diff --git a/contrib/pg_pathman/.gitignore b/contrib/pg_pathman/.gitignore index 7671c4d178..50fb51a52f 100644 --- a/contrib/pg_pathman/.gitignore +++ b/contrib/pg_pathman/.gitignore @@ -1,7 +1,9 @@ .deps -results/pg_pathman.out +isolation_output +results/* regression.diffs regression.out *.o *.so +*.pyc pg_pathman--*.sql diff --git a/contrib/pg_pathman/.travis.yml b/contrib/pg_pathman/.travis.yml index 047a1c52d4..36b5bc04ab 100644 --- a/contrib/pg_pathman/.travis.yml +++ b/contrib/pg_pathman/.travis.yml @@ -14,6 +14,8 @@ before_install: - sudo sh ./travis/apt.postgresql.org.sh env: + - PGVERSION=9.6 CHECK_CODE=true + - PGVERSION=9.6 CHECK_CODE=false - PGVERSION=9.5 CHECK_CODE=true - PGVERSION=9.5 CHECK_CODE=false diff --git a/contrib/pg_pathman/Makefile b/contrib/pg_pathman/Makefile index 977c96b594..a451089773 100644 --- a/contrib/pg_pathman/Makefile +++ b/contrib/pg_pathman/Makefile @@ -1,15 +1,23 @@ # contrib/pg_pathman/Makefile MODULE_big = pg_pathman -OBJS = src/init.o src/utils.o src/runtimeappend.o src/runtime_merge_append.o src/pg_pathman.o src/dsm_array.o \ - src/rangeset.o src/pl_funcs.o src/worker.o src/hooks.o src/nodes_common.o $(WIN32RES) +OBJS = src/init.o src/relation_info.o src/utils.o src/partition_filter.o \ + src/runtimeappend.o src/runtime_merge_append.o src/pg_pathman.o src/rangeset.o \ + src/pl_funcs.o src/pl_range_funcs.o src/pl_hash_funcs.o src/pathman_workers.o \ + src/hooks.o src/nodes_common.o src/xact_handling.o src/copy_stmt_hooking.o \ + src/pg_compat.o $(WIN32RES) EXTENSION = pg_pathman -EXTVERSION = 0.1 +EXTVERSION = 1.0 DATA_built = $(EXTENSION)--$(EXTVERSION).sql PGFILEDESC = "pg_pathman - partitioning tool" -REGRESS = pg_pathman +REGRESS = pathman_basic \ + pathman_runtime_nodes \ + pathman_callbacks \ + pathman_domains \ + pathman_foreign_keys \ + pathman_rowmarks EXTRA_REGRESS_OPTS=--temp-config=$(top_srcdir)/$(subdir)/conf.add EXTRA_CLEAN = $(EXTENSION)--$(EXTVERSION).sql ./isolation_output @@ -27,7 +35,7 @@ endif $(EXTENSION)--$(EXTVERSION).sql: init.sql hash.sql range.sql cat $^ > $@ -ISOLATIONCHECKS=insert_trigger rollback_on_create_partitions +ISOLATIONCHECKS=insert_nodes for_update rollback_on_create_partitions submake-isolation: $(MAKE) -C $(top_builddir)/src/test/isolation all @@ -35,6 +43,6 @@ submake-isolation: isolationcheck: | submake-isolation $(MKDIR_P) isolation_output $(pg_isolation_regress_check) \ - --temp-config=$(top_srcdir)/$(subdir)/conf.add \ - --outputdir=./isolation_output \ - $(ISOLATIONCHECKS) + --temp-config=$(top_srcdir)/$(subdir)/conf.add \ + --outputdir=./isolation_output \ + $(ISOLATIONCHECKS) diff --git a/contrib/pg_pathman/README.md b/contrib/pg_pathman/README.md index 95f0f9059c..ec176bb822 100644 --- a/contrib/pg_pathman/README.md +++ b/contrib/pg_pathman/README.md @@ -1,9 +1,12 @@ [![Build Status](https://travis-ci.org/postgrespro/pg_pathman.svg?branch=master)](https://travis-ci.org/postgrespro/pg_pathman) +[![PGXN version](https://badge.fury.io/pg/pg_pathman.svg)](https://badge.fury.io/pg/pg_pathman) # pg_pathman The `pg_pathman` module provides optimized partitioning mechanism and functions to manage partitions. +The extension is compatible with PostgreSQL 9.5 (9.6 support is coming soon). + ## Overview **Partitioning** means splitting one large table into smaller pieces. Each row in such table is moved to a single partition according to the partitioning key. PostgreSQL supports partitioning via table inheritance: each partition must be created as a child table with CHECK CONSTRAINT. For example: @@ -29,20 +32,18 @@ WHERE id = 150 Based on the partitioning type and condition's operator, `pg_pathman` searches for the corresponding partitions and builds the plan. Currently `pg_pathman` supports two partitioning schemes: * **RANGE** - maps rows to partitions using partitioning key ranges assigned to each partition. Optimization is achieved by using the binary search algorithm; -* **HASH** - maps rows to partitions using a generic hash function (only *integer* attributes are supported at the moment). +* **HASH** - maps rows to partitions using a generic hash function. More interesting features are yet to come. Stay tuned! ## Roadmap - * Replace INSERT triggers with a custom node (aka **PartitionFilter**) - * Implement [concurrent partitioning](https://github.com/postgrespro/pg_pathman/tree/concurrent_part) (much more responsive) - * Implement HASH partitioning for non-integer attributes - * Optimize hash join (both tables are partitioned by join key) - * Implement LIST partitioning scheme + + * Implement LIST partitioning scheme; + * Optimize hash join (both tables are partitioned by join key). ## Installation guide To install `pg_pathman`, execute this in the module's directory: -``` +```shell make install USE_PGXS=1 ``` Modify the **`shared_preload_libraries`** parameter in `postgresql.conf` as following: @@ -50,7 +51,7 @@ Modify the **`shared_preload_libraries`** parameter in `postgresql.conf` as foll shared_preload_libraries = 'pg_pathman' ``` It is essential to restart the PostgreSQL instance. After that, execute the following query in psql: -``` +```plpgsql CREATE EXTENSION pg_pathman; ``` @@ -62,108 +63,254 @@ Done! Now it's time to setup your partitioning schemes. ### Partition creation ```plpgsql -create_hash_partitions(relation TEXT, +create_hash_partitions(relation REGCLASS, attribute TEXT, - partitions_count INTEGER) + partitions_count INTEGER, + partition_name TEXT DEFAULT NULL, + partition_data BOOLEAN DEFAULT TRUE) ``` -Performs HASH partitioning for `relation` by integer key `attribute`. Creates `partitions_count` partitions and trigger on INSERT. All the data will be automatically copied from the parent to partitions. +Performs HASH partitioning for `relation` by integer key `attribute`. The `partitions_count` parameter specifies the number of partitions to create; it cannot be changed afterwards. If `partition_data` is `true` then all the data will be automatically copied from the parent table to partitions. Note that data migration may took a while to finish and the table will be locked until transaction commits. See `partition_table_concurrently()` for a lock-free way to migrate data. Partition creation callback is invoked for each partition if set beforehand (see `set_part_init_callback()`). ```plpgsql -create_range_partitions(relation TEXT, - attribute TEXT, - start_value ANYELEMENT, - interval ANYELEMENT, - premake INTEGER DEFAULT NULL) +create_range_partitions(relation REGCLASS, + attribute TEXT, + start_value ANYELEMENT, + interval ANYELEMENT, + count INTEGER DEFAULT NULL + partition_data BOOLEAN DEFAULT TRUE) + +create_range_partitions(relation REGCLASS, + attribute TEXT, + start_value ANYELEMENT, + interval INTERVAL, + count INTEGER DEFAULT NULL, + partition_data BOOLEAN DEFAULT TRUE) +``` +Performs RANGE partitioning for `relation` by partitioning key `attribute`. `start_value` argument specifies initial value, `interval` sets the range of values in a single partition, `count` is the number of premade partitions (if not set then pathman tries to determine it based on attribute values). Partition creation callback is invoked for each partition if set beforehand. -create_range_partitions(relation TEXT, - attribute TEXT, - start_value ANYELEMENT, - interval INTERVAL, - premake INTEGER DEFAULT NULL) +```plpgsql +create_partitions_from_range(relation REGCLASS, + attribute TEXT, + start_value ANYELEMENT, + end_value ANYELEMENT, + interval ANYELEMENT, + partition_data BOOLEAN DEFAULT TRUE) + +create_partitions_from_range(relation REGCLASS, + attribute TEXT, + start_value ANYELEMENT, + end_value ANYELEMENT, + interval INTERVAL, + partition_data BOOLEAN DEFAULT TRUE) ``` -Performs RANGE partitioning for `relation` by partitioning key `attribute`. `start_value` argument specifies initial value, `interval` sets the range of values in a single partition, `premake` is the number of premade partitions (if not set then pathman tries to determine it based on attribute values). All the data will be automatically copied from the parent to partitions. +Performs RANGE-partitioning from specified range for `relation` by partitioning key `attribute`. Partition creation callback is invoked for each partition if set beforehand. + +### Data migration ```plpgsql -create_partitions_from_range(relation TEXT, - attribute TEXT, - start_value ANYELEMENT, - end_value ANYELEMENT, - interval ANYELEMENT) +partition_table_concurrently(relation REGCLASS) +``` +Starts a background worker to move data from parent table to partitions. The worker utilizes short transactions to copy small batches of data (up to 10K rows per transaction) and thus doesn't significantly interfere with user's activity. -create_partitions_from_range(relation TEXT, - attribute TEXT, - start_value ANYELEMENT, - end_value ANYELEMENT, - interval INTERVAL) +```plpgsql +stop_concurrent_part_task(relation REGCLASS) ``` -Performs RANGE-partitioning from specified range for `relation` by partitioning key `attribute`. Data will be copied to partitions as well. +Stops a background worker performing a concurrent partitioning task. Note: worker will exit after it finishes relocating a current batch. ### Triggers ```plpgsql -create_hash_update_trigger(parent TEXT) +create_hash_update_trigger(parent REGCLASS) ``` Creates the trigger on UPDATE for HASH partitions. The UPDATE trigger isn't created by default because of the overhead. It's useful in cases when the key attribute might change. ```plpgsql -create_range_update_trigger(parent TEXT) +create_range_update_trigger(parent REGCLASS) ``` Same as above, but for a RANGE-partitioned table. ### Post-creation partition management ```plpgsql -split_range_partition(partition TEXT, value ANYELEMENT) +split_range_partition(partition REGCLASS, + value ANYELEMENT, + partition_name TEXT DEFAULT NULL) ``` -Split RANGE `partition` in two by `value`. +Split RANGE `partition` in two by `value`. Partition creation callback is invoked for a new partition if available. ```plpgsql -merge_range_partitions(partition1 TEXT, partition2 TEXT) +merge_range_partitions(partition1 REGCLASS, partition2 REGCLASS) ``` Merge two adjacent RANGE partitions. First, data from `partition2` is copied to `partition1`, then `partition2` is removed. ```plpgsql -append_range_partition(p_relation TEXT) +append_range_partition(p_relation REGCLASS, + partition_name TEXT DEFAULT NULL, + tablespace TEXT DEFAULT NULL) ``` -Append new RANGE partition. +Append new RANGE partition with `pathman_config.range_interval` as interval. ```plpgsql -prepend_range_partition(p_relation TEXT) +prepend_range_partition(p_relation REGCLASS, + partition_name TEXT DEFAULT NULL, + tablespace TEXT DEFAULT NULL) ``` -Prepend new RANGE partition. +Prepend new RANGE partition with `pathman_config.range_interval` as interval. ```plpgsql -add_range_partition(relation TEXT, - start_value ANYELEMENT, - end_value ANYELEMENT) +add_range_partition(relation REGCLASS, + start_value ANYELEMENT, + end_value ANYELEMENT, + partition_name TEXT DEFAULT NULL, + tablespace TEXT DEFAULT NULL) ``` Create new RANGE partition for `relation` with specified range bounds. ```plpgsql -drop_range_partition(partition TEXT) +drop_range_partition(partition TEXT, delete_data BOOLEAN DEFAULT TRUE) ``` -Drop RANGE partition and all its data. +Drop RANGE partition and all of its data if `delete_data` is true. ```plpgsql -attach_range_partition(relation TEXT, - partition TEXT, +attach_range_partition(relation REGCLASS, + partition REGCLASS, start_value ANYELEMENT, end_value ANYELEMENT) ``` -Attach partition to the existing RANGE-partitioned relation. The attached table must have exactly the same structure as the parent table, including the dropped columns. +Attach partition to the existing RANGE-partitioned relation. The attached table must have exactly the same structure as the parent table, including the dropped columns. Partition creation callback is invoked if set (see `pathman_config_params`). ```plpgsql -detach_range_partition(partition TEXT) +detach_range_partition(partition REGCLASS) ``` Detach partition from the existing RANGE-partitioned relation. ```plpgsql -disable_partitioning(relation TEXT) +disable_pathman_for(relation TEXT) ``` Permanently disable `pg_pathman` partitioning mechanism for the specified parent table and remove the insert trigger if it exists. All partitions and data remain unchanged. +```plpgsql +drop_partitions(parent REGCLASS, + delete_data BOOLEAN DEFAULT FALSE) +``` +Drop partitions of the `parent` table (both foreign and local relations). If `delete_data` is `false`, the data is copied to the parent table first. Default is `false`. + + +### Additional parameters + +```plpgsql +set_enable_parent(relation REGCLASS, value BOOLEAN) +``` +Include/exclude parent table into/from query plan. In original PostgreSQL planner parent table is always included into query plan even if it's empty which can lead to additional overhead. You can use `disable_parent()` if you are never going to use parent table as a storage. Default value depends on the `partition_data` parameter that was specified during initial partitioning in `create_range_partitions()` or `create_partitions_from_range()` functions. If the `partition_data` parameter was `true` then all data have already been migrated to partitions and parent table disabled. Otherwise it is enabled. + +```plpgsql +set_auto(relation REGCLASS, value BOOLEAN) +``` +Enable/disable auto partition propagation (only for RANGE partitioning). It is enabled by default. + +```plpgsql +set_init_callback(relation REGCLASS, callback REGPROC DEFAULT 0) +``` +Set partition creation callback to be invoked for each attached or created partition (both HASH and RANGE). The callback must have the following signature: `part_init_callback(args JSONB) RETURNS VOID`. Parameter `arg` consists of several fields whose presence depends on partitioning type: +```json +/* RANGE-partitioned table abc (child abc_4) */ +{ + "parent": "abc", + "parttype": "2", + "partition": "abc_4", + "range_max": "401", + "range_min": "301" +} + +/* HASH-partitioned table abc (child abc_0) */ +{ + "parent": "abc", + "parttype": "1", + "partition": "abc_0" +} +``` + +## Views and tables + +#### `pathman_config` --- main config storage +```plpgsql +CREATE TABLE IF NOT EXISTS pathman_config ( + partrel REGCLASS NOT NULL PRIMARY KEY, + attname TEXT NOT NULL, + parttype INTEGER NOT NULL, + range_interval TEXT, + + CHECK (parttype IN (1, 2)) /* check for allowed part types */ ); +``` +This table stores a list of partitioned tables. + +#### `pathman_config_params` --- optional parameters +```plpgsql +CREATE TABLE IF NOT EXISTS pathman_config_params ( + partrel REGCLASS NOT NULL PRIMARY KEY, + enable_parent BOOLEAN NOT NULL DEFAULT TRUE, + auto BOOLEAN NOT NULL DEFAULT TRUE, + init_callback REGPROCEDURE NOT NULL DEFAULT 0); +``` +This table stores optional parameters which override standard behavior. + +#### `pathman_concurrent_part_tasks` --- currently running partitioning workers +```plpgsql +-- helper SRF function +CREATE OR REPLACE FUNCTION show_concurrent_part_tasks() +RETURNS TABLE ( + userid REGROLE, + pid INT, + dbid OID, + relid REGCLASS, + processed INT, + status TEXT) +AS 'pg_pathman', 'show_concurrent_part_tasks_internal' +LANGUAGE C STRICT; + +CREATE OR REPLACE VIEW pathman_concurrent_part_tasks +AS SELECT * FROM show_concurrent_part_tasks(); +``` +This view lists all currently running concurrent partitioning tasks. + +#### `pathman_partition_list` --- list of all existing partitions +```plpgsql +-- helper SRF function +CREATE OR REPLACE FUNCTION show_partition_list() +RETURNS TABLE ( + parent REGCLASS, + partition REGCLASS, + parttype INT4, + partattr TEXT, + range_min TEXT, + range_max TEXT) +AS 'pg_pathman', 'show_partition_list_internal' +LANGUAGE C STRICT; + +CREATE OR REPLACE VIEW pathman_partition_list +AS SELECT * FROM show_partition_list(); +``` +This view lists all existing partitions, as well as their parents and range boundaries (NULL for HASH partitions). + + ## Custom plan nodes `pg_pathman` provides a couple of [custom plan nodes](https://wiki.postgresql.org/wiki/CustomScanAPI) which aim to reduce execution time, namely: - `RuntimeAppend` (overrides `Append` plan node) - `RuntimeMergeAppend` (overrides `MergeAppend` plan node) +- `PartitionFilter` (drop-in replacement for INSERT triggers) + +`PartitionFilter` acts as a *proxy node* for INSERT's child scan, which means it can redirect output tuples to the corresponding partition: + +```plpgsql +EXPLAIN (COSTS OFF) +INSERT INTO partitioned_table +SELECT generate_series(1, 10), random(); + QUERY PLAN +----------------------------------------- + Insert on partitioned_table + -> Custom Scan (PartitionFilter) + -> Subquery Scan on "*SELECT*" + -> Result +(4 rows) +``` `RuntimeAppend` and `RuntimeMergeAppend` have much in common: they come in handy in a case when WHERE condition takes form of: ``` @@ -175,7 +322,7 @@ This kind of expressions can no longer be optimized at planning time since the p There are at least several cases that demonstrate usefulness of these nodes: -``` +```plpgsql /* create table we're going to partition */ CREATE TABLE partitioned_table(id INT NOT NULL, payload REAL); @@ -192,7 +339,7 @@ CREATE TABLE some_table AS SELECT generate_series(1, 100) AS VAL; - **`id = (select ... limit 1)`** -``` +```plpgsql EXPLAIN (COSTS OFF, ANALYZE) SELECT * FROM partitioned_table WHERE id = (SELECT * FROM some_table LIMIT 1); QUERY PLAN @@ -232,7 +379,7 @@ WHERE id = (SELECT * FROM some_table LIMIT 1); ``` - **`id = ANY (select ...)`** -``` +```plpgsql EXPLAIN (COSTS OFF, ANALYZE) SELECT * FROM partitioned_table WHERE id = any (SELECT * FROM some_table limit 4); QUERY PLAN @@ -284,15 +431,40 @@ In case you're interested, you can read more about custom nodes at Alexander Kor ### Common tips - You can easily add **_partition_** column containing the names of the underlying partitions using the system attribute called **_tableoid_**: -``` +```plpgsql SELECT tableoid::regclass AS partition, * FROM partitioned_table; ``` -- Though indices on a parent table aren't particularly useful (since it's empty), they act as prototypes for indices on partitions. For each index on the parent table, `pg_pathman` will create a similar index on every partition. +- Though indices on a parent table aren't particularly useful (since it's supposed to be empty), they act as prototypes for indices on partitions. For each index on the parent table, `pg_pathman` will create a similar index on every partition. + +- All running concurrent partitioning tasks can be listed using the `pathman_concurrent_part_tasks` view: +```plpgsql +SELECT * FROM pathman_concurrent_part_tasks; + userid | pid | dbid | relid | processed | status +--------+------+-------+-------+-----------+--------- + dmitry | 7367 | 16384 | test | 472000 | working +(1 row) +``` + +- `pathman_partition_list` in conjunction with `drop_range_partition()` can be used to drop RANGE partitions in a more flexible way compared to good old `DROP TABLE`: +```plpgsql +SELECT drop_range_partition(partition, false) /* move data to parent */ +FROM pathman_partition_list +WHERE parent = 'part_test'::regclass AND range_min::int < 500; +NOTICE: 1 rows copied from part_test_11 +NOTICE: 100 rows copied from part_test_1 +NOTICE: 100 rows copied from part_test_2 + drop_range_partition +---------------------- + dummy_test_11 + dummy_test_1 + dummy_test_2 +(3 rows) +``` ### HASH partitioning Consider an example of HASH partitioning. First create a table with some integer column: -``` +```plpgsql CREATE TABLE items ( id SERIAL PRIMARY KEY, name TEXT, @@ -303,13 +475,13 @@ SELECT g, md5(g::text), random() * 100000 FROM generate_series(1, 100000) as g; ``` Now run the `create_hash_partitions()` function with appropriate arguments: -``` +```plpgsql SELECT create_hash_partitions('items', 'id', 100); ``` This will create new partitions and move the data from parent to partitions. Here's an example of the query performing filtering by partitioning key: -``` +```plpgsql SELECT * FROM items WHERE id = 1234; id | name | code ------+----------------------------------+------ @@ -329,7 +501,7 @@ Notice that the `Append` node contains only one child scan which corresponds to > **Important:** pay attention to the fact that `pg_pathman` excludes the parent table from the query plan. To access parent table use ONLY modifier: -``` +```plpgsql EXPLAIN SELECT * FROM ONLY items; QUERY PLAN ------------------------------------------------------ @@ -337,13 +509,12 @@ EXPLAIN SELECT * FROM ONLY items; ``` ### RANGE partitioning Consider an example of RANGE partitioning. Let's create a table containing some dummy logs: -``` +```plpgsql CREATE TABLE journal ( id SERIAL, dt TIMESTAMP NOT NULL, level INTEGER, - msg TEXT -); + msg TEXT); -- similar index will also be created for each partition CREATE INDEX ON journal(dt); @@ -354,47 +525,47 @@ SELECT g, random() * 6, md5(g::text) FROM generate_series('2015-01-01'::date, '2015-12-31'::date, '1 minute') as g; ``` Run the `create_range_partitions()` function to create partitions so that each partition would contain the data for one day: -``` +```plpgsql SELECT create_range_partitions('journal', 'dt', '2015-01-01'::date, '1 day'::interval); ``` It will create 365 partitions and move the data from parent to partitions. New partitions are appended automaticaly by insert trigger, but it can be done manually with the following functions: -``` --- append new partition with specified range +```plpgsql +-- add new partition with specified range SELECT add_range_partition('journal', '2016-01-01'::date, '2016-01-07'::date); -- append new partition with default range SELECT append_range_partition('journal'); ``` The first one creates a partition with specified range. The second one creates a partition with default interval and appends it to the partition list. It is also possible to attach an existing table as partition. For example, we may want to attach an archive table (or even foreign table from another server) for some outdated data: -``` +```plpgsql CREATE FOREIGN TABLE journal_archive ( id INTEGER NOT NULL, dt TIMESTAMP NOT NULL, level INTEGER, - msg TEXT -) SERVER archive_server; + msg TEXT) +SERVER archive_server; SELECT attach_range_partition('journal', 'journal_archive', '2014-01-01'::date, '2015-01-01'::date); ``` > **Important:** the definition of the attached table must match the one of the existing partitioned table, including the dropped columns. To merge to adjacent partitions, use the `merge_range_partitions()` function: -``` +```plpgsql SELECT merge_range_partitions('journal_archive', 'journal_1'); ``` To split partition by value, use the `split_range_partition()` function: -``` +```plpgsql SELECT split_range_partition('journal_366', '2016-01-03'::date); ``` To detach partition, use the `detach_range_partition()` function: -``` +```plpgsql SELECT detach_range_partition('journal_archive'); ``` Here's an example of the query performing filtering by partitioning key: -``` +```plpgsql SELECT * FROM journal WHERE dt >= '2015-06-01' AND dt < '2015-06-03'; id | dt | level | msg --------+---------------------+-------+---------------------------------- @@ -419,10 +590,14 @@ There are several user-accessible [GUC](https://www.postgresql.org/docs/9.5/stat - `pg_pathman.enable` --- disable (or enable) `pg_pathman` completely - `pg_pathman.enable_runtimeappend` --- toggle `RuntimeAppend` custom node on\off - `pg_pathman.enable_runtimemergeappend` --- toggle `RuntimeMergeAppend` custom node on\off + - `pg_pathman.enable_partitionfilter` --- toggle `PartitionFilter` custom node on\off + - `pg_pathman.enable_auto_partition` --- toggle automatic partition creation on\off (per session) + - `pg_pathman.insert_into_fdw` --- allow INSERTs into various FDWs `(disabled | postgres | any_fdw)` + - `pg_pathman.override_copy` --- toggle COPY statement hooking on\off -To **permanently** disable `pg_pathman` for some previously partitioned table, use the `disable_partitioning()` function: +To **permanently** disable `pg_pathman` for some previously partitioned table, use the `disable_pathman_for()` function: ``` -SELECT disable_partitioning('range_rel'); +SELECT disable_pathman_for('range_rel'); ``` All sections and data will remain unchanged and will be handled by the standard PostgreSQL inheritance mechanism. @@ -430,6 +605,6 @@ All sections and data will remain unchanged and will be handled by the standard Do not hesitate to post your issues, questions and new ideas at the [issues](https://github.com/postgrespro/pg_pathman/issues) page. ## Authors -Ildar Musin Postgres Professional Ltd., Russia -Alexander Korotkov Postgres Professional Ltd., Russia -Dmitry Ivanov Postgres Professional Ltd., Russia +Ildar Musin Postgres Professional Ltd., Russia +Alexander Korotkov Postgres Professional Ltd., Russia +Dmitry Ivanov Postgres Professional Ltd., Russia diff --git a/contrib/pg_pathman/README.rus.md b/contrib/pg_pathman/README.rus.md index f8c45306a5..6acea3c5d1 100644 --- a/contrib/pg_pathman/README.rus.md +++ b/contrib/pg_pathman/README.rus.md @@ -1,10 +1,15 @@ +[![Build Status](https://travis-ci.org/postgrespro/pg_pathman.svg?branch=master)](https://travis-ci.org/postgrespro/pg_pathman) +[![PGXN version](https://badge.fury.io/pg/pg_pathman.svg)](https://badge.fury.io/pg/pg_pathman) + # pg_pathman Модуль `pg_pathman` предоставляет оптимизированный механизм секционирования, а также функции для создания и управления секциями. +Расширение совместимо с PostgreSQL 9.5 (поддержка 9.6 будет добавлена в одном из ближайших обновлений). + ## Концепция pg_pathman -Секционирование -- это способ разбиения одной большой таблицы на множество меньших по размеру. Для каждой записи можно однозначно определить секцию, в которой она должна храниться посредством вычисления ключа. +**Секционирование** -- это способ разбиения одной большой таблицы на множество меньших по размеру. Для каждой записи можно однозначно определить секцию, в которой она должна храниться посредством вычисления ключа. Секционирование в postgres основано на механизме наследования. Каждому наследнику задается условие CHECK CONSTRAINT. Например: ``` @@ -29,16 +34,16 @@ WHERE id = 150 В текущей версии `pg_pathman` поддерживает следующие типы секционирования: -* RANGE - разбивает таблицу на секции по диапазонам ключевого аттрибута; для оптимизации построения плана используется метод бинарного поиска. -* HASH - данные равномерно распределяются по секциям в соответствии со значениями hash-функции, вычисленными по заданному целочисленному атрибуту. +* **RANGE** - разбивает таблицу на секции по диапазонам ключевого аттрибута; для оптимизации построения плана используется метод бинарного поиска. +* **HASH** - данные равномерно распределяются по секциям в соответствии со значениями hash-функции, вычисленными по заданному целочисленному атрибуту. + +More interesting features are yet to come. Stay tuned! ## Roadmap - * Выбор секций на этапе выполнения запроса (полезно для nested loop join, prepared statements); - * Оптимизация выдачи упорядоченных результатов из секционированных таблиц (полезно для merge join, order by); - * Оптимизация hash join для случая, когда обе таблицы секционированы по ключу join’а; + * Предоставить возможность установки пользовательских колбеков на создание\уничтожение партиции (issue [#22](https://github.com/postgrespro/pg_pathman/issues/22)) * LIST-секционирование; - * HASH-секционирование по ключевому атрибуту с типом, отличным от INTEGER. + * Оптимизация hash join для случая, когда обе таблицы секционированы по ключу join’а. ## Установка @@ -55,113 +60,306 @@ shared_preload_libraries = 'pg_pathman' CREATE EXTENSION pg_pathman; ``` -## Функции pg_pathman +> **Важно:** Если вы хотите собрать `pg_pathman` для работы с кастомной сборкой PostgreSQL, не забудьте установить переменную окружения `PG_CONFIG` равной пути к исполняемому файлу pg_config. Узнать больше о сборке расширений для PostgreSQL можно по ссылке: [here](https://wiki.postgresql.org/wiki/Building_and_Installing_PostgreSQL_Extension_Modules). + +## Функции `pg_pathman` ### Создание секций +```plpgsql +create_hash_partitions(relation REGCLASS, + attribute TEXT, + partitions_count INTEGER, + partition_name TEXT DEFAULT NULL) ``` -create_hash_partitions( - relation TEXT, - attribute TEXT, - partitions_count INTEGER) -``` -Выполняет HASH-секционирование таблицы `relation` по целочисленному полю `attribute`. Создает `partitions_count` дочерних секций, а также триггер на вставку. Данные из родительской таблицы будут автоматически скопированы в дочерние. +Выполняет HASH-секционирование таблицы `relation` по целочисленному полю `attribute`. Параметр `partitions_count` определяет, сколько секций будет создано. Если `partition_data` установлен в значение `true`, то данные из родительской таблицы будут автоматически распределены по секциям. Стоит иметь в виду, что миграция данных может занять некоторое время, а данные заблокированы. Для конкурентной миграции данных см. функцию `partition_table_concurrently()`. + +```plpgsql +create_range_partitions(relation REGCLASS, + attribute TEXT, + start_value ANYELEMENT, + interval ANYELEMENT, + count INTEGER DEFAULT NULL + partition_data BOOLEAN DEFAULT true) +create_range_partitions(relation REGCLASS, + attribute TEXT, + start_value ANYELEMENT, + interval INTERVAL, + count INTEGER DEFAULT NULL, + partition_data BOOLEAN DEFAULT true) ``` -create_range_partitions( - relation TEXT, - attribute TEXT, - start_value ANYELEMENT, - interval ANYELEMENT, - premake INTEGER DEFAULT NULL) +Выполняет RANGE-секционирование таблицы `relation` по полю `attribute`. Аргумент `start_value` задает начальное значение, `interval` -- диапазон значений внутри одной секции, `count` -- количество создаваемых секций (если не задано, то pathman попытается определить количество секций на основе значений аттрибута). + +```plpgsql +create_partitions_from_range(relation REGCLASS, + attribute TEXT, + start_value ANYELEMENT, + end_value ANYELEMENT, + interval ANYELEMENT, + partition_data BOOLEAN DEFAULT true) -create_range_partitions( - relation TEXT, - attribute TEXT, - start_value ANYELEMENT, - interval INTERVAL, - premake INTEGER DEFAULT NULL) +create_partitions_from_range(relation REGCLASS, + attribute TEXT, + start_value ANYELEMENT, + end_value ANYELEMENT, + interval INTERVAL, + partition_data BOOLEAN DEFAULT true) ``` -Выполняет RANGE-секционирование таблицы `relation` по полю `attribute`. Аргумент `start_value` задает начальное значение, `interval` -- диапазон значений внутри одной секции, `premake` -- количество заранее создаваемых секций (если не задано, то pathman попытается определить количество секций на основе значений аттрибута). Данные из родительской таблицы будут автоматически скопированы в дочерние. +Выполняет RANGE-секционирование для заданного диапазона таблицы `relation` по полю `attribute`. +### Миграция данных + +```plpgsql +partition_table_concurrently(relation REGCLASS) ``` -create_partitions_from_range( - relation TEXT, - attribute TEXT, - start_value ANYELEMENT, - end_value ANYELEMENT, - interval ANYELEMENT) +Запускает новый процесс (background worker) для конкурентного перемещения данных из родительской таблицы в дочерние секции. Рабочий процесс использует короткие транзакции для перемещения небольших объемов данных (порядка 10 тысяч записей) и, таким образом, не оказывает существенного влияния на работу пользователей. -create_partitions_from_range( - relation TEXT, - attribute TEXT, - start_value ANYELEMENT, - end_value ANYELEMENT, - interval INTERVAL) +```plpgsql +stop_concurrent_part_task(relation REGCLASS) ``` -Выполняет RANGE-секционирование для заданного диапазона таблицы `relation` по полю `attribute`. Данные также будут скопированы в дочерние секции. +Останавливает процесс конкурентного партиционирования. Обратите внимание, что процесс завершается не мгновенно, а только по завершении текущей транзакции. ### Утилиты -``` -create_hash_update_trigger(parent TEXT) +```plpgsql +create_hash_update_trigger(parent REGCLASS) ``` Создает триггер на UPDATE для HASH секций. По-умолчанию триггер на обновление данных не создается, т.к. это создает дополнительные накладные расходы. Триггер полезен только в том случае, когда меняется значение ключевого аттрибута. -``` -create_range_update_trigger(parent TEXT) +```plpgsql +create_range_update_trigger(parent REGCLASS) ``` Аналогично предыдущей, но для RANGE секций. ### Управление секциями -``` -split_range_partition(partition TEXT, value ANYELEMENT) +```plpgsql +split_range_partition(partition REGCLASS, + value ANYELEMENT, + partition_name TEXT DEFAULT NULL,) ``` Разбивает RANGE секцию `partition` на две секции по значению `value`. -``` -merge_range_partitions(partition1 TEXT, partition2 TEXT) + +```plpgsql +merge_range_partitions(partition1 REGCLASS, partition2 REGCLASS) ``` Объединяет две смежные RANGE секции. Данные из `partition2` копируются в `partition1`, после чего секция `partition2` удаляется. + +```plpgsql +append_range_partition(p_relation REGCLASS, + partition_name TEXT DEFAULT NULL) ``` -append_range_partition(p_relation TEXT) +Добавляет новую RANGE секцию с диапазоном `pathman_config.range_interval` в конец списка секций. + +```plpgsql +prepend_range_partition(p_relation REGCLASS, + partition_name TEXT DEFAULT NULL) ``` -Добавляет новую RANGE секцию в конец списка секций. +Добавляет новую RANGE секцию с диапазоном `pathman_config.range_interval` в начало списка секций. + +```plpgsql +add_range_partition(relation REGCLASS, + start_value ANYELEMENT, + end_value ANYELEMENT, + partition_name TEXT DEFAULT NULL) ``` -prepend_range_partition(p_relation TEXT) +Добавляет новую RANGE секцию с заданным диапазоном к секционированной таблице `relation`. + +```plpgsql +drop_range_partition(partition TEXT) ``` -Добавляет новую RANGE секцию в начало списка секций. +Удаляет RANGE секцию вместе с содержащимися в ней данными. +```plpgsql +attach_range_partition(relation REGCLASS, + partition REGCLASS, + start_value ANYELEMENT, + end_value ANYELEMENT) ``` -add_range_partition( - relation TEXT, - start_value ANYELEMENT, - end_value ANYELEMENT) +Присоединяет существующую таблицу `partition` в качестве секции к ранее секционированной таблице `relation`. Структура присоединяемой таблицы должна в точности повторять структуру родительской. + +```plpgsql +detach_range_partition(partition REGCLASS) ``` -Добавляет новую RANGE секцию с заданным диапазоном к секционированной таблице `relation`. +Отсоединяет секцию `partition`, после чего она становится независимой таблицей. +```plpgsql +disable_pathman_for(relation REGCLASS) ``` -drop_range_partition(partition TEXT) +Отключает механизм секционирования `pg_pathman` для заданной таблицы. При этом созданные ранее секции остаются без изменений. + +```plpgsql +drop_partitions(parent REGCLASS, + delete_data BOOLEAN DEFAULT FALSE) ``` -Удаляет RANGE секцию вместе с содержащимися в ней данными. +Удаляет все секции таблицы `parent`. Если параметр `delete_data` задан как `false` (по-умолчанию `false`), то данные из секций копируются в родительскую таблицу. + +### Дополнительные параметры +```plpgsql +enable_parent(relation REGCLASS) +disable_parent(relation REGCLASS) ``` -attach_range_partition( - relation TEXT, - partition TEXT, - start_value ANYELEMENT, - end_value ANYELEMENT) +Включает/исключает родительскую таблицу в план запроса. В оригинальном планировщике PostgreSQL родительская таблица всегда включается в план запроса, даже если она пуста. Это создает дополнительные накладные расходы. Выполните `disable_parent()`, если вы не собираетесь хранить какие-либо данные в родительской таблице. Значение по-умолчанию зависит от того, был ли установлен параметр `partition_data` при первоначальном разбиении таблицы (см. функции `create_range_partitions()` и `create_partitions_from_range()`). Если он был установлен в значение `true`, то все данные были перемещены в секции, а родительская таблица отключена. В противном случае родительская таблица по-умолчанию влючена. + +```plpgsql +enable_auto(relation REGCLASS) +disable_auto(relation REGCLASS) ``` -Присоединяет существующую таблицу `partition` в качестве секции к ранее секционированной таблице `relation`. Структура присоединяемой таблицы должна в точности повторять структуру родительской. +Включает/выключает автоматическое создание секций (только для RANGE секционирования). По-умолчанию включено. + +## Custom plan nodes +`pg_pathman` вводит три новых узла плана (см. [custom plan nodes](https://wiki.postgresql.org/wiki/CustomScanAPI)), предназначенных для оптимизации времени выполнения: + +- `RuntimeAppend` (замещает узел типа `Append`) +- `RuntimeMergeAppend` (замещает узел типа `MergeAppend`) +- `PartitionFilter` (выполняет работу INSERT-триггера) + +`PartitionFilter` работает как прокси-узел для INSERT-запросов, распределяя новые записи по соответствующим секциям: ``` -detach_range_partition(partition TEXT) +EXPLAIN (COSTS OFF) +INSERT INTO partitioned_table +SELECT generate_series(1, 10), random(); + QUERY PLAN +----------------------------------------- + Insert on partitioned_table + -> Custom Scan (PartitionFilter) + -> Subquery Scan on "*SELECT*" + -> Result +(4 rows) ``` -Отсоединяет секцию `partition`, после чего она становится независимой таблицей. +Узлы `RuntimeAppend` и `RuntimeMergeAppend` имеют между собой много общего: они нужны в случает, когда условие WHERE принимает форму: +``` +ПЕРЕМЕННАЯ ОПЕРАТОР ПАРАМЕТР +``` +Подобные выражения не могут быть оптимизированы во время планирования, т.к. значение параметра неизвестно до стадии выполнения. Проблема может быть решена путем встраивания дополнительной процедуры анализа в код `Append` узла, таким образом позволяя ему выбирать лишь необходимые субпланы из всего списка дочерних планов. + +---------- + +Есть по меньшей мере несколько ситуаций, которые демонстрируют полезность таких узлов: + +``` +/* создаем таблицу, которую хотим секционировать */ +CREATE TABLE partitioned_table(id INT NOT NULL, payload REAL); + +/* заполняем данными */ +INSERT INTO partitioned_table +SELECT generate_series(1, 1000), random(); + +/* выполняем секционирование */ +SELECT create_hash_partitions('partitioned_table', 'id', 100); + +/* создаем обычную таблицу */ +CREATE TABLE some_table AS SELECT generate_series(1, 100) AS VAL; +``` + + + - **`id = (select ... limit 1)`** ``` -disable_partitioning(relation TEXT) +EXPLAIN (COSTS OFF, ANALYZE) SELECT * FROM partitioned_table +WHERE id = (SELECT * FROM some_table LIMIT 1); + QUERY PLAN +---------------------------------------------------------------------------------------------------- + Custom Scan (RuntimeAppend) (actual time=0.030..0.033 rows=1 loops=1) + InitPlan 1 (returns $0) + -> Limit (actual time=0.011..0.011 rows=1 loops=1) + -> Seq Scan on some_table (actual time=0.010..0.010 rows=1 loops=1) + -> Seq Scan on partitioned_table_70 partitioned_table (actual time=0.004..0.006 rows=1 loops=1) + Filter: (id = $0) + Rows Removed by Filter: 9 + Planning time: 1.131 ms + Execution time: 0.075 ms +(9 rows) + +/* выключаем узел RuntimeAppend */ +SET pg_pathman.enable_runtimeappend = f; + +EXPLAIN (COSTS OFF, ANALYZE) SELECT * FROM partitioned_table +WHERE id = (SELECT * FROM some_table LIMIT 1); + QUERY PLAN +---------------------------------------------------------------------------------- + Append (actual time=0.196..0.274 rows=1 loops=1) + InitPlan 1 (returns $0) + -> Limit (actual time=0.005..0.005 rows=1 loops=1) + -> Seq Scan on some_table (actual time=0.003..0.003 rows=1 loops=1) + -> Seq Scan on partitioned_table_0 (actual time=0.014..0.014 rows=0 loops=1) + Filter: (id = $0) + Rows Removed by Filter: 6 + -> Seq Scan on partitioned_table_1 (actual time=0.003..0.003 rows=0 loops=1) + Filter: (id = $0) + Rows Removed by Filter: 5 + ... /* more plans follow */ + Planning time: 1.140 ms + Execution time: 0.855 ms +(306 rows) +``` + + - **`id = ANY (select ...)`** +``` +EXPLAIN (COSTS OFF, ANALYZE) SELECT * FROM partitioned_table +WHERE id = any (SELECT * FROM some_table limit 4); + QUERY PLAN +----------------------------------------------------------------------------------------------------------- + Nested Loop (actual time=0.025..0.060 rows=4 loops=1) + -> Limit (actual time=0.009..0.011 rows=4 loops=1) + -> Seq Scan on some_table (actual time=0.008..0.010 rows=4 loops=1) + -> Custom Scan (RuntimeAppend) (actual time=0.002..0.004 rows=1 loops=4) + -> Seq Scan on partitioned_table_70 partitioned_table (actual time=0.001..0.001 rows=10 loops=1) + -> Seq Scan on partitioned_table_26 partitioned_table (actual time=0.002..0.003 rows=9 loops=1) + -> Seq Scan on partitioned_table_27 partitioned_table (actual time=0.001..0.002 rows=20 loops=1) + -> Seq Scan on partitioned_table_63 partitioned_table (actual time=0.001..0.002 rows=9 loops=1) + Planning time: 0.771 ms + Execution time: 0.101 ms +(10 rows) + +/* выключаем узел RuntimeAppend */ +SET pg_pathman.enable_runtimeappend = f; + +EXPLAIN (COSTS OFF, ANALYZE) SELECT * FROM partitioned_table +WHERE id = any (SELECT * FROM some_table limit 4); + QUERY PLAN +----------------------------------------------------------------------------------------- + Nested Loop Semi Join (actual time=0.531..1.526 rows=4 loops=1) + Join Filter: (partitioned_table.id = some_table.val) + Rows Removed by Join Filter: 3990 + -> Append (actual time=0.190..0.470 rows=1000 loops=1) + -> Seq Scan on partitioned_table (actual time=0.187..0.187 rows=0 loops=1) + -> Seq Scan on partitioned_table_0 (actual time=0.002..0.004 rows=6 loops=1) + -> Seq Scan on partitioned_table_1 (actual time=0.001..0.001 rows=5 loops=1) + -> Seq Scan on partitioned_table_2 (actual time=0.002..0.004 rows=14 loops=1) +... /* 96 scans follow */ + -> Materialize (actual time=0.000..0.000 rows=4 loops=1000) + -> Limit (actual time=0.005..0.006 rows=4 loops=1) + -> Seq Scan on some_table (actual time=0.003..0.004 rows=4 loops=1) + Planning time: 2.169 ms + Execution time: 2.059 ms +(110 rows) +``` + + - **`NestLoop` involving a partitioned table**, which is omitted since it's occasionally shown above. + +---------- + +Узнать больше о работе RuntimeAppend можно в [блоге](http://akorotkov.github.io/blog/2016/06/15/pg_pathman-runtime-append/) Александра Короткова. + +## Примеры + +### Common tips +- You can easily add **_partition_** column containing the names of the underlying partitions using the system attribute called **_tableoid_**: +``` +SELECT tableoid::regclass AS partition, * FROM partitioned_table; +``` +- Несмотря на то, что индексы на родительской таблице не очень полезны (т.к. таблица пуста), они тем не менее выполняют роль прототипов для создания индексов в дочерних таблицах: `pg_pathman` автоматически создает аналогичные индексы для каждой новой секции. + +- Получить все текущие процессы конкурентного секционирования можно из представления `pathman_concurrent_part_tasks`: +```plpgsql +SELECT * FROM pathman_concurrent_part_tasks; + userid | pid | dbid | relid | processed | status +--------+------+-------+-------+-----------+--------- + dmitry | 7367 | 16384 | test | 472000 | working +(1 row) ``` -Отключает механизм секционирования `pg_pathman` для заданной таблицы и удаляет триггер на вставку. При этом созданные ранее секции остаются без изменений. -## Примеры использования -### HASH +### HASH секционирование Рассмотрим пример секционирования таблицы, используя HASH-стратегию на примере таблицы товаров. ``` CREATE TABLE items ( @@ -200,7 +398,7 @@ EXPLAIN SELECT * FROM ONLY items; Seq Scan on items (cost=0.00..0.00 rows=1 width=45) ``` -### RANGE +### RANGE секционирование Рассмотрим пример разбиения таблицы по диапазону дат. Пусть у нас имеется таблица логов: ``` CREATE TABLE journal ( @@ -274,14 +472,22 @@ EXPLAIN SELECT * FROM journal WHERE dt >= '2015-06-01' AND dt < '2015-06-03'; ``` ### Деакцивация pg_pathman -Деактивировать механизм pg_pathman для некоторой ранее разделенной таблицы можно следующей командой disable_partitioning(): +Для включения и отключения модуля `pg_pathman` и отдельных его копонентов существует ряд [GUC](https://www.postgresql.org/docs/9.5/static/config-setting.html) переменных: + + - `pg_pathman.enable` --- полная отключение (или включение) модуля `pg_pathman` + - `pg_pathman.enable_runtimeappend` --- включение/отключение функционала `RuntimeAppend` + - `pg_pathman.enable_runtimemergeappend` --- включение/отключение функционала `RuntimeMergeAppend` + - `pg_pathman.enable_partitionfilter` --- включение/отключение функционала `PartitionFilter` + +Чтобы **безвозвратно** отключить механизм `pg_pathman` для отдельной таблицы, используйте фунцию `disable_pathman_for()`. В результате этой операции структура таблиц останется прежней, но для планирования и выполнения запросов будет использоваться стандартный механизм PostgreSQL. ``` -SELECT disable_partitioning('journal'); +SELECT disable_pathman_for('range_rel'); ``` -Все созданные секции и данные останутся по прежнему доступны и будут обрабатываться стандартным планировщиком PostgreSQL. -## Авторы - -Ильдар Мусин Postgres Professional, Россия +## Обратная связь +Если у вас есть вопросы или предложения, а также если вы обнаружили ошибки, напишите нам в разделе [issues](https://github.com/postgrespro/pg_pathman/issues). -Александр Коротков Postgres Professional, Россия +## Авторы +Ильдар Мусин Postgres Professional, Россия +Александр Коротков Postgres Professional, Россия +Дмитрий Иванов Postgres Professional, Россия diff --git a/contrib/pg_pathman/expected/for_update.out b/contrib/pg_pathman/expected/for_update.out new file mode 100644 index 0000000000..3e41031ee3 --- /dev/null +++ b/contrib/pg_pathman/expected/for_update.out @@ -0,0 +1,38 @@ +Parsed test spec with 2 sessions + +starting permutation: s1_b s1_update s2_select s1_r +create_range_partitions + +10 +step s1_b: begin; +step s1_update: update test_tbl set id = 2 where id = 1; +step s2_select: select * from test_tbl where id = 1; +id val + +1 1 +step s1_r: rollback; + +starting permutation: s1_b s1_update s2_select_locked s1_r +create_range_partitions + +10 +step s1_b: begin; +step s1_update: update test_tbl set id = 2 where id = 1; +step s2_select_locked: select * from test_tbl where id = 1 for share; +step s1_r: rollback; +step s2_select_locked: <... completed> +id val + +1 1 + +starting permutation: s1_b s1_update s2_select_locked s1_c +create_range_partitions + +10 +step s1_b: begin; +step s1_update: update test_tbl set id = 2 where id = 1; +step s2_select_locked: select * from test_tbl where id = 1 for share; +step s1_c: commit; +step s2_select_locked: <... completed> +id val + diff --git a/contrib/pg_pathman/expected/insert_trigger.out b/contrib/pg_pathman/expected/insert_nodes.out similarity index 100% rename from contrib/pg_pathman/expected/insert_trigger.out rename to contrib/pg_pathman/expected/insert_nodes.out diff --git a/contrib/pg_pathman/expected/pg_pathman.out b/contrib/pg_pathman/expected/pathman_basic.out similarity index 76% rename from contrib/pg_pathman/expected/pg_pathman.out rename to contrib/pg_pathman/expected/pathman_basic.out index c3a0a1626f..b905b02a0b 100644 --- a/contrib/pg_pathman/expected/pg_pathman.out +++ b/contrib/pg_pathman/expected/pathman_basic.out @@ -9,12 +9,87 @@ INSERT INTO test.hash_rel VALUES (1, 1); INSERT INTO test.hash_rel VALUES (2, 2); INSERT INTO test.hash_rel VALUES (3, 3); SELECT pathman.create_hash_partitions('test.hash_rel', 'value', 3); -ERROR: Partitioning key 'value' must be NOT NULL +ERROR: partitioning key 'value' must be NOT NULL ALTER TABLE test.hash_rel ALTER COLUMN value SET NOT NULL; +SELECT pathman.create_hash_partitions('test.hash_rel', 'value', 3, partition_data:=false); + create_hash_partitions +------------------------ + 3 +(1 row) + +EXPLAIN (COSTS OFF) SELECT * FROM test.hash_rel; + QUERY PLAN +------------------------------ + Append + -> Seq Scan on hash_rel + -> Seq Scan on hash_rel_0 + -> Seq Scan on hash_rel_1 + -> Seq Scan on hash_rel_2 +(5 rows) + +SELECT * FROM test.hash_rel; + id | value +----+------- + 1 | 1 + 2 | 2 + 3 | 3 +(3 rows) + +SELECT pathman.set_enable_parent('test.hash_rel', false); + set_enable_parent +------------------- + +(1 row) + +EXPLAIN (COSTS OFF) SELECT * FROM test.hash_rel; + QUERY PLAN +------------------------------ + Append + -> Seq Scan on hash_rel_0 + -> Seq Scan on hash_rel_1 + -> Seq Scan on hash_rel_2 +(4 rows) + +SELECT * FROM test.hash_rel; + id | value +----+------- +(0 rows) + +SELECT pathman.set_enable_parent('test.hash_rel', true); + set_enable_parent +------------------- + +(1 row) + +EXPLAIN (COSTS OFF) SELECT * FROM test.hash_rel; + QUERY PLAN +------------------------------ + Append + -> Seq Scan on hash_rel + -> Seq Scan on hash_rel_0 + -> Seq Scan on hash_rel_1 + -> Seq Scan on hash_rel_2 +(5 rows) + +SELECT * FROM test.hash_rel; + id | value +----+------- + 1 | 1 + 2 | 2 + 3 | 3 +(3 rows) + +SELECT pathman.drop_partitions('test.hash_rel'); +NOTICE: function test.hash_rel_upd_trig_func() does not exist, skipping +NOTICE: 0 rows copied from test.hash_rel_0 +NOTICE: 0 rows copied from test.hash_rel_1 +NOTICE: 0 rows copied from test.hash_rel_2 + drop_partitions +----------------- + 3 +(1 row) + SELECT pathman.create_hash_partitions('test.hash_rel', 'Value', 3); -NOTICE: function test.hash_rel_insert_trigger_func() does not exist, skipping -NOTICE: function test.hash_rel_update_trigger_func() does not exist, skipping -NOTICE: Copying data to partitions... create_hash_partitions ------------------------ 3 @@ -55,13 +130,12 @@ CREATE INDEX ON test.range_rel (dt); INSERT INTO test.range_rel (dt, txt) SELECT g, md5(g::TEXT) FROM generate_series('2015-01-01', '2015-04-30', '1 day'::interval) as g; SELECT pathman.create_range_partitions('test.range_rel', 'dt', '2015-01-01'::DATE, '1 month'::INTERVAL, 2); -ERROR: Partitioning key 'dt' must be NOT NULL +ERROR: partitioning key 'dt' must be NOT NULL ALTER TABLE test.range_rel ALTER COLUMN dt SET NOT NULL; SELECT pathman.create_range_partitions('test.range_rel', 'dt', '2015-01-01'::DATE, '1 month'::INTERVAL, 2); -ERROR: Not enough partitions to fit all the values of 'dt' +ERROR: not enough partitions to fit all values of 'dt' SELECT pathman.create_range_partitions('test.range_rel', 'DT', '2015-01-01'::DATE, '1 month'::INTERVAL); NOTICE: sequence "range_rel_seq" does not exist, skipping -NOTICE: Copying data to partitions... create_range_partitions ------------------------- 4 @@ -84,7 +158,6 @@ CREATE TABLE test.num_range_rel ( txt TEXT); SELECT pathman.create_range_partitions('test.num_range_rel', 'id', 0, 1000, 4); NOTICE: sequence "num_range_rel_seq" does not exist, skipping -NOTICE: Copying data to partitions... create_range_partitions ------------------------- 4 @@ -123,9 +196,9 @@ SET pg_pathman.enable_runtimemergeappend = OFF; VACUUM; /* update triggers test */ SELECT pathman.create_hash_update_trigger('test.hash_rel'); - create_hash_update_trigger ----------------------------- - + create_hash_update_trigger +----------------------------- + test.hash_rel_upd_trig_func (1 row) UPDATE test.hash_rel SET value = 7 WHERE value = 6; @@ -144,9 +217,9 @@ SELECT * FROM test.hash_rel WHERE value = 7; (1 row) SELECT pathman.create_range_update_trigger('test.num_range_rel'); - create_range_update_trigger ------------------------------------------- - test.num_range_rel_update_trigger_func() + create_range_update_trigger +---------------------------------- + test.num_range_rel_upd_trig_func (1 row) UPDATE test.num_range_rel SET id = 3001 WHERE id = 1; @@ -551,7 +624,7 @@ WHERE j1.dt < '2015-03-01' AND j2.dt >= '2015-02-01' ORDER BY j2.dt; * Test CTE query */ EXPLAIN (COSTS OFF) - WITH ttt AS (SELECT * FROM test.range_rel WHERE dt >= '2015-02-01' AND dt < '2015-03-15') + WITH ttt AS (SELECT * FROM test.range_rel WHERE dt >= '2015-02-01' AND dt < '2015-03-15') SELECT * FROM ttt; QUERY PLAN -------------------------------------------------------------------------------------------- @@ -564,7 +637,7 @@ SELECT * FROM ttt; (6 rows) EXPLAIN (COSTS OFF) - WITH ttt AS (SELECT * FROM test.hash_rel WHERE value = 2) + WITH ttt AS (SELECT * FROM test.hash_rel WHERE value = 2) SELECT * FROM ttt; QUERY PLAN -------------------------------------- @@ -575,231 +648,11 @@ SELECT * FROM ttt; Filter: (value = 2) (5 rows) -/* - * Test RuntimeAppend - */ -create or replace function test.pathman_assert(smt bool, error_msg text) returns text as $$ -begin - if not smt then - raise exception '%', error_msg; - end if; - - return 'ok'; -end; -$$ language plpgsql; -create or replace function test.pathman_equal(a text, b text, error_msg text) returns text as $$ -begin - if a != b then - raise exception '''%'' is not equal to ''%'', %', a, b, error_msg; - end if; - - return 'equal'; -end; -$$ language plpgsql; -create or replace function test.pathman_test(query text) returns jsonb as $$ -declare - plan jsonb; -begin - execute 'explain (analyze, format json)' || query into plan; - - return plan; -end; -$$ language plpgsql; -create or replace function test.pathman_test_1() returns text as $$ -declare - plan jsonb; - num int; -begin - plan = test.pathman_test('select * from test.runtime_test_1 where id = (select * from test.run_values limit 1)'); - - perform test.pathman_equal((plan->0->'Plan'->'Node Type')::text, - '"Custom Scan"', - 'wrong plan type'); - - perform test.pathman_equal((plan->0->'Plan'->'Custom Plan Provider')::text, - '"RuntimeAppend"', - 'wrong plan provider'); - - perform test.pathman_equal((plan->0->'Plan'->'Plans'->1->'Relation Name')::text, - format('"runtime_test_1_%s"', pathman.get_hash(hashint4(1), 6)), - 'wrong partition'); - - select count(*) from jsonb_array_elements_text(plan->0->'Plan'->'Plans') into num; - perform test.pathman_equal(num::text, '2', 'expected 2 child plans for custom scan'); - - return 'ok'; -end; -$$ language plpgsql; -create or replace function test.pathman_test_2() returns text as $$ -declare - plan jsonb; - num int; -begin - plan = test.pathman_test('select * from test.runtime_test_1 where id = any (select * from test.run_values limit 4)'); - - perform test.pathman_equal((plan->0->'Plan'->'Node Type')::text, - '"Nested Loop"', - 'wrong plan type'); - - perform test.pathman_equal((plan->0->'Plan'->'Plans'->1->'Node Type')::text, - '"Custom Scan"', - 'wrong plan type'); - - perform test.pathman_equal((plan->0->'Plan'->'Plans'->1->'Custom Plan Provider')::text, - '"RuntimeAppend"', - 'wrong plan provider'); - - select count(*) from jsonb_array_elements_text(plan->0->'Plan'->'Plans'->1->'Plans') into num; - perform test.pathman_equal(num::text, '4', 'expected 4 child plans for custom scan'); - - for i in 0..3 loop - perform test.pathman_equal((plan->0->'Plan'->'Plans'->1->'Plans'->i->'Relation Name')::text, - format('"runtime_test_1_%s"', pathman.get_hash(hashint4(i + 1), 6)), - 'wrong partition'); - - num = plan->0->'Plan'->'Plans'->1->'Plans'->i->'Actual Loops'; - perform test.pathman_equal(num::text, '1', 'expected 1 loop'); - end loop; - - return 'ok'; -end; -$$ language plpgsql; -create or replace function test.pathman_test_3() returns text as $$ -declare - plan jsonb; - num int; -begin - plan = test.pathman_test('select * from test.runtime_test_1 a join test.run_values b on a.id = b.val'); - - perform test.pathman_equal((plan->0->'Plan'->'Node Type')::text, - '"Nested Loop"', - 'wrong plan type'); - - perform test.pathman_equal((plan->0->'Plan'->'Plans'->1->'Node Type')::text, - '"Custom Scan"', - 'wrong plan type'); - - perform test.pathman_equal((plan->0->'Plan'->'Plans'->1->'Custom Plan Provider')::text, - '"RuntimeAppend"', - 'wrong plan provider'); - - select count(*) from jsonb_array_elements_text(plan->0->'Plan'->'Plans'->1->'Plans') into num; - perform test.pathman_equal(num::text, '6', 'expected 6 child plans for custom scan'); - - for i in 0..5 loop - num = plan->0->'Plan'->'Plans'->1->'Plans'->i->'Actual Loops'; - perform test.pathman_assert(num > 0 and num <= 1718, 'expected no more than 1718 loops'); - end loop; - - return 'ok'; -end; -$$ language plpgsql; -create or replace function test.pathman_test_4() returns text as $$ -declare - plan jsonb; - num int; -begin - plan = test.pathman_test('select * from test.category c, lateral' || - '(select * from test.runtime_test_2 g where g.category_id = c.id order by rating limit 4) as tg'); - - perform test.pathman_equal((plan->0->'Plan'->'Node Type')::text, - '"Nested Loop"', - 'wrong plan type'); - - /* Limit -> Custom Scan */ - perform test.pathman_equal((plan->0->'Plan'->'Plans'->1->0->'Node Type')::text, - '"Custom Scan"', - 'wrong plan type'); - - perform test.pathman_equal((plan->0->'Plan'->'Plans'->1->0->'Custom Plan Provider')::text, - '"RuntimeMergeAppend"', - 'wrong plan provider'); - - select count(*) from jsonb_array_elements_text(plan->0->'Plan'->'Plans'->1->'Plans'->0->'Plans') into num; - perform test.pathman_equal(num::text, '4', 'expected 4 child plans for custom scan'); - - for i in 0..3 loop - perform test.pathman_equal((plan->0->'Plan'->'Plans'->1->'Plans'->0->'Plans'->i->'Relation Name')::text, - format('"runtime_test_2_%s"', pathman.get_hash(hashint4(i + 1), 6)), - 'wrong partition'); - - num = plan->0->'Plan'->'Plans'->1->'Plans'->0->'Plans'->i->'Actual Loops'; - perform test.pathman_assert(num = 1, 'expected no more than 1 loops'); - end loop; - - return 'ok'; -end; -$$ language plpgsql; -create table test.run_values as select generate_series(1, 10000) val; -create table test.runtime_test_1(id serial primary key, val real); -insert into test.runtime_test_1 select generate_series(1, 10000), random(); -select pathman.create_hash_partitions('test.runtime_test_1', 'id', 6); -NOTICE: function test.runtime_test_1_insert_trigger_func() does not exist, skipping -NOTICE: function test.runtime_test_1_update_trigger_func() does not exist, skipping -NOTICE: Copying data to partitions... - create_hash_partitions ------------------------- - 6 -(1 row) - -create table test.category as (select id, 'cat' || id::text as name from generate_series(1, 4) id); -create table test.runtime_test_2 (id serial, category_id int not null, name text, rating real); -insert into test.runtime_test_2 (select id, (id % 6) + 1 as category_id, 'good' || id::text as name, random() as rating from generate_series(1, 100000) id); -create index on test.runtime_test_2 (category_id, rating); -select pathman.create_hash_partitions('test.runtime_test_2', 'category_id', 6); -NOTICE: function test.runtime_test_2_insert_trigger_func() does not exist, skipping -NOTICE: function test.runtime_test_2_update_trigger_func() does not exist, skipping -NOTICE: Copying data to partitions... - create_hash_partitions ------------------------- - 6 -(1 row) - -analyze test.run_values; -analyze test.runtime_test_1; -set enable_mergejoin = off; -set enable_hashjoin = off; -set pg_pathman.enable_runtimeappend = on; -set pg_pathman.enable_runtimemergeappend = on; -select test.pathman_test_1(); /* RuntimeAppend (select ... where id = (subquery)) */ - pathman_test_1 ----------------- - ok -(1 row) - -select test.pathman_test_2(); /* RuntimeAppend (select ... where id = any(subquery)) */ - pathman_test_2 ----------------- - ok -(1 row) - -select test.pathman_test_3(); /* RuntimeAppend (a join b on a.id = b.val) */ - pathman_test_3 ----------------- - ok -(1 row) - -select test.pathman_test_4(); /* RuntimeMergeAppend (lateral) */ - pathman_test_4 ----------------- - ok -(1 row) - -set pg_pathman.enable_runtimeappend = off; -set pg_pathman.enable_runtimemergeappend = off; -set enable_mergejoin = on; -set enable_hashjoin = on; -drop table test.run_values, test.runtime_test_1, test.runtime_test_2 cascade; -NOTICE: drop cascades to 12 other objects /* * Test split and merge */ /* Split first partition in half */ SELECT pathman.split_range_partition('test.num_range_rel_1', 500); -NOTICE: Creating new partition... -NOTICE: Copying data to new partition... -NOTICE: Altering original partition... -NOTICE: Done! split_range_partition ----------------------- {0,1000} @@ -816,10 +669,6 @@ EXPLAIN (COSTS OFF) SELECT * FROM test.num_range_rel WHERE id BETWEEN 100 AND 70 (5 rows) SELECT pathman.split_range_partition('test.range_rel_1', '2015-01-15'::DATE); -NOTICE: Creating new partition... -NOTICE: Copying data to new partition... -NOTICE: Altering original partition... -NOTICE: Done! split_range_partition ------------------------- {01-01-2015,02-01-2015} @@ -827,10 +676,6 @@ NOTICE: Done! /* Merge two partitions into one */ SELECT pathman.merge_range_partitions('test.num_range_rel_1', 'test.num_range_rel_' || currval('test.num_range_rel_seq')); -NOTICE: Altering first partition... -NOTICE: Copying data... -NOTICE: Dropping second partition... -NOTICE: Done! merge_range_partitions ------------------------ @@ -845,10 +690,6 @@ EXPLAIN (COSTS OFF) SELECT * FROM test.num_range_rel WHERE id BETWEEN 100 AND 70 (3 rows) SELECT pathman.merge_range_partitions('test.range_rel_1', 'test.range_rel_' || currval('test.range_rel_seq')); -NOTICE: Altering first partition... -NOTICE: Copying data... -NOTICE: Dropping second partition... -NOTICE: Done! merge_range_partitions ------------------------ @@ -856,8 +697,6 @@ NOTICE: Done! /* Append and prepend partitions */ SELECT pathman.append_range_partition('test.num_range_rel'); -NOTICE: Appending new partition... -NOTICE: Done! append_range_partition ------------------------ test.num_range_rel_6 @@ -871,8 +710,6 @@ EXPLAIN (COSTS OFF) SELECT * FROM test.num_range_rel WHERE id >= 4000; (2 rows) SELECT pathman.prepend_range_partition('test.num_range_rel'); -NOTICE: Prepending new partition... -NOTICE: Done! prepend_range_partition ------------------------- test.num_range_rel_7 @@ -892,16 +729,12 @@ SELECT pathman.drop_range_partition('test.num_range_rel_7'); (1 row) SELECT pathman.append_range_partition('test.range_rel'); -NOTICE: Appending new partition... -NOTICE: Done! append_range_partition ------------------------ test.range_rel_6 (1 row) SELECT pathman.prepend_range_partition('test.range_rel'); -NOTICE: Prepending new partition... -NOTICE: Done! prepend_range_partition ------------------------- test.range_rel_7 @@ -932,9 +765,8 @@ EXPLAIN (COSTS OFF) SELECT * FROM test.range_rel WHERE dt BETWEEN '2014-12-15' A (3 rows) SELECT pathman.add_range_partition('test.range_rel', '2014-12-01'::DATE, '2015-01-02'::DATE); -ERROR: Specified range overlaps with existing partitions +ERROR: specified range overlaps with existing partitions SELECT pathman.add_range_partition('test.range_rel', '2014-12-01'::DATE, '2015-01-01'::DATE); -NOTICE: Done! add_range_partition --------------------- test.range_rel_8 @@ -952,7 +784,7 @@ EXPLAIN (COSTS OFF) SELECT * FROM test.range_rel WHERE dt BETWEEN '2014-12-15' A CREATE TABLE test.range_rel_archive (LIKE test.range_rel INCLUDING ALL); SELECT pathman.attach_range_partition('test.range_rel', 'test.range_rel_archive', '2014-01-01'::DATE, '2015-01-01'::DATE); -ERROR: Specified range overlaps with existing partitions +ERROR: specified range overlaps with existing partitions SELECT pathman.attach_range_partition('test.range_rel', 'test.range_rel_archive', '2014-01-01'::DATE, '2014-12-01'::DATE); attach_range_partition ------------------------ @@ -986,17 +818,61 @@ EXPLAIN (COSTS OFF) SELECT * FROM test.range_rel WHERE dt BETWEEN '2014-11-15' A (4 rows) CREATE TABLE test.range_rel_test1 ( - id SERIAL PRIMARY KEY, - dt TIMESTAMP, - txt TEXT, - abc INTEGER); + id SERIAL PRIMARY KEY, + dt TIMESTAMP, + txt TEXT, + abc INTEGER); SELECT pathman.attach_range_partition('test.range_rel', 'test.range_rel_test1', '2013-01-01'::DATE, '2014-01-01'::DATE); -ERROR: Partition must have the exact same structure as parent +ERROR: partition must have the exact same structure as parent CREATE TABLE test.range_rel_test2 ( - id SERIAL PRIMARY KEY, - dt TIMESTAMP); + id SERIAL PRIMARY KEY, + dt TIMESTAMP); SELECT pathman.attach_range_partition('test.range_rel', 'test.range_rel_test2', '2013-01-01'::DATE, '2014-01-01'::DATE); -ERROR: Partition must have the exact same structure as parent +ERROR: partition must have the exact same structure as parent +/* + * Zero partitions count and adding partitions with specified name + */ +CREATE TABLE test.zero( + id SERIAL PRIMARY KEY, + value INT NOT NULL); +INSERT INTO test.zero SELECT g, g FROM generate_series(1, 100) as g; +SELECT pathman.create_range_partitions('test.zero', 'value', 50, 10, 0); +NOTICE: sequence "zero_seq" does not exist, skipping + create_range_partitions +------------------------- + 0 +(1 row) + +SELECT pathman.append_range_partition('test.zero', 'test.zero_0'); +ERROR: cannot append to empty partitions set +SELECT pathman.prepend_range_partition('test.zero', 'test.zero_1'); +ERROR: cannot prepend to empty partitions set +SELECT pathman.add_range_partition('test.zero', 50, 70, 'test.zero_50'); + add_range_partition +--------------------- + test.zero_50 +(1 row) + +SELECT pathman.append_range_partition('test.zero', 'test.zero_appended'); + append_range_partition +------------------------ + test.zero_appended +(1 row) + +SELECT pathman.prepend_range_partition('test.zero', 'test.zero_prepended'); + prepend_range_partition +------------------------- + test.zero_prepended +(1 row) + +SELECT pathman.split_range_partition('test.zero_50', 60, 'test.zero_60'); + split_range_partition +----------------------- + {50,70} +(1 row) + +DROP TABLE test.zero CASCADE; +NOTICE: drop cascades to 4 other objects /* * Check that altering table columns doesn't break trigger */ @@ -1011,15 +887,14 @@ SELECT * FROM test.hash_rel WHERE id = 123; /* * Clean up */ -SELECT pathman.drop_hash_partitions('test.hash_rel'); -NOTICE: drop cascades to trigger test_hash_rel_insert_trigger on table test.hash_rel +SELECT pathman.drop_partitions('test.hash_rel'); NOTICE: drop cascades to 3 other objects -NOTICE: 2 rows copied from test.hash_rel_2 -NOTICE: 3 rows copied from test.hash_rel_1 NOTICE: 2 rows copied from test.hash_rel_0 - drop_hash_partitions ----------------------- - 3 +NOTICE: 3 rows copied from test.hash_rel_1 +NOTICE: 2 rows copied from test.hash_rel_2 + drop_partitions +----------------- + 3 (1 row) SELECT COUNT(*) FROM ONLY test.hash_rel; @@ -1029,20 +904,16 @@ SELECT COUNT(*) FROM ONLY test.hash_rel; (1 row) SELECT pathman.create_hash_partitions('test.hash_rel', 'value', 3); -NOTICE: function test.hash_rel_insert_trigger_func() does not exist, skipping -NOTICE: function test.hash_rel_update_trigger_func() does not exist, skipping -NOTICE: Copying data to partitions... create_hash_partitions ------------------------ 3 (1 row) -SELECT pathman.drop_hash_partitions('test.hash_rel', TRUE); -NOTICE: drop cascades to trigger test_hash_rel_insert_trigger on table test.hash_rel -NOTICE: function test.hash_rel_update_trigger_func() does not exist, skipping - drop_hash_partitions ----------------------- - 3 +SELECT pathman.drop_partitions('test.hash_rel', TRUE); +NOTICE: function test.hash_rel_upd_trig_func() does not exist, skipping + drop_partitions +----------------- + 3 (1 row) SELECT COUNT(*) FROM ONLY test.hash_rel; @@ -1052,15 +923,16 @@ SELECT COUNT(*) FROM ONLY test.hash_rel; (1 row) DROP TABLE test.hash_rel CASCADE; -SELECT pathman.drop_range_partitions('test.num_range_rel'); -NOTICE: 0 rows copied from test.num_range_rel_6 -NOTICE: 2 rows copied from test.num_range_rel_4 -NOTICE: 1000 rows copied from test.num_range_rel_3 -NOTICE: 1000 rows copied from test.num_range_rel_2 +SELECT pathman.drop_partitions('test.num_range_rel'); +NOTICE: drop cascades to 4 other objects NOTICE: 998 rows copied from test.num_range_rel_1 - drop_range_partitions ------------------------ - 5 +NOTICE: 1000 rows copied from test.num_range_rel_2 +NOTICE: 1000 rows copied from test.num_range_rel_3 +NOTICE: 2 rows copied from test.num_range_rel_4 +NOTICE: 0 rows copied from test.num_range_rel_6 + drop_partitions +----------------- + 5 (1 row) DROP TABLE test.num_range_rel CASCADE; @@ -1071,7 +943,6 @@ CREATE TABLE test.range_rel ( id SERIAL PRIMARY KEY, dt TIMESTAMP NOT NULL); SELECT pathman.create_range_partitions('test.range_rel', 'dt', '2015-01-01'::DATE, '10 days'::INTERVAL, 1); -NOTICE: Copying data to partitions... create_range_partitions ------------------------- 1 @@ -1109,11 +980,26 @@ SELECT * FROM test.range_rel WHERE dt = '2015-03-15'; 74 | Sun Mar 15 00:00:00 2015 (1 row) +SELECT pathman.set_auto('test.range_rel', false); + set_auto +---------- + +(1 row) + +INSERT INTO test.range_rel (dt) VALUES ('2015-06-01'); +ERROR: no suitable partition for key 'Mon Jun 01 00:00:00 2015' +SELECT pathman.set_auto('test.range_rel', true); + set_auto +---------- + +(1 row) + +INSERT INTO test.range_rel (dt) VALUES ('2015-06-01'); DROP TABLE test.range_rel CASCADE; -NOTICE: drop cascades to 16 other objects +NOTICE: drop cascades to 20 other objects SELECT * FROM pathman.pathman_config; - id | relname | attname | parttype | range_interval -----+---------+---------+----------+---------------- + partrel | attname | parttype | range_interval +---------+---------+----------+---------------- (0 rows) /* Check overlaps */ @@ -1121,7 +1007,6 @@ CREATE TABLE test.num_range_rel ( id SERIAL PRIMARY KEY, txt TEXT); SELECT pathman.create_range_partitions('test.num_range_rel', 'id', 1000, 1000, 4); -NOTICE: Copying data to partitions... create_range_partitions ------------------------- 4 @@ -1174,9 +1059,6 @@ CREATE TABLE test."TeSt" (a INT NOT NULL, b INT); SELECT pathman.create_hash_partitions('test.TeSt', 'a', 3); ERROR: relation "test.test" does not exist at character 39 SELECT pathman.create_hash_partitions('test."TeSt"', 'a', 3); -NOTICE: function test.TeSt_insert_trigger_func() does not exist, skipping -NOTICE: function test.TeSt_update_trigger_func() does not exist, skipping -NOTICE: Copying data to partitions... create_hash_partitions ------------------------ 3 @@ -1196,7 +1078,7 @@ SELECT * FROM test."TeSt"; SELECT pathman.create_hash_update_trigger('test."TeSt"'); create_hash_update_trigger ---------------------------- - + test."TeSt_upd_trig_func" (1 row) UPDATE test."TeSt" SET a = 1; @@ -1224,15 +1106,14 @@ EXPLAIN (COSTS OFF) SELECT * FROM test."TeSt" WHERE a = 1; Filter: (a = 1) (3 rows) -SELECT pathman.drop_hash_partitions('test."TeSt"'); -NOTICE: drop cascades to trigger test_TeSt_insert_trigger on table test."TeSt" +SELECT pathman.drop_partitions('test."TeSt"'); NOTICE: drop cascades to 3 other objects -NOTICE: 3 rows copied from test."TeSt_2" -NOTICE: 0 rows copied from test."TeSt_1" NOTICE: 0 rows copied from test."TeSt_0" - drop_hash_partitions ----------------------- - 3 +NOTICE: 0 rows copied from test."TeSt_1" +NOTICE: 3 rows copied from test."TeSt_2" + drop_partitions +----------------- + 3 (1 row) SELECT * FROM test."TeSt"; @@ -1251,61 +1132,48 @@ INSERT INTO test."RangeRel" (dt, txt) SELECT g, md5(g::TEXT) FROM generate_series('2015-01-01', '2015-01-03', '1 day'::interval) as g; SELECT pathman.create_range_partitions('test."RangeRel"', 'dt', '2015-01-01'::DATE, '1 day'::INTERVAL); NOTICE: sequence "RangeRel_seq" does not exist, skipping -NOTICE: Copying data to partitions... create_range_partitions ------------------------- 3 (1 row) SELECT pathman.append_range_partition('test."RangeRel"'); -NOTICE: Appending new partition... -NOTICE: Done! append_range_partition ------------------------ test."RangeRel_4" (1 row) SELECT pathman.prepend_range_partition('test."RangeRel"'); -NOTICE: Prepending new partition... -NOTICE: Done! prepend_range_partition ------------------------- test."RangeRel_5" (1 row) SELECT pathman.merge_range_partitions('test."RangeRel_1"', 'test."RangeRel_' || currval('test."RangeRel_seq"') || '"'); -NOTICE: Altering first partition... -NOTICE: Copying data... -NOTICE: Dropping second partition... -NOTICE: Done! merge_range_partitions ------------------------ (1 row) SELECT pathman.split_range_partition('test."RangeRel_1"', '2015-01-01'::DATE); -NOTICE: Creating new partition... -NOTICE: Copying data to new partition... -NOTICE: Altering original partition... -NOTICE: Done! split_range_partition ------------------------- {12-31-2014,01-02-2015} (1 row) -SELECT pathman.drop_range_partitions('test."RangeRel"'); -NOTICE: 1 rows copied from test."RangeRel_6" -NOTICE: 0 rows copied from test."RangeRel_4" -NOTICE: 1 rows copied from test."RangeRel_3" -NOTICE: 1 rows copied from test."RangeRel_2" +SELECT pathman.drop_partitions('test."RangeRel"'); +NOTICE: function test.RangeRel_upd_trig_func() does not exist, skipping NOTICE: 0 rows copied from test."RangeRel_1" - drop_range_partitions ------------------------ - 5 +NOTICE: 1 rows copied from test."RangeRel_2" +NOTICE: 1 rows copied from test."RangeRel_3" +NOTICE: 0 rows copied from test."RangeRel_4" +NOTICE: 1 rows copied from test."RangeRel_6" + drop_partitions +----------------- + 5 (1 row) SELECT pathman.create_partitions_from_range('test."RangeRel"', 'dt', '2015-01-01'::DATE, '2015-01-05'::DATE, '1 day'::INTERVAL); -NOTICE: Copying data to partitions... create_partitions_from_range ------------------------------ 5 @@ -1314,9 +1182,9 @@ NOTICE: Copying data to partitions... DROP TABLE test."RangeRel" CASCADE; NOTICE: drop cascades to 5 other objects SELECT * FROM pathman.pathman_config; - id | relname | attname | parttype | range_interval -----+--------------------+---------+----------+---------------- - 8 | test.num_range_rel | id | 2 | 1000 + partrel | attname | parttype | range_interval +--------------------+---------+----------+---------------- + test.num_range_rel | id | 2 | 1000 (1 row) CREATE TABLE test."RangeRel" ( @@ -1324,23 +1192,22 @@ CREATE TABLE test."RangeRel" ( dt TIMESTAMP NOT NULL, txt TEXT); SELECT pathman.create_range_partitions('test."RangeRel"', 'id', 1, 100, 3); -NOTICE: Copying data to partitions... create_range_partitions ------------------------- 3 (1 row) -SELECT pathman.drop_range_partitions('test."RangeRel"'); -NOTICE: 0 rows copied from test."RangeRel_3" -NOTICE: 0 rows copied from test."RangeRel_2" +SELECT pathman.drop_partitions('test."RangeRel"'); +NOTICE: function test.RangeRel_upd_trig_func() does not exist, skipping NOTICE: 0 rows copied from test."RangeRel_1" - drop_range_partitions ------------------------ - 3 +NOTICE: 0 rows copied from test."RangeRel_2" +NOTICE: 0 rows copied from test."RangeRel_3" + drop_partitions +----------------- + 3 (1 row) SELECT pathman.create_partitions_from_range('test."RangeRel"', 'id', 1, 300, 100); -NOTICE: Copying data to partitions... create_partitions_from_range ------------------------------ 3 @@ -1357,9 +1224,6 @@ CREATE TABLE hash_rel ( value INTEGER NOT NULL); INSERT INTO hash_rel (value) SELECT g FROM generate_series(1, 10000) as g; SELECT create_hash_partitions('hash_rel', 'value', 3); -NOTICE: function public.hash_rel_insert_trigger_func() does not exist, skipping -NOTICE: function public.hash_rel_update_trigger_func() does not exist, skipping -NOTICE: Copying data to partitions... create_hash_partitions ------------------------ 3 @@ -1385,43 +1249,30 @@ CREATE TABLE range_rel ( INSERT INTO range_rel (dt, value) SELECT g, extract(day from g) FROM generate_series('2010-01-01'::date, '2010-12-31'::date, '1 day') as g; SELECT create_range_partitions('range_rel', 'dt', '2010-01-01'::date, '1 month'::interval, 12); NOTICE: sequence "range_rel_seq" does not exist, skipping -NOTICE: Copying data to partitions... create_range_partitions ------------------------- 12 (1 row) SELECT merge_range_partitions('range_rel_1', 'range_rel_2'); -NOTICE: Altering first partition... -NOTICE: Copying data... -NOTICE: Dropping second partition... -NOTICE: Done! merge_range_partitions ------------------------ (1 row) SELECT split_range_partition('range_rel_1', '2010-02-15'::date); -NOTICE: Creating new partition... -NOTICE: Copying data to new partition... -NOTICE: Altering original partition... -NOTICE: Done! split_range_partition ------------------------- {01-01-2010,03-01-2010} (1 row) SELECT append_range_partition('range_rel'); -NOTICE: Appending new partition... -NOTICE: Done! append_range_partition ------------------------ public.range_rel_14 (1 row) SELECT prepend_range_partition('range_rel'); -NOTICE: Prepending new partition... -NOTICE: Done! prepend_range_partition ------------------------- public.range_rel_15 @@ -1505,41 +1356,41 @@ EXPLAIN (COSTS OFF) DELETE FROM range_rel r USING tmp t WHERE r.dt = '2010-01-02 DELETE FROM range_rel r USING tmp t WHERE r.dt = '2010-01-02' AND r.id = t.id; /* Create range partitions from whole range */ -SELECT drop_range_partitions('range_rel'); -NOTICE: 0 rows copied from range_rel_15 -NOTICE: 0 rows copied from range_rel_14 -NOTICE: 14 rows copied from range_rel_13 -NOTICE: 31 rows copied from range_rel_12 -NOTICE: 30 rows copied from range_rel_11 -NOTICE: 31 rows copied from range_rel_10 -NOTICE: 30 rows copied from range_rel_9 -NOTICE: 31 rows copied from range_rel_8 -NOTICE: 31 rows copied from range_rel_7 -NOTICE: 29 rows copied from range_rel_6 -NOTICE: 31 rows copied from range_rel_5 -NOTICE: 30 rows copied from range_rel_4 -NOTICE: 31 rows copied from range_rel_3 +SELECT drop_partitions('range_rel'); +NOTICE: function public.range_rel_upd_trig_func() does not exist, skipping NOTICE: 44 rows copied from range_rel_1 - drop_range_partitions ------------------------ - 14 +NOTICE: 31 rows copied from range_rel_3 +NOTICE: 30 rows copied from range_rel_4 +NOTICE: 31 rows copied from range_rel_5 +NOTICE: 29 rows copied from range_rel_6 +NOTICE: 31 rows copied from range_rel_7 +NOTICE: 31 rows copied from range_rel_8 +NOTICE: 30 rows copied from range_rel_9 +NOTICE: 31 rows copied from range_rel_10 +NOTICE: 30 rows copied from range_rel_11 +NOTICE: 31 rows copied from range_rel_12 +NOTICE: 14 rows copied from range_rel_13 +NOTICE: 0 rows copied from range_rel_14 +NOTICE: 0 rows copied from range_rel_15 + drop_partitions +----------------- + 14 (1 row) SELECT create_partitions_from_range('range_rel', 'id', 1, 1000, 100); -NOTICE: Copying data to partitions... create_partitions_from_range ------------------------------ 10 (1 row) -SELECT drop_range_partitions('range_rel', TRUE); - drop_range_partitions ------------------------ - 10 +SELECT drop_partitions('range_rel', TRUE); +NOTICE: function public.range_rel_upd_trig_func() does not exist, skipping + drop_partitions +----------------- + 10 (1 row) SELECT create_partitions_from_range('range_rel', 'dt', '2015-01-01'::date, '2015-12-01'::date, '1 month'::interval); -NOTICE: Copying data to partitions... create_partitions_from_range ------------------------------ 12 @@ -1558,12 +1409,11 @@ CREATE TABLE replies(id SERIAL PRIMARY KEY, message_id INTEGER REFERENCES messag INSERT INTO messages SELECT g, md5(g::text) FROM generate_series(1, 10) as g; INSERT INTO replies SELECT g, g, md5(g::text) FROM generate_series(1, 10) as g; SELECT create_range_partitions('messages', 'id', 1, 100, 2); -WARNING: Foreign key 'replies_message_id_fkey' references to the relation 'messages' -ERROR: Relation 'messages' is referenced from other relations +WARNING: foreign key 'replies_message_id_fkey' references relation 'messages' +ERROR: relation "messages" is referenced from other relations ALTER TABLE replies DROP CONSTRAINT replies_message_id_fkey; SELECT create_range_partitions('messages', 'id', 1, 100, 2); NOTICE: sequence "messages_seq" does not exist, skipping -NOTICE: Copying data to partitions... create_range_partitions ------------------------- 2 @@ -1577,3 +1427,8 @@ EXPLAIN (COSTS OFF) SELECT * FROM messages; -> Seq Scan on messages_2 (3 rows) +DROP SCHEMA test CASCADE; +NOTICE: drop cascades to 13 other objects +DROP EXTENSION pg_pathman CASCADE; +NOTICE: drop cascades to 3 other objects +DROP SCHEMA pathman CASCADE; diff --git a/contrib/pg_pathman/expected/pathman_callbacks.out b/contrib/pg_pathman/expected/pathman_callbacks.out new file mode 100644 index 0000000000..6a997e9ee6 --- /dev/null +++ b/contrib/pg_pathman/expected/pathman_callbacks.out @@ -0,0 +1,85 @@ +\set VERBOSITY terse +CREATE EXTENSION pg_pathman; +CREATE SCHEMA callbacks; +/* Check callbacks */ +CREATE OR REPLACE FUNCTION callbacks.abc_on_part_created_callback( + args JSONB) +RETURNS VOID AS $$ +BEGIN + RAISE WARNING 'callback arg: %', args::TEXT; +END +$$ language plpgsql; +/* set callback to be called on RANGE partitions */ +CREATE TABLE callbacks.abc(a serial, b int); +SELECT create_range_partitions('callbacks.abc', 'a', 1, 100, 2); +NOTICE: sequence "abc_seq" does not exist, skipping + create_range_partitions +------------------------- + 2 +(1 row) + +SELECT set_init_callback('callbacks.abc', + 'callbacks.abc_on_part_created_callback'); + set_init_callback +------------------- + +(1 row) + +INSERT INTO callbacks.abc VALUES (123, 1); +INSERT INTO callbacks.abc VALUES (223, 1); +SELECT append_range_partition('callbacks.abc'); +WARNING: callback arg: {"parent": "abc", "parttype": "2", "partition": "abc_4", "range_max": "401", "range_min": "301"} + append_range_partition +------------------------ + callbacks.abc_4 +(1 row) + +SELECT prepend_range_partition('callbacks.abc'); +WARNING: callback arg: {"parent": "abc", "parttype": "2", "partition": "abc_5", "range_max": "1", "range_min": "-99"} + prepend_range_partition +------------------------- + callbacks.abc_5 +(1 row) + +SELECT add_range_partition('callbacks.abc', 401, 502); +WARNING: callback arg: {"parent": "abc", "parttype": "2", "partition": "abc_6", "range_max": "502", "range_min": "401"} + add_range_partition +--------------------- + callbacks.abc_6 +(1 row) + +SELECT drop_partitions('callbacks.abc'); +NOTICE: function callbacks.abc_upd_trig_func() does not exist, skipping +NOTICE: 0 rows copied from callbacks.abc_1 +NOTICE: 1 rows copied from callbacks.abc_2 +NOTICE: 1 rows copied from callbacks.abc_3 +NOTICE: 0 rows copied from callbacks.abc_4 +NOTICE: 0 rows copied from callbacks.abc_5 +NOTICE: 0 rows copied from callbacks.abc_6 + drop_partitions +----------------- + 6 +(1 row) + +/* set callback to be called on HASH partitions */ +SELECT set_init_callback('callbacks.abc', + 'callbacks.abc_on_part_created_callback'); + set_init_callback +------------------- + +(1 row) + +SELECT create_hash_partitions('callbacks.abc', 'a', 5); +WARNING: callback arg: {"parent": "abc", "parttype": "1", "partition": "abc_0"} +WARNING: callback arg: {"parent": "abc", "parttype": "1", "partition": "abc_1"} +WARNING: callback arg: {"parent": "abc", "parttype": "1", "partition": "abc_2"} +WARNING: callback arg: {"parent": "abc", "parttype": "1", "partition": "abc_3"} +WARNING: callback arg: {"parent": "abc", "parttype": "1", "partition": "abc_4"} + create_hash_partitions +------------------------ + 5 +(1 row) + +DROP SCHEMA callbacks CASCADE; +NOTICE: drop cascades to 8 other objects +DROP EXTENSION pg_pathman CASCADE; diff --git a/contrib/pg_pathman/expected/pathman_domains.out b/contrib/pg_pathman/expected/pathman_domains.out new file mode 100644 index 0000000000..283a6d5b83 --- /dev/null +++ b/contrib/pg_pathman/expected/pathman_domains.out @@ -0,0 +1,92 @@ +\set VERBOSITY terse +CREATE EXTENSION pg_pathman; +CREATE SCHEMA domains; +CREATE DOMAIN domains.dom_test AS numeric CHECK (value < 1200); +CREATE TABLE domains.dom_table(val domains.dom_test NOT NULL); +INSERT INTO domains.dom_table SELECT generate_series(1, 999); +SELECT create_range_partitions('domains.dom_table', 'val', 1, 100); +NOTICE: sequence "dom_table_seq" does not exist, skipping + create_range_partitions +------------------------- + 10 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT * FROM domains.dom_table +WHERE val < 250; + QUERY PLAN +--------------------------------------------------- + Append + -> Seq Scan on dom_table_1 + -> Seq Scan on dom_table_2 + -> Seq Scan on dom_table_3 + Filter: ((val)::numeric < '250'::numeric) +(5 rows) + +INSERT INTO domains.dom_table VALUES(1500); +ERROR: value for domain domains.dom_test violates check constraint "dom_test_check" +INSERT INTO domains.dom_table VALUES(-10); +SELECT append_range_partition('domains.dom_table'); + append_range_partition +------------------------ + domains.dom_table_12 +(1 row) + +SELECT prepend_range_partition('domains.dom_table'); + prepend_range_partition +------------------------- + domains.dom_table_13 +(1 row) + +SELECT merge_range_partitions('domains.dom_table_1', 'domains.dom_table_2'); + merge_range_partitions +------------------------ + +(1 row) + +SELECT split_range_partition('domains.dom_table_1', 50); + split_range_partition +----------------------- + {1,201} +(1 row) + +INSERT INTO domains.dom_table VALUES(1101); +EXPLAIN (COSTS OFF) +SELECT * FROM domains.dom_table +WHERE val < 450; + QUERY PLAN +--------------------------------------------------- + Append + -> Seq Scan on dom_table_13 + -> Seq Scan on dom_table_11 + -> Seq Scan on dom_table_1 + -> Seq Scan on dom_table_14 + -> Seq Scan on dom_table_3 + -> Seq Scan on dom_table_4 + -> Seq Scan on dom_table_5 + Filter: ((val)::numeric < '450'::numeric) +(9 rows) + +SELECT * FROM pathman_partition_list +ORDER BY range_min::INT, range_max::INT; + parent | partition | parttype | partattr | range_min | range_max +-------------------+----------------------+----------+----------+-----------+----------- + domains.dom_table | domains.dom_table_13 | 2 | val | -199 | -99 + domains.dom_table | domains.dom_table_11 | 2 | val | -99 | 1 + domains.dom_table | domains.dom_table_1 | 2 | val | 1 | 50 + domains.dom_table | domains.dom_table_14 | 2 | val | 50 | 201 + domains.dom_table | domains.dom_table_3 | 2 | val | 201 | 301 + domains.dom_table | domains.dom_table_4 | 2 | val | 301 | 401 + domains.dom_table | domains.dom_table_5 | 2 | val | 401 | 501 + domains.dom_table | domains.dom_table_6 | 2 | val | 501 | 601 + domains.dom_table | domains.dom_table_7 | 2 | val | 601 | 701 + domains.dom_table | domains.dom_table_8 | 2 | val | 701 | 801 + domains.dom_table | domains.dom_table_9 | 2 | val | 801 | 901 + domains.dom_table | domains.dom_table_10 | 2 | val | 901 | 1001 + domains.dom_table | domains.dom_table_12 | 2 | val | 1001 | 1101 + domains.dom_table | domains.dom_table_15 | 2 | val | 1101 | 1201 +(14 rows) + +DROP SCHEMA domains CASCADE; +NOTICE: drop cascades to 17 other objects +DROP EXTENSION pg_pathman CASCADE; diff --git a/contrib/pg_pathman/expected/pathman_foreign_keys.out b/contrib/pg_pathman/expected/pathman_foreign_keys.out new file mode 100644 index 0000000000..20a4da60d4 --- /dev/null +++ b/contrib/pg_pathman/expected/pathman_foreign_keys.out @@ -0,0 +1,67 @@ +\set VERBOSITY terse +CREATE EXTENSION pg_pathman; +CREATE SCHEMA fkeys; +/* Check primary keys generation */ +CREATE TABLE fkeys.test_ref(comment TEXT UNIQUE); +INSERT INTO fkeys.test_ref VALUES('test'); +CREATE TABLE fkeys.test_fkey( + id INT NOT NULL, + comment TEXT, + FOREIGN KEY (comment) REFERENCES fkeys.test_ref(comment)); +INSERT INTO fkeys.test_fkey SELECT generate_series(1, 1000), 'test'; +SELECT create_range_partitions('fkeys.test_fkey', 'id', 1, 100); +NOTICE: sequence "test_fkey_seq" does not exist, skipping + create_range_partitions +------------------------- + 10 +(1 row) + +INSERT INTO fkeys.test_fkey VALUES(1, 'wrong'); +ERROR: insert or update on table "test_fkey_1" violates foreign key constraint "test_fkey_1_comment_fkey" +INSERT INTO fkeys.test_fkey VALUES(1, 'test'); +SELECT drop_partitions('fkeys.test_fkey'); +NOTICE: function fkeys.test_fkey_upd_trig_func() does not exist, skipping +NOTICE: 101 rows copied from fkeys.test_fkey_1 +NOTICE: 100 rows copied from fkeys.test_fkey_2 +NOTICE: 100 rows copied from fkeys.test_fkey_3 +NOTICE: 100 rows copied from fkeys.test_fkey_4 +NOTICE: 100 rows copied from fkeys.test_fkey_5 +NOTICE: 100 rows copied from fkeys.test_fkey_6 +NOTICE: 100 rows copied from fkeys.test_fkey_7 +NOTICE: 100 rows copied from fkeys.test_fkey_8 +NOTICE: 100 rows copied from fkeys.test_fkey_9 +NOTICE: 100 rows copied from fkeys.test_fkey_10 + drop_partitions +----------------- + 10 +(1 row) + +SELECT create_hash_partitions('fkeys.test_fkey', 'id', 10); + create_hash_partitions +------------------------ + 10 +(1 row) + +INSERT INTO fkeys.test_fkey VALUES(1, 'wrong'); +ERROR: insert or update on table "test_fkey_0" violates foreign key constraint "test_fkey_0_comment_fkey" +INSERT INTO fkeys.test_fkey VALUES(1, 'test'); +SELECT drop_partitions('fkeys.test_fkey'); +NOTICE: function fkeys.test_fkey_upd_trig_func() does not exist, skipping +NOTICE: 100 rows copied from fkeys.test_fkey_0 +NOTICE: 90 rows copied from fkeys.test_fkey_1 +NOTICE: 90 rows copied from fkeys.test_fkey_2 +NOTICE: 116 rows copied from fkeys.test_fkey_3 +NOTICE: 101 rows copied from fkeys.test_fkey_4 +NOTICE: 90 rows copied from fkeys.test_fkey_5 +NOTICE: 95 rows copied from fkeys.test_fkey_6 +NOTICE: 118 rows copied from fkeys.test_fkey_7 +NOTICE: 108 rows copied from fkeys.test_fkey_8 +NOTICE: 94 rows copied from fkeys.test_fkey_9 + drop_partitions +----------------- + 10 +(1 row) + +DROP SCHEMA fkeys CASCADE; +NOTICE: drop cascades to 3 other objects +DROP EXTENSION pg_pathman CASCADE; diff --git a/contrib/pg_pathman/expected/pathman_rowmarks.out b/contrib/pg_pathman/expected/pathman_rowmarks.out new file mode 100644 index 0000000000..40bd14e62b --- /dev/null +++ b/contrib/pg_pathman/expected/pathman_rowmarks.out @@ -0,0 +1,178 @@ +CREATE EXTENSION pg_pathman; +CREATE SCHEMA rowmarks; +CREATE TABLE rowmarks.first(id int NOT NULL); +CREATE TABLE rowmarks.second(id int NOT NULL); +INSERT INTO rowmarks.first SELECT generate_series(1, 10); +INSERT INTO rowmarks.second SELECT generate_series(1, 10); +SELECT create_hash_partitions('rowmarks.first', 'id', 5); + create_hash_partitions +------------------------ + 5 +(1 row) + +/* Not partitioned */ +SELECT * FROM rowmarks.second ORDER BY id FOR UPDATE; + id +---- + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 +(10 rows) + +/* Simple case (plan) */ +EXPLAIN (COSTS OFF) +SELECT * FROM rowmarks.first ORDER BY id FOR UPDATE; + QUERY PLAN +--------------------------------------- + LockRows + -> Sort + Sort Key: first_0.id + -> Append + -> Seq Scan on first_0 + -> Seq Scan on first_1 + -> Seq Scan on first_2 + -> Seq Scan on first_3 + -> Seq Scan on first_4 +(9 rows) + +/* Simple case (execution) */ +SELECT * FROM rowmarks.first ORDER BY id FOR UPDATE; + id +---- + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 +(10 rows) + +SELECT FROM rowmarks.first ORDER BY id FOR UPDATE; +-- +(10 rows) + +SELECT tableoid > 0 FROM rowmarks.first ORDER BY id FOR UPDATE; + ?column? +---------- + t + t + t + t + t + t + t + t + t + t +(10 rows) + +/* A little harder (plan) */ +EXPLAIN (COSTS OFF) +SELECT * FROM rowmarks.first +WHERE id = (SELECT id FROM rowmarks.first + ORDER BY id + OFFSET 10 LIMIT 1 + FOR UPDATE) +FOR SHARE; + QUERY PLAN +----------------------------------------------------- + LockRows + InitPlan 1 (returns $1) + -> Limit + -> LockRows + -> Sort + Sort Key: first_0.id + -> Append + -> Seq Scan on first_0 + -> Seq Scan on first_1 + -> Seq Scan on first_2 + -> Seq Scan on first_3 + -> Seq Scan on first_4 + -> Custom Scan (RuntimeAppend) + -> Seq Scan on first_0 first + Filter: (id = $1) + -> Seq Scan on first_1 first + Filter: (id = $1) + -> Seq Scan on first_2 first + Filter: (id = $1) + -> Seq Scan on first_3 first + Filter: (id = $1) + -> Seq Scan on first_4 first + Filter: (id = $1) +(23 rows) + +/* A little harder (execution) */ +SELECT * FROM rowmarks.first +WHERE id = (SELECT id FROM rowmarks.first + ORDER BY id + OFFSET 5 LIMIT 1 + FOR UPDATE) +FOR SHARE; + id +---- + 6 +(1 row) + +/* Two tables (plan) */ +EXPLAIN (COSTS OFF) +SELECT * FROM rowmarks.first +WHERE id = (SELECT id FROM rowmarks.second + ORDER BY id + OFFSET 5 LIMIT 1 + FOR UPDATE) +FOR SHARE; + QUERY PLAN +---------------------------------------------- + LockRows + InitPlan 1 (returns $1) + -> Limit + -> LockRows + -> Sort + Sort Key: second.id + -> Seq Scan on second + -> Custom Scan (RuntimeAppend) + -> Seq Scan on first_0 first + Filter: (id = $1) + -> Seq Scan on first_1 first + Filter: (id = $1) + -> Seq Scan on first_2 first + Filter: (id = $1) + -> Seq Scan on first_3 first + Filter: (id = $1) + -> Seq Scan on first_4 first + Filter: (id = $1) +(18 rows) + +/* Two tables (execution) */ +SELECT * FROM rowmarks.first +WHERE id = (SELECT id FROM rowmarks.second + ORDER BY id + OFFSET 5 LIMIT 1 + FOR UPDATE) +FOR SHARE; + id +---- + 6 +(1 row) + +DROP SCHEMA rowmarks CASCADE; +NOTICE: drop cascades to 7 other objects +DETAIL: drop cascades to table rowmarks.first +drop cascades to table rowmarks.second +drop cascades to table rowmarks.first_0 +drop cascades to table rowmarks.first_1 +drop cascades to table rowmarks.first_2 +drop cascades to table rowmarks.first_3 +drop cascades to table rowmarks.first_4 +DROP EXTENSION pg_pathman; diff --git a/contrib/pg_pathman/expected/pathman_runtime_nodes.out b/contrib/pg_pathman/expected/pathman_runtime_nodes.out new file mode 100644 index 0000000000..98b08710e0 --- /dev/null +++ b/contrib/pg_pathman/expected/pathman_runtime_nodes.out @@ -0,0 +1,291 @@ +\set VERBOSITY terse +CREATE SCHEMA pathman; +CREATE EXTENSION pg_pathman SCHEMA pathman; +CREATE SCHEMA test; +/* + * Test RuntimeAppend + */ +create or replace function test.pathman_assert(smt bool, error_msg text) returns text as $$ +begin + if not smt then + raise exception '%', error_msg; + end if; + + return 'ok'; +end; +$$ language plpgsql; +create or replace function test.pathman_equal(a text, b text, error_msg text) returns text as $$ +begin + if a != b then + raise exception '''%'' is not equal to ''%'', %', a, b, error_msg; + end if; + + return 'equal'; +end; +$$ language plpgsql; +create or replace function test.pathman_test(query text) returns jsonb as $$ +declare + plan jsonb; +begin + execute 'explain (analyze, format json)' || query into plan; + + return plan; +end; +$$ language plpgsql; +create or replace function test.pathman_test_1() returns text as $$ +declare + plan jsonb; + num int; +begin + plan = test.pathman_test('select * from test.runtime_test_1 where id = (select * from test.run_values limit 1)'); + + perform test.pathman_equal((plan->0->'Plan'->'Node Type')::text, + '"Custom Scan"', + 'wrong plan type'); + + perform test.pathman_equal((plan->0->'Plan'->'Custom Plan Provider')::text, + '"RuntimeAppend"', + 'wrong plan provider'); + + perform test.pathman_equal((plan->0->'Plan'->'Plans'->1->'Relation Name')::text, + format('"runtime_test_1_%s"', pathman.get_hash_part_idx(hashint4(1), 6)), + 'wrong partition'); + + select count(*) from jsonb_array_elements_text(plan->0->'Plan'->'Plans') into num; + perform test.pathman_equal(num::text, '2', 'expected 2 child plans for custom scan'); + + return 'ok'; +end; +$$ language plpgsql +set pg_pathman.enable = true +set enable_mergejoin = off +set enable_hashjoin = off; +create or replace function test.pathman_test_2() returns text as $$ +declare + plan jsonb; + num int; +begin + plan = test.pathman_test('select * from test.runtime_test_1 where id = any (select * from test.run_values limit 4)'); + + perform test.pathman_equal((plan->0->'Plan'->'Node Type')::text, + '"Nested Loop"', + 'wrong plan type'); + + perform test.pathman_equal((plan->0->'Plan'->'Plans'->1->'Node Type')::text, + '"Custom Scan"', + 'wrong plan type'); + + perform test.pathman_equal((plan->0->'Plan'->'Plans'->1->'Custom Plan Provider')::text, + '"RuntimeAppend"', + 'wrong plan provider'); + + select count(*) from jsonb_array_elements_text(plan->0->'Plan'->'Plans'->1->'Plans') into num; + perform test.pathman_equal(num::text, '4', 'expected 4 child plans for custom scan'); + + for i in 0..3 loop + perform test.pathman_equal((plan->0->'Plan'->'Plans'->1->'Plans'->i->'Relation Name')::text, + format('"runtime_test_1_%s"', pathman.get_hash_part_idx(hashint4(i + 1), 6)), + 'wrong partition'); + + num = plan->0->'Plan'->'Plans'->1->'Plans'->i->'Actual Loops'; + perform test.pathman_equal(num::text, '1', 'expected 1 loop'); + end loop; + + return 'ok'; +end; +$$ language plpgsql +set pg_pathman.enable = true +set enable_mergejoin = off +set enable_hashjoin = off; +create or replace function test.pathman_test_3() returns text as $$ +declare + plan jsonb; + num int; +begin + plan = test.pathman_test('select * from test.runtime_test_1 a join test.run_values b on a.id = b.val'); + + perform test.pathman_equal((plan->0->'Plan'->'Node Type')::text, + '"Nested Loop"', + 'wrong plan type'); + + perform test.pathman_equal((plan->0->'Plan'->'Plans'->1->'Node Type')::text, + '"Custom Scan"', + 'wrong plan type'); + + perform test.pathman_equal((plan->0->'Plan'->'Plans'->1->'Custom Plan Provider')::text, + '"RuntimeAppend"', + 'wrong plan provider'); + + select count(*) from jsonb_array_elements_text(plan->0->'Plan'->'Plans'->1->'Plans') into num; + perform test.pathman_equal(num::text, '6', 'expected 6 child plans for custom scan'); + + for i in 0..5 loop + num = plan->0->'Plan'->'Plans'->1->'Plans'->i->'Actual Loops'; + perform test.pathman_assert(num > 0 and num <= 1718, 'expected no more than 1718 loops'); + end loop; + + return 'ok'; +end; +$$ language plpgsql +set pg_pathman.enable = true +set enable_mergejoin = off +set enable_hashjoin = off; +create or replace function test.pathman_test_4() returns text as $$ +declare + plan jsonb; + num int; +begin + plan = test.pathman_test('select * from test.category c, lateral' || + '(select * from test.runtime_test_2 g where g.category_id = c.id order by rating limit 4) as tg'); + + perform test.pathman_equal((plan->0->'Plan'->'Node Type')::text, + '"Nested Loop"', + 'wrong plan type'); + + /* Limit -> Custom Scan */ + perform test.pathman_equal((plan->0->'Plan'->'Plans'->1->0->'Node Type')::text, + '"Custom Scan"', + 'wrong plan type'); + + perform test.pathman_equal((plan->0->'Plan'->'Plans'->1->0->'Custom Plan Provider')::text, + '"RuntimeMergeAppend"', + 'wrong plan provider'); + + select count(*) from jsonb_array_elements_text(plan->0->'Plan'->'Plans'->1->'Plans'->0->'Plans') into num; + perform test.pathman_equal(num::text, '4', 'expected 4 child plans for custom scan'); + + for i in 0..3 loop + perform test.pathman_equal((plan->0->'Plan'->'Plans'->1->'Plans'->0->'Plans'->i->'Relation Name')::text, + format('"runtime_test_2_%s"', pathman.get_hash_part_idx(hashint4(i + 1), 6)), + 'wrong partition'); + + num = plan->0->'Plan'->'Plans'->1->'Plans'->0->'Plans'->i->'Actual Loops'; + perform test.pathman_assert(num = 1, 'expected no more than 1 loops'); + end loop; + + return 'ok'; +end; +$$ language plpgsql +set pg_pathman.enable = true +set enable_mergejoin = off +set enable_hashjoin = off; +create or replace function test.pathman_test_5() returns text as $$ +declare + res record; +begin + select + from test.runtime_test_3 + where id = (select * from test.vals order by val limit 1) + limit 1 + into res; /* test empty tlist */ + + + select id, generate_series(1, 2) gen, val + from test.runtime_test_3 + where id = any (select * from test.vals order by val limit 5) + order by id, gen, val + offset 1 limit 1 + into res; /* without IndexOnlyScan */ + + perform test.pathman_equal(res.id::text, '1', 'id is incorrect (t2)'); + perform test.pathman_equal(res.gen::text, '2', 'gen is incorrect (t2)'); + perform test.pathman_equal(res.val::text, 'k = 1', 'val is incorrect (t2)'); + + + select id + from test.runtime_test_3 + where id = any (select * from test.vals order by val limit 5) + order by id + offset 3 limit 1 + into res; /* with IndexOnlyScan */ + + perform test.pathman_equal(res.id::text, '4', 'id is incorrect (t3)'); + + + select v.val v1, generate_series(2, 2) gen, t.val v2 + from test.runtime_test_3 t join test.vals v on id = v.val + order by v1, gen, v2 + limit 1 + into res; + + perform test.pathman_equal(res.v1::text, '1', 'v1 is incorrect (t4)'); + perform test.pathman_equal(res.gen::text, '2', 'gen is incorrect (t4)'); + perform test.pathman_equal(res.v2::text, 'k = 1', 'v2 is incorrect (t4)'); + + return 'ok'; +end; +$$ language plpgsql +set pg_pathman.enable = true +set enable_hashjoin = off +set enable_mergejoin = off; +create table test.run_values as select generate_series(1, 10000) val; +create table test.runtime_test_1(id serial primary key, val real); +insert into test.runtime_test_1 select generate_series(1, 10000), random(); +select pathman.create_hash_partitions('test.runtime_test_1', 'id', 6); + create_hash_partitions +------------------------ + 6 +(1 row) + +create table test.category as (select id, 'cat' || id::text as name from generate_series(1, 4) id); +create table test.runtime_test_2 (id serial, category_id int not null, name text, rating real); +insert into test.runtime_test_2 (select id, (id % 6) + 1 as category_id, 'good' || id::text as name, random() as rating from generate_series(1, 100000) id); +create index on test.runtime_test_2 (category_id, rating); +select pathman.create_hash_partitions('test.runtime_test_2', 'category_id', 6); + create_hash_partitions +------------------------ + 6 +(1 row) + +create table test.vals as (select generate_series(1, 10000) as val); +create table test.runtime_test_3(val text, id serial not null); +insert into test.runtime_test_3(id, val) select * from generate_series(1, 10000) k, format('k = %s', k); +select pathman.create_hash_partitions('test.runtime_test_3', 'id', 4); + create_hash_partitions +------------------------ + 4 +(1 row) + +create index on test.runtime_test_3 (id); +create index on test.runtime_test_3_0 (id); +analyze test.run_values; +analyze test.runtime_test_1; +analyze test.runtime_test_2; +analyze test.runtime_test_3; +analyze test.runtime_test_3_0; +set pg_pathman.enable_runtimeappend = on; +set pg_pathman.enable_runtimemergeappend = on; +select test.pathman_test_1(); /* RuntimeAppend (select ... where id = (subquery)) */ + pathman_test_1 +---------------- + ok +(1 row) + +select test.pathman_test_2(); /* RuntimeAppend (select ... where id = any(subquery)) */ + pathman_test_2 +---------------- + ok +(1 row) + +select test.pathman_test_3(); /* RuntimeAppend (a join b on a.id = b.val) */ + pathman_test_3 +---------------- + ok +(1 row) + +select test.pathman_test_4(); /* RuntimeMergeAppend (lateral) */ + pathman_test_4 +---------------- + ok +(1 row) + +select test.pathman_test_5(); /* projection tests for RuntimeXXX nodes */ + pathman_test_5 +---------------- + ok +(1 row) + +DROP SCHEMA test CASCADE; +NOTICE: drop cascades to 30 other objects +DROP EXTENSION pg_pathman CASCADE; +DROP SCHEMA pathman CASCADE; diff --git a/contrib/pg_pathman/expected/rollback_on_create_partitions.out b/contrib/pg_pathman/expected/rollback_on_create_partitions.out index 3c3e2d933a..3531107db8 100644 --- a/contrib/pg_pathman/expected/rollback_on_create_partitions.out +++ b/contrib/pg_pathman/expected/rollback_on_create_partitions.out @@ -21,7 +21,6 @@ Append -> Seq Scan on range_rel_8 -> Seq Scan on range_rel_9 -> Seq Scan on range_rel_10 -WARNING: Partitioning of table 'range_rel' has been aborted, removing partitions from pg_pathman's cache step rollback: ROLLBACK; step show_rel: EXPLAIN (COSTS OFF) SELECT * FROM range_rel; QUERY PLAN @@ -74,8 +73,8 @@ create_range_partitions 10 step savepoint_b: SAVEPOINT b; -step drop_partitions: SELECT drop_range_partitions('range_rel'); -drop_range_partitions +step drop_partitions: SELECT drop_partitions('range_rel'); +drop_partitions 10 step show_rel: EXPLAIN (COSTS OFF) SELECT * FROM range_rel; @@ -83,8 +82,6 @@ QUERY PLAN Seq Scan on range_rel step savepoint_c: SAVEPOINT c; -WARNING: All changes in partitioned table 'range_rel' will be discarded -WARNING: Partitioning of table 'range_rel' has been aborted, removing partitions from pg_pathman's cache step rollback: ROLLBACK; step show_rel: EXPLAIN (COSTS OFF) SELECT * FROM range_rel; QUERY PLAN @@ -100,8 +97,8 @@ create_range_partitions 10 step savepoint_b: SAVEPOINT b; -step drop_partitions: SELECT drop_range_partitions('range_rel'); -drop_range_partitions +step drop_partitions: SELECT drop_partitions('range_rel'); +drop_partitions 10 step show_rel: EXPLAIN (COSTS OFF) SELECT * FROM range_rel; @@ -124,12 +121,11 @@ create_range_partitions 10 step savepoint_b: SAVEPOINT b; -step drop_partitions: SELECT drop_range_partitions('range_rel'); -drop_range_partitions +step drop_partitions: SELECT drop_partitions('range_rel'); +drop_partitions 10 step savepoint_c: SAVEPOINT c; -WARNING: All changes in partitioned table 'range_rel' will be discarded step rollback_b: ROLLBACK TO SAVEPOINT b; step show_rel: EXPLAIN (COSTS OFF) SELECT * FROM range_rel; QUERY PLAN @@ -145,7 +141,6 @@ Append -> Seq Scan on range_rel_8 -> Seq Scan on range_rel_9 -> Seq Scan on range_rel_10 -WARNING: Partitioning of table 'range_rel' has been aborted, removing partitions from pg_pathman's cache step rollback: ROLLBACK; step show_rel: EXPLAIN (COSTS OFF) SELECT * FROM range_rel; QUERY PLAN @@ -161,12 +156,11 @@ create_range_partitions 10 step savepoint_b: SAVEPOINT b; -step drop_partitions: SELECT drop_range_partitions('range_rel'); -drop_range_partitions +step drop_partitions: SELECT drop_partitions('range_rel'); +drop_partitions 10 step savepoint_c: SAVEPOINT c; -WARNING: All changes in partitioned table 'range_rel' will be discarded step rollback_b: ROLLBACK TO SAVEPOINT b; step show_rel: EXPLAIN (COSTS OFF) SELECT * FROM range_rel; QUERY PLAN @@ -207,8 +201,8 @@ create_range_partitions 10 step savepoint_b: SAVEPOINT b; -step drop_partitions: SELECT drop_range_partitions('range_rel'); -drop_range_partitions +step drop_partitions: SELECT drop_partitions('range_rel'); +drop_partitions 10 step show_rel: EXPLAIN (COSTS OFF) SELECT * FROM range_rel; @@ -216,8 +210,6 @@ QUERY PLAN Seq Scan on range_rel step savepoint_c: SAVEPOINT c; -WARNING: All changes in partitioned table 'range_rel' will be discarded -WARNING: Partitioning of table 'range_rel' has been aborted, removing partitions from pg_pathman's cache step rollback_a: ROLLBACK TO SAVEPOINT a; step show_rel: EXPLAIN (COSTS OFF) SELECT * FROM range_rel; QUERY PLAN @@ -238,8 +230,8 @@ create_range_partitions 10 step savepoint_b: SAVEPOINT b; -step drop_partitions: SELECT drop_range_partitions('range_rel'); -drop_range_partitions +step drop_partitions: SELECT drop_partitions('range_rel'); +drop_partitions 10 step show_rel: EXPLAIN (COSTS OFF) SELECT * FROM range_rel; @@ -247,8 +239,6 @@ QUERY PLAN Seq Scan on range_rel step savepoint_c: SAVEPOINT c; -WARNING: All changes in partitioned table 'range_rel' will be discarded -WARNING: Partitioning of table 'range_rel' has been aborted, removing partitions from pg_pathman's cache step rollback_a: ROLLBACK TO SAVEPOINT a; step show_rel: EXPLAIN (COSTS OFF) SELECT * FROM range_rel; QUERY PLAN @@ -269,8 +259,8 @@ create_range_partitions 10 step savepoint_b: SAVEPOINT b; -step drop_partitions: SELECT drop_range_partitions('range_rel'); -drop_range_partitions +step drop_partitions: SELECT drop_partitions('range_rel'); +drop_partitions 10 step show_rel: EXPLAIN (COSTS OFF) SELECT * FROM range_rel; @@ -278,18 +268,15 @@ QUERY PLAN Seq Scan on range_rel step savepoint_c: SAVEPOINT c; -WARNING: All changes in partitioned table 'range_rel' will be discarded step rollback_b: ROLLBACK TO SAVEPOINT b; -step drop_partitions: SELECT drop_range_partitions('range_rel'); -drop_range_partitions +step drop_partitions: SELECT drop_partitions('range_rel'); +drop_partitions 10 step show_rel: EXPLAIN (COSTS OFF) SELECT * FROM range_rel; QUERY PLAN Seq Scan on range_rel -WARNING: All changes in partitioned table 'range_rel' will be discarded -WARNING: Partitioning of table 'range_rel' has been aborted, removing partitions from pg_pathman's cache step rollback: ROLLBACK; step show_rel: EXPLAIN (COSTS OFF) SELECT * FROM range_rel; QUERY PLAN @@ -305,8 +292,8 @@ create_range_partitions 10 step savepoint_b: SAVEPOINT b; -step drop_partitions: SELECT drop_range_partitions('range_rel'); -drop_range_partitions +step drop_partitions: SELECT drop_partitions('range_rel'); +drop_partitions 10 step show_rel: EXPLAIN (COSTS OFF) SELECT * FROM range_rel; @@ -314,10 +301,9 @@ QUERY PLAN Seq Scan on range_rel step savepoint_c: SAVEPOINT c; -WARNING: All changes in partitioned table 'range_rel' will be discarded step rollback_b: ROLLBACK TO SAVEPOINT b; -step drop_partitions: SELECT drop_range_partitions('range_rel'); -drop_range_partitions +step drop_partitions: SELECT drop_partitions('range_rel'); +drop_partitions 10 step show_rel: EXPLAIN (COSTS OFF) SELECT * FROM range_rel; @@ -339,12 +325,10 @@ create_range_partitions 10 step savepoint_b: SAVEPOINT b; -step drop_partitions: SELECT drop_range_partitions('range_rel'); -drop_range_partitions +step drop_partitions: SELECT drop_partitions('range_rel'); +drop_partitions 10 -WARNING: All changes in partitioned table 'range_rel' will be discarded -WARNING: Partitioning of table 'range_rel' has been aborted, removing partitions from pg_pathman's cache step rollback_a: ROLLBACK TO SAVEPOINT a; step create_partitions: SELECT create_range_partitions('range_rel', 'id', 1, 1000); create_range_partitions @@ -364,7 +348,6 @@ Append -> Seq Scan on range_rel_8 -> Seq Scan on range_rel_9 -> Seq Scan on range_rel_10 -WARNING: Partitioning of table 'range_rel' has been aborted, removing partitions from pg_pathman's cache step rollback: ROLLBACK; step show_rel: EXPLAIN (COSTS OFF) SELECT * FROM range_rel; QUERY PLAN @@ -380,12 +363,10 @@ create_range_partitions 10 step savepoint_b: SAVEPOINT b; -step drop_partitions: SELECT drop_range_partitions('range_rel'); -drop_range_partitions +step drop_partitions: SELECT drop_partitions('range_rel'); +drop_partitions 10 -WARNING: All changes in partitioned table 'range_rel' will be discarded -WARNING: Partitioning of table 'range_rel' has been aborted, removing partitions from pg_pathman's cache step rollback_a: ROLLBACK TO SAVEPOINT a; step create_partitions: SELECT create_range_partitions('range_rel', 'id', 1, 1000); create_range_partitions diff --git a/contrib/pg_pathman/hash.sql b/contrib/pg_pathman/hash.sql index 6e0a55d2af..e4001bdceb 100644 --- a/contrib/pg_pathman/hash.sql +++ b/contrib/pg_pathman/hash.sql @@ -12,280 +12,218 @@ * Creates hash partitions for specified relation */ CREATE OR REPLACE FUNCTION @extschema@.create_hash_partitions( - relation REGCLASS - , attribute TEXT - , partitions_count INTEGER -) RETURNS INTEGER AS + parent_relid REGCLASS, + attribute TEXT, + partitions_count INTEGER, + partition_data BOOLEAN DEFAULT TRUE) +RETURNS INTEGER AS $$ DECLARE - v_relname TEXT; - v_child_relname TEXT; - v_type TEXT; - v_plain_schema TEXT; - v_plain_relname TEXT; - v_hashfunc TEXT; + v_child_relname TEXT; + v_plain_schema TEXT; + v_plain_relname TEXT; + v_atttype REGTYPE; + v_hashfunc REGPROC; + v_init_callback REGPROCEDURE; + BEGIN - v_relname := @extschema@.validate_relname(relation); + IF partition_data = true THEN + /* Acquire data modification lock */ + PERFORM @extschema@.prevent_relation_modification(parent_relid); + ELSE + /* Acquire lock on parent */ + PERFORM @extschema@.lock_partitioned_relation(parent_relid); + END IF; + + PERFORM @extschema@.validate_relname(parent_relid); attribute := lower(attribute); - PERFORM @extschema@.common_relation_checks(relation, attribute); + PERFORM @extschema@.common_relation_checks(parent_relid, attribute); - v_type := @extschema@.get_attribute_type_name(v_relname, attribute); + /* Fetch atttype and its hash function */ + v_atttype := @extschema@.get_attribute_type(parent_relid, attribute); + v_hashfunc := @extschema@.get_type_hash_func(v_atttype); SELECT * INTO v_plain_schema, v_plain_relname - FROM @extschema@.get_plain_schema_and_relname(relation); + FROM @extschema@.get_plain_schema_and_relname(parent_relid); - v_hashfunc := @extschema@.get_type_hash_func(v_type::regtype::oid)::regproc; + /* Insert new entry to pathman config */ + INSERT INTO @extschema@.pathman_config (partrel, attname, parttype) + VALUES (parent_relid, attribute, 1); /* Create partitions and update pg_pathman configuration */ FOR partnum IN 0..partitions_count-1 LOOP v_child_relname := format('%s.%s', - v_plain_schema, + quote_ident(v_plain_schema), quote_ident(v_plain_relname || '_' || partnum)); - EXECUTE format('CREATE TABLE %s (LIKE %s INCLUDING ALL)' - , v_child_relname - , v_relname); - - EXECUTE format('ALTER TABLE %s INHERIT %s' - , v_child_relname - , v_relname); - - EXECUTE format('ALTER TABLE %s ADD CHECK (@extschema@.get_hash(%s(%s), %s) = %s)' - , v_child_relname - , v_hashfunc - , attribute - , partitions_count - , partnum); + EXECUTE format( + 'CREATE TABLE %1$s (LIKE %2$s INCLUDING ALL) INHERITS (%2$s) TABLESPACE %s', + v_child_relname, + parent_relid::TEXT, + @extschema@.get_rel_tablespace_name(parent_relid)); + + EXECUTE format('ALTER TABLE %s ADD CONSTRAINT %s + CHECK (@extschema@.get_hash_part_idx(%s(%s), %s) = %s)', + v_child_relname, + @extschema@.build_check_constraint_name(v_child_relname::REGCLASS, + attribute), + v_hashfunc::TEXT, + attribute, + partitions_count, + partnum); + + PERFORM @extschema@.copy_foreign_keys(parent_relid, v_child_relname::REGCLASS); + + /* Fetch init_callback from 'params' table */ + WITH stub_callback(stub) as (values (0)) + SELECT coalesce(init_callback, 0::REGPROCEDURE) + FROM stub_callback + LEFT JOIN @extschema@.pathman_config_params AS params + ON params.partrel = parent_relid + INTO v_init_callback; + + PERFORM @extschema@.invoke_on_partition_created_callback(parent_relid, + v_child_relname::REGCLASS, + v_init_callback); END LOOP; - INSERT INTO @extschema@.pathman_config (relname, attname, parttype) - VALUES (v_relname, attribute, 1); - - /* Create triggers */ - PERFORM @extschema@.create_hash_insert_trigger(v_relname, attribute, partitions_count); - /* Do not create update trigger by default */ - -- PERFORM @extschema@.create_hash_update_trigger(relation, attribute, partitions_count); /* Notify backend about changes */ - PERFORM @extschema@.on_create_partitions(relation::oid); + PERFORM @extschema@.on_create_partitions(parent_relid); /* Copy data */ - PERFORM @extschema@.partition_data(relation); + IF partition_data = true THEN + PERFORM @extschema@.set_enable_parent(parent_relid, false); + PERFORM @extschema@.partition_data(parent_relid); + ELSE + PERFORM @extschema@.set_enable_parent(parent_relid, true); + END IF; RETURN partitions_count; END -$$ LANGUAGE plpgsql; - -/* - * Creates hash trigger for specified relation - */ -CREATE OR REPLACE FUNCTION @extschema@.create_hash_insert_trigger( - IN relation REGCLASS - , IN attr TEXT - , IN partitions_count INTEGER) -RETURNS VOID AS -$$ -DECLARE - func TEXT := ' - CREATE OR REPLACE FUNCTION %s() - RETURNS TRIGGER AS $body$ - DECLARE - hash INTEGER; - BEGIN - hash := @extschema@.get_hash(%s(NEW.%s), %s); - %s - RETURN NULL; - END $body$ LANGUAGE plpgsql;'; - funcname TEXT; - trigger TEXT := ' - CREATE TRIGGER %s - BEFORE INSERT ON %s - FOR EACH ROW EXECUTE PROCEDURE %s();'; - triggername TEXT; - insert_stmt TEXT; - relname TEXT; - schema TEXT; - atttype TEXT; - hashfunc TEXT; -BEGIN - /* drop trigger and corresponding function */ - PERFORM @extschema@.drop_hash_triggers(relation); - - SELECT * INTO schema, relname - FROM @extschema@.get_plain_schema_and_relname(relation); - - /* generate INSERT statement for trigger */ - insert_stmt = format('EXECUTE format(''INSERT INTO %s.%s SELECT $1.*'', hash) USING NEW;' - , schema, quote_ident(relname || '_%s')); - - /* format and create new trigger for relation */ - funcname := schema || '.' || quote_ident(format('%s_insert_trigger_func', relname)); - triggername := quote_ident(format('%s_%s_insert_trigger', schema, relname)); - - /* base hash function for type */ - atttype := @extschema@.get_attribute_type_name(relation, attr); - hashfunc := @extschema@.get_type_hash_func(atttype::regtype::oid)::regproc; - - func := format(func, funcname, hashfunc, attr, partitions_count, insert_stmt); - trigger := format(trigger, triggername, relation, funcname); - EXECUTE func; - EXECUTE trigger; -END -$$ LANGUAGE plpgsql; - -/* - * Drops all partitions for specified relation - */ -CREATE OR REPLACE FUNCTION @extschema@.drop_hash_partitions( - IN relation REGCLASS - , delete_data BOOLEAN DEFAULT FALSE) -RETURNS INTEGER AS -$$ -DECLARE - v_relname TEXT; - v_rec RECORD; - v_rows INTEGER; - v_part_count INTEGER := 0; -BEGIN - v_relname := @extschema@.validate_relname(relation); - - /* Drop trigger first */ - PERFORM @extschema@.drop_hash_triggers(relation); - DELETE FROM @extschema@.pathman_config WHERE relname::regclass = relation; - - FOR v_rec in (SELECT inhrelid::regclass::text AS tbl - FROM pg_inherits WHERE inhparent = relation::oid) - LOOP - IF NOT delete_data THEN - EXECUTE format('WITH part_data AS (DELETE FROM %s RETURNING *) - INSERT INTO %s SELECT * FROM part_data' - , v_rec.tbl - , relation::text); - GET DIAGNOSTICS v_rows = ROW_COUNT; - RAISE NOTICE '% rows copied from %', v_rows, v_rec.tbl; - END IF; - EXECUTE format('DROP TABLE %s', v_rec.tbl); - v_part_count := v_part_count + 1; - END LOOP; - - /* Notify backend about changes */ - PERFORM @extschema@.on_remove_partitions(relation::oid); - - RETURN v_part_count; -END -$$ LANGUAGE plpgsql; - -/* - * Drops hash trigger - */ -CREATE OR REPLACE FUNCTION @extschema@.drop_hash_triggers(IN relation REGCLASS) -RETURNS VOID AS -$$ -DECLARE - relname TEXT; - schema TEXT; - funcname TEXT; -BEGIN - SELECT * INTO schema, relname - FROM @extschema@.get_plain_schema_and_relname(relation); - - funcname := schema || '.' || quote_ident(format('%s_insert_trigger_func', relname)); - EXECUTE format('DROP FUNCTION IF EXISTS %s() CASCADE', funcname); - funcname := schema || '.' || quote_ident(format('%s_update_trigger_func', relname)); - EXECUTE format('DROP FUNCTION IF EXISTS %s() CASCADE', funcname); -END -$$ LANGUAGE plpgsql; +$$ LANGUAGE plpgsql +SET client_min_messages = WARNING; /* * Creates an update trigger */ CREATE OR REPLACE FUNCTION @extschema@.create_hash_update_trigger( - IN relation REGCLASS) -RETURNS VOID AS + parent_relid REGCLASS) +RETURNS TEXT AS $$ DECLARE - func TEXT := ' - CREATE OR REPLACE FUNCTION %s() - RETURNS TRIGGER AS - $body$ - DECLARE old_hash INTEGER; new_hash INTEGER; q TEXT; - BEGIN - old_hash := @extschema@.get_hash(%9$s(OLD.%2$s), %3$s); - new_hash := @extschema@.get_hash(%9$s(NEW.%2$s), %3$s); - IF old_hash = new_hash THEN RETURN NEW; END IF; - q := format(''DELETE FROM %8$s WHERE %4$s'', old_hash); - EXECUTE q USING %5$s; - q := format(''INSERT INTO %8$s VALUES (%6$s)'', new_hash); - EXECUTE q USING %7$s; - RETURN NULL; - END $body$ LANGUAGE plpgsql'; - trigger TEXT := ' - CREATE TRIGGER %s - BEFORE UPDATE ON %s - FOR EACH ROW EXECUTE PROCEDURE %s()'; - att_names TEXT; - old_fields TEXT; - new_fields TEXT; - att_val_fmt TEXT; - att_fmt TEXT; - relid INTEGER; - partitions_count INTEGER; - attr TEXT; - plain_schema TEXT; - plain_relname TEXT; - funcname TEXT; - triggername TEXT; - child_relname_format TEXT; - atttype TEXT; - hashfunc TEXT; + func TEXT := 'CREATE OR REPLACE FUNCTION %1$s() + RETURNS TRIGGER AS + $body$ + DECLARE + old_idx INTEGER; /* partition indices */ + new_idx INTEGER; + + BEGIN + old_idx := @extschema@.get_hash_part_idx(%9$s(OLD.%2$s), %3$s); + new_idx := @extschema@.get_hash_part_idx(%9$s(NEW.%2$s), %3$s); + + IF old_idx = new_idx THEN + RETURN NEW; + END IF; + + EXECUTE format(''DELETE FROM %8$s WHERE %4$s'', old_idx) + USING %5$s; + + EXECUTE format(''INSERT INTO %8$s VALUES (%6$s)'', new_idx) + USING %7$s; + + RETURN NULL; + END $body$ + LANGUAGE plpgsql'; + + trigger TEXT := 'CREATE TRIGGER %s + BEFORE UPDATE ON %s + FOR EACH ROW EXECUTE PROCEDURE %s()'; + + att_names TEXT; + old_fields TEXT; + new_fields TEXT; + att_val_fmt TEXT; + att_fmt TEXT; + attr TEXT; + plain_schema TEXT; + plain_relname TEXT; + child_relname_format TEXT; + funcname TEXT; + triggername TEXT; + atttype REGTYPE; + partitions_count INTEGER; + BEGIN - SELECT * INTO plain_schema, plain_relname - FROM @extschema@.get_plain_schema_and_relname(relation); + attr := attname FROM @extschema@.pathman_config WHERE partrel = parent_relid; + + IF attr IS NULL THEN + RAISE EXCEPTION 'table "%" is not partitioned', parent_relid::TEXT; + END IF; - relid := relation::oid; SELECT string_agg(attname, ', '), string_agg('OLD.' || attname, ', '), string_agg('NEW.' || attname, ', '), - string_agg('CASE WHEN NOT $' || attnum || ' IS NULL THEN ' || attname || ' = $' || attnum || - ' ELSE ' || attname || ' IS NULL END', ' AND '), + string_agg('CASE WHEN NOT $' || attnum || ' IS NULL THEN ' || + attname || ' = $' || attnum || ' ' || + 'ELSE ' || + attname || ' IS NULL END', + ' AND '), string_agg('$' || attnum, ', ') - FROM pg_attribute - WHERE attrelid=relid AND attnum>0 + FROM pg_catalog.pg_attribute + WHERE attrelid = parent_relid AND attnum > 0 INTO att_names, old_fields, new_fields, att_val_fmt, att_fmt; - attr := attname FROM @extschema@.pathman_config WHERE relname::regclass = relation; + partitions_count := COUNT(*) FROM pg_catalog.pg_inherits + WHERE inhparent = parent_relid::oid; - IF attr IS NULL THEN - RAISE EXCEPTION 'Table % is not partitioned', quote_ident(relation::TEXT); - END IF; + /* Build trigger & trigger function's names */ + funcname := @extschema@.build_update_trigger_func_name(parent_relid); + triggername := @extschema@.build_update_trigger_name(parent_relid); - partitions_count := COUNT(*) FROM pg_inherits WHERE inhparent = relation::oid; + /* Build partition name template */ + SELECT * INTO plain_schema, plain_relname + FROM @extschema@.get_plain_schema_and_relname(parent_relid); - /* Function name, trigger name and child relname template */ - funcname := plain_schema || '.' || quote_ident(format('%s_update_trigger_func', plain_relname)); - child_relname_format := plain_schema || '.' || quote_ident(plain_relname || '_%s'); - triggername := quote_ident(format('%s_%s_update_trigger', plain_schema, plain_relname)); + child_relname_format := quote_ident(plain_schema) || '.' || + quote_ident(plain_relname || '_%s'); - /* base hash function for type */ - atttype := @extschema@.get_attribute_type_name(relation, attr); - hashfunc := @extschema@.get_type_hash_func(atttype::regtype::oid)::regproc; + /* Fetch base hash function for atttype */ + atttype := @extschema@.get_attribute_type(parent_relid, attr); /* Format function definition and execute it */ - func := format(func, funcname, attr, partitions_count, att_val_fmt, - old_fields, att_fmt, new_fields, child_relname_format, hashfunc); - EXECUTE func; + EXECUTE format(func, funcname, attr, partitions_count, att_val_fmt, + old_fields, att_fmt, new_fields, child_relname_format, + @extschema@.get_type_hash_func(atttype)::TEXT); - /* Create triggers on child relations */ + /* Create trigger on every partition */ FOR num IN 0..partitions_count-1 LOOP - EXECUTE format(trigger - , triggername - , format(child_relname_format, num) - , funcname); + EXECUTE format(trigger, + triggername, + format(child_relname_format, num), + funcname); END LOOP; + + return funcname; END $$ LANGUAGE plpgsql; + +/* + * Returns hash function OID for specified type + */ +CREATE OR REPLACE FUNCTION @extschema@.get_type_hash_func(REGTYPE) +RETURNS REGPROC AS 'pg_pathman', 'get_type_hash_func' +LANGUAGE C STRICT; + +/* + * Calculates hash for integer value + */ +CREATE OR REPLACE FUNCTION @extschema@.get_hash_part_idx(INTEGER, INTEGER) +RETURNS INTEGER AS 'pg_pathman', 'get_hash_part_idx' +LANGUAGE C STRICT; diff --git a/contrib/pg_pathman/init.sql b/contrib/pg_pathman/init.sql index e2e3b31c5f..4d56242bae 100644 --- a/contrib/pg_pathman/init.sql +++ b/contrib/pg_pathman/init.sql @@ -1,7 +1,7 @@ /* ------------------------------------------------------------------------ * * init.sql - * Creates config table and provides common utility functions + * Creates config table and provides common utility functions * * Copyright (c) 2015-2016, Postgres Professional * @@ -10,197 +10,352 @@ /* * Pathman config - * relname - schema qualified relation name - * attname - partitioning key - * parttype - partitioning type: - * 1 - HASH - * 2 - RANGE - * range_interval - base interval for RANGE partitioning in string representation + * partrel - regclass (relation type, stored as Oid) + * attname - partitioning key + * parttype - partitioning type: + * 1 - HASH + * 2 - RANGE + * range_interval - base interval for RANGE partitioning as string */ CREATE TABLE IF NOT EXISTS @extschema@.pathman_config ( - id SERIAL PRIMARY KEY, - relname VARCHAR(127), - attname VARCHAR(127), - parttype INTEGER, - range_interval TEXT + partrel REGCLASS NOT NULL PRIMARY KEY, + attname TEXT NOT NULL, + parttype INTEGER NOT NULL, + range_interval TEXT, + + CHECK (parttype IN (1, 2)) /* check for allowed part types */ +); + +/* + * Optional parameters for partitioned tables. + * partrel - regclass (relation type, stored as Oid) + * enable_parent - add parent table to plan + * auto - enable automatic partition creation + * init_callback - cb to be executed on partition creation + */ +CREATE TABLE IF NOT EXISTS @extschema@.pathman_config_params ( + partrel REGCLASS NOT NULL PRIMARY KEY, + enable_parent BOOLEAN NOT NULL DEFAULT TRUE, + auto BOOLEAN NOT NULL DEFAULT TRUE, + init_callback REGPROCEDURE NOT NULL DEFAULT 0 ); +CREATE UNIQUE INDEX i_pathman_config_params +ON @extschema@.pathman_config_params(partrel); + +/* + * Invalidate relcache every time someone changes parameters config. + */ +CREATE OR REPLACE FUNCTION @extschema@.pathman_config_params_trigger_func() +RETURNS TRIGGER AS +$$ +BEGIN + IF TG_OP IN ('INSERT', 'UPDATE') THEN + PERFORM @extschema@.invalidate_relcache(NEW.partrel); + END IF; + + IF TG_OP IN ('UPDATE', 'DELETE') THEN + PERFORM @extschema@.invalidate_relcache(OLD.partrel); + END IF; + + IF TG_OP = 'DELETE' THEN + RETURN OLD; + ELSE + RETURN NEW; + END IF; +END +$$ +LANGUAGE plpgsql; + +CREATE TRIGGER pathman_config_params_trigger +BEFORE INSERT OR UPDATE OR DELETE ON @extschema@.pathman_config_params +FOR EACH ROW EXECUTE PROCEDURE @extschema@.pathman_config_params_trigger_func(); + +/* + * Enable dump of config tables with pg_dump. + */ SELECT pg_catalog.pg_extension_config_dump('@extschema@.pathman_config', ''); +SELECT pg_catalog.pg_extension_config_dump('@extschema@.pathman_config_params', ''); -CREATE OR REPLACE FUNCTION @extschema@.on_create_partitions(relid OID) -RETURNS VOID AS 'pg_pathman', 'on_partitions_created' LANGUAGE C STRICT; -CREATE OR REPLACE FUNCTION @extschema@.on_update_partitions(relid OID) -RETURNS VOID AS 'pg_pathman', 'on_partitions_updated' LANGUAGE C STRICT; +CREATE OR REPLACE FUNCTION @extschema@.partitions_count(relation REGCLASS) +RETURNS INT AS +$$ +BEGIN + RETURN count(*) FROM pg_inherits WHERE inhparent = relation; +END +$$ +LANGUAGE plpgsql STRICT; -CREATE OR REPLACE FUNCTION @extschema@.on_remove_partitions(relid OID) -RETURNS VOID AS 'pg_pathman', 'on_partitions_removed' LANGUAGE C STRICT; +/* + * Add a row describing the optional parameter to pathman_config_params. + */ +CREATE OR REPLACE FUNCTION @extschema@.pathman_set_param( + relation REGCLASS, + param TEXT, + value ANYELEMENT) +RETURNS VOID AS +$$ +BEGIN + EXECUTE format('INSERT INTO @extschema@.pathman_config_params + (partrel, %1$s) VALUES ($1, $2) + ON CONFLICT (partrel) DO UPDATE SET %1$s = $2', param) + USING relation, value; +END +$$ +LANGUAGE plpgsql; -CREATE OR REPLACE FUNCTION @extschema@.find_or_create_range_partition(relid OID, value ANYELEMENT) -RETURNS OID AS 'pg_pathman', 'find_or_create_range_partition' LANGUAGE C STRICT; +/* + * Include\exclude parent relation in query plan. + */ +CREATE OR REPLACE FUNCTION @extschema@.set_enable_parent( + relation REGCLASS, + value BOOLEAN) +RETURNS VOID AS +$$ +BEGIN + PERFORM @extschema@.pathman_set_param(relation, 'enable_parent', value); +END +$$ +LANGUAGE plpgsql STRICT; +/* + * Enable\disable automatic partition creation. + */ +CREATE OR REPLACE FUNCTION @extschema@.set_auto( + relation REGCLASS, + value BOOLEAN) +RETURNS VOID AS +$$ +BEGIN + PERFORM @extschema@.pathman_set_param(relation, 'auto', value); +END +$$ +LANGUAGE plpgsql STRICT; /* - * Returns min and max values for specified RANGE partition. + * Set partition creation callback */ -CREATE OR REPLACE FUNCTION @extschema@.get_partition_range( - parent_relid OID, partition_relid OID, dummy ANYELEMENT) -RETURNS ANYARRAY AS 'pg_pathman', 'get_partition_range' LANGUAGE C STRICT; +CREATE OR REPLACE FUNCTION @extschema@.set_init_callback( + relation REGCLASS, + callback REGPROC DEFAULT 0) +RETURNS VOID AS +$$ +BEGIN + PERFORM @extschema@.validate_on_partition_created_callback(callback); + PERFORM @extschema@.pathman_set_param(relation, 'init_callback', callback); +END +$$ +LANGUAGE plpgsql; +/* + * Show all existing parents and partitions. + */ +CREATE OR REPLACE FUNCTION @extschema@.show_partition_list() +RETURNS TABLE ( + parent REGCLASS, + partition REGCLASS, + parttype INT4, + partattr TEXT, + range_min TEXT, + range_max TEXT) +AS 'pg_pathman', 'show_partition_list_internal' LANGUAGE C STRICT; /* - * Returns N-th range (in form of array) + * View for show_partition_list(). */ -CREATE OR REPLACE FUNCTION @extschema@.get_range_by_idx( - parent_relid OID, idx INTEGER, dummy ANYELEMENT) -RETURNS ANYARRAY AS 'pg_pathman', 'get_range_by_idx' LANGUAGE C STRICT; +CREATE OR REPLACE VIEW @extschema@.pathman_partition_list +AS SELECT * FROM @extschema@.show_partition_list(); /* - * Returns min value of the first range for relation + * Show all existing concurrent partitioning tasks. */ -CREATE OR REPLACE FUNCTION @extschema@.get_min_range_value( - parent_relid OID, dummy ANYELEMENT) -RETURNS ANYELEMENT AS 'pg_pathman', 'get_min_range_value' LANGUAGE C STRICT; +CREATE OR REPLACE FUNCTION @extschema@.show_concurrent_part_tasks() +RETURNS TABLE ( + userid REGROLE, + pid INT, + dbid OID, + relid REGCLASS, + processed INT, + status TEXT) +AS 'pg_pathman', 'show_concurrent_part_tasks_internal' LANGUAGE C STRICT; /* - * Returns max value of the last range for relation + * View for show_concurrent_part_tasks(). */ -CREATE OR REPLACE FUNCTION @extschema@.get_max_range_value( - parent_relid OID, dummy ANYELEMENT) -RETURNS ANYELEMENT AS 'pg_pathman', 'get_max_range_value' LANGUAGE C STRICT; +CREATE OR REPLACE VIEW @extschema@.pathman_concurrent_part_tasks +AS SELECT * FROM @extschema@.show_concurrent_part_tasks(); /* - * Checks if range overlaps with existing partitions. - * Returns TRUE if overlaps and FALSE otherwise. + * Partition table using ConcurrentPartWorker. */ -CREATE OR REPLACE FUNCTION @extschema@.check_overlap( - parent_relid OID, range_min ANYELEMENT, range_max ANYELEMENT) -RETURNS BOOLEAN AS 'pg_pathman', 'check_overlap' LANGUAGE C STRICT; +CREATE OR REPLACE FUNCTION @extschema@.partition_table_concurrently(relation regclass) +RETURNS VOID AS 'pg_pathman', 'partition_table_concurrently' LANGUAGE C STRICT; /* - * Copy rows to partitions + * Stop concurrent partitioning task. */ -CREATE OR REPLACE FUNCTION @extschema@.partition_data( - p_parent regclass - , p_invalidate_cache_on_error BOOLEAN DEFAULT FALSE - , OUT p_total BIGINT) +CREATE OR REPLACE FUNCTION @extschema@.stop_concurrent_part_task(relation regclass) +RETURNS BOOL AS 'pg_pathman', 'stop_concurrent_part_task' LANGUAGE C STRICT; + + +/* + * Copy rows to partitions concurrently. + */ +CREATE OR REPLACE FUNCTION @extschema@._partition_data_concurrent( + p_relation REGCLASS, + p_min ANYELEMENT DEFAULT NULL::text, + p_max ANYELEMENT DEFAULT NULL::text, + p_limit INT DEFAULT NULL, + OUT p_total BIGINT) AS $$ DECLARE - relname TEXT; - rec RECORD; - cnt BIGINT := 0; + v_attr TEXT; + v_limit_clause TEXT := ''; + v_where_clause TEXT := ''; + ctids TID[]; + BEGIN - relname := @extschema@.validate_relname(p_parent); + SELECT attname INTO v_attr + FROM @extschema@.pathman_config WHERE partrel = p_relation; p_total := 0; - /* Create partitions and copy rest of the data */ + /* Format LIMIT clause if needed */ + IF NOT p_limit IS NULL THEN + v_limit_clause := format('LIMIT %s', p_limit); + END IF; + + /* Format WHERE clause if needed */ + IF NOT p_min IS NULL THEN + v_where_clause := format('%1$s >= $1', v_attr); + END IF; + + IF NOT p_max IS NULL THEN + IF NOT p_min IS NULL THEN + v_where_clause := v_where_clause || ' AND '; + END IF; + v_where_clause := v_where_clause || format('%1$s < $2', v_attr); + END IF; + + IF v_where_clause != '' THEN + v_where_clause := 'WHERE ' || v_where_clause; + END IF; + + /* Lock rows and copy data */ RAISE NOTICE 'Copying data to partitions...'; + EXECUTE format('SELECT array(SELECT ctid FROM ONLY %1$s %2$s %3$s FOR UPDATE NOWAIT)', + p_relation, v_where_clause, v_limit_clause) + USING p_min, p_max + INTO ctids; + EXECUTE format(' - WITH part_data AS ( - DELETE FROM ONLY %s RETURNING *) - INSERT INTO %s SELECT * FROM part_data' - , relname - , relname); + WITH data AS ( + DELETE FROM ONLY %1$s WHERE ctid = ANY($1) RETURNING *) + INSERT INTO %1$s SELECT * FROM data', + p_relation) + USING ctids; + + /* Get number of inserted rows */ GET DIAGNOSTICS p_total = ROW_COUNT; RETURN; END $$ -LANGUAGE plpgsql; - +LANGUAGE plpgsql +SET pg_pathman.enable_partitionfilter = on; /* ensures that PartitionFilter is ON */ /* - * Disable pathman partitioning for specified relation + * Old school way to distribute rows to partitions. */ -CREATE OR REPLACE FUNCTION @extschema@.disable_partitioning(IN relation TEXT) -RETURNS VOID AS +CREATE OR REPLACE FUNCTION @extschema@.partition_data( + parent_relid REGCLASS, + OUT p_total BIGINT) +AS $$ DECLARE - v_parttype INTEGER; + relname TEXT; + rec RECORD; + cnt BIGINT := 0; + BEGIN - relation := @extschema@.validate_relname(relation); - v_parttype := parttype FROM pathman_config WHERE relname = relation; - - DELETE FROM @extschema@.pathman_config WHERE relname = relation; - IF v_parttype = 1 THEN - PERFORM @extschema@.drop_hash_triggers(relation); - ELSIF v_parttype = 2 THEN - PERFORM @extschema@.drop_range_triggers(relation); - END IF; + p_total := 0; - /* Notify backend about changes */ - PERFORM on_remove_partitions(relation::regclass::integer); + /* Create partitions and copy rest of the data */ + EXECUTE format('WITH part_data AS (DELETE FROM ONLY %1$s RETURNING *) + INSERT INTO %1$s SELECT * FROM part_data', + parent_relid::TEXT); + + /* Get number of inserted rows */ + GET DIAGNOSTICS p_total = ROW_COUNT; + RETURN; END $$ -LANGUAGE plpgsql; - +LANGUAGE plpgsql STRICT +SET pg_pathman.enable_partitionfilter = on; /* ensures that PartitionFilter is ON */ /* - * Returns attribute type name for relation + * Disable pathman partitioning for specified relation. */ -CREATE OR REPLACE FUNCTION @extschema@.get_attribute_type_name( - p_relation REGCLASS - , p_attname TEXT - , OUT p_atttype TEXT) -RETURNS TEXT AS +CREATE OR REPLACE FUNCTION @extschema@.disable_pathman_for( + parent_relid REGCLASS) +RETURNS VOID AS $$ BEGIN - SELECT typname::TEXT INTO p_atttype - FROM pg_type JOIN pg_attribute on atttypid = "oid" - WHERE attrelid = p_relation::oid and attname = lower(p_attname); -END -$$ -LANGUAGE plpgsql; + PERFORM @extschema@.validate_relname(parent_relid); + DELETE FROM @extschema@.pathman_config WHERE partrel = parent_relid; + PERFORM @extschema@.drop_triggers(parent_relid); -/* - * Checks if attribute is nullable - */ -CREATE OR REPLACE FUNCTION @extschema@.is_attribute_nullable( - p_relation REGCLASS - , p_attname TEXT - , OUT p_nullable BOOLEAN) -RETURNS BOOLEAN AS -$$ -BEGIN - SELECT NOT attnotnull INTO p_nullable - FROM pg_type JOIN pg_attribute on atttypid = "oid" - WHERE attrelid = p_relation::oid and attname = lower(p_attname); + /* Notify backend about changes */ + PERFORM @extschema@.on_remove_partitions(parent_relid); END $$ -LANGUAGE plpgsql; - +LANGUAGE plpgsql STRICT; /* - * Aggregates several common relation checks before partitioning. Suitable for every partitioning type. + * Aggregates several common relation checks before partitioning. + * Suitable for every partitioning type. */ CREATE OR REPLACE FUNCTION @extschema@.common_relation_checks( - p_relation REGCLASS - , p_attribute TEXT) + p_relation REGCLASS, + p_attribute TEXT) RETURNS BOOLEAN AS $$ DECLARE - v_rec RECORD; - is_referenced BOOLEAN; + v_rec RECORD; + is_referenced BOOLEAN; + rel_persistence CHAR; + BEGIN - IF EXISTS (SELECT * FROM @extschema@.pathman_config WHERE relname::regclass = p_relation) THEN - RAISE EXCEPTION 'Relation "%" has already been partitioned', p_relation; + /* Ignore temporary tables */ + SELECT relpersistence FROM pg_catalog.pg_class + WHERE oid = p_relation INTO rel_persistence; + + IF rel_persistence = 't'::CHAR THEN + RAISE EXCEPTION 'temporary table "%" cannot be partitioned', + p_relation::TEXT; + END IF; + + IF EXISTS (SELECT * FROM @extschema@.pathman_config + WHERE partrel = p_relation) THEN + RAISE EXCEPTION 'relation "%" has already been partitioned', p_relation; END IF; IF @extschema@.is_attribute_nullable(p_relation, p_attribute) THEN - RAISE EXCEPTION 'Partitioning key ''%'' must be NOT NULL', p_attribute; + RAISE EXCEPTION 'partitioning key ''%'' must be NOT NULL', p_attribute; END IF; - /* Check if there are foreign keys reference to the relation */ + /* Check if there are foreign keys that reference the relation */ FOR v_rec IN (SELECT * FROM pg_constraint WHERE confrelid = p_relation::regclass::oid) LOOP is_referenced := TRUE; - RAISE WARNING 'Foreign key ''%'' references to the relation ''%''', v_rec.conname, p_relation; + RAISE WARNING 'foreign key ''%'' references relation ''%''', + v_rec.conname, p_relation; END LOOP; IF is_referenced THEN - RAISE EXCEPTION 'Relation ''%'' is referenced from other relations', p_relation; + RAISE EXCEPTION 'relation "%" is referenced from other relations', p_relation; END IF; RETURN TRUE; @@ -209,146 +364,409 @@ $$ LANGUAGE plpgsql; /* - * Returns relname without quotes or something + * Returns relname without quotes or something. */ -CREATE OR REPLACE FUNCTION @extschema@.get_plain_schema_and_relname(cls regclass, OUT schema TEXT, OUT relname TEXT) +CREATE OR REPLACE FUNCTION @extschema@.get_plain_schema_and_relname( + cls REGCLASS, + OUT schema TEXT, + OUT relname TEXT) AS $$ BEGIN - SELECT relnamespace::regnamespace, pg_class.relname FROM pg_class WHERE oid = cls::oid + SELECT pg_catalog.pg_class.relnamespace::regnamespace, + pg_catalog.pg_class.relname + FROM pg_catalog.pg_class WHERE oid = cls::oid INTO schema, relname; END $$ -LANGUAGE plpgsql; - - -CREATE OR REPLACE FUNCTION @extschema@.get_plain_relname(cls regclass) -RETURNS TEXT AS -$$ -BEGIN - RETURN relname FROM pg_class WHERE oid = cls::oid; -END -$$ -LANGUAGE plpgsql; - +LANGUAGE plpgsql STRICT; /* - * Validates relation name. It must be schema qualified + * Returns the schema-qualified name of table. */ -CREATE OR REPLACE FUNCTION @extschema@.validate_relname(cls regclass) +CREATE OR REPLACE FUNCTION @extschema@.get_schema_qualified_name( + cls REGCLASS, + delimiter TEXT DEFAULT '.', + suffix TEXT DEFAULT '') RETURNS TEXT AS $$ BEGIN - RETURN @extschema@.get_schema_qualified_name(cls, '.'); + RETURN (SELECT quote_ident(relnamespace::regnamespace::text) || + delimiter || + quote_ident(relname || suffix) + FROM pg_catalog.pg_class + WHERE oid = cls::oid); END $$ -LANGUAGE plpgsql; - +LANGUAGE plpgsql STRICT; /* - * Returns schema-qualified name for table + * Validates relation name. It must be schema qualified. */ -CREATE OR REPLACE FUNCTION @extschema@.get_schema_qualified_name( - cls REGCLASS - , delimiter TEXT DEFAULT '_' - , suffix TEXT DEFAULT '') +CREATE OR REPLACE FUNCTION @extschema@.validate_relname( + cls REGCLASS) RETURNS TEXT AS $$ +DECLARE + relname TEXT; + BEGIN - RETURN (SELECT quote_ident(relnamespace::regnamespace::text) || - delimiter || - quote_ident(relname || suffix) - FROM pg_class - WHERE oid = cls::oid); + relname = @extschema@.get_schema_qualified_name(cls); + + IF relname IS NULL THEN + RAISE EXCEPTION 'relation %s does not exist', cls; + END IF; + + RETURN relname; END $$ LANGUAGE plpgsql; /* - * Check if two relations have equal structures + * Check if two relations have equal structures. */ -CREATE OR REPLACE FUNCTION @extschema@.validate_relations_equality(relation1 OID, relation2 OID) +CREATE OR REPLACE FUNCTION @extschema@.validate_relations_equality( + relation1 OID, relation2 OID) RETURNS BOOLEAN AS $$ DECLARE - rec RECORD; + rec RECORD; + BEGIN FOR rec IN ( WITH - a1 AS (select * from pg_attribute where attrelid = relation1 and attnum > 0), - a2 AS (select * from pg_attribute where attrelid = relation2 and attnum > 0) + a1 AS (select * from pg_catalog.pg_attribute + where attrelid = relation1 and attnum > 0), + a2 AS (select * from pg_catalog.pg_attribute + where attrelid = relation2 and attnum > 0) SELECT a1.attname name1, a2.attname name2, a1.atttypid type1, a2.atttypid type2 FROM a1 FULL JOIN a2 ON a1.attnum = a2.attnum ) LOOP IF rec.name1 IS NULL OR rec.name2 IS NULL OR rec.name1 != rec.name2 THEN - RETURN False; + RETURN false; END IF; END LOOP; - RETURN True; + RETURN true; END $$ LANGUAGE plpgsql; /* - * Check if regclass if date or timestamp + * DDL trigger that deletes entry from pathman_config table. */ -CREATE OR REPLACE FUNCTION @extschema@.is_date(cls REGTYPE) -RETURNS BOOLEAN AS +CREATE OR REPLACE FUNCTION @extschema@.pathman_ddl_trigger_func() +RETURNS event_trigger AS $$ +DECLARE + obj record; + pg_class_oid oid; BEGIN - RETURN cls IN ('timestamp'::regtype, 'timestamptz'::regtype, 'date'::regtype); + pg_class_oid = 'pg_catalog.pg_class'::regclass; + + /* Handle 'DROP TABLE' events */ + WITH to_be_deleted AS ( + SELECT cfg.partrel AS rel FROM pg_event_trigger_dropped_objects() AS events + JOIN @extschema@.pathman_config AS cfg ON cfg.partrel::oid = events.objid + WHERE events.classid = pg_class_oid + ) + DELETE FROM @extschema@.pathman_config + WHERE partrel IN (SELECT rel FROM to_be_deleted); + + /* Cleanup params table too */ + WITH to_be_deleted AS ( + SELECT cfg.partrel AS rel FROM pg_event_trigger_dropped_objects() AS events + JOIN @extschema@.pathman_config_params AS cfg ON cfg.partrel::oid = events.objid + WHERE events.classid = pg_class_oid + ) + DELETE FROM @extschema@.pathman_config_params + WHERE partrel IN (SELECT rel FROM to_be_deleted); END $$ LANGUAGE plpgsql; /* - * DDL trigger that deletes entry from pathman_config + * Drop triggers. */ -CREATE OR REPLACE FUNCTION @extschema@.pathman_ddl_trigger_func() -RETURNS event_trigger AS +CREATE OR REPLACE FUNCTION @extschema@.drop_triggers( + parent_relid REGCLASS) +RETURNS VOID AS +$$ +BEGIN + EXECUTE format('DROP FUNCTION IF EXISTS %s() CASCADE', + @extschema@.build_update_trigger_func_name(parent_relid)); +END +$$ LANGUAGE plpgsql STRICT; + +/* + * Drop partitions. If delete_data set to TRUE, partitions + * will be dropped with all the data. + */ +CREATE OR REPLACE FUNCTION @extschema@.drop_partitions( + parent_relid REGCLASS, + delete_data BOOLEAN DEFAULT FALSE) +RETURNS INTEGER AS $$ DECLARE - obj record; + v_rec RECORD; + v_rows BIGINT; + v_part_count INTEGER := 0; + conf_num_del INTEGER; + v_relkind CHAR; + BEGIN - FOR obj IN SELECT * FROM pg_event_trigger_dropped_objects() as events - JOIN @extschema@.pathman_config as cfg ON cfg.relname = events.object_identity + PERFORM @extschema@.validate_relname(parent_relid); + + /* Drop trigger first */ + PERFORM @extschema@.drop_triggers(parent_relid); + + WITH config_num_deleted AS (DELETE FROM @extschema@.pathman_config + WHERE partrel = parent_relid + RETURNING *) + SELECT count(*) from config_num_deleted INTO conf_num_del; + + DELETE FROM @extschema@.pathman_config_params WHERE partrel = parent_relid; + + IF conf_num_del = 0 THEN + RAISE EXCEPTION 'relation "%" has no partitions', parent_relid::TEXT; + END IF; + + FOR v_rec IN (SELECT inhrelid::REGCLASS AS tbl + FROM pg_catalog.pg_inherits + WHERE inhparent::regclass = parent_relid + ORDER BY inhrelid ASC) LOOP - IF obj.object_type = 'table' THEN - EXECUTE 'DELETE FROM @extschema@.pathman_config WHERE relname = $1' - USING obj.object_identity; + IF NOT delete_data THEN + EXECUTE format('INSERT INTO %s SELECT * FROM %s', + parent_relid::TEXT, + v_rec.tbl::TEXT); + GET DIAGNOSTICS v_rows = ROW_COUNT; + + /* Show number of copied rows */ + RAISE NOTICE '% rows copied from %', v_rows, v_rec.tbl::TEXT; END IF; + + SELECT relkind FROM pg_catalog.pg_class + WHERE oid = v_rec.tbl + INTO v_relkind; + + /* + * Determine the kind of child relation. It can be either regular + * table (r) or foreign table (f). Depending on relkind we use + * DROP TABLE or DROP FOREIGN TABLE. + */ + IF v_relkind = 'f' THEN + EXECUTE format('DROP FOREIGN TABLE %s', v_rec.tbl::TEXT); + ELSE + EXECUTE format('DROP TABLE %s', v_rec.tbl::TEXT); + END IF; + + v_part_count := v_part_count + 1; END LOOP; + + /* Notify backend about changes */ + PERFORM @extschema@.on_remove_partitions(parent_relid); + + RETURN v_part_count; END +$$ LANGUAGE plpgsql +SET pg_pathman.enable_partitionfilter = off; /* ensures that PartitionFilter is OFF */ + + +/* + * Copy all of parent's foreign keys. + */ +CREATE OR REPLACE FUNCTION @extschema@.copy_foreign_keys( + parent_relid REGCLASS, + partition REGCLASS) +RETURNS VOID AS $$ -LANGUAGE plpgsql; +DECLARE + rec RECORD; + +BEGIN + PERFORM @extschema@.validate_relname(parent_relid); + PERFORM @extschema@.validate_relname(partition); + FOR rec IN (SELECT oid as conid FROM pg_catalog.pg_constraint + WHERE conrelid = parent_relid AND contype = 'f') + LOOP + EXECUTE format('ALTER TABLE %s ADD %s', + partition::TEXT, + pg_get_constraintdef(rec.conid)); + END LOOP; +END +$$ LANGUAGE plpgsql STRICT; + + +/* + * Create DDL trigger to call pathman_ddl_trigger_func(). + */ CREATE EVENT TRIGGER pathman_ddl_trigger ON sql_drop EXECUTE PROCEDURE @extschema@.pathman_ddl_trigger_func(); + + +CREATE OR REPLACE FUNCTION @extschema@.on_create_partitions( + relid REGCLASS) +RETURNS VOID AS 'pg_pathman', 'on_partitions_created' +LANGUAGE C STRICT; + +CREATE OR REPLACE FUNCTION @extschema@.on_update_partitions( + relid REGCLASS) +RETURNS VOID AS 'pg_pathman', 'on_partitions_updated' +LANGUAGE C STRICT; + +CREATE OR REPLACE FUNCTION @extschema@.on_remove_partitions( + relid REGCLASS) +RETURNS VOID AS 'pg_pathman', 'on_partitions_removed' +LANGUAGE C STRICT; + + /* - * Acquire partitions lock to prevent concurrent partitions creation + * Get parent of pg_pathman's partition. */ -CREATE OR REPLACE FUNCTION @extschema@.acquire_partitions_lock() -RETURNS VOID AS 'pg_pathman', 'acquire_partitions_lock' LANGUAGE C STRICT; +CREATE OR REPLACE FUNCTION @extschema@.get_parent_of_partition(REGCLASS) +RETURNS REGCLASS AS 'pg_pathman', 'get_parent_of_partition_pl' +LANGUAGE C STRICT; /* - * Release partitions lock + * Extract basic type of a domain. */ -CREATE OR REPLACE FUNCTION @extschema@.release_partitions_lock() -RETURNS VOID AS 'pg_pathman', 'release_partitions_lock' LANGUAGE C STRICT; +CREATE OR REPLACE FUNCTION @extschema@.get_base_type(REGTYPE) +RETURNS REGTYPE AS 'pg_pathman', 'get_base_type_pl' +LANGUAGE C STRICT; + +/* + * Returns attribute type name for relation. + */ +CREATE OR REPLACE FUNCTION @extschema@.get_attribute_type( + REGCLASS, TEXT) +RETURNS REGTYPE AS 'pg_pathman', 'get_attribute_type_pl' +LANGUAGE C STRICT; + +/* + * Return tablespace name for specified relation. + */ +CREATE OR REPLACE FUNCTION @extschema@.get_rel_tablespace_name(REGCLASS) +RETURNS TEXT AS 'pg_pathman', 'get_rel_tablespace_name' +LANGUAGE C STRICT; + + +/* + * Checks if attribute is nullable + */ +CREATE OR REPLACE FUNCTION @extschema@.is_attribute_nullable( + REGCLASS, TEXT) +RETURNS BOOLEAN AS 'pg_pathman', 'is_attribute_nullable' +LANGUAGE C STRICT; + +/* + * Check if regclass is date or timestamp. + */ +CREATE OR REPLACE FUNCTION @extschema@.is_date_type( + typid REGTYPE) +RETURNS BOOLEAN AS 'pg_pathman', 'is_date_type' +LANGUAGE C STRICT; + + +/* + * Build check constraint name for a specified relation's column. + */ +CREATE OR REPLACE FUNCTION @extschema@.build_check_constraint_name( + REGCLASS, INT2) +RETURNS TEXT AS 'pg_pathman', 'build_check_constraint_name_attnum' +LANGUAGE C STRICT; + +CREATE OR REPLACE FUNCTION @extschema@.build_check_constraint_name( + REGCLASS, TEXT) +RETURNS TEXT AS 'pg_pathman', 'build_check_constraint_name_attname' +LANGUAGE C STRICT; + +/* + * Build update trigger and its underlying function's names. + */ +CREATE OR REPLACE FUNCTION @extschema@.build_update_trigger_name( + REGCLASS) +RETURNS TEXT AS 'pg_pathman', 'build_update_trigger_name' +LANGUAGE C STRICT; + +CREATE OR REPLACE FUNCTION @extschema@.build_update_trigger_func_name( + REGCLASS) +RETURNS TEXT AS 'pg_pathman', 'build_update_trigger_func_name' +LANGUAGE C STRICT; + + +/* + * Attach a previously partitioned table. + */ +CREATE OR REPLACE FUNCTION @extschema@.add_to_pathman_config( + parent_relid REGCLASS, + attname TEXT, + range_interval TEXT DEFAULT NULL) +RETURNS BOOLEAN AS 'pg_pathman', 'add_to_pathman_config' +LANGUAGE C; + +CREATE OR REPLACE FUNCTION @extschema@.invalidate_relcache(relid OID) +RETURNS VOID AS 'pg_pathman' LANGUAGE C STRICT; + + +/* + * Lock partitioned relation to restrict concurrent + * modification of partitioning scheme. + */ + CREATE OR REPLACE FUNCTION @extschema@.lock_partitioned_relation( + REGCLASS) + RETURNS VOID AS 'pg_pathman', 'lock_partitioned_relation' + LANGUAGE C STRICT; + +/* + * Lock relation to restrict concurrent modification of data. + */ + CREATE OR REPLACE FUNCTION @extschema@.prevent_relation_modification( + REGCLASS) + RETURNS VOID AS 'pg_pathman', 'prevent_relation_modification' + LANGUAGE C STRICT; + + +/* + * DEBUG: Place this inside some plpgsql fuction and set breakpoint. + */ +CREATE OR REPLACE FUNCTION @extschema@.debug_capture() +RETURNS VOID AS 'pg_pathman', 'debug_capture' +LANGUAGE C STRICT; + +/* + * Checks that callback function meets specific requirements. Particularly it + * must have the only JSONB argument and VOID return type. + */ +CREATE OR REPLACE FUNCTION @extschema@.validate_on_partition_created_callback( + callback REGPROC) +RETURNS VOID AS 'pg_pathman', 'validate_on_part_init_callback_pl' +LANGUAGE C STRICT; + /* - * Returns hash function OID for specified type + * Invoke init_callback on RANGE partition. */ -CREATE OR REPLACE FUNCTION @extschema@.get_type_hash_func(OID) -RETURNS OID AS 'pg_pathman', 'get_type_hash_func' LANGUAGE C STRICT; +CREATE OR REPLACE FUNCTION @extschema@.invoke_on_partition_created_callback( + parent_relid REGCLASS, + partition REGCLASS, + init_callback REGPROCEDURE, + start_value ANYELEMENT, + end_value ANYELEMENT) +RETURNS VOID AS 'pg_pathman', 'invoke_on_partition_created_callback' +LANGUAGE C; /* - * Calculates hash for integer value + * Invoke init_callback on HASH partition. */ -CREATE OR REPLACE FUNCTION @extschema@.get_hash(INTEGER, INTEGER) -RETURNS INTEGER AS 'pg_pathman', 'get_hash' LANGUAGE C STRICT; +CREATE OR REPLACE FUNCTION @extschema@.invoke_on_partition_created_callback( + parent_relid REGCLASS, + partition REGCLASS, + init_callback REGPROCEDURE) +RETURNS VOID AS 'pg_pathman', 'invoke_on_partition_created_callback' +LANGUAGE C; diff --git a/contrib/pg_pathman/pg_pathman.control b/contrib/pg_pathman/pg_pathman.control index d42ea8c09d..ecc4ef641f 100644 --- a/contrib/pg_pathman/pg_pathman.control +++ b/contrib/pg_pathman/pg_pathman.control @@ -1,4 +1,4 @@ # pg_pathman extension -comment 'Partitioning tool' -default_version = '0.1' +comment 'Partitioning tool ver. 1.0' +default_version = '1.0' module_pathname='$libdir/pg_pathman' diff --git a/contrib/pg_pathman/range.sql b/contrib/pg_pathman/range.sql index 024091bba2..dfad1fdcd8 100644 --- a/contrib/pg_pathman/range.sql +++ b/contrib/pg_pathman/range.sql @@ -8,18 +8,24 @@ * ------------------------------------------------------------------------ */ -CREATE OR REPLACE FUNCTION @extschema@.get_sequence_name(plain_schema TEXT, plain_relname TEXT) +CREATE OR REPLACE FUNCTION @extschema@.get_sequence_name( + plain_schema TEXT, + plain_relname TEXT) RETURNS TEXT AS $$ BEGIN - RETURN format('%s.%s', plain_schema, quote_ident(format('%s_seq', plain_relname))); + RETURN format('%s.%s', + quote_ident(plain_schema), + quote_ident(format('%s_seq', plain_relname))); END $$ LANGUAGE plpgsql; -CREATE OR REPLACE FUNCTION @extschema@.create_or_replace_sequence(plain_schema TEXT, plain_relname TEXT, OUT seq_name TEXT) +CREATE OR REPLACE FUNCTION @extschema@.create_or_replace_sequence( + plain_schema TEXT, + plain_relname TEXT, + OUT seq_name TEXT) AS $$ -DECLARE BEGIN seq_name := @extschema@.get_sequence_name(plain_schema, plain_relname); EXECUTE format('DROP SEQUENCE IF EXISTS %s', seq_name); @@ -28,38 +34,91 @@ END $$ LANGUAGE plpgsql; +/* + * Check RANGE partition boundaries. + */ +CREATE OR REPLACE FUNCTION @extschema@.check_boundaries( + parent_relid REGCLASS, + p_attribute TEXT, + p_start_value ANYELEMENT, + p_end_value ANYELEMENT) +RETURNS VOID AS +$$ +DECLARE + v_min p_start_value%TYPE; + v_max p_start_value%TYPE; + v_count BIGINT; + +BEGIN + /* Get min and max values */ + EXECUTE format('SELECT count(*), min(%1$s), max(%1$s) + FROM %2$s WHERE NOT %1$s IS NULL', + p_attribute, parent_relid::TEXT) + INTO v_count, v_min, v_max; + + /* Check if column has NULL values */ + IF v_count > 0 AND (v_min IS NULL OR v_max IS NULL) THEN + RAISE EXCEPTION '''%'' column contains NULL values', p_attribute; + END IF; + + /* Check lower boundary */ + IF p_start_value > v_min THEN + RAISE EXCEPTION 'start value is less than minimum value of ''%''', + p_attribute; + END IF; + + /* Check upper boundary */ + IF p_end_value <= v_max THEN + RAISE EXCEPTION 'not enough partitions to fit all values of ''%''', + p_attribute; + END IF; +END +$$ LANGUAGE plpgsql; + /* * Creates RANGE partitions for specified relation based on datetime attribute */ CREATE OR REPLACE FUNCTION @extschema@.create_range_partitions( - p_relation REGCLASS - , p_attribute TEXT - , p_start_value ANYELEMENT - , p_interval INTERVAL - , p_count INTEGER DEFAULT NULL) + parent_relid REGCLASS, + p_attribute TEXT, + p_start_value ANYELEMENT, + p_interval INTERVAL, + p_count INTEGER DEFAULT NULL, + partition_data BOOLEAN DEFAULT TRUE) RETURNS INTEGER AS $$ DECLARE - v_relname TEXT; - v_rows_count INTEGER; - v_max p_start_value%TYPE; - v_cur_value p_start_value%TYPE := p_start_value; - v_plain_relname TEXT; - v_plain_schema TEXT; - i INTEGER; + v_rows_count INTEGER; + v_atttype REGTYPE; + v_max p_start_value%TYPE; + v_cur_value p_start_value%TYPE := p_start_value; + p_end_value p_start_value%TYPE; + i INTEGER; + BEGIN - v_relname := @extschema@.validate_relname(p_relation); + IF partition_data = true THEN + /* Acquire data modification lock */ + PERFORM @extschema@.prevent_relation_modification(parent_relid); + ELSE + /* Acquire lock on parent */ + PERFORM @extschema@.lock_partitioned_relation(parent_relid); + END IF; + + PERFORM @extschema@.validate_relname(parent_relid); p_attribute := lower(p_attribute); - PERFORM @extschema@.common_relation_checks(v_relname, p_attribute); + PERFORM @extschema@.common_relation_checks(parent_relid, p_attribute); + + IF p_count < 0 THEN + RAISE EXCEPTION '''p_count'' must not be less than 0'; + END IF; /* Try to determine partitions count if not set */ IF p_count IS NULL THEN - EXECUTE format('SELECT count(*), max(%s) FROM %s' - , p_attribute, p_relation) + EXECUTE format('SELECT count(*), max(%s) FROM %s', p_attribute, parent_relid) INTO v_rows_count, v_max; IF v_rows_count = 0 THEN - RAISE EXCEPTION 'Cannot determine partitions count for empty table'; + RAISE EXCEPTION 'cannot determine partitions count for empty table'; END IF; p_count := 0; @@ -70,44 +129,64 @@ BEGIN END LOOP; END IF; - /* Check boundaries */ - EXECUTE format('SELECT @extschema@.check_boundaries(''%s'', ''%s'', ''%s'', ''%s''::%s)' - , v_relname - , p_attribute - , p_start_value - , p_start_value + p_interval*p_count - , pg_typeof(p_start_value)); + v_atttype := @extschema@.get_base_type(pg_typeof(p_start_value)); + + /* + * In case when user doesn't want to automatically create partitions + * and specifies partition count as 0 then do not check boundaries + */ + IF p_count != 0 THEN + /* compute right bound of partitioning through additions */ + p_end_value := p_start_value; + FOR i IN 1..p_count + LOOP + p_end_value := p_end_value + p_interval; + END LOOP; + + /* Check boundaries */ + EXECUTE format('SELECT @extschema@.check_boundaries(''%s'', ''%s'', ''%s'', ''%s''::%s)', + parent_relid, + p_attribute, + p_start_value, + p_end_value, + v_atttype::TEXT); + END IF; /* Create sequence for child partitions names */ - SELECT * INTO v_plain_schema, v_plain_relname FROM @extschema@.get_plain_schema_and_relname(p_relation); - PERFORM @extschema@.create_or_replace_sequence(v_plain_schema, v_plain_relname); + PERFORM @extschema@.create_or_replace_sequence(schema, relname) + FROM @extschema@.get_plain_schema_and_relname(parent_relid); /* Insert new entry to pathman config */ - INSERT INTO @extschema@.pathman_config (relname, attname, parttype, range_interval) - VALUES (v_relname, p_attribute, 2, p_interval::text); + INSERT INTO @extschema@.pathman_config (partrel, attname, parttype, range_interval) + VALUES (parent_relid, p_attribute, 2, p_interval::TEXT); - /* create first partition */ + /* Create first partition */ FOR i IN 1..p_count LOOP - EXECUTE format('SELECT @extschema@.create_single_range_partition($1, $2, $3::%s);', pg_typeof(p_start_value)) - USING v_relname, p_start_value, p_start_value + p_interval; + EXECUTE + format('SELECT @extschema@.create_single_range_partition($1, $2, $3::%s, tablespace:=$4)', + v_atttype::TEXT) + USING + parent_relid, + p_start_value, + p_start_value + p_interval, + @extschema@.get_rel_tablespace_name(parent_relid); p_start_value := p_start_value + p_interval; END LOOP; - /* Create triggers */ - PERFORM @extschema@.create_range_insert_trigger(v_relname, p_attribute); - -- PERFORM create_hash_update_trigger(relation, attribute, partitions_count); /* Notify backend about changes */ - PERFORM @extschema@.on_create_partitions(p_relation::oid); + PERFORM @extschema@.on_create_partitions(parent_relid); - /* Copy data */ - PERFORM @extschema@.partition_data(p_relation); + /* Relocate data if asked to */ + IF partition_data = true THEN + PERFORM @extschema@.set_enable_parent(parent_relid, false); + PERFORM @extschema@.partition_data(parent_relid); + ELSE + PERFORM @extschema@.set_enable_parent(parent_relid, true); + END IF; RETURN p_count; - -EXCEPTION WHEN others THEN - RAISE EXCEPTION '%', SQLERRM; END $$ LANGUAGE plpgsql; @@ -115,38 +194,45 @@ $$ LANGUAGE plpgsql; * Creates RANGE partitions for specified relation based on numerical attribute */ CREATE OR REPLACE FUNCTION @extschema@.create_range_partitions( - p_relation REGCLASS - , p_attribute TEXT - , p_start_value ANYELEMENT - , p_interval ANYELEMENT - , p_count INTEGER DEFAULT NULL) + parent_relid REGCLASS, + p_attribute TEXT, + p_start_value ANYELEMENT, + p_interval ANYELEMENT, + p_count INTEGER DEFAULT NULL, + partition_data BOOLEAN DEFAULT TRUE) RETURNS INTEGER AS $$ DECLARE - v_relname TEXT; - v_rows_count INTEGER; - v_max p_start_value%TYPE; - v_cur_value p_start_value%TYPE := p_start_value; - i INTEGER; - v_plain_schema TEXT; - v_plain_relname TEXT; + v_rows_count INTEGER; + v_max p_start_value%TYPE; + v_cur_value p_start_value%TYPE := p_start_value; + p_end_value p_start_value%TYPE; + i INTEGER; + BEGIN - v_relname := @extschema@.validate_relname(p_relation); + IF partition_data = true THEN + /* Acquire data modification lock */ + PERFORM @extschema@.prevent_relation_modification(parent_relid); + ELSE + /* Acquire lock on parent */ + PERFORM @extschema@.lock_partitioned_relation(parent_relid); + END IF; + + PERFORM @extschema@.validate_relname(parent_relid); p_attribute := lower(p_attribute); - PERFORM @extschema@.common_relation_checks(p_relation, p_attribute); + PERFORM @extschema@.common_relation_checks(parent_relid, p_attribute); - IF p_count <= 0 THEN - RAISE EXCEPTION 'Partitions count must be greater than zero'; + IF p_count < 0 THEN + RAISE EXCEPTION 'partitions count must not be less than zero'; END IF; /* Try to determine partitions count if not set */ IF p_count IS NULL THEN - EXECUTE format('SELECT count(*), max(%s) FROM %s' - , p_attribute, p_relation) + EXECUTE format('SELECT count(*), max(%s) FROM %s', p_attribute, parent_relid) INTO v_rows_count, v_max; IF v_rows_count = 0 THEN - RAISE EXCEPTION 'Cannot determine partitions count for empty table'; + RAISE EXCEPTION 'cannot determine partitions count for empty table'; END IF; IF v_max IS NULL THEN @@ -161,43 +247,57 @@ BEGIN END LOOP; END IF; - /* check boundaries */ - PERFORM @extschema@.check_boundaries(p_relation - , p_attribute - , p_start_value - , p_start_value + p_interval*p_count); + /* + * In case when user doesn't want to automatically create partitions + * and specifies partition count as 0 then do not check boundaries + */ + IF p_count != 0 THEN + /* compute right bound of partitioning through additions */ + p_end_value := p_start_value; + FOR i IN 1..p_count + LOOP + p_end_value := p_end_value + p_interval; + END LOOP; + + /* check boundaries */ + PERFORM @extschema@.check_boundaries(parent_relid, + p_attribute, + p_start_value, + p_end_value); + END IF; /* Create sequence for child partitions names */ - SELECT * INTO v_plain_schema, v_plain_relname FROM @extschema@.get_plain_schema_and_relname(p_relation); - PERFORM @extschema@.create_or_replace_sequence(v_plain_schema, v_plain_relname); + PERFORM @extschema@.create_or_replace_sequence(schema, relname) + FROM @extschema@.get_plain_schema_and_relname(parent_relid); /* Insert new entry to pathman config */ - INSERT INTO @extschema@.pathman_config (relname, attname, parttype, range_interval) - VALUES (v_relname, p_attribute, 2, p_interval::text); + INSERT INTO @extschema@.pathman_config (partrel, attname, parttype, range_interval) + VALUES (parent_relid, p_attribute, 2, p_interval::TEXT); /* create first partition */ FOR i IN 1..p_count LOOP - PERFORM @extschema@.create_single_range_partition(p_relation - , p_start_value - , p_start_value + p_interval); + PERFORM @extschema@.create_single_range_partition( + parent_relid, + p_start_value, + p_start_value + p_interval, + tablespace := @extschema@.get_rel_tablespace_name(parent_relid)); + p_start_value := p_start_value + p_interval; END LOOP; - /* Create triggers */ - PERFORM @extschema@.create_range_insert_trigger(p_relation, p_attribute); - -- PERFORM create_hash_update_trigger(relation, attribute, partitions_count); - /* Notify backend about changes */ - PERFORM @extschema@.on_create_partitions(p_relation::regclass::oid); + PERFORM @extschema@.on_create_partitions(parent_relid); - /* Copy data */ - PERFORM @extschema@.partition_data(p_relation); + /* Relocate data if asked to */ + IF partition_data = true THEN + PERFORM @extschema@.set_enable_parent(parent_relid, false); + PERFORM @extschema@.partition_data(parent_relid); + ELSE + PERFORM @extschema@.set_enable_parent(parent_relid, true); + END IF; RETURN p_count; - -EXCEPTION WHEN others THEN - RAISE EXCEPTION '%', SQLERRM; END $$ LANGUAGE plpgsql; @@ -205,63 +305,72 @@ $$ LANGUAGE plpgsql; * Creates RANGE partitions for specified range */ CREATE OR REPLACE FUNCTION @extschema@.create_partitions_from_range( - p_relation REGCLASS - , p_attribute TEXT - , p_start_value ANYELEMENT - , p_end_value ANYELEMENT - , p_interval ANYELEMENT) + parent_relid REGCLASS, + p_attribute TEXT, + p_start_value ANYELEMENT, + p_end_value ANYELEMENT, + p_interval ANYELEMENT, + partition_data BOOLEAN DEFAULT TRUE) RETURNS INTEGER AS $$ DECLARE - v_relname TEXT; - v_plain_schema TEXT; - v_plain_relname TEXT; - i INTEGER := 0; + part_count INTEGER := 0; + BEGIN - v_relname := @extschema@.validate_relname(p_relation); + IF partition_data = true THEN + /* Acquire data modification lock */ + PERFORM @extschema@.prevent_relation_modification(parent_relid); + ELSE + /* Acquire lock on parent */ + PERFORM @extschema@.lock_partitioned_relation(parent_relid); + END IF; + + PERFORM @extschema@.validate_relname(parent_relid); p_attribute := lower(p_attribute); - PERFORM @extschema@.common_relation_checks(p_relation, p_attribute); + PERFORM @extschema@.common_relation_checks(parent_relid, p_attribute); IF p_interval <= 0 THEN - RAISE EXCEPTION 'Interval must be positive'; + RAISE EXCEPTION 'interval must be positive'; END IF; - /* Create sequence for child partitions names */ - SELECT * INTO v_plain_schema, v_plain_relname FROM @extschema@.get_plain_schema_and_relname(p_relation); - PERFORM @extschema@.create_or_replace_sequence(v_plain_schema, v_plain_relname); + /* Check boundaries */ + PERFORM @extschema@.check_boundaries(parent_relid, + p_attribute, + p_start_value, + p_end_value); - /* check boundaries */ - PERFORM @extschema@.check_boundaries(p_relation - , p_attribute - , p_start_value - , p_end_value); + /* Create sequence for child partitions names */ + PERFORM @extschema@.create_or_replace_sequence(schema, relname) + FROM @extschema@.get_plain_schema_and_relname(parent_relid); /* Insert new entry to pathman config */ - INSERT INTO @extschema@.pathman_config (relname, attname, parttype, range_interval) - VALUES (v_relname, p_attribute, 2, p_interval::text); + INSERT INTO @extschema@.pathman_config (partrel, attname, parttype, range_interval) + VALUES (parent_relid, p_attribute, 2, p_interval::TEXT); WHILE p_start_value <= p_end_value LOOP - PERFORM @extschema@.create_single_range_partition(p_relation - , p_start_value - , p_start_value + p_interval); + PERFORM @extschema@.create_single_range_partition( + parent_relid, + p_start_value, + p_start_value + p_interval, + tablespace := @extschema@.get_rel_tablespace_name(parent_relid)); + p_start_value := p_start_value + p_interval; - i := i + 1; + part_count := part_count + 1; END LOOP; - /* Create triggers */ - PERFORM @extschema@.create_range_insert_trigger(p_relation, p_attribute); - /* Notify backend about changes */ - PERFORM @extschema@.on_create_partitions(p_relation::regclass::oid); - - /* Copy data */ - PERFORM @extschema@.partition_data(p_relation); + PERFORM @extschema@.on_create_partitions(parent_relid); - RETURN i; + /* Relocate data if asked to */ + IF partition_data = true THEN + PERFORM @extschema@.set_enable_parent(parent_relid, false); + PERFORM @extschema@.partition_data(parent_relid); + ELSE + PERFORM @extschema@.set_enable_parent(parent_relid, true); + END IF; -EXCEPTION WHEN others THEN - RAISE EXCEPTION '%', SQLERRM; + RETURN part_count; /* number of created partitions */ END $$ LANGUAGE plpgsql; @@ -269,291 +378,261 @@ $$ LANGUAGE plpgsql; * Creates RANGE partitions for specified range based on datetime attribute */ CREATE OR REPLACE FUNCTION @extschema@.create_partitions_from_range( - p_relation REGCLASS - , p_attribute TEXT - , p_start_value ANYELEMENT - , p_end_value ANYELEMENT - , p_interval INTERVAL) + parent_relid REGCLASS, + p_attribute TEXT, + p_start_value ANYELEMENT, + p_end_value ANYELEMENT, + p_interval INTERVAL, + partition_data BOOLEAN DEFAULT TRUE) RETURNS INTEGER AS $$ DECLARE - v_relname TEXT; - v_plain_schema TEXT; - v_plain_relname TEXT; - i INTEGER := 0; + part_count INTEGER := 0; + BEGIN - v_relname := @extschema@.validate_relname(p_relation); + IF partition_data = true THEN + /* Acquire data modification lock */ + PERFORM @extschema@.prevent_relation_modification(parent_relid); + ELSE + /* Acquire lock on parent */ + PERFORM @extschema@.lock_partitioned_relation(parent_relid); + END IF; + + PERFORM @extschema@.validate_relname(parent_relid); p_attribute := lower(p_attribute); - PERFORM @extschema@.common_relation_checks(p_relation, p_attribute); + PERFORM @extschema@.common_relation_checks(parent_relid, p_attribute); - /* Create sequence for child partitions names */ - SELECT * INTO v_plain_schema, v_plain_relname FROM @extschema@.get_plain_schema_and_relname(p_relation); - PERFORM @extschema@.create_or_replace_sequence(v_plain_schema, v_plain_relname); + /* Check boundaries */ + PERFORM @extschema@.check_boundaries(parent_relid, + p_attribute, + p_start_value, + p_end_value); - /* check boundaries */ - PERFORM @extschema@.check_boundaries(p_relation - , p_attribute - , p_start_value - , p_end_value); + /* Create sequence for child partitions names */ + PERFORM @extschema@.create_or_replace_sequence(schema, relname) + FROM @extschema@.get_plain_schema_and_relname(parent_relid); /* Insert new entry to pathman config */ - INSERT INTO @extschema@.pathman_config (relname, attname, parttype, range_interval) - VALUES (v_relname, p_attribute, 2, p_interval::text); + INSERT INTO @extschema@.pathman_config (partrel, attname, parttype, range_interval) + VALUES (parent_relid, p_attribute, 2, p_interval::TEXT); WHILE p_start_value <= p_end_value LOOP - EXECUTE format('SELECT @extschema@.create_single_range_partition($1, $2, $3::%s);', pg_typeof(p_start_value)) - USING p_relation, p_start_value, p_start_value + p_interval; + EXECUTE + format('SELECT @extschema@.create_single_range_partition($1, $2, $3::%s, tablespace:=$4);', + @extschema@.get_base_type(pg_typeof(p_start_value))::TEXT) + USING + parent_relid, + p_start_value, + p_start_value + p_interval, + @extschema@.get_rel_tablespace_name(parent_relid); + p_start_value := p_start_value + p_interval; - i := i + 1; + part_count := part_count + 1; END LOOP; - /* Create triggers */ - PERFORM @extschema@.create_range_insert_trigger(p_relation, p_attribute); - /* Notify backend about changes */ - PERFORM @extschema@.on_create_partitions(p_relation::regclass::oid); - - /* Copy data */ - PERFORM @extschema@.partition_data(p_relation); - - RETURN i; - -EXCEPTION WHEN others THEN - RAISE EXCEPTION '%', SQLERRM; -END -$$ LANGUAGE plpgsql; - -/* - * - */ -CREATE OR REPLACE FUNCTION @extschema@.check_boundaries( - p_relation REGCLASS - , p_attribute TEXT - , p_start_value ANYELEMENT - , p_end_value ANYELEMENT) -RETURNS VOID AS -$$ -DECLARE - v_min p_start_value%TYPE; - v_max p_start_value%TYPE; - v_count INTEGER; -BEGIN - /* Get min and max values */ - EXECUTE format('SELECT count(*), min(%s), max(%s) FROM %s WHERE NOT %s IS NULL', - p_attribute, p_attribute, p_relation::text, p_attribute) - INTO v_count, v_min, v_max; - - /* check if column has NULL values */ - IF v_count > 0 AND (v_min IS NULL OR v_max IS NULL) THEN - RAISE EXCEPTION '''%'' column has NULL values', p_attribute; - END IF; - - /* check lower boundary */ - IF p_start_value > v_min THEN - RAISE EXCEPTION 'Start value is less than minimum value of ''%''' - , p_attribute; - END IF; - - /* check upper boundary */ - IF p_end_value <= v_max THEN - RAISE EXCEPTION 'Not enough partitions to fit all the values of ''%''' - , p_attribute; - END IF; -END -$$ LANGUAGE plpgsql; + PERFORM @extschema@.on_create_partitions(parent_relid); -/* - * Formats range condition. Utility function. - */ -CREATE OR REPLACE FUNCTION @extschema@.get_range_condition( - p_attname TEXT - , p_start_value ANYELEMENT - , p_end_value ANYELEMENT) -RETURNS TEXT AS -$$ -DECLARE - v_type REGTYPE; - v_sql TEXT; -BEGIN - /* determine the type of values */ - v_type := pg_typeof(p_start_value); - - /* we cannot use placeholders in DDL queries, so we are using format(...) */ - IF v_type IN ('date'::regtype, 'timestamp'::regtype, 'timestamptz'::regtype) THEN - v_sql := '%s >= ''%s'' AND %s < ''%s'''; + /* Relocate data if asked to */ + IF partition_data = true THEN + PERFORM @extschema@.set_enable_parent(parent_relid, false); + PERFORM @extschema@.partition_data(parent_relid); ELSE - v_sql := '%s >= %s AND %s < %s'; + PERFORM @extschema@.set_enable_parent(parent_relid, true); END IF; - v_sql := format(v_sql - , p_attname - , p_start_value - , p_attname - , p_end_value); - RETURN v_sql; + RETURN part_count; /* number of created partitions */ END -$$ -LANGUAGE plpgsql; +$$ LANGUAGE plpgsql; /* - * Creates new RANGE partition. Returns partition name + * Creates new RANGE partition. Returns partition name. + * NOTE: This function SHOULD NOT take xact_handling lock (BGWs in 9.5). */ CREATE OR REPLACE FUNCTION @extschema@.create_single_range_partition( - p_parent REGCLASS - , p_start_value ANYELEMENT - , p_end_value ANYELEMENT) + parent_relid REGCLASS, + p_start_value ANYELEMENT, + p_end_value ANYELEMENT, + partition_name TEXT DEFAULT NULL, + tablespace TEXT DEFAULT NULL) RETURNS TEXT AS $$ DECLARE - v_part_num INT; - v_child_relname TEXT; - v_plain_child_relname TEXT; - v_attname TEXT; - v_sql TEXT; - v_cond TEXT; - v_plain_schema TEXT; - v_plain_relname TEXT; - v_child_relname_exists INTEGER := 1; - v_seq_name TEXT; + v_part_num INT; + v_child_relname TEXT; + v_plain_child_relname TEXT; + v_attname TEXT; + v_plain_schema TEXT; + v_plain_relname TEXT; + v_child_relname_exists BOOL; + v_seq_name TEXT; + v_init_callback REGPROCEDURE; + BEGIN v_attname := attname FROM @extschema@.pathman_config - WHERE relname::regclass = p_parent; + WHERE partrel = parent_relid; IF v_attname IS NULL THEN - RAISE EXCEPTION 'Table % is not partitioned', quote_ident(p_parent::TEXT); + RAISE EXCEPTION 'table "%" is not partitioned', parent_relid::TEXT; END IF; SELECT * INTO v_plain_schema, v_plain_relname - FROM @extschema@.get_plain_schema_and_relname(p_parent); + FROM @extschema@.get_plain_schema_and_relname(parent_relid); v_seq_name := @extschema@.get_sequence_name(v_plain_schema, v_plain_relname); - /* get next value from sequence */ - LOOP - v_part_num := nextval(v_seq_name); - v_plain_child_relname := format('%s_%s', v_plain_relname, v_part_num); - v_child_relname := format('%s.%s', - v_plain_schema, - quote_ident(v_plain_child_relname)); - v_child_relname_exists := count(*) - FROM pg_class - WHERE relnamespace::regnamespace || '.' || relname = v_child_relname - LIMIT 1; - EXIT WHEN v_child_relname_exists = 0; - END LOOP; - - EXECUTE format('CREATE TABLE %s (LIKE %s INCLUDING ALL)' - , v_child_relname - , p_parent); + IF partition_name IS NULL THEN + /* Get next value from sequence */ + LOOP + v_part_num := nextval(v_seq_name); + v_plain_child_relname := format('%s_%s', v_plain_relname, v_part_num); + v_child_relname := format('%s.%s', + quote_ident(v_plain_schema), + quote_ident(v_plain_child_relname)); + + v_child_relname_exists := count(*) > 0 + FROM pg_class + WHERE relname = v_plain_child_relname AND + relnamespace = v_plain_schema::regnamespace + LIMIT 1; + + EXIT WHEN v_child_relname_exists = false; + END LOOP; + ELSE + v_child_relname := partition_name; + END IF; - EXECUTE format('ALTER TABLE %s INHERIT %s' - , v_child_relname - , p_parent); + IF tablespace IS NULL THEN + tablespace := @extschema@.get_rel_tablespace_name(parent_relid); + END IF; - v_cond := @extschema@.get_range_condition(v_attname, p_start_value, p_end_value); - v_sql := format('ALTER TABLE %s ADD CONSTRAINT %s CHECK (%s)' - , v_child_relname - , quote_ident(format('%s_%s_check', v_plain_schema, v_plain_child_relname)) - , v_cond); + EXECUTE format('CREATE TABLE %1$s (LIKE %2$s INCLUDING ALL) + INHERITS (%2$s) TABLESPACE %3$s', + v_child_relname, + parent_relid::TEXT, + tablespace); + + EXECUTE format('ALTER TABLE %s ADD CONSTRAINT %s CHECK (%s)', + v_child_relname, + @extschema@.build_check_constraint_name(v_child_relname::REGCLASS, + v_attname), + @extschema@.build_range_condition(v_attname, + p_start_value, + p_end_value)); + + PERFORM @extschema@.copy_foreign_keys(parent_relid, v_child_relname::REGCLASS); + + /* Fetch init_callback from 'params' table */ + WITH stub_callback(stub) as (values (0)) + SELECT coalesce(init_callback, 0::REGPROCEDURE) + FROM stub_callback + LEFT JOIN @extschema@.pathman_config_params AS params + ON params.partrel = parent_relid + INTO v_init_callback; + + PERFORM @extschema@.invoke_on_partition_created_callback(parent_relid, + v_child_relname::REGCLASS, + v_init_callback, + p_start_value, + p_end_value); - EXECUTE v_sql; RETURN v_child_relname; END -$$ LANGUAGE plpgsql; +$$ LANGUAGE plpgsql +SET client_min_messages = WARNING; /* * Split RANGE partition */ CREATE OR REPLACE FUNCTION @extschema@.split_range_partition( - p_partition REGCLASS - , p_value ANYELEMENT - , OUT p_range ANYARRAY) + p_partition REGCLASS, + p_value ANYELEMENT, + partition_name TEXT DEFAULT NULL, + OUT p_range ANYARRAY) RETURNS ANYARRAY AS $$ DECLARE - v_parent_relid OID; - v_child_relid OID := p_partition::oid; - v_attname TEXT; - v_cond TEXT; - v_new_partition TEXT; - v_part_type INTEGER; - v_part_relname TEXT; - v_plain_schema TEXT; - v_plain_relname TEXT; - v_check_name TEXT; + v_parent REGCLASS; + v_attname TEXT; + v_atttype REGTYPE; + v_cond TEXT; + v_new_partition TEXT; + v_part_type INTEGER; + v_check_name TEXT; + BEGIN - v_part_relname := @extschema@.validate_relname(p_partition); + PERFORM @extschema@.validate_relname(p_partition); + v_parent = @extschema@.get_parent_of_partition(p_partition); - v_parent_relid := inhparent - FROM pg_inherits - WHERE inhrelid = v_child_relid; + /* Acquire lock on parent */ + PERFORM @extschema@.lock_partitioned_relation(v_parent); + + /* Acquire data modification lock (prevent further modifications) */ + PERFORM @extschema@.prevent_relation_modification(p_partition); SELECT attname, parttype FROM @extschema@.pathman_config - WHERE relname::regclass = v_parent_relid::regclass + WHERE partrel = v_parent INTO v_attname, v_part_type; IF v_attname IS NULL THEN - RAISE EXCEPTION 'Table % is not partitioned', - quote_ident(v_parent_relid::regclass::text); + RAISE EXCEPTION 'table "%" is not partitioned', v_parent::TEXT; END IF; - SELECT * INTO v_plain_schema, v_plain_relname - FROM @extschema@.get_plain_schema_and_relname(p_partition); - - /* Check if this is RANGE partition */ + /* Check if this is a RANGE partition */ IF v_part_type != 2 THEN - RAISE EXCEPTION 'Specified partition isn''t RANGE partition'; + RAISE EXCEPTION '"%" is not a RANGE partition', p_partition::TEXT; END IF; + v_atttype = @extschema@.get_attribute_type(v_parent, v_attname); + /* Get partition values range */ - p_range := @extschema@.get_partition_range(v_parent_relid, v_child_relid, 0); + EXECUTE format('SELECT @extschema@.get_part_range($1, NULL::%s)', + @extschema@.get_base_type(v_atttype)::TEXT) + USING p_partition + INTO p_range; + IF p_range IS NULL THEN - RAISE EXCEPTION 'Could not find specified partition'; + RAISE EXCEPTION 'could not find specified partition'; END IF; /* Check if value fit into the range */ IF p_range[1] > p_value OR p_range[2] <= p_value THEN - RAISE EXCEPTION 'Specified value does not fit into the range [%, %)', + RAISE EXCEPTION 'specified value does not fit into the range [%, %)', p_range[1], p_range[2]; END IF; /* Create new partition */ - RAISE NOTICE 'Creating new partition...'; - v_new_partition := @extschema@.create_single_range_partition( - @extschema@.get_schema_qualified_name(v_parent_relid::regclass, '.'), - p_value, - p_range[2]); + v_new_partition := @extschema@.create_single_range_partition(v_parent, + p_value, + p_range[2], + partition_name); /* Copy data */ - RAISE NOTICE 'Copying data to new partition...'; - v_cond := @extschema@.get_range_condition(v_attname, p_value, p_range[2]); - EXECUTE format(' - WITH part_data AS ( - DELETE FROM %s WHERE %s RETURNING *) - INSERT INTO %s SELECT * FROM part_data' - , p_partition - , v_cond - , v_new_partition); + v_cond := @extschema@.build_range_condition(v_attname, p_value, p_range[2]); + EXECUTE format('WITH part_data AS (DELETE FROM %s WHERE %s RETURNING *) + INSERT INTO %s SELECT * FROM part_data', + p_partition::TEXT, + v_cond, + v_new_partition); /* Alter original partition */ - RAISE NOTICE 'Altering original partition...'; - v_cond := @extschema@.get_range_condition(v_attname, p_range[1], p_value); - v_check_name := quote_ident(format('%s_%s_check', v_plain_schema, v_plain_relname)); - EXECUTE format('ALTER TABLE %s DROP CONSTRAINT %s' - , p_partition::text - , v_check_name); - EXECUTE format('ALTER TABLE %s ADD CONSTRAINT %s CHECK (%s)' - , p_partition - , v_check_name - , v_cond); + v_cond := @extschema@.build_range_condition(v_attname, p_range[1], p_value); + v_check_name := @extschema@.build_check_constraint_name(p_partition, v_attname); - /* Tell backend to reload configuration */ - PERFORM @extschema@.on_update_partitions(v_parent_relid::oid); + EXECUTE format('ALTER TABLE %s DROP CONSTRAINT %s', + p_partition::TEXT, + v_check_name); - RAISE NOTICE 'Done!'; + EXECUTE format('ALTER TABLE %s ADD CONSTRAINT %s CHECK (%s)', + p_partition::TEXT, + v_check_name, + v_cond); + + /* Tell backend to reload configuration */ + PERFORM @extschema@.on_update_partitions(v_parent); END $$ LANGUAGE plpgsql; @@ -563,59 +642,58 @@ LANGUAGE plpgsql; * Merge RANGE partitions */ CREATE OR REPLACE FUNCTION @extschema@.merge_range_partitions( - p_partition1 REGCLASS - , p_partition2 REGCLASS) + partition1 REGCLASS, + partition2 REGCLASS) RETURNS VOID AS $$ DECLARE - v_parent_relid1 OID; - v_parent_relid2 OID; - v_part1_relid OID := p_partition1::oid; - v_part2_relid OID := p_partition2::oid; - v_part1_relname TEXT; - v_part2_relname TEXT; - v_attname TEXT; - v_part_type INTEGER; - v_atttype TEXT; -BEGIN - v_part1_relname := @extschema@.validate_relname(p_partition1); - v_part2_relname := @extschema@.validate_relname(p_partition2); + v_parent1 REGCLASS; + v_parent2 REGCLASS; + v_attname TEXT; + v_part_type INTEGER; + v_atttype REGTYPE; - IF v_part1_relid = v_part2_relid THEN - RAISE EXCEPTION 'Cannot merge partition to itself'; +BEGIN + IF partition1 = partition2 THEN + RAISE EXCEPTION 'cannot merge partition with itself'; END IF; - v_parent_relid1 := inhparent FROM pg_inherits WHERE inhrelid = v_part1_relid; - v_parent_relid2 := inhparent FROM pg_inherits WHERE inhrelid = v_part2_relid; + v_parent1 := @extschema@.get_parent_of_partition(partition1); + v_parent2 := @extschema@.get_parent_of_partition(partition2); - IF v_parent_relid1 != v_parent_relid2 THEN - RAISE EXCEPTION 'Cannot merge partitions having different parents'; + /* Acquire data modification locks (prevent further modifications) */ + PERFORM @extschema@.prevent_relation_modification(partition1); + PERFORM @extschema@.prevent_relation_modification(partition2); + + IF v_parent1 != v_parent2 THEN + RAISE EXCEPTION 'cannot merge partitions with different parents'; END IF; + /* Acquire lock on parent */ + PERFORM @extschema@.lock_partitioned_relation(v_parent1); + SELECT attname, parttype FROM @extschema@.pathman_config - WHERE relname::regclass = v_parent_relid1::regclass + WHERE partrel = v_parent1 INTO v_attname, v_part_type; IF v_attname IS NULL THEN - RAISE EXCEPTION 'Table % is not partitioned', - quote_ident(v_parent_relid1::regclass::text); + RAISE EXCEPTION 'table "%" is not partitioned', v_parent1::TEXT; END IF; - /* Check if this is RANGE partition */ + /* Check if this is a RANGE partition */ IF v_part_type != 2 THEN - RAISE EXCEPTION 'Specified partitions aren''t RANGE partitions'; + RAISE EXCEPTION 'specified partitions aren''t RANGE partitions'; END IF; - v_atttype := @extschema@.get_attribute_type_name(p_partition1, v_attname); + v_atttype := @extschema@.get_attribute_type(partition1, v_attname); - EXECUTE format('SELECT @extschema@.merge_range_partitions_internal($1, $2 , $3, NULL::%s)', v_atttype) - USING v_parent_relid1, p_partition1 , p_partition2; + EXECUTE format('SELECT @extschema@.merge_range_partitions_internal($1, $2, $3, NULL::%s)', + @extschema@.get_base_type(v_atttype)::TEXT) + USING v_parent1, partition1, partition2; /* Tell backend to reload configuration */ - PERFORM @extschema@.on_update_partitions(v_parent_relid1::oid); - - RAISE NOTICE 'Done!'; + PERFORM @extschema@.on_update_partitions(v_parent1); END $$ LANGUAGE plpgsql; @@ -625,147 +703,171 @@ LANGUAGE plpgsql; * Merge two partitions. All data will be copied to the first one. Second * partition will be destroyed. * - * Notes: dummy field is used to pass the element type to the function - * (it is necessary because of pseudo-types used in function) + * NOTE: dummy field is used to pass the element type to the function + * (it is necessary because of pseudo-types used in function). */ CREATE OR REPLACE FUNCTION @extschema@.merge_range_partitions_internal( - p_parent_relid OID - , p_part1 REGCLASS - , p_part2 REGCLASS - , dummy ANYELEMENT - , OUT p_range ANYARRAY) + parent_relid REGCLASS, + partition1 REGCLASS, + partition2 REGCLASS, + dummy ANYELEMENT, + OUT p_range ANYARRAY) RETURNS ANYARRAY AS $$ DECLARE - v_attname TEXT; - v_cond TEXT; - v_plain_schema TEXT; - v_plain_relname TEXT; - v_child_relname TEXT; - v_check_name TEXT; + v_attname TEXT; + v_atttype REGTYPE; + v_check_name TEXT; + BEGIN SELECT attname FROM @extschema@.pathman_config - WHERE relname::regclass = p_parent_relid::regclass + WHERE partrel = parent_relid INTO v_attname; IF v_attname IS NULL THEN - RAISE EXCEPTION 'Table % is not partitioned', - quote_ident(p_parent_relid::regclass::text); + RAISE EXCEPTION 'table "%" is not partitioned', parent_relid::TEXT; END IF; - SELECT * INTO v_plain_schema, v_plain_relname - FROM @extschema@.get_plain_schema_and_relname(p_part1); + v_atttype = @extschema@.get_attribute_type(parent_relid, v_attname); - /* - * Get ranges - * first and second elements of array are MIN and MAX of partition1 - * third and forth elements are MIN and MAX of partition2 - */ - p_range := @extschema@.get_partition_range(p_parent_relid, p_part1::oid, 0) || - @extschema@.get_partition_range(p_parent_relid, p_part2::oid, 0); + /* We have to pass fake NULL casted to column's type */ + EXECUTE format('SELECT @extschema@.get_part_range($1, NULL::%1$s) || + @extschema@.get_part_range($2, NULL::%1$s)', + @extschema@.get_base_type(v_atttype)::TEXT) + USING partition1, partition2 + INTO p_range; /* Check if ranges are adjacent */ IF p_range[1] != p_range[4] AND p_range[2] != p_range[3] THEN - RAISE EXCEPTION 'Merge failed. Partitions must be adjacent'; + RAISE EXCEPTION 'merge failed, partitions must be adjacent'; END IF; - /* Extend first partition */ - v_cond := @extschema@.get_range_condition(v_attname - , least(p_range[1], p_range[3]) - , greatest(p_range[2], p_range[4])); - - /* Alter first partition */ - RAISE NOTICE 'Altering first partition...'; - v_check_name := quote_ident(v_plain_schema || '_' || v_plain_relname || '_check'); - EXECUTE format('ALTER TABLE %s DROP CONSTRAINT %s' - , p_part1::text - , v_check_name); - EXECUTE format('ALTER TABLE %s ADD CONSTRAINT %s CHECK (%s)' - , p_part1::text - , v_check_name - , v_cond); + /* Drop constraint on first partition... */ + v_check_name := @extschema@.build_check_constraint_name(partition1, v_attname); + EXECUTE format('ALTER TABLE %s DROP CONSTRAINT %s', + partition1::TEXT, + v_check_name); + + /* and create a new one */ + EXECUTE format('ALTER TABLE %s ADD CONSTRAINT %s CHECK (%s)', + partition1::TEXT, + v_check_name, + @extschema@.build_range_condition(v_attname, + least(p_range[1], p_range[3]), + greatest(p_range[2], p_range[4]))); /* Copy data from second partition to the first one */ - RAISE NOTICE 'Copying data...'; EXECUTE format('WITH part_data AS (DELETE FROM %s RETURNING *) - INSERT INTO %s SELECT * FROM part_data' - , p_part2::text - , p_part1::text); + INSERT INTO %s SELECT * FROM part_data', + partition2::TEXT, + partition1::TEXT); /* Remove second partition */ - RAISE NOTICE 'Dropping second partition...'; - EXECUTE format('DROP TABLE %s', p_part2::text); + EXECUTE format('DROP TABLE %s', partition2::TEXT); END $$ LANGUAGE plpgsql; /* - * Append new partition + * Append new partition. */ CREATE OR REPLACE FUNCTION @extschema@.append_range_partition( - p_relation REGCLASS) + parent_relid REGCLASS, + partition_name TEXT DEFAULT NULL, + tablespace TEXT DEFAULT NULL) RETURNS TEXT AS $$ DECLARE - v_attname TEXT; - v_atttype TEXT; - v_part_name TEXT; - v_interval TEXT; + v_attname TEXT; + v_atttype REGTYPE; + v_part_name TEXT; + v_interval TEXT; + BEGIN - /* Prevent concurrent partition creation */ - PERFORM @extschema@.acquire_partitions_lock(); + /* Acquire lock on parent */ + PERFORM @extschema@.lock_partitioned_relation(parent_relid); SELECT attname, range_interval FROM @extschema@.pathman_config - WHERE relname::regclass = p_relation + WHERE partrel = parent_relid INTO v_attname, v_interval; IF v_attname IS NULL THEN - RAISE EXCEPTION 'Table % is not partitioned', quote_ident(p_relation::TEXT); + RAISE EXCEPTION 'table "%" is not partitioned', parent_relid::TEXT; END IF; - v_atttype := @extschema@.get_attribute_type_name(p_relation, v_attname); + v_atttype := @extschema@.get_attribute_type(parent_relid, v_attname); - EXECUTE format('SELECT @extschema@.append_partition_internal($1, $2, $3, ARRAY[]::%s[])', v_atttype) - INTO v_part_name - USING p_relation, v_atttype, v_interval; + EXECUTE + format('SELECT @extschema@.append_partition_internal($1, $2, $3, ARRAY[]::%s[], $4, $5)', + @extschema@.get_base_type(v_atttype)::TEXT) + USING + parent_relid, + v_atttype, + v_interval, + partition_name, + tablespace + INTO + v_part_name; /* Invalidate cache */ - PERFORM @extschema@.on_update_partitions(p_relation::oid); - - /* Release lock */ - PERFORM @extschema@.release_partitions_lock(); - - RAISE NOTICE 'Done!'; + PERFORM @extschema@.on_update_partitions(parent_relid); RETURN v_part_name; - -EXCEPTION WHEN others THEN - RAISE EXCEPTION '%', SQLERRM; END $$ LANGUAGE plpgsql; - +/* + * Spawn logic for append_partition(). We have to + * separate this in order to pass the 'p_range'. + * + * NOTE: we don't take a xact_handling lock here. + */ CREATE OR REPLACE FUNCTION @extschema@.append_partition_internal( - p_relation REGCLASS - , p_atttype TEXT - , p_interval TEXT - , p_range ANYARRAY DEFAULT NULL) + parent_relid REGCLASS, + p_atttype REGTYPE, + p_interval TEXT, + p_range ANYARRAY DEFAULT NULL, + partition_name TEXT DEFAULT NULL, + tablespace TEXT DEFAULT NULL) RETURNS TEXT AS $$ DECLARE - v_part_name TEXT; + v_part_name TEXT; + v_atttype REGTYPE; + BEGIN - p_range := @extschema@.get_range_by_idx(p_relation::oid, -1, 0); - RAISE NOTICE 'Appending new partition...'; - IF @extschema@.is_date(p_atttype::regtype) THEN - v_part_name := @extschema@.create_single_range_partition(p_relation - , p_range[2] - , p_range[2] + p_interval::interval); + IF @extschema@.partitions_count(parent_relid) = 0 THEN + RAISE EXCEPTION 'cannot append to empty partitions set'; + END IF; + + v_atttype := @extschema@.get_base_type(p_atttype); + + /* We have to pass fake NULL casted to column's type */ + EXECUTE format('SELECT @extschema@.get_part_range($1, -1, NULL::%s)', + v_atttype::TEXT) + USING parent_relid + INTO p_range; + + IF @extschema@.is_date_type(p_atttype) THEN + v_part_name := @extschema@.create_single_range_partition( + parent_relid, + p_range[2], + p_range[2] + p_interval::interval, + partition_name, + tablespace); ELSE - EXECUTE format('SELECT @extschema@.create_single_range_partition($1, $2, $2 + $3::%s)', p_atttype) - USING p_relation, p_range[2], p_interval - INTO v_part_name; + EXECUTE + format('SELECT @extschema@.create_single_range_partition($1, $2, $2 + $3::%s, $4, $5)', + v_atttype::TEXT) + USING + parent_relid, + p_range[2], + p_interval, + partition_name, + tablespace + INTO + v_part_name; END IF; RETURN v_part_name; @@ -775,72 +877,102 @@ LANGUAGE plpgsql; /* - * Prepend new partition + * Prepend new partition. */ -CREATE OR REPLACE FUNCTION @extschema@.prepend_range_partition(p_relation REGCLASS) +CREATE OR REPLACE FUNCTION @extschema@.prepend_range_partition( + parent_relid REGCLASS, + partition_name TEXT DEFAULT NULL, + tablespace TEXT DEFAULT NULL) RETURNS TEXT AS $$ DECLARE - v_attname TEXT; - v_atttype TEXT; - v_part_name TEXT; - v_interval TEXT; -BEGIN - /* Prevent concurrent partition creation */ - PERFORM @extschema@.acquire_partitions_lock(); + v_attname TEXT; + v_atttype REGTYPE; + v_part_name TEXT; + v_interval TEXT; +BEGIN SELECT attname, range_interval FROM @extschema@.pathman_config - WHERE relname::regclass = p_relation + WHERE partrel = parent_relid INTO v_attname, v_interval; IF v_attname IS NULL THEN - RAISE EXCEPTION 'Table % is not partitioned', quote_ident(p_relation::TEXT); + RAISE EXCEPTION 'table "%" is not partitioned', parent_relid::TEXT; END IF; - v_atttype := @extschema@.get_attribute_type_name(p_relation, v_attname); + v_atttype := @extschema@.get_attribute_type(parent_relid, v_attname); - EXECUTE format('SELECT @extschema@.prepend_partition_internal($1, $2, $3, ARRAY[]::%s[])', v_atttype) - INTO v_part_name - USING p_relation, v_atttype, v_interval; + EXECUTE + format('SELECT @extschema@.prepend_partition_internal($1, $2, $3, ARRAY[]::%s[], $4, $5)', + @extschema@.get_base_type(v_atttype)::TEXT) + USING + parent_relid, + v_atttype, + v_interval, + partition_name, + tablespace + INTO + v_part_name; /* Invalidate cache */ - PERFORM @extschema@.on_update_partitions(p_relation::oid); - - /* Release lock */ - PERFORM @extschema@.release_partitions_lock(); - - RAISE NOTICE 'Done!'; + PERFORM @extschema@.on_update_partitions(parent_relid); RETURN v_part_name; - -EXCEPTION WHEN others THEN - RAISE EXCEPTION '%', SQLERRM; END $$ LANGUAGE plpgsql; - +/* + * Spawn logic for prepend_partition(). We have to + * separate this in order to pass the 'p_range'. + * + * NOTE: we don't take a xact_handling lock here. + */ CREATE OR REPLACE FUNCTION @extschema@.prepend_partition_internal( - p_relation REGCLASS - , p_atttype TEXT - , p_interval TEXT - , p_range ANYARRAY DEFAULT NULL) + parent_relid REGCLASS, + p_atttype REGTYPE, + p_interval TEXT, + p_range ANYARRAY DEFAULT NULL, + partition_name TEXT DEFAULT NULL, + tablespace TEXT DEFAULT NULL) RETURNS TEXT AS $$ DECLARE - v_part_name TEXT; + v_part_name TEXT; + v_atttype REGTYPE; + BEGIN - p_range := @extschema@.get_range_by_idx(p_relation::oid, 0, 0); - RAISE NOTICE 'Prepending new partition...'; + IF @extschema@.partitions_count(parent_relid) = 0 THEN + RAISE EXCEPTION 'cannot prepend to empty partitions set'; + END IF; - IF @extschema@.is_date(p_atttype::regtype) THEN - v_part_name := @extschema@.create_single_range_partition(p_relation - , p_range[1] - p_interval::interval - , p_range[1]); + v_atttype := @extschema@.get_base_type(p_atttype); + + /* We have to pass fake NULL casted to column's type */ + EXECUTE format('SELECT @extschema@.get_part_range($1, 0, NULL::%s)', + v_atttype::TEXT) + USING parent_relid + INTO p_range; + + IF @extschema@.is_date_type(p_atttype) THEN + v_part_name := @extschema@.create_single_range_partition( + parent_relid, + p_range[1] - p_interval::interval, + p_range[1], + partition_name, + tablespace); ELSE - EXECUTE format('SELECT @extschema@.create_single_range_partition($1, $2 - $3::%s, $2)', p_atttype) - USING p_relation, p_range[1], p_interval - INTO v_part_name; + EXECUTE + format('SELECT @extschema@.create_single_range_partition($1, $2 - $3::%s, $2, $4, $5)', + v_atttype::TEXT) + USING + parent_relid, + p_range[1], + p_interval, + partition_name, + tablespace + INTO + v_part_name; END IF; RETURN v_part_name; @@ -853,38 +985,39 @@ LANGUAGE plpgsql; * Add new partition */ CREATE OR REPLACE FUNCTION @extschema@.add_range_partition( - p_relation REGCLASS - , p_start_value ANYELEMENT - , p_end_value ANYELEMENT) + parent_relid REGCLASS, + p_start_value ANYELEMENT, + p_end_value ANYELEMENT, + partition_name TEXT DEFAULT NULL, + tablespace TEXT DEFAULT NULL) RETURNS TEXT AS $$ DECLARE - v_part_name TEXT; + v_part_name TEXT; + BEGIN - /* Prevent concurrent partition creation */ - PERFORM @extschema@.acquire_partitions_lock(); + /* Acquire lock on parent */ + PERFORM @extschema@.lock_partitioned_relation(parent_relid); - /* check range overlap */ - IF @extschema@.check_overlap(p_relation::oid, p_start_value, p_end_value) != FALSE THEN - RAISE EXCEPTION 'Specified range overlaps with existing partitions'; + IF p_start_value >= p_end_value THEN + RAISE EXCEPTION 'failed to create partition: p_start_value is greater than p_end_value'; END IF; - IF p_start_value >= p_end_value THEN - RAISE EXCEPTION 'Failed to create partition: p_start_value is greater than p_end_value'; + /* check range overlap */ + IF @extschema@.partitions_count(parent_relid) > 0 + AND @extschema@.check_overlap(parent_relid, p_start_value, p_end_value) THEN + RAISE EXCEPTION 'specified range overlaps with existing partitions'; END IF; /* Create new partition */ - v_part_name := @extschema@.create_single_range_partition(p_relation, p_start_value, p_end_value); - PERFORM @extschema@.on_update_partitions(p_relation::oid); + v_part_name := @extschema@.create_single_range_partition(parent_relid, + p_start_value, + p_end_value, + partition_name, + tablespace); + PERFORM @extschema@.on_update_partitions(parent_relid); - /* Release lock */ - PERFORM @extschema@.release_partitions_lock(); - - RAISE NOTICE 'Done!'; RETURN v_part_name; - -EXCEPTION WHEN others THEN - RAISE EXCEPTION '%', SQLERRM; END $$ LANGUAGE plpgsql; @@ -894,109 +1027,140 @@ LANGUAGE plpgsql; * Drop range partition */ CREATE OR REPLACE FUNCTION @extschema@.drop_range_partition( - p_partition REGCLASS) + p_partition REGCLASS, + delete_data BOOLEAN DEFAULT TRUE) RETURNS TEXT AS $$ DECLARE - v_part_name TEXT := p_partition::TEXT; - v_parent TEXT; - v_count INTEGER; + parent_relid REGCLASS; + part_name TEXT; + v_relkind CHAR; + v_rows BIGINT; + v_part_type INTEGER; + BEGIN - /* Prevent concurrent partition management */ - PERFORM @extschema@.acquire_partitions_lock(); + parent_relid := @extschema@.get_parent_of_partition(p_partition); + part_name := p_partition::TEXT; /* save the name to be returned */ + + SELECT parttype + FROM @extschema@.pathman_config + WHERE partrel = parent_relid + INTO v_part_type; + + /* Check if this is a RANGE partition */ + IF v_part_type != 2 THEN + RAISE EXCEPTION '"%" is not a RANGE partition', p_partition::TEXT; + END IF; + + /* Acquire lock on parent */ + PERFORM @extschema@.lock_partitioned_relation(parent_relid); - /* Parent table name */ - SELECT inhparent::regclass INTO v_parent - FROM pg_inherits WHERE inhrelid::regclass = p_partition; + IF NOT delete_data THEN + EXECUTE format('INSERT INTO %s SELECT * FROM %s', + parent_relid::TEXT, + p_partition::TEXT); + GET DIAGNOSTICS v_rows = ROW_COUNT; - IF v_parent IS NULL THEN - RAISE EXCEPTION 'Partition ''%'' not found', p_partition; + /* Show number of copied rows */ + RAISE NOTICE '% rows copied from %', v_rows, p_partition::TEXT; END IF; - /* Drop table and update cache */ - EXECUTE format('DROP TABLE %s', p_partition::TEXT); - PERFORM @extschema@.on_update_partitions(v_parent::regclass::oid); + SELECT relkind FROM pg_catalog.pg_class + WHERE oid = p_partition + INTO v_relkind; - /* Release lock */ - PERFORM @extschema@.release_partitions_lock(); + /* + * Determine the kind of child relation. It can be either regular + * table (r) or foreign table (f). Depending on relkind we use + * DROP TABLE or DROP FOREIGN TABLE. + */ + IF v_relkind = 'f' THEN + EXECUTE format('DROP FOREIGN TABLE %s', p_partition::TEXT); + ELSE + EXECUTE format('DROP TABLE %s', p_partition::TEXT); + END IF; - RETURN v_part_name; + /* Invalidate cache */ + PERFORM @extschema@.on_update_partitions(parent_relid); -EXCEPTION WHEN others THEN - RAISE EXCEPTION '%', SQLERRM; + RETURN part_name; END $$ -LANGUAGE plpgsql; +LANGUAGE plpgsql +SET pg_pathman.enable_partitionfilter = off; /* ensures that PartitionFilter is OFF */ /* * Attach range partition */ CREATE OR REPLACE FUNCTION @extschema@.attach_range_partition( - p_relation REGCLASS - , p_partition REGCLASS - , p_start_value ANYELEMENT - , p_end_value ANYELEMENT) + parent_relid REGCLASS, + p_partition REGCLASS, + p_start_value ANYELEMENT, + p_end_value ANYELEMENT) RETURNS TEXT AS $$ DECLARE - v_attname TEXT; - v_cond TEXT; - v_plain_partname TEXT; - v_plain_schema TEXT; - rel_persistence CHAR; + v_attname TEXT; + rel_persistence CHAR; + v_init_callback REGPROCEDURE; + BEGIN + /* Acquire lock on parent */ + PERFORM @extschema@.lock_partitioned_relation(parent_relid); + /* Ignore temporary tables */ - SELECT relpersistence FROM pg_catalog.pg_class WHERE oid = p_partition INTO rel_persistence; + SELECT relpersistence FROM pg_catalog.pg_class + WHERE oid = p_partition INTO rel_persistence; + IF rel_persistence = 't'::CHAR THEN - RAISE EXCEPTION 'Temporary table % cannot be used as a partition', - quote_ident(p_partition::TEXT); + RAISE EXCEPTION 'temporary table "%" cannot be used as a partition', + p_partition::TEXT; END IF; - /* Prevent concurrent partition management */ - PERFORM @extschema@.acquire_partitions_lock(); - - IF @extschema@.check_overlap(p_relation::oid, p_start_value, p_end_value) != FALSE THEN - RAISE EXCEPTION 'Specified range overlaps with existing partitions'; + IF @extschema@.check_overlap(parent_relid, p_start_value, p_end_value) THEN + RAISE EXCEPTION 'specified range overlaps with existing partitions'; END IF; - IF NOT @extschema@.validate_relations_equality(p_relation, p_partition) THEN - RAISE EXCEPTION 'Partition must have the exact same structure as parent'; + IF NOT @extschema@.validate_relations_equality(parent_relid, p_partition) THEN + RAISE EXCEPTION 'partition must have the exact same structure as parent'; END IF; /* Set inheritance */ - EXECUTE format('ALTER TABLE %s INHERIT %s' - , p_partition - , p_relation); + EXECUTE format('ALTER TABLE %s INHERIT %s', p_partition, parent_relid); - /* Set check constraint */ - v_attname := attname - FROM @extschema@.pathman_config - WHERE relname::regclass = p_relation; + v_attname := attname FROM @extschema@.pathman_config WHERE partrel = parent_relid; IF v_attname IS NULL THEN - RAISE EXCEPTION 'Table % is not partitioned', quote_ident(p_relation::TEXT); + RAISE EXCEPTION 'table "%" is not partitioned', parent_relid::TEXT; END IF; - v_cond := @extschema@.get_range_condition(v_attname, p_start_value, p_end_value); - - /* Plain partition name and schema */ - SELECT * INTO v_plain_schema, v_plain_partname FROM @extschema@.get_plain_schema_and_relname(p_partition); - - EXECUTE format('ALTER TABLE %s ADD CONSTRAINT %s CHECK (%s)' - , p_partition - , v_plain_schema || '_' || quote_ident(v_plain_partname || '_check') - , v_cond); + /* Set check constraint */ + EXECUTE format('ALTER TABLE %s ADD CONSTRAINT %s CHECK (%s)', + p_partition::TEXT, + @extschema@.build_check_constraint_name(p_partition, v_attname), + @extschema@.build_range_condition(v_attname, + p_start_value, + p_end_value)); + + /* Fetch init_callback from 'params' table */ + WITH stub_callback(stub) as (values (0)) + SELECT coalesce(init_callback, 0::REGPROCEDURE) + FROM stub_callback + LEFT JOIN @extschema@.pathman_config_params AS params + ON params.partrel = parent_relid + INTO v_init_callback; + + PERFORM @extschema@.invoke_on_partition_created_callback(parent_relid, + p_partition, + v_init_callback, + p_start_value, + p_end_value); /* Invalidate cache */ - PERFORM @extschema@.on_update_partitions(p_relation::oid); + PERFORM @extschema@.on_update_partitions(parent_relid); - /* Release lock */ - PERFORM @extschema@.release_partitions_lock(); RETURN p_partition; - -EXCEPTION WHEN others THEN - RAISE EXCEPTION '%', SQLERRM; END $$ LANGUAGE plpgsql; @@ -1006,326 +1170,187 @@ LANGUAGE plpgsql; * Detach range partition */ CREATE OR REPLACE FUNCTION @extschema@.detach_range_partition( - p_partition TEXT) + p_partition REGCLASS) RETURNS TEXT AS $$ DECLARE - v_parent TEXT; + v_attname TEXT; + parent_relid REGCLASS; + BEGIN - /* Prevent concurrent partition management */ - PERFORM @extschema@.acquire_partitions_lock(); + parent_relid := @extschema@.get_parent_of_partition(p_partition); + + /* Acquire lock on parent */ + PERFORM @extschema@.lock_partitioned_relation(parent_relid); - /* Parent table */ - SELECT inhparent::regclass INTO v_parent - FROM pg_inherits WHERE inhrelid = p_partition::regclass::oid; + v_attname := attname + FROM @extschema@.pathman_config + WHERE partrel = parent_relid; + + IF v_attname IS NULL THEN + RAISE EXCEPTION 'table "%" is not partitioned', parent_relid::TEXT; + END IF; /* Remove inheritance */ - EXECUTE format('ALTER TABLE %s NO INHERIT %s' - , p_partition - , v_parent); + EXECUTE format('ALTER TABLE %s NO INHERIT %s', + p_partition::TEXT, + parent_relid::TEXT); /* Remove check constraint */ - EXECUTE format('ALTER TABLE %s DROP CONSTRAINT %s_check' - , p_partition - , @extschema@.get_schema_qualified_name(p_partition::regclass)); + EXECUTE format('ALTER TABLE %s DROP CONSTRAINT %s', + p_partition::TEXT, + @extschema@.build_check_constraint_name(p_partition, v_attname)); /* Invalidate cache */ - PERFORM @extschema@.on_update_partitions(v_parent::regclass::oid); + PERFORM @extschema@.on_update_partitions(parent_relid); - /* Release lock */ - PERFORM @extschema@.release_partitions_lock(); RETURN p_partition; - -EXCEPTION WHEN others THEN - RAISE EXCEPTION '%', SQLERRM; END $$ LANGUAGE plpgsql; -/* - * Creates range partitioning insert trigger - */ -CREATE OR REPLACE FUNCTION @extschema@.create_range_insert_trigger( - v_relation REGCLASS - , v_attname TEXT) -RETURNS VOID AS -$$ -DECLARE - v_func TEXT := ' - CREATE OR REPLACE FUNCTION %s() - RETURNS TRIGGER - AS $body$ - DECLARE - v_part_relid OID; - BEGIN - IF TG_OP = ''INSERT'' THEN - IF NEW.%2$s IS NULL THEN - RAISE EXCEPTION ''ERROR: NULL value in partitioning key''; - END IF; - v_part_relid := @extschema@.find_or_create_range_partition(TG_RELID, NEW.%2$s); - IF NOT v_part_relid IS NULL THEN - EXECUTE format(''INSERT INTO %%s SELECT $1.*'', v_part_relid::regclass) - USING NEW; - ELSE - RAISE EXCEPTION ''ERROR: Cannot find partition''; - END IF; - END IF; - RETURN NULL; - END - $body$ LANGUAGE plpgsql;'; - v_funcname TEXT; - v_trigger TEXT := ' - CREATE TRIGGER %s - BEFORE INSERT ON %s - FOR EACH ROW EXECUTE PROCEDURE %s();'; - v_triggername TEXT; - v_plain_relname TEXT; - v_plain_schema TEXT; -BEGIN - SELECT * INTO v_plain_schema, v_plain_relname - FROM @extschema@.get_plain_schema_and_relname(v_relation); - - v_funcname := format(quote_ident('%s_insert_trigger_func'), v_plain_relname); - v_triggername := format('"%s_%s_insert_trigger"', v_plain_schema, v_plain_relname); - - v_func := format(v_func, v_funcname, v_attname); - v_trigger := format(v_trigger, v_triggername, v_relation, v_funcname); - - EXECUTE v_func; - EXECUTE v_trigger; - RETURN; -END -$$ LANGUAGE plpgsql; - - /* * Creates an update trigger */ CREATE OR REPLACE FUNCTION @extschema@.create_range_update_trigger( - IN relation REGCLASS) + IN parent_relid REGCLASS) RETURNS TEXT AS $$ DECLARE - func TEXT := ' - CREATE OR REPLACE FUNCTION %s_update_trigger_func() - RETURNS TRIGGER AS - $body$ - DECLARE - old_oid INTEGER; - new_oid INTEGER; - q TEXT; - BEGIN - old_oid := TG_RELID; - new_oid := @extschema@.find_or_create_range_partition(''%1$s''::regclass::oid, NEW.%2$s); - IF old_oid = new_oid THEN RETURN NEW; END IF; - q := format(''DELETE FROM %%s WHERE %4$s'', old_oid::regclass::text); - EXECUTE q USING %5$s; - q := format(''INSERT INTO %%s VALUES (%6$s)'', new_oid::regclass::text); - EXECUTE q USING %7$s; - RETURN NULL; - END $body$ LANGUAGE plpgsql'; - trigger TEXT := 'CREATE TRIGGER %s_update_trigger ' || - 'BEFORE UPDATE ON %s ' || - 'FOR EACH ROW EXECUTE PROCEDURE %s_update_trigger_func()'; - att_names TEXT; - old_fields TEXT; - new_fields TEXT; - att_val_fmt TEXT; - att_fmt TEXT; - relid INTEGER; - rec RECORD; - num INTEGER := 0; - attr TEXT; + func TEXT := 'CREATE OR REPLACE FUNCTION %1$s() + RETURNS TRIGGER AS + $body$ + DECLARE + old_oid Oid; + new_oid Oid; + + BEGIN + old_oid := TG_RELID; + new_oid := @extschema@.find_or_create_range_partition( + ''%2$s''::regclass, NEW.%3$s); + + IF old_oid = new_oid THEN + RETURN NEW; + END IF; + + EXECUTE format(''DELETE FROM %%s WHERE %5$s'', + old_oid::regclass::text) + USING %6$s; + + EXECUTE format(''INSERT INTO %%s VALUES (%7$s)'', + new_oid::regclass::text) + USING %8$s; + + RETURN NULL; + END $body$ + LANGUAGE plpgsql'; + + trigger TEXT := 'CREATE TRIGGER %s ' || + 'BEFORE UPDATE ON %s ' || + 'FOR EACH ROW EXECUTE PROCEDURE %s()'; + + triggername TEXT; + funcname TEXT; + att_names TEXT; + old_fields TEXT; + new_fields TEXT; + att_val_fmt TEXT; + att_fmt TEXT; + attr TEXT; + rec RECORD; + BEGIN - relid := relation::oid; + attr := attname FROM @extschema@.pathman_config WHERE partrel = parent_relid; + + IF attr IS NULL THEN + RAISE EXCEPTION 'table "%" is not partitioned', parent_relid::TEXT; + END IF; + SELECT string_agg(attname, ', '), string_agg('OLD.' || attname, ', '), string_agg('NEW.' || attname, ', '), - string_agg('CASE WHEN NOT $' || attnum || ' IS NULL THEN ' || attname || ' = $' || attnum || - ' ELSE ' || attname || ' IS NULL END', ' AND '), + string_agg('CASE WHEN NOT $' || attnum || ' IS NULL THEN ' || + attname || ' = $' || attnum || ' ' || + 'ELSE ' || + attname || ' IS NULL END', + ' AND '), string_agg('$' || attnum, ', ') FROM pg_attribute - WHERE attrelid=relid AND attnum>0 - INTO att_names, - old_fields, - new_fields, - att_val_fmt, - att_fmt; - - attr := attname - FROM @extschema@.pathman_config - WHERE relname::regclass = relation; - - IF attr IS NULL THEN - RAISE EXCEPTION 'Table % is not partitioned', quote_ident(relation::TEXT); - END IF; - - EXECUTE format(func, relation, attr, 0, att_val_fmt, + WHERE attrelid::REGCLASS = parent_relid AND attnum > 0 + INTO att_names, + old_fields, + new_fields, + att_val_fmt, + att_fmt; + + /* Build trigger & trigger function's names */ + funcname := @extschema@.build_update_trigger_func_name(parent_relid); + triggername := @extschema@.build_update_trigger_name(parent_relid); + + /* Create function for trigger */ + EXECUTE format(func, funcname, parent_relid, attr, 0, att_val_fmt, old_fields, att_fmt, new_fields); - FOR rec in (SELECT * FROM pg_inherits WHERE inhparent = relation::regclass::oid) + + /* Create trigger on every partition */ + FOR rec in (SELECT * FROM pg_catalog.pg_inherits + WHERE inhparent = parent_relid) LOOP - EXECUTE format(trigger - , @extschema@.get_schema_qualified_name(relation::regclass) - , rec.inhrelid::regclass - , relation); - num := num + 1; + EXECUTE format(trigger, + triggername, + rec.inhrelid::REGCLASS::TEXT, + funcname); END LOOP; - RETURN format('%s_update_trigger_func()', relation); + RETURN funcname; END $$ LANGUAGE plpgsql; - /* - * Drop partitions - * If delete_data set to TRUE then partitions will be dropped with all the data + * Construct CHECK constraint condition for a range partition. */ -CREATE OR REPLACE FUNCTION @extschema@.drop_range_partitions( - relation REGCLASS - , delete_data BOOLEAN DEFAULT FALSE) -RETURNS INTEGER AS -$$ -DECLARE - v_rec RECORD; - v_rows INTEGER; - v_part_count INTEGER := 0; - v_relname TEXT; -BEGIN - v_relname := @extschema@.validate_relname(relation); - - /* Drop trigger first */ - PERFORM @extschema@.drop_range_triggers(relation); - - FOR v_rec IN (SELECT inhrelid::regclass::text AS tbl - FROM pg_inherits WHERE inhparent::regclass = relation) - LOOP - IF NOT delete_data THEN - EXECUTE format('WITH part_data AS (DELETE FROM %s RETURNING *) - INSERT INTO %s SELECT * FROM part_data' - , v_rec.tbl - , relation::text); - GET DIAGNOSTICS v_rows = ROW_COUNT; - RAISE NOTICE '% rows copied from %', v_rows, v_rec.tbl; - END IF; - EXECUTE format('DROP TABLE %s', v_rec.tbl); - v_part_count := v_part_count + 1; - END LOOP; - - DELETE FROM @extschema@.pathman_config WHERE relname::regclass = relation; - - /* Notify backend about changes */ - PERFORM @extschema@.on_remove_partitions(relation::oid); - - RETURN v_part_count; -END -$$ LANGUAGE plpgsql; - +CREATE OR REPLACE FUNCTION @extschema@.build_range_condition( + p_attname TEXT, + p_start_value ANYELEMENT, + p_end_value ANYELEMENT) +RETURNS TEXT AS 'pg_pathman', 'build_range_condition' +LANGUAGE C; /* - * Drop trigger + * Returns N-th range (as an array of two elements). */ -CREATE OR REPLACE FUNCTION @extschema@.drop_range_triggers(IN relation REGCLASS) -RETURNS VOID AS -$$ -DECLARE - schema TEXT; - relname TEXT; -BEGIN - SELECT * INTO schema, relname - FROM @extschema@.get_plain_schema_and_relname(relation); - - EXECUTE format('DROP TRIGGER IF EXISTS %s ON %s CASCADE' - , format('"%s_%s_insert_trigger"', schema, relname) - , relation::TEXT); -END -$$ LANGUAGE plpgsql; - +CREATE OR REPLACE FUNCTION @extschema@.get_part_range( + parent_relid REGCLASS, + partition_idx INTEGER, + dummy ANYELEMENT) +RETURNS ANYARRAY AS 'pg_pathman', 'get_part_range_by_idx' +LANGUAGE C; /* - * Internal function used to create new partitions on insert or update trigger. - * Invoked from C-function find_or_create_range_partition(). + * Returns min and max values for specified RANGE partition. */ -CREATE OR REPLACE FUNCTION @extschema@.append_partitions_on_demand_internal( - p_relid OID - , p_new_value ANYELEMENT) -RETURNS OID AS -$$ -DECLARE - v_cnt INTEGER := 0; - i INTEGER := 0; - v_part TEXT; - v_interval TEXT; - v_attname TEXT; - v_min p_new_value%TYPE; - v_max p_new_value%TYPE; - v_cur_value p_new_value%TYPE; - v_next_value p_new_value%TYPE; - v_is_date BOOLEAN; -BEGIN - /* get attribute name and interval */ - SELECT attname, range_interval - FROM @extschema@.pathman_config - WHERE relname::regclass = p_relid::regclass - INTO v_attname, v_interval; +CREATE OR REPLACE FUNCTION @extschema@.get_part_range( + partition_relid REGCLASS, + dummy ANYELEMENT) +RETURNS ANYARRAY AS 'pg_pathman', 'get_part_range_by_oid' +LANGUAGE C; - IF v_attname IS NULL THEN - RAISE EXCEPTION 'Table % is not partitioned', - quote_ident(p_relid::regclass::text); - END IF; - - v_min := @extschema@.get_min_range_value(p_relid::regclass::oid, p_new_value); - v_max := @extschema@.get_max_range_value(p_relid::regclass::oid, p_new_value); - - v_is_date := @extschema@.is_date(pg_typeof(p_new_value)::regtype); - - IF p_new_value >= v_max THEN - v_cur_value := v_max; - WHILE v_cur_value <= p_new_value AND i < 1000 - LOOP - IF v_is_date THEN - v_next_value := v_cur_value + v_interval::interval; - ELSE - EXECUTE format('SELECT $1 + $2::%s', pg_typeof(p_new_value)) - USING v_cur_value, v_interval - INTO v_next_value; - END IF; - - v_part := @extschema@.create_single_range_partition( - @extschema@.get_schema_qualified_name(p_relid::regclass, '.') - , v_cur_value - , v_next_value); - i := i + 1; - v_cur_value := v_next_value; - RAISE NOTICE 'partition % created', v_part; - END LOOP; - ELSIF p_new_value <= v_min THEN - v_cur_value := v_min; - WHILE v_cur_value >= p_new_value AND i < 1000 - LOOP - IF v_is_date THEN - v_next_value := v_cur_value - v_interval::interval; - ELSE - EXECUTE format('SELECT $1 - $2::%s', pg_typeof(p_new_value)) - USING v_cur_value, v_interval - INTO v_next_value; - END IF; - - v_part := @extschema@.create_single_range_partition( - @extschema@.get_schema_qualified_name(p_relid::regclass, '.') - , v_next_value - , v_cur_value); - i := i + 1; - v_cur_value := v_next_value; - RAISE NOTICE 'partition % created', v_part; - END LOOP; - ELSE - RAISE NOTICE 'Not implemented yet'; - END IF; +/* + * Checks if range overlaps with existing partitions. + * Returns TRUE if overlaps and FALSE otherwise. + */ +CREATE OR REPLACE FUNCTION @extschema@.check_overlap( + parent_relid REGCLASS, + range_min ANYELEMENT, + range_max ANYELEMENT) +RETURNS BOOLEAN AS 'pg_pathman', 'check_overlap' +LANGUAGE C; - IF i > 0 THEN - RETURN v_part::regclass::oid; - END IF; - RETURN NULL; -END -$$ LANGUAGE plpgsql; +/* + * Needed for an UPDATE trigger. + */ +CREATE OR REPLACE FUNCTION @extschema@.find_or_create_range_partition( + parent_relid REGCLASS, + value ANYELEMENT) +RETURNS REGCLASS AS 'pg_pathman', 'find_or_create_range_partition' +LANGUAGE C; diff --git a/contrib/pg_pathman/specs/for_update.spec b/contrib/pg_pathman/specs/for_update.spec new file mode 100644 index 0000000000..55ea24af3a --- /dev/null +++ b/contrib/pg_pathman/specs/for_update.spec @@ -0,0 +1,32 @@ +setup +{ + create extension pg_pathman; + create table test_tbl(id int not null, val real); + insert into test_tbl select i, i from generate_series(1, 1000) as i; + select create_range_partitions('test_tbl', 'id', 1, 100, 10); +} + +teardown +{ + drop table test_tbl cascade; + drop extension pg_pathman; +} + +session "s1" +step "s1_b" { begin; } +step "s1_c" { commit; } +step "s1_r" { rollback; } +step "s1_update" { update test_tbl set id = 2 where id = 1; } + +session "s2" +step "s2_b" { begin; } +step "s2_c" { commit; } +step "s2_select_locked" { select * from test_tbl where id = 1 for share; } +step "s2_select" { select * from test_tbl where id = 1; } + + +permutation "s1_b" "s1_update" "s2_select" "s1_r" + +permutation "s1_b" "s1_update" "s2_select_locked" "s1_r" + +permutation "s1_b" "s1_update" "s2_select_locked" "s1_c" diff --git a/contrib/pg_pathman/specs/insert_trigger.spec b/contrib/pg_pathman/specs/insert_nodes.spec similarity index 97% rename from contrib/pg_pathman/specs/insert_trigger.spec rename to contrib/pg_pathman/specs/insert_nodes.spec index 126c900afd..93df4102f6 100644 --- a/contrib/pg_pathman/specs/insert_trigger.spec +++ b/contrib/pg_pathman/specs/insert_nodes.spec @@ -7,7 +7,7 @@ setup teardown { - SELECT drop_range_partitions('range_rel'); + SELECT drop_partitions('range_rel'); DROP TABLE range_rel CASCADE; DROP EXTENSION pg_pathman; } diff --git a/contrib/pg_pathman/specs/rollback_on_create_partitions.spec b/contrib/pg_pathman/specs/rollback_on_create_partitions.spec index 83e634e632..41fc48d114 100644 --- a/contrib/pg_pathman/specs/rollback_on_create_partitions.spec +++ b/contrib/pg_pathman/specs/rollback_on_create_partitions.spec @@ -6,7 +6,6 @@ setup teardown { - SELECT drop_range_partitions('range_rel'); DROP TABLE range_rel CASCADE; DROP EXTENSION pg_pathman; } @@ -17,7 +16,7 @@ step "rollback" { ROLLBACK; } step "commit" { COMMIT; } step "insert_data" { INSERT INTO range_rel SELECT generate_series(1, 10000); } step "create_partitions" { SELECT create_range_partitions('range_rel', 'id', 1, 1000); } -step "drop_partitions" { SELECT drop_range_partitions('range_rel'); } +step "drop_partitions" { SELECT drop_partitions('range_rel'); } step "savepoint_a" { SAVEPOINT a; } step "rollback_a" { ROLLBACK TO SAVEPOINT a; } step "savepoint_b" { SAVEPOINT b; } diff --git a/contrib/pg_pathman/sql/pg_pathman.sql b/contrib/pg_pathman/sql/pathman_basic.sql similarity index 67% rename from contrib/pg_pathman/sql/pg_pathman.sql rename to contrib/pg_pathman/sql/pathman_basic.sql index f7d50e1739..2faadd426d 100644 --- a/contrib/pg_pathman/sql/pg_pathman.sql +++ b/contrib/pg_pathman/sql/pathman_basic.sql @@ -12,6 +12,16 @@ INSERT INTO test.hash_rel VALUES (2, 2); INSERT INTO test.hash_rel VALUES (3, 3); SELECT pathman.create_hash_partitions('test.hash_rel', 'value', 3); ALTER TABLE test.hash_rel ALTER COLUMN value SET NOT NULL; +SELECT pathman.create_hash_partitions('test.hash_rel', 'value', 3, partition_data:=false); +EXPLAIN (COSTS OFF) SELECT * FROM test.hash_rel; +SELECT * FROM test.hash_rel; +SELECT pathman.set_enable_parent('test.hash_rel', false); +EXPLAIN (COSTS OFF) SELECT * FROM test.hash_rel; +SELECT * FROM test.hash_rel; +SELECT pathman.set_enable_parent('test.hash_rel', true); +EXPLAIN (COSTS OFF) SELECT * FROM test.hash_rel; +SELECT * FROM test.hash_rel; +SELECT pathman.drop_partitions('test.hash_rel'); SELECT pathman.create_hash_partitions('test.hash_rel', 'Value', 3); SELECT COUNT(*) FROM test.hash_rel; SELECT COUNT(*) FROM ONLY test.hash_rel; @@ -147,208 +157,13 @@ WHERE j1.dt < '2015-03-01' AND j2.dt >= '2015-02-01' ORDER BY j2.dt; * Test CTE query */ EXPLAIN (COSTS OFF) - WITH ttt AS (SELECT * FROM test.range_rel WHERE dt >= '2015-02-01' AND dt < '2015-03-15') + WITH ttt AS (SELECT * FROM test.range_rel WHERE dt >= '2015-02-01' AND dt < '2015-03-15') SELECT * FROM ttt; EXPLAIN (COSTS OFF) - WITH ttt AS (SELECT * FROM test.hash_rel WHERE value = 2) + WITH ttt AS (SELECT * FROM test.hash_rel WHERE value = 2) SELECT * FROM ttt; - -/* - * Test RuntimeAppend - */ - -create or replace function test.pathman_assert(smt bool, error_msg text) returns text as $$ -begin - if not smt then - raise exception '%', error_msg; - end if; - - return 'ok'; -end; -$$ language plpgsql; - -create or replace function test.pathman_equal(a text, b text, error_msg text) returns text as $$ -begin - if a != b then - raise exception '''%'' is not equal to ''%'', %', a, b, error_msg; - end if; - - return 'equal'; -end; -$$ language plpgsql; - -create or replace function test.pathman_test(query text) returns jsonb as $$ -declare - plan jsonb; -begin - execute 'explain (analyze, format json)' || query into plan; - - return plan; -end; -$$ language plpgsql; - -create or replace function test.pathman_test_1() returns text as $$ -declare - plan jsonb; - num int; -begin - plan = test.pathman_test('select * from test.runtime_test_1 where id = (select * from test.run_values limit 1)'); - - perform test.pathman_equal((plan->0->'Plan'->'Node Type')::text, - '"Custom Scan"', - 'wrong plan type'); - - perform test.pathman_equal((plan->0->'Plan'->'Custom Plan Provider')::text, - '"RuntimeAppend"', - 'wrong plan provider'); - - perform test.pathman_equal((plan->0->'Plan'->'Plans'->1->'Relation Name')::text, - format('"runtime_test_1_%s"', pathman.get_hash(hashint4(1), 6)), - 'wrong partition'); - - select count(*) from jsonb_array_elements_text(plan->0->'Plan'->'Plans') into num; - perform test.pathman_equal(num::text, '2', 'expected 2 child plans for custom scan'); - - return 'ok'; -end; -$$ language plpgsql; - -create or replace function test.pathman_test_2() returns text as $$ -declare - plan jsonb; - num int; -begin - plan = test.pathman_test('select * from test.runtime_test_1 where id = any (select * from test.run_values limit 4)'); - - perform test.pathman_equal((plan->0->'Plan'->'Node Type')::text, - '"Nested Loop"', - 'wrong plan type'); - - perform test.pathman_equal((plan->0->'Plan'->'Plans'->1->'Node Type')::text, - '"Custom Scan"', - 'wrong plan type'); - - perform test.pathman_equal((plan->0->'Plan'->'Plans'->1->'Custom Plan Provider')::text, - '"RuntimeAppend"', - 'wrong plan provider'); - - select count(*) from jsonb_array_elements_text(plan->0->'Plan'->'Plans'->1->'Plans') into num; - perform test.pathman_equal(num::text, '4', 'expected 4 child plans for custom scan'); - - for i in 0..3 loop - perform test.pathman_equal((plan->0->'Plan'->'Plans'->1->'Plans'->i->'Relation Name')::text, - format('"runtime_test_1_%s"', pathman.get_hash(hashint4(i + 1), 6)), - 'wrong partition'); - - num = plan->0->'Plan'->'Plans'->1->'Plans'->i->'Actual Loops'; - perform test.pathman_equal(num::text, '1', 'expected 1 loop'); - end loop; - - return 'ok'; -end; -$$ language plpgsql; - -create or replace function test.pathman_test_3() returns text as $$ -declare - plan jsonb; - num int; -begin - plan = test.pathman_test('select * from test.runtime_test_1 a join test.run_values b on a.id = b.val'); - - perform test.pathman_equal((plan->0->'Plan'->'Node Type')::text, - '"Nested Loop"', - 'wrong plan type'); - - perform test.pathman_equal((plan->0->'Plan'->'Plans'->1->'Node Type')::text, - '"Custom Scan"', - 'wrong plan type'); - - perform test.pathman_equal((plan->0->'Plan'->'Plans'->1->'Custom Plan Provider')::text, - '"RuntimeAppend"', - 'wrong plan provider'); - - select count(*) from jsonb_array_elements_text(plan->0->'Plan'->'Plans'->1->'Plans') into num; - perform test.pathman_equal(num::text, '6', 'expected 6 child plans for custom scan'); - - for i in 0..5 loop - num = plan->0->'Plan'->'Plans'->1->'Plans'->i->'Actual Loops'; - perform test.pathman_assert(num > 0 and num <= 1718, 'expected no more than 1718 loops'); - end loop; - - return 'ok'; -end; -$$ language plpgsql; - -create or replace function test.pathman_test_4() returns text as $$ -declare - plan jsonb; - num int; -begin - plan = test.pathman_test('select * from test.category c, lateral' || - '(select * from test.runtime_test_2 g where g.category_id = c.id order by rating limit 4) as tg'); - - perform test.pathman_equal((plan->0->'Plan'->'Node Type')::text, - '"Nested Loop"', - 'wrong plan type'); - - /* Limit -> Custom Scan */ - perform test.pathman_equal((plan->0->'Plan'->'Plans'->1->0->'Node Type')::text, - '"Custom Scan"', - 'wrong plan type'); - - perform test.pathman_equal((plan->0->'Plan'->'Plans'->1->0->'Custom Plan Provider')::text, - '"RuntimeMergeAppend"', - 'wrong plan provider'); - - select count(*) from jsonb_array_elements_text(plan->0->'Plan'->'Plans'->1->'Plans'->0->'Plans') into num; - perform test.pathman_equal(num::text, '4', 'expected 4 child plans for custom scan'); - - for i in 0..3 loop - perform test.pathman_equal((plan->0->'Plan'->'Plans'->1->'Plans'->0->'Plans'->i->'Relation Name')::text, - format('"runtime_test_2_%s"', pathman.get_hash(hashint4(i + 1), 6)), - 'wrong partition'); - - num = plan->0->'Plan'->'Plans'->1->'Plans'->0->'Plans'->i->'Actual Loops'; - perform test.pathman_assert(num = 1, 'expected no more than 1 loops'); - end loop; - - return 'ok'; -end; -$$ language plpgsql; - - -create table test.run_values as select generate_series(1, 10000) val; -create table test.runtime_test_1(id serial primary key, val real); -insert into test.runtime_test_1 select generate_series(1, 10000), random(); -select pathman.create_hash_partitions('test.runtime_test_1', 'id', 6); - -create table test.category as (select id, 'cat' || id::text as name from generate_series(1, 4) id); -create table test.runtime_test_2 (id serial, category_id int not null, name text, rating real); -insert into test.runtime_test_2 (select id, (id % 6) + 1 as category_id, 'good' || id::text as name, random() as rating from generate_series(1, 100000) id); -create index on test.runtime_test_2 (category_id, rating); -select pathman.create_hash_partitions('test.runtime_test_2', 'category_id', 6); - -analyze test.run_values; -analyze test.runtime_test_1; - -set enable_mergejoin = off; -set enable_hashjoin = off; -set pg_pathman.enable_runtimeappend = on; -set pg_pathman.enable_runtimemergeappend = on; -select test.pathman_test_1(); /* RuntimeAppend (select ... where id = (subquery)) */ -select test.pathman_test_2(); /* RuntimeAppend (select ... where id = any(subquery)) */ -select test.pathman_test_3(); /* RuntimeAppend (a join b on a.id = b.val) */ -select test.pathman_test_4(); /* RuntimeMergeAppend (lateral) */ - -set pg_pathman.enable_runtimeappend = off; -set pg_pathman.enable_runtimemergeappend = off; -set enable_mergejoin = on; -set enable_hashjoin = on; - -drop table test.run_values, test.runtime_test_1, test.runtime_test_2 cascade; - /* * Test split and merge */ @@ -387,16 +202,32 @@ EXPLAIN (COSTS OFF) SELECT * FROM test.range_rel WHERE dt BETWEEN '2014-11-15' A SELECT pathman.detach_range_partition('test.range_rel_archive'); EXPLAIN (COSTS OFF) SELECT * FROM test.range_rel WHERE dt BETWEEN '2014-11-15' AND '2015-01-15'; CREATE TABLE test.range_rel_test1 ( - id SERIAL PRIMARY KEY, - dt TIMESTAMP, - txt TEXT, - abc INTEGER); + id SERIAL PRIMARY KEY, + dt TIMESTAMP, + txt TEXT, + abc INTEGER); SELECT pathman.attach_range_partition('test.range_rel', 'test.range_rel_test1', '2013-01-01'::DATE, '2014-01-01'::DATE); CREATE TABLE test.range_rel_test2 ( - id SERIAL PRIMARY KEY, - dt TIMESTAMP); + id SERIAL PRIMARY KEY, + dt TIMESTAMP); SELECT pathman.attach_range_partition('test.range_rel', 'test.range_rel_test2', '2013-01-01'::DATE, '2014-01-01'::DATE); +/* + * Zero partitions count and adding partitions with specified name + */ +CREATE TABLE test.zero( + id SERIAL PRIMARY KEY, + value INT NOT NULL); +INSERT INTO test.zero SELECT g, g FROM generate_series(1, 100) as g; +SELECT pathman.create_range_partitions('test.zero', 'value', 50, 10, 0); +SELECT pathman.append_range_partition('test.zero', 'test.zero_0'); +SELECT pathman.prepend_range_partition('test.zero', 'test.zero_1'); +SELECT pathman.add_range_partition('test.zero', 50, 70, 'test.zero_50'); +SELECT pathman.append_range_partition('test.zero', 'test.zero_appended'); +SELECT pathman.prepend_range_partition('test.zero', 'test.zero_prepended'); +SELECT pathman.split_range_partition('test.zero_50', 60, 'test.zero_60'); +DROP TABLE test.zero CASCADE; + /* * Check that altering table columns doesn't break trigger */ @@ -407,14 +238,14 @@ SELECT * FROM test.hash_rel WHERE id = 123; /* * Clean up */ -SELECT pathman.drop_hash_partitions('test.hash_rel'); +SELECT pathman.drop_partitions('test.hash_rel'); SELECT COUNT(*) FROM ONLY test.hash_rel; SELECT pathman.create_hash_partitions('test.hash_rel', 'value', 3); -SELECT pathman.drop_hash_partitions('test.hash_rel', TRUE); +SELECT pathman.drop_partitions('test.hash_rel', TRUE); SELECT COUNT(*) FROM ONLY test.hash_rel; DROP TABLE test.hash_rel CASCADE; -SELECT pathman.drop_range_partitions('test.num_range_rel'); +SELECT pathman.drop_partitions('test.num_range_rel'); DROP TABLE test.num_range_rel CASCADE; DROP TABLE test.range_rel CASCADE; @@ -435,6 +266,11 @@ SELECT * FROM test.range_rel WHERE dt = '2014-12-15'; EXPLAIN (COSTS OFF) SELECT * FROM test.range_rel WHERE dt = '2015-03-15'; SELECT * FROM test.range_rel WHERE dt = '2015-03-15'; +SELECT pathman.set_auto('test.range_rel', false); +INSERT INTO test.range_rel (dt) VALUES ('2015-06-01'); +SELECT pathman.set_auto('test.range_rel', true); +INSERT INTO test.range_rel (dt) VALUES ('2015-06-01'); + DROP TABLE test.range_rel CASCADE; SELECT * FROM pathman.pathman_config; @@ -464,7 +300,7 @@ UPDATE test."TeSt" SET a = 1; SELECT * FROM test."TeSt"; SELECT * FROM test."TeSt" WHERE a = 1; EXPLAIN (COSTS OFF) SELECT * FROM test."TeSt" WHERE a = 1; -SELECT pathman.drop_hash_partitions('test."TeSt"'); +SELECT pathman.drop_partitions('test."TeSt"'); SELECT * FROM test."TeSt"; CREATE TABLE test."RangeRel" ( @@ -478,7 +314,7 @@ SELECT pathman.append_range_partition('test."RangeRel"'); SELECT pathman.prepend_range_partition('test."RangeRel"'); SELECT pathman.merge_range_partitions('test."RangeRel_1"', 'test."RangeRel_' || currval('test."RangeRel_seq"') || '"'); SELECT pathman.split_range_partition('test."RangeRel_1"', '2015-01-01'::DATE); -SELECT pathman.drop_range_partitions('test."RangeRel"'); +SELECT pathman.drop_partitions('test."RangeRel"'); SELECT pathman.create_partitions_from_range('test."RangeRel"', 'dt', '2015-01-01'::DATE, '2015-01-05'::DATE, '1 day'::INTERVAL); DROP TABLE test."RangeRel" CASCADE; SELECT * FROM pathman.pathman_config; @@ -487,7 +323,7 @@ CREATE TABLE test."RangeRel" ( dt TIMESTAMP NOT NULL, txt TEXT); SELECT pathman.create_range_partitions('test."RangeRel"', 'id', 1, 100, 3); -SELECT pathman.drop_range_partitions('test."RangeRel"'); +SELECT pathman.drop_partitions('test."RangeRel"'); SELECT pathman.create_partitions_from_range('test."RangeRel"', 'id', 1, 300, 100); DROP TABLE test."RangeRel" CASCADE; @@ -535,9 +371,9 @@ EXPLAIN (COSTS OFF) DELETE FROM range_rel r USING tmp t WHERE r.dt = '2010-01-02 DELETE FROM range_rel r USING tmp t WHERE r.dt = '2010-01-02' AND r.id = t.id; /* Create range partitions from whole range */ -SELECT drop_range_partitions('range_rel'); +SELECT drop_partitions('range_rel'); SELECT create_partitions_from_range('range_rel', 'id', 1, 1000, 100); -SELECT drop_range_partitions('range_rel', TRUE); +SELECT drop_partitions('range_rel', TRUE); SELECT create_partitions_from_range('range_rel', 'dt', '2015-01-01'::date, '2015-12-01'::date, '1 month'::interval); EXPLAIN (COSTS OFF) SELECT * FROM range_rel WHERE dt = '2015-12-15'; @@ -549,3 +385,8 @@ SELECT create_range_partitions('messages', 'id', 1, 100, 2); ALTER TABLE replies DROP CONSTRAINT replies_message_id_fkey; SELECT create_range_partitions('messages', 'id', 1, 100, 2); EXPLAIN (COSTS OFF) SELECT * FROM messages; + + +DROP SCHEMA test CASCADE; +DROP EXTENSION pg_pathman CASCADE; +DROP SCHEMA pathman CASCADE; diff --git a/contrib/pg_pathman/sql/pathman_callbacks.sql b/contrib/pg_pathman/sql/pathman_callbacks.sql new file mode 100644 index 0000000000..3aa174cd23 --- /dev/null +++ b/contrib/pg_pathman/sql/pathman_callbacks.sql @@ -0,0 +1,41 @@ +\set VERBOSITY terse + +CREATE EXTENSION pg_pathman; +CREATE SCHEMA callbacks; + +/* Check callbacks */ + +CREATE OR REPLACE FUNCTION callbacks.abc_on_part_created_callback( + args JSONB) +RETURNS VOID AS $$ +BEGIN + RAISE WARNING 'callback arg: %', args::TEXT; +END +$$ language plpgsql; + + +/* set callback to be called on RANGE partitions */ +CREATE TABLE callbacks.abc(a serial, b int); +SELECT create_range_partitions('callbacks.abc', 'a', 1, 100, 2); + +SELECT set_init_callback('callbacks.abc', + 'callbacks.abc_on_part_created_callback'); + +INSERT INTO callbacks.abc VALUES (123, 1); +INSERT INTO callbacks.abc VALUES (223, 1); + +SELECT append_range_partition('callbacks.abc'); +SELECT prepend_range_partition('callbacks.abc'); +SELECT add_range_partition('callbacks.abc', 401, 502); + +SELECT drop_partitions('callbacks.abc'); + + +/* set callback to be called on HASH partitions */ +SELECT set_init_callback('callbacks.abc', + 'callbacks.abc_on_part_created_callback'); +SELECT create_hash_partitions('callbacks.abc', 'a', 5); + + +DROP SCHEMA callbacks CASCADE; +DROP EXTENSION pg_pathman CASCADE; diff --git a/contrib/pg_pathman/sql/pathman_domains.sql b/contrib/pg_pathman/sql/pathman_domains.sql new file mode 100644 index 0000000000..bc5d227e4e --- /dev/null +++ b/contrib/pg_pathman/sql/pathman_domains.sql @@ -0,0 +1,37 @@ +\set VERBOSITY terse + +CREATE EXTENSION pg_pathman; +CREATE SCHEMA domains; + +CREATE DOMAIN domains.dom_test AS numeric CHECK (value < 1200); + +CREATE TABLE domains.dom_table(val domains.dom_test NOT NULL); +INSERT INTO domains.dom_table SELECT generate_series(1, 999); + +SELECT create_range_partitions('domains.dom_table', 'val', 1, 100); + +EXPLAIN (COSTS OFF) +SELECT * FROM domains.dom_table +WHERE val < 250; + +INSERT INTO domains.dom_table VALUES(1500); +INSERT INTO domains.dom_table VALUES(-10); + +SELECT append_range_partition('domains.dom_table'); +SELECT prepend_range_partition('domains.dom_table'); +SELECT merge_range_partitions('domains.dom_table_1', 'domains.dom_table_2'); +SELECT split_range_partition('domains.dom_table_1', 50); + +INSERT INTO domains.dom_table VALUES(1101); + +EXPLAIN (COSTS OFF) +SELECT * FROM domains.dom_table +WHERE val < 450; + + +SELECT * FROM pathman_partition_list +ORDER BY range_min::INT, range_max::INT; + + +DROP SCHEMA domains CASCADE; +DROP EXTENSION pg_pathman CASCADE; diff --git a/contrib/pg_pathman/sql/pathman_foreign_keys.sql b/contrib/pg_pathman/sql/pathman_foreign_keys.sql new file mode 100644 index 0000000000..a2032815de --- /dev/null +++ b/contrib/pg_pathman/sql/pathman_foreign_keys.sql @@ -0,0 +1,29 @@ +\set VERBOSITY terse + +CREATE EXTENSION pg_pathman; +CREATE SCHEMA fkeys; + +/* Check primary keys generation */ +CREATE TABLE fkeys.test_ref(comment TEXT UNIQUE); +INSERT INTO fkeys.test_ref VALUES('test'); + +CREATE TABLE fkeys.test_fkey( + id INT NOT NULL, + comment TEXT, + FOREIGN KEY (comment) REFERENCES fkeys.test_ref(comment)); + +INSERT INTO fkeys.test_fkey SELECT generate_series(1, 1000), 'test'; + +SELECT create_range_partitions('fkeys.test_fkey', 'id', 1, 100); +INSERT INTO fkeys.test_fkey VALUES(1, 'wrong'); +INSERT INTO fkeys.test_fkey VALUES(1, 'test'); +SELECT drop_partitions('fkeys.test_fkey'); + +SELECT create_hash_partitions('fkeys.test_fkey', 'id', 10); +INSERT INTO fkeys.test_fkey VALUES(1, 'wrong'); +INSERT INTO fkeys.test_fkey VALUES(1, 'test'); +SELECT drop_partitions('fkeys.test_fkey'); + + +DROP SCHEMA fkeys CASCADE; +DROP EXTENSION pg_pathman CASCADE; diff --git a/contrib/pg_pathman/sql/pathman_rowmarks.sql b/contrib/pg_pathman/sql/pathman_rowmarks.sql new file mode 100644 index 0000000000..8397b7fc01 --- /dev/null +++ b/contrib/pg_pathman/sql/pathman_rowmarks.sql @@ -0,0 +1,62 @@ +CREATE EXTENSION pg_pathman; +CREATE SCHEMA rowmarks; + + +CREATE TABLE rowmarks.first(id int NOT NULL); +CREATE TABLE rowmarks.second(id int NOT NULL); + +INSERT INTO rowmarks.first SELECT generate_series(1, 10); +INSERT INTO rowmarks.second SELECT generate_series(1, 10); + + +SELECT create_hash_partitions('rowmarks.first', 'id', 5); + +/* Not partitioned */ +SELECT * FROM rowmarks.second ORDER BY id FOR UPDATE; + +/* Simple case (plan) */ +EXPLAIN (COSTS OFF) +SELECT * FROM rowmarks.first ORDER BY id FOR UPDATE; + +/* Simple case (execution) */ +SELECT * FROM rowmarks.first ORDER BY id FOR UPDATE; +SELECT FROM rowmarks.first ORDER BY id FOR UPDATE; +SELECT tableoid > 0 FROM rowmarks.first ORDER BY id FOR UPDATE; + +/* A little harder (plan) */ +EXPLAIN (COSTS OFF) +SELECT * FROM rowmarks.first +WHERE id = (SELECT id FROM rowmarks.first + ORDER BY id + OFFSET 10 LIMIT 1 + FOR UPDATE) +FOR SHARE; + +/* A little harder (execution) */ +SELECT * FROM rowmarks.first +WHERE id = (SELECT id FROM rowmarks.first + ORDER BY id + OFFSET 5 LIMIT 1 + FOR UPDATE) +FOR SHARE; + +/* Two tables (plan) */ +EXPLAIN (COSTS OFF) +SELECT * FROM rowmarks.first +WHERE id = (SELECT id FROM rowmarks.second + ORDER BY id + OFFSET 5 LIMIT 1 + FOR UPDATE) +FOR SHARE; + +/* Two tables (execution) */ +SELECT * FROM rowmarks.first +WHERE id = (SELECT id FROM rowmarks.second + ORDER BY id + OFFSET 5 LIMIT 1 + FOR UPDATE) +FOR SHARE; + + +DROP SCHEMA rowmarks CASCADE; +DROP EXTENSION pg_pathman; diff --git a/contrib/pg_pathman/sql/pathman_runtime_nodes.sql b/contrib/pg_pathman/sql/pathman_runtime_nodes.sql new file mode 100644 index 0000000000..517995b9be --- /dev/null +++ b/contrib/pg_pathman/sql/pathman_runtime_nodes.sql @@ -0,0 +1,272 @@ +\set VERBOSITY terse + +CREATE SCHEMA pathman; +CREATE EXTENSION pg_pathman SCHEMA pathman; +CREATE SCHEMA test; + +/* + * Test RuntimeAppend + */ + +create or replace function test.pathman_assert(smt bool, error_msg text) returns text as $$ +begin + if not smt then + raise exception '%', error_msg; + end if; + + return 'ok'; +end; +$$ language plpgsql; + +create or replace function test.pathman_equal(a text, b text, error_msg text) returns text as $$ +begin + if a != b then + raise exception '''%'' is not equal to ''%'', %', a, b, error_msg; + end if; + + return 'equal'; +end; +$$ language plpgsql; + +create or replace function test.pathman_test(query text) returns jsonb as $$ +declare + plan jsonb; +begin + execute 'explain (analyze, format json)' || query into plan; + + return plan; +end; +$$ language plpgsql; + +create or replace function test.pathman_test_1() returns text as $$ +declare + plan jsonb; + num int; +begin + plan = test.pathman_test('select * from test.runtime_test_1 where id = (select * from test.run_values limit 1)'); + + perform test.pathman_equal((plan->0->'Plan'->'Node Type')::text, + '"Custom Scan"', + 'wrong plan type'); + + perform test.pathman_equal((plan->0->'Plan'->'Custom Plan Provider')::text, + '"RuntimeAppend"', + 'wrong plan provider'); + + perform test.pathman_equal((plan->0->'Plan'->'Plans'->1->'Relation Name')::text, + format('"runtime_test_1_%s"', pathman.get_hash_part_idx(hashint4(1), 6)), + 'wrong partition'); + + select count(*) from jsonb_array_elements_text(plan->0->'Plan'->'Plans') into num; + perform test.pathman_equal(num::text, '2', 'expected 2 child plans for custom scan'); + + return 'ok'; +end; +$$ language plpgsql +set pg_pathman.enable = true +set enable_mergejoin = off +set enable_hashjoin = off; + +create or replace function test.pathman_test_2() returns text as $$ +declare + plan jsonb; + num int; +begin + plan = test.pathman_test('select * from test.runtime_test_1 where id = any (select * from test.run_values limit 4)'); + + perform test.pathman_equal((plan->0->'Plan'->'Node Type')::text, + '"Nested Loop"', + 'wrong plan type'); + + perform test.pathman_equal((plan->0->'Plan'->'Plans'->1->'Node Type')::text, + '"Custom Scan"', + 'wrong plan type'); + + perform test.pathman_equal((plan->0->'Plan'->'Plans'->1->'Custom Plan Provider')::text, + '"RuntimeAppend"', + 'wrong plan provider'); + + select count(*) from jsonb_array_elements_text(plan->0->'Plan'->'Plans'->1->'Plans') into num; + perform test.pathman_equal(num::text, '4', 'expected 4 child plans for custom scan'); + + for i in 0..3 loop + perform test.pathman_equal((plan->0->'Plan'->'Plans'->1->'Plans'->i->'Relation Name')::text, + format('"runtime_test_1_%s"', pathman.get_hash_part_idx(hashint4(i + 1), 6)), + 'wrong partition'); + + num = plan->0->'Plan'->'Plans'->1->'Plans'->i->'Actual Loops'; + perform test.pathman_equal(num::text, '1', 'expected 1 loop'); + end loop; + + return 'ok'; +end; +$$ language plpgsql +set pg_pathman.enable = true +set enable_mergejoin = off +set enable_hashjoin = off; + +create or replace function test.pathman_test_3() returns text as $$ +declare + plan jsonb; + num int; +begin + plan = test.pathman_test('select * from test.runtime_test_1 a join test.run_values b on a.id = b.val'); + + perform test.pathman_equal((plan->0->'Plan'->'Node Type')::text, + '"Nested Loop"', + 'wrong plan type'); + + perform test.pathman_equal((plan->0->'Plan'->'Plans'->1->'Node Type')::text, + '"Custom Scan"', + 'wrong plan type'); + + perform test.pathman_equal((plan->0->'Plan'->'Plans'->1->'Custom Plan Provider')::text, + '"RuntimeAppend"', + 'wrong plan provider'); + + select count(*) from jsonb_array_elements_text(plan->0->'Plan'->'Plans'->1->'Plans') into num; + perform test.pathman_equal(num::text, '6', 'expected 6 child plans for custom scan'); + + for i in 0..5 loop + num = plan->0->'Plan'->'Plans'->1->'Plans'->i->'Actual Loops'; + perform test.pathman_assert(num > 0 and num <= 1718, 'expected no more than 1718 loops'); + end loop; + + return 'ok'; +end; +$$ language plpgsql +set pg_pathman.enable = true +set enable_mergejoin = off +set enable_hashjoin = off; + +create or replace function test.pathman_test_4() returns text as $$ +declare + plan jsonb; + num int; +begin + plan = test.pathman_test('select * from test.category c, lateral' || + '(select * from test.runtime_test_2 g where g.category_id = c.id order by rating limit 4) as tg'); + + perform test.pathman_equal((plan->0->'Plan'->'Node Type')::text, + '"Nested Loop"', + 'wrong plan type'); + + /* Limit -> Custom Scan */ + perform test.pathman_equal((plan->0->'Plan'->'Plans'->1->0->'Node Type')::text, + '"Custom Scan"', + 'wrong plan type'); + + perform test.pathman_equal((plan->0->'Plan'->'Plans'->1->0->'Custom Plan Provider')::text, + '"RuntimeMergeAppend"', + 'wrong plan provider'); + + select count(*) from jsonb_array_elements_text(plan->0->'Plan'->'Plans'->1->'Plans'->0->'Plans') into num; + perform test.pathman_equal(num::text, '4', 'expected 4 child plans for custom scan'); + + for i in 0..3 loop + perform test.pathman_equal((plan->0->'Plan'->'Plans'->1->'Plans'->0->'Plans'->i->'Relation Name')::text, + format('"runtime_test_2_%s"', pathman.get_hash_part_idx(hashint4(i + 1), 6)), + 'wrong partition'); + + num = plan->0->'Plan'->'Plans'->1->'Plans'->0->'Plans'->i->'Actual Loops'; + perform test.pathman_assert(num = 1, 'expected no more than 1 loops'); + end loop; + + return 'ok'; +end; +$$ language plpgsql +set pg_pathman.enable = true +set enable_mergejoin = off +set enable_hashjoin = off; + +create or replace function test.pathman_test_5() returns text as $$ +declare + res record; +begin + select + from test.runtime_test_3 + where id = (select * from test.vals order by val limit 1) + limit 1 + into res; /* test empty tlist */ + + + select id, generate_series(1, 2) gen, val + from test.runtime_test_3 + where id = any (select * from test.vals order by val limit 5) + order by id, gen, val + offset 1 limit 1 + into res; /* without IndexOnlyScan */ + + perform test.pathman_equal(res.id::text, '1', 'id is incorrect (t2)'); + perform test.pathman_equal(res.gen::text, '2', 'gen is incorrect (t2)'); + perform test.pathman_equal(res.val::text, 'k = 1', 'val is incorrect (t2)'); + + + select id + from test.runtime_test_3 + where id = any (select * from test.vals order by val limit 5) + order by id + offset 3 limit 1 + into res; /* with IndexOnlyScan */ + + perform test.pathman_equal(res.id::text, '4', 'id is incorrect (t3)'); + + + select v.val v1, generate_series(2, 2) gen, t.val v2 + from test.runtime_test_3 t join test.vals v on id = v.val + order by v1, gen, v2 + limit 1 + into res; + + perform test.pathman_equal(res.v1::text, '1', 'v1 is incorrect (t4)'); + perform test.pathman_equal(res.gen::text, '2', 'gen is incorrect (t4)'); + perform test.pathman_equal(res.v2::text, 'k = 1', 'v2 is incorrect (t4)'); + + return 'ok'; +end; +$$ language plpgsql +set pg_pathman.enable = true +set enable_hashjoin = off +set enable_mergejoin = off; + + + +create table test.run_values as select generate_series(1, 10000) val; +create table test.runtime_test_1(id serial primary key, val real); +insert into test.runtime_test_1 select generate_series(1, 10000), random(); +select pathman.create_hash_partitions('test.runtime_test_1', 'id', 6); + +create table test.category as (select id, 'cat' || id::text as name from generate_series(1, 4) id); +create table test.runtime_test_2 (id serial, category_id int not null, name text, rating real); +insert into test.runtime_test_2 (select id, (id % 6) + 1 as category_id, 'good' || id::text as name, random() as rating from generate_series(1, 100000) id); +create index on test.runtime_test_2 (category_id, rating); +select pathman.create_hash_partitions('test.runtime_test_2', 'category_id', 6); + +create table test.vals as (select generate_series(1, 10000) as val); +create table test.runtime_test_3(val text, id serial not null); +insert into test.runtime_test_3(id, val) select * from generate_series(1, 10000) k, format('k = %s', k); +select pathman.create_hash_partitions('test.runtime_test_3', 'id', 4); +create index on test.runtime_test_3 (id); +create index on test.runtime_test_3_0 (id); + + +analyze test.run_values; +analyze test.runtime_test_1; +analyze test.runtime_test_2; +analyze test.runtime_test_3; +analyze test.runtime_test_3_0; + +set pg_pathman.enable_runtimeappend = on; +set pg_pathman.enable_runtimemergeappend = on; + +select test.pathman_test_1(); /* RuntimeAppend (select ... where id = (subquery)) */ +select test.pathman_test_2(); /* RuntimeAppend (select ... where id = any(subquery)) */ +select test.pathman_test_3(); /* RuntimeAppend (a join b on a.id = b.val) */ +select test.pathman_test_4(); /* RuntimeMergeAppend (lateral) */ +select test.pathman_test_5(); /* projection tests for RuntimeXXX nodes */ + + +DROP SCHEMA test CASCADE; +DROP EXTENSION pg_pathman CASCADE; +DROP SCHEMA pathman CASCADE; + diff --git a/contrib/pg_pathman/src/copy_stmt_hooking.c b/contrib/pg_pathman/src/copy_stmt_hooking.c new file mode 100644 index 0000000000..7b06a4b436 --- /dev/null +++ b/contrib/pg_pathman/src/copy_stmt_hooking.c @@ -0,0 +1,560 @@ +/* ------------------------------------------------------------------------ + * + * copy_stmt_hooking.c + * Override COPY TO/FROM statement for partitioned tables + * + * Copyright (c) 2016, Postgres Professional + * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * ------------------------------------------------------------------------ + */ + +#include "copy_stmt_hooking.h" +#include "init.h" +#include "partition_filter.h" +#include "relation_info.h" + +#include "access/htup_details.h" +#include "access/sysattr.h" +#include "access/xact.h" +#include "catalog/namespace.h" +#include "catalog/pg_attribute.h" +#include "commands/copy.h" +#include "commands/trigger.h" +#include "executor/executor.h" +#include "foreign/fdwapi.h" +#include "miscadmin.h" +#include "nodes/makefuncs.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/rel.h" +#include "utils/rls.h" + +#include "libpq/libpq.h" + + +static uint64 PathmanCopyFrom(CopyState cstate, + Relation parent_rel, + List *range_table, + bool old_protocol); + +static void prepare_rri_fdw_for_copy(EState *estate, + ResultRelInfoHolder *rri_holder, + void *arg); + + +/* + * Is pg_pathman supposed to handle this COPY stmt? + */ +bool +is_pathman_related_copy(Node *parsetree) +{ + CopyStmt *copy_stmt = (CopyStmt *) parsetree; + Oid partitioned_table; + + Assert(IsPathmanReady()); + + if (!IsOverrideCopyEnabled()) + { + elog(DEBUG1, "COPY statement hooking is disabled"); + return false; + } + + /* Check that it's a CopyStmt */ + if (!IsA(parsetree, CopyStmt)) + return false; + + /* Also check that stmt->relation exists */ + if (!copy_stmt->relation) + return false; + + /* Get partition's Oid while locking it */ + partitioned_table = RangeVarGetRelid(copy_stmt->relation, + (copy_stmt->is_from ? + RowExclusiveLock : + AccessShareLock), + false); + + /* Check that relation is partitioned */ + if (get_pathman_relation_info(partitioned_table)) + { + ListCell *lc; + + /* Analyze options list */ + foreach (lc, copy_stmt->options) + { + DefElem *defel = (DefElem *) lfirst(lc); + + Assert(IsA(defel, DefElem)); + + /* We do not support freeze */ + if (strcmp(defel->defname, "freeze") == 0) + elog(ERROR, "freeze is not supported for partitioned tables"); + } + + elog(DEBUG1, "Overriding default behavior for COPY [%u]", partitioned_table); + return true; + } + + return false; +} + +/* + * CopyGetAttnums - build an integer list of attnums to be copied + * + * The input attnamelist is either the user-specified column list, + * or NIL if there was none (in which case we want all the non-dropped + * columns). + * + * rel can be NULL ... it's only used for error reports. + */ +static List * +CopyGetAttnums(TupleDesc tupDesc, Relation rel, List *attnamelist) +{ + List *attnums = NIL; + + if (attnamelist == NIL) + { + /* Generate default column list */ + Form_pg_attribute *attr = tupDesc->attrs; + int attr_count = tupDesc->natts; + int i; + + for (i = 0; i < attr_count; i++) + { + if (attr[i]->attisdropped) + continue; + attnums = lappend_int(attnums, i + 1); + } + } + else + { + /* Validate the user-supplied list and extract attnums */ + ListCell *l; + + foreach(l, attnamelist) + { + char *name = strVal(lfirst(l)); + int attnum; + int i; + + /* Lookup column name */ + attnum = InvalidAttrNumber; + for (i = 0; i < tupDesc->natts; i++) + { + if (tupDesc->attrs[i]->attisdropped) + continue; + if (namestrcmp(&(tupDesc->attrs[i]->attname), name) == 0) + { + attnum = tupDesc->attrs[i]->attnum; + break; + } + } + if (attnum == InvalidAttrNumber) + { + if (rel != NULL) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_COLUMN), + errmsg("column \"%s\" of relation \"%s\" does not exist", + name, RelationGetRelationName(rel)))); + else + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_COLUMN), + errmsg("column \"%s\" does not exist", + name))); + } + /* Check for duplicates */ + if (list_member_int(attnums, attnum)) + ereport(ERROR, + (errcode(ERRCODE_DUPLICATE_COLUMN), + errmsg("column \"%s\" specified more than once", + name))); + attnums = lappend_int(attnums, attnum); + } + } + + return attnums; +} + +/* + * Execute COPY TO/FROM statement for a partitioned table. + * NOTE: based on DoCopy() (see copy.c). + */ +void +PathmanDoCopy(const CopyStmt *stmt, const char *queryString, uint64 *processed) +{ + CopyState cstate; + bool is_from = stmt->is_from; + bool pipe = (stmt->filename == NULL); + Relation rel; + Node *query = NULL; + List *range_table = NIL; + + /* Disallow COPY TO/FROM file or program except to superusers. */ + if (!pipe && !superuser()) + { + if (stmt->is_program) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be superuser to COPY to or from an external program"), + errhint("Anyone can COPY to stdout or from stdin. " + "psql's \\copy command also works for anyone."))); + else + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be superuser to COPY to or from a file"), + errhint("Anyone can COPY to stdout or from stdin. " + "psql's \\copy command also works for anyone."))); + } + + if (stmt->relation) + { + TupleDesc tupDesc; + AclMode required_access = (is_from ? ACL_INSERT : ACL_SELECT); + List *attnums; + ListCell *cur; + RangeTblEntry *rte; + + Assert(!stmt->query); + + /* Open the relation (we've locked it in is_pathman_related_copy()) */ + rel = heap_openrv(stmt->relation, NoLock); + + rte = makeNode(RangeTblEntry); + rte->rtekind = RTE_RELATION; + rte->relid = RelationGetRelid(rel); + rte->relkind = rel->rd_rel->relkind; + rte->requiredPerms = required_access; + range_table = list_make1(rte); + + tupDesc = RelationGetDescr(rel); + attnums = CopyGetAttnums(tupDesc, rel, stmt->attlist); + foreach(cur, attnums) + { + int attno = lfirst_int(cur) - FirstLowInvalidHeapAttributeNumber; + + if (is_from) + rte->insertedCols = bms_add_member(rte->insertedCols, attno); + else + rte->selectedCols = bms_add_member(rte->selectedCols, attno); + } + ExecCheckRTPerms(range_table, true); + + /* + * We should perform a query instead of low-level heap scan whenever: + * a) table has a RLS policy; + * b) table is partitioned & it's COPY FROM. + */ + if (check_enable_rls(rte->relid, InvalidOid, false) == RLS_ENABLED || + is_from == false) /* rewrite COPY table TO statements */ + { + SelectStmt *select; + ColumnRef *cr; + ResTarget *target; + RangeVar *from; + + if (is_from) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("COPY FROM not supported with row-level security"), + errhint("Use INSERT statements instead."))); + + /* Build target list */ + cr = makeNode(ColumnRef); + + if (!stmt->attlist) + cr->fields = list_make1(makeNode(A_Star)); + else + cr->fields = stmt->attlist; + + cr->location = 1; + + target = makeNode(ResTarget); + target->name = NULL; + target->indirection = NIL; + target->val = (Node *) cr; + target->location = 1; + + /* + * Build RangeVar for from clause, fully qualified based on the + * relation which we have opened and locked. + */ + from = makeRangeVar(get_namespace_name(RelationGetNamespace(rel)), + RelationGetRelationName(rel), -1); + + /* Build query */ + select = makeNode(SelectStmt); + select->targetList = list_make1(target); + select->fromClause = list_make1(from); + + query = (Node *) select; + + /* + * Close the relation for now, but keep the lock on it to prevent + * changes between now and when we start the query-based COPY. + * + * We'll reopen it later as part of the query-based COPY. + */ + heap_close(rel, NoLock); + rel = NULL; + } + } + else + { + Assert(stmt->query); + + query = stmt->query; + rel = NULL; + } + + /* COPY ... FROM ... */ + if (is_from) + { + bool is_old_protocol = PG_PROTOCOL_MAJOR(FrontendProtocol) < 3 && + stmt->filename == NULL; + + /* There should be relation */ + if (!rel) elog(FATAL, "No relation for PATHMAN COPY FROM"); + + /* check read-only transaction and parallel mode */ + if (XactReadOnly && !rel->rd_islocaltemp) + PreventCommandIfReadOnly("PATHMAN COPY FROM"); + PreventCommandIfParallelMode("PATHMAN COPY FROM"); + + cstate = BeginCopyFrom(rel, stmt->filename, stmt->is_program, + stmt->attlist, stmt->options); + *processed = PathmanCopyFrom(cstate, rel, range_table, is_old_protocol); + EndCopyFrom(cstate); + } + /* COPY ... TO ... */ + else + { + CopyStmt modified_copy_stmt; + + /* We should've created a query */ + Assert(query); + + /* Copy 'stmt' and override some of the fields */ + modified_copy_stmt = *stmt; + modified_copy_stmt.relation = NULL; + modified_copy_stmt.query = query; + + /* Call standard DoCopy using a new CopyStmt */ + DoCopy(&modified_copy_stmt, queryString, processed); + } + + /* + * Close the relation. If reading, we can release the AccessShareLock we + * got; if writing, we should hold the lock until end of transaction to + * ensure that updates will be committed before lock is released. + */ + if (rel != NULL) + heap_close(rel, (is_from ? NoLock : AccessShareLock)); +} + +/* + * Copy FROM file to relation. + */ +static uint64 +PathmanCopyFrom(CopyState cstate, Relation parent_rel, + List *range_table, bool old_protocol) +{ + HeapTuple tuple; + TupleDesc tupDesc; + Datum *values; + bool *nulls; + + ResultPartsStorage parts_storage; + ResultRelInfo *parent_result_rel; + + EState *estate = CreateExecutorState(); /* for ExecConstraints() */ + ExprContext *econtext; + TupleTableSlot *myslot; + MemoryContext oldcontext = CurrentMemoryContext; + + uint64 processed = 0; + + + tupDesc = RelationGetDescr(parent_rel); + + parent_result_rel = makeNode(ResultRelInfo); + InitResultRelInfo(parent_result_rel, + parent_rel, + 1, /* dummy rangetable index */ + 0); + ExecOpenIndices(parent_result_rel, false); + + estate->es_result_relations = parent_result_rel; + estate->es_num_result_relations = 1; + estate->es_result_relation_info = parent_result_rel; + estate->es_range_table = range_table; + + /* Initialize ResultPartsStorage */ + init_result_parts_storage(&parts_storage, estate, false, + ResultPartsStorageStandard, + prepare_rri_fdw_for_copy, NULL); + parts_storage.saved_rel_info = parent_result_rel; + + /* Set up a tuple slot too */ + myslot = ExecInitExtraTupleSlot(estate); + ExecSetSlotDescriptor(myslot, tupDesc); + /* Triggers might need a slot as well */ + estate->es_trig_tuple_slot = ExecInitExtraTupleSlot(estate); + + /* Prepare to catch AFTER triggers. */ + AfterTriggerBeginQuery(); + + /* + * Check BEFORE STATEMENT insertion triggers. It's debatable whether we + * should do this for COPY, since it's not really an "INSERT" statement as + * such. However, executing these triggers maintains consistency with the + * EACH ROW triggers that we already fire on COPY. + */ + ExecBSInsertTriggers(estate, parent_result_rel); + + values = (Datum *) palloc(tupDesc->natts * sizeof(Datum)); + nulls = (bool *) palloc(tupDesc->natts * sizeof(bool)); + + econtext = GetPerTupleExprContext(estate); + + for (;;) + { + TupleTableSlot *slot; + bool skip_tuple; + Oid tuple_oid = InvalidOid; + + const PartRelationInfo *prel; + ResultRelInfoHolder *rri_holder_child; + ResultRelInfo *child_result_rel; + + CHECK_FOR_INTERRUPTS(); + + ResetPerTupleExprContext(estate); + + /* Fetch PartRelationInfo for parent relation */ + prel = get_pathman_relation_info(RelationGetRelid(parent_rel)); + + /* Switch into per tuple memory context */ + MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); + + if (!NextCopyFrom(cstate, econtext, values, nulls, &tuple_oid)) + break; + + if (nulls[prel->attnum - 1]) + elog(ERROR, ERR_PART_ATTR_NULL); + + /* Search for a matching partition */ + rri_holder_child = select_partition_for_insert(prel, &parts_storage, + values[prel->attnum - 1], + estate, false); + child_result_rel = rri_holder_child->result_rel_info; + estate->es_result_relation_info = child_result_rel; + + /* And now we can form the input tuple. */ + tuple = heap_form_tuple(tupDesc, values, nulls); + if (tuple_oid != InvalidOid) + HeapTupleSetOid(tuple, tuple_oid); + + /* + * Constraints might reference the tableoid column, so initialize + * t_tableOid before evaluating them. + */ + tuple->t_tableOid = RelationGetRelid(child_result_rel->ri_RelationDesc); + + /* Triggers and stuff need to be invoked in query context. */ + MemoryContextSwitchTo(oldcontext); + + /* Place tuple in tuple slot --- but slot shouldn't free it */ + slot = myslot; + ExecStoreTuple(tuple, slot, InvalidBuffer, false); + + skip_tuple = false; + + /* BEFORE ROW INSERT Triggers */ + if (child_result_rel->ri_TrigDesc && + child_result_rel->ri_TrigDesc->trig_insert_before_row) + { + slot = ExecBRInsertTriggers(estate, child_result_rel, slot); + + if (slot == NULL) /* "do nothing" */ + skip_tuple = true; + else /* trigger might have changed tuple */ + tuple = ExecMaterializeSlot(slot); + } + + /* Proceed if we still have a tuple */ + if (!skip_tuple) + { + List *recheckIndexes = NIL; + + /* Check the constraints of the tuple */ + if (child_result_rel->ri_RelationDesc->rd_att->constr) + ExecConstraints(child_result_rel, slot, estate); + + /* OK, store the tuple and create index entries for it */ + simple_heap_insert(child_result_rel->ri_RelationDesc, tuple); + + if (child_result_rel->ri_NumIndices > 0) + recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self), + estate, false, NULL, NIL); + + /* AFTER ROW INSERT Triggers */ + ExecARInsertTriggers(estate, child_result_rel, tuple, + recheckIndexes); + + list_free(recheckIndexes); + + /* + * We count only tuples not suppressed by a BEFORE INSERT trigger; + * this is the same definition used by execMain.c for counting + * tuples inserted by an INSERT command. + */ + processed++; + } + } + + MemoryContextSwitchTo(oldcontext); + + /* + * In the old protocol, tell pqcomm that we can process normal protocol + * messages again. + */ + if (old_protocol) + pq_endmsgread(); + + /* Execute AFTER STATEMENT insertion triggers */ + ExecASInsertTriggers(estate, parent_result_rel); + + /* Handle queued AFTER triggers */ + AfterTriggerEndQuery(estate); + + pfree(values); + pfree(nulls); + + ExecResetTupleTable(estate->es_tupleTable, false); + + /* Close partitions and destroy hash table */ + fini_result_parts_storage(&parts_storage, true); + + FreeExecutorState(estate); + + return processed; +} + +/* + * COPY FROM does not support FDWs, emit ERROR. + */ +static void +prepare_rri_fdw_for_copy(EState *estate, + ResultRelInfoHolder *rri_holder, + void *arg) +{ + ResultRelInfo *rri = rri_holder->result_rel_info; + FdwRoutine *fdw_routine = rri->ri_FdwRoutine; + + if (fdw_routine != NULL) + elog(ERROR, "cannot copy to foreign partition \"%s\"", + get_rel_name(RelationGetRelid(rri->ri_RelationDesc))); +} diff --git a/contrib/pg_pathman/src/copy_stmt_hooking.h b/contrib/pg_pathman/src/copy_stmt_hooking.h new file mode 100644 index 0000000000..389a411c43 --- /dev/null +++ b/contrib/pg_pathman/src/copy_stmt_hooking.h @@ -0,0 +1,23 @@ +/* ------------------------------------------------------------------------ + * + * copy_stmt_hooking.h + * Transaction-specific locks and other functions + * + * Copyright (c) 2016, Postgres Professional + * + * ------------------------------------------------------------------------ + */ + +#ifndef COPY_STMT_HOOKING_H +#define COPY_STMT_HOOKING_H + + +#include "postgres.h" +#include "commands/copy.h" +#include "nodes/nodes.h" + + +bool is_pathman_related_copy(Node *parsetree); +void PathmanDoCopy(const CopyStmt *stmt, const char *queryString, uint64 *processed); + +#endif diff --git a/contrib/pg_pathman/src/dsm_array.c b/contrib/pg_pathman/src/dsm_array.c deleted file mode 100644 index 5667ba04fb..0000000000 --- a/contrib/pg_pathman/src/dsm_array.c +++ /dev/null @@ -1,302 +0,0 @@ -/* ------------------------------------------------------------------------ - * - * init.c - * This module allocates large DSM segment to store arrays, - * initializes it with block structure and provides functions to - * allocate and free arrays - * - * Copyright (c) 2015-2016, Postgres Professional - * - * ------------------------------------------------------------------------ - */ -#include "pathman.h" -#include "storage/shmem.h" -#include "storage/dsm.h" -#include "storage/lwlock.h" -#include - - -static dsm_segment *segment = NULL; - -typedef struct DsmConfig -{ - dsm_handle segment_handle; - size_t block_size; - size_t blocks_count; - size_t first_free; -} DsmConfig; - -static DsmConfig *dsm_cfg = NULL; - -/* - * Block header - * - * Its size must be 4 bytes for 32bit and 8 bytes for 64bit. Otherwise it could - * screw up an alignment (for example on Sparc9) - */ -typedef uintptr_t BlockHeader; -typedef BlockHeader* BlockHeaderPtr; - -#define FREE_BIT 0x80000000 -#define is_free(header) \ - ((*header) & FREE_BIT) -#define set_free(header) \ - ((*header) | FREE_BIT) -#define set_used(header) \ - ((*header) & ~FREE_BIT) -#define get_length(header) \ - ((*header) & ~FREE_BIT) -#define set_length(header, length) \ - ((length) | ((*header) & FREE_BIT)) - -/* - * Amount of memory that need to be requested in shared memory to store dsm - * config - */ -Size -get_dsm_shared_size() -{ - return (Size) MAXALIGN(sizeof(DsmConfig)); -} - -/* - * Initialize dsm config for arrays - */ -void -init_dsm_config() -{ - bool found; - dsm_cfg = ShmemInitStruct("pathman dsm_array config", sizeof(DsmConfig), &found); - if (!found) - { - dsm_cfg->segment_handle = 0; - dsm_cfg->block_size = 0; - dsm_cfg->blocks_count = INITIAL_BLOCKS_COUNT; - dsm_cfg->first_free = 0; - } -} - -/* - * Attach process to dsm_array segment. This function is used for - * background workers only. Use init_dsm_segment() in backend processes. - */ -void -attach_dsm_array_segment() -{ - segment = dsm_attach(dsm_cfg->segment_handle); -} - -/* - * Initialize dsm segment. Returns true if new segment was created and - * false if attached to existing segment - */ -bool -init_dsm_segment(size_t blocks_count, size_t block_size) -{ - bool ret; - - /* if there is already an existing segment then attach to it */ - if (dsm_cfg->segment_handle != 0) - { - ret = false; - segment = dsm_attach(dsm_cfg->segment_handle); - } - - /* - * If segment hasn't been created yet or has already been destroyed - * (it happens when last session detaches segment) then create new one - */ - if (dsm_cfg->segment_handle == 0 || segment == NULL) - { - /* create segment */ - segment = dsm_create(block_size * blocks_count, 0); - dsm_cfg->segment_handle = dsm_segment_handle(segment); - dsm_cfg->first_free = 0; - dsm_cfg->block_size = block_size; - dsm_cfg->blocks_count = blocks_count; - init_dsm_table(block_size, 0, dsm_cfg->blocks_count); - ret = true; - } - - /* - * Keep mapping till the end of the session. Otherwise it would be - * destroyed by the end of transaction - */ - dsm_pin_mapping(segment); - - return ret; -} - -/* - * Initialize allocated segment with block structure - */ -void -init_dsm_table(size_t block_size, size_t start, size_t end) -{ - int i; - BlockHeaderPtr header; - char *ptr = dsm_segment_address(segment); - - /* create blocks */ - for (i=start; ifirst_free; iblocks_count; ) - { - header = (BlockHeaderPtr) &ptr[i * dsm_cfg->block_size]; - if (is_free(header)) - { - if (!collecting_blocks) - { - offset = i * dsm_cfg->block_size; - total_length = dsm_cfg->block_size - sizeof(BlockHeader); - min_pos = i; - collecting_blocks = true; - } - else - { - total_length += dsm_cfg->block_size; - } - i++; - } - else - { - collecting_blocks = false; - offset = 0; - total_length = 0; - i += get_length(header); - } - - if (total_length >= size_requested) - { - max_pos = i-1; - found = true; - break; - } - } - - /* - * If dsm segment size is not enough then resize it (or allocate bigger - * for segment SysV and Windows, not implemented yet) - */ - if (!found) - { - size_t new_blocks_count = dsm_cfg->blocks_count * 2; - - dsm_resize(segment, new_blocks_count * dsm_cfg->block_size); - init_dsm_table(dsm_cfg->block_size, dsm_cfg->blocks_count, new_blocks_count); - dsm_cfg->blocks_count = new_blocks_count; - - /* try again */ - return alloc_dsm_array(arr, entry_size, length); - } - - /* look up for first free block */ - if (dsm_cfg->first_free == min_pos) - { - for (; iblocks_count; ) - { - header = (BlockHeaderPtr) &ptr[i * dsm_cfg->block_size]; - if (is_free(header)) - { - dsm_cfg->first_free = i; - break; - } - else - { - i += get_length(header); - } - } - } - - /* if we found enough of space */ - if (total_length >= size_requested) - { - header = (BlockHeaderPtr) &ptr[min_pos * dsm_cfg->block_size]; - *header = set_used(header); - *header = set_length(header, max_pos - min_pos + 1); - - arr->offset = offset; - arr->length = length; - } -} - -void -free_dsm_array(DsmArray *arr) -{ - int start = arr->offset / dsm_cfg->block_size; - int i = 0; - char *ptr = dsm_segment_address(segment); - BlockHeaderPtr header = (BlockHeaderPtr) &ptr[start * dsm_cfg->block_size]; - size_t blocks_count = get_length(header); - - /* set blocks free */ - for(; i < blocks_count; i++) - { - header = (BlockHeaderPtr) &ptr[(start + i) * dsm_cfg->block_size]; - *header = set_free(header); - *header = set_length(header, 1); - } - - if (start < dsm_cfg->first_free) - dsm_cfg->first_free = start; - - arr->offset = 0; - arr->length = 0; -} - -void -resize_dsm_array(DsmArray *arr, size_t entry_size, size_t length) -{ - void *array_data; - size_t array_data_size; - void *buffer; - - /* Copy data from array to temporary buffer */ - array_data = dsm_array_get_pointer(arr); - array_data_size = arr->length * entry_size; - buffer = palloc(array_data_size); - memcpy(buffer, array_data, array_data_size); - - /* Free array */ - free_dsm_array(arr); - - /* Allocate new array */ - alloc_dsm_array(arr, entry_size, length); - - /* Copy data to new array */ - array_data = dsm_array_get_pointer(arr); - memcpy(array_data, buffer, array_data_size); - - pfree(buffer); -} - -void * -dsm_array_get_pointer(const DsmArray *arr) -{ - return (char *) dsm_segment_address(segment) + arr->offset + sizeof(BlockHeader); -} diff --git a/contrib/pg_pathman/src/hooks.c b/contrib/pg_pathman/src/hooks.c index 240ed4ef09..e3a368b6ab 100644 --- a/contrib/pg_pathman/src/hooks.c +++ b/contrib/pg_pathman/src/hooks.c @@ -7,19 +7,31 @@ * * ------------------------------------------------------------------------ */ -#include "postgres.h" -#include "optimizer/cost.h" -#include "optimizer/restrictinfo.h" -#include "utils/guc.h" + +#include "copy_stmt_hooking.h" #include "hooks.h" -#include "pathman.h" +#include "init.h" +#include "partition_filter.h" +#include "pg_compat.h" #include "runtimeappend.h" #include "runtime_merge_append.h" #include "utils.h" +#include "xact_handling.h" + +#include "access/transam.h" +#include "miscadmin.h" +#include "optimizer/cost.h" +#include "optimizer/restrictinfo.h" +#include "utils/typcache.h" set_join_pathlist_hook_type set_join_pathlist_next = NULL; set_rel_pathlist_hook_type set_rel_pathlist_hook_next = NULL; +planner_hook_type planner_hook_next = NULL; +post_parse_analyze_hook_type post_parse_analyze_hook_next = NULL; +shmem_startup_hook_type shmem_startup_hook_next = NULL; +ProcessUtility_hook_type process_utility_hook_next = NULL; + /* Take care of joins */ void @@ -30,87 +42,94 @@ pathman_join_pathlist_hook(PlannerInfo *root, JoinType jointype, JoinPathExtraData *extra) { - JoinCostWorkspace workspace; - Path *outer, - *inner; - Relids inner_required; - RangeTblEntry *inner_entry = root->simple_rte_array[innerrel->relid]; - PartRelationInfo *inner_prel; - NestPath *nest_path; - List *pathkeys = NIL; - List *joinrestrictclauses = extra->restrictlist; - List *joinclauses, - *otherclauses; - ListCell *lc; - double paramsel; - WalkerContext context; - bool innerrel_rinfo_contains_part_attr; - + JoinCostWorkspace workspace; + RangeTblEntry *inner_rte = root->simple_rte_array[innerrel->relid]; + const PartRelationInfo *inner_prel; + List *pathkeys = NIL, + *joinclauses, + *otherclauses; + ListCell *lc; + WalkerContext context; + double paramsel; + bool innerrel_rinfo_contains_part_attr; + + /* Call hooks set by other extensions */ if (set_join_pathlist_next) set_join_pathlist_next(root, joinrel, outerrel, innerrel, jointype, extra); - if (jointype == JOIN_UNIQUE_OUTER || - jointype == JOIN_UNIQUE_INNER) - { - jointype = JOIN_INNER; - } - - if (jointype == JOIN_FULL || !pg_pathman_enable_runtimeappend) + /* Check that both pg_pathman & RuntimeAppend nodes are enabled */ + if (!IsPathmanReady() || !pg_pathman_enable_runtimeappend) return; - if (innerrel->reloptkind != RELOPT_BASEREL || - !inner_entry->inh || - !(inner_prel = get_pathman_relation_info(inner_entry->relid, NULL))) + if (jointype == JOIN_FULL) + return; /* handling full joins is meaningless */ + + /* Check that innerrel is a BASEREL with inheritors & PartRelationInfo */ + if (innerrel->reloptkind != RELOPT_BASEREL || !inner_rte->inh || + !(inner_prel = get_pathman_relation_info(inner_rte->relid))) { return; /* Obviously not our case */ } + /* + * These codes are used internally in the planner, but are not supported + * by the executor (nor, indeed, by most of the planner). + */ + if (jointype == JOIN_UNIQUE_OUTER || jointype == JOIN_UNIQUE_INNER) + jointype = JOIN_INNER; /* replace with a proper value */ + /* Extract join clauses which will separate partitions */ if (IS_OUTER_JOIN(extra->sjinfo->jointype)) { - extract_actual_join_clauses(joinrestrictclauses, + extract_actual_join_clauses(extra->restrictlist, &joinclauses, &otherclauses); } else { /* We can treat all clauses alike for an inner join */ - joinclauses = extract_actual_clauses(joinrestrictclauses, false); + joinclauses = extract_actual_clauses(extra->restrictlist, false); otherclauses = NIL; } paramsel = 1.0; foreach (lc, joinclauses) { - WrapperNode *wrap; + WrapperNode *wrap; - context.prel = inner_prel; - context.econtext = NULL; - context.hasLeast = false; - context.hasGreatest = false; + InitWalkerContext(&context, inner_prel, NULL, false); wrap = walk_expr_tree((Expr *) lfirst(lc), &context); paramsel *= wrap->paramsel; } + /* Check that innerrel's RestrictInfos contain partitioned column */ innerrel_rinfo_contains_part_attr = - check_rinfo_for_partitioned_attr(innerrel->baserestrictinfo, - innerrel->relid, - inner_prel->attnum); + get_partitioned_attr_clauses(innerrel->baserestrictinfo, + inner_prel, innerrel->relid) != NULL; foreach (lc, innerrel->pathlist) { AppendPath *cur_inner_path = (AppendPath *) lfirst(lc); - ParamPathInfo *ppi; + Path *outer, + *inner; + NestPath *nest_path; /* NestLoop we're creating */ + ParamPathInfo *ppi; /* parameterization info */ + Relids inner_required; /* required paremeterization relids */ + List *filtered_joinclauses = NIL; + ListCell *rinfo_lc; if (!IsA(cur_inner_path, AppendPath)) continue; + /* Select cheapest path for outerrel */ outer = outerrel->cheapest_total_path; + /* Make innerrel path depend on outerrel's column */ inner_required = bms_union(PATH_REQ_OUTER((Path *) cur_inner_path), bms_make_singleton(outerrel->relid)); + /* Get the ParamPathInfo for a parameterized path */ ppi = get_baserel_parampathinfo(root, innerrel, inner_required); /* @@ -118,17 +137,15 @@ pathman_join_pathlist_hook(PlannerInfo *root, * ppi->ppi_clauses reference partition attribute */ if (!(innerrel_rinfo_contains_part_attr || - (ppi && check_rinfo_for_partitioned_attr(ppi->ppi_clauses, - innerrel->relid, - inner_prel->attnum)))) + (ppi && get_partitioned_attr_clauses(ppi->ppi_clauses, + inner_prel, + innerrel->relid)))) continue; - inner = create_runtimeappend_path(root, cur_inner_path, - ppi, - paramsel); + inner = create_runtimeappend_path(root, cur_inner_path, ppi, paramsel); initial_cost_nestloop(root, &workspace, jointype, - outer, inner, + outer, inner, /* built paths */ extra->sjinfo, &extra->semifactors); pathkeys = build_join_pathkeys(root, joinrel, jointype, outer->pathkeys); @@ -139,39 +156,62 @@ pathman_join_pathlist_hook(PlannerInfo *root, pathkeys, calc_nestloop_required_outer(outer, inner)); + /* Discard all clauses that are to be evaluated by 'inner' */ + foreach (rinfo_lc, extra->restrictlist) + { + RestrictInfo *rinfo = (RestrictInfo *) lfirst(rinfo_lc); + + Assert(IsA(rinfo, RestrictInfo)); + if (!join_clause_is_movable_to(rinfo, inner->parent)) + filtered_joinclauses = lappend(filtered_joinclauses, rinfo); + } + + /* + * Override 'rows' value produced by standard estimator. + * Currently we use get_parameterized_joinrel_size() since + * it works just fine, but this might change some day. + */ + nest_path->path.rows = get_parameterized_joinrel_size_compat(root, + joinrel, + outer, + inner, + extra->sjinfo, + filtered_joinclauses); + + /* Finally we can add the new NestLoop path */ add_path(joinrel, (Path *) nest_path); } } /* Cope with simple relations */ void -pathman_rel_pathlist_hook(PlannerInfo *root, RelOptInfo *rel, Index rti, RangeTblEntry *rte) +pathman_rel_pathlist_hook(PlannerInfo *root, + RelOptInfo *rel, + Index rti, + RangeTblEntry *rte) { - PartRelationInfo *prel = NULL; - RangeTblEntry **new_rte_array; - RelOptInfo **new_rel_array; - bool found; - int len; + const PartRelationInfo *prel; + RangeTblEntry **new_rte_array; + RelOptInfo **new_rel_array; + int len; /* Invoke original hook if needed */ if (set_rel_pathlist_hook_next != NULL) set_rel_pathlist_hook_next(root, rel, rti, rte); - if (!pg_pathman_enable) - return; + if (!IsPathmanReady()) + return; /* pg_pathman is not ready */ /* This works only for SELECT queries (at least for now) */ if (root->parse->commandType != CMD_SELECT || !list_member_oid(inheritance_enabled_relids, rte->relid)) return; - /* Lookup partitioning information for parent relation */ - prel = get_pathman_relation_info(rte->relid, &found); - - if (prel != NULL && found) + /* Proceed iff relation 'rel' is partitioned */ + if ((prel = get_pathman_relation_info(rte->relid)) != NULL) { ListCell *lc; - Oid *dsm_arr; + Oid *children; List *ranges, *wrappers; PathKey *pathkeyAsc = NULL, @@ -214,24 +254,20 @@ pathman_rel_pathlist_hook(PlannerInfo *root, RelOptInfo *rel, Index rti, RangeTb pathkeyDesc = (PathKey *) linitial(pathkeys); } - rte->inh = true; - dsm_arr = (Oid *) dsm_array_get_pointer(&prel->children); - ranges = list_make1_int(make_irange(0, prel->children_count - 1, false)); + rte->inh = true; /* we must restore 'inh' flag! */ + + children = PrelGetChildrenArray(prel); + ranges = list_make1_irange(make_irange(0, PrelLastChild(prel), false)); /* Make wrappers over restrictions and collect final rangeset */ - context.prel = prel; - context.econtext = NULL; - context.hasLeast = false; - context.hasGreatest = false; + InitWalkerContext(&context, prel, NULL, false); wrappers = NIL; foreach(lc, rel->baserestrictinfo) { WrapperNode *wrap; - RestrictInfo *rinfo = (RestrictInfo*) lfirst(lc); + RestrictInfo *rinfo = (RestrictInfo *) lfirst(lc); wrap = walk_expr_tree(rinfo->clause, &context); - if (!lc->next) - finish_least_greatest(wrap, &context); paramsel *= wrap->paramsel; wrappers = lappend(wrappers, wrap); @@ -241,11 +277,12 @@ pathman_rel_pathlist_hook(PlannerInfo *root, RelOptInfo *rel, Index rti, RangeTb /* * Expand simple_rte_array and simple_rel_array */ + len = irange_list_length(ranges); + if (prel->enable_parent) + len++; - if (ranges) + if (len > 0) { - len = irange_list_length(ranges); - /* Expand simple_rel_array and simple_rte_array */ new_rel_array = (RelOptInfo **) palloc0((root->simple_rel_array_size + len) * sizeof(RelOptInfo *)); @@ -255,11 +292,11 @@ pathman_rel_pathlist_hook(PlannerInfo *root, RelOptInfo *rel, Index rti, RangeTb palloc0((root->simple_rel_array_size + len) * sizeof(RangeTblEntry *)); /* Copy relations to the new arrays */ - for (i = 0; i < root->simple_rel_array_size; i++) - { - new_rel_array[i] = root->simple_rel_array[i]; - new_rte_array[i] = root->simple_rte_array[i]; - } + for (i = 0; i < root->simple_rel_array_size; i++) + { + new_rel_array[i] = root->simple_rel_array[i]; + new_rte_array[i] = root->simple_rte_array[i]; + } /* Free old arrays */ pfree(root->simple_rel_array); @@ -270,6 +307,10 @@ pathman_rel_pathlist_hook(PlannerInfo *root, RelOptInfo *rel, Index rti, RangeTb root->simple_rte_array = new_rte_array; } + /* Add parent if needed */ + if (prel->enable_parent) + append_child_relation(root, rel, rti, rte, 0, rte->relid, NULL); + /* * Iterate all indexes in rangeset and append corresponding child * relations. @@ -278,8 +319,9 @@ pathman_rel_pathlist_hook(PlannerInfo *root, RelOptInfo *rel, Index rti, RangeTb { IndexRange irange = lfirst_irange(lc); - for (i = irange_lower(irange); i <= irange_upper(irange); i++) - append_child_relation(root, rel, rti, rte, i, dsm_arr[i], wrappers); + for (i = irange.ir_lower; i <= irange.ir_upper; i++) + append_child_relation(root, rel, rti, rte, i, children[i], + wrappers); } /* Clear old path list */ @@ -287,21 +329,21 @@ pathman_rel_pathlist_hook(PlannerInfo *root, RelOptInfo *rel, Index rti, RangeTb rel->pathlist = NIL; set_append_rel_pathlist(root, rel, rti, rte, pathkeyAsc, pathkeyDesc); - set_append_rel_size(root, rel, rti, rte); + set_append_rel_size_compat(root, rel, rti, rte); - /* No need to go further, return */ + /* No need to go further (both nodes are disabled), return */ if (!(pg_pathman_enable_runtimeappend || pg_pathman_enable_runtime_merge_append)) return; - /* RuntimeAppend is pointless if there are no params in clauses */ + /* Runtime[Merge]Append is pointless if there are no params in clauses */ if (!clause_contains_params((Node *) get_actual_clauses(rel->baserestrictinfo))) return; + /* Check that rel's RestrictInfo contains partitioned column */ rel_rinfo_contains_part_attr = - check_rinfo_for_partitioned_attr(rel->baserestrictinfo, - rel->relid, - prel->attnum); + get_partitioned_attr_clauses(rel->baserestrictinfo, + prel, rel->relid) != NULL; foreach (lc, rel->pathlist) { @@ -312,8 +354,7 @@ pathman_rel_pathlist_hook(PlannerInfo *root, RelOptInfo *rel, Index rti, RangeTb /* Skip if rel contains some join-related stuff or path type mismatched */ if (!(IsA(cur_path, AppendPath) || IsA(cur_path, MergeAppendPath)) || - rel->has_eclass_joins || - rel->joininfo) + rel->has_eclass_joins || rel->joininfo) { continue; } @@ -323,9 +364,8 @@ pathman_rel_pathlist_hook(PlannerInfo *root, RelOptInfo *rel, Index rti, RangeTb * ppi->ppi_clauses reference partition attribute */ if (!(rel_rinfo_contains_part_attr || - (ppi && check_rinfo_for_partitioned_attr(ppi->ppi_clauses, - rel->relid, - prel->attnum)))) + (ppi && get_partitioned_attr_clauses(ppi->ppi_clauses, + prel, rel->relid)))) continue; if (IsA(cur_path, AppendPath) && pg_pathman_enable_runtimeappend) @@ -333,8 +373,16 @@ pathman_rel_pathlist_hook(PlannerInfo *root, RelOptInfo *rel, Index rti, RangeTb ppi, paramsel); else if (IsA(cur_path, MergeAppendPath) && pg_pathman_enable_runtime_merge_append) + { + /* Check struct layout compatibility */ + if (offsetof(AppendPath, subpaths) != + offsetof(MergeAppendPath, subpaths)) + elog(FATAL, "Struct layouts of AppendPath and " + "MergeAppendPath differ"); + inner_path = create_runtimemergeappend_path(root, cur_path, ppi, paramsel); + } if (inner_path) add_path(rel, inner_path); @@ -342,20 +390,248 @@ pathman_rel_pathlist_hook(PlannerInfo *root, RelOptInfo *rel, Index rti, RangeTb } } +/* + * Intercept 'pg_pathman.enable' GUC assignments. + */ void pg_pathman_enable_assign_hook(bool newval, void *extra) { + elog(DEBUG2, "pg_pathman_enable_assign_hook() [newval = %s] triggered", + newval ? "true" : "false"); + /* Return quickly if nothing has changed */ - if (newval == (pg_pathman_enable && + if (newval == (pg_pathman_init_state.pg_pathman_enable && + pg_pathman_init_state.auto_partition && + pg_pathman_init_state.override_copy && pg_pathman_enable_runtimeappend && - pg_pathman_enable_runtime_merge_append)) + pg_pathman_enable_runtime_merge_append && + pg_pathman_enable_partition_filter)) return; + pg_pathman_init_state.auto_partition = newval; + pg_pathman_init_state.override_copy = newval; pg_pathman_enable_runtime_merge_append = newval; pg_pathman_enable_runtimeappend = newval; + pg_pathman_enable_partition_filter = newval; elog(NOTICE, - "RuntimeAppend and RuntimeMergeAppend nodes have been %s", + "RuntimeAppend, RuntimeMergeAppend and PartitionFilter nodes " + "and some other options have been %s", newval ? "enabled" : "disabled"); } +/* + * Planner hook. It disables inheritance for tables that have been partitioned + * by pathman to prevent standart PostgreSQL partitioning mechanism from + * handling that tables. + */ +PlannedStmt * +pathman_planner_hook(Query *parse, int cursorOptions, ParamListInfo boundParams) +{ +#define ExecuteForPlanTree(planned_stmt, proc) \ + do { \ + ListCell *lc; \ + proc((planned_stmt)->rtable, (planned_stmt)->planTree); \ + foreach (lc, (planned_stmt)->subplans) \ + proc((planned_stmt)->rtable, (Plan *) lfirst(lc)); \ + } while (0) + + PlannedStmt *result; + + /* FIXME: fix these commands (traverse whole query tree) */ + if (IsPathmanReady()) + { + switch(parse->commandType) + { + case CMD_SELECT: + disable_inheritance(parse); + rowmark_add_tableoids(parse); /* add attributes for rowmarks */ + break; + + case CMD_UPDATE: + case CMD_DELETE: + disable_inheritance_cte(parse); + disable_inheritance_subselect(parse); + handle_modification_query(parse); + break; + + default: + break; + } + } + + /* Invoke original hook if needed */ + if (planner_hook_next) + result = planner_hook_next(parse, cursorOptions, boundParams); + else + result = standard_planner(parse, cursorOptions, boundParams); + + if (IsPathmanReady()) + { + /* Give rowmark-related attributes correct names */ + ExecuteForPlanTree(result, postprocess_lock_rows); + + /* Add PartitionFilter node for INSERT queries */ + ExecuteForPlanTree(result, add_partition_filters); + } + + list_free(inheritance_disabled_relids); + list_free(inheritance_enabled_relids); + inheritance_disabled_relids = NIL; + inheritance_enabled_relids = NIL; + + return result; +} + +/* + * Post parse analysis hook. It makes sure the config is loaded before executing + * any statement, including utility commands + */ +void +pathman_post_parse_analysis_hook(ParseState *pstate, Query *query) +{ + /* Invoke original hook if needed */ + if (post_parse_analyze_hook_next) + post_parse_analyze_hook_next(pstate, query); + + /* We shouldn't do anything on BEGIN or SET ISOLATION LEVEL stmts */ + if (query->commandType == CMD_UTILITY && + (xact_is_transaction_stmt(query->utilityStmt) || + xact_is_set_transaction_stmt(query->utilityStmt))) + { + return; + } + + /* Finish delayed invalidation jobs */ + if (IsPathmanReady()) + finish_delayed_invalidation(); + + /* Load config if pg_pathman exists & it's still necessary */ + if (IsPathmanEnabled() && + !IsPathmanInitialized() && + /* Now evaluate the most expensive clause */ + get_pathman_schema() != InvalidOid) + { + load_config(); /* perform main cache initialization */ + } + + inheritance_disabled_relids = NIL; + inheritance_enabled_relids = NIL; +} + +/* + * Initialize dsm_config & shmem_config. + */ +void +pathman_shmem_startup_hook(void) +{ + /* Invoke original hook if needed */ + if (shmem_startup_hook_next != NULL) + shmem_startup_hook_next(); + + /* Allocate shared memory objects */ + LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); + init_shmem_config(); + LWLockRelease(AddinShmemInitLock); +} + +/* + * Invalidate PartRelationInfo cache entry if needed. + */ +void +pathman_relcache_hook(Datum arg, Oid relid) +{ + PartParentSearch search; + Oid partitioned_table; + + if (!IsPathmanReady()) + return; + + /* We shouldn't even consider special OIDs */ + if (relid < FirstNormalObjectId) + return; + + /* Invalidation event for PATHMAN_CONFIG table (probably DROP) */ + if (relid == get_pathman_config_relid()) + delay_pathman_shutdown(); + + /* Invalidate PartParentInfo cache if needed */ + partitioned_table = forget_parent_of_partition(relid, &search); + + switch (search) + { + /* It is (or was) a valid partition */ + case PPS_ENTRY_PART_PARENT: + case PPS_ENTRY_PARENT: + { + elog(DEBUG2, "Invalidation message for partition %u [%u]", + relid, MyProcPid); + + delay_invalidation_parent_rel(partitioned_table); + } + break; + + /* Both syscache and pathman's cache say it isn't a partition */ + case PPS_ENTRY_NOT_FOUND: + { + if (partitioned_table != InvalidOid) + delay_invalidation_parent_rel(partitioned_table); +#ifdef NOT_USED + elog(DEBUG2, "Invalidation message for relation %u [%u]", + relid, MyProcPid); +#endif + } + break; + + /* We can't say anything (state is not transactional) */ + case PPS_NOT_SURE: + { + elog(DEBUG2, "Invalidation message for vague relation %u [%u]", + relid, MyProcPid); + + delay_invalidation_vague_rel(relid); + } + break; + + default: + elog(ERROR, "Not implemented yet (%s)", + CppAsString(pathman_relcache_hook)); + break; + } +} + +/* + * Utility function invoker hook. + */ +void +pathman_process_utility_hook(Node *parsetree, + const char *queryString, + ProcessUtilityContext context, + ParamListInfo params, + DestReceiver *dest, + char *completionTag) +{ + /* Call hooks set by other extensions */ + if (process_utility_hook_next) + process_utility_hook_next(parsetree, queryString, + context, params, + dest, completionTag); + + /* Override standard COPY statement if needed */ + if (IsPathmanReady() && is_pathman_related_copy(parsetree)) + { + uint64 processed; + + PathmanDoCopy((CopyStmt *) parsetree, queryString, &processed); + if (completionTag) + snprintf(completionTag, COMPLETION_TAG_BUFSIZE, + "PATHMAN COPY " UINT64_FORMAT, processed); + + return; /* don't call standard_ProcessUtility() */ + } + + /* Call internal implementation */ + standard_ProcessUtility(parsetree, queryString, + context, params, + dest, completionTag); +} diff --git a/contrib/pg_pathman/src/hooks.h b/contrib/pg_pathman/src/hooks.h index 33513982e7..5b349a3440 100644 --- a/contrib/pg_pathman/src/hooks.h +++ b/contrib/pg_pathman/src/hooks.h @@ -7,20 +7,56 @@ * * ------------------------------------------------------------------------ */ + #ifndef JOIN_HOOK_H #define JOIN_HOOK_H #include "postgres.h" +#include "optimizer/planner.h" #include "optimizer/paths.h" +#include "parser/analyze.h" +#include "storage/ipc.h" +#include "tcop/utility.h" + extern set_join_pathlist_hook_type set_join_pathlist_next; extern set_rel_pathlist_hook_type set_rel_pathlist_hook_next; +extern planner_hook_type planner_hook_next; +extern post_parse_analyze_hook_type post_parse_analyze_hook_next; +extern shmem_startup_hook_type shmem_startup_hook_next; +extern ProcessUtility_hook_type process_utility_hook_next; -void pathman_join_pathlist_hook(PlannerInfo *root, RelOptInfo *joinrel, RelOptInfo *outerrel, - RelOptInfo *innerrel, JoinType jointype, JoinPathExtraData *extra); -void pathman_rel_pathlist_hook(PlannerInfo *root, RelOptInfo *rel, Index rti, RangeTblEntry *rte); +void pathman_join_pathlist_hook(PlannerInfo *root, + RelOptInfo *joinrel, + RelOptInfo *outerrel, + RelOptInfo *innerrel, + JoinType jointype, + JoinPathExtraData *extra); + +void pathman_rel_pathlist_hook(PlannerInfo *root, + RelOptInfo *rel, + Index rti, + RangeTblEntry *rte); void pg_pathman_enable_assign_hook(char newval, void *extra); +PlannedStmt * pathman_planner_hook(Query *parse, + int cursorOptions, + ParamListInfo boundParams); + +void pathman_post_parse_analysis_hook(ParseState *pstate, + Query *query); + +void pathman_shmem_startup_hook(void); + +void pathman_relcache_hook(Datum arg, Oid relid); + +void pathman_process_utility_hook(Node *parsetree, + const char *queryString, + ProcessUtilityContext context, + ParamListInfo params, + DestReceiver *dest, + char *completionTag); + #endif diff --git a/contrib/pg_pathman/src/init.c b/contrib/pg_pathman/src/init.c index e763c876c4..83d153713b 100644 --- a/contrib/pg_pathman/src/init.c +++ b/contrib/pg_pathman/src/init.c @@ -5,594 +5,845 @@ * * Copyright (c) 2015-2016, Postgres Professional * + * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * * ------------------------------------------------------------------------ */ + +#include "hooks.h" +#include "init.h" #include "pathman.h" -#include "miscadmin.h" -#include "executor/spi.h" -#include "catalog/pg_type.h" -#include "catalog/pg_class.h" -#include "catalog/pg_constraint.h" -#include "catalog/pg_operator.h" +#include "pathman_workers.h" +#include "relation_info.h" +#include "utils.h" + #include "access/htup_details.h" -#include "utils/syscache.h" +#include "access/sysattr.h" +#include "catalog/indexing.h" +#include "catalog/pg_constraint.h" +#include "catalog/pg_inherits.h" +#include "catalog/pg_inherits_fn.h" +#include "catalog/pg_type.h" +#include "miscadmin.h" +#include "optimizer/clauses.h" +#include "parser/parse_coerce.h" +#include "utils/datum.h" +#include "utils/inval.h" #include "utils/builtins.h" -#include "utils/typcache.h" +#include "utils/fmgroids.h" +#include "utils/memutils.h" #include "utils/lsyscache.h" -#include "utils/bytea.h" #include "utils/snapmgr.h" -#include "optimizer/clauses.h" +#include "utils/syscache.h" +#include "utils/typcache.h" + +#if PG_VERSION_NUM >= 90600 +#include "catalog/pg_constraint_fn.h" +#endif + + +/* Help user in case of emergency */ +#define INIT_ERROR_HINT "pg_pathman will be disabled to allow you to resolve this issue" + +/* Initial size of 'partitioned_rels' table */ +#define PART_RELS_SIZE 10 +#define CHILD_FACTOR 500 + + +/* Storage for PartRelationInfos */ +HTAB *partitioned_rels = NULL; +/* Storage for PartParentInfos */ +HTAB *parent_cache = NULL; -HTAB *relations = NULL; -HTAB *range_restrictions = NULL; -bool initialization_needed = true; +/* pg_pathman's init status */ +PathmanInitState pg_pathman_init_state; +/* Shall we install new relcache callback? */ +static bool relcache_callback_needed = true; + +/* Functions for various local caches */ +static bool init_pathman_relation_oids(void); +static void fini_pathman_relation_oids(void); +static void init_local_cache(void); +static void fini_local_cache(void); +static void read_pathman_config(void); + +static Expr *get_partition_constraint_expr(Oid partition, AttrNumber part_attno); -static bool validate_partition_constraints(const Oid *children_oids, - const uint32 children_count, - Snapshot snapshot, - PartRelationInfo *prel, - RangeRelation *rangerel); -static bool validate_range_constraint(Expr *, PartRelationInfo *, Datum *, Datum *); -static bool validate_hash_constraint(Expr *expr, PartRelationInfo *prel, int *hash); -static bool read_opexpr_const(OpExpr *opexpr, int varattno, Datum *val); static int cmp_range_entries(const void *p1, const void *p2, void *arg); +static bool validate_range_constraint(const Expr *expr, + const PartRelationInfo *prel, + Datum *min, + Datum *max); + +static bool validate_hash_constraint(const Expr *expr, + const PartRelationInfo *prel, + uint32 *part_hash); + +static bool read_opexpr_const(const OpExpr *opexpr, + const PartRelationInfo *prel, + Datum *val); + +static int oid_cmp(const void *p1, const void *p2); + /* - * This argument contains type compare function - * and 'by_val' flag (absent in RangeEntry) which - * are required for the cmp_range_entries() function. + * Save and restore main init state. */ -typedef struct -{ - FmgrInfo *cmp_proc; - bool by_val; -} cmp_range_entries_arg; - -Size -pathman_memsize() +void +save_pathman_init_state(PathmanInitState *temp_init_state) { - Size size; + *temp_init_state = pg_pathman_init_state; +} - size = get_dsm_shared_size() + MAXALIGN(sizeof(PathmanState)); - return size; +void +restore_pathman_init_state(const PathmanInitState *temp_init_state) +{ + pg_pathman_init_state = *temp_init_state; } +/* + * Create main GUCs. + */ void -init_shmem_config() +init_main_pathman_toggles(void) { - bool found; + /* Main toggle, load_config() will enable it */ + DefineCustomBoolVariable("pg_pathman.enable", + "Enables pg_pathman's optimizations during the planner stage", + NULL, + &pg_pathman_init_state.pg_pathman_enable, + true, + PGC_SUSET, + 0, + NULL, + pg_pathman_enable_assign_hook, + NULL); + + /* Global toggle for automatic partition creation */ + DefineCustomBoolVariable("pg_pathman.enable_auto_partition", + "Enables automatic partition creation", + NULL, + &pg_pathman_init_state.auto_partition, + true, + PGC_SUSET, + 0, + NULL, + NULL, + NULL); + + /* Global toggle for COPY stmt handling */ + DefineCustomBoolVariable("pg_pathman.override_copy", + "Override COPY statement handling", + NULL, + &pg_pathman_init_state.override_copy, + true, + PGC_SUSET, + 0, + NULL, + NULL, + NULL); +} - /* Check if module was initialized in postmaster */ - pmstate = ShmemInitStruct("pathman state", sizeof(PathmanState), &found); - if (!found) +/* + * Create local PartRelationInfo cache & load pg_pathman's config. + * Return true on success. May occasionally emit ERROR. + */ +bool +load_config(void) +{ + /* + * Try to cache important relids. + * + * Once CREATE EXTENSION stmt is processed, get_pathman_schema() + * function starts returning perfectly valid schema Oid, which + * means we have to check that *ALL* pg_pathman's relations' Oids + * have been cached properly. Only then can we assume that + * initialization is not needed anymore. + */ + if (!init_pathman_relation_oids()) + return false; /* remain 'uninitialized', exit before creating main caches */ + + init_local_cache(); /* create 'partitioned_rels' hash table */ + read_pathman_config(); /* read PATHMAN_CONFIG table & fill cache */ + + /* Register pathman_relcache_hook(), currently we can't unregister it */ + if (relcache_callback_needed) { - /* - * Initialize locks in postmaster - */ - if (!IsUnderPostmaster) - { - /* Initialize locks */ - pmstate->load_config_lock = LWLockAssign(); - pmstate->dsm_init_lock = LWLockAssign(); - pmstate->edit_partitions_lock = LWLockAssign(); - } + CacheRegisterRelcacheCallback(pathman_relcache_hook, PointerGetDatum(NULL)); + relcache_callback_needed = false; } - create_relations_hashtable(); - create_range_restrictions_hashtable(); + /* Mark pg_pathman as initialized */ + pg_pathman_init_state.initialization_needed = false; + + elog(DEBUG2, "pg_pathman's config has been loaded successfully [%u]", MyProcPid); + + return true; } /* - * Initialize hashtables + * Destroy local caches & free memory. */ void -load_config(void) +unload_config(void) { - bool new_segment_created; - Oid *databases; + /* Don't forget to reset pg_pathman's cached relids */ + fini_pathman_relation_oids(); - initialization_needed = false; + /* Destroy 'partitioned_rels' & 'parent_cache' hash tables */ + fini_local_cache(); - LWLockAcquire(pmstate->dsm_init_lock, LW_EXCLUSIVE); - new_segment_created = init_dsm_segment(INITIAL_BLOCKS_COUNT, 32); + /* Mark pg_pathman as uninitialized */ + pg_pathman_init_state.initialization_needed = true; - /* If dsm segment just created */ - if (new_segment_created) - { - /* - * Allocate databases array and put current database - * oid into it. This array contains databases oids - * that have already been cached (to prevent repeat caching) - */ - if (&pmstate->databases.length > 0) - free_dsm_array(&pmstate->databases); - alloc_dsm_array(&pmstate->databases, sizeof(Oid), 1); - databases = (Oid *) dsm_array_get_pointer(&pmstate->databases); - databases[0] = MyDatabaseId; - } - else - { - int databases_count = pmstate->databases.length; - int i; + elog(DEBUG2, "pg_pathman's config has been unloaded successfully [%u]", MyProcPid); +} - /* Check if we already cached config for current database */ - databases = (Oid *) dsm_array_get_pointer(&pmstate->databases); - for(i=0; idsm_init_lock); - return; - } +/* + * Estimate total amount of shmem needed for pg_pathman to run. + */ +Size +estimate_pathman_shmem_size(void) +{ + return estimate_concurrent_part_task_slots_size() + + MAXALIGN(sizeof(PathmanState)); +} - /* Put current database oid to databases list */ - resize_dsm_array(&pmstate->databases, sizeof(Oid), databases_count + 1); - databases = (Oid *) dsm_array_get_pointer(&pmstate->databases); - databases[databases_count] = MyDatabaseId; - } +/* + * Cache *all* important pg_pathman's relids at once. + * We should NOT rely on any previously cached values. + */ +static bool +init_pathman_relation_oids(void) +{ + Oid schema = get_pathman_schema(); + Assert(schema != InvalidOid); + + /* Cache PATHMAN_CONFIG relation's Oid */ + pathman_config_relid = get_relname_relid(PATHMAN_CONFIG, schema); + if (pathman_config_relid == InvalidOid) + return false; + + /* Cache PATHMAN_CONFIG_PARAMS relation's Oid */ + pathman_config_params_relid = get_relname_relid(PATHMAN_CONFIG_PARAMS, + schema); + if (pathman_config_params_relid == InvalidOid) + return false; - /* Load cache */ - LWLockAcquire(pmstate->load_config_lock, LW_EXCLUSIVE); - load_relations(new_segment_created); - LWLockRelease(pmstate->load_config_lock); - LWLockRelease(pmstate->dsm_init_lock); + /* NOTE: add more relations to be cached right here ^^^ */ + + /* Everything is fine, proceed */ + return true; } /* - * Returns extension schema name or NULL. Caller is responsible for freeing - * the memory. + * Forget *all* pg_pathman's cached relids. */ -char * -get_extension_schema() +static void +fini_pathman_relation_oids(void) { - int ret; - bool isnull; + pathman_config_relid = InvalidOid; + pathman_config_params_relid = InvalidOid; - ret = SPI_exec("SELECT extnamespace::regnamespace::text FROM pg_extension WHERE extname = 'pg_pathman'", 0); - if (ret > 0 && SPI_tuptable != NULL && SPI_processed > 0) - { - TupleDesc tupdesc = SPI_tuptable->tupdesc; - SPITupleTable *tuptable = SPI_tuptable; - HeapTuple tuple = tuptable->vals[0]; - Datum datum = SPI_getbinval(tuple, tupdesc, 1, &isnull); + /* NOTE: add more relations to be forgotten right here ^^^ */ +} - if (isnull) - return NULL; +/* + * Initialize per-process resources. + */ +static void +init_local_cache(void) +{ + HASHCTL ctl; - return TextDatumGetCString(datum); - } - return NULL; + memset(&ctl, 0, sizeof(ctl)); + ctl.keysize = sizeof(Oid); + ctl.entrysize = sizeof(PartRelationInfo); + ctl.hcxt = TopMemoryContext; /* place data to persistent mcxt */ + + partitioned_rels = hash_create("pg_pathman's partitioned relations cache", + PART_RELS_SIZE, &ctl, HASH_ELEM | HASH_BLOBS); + + memset(&ctl, 0, sizeof(ctl)); + ctl.keysize = sizeof(Oid); + ctl.entrysize = sizeof(PartParentInfo); + ctl.hcxt = TopMemoryContext; /* place data to persistent mcxt */ + + parent_cache = hash_create("pg_pathman's partition parents cache", + PART_RELS_SIZE * CHILD_FACTOR, + &ctl, HASH_ELEM | HASH_BLOBS); } /* - * Loads partitioned tables structure to hashtable. - * - * TODO: reload just the specified relation + * Safely free per-process resources. */ -void -load_relations(bool reinitialize) +static void +fini_local_cache(void) { - int ret, - i, - proc; - bool isnull; - List *part_oids = NIL; - ListCell *lc; - char *schema; - TypeCacheEntry *tce; - PartRelationInfo *prel; - char sql[] = "SELECT pg_class.oid, pg_attribute.attnum, cfg.parttype, pg_attribute.atttypid " - "FROM %s.pathman_config as cfg " - "JOIN pg_class ON pg_class.oid = cfg.relname::regclass::oid " - "JOIN pg_attribute ON pg_attribute.attname = lower(cfg.attname) " - "AND attrelid = pg_class.oid"; - char *query; - - SPI_connect(); - schema = get_extension_schema(); - - /* If extension isn't exist then just quit */ - if (!schema) + HASH_SEQ_STATUS status; + PartRelationInfo *prel; + + hash_seq_init(&status, partitioned_rels); + while((prel = (PartRelationInfo *) hash_seq_search(&status)) != NULL) { - SPI_finish(); - return; + if (PrelIsValid(prel)) + { + FreeChildrenArray(prel); + FreeRangesArray(prel); + } } - /* Put schema name to the query */ - query = psprintf(sql, schema); - ret = SPI_exec(query, 0); - proc = SPI_processed; + /* Now we can safely destroy hash tables */ + hash_destroy(partitioned_rels); + hash_destroy(parent_cache); + partitioned_rels = NULL; + parent_cache = NULL; +} - if (ret > 0 && SPI_tuptable != NULL) - { - TupleDesc tupdesc = SPI_tuptable->tupdesc; - SPITupleTable *tuptable = SPI_tuptable; +/* + * Initializes pg_pathman's global state (PathmanState) & locks. + */ +void +init_shmem_config(void) +{ + bool found; - for (i=0; ivals[i]; - int oid = DatumGetObjectId(SPI_getbinval(tuple, tupdesc, 1, &isnull)); - - key.dbid = MyDatabaseId; - key.relid = oid; - prel = (PartRelationInfo*) - hash_search(relations, (const void *) &key, HASH_ENTER, NULL); + /* NOTE: dsm_array is redundant, hence the commented code */ + /* pmstate->dsm_init_lock = LWLockAssign(); */ + } + } - prel->attnum = DatumGetInt32(SPI_getbinval(tuple, tupdesc, 2, &isnull)); - prel->parttype = DatumGetInt32(SPI_getbinval(tuple, tupdesc, 3, &isnull)); - prel->atttype = DatumGetObjectId(SPI_getbinval(tuple, tupdesc, 4, &isnull)); + /* Allocate some space for concurrent part slots */ + init_concurrent_part_task_slots(); +} - tce = lookup_type_cache(prel->atttype, TYPECACHE_CMP_PROC | TYPECACHE_HASH_PROC); - prel->cmp_proc = tce->cmp_proc; - prel->hash_proc = tce->hash_proc; +/* + * Fill PartRelationInfo with partition-related info. + */ +void +fill_prel_with_partitions(const Oid *partitions, + const uint32 parts_count, + PartRelationInfo *prel) +{ + uint32 i; + Expr *con_expr; + MemoryContext mcxt = TopMemoryContext; - part_oids = lappend_int(part_oids, oid); - } - } - pfree(query); + /* Allocate memory for 'prel->children' & 'prel->ranges' (if needed) */ + prel->children = MemoryContextAllocZero(mcxt, parts_count * sizeof(Oid)); + if (prel->parttype == PT_RANGE) + prel->ranges = MemoryContextAllocZero(mcxt, parts_count * sizeof(RangeEntry)); + prel->children_count = parts_count; - /* Load children information */ - foreach(lc, part_oids) + for (i = 0; i < PrelChildrenCount(prel); i++) { - Oid oid = (int) lfirst_int(lc); + con_expr = get_partition_constraint_expr(partitions[i], prel->attnum); - prel = get_pathman_relation_info(oid, NULL); - switch(prel->parttype) + /* Perform a partitioning_type-dependent task */ + switch (prel->parttype) { - case PT_RANGE: - if (reinitialize && prel->children.length > 0) + case PT_HASH: { - RangeRelation *rangerel = get_pathman_range_relation(oid, NULL); - free_dsm_array(&prel->children); - free_dsm_array(&rangerel->ranges); - prel->children_count = 0; + uint32 hash; /* hash value < parts_count */ + + if (validate_hash_constraint(con_expr, prel, &hash)) + prel->children[hash] = partitions[i]; + else + { + DisablePathman(); /* disable pg_pathman since config is broken */ + ereport(ERROR, + (errmsg("Wrong constraint format for HASH partition \"%s\"", + get_rel_name_or_relid(partitions[i])), + errhint(INIT_ERROR_HINT))); + } } - load_partitions(oid, GetCatalogSnapshot(oid)); break; - case PT_HASH: - if (reinitialize && prel->children.length > 0) + + case PT_RANGE: { - free_dsm_array(&prel->children); - prel->children_count = 0; + Datum range_min, range_max; + + if (validate_range_constraint(con_expr, prel, + &range_min, &range_max)) + { + prel->ranges[i].child_oid = partitions[i]; + prel->ranges[i].min = range_min; + prel->ranges[i].max = range_max; + } + else + { + DisablePathman(); /* disable pg_pathman since config is broken */ + ereport(ERROR, + (errmsg("Wrong constraint format for RANGE partition \"%s\"", + get_rel_name_or_relid(partitions[i])), + errhint(INIT_ERROR_HINT))); + } } - load_partitions(oid, GetCatalogSnapshot(oid)); break; + + default: + { + DisablePathman(); /* disable pg_pathman since config is broken */ + ereport(ERROR, + (errmsg("Unknown partitioning type for relation \"%s\"", + get_rel_name_or_relid(PrelParentRelid(prel))), + errhint(INIT_ERROR_HINT))); + } } } - SPI_finish(); -} -void -create_relations_hashtable() -{ - HASHCTL ctl; + /* Finalize 'prel' for a RANGE-partitioned table */ + if (prel->parttype == PT_RANGE) + { + MemoryContext old_mcxt; - memset(&ctl, 0, sizeof(ctl)); - ctl.keysize = sizeof(RelationKey); - ctl.entrysize = sizeof(PartRelationInfo); + /* Sort partitions by RangeEntry->min asc */ + qsort_arg((void *) prel->ranges, PrelChildrenCount(prel), + sizeof(RangeEntry), cmp_range_entries, + (void *) &prel->cmp_proc); + + /* Initialize 'prel->children' array */ + for (i = 0; i < PrelChildrenCount(prel); i++) + prel->children[i] = prel->ranges[i].child_oid; + + /* Copy all min & max Datums to the persistent mcxt */ + old_mcxt = MemoryContextSwitchTo(TopMemoryContext); + for (i = 0; i < PrelChildrenCount(prel); i++) + { + prel->ranges[i].max = datumCopy(prel->ranges[i].max, + prel->attbyval, + prel->attlen); - /* Already exists, recreate */ - if (relations != NULL) - hash_destroy(relations); + prel->ranges[i].min = datumCopy(prel->ranges[i].min, + prel->attbyval, + prel->attlen); + } + MemoryContextSwitchTo(old_mcxt); + + } - relations = ShmemInitHash("Partitioning relation info", 1024, 1024, &ctl, - HASH_ELEM | HASH_BLOBS); +#ifdef USE_ASSERT_CHECKING + /* Check that each partition Oid has been assigned properly */ + if (prel->parttype == PT_HASH) + for (i = 0; i < PrelChildrenCount(prel); i++) + { + if (prel->children[i] == InvalidOid) + { + DisablePathman(); /* disable pg_pathman since config is broken */ + elog(ERROR, "pg_pathman's cache for relation \"%s\" " + "has not been properly initialized", + get_rel_name_or_relid(PrelParentRelid(prel))); + } + } +#endif } /* - * Load and validate CHECK constraints + * find_inheritance_children + * + * Returns an array containing the OIDs of all relations which + * inherit *directly* from the relation with OID 'parentrelId'. + * + * The specified lock type is acquired on each child relation (but not on the + * given rel; caller should already have locked it). If lockmode is NoLock + * then no locks are acquired, but caller must beware of race conditions + * against possible DROPs of child relations. + * + * borrowed from pg_inherits.c */ -void -load_partitions(Oid parent_oid, Snapshot snapshot) +Oid * +find_inheritance_children_array(Oid parentrelId, LOCKMODE lockmode, uint32 *size) { - PartRelationInfo *prel = NULL; - RangeRelation *rangerel = NULL; - SPIPlanPtr plan = NULL; - bool found; - int ret, - i, - children_count = 0; - Datum vals[1]; - Oid types[1] = { INT4OID }; - const bool nulls[1] = { false }; - Oid *children_oids; - - vals[0] = Int32GetDatum(parent_oid); - prel = get_pathman_relation_info(parent_oid, NULL); - - /* Skip if already loaded */ - if (prel->children.length > 0) - return; - - /* Load children oids */ - plan = SPI_prepare("SELECT inhrelid FROM pg_inherits " - "WHERE inhparent = $1", - 1, types); - - ret = SPI_execute_snapshot(plan, vals, nulls, snapshot, - InvalidSnapshot, true, false, 0); - - if (ret == SPI_OK_SELECT) + Relation relation; + SysScanDesc scan; + ScanKeyData key[1]; + HeapTuple inheritsTuple; + Oid inhrelid; + Oid *oidarr; + uint32 maxoids, + numoids, + i; + + /* + * Can skip the scan if pg_class shows the relation has never had a + * subclass. + */ + if (!has_subclass(parentrelId)) { - /* Initialize children data for validate_partition_constraints() */ - children_count = SPI_processed; - children_oids = palloc(sizeof(Oid) * children_count); + *size = 0; + return NULL; + } - for(i = 0; i < children_count; i++) - { - TupleDesc tupdesc = SPI_tuptable->tupdesc; - HeapTuple tuple = SPI_tuptable->vals[i]; - bool isnull; + /* + * Scan pg_inherits and build a working array of subclass OIDs. + */ + maxoids = 32; + oidarr = (Oid *) palloc(maxoids * sizeof(Oid)); + numoids = 0; - children_oids[i] = DatumGetObjectId(SPI_getbinval(tuple, - tupdesc, - 1, &isnull)); - } - } - else return; /* there are no children */ + relation = heap_open(InheritsRelationId, AccessShareLock); - if (children_count > 0) - { - alloc_dsm_array(&prel->children, sizeof(Oid), children_count); + ScanKeyInit(&key[0], + Anum_pg_inherits_inhparent, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(parentrelId)); - /* allocate ranges array is dsm */ - if (prel->parttype == PT_RANGE) + scan = systable_beginscan(relation, InheritsParentIndexId, true, + NULL, 1, key); + + while ((inheritsTuple = systable_getnext(scan)) != NULL) + { + inhrelid = ((Form_pg_inherits) GETSTRUCT(inheritsTuple))->inhrelid; + if (numoids >= maxoids) { - TypeCacheEntry *tce = lookup_type_cache(prel->atttype, 0); - RelationKey key; - - key.dbid = MyDatabaseId; - key.relid = parent_oid; - rangerel = (RangeRelation *) hash_search(range_restrictions, - (const void *) &key, - HASH_ENTER, &found); - rangerel->by_val = tce->typbyval; - alloc_dsm_array(&rangerel->ranges, sizeof(RangeEntry), children_count); + maxoids *= 2; + oidarr = (Oid *) repalloc(oidarr, maxoids * sizeof(Oid)); } + oidarr[numoids++] = inhrelid; + } + + systable_endscan(scan); + + heap_close(relation, AccessShareLock); - /* Validate partitions constraints */ - if (!validate_partition_constraints(children_oids, - children_count, - snapshot, - prel, - rangerel)) + /* + * If we found more than one child, sort them by OID. This ensures + * reasonably consistent behavior regardless of the vagaries of an + * indexscan. This is important since we need to be sure all backends + * lock children in the same order to avoid needless deadlocks. + */ + if (numoids > 1) + qsort(oidarr, numoids, sizeof(Oid), oid_cmp); + + /* + * Acquire locks and build the result list. + */ + for (i = 0; i < numoids; i++) + { + inhrelid = oidarr[i]; + + if (lockmode != NoLock) { - RelationKey key; + /* Get the lock to synchronize against concurrent drop */ + LockRelationOid(inhrelid, lockmode); /* - * If validation failed then pg_pathman cannot handle this relation. - * Remove it from the cache + * Now that we have the lock, double-check to see if the relation + * really exists or not. If not, assume it was dropped while we + * waited to acquire lock, and ignore it. */ - key.dbid = MyDatabaseId; - key.relid = parent_oid; - - free_dsm_array(&prel->children); - free_dsm_array(&rangerel->ranges); - hash_search(relations, (const void *) &key, HASH_REMOVE, &found); - if (prel->parttype == PT_RANGE) - hash_search(range_restrictions, - (const void *) &key, - HASH_REMOVE, &found); - - elog(WARNING, "Validation failed for relation '%s'. " - "It will not be handled by pg_pathman", - get_rel_name(parent_oid)); + if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(inhrelid))) + { + /* Release useless lock */ + UnlockRelationOid(inhrelid, lockmode); + /* And ignore this relation */ + continue; + } } - else - prel->children_count = children_count; - - pfree(children_oids); } + + *size = numoids; + return oidarr; } -static bool -validate_partition_constraints(const Oid *children_oids, - const uint32 children_count, - Snapshot snapshot, - PartRelationInfo *prel, - RangeRelation *rangerel) +/* + * Generate check constraint name for a partition. + * + * This function does not perform sanity checks at all. + */ +char * +build_check_constraint_name_internal(Oid relid, AttrNumber attno) { - Expr *expr = NULL; /* constraint object tree */ - RangeEntry *ranges = NULL; - Oid *dsm_children = NULL; /* points to prel->children */ - int i; - - - if (children_count == 0) - return false; /* nothing to do here */ + return psprintf("pathman_%s_%u_check", get_rel_name(relid), attno); +} - dsm_children = dsm_array_get_pointer(&prel->children); - if (prel->parttype == PT_RANGE) - ranges = (RangeEntry *) dsm_array_get_pointer(&rangerel->ranges); +/* + * Check that relation 'relid' is partitioned by pg_pathman. + * + * Extract tuple into 'values' and 'isnull' if they're provided. + */ +bool +pathman_config_contains_relation(Oid relid, Datum *values, bool *isnull, + TransactionId *xmin) +{ + Relation rel; + HeapScanDesc scan; + ScanKeyData key[1]; + Snapshot snapshot; + HeapTuple htup; + bool contains_rel = false; - /* Iterate through children */ - for (i = 0; i < children_count; i++) - { - Form_pg_constraint constraint = NULL; - uint32 cur_processed; - bool found_valid_constraint = false; + ScanKeyInit(&key[0], + Anum_pathman_config_partrel, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(relid)); - /* SPI args */ - Datum oids[1] = { ObjectIdGetDatum(children_oids[i]) }; - const bool nulls[1] = { false }; - Oid types[1] = { INT4OID }; + /* Open PATHMAN_CONFIG with latest snapshot available */ + rel = heap_open(get_pathman_config_relid(), AccessShareLock); - SPIPlanPtr plan = NULL; - int ret; /* SPI result code */ + /* Check that 'partrel' column is if regclass type */ + Assert(RelationGetDescr(rel)-> + attrs[Anum_pathman_config_partrel - 1]-> + atttypid == REGCLASSOID); + /* Check that number of columns == Natts_pathman_config */ + Assert(RelationGetDescr(rel)->natts == Natts_pathman_config); - /* Select constraints for this partition */ - plan = SPI_prepare("SELECT * FROM pg_constraint " - "WHERE conrelid = $1 AND contype = 'c'", - 1, types); + snapshot = RegisterSnapshot(GetLatestSnapshot()); + scan = heap_beginscan(rel, snapshot, 1, key); - ret = SPI_execute_snapshot(plan, oids, nulls, snapshot, - InvalidSnapshot, true, false, 0); + while ((htup = heap_getnext(scan, ForwardScanDirection)) != NULL) + { + contains_rel = true; /* found partitioned table */ - if (ret != SPI_OK_SELECT) + /* Extract data if necessary */ + if (values && isnull) { - elog(WARNING, - "No constraints found for partition %s", - get_rel_name(children_oids[i])); + heap_deform_tuple(htup, RelationGetDescr(rel), values, isnull); - return false; /* keep compiler happy */ + /* Perform checks for non-NULL columns */ + Assert(!isnull[Anum_pathman_config_partrel - 1]); + Assert(!isnull[Anum_pathman_config_attname - 1]); + Assert(!isnull[Anum_pathman_config_parttype - 1]); } - /* Iterate through all available check constraints to find a valid one */ - for (cur_processed = 0; - cur_processed < SPI_processed && !found_valid_constraint; - cur_processed++) + /* Set xmin if necessary */ + if (xmin) { - int hash; /* temp hash value for HASH partitioning */ - RangeEntry re; /* temporary RangeEntry */ - Datum min, /* RangeEntry's min value */ - max; /* RangeEntry's max value */ + Datum value; + bool isnull; - Datum conbin_datum; /* nodeToString representation of constraint */ - bool conbin_isnull; + value = heap_getsysattr(htup, + MinTransactionIdAttributeNumber, + RelationGetDescr(rel), + &isnull); - HeapTuple tuple = SPI_tuptable->vals[cur_processed]; + Assert(!isnull); + *xmin = DatumGetTransactionId(value); + } + } + /* Clean resources */ + heap_endscan(scan); + UnregisterSnapshot(snapshot); + heap_close(rel, AccessShareLock); - /* Get 'conbin' from the 'pg_constraint' */ - constraint = (Form_pg_constraint) GETSTRUCT(tuple); - conbin_datum = SysCacheGetAttr(CONSTROID, tuple, - Anum_pg_constraint_conbin, - &conbin_isnull); + elog(DEBUG2, "PATHMAN_CONFIG table %s relation %u", + (contains_rel ? "contains" : "doesn't contain"), relid); - /* Handle unexpected null value */ - if (conbin_isnull) - continue; /* not a valid value anyway, skipping */ + return contains_rel; +} - /* Convert constraint datum to object tree */ - expr = (Expr *) stringToNode(TextDatumGetCString(conbin_datum)); +/* + * Loads additional pathman parameters like 'enable_parent' or 'auto' + * from PATHMAN_CONFIG_PARAMS + */ +bool +read_pathman_params(Oid relid, Datum *values, bool *isnull) +{ + Relation rel; + HeapScanDesc scan; + ScanKeyData key[1]; + Snapshot snapshot; + HeapTuple htup; + bool row_found = false; + + ScanKeyInit(&key[0], + Anum_pathman_config_params_partrel, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(relid)); + + rel = heap_open(get_pathman_config_params_relid(), AccessShareLock); + snapshot = RegisterSnapshot(GetLatestSnapshot()); + scan = heap_beginscan(rel, snapshot, 1, key); + + /* There should be just 1 row */ + if ((htup = heap_getnext(scan, ForwardScanDirection)) != NULL) + { + /* Extract data if necessary */ + heap_deform_tuple(htup, RelationGetDescr(rel), values, isnull); + row_found = true; + + /* Perform checks for non-NULL columns */ + Assert(!isnull[Anum_pathman_config_params_partrel - 1]); + Assert(!isnull[Anum_pathman_config_params_enable_parent - 1]); + Assert(!isnull[Anum_pathman_config_params_auto - 1]); + Assert(!isnull[Anum_pathman_config_params_init_callback - 1]); + } - switch(prel->parttype) - { - case PT_RANGE: - if (!validate_range_constraint(expr, prel, &min, &max)) - continue; + /* Clean resources */ + heap_endscan(scan); + UnregisterSnapshot(snapshot); + heap_close(rel, AccessShareLock); - /* If datum is referenced by val then just assign */ - if (rangerel->by_val) - { - re.min = min; - re.max = max; - } - /* else copy data by pointer */ - else - { - memcpy(&re.min, DatumGetPointer(min), sizeof(re.min)); - memcpy(&re.max, DatumGetPointer(max), sizeof(re.max)); - } - re.child_oid = constraint->conrelid; - ranges[i] = re; /* copy struct 're' */ + return row_found; +} - /* Found valid range constraint */ - found_valid_constraint = true; - break; +/* + * Go through the PATHMAN_CONFIG table and create PartRelationInfo entries. + */ +static void +read_pathman_config(void) +{ + Relation rel; + HeapScanDesc scan; + Snapshot snapshot; + HeapTuple htup; - case PT_HASH: - if (!validate_hash_constraint(expr, prel, &hash)) - continue; + /* Open PATHMAN_CONFIG with latest snapshot available */ + rel = heap_open(get_pathman_config_relid(), AccessShareLock); - /* Copy oid to the prel's 'children' array */ - dsm_children[hash] = constraint->conrelid; + /* Check that 'partrel' column is if regclass type */ + Assert(RelationGetDescr(rel)-> + attrs[Anum_pathman_config_partrel - 1]-> + atttypid == REGCLASSOID); - /* Found valid hash constraint */ - found_valid_constraint = true; - break; - } - } + /* Check that number of columns == Natts_pathman_config */ + Assert(RelationGetDescr(rel)->natts == Natts_pathman_config); - /* No constraint matches pattern, aborting */ - if (!found_valid_constraint) - { - elog(ERROR, - "Cannot find valid check constraint for partition %s", - get_rel_name(children_oids[i])); + snapshot = RegisterSnapshot(GetLatestSnapshot()); + scan = heap_beginscan(rel, snapshot, 0, NULL); - return false; /* keep compiler happy */ + /* Examine each row and create a PartRelationInfo in local cache */ + while((htup = heap_getnext(scan, ForwardScanDirection)) != NULL) + { + Datum values[Natts_pathman_config]; + bool isnull[Natts_pathman_config]; + Oid relid; /* partitioned table */ + PartType parttype; /* partitioning type */ + text *attname; /* partitioned column name */ + + /* Extract Datums from tuple 'htup' */ + heap_deform_tuple(htup, RelationGetDescr(rel), values, isnull); + + /* These attributes are marked as NOT NULL, check anyway */ + Assert(!isnull[Anum_pathman_config_partrel - 1]); + Assert(!isnull[Anum_pathman_config_parttype - 1]); + Assert(!isnull[Anum_pathman_config_attname - 1]); + + /* Extract values from Datums */ + relid = DatumGetObjectId(values[Anum_pathman_config_partrel - 1]); + parttype = DatumGetPartType(values[Anum_pathman_config_parttype - 1]); + attname = DatumGetTextP(values[Anum_pathman_config_attname - 1]); + + /* Check that relation 'relid' exists */ + if (get_rel_type_id(relid) == InvalidOid) + { + DisablePathman(); /* disable pg_pathman since config is broken */ + ereport(ERROR, + (errmsg("Table \"%s\" contains nonexistent relation %u", + PATHMAN_CONFIG, relid), + errhint(INIT_ERROR_HINT))); } - /* Don't forget to free plan */ - SPI_freeplan(plan); + /* Create or update PartRelationInfo for this partitioned table */ + refresh_pathman_relation_info(relid, parttype, text_to_cstring(attname)); } - /* - * Sort range partitions and check if they overlap - */ - if (prel->parttype == PT_RANGE) - { - TypeCacheEntry *tce; - bool byVal = rangerel->by_val; - cmp_range_entries_arg sort_arg; /* for qsort_arg() */ - - /* Sort ascending */ - tce = lookup_type_cache(prel->atttype, - TYPECACHE_CMP_PROC | TYPECACHE_CMP_PROC_FINFO); - - /* Initialize qsort_arg comparator()'s argument */ - sort_arg.cmp_proc = &tce->cmp_proc_finfo; - sort_arg.by_val = byVal; - - qsort_arg(ranges, children_count, sizeof(RangeEntry), - cmp_range_entries, (void *) &sort_arg); + /* Clean resources */ + heap_endscan(scan); + UnregisterSnapshot(snapshot); + heap_close(rel, AccessShareLock); +} - /* Copy sorted oids to the prel's 'children' array */ - for(i = 0; i < children_count; i++) - dsm_children[i] = ranges[i].child_oid; +/* + * Get constraint expression tree for a partition. + * + * build_check_constraint_name_internal() is used to build conname. + */ +static Expr * +get_partition_constraint_expr(Oid partition, AttrNumber part_attno) +{ + Oid conid; /* constraint Oid */ + char *conname; /* constraint name */ + HeapTuple con_tuple; + Datum conbin_datum; + bool conbin_isnull; + Expr *expr; /* expression tree for constraint */ + + conname = build_check_constraint_name_internal(partition, part_attno); + conid = get_relation_constraint_oid(partition, conname, true); + if (conid == InvalidOid) + { + DisablePathman(); /* disable pg_pathman since config is broken */ + ereport(ERROR, + (errmsg("constraint \"%s\" for partition \"%s\" does not exist", + conname, get_rel_name_or_relid(partition)), + errhint(INIT_ERROR_HINT))); + } - /* Check if some ranges overlap */ - for(i = 0; i < children_count - 1; i++) - { - Datum cur_upper = PATHMAN_GET_DATUM(ranges[i].max, byVal), - next_lower = PATHMAN_GET_DATUM(ranges[i + 1].min, byVal); + con_tuple = SearchSysCache1(CONSTROID, ObjectIdGetDatum(conid)); + conbin_datum = SysCacheGetAttr(CONSTROID, con_tuple, + Anum_pg_constraint_conbin, + &conbin_isnull); + if (conbin_isnull) + { + DisablePathman(); /* disable pg_pathman since config is broken */ + ereport(WARNING, + (errmsg("constraint \"%s\" for partition \"%s\" has NULL conbin", + conname, get_rel_name_or_relid(partition)), + errhint(INIT_ERROR_HINT))); + pfree(conname); + + return NULL; /* could not parse */ + } + pfree(conname); - bool overlap = DatumGetInt32(FunctionCall2(&tce->cmp_proc_finfo, - next_lower, - cur_upper)) < 0; - if (overlap) - { - elog(WARNING, "Partitions %s and %s overlap", - get_rel_name(ranges[i].child_oid), - get_rel_name(ranges[i + 1].child_oid)); + /* Finally we get a constraint expression tree */ + expr = (Expr *) stringToNode(TextDatumGetCString(conbin_datum)); - return false; /* keep compiler happy */ - } - } - } + /* Don't foreget to release syscache tuple */ + ReleaseSysCache(con_tuple); - return true; /* everything seems to be fine */ + return expr; } /* qsort comparison function for RangeEntries */ static int cmp_range_entries(const void *p1, const void *p2, void *arg) { - const RangeEntry *v1 = (const RangeEntry *) p1; - const RangeEntry *v2 = (const RangeEntry *) p2; - const cmp_range_entries_arg *sort_arg = (const cmp_range_entries_arg *) arg; + const RangeEntry *v1 = (const RangeEntry *) p1; + const RangeEntry *v2 = (const RangeEntry *) p2; + + Oid cmp_proc_oid = *(Oid *) arg; - return FunctionCall2(sort_arg->cmp_proc, - PATHMAN_GET_DATUM(v1->min, sort_arg->by_val), - PATHMAN_GET_DATUM(v2->min, sort_arg->by_val)); + return OidFunctionCall2(cmp_proc_oid, v1->min, v2->min); } /* - * Validates range constraint. It MUST have the exact format: - * VARIABLE >= CONST AND VARIABLE < CONST + * Validates range constraint. It MUST have this exact format: + * + * VARIABLE >= CONST AND VARIABLE < CONST + * + * Writes 'min' & 'max' values on success. */ static bool -validate_range_constraint(Expr *expr, PartRelationInfo *prel, Datum *min, Datum *max) +validate_range_constraint(const Expr *expr, + const PartRelationInfo *prel, + Datum *min, + Datum *max) { - TypeCacheEntry *tce; - BoolExpr *boolexpr = (BoolExpr *) expr; - OpExpr *opexpr; + const TypeCacheEntry *tce; + const BoolExpr *boolexpr = (const BoolExpr *) expr; + const OpExpr *opexpr; + + if (!expr) + return false; /* it should be an AND operator on top */ if (!and_clause((Node *) expr)) @@ -602,9 +853,10 @@ validate_range_constraint(Expr *expr, PartRelationInfo *prel, Datum *min, Datum /* check that left operand is >= operator */ opexpr = (OpExpr *) linitial(boolexpr->args); - if (get_op_opfamily_strategy(opexpr->opno, tce->btree_opf) == BTGreaterEqualStrategyNumber) + if (BTGreaterEqualStrategyNumber == get_op_opfamily_strategy(opexpr->opno, + tce->btree_opf)) { - if (!read_opexpr_const(opexpr, prel->attnum, min)) + if (!read_opexpr_const(opexpr, prel, min)) return false; } else @@ -612,9 +864,10 @@ validate_range_constraint(Expr *expr, PartRelationInfo *prel, Datum *min, Datum /* check that right operand is < operator */ opexpr = (OpExpr *) lsecond(boolexpr->args); - if (get_op_opfamily_strategy(opexpr->opno, tce->btree_opf) == BTLessStrategyNumber) + if (BTLessStrategyNumber == get_op_opfamily_strategy(opexpr->opno, + tce->btree_opf)) { - if (!read_opexpr_const(opexpr, prel->attnum, max)) + if (!read_opexpr_const(opexpr, prel, max)) return false; } else @@ -624,144 +877,205 @@ validate_range_constraint(Expr *expr, PartRelationInfo *prel, Datum *min, Datum } /* - * Reads const value from expressions of kind: VAR >= CONST or VAR < CONST + * Reads const value from expressions of kind: + * 1) VAR >= CONST OR VAR < CONST + * 2) RELABELTYPE(VAR) >= CONST OR RELABELTYPE(VAR) < CONST */ static bool -read_opexpr_const(OpExpr *opexpr, int varattno, Datum *val) +read_opexpr_const(const OpExpr *opexpr, + const PartRelationInfo *prel, + Datum *val) { - Node *left = linitial(opexpr->args); - Node *right = lsecond(opexpr->args); + const Node *left; + const Node *right; + const Var *part_attr; /* partitioned column */ + const Const *constant; - if ( !IsA(left, Var) || !IsA(right, Const) ) + if (list_length(opexpr->args) != 2) return false; - if ( ((Var*) left)->varattno != varattno ) + + left = linitial(opexpr->args); + right = lsecond(opexpr->args); + + /* VAR is a part of RelabelType node */ + if (IsA(left, RelabelType) && IsA(right, Const)) + { + Var *var = (Var *) ((RelabelType *) left)->arg; + + if (IsA(var, Var)) + part_attr = var; + else + return false; + } + /* left arg is of type VAR */ + else if (IsA(left, Var) && IsA(right, Const)) + { + part_attr = (Var *) left; + } + /* Something is wrong, retreat! */ + else return false; + + /* VAR.attno == partitioned attribute number */ + if (part_attr->varoattno != prel->attnum) return false; - *val = ((Const*) right)->constvalue; + + /* CONST is NOT NULL */ + if (((Const *) right)->constisnull) + return false; + + constant = (Const *) right; + + /* Check that types are binary coercible */ + if (IsBinaryCoercible(constant->consttype, prel->atttype)) + { + *val = constant->constvalue; + } + /* If not, try to perfrom a type cast */ + else + { + CoercionPathType ret; + Oid castfunc = InvalidOid; + + ret = find_coercion_pathway(prel->atttype, constant->consttype, + COERCION_EXPLICIT, &castfunc); + + switch (ret) + { + /* There's a function */ + case COERCION_PATH_FUNC: + { + /* Perform conversion */ + Assert(castfunc != InvalidOid); + *val = OidFunctionCall1(castfunc, constant->constvalue); + } + break; + + /* Types are binary compatible (no implicit cast) */ + case COERCION_PATH_RELABELTYPE: + { + /* We don't perform any checks here */ + *val = constant->constvalue; + } + break; + + /* TODO: implement these if needed */ + case COERCION_PATH_ARRAYCOERCE: + case COERCION_PATH_COERCEVIAIO: + + /* There's no cast available */ + case COERCION_PATH_NONE: + default: + { + elog(WARNING, "Constant type in some check constraint " + "does not match the partitioned column's type"); + return false; + } + } + } return true; } /* - * Validate hash constraint. It MUST have the exact format - * VARIABLE % CONST = CONST + * Validate hash constraint. It MUST have this exact format: + * + * get_hash_part_idx(TYPE_HASH_PROC(VALUE), PARTITIONS_COUNT) = CUR_PARTITION_HASH + * + * Writes 'part_hash' hash value for this partition on success. */ static bool -validate_hash_constraint(Expr *expr, PartRelationInfo *prel, int *hash) +validate_hash_constraint(const Expr *expr, + const PartRelationInfo *prel, + uint32 *part_hash) { - OpExpr *eqexpr; - TypeCacheEntry *tce; - FuncExpr *gethashfunc; - FuncExpr *funcexpr; - Var *var; + const TypeCacheEntry *tce; + const OpExpr *eq_expr; + const FuncExpr *get_hash_expr, + *type_hash_proc_expr; + const Var *var; /* partitioned column */ + + if (!expr) + return false; if (!IsA(expr, OpExpr)) return false; - eqexpr = (OpExpr *) expr; + eq_expr = (const OpExpr *) expr; - /* - * We expect get_hash() function on the left - * TODO: check that it is really the 'get_hash' function - */ - if (!IsA(linitial(eqexpr->args), FuncExpr)) + /* Check that left expression is a function call */ + if (!IsA(linitial(eq_expr->args), FuncExpr)) return false; - gethashfunc = (FuncExpr *) linitial(eqexpr->args); - /* Is this an equality operator? */ - tce = lookup_type_cache(gethashfunc->funcresulttype, TYPECACHE_BTREE_OPFAMILY); - if (get_op_opfamily_strategy(eqexpr->opno, tce->btree_opf) != BTEqualStrategyNumber) + get_hash_expr = (FuncExpr *) linitial(eq_expr->args); /* get_hash_part_idx(...) */ + + /* Is 'eqexpr' an equality operator? */ + tce = lookup_type_cache(get_hash_expr->funcresulttype, TYPECACHE_BTREE_OPFAMILY); + if (BTEqualStrategyNumber != get_op_opfamily_strategy(eq_expr->opno, + tce->btree_opf)) return false; - if (list_length(gethashfunc->args) == 2) + if (list_length(get_hash_expr->args) == 2) { - Node *first = linitial(gethashfunc->args); - Node *second = lsecond(gethashfunc->args); - Const *mod_result; + Node *first = linitial(get_hash_expr->args); /* arg #1: TYPE_HASH_PROC(VALUE) */ + Node *second = lsecond(get_hash_expr->args); /* arg #2: PARTITIONS_COUNT */ + Const *cur_partition_hash; /* hash value for this partition */ - if ( !IsA(first, FuncExpr) || !IsA(second, Const) ) + if (!IsA(first, FuncExpr) || !IsA(second, Const)) return false; - /* Check that function is the base hash function for the type */ - funcexpr = (FuncExpr *) first; - if (funcexpr->funcid != prel->hash_proc || - (!IsA(linitial(funcexpr->args), Var) && !IsA(linitial(funcexpr->args), - RelabelType))) + type_hash_proc_expr = (FuncExpr *) first; + + /* Check that function is indeed TYPE_HASH_PROC */ + if (type_hash_proc_expr->funcid != prel->hash_proc || + !(IsA(linitial(type_hash_proc_expr->args), Var) || + IsA(linitial(type_hash_proc_expr->args), RelabelType))) + { return false; + } - /* Check that argument is partitioning key attribute */ - if (IsA(linitial(funcexpr->args), RelabelType)) - var = (Var *) ((RelabelType *) linitial(funcexpr->args))->arg; + /* Extract argument into 'var' */ + if (IsA(linitial(type_hash_proc_expr->args), RelabelType)) + var = (Var *) ((RelabelType *) linitial(type_hash_proc_expr->args))->arg; else - var = (Var *) linitial(funcexpr->args); - if (var->varattno != prel->attnum) - return false; + var = (Var *) linitial(type_hash_proc_expr->args); - /* Check that const value less than partitions count */ - if (DatumGetInt32(((Const*) second)->constvalue) != prel->children.length) + /* Check that 'var' is the partitioning key attribute */ + if (var->varoattno != prel->attnum) return false; - if ( !IsA(lsecond(eqexpr->args), Const) ) + /* Check that PARTITIONS_COUNT is equal to total amount of partitions */ + if (DatumGetUInt32(((Const *) second)->constvalue) != PrelChildrenCount(prel)) return false; - mod_result = lsecond(eqexpr->args); - *hash = DatumGetInt32(mod_result->constvalue); - return true; - } - - return false; -} - -/* - * Create range restrictions table - */ -void -create_range_restrictions_hashtable() -{ - HASHCTL ctl; - - memset(&ctl, 0, sizeof(ctl)); - ctl.keysize = sizeof(RelationKey); - ctl.entrysize = sizeof(RangeRelation); - range_restrictions = ShmemInitHash("pg_pathman range restrictions", - 1024, 1024, &ctl, HASH_ELEM | HASH_BLOBS); -} + /* Check that CUR_PARTITION_HASH is Const */ + if (!IsA(lsecond(eq_expr->args), Const)) + return false; -/* - * Remove partitions from pathman's cache - */ -void -remove_relation_info(Oid relid) -{ - PartRelationInfo *prel; - RangeRelation *rangerel; - RelationKey key; + cur_partition_hash = lsecond(eq_expr->args); - key.dbid = MyDatabaseId; - key.relid = relid; + /* Check that CUR_PARTITION_HASH is NOT NULL */ + if (cur_partition_hash->constisnull) + return false; - prel = get_pathman_relation_info(relid, NULL); + *part_hash = DatumGetUInt32(cur_partition_hash->constvalue); + if (*part_hash >= PrelChildrenCount(prel)) + return false; - /* If there is nothing to remove then just return */ - if (!prel) - { - elog(DEBUG2, "pg_pathman's cache does not contain relation %u", relid); - return; + return true; /* everything seems to be ok */ } - /* Remove children relations */ - switch (prel->parttype) - { - case PT_HASH: - free_dsm_array(&prel->children); - break; - - case PT_RANGE: - rangerel = get_pathman_range_relation(relid, NULL); - free_dsm_array(&rangerel->ranges); - free_dsm_array(&prel->children); - hash_search(range_restrictions, (const void *) &key, HASH_REMOVE, NULL); - break; - } + return false; +} - prel->children_count = 0; - hash_search(relations, (const void *) &key, HASH_REMOVE, 0); +/* needed for find_inheritance_children_array() function */ +static int +oid_cmp(const void *p1, const void *p2) +{ + Oid v1 = *((const Oid *) p1); + Oid v2 = *((const Oid *) p2); + + if (v1 < v2) + return -1; + if (v1 > v2) + return 1; + return 0; } diff --git a/contrib/pg_pathman/src/init.h b/contrib/pg_pathman/src/init.h new file mode 100644 index 0000000000..effb2675c7 --- /dev/null +++ b/contrib/pg_pathman/src/init.h @@ -0,0 +1,128 @@ +/* ------------------------------------------------------------------------ + * + * init.h + * Initialization functions + * + * Copyright (c) 2015-2016, Postgres Professional + * + * ------------------------------------------------------------------------ + */ + +#ifndef PATHMAN_INIT_H +#define PATHMAN_INIT_H + +#include "relation_info.h" + +#include "postgres.h" +#include "storage/lmgr.h" +#include "utils/guc.h" +#include "utils/hsearch.h" +#include "utils/snapshot.h" + + +/* + * pg_pathman's initialization state structure. + */ +typedef struct +{ + bool pg_pathman_enable; /* GUC variable implementation */ + bool auto_partition; /* GUC variable for auto partition propagation */ + bool override_copy; /* override COPY TO/FROM */ + bool initialization_needed; /* do we need to perform init? */ +} PathmanInitState; + + +extern HTAB *partitioned_rels; +extern HTAB *parent_cache; + +/* pg_pathman's initialization state */ +extern PathmanInitState pg_pathman_init_state; + + +/* + * Check if pg_pathman is initialized. + */ +#define IsPathmanInitialized() ( !pg_pathman_init_state.initialization_needed ) + +/* + * Check if pg_pathman is enabled. + */ +#define IsPathmanEnabled() ( pg_pathman_init_state.pg_pathman_enable ) + +/* + * Check if pg_pathman is initialized & enabled. + */ +#define IsPathmanReady() ( IsPathmanInitialized() && IsPathmanEnabled() ) + +/* + * Should we override COPY stmt handling? + */ +#define IsOverrideCopyEnabled() ( pg_pathman_init_state.override_copy ) + +/* + * Check if auto partition creation is enabled. + */ +#define IsAutoPartitionEnabled() ( pg_pathman_init_state.auto_partition ) + +/* + * Enable/disable auto partition propagation. Note that this only works if + * partitioned relation supports this. See enable_auto() and disable_auto() + * functions. + */ +#define SetAutoPartitionEnabled(value) \ + do { \ + Assert((value) == true || (value) == false); \ + pg_pathman_init_state.auto_partition = (value); \ + } while (0) + +/* + * Emergency disable mechanism. + */ +#define DisablePathman() \ + do { \ + pg_pathman_init_state.pg_pathman_enable = false; \ + pg_pathman_init_state.auto_partition = false; \ + pg_pathman_init_state.override_copy = false; \ + pg_pathman_init_state.initialization_needed = true; \ + } while (0) + + +/* + * Save and restore PathmanInitState. + */ +void save_pathman_init_state(PathmanInitState *temp_init_state); +void restore_pathman_init_state(const PathmanInitState *temp_init_state); + +/* + * Create main GUC variables. + */ +void init_main_pathman_toggles(void); + +Size estimate_pathman_shmem_size(void); +void init_shmem_config(void); + +bool load_config(void); +void unload_config(void); + + +void fill_prel_with_partitions(const Oid *partitions, + const uint32 parts_count, + PartRelationInfo *prel); + +Oid *find_inheritance_children_array(Oid parentrelId, + LOCKMODE lockmode, + uint32 *size); + +char *build_check_constraint_name_internal(Oid relid, + AttrNumber attno); + +bool pathman_config_contains_relation(Oid relid, + Datum *values, + bool *isnull, + TransactionId *xmin); + +bool read_pathman_params(Oid relid, + Datum *values, + bool *isnull); + +#endif diff --git a/contrib/pg_pathman/src/nodes_common.c b/contrib/pg_pathman/src/nodes_common.c index a66587f831..f75bd2f123 100644 --- a/contrib/pg_pathman/src/nodes_common.c +++ b/contrib/pg_pathman/src/nodes_common.c @@ -7,14 +7,20 @@ * * ------------------------------------------------------------------------ */ -#include "postgres.h" -#include "optimizer/paths.h" + #include "nodes_common.h" #include "runtimeappend.h" +#include "utils.h" + +#include "access/sysattr.h" #include "optimizer/restrictinfo.h" -#include "optimizer/plancat.h" +#include "optimizer/var.h" #include "utils/memutils.h" -#include "utils.h" + + +/* Allocation settings */ +#define INITIAL_ALLOC_NUM 10 +#define ALLOC_EXP 2 /* Compare plans by 'original_order' */ @@ -78,11 +84,13 @@ transform_plans_into_states(RuntimeAppendState *scan_state, static ChildScanCommon * select_required_plans(HTAB *children_table, Oid *parts, int nparts, int *nres) { - int allocated = 10; - int used = 0; - ChildScanCommon *result = palloc(10 * sizeof(ChildScanCommon)); + uint32 allocated = INITIAL_ALLOC_NUM, + used = 0; + ChildScanCommon *result; int i; + result = (ChildScanCommon *) palloc(allocated * sizeof(ChildScanCommon)); + for (i = 0; i < nparts; i++) { ChildScanCommon child = hash_search(children_table, @@ -93,7 +101,7 @@ select_required_plans(HTAB *children_table, Oid *parts, int nparts, int *nres) if (allocated <= used) { - allocated *= 2; + allocated = allocated * ALLOC_EXP + 1; result = repalloc(result, allocated * sizeof(ChildScanCommon)); } @@ -104,39 +112,6 @@ select_required_plans(HTAB *children_table, Oid *parts, int nparts, int *nres) return result; } -/* Transform partition ranges into plain array of partition Oids */ -static Oid * -get_partition_oids(List *ranges, int *n, PartRelationInfo *prel) -{ - ListCell *range_cell; - int allocated = 10; - int used = 0; - Oid *result = palloc(allocated * sizeof(Oid)); - Oid *children = dsm_array_get_pointer(&prel->children); - - foreach (range_cell, ranges) - { - int i; - int a = irange_lower(lfirst_irange(range_cell)); - int b = irange_upper(lfirst_irange(range_cell)); - - for (i = a; i <= b; i++) - { - if (allocated <= used) - { - allocated *= 2; - result = repalloc(result, allocated * sizeof(Oid)); - } - - Assert(i < prel->children_count); - result[used++] = children[i]; - } - } - - *n = used; - return result; -} - /* Replace Vars' varnos with the value provided by 'parent' */ static List * replace_tlist_varnos(List *child_tlist, RelOptInfo *parent) @@ -148,7 +123,7 @@ replace_tlist_varnos(List *child_tlist, RelOptInfo *parent) foreach (lc, child_tlist) { Var *var = (Var *) ((TargetEntry *) lfirst(lc))->expr; - Var *newvar = palloc(sizeof(Var)); + Var *newvar = (Var *) palloc(sizeof(Var)); Assert(IsA(var, Var)); @@ -164,13 +139,49 @@ replace_tlist_varnos(List *child_tlist, RelOptInfo *parent) return result; } +/* Append partition attribute in case it's not present in target list */ +static List * +append_part_attr_to_tlist(List *tlist, Index relno, const PartRelationInfo *prel) +{ + ListCell *lc; + bool part_attr_found = false; + + foreach (lc, tlist) + { + TargetEntry *te = (TargetEntry *) lfirst(lc); + Var *var = (Var *) te->expr; + + if (IsA(var, Var) && var->varoattno == prel->attnum) + part_attr_found = true; + } + + if (!part_attr_found) + { + Var *newvar = makeVar(relno, + prel->attnum, + prel->atttype, + prel->atttypmod, + prel->attcollid, + 0); + + Index last_item = list_length(tlist) + 1; + + tlist = lappend(tlist, makeTargetEntry((Expr *) newvar, + last_item, + NULL, false)); + } + + return tlist; +} + static void -pack_runtimeappend_private(CustomScan *cscan, RuntimeAppendPath *path) +pack_runtimeappend_private(CustomScan *cscan, RuntimeAppendPath *path, + bool enable_parent) { ChildScanCommon *children = path->children; int nchildren = path->nchildren; - List *custom_private = NIL; - List *custom_oids = NIL; + List *custom_private = NIL, + *custom_oids = NIL; int i; for (i = 0; i < nchildren; i++) @@ -180,33 +191,41 @@ pack_runtimeappend_private(CustomScan *cscan, RuntimeAppendPath *path) pfree(children[i]); } - /* Save main table and partition relids as first element of 'custom_private' */ + /* Save parent & partition Oids and a flag as first element of 'custom_private' */ custom_private = lappend(custom_private, - list_make2(list_make1_oid(path->relid), - custom_oids)); + list_make3(list_make1_oid(path->relid), + custom_oids, /* list of Oids */ + list_make1_int(enable_parent))); + /* Store freshly built 'custom_private' */ cscan->custom_private = custom_private; } static void unpack_runtimeappend_private(RuntimeAppendState *scan_state, CustomScan *cscan) { - ListCell *oid_cell; - ListCell *plan_cell; - List *runtimeappend_private = linitial(cscan->custom_private); - List *custom_oids = (List *) lsecond(runtimeappend_private); - int nchildren = list_length(custom_oids); + ListCell *oid_cell, + *plan_cell; + List *runtimeappend_private = linitial(cscan->custom_private), + *custom_oids; /* Oids of partitions */ + int custom_oids_count; /* number of partitions */ + HTAB *children_table; HASHCTL *children_table_config = &scan_state->children_table_config; int i; + /* Extract Oids list from packed data */ + custom_oids = (List *) lsecond(runtimeappend_private); + custom_oids_count = list_length(custom_oids); + memset(children_table_config, 0, sizeof(HASHCTL)); children_table_config->keysize = sizeof(Oid); children_table_config->entrysize = sizeof(ChildScanCommonData); - children_table = hash_create("Plan storage", nchildren, + children_table = hash_create("RuntimeAppend plan storage", + custom_oids_count, children_table_config, - HASH_ELEM | HASH_BLOBS); + HASH_ELEM | HASH_BLOBS); i = 0; forboth (oid_cell, custom_oids, plan_cell, cscan->custom_plans) @@ -225,8 +244,83 @@ unpack_runtimeappend_private(RuntimeAppendState *scan_state, CustomScan *cscan) child->original_order = i++; /* will be used in EXPLAIN */ } + /* Finally fill 'scan_state' with unpacked elements */ scan_state->children_table = children_table; scan_state->relid = linitial_oid(linitial(runtimeappend_private)); + scan_state->enable_parent = (bool) linitial_int(lthird(runtimeappend_private)); +} + +/* + * Filter all available clauses and extract relevant ones. + */ +List * +get_partitioned_attr_clauses(List *restrictinfo_list, + const PartRelationInfo *prel, + Index partitioned_rel) +{ +#define AdjustAttno(attno) \ + ( (AttrNumber) (attno + FirstLowInvalidHeapAttributeNumber) ) + + List *result = NIL; + ListCell *l; + + foreach(l, restrictinfo_list) + { + RestrictInfo *rinfo = (RestrictInfo *) lfirst(l); + Bitmapset *varattnos = NULL; + int part_attno; + + Assert(IsA(rinfo, RestrictInfo)); + pull_varattnos((Node *) rinfo->clause, partitioned_rel, &varattnos); + + if (bms_get_singleton_member(varattnos, &part_attno) && + AdjustAttno(part_attno) == prel->attnum) + { + result = lappend(result, rinfo->clause); + } + } + return result; +} + + +/* Transform partition ranges into plain array of partition Oids */ +Oid * +get_partition_oids(List *ranges, int *n, const PartRelationInfo *prel, + bool include_parent) +{ + ListCell *range_cell; + uint32 allocated = INITIAL_ALLOC_NUM, + used = 0; + Oid *result = (Oid *) palloc(allocated * sizeof(Oid)); + Oid *children = PrelGetChildrenArray(prel); + + /* If required, add parent to result */ + Assert(INITIAL_ALLOC_NUM >= 1); + if (include_parent) + result[used++] = PrelParentRelid(prel); + + /* Deal with selected partitions */ + foreach (range_cell, ranges) + { + uint32 i; + uint32 a = lfirst_irange(range_cell).ir_lower, + b = lfirst_irange(range_cell).ir_upper; + + for (i = a; i <= b; i++) + { + if (allocated <= used) + { + allocated = allocated * ALLOC_EXP + 1; + result = repalloc(result, allocated * sizeof(Oid)); + } + + Assert(i < PrelChildrenCount(prel)); + result[used++] = children[i]; + } + } + + *n = used; + return result; } Path * @@ -245,7 +339,7 @@ create_append_path_common(PlannerInfo *root, RuntimeAppendPath *result; - result = palloc0(size); + result = (RuntimeAppendPath *) palloc0(size); NodeSetTag(result, T_CustomPath); result->cpath.path.pathtype = T_CustomScan; @@ -266,13 +360,16 @@ create_append_path_common(PlannerInfo *root, result->relid = inner_entry->relid; result->nchildren = list_length(inner_append->subpaths); - result->children = palloc(result->nchildren * sizeof(ChildScanCommon)); + result->children = (ChildScanCommon *) + palloc(result->nchildren * sizeof(ChildScanCommon)); i = 0; foreach (lc, inner_append->subpaths) { Path *path = lfirst(lc); Index relindex = path->parent->relid; - ChildScanCommon child = palloc(sizeof(ChildScanCommonData)); + ChildScanCommon child; + + child = (ChildScanCommon) palloc(sizeof(ChildScanCommonData)); result->cpath.path.startup_cost += path->startup_cost; result->cpath.path.total_cost += path->total_cost; @@ -301,84 +398,55 @@ create_append_plan_common(PlannerInfo *root, RelOptInfo *rel, List *clauses, List *custom_plans, CustomScanMethods *scan_methods) { - RuntimeAppendPath *gpath = (RuntimeAppendPath *) best_path; - CustomScan *cscan; + RuntimeAppendPath *rpath = (RuntimeAppendPath *) best_path; + const PartRelationInfo *prel; + CustomScan *cscan; + + prel = get_pathman_relation_info(rpath->relid); + Assert(prel); cscan = makeNode(CustomScan); - cscan->custom_scan_tlist = NIL; + cscan->custom_scan_tlist = NIL; /* initial value (empty list) */ + cscan->scan.plan.targetlist = NIL; if (custom_plans) { ListCell *lc1, *lc2; - forboth (lc1, gpath->cpath.custom_paths, lc2, custom_plans) + forboth (lc1, rpath->cpath.custom_paths, lc2, custom_plans) { Plan *child_plan = (Plan *) lfirst(lc2); RelOptInfo *child_rel = ((Path *) lfirst(lc1))->parent; - /* We inforce IndexOnlyScans to return all available columns */ - if (IsA(child_plan, IndexOnlyScan)) - { - IndexOptInfo *indexinfo = ((IndexPath *) lfirst(lc1))->indexinfo; - RangeTblEntry *rentry = root->simple_rte_array[child_rel->relid]; - Relation child_relation; - - /* TODO: find out whether we need locks or not */ - child_relation = heap_open(rentry->relid, NoLock); - child_plan->targetlist = build_index_tlist(root, indexinfo, - child_relation); - heap_close(child_relation, NoLock); - - if (!cscan->custom_scan_tlist) - { - /* Set appropriate tlist for child scans */ - cscan->custom_scan_tlist = - replace_tlist_varnos(child_plan->targetlist, rel); - - /* Replace parent's tlist as well */ - tlist = cscan->custom_scan_tlist; - } - } - /* Don't generate useless physical tlists that will be replaced */ - else if (!cscan->custom_scan_tlist) - child_plan->targetlist = build_physical_tlist(root, child_rel); - } + /* Replace rel's tlist with a matching one */ + if (!cscan->scan.plan.targetlist) + tlist = replace_tlist_varnos(child_plan->targetlist, rel); - /* - * Go through the other (non-IOS) plans and replace their - * physical tlists with the new 'custom_scan_tlist'. - */ - if (cscan->custom_scan_tlist) - forboth (lc1, gpath->cpath.custom_paths, lc2, custom_plans) - { - Plan *child_plan = (Plan *) lfirst(lc2); - RelOptInfo *child_rel = ((Path *) lfirst(lc1))->parent; + /* Add partition attribute if necessary (for ExecQual()) */ + child_plan->targetlist = append_part_attr_to_tlist(child_plan->targetlist, + child_rel->relid, + prel); - if (!IsA(child_plan, IndexOnlyScan)) - child_plan->targetlist = - replace_tlist_varnos(cscan->custom_scan_tlist, child_rel); - } + /* Now make custom_scan_tlist match child plans' targetlists */ + if (!cscan->custom_scan_tlist) + cscan->custom_scan_tlist = replace_tlist_varnos(child_plan->targetlist, + rel); + } } cscan->scan.plan.qual = NIL; cscan->scan.plan.targetlist = tlist; - /* - * Initialize custom_scan_tlist if it's not - * ready yet (there are no IndexOnlyScans). - */ - if (!cscan->custom_scan_tlist) - cscan->custom_scan_tlist = tlist; - + /* Since we're not scanning any real table directly */ cscan->scan.scanrelid = 0; - cscan->custom_exprs = get_actual_clauses(clauses); + cscan->custom_exprs = get_partitioned_attr_clauses(clauses, prel, rel->relid); cscan->custom_plans = custom_plans; - cscan->methods = scan_methods; - pack_runtimeappend_private(cscan, gpath); + /* Cache 'prel->enable_parent' as well */ + pack_runtimeappend_private(cscan, rpath, prel->enable_parent); return &cscan->scan.plan; } @@ -388,19 +456,17 @@ create_append_scan_state_common(CustomScan *node, CustomExecMethods *exec_methods, uint32 size) { - RuntimeAppendState *scan_state = palloc0(size); + RuntimeAppendState *scan_state; + scan_state = (RuntimeAppendState *) palloc0(size); NodeSetTag(scan_state, T_CustomScanState); + scan_state->css.flags = node->flags; scan_state->css.methods = exec_methods; scan_state->custom_exprs = node->custom_exprs; unpack_runtimeappend_private(scan_state, node); - /* Fill in relation info using main table's relid */ - scan_state->prel = get_pathman_relation_info(scan_state->relid, NULL); - Assert(scan_state->prel); - scan_state->cur_plans = NULL; scan_state->ncur_plans = 0; scan_state->running_idx = 0; @@ -426,14 +492,16 @@ exec_append_common(CustomScanState *node, { RuntimeAppendState *scan_state = (RuntimeAppendState *) node; + /* ReScan if no plans are selected */ if (scan_state->ncur_plans == 0) ExecReScan(&node->ss.ps); for (;;) { + /* Fetch next tuple if we're done with Projections */ if (!node->ss.ps.ps_TupFromTlist) { - fetch_next_tuple(node); + fetch_next_tuple(node); /* use specific callback */ if (TupIsNull(scan_state->slot)) return NULL; @@ -475,33 +543,33 @@ end_append_common(CustomScanState *node) void rescan_append_common(CustomScanState *node) { - RuntimeAppendState *scan_state = (RuntimeAppendState *) node; - ExprContext *econtext = node->ss.ps.ps_ExprContext; - PartRelationInfo *prel = scan_state->prel; - List *ranges; - ListCell *lc; - Oid *parts; - int nparts; - WalkerContext wcxt; + RuntimeAppendState *scan_state = (RuntimeAppendState *) node; + ExprContext *econtext = node->ss.ps.ps_ExprContext; + const PartRelationInfo *prel; + List *ranges; + ListCell *lc; + WalkerContext wcxt; + Oid *parts; + int nparts; - ranges = list_make1_int(make_irange(0, prel->children_count - 1, false)); + prel = get_pathman_relation_info(scan_state->relid); + Assert(prel); - wcxt.prel = prel; - wcxt.econtext = econtext; - wcxt.hasLeast = false; - wcxt.hasGreatest = false; + /* First we select all available partitions... */ + ranges = list_make1_irange(make_irange(0, PrelLastChild(prel), false)); + InitWalkerContext(&wcxt, prel, econtext, false); foreach (lc, scan_state->custom_exprs) { WrapperNode *wn; + /* ... then we cut off irrelevant ones using the provided clauses */ wn = walk_expr_tree((Expr *) lfirst(lc), &wcxt); - ranges = irange_list_intersect(ranges, wn->rangeset); } /* Get Oids of the required partitions */ - parts = get_partition_oids(ranges, &nparts, prel); + parts = get_partition_oids(ranges, &nparts, prel, scan_state->enable_parent); /* Select new plans for this run using 'parts' */ if (scan_state->cur_plans) @@ -527,13 +595,15 @@ explain_append_common(CustomScanState *node, HTAB *children_table, ExplainState /* Construct excess PlanStates */ if (!es->analyze) { - int allocated = 10; - int used = 0; - ChildScanCommon *custom_ps = palloc(allocated * sizeof(ChildScanCommon)); - ChildScanCommon child; + uint32 allocated = INITIAL_ALLOC_NUM, + used = 0; + ChildScanCommon *custom_ps, + child; HASH_SEQ_STATUS seqstat; int i; + custom_ps = (ChildScanCommon *) palloc(allocated * sizeof(ChildScanCommon)); + /* There can't be any nodes since we're not scanning anything */ Assert(!node->custom_ps); @@ -544,7 +614,7 @@ explain_append_common(CustomScanState *node, HTAB *children_table, ExplainState { if (allocated <= used) { - allocated *= 2; + allocated = allocated * ALLOC_EXP + 1; custom_ps = repalloc(custom_ps, allocated * sizeof(ChildScanCommon)); } diff --git a/contrib/pg_pathman/src/nodes_common.h b/contrib/pg_pathman/src/nodes_common.h index 82d4bb9d88..f0423a48e8 100644 --- a/contrib/pg_pathman/src/nodes_common.h +++ b/contrib/pg_pathman/src/nodes_common.h @@ -7,13 +7,26 @@ * * ------------------------------------------------------------------------ */ + #ifndef NODES_COMMON_H #define NODES_COMMON_H +#include "relation_info.h" + +#include "postgres.h" #include "commands/explain.h" -#include "pathman.h" +#include "optimizer/planner.h" + +#if PG_VERSION_NUM >= 90600 +#include "nodes/extensible.h" +#endif +/* + * Common structure for storing selected + * Paths/Plans/PlanStates in a hash table + * or its slice. + */ typedef struct { Oid relid; /* partition relid */ @@ -23,7 +36,7 @@ typedef struct CHILD_PATH = 0, CHILD_PLAN, CHILD_PLAN_STATE - } content_type; + } content_type; union { @@ -32,7 +45,7 @@ typedef struct PlanState *plan_state; } content; - int original_order; /* for sorting in EXPLAIN */ + int original_order; /* for sorting in EXPLAIN */ } ChildScanCommonData; typedef ChildScanCommonData *ChildScanCommon; @@ -51,6 +64,13 @@ clear_plan_states(CustomScanState *scan_state) } } +List * get_partitioned_attr_clauses(List *restrictinfo_list, + const PartRelationInfo *prel, + Index partitioned_rel); + +Oid * get_partition_oids(List *ranges, int *n, const PartRelationInfo *prel, + bool include_parent); + Path * create_append_path_common(PlannerInfo *root, AppendPath *inner_append, ParamPathInfo *param_info, diff --git a/contrib/pg_pathman/src/partition_filter.c b/contrib/pg_pathman/src/partition_filter.c new file mode 100644 index 0000000000..51f09923e0 --- /dev/null +++ b/contrib/pg_pathman/src/partition_filter.c @@ -0,0 +1,839 @@ +/* ------------------------------------------------------------------------ + * + * partition_filter.c + * Select partition for INSERT operation + * + * Copyright (c) 2016, Postgres Professional + * + * ------------------------------------------------------------------------ + */ + +#include "init.h" +#include "nodes_common.h" +#include "partition_filter.h" +#include "utils.h" + +#include "foreign/fdwapi.h" +#include "foreign/foreign.h" +#include "nodes/nodeFuncs.h" +#include "utils/guc.h" +#include "utils/memutils.h" +#include "utils/lsyscache.h" +#include "utils/syscache.h" + + +#define ALLOC_EXP 2 + + +/* + * We use this struct as an argument for fake + * MemoryContextCallback pf_memcxt_callback() + * in order to attach some additional info to + * EState (estate->es_query_cxt is involved). + */ +typedef struct +{ + int estate_alloc_result_rels; /* number of allocated result rels */ + bool estate_not_modified; /* did we modify EState somehow? */ +} estate_mod_data; + +/* + * Allow INSERTs into any FDW \ postgres_fdw \ no FDWs at all. + */ +typedef enum +{ + PF_FDW_INSERT_DISABLED = 0, /* INSERTs into FDWs are prohibited */ + PF_FDW_INSERT_POSTGRES, /* INSERTs into postgres_fdw are OK */ + PF_FDW_INSERT_ANY_FDW /* INSERTs into any FDWs are OK */ +} PF_insert_fdw_mode; + +static const struct config_enum_entry pg_pathman_insert_into_fdw_options[] = { + { "disabled", PF_FDW_INSERT_DISABLED, false }, + { "postgres", PF_FDW_INSERT_POSTGRES, false }, + { "any_fdw", PF_FDW_INSERT_ANY_FDW, false }, + { NULL, 0, false } +}; + + +bool pg_pathman_enable_partition_filter = true; +int pg_pathman_insert_into_fdw = PF_FDW_INSERT_POSTGRES; + +CustomScanMethods partition_filter_plan_methods; +CustomExecMethods partition_filter_exec_methods; + + +static estate_mod_data * fetch_estate_mod_data(EState *estate); +static void partition_filter_visitor(Plan *plan, void *context); +static List * pfilter_build_tlist(List *tlist); +static Index append_rte_to_estate(EState *estate, RangeTblEntry *rte); +static int append_rri_to_estate(EState *estate, ResultRelInfo *rri); +static void prepare_rri_fdw_for_insert(EState *estate, + ResultRelInfoHolder *rri_holder, + void *arg); + + +void +init_partition_filter_static_data(void) +{ + partition_filter_plan_methods.CustomName = "PartitionFilter"; + partition_filter_plan_methods.CreateCustomScanState = partition_filter_create_scan_state; + + partition_filter_exec_methods.CustomName = "PartitionFilter"; + partition_filter_exec_methods.BeginCustomScan = partition_filter_begin; + partition_filter_exec_methods.ExecCustomScan = partition_filter_exec; + partition_filter_exec_methods.EndCustomScan = partition_filter_end; + partition_filter_exec_methods.ReScanCustomScan = partition_filter_rescan; + partition_filter_exec_methods.MarkPosCustomScan = NULL; + partition_filter_exec_methods.RestrPosCustomScan = NULL; + partition_filter_exec_methods.ExplainCustomScan = partition_filter_explain; + + DefineCustomBoolVariable("pg_pathman.enable_partitionfilter", + "Enables the planner's use of PartitionFilter custom node.", + NULL, + &pg_pathman_enable_partition_filter, + true, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + DefineCustomEnumVariable("pg_pathman.insert_into_fdw", + "Allow INSERTS into FDW partitions.", + NULL, + &pg_pathman_insert_into_fdw, + PF_FDW_INSERT_POSTGRES, + pg_pathman_insert_into_fdw_options, + PGC_SUSET, + 0, + NULL, + NULL, + NULL); +} + + +/* + * Add PartitionFilter nodes to the plan tree + */ +void +add_partition_filters(List *rtable, Plan *plan) +{ + if (pg_pathman_enable_partition_filter) + plan_tree_walker(plan, partition_filter_visitor, rtable); +} + + +/* + * Initialize ResultPartsStorage (hash table etc). + */ +void +init_result_parts_storage(ResultPartsStorage *parts_storage, + EState *estate, + bool speculative_inserts, + Size table_entry_size, + on_new_rri_holder on_new_rri_holder_cb, + void *on_new_rri_holder_cb_arg) +{ + HASHCTL *result_rels_table_config = &parts_storage->result_rels_table_config; + + memset(result_rels_table_config, 0, sizeof(HASHCTL)); + result_rels_table_config->keysize = sizeof(Oid); + + /* Use sizeof(ResultRelInfoHolder) if table_entry_size is 0 */ + if (table_entry_size == ResultPartsStorageStandard) + result_rels_table_config->entrysize = sizeof(ResultRelInfoHolder); + else + result_rels_table_config->entrysize = table_entry_size; + + parts_storage->result_rels_table = hash_create("ResultRelInfo storage", 10, + result_rels_table_config, + HASH_ELEM | HASH_BLOBS); + parts_storage->estate = estate; + parts_storage->saved_rel_info = NULL; + + parts_storage->on_new_rri_holder_callback = on_new_rri_holder_cb; + parts_storage->callback_arg = on_new_rri_holder_cb_arg; + + /* Currenly ResultPartsStorage is used only for INSERTs */ + parts_storage->command_type = CMD_INSERT; + parts_storage->speculative_inserts = speculative_inserts; + + /* Partitions must remain locked till transaction's end */ + parts_storage->head_open_lock_mode = RowExclusiveLock; + parts_storage->heap_close_lock_mode = NoLock; +} + +/* + * Free ResultPartsStorage (close relations etc). + */ +void +fini_result_parts_storage(ResultPartsStorage *parts_storage, bool close_rels) +{ + /* Close partitions and their indices if asked to */ + if (close_rels) + { + HASH_SEQ_STATUS stat; + ResultRelInfoHolder *rri_holder; /* ResultRelInfo holder */ + + hash_seq_init(&stat, parts_storage->result_rels_table); + while ((rri_holder = (ResultRelInfoHolder *) hash_seq_search(&stat)) != NULL) + { + ExecCloseIndices(rri_holder->result_rel_info); + + heap_close(rri_holder->result_rel_info->ri_RelationDesc, + parts_storage->heap_close_lock_mode); + } + } + + /* Finally destroy hash table */ + hash_destroy(parts_storage->result_rels_table); +} + +/* + * Find a ResultRelInfo for the partition using ResultPartsStorage. + */ +ResultRelInfoHolder * +scan_result_parts_storage(Oid partid, ResultPartsStorage *parts_storage) +{ +#define CopyToResultRelInfo(field_name) \ + ( part_result_rel_info->field_name = parts_storage->saved_rel_info->field_name ) + + ResultRelInfoHolder *rri_holder; + bool found; + + rri_holder = hash_search(parts_storage->result_rels_table, + (const void *) &partid, + HASH_ENTER, &found); + + /* If not found, create & cache new ResultRelInfo */ + if (!found) + { + Relation child_rel; + RangeTblEntry *child_rte, + *parent_rte; + Index child_rte_idx; + ResultRelInfo *part_result_rel_info; + + /* Lock partition and check if it exists */ + LockRelationOid(partid, parts_storage->head_open_lock_mode); + if(!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(partid))) + { + UnlockRelationOid(partid, parts_storage->head_open_lock_mode); + return NULL; + } + + parent_rte = rt_fetch(parts_storage->saved_rel_info->ri_RangeTableIndex, + parts_storage->estate->es_range_table); + + /* Open relation and check if it is a valid target */ + child_rel = heap_open(partid, NoLock); + CheckValidResultRel(child_rel, parts_storage->command_type); + + /* Create RangeTblEntry for partition */ + child_rte = makeNode(RangeTblEntry); + + child_rte->rtekind = RTE_RELATION; + child_rte->relid = partid; + child_rte->relkind = child_rel->rd_rel->relkind; + child_rte->eref = parent_rte->eref; + child_rte->requiredPerms = parent_rte->requiredPerms; + child_rte->checkAsUser = parent_rte->checkAsUser; + child_rte->insertedCols = parent_rte->insertedCols; + + /* Check permissions for partition */ + ExecCheckRTPerms(list_make1(child_rte), true); + + /* Append RangeTblEntry to estate->es_range_table */ + child_rte_idx = append_rte_to_estate(parts_storage->estate, child_rte); + + /* Create ResultRelInfo for partition */ + part_result_rel_info = makeNode(ResultRelInfo); + + /* Check that 'saved_rel_info' is set */ + if (!parts_storage->saved_rel_info) + elog(ERROR, "ResultPartsStorage contains no saved_rel_info"); + + InitResultRelInfo(part_result_rel_info, + child_rel, + child_rte_idx, + parts_storage->estate->es_instrument); + + if (parts_storage->command_type != CMD_DELETE) + ExecOpenIndices(part_result_rel_info, parts_storage->speculative_inserts); + + /* Copy necessary fields from saved ResultRelInfo */ + CopyToResultRelInfo(ri_WithCheckOptions); + CopyToResultRelInfo(ri_WithCheckOptionExprs); + CopyToResultRelInfo(ri_junkFilter); + CopyToResultRelInfo(ri_projectReturning); + CopyToResultRelInfo(ri_onConflictSetProj); + CopyToResultRelInfo(ri_onConflictSetWhere); + + /* ri_ConstraintExprs will be initialized by ExecRelCheck() */ + part_result_rel_info->ri_ConstraintExprs = NULL; + + /* Finally fill the ResultRelInfo holder */ + rri_holder->partid = partid; + rri_holder->result_rel_info = part_result_rel_info; + + /* Call on_new_rri_holder_callback() if needed */ + if (parts_storage->on_new_rri_holder_callback) + parts_storage->on_new_rri_holder_callback(parts_storage->estate, + rri_holder, + parts_storage->callback_arg); + + /* Append ResultRelInfo to storage->es_alloc_result_rels */ + append_rri_to_estate(parts_storage->estate, part_result_rel_info); + } + + return rri_holder; +} + +/* + * Find matching partitions for 'value' using PartRelationInfo. + */ +Oid * +find_partitions_for_value(Datum value, const PartRelationInfo *prel, + ExprContext *econtext, int *nparts) +{ +#define CopyToTempConst(const_field, attr_field) \ + ( temp_const.const_field = prel->attr_field ) + + Const temp_const; /* temporary const for expr walker */ + WalkerContext wcxt; + List *ranges = NIL; + + /* Prepare dummy Const node */ + NodeSetTag(&temp_const, T_Const); + temp_const.location = -1; + + /* Fill const with value ... */ + temp_const.constvalue = value; + temp_const.constisnull = false; + + /* ... and some other important data */ + CopyToTempConst(consttype, atttype); + CopyToTempConst(consttypmod, atttypmod); + CopyToTempConst(constcollid, attcollid); + CopyToTempConst(constlen, attlen); + CopyToTempConst(constbyval, attbyval); + + InitWalkerContext(&wcxt, prel, econtext, true); + ranges = walk_expr_tree((Expr *) &temp_const, &wcxt)->rangeset; + return get_partition_oids(ranges, nparts, prel, false); +} + + +Plan * +make_partition_filter(Plan *subplan, Oid partitioned_table, + OnConflictAction conflict_action) +{ + CustomScan *cscan = makeNode(CustomScan); + + cscan->scan.plan.startup_cost = subplan->startup_cost; + cscan->scan.plan.total_cost = subplan->total_cost; + cscan->scan.plan.plan_rows = subplan->plan_rows; + cscan->scan.plan.plan_width = subplan->plan_width; + + cscan->methods = &partition_filter_plan_methods; + cscan->custom_plans = list_make1(subplan); + + cscan->scan.plan.targetlist = pfilter_build_tlist(subplan->targetlist); + + /* No relation will be scanned */ + cscan->scan.scanrelid = 0; + cscan->custom_scan_tlist = subplan->targetlist; + + /* Pack partitioned table's Oid and conflict_action */ + cscan->custom_private = list_make2_int(partitioned_table, conflict_action); + + return &cscan->scan.plan; +} + +Node * +partition_filter_create_scan_state(CustomScan *node) +{ + PartitionFilterState *state; + + state = (PartitionFilterState *) palloc0(sizeof(PartitionFilterState)); + NodeSetTag(state, T_CustomScanState); + + state->css.flags = node->flags; + state->css.methods = &partition_filter_exec_methods; + + /* Extract necessary variables */ + state->subplan = (Plan *) linitial(node->custom_plans); + state->partitioned_table = linitial_int(node->custom_private); + state->on_conflict_action = lsecond_int(node->custom_private); + + /* Check boundaries */ + Assert(state->on_conflict_action >= ONCONFLICT_NONE || + state->on_conflict_action <= ONCONFLICT_UPDATE); + + /* There should be exactly one subplan */ + Assert(list_length(node->custom_plans) == 1); + + return (Node *) state; +} + +void +partition_filter_begin(CustomScanState *node, EState *estate, int eflags) +{ + PartitionFilterState *state = (PartitionFilterState *) node; + + /* It's convenient to store PlanState in 'custom_ps' */ + node->custom_ps = list_make1(ExecInitNode(state->subplan, estate, eflags)); + + /* Init ResultRelInfo cache */ + init_result_parts_storage(&state->result_parts, estate, + state->on_conflict_action != ONCONFLICT_NONE, + ResultPartsStorageStandard, prepare_rri_fdw_for_insert, NULL); + + state->warning_triggered = false; +} + +TupleTableSlot * +partition_filter_exec(CustomScanState *node) +{ + PartitionFilterState *state = (PartitionFilterState *) node; + + ExprContext *econtext = node->ss.ps.ps_ExprContext; + EState *estate = node->ss.ps.state; + PlanState *child_ps = (PlanState *) linitial(node->custom_ps); + TupleTableSlot *slot; + + slot = ExecProcNode(child_ps); + + /* Save original ResultRelInfo */ + if (!state->result_parts.saved_rel_info) + state->result_parts.saved_rel_info = estate->es_result_relation_info; + + if (!TupIsNull(slot)) + { + MemoryContext old_cxt; + const PartRelationInfo *prel; + ResultRelInfoHolder *rri_holder; + bool isnull; + Datum value; + + /* Fetch PartRelationInfo for this partitioned relation */ + prel = get_pathman_relation_info(state->partitioned_table); + if (!prel) + { + if (!state->warning_triggered) + elog(WARNING, "Relation \"%s\" is not partitioned, " + "PartitionFilter will behave as a normal INSERT", + get_rel_name_or_relid(state->partitioned_table)); + + return slot; + } + + /* Extract partitioned column's value (also check types) */ + Assert(slot->tts_tupleDescriptor-> + attrs[prel->attnum - 1]->atttypid == prel->atttype); + value = slot_getattr(slot, prel->attnum, &isnull); + if (isnull) + elog(ERROR, ERR_PART_ATTR_NULL); + + /* Switch to per-tuple context */ + old_cxt = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); + + /* Search for a matching partition */ + rri_holder = select_partition_for_insert(prel, + &state->result_parts, + value, estate, true); + estate->es_result_relation_info = rri_holder->result_rel_info; + + /* Switch back and clean up per-tuple context */ + MemoryContextSwitchTo(old_cxt); + ResetExprContext(econtext); + + return slot; + } + + return NULL; +} + +void +partition_filter_end(CustomScanState *node) +{ + PartitionFilterState *state = (PartitionFilterState *) node; + + /* Executor will close rels via estate->es_result_relations */ + fini_result_parts_storage(&state->result_parts, false); + + Assert(list_length(node->custom_ps) == 1); + ExecEndNode((PlanState *) linitial(node->custom_ps)); +} + +void +partition_filter_rescan(CustomScanState *node) +{ + Assert(list_length(node->custom_ps) == 1); + ExecReScan((PlanState *) linitial(node->custom_ps)); +} + +void +partition_filter_explain(CustomScanState *node, List *ancestors, ExplainState *es) +{ + /* Nothing to do here now */ +} + +/* + * Smart wrapper for scan_result_parts_storage(). + */ +ResultRelInfoHolder * +select_partition_for_insert(const PartRelationInfo *prel, + ResultPartsStorage *parts_storage, + Datum value, EState *estate, + bool spawn_partitions) +{ + MemoryContext old_cxt; + ExprContext *econtext; + ResultRelInfoHolder *rri_holder; + Oid selected_partid = InvalidOid; + Oid *parts; + int nparts; + + econtext = GetPerTupleExprContext(estate); + + /* Search for matching partitions */ + parts = find_partitions_for_value(value, prel, econtext, &nparts); + + if (nparts > 1) + elog(ERROR, ERR_PART_ATTR_MULTIPLE); + else if (nparts == 0) + { + /* + * If auto partition propagation is enabled then try to create + * new partitions for the key + */ + if (prel->auto_partition && IsAutoPartitionEnabled() && spawn_partitions) + { + selected_partid = create_partitions(PrelParentRelid(prel), + value, prel->atttype); + + /* get_pathman_relation_info() will refresh this entry */ + invalidate_pathman_relation_info(PrelParentRelid(prel), NULL); + } + else + elog(ERROR, ERR_PART_ATTR_NO_PART, + datum_to_cstring(value, prel->atttype)); + } + else selected_partid = parts[0]; + + /* Replace parent table with a suitable partition */ + old_cxt = MemoryContextSwitchTo(estate->es_query_cxt); + rri_holder = scan_result_parts_storage(selected_partid, parts_storage); + MemoryContextSwitchTo(old_cxt); + + /* Could not find suitable partition */ + if (rri_holder == NULL) + elog(ERROR, ERR_PART_ATTR_NO_PART, + datum_to_cstring(value, prel->atttype)); + + return rri_holder; +} + +/* + * Callback to be executed on FDW partitions. + */ +static void +prepare_rri_fdw_for_insert(EState *estate, + ResultRelInfoHolder *rri_holder, + void *arg) +{ + ResultRelInfo *rri = rri_holder->result_rel_info; + FdwRoutine *fdw_routine = rri->ri_FdwRoutine; + Oid partid; + + /* Nothing to do if not FDW */ + if (fdw_routine == NULL) + return; + + partid = RelationGetRelid(rri->ri_RelationDesc); + + /* Perform some checks according to 'pg_pathman_insert_into_fdw' */ + switch (pg_pathman_insert_into_fdw) + { + case PF_FDW_INSERT_DISABLED: + elog(ERROR, "INSERTs into FDW partitions are disabled"); + break; + + case PF_FDW_INSERT_POSTGRES: + { + ForeignDataWrapper *fdw; + ForeignServer *fserver; + + /* Check if it's PostgreSQL FDW */ + fserver = GetForeignServer(GetForeignTable(partid)->serverid); + fdw = GetForeignDataWrapper(fserver->fdwid); + if (strcmp("postgres_fdw", fdw->fdwname) != 0) + elog(ERROR, "FDWs other than postgres_fdw are restricted"); + } + break; + + case PF_FDW_INSERT_ANY_FDW: + { + ForeignDataWrapper *fdw; + ForeignServer *fserver; + + fserver = GetForeignServer(GetForeignTable(partid)->serverid); + fdw = GetForeignDataWrapper(fserver->fdwid); + if (strcmp("postgres_fdw", fdw->fdwname) != 0) + elog(WARNING, "unrestricted FDW mode may lead to \"%s\" crashes", + fdw->fdwname); + } + break; /* do nothing */ + + default: + elog(ERROR, "Mode is not implemented yet"); + break; + } + + if (fdw_routine->PlanForeignModify) + { + RangeTblEntry *rte; + ModifyTableState mtstate; + List *fdw_private; + Query query; + PlannedStmt *plan; + TupleDesc tupdesc; + int i, + target_attr; + + /* Fetch RangeTblEntry for partition */ + rte = rt_fetch(rri->ri_RangeTableIndex, estate->es_range_table); + + /* Fetch tuple descriptor */ + tupdesc = RelationGetDescr(rri->ri_RelationDesc); + + /* Create fake Query node */ + memset((void *) &query, 0, sizeof(Query)); + NodeSetTag(&query, T_Query); + + query.commandType = CMD_INSERT; + query.querySource = QSRC_ORIGINAL; + query.resultRelation = 1; + query.rtable = list_make1(copyObject(rte)); + query.jointree = makeNode(FromExpr); + + query.targetList = NIL; + query.returningList = NIL; + + /* Generate 'query.targetList' using 'tupdesc' */ + target_attr = 1; + for (i = 0; i < tupdesc->natts; i++) + { + Form_pg_attribute attr; + TargetEntry *te; + Param *param; + + attr = tupdesc->attrs[i]; + + if (attr->attisdropped) + continue; + + param = makeNode(Param); + param->paramkind = PARAM_EXTERN; + param->paramid = target_attr; + param->paramtype = attr->atttypid; + param->paramtypmod = attr->atttypmod; + param->paramcollid = attr->attcollation; + param->location = -1; + + te = makeTargetEntry((Expr *) param, target_attr, + pstrdup(NameStr(attr->attname)), + false); + + query.targetList = lappend(query.targetList, te); + + target_attr++; + } + + /* Create fake ModifyTableState */ + memset((void *) &mtstate, 0, sizeof(ModifyTableState)); + NodeSetTag(&mtstate, T_ModifyTableState); + mtstate.ps.state = estate; + mtstate.operation = CMD_INSERT; + mtstate.resultRelInfo = rri; + mtstate.mt_onconflict = ONCONFLICT_NONE; + + /* Plan fake query in for FDW access to be planned as well */ + elog(DEBUG1, "FDW(%u): plan fake query for fdw_private", partid); + plan = standard_planner(&query, 0, NULL); + + /* Extract fdw_private from useless plan */ + elog(DEBUG1, "FDW(%u): extract fdw_private", partid); + fdw_private = (List *) + linitial(((ModifyTable *) plan->planTree)->fdwPrivLists); + + /* call BeginForeignModify on 'rri' */ + elog(DEBUG1, "FDW(%u): call BeginForeignModify on a fake INSERT node", partid); + fdw_routine->BeginForeignModify(&mtstate, rri, fdw_private, 0, 0); + + /* Report success */ + elog(DEBUG1, "FDW(%u): success", partid); + } +} + +/* + * Used by fetch_estate_mod_data() to find estate_mod_data. + */ +static void +pf_memcxt_callback(void *arg) { elog(DEBUG1, "EState is destroyed"); } + +/* + * Fetch (or create) a estate_mod_data structure we've hidden inside es_query_cxt. + */ +static estate_mod_data * +fetch_estate_mod_data(EState *estate) +{ + MemoryContext estate_mcxt = estate->es_query_cxt; + estate_mod_data *emd_struct; + MemoryContextCallback *cb = estate_mcxt->reset_cbs; + + /* Go through callback list */ + while (cb != NULL) + { + /* This is the dummy callback we're looking for! */ + if (cb->func == pf_memcxt_callback) + return (estate_mod_data *) cb->arg; + + cb = estate_mcxt->reset_cbs->next; + } + + /* Have to create a new one */ + emd_struct = MemoryContextAlloc(estate_mcxt, sizeof(estate_mod_data)); + emd_struct->estate_not_modified = true; + emd_struct->estate_alloc_result_rels = estate->es_num_result_relations; + + cb = MemoryContextAlloc(estate_mcxt, sizeof(MemoryContextCallback)); + cb->func = pf_memcxt_callback; + cb->arg = emd_struct; + + MemoryContextRegisterResetCallback(estate_mcxt, cb); + + return emd_struct; +} + +/* + * Append RangeTblEntry 'rte' to estate->es_range_table. + */ +static Index +append_rte_to_estate(EState *estate, RangeTblEntry *rte) +{ + estate_mod_data *emd_struct = fetch_estate_mod_data(estate); + + /* Copy estate->es_range_table if it's first time expansion */ + if (emd_struct->estate_not_modified) + estate->es_range_table = list_copy(estate->es_range_table); + + estate->es_range_table = lappend(estate->es_range_table, rte); + + /* Update estate_mod_data */ + emd_struct->estate_not_modified = false; + + return list_length(estate->es_range_table); +} + +/* + * Append ResultRelInfo 'rri' to estate->es_result_relations. + */ +static int +append_rri_to_estate(EState *estate, ResultRelInfo *rri) +{ + estate_mod_data *emd_struct = fetch_estate_mod_data(estate); + int result_rels_allocated = emd_struct->estate_alloc_result_rels; + + /* Reallocate estate->es_result_relations if needed */ + if (result_rels_allocated <= estate->es_num_result_relations) + { + ResultRelInfo *rri_array = estate->es_result_relations; + + result_rels_allocated = result_rels_allocated * ALLOC_EXP + 1; + estate->es_result_relations = palloc(result_rels_allocated * + sizeof(ResultRelInfo)); + memcpy(estate->es_result_relations, + rri_array, + estate->es_num_result_relations * sizeof(ResultRelInfo)); + } + + /* + * Append ResultRelInfo to 'es_result_relations' array. + * NOTE: this is probably safe since ResultRelInfo + * contains nothing but pointers to various structs. + */ + estate->es_result_relations[estate->es_num_result_relations] = *rri; + + /* Update estate_mod_data */ + emd_struct->estate_alloc_result_rels = result_rels_allocated; + emd_struct->estate_not_modified = false; + + return estate->es_num_result_relations++; +} + +/* + * Build partition filter's target list pointing to subplan tuple's elements + */ +static List * +pfilter_build_tlist(List *tlist) +{ + List *result_tlist = NIL; + ListCell *lc; + int i = 1; + + foreach (lc, tlist) + { + TargetEntry *tle = (TargetEntry *) lfirst(lc); + + Var *var = makeVar(INDEX_VAR, /* point to subplan's elements */ + i, /* direct attribute mapping */ + exprType((Node *) tle->expr), + exprTypmod((Node *) tle->expr), + exprCollation((Node *) tle->expr), + 0); + + result_tlist = lappend(result_tlist, + makeTargetEntry((Expr *) var, + i, + NULL, + tle->resjunk)); + i++; /* next resno */ + } + + return result_tlist; +} + +/* + * Add partition filters to ModifyTable node's children. + * + * 'context' should point to the PlannedStmt->rtable. + */ +static void +partition_filter_visitor(Plan *plan, void *context) +{ + List *rtable = (List *) context; + ModifyTable *modify_table = (ModifyTable *) plan; + ListCell *lc1, + *lc2; + + /* Skip if not ModifyTable with 'INSERT' command */ + if (!IsA(modify_table, ModifyTable) || modify_table->operation != CMD_INSERT) + return; + + Assert(rtable && IsA(rtable, List)); + + forboth (lc1, modify_table->plans, lc2, modify_table->resultRelations) + { + Index rindex = lfirst_int(lc2); + Oid relid = getrelid(rindex, rtable); + const PartRelationInfo *prel = get_pathman_relation_info(relid); + + /* Check that table is partitioned */ + if (prel) + lfirst(lc1) = make_partition_filter((Plan *) lfirst(lc1), + relid, + modify_table->onConflictAction); + } +} diff --git a/contrib/pg_pathman/src/partition_filter.h b/contrib/pg_pathman/src/partition_filter.h new file mode 100644 index 0000000000..f0cf05845b --- /dev/null +++ b/contrib/pg_pathman/src/partition_filter.h @@ -0,0 +1,139 @@ +/* ------------------------------------------------------------------------ + * + * partition_filter.h + * Select partition for INSERT operation + * + * Copyright (c) 2016, Postgres Professional + * + * ------------------------------------------------------------------------ + */ + +#ifndef RUNTIME_INSERT_H +#define RUNTIME_INSERT_H + +#include "relation_info.h" +#include "utils.h" + +#include "postgres.h" +#include "commands/explain.h" +#include "optimizer/planner.h" + +#if PG_VERSION_NUM >= 90600 +#include "nodes/extensible.h" +#endif + + +#define ERR_PART_ATTR_NULL "partitioned column's value should not be NULL" +#define ERR_PART_ATTR_NO_PART "no suitable partition for key '%s'" +#define ERR_PART_ATTR_MULTIPLE "PartitionFilter selected more than one partition" + + +/* + * Single element of 'result_rels_table'. + */ +typedef struct +{ + Oid partid; /* partition's relid */ + ResultRelInfo *result_rel_info; /* cached ResultRelInfo */ +} ResultRelInfoHolder; + +/* + * Callback to be fired at rri_holder creation. + */ +typedef void (*on_new_rri_holder)(EState *estate, + ResultRelInfoHolder *rri_holder, + void *arg); + +/* + * Cached ResultRelInfos of partitions. + */ +typedef struct +{ + ResultRelInfo *saved_rel_info; /* original ResultRelInfo (parent) */ + HTAB *result_rels_table; + HASHCTL result_rels_table_config; + + bool speculative_inserts; /* for ExecOpenIndices() */ + + on_new_rri_holder on_new_rri_holder_callback; + void *callback_arg; + + EState *estate; /* pointer to executor's state */ + + CmdType command_type; /* currenly we only allow INSERT */ + LOCKMODE head_open_lock_mode; + LOCKMODE heap_close_lock_mode; +} ResultPartsStorage; + +/* + * Standard size of ResultPartsStorage entry. + */ +#define ResultPartsStorageStandard 0 + +typedef struct +{ + CustomScanState css; + + Oid partitioned_table; + OnConflictAction on_conflict_action; + + Plan *subplan; /* proxy variable to store subplan */ + ResultPartsStorage result_parts; /* partition ResultRelInfo cache */ + + bool warning_triggered; /* warning message counter */ +} PartitionFilterState; + + +extern bool pg_pathman_enable_partition_filter; +extern int pg_pathman_insert_into_fdw; + +extern CustomScanMethods partition_filter_plan_methods; +extern CustomExecMethods partition_filter_exec_methods; + + +void init_partition_filter_static_data(void); + +void add_partition_filters(List *rtable, Plan *plan); + +/* ResultPartsStorage init\fini\scan function */ +void init_result_parts_storage(ResultPartsStorage *parts_storage, + EState *estate, + bool speculative_inserts, + Size table_entry_size, + on_new_rri_holder on_new_rri_holder_cb, + void *on_new_rri_holder_cb_arg); +void fini_result_parts_storage(ResultPartsStorage *parts_storage, + bool close_rels); +ResultRelInfoHolder * scan_result_parts_storage(Oid partid, + ResultPartsStorage *storage); + +/* Find suitable partition using 'value' */ +Oid *find_partitions_for_value(Datum value, const PartRelationInfo *prel, + ExprContext *econtext, int *nparts); + +Plan * make_partition_filter(Plan *subplan, + Oid partitioned_table, + OnConflictAction conflict_action); + +Node * partition_filter_create_scan_state(CustomScan *node); + +void partition_filter_begin(CustomScanState *node, + EState *estate, + int eflags); + +TupleTableSlot * partition_filter_exec(CustomScanState *node); + +void partition_filter_end(CustomScanState *node); + +void partition_filter_rescan(CustomScanState *node); + +void partition_filter_explain(CustomScanState *node, + List *ancestors, + ExplainState *es); + +ResultRelInfoHolder * select_partition_for_insert(const PartRelationInfo *prel, + ResultPartsStorage *parts_storage, + Datum value, EState *estate, + bool spawn_partitions); + +#endif diff --git a/contrib/pg_pathman/src/pathman.h b/contrib/pg_pathman/src/pathman.h index dc8c77f43d..84d71dd9ab 100644 --- a/contrib/pg_pathman/src/pathman.h +++ b/contrib/pg_pathman/src/pathman.h @@ -7,238 +7,188 @@ * * ------------------------------------------------------------------------ */ + #ifndef PATHMAN_H #define PATHMAN_H +#include "relation_info.h" +#include "rangeset.h" + #include "postgres.h" -#include "utils/date.h" -#include "utils/hsearch.h" -#include "utils/snapshot.h" -#include "utils/typcache.h" -#include "nodes/pg_list.h" #include "nodes/makefuncs.h" #include "nodes/primnodes.h" #include "nodes/execnodes.h" #include "optimizer/planner.h" #include "parser/parsetree.h" -#include "storage/dsm.h" -#include "storage/lwlock.h" -/* Check PostgreSQL version */ -#if PG_VERSION_NUM < 90500 - #error "You are trying to build pg_pathman with PostgreSQL version lower than 9.5. Please, check you environment." + +/* Check PostgreSQL version (9.5.4 contains an important fix for BGW) */ +#if PG_VERSION_NUM < 90503 + #error "Cannot build pg_pathman with PostgreSQL version lower than 9.5.3" +#elif PG_VERSION_NUM < 90504 + #warning "It is STRONGLY recommended to use pg_pathman with PostgreSQL 9.5.4 since it contains important fixes" +#endif + +/* Get CString representation of Datum (simple wrapper) */ +#ifdef USE_ASSERT_CHECKING + #include "utils.h" + #define DebugPrintDatum(datum, typid) ( datum_to_cstring((datum), (typid)) ) +#else + #define DebugPrintDatum(datum, typid) ( "[use --enable-cassert]" ) #endif -#define ALL NIL -#define INITIAL_BLOCKS_COUNT 8192 /* - * Partitioning type + * Definitions for the "pathman_config" table. */ -typedef enum PartType -{ - PT_HASH = 1, - PT_RANGE -} PartType; +#define PATHMAN_CONFIG "pathman_config" +#define Natts_pathman_config 4 +#define Anum_pathman_config_partrel 1 /* partitioned relation (regclass) */ +#define Anum_pathman_config_attname 2 /* partitioned column (text) */ +#define Anum_pathman_config_parttype 3 /* partitioning type (1|2) */ +#define Anum_pathman_config_range_interval 4 /* interval for RANGE pt. (text) */ + +/* type modifier (typmod) for 'range_interval' */ +#define PATHMAN_CONFIG_interval_typmod -1 /* - * Dynamic shared memory array + * Definitions for the "pathman_config_params" table. */ -typedef struct DsmArray -{ - dsm_handle segment; - size_t offset; - size_t length; -} DsmArray; +#define PATHMAN_CONFIG_PARAMS "pathman_config_params" +#define Natts_pathman_config_params 4 +#define Anum_pathman_config_params_partrel 1 /* primary key */ +#define Anum_pathman_config_params_enable_parent 2 /* include parent into plan */ +#define Anum_pathman_config_params_auto 3 /* auto partitions creation */ +#define Anum_pathman_config_params_init_callback 4 /* partition action callback */ /* - * Hashtable key for relations + * Definitions for the "pathman_partition_list" view. */ -typedef struct RelationKey -{ - Oid dbid; - Oid relid; -} RelationKey; +#define PATHMAN_PARTITION_LIST "pathman_partition_list" +#define Natts_pathman_partition_list 6 +#define Anum_pathman_pl_parent 1 /* partitioned relation (regclass) */ +#define Anum_pathman_pl_partition 2 /* child partition (regclass) */ +#define Anum_pathman_pl_parttype 3 /* partitioning type (1|2) */ +#define Anum_pathman_pl_partattr 4 /* partitioned column (text) */ +#define Anum_pathman_pl_range_min 5 /* partition's min value */ +#define Anum_pathman_pl_range_max 6 /* partition's max value */ + /* - * PartRelationInfo - * Per-relation partitioning information - * - * oid - parent table oid - * children - list of children oids - * parttype - partitioning type (HASH, LIST or RANGE) - * attnum - attribute number of parent relation + * Cache current PATHMAN_CONFIG relid (set during load_config()). */ -typedef struct PartRelationInfo -{ - RelationKey key; - DsmArray children; - int children_count; - PartType parttype; - Index attnum; - Oid atttype; - Oid cmp_proc; - Oid hash_proc; -} PartRelationInfo; +extern Oid pathman_config_relid; +extern Oid pathman_config_params_relid; /* - * Child relation for HASH partitioning + * Just to clarify our intentions (return the corresponding relid). */ -typedef struct HashRelationKey -{ - int hash; - Oid parent_oid; -} HashRelationKey; - -typedef struct HashRelation -{ - HashRelationKey key; - Oid child_oid; -} HashRelation; +Oid get_pathman_config_relid(void); +Oid get_pathman_config_params_relid(void); /* - * Child relation for RANGE partitioning + * pg_pathman's global state structure. */ -typedef struct RangeEntry +typedef struct PathmanState { - Oid child_oid; -#ifdef HAVE_INT64_TIMESTAMP - int64 min; - int64 max; -#else - double min; - double max; -#endif -} RangeEntry; + LWLock *dsm_init_lock; /* unused */ +} PathmanState; -typedef struct RangeRelation -{ - RelationKey key; - bool by_val; - DsmArray ranges; -} RangeRelation; -typedef struct PathmanState +/* + * Result of search_range_partition_eq(). + */ +typedef enum { - LWLock *load_config_lock; - LWLock *dsm_init_lock; - LWLock *edit_partitions_lock; - DsmArray databases; -} PathmanState; + SEARCH_RANGEREL_OUT_OF_RANGE = 0, + SEARCH_RANGEREL_GAP, + SEARCH_RANGEREL_FOUND +} search_rangerel_result; + /* * The list of partitioned relation relids that must be handled by pg_pathman */ extern List *inheritance_enabled_relids; + /* * This list is used to ensure that partitioned relation isn't used both * with and without ONLY modifiers */ extern List *inheritance_disabled_relids; -extern bool pg_pathman_enable; +/* + * pg_pathman's global state. + */ extern PathmanState *pmstate; -#define PATHMAN_GET_DATUM(value, by_val) ( (by_val) ? (Datum) (value) : PointerGetDatum(&value) ) - -typedef int IndexRange; -#define RANGE_INFINITY 0x7FFF -#define RANGE_LOSSY 0x80000000 - -#define make_irange(lower, upper, lossy) \ - (((lower) & RANGE_INFINITY) << 15 | ((upper) & RANGE_INFINITY) | ((lossy) ? RANGE_LOSSY : 0)) - -#define irange_lower(irange) \ - (((irange) >> 15) & RANGE_INFINITY) - -#define irange_upper(irange) \ - ((irange) & RANGE_INFINITY) - -#define irange_is_lossy(irange) \ - ((irange) & RANGE_LOSSY) - -#define lfirst_irange(lc) ((IndexRange)(lc)->data.int_value) -#define lappend_irange(list, irange) (lappend_int((list), (int)(irange))) -#define lcons_irange(irange, list) lcons_int((int)(irange), (list)) -#define list_make1_irange(irange) lcons_int((int)(irange), NIL) -#define llast_irange(l) (IndexRange)lfirst_int(list_tail(l)) - -/* rangeset.c */ -bool irange_intersects(IndexRange a, IndexRange b); -bool irange_conjuncted(IndexRange a, IndexRange b); -IndexRange irange_union(IndexRange a, IndexRange b); -IndexRange irange_intersect(IndexRange a, IndexRange b); -List *irange_list_union(List *a, List *b); -List *irange_list_intersect(List *a, List *b); -int irange_list_length(List *rangeset); -bool irange_list_find(List *rangeset, int index, bool *lossy); - -/* Dynamic shared memory functions */ -Size get_dsm_shared_size(void); -void init_dsm_config(void); -bool init_dsm_segment(size_t blocks_count, size_t block_size); -void init_dsm_table(size_t block_size, size_t start, size_t end); -void alloc_dsm_array(DsmArray *arr, size_t entry_size, size_t length); -void free_dsm_array(DsmArray *arr); -void resize_dsm_array(DsmArray *arr, size_t entry_size, size_t length); -void *dsm_array_get_pointer(const DsmArray* arr); -dsm_handle get_dsm_array_segment(void); -void attach_dsm_array_segment(void); - -HTAB *relations; -HTAB *range_restrictions; -bool initialization_needed; - -/* initialization functions */ -Size pathman_memsize(void); -void init_shmem_config(void); -void load_config(void); -void create_relations_hashtable(void); -void create_hash_restrictions_hashtable(void); -void create_range_restrictions_hashtable(void); -void load_relations(bool reinitialize); -void load_partitions(Oid parent_oid, Snapshot snapshot); -void remove_relation_info(Oid relid); - -/* utility functions */ + int append_child_relation(PlannerInfo *root, RelOptInfo *rel, Index rti, RangeTblEntry *rte, int index, Oid childOID, List *wrappers); -PartRelationInfo *get_pathman_relation_info(Oid relid, bool *found); -RangeRelation *get_pathman_range_relation(Oid relid, bool *found); -int range_binary_search(const RangeRelation *rangerel, FmgrInfo *cmp_func, Datum value, bool *fountPtr); -char *get_extension_schema(void); -FmgrInfo *get_cmp_func(Oid type1, Oid type2); -Oid create_partitions_bg_worker(Oid relid, Datum value, Oid value_type, bool *crashed); -Oid create_partitions(Oid relid, Datum value, Oid value_type, bool *crashed); -uint32 make_hash(uint32 value, uint32 partitions); + +search_rangerel_result search_range_partition_eq(const Datum value, + FmgrInfo *cmp_func, + const PartRelationInfo *prel, + RangeEntry *out_re); + +uint32 hash_to_part_index(uint32 value, uint32 partitions); + +void handle_modification_query(Query *parse); +void disable_inheritance(Query *parse); +void disable_inheritance_cte(Query *parse); +void disable_inheritance_subselect(Query *parse); /* copied from allpaths.h */ void set_append_rel_size(PlannerInfo *root, RelOptInfo *rel, Index rti, RangeTblEntry *rte); -void set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, Index rti, RangeTblEntry *rte, - PathKey *pathkeyAsc, PathKey *pathkeyDesc); +void set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, Index rti, + RangeTblEntry *rte, PathKey *pathkeyAsc, + PathKey *pathkeyDesc); typedef struct { - const Node *orig; - List *args; - List *rangeset; - double paramsel; + const Node *orig; /* examined expression */ + List *args; /* extracted from 'orig' */ + List *rangeset; /* IndexRanges representing selected parts */ + bool found_gap; /* were there any gaps? */ + double paramsel; /* estimated selectivity */ } WrapperNode; typedef struct { - const PartRelationInfo *prel; - bool hasLeast, - hasGreatest; - Datum least, - greatest; - - PlanState *pstate; - ExprContext *econtext; + const PartRelationInfo *prel; /* main partitioning structure */ + ExprContext *econtext; /* for ExecEvalExpr() */ + bool for_insert; /* are we in PartitionFilter now? */ } WalkerContext; +/* + * Usual initialization procedure for WalkerContext. + */ +#define InitWalkerContext(context, prel_info, ecxt, for_ins) \ + do { \ + (context)->prel = (prel_info); \ + (context)->econtext = (ecxt); \ + (context)->for_insert = (for_ins); \ + } while (0) + +/* Check that WalkerContext contains ExprContext (plan execution stage) */ +#define WcxtHasExprContext(wcxt) ( (wcxt)->econtext ) + +/* + * Functions for partition creation, use create_partitions(). + */ +Oid create_partitions(Oid relid, Datum value, Oid value_type); +Oid create_partitions_bg_worker(Oid relid, Datum value, Oid value_type); +Oid create_partitions_internal(Oid relid, Datum value, Oid value_type); + +void select_range_partitions(const Datum value, + FmgrInfo *cmp_func, + const RangeEntry *ranges, + const int nranges, + const int strategy, + WrapperNode *result); + +/* Examine expression in order to select partitions. */ WrapperNode *walk_expr_tree(Expr *expr, WalkerContext *context); -void finish_least_greatest(WrapperNode *wrap, WalkerContext *context); #endif /* PATHMAN_H */ diff --git a/contrib/pg_pathman/src/pathman_workers.c b/contrib/pg_pathman/src/pathman_workers.c new file mode 100644 index 0000000000..c913a71061 --- /dev/null +++ b/contrib/pg_pathman/src/pathman_workers.c @@ -0,0 +1,807 @@ +/*------------------------------------------------------------------------- + * + * pathman_workers.c + * + * There are two purposes of this subsystem: + * + * * Create new partitions for INSERT in separate transaction + * * Process concurrent partitioning operations + * + * Background worker API is used for both cases. + * + * Copyright (c) 2015-2016, Postgres Professional + * + *------------------------------------------------------------------------- + */ + +#include "init.h" +#include "pathman_workers.h" +#include "relation_info.h" +#include "utils.h" + +#include "access/htup_details.h" +#include "access/xact.h" +#include "catalog/pg_type.h" +#include "executor/spi.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "postmaster/bgworker.h" +#include "storage/dsm.h" +#include "storage/ipc.h" +#include "storage/latch.h" +#include "utils/builtins.h" +#include "utils/datum.h" +#include "utils/memutils.h" +#include "utils/lsyscache.h" +#include "utils/typcache.h" +#include "utils/resowner.h" +#include "utils/snapmgr.h" + + + +/* Declarations for ConcurrentPartWorker */ +PG_FUNCTION_INFO_V1( partition_table_concurrently ); +PG_FUNCTION_INFO_V1( show_concurrent_part_tasks_internal ); +PG_FUNCTION_INFO_V1( stop_concurrent_part_task ); + + +static void handle_sigterm(SIGNAL_ARGS); +static void bg_worker_load_config(const char *bgw_name); +static void start_bg_worker(const char bgworker_name[BGW_MAXLEN], + bgworker_main_type bgw_main_func, + Datum bgw_arg, bool wait_for_shutdown); + +static void bgw_main_spawn_partitions(Datum main_arg); +static void bgw_main_concurrent_part(Datum main_arg); + + +/* + * Function context for concurrent_part_tasks_internal() SRF. + */ +typedef struct +{ + int cur_idx; /* current slot to be processed */ +} active_workers_cxt; + + +/* + * Slots for concurrent partitioning tasks. + */ +static ConcurrentPartSlot *concurrent_part_slots; + + +/* + * Available workers' names. + */ +static const char *spawn_partitions_bgw = "SpawnPartitionsWorker"; +static const char *concurrent_part_bgw = "ConcurrentPartWorker"; + + +/* + * Estimate amount of shmem needed for concurrent partitioning. + */ +Size +estimate_concurrent_part_task_slots_size(void) +{ + return sizeof(ConcurrentPartSlot) * PART_WORKER_SLOTS; +} + +/* + * Initialize shared memory needed for concurrent partitioning. + */ +void +init_concurrent_part_task_slots(void) +{ + bool found; + Size size = estimate_concurrent_part_task_slots_size(); + int i; + + concurrent_part_slots = (ConcurrentPartSlot *) + ShmemInitStruct("array of ConcurrentPartSlots", size, &found); + + /* Initialize 'concurrent_part_slots' if needed */ + if (!found) + { + memset(concurrent_part_slots, 0, size); + + for (i = 0; i < PART_WORKER_SLOTS; i++) + SpinLockInit(&concurrent_part_slots[i].mutex); + } +} + + +/* + * ------------------------------------------------- + * Common utility functions for background workers + * ------------------------------------------------- + */ + +/* + * Handle SIGTERM in BGW's process. + */ +static void +handle_sigterm(SIGNAL_ARGS) +{ + int save_errno = errno; + + SetLatch(MyLatch); + + if (!proc_exit_inprogress) + { + InterruptPending = true; + ProcDiePending = true; + } + + errno = save_errno; +} + +/* + * Initialize pg_pathman's local config in BGW's process. + */ +static void +bg_worker_load_config(const char *bgw_name) +{ + /* Try to load config */ + if (!load_config()) + elog(ERROR, "%s: could not load pg_pathman's config [%u]", + bgw_name, MyProcPid); + else + elog(LOG, "%s: loaded pg_pathman's config [%u]", + bgw_name, MyProcPid); +} + +/* + * Common function to start background worker. + */ +static void +start_bg_worker(const char bgworker_name[BGW_MAXLEN], + bgworker_main_type bgw_main_func, + Datum bgw_arg, bool wait_for_shutdown) +{ +#define HandleError(condition, new_state) \ + if (condition) { exec_state = (new_state); goto handle_exec_state; } + + /* Execution state to be checked */ + enum + { + BGW_OK = 0, /* everything is fine (default) */ + BGW_COULD_NOT_START, /* could not start worker */ + BGW_PM_DIED /* postmaster died */ + } exec_state = BGW_OK; + + BackgroundWorker worker; + BackgroundWorkerHandle *bgw_handle; + BgwHandleStatus bgw_status; + bool bgw_started; + pid_t pid; + + /* Initialize worker struct */ + memcpy(worker.bgw_name, bgworker_name, BGW_MAXLEN); + worker.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION; + worker.bgw_start_time = BgWorkerStart_RecoveryFinished; + worker.bgw_restart_time = BGW_NEVER_RESTART; + worker.bgw_main = bgw_main_func; + worker.bgw_main_arg = bgw_arg; + worker.bgw_notify_pid = MyProcPid; + + /* Start dynamic worker */ + bgw_started = RegisterDynamicBackgroundWorker(&worker, &bgw_handle); + HandleError(bgw_started == false, BGW_COULD_NOT_START); + + /* Wait till the worker starts */ + bgw_status = WaitForBackgroundWorkerStartup(bgw_handle, &pid); + HandleError(bgw_status == BGWH_POSTMASTER_DIED, BGW_PM_DIED); + + /* Wait till the edn if we're asked to */ + if (wait_for_shutdown) + { + /* Wait till the worker finishes job */ + bgw_status = WaitForBackgroundWorkerShutdown(bgw_handle); + HandleError(bgw_status == BGWH_POSTMASTER_DIED, BGW_PM_DIED); + } + +/* end execution */ +handle_exec_state: + + switch (exec_state) + { + case BGW_COULD_NOT_START: + elog(ERROR, "Unable to create background %s for pg_pathman", + bgworker_name); + break; + + case BGW_PM_DIED: + ereport(ERROR, + (errmsg("Postmaster died during the pg_pathman background worker process"), + errhint("More details may be available in the server log."))); + break; + + default: + break; + } +} + + +/* + * -------------------------------------- + * SpawnPartitionsWorker implementation + * -------------------------------------- + */ + +/* + * Create args segment for partitions bgw. + */ +static dsm_segment * +create_partitions_bg_worker_segment(Oid relid, Datum value, Oid value_type) +{ + TypeCacheEntry *typcache; + Size datum_size; + Size segment_size; + dsm_segment *segment; + SpawnPartitionArgs *args; + + typcache = lookup_type_cache(value_type, 0); + + /* Calculate segment size */ + datum_size = datumGetSize(value, typcache->typbyval, typcache->typlen); + segment_size = offsetof(SpawnPartitionArgs, value) + datum_size; + + segment = dsm_create(segment_size, 0); + + /* Initialize BGW args */ + args = (SpawnPartitionArgs *) dsm_segment_address(segment); + + args->userid = GetAuthenticatedUserId(); + + args->result = InvalidOid; + args->dbid = MyDatabaseId; + args->partitioned_table = relid; + + /* Write value-related stuff */ + args->value_type = value_type; + args->value_size = datum_size; + args->value_byval = typcache->typbyval; + + PackDatumToByteArray((void *) args->value, value, + datum_size, args->value_byval); + + return segment; +} + +/* + * Starts background worker that will create new partitions, + * waits till it finishes the job and returns the result (new partition oid) + * + * NB: This function should not be called directly, use create_partitions() instead. + */ +Oid +create_partitions_bg_worker(Oid relid, Datum value, Oid value_type) +{ + dsm_segment *segment; + dsm_handle segment_handle; + SpawnPartitionArgs *bgw_args; + Oid child_oid = InvalidOid; + + /* Create a dsm segment for the worker to pass arguments */ + segment = create_partitions_bg_worker_segment(relid, value, value_type); + segment_handle = dsm_segment_handle(segment); + bgw_args = (SpawnPartitionArgs *) dsm_segment_address(segment); + + /* Start worker and wait for it to finish */ + start_bg_worker(spawn_partitions_bgw, + bgw_main_spawn_partitions, + UInt32GetDatum(segment_handle), + true); + + /* Save the result (partition Oid) */ + child_oid = bgw_args->result; + + /* Free dsm segment */ + dsm_detach(segment); + + if (child_oid == InvalidOid) + ereport(ERROR, + (errmsg("Attempt to spawn new partitions of relation \"%s\" failed", + get_rel_name_or_relid(relid)), + errhint("See server log for more details."))); + + return child_oid; +} + +/* + * Entry point for SpawnPartitionsWorker's process. + */ +static void +bgw_main_spawn_partitions(Datum main_arg) +{ + dsm_handle handle = DatumGetUInt32(main_arg); + dsm_segment *segment; + SpawnPartitionArgs *args; + Datum value; + + /* Establish signal handlers before unblocking signals. */ + pqsignal(SIGTERM, handle_sigterm); + + /* We're now ready to receive signals */ + BackgroundWorkerUnblockSignals(); + + /* Create resource owner */ + CurrentResourceOwner = ResourceOwnerCreate(NULL, spawn_partitions_bgw); + + if (!handle) + elog(ERROR, "%s: invalid dsm_handle [%u]", + spawn_partitions_bgw, MyProcPid); + + /* Attach to dynamic shared memory */ + if ((segment = dsm_attach(handle)) == NULL) + elog(ERROR, "%s: cannot attach to segment [%u]", + spawn_partitions_bgw, MyProcPid); + args = dsm_segment_address(segment); + + /* Establish connection and start transaction */ + BackgroundWorkerInitializeConnectionByOid(args->dbid, args->userid); + + /* Start new transaction (syscache access etc.) */ + StartTransactionCommand(); + + /* Initialize pg_pathman's local config */ + bg_worker_load_config(spawn_partitions_bgw); + + /* Upack Datum from segment to 'value' */ + UnpackDatumFromByteArray(&value, + args->value_size, + args->value_byval, + (const void *) args->value); + +/* Print 'arg->value' for debug purposes */ +#ifdef USE_ASSERT_CHECKING + elog(LOG, "%s: arg->value is '%s' [%u]", + spawn_partitions_bgw, + DebugPrintDatum(value, args->value_type), MyProcPid); +#endif + + /* Create partitions and save the Oid of the last one */ + args->result = create_partitions_internal(args->partitioned_table, + value, /* unpacked Datum */ + args->value_type); + + /* Finish transaction in an appropriate way */ + if (args->result == InvalidOid) + AbortCurrentTransaction(); + else + CommitTransactionCommand(); + + dsm_detach(segment); +} + + +/* + * ------------------------------------- + * ConcurrentPartWorker implementation + * ------------------------------------- + */ + +/* + * Entry point for ConcurrentPartWorker's process. + */ +static void +bgw_main_concurrent_part(Datum main_arg) +{ + int rows; + bool failed; + int failures_count = 0; + char *sql = NULL; + ConcurrentPartSlot *part_slot; + + /* Establish signal handlers before unblocking signals. */ + pqsignal(SIGTERM, handle_sigterm); + + /* We're now ready to receive signals */ + BackgroundWorkerUnblockSignals(); + + /* Create resource owner */ + CurrentResourceOwner = ResourceOwnerCreate(NULL, concurrent_part_bgw); + + /* Update concurrent part slot */ + part_slot = &concurrent_part_slots[DatumGetInt32(main_arg)]; + part_slot->pid = MyProcPid; + + /* Disable auto partition propagation */ + SetAutoPartitionEnabled(false); + + /* Establish connection and start transaction */ + BackgroundWorkerInitializeConnectionByOid(part_slot->dbid, part_slot->userid); + + /* Initialize pg_pathman's local config */ + StartTransactionCommand(); + bg_worker_load_config(concurrent_part_bgw); + CommitTransactionCommand(); + + /* Do the job */ + do + { + MemoryContext old_mcxt; + + Oid types[2] = { OIDOID, INT4OID }; + Datum vals[2] = { part_slot->relid, part_slot->batch_size }; + bool nulls[2] = { false, false }; + + /* Reset loop variables */ + failed = false; + rows = 0; + + /* Start new transaction (syscache access etc.) */ + StartTransactionCommand(); + + /* We'll need this to recover from errors */ + old_mcxt = CurrentMemoryContext; + + SPI_connect(); + PushActiveSnapshot(GetTransactionSnapshot()); + + /* Prepare the query if needed */ + if (sql == NULL) + { + MemoryContext current_mcxt; + + /* + * Allocate as SQL query in top memory context because current + * context will be destroyed after transaction finishes + */ + current_mcxt = MemoryContextSwitchTo(TopMemoryContext); + sql = psprintf("SELECT %s._partition_data_concurrent($1::oid, p_limit:=$2)", + get_namespace_name(get_pathman_schema())); + MemoryContextSwitchTo(current_mcxt); + } + + /* Exec ret = _partition_data_concurrent() */ + PG_TRY(); + { + int ret; + bool isnull; + + ret = SPI_execute_with_args(sql, 2, types, vals, nulls, false, 0); + if (ret == SPI_OK_SELECT) + { + TupleDesc tupdesc = SPI_tuptable->tupdesc; + HeapTuple tuple = SPI_tuptable->vals[0]; + + Assert(SPI_processed == 1); /* there should be 1 result at most */ + + rows = DatumGetInt32(SPI_getbinval(tuple, tupdesc, 1, &isnull)); + + Assert(!isnull); /* ... and ofc it must not be NULL */ + } + } + PG_CATCH(); + { + ErrorData *error; + char *sleep_time_str; + + /* Switch to the original context & copy edata */ + MemoryContextSwitchTo(old_mcxt); + error = CopyErrorData(); + FlushErrorState(); + + /* Print messsage for this BGWorker to server log */ + sleep_time_str = datum_to_cstring(Float8GetDatum(part_slot->sleep_time), + FLOAT8OID); + failures_count++; + ereport(LOG, + (errmsg("%s: %s", concurrent_part_bgw, error->message), + errdetail("attempt: %d/%d, sleep time: %s", + failures_count, + PART_WORKER_MAX_ATTEMPTS, + sleep_time_str))); + pfree(sleep_time_str); /* free the time string */ + + FreeErrorData(error); + + /* + * The most common exception we can catch here is a deadlock with + * concurrent user queries. Check that attempts count doesn't exceed + * some reasonable value + */ + if (failures_count >= PART_WORKER_MAX_ATTEMPTS) + { + /* Mark slot as FREE */ + cps_set_status(part_slot, CPS_FREE); + + elog(LOG, + "concurrent partitioning worker has canceled the task because " + "maximum amount of attempts (%d) had been exceeded, " + "see the error message below", + PART_WORKER_MAX_ATTEMPTS); + + return; /* exit quickly */ + } + + /* Set 'failed' flag */ + failed = true; + } + PG_END_TRY(); + + SPI_finish(); + PopActiveSnapshot(); + + if (failed) + { + /* Abort transaction and sleep for a second */ + AbortCurrentTransaction(); + DirectFunctionCall1(pg_sleep, Float8GetDatum(part_slot->sleep_time)); + } + else + { + /* Commit transaction and reset 'failures_count' */ + CommitTransactionCommand(); + failures_count = 0; + + /* Add rows to total_rows */ + SpinLockAcquire(&part_slot->mutex); + part_slot->total_rows += rows; +/* Report debug message */ +#ifdef USE_ASSERT_CHECKING + elog(DEBUG1, "%s: relocated %d rows, total: %lu [%u]", + concurrent_part_bgw, rows, part_slot->total_rows, MyProcPid); +#endif + SpinLockRelease(&part_slot->mutex); + } + + /* If other backend requested to stop us, quit */ + if (cps_check_status(part_slot) == CPS_STOPPING) + break; + } + while(rows > 0 || failed); /* do while there's still rows to be relocated */ + + /* Reclaim the resources */ + pfree(sql); + + /* Mark slot as FREE */ + cps_set_status(part_slot, CPS_FREE); +} + + +/* + * ----------------------------------------------- + * Public interface for the ConcurrentPartWorker + * ----------------------------------------------- + */ + +/* + * Start concurrent partitioning worker to redistribute rows. + * NOTE: this function returns immediately. + */ +Datum +partition_table_concurrently(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + int empty_slot_idx = -1, /* do we have a slot for BGWorker? */ + i; + + /* Check if relation is a partitioned table */ + shout_if_prel_is_invalid(relid, + /* We also lock the parent relation */ + get_pathman_relation_info_after_lock(relid, true), + /* Partitioning type does not matter here */ + PT_INDIFFERENT); + /* + * Look for an empty slot and also check that a concurrent + * partitioning operation for this table hasn't been started yet + */ + for (i = 0; i < PART_WORKER_SLOTS; i++) + { + ConcurrentPartSlot *cur_slot = &concurrent_part_slots[i]; + bool keep_this_lock = false; + + /* Lock current slot */ + SpinLockAcquire(&cur_slot->mutex); + + /* Should we take this slot into account? (it should be FREE) */ + if (empty_slot_idx < 0 && cur_slot->worker_status == CPS_FREE) + { + empty_slot_idx = i; /* yes, remember this slot */ + keep_this_lock = true; /* also don't unlock it */ + } + + /* Oops, looks like we already have BGWorker for this table */ + if (cur_slot->relid == relid && + cur_slot->dbid == MyDatabaseId && + cur_slot->worker_status != CPS_FREE) + { + /* Unlock current slot */ + SpinLockRelease(&cur_slot->mutex); + + /* Release borrowed slot for new BGWorker too */ + if (empty_slot_idx >= 0 && empty_slot_idx != i) + SpinLockRelease(&concurrent_part_slots[empty_slot_idx].mutex); + + elog(ERROR, + "table \"%s\" is already being partitioned", + get_rel_name(relid)); + } + + /* Normally we don't want to keep it */ + if (!keep_this_lock) + SpinLockRelease(&cur_slot->mutex); + } + + /* Looks like we could not find an empty slot */ + if (empty_slot_idx < 0) + elog(ERROR, "no empty worker slots found"); + else + { + /* Initialize concurrent part slot */ + InitConcurrentPartSlot(&concurrent_part_slots[empty_slot_idx], + GetAuthenticatedUserId(), CPS_WORKING, + MyDatabaseId, relid, 1000, 1.0); + + /* Now we can safely unlock slot for new BGWorker */ + SpinLockRelease(&concurrent_part_slots[empty_slot_idx].mutex); + } + + /* Start worker (we should not wait) */ + start_bg_worker(concurrent_part_bgw, + bgw_main_concurrent_part, + Int32GetDatum(empty_slot_idx), + false); + + /* Tell user everything's fine */ + elog(NOTICE, + "worker started, you can stop it " + "with the following command: select %s('%s');", + CppAsString(stop_concurrent_part_task), + get_rel_name(relid)); + + PG_RETURN_VOID(); +} + +/* + * Return list of active concurrent partitioning workers. + * NOTE: this is a set-returning-function (SRF). + */ +Datum +show_concurrent_part_tasks_internal(PG_FUNCTION_ARGS) +{ + FuncCallContext *funcctx; + active_workers_cxt *userctx; + int i; + + /* + * Initialize tuple descriptor & function call context. + */ + if (SRF_IS_FIRSTCALL()) + { + TupleDesc tupdesc; + MemoryContext old_mcxt; + + funcctx = SRF_FIRSTCALL_INIT(); + + old_mcxt = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + userctx = (active_workers_cxt *) palloc(sizeof(active_workers_cxt)); + userctx->cur_idx = 0; + + /* Create tuple descriptor */ + tupdesc = CreateTemplateTupleDesc(Natts_pathman_cp_tasks, false); + + TupleDescInitEntry(tupdesc, Anum_pathman_cp_tasks_userid, + "userid", REGROLEOID, -1, 0); + TupleDescInitEntry(tupdesc, Anum_pathman_cp_tasks_pid, + "pid", INT4OID, -1, 0); + TupleDescInitEntry(tupdesc, Anum_pathman_cp_tasks_dbid, + "dbid", OIDOID, -1, 0); + TupleDescInitEntry(tupdesc, Anum_pathman_cp_tasks_relid, + "relid", REGCLASSOID, -1, 0); + TupleDescInitEntry(tupdesc, Anum_pathman_cp_tasks_processed, + "processed", INT4OID, -1, 0); + TupleDescInitEntry(tupdesc, Anum_pathman_cp_tasks_status, + "status", TEXTOID, -1, 0); + + funcctx->tuple_desc = BlessTupleDesc(tupdesc); + funcctx->user_fctx = (void *) userctx; + + MemoryContextSwitchTo(old_mcxt); + } + + funcctx = SRF_PERCALL_SETUP(); + userctx = (active_workers_cxt *) funcctx->user_fctx; + + /* Iterate through worker slots */ + for (i = userctx->cur_idx; i < PART_WORKER_SLOTS; i++) + { + ConcurrentPartSlot *cur_slot = &concurrent_part_slots[i]; + HeapTuple htup = NULL; + + HOLD_INTERRUPTS(); + SpinLockAcquire(&cur_slot->mutex); + + if (cur_slot->worker_status != CPS_FREE) + { + Datum values[Natts_pathman_cp_tasks]; + bool isnull[Natts_pathman_cp_tasks] = { 0 }; + + values[Anum_pathman_cp_tasks_userid - 1] = cur_slot->userid; + values[Anum_pathman_cp_tasks_pid - 1] = cur_slot->pid; + values[Anum_pathman_cp_tasks_dbid - 1] = cur_slot->dbid; + values[Anum_pathman_cp_tasks_relid - 1] = cur_slot->relid; + values[Anum_pathman_cp_tasks_processed - 1] = cur_slot->total_rows; + + /* Now build a status string */ + switch(cur_slot->worker_status) + { + case CPS_WORKING: + values[Anum_pathman_cp_tasks_status - 1] = + PointerGetDatum(cstring_to_text("working")); + break; + + case CPS_STOPPING: + values[Anum_pathman_cp_tasks_status - 1] = + PointerGetDatum(cstring_to_text("stopping")); + break; + + default: + values[Anum_pathman_cp_tasks_status - 1] = + PointerGetDatum(cstring_to_text("[unknown]")); + } + + /* Form output tuple */ + htup = heap_form_tuple(funcctx->tuple_desc, values, isnull); + + /* Switch to next worker */ + userctx->cur_idx = i + 1; + } + + SpinLockRelease(&cur_slot->mutex); + RESUME_INTERRUPTS(); + + /* Return tuple if needed */ + if (htup) + SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(htup)); + } + + SRF_RETURN_DONE(funcctx); +} + +/* + * Stop the specified concurrent partitioning worker. + * NOTE: worker will stop after it finishes a batch. + */ +Datum +stop_concurrent_part_task(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + bool worker_found = false; + int i; + + for (i = 0; i < PART_WORKER_SLOTS && !worker_found; i++) + { + ConcurrentPartSlot *cur_slot = &concurrent_part_slots[i]; + + HOLD_INTERRUPTS(); + SpinLockAcquire(&cur_slot->mutex); + + if (cur_slot->worker_status != CPS_FREE && + cur_slot->relid == relid && + cur_slot->dbid == MyDatabaseId) + { + elog(NOTICE, "worker will stop after it finishes current batch"); + + /* Change worker's state & set 'worker_found' */ + cur_slot->worker_status = CPS_STOPPING; + worker_found = true; + } + + SpinLockRelease(&cur_slot->mutex); + RESUME_INTERRUPTS(); + } + + if (worker_found) + PG_RETURN_BOOL(true); + else + { + elog(ERROR, "cannot find worker for relation \"%s\"", + get_rel_name_or_relid(relid)); + + PG_RETURN_BOOL(false); /* keep compiler happy */ + } +} diff --git a/contrib/pg_pathman/src/pathman_workers.h b/contrib/pg_pathman/src/pathman_workers.h new file mode 100644 index 0000000000..dfa14d53fb --- /dev/null +++ b/contrib/pg_pathman/src/pathman_workers.h @@ -0,0 +1,181 @@ +/*------------------------------------------------------------------------- + * + * pathman_workers.h + * + * There are two purposes of this subsystem: + * + * * Create new partitions for INSERT in separate transaction + * * Process concurrent partitioning operations + * + * Background worker API is used for both cases. + * + * Copyright (c) 2015-2016, Postgres Professional + * + *------------------------------------------------------------------------- + */ + +#ifndef PATHMAN_WORKERS_H +#define PATHMAN_WORKERS_H + +#include "postgres.h" +#include "storage/spin.h" + + +/* + * Store args, result and execution status of CreatePartitionsWorker. + */ +typedef struct +{ + Oid userid; /* connect as a specified user */ + + Oid result; /* target partition */ + Oid dbid; /* database which stores 'partitioned_table' */ + Oid partitioned_table; + + /* Needed to decode Datum from 'values' */ + Oid value_type; + Size value_size; + bool value_byval; + + /* Store Datum as flexible array */ + uint8 value[FLEXIBLE_ARRAY_MEMBER]; +} SpawnPartitionArgs; + + +typedef enum +{ + CPS_FREE = 0, /* slot is empty */ + CPS_WORKING, /* occupied by live worker */ + CPS_STOPPING /* worker is going to shutdown */ + +} ConcurrentPartSlotStatus; + +/* + * Store args and execution status of a single ConcurrentPartWorker. + */ +typedef struct +{ + slock_t mutex; /* protect slot from race conditions */ + + ConcurrentPartSlotStatus worker_status; /* status of a particular worker */ + + Oid userid; /* connect as a specified user */ + pid_t pid; /* worker's PID */ + Oid dbid; /* database which contains the relation */ + Oid relid; /* table to be partitioned concurrently */ + uint64 total_rows; /* total amount of rows processed */ + + int32 batch_size; /* number of rows in a batch */ + float8 sleep_time; /* how long should we sleep in case of error? */ +} ConcurrentPartSlot; + +#define InitConcurrentPartSlot(slot, user, w_status, db, rel, batch_sz, sleep_t) \ + do { \ + (slot)->userid = (user); \ + (slot)->worker_status = (w_status); \ + (slot)->pid = 0; \ + (slot)->dbid = (db); \ + (slot)->relid = (rel); \ + (slot)->total_rows = 0; \ + (slot)->batch_size = (batch_sz); \ + (slot)->sleep_time = (sleep_t); \ + } while (0) + +static inline ConcurrentPartSlotStatus +cps_check_status(ConcurrentPartSlot *slot) +{ + ConcurrentPartSlotStatus status; + + SpinLockAcquire(&slot->mutex); + status = slot->worker_status; + SpinLockRelease(&slot->mutex); + + return status; +} + +static inline void +cps_set_status(ConcurrentPartSlot *slot, ConcurrentPartSlotStatus status) +{ + SpinLockAcquire(&slot->mutex); + slot->worker_status = status; + SpinLockRelease(&slot->mutex); +} + + + +/* Number of worker slots for concurrent partitioning */ +#define PART_WORKER_SLOTS 10 + +/* Max number of attempts per batch */ +#define PART_WORKER_MAX_ATTEMPTS 60 + + +/* + * Definitions for the "pathman_concurrent_part_tasks" view. + */ +#define PATHMAN_CONCURRENT_PART_TASKS "pathman_concurrent_part_tasks" +#define Natts_pathman_cp_tasks 6 +#define Anum_pathman_cp_tasks_userid 1 +#define Anum_pathman_cp_tasks_pid 2 +#define Anum_pathman_cp_tasks_dbid 3 +#define Anum_pathman_cp_tasks_relid 4 +#define Anum_pathman_cp_tasks_processed 5 +#define Anum_pathman_cp_tasks_status 6 + + +/* + * Concurrent partitioning slots are stored in shmem. + */ +Size estimate_concurrent_part_task_slots_size(void); +void init_concurrent_part_task_slots(void); + + +/* + * Useful datum packing\unpacking functions for BGW. + */ + +static inline void * +PackDatumToByteArray(void *byte_array, Datum datum, Size datum_size, bool typbyval) +{ + if (typbyval) + /* We have to copy all Datum's bytes */ + datum_size = Max(sizeof(Datum), datum_size); + + memcpy((void *) byte_array, + (const void *) (typbyval ? + (Pointer) &datum : /* treat Datum as byte array */ + DatumGetPointer(datum)), /* extract pointer to data */ + datum_size); + + return ((uint8 *) byte_array) + datum_size; +} + +static inline void * +UnpackDatumFromByteArray(Datum *datum, Size datum_size, bool typbyval, + const void *byte_array) +{ + void *dst; + + if (typbyval) + { + /* Write Data to Datum directly */ + dst = datum; + + /* We have to copy all Datum's bytes */ + datum_size = Max(sizeof(Datum), datum_size); + } + else + { + /* Allocate space for Datum's internals */ + dst = palloc(datum_size); + + /* Save pointer to Datum */ + *datum = PointerGetDatum(dst); + } + + memcpy(dst, byte_array, datum_size); + + return ((uint8 *) byte_array) + datum_size; +} + +#endif diff --git a/contrib/pg_pathman/src/pg_compat.c b/contrib/pg_pathman/src/pg_compat.c new file mode 100644 index 0000000000..7474d6897e --- /dev/null +++ b/contrib/pg_pathman/src/pg_compat.c @@ -0,0 +1,114 @@ +/* ------------------------------------------------------------------------ + * + * pg_compat.c + * Compatibility tools + * + * Copyright (c) 2016, Postgres Professional + * + * ------------------------------------------------------------------------ + */ + +#include "pg_compat.h" + +#include "optimizer/pathnode.h" +#include "port.h" +#include "utils.h" + +#include + + +void +set_append_rel_size_compat(PlannerInfo *root, RelOptInfo *rel, + Index rti, RangeTblEntry *rte) +{ + double parent_rows = 0; + double parent_size = 0; + ListCell *l; + + foreach(l, root->append_rel_list) + { + AppendRelInfo *appinfo = (AppendRelInfo *) lfirst(l); + Index childRTindex, + parentRTindex = rti; + RelOptInfo *childrel; + + /* append_rel_list contains all append rels; ignore others */ + if (appinfo->parent_relid != parentRTindex) + continue; + + childRTindex = appinfo->child_relid; + + childrel = find_base_rel(root, childRTindex); + Assert(childrel->reloptkind == RELOPT_OTHER_MEMBER_REL); + + /* + * Accumulate size information from each live child. + */ + Assert(childrel->rows > 0); + + parent_rows += childrel->rows; +#if PG_VERSION_NUM >= 90600 + parent_size += childrel->reltarget->width * childrel->rows; +#else + parent_size += childrel->width * childrel->rows; +#endif + } + + rel->rows = parent_rows; +#if PG_VERSION_NUM >= 90600 + rel->reltarget->width = rint(parent_size / parent_rows); +#else + rel->width = rint(parent_size / parent_rows); +#endif + rel->tuples = parent_rows; +} + +extern +void copy_targetlist_compat(RelOptInfo *dest, RelOptInfo *rel) +{ + ListCell *lc; + +#if PG_VERSION_NUM >= 90600 + dest->reltarget->exprs = NIL; + foreach(lc, rel->reltarget->exprs) +#else + dest->reltargetlist = NIL; + foreach(lc, rel->reltargetlist) +#endif + { + Node *new_target; + Node *node; + + node = (Node *) lfirst(lc); + new_target = copyObject(node); + change_varnos(new_target, rel->relid, dest->relid); +#if PG_VERSION_NUM >= 90600 + dest->reltarget->exprs = lappend(dest->reltarget->exprs, new_target); +#else + dest->reltargetlist = lappend(dest->reltargetlist, new_target); +#endif + } +} + +#if PG_VERSION_NUM >= 90600 +/* + * make_result + * Build a Result plan node + */ +Result * +make_result(List *tlist, + Node *resconstantqual, + Plan *subplan) +{ + Result *node = makeNode(Result); + Plan *plan = &node->plan; + + plan->targetlist = tlist; + plan->qual = NIL; + plan->lefttree = subplan; + plan->righttree = NULL; + node->resconstantqual = resconstantqual; + + return node; +} +#endif diff --git a/contrib/pg_pathman/src/pg_compat.h b/contrib/pg_pathman/src/pg_compat.h new file mode 100644 index 0000000000..7bef6778ee --- /dev/null +++ b/contrib/pg_pathman/src/pg_compat.h @@ -0,0 +1,74 @@ +/* ------------------------------------------------------------------------ + * + * pg_compat.h + * Compatibility tools + * + * Copyright (c) 2016, Postgres Professional + * + * ------------------------------------------------------------------------ + */ + +#ifndef PG_COMPAT_H +#define PG_COMPAT_H + +#include "postgres.h" + +#include "nodes/relation.h" +#include "nodes/pg_list.h" +#include "optimizer/cost.h" +#include "optimizer/paths.h" + + +extern void set_append_rel_size_compat(PlannerInfo *root, RelOptInfo *rel, + Index rti, RangeTblEntry *rte); +extern void copy_targetlist_compat(RelOptInfo *dest, RelOptInfo *rel); + +#if PG_VERSION_NUM >= 90600 + +#define get_parameterized_joinrel_size_compat(root, rel, outer_path, \ + inner_path, sjinfo, \ + restrict_clauses) \ + get_parameterized_joinrel_size(root, rel, outer_path, \ + inner_path, sjinfo, \ + restrict_clauses) + +#define check_index_predicates_compat(rool, rel) \ + check_index_predicates(root, rel) + +#define create_append_path_compat(rel, subpaths, required_outer, parallel_workers) \ + create_append_path(rel, subpaths, required_outer, parallel_workers) + +#define pull_var_clause_compat(node, aggbehavior, phbehavior) \ + pull_var_clause(node, aggbehavior | phbehavior) + +extern Result *make_result(List *tlist, Node *resconstantqual, Plan *subplan); +#define make_result_compat(root, tlist, resconstantqual, subplan) \ + make_result(tlist, resconstantqual, subplan) + +#else /* PG_VERSION_NUM >= 90500 */ + +#define get_parameterized_joinrel_size_compat(root, rel, \ + outer_path, \ + inner_path, \ + sjinfo, restrict_clauses) \ + get_parameterized_joinrel_size(root, rel, \ + (outer_path)->rows, \ + (inner_path)->rows, \ + sjinfo, restrict_clauses) + +#define check_index_predicates_compat(rool, rel) \ + check_partial_indexes(root, rel) + +#define create_append_path_compat(rel, subpaths, required_outer, parallel_workers) \ + create_append_path(rel, subpaths, required_outer) + +#define pull_var_clause_compat(node, aggbehavior, phbehavior) \ + pull_var_clause(node, aggbehavior, phbehavior) + +#define make_result_compat(root, tlist, resconstantqual, subplan) \ + make_result(root, tlist, resconstantqual, subplan) + +#endif + + +#endif /* PG_COMPAT_H */ diff --git a/contrib/pg_pathman/src/pg_pathman.c b/contrib/pg_pathman/src/pg_pathman.c index 09fe38278f..113df1b4bf 100644 --- a/contrib/pg_pathman/src/pg_pathman.c +++ b/contrib/pg_pathman/src/pg_pathman.c @@ -8,72 +8,76 @@ * * ------------------------------------------------------------------------ */ + +#include "pg_compat.h" + #include "pathman.h" +#include "init.h" +#include "hooks.h" +#include "utils.h" +#include "partition_filter.h" +#include "runtimeappend.h" +#include "runtime_merge_append.h" +#include "xact_handling.h" + #include "postgres.h" +#include "access/heapam.h" +#include "access/htup_details.h" +#include "access/transam.h" +#include "access/xact.h" +#include "catalog/pg_cast.h" +#include "catalog/pg_type.h" +#include "executor/spi.h" +#include "foreign/fdwapi.h" #include "fmgr.h" #include "miscadmin.h" -#include "nodes/makefuncs.h" -#include "nodes/nodeFuncs.h" -#include "nodes/pg_list.h" -#include "nodes/relation.h" -#include "nodes/primnodes.h" #include "optimizer/clauses.h" -#include "optimizer/paths.h" -#include "optimizer/pathnode.h" -#include "optimizer/planner.h" +#include "optimizer/prep.h" #include "optimizer/restrictinfo.h" #include "optimizer/cost.h" -#include "parser/analyze.h" -#include "utils/hsearch.h" -#include "utils/tqual.h" -#include "utils/rel.h" -#include "utils/elog.h" -#include "utils/array.h" -#include "utils/date.h" -#include "utils/guc.h" +#include "utils/builtins.h" +#include "utils/datum.h" #include "utils/lsyscache.h" -#include "utils/selfuncs.h" #include "utils/memutils.h" -#include "access/heapam.h" -#include "access/nbtree.h" -#include "storage/ipc.h" -#include "catalog/pg_operator.h" -#include "catalog/pg_type.h" -#include "foreign/fdwapi.h" -#include "hooks.h" -#include "utils.h" -#include "runtimeappend.h" -#include "runtime_merge_append.h" +#include "utils/rel.h" +#include "utils/syscache.h" +#include "utils/selfuncs.h" +#include "utils/snapmgr.h" +#include "utils/typcache.h" PG_MODULE_MAGIC; + List *inheritance_disabled_relids = NIL; List *inheritance_enabled_relids = NIL; -bool pg_pathman_enable; PathmanState *pmstate; +Oid pathman_config_relid = InvalidOid; +Oid pathman_config_params_relid = InvalidOid; -/* Original hooks */ -static shmem_startup_hook_type shmem_startup_hook_original = NULL; -static post_parse_analyze_hook_type post_parse_analyze_hook_original = NULL; -static planner_hook_type planner_hook_original = NULL; /* pg module functions */ void _PG_init(void); -/* Hook functions */ -static void pathman_shmem_startup(void); -void pathman_post_parse_analysis_hook(ParseState *pstate, Query *query); -static PlannedStmt * pathman_planner_hook(Query *parse, int cursorOptions, ParamListInfo boundParams); - /* Utility functions */ -static void handle_modification_query(Query *parse); static Node *wrapper_make_expression(WrapperNode *wrap, int index, bool *alwaysTrue); -static void disable_inheritance(Query *parse); -static void disable_inheritance_cte(Query *parse); -static void disable_inheritance_subselect(Query *parse); static bool disable_inheritance_subselect_walker(Node *node, void *context); +/* "Partition creation"-related functions */ +static Datum extract_binary_interval_from_text(Datum interval_text, + Oid part_atttype, + Oid *interval_type); +static bool spawn_partitions(Oid partitioned_rel, + Datum value, + Datum leading_bound, + Oid leading_bound_type, + FmgrInfo *cmp_proc, + Datum interval_binary, + Oid interval_type, + bool forward, + Oid *last_partition); + /* Expression tree handlers */ +static WrapperNode *handle_const(const Const *c, WalkerContext *context); static void handle_binary_opexpr(WalkerContext *context, WrapperNode *result, const Node *varnode, const Const *c); static void handle_binary_opexpr_param(const PartRelationInfo *prel, WrapperNode *result, const Node *varnode); static WrapperNode *handle_opexpr(const OpExpr *expr, WalkerContext *context); @@ -101,18 +105,16 @@ static Path *get_cheapest_parameterized_child_path(PlannerInfo *root, RelOptInfo * flinfo is a pointer to an instance of FmgrInfo * arg1, arg2 are Datum instances */ -#define check_lt(flinfo, arg1, arg2) \ - ((int) FunctionCall2(cmp_func, arg1, arg2) < 0) -#define check_le(flinfo, arg1, arg2) \ - ((int) FunctionCall2(cmp_func, arg1, arg2) <= 0) -#define check_eq(flinfo, arg1, arg2) \ - ((int) FunctionCall2(cmp_func, arg1, arg2) == 0) -#define check_ge(flinfo, arg1, arg2) \ - ((int) FunctionCall2(cmp_func, arg1, arg2) >= 0) -#define check_gt(flinfo, arg1, arg2) \ - ((int) FunctionCall2(cmp_func, arg1, arg2) > 0) - -#define WcxtHasExprContext(wcxt) ( (wcxt)->econtext ) +#define check_lt(finfo, arg1, arg2) \ + ((int) FunctionCall2(finfo, arg1, arg2) < 0) +#define check_le(finfo, arg1, arg2) \ + ((int) FunctionCall2(finfo, arg1, arg2) <= 0) +#define check_eq(finfo, arg1, arg2) \ + ((int) FunctionCall2(finfo, arg1, arg2) == 0) +#define check_ge(finfo, arg1, arg2) \ + ((int) FunctionCall2(finfo, arg1, arg2) >= 0) +#define check_gt(finfo, arg1, arg2) \ + ((int) FunctionCall2(finfo, arg1, arg2) > 0) /* We can transform Param into Const provided that 'econtext' is available */ #define IsConstValue(wcxt, node) \ @@ -121,211 +123,66 @@ static Path *get_cheapest_parameterized_child_path(PlannerInfo *root, RelOptInfo #define ExtractConst(wcxt, node) \ ( IsA((node), Param) ? extract_const((wcxt), (Param *) (node)) : ((Const *) (node)) ) + /* - * Entry point + * Set initial values for all Postmaster's forks. */ void _PG_init(void) { + PathmanInitState temp_init_state; + if (!process_shared_preload_libraries_in_progress) { - elog(ERROR, "Pathman module must be initialized in postmaster. " + elog(ERROR, "pg_pathman module must be initialized by Postmaster. " "Put the following line to configuration file: " "shared_preload_libraries='pg_pathman'"); - initialization_needed = false; } /* Request additional shared resources */ - RequestAddinShmemSpace(pathman_memsize()); - RequestAddinLWLocks(3); - - set_rel_pathlist_hook_next = set_rel_pathlist_hook; - set_rel_pathlist_hook = pathman_rel_pathlist_hook; - set_join_pathlist_next = set_join_pathlist_hook; - set_join_pathlist_hook = pathman_join_pathlist_hook; - shmem_startup_hook_original = shmem_startup_hook; - shmem_startup_hook = pathman_shmem_startup; - post_parse_analyze_hook_original = post_parse_analyze_hook; - post_parse_analyze_hook = pathman_post_parse_analysis_hook; - planner_hook_original = planner_hook; - planner_hook = pathman_planner_hook; - - /* RuntimeAppend */ - runtimeappend_path_methods.CustomName = "RuntimeAppend"; - runtimeappend_path_methods.PlanCustomPath = create_runtimeappend_plan; - - runtimeappend_plan_methods.CustomName = "RuntimeAppend"; - runtimeappend_plan_methods.CreateCustomScanState = runtimeappend_create_scan_state; - - runtimeappend_exec_methods.CustomName = "RuntimeAppend"; - runtimeappend_exec_methods.BeginCustomScan = runtimeappend_begin; - runtimeappend_exec_methods.ExecCustomScan = runtimeappend_exec; - runtimeappend_exec_methods.EndCustomScan = runtimeappend_end; - runtimeappend_exec_methods.ReScanCustomScan = runtimeappend_rescan; - runtimeappend_exec_methods.MarkPosCustomScan = NULL; - runtimeappend_exec_methods.RestrPosCustomScan = NULL; - runtimeappend_exec_methods.ExplainCustomScan = runtimeappend_explain; - - /* RuntimeMergeAppend */ - runtime_merge_append_path_methods.CustomName = "RuntimeMergeAppend"; - runtime_merge_append_path_methods.PlanCustomPath = create_runtimemergeappend_plan; - - runtime_merge_append_plan_methods.CustomName = "RuntimeMergeAppend"; - runtime_merge_append_plan_methods.CreateCustomScanState = runtimemergeappend_create_scan_state; - - runtime_merge_append_exec_methods.CustomName = "RuntimeMergeAppend"; - runtime_merge_append_exec_methods.BeginCustomScan = runtimemergeappend_begin; - runtime_merge_append_exec_methods.ExecCustomScan = runtimemergeappend_exec; - runtime_merge_append_exec_methods.EndCustomScan = runtimemergeappend_end; - runtime_merge_append_exec_methods.ReScanCustomScan = runtimemergeappend_rescan; - runtime_merge_append_exec_methods.MarkPosCustomScan = NULL; - runtime_merge_append_exec_methods.RestrPosCustomScan = NULL; - runtime_merge_append_exec_methods.ExplainCustomScan = runtimemergeappend_explain; - - DefineCustomBoolVariable("pg_pathman.enable", - "Enables pg_pathman's optimizations during the planner stage", - NULL, - &pg_pathman_enable, - true, - PGC_USERSET, - 0, - NULL, - pg_pathman_enable_assign_hook, - NULL); - - DefineCustomBoolVariable("pg_pathman.enable_runtimeappend", - "Enables the planner's use of RuntimeAppend custom node.", - NULL, - &pg_pathman_enable_runtimeappend, - true, - PGC_USERSET, - 0, - NULL, - NULL, - NULL); - - DefineCustomBoolVariable("pg_pathman.enable_runtimemergeappend", - "Enables the planner's use of RuntimeMergeAppend custom node.", - NULL, - &pg_pathman_enable_runtime_merge_append, - true, - PGC_USERSET, - 0, - NULL, - NULL, - NULL); -} - -PartRelationInfo * -get_pathman_relation_info(Oid relid, bool *found) -{ - RelationKey key; - - key.dbid = MyDatabaseId; - key.relid = relid; - return hash_search(relations, (const void *) &key, HASH_FIND, found); -} - -RangeRelation * -get_pathman_range_relation(Oid relid, bool *found) -{ - RelationKey key; - - key.dbid = MyDatabaseId; - key.relid = relid; - return hash_search(range_restrictions, (const void *) &key, HASH_FIND, found); -} - -FmgrInfo * -get_cmp_func(Oid type1, Oid type2) -{ - FmgrInfo *cmp_func; - Oid cmp_proc_oid; - TypeCacheEntry *tce; - - cmp_func = palloc(sizeof(FmgrInfo)); - tce = lookup_type_cache(type1, - TYPECACHE_BTREE_OPFAMILY | TYPECACHE_CMP_PROC | TYPECACHE_CMP_PROC_FINFO); - cmp_proc_oid = get_opfamily_proc(tce->btree_opf, - type1, - type2, - BTORDER_PROC); - fmgr_info(cmp_proc_oid, cmp_func); - return cmp_func; + RequestAddinShmemSpace(estimate_pathman_shmem_size()); + + /* NOTE: we don't need LWLocks now. RequestAddinLWLocks(1); */ + + /* Assign pg_pathman's initial state */ + temp_init_state.initialization_needed = true; + temp_init_state.pg_pathman_enable = true; + + /* Apply initial state */ + restore_pathman_init_state(&temp_init_state); + + /* Initialize 'next' hook pointers */ + set_rel_pathlist_hook_next = set_rel_pathlist_hook; + set_rel_pathlist_hook = pathman_rel_pathlist_hook; + set_join_pathlist_next = set_join_pathlist_hook; + set_join_pathlist_hook = pathman_join_pathlist_hook; + shmem_startup_hook_next = shmem_startup_hook; + shmem_startup_hook = pathman_shmem_startup_hook; + post_parse_analyze_hook_next = post_parse_analyze_hook; + post_parse_analyze_hook = pathman_post_parse_analysis_hook; + planner_hook_next = planner_hook; + planner_hook = pathman_planner_hook; + process_utility_hook_next = ProcessUtility_hook; + ProcessUtility_hook = pathman_process_utility_hook; + + /* Initialize static data for all subsystems */ + init_main_pathman_toggles(); + init_runtimeappend_static_data(); + init_runtime_merge_append_static_data(); + init_partition_filter_static_data(); } /* - * Post parse analysis hook. It makes sure the config is loaded before executing - * any statement, including utility commands + * Disables inheritance for partitioned by pathman relations. + * It must be done to prevent PostgresSQL from exhaustive search. */ void -pathman_post_parse_analysis_hook(ParseState *pstate, Query *query) -{ - if (initialization_needed) - load_config(); - - if (post_parse_analyze_hook_original) - post_parse_analyze_hook_original(pstate, query); - - inheritance_disabled_relids = NIL; - inheritance_enabled_relids = NIL; -} - -/* - * Planner hook. It disables inheritance for tables that have been partitioned - * by pathman to prevent standart PostgreSQL partitioning mechanism from - * handling that tables. - */ -PlannedStmt * -pathman_planner_hook(Query *parse, int cursorOptions, ParamListInfo boundParams) -{ - PlannedStmt *result; - - if (pg_pathman_enable) - { - // inheritance_disabled = false; - switch(parse->commandType) - { - case CMD_SELECT: - disable_inheritance(parse); - break; - case CMD_UPDATE: - case CMD_DELETE: - disable_inheritance_cte(parse); - disable_inheritance_subselect(parse); - handle_modification_query(parse); - break; - default: - break; - } - } - - /* Invoke original hook */ - if (planner_hook_original) - result = planner_hook_original(parse, cursorOptions, boundParams); - else - result = standard_planner(parse, cursorOptions, boundParams); - - list_free(inheritance_disabled_relids); - list_free(inheritance_enabled_relids); - inheritance_disabled_relids = NIL; - inheritance_enabled_relids = NIL; - - return result; -} - -/* - * Disables inheritance for partitioned by pathman relations. It must be done to - * prevent PostgresSQL from full search. - */ -static void disable_inheritance(Query *parse) { - ListCell *lc; - RangeTblEntry *rte; - PartRelationInfo *prel; - MemoryContext oldcontext; - bool found; + const PartRelationInfo *prel; + RangeTblEntry *rte; + MemoryContext oldcontext; + ListCell *lc; /* If query contains CTE (WITH statement) then handle subqueries too */ disable_inheritance_cte(parse); @@ -335,18 +192,20 @@ disable_inheritance(Query *parse) foreach(lc, parse->rtable) { - rte = (RangeTblEntry*) lfirst(lc); + rte = (RangeTblEntry *) lfirst(lc); switch(rte->rtekind) { case RTE_RELATION: if (rte->inh) { - /* Look up this relation in pathman relations */ - prel = get_pathman_relation_info(rte->relid, &found); - if (prel != NULL && found) + /* Look up this relation in pathman local cache */ + prel = get_pathman_relation_info(rte->relid); + if (prel) { + /* We'll set this flag later */ rte->inh = false; + /* * Sometimes user uses the ONLY statement and in this case * rte->inh is also false. We should differ the case @@ -399,7 +258,7 @@ disable_inheritance(Query *parse) "with and without ONLY modifier"); } -static void +void disable_inheritance_cte(Query *parse) { ListCell *lc; @@ -413,7 +272,7 @@ disable_inheritance_cte(Query *parse) } } -static void +void disable_inheritance_subselect(Query *parse) { Node *quals; @@ -441,53 +300,48 @@ disable_inheritance_subselect_walker(Node *node, void *context) } /* - * Checks if query is affects only one partition. If true then substitute + * Checks if query affects only one partition. If true then substitute */ -static void +void handle_modification_query(Query *parse) { - PartRelationInfo *prel; - List *ranges; - RangeTblEntry *rte; - WrapperNode *wrap; - Expr *expr; - bool found; - WalkerContext context; + const PartRelationInfo *prel; + List *ranges; + RangeTblEntry *rte; + WrapperNode *wrap; + Expr *expr; + WalkerContext context; Assert(parse->commandType == CMD_UPDATE || parse->commandType == CMD_DELETE); Assert(parse->resultRelation > 0); rte = rt_fetch(parse->resultRelation, parse->rtable); - prel = get_pathman_relation_info(rte->relid, &found); + prel = get_pathman_relation_info(rte->relid); - if (!found) + if (!prel) return; /* Parse syntax tree and extract partition ranges */ - ranges = list_make1_int(make_irange(0, prel->children_count - 1, false)); + ranges = list_make1_irange(make_irange(0, PrelLastChild(prel), false)); expr = (Expr *) eval_const_expressions(NULL, parse->jointree->quals); if (!expr) return; /* Parse syntax tree and extract partition ranges */ - context.prel = prel; - context.econtext = NULL; - context.hasLeast = false; - context.hasGreatest = false; + InitWalkerContext(&context, prel, NULL, false); wrap = walk_expr_tree(expr, &context); - finish_least_greatest(wrap, &context); ranges = irange_list_intersect(ranges, wrap->rangeset); /* If only one partition is affected then substitute parent table with partition */ if (irange_list_length(ranges) == 1) { - IndexRange irange = (IndexRange) linitial_oid(ranges); - if (irange_lower(irange) == irange_upper(irange)) + IndexRange irange = linitial_irange(ranges); + if (irange.ir_lower == irange.ir_upper) { - Oid *children = (Oid *) dsm_array_get_pointer(&prel->children); - rte->relid = children[irange_lower(irange)]; + Oid *children = PrelGetChildrenArray(prel); + rte->relid = children[irange.ir_lower]; rte->inh = false; } } @@ -495,63 +349,6 @@ handle_modification_query(Query *parse) return; } -/* - * Shared memory startup hook - */ -static void -pathman_shmem_startup(void) -{ - /* Allocate shared memory objects */ - LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); - init_dsm_config(); - init_shmem_config(); - LWLockRelease(AddinShmemInitLock); - - /* Invoke original hook if needed */ - if (shmem_startup_hook_original != NULL) - shmem_startup_hook_original(); -} - -void -set_append_rel_size(PlannerInfo *root, RelOptInfo *rel, - Index rti, RangeTblEntry *rte) -{ - double parent_rows = 0; - double parent_size = 0; - ListCell *l; - - foreach(l, root->append_rel_list) - { - AppendRelInfo *appinfo = (AppendRelInfo *) lfirst(l); - int childRTindex, - parentRTindex = rti; - RelOptInfo *childrel; - - /* append_rel_list contains all append rels; ignore others */ - if (appinfo->parent_relid != parentRTindex) - continue; - - childRTindex = appinfo->child_relid; - - childrel = find_base_rel(root, childRTindex); - Assert(childrel->reloptkind == RELOPT_OTHER_MEMBER_REL); - - /* - * Accumulate size information from each live child. - */ - Assert(childrel->rows > 0); - - parent_rows += childrel->rows; - parent_size += childrel->width * childrel->rows; - } - - rel->rows = parent_rows; - rel->width = rint(parent_size / parent_rows); - // for (i = 0; i < nattrs; i++) - // rel->attr_widths[i] = rint(parent_attrsizes[i] / parent_rows); - rel->tuples = parent_rows; -} - /* * Creates child relation and adds it to root. * Returns child index in simple_rel_array @@ -560,14 +357,18 @@ int append_child_relation(PlannerInfo *root, RelOptInfo *rel, Index rti, RangeTblEntry *rte, int index, Oid childOid, List *wrappers) { - RangeTblEntry *childrte; - RelOptInfo *childrel; - Index childRTindex; - AppendRelInfo *appinfo; - Node *node; - ListCell *lc, *lc2; - Relation newrelation; - + RangeTblEntry *childrte; + RelOptInfo *childrel; + Index childRTindex; + AppendRelInfo *appinfo; + ListCell *lc, + *lc2; + Relation newrelation; + PlanRowMark *parent_rowmark; + PlanRowMark *child_rowmark; + AttrNumber i; + + /* FIXME: acquire a suitable lock on partition */ newrelation = heap_open(childOid, NoLock); /* @@ -587,56 +388,75 @@ append_child_relation(PlannerInfo *root, RelOptInfo *rel, Index rti, childrel = build_simple_rel(root, childRTindex, RELOPT_OTHER_MEMBER_REL); /* Copy targetlist */ - childrel->reltargetlist = NIL; - foreach(lc, rel->reltargetlist) - { - Node *new_target; + copy_targetlist_compat(childrel, rel); - node = (Node *) lfirst(lc); - new_target = copyObject(node); - change_varnos(new_target, rel->relid, childrel->relid); - childrel->reltargetlist = lappend(childrel->reltargetlist, new_target); - } + /* Copy attr_needed & attr_widths */ + childrel->attr_needed = (Relids *) + palloc0((rel->max_attr - rel->min_attr + 1) * sizeof(Relids)); + childrel->attr_widths = (int32 *) + palloc0((rel->max_attr - rel->min_attr + 1) * sizeof(int32)); - /* Copy attr_needed (used in build_joinrel_tlist() function) */ - childrel->attr_needed = rel->attr_needed; + for (i = 0; i < rel->max_attr - rel->min_attr + 1; i++) + childrel->attr_needed[i] = bms_copy(rel->attr_needed[i]); - /* Copy restrictions */ + memcpy(childrel->attr_widths, rel->attr_widths, + (rel->max_attr - rel->min_attr + 1) * sizeof(int32)); + + /* + * Copy restrictions. If it's not the parent table then copy only those + * restrictions that reference to this partition + */ childrel->baserestrictinfo = NIL; - forboth(lc, wrappers, lc2, rel->baserestrictinfo) + if (rte->relid != childOid) { - bool alwaysTrue; - WrapperNode *wrap = (WrapperNode *) lfirst(lc); - Node *new_clause = wrapper_make_expression(wrap, index, &alwaysTrue); - RestrictInfo *old_rinfo = (RestrictInfo *) lfirst(lc2); - - if (alwaysTrue) + forboth(lc, wrappers, lc2, rel->baserestrictinfo) { - continue; - } - Assert(new_clause); + bool alwaysTrue; + WrapperNode *wrap = (WrapperNode *) lfirst(lc); + Node *new_clause = wrapper_make_expression(wrap, index, &alwaysTrue); + RestrictInfo *old_rinfo = (RestrictInfo *) lfirst(lc2); - if (and_clause((Node *) new_clause)) - { - ListCell *alc; + if (alwaysTrue) + { + continue; + } + Assert(new_clause); + + if (and_clause((Node *) new_clause)) + { + ListCell *alc; + + foreach(alc, ((BoolExpr *) new_clause)->args) + { + Node *arg = (Node *) lfirst(alc); + RestrictInfo *new_rinfo = rebuild_restrictinfo(arg, old_rinfo); - foreach(alc, ((BoolExpr *) new_clause)->args) + change_varnos((Node *)new_rinfo, rel->relid, childrel->relid); + childrel->baserestrictinfo = lappend(childrel->baserestrictinfo, + new_rinfo); + } + } + else { - Node *arg = (Node *) lfirst(alc); - RestrictInfo *new_rinfo = rebuild_restrictinfo(arg, old_rinfo); + RestrictInfo *new_rinfo = rebuild_restrictinfo(new_clause, old_rinfo); + /* Replace old relids with new ones */ change_varnos((Node *)new_rinfo, rel->relid, childrel->relid); + childrel->baserestrictinfo = lappend(childrel->baserestrictinfo, - new_rinfo); + (void *) new_rinfo); } } - else + } + /* If it's the parent table then copy all restrictions */ + else + { + foreach(lc, rel->baserestrictinfo) { - RestrictInfo *new_rinfo = rebuild_restrictinfo(new_clause, old_rinfo); + RestrictInfo *rinfo = (RestrictInfo *) lfirst(lc); + RestrictInfo *new_rinfo = (RestrictInfo *) copyObject(rinfo); - /* Replace old relids with new ones */ change_varnos((Node *)new_rinfo, rel->relid, childrel->relid); - childrel->baserestrictinfo = lappend(childrel->baserestrictinfo, (void *) new_rinfo); } @@ -680,8 +500,35 @@ append_child_relation(PlannerInfo *root, RelOptInfo *rel, Index rti, /* Recalc parent relation tuples count */ rel->tuples += childrel->tuples; + /* Close child relations, but keep locks */ heap_close(newrelation, NoLock); + + /* Create rowmarks required for child rels */ + parent_rowmark = get_plan_rowmark(root->rowMarks, rti); + if (parent_rowmark) + { + child_rowmark = makeNode(PlanRowMark); + + child_rowmark->rti = childRTindex; + child_rowmark->prti = rti; + child_rowmark->rowmarkId = parent_rowmark->rowmarkId; + /* Reselect rowmark type, because relkind might not match parent */ + child_rowmark->markType = select_rowmark_type(childrte, + parent_rowmark->strength); + child_rowmark->allMarkTypes = (1 << child_rowmark->markType); + child_rowmark->strength = parent_rowmark->strength; + child_rowmark->waitPolicy = parent_rowmark->waitPolicy; + child_rowmark->isParent = false; + + /* Include child's rowmark type in parent's allMarkTypes */ + parent_rowmark->allMarkTypes |= child_rowmark->allMarkTypes; + + root->rowMarks = lappend(root->rowMarks, child_rowmark); + + parent_rowmark->isParent = true; + } + return childRTindex; } @@ -710,6 +557,7 @@ wrapper_make_expression(WrapperNode *wrap, int index, bool *alwaysTrue) * sequntially. */ found = irange_list_find(wrap->rangeset, index, &lossy); + /* Return NULL for always true and always false. */ if (!found) return NULL; @@ -722,7 +570,7 @@ wrapper_make_expression(WrapperNode *wrap, int index, bool *alwaysTrue) if (IsA(wrap->orig, BoolExpr)) { const BoolExpr *expr = (const BoolExpr *) wrap->orig; - BoolExpr *result; + BoolExpr *result; if (expr->boolop == OR_EXPR || expr->boolop == AND_EXPR) { @@ -734,7 +582,8 @@ wrapper_make_expression(WrapperNode *wrap, int index, bool *alwaysTrue) Node *arg; bool childAlwaysTrue; - arg = wrapper_make_expression((WrapperNode *)lfirst(lc), index, &childAlwaysTrue); + arg = wrapper_make_expression((WrapperNode *) lfirst(lc), + index, &childAlwaysTrue); #ifdef USE_ASSERT_CHECKING /* * We shouldn't get there for always true clause under OR and @@ -760,17 +609,13 @@ wrapper_make_expression(WrapperNode *wrap, int index, bool *alwaysTrue) result->args = args; result->boolop = expr->boolop; result->location = expr->location; - return (Node *)result; + return (Node *) result; } else - { return copyObject(wrap->orig); - } } else - { return copyObject(wrap->orig); - } } /* @@ -786,85 +631,541 @@ walk_expr_tree(Expr *expr, WalkerContext *context) switch (expr->type) { + /* Useful for INSERT optimization */ + case T_Const: + return handle_const((Const *) expr, context); + /* AND, OR, NOT expressions */ case T_BoolExpr: boolexpr = (BoolExpr *) expr; return handle_boolexpr(boolexpr, context); + /* =, !=, <, > etc. */ case T_OpExpr: opexpr = (OpExpr *) expr; return handle_opexpr(opexpr, context); + /* IN expression */ case T_ScalarArrayOpExpr: arrexpr = (ScalarArrayOpExpr *) expr; return handle_arrexpr(arrexpr, context); + default: - result = (WrapperNode *)palloc(sizeof(WrapperNode)); - result->orig = (const Node *)expr; + result = (WrapperNode *) palloc(sizeof(WrapperNode)); + result->orig = (const Node *) expr; result->args = NIL; - result->rangeset = list_make1_irange(make_irange(0, context->prel->children_count - 1, true)); + result->rangeset = list_make1_irange( + make_irange(0, PrelLastChild(context->prel), true)); result->paramsel = 1.0; - return result; } } +/* + * Append\prepend partitions if there's no partition to store 'value'. + * + * Used by create_partitions_internal(). + * + * NB: 'value' type is not needed since we've already taken + * it into account while searching for the 'cmp_proc'. + */ +static bool +spawn_partitions(Oid partitioned_rel, /* parent's Oid */ + Datum value, /* value to be INSERTed */ + Datum leading_bound, /* current global min\max */ + Oid leading_bound_type, /* type of the boundary */ + FmgrInfo *cmp_proc, /* cmp(value, leading_bound) */ + Datum interval_binary, /* interval in binary form */ + Oid interval_type, /* INTERVALOID or prel->atttype */ + bool forward, /* append\prepend */ + Oid *last_partition) /* result (Oid of the last partition) */ +{ +/* Cache "+"(leading_bound, interval) or "-"(leading_bound, interval) operator */ +#define CacheOperator(finfo, opname, arg1, arg2, is_cached) \ + do { \ + if (!is_cached) \ + { \ + fmgr_info(get_binary_operator_oid((opname), (arg1), (arg2)), \ + (finfo)); \ + is_cached = true; \ + } \ + } while (0) + +/* Use "<" for prepend & ">=" for append */ +#define do_compare(compar, a, b, fwd) \ + ( \ + (fwd) ? \ + check_ge((compar), (a), (b)) : \ + check_lt((compar), (a), (b)) \ + ) + + FmgrInfo interval_move_bound; /* function to move upper\lower boundary */ + bool interval_move_bound_cached = false; /* is it cached already? */ + bool spawned = false; + + Datum cur_part_leading = leading_bound; + + char *query; + + /* Create querty statement */ + query = psprintf("SELECT part::regclass " + "FROM %s.create_single_range_partition($1, $2, $3) AS part", + get_namespace_name(get_pathman_schema())); + + /* Execute comparison function cmp(value, cur_part_leading) */ + while (do_compare(cmp_proc, value, cur_part_leading, forward)) + { + char *nulls = NULL; /* no params are NULL */ + Oid types[3] = { REGCLASSOID, leading_bound_type, leading_bound_type }; + Datum values[3]; + int ret; + + /* Assign the 'following' boundary to current 'leading' value */ + Datum cur_part_following = cur_part_leading; + + CacheOperator(&interval_move_bound, (forward ? "+" : "-"), + leading_bound_type, interval_type, interval_move_bound_cached); + + /* Move leading bound by interval (leading +\- INTERVAL) */ + cur_part_leading = FunctionCall2(&interval_move_bound, + cur_part_leading, + interval_binary); + + /* Fill in 'values' with parent's Oid and correct boundaries... */ + values[0] = partitioned_rel; /* partitioned table's Oid */ + values[1] = forward ? cur_part_following : cur_part_leading; /* value #1 */ + values[2] = forward ? cur_part_leading : cur_part_following; /* value #2 */ + + /* ...and create partition */ + ret = SPI_execute_with_args(query, 3, types, values, nulls, false, 0); + if (ret != SPI_OK_SELECT) + elog(ERROR, "Could not spawn a partition"); + + /* Set 'last_partition' if necessary */ + if (last_partition) + { + HeapTuple htup = SPI_tuptable->vals[0]; + Datum partid; + bool isnull; + + Assert(SPI_processed == 1); + Assert(SPI_tuptable->tupdesc->natts == 1); + partid = SPI_getbinval(htup, SPI_tuptable->tupdesc, 1, &isnull); + + *last_partition = DatumGetObjectId(partid); + } + +#ifdef USE_ASSERT_CHECKING + elog(DEBUG2, "%s partition with following='%s' & leading='%s' [%u]", + (forward ? "Appending" : "Prepending"), + DebugPrintDatum(cur_part_following, leading_bound_type), + DebugPrintDatum(cur_part_leading, leading_bound_type), + MyProcPid); +#endif + + /* We have spawned at least 1 partition */ + spawned = true; + } + + pfree(query); + + return spawned; +} + +/* + * Convert interval from TEXT to binary form using partitioned column's type. + */ +static Datum +extract_binary_interval_from_text(Datum interval_text, /* interval as TEXT */ + Oid part_atttype, /* partitioned column's type */ + Oid *interval_type) /* returned value */ +{ + Datum interval_binary; + const char *interval_cstring; + + interval_cstring = TextDatumGetCString(interval_text); + + /* If 'part_atttype' is a *date type*, cast 'range_interval' to INTERVAL */ + if (is_date_type_internal(part_atttype)) + { + int32 interval_typmod = PATHMAN_CONFIG_interval_typmod; + + /* Convert interval from CSTRING to internal form */ + interval_binary = DirectFunctionCall3(interval_in, + CStringGetDatum(interval_cstring), + ObjectIdGetDatum(InvalidOid), + Int32GetDatum(interval_typmod)); + if (interval_type) + *interval_type = INTERVALOID; + } + /* Otherwise cast it to the partitioned column's type */ + else + { + HeapTuple htup; + Oid typein_proc = InvalidOid; + + htup = SearchSysCache1(TYPEOID, ObjectIdGetDatum(part_atttype)); + if (HeapTupleIsValid(htup)) + { + typein_proc = ((Form_pg_type) GETSTRUCT(htup))->typinput; + ReleaseSysCache(htup); + } + else + elog(ERROR, "Cannot find input function for type %u", part_atttype); + + /* + * Convert interval from CSTRING to 'prel->atttype'. + * + * Note: We pass 3 arguments in case + * 'typein_proc' also takes Oid & typmod. + */ + interval_binary = OidFunctionCall3(typein_proc, + CStringGetDatum(interval_cstring), + ObjectIdGetDatum(part_atttype), + Int32GetDatum(-1)); + if (interval_type) + *interval_type = part_atttype; + } + + return interval_binary; +} + +/* + * Append partitions (if needed) and return Oid of the partition to contain value. + * + * NB: This function should not be called directly, use create_partitions() instead. + */ +Oid +create_partitions_internal(Oid relid, Datum value, Oid value_type) +{ + MemoryContext old_mcxt = CurrentMemoryContext; + Oid partid = InvalidOid; /* last created partition (or InvalidOid) */ + + PG_TRY(); + { + const PartRelationInfo *prel; + Datum values[Natts_pathman_config]; + bool isnull[Natts_pathman_config]; + + /* Get both PartRelationInfo & PATHMAN_CONFIG contents for this relation */ + if (pathman_config_contains_relation(relid, values, isnull, NULL)) + { + Oid base_atttype; /* base type of prel->atttype */ + Oid base_value_type; /* base type of value_type */ + + Datum min_rvalue, /* absolute MIN */ + max_rvalue; /* absolute MAX */ + + Oid interval_type = InvalidOid; + Datum interval_binary, /* assigned 'width' of a single partition */ + interval_text; + + FmgrInfo interval_type_cmp; + + /* Fetch PartRelationInfo by 'relid' */ + prel = get_pathman_relation_info(relid); + shout_if_prel_is_invalid(relid, prel, PT_RANGE); + + /* Fetch base types of prel->atttype & value_type */ + base_atttype = getBaseType(prel->atttype); + base_value_type = getBaseType(value_type); + + /* Read max & min range values from PartRelationInfo */ + min_rvalue = PrelGetRangesArray(prel)[0].min; + max_rvalue = PrelGetRangesArray(prel)[PrelLastChild(prel)].max; + + /* Copy datums on order to protect them from cache invalidation */ + min_rvalue = datumCopy(min_rvalue, prel->attbyval, prel->attlen); + max_rvalue = datumCopy(max_rvalue, prel->attbyval, prel->attlen); + + /* Retrieve interval as TEXT from tuple */ + interval_text = values[Anum_pathman_config_range_interval - 1]; + + /* Convert interval to binary representation */ + interval_binary = extract_binary_interval_from_text(interval_text, + base_atttype, + &interval_type); + + /* Fill the FmgrInfo struct with a cmp(value, part_attribute) function */ + fill_type_cmp_fmgr_info(&interval_type_cmp, base_value_type, base_atttype); + + if (SPI_connect() != SPI_OK_CONNECT) + elog(ERROR, "could not connect using SPI"); + + /* while (value >= MAX) ... */ + spawn_partitions(PrelParentRelid(prel), value, max_rvalue, + base_atttype, &interval_type_cmp, interval_binary, + interval_type, true, &partid); + + /* while (value < MIN) ... */ + if (partid == InvalidOid) + spawn_partitions(PrelParentRelid(prel), value, min_rvalue, + base_atttype, &interval_type_cmp, interval_binary, + interval_type, false, &partid); + + SPI_finish(); /* close SPI connection */ + } + else + elog(ERROR, "pg_pathman's config does not contain relation \"%s\"", + get_rel_name_or_relid(relid)); + } + PG_CATCH(); + { + ErrorData *edata; + + /* Switch to the original context & copy edata */ + MemoryContextSwitchTo(old_mcxt); + edata = CopyErrorData(); + FlushErrorState(); + + elog(LOG, "create_partitions_internal(): %s [%u]", + edata->message, MyProcPid); + + FreeErrorData(edata); + + SPI_finish(); /* no problem if not connected */ + + /* Reset 'partid' in case of error */ + partid = InvalidOid; + } + PG_END_TRY(); + + return partid; +} + +/* + * Create RANGE partitions (if needed) using either BGW or current backend. + * + * Returns Oid of the partition to store 'value'. + */ +Oid +create_partitions(Oid relid, Datum value, Oid value_type) +{ + TransactionId rel_xmin; + Oid last_partition = InvalidOid; + + /* Check that table is partitioned and fetch xmin */ + if (pathman_config_contains_relation(relid, NULL, NULL, &rel_xmin)) + { + bool part_in_prev_xact = + TransactionIdPrecedes(rel_xmin, GetCurrentTransactionId()) || + TransactionIdEquals(rel_xmin, FrozenTransactionId); + + /* + * If table has been partitioned in some previous xact AND + * we don't hold any conflicting locks, run BGWorker. + */ + if (part_in_prev_xact && !xact_bgw_conflicting_lock_exists(relid)) + { + elog(DEBUG2, "create_partitions(): chose BGWorker [%u]", MyProcPid); + last_partition = create_partitions_bg_worker(relid, value, value_type); + } + /* Else it'd be better for the current backend to create partitions */ + else + { + elog(DEBUG2, "create_partitions(): chose backend [%u]", MyProcPid); + last_partition = create_partitions_internal(relid, value, value_type); + } + } + else + elog(ERROR, "relation \"%s\" is not partitioned by pg_pathman", + get_rel_name_or_relid(relid)); + + /* Check that 'last_partition' is valid */ + if (last_partition == InvalidOid) + elog(ERROR, "could not create new partitions for relation \"%s\"", + get_rel_name_or_relid(relid)); + + return last_partition; +} + +/* + * Given RangeEntry array and 'value', return selected + * RANGE partitions inside the WrapperNode. + */ void -finish_least_greatest(WrapperNode *wrap, WalkerContext *context) +select_range_partitions(const Datum value, + FmgrInfo *cmp_func, + const RangeEntry *ranges, + const int nranges, + const int strategy, + WrapperNode *result) { - if (context->hasLeast && context->hasGreatest) + const RangeEntry *current_re; + bool lossy = false, + is_less, + is_greater; + +#ifdef USE_ASSERT_CHECKING + bool found = false; + int counter = 0; +#endif + + int i, + startidx = 0, + endidx = nranges - 1, + cmp_min, + cmp_max; + + /* Initial value (no missing partitions found) */ + result->found_gap = false; + + /* Check boundaries */ + if (nranges == 0) + { + result->rangeset = NIL; + return; + } + else { - switch (context->prel->atttype) + Assert(ranges); + Assert(cmp_func); + + /* Corner cases */ + cmp_min = FunctionCall2(cmp_func, value, ranges[startidx].min), + cmp_max = FunctionCall2(cmp_func, value, ranges[endidx].max); + + if ((cmp_min <= 0 && strategy == BTLessStrategyNumber) || + (cmp_min < 0 && (strategy == BTLessEqualStrategyNumber || + strategy == BTEqualStrategyNumber))) { - case INT4OID: - { - int least = DatumGetInt32(context->least), - greatest = DatumGetInt32(context->greatest); - List *rangeset = NIL; + result->rangeset = NIL; + return; + } - if (greatest - least + 1 < context->prel->children_count) - { - int value, - hash; - for (value = least; value <= greatest; value++) - { - hash = make_hash(value, context->prel->children_count); - rangeset = irange_list_union(rangeset, - list_make1_irange(make_irange(hash, hash, true))); - } - wrap->rangeset = irange_list_intersect(wrap->rangeset, - rangeset); - } - } - break; - default: - elog(ERROR, "Invalid datatype: %u", context->prel->atttype); + if (cmp_max >= 0 && (strategy == BTGreaterEqualStrategyNumber || + strategy == BTGreaterStrategyNumber || + strategy == BTEqualStrategyNumber)) + { + result->rangeset = NIL; + return; + } + + if ((cmp_min < 0 && strategy == BTGreaterStrategyNumber) || + (cmp_min <= 0 && strategy == BTGreaterEqualStrategyNumber)) + { + result->rangeset = list_make1_irange(make_irange(startidx, endidx, false)); + return; + } + + if (cmp_max >= 0 && (strategy == BTLessEqualStrategyNumber || + strategy == BTLessStrategyNumber)) + { + result->rangeset = list_make1_irange(make_irange(startidx, endidx, false)); + return; + } + } + + /* Binary search */ + while (true) + { + Assert(cmp_func); + + i = startidx + (endidx - startidx) / 2; + Assert(i >= 0 && i < nranges); + + current_re = &ranges[i]; + + cmp_min = FunctionCall2(cmp_func, value, current_re->min); + cmp_max = FunctionCall2(cmp_func, value, current_re->max); + + is_less = (cmp_min < 0 || (cmp_min == 0 && strategy == BTLessStrategyNumber)); + is_greater = (cmp_max > 0 || (cmp_max >= 0 && strategy != BTLessStrategyNumber)); + + if (!is_less && !is_greater) + { + if (strategy == BTGreaterEqualStrategyNumber && cmp_min == 0) + lossy = false; + else if (strategy == BTLessStrategyNumber && cmp_max == 0) + lossy = false; + else + lossy = true; +#ifdef USE_ASSERT_CHECKING + found = true; +#endif + break; + } + + /* If we still haven't found partition then it doesn't exist */ + if (startidx >= endidx) + { + result->rangeset = NIL; + result->found_gap = true; + return; } + + if (is_less) + endidx = i - 1; + else if (is_greater) + startidx = i + 1; + + /* For debug's sake */ + Assert(++counter < 100); + } + + Assert(found); + + /* Filter partitions */ + switch(strategy) + { + case BTLessStrategyNumber: + case BTLessEqualStrategyNumber: + if (lossy) + { + result->rangeset = list_make1_irange(make_irange(i, i, true)); + if (i > 0) + result->rangeset = lcons_irange(make_irange(0, i - 1, false), + result->rangeset); + } + else + { + result->rangeset = list_make1_irange(make_irange(0, i, false)); + } + break; + + case BTEqualStrategyNumber: + result->rangeset = list_make1_irange(make_irange(i, i, true)); + break; + + case BTGreaterEqualStrategyNumber: + case BTGreaterStrategyNumber: + if (lossy) + { + result->rangeset = list_make1_irange(make_irange(i, i, true)); + if (i < nranges - 1) + result->rangeset = + lappend_irange(result->rangeset, + make_irange(i + 1, + nranges - 1, + false)); + } + else + { + result->rangeset = + list_make1_irange(make_irange(i, + nranges - 1, + false)); + } + break; + + default: + elog(ERROR, "Unknown btree strategy (%u)", strategy); + break; } - context->hasLeast = false; - context->hasGreatest = false; } /* - * This function determines which partitions should appear in query plan + * This function determines which partitions should appear in query plan. */ static void handle_binary_opexpr(WalkerContext *context, WrapperNode *result, const Node *varnode, const Const *c) { - HashRelationKey key; - RangeRelation *rangerel; - Datum value; - int i, - strategy; - uint32 int_value; - bool is_less, - is_greater; - FmgrInfo cmp_func; - Oid cmp_proc_oid; - Oid vartype; - const OpExpr *expr = (const OpExpr *)result->orig; - TypeCacheEntry *tce; + int strategy; + TypeCacheEntry *tce; + FmgrInfo cmp_func; + Oid vartype; + const OpExpr *expr = (const OpExpr *) result->orig; const PartRelationInfo *prel = context->prel; Assert(IsA(varnode, Var) || IsA(varnode, RelabelType)); @@ -873,203 +1174,48 @@ handle_binary_opexpr(WalkerContext *context, WrapperNode *result, ((Var *) varnode)->vartype : ((RelabelType *) varnode)->resulttype; - /* Determine operator type */ - tce = lookup_type_cache(vartype, - TYPECACHE_BTREE_OPFAMILY | TYPECACHE_CMP_PROC | TYPECACHE_CMP_PROC_FINFO); - + tce = lookup_type_cache(vartype, TYPECACHE_BTREE_OPFAMILY); strategy = get_op_opfamily_strategy(expr->opno, tce->btree_opf); - cmp_proc_oid = get_opfamily_proc(tce->btree_opf, - vartype, - c->consttype, - BTORDER_PROC); - fmgr_info(cmp_proc_oid, &cmp_func); + fill_type_cmp_fmgr_info(&cmp_func, + getBaseType(c->consttype), + getBaseType(prel->atttype)); switch (prel->parttype) { case PT_HASH: if (strategy == BTEqualStrategyNumber) { - value = OidFunctionCall1(prel->hash_proc, c->constvalue); - int_value = DatumGetUInt32(value); - key.hash = make_hash(int_value, prel->children_count); - result->rangeset = list_make1_irange(make_irange(key.hash, key.hash, true)); - return; - } - break; - case PT_RANGE: - value = c->constvalue; - rangerel = get_pathman_range_relation(prel->key.relid, NULL); - if (rangerel != NULL) - { - RangeEntry *re; - bool lossy = false; -#ifdef USE_ASSERT_CHECKING - bool found = false; - int counter = 0; -#endif - int startidx = 0, - cmp_min, - cmp_max, - endidx = rangerel->ranges.length - 1; - RangeEntry *ranges = dsm_array_get_pointer(&rangerel->ranges); - bool byVal = rangerel->by_val; - - /* Check boundaries */ - if (rangerel->ranges.length == 0) - { - result->rangeset = NIL; - return; - } - else - { - /* Corner cases */ - cmp_min = FunctionCall2(&cmp_func, value, - PATHMAN_GET_DATUM(ranges[0].min, byVal)), - cmp_max = FunctionCall2(&cmp_func, value, - PATHMAN_GET_DATUM(ranges[rangerel->ranges.length - 1].max, byVal)); - - if ((cmp_min < 0 && - (strategy == BTLessEqualStrategyNumber || - strategy == BTEqualStrategyNumber)) || - (cmp_min <= 0 && strategy == BTLessStrategyNumber)) - { - result->rangeset = NIL; - return; - } - - if (cmp_max >= 0 && (strategy == BTGreaterEqualStrategyNumber || - strategy == BTGreaterStrategyNumber || - strategy == BTEqualStrategyNumber)) - { - result->rangeset = NIL; - return; - } - - if ((cmp_min < 0 && strategy == BTGreaterStrategyNumber) || - (cmp_min <= 0 && strategy == BTGreaterEqualStrategyNumber)) - { - result->rangeset = list_make1_irange(make_irange(startidx, endidx, false)); - return; - } - - if (cmp_max >= 0 && (strategy == BTLessEqualStrategyNumber || - strategy == BTLessStrategyNumber)) - { - result->rangeset = list_make1_irange(make_irange(startidx, endidx, false)); - return; - } - } + Datum value = OidFunctionCall1(prel->hash_proc, c->constvalue); + uint32 idx = hash_to_part_index(DatumGetInt32(value), + PrelChildrenCount(prel)); - /* Binary search */ - while (true) - { - i = startidx + (endidx - startidx) / 2; - Assert(i >= 0 && i < rangerel->ranges.length); - re = &ranges[i]; - cmp_min = FunctionCall2(&cmp_func, value, PATHMAN_GET_DATUM(re->min, byVal)); - cmp_max = FunctionCall2(&cmp_func, value, PATHMAN_GET_DATUM(re->max, byVal)); - - is_less = (cmp_min < 0 || (cmp_min == 0 && strategy == BTLessStrategyNumber)); - is_greater = (cmp_max > 0 || (cmp_max >= 0 && strategy != BTLessStrategyNumber)); - - if (!is_less && !is_greater) - { - if (strategy == BTGreaterEqualStrategyNumber && cmp_min == 0) - lossy = false; - else if (strategy == BTLessStrategyNumber && cmp_max == 0) - lossy = false; - else - lossy = true; -#ifdef USE_ASSERT_CHECKING - found = true; -#endif - break; - } + result->rangeset = list_make1_irange(make_irange(idx, idx, true)); - /* If we still didn't find partition then it doesn't exist */ - if (startidx >= endidx) - { - /* Handle case when we hit the gap between partitions */ - if (strategy != BTEqualStrategyNumber) - { - if (strategy == BTLessStrategyNumber || - strategy == BTLessEqualStrategyNumber) - { - if (is_less && i > 0) - i--; - } - if (strategy == BTGreaterStrategyNumber || - strategy == BTGreaterEqualStrategyNumber) - { - if (is_greater && i < rangerel->ranges.length - 1) - i++; - } - lossy = false; - break; - } - result->rangeset = NIL; - return; - } - - if (is_less) - endidx = i - 1; - else if (is_greater) - startidx = i + 1; - - /* For debug's sake */ - Assert(++counter < 100); - } - - Assert(found); + return; /* exit on equal */ + } + break; /* continue to function's end */ - /* Filter partitions */ - switch(strategy) - { - case BTLessStrategyNumber: - case BTLessEqualStrategyNumber: - if (lossy) - { - result->rangeset = list_make1_irange(make_irange(i, i, true)); - if (i > 0) - result->rangeset = lcons_irange( - make_irange(0, i - 1, false), result->rangeset); - } - else - { - result->rangeset = list_make1_irange( - make_irange(0, i, false)); - } - return; - case BTEqualStrategyNumber: - result->rangeset = list_make1_irange(make_irange(i, i, true)); - return; - case BTGreaterEqualStrategyNumber: - case BTGreaterStrategyNumber: - if (lossy) - { - result->rangeset = list_make1_irange(make_irange(i, i, true)); - if (i < prel->children_count - 1) - result->rangeset = lappend_irange(result->rangeset, - make_irange(i + 1, prel->children_count - 1, false)); - } - else - { - result->rangeset = list_make1_irange( - make_irange(i, prel->children_count - 1, false)); - } - return; - } - result->rangeset = list_make1_irange(make_irange(startidx, endidx, true)); + case PT_RANGE: + { + select_range_partitions(c->constvalue, + &cmp_func, + PrelGetRangesArray(context->prel), + PrelChildrenCount(context->prel), + strategy, + result); return; } + + default: + elog(ERROR, "Unknown partitioning type %u", prel->parttype); } - result->rangeset = list_make1_irange(make_irange(0, prel->children_count - 1, true)); + result->rangeset = list_make1_irange(make_irange(0, PrelLastChild(prel), true)); result->paramsel = 1.0; } /* - * Estimate selectivity of parametrized quals. + * Estimate selectivity of parametrized quals. */ static void handle_binary_opexpr_param(const PartRelationInfo *prel, @@ -1090,11 +1236,11 @@ handle_binary_opexpr_param(const PartRelationInfo *prel, tce = lookup_type_cache(vartype, TYPECACHE_BTREE_OPFAMILY); strategy = get_op_opfamily_strategy(expr->opno, tce->btree_opf); - result->rangeset = list_make1_irange(make_irange(0, prel->children_count - 1, true)); + result->rangeset = list_make1_irange(make_irange(0, PrelLastChild(prel), true)); if (strategy == BTEqualStrategyNumber) { - result->paramsel = 1.0 / (double) prel->children_count; + result->paramsel = 1.0 / (double) PrelChildrenCount(prel); } else if (prel->parttype == PT_RANGE && strategy > 0) { @@ -1107,73 +1253,58 @@ handle_binary_opexpr_param(const PartRelationInfo *prel, } /* - * Calculates hash value + * Convert hash value to the partition index. */ uint32 -make_hash(uint32 value, uint32 partitions) +hash_to_part_index(uint32 value, uint32 partitions) { return value % partitions; } -/* - * Search for range section. Returns position of the item in array. - * If item wasn't found then function returns closest position and sets - * foundPtr to false. If value is outside the range covered by partitions - * then returns -1. - */ -int -range_binary_search(const RangeRelation *rangerel, FmgrInfo *cmp_func, Datum value, bool *foundPtr) +search_rangerel_result +search_range_partition_eq(const Datum value, + FmgrInfo *cmp_func, + const PartRelationInfo *prel, + RangeEntry *out_re) /* returned RangeEntry */ { - RangeEntry *ranges = dsm_array_get_pointer(&rangerel->ranges); - RangeEntry *re; - bool byVal = rangerel->by_val; - int cmp_min, - cmp_max, - i = 0, - startidx = 0, - endidx = rangerel->ranges.length-1; -#ifdef USE_ASSERT_CHECKING - int counter = 0; -#endif + RangeEntry *ranges; + int nranges; + WrapperNode result; - *foundPtr = false; + ranges = PrelGetRangesArray(prel); + nranges = PrelChildrenCount(prel); - /* Check boundaries */ - cmp_min = FunctionCall2(cmp_func, value, PATHMAN_GET_DATUM(ranges[0].min, byVal)), - cmp_max = FunctionCall2(cmp_func, value, PATHMAN_GET_DATUM(ranges[rangerel->ranges.length - 1].max, byVal)); + select_range_partitions(value, + cmp_func, + ranges, + nranges, + BTEqualStrategyNumber, + &result); - if (cmp_min < 0 || cmp_max >= 0) + if (result.found_gap) { - return -1; + return SEARCH_RANGEREL_GAP; } - - while (true) + else if (result.rangeset == NIL) { - i = startidx + (endidx - startidx) / 2; - Assert(i >= 0 && i < rangerel->ranges.length); - re = &ranges[i]; - cmp_min = FunctionCall2(cmp_func, value, PATHMAN_GET_DATUM(re->min, byVal)); - cmp_max = FunctionCall2(cmp_func, value, PATHMAN_GET_DATUM(re->max, byVal)); + return SEARCH_RANGEREL_OUT_OF_RANGE; + } + else + { + IndexRange irange = linitial_irange(result.rangeset); - if (cmp_min >= 0 && cmp_max < 0) - { - *foundPtr = true; - break; - } + Assert(list_length(result.rangeset) == 1); + Assert(irange.ir_lower == irange.ir_upper); + Assert(irange.ir_valid); - if (startidx >= endidx) - return i; + /* Write result to the 'out_rentry' if necessary */ + if (out_re) + memcpy((void *) out_re, + (const void *) &ranges[irange.ir_lower], + sizeof(RangeEntry)); - if (cmp_min < 0) - endidx = i - 1; - else if (cmp_max >= 0) - startidx = i + 1; - - /* For debug's sake */ - Assert(++counter < 100); + return SEARCH_RANGEREL_FOUND; } - - return i; } static Const * @@ -1188,6 +1319,62 @@ extract_const(WalkerContext *wcxt, Param *param) value, isnull, get_typbyval(param->paramtype)); } +static WrapperNode * +handle_const(const Const *c, WalkerContext *context) +{ + const PartRelationInfo *prel = context->prel; + WrapperNode *result = (WrapperNode *) palloc(sizeof(WrapperNode)); + + result->orig = (const Node *) c; + + /* + * Had to add this check for queries like: + * select * from test.hash_rel where txt = NULL; + */ + if (!context->for_insert) + { + result->rangeset = list_make1_irange(make_irange(0, + PrelLastChild(prel), + true)); + result->paramsel = 1.0; + + return result; + } + + switch (prel->parttype) + { + case PT_HASH: + { + Datum value = OidFunctionCall1(prel->hash_proc, c->constvalue); + uint32 idx = hash_to_part_index(DatumGetInt32(value), + PrelChildrenCount(prel)); + result->rangeset = list_make1_irange(make_irange(idx, idx, true)); + } + break; + + case PT_RANGE: + { + TypeCacheEntry *tce; + + tce = lookup_type_cache(c->consttype, TYPECACHE_CMP_PROC_FINFO); + + select_range_partitions(c->constvalue, + &tce->cmp_proc_finfo, + PrelGetRangesArray(context->prel), + PrelChildrenCount(context->prel), + BTEqualStrategyNumber, + result); + } + break; + + default: + elog(ERROR, "Unknown partitioning type %u", prel->parttype); + break; + } + + return result; +} + /* * Operator expression handler */ @@ -1218,7 +1405,7 @@ handle_opexpr(const OpExpr *expr, WalkerContext *context) } } - result->rangeset = list_make1_irange(make_irange(0, prel->children_count - 1, true)); + result->rangeset = list_make1_irange(make_irange(0, PrelLastChild(prel), true)); result->paramsel = 1.0; return result; } @@ -1226,12 +1413,15 @@ handle_opexpr(const OpExpr *expr, WalkerContext *context) /* * Checks if expression is a KEY OP PARAM or PARAM OP KEY, * where KEY is partition key (it could be Var or RelableType) and PARAM is - * whatever. Function returns variable (or RelableType) and param via var_ptr + * whatever. Function returns variable (or RelableType) and param via var_ptr * and param_ptr pointers. If partition key isn't in expression then function * returns false. */ static bool -pull_var_param(const WalkerContext *ctx, const OpExpr *expr, Node **var_ptr, Node **param_ptr) +pull_var_param(const WalkerContext *ctx, + const OpExpr *expr, + Node **var_ptr, + Node **param_ptr) { Node *left = linitial(expr->args), *right = lsecond(expr->args); @@ -1244,14 +1434,14 @@ pull_var_param(const WalkerContext *ctx, const OpExpr *expr, Node **var_ptr, Nod (Var *) left : (Var *) ((RelabelType *) left)->arg; - if (v->varattno == ctx->prel->attnum) + if (v->varoattno == ctx->prel->attnum) { *var_ptr = left; *param_ptr = right; return true; } } - + /* ... variable is on the right side */ if (IsA(right, Var) || IsA(right, RelabelType)) { @@ -1259,7 +1449,7 @@ pull_var_param(const WalkerContext *ctx, const OpExpr *expr, Node **var_ptr, Nod (Var *) right : (Var *) ((RelabelType *) right)->arg; - if (v->varattno == ctx->prel->attnum) + if (v->varoattno == ctx->prel->attnum) { *var_ptr = right; *param_ptr = left; @@ -1286,7 +1476,9 @@ handle_boolexpr(const BoolExpr *expr, WalkerContext *context) result->paramsel = 1.0; if (expr->boolop == AND_EXPR) - result->rangeset = list_make1_irange(make_irange(0, prel->children_count - 1, false)); + result->rangeset = list_make1_irange(make_irange(0, + PrelLastChild(prel), + false)); else result->rangeset = NIL; @@ -1299,7 +1491,6 @@ handle_boolexpr(const BoolExpr *expr, WalkerContext *context) switch (expr->boolop) { case OR_EXPR: - // finish_least_greatest(arg, context); result->rangeset = irange_list_union(result->rangeset, arg->rangeset); break; case AND_EXPR: @@ -1307,7 +1498,9 @@ handle_boolexpr(const BoolExpr *expr, WalkerContext *context) result->paramsel *= arg->paramsel; break; default: - result->rangeset = list_make1_irange(make_irange(0, prel->children_count - 1, false)); + result->rangeset = list_make1_irange(make_irange(0, + PrelLastChild(prel), + false)); break; } } @@ -1339,7 +1532,6 @@ handle_arrexpr(const ScalarArrayOpExpr *expr, WalkerContext *context) Node *varnode = (Node *) linitial(expr->args); Var *var; Node *arraynode = (Node *) lsecond(expr->args); - int hash; const PartRelationInfo *prel = context->prel; result->orig = (const Node *)expr; @@ -1354,7 +1546,7 @@ handle_arrexpr(const ScalarArrayOpExpr *expr, WalkerContext *context) var = !IsA(varnode, RelabelType) ? (Var *) varnode : (Var *) ((RelabelType *) varnode)->arg; - if (var->varattno != prel->attnum) + if (var->varoattno != prel->attnum) goto handle_arrexpr_return; } else @@ -1371,8 +1563,6 @@ handle_arrexpr(const ScalarArrayOpExpr *expr, WalkerContext *context) Datum *elem_values; bool *elem_nulls; int i; - Datum value; - uint32 int_value; /* Extract values from array */ arrayval = DatumGetArrayTypeP(((Const *) arraynode)->constvalue); @@ -1388,12 +1578,16 @@ handle_arrexpr(const ScalarArrayOpExpr *expr, WalkerContext *context) /* Construct OIDs list */ for (i = 0; i < num_elems; i++) { + Datum value; + uint32 idx; + /* Invoke base hash function for value type */ value = OidFunctionCall1(prel->hash_proc, elem_values[i]); - int_value = DatumGetUInt32(value); - hash = make_hash(int_value, prel->children_count); + idx = hash_to_part_index(DatumGetInt32(value), PrelChildrenCount(prel)); result->rangeset = irange_list_union(result->rangeset, - list_make1_irange(make_irange(hash, hash, true))); + list_make1_irange(make_irange(idx, + idx, + true))); } /* Free resources */ @@ -1407,7 +1601,7 @@ handle_arrexpr(const ScalarArrayOpExpr *expr, WalkerContext *context) result->paramsel = DEFAULT_INEQ_SEL; handle_arrexpr_return: - result->rangeset = list_make1_irange(make_irange(0, prel->children_count - 1, true)); + result->rangeset = list_make1_irange(make_irange(0, PrelLastChild(prel), true)); return result; } @@ -1427,7 +1621,7 @@ set_plain_rel_size(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) * Test any partial indexes of rel for applicability. We must do this * first since partial unique indexes can affect size estimates. */ - check_partial_indexes(root, rel); + check_index_predicates_compat(root, rel); /* Mark rel with estimated output rows, width, etc */ set_baserel_size_estimates(root, rel); @@ -1457,7 +1651,6 @@ set_plain_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) path = create_seqscan_path(root, rel, required_outer); #endif add_path(rel, path); - // set_pathkeys(root, rel, path); /* Consider index scans */ create_index_paths(root, rel); @@ -1503,7 +1696,7 @@ set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, Index rti, RangeTblEntry *rte, PathKey *pathkeyAsc, PathKey *pathkeyDesc) { - int parentRTindex = rti; + Index parentRTindex = rti; List *live_childrels = NIL; List *subpaths = NIL; bool subpaths_valid = true; @@ -1520,7 +1713,7 @@ set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, foreach(l, root->append_rel_list) { AppendRelInfo *appinfo = (AppendRelInfo *) lfirst(l); - int childRTindex; + Index childRTindex; RangeTblEntry *childRTE; RelOptInfo *childrel; ListCell *lcp; @@ -1642,7 +1835,8 @@ set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, * if we have zero or one live subpath due to constraint exclusion.) */ if (subpaths_valid) - add_path(rel, (Path *) create_append_path(rel, subpaths, NULL)); + add_path(rel, + (Path *) create_append_path_compat(rel, subpaths, NULL, 0)); /* * Also build unparameterized MergeAppend paths based on the collected @@ -1693,7 +1887,7 @@ set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, if (subpaths_valid) add_path(rel, (Path *) - create_append_path(rel, subpaths, required_outer)); + create_append_path_compat(rel, subpaths, required_outer, 0)); } } @@ -1776,9 +1970,6 @@ get_cheapest_parameterized_child_path(PlannerInfo *root, RelOptInfo *rel, return cheapest; } - -//--------------------------------------------------------------- - /* * generate_mergeappend_paths * Generate MergeAppend paths for an append relation @@ -1874,13 +2065,15 @@ generate_mergeappend_paths(PlannerInfo *root, RelOptInfo *rel, { Path *path; - path = (Path *) create_append_path(rel, startup_subpaths, NULL); + path = (Path *) create_append_path_compat(rel, startup_subpaths, + NULL, 0); path->pathkeys = pathkeys; add_path(rel, path); if (startup_neq_total) { - path = (Path *) create_append_path(rel, total_subpaths, NULL); + path = (Path *) create_append_path_compat(rel, total_subpaths, + NULL, 0); path->pathkeys = pathkeys; add_path(rel, path); } @@ -1893,15 +2086,15 @@ generate_mergeappend_paths(PlannerInfo *root, RelOptInfo *rel, */ Path *path; - path = (Path *) create_append_path(rel, - list_reverse(startup_subpaths), NULL); + path = (Path *) create_append_path_compat(rel, + list_reverse(startup_subpaths), NULL, 0); path->pathkeys = pathkeys; add_path(rel, path); if (startup_neq_total) { - path = (Path *) create_append_path(rel, - list_reverse(total_subpaths), NULL); + path = (Path *) create_append_path_compat(rel, + list_reverse(total_subpaths), NULL, 0); path->pathkeys = pathkeys; add_path(rel, path); } @@ -1923,3 +2116,21 @@ generate_mergeappend_paths(PlannerInfo *root, RelOptInfo *rel, } } } + +/* + * Get cached PATHMAN_CONFIG relation Oid. + */ +Oid +get_pathman_config_relid(void) +{ + return pathman_config_relid; +} + +/* + * Get cached PATHMAN_CONFIG_PARAMS relation Oid. + */ +Oid +get_pathman_config_params_relid(void) +{ + return pathman_config_params_relid; +} diff --git a/contrib/pg_pathman/src/pl_funcs.c b/contrib/pg_pathman/src/pl_funcs.c index 51e5158cf1..a7c19b2e3b 100644 --- a/contrib/pg_pathman/src/pl_funcs.c +++ b/contrib/pg_pathman/src/pl_funcs.c @@ -8,252 +8,94 @@ * ------------------------------------------------------------------------ */ +#include "init.h" +#include "utils.h" #include "pathman.h" -#include "utils/lsyscache.h" -#include "utils/typcache.h" -#include "utils/array.h" -#include "utils/snapmgr.h" -#include "utils/memutils.h" +#include "relation_info.h" +#include "xact_handling.h" + +#include "access/htup_details.h" #include "access/nbtree.h" -#include "access/xact.h" +#include "catalog/indexing.h" #include "catalog/pg_type.h" -#include "executor/spi.h" -#include "storage/lmgr.h" -#include "utils.h" +#include "commands/tablespace.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "utils/builtins.h" +#include "utils/inval.h" +#include "utils/jsonb.h" +#include "utils/snapmgr.h" +#include "utils/lsyscache.h" +#include "utils/syscache.h" + +/* Function declarations */ -/* declarations */ PG_FUNCTION_INFO_V1( on_partitions_created ); PG_FUNCTION_INFO_V1( on_partitions_updated ); PG_FUNCTION_INFO_V1( on_partitions_removed ); -PG_FUNCTION_INFO_V1( find_or_create_range_partition); -PG_FUNCTION_INFO_V1( get_range_by_idx ); -PG_FUNCTION_INFO_V1( get_partition_range ); -PG_FUNCTION_INFO_V1( acquire_partitions_lock ); -PG_FUNCTION_INFO_V1( release_partitions_lock ); -PG_FUNCTION_INFO_V1( check_overlap ); -PG_FUNCTION_INFO_V1( get_min_range_value ); -PG_FUNCTION_INFO_V1( get_max_range_value ); -PG_FUNCTION_INFO_V1( get_type_hash_func ); -PG_FUNCTION_INFO_V1( get_hash ); +PG_FUNCTION_INFO_V1( get_parent_of_partition_pl ); +PG_FUNCTION_INFO_V1( get_base_type_pl ); +PG_FUNCTION_INFO_V1( get_attribute_type_pl ); +PG_FUNCTION_INFO_V1( get_rel_tablespace_name ); -/* - * Partition-related operation type. - */ -typedef enum -{ - EV_ON_PART_CREATED = 1, - EV_ON_PART_UPDATED, - EV_ON_PART_REMOVED -} part_event_type; +PG_FUNCTION_INFO_V1( show_partition_list_internal ); -/* - * We have to reset shared memory cache each time a transaction - * containing a partitioning-related operation has been rollbacked, - * hence we need to pass a partitioned table's Oid & some other stuff. - * - * Note: 'relname' cannot be fetched within - * Xact callbacks, so we have to store it here. - */ -typedef struct part_abort_arg part_abort_arg; +PG_FUNCTION_INFO_V1( build_update_trigger_func_name ); +PG_FUNCTION_INFO_V1( build_update_trigger_name ); +PG_FUNCTION_INFO_V1( build_check_constraint_name_attnum ); +PG_FUNCTION_INFO_V1( build_check_constraint_name_attname ); -struct part_abort_arg -{ - Oid partitioned_table_relid; - char *relname; +PG_FUNCTION_INFO_V1( is_date_type ); +PG_FUNCTION_INFO_V1( is_attribute_nullable ); - bool is_subxact; /* needed for correct callback removal */ - SubTransactionId subxact_id; /* necessary for detecting specific subxact */ - part_abort_arg *xact_cb_arg; /* points to the parent Xact's arg */ +PG_FUNCTION_INFO_V1( add_to_pathman_config ); +PG_FUNCTION_INFO_V1( invalidate_relcache ); - part_event_type event; /* created | updated | removed partitions */ +PG_FUNCTION_INFO_V1( lock_partitioned_relation ); +PG_FUNCTION_INFO_V1( prevent_relation_modification ); - bool expired; /* set by (Sub)Xact when a job is done */ -}; +PG_FUNCTION_INFO_V1( validate_on_part_init_callback_pl ); +PG_FUNCTION_INFO_V1( invoke_on_partition_created_callback ); +PG_FUNCTION_INFO_V1( debug_capture ); -static part_abort_arg * make_part_abort_arg(Oid partitioned_table, - part_event_type event, - bool is_subxact, - part_abort_arg *xact_cb_arg); - -static void handle_part_event_cancellation(const part_abort_arg *arg); -static void on_xact_abort_callback(XactEvent event, void *arg); -static void on_subxact_abort_callback(SubXactEvent event, SubTransactionId mySubid, - SubTransactionId parentSubid, void *arg); - -static void remove_on_xact_abort_callbacks(void *arg); -static void add_on_xact_abort_callbacks(Oid partitioned_table, part_event_type event); - -static void on_partitions_created_internal(Oid partitioned_table, bool add_callbacks); -static void on_partitions_updated_internal(Oid partitioned_table, bool add_callbacks); -static void on_partitions_removed_internal(Oid partitioned_table, bool add_callbacks); - - -/* Construct part_abort_arg for callbacks in TopTransactionContext. */ -static part_abort_arg * -make_part_abort_arg(Oid partitioned_table, part_event_type event, - bool is_subxact, part_abort_arg *xact_cb_arg) -{ - part_abort_arg *arg = MemoryContextAlloc(TopTransactionContext, - sizeof(part_abort_arg)); - - const char *relname = get_rel_name(partitioned_table); - - /* Fill in Oid & relation name */ - arg->partitioned_table_relid = partitioned_table; - arg->relname = MemoryContextStrdup(TopTransactionContext, relname); - arg->is_subxact = is_subxact; - arg->subxact_id = GetCurrentSubTransactionId(); /* for SubXact callback */ - arg->xact_cb_arg = xact_cb_arg; - arg->event = event; - arg->expired = false; - - return arg; -} - -/* Revert shared memory cache changes iff xact has been aborted. */ -static void -handle_part_event_cancellation(const part_abort_arg *arg) -{ -#define DO_NOT_USE_CALLBACKS false /* just to clarify intentions */ - - switch (arg->event) - { - case EV_ON_PART_CREATED: - { - elog(WARNING, "Partitioning of table '%s' has been aborted, " - "removing partitions from pg_pathman's cache", - arg->relname); - - on_partitions_removed_internal(arg->partitioned_table_relid, - DO_NOT_USE_CALLBACKS); - } - break; - - case EV_ON_PART_UPDATED: - { - elog(WARNING, "All changes in partitioned table " - "'%s' will be discarded", - arg->relname); - - on_partitions_updated_internal(arg->partitioned_table_relid, - DO_NOT_USE_CALLBACKS); - } - break; - - case EV_ON_PART_REMOVED: - { - elog(WARNING, "All changes in partitioned table " - "'%s' will be discarded", - arg->relname); - - on_partitions_created_internal(arg->partitioned_table_relid, - DO_NOT_USE_CALLBACKS); - } - break; - - default: - elog(ERROR, "Unknown event spotted in xact callback"); - } -} /* - * Add & remove xact callbacks + * User context for function show_partition_list_internal(). */ - -static void -remove_on_xact_abort_callbacks(void *arg) +typedef struct { - part_abort_arg *parg = (part_abort_arg *) arg; - - elog(DEBUG2, "remove_on_xact_abort_callbacks() " - "[is_subxact = %s, relname = '%s', event = %u] " - "triggered for relation %u", - (parg->is_subxact ? "true" : "false"), parg->relname, - parg->event, parg->partitioned_table_relid); + Relation pathman_config; + HeapScanDesc pathman_config_scan; + Snapshot snapshot; - /* Is this a SubXact callback or not? */ - if (!parg->is_subxact) - UnregisterXactCallback(on_xact_abort_callback, arg); - else - UnregisterSubXactCallback(on_subxact_abort_callback, arg); + const PartRelationInfo *current_prel; /* selected PartRelationInfo */ - pfree(arg); -} + uint32 child_number; /* child we're looking at */ +} show_partition_list_cxt; -static void -add_on_xact_abort_callbacks(Oid partitioned_table, part_event_type event) -{ - part_abort_arg *xact_cb_arg = make_part_abort_arg(partitioned_table, - event, false, NULL); - RegisterXactCallback(on_xact_abort_callback, (void *) xact_cb_arg); - execute_on_xact_mcxt_reset(TopTransactionContext, - remove_on_xact_abort_callbacks, - xact_cb_arg); +static void on_partitions_created_internal(Oid partitioned_table, bool add_callbacks); +static void on_partitions_updated_internal(Oid partitioned_table, bool add_callbacks); +static void on_partitions_removed_internal(Oid partitioned_table, bool add_callbacks); - /* Register SubXact callback if necessary */ - if (IsSubTransaction()) - { - /* - * SubXact callback's arg contains a pointer to the parent - * Xact callback's arg. This will allow it to 'expire' both - * args and to prevent Xact's callback from doing anything - */ - void *subxact_cb_arg = make_part_abort_arg(partitioned_table, event, - true, xact_cb_arg); - - RegisterSubXactCallback(on_subxact_abort_callback, subxact_cb_arg); - execute_on_xact_mcxt_reset(CurTransactionContext, - remove_on_xact_abort_callbacks, - subxact_cb_arg); - } -} /* - * Xact & SubXact callbacks + * Extracted common check. */ - -static void -on_xact_abort_callback(XactEvent event, void *arg) +static bool +check_relation_exists(Oid relid) { - part_abort_arg *parg = (part_abort_arg *) arg; - - /* Check that this is an aborted Xact & action has not expired yet */ - if ((event == XACT_EVENT_ABORT || event == XACT_EVENT_PARALLEL_ABORT) && - !parg->expired) - { - handle_part_event_cancellation(parg); - - /* Set expiration flag */ - parg->expired = true; - } + return get_rel_type_id(relid) != InvalidOid; } -static void -on_subxact_abort_callback(SubXactEvent event, SubTransactionId mySubid, - SubTransactionId parentSubid, void *arg) -{ - part_abort_arg *parg = (part_abort_arg *) arg; - - Assert(parg->subxact_id != InvalidSubTransactionId); - - /* Check if this is an aborted SubXact we've been waiting for */ - if (event == SUBXACT_EVENT_ABORT_SUB && - mySubid <= parg->subxact_id && !parg->expired) - { - handle_part_event_cancellation(parg); - - /* Now set expiration flags to disable Xact callback */ - parg->xact_cb_arg->expired = true; - parg->expired = true; - } -} /* - * Callbacks + * ---------------------------- + * Partition events callbacks + * ---------------------------- */ static void @@ -262,34 +104,18 @@ on_partitions_created_internal(Oid partitioned_table, bool add_callbacks) elog(DEBUG2, "on_partitions_created() [add_callbacks = %s] " "triggered for relation %u", (add_callbacks ? "true" : "false"), partitioned_table); - - LWLockAcquire(pmstate->load_config_lock, LW_EXCLUSIVE); - load_relations(false); - LWLockRelease(pmstate->load_config_lock); - - /* Register hooks that will clear shmem cache if needed */ - if (add_callbacks) - add_on_xact_abort_callbacks(partitioned_table, EV_ON_PART_CREATED); } static void on_partitions_updated_internal(Oid partitioned_table, bool add_callbacks) { + bool entry_found; + elog(DEBUG2, "on_partitions_updated() [add_callbacks = %s] " "triggered for relation %u", (add_callbacks ? "true" : "false"), partitioned_table); - if (get_pathman_relation_info(partitioned_table, NULL)) - { - LWLockAcquire(pmstate->load_config_lock, LW_EXCLUSIVE); - remove_relation_info(partitioned_table); - load_relations(false); - LWLockRelease(pmstate->load_config_lock); - } - - /* Register hooks that will clear shmem cache if needed */ - if (add_callbacks) - add_on_xact_abort_callbacks(partitioned_table, EV_ON_PART_UPDATED); + invalidate_pathman_relation_info(partitioned_table, &entry_found); } static void @@ -298,19 +124,8 @@ on_partitions_removed_internal(Oid partitioned_table, bool add_callbacks) elog(DEBUG2, "on_partitions_removed() [add_callbacks = %s] " "triggered for relation %u", (add_callbacks ? "true" : "false"), partitioned_table); - - LWLockAcquire(pmstate->load_config_lock, LW_EXCLUSIVE); - remove_relation_info(partitioned_table); - LWLockRelease(pmstate->load_config_lock); - - /* Register hooks that will clear shmem cache if needed */ - if (add_callbacks) - add_on_xact_abort_callbacks(partitioned_table, EV_ON_PART_REMOVED); } -/* - * Thin layer between pure c and pl/PgSQL - */ Datum on_partitions_created(PG_FUNCTION_ARGS) @@ -333,322 +148,736 @@ on_partitions_removed(PG_FUNCTION_ARGS) PG_RETURN_NULL(); } + /* - * Returns partition oid for specified parent relid and value. - * In case when partition isn't exist try to create one. + * ------------------------ + * Various useful getters + * ------------------------ + */ + +/* + * Get parent of a specified partition. */ Datum -find_or_create_range_partition(PG_FUNCTION_ARGS) +get_parent_of_partition_pl(PG_FUNCTION_ARGS) { - Oid relid = PG_GETARG_OID(0); - Datum value = PG_GETARG_DATUM(1); - Oid value_type = get_fn_expr_argtype(fcinfo->flinfo, 1); - int pos; - bool found; - RangeRelation *rangerel; - RangeEntry *ranges; - TypeCacheEntry *tce; - PartRelationInfo *prel; - Oid cmp_proc_oid; - FmgrInfo cmp_func; - - tce = lookup_type_cache(value_type, - TYPECACHE_EQ_OPR | TYPECACHE_LT_OPR | TYPECACHE_GT_OPR | - TYPECACHE_CMP_PROC | TYPECACHE_CMP_PROC_FINFO); - - prel = get_pathman_relation_info(relid, NULL); - rangerel = get_pathman_range_relation(relid, NULL); - - if (!prel || !rangerel) + Oid partition = PG_GETARG_OID(0); + PartParentSearch parent_search; + Oid parent; + + /* Fetch parent & write down search status */ + parent = get_parent_of_partition(partition, &parent_search); + + /* We MUST be sure :) */ + Assert(parent_search != PPS_NOT_SURE); + + /* It must be parent known by pg_pathman */ + if (parent_search == PPS_ENTRY_PART_PARENT) + PG_RETURN_OID(parent); + else + { + elog(ERROR, "\"%s\" is not a partition", + get_rel_name_or_relid(partition)); + PG_RETURN_NULL(); + } +} - cmp_proc_oid = get_opfamily_proc(tce->btree_opf, - value_type, - prel->atttype, - BTORDER_PROC); - fmgr_info(cmp_proc_oid, &cmp_func); +/* + * Extract basic type of a domain. + */ +Datum +get_base_type_pl(PG_FUNCTION_ARGS) +{ + PG_RETURN_OID(getBaseType(PG_GETARG_OID(0))); +} - ranges = dsm_array_get_pointer(&rangerel->ranges); - pos = range_binary_search(rangerel, &cmp_func, value, &found); +/* + * Get type (as REGTYPE) of a given attribute. + */ +Datum +get_attribute_type_pl(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + text *attname = PG_GETARG_TEXT_P(1); + Oid result; + HeapTuple tp; + + /* NOTE: for now it's the most efficient way */ + tp = SearchSysCacheAttName(relid, text_to_cstring(attname)); + if (HeapTupleIsValid(tp)) + { + Form_pg_attribute att_tup = (Form_pg_attribute) GETSTRUCT(tp); + result = att_tup->atttypid; + ReleaseSysCache(tp); + + PG_RETURN_OID(result); + } + else + elog(ERROR, "Cannot find type name for attribute \"%s\" " + "of relation \"%s\"", + text_to_cstring(attname), get_rel_name_or_relid(relid)); + + PG_RETURN_NULL(); /* keep compiler happy */ +} + +/* + * Return tablespace name for specified relation + */ +Datum +get_rel_tablespace_name(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + Oid tablespace_id; + char *result; + + tablespace_id = get_rel_tablespace(relid); + + /* If tablespace id is InvalidOid then use the default tablespace */ + if (!OidIsValid(tablespace_id)) + { + tablespace_id = GetDefaultTablespace(get_rel_persistence(relid)); + + /* If tablespace is still invalid then use database's default */ + if (!OidIsValid(tablespace_id)) + tablespace_id = MyDatabaseTableSpace; + } + + result = get_tablespace_name(tablespace_id); + PG_RETURN_TEXT_P(cstring_to_text(result)); +} + + +/* + * ---------------------- + * Common purpose VIEWs + * ---------------------- + */ + +/* + * List all existing partitions and their parents. + */ +Datum +show_partition_list_internal(PG_FUNCTION_ARGS) +{ + show_partition_list_cxt *usercxt; + FuncCallContext *funccxt; /* - * If found then just return oid. Else create new partitions + * Initialize tuple descriptor & function call context. */ - if (found) - PG_RETURN_OID(ranges[pos].child_oid); - /* - * If not found and value is between first and last partitions - */ - if (!found && pos >= 0) - PG_RETURN_NULL(); - else + if (SRF_IS_FIRSTCALL()) + { + TupleDesc tupdesc; + MemoryContext old_mcxt; + + funccxt = SRF_FIRSTCALL_INIT(); + + old_mcxt = MemoryContextSwitchTo(funccxt->multi_call_memory_ctx); + + usercxt = (show_partition_list_cxt *) palloc(sizeof(show_partition_list_cxt)); + + /* Open PATHMAN_CONFIG with latest snapshot available */ + usercxt->pathman_config = heap_open(get_pathman_config_relid(), + AccessShareLock); + usercxt->snapshot = RegisterSnapshot(GetLatestSnapshot()); + usercxt->pathman_config_scan = heap_beginscan(usercxt->pathman_config, + usercxt->snapshot, 0, NULL); + + usercxt->current_prel = NULL; + + /* Create tuple descriptor */ + tupdesc = CreateTemplateTupleDesc(Natts_pathman_partition_list, false); + + TupleDescInitEntry(tupdesc, Anum_pathman_pl_parent, + "parent", REGCLASSOID, -1, 0); + TupleDescInitEntry(tupdesc, Anum_pathman_pl_partition, + "partition", REGCLASSOID, -1, 0); + TupleDescInitEntry(tupdesc, Anum_pathman_pl_parttype, + "parttype", INT4OID, -1, 0); + TupleDescInitEntry(tupdesc, Anum_pathman_pl_partattr, + "partattr", TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, Anum_pathman_pl_range_min, + "range_min", TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, Anum_pathman_pl_range_max, + "range_max", TEXTOID, -1, 0); + + funccxt->tuple_desc = BlessTupleDesc(tupdesc); + funccxt->user_fctx = (void *) usercxt; + + MemoryContextSwitchTo(old_mcxt); + } + + funccxt = SRF_PERCALL_SETUP(); + usercxt = (show_partition_list_cxt *) funccxt->user_fctx; + + /* Iterate through pathman cache */ + for(;;) { - Oid child_oid; - bool crashed = false; + const PartRelationInfo *prel; + HeapTuple htup; + Datum values[Natts_pathman_partition_list]; + bool isnull[Natts_pathman_partition_list] = { 0 }; + char *partattr_cstr; + + /* Fetch next PartRelationInfo if needed */ + if (usercxt->current_prel == NULL) + { + HeapTuple pathman_config_htup; + Datum parent_table; + bool parent_table_isnull; + Oid parent_table_oid; + + pathman_config_htup = heap_getnext(usercxt->pathman_config_scan, + ForwardScanDirection); + if (!HeapTupleIsValid(pathman_config_htup)) + break; - /* Lock config before appending new partitions */ - LWLockAcquire(pmstate->load_config_lock, LW_EXCLUSIVE); + parent_table = heap_getattr(pathman_config_htup, + Anum_pathman_config_partrel, + RelationGetDescr(usercxt->pathman_config), + &parent_table_isnull); - /* Restrict concurrent partition creation */ - LWLockAcquire(pmstate->edit_partitions_lock, LW_EXCLUSIVE); + Assert(parent_table_isnull == false); + parent_table_oid = DatumGetObjectId(parent_table); - /* - * Check if someone else has already created partition. - */ - ranges = dsm_array_get_pointer(&rangerel->ranges); - pos = range_binary_search(rangerel, &cmp_func, value, &found); - if (found) + usercxt->current_prel = get_pathman_relation_info(parent_table_oid); + if (usercxt->current_prel == NULL) + continue; + + usercxt->child_number = 0; + } + + /* Alias to 'usercxt->current_prel' */ + prel = usercxt->current_prel; + + /* If we've run out of partitions, switch to the next 'prel' */ + if (usercxt->child_number >= PrelChildrenCount(prel)) { - LWLockRelease(pmstate->edit_partitions_lock); - LWLockRelease(pmstate->load_config_lock); - PG_RETURN_OID(ranges[pos].child_oid); + usercxt->current_prel = NULL; + usercxt->child_number = 0; + + continue; } - /* Start background worker to create new partitions */ - child_oid = create_partitions_bg_worker(relid, value, value_type, &crashed); + partattr_cstr = get_attname(PrelParentRelid(prel), prel->attnum); + if (!partattr_cstr) + { + /* Parent does not exist, go to the next 'prel' */ + usercxt->current_prel = NULL; + continue; + } - /* Release locks */ - if (!crashed) + /* Fill in common values */ + values[Anum_pathman_pl_parent - 1] = PrelParentRelid(prel); + values[Anum_pathman_pl_parttype - 1] = prel->parttype; + values[Anum_pathman_pl_partattr - 1] = CStringGetTextDatum(partattr_cstr); + + switch (prel->parttype) { - LWLockRelease(pmstate->edit_partitions_lock); - LWLockRelease(pmstate->load_config_lock); + case PT_HASH: + { + Oid *children = PrelGetChildrenArray(prel), + child_oid = children[usercxt->child_number]; + + values[Anum_pathman_pl_partition - 1] = child_oid; + isnull[Anum_pathman_pl_range_min - 1] = true; + isnull[Anum_pathman_pl_range_max - 1] = true; + } + break; + + case PT_RANGE: + { + RangeEntry *re; + Datum rmin, + rmax; + + re = &PrelGetRangesArray(prel)[usercxt->child_number]; + + rmin = CStringGetTextDatum(datum_to_cstring(re->min, + prel->atttype)); + rmax = CStringGetTextDatum(datum_to_cstring(re->max, + prel->atttype)); + + values[Anum_pathman_pl_partition - 1] = re->child_oid; + values[Anum_pathman_pl_range_min - 1] = rmin; + values[Anum_pathman_pl_range_max - 1] = rmax; + } + break; + + default: + elog(ERROR, "Unknown partitioning type %u", prel->parttype); } - /* Repeat binary search */ - (void) range_binary_search(rangerel, &cmp_func, value, &found); - if (found) - PG_RETURN_OID(child_oid); + /* Switch to the next child */ + usercxt->child_number++; + + /* Form output tuple */ + htup = heap_form_tuple(funccxt->tuple_desc, values, isnull); + + SRF_RETURN_NEXT(funccxt, HeapTupleGetDatum(htup)); } - PG_RETURN_NULL(); + /* Clean resources */ + heap_endscan(usercxt->pathman_config_scan); + UnregisterSnapshot(usercxt->snapshot); + heap_close(usercxt->pathman_config, AccessShareLock); + + SRF_RETURN_DONE(funccxt); } + /* - * Returns range (min, max) as output parameters - * - * first argument is the parent relid - * second is the partition relid - * third and forth are MIN and MAX output parameters + * -------- + * Traits + * -------- */ + Datum -get_partition_range(PG_FUNCTION_ARGS) +is_date_type(PG_FUNCTION_ARGS) { - Oid parent_oid = PG_GETARG_OID(0); - Oid child_oid = PG_GETARG_OID(1); - int nelems = 2; - int i; - bool found = false; - Datum *elems; - PartRelationInfo *prel; - RangeRelation *rangerel; - RangeEntry *ranges; - TypeCacheEntry *tce; - ArrayType *arr; - - prel = get_pathman_relation_info(parent_oid, NULL); - - rangerel = get_pathman_range_relation(parent_oid, NULL); - - if (!prel || !rangerel) - PG_RETURN_NULL(); - - ranges = dsm_array_get_pointer(&rangerel->ranges); - tce = lookup_type_cache(prel->atttype, 0); + PG_RETURN_BOOL(is_date_type_internal(PG_GETARG_OID(0))); +} - /* Looking for specified partition */ - for(i=0; iranges.length; i++) - if (ranges[i].child_oid == child_oid) - { - found = true; - break; - } +Datum +is_attribute_nullable(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + text *attname = PG_GETARG_TEXT_P(1); + bool result = true; + HeapTuple tp; - if (found) + tp = SearchSysCacheAttName(relid, text_to_cstring(attname)); + if (HeapTupleIsValid(tp)) { - bool byVal = rangerel->by_val; - - elems = palloc(nelems * sizeof(Datum)); - elems[0] = PATHMAN_GET_DATUM(ranges[i].min, byVal); - elems[1] = PATHMAN_GET_DATUM(ranges[i].max, byVal); - - arr = construct_array(elems, nelems, prel->atttype, - tce->typlen, tce->typbyval, tce->typalign); - PG_RETURN_ARRAYTYPE_P(arr); + Form_pg_attribute att_tup = (Form_pg_attribute) GETSTRUCT(tp); + result = !att_tup->attnotnull; + ReleaseSysCache(tp); } + else + elog(ERROR, "Cannot find type name for attribute \"%s\" " + "of relation \"%s\"", + text_to_cstring(attname), get_rel_name_or_relid(relid)); - PG_RETURN_NULL(); + PG_RETURN_BOOL(result); /* keep compiler happy */ } /* - * Returns N-th range (in form of array) - * - * First argument is the parent relid. - * Second argument is the index of the range (if it is negative then the last - * range will be returned). + * ------------------------ + * Useful string builders + * ------------------------ */ + Datum -get_range_by_idx(PG_FUNCTION_ARGS) +build_update_trigger_func_name(PG_FUNCTION_ARGS) { - Oid parent_oid = PG_GETARG_OID(0); - int idx = PG_GETARG_INT32(1); - PartRelationInfo *prel; - RangeRelation *rangerel; - RangeEntry *ranges; - RangeEntry *re; - Datum *elems; - TypeCacheEntry *tce; + Oid relid = PG_GETARG_OID(0), + nspid; + const char *result; - prel = get_pathman_relation_info(parent_oid, NULL); + /* Check that relation exists */ + if (!check_relation_exists(relid)) + elog(ERROR, "Invalid relation %u", relid); - rangerel = get_pathman_range_relation(parent_oid, NULL); + nspid = get_rel_namespace(relid); + result = psprintf("%s.%s", + quote_identifier(get_namespace_name(nspid)), + quote_identifier(psprintf("%s_upd_trig_func", + get_rel_name(relid)))); - if (!prel || !rangerel || idx >= (int)rangerel->ranges.length) - PG_RETURN_NULL(); + PG_RETURN_TEXT_P(cstring_to_text(result)); +} - tce = lookup_type_cache(prel->atttype, 0); - ranges = dsm_array_get_pointer(&rangerel->ranges); - if (idx >= 0) - re = &ranges[idx]; - else - re = &ranges[rangerel->ranges.length - 1]; +Datum +build_update_trigger_name(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + const char *result; /* trigger's name can't be qualified */ - elems = palloc(2 * sizeof(Datum)); - elems[0] = PATHMAN_GET_DATUM(re->min, rangerel->by_val); - elems[1] = PATHMAN_GET_DATUM(re->max, rangerel->by_val); + /* Check that relation exists */ + if (!check_relation_exists(relid)) + elog(ERROR, "Invalid relation %u", relid); - PG_RETURN_ARRAYTYPE_P( - construct_array(elems, 2, prel->atttype, - tce->typlen, tce->typbyval, tce->typalign)); + result = quote_identifier(psprintf("%s_upd_trig", get_rel_name(relid))); + + PG_RETURN_TEXT_P(cstring_to_text(result)); } -/* - * Returns min value of the first range for relation - */ Datum -get_min_range_value(PG_FUNCTION_ARGS) +build_check_constraint_name_attnum(PG_FUNCTION_ARGS) { - Oid parent_oid = PG_GETARG_OID(0); - PartRelationInfo *prel; - RangeRelation *rangerel; - RangeEntry *ranges; + Oid relid = PG_GETARG_OID(0); + AttrNumber attnum = PG_GETARG_INT16(1); + const char *result; - prel = get_pathman_relation_info(parent_oid, NULL); - rangerel = get_pathman_range_relation(parent_oid, NULL); + if (!check_relation_exists(relid)) + elog(ERROR, "Invalid relation %u", relid); - if (!prel || !rangerel || prel->parttype != PT_RANGE || rangerel->ranges.length == 0) - PG_RETURN_NULL(); + /* We explicitly do not support system attributes */ + if (attnum == InvalidAttrNumber || attnum < 0) + elog(ERROR, "Cannot build check constraint name: " + "invalid attribute number %i", attnum); + + result = build_check_constraint_name_internal(relid, attnum); - ranges = dsm_array_get_pointer(&rangerel->ranges); - PG_RETURN_DATUM(PATHMAN_GET_DATUM(ranges[0].min, rangerel->by_val)); + PG_RETURN_TEXT_P(cstring_to_text(quote_identifier(result))); } -/* - * Returns max value of the last range for relation - */ Datum -get_max_range_value(PG_FUNCTION_ARGS) +build_check_constraint_name_attname(PG_FUNCTION_ARGS) { - Oid parent_oid = PG_GETARG_OID(0); - PartRelationInfo *prel; - RangeRelation *rangerel; - RangeEntry *ranges; + Oid relid = PG_GETARG_OID(0); + text *attname = PG_GETARG_TEXT_P(1); + AttrNumber attnum = get_attnum(relid, text_to_cstring(attname)); + const char *result; - prel = get_pathman_relation_info(parent_oid, NULL); - rangerel = get_pathman_range_relation(parent_oid, NULL); + if (!check_relation_exists(relid)) + elog(ERROR, "Invalid relation %u", relid); - if (!prel || !rangerel || prel->parttype != PT_RANGE || rangerel->ranges.length == 0) - PG_RETURN_NULL(); + if (attnum == InvalidAttrNumber) + elog(ERROR, "Relation \"%s\" has no column '%s'", + get_rel_name_or_relid(relid), text_to_cstring(attname)); + + result = build_check_constraint_name_internal(relid, attnum); - ranges = dsm_array_get_pointer(&rangerel->ranges); - PG_RETURN_DATUM(PATHMAN_GET_DATUM(ranges[rangerel->ranges.length-1].max, rangerel->by_val)); + PG_RETURN_TEXT_P(cstring_to_text(quote_identifier(result))); } + +/* + * ------------------------ + * Cache & config updates + * ------------------------ + */ + /* - * Checks if range overlaps with existing partitions. - * Returns TRUE if overlaps and FALSE otherwise. + * Try to add previously partitioned table to PATHMAN_CONFIG. */ Datum -check_overlap(PG_FUNCTION_ARGS) +add_to_pathman_config(PG_FUNCTION_ARGS) { - Oid partitioned_table = PG_GETARG_OID(0); + Oid relid; + text *attname; + PartType parttype; - Datum p1 = PG_GETARG_DATUM(1), - p2 = PG_GETARG_DATUM(2); + Relation pathman_config; + Datum values[Natts_pathman_config]; + bool isnull[Natts_pathman_config]; + HeapTuple htup; + CatalogIndexState indstate; - Oid p1_type = get_fn_expr_argtype(fcinfo->flinfo, 1), - p2_type = get_fn_expr_argtype(fcinfo->flinfo, 2); + PathmanInitState init_state; + MemoryContext old_mcxt = CurrentMemoryContext; - FmgrInfo *cmp_func_1, - *cmp_func_2; + if (PG_ARGISNULL(0)) + elog(ERROR, "parent_relid should not be null"); - PartRelationInfo *prel; - RangeRelation *rangerel; - RangeEntry *ranges; - int i; - bool byVal; + if (PG_ARGISNULL(1)) + elog(ERROR, "attname should not be null"); - prel = get_pathman_relation_info(partitioned_table, NULL); - rangerel = get_pathman_range_relation(partitioned_table, NULL); + /* Read parameters */ + relid = PG_GETARG_OID(0); + attname = PG_GETARG_TEXT_P(1); - if (!prel || !rangerel || prel->parttype != PT_RANGE) - PG_RETURN_NULL(); + /* Check that relation exists */ + if (!check_relation_exists(relid)) + elog(ERROR, "Invalid relation %u", relid); + + if (get_attnum(relid, text_to_cstring(attname)) == InvalidAttrNumber) + elog(ERROR, "Relation \"%s\" has no column '%s'", + get_rel_name_or_relid(relid), text_to_cstring(attname)); + + /* Select partitioning type using 'range_interval' */ + parttype = PG_ARGISNULL(2) ? PT_HASH : PT_RANGE; + + /* + * Initialize columns (partrel, attname, parttype, range_interval). + */ + values[Anum_pathman_config_partrel - 1] = ObjectIdGetDatum(relid); + isnull[Anum_pathman_config_partrel - 1] = false; + + values[Anum_pathman_config_attname - 1] = PointerGetDatum(attname); + isnull[Anum_pathman_config_attname - 1] = false; - /* Get comparison functions */ - cmp_func_1 = get_cmp_func(p1_type, prel->atttype); - cmp_func_2 = get_cmp_func(p2_type, prel->atttype); + values[Anum_pathman_config_parttype - 1] = Int32GetDatum(parttype); + isnull[Anum_pathman_config_parttype - 1] = false; - byVal = rangerel->by_val; - ranges = (RangeEntry *) dsm_array_get_pointer(&rangerel->ranges); - for (i = 0; i < rangerel->ranges.length; i++) + values[Anum_pathman_config_range_interval - 1] = PG_GETARG_DATUM(2); + isnull[Anum_pathman_config_range_interval - 1] = PG_ARGISNULL(2); + + /* Insert new row into PATHMAN_CONFIG */ + pathman_config = heap_open(get_pathman_config_relid(), RowExclusiveLock); + htup = heap_form_tuple(RelationGetDescr(pathman_config), values, isnull); + simple_heap_insert(pathman_config, htup); + indstate = CatalogOpenIndexes(pathman_config); + CatalogIndexInsert(indstate, htup); + CatalogCloseIndexes(indstate); + heap_close(pathman_config, RowExclusiveLock); + + /* Now try to create a PartRelationInfo */ + PG_TRY(); { - int c1 = FunctionCall2(cmp_func_1, p1, - PATHMAN_GET_DATUM(ranges[i].max, byVal)); - int c2 = FunctionCall2(cmp_func_2, p2, - PATHMAN_GET_DATUM(ranges[i].min, byVal)); + /* Some flags might change during refresh attempt */ + save_pathman_init_state(&init_state); - if (c1 < 0 && c2 > 0) - PG_RETURN_BOOL(true); + refresh_pathman_relation_info(relid, parttype, text_to_cstring(attname)); } + PG_CATCH(); + { + ErrorData *edata; + + /* Switch to the original context & copy edata */ + MemoryContextSwitchTo(old_mcxt); + edata = CopyErrorData(); + FlushErrorState(); + + /* We have to restore all changed flags */ + restore_pathman_init_state(&init_state); + + /* Show error message */ + elog(ERROR, "%s", edata->message); + + FreeErrorData(edata); + } + PG_END_TRY(); + + PG_RETURN_BOOL(true); +} + + +/* + * Invalidate relcache for a specified relation. + */ +Datum +invalidate_relcache(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); - PG_RETURN_BOOL(false); + if (check_relation_exists(relid)) + CacheInvalidateRelcacheByRelid(relid); + + PG_RETURN_VOID(); } + /* - * Acquire partitions lock + * -------------------------- + * Special locking routines + * -------------------------- + */ + +/* + * Acquire appropriate lock on a partitioned relation. */ Datum -acquire_partitions_lock(PG_FUNCTION_ARGS) +lock_partitioned_relation(PG_FUNCTION_ARGS) { - LWLockAcquire(pmstate->edit_partitions_lock, LW_EXCLUSIVE); - PG_RETURN_NULL(); + Oid relid = PG_GETARG_OID(0); + + /* Lock partitioned relation till transaction's end */ + xact_lock_partitioned_rel(relid, false); + + PG_RETURN_VOID(); } +/* + * Lock relation exclusively & check for current isolation level. + */ Datum -release_partitions_lock(PG_FUNCTION_ARGS) +prevent_relation_modification(PG_FUNCTION_ARGS) { - LWLockRelease(pmstate->edit_partitions_lock); - PG_RETURN_NULL(); + Oid relid = PG_GETARG_OID(0); + + /* + * Check that isolation level is READ COMMITTED. + * Else we won't be able to see new rows + * which could slip through locks. + */ + if (!xact_is_level_read_committed()) + ereport(ERROR, + (errmsg("Cannot perform blocking partitioning operation"), + errdetail("Expected READ COMMITTED isolation level"))); + + /* + * Check if table is being modified + * concurrently in a separate transaction. + */ + if (!xact_lock_rel_exclusive(relid, true)) + ereport(ERROR, + (errmsg("Cannot perform blocking partitioning operation"), + errdetail("Table \"%s\" is being modified concurrently", + get_rel_name_or_relid(relid)))); + + PG_RETURN_VOID(); } + +/* + * ------------------------------------------- + * User-defined partition creation callbacks + * ------------------------------------------- + */ + /* - * Returns hash function OID for specified type + * Checks that callback function meets specific requirements. + * It must have the only JSONB argument and BOOL return type. */ Datum -get_type_hash_func(PG_FUNCTION_ARGS) +validate_on_part_init_callback_pl(PG_FUNCTION_ARGS) { - TypeCacheEntry *tce; - Oid type_oid = PG_GETARG_OID(0); + validate_on_part_init_cb(PG_GETARG_OID(0), true); - tce = lookup_type_cache(type_oid, TYPECACHE_HASH_PROC); - PG_RETURN_OID(tce->hash_proc); + PG_RETURN_VOID(); } +/* + * Builds JSONB object containing new partition parameters + * and invokes the callback. + */ Datum -get_hash(PG_FUNCTION_ARGS) +invoke_on_partition_created_callback(PG_FUNCTION_ARGS) { - uint32 value = PG_GETARG_UINT32(0), - part_count = PG_GETARG_UINT32(1); +#define JSB_INIT_VAL(value, val_type, val_cstring) \ + do { \ + (value)->type = jbvString; \ + (value)->val.string.len = strlen(val_cstring); \ + (value)->val.string.val = val_cstring; \ + pushJsonbValue(&jsonb_state, val_type, (value)); \ + } while (0) + +#define ARG_PARENT 0 /* parent table */ +#define ARG_CHILD 1 /* partition */ +#define ARG_CALLBACK 2 /* callback to be invoked */ +#define ARG_RANGE_START 3 /* start_value */ +#define ARG_RANGE_END 4 /* end_value */ + + Oid parent_oid = PG_GETARG_OID(ARG_PARENT), + partition_oid = PG_GETARG_OID(ARG_CHILD); + PartType part_type; + + Oid cb_oid = PG_GETARG_OID(ARG_CALLBACK); + FmgrInfo cb_flinfo; + FunctionCallInfoData cb_fcinfo; + + JsonbParseState *jsonb_state = NULL; + JsonbValue *result, + key, + val; + + /* If there's no callback function specified, we're done */ + if (cb_oid == InvalidOid) + PG_RETURN_VOID(); + + if (PG_ARGISNULL(ARG_PARENT)) + elog(ERROR, "parent_relid should not be null"); + + if (PG_ARGISNULL(ARG_CHILD)) + elog(ERROR, "partition should not be null"); + + switch (PG_NARGS()) + { + case 3: + part_type = PT_HASH; + break; + + case 5: + { + if (PG_ARGISNULL(ARG_RANGE_START) || PG_ARGISNULL(ARG_RANGE_START)) + elog(ERROR, "both bounds must be provided for RANGE partition"); + + part_type = PT_RANGE; + } + break; + + default: + elog(ERROR, "error in function \"%s\"", + CppAsString(invoke_on_partition_created_callback)); + } + + /* Build JSONB according to partitioning type */ + switch (part_type) + { + case PT_HASH: + { + pushJsonbValue(&jsonb_state, WJB_BEGIN_OBJECT, NULL); + + JSB_INIT_VAL(&key, WJB_KEY, "parent"); + JSB_INIT_VAL(&val, WJB_VALUE, get_rel_name_or_relid(parent_oid)); + JSB_INIT_VAL(&key, WJB_KEY, "partition"); + JSB_INIT_VAL(&val, WJB_VALUE, get_rel_name_or_relid(partition_oid)); + JSB_INIT_VAL(&key, WJB_KEY, "parttype"); + JSB_INIT_VAL(&val, WJB_VALUE, PartTypeToCString(PT_HASH)); + + result = pushJsonbValue(&jsonb_state, WJB_END_OBJECT, NULL); + } + break; + + case PT_RANGE: + { + char *start_value, + *end_value; + Oid type = get_fn_expr_argtype(fcinfo->flinfo, ARG_RANGE_START); + + /* Convert min & max to CSTRING */ + start_value = datum_to_cstring(PG_GETARG_DATUM(ARG_RANGE_START), type); + end_value = datum_to_cstring(PG_GETARG_DATUM(ARG_RANGE_END), type); + + pushJsonbValue(&jsonb_state, WJB_BEGIN_OBJECT, NULL); + + JSB_INIT_VAL(&key, WJB_KEY, "parent"); + JSB_INIT_VAL(&val, WJB_VALUE, get_rel_name_or_relid(parent_oid)); + JSB_INIT_VAL(&key, WJB_KEY, "partition"); + JSB_INIT_VAL(&val, WJB_VALUE, get_rel_name_or_relid(partition_oid)); + JSB_INIT_VAL(&key, WJB_KEY, "parttype"); + JSB_INIT_VAL(&val, WJB_VALUE, PartTypeToCString(PT_RANGE)); + JSB_INIT_VAL(&key, WJB_KEY, "range_min"); + JSB_INIT_VAL(&val, WJB_VALUE, start_value); + JSB_INIT_VAL(&key, WJB_KEY, "range_max"); + JSB_INIT_VAL(&val, WJB_VALUE, end_value); + + result = pushJsonbValue(&jsonb_state, WJB_END_OBJECT, NULL); + } + break; + + default: + elog(ERROR, "Unknown partitioning type %u", part_type); + break; + } + + /* Validate the callback's signature */ + validate_on_part_init_cb(cb_oid, true); + + fmgr_info(cb_oid, &cb_flinfo); + + InitFunctionCallInfoData(cb_fcinfo, &cb_flinfo, 1, InvalidOid, NULL, NULL); + cb_fcinfo.arg[0] = PointerGetDatum(JsonbValueToJsonb(result)); + cb_fcinfo.argnull[0] = false; + + /* Invoke the callback */ + FunctionCallInvoke(&cb_fcinfo); + + PG_RETURN_VOID(); +} + + +/* + * ------- + * DEBUG + * ------- + */ + +/* + * NOTE: used for DEBUG, set breakpoint here. + */ +Datum +debug_capture(PG_FUNCTION_ARGS) +{ + static float8 sleep_time = 0; + DirectFunctionCall1(pg_sleep, Float8GetDatum(sleep_time)); + + /* Write something (doesn't really matter) */ + elog(WARNING, "debug_capture [%u]", MyProcPid); - PG_RETURN_UINT32(make_hash(value, part_count)); + PG_RETURN_VOID(); } diff --git a/contrib/pg_pathman/src/pl_hash_funcs.c b/contrib/pg_pathman/src/pl_hash_funcs.c new file mode 100644 index 0000000000..6dc0916fbb --- /dev/null +++ b/contrib/pg_pathman/src/pl_hash_funcs.c @@ -0,0 +1,46 @@ +/* ------------------------------------------------------------------------ + * + * pl_hash_funcs.c + * Utility C functions for stored HASH procedures + * + * Copyright (c) 2016, Postgres Professional + * + * ------------------------------------------------------------------------ + */ + +#include "pathman.h" + +#include "utils/typcache.h" + + +/* Function declarations */ + +PG_FUNCTION_INFO_V1( get_type_hash_func ); +PG_FUNCTION_INFO_V1( get_hash_part_idx ); + + +/* + * Returns hash function's OID for a specified type. + */ +Datum +get_type_hash_func(PG_FUNCTION_ARGS) +{ + TypeCacheEntry *tce; + Oid type_oid = PG_GETARG_OID(0); + + tce = lookup_type_cache(type_oid, TYPECACHE_HASH_PROC); + + PG_RETURN_OID(tce->hash_proc); +} + +/* + * Wrapper for hash_to_part_index(). + */ +Datum +get_hash_part_idx(PG_FUNCTION_ARGS) +{ + uint32 value = PG_GETARG_UINT32(0), + part_count = PG_GETARG_UINT32(1); + + PG_RETURN_UINT32(hash_to_part_index(value, part_count)); +} diff --git a/contrib/pg_pathman/src/pl_range_funcs.c b/contrib/pg_pathman/src/pl_range_funcs.c new file mode 100644 index 0000000000..bd71ce0979 --- /dev/null +++ b/contrib/pg_pathman/src/pl_range_funcs.c @@ -0,0 +1,278 @@ +/* ------------------------------------------------------------------------ + * + * pl_range_funcs.c + * Utility C functions for stored RANGE procedures + * + * Copyright (c) 2016, Postgres Professional + * + * ------------------------------------------------------------------------ + */ + +#include "pathman.h" +#include "relation_info.h" +#include "utils.h" + +#include "utils/array.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" + + +/* Function declarations */ + +PG_FUNCTION_INFO_V1( find_or_create_range_partition); +PG_FUNCTION_INFO_V1( check_overlap ); + +PG_FUNCTION_INFO_V1( get_part_range_by_oid ); +PG_FUNCTION_INFO_V1( get_part_range_by_idx ); + +PG_FUNCTION_INFO_V1( build_range_condition ); + + +/* + * ----------------------------- + * Partition creation & checks + * ----------------------------- + */ + +/* + * Returns partition oid for specified parent relid and value. + * In case when partition doesn't exist try to create one. + */ +Datum +find_or_create_range_partition(PG_FUNCTION_ARGS) +{ + Oid parent_oid = PG_GETARG_OID(0); + Datum value = PG_GETARG_DATUM(1); + Oid value_type = get_fn_expr_argtype(fcinfo->flinfo, 1); + const PartRelationInfo *prel; + FmgrInfo cmp_func; + RangeEntry found_rentry; + search_rangerel_result search_state; + + prel = get_pathman_relation_info(parent_oid); + shout_if_prel_is_invalid(parent_oid, prel, PT_RANGE); + + fill_type_cmp_fmgr_info(&cmp_func, + getBaseType(value_type), + getBaseType(prel->atttype)); + + /* Use available PartRelationInfo to find partition */ + search_state = search_range_partition_eq(value, &cmp_func, prel, + &found_rentry); + + /* + * If found then just return oid, else create new partitions + */ + if (search_state == SEARCH_RANGEREL_FOUND) + PG_RETURN_OID(found_rentry.child_oid); + /* + * If not found and value is between first and last partitions + */ + else if (search_state == SEARCH_RANGEREL_GAP) + PG_RETURN_NULL(); + else + { + Oid child_oid = create_partitions(parent_oid, value, value_type); + + /* get_pathman_relation_info() will refresh this entry */ + invalidate_pathman_relation_info(parent_oid, NULL); + + PG_RETURN_OID(child_oid); + } +} + +/* + * Checks if range overlaps with existing partitions. + * Returns TRUE if overlaps and FALSE otherwise. + */ +Datum +check_overlap(PG_FUNCTION_ARGS) +{ + Oid parent_oid = PG_GETARG_OID(0); + + Datum p1 = PG_GETARG_DATUM(1), + p2 = PG_GETARG_DATUM(2); + + Oid p1_type = get_fn_expr_argtype(fcinfo->flinfo, 1), + p2_type = get_fn_expr_argtype(fcinfo->flinfo, 2), + part_type; + + FmgrInfo cmp_func_1, + cmp_func_2; + + uint32 i; + RangeEntry *ranges; + const PartRelationInfo *prel; + + prel = get_pathman_relation_info(parent_oid); + shout_if_prel_is_invalid(parent_oid, prel, PT_RANGE); + + part_type = getBaseType(prel->atttype); + + /* Fetch comparison functions */ + fill_type_cmp_fmgr_info(&cmp_func_1, getBaseType(p1_type), part_type); + fill_type_cmp_fmgr_info(&cmp_func_2, getBaseType(p2_type), part_type); + + ranges = PrelGetRangesArray(prel); + for (i = 0; i < PrelChildrenCount(prel); i++) + { + int c1 = FunctionCall2(&cmp_func_1, p1, ranges[i].max); + int c2 = FunctionCall2(&cmp_func_2, p2, ranges[i].min); + + if (c1 < 0 && c2 > 0) + PG_RETURN_BOOL(true); + } + + PG_RETURN_BOOL(false); +} + + +/* + * ------------------------ + * Various useful getters + * ------------------------ + */ + +/* + * Returns range entry (min, max) (in form of array). + * + * arg #1 is the parent's Oid. + * arg #2 is the partition's Oid. + */ +Datum +get_part_range_by_oid(PG_FUNCTION_ARGS) +{ + Oid partition_relid = InvalidOid, + parent_relid; + PartParentSearch parent_search; + uint32 i; + RangeEntry *ranges; + const PartRelationInfo *prel; + + if (PG_ARGISNULL(0)) + elog(ERROR, "'partition_relid' should not be NULL"); + else + partition_relid = PG_GETARG_OID(0); + + parent_relid = get_parent_of_partition(partition_relid, &parent_search); + if (parent_search != PPS_ENTRY_PART_PARENT) + elog(ERROR, "relation \"%s\" is not a partition", + get_rel_name_or_relid(partition_relid)); + + prel = get_pathman_relation_info(parent_relid); + shout_if_prel_is_invalid(parent_relid, prel, PT_RANGE); + + ranges = PrelGetRangesArray(prel); + + /* Look for the specified partition */ + for (i = 0; i < PrelChildrenCount(prel); i++) + if (ranges[i].child_oid == partition_relid) + { + ArrayType *arr; + Datum elems[2] = { ranges[i].min, ranges[i].max }; + + arr = construct_array(elems, 2, prel->atttype, + prel->attlen, prel->attbyval, + prel->attalign); + + PG_RETURN_ARRAYTYPE_P(arr); + } + + /* No partition found, report error */ + elog(ERROR, "relation \"%s\" has no partition \"%s\"", + get_rel_name_or_relid(parent_relid), + get_rel_name_or_relid(partition_relid)); + + PG_RETURN_NULL(); /* keep compiler happy */ +} + +/* + * Returns N-th range entry (min, max) (in form of array). + * + * arg #1 is the parent's Oid. + * arg #2 is the index of the range + * (if it is negative then the last range will be returned). + */ +Datum +get_part_range_by_idx(PG_FUNCTION_ARGS) +{ + Oid parent_relid = InvalidOid; + int partition_idx = 0; + Datum elems[2]; + RangeEntry *ranges; + const PartRelationInfo *prel; + + if (PG_ARGISNULL(0)) + elog(ERROR, "'parent_relid' should not be NULL"); + else + parent_relid = PG_GETARG_OID(0); + + if (PG_ARGISNULL(1)) + elog(ERROR, "'partition_idx' should not be NULL"); + else + partition_idx = PG_GETARG_INT32(1); + + prel = get_pathman_relation_info(parent_relid); + shout_if_prel_is_invalid(parent_relid, prel, PT_RANGE); + + /* Now we have to deal with 'idx' */ + if (partition_idx < -1) + { + elog(ERROR, "negative indices other than -1 (last partition) are not allowed"); + } + else if (partition_idx == -1) + { + partition_idx = PrelLastChild(prel); + } + else if (((uint32) abs(partition_idx)) >= PrelChildrenCount(prel)) + { + elog(ERROR, "partition #%d does not exist (total amount is %u)", + partition_idx, PrelChildrenCount(prel)); + } + + ranges = PrelGetRangesArray(prel); + + elems[0] = ranges[partition_idx].min; + elems[1] = ranges[partition_idx].max; + + PG_RETURN_ARRAYTYPE_P(construct_array(elems, 2, + prel->atttype, + prel->attlen, + prel->attbyval, + prel->attalign)); +} + + +/* + * ------------------------ + * Useful string builders + * ------------------------ + */ + +/* Build range condition for a CHECK CONSTRAINT. */ +Datum +build_range_condition(PG_FUNCTION_ARGS) +{ + text *attname = PG_GETARG_TEXT_P(0); + + Datum min_bound = PG_GETARG_DATUM(1), + max_bound = PG_GETARG_DATUM(2); + + Oid min_bound_type = get_fn_expr_argtype(fcinfo->flinfo, 1), + max_bound_type = get_fn_expr_argtype(fcinfo->flinfo, 2); + + char *result; + + /* This is not going to trigger (not now, at least), just for the safety */ + if (min_bound_type != max_bound_type) + elog(ERROR, "cannot build range condition: " + "boundaries should be of the same type"); + + /* Create range condition CSTRING */ + result = psprintf("%1$s >= '%2$s' AND %1$s < '%3$s'", + text_to_cstring(attname), + datum_to_cstring(min_bound, min_bound_type), + datum_to_cstring(max_bound, max_bound_type)); + + PG_RETURN_TEXT_P(cstring_to_text(result)); +} diff --git a/contrib/pg_pathman/src/rangeset.c b/contrib/pg_pathman/src/rangeset.c index cfc6c7072d..beff56de32 100644 --- a/contrib/pg_pathman/src/rangeset.c +++ b/contrib/pg_pathman/src/rangeset.c @@ -1,69 +1,49 @@ /* ------------------------------------------------------------------------ * * rangeset.c - * Index range functions + * IndexRange functions * * Copyright (c) 2015-2016, Postgres Professional * * ------------------------------------------------------------------------ */ -#include "pathman.h" + +#include "rangeset.h" /* Check if two ranges are intersecting */ bool irange_intersects(IndexRange a, IndexRange b) { - return (irange_lower(a) <= irange_upper(b)) && - (irange_lower(b) <= irange_upper(a)); + return (a.ir_lower <= b.ir_upper) && + (b.ir_lower <= a.ir_upper); } /* Check if two ranges are conjuncted */ bool irange_conjuncted(IndexRange a, IndexRange b) { - return (irange_lower(a) - 1 <= irange_upper(b)) && - (irange_lower(b) - 1 <= irange_upper(a)); + return (a.ir_lower - 1 <= b.ir_upper) && + (b.ir_lower - 1 <= a.ir_upper); } /* Make union of two ranges. They should have the same lossiness. */ IndexRange irange_union(IndexRange a, IndexRange b) { - Assert(irange_is_lossy(a) == irange_is_lossy(b)); - return make_irange(Min(irange_lower(a), irange_lower(b)), - Max(irange_upper(a), irange_upper(b)), - irange_is_lossy(a)); + Assert(a.ir_lossy == b.ir_lossy); + return make_irange(Min(a.ir_lower, b.ir_lower), + Max(a.ir_upper, b.ir_upper), + a.ir_lossy); } /* Get intersection of two ranges */ IndexRange irange_intersect(IndexRange a, IndexRange b) { - return make_irange(Max(irange_lower(a), irange_lower(b)), - Min(irange_upper(a), irange_upper(b)), - irange_is_lossy(a) || irange_is_lossy(b)); -} - -#ifdef NOT_USED -/* Print range list in debug purposes */ -static char * -print_irange(List *l) -{ - ListCell *c; - StringInfoData str; - - initStringInfo(&str); - - foreach (c, l) - { - IndexRange ir = lfirst_irange(c); - - appendStringInfo(&str, "[%d,%d]%c ", irange_lower(ir), irange_upper(ir), - irange_is_lossy(ir) ? 'l' : 'e'); - } - return str.data; + return make_irange(Max(a.ir_lower, b.ir_lower), + Min(a.ir_upper, b.ir_upper), + a.ir_lossy || b.ir_lossy); } -#endif /* * Make union of two index rage lists. @@ -74,7 +54,7 @@ irange_list_union(List *a, List *b) ListCell *ca, *cb; List *result = NIL; - IndexRange cur = 0; + IndexRange cur = InvalidIndexRange; bool have_cur = false; ca = list_head(a); @@ -82,12 +62,12 @@ irange_list_union(List *a, List *b) while (ca || cb) { - IndexRange next = 0; + IndexRange next = InvalidIndexRange; /* Fetch next range with lesser lower bound */ if (ca && cb) { - if (irange_lower(lfirst_irange(ca)) <= irange_lower(lfirst_irange(cb))) + if (lfirst_irange(ca).ir_lower <= lfirst_irange(cb).ir_lower) { next = lfirst_irange(ca); ca = lnext(ca); @@ -122,25 +102,25 @@ irange_list_union(List *a, List *b) /* * Ranges are conjuncted, try to unify them. */ - if (irange_is_lossy(next) == irange_is_lossy(cur)) + if (next.ir_lossy == cur.ir_lossy) { cur = irange_union(next, cur); } else { - if (!irange_is_lossy(cur)) + if (!cur.ir_lossy) { result = lappend_irange(result, cur); - cur = make_irange(irange_upper(cur) + 1, - irange_upper(next), - irange_is_lossy(next)); + cur = make_irange(cur.ir_upper + 1, + next.ir_upper, + next.ir_lossy); } else { - result = lappend_irange(result, - make_irange(irange_lower(cur), - irange_lower(next) - 1, - irange_is_lossy(cur))); + result = lappend_irange(result, + make_irange(cur.ir_lower, + next.ir_lower - 1, + cur.ir_lossy)); cur = next; } } @@ -196,10 +176,10 @@ irange_list_intersect(List *a, List *b) if (result != NIL) { last = llast_irange(result); - if (irange_conjuncted(last, intersect) && - irange_is_lossy(last) == irange_is_lossy(intersect)) + if (irange_conjuncted(last, intersect) && + last.ir_lossy == intersect.ir_lossy) { - llast_int(result) = irange_union(last, intersect); + llast(result) = alloc_irange(irange_union(last, intersect)); } else { @@ -217,9 +197,9 @@ irange_list_intersect(List *a, List *b) * which lists to fetch, since lower bound of next range is greater (or * equal) to upper bound of current. */ - if (irange_upper(ra) <= irange_upper(rb)) + if (ra.ir_upper <= rb.ir_upper) ca = lnext(ca); - if (irange_upper(ra) >= irange_upper(rb)) + if (ra.ir_upper >= rb.ir_upper) cb = lnext(cb); } return result; @@ -235,7 +215,7 @@ irange_list_length(List *rangeset) foreach (lc, rangeset) { IndexRange irange = lfirst_irange(lc); - result += irange_upper(irange) - irange_lower(irange) + 1; + result += irange.ir_upper - irange.ir_lower + 1; } return result; } @@ -249,10 +229,10 @@ irange_list_find(List *rangeset, int index, bool *lossy) foreach (lc, rangeset) { IndexRange irange = lfirst_irange(lc); - if (index >= irange_lower(irange) && index <= irange_upper(irange)) + if (index >= irange.ir_lower && index <= irange.ir_upper) { if (lossy) - *lossy = irange_is_lossy(irange) ? true : false; + *lossy = irange.ir_lossy; return true; } } diff --git a/contrib/pg_pathman/src/rangeset.h b/contrib/pg_pathman/src/rangeset.h new file mode 100644 index 0000000000..ffe7f31fc8 --- /dev/null +++ b/contrib/pg_pathman/src/rangeset.h @@ -0,0 +1,75 @@ +/* ------------------------------------------------------------------------ + * + * rangeset.h + * IndexRange functions + * + * Copyright (c) 2015-2016, Postgres Professional + * + * ------------------------------------------------------------------------ + */ + +#ifndef PATHMAN_RANGESET_H +#define PATHMAN_RANGESET_H + + +#include "pathman.h" +#include "nodes/pg_list.h" + + +/* + * IndexRange contains a set of selected partitions. + */ +typedef struct { + bool ir_valid : 1; + bool ir_lossy : 1; /* should we use IndexScan? */ + uint32 ir_lower : 31; /* lower bound */ + uint32 ir_upper : 31; /* upper bound */ +} IndexRange; + + +#define RANGE_MASK 0xEFFFFFFF +#define InvalidIndexRange { false, false, 0, 0 } + + +inline static IndexRange +make_irange(uint32 lower, uint32 upper, bool lossy) +{ + IndexRange result; + + result.ir_valid = true; + result.ir_lossy = lossy; + result.ir_lower = (lower & RANGE_MASK); + result.ir_upper = (upper & RANGE_MASK); + + return result; +} + +inline static IndexRange * +alloc_irange(IndexRange irange) +{ + IndexRange *result = (IndexRange *) palloc(sizeof(IndexRange)); + + memcpy((void *) result, (void *) &irange, sizeof(IndexRange)); + + return result; +} + +#define lfirst_irange(lc) ( *(IndexRange *) lfirst(lc) ) +#define lappend_irange(list, irange) ( lappend((list), alloc_irange(irange)) ) +#define lcons_irange(irange, list) ( lcons(alloc_irange(irange), (list)) ) +#define list_make1_irange(irange) ( lcons(alloc_irange(irange), NIL) ) +#define llast_irange(list) ( lfirst_irange(list_tail(list)) ) +#define linitial_irange(list) ( lfirst_irange(list_head(list)) ) + + +/* rangeset.c */ +bool irange_intersects(IndexRange a, IndexRange b); +bool irange_conjuncted(IndexRange a, IndexRange b); +IndexRange irange_union(IndexRange a, IndexRange b); +IndexRange irange_intersect(IndexRange a, IndexRange b); +List *irange_list_union(List *a, List *b); +List *irange_list_intersect(List *a, List *b); +int irange_list_length(List *rangeset); +bool irange_list_find(List *rangeset, int index, bool *lossy); + +#endif diff --git a/contrib/pg_pathman/src/relation_info.c b/contrib/pg_pathman/src/relation_info.c new file mode 100644 index 0000000000..7028726535 --- /dev/null +++ b/contrib/pg_pathman/src/relation_info.c @@ -0,0 +1,703 @@ +/* ------------------------------------------------------------------------ + * + * relation_info.c + * Data structures describing partitioned relations + * + * Copyright (c) 2016, Postgres Professional + * + * ------------------------------------------------------------------------ + */ + +#include "relation_info.h" +#include "init.h" +#include "utils.h" +#include "xact_handling.h" + +#include "access/htup_details.h" +#include "access/xact.h" +#include "catalog/catalog.h" +#include "catalog/indexing.h" +#include "catalog/pg_inherits.h" +#include "miscadmin.h" +#include "storage/lmgr.h" +#include "utils/builtins.h" +#include "utils/fmgroids.h" +#include "utils/hsearch.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/snapmgr.h" +#include "utils/typcache.h" + + +/* + * We delay all invalidation jobs received in relcache hook. + */ +static List *delayed_invalidation_parent_rels = NIL; +static List *delayed_invalidation_vague_rels = NIL; +static bool delayed_shutdown = false; /* pathman was dropped */ + + +/* Add unique Oid to list, allocate in TopMemoryContext */ +#define list_add_unique(list, oid) \ + do { \ + MemoryContext old_mcxt = MemoryContextSwitchTo(TopMemoryContext); \ + list = list_append_unique_oid(list, ObjectIdGetDatum(oid)); \ + MemoryContextSwitchTo(old_mcxt); \ + } while (0) + +#define free_invalidation_list(list) \ + do { \ + list_free(list); \ + list = NIL; \ + } while (0) + + +static bool try_perform_parent_refresh(Oid parent); +static Oid try_syscache_parent_search(Oid partition, PartParentSearch *status); +static Oid get_parent_of_partition_internal(Oid partition, + PartParentSearch *status, + HASHACTION action); + + +/* + * refresh\invalidate\get\remove PartRelationInfo functions. + */ + +/* Create or update PartRelationInfo in local cache. Might emit ERROR. */ +const PartRelationInfo * +refresh_pathman_relation_info(Oid relid, + PartType partitioning_type, + const char *part_column_name) +{ + const LOCKMODE lockmode = AccessShareLock; + const TypeCacheEntry *typcache; + Oid *prel_children; + uint32 prel_children_count = 0, + i; + bool found; + PartRelationInfo *prel; + Datum param_values[Natts_pathman_config_params]; + bool param_isnull[Natts_pathman_config_params]; + + prel = (PartRelationInfo *) hash_search(partitioned_rels, + (const void *) &relid, + HASH_ENTER, &found); + elog(DEBUG2, + found ? + "Refreshing record for relation %u in pg_pathman's cache [%u]" : + "Creating new record for relation %u in pg_pathman's cache [%u]", + relid, MyProcPid); + + /* + * NOTE: Trick clang analyzer (first access without NULL pointer check). + * Access to field 'valid' results in a dereference of a null pointer. + */ + prel->cmp_proc = InvalidOid; + + /* Clear outdated resources */ + if (found && PrelIsValid(prel)) + { + /* Free these arrays iff they're not NULL */ + FreeChildrenArray(prel); + FreeRangesArray(prel); + } + + /* First we assume that this entry is invalid */ + prel->valid = false; + + /* Make both arrays point to NULL */ + prel->children = NULL; + prel->ranges = NULL; + + /* Set partitioning type */ + prel->parttype = partitioning_type; + + /* Initialize PartRelationInfo using syscache & typcache */ + prel->attnum = get_attnum(relid, part_column_name); + + /* Attribute number sanity check */ + if (prel->attnum == InvalidAttrNumber) + elog(ERROR, "Relation \"%s\" has no column \"%s\"", + get_rel_name_or_relid(relid), part_column_name); + + /* Fetch atttypid, atttypmod, and attcollation in a single cache lookup */ + get_atttypetypmodcoll(relid, prel->attnum, + &prel->atttype, &prel->atttypmod, &prel->attcollid); + + /* Fetch HASH & CMP fuctions and other stuff from type cache */ + typcache = lookup_type_cache(prel->atttype, + TYPECACHE_CMP_PROC | TYPECACHE_HASH_PROC); + + prel->attbyval = typcache->typbyval; + prel->attlen = typcache->typlen; + prel->attalign = typcache->typalign; + + prel->cmp_proc = typcache->cmp_proc; + prel->hash_proc = typcache->hash_proc; + + LockRelationOid(relid, lockmode); + prel_children = find_inheritance_children_array(relid, lockmode, + &prel_children_count); + UnlockRelationOid(relid, lockmode); + + /* If there's no children at all, remove this entry */ + if (prel_children_count == 0) + { + remove_pathman_relation_info(relid); + return NULL; + } + + /* + * Fill 'prel' with partition info, raise ERROR if anything is wrong. + * This way PartRelationInfo will remain 'invalid', and 'get' procedure + * will try to refresh it again (and again), until the error is fixed + * by user manually (i.e. invalid check constraints etc). + */ + fill_prel_with_partitions(prel_children, prel_children_count, prel); + + /* Add "partition+parent" tuple to cache */ + for (i = 0; i < prel_children_count; i++) + cache_parent_of_partition(prel_children[i], relid); + + pfree(prel_children); + + /* Read additional parameters ('enable_parent' and 'auto' at the moment) */ + if (read_pathman_params(relid, param_values, param_isnull)) + { + prel->enable_parent = param_values[Anum_pathman_config_params_enable_parent - 1]; + prel->auto_partition = param_values[Anum_pathman_config_params_auto - 1]; + prel->init_callback = param_values[Anum_pathman_config_params_init_callback - 1]; + } + /* Else set default values if they cannot be found */ + else + { + prel->enable_parent = false; + prel->auto_partition = true; + prel->init_callback = InvalidOid; + } + + /* We've successfully built a cache entry */ + prel->valid = true; + + return prel; +} + +/* Invalidate PartRelationInfo cache entry. Create new entry if 'found' is NULL. */ +void +invalidate_pathman_relation_info(Oid relid, bool *found) +{ + bool prel_found; + HASHACTION action = found ? HASH_FIND : HASH_ENTER; + PartRelationInfo *prel; + + prel = hash_search(partitioned_rels, + (const void *) &relid, + action, &prel_found); + + if ((action == HASH_FIND || + (action == HASH_ENTER && prel_found)) && PrelIsValid(prel)) + { + FreeChildrenArray(prel); + FreeRangesArray(prel); + + prel->valid = false; /* now cache entry is invalid */ + } + /* Handle invalid PartRelationInfo */ + else if (prel) + { + prel->children = NULL; + prel->ranges = NULL; + + prel->valid = false; /* now cache entry is invalid */ + } + + /* Set 'found' if necessary */ + if (found) *found = prel_found; + + elog(DEBUG2, + "Invalidating record for relation %u in pg_pathman's cache [%u]", + relid, MyProcPid); +} + +/* Get PartRelationInfo from local cache. */ +const PartRelationInfo * +get_pathman_relation_info(Oid relid) +{ + const PartRelationInfo *prel = hash_search(partitioned_rels, + (const void *) &relid, + HASH_FIND, NULL); + + /* Refresh PartRelationInfo if needed */ + if (prel && !PrelIsValid(prel)) + { + Datum values[Natts_pathman_config]; + bool isnull[Natts_pathman_config]; + + /* Check that PATHMAN_CONFIG table contains this relation */ + if (pathman_config_contains_relation(relid, values, isnull, NULL)) + { + PartType part_type; + const char *attname; + + /* We can't use 'part_type' & 'attname' from invalid prel */ + part_type = DatumGetPartType(values[Anum_pathman_config_parttype - 1]); + attname = TextDatumGetCString(values[Anum_pathman_config_attname - 1]); + + /* Refresh partitioned table cache entry (might turn NULL) */ + /* TODO: possible refactoring, pass found 'prel' instead of searching */ + prel = refresh_pathman_relation_info(relid, + part_type, + attname); + } + /* Else clear remaining cache entry */ + else remove_pathman_relation_info(relid); + } + + elog(DEBUG2, + "Fetching %s record for relation %u from pg_pathman's cache [%u]", + (prel ? "live" : "NULL"), relid, MyProcPid); + + return prel; +} + +/* Acquire lock on a table and try to get PartRelationInfo */ +const PartRelationInfo * +get_pathman_relation_info_after_lock(Oid relid, bool unlock_if_not_found) +{ + const PartRelationInfo *prel; + + /* Restrict concurrent partition creation (it's dangerous) */ + xact_lock_partitioned_rel(relid, false); + + prel = get_pathman_relation_info(relid); + if (!prel && unlock_if_not_found) + xact_unlock_partitioned_rel(relid); + + return prel; +} + +/* Remove PartRelationInfo from local cache. */ +void +remove_pathman_relation_info(Oid relid) +{ + PartRelationInfo *prel = hash_search(partitioned_rels, + (const void *) &relid, + HASH_FIND, NULL); + if (prel && PrelIsValid(prel)) + { + /* Free these arrays iff they're not NULL */ + FreeChildrenArray(prel); + FreeRangesArray(prel); + } + + /* Now let's remove the entry completely */ + hash_search(partitioned_rels, + (const void *) &relid, + HASH_REMOVE, NULL); + + elog(DEBUG2, + "Removing record for relation %u in pg_pathman's cache [%u]", + relid, MyProcPid); +} + + +/* + * Functions for delayed invalidation. + */ + +/* Add new delayed pathman shutdown job (DROP EXTENSION) */ +void +delay_pathman_shutdown(void) +{ + delayed_shutdown = true; +} + +/* Add new delayed invalidation job for a [ex-]parent relation */ +void +delay_invalidation_parent_rel(Oid parent) +{ + list_add_unique(delayed_invalidation_parent_rels, parent); +} + +/* Add new delayed invalidation job for a vague relation */ +void +delay_invalidation_vague_rel(Oid vague_rel) +{ + list_add_unique(delayed_invalidation_vague_rels, vague_rel); +} + +/* Finish all pending invalidation jobs if possible */ +void +finish_delayed_invalidation(void) +{ + /* Exit early if there's nothing to do */ + if (delayed_invalidation_parent_rels == NIL && + delayed_invalidation_vague_rels == NIL && + delayed_shutdown == false) + { + return; + } + + /* Check that current state is transactional */ + if (IsTransactionState()) + { + ListCell *lc; + + /* Handle the probable 'DROP EXTENSION' case */ + if (delayed_shutdown) + { + Oid cur_pathman_config_relid; + + /* Unset 'shutdown' flag */ + delayed_shutdown = false; + + /* Get current PATHMAN_CONFIG relid */ + cur_pathman_config_relid = get_relname_relid(PATHMAN_CONFIG, + get_pathman_schema()); + + /* Check that PATHMAN_CONFIG table has indeed been dropped */ + if (cur_pathman_config_relid == InvalidOid || + cur_pathman_config_relid != get_pathman_config_relid()) + { + /* Ok, let's unload pg_pathman's config */ + unload_config(); + + /* Disregard all remaining invalidation jobs */ + free_invalidation_list(delayed_invalidation_parent_rels); + free_invalidation_list(delayed_invalidation_vague_rels); + + /* No need to continue, exit */ + return; + } + } + + /* Process relations that are (or were) definitely partitioned */ + foreach (lc, delayed_invalidation_parent_rels) + { + Oid parent = lfirst_oid(lc); + + /* Skip if it's a TOAST table */ + if (IsToastNamespace(get_rel_namespace(parent))) + continue; + + if (!pathman_config_contains_relation(parent, NULL, NULL, NULL)) + remove_pathman_relation_info(parent); + else + /* get_pathman_relation_info() will refresh this entry */ + invalidate_pathman_relation_info(parent, NULL); + } + + /* Process all other vague cases */ + foreach (lc, delayed_invalidation_vague_rels) + { + Oid vague_rel = lfirst_oid(lc); + + /* Skip if it's a TOAST table */ + if (IsToastNamespace(get_rel_namespace(vague_rel))) + continue; + + /* It might be a partitioned table or a partition */ + if (!try_perform_parent_refresh(vague_rel)) + { + PartParentSearch search; + Oid parent; + + parent = get_parent_of_partition(vague_rel, &search); + + switch (search) + { + /* It's still parent */ + case PPS_ENTRY_PART_PARENT: + try_perform_parent_refresh(parent); + break; + + /* It *might have been* parent before (not in PATHMAN_CONFIG) */ + case PPS_ENTRY_PARENT: + remove_pathman_relation_info(parent); + break; + + /* How come we still don't know?? */ + case PPS_NOT_SURE: + elog(ERROR, "Unknown table status, this should never happen"); + break; + + default: + break; + } + } + } + + free_invalidation_list(delayed_invalidation_parent_rels); + free_invalidation_list(delayed_invalidation_vague_rels); + } +} + + +/* + * cache\forget\get PartParentInfo functions. + */ + +/* Create "partition+parent" pair in local cache */ +void +cache_parent_of_partition(Oid partition, Oid parent) +{ + bool found; + PartParentInfo *ppar; + + ppar = hash_search(parent_cache, + (const void *) &partition, + HASH_ENTER, &found); + + elog(DEBUG2, + found ? + "Refreshing record for child %u in pg_pathman's cache [%u]" : + "Creating new record for child %u in pg_pathman's cache [%u]", + partition, MyProcPid); + + ppar->child_rel = partition; + ppar->parent_rel = parent; +} + +/* Remove "partition+parent" pair from cache & return parent's Oid */ +Oid +forget_parent_of_partition(Oid partition, PartParentSearch *status) +{ + return get_parent_of_partition_internal(partition, status, HASH_REMOVE); +} + +/* Return partition parent's Oid */ +Oid +get_parent_of_partition(Oid partition, PartParentSearch *status) +{ + return get_parent_of_partition_internal(partition, status, HASH_FIND); +} + +/* + * Get [and remove] "partition+parent" pair from cache, + * also check syscache if 'status' is provided. + * + * "status == NULL" implies that we don't care about + * neither syscache nor PATHMAN_CONFIG table contents. + */ +static Oid +get_parent_of_partition_internal(Oid partition, + PartParentSearch *status, + HASHACTION action) +{ + const char *action_str; /* "Fetching"\"Resetting" */ + Oid parent; + PartParentInfo *ppar = hash_search(parent_cache, + (const void *) &partition, + HASH_FIND, NULL); + + /* Set 'action_str' */ + switch (action) + { + case HASH_REMOVE: + action_str = "Resetting"; + break; + + case HASH_FIND: + action_str = "Fetching"; + break; + + default: + elog(ERROR, "Unexpected HTAB action %u", action); + } + + elog(DEBUG2, + "%s %s record for child %u from pg_pathman's cache [%u]", + action_str, (ppar ? "live" : "NULL"), partition, MyProcPid); + + if (ppar) + { + if (status) *status = PPS_ENTRY_PART_PARENT; + parent = ppar->parent_rel; + + /* Remove entry if necessary */ + if (action == HASH_REMOVE) + hash_search(parent_cache, + (const void *) &partition, + HASH_REMOVE, NULL); + } + /* Try fetching parent from syscache if 'status' is provided */ + else if (status) + parent = try_syscache_parent_search(partition, status); + else + parent = InvalidOid; /* we don't have to set status */ + + return parent; +} + +/* Try to find parent of a partition using syscache & PATHMAN_CONFIG */ +static Oid +try_syscache_parent_search(Oid partition, PartParentSearch *status) +{ + if (!IsTransactionState()) + { + /* We could not perform search */ + if (status) *status = PPS_NOT_SURE; + + return InvalidOid; + } + else + { + Relation relation; + Snapshot snapshot; + ScanKeyData key[1]; + SysScanDesc scan; + HeapTuple inheritsTuple; + Oid parent = InvalidOid; + + /* At first we assume parent does not exist (not a partition) */ + if (status) *status = PPS_ENTRY_NOT_FOUND; + + relation = heap_open(InheritsRelationId, AccessShareLock); + + ScanKeyInit(&key[0], + Anum_pg_inherits_inhrelid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(partition)); + + snapshot = RegisterSnapshot(GetLatestSnapshot()); + scan = systable_beginscan(relation, InheritsRelidSeqnoIndexId, + true, NULL, 1, key); + + while ((inheritsTuple = systable_getnext(scan)) != NULL) + { + parent = ((Form_pg_inherits) GETSTRUCT(inheritsTuple))->inhparent; + + /* + * NB: don't forget that 'inh' flag does not immediately + * mean that this is a pg_pathman's partition. It might + * be just a casual inheriting table. + */ + if (status) *status = PPS_ENTRY_PARENT; + + /* Check that PATHMAN_CONFIG contains this table */ + if (pathman_config_contains_relation(parent, NULL, NULL, NULL)) + { + /* We've found the entry, update status */ + if (status) *status = PPS_ENTRY_PART_PARENT; + } + + break; /* there should be no more rows */ + } + + systable_endscan(scan); + UnregisterSnapshot(snapshot); + heap_close(relation, AccessShareLock); + + return parent; + } +} + +/* + * Try to refresh cache entry for relation 'parent'. + * + * Return true on success. + */ +static bool +try_perform_parent_refresh(Oid parent) +{ + Datum values[Natts_pathman_config]; + bool isnull[Natts_pathman_config]; + + if (pathman_config_contains_relation(parent, values, isnull, NULL)) + { + text *attname; + PartType parttype; + + parttype = DatumGetPartType(values[Anum_pathman_config_parttype - 1]); + attname = DatumGetTextP(values[Anum_pathman_config_attname - 1]); + + /* If anything went wrong, return false (actually, it might throw ERROR) */ + if (!PrelIsValid(refresh_pathman_relation_info(parent, parttype, + text_to_cstring(attname)))) + return false; + } + /* Not a partitioned relation */ + else return false; + + return true; +} + +/* + * Safe PartType wrapper. + */ +PartType +DatumGetPartType(Datum datum) +{ + uint32 val = DatumGetUInt32(datum); + + if (val < 1 || val > 2) + elog(ERROR, "Unknown partitioning type %u", val); + + return (PartType) val; +} + +char * +PartTypeToCString(PartType parttype) +{ + static char *hash_str = "1", + *range_str = "2"; + + switch (parttype) + { + case PT_HASH: + return hash_str; + + case PT_RANGE: + return range_str; + + default: + elog(ERROR, "Unknown partitioning type %u", parttype); + return NULL; /* keep compiler happy */ + } +} + +/* + * Common PartRelationInfo checks. Emit ERROR if anything is wrong. + */ +void +shout_if_prel_is_invalid(Oid parent_oid, + const PartRelationInfo *prel, + PartType expected_part_type) +{ + if (!prel) + elog(ERROR, "relation \"%s\" has no partitions", + get_rel_name_or_relid(parent_oid)); + + if (!PrelIsValid(prel)) + elog(ERROR, "pg_pathman's cache contains invalid entry " + "for relation \"%s\" [%u]", + get_rel_name_or_relid(parent_oid), + MyProcPid); + + /* Check partitioning type unless it's "indifferent" */ + if (expected_part_type != PT_INDIFFERENT && + expected_part_type != prel->parttype) + { + char *expected_str; + + switch (expected_part_type) + { + case PT_HASH: + expected_str = "HASH"; + break; + + case PT_RANGE: + expected_str = "RANGE"; + break; + + default: + elog(ERROR, + "expected_str selection not implemented for type %d", + expected_part_type); + } + + elog(ERROR, "relation \"%s\" is not partitioned by %s", + get_rel_name_or_relid(parent_oid), + expected_str); + } +} diff --git a/contrib/pg_pathman/src/relation_info.h b/contrib/pg_pathman/src/relation_info.h new file mode 100644 index 0000000000..5b50005a9c --- /dev/null +++ b/contrib/pg_pathman/src/relation_info.h @@ -0,0 +1,200 @@ +/* ------------------------------------------------------------------------ + * + * relation_info.h + * Data structures describing partitioned relations + * + * Copyright (c) 2016, Postgres Professional + * + * ------------------------------------------------------------------------ + */ + +#ifndef RELATION_INFO_H +#define RELATION_INFO_H + +#include "postgres.h" +#include "access/attnum.h" +#include "port/atomics.h" + + +/* + * Partitioning type. + */ +typedef enum +{ + PT_INDIFFERENT = 0, /* for part type traits (virtual type) */ + PT_HASH, + PT_RANGE +} PartType; + +/* + * Child relation info for RANGE partitioning + */ +typedef struct +{ + Oid child_oid; + + Datum min, + max; +} RangeEntry; + +/* + * PartRelationInfo + * Per-relation partitioning information + */ +typedef struct +{ + Oid key; /* partitioned table's Oid */ + bool valid; /* is this entry valid? */ + bool enable_parent; /* include parent to the plan */ + bool auto_partition; /* auto partition creation */ + Oid init_callback; /* callback for partition creation */ + + uint32 children_count; + Oid *children; /* Oids of child partitions */ + RangeEntry *ranges; /* per-partition range entry or NULL */ + + PartType parttype; /* partitioning type (HASH | RANGE) */ + AttrNumber attnum; /* partitioned column's index */ + Oid atttype; /* partitioned column's type */ + int32 atttypmod; /* partitioned column type modifier */ + bool attbyval; /* is partitioned column stored by value? */ + int16 attlen; /* length of the partitioned column's type */ + int attalign; /* alignment of the part column's type */ + Oid attcollid; /* collation of the partitioned column */ + + Oid cmp_proc, /* comparison fuction for 'atttype' */ + hash_proc; /* hash function for 'atttype' */ +} PartRelationInfo; + +/* + * RelParentInfo + * Cached parent of the specified partition. + * Allows us to quickly search for PartRelationInfo. + */ +typedef struct +{ + Oid child_rel; /* key */ + Oid parent_rel; +} PartParentInfo; + +/* + * PartParentSearch + * Represents status of a specific cached entry. + * Returned by [for]get_parent_of_partition(). + */ +typedef enum +{ + PPS_ENTRY_NOT_FOUND = 0, + PPS_ENTRY_PARENT, /* entry was found, but pg_pathman doesn't know it */ + PPS_ENTRY_PART_PARENT, /* entry is parent and is known by pg_pathman */ + PPS_NOT_SURE /* can't determine (not transactional state) */ +} PartParentSearch; + + +/* + * PartRelationInfo field access macros. + */ + +#define PrelParentRelid(prel) ( (prel)->key ) + +#define PrelGetChildrenArray(prel) ( (prel)->children ) + +#define PrelGetRangesArray(prel) ( (prel)->ranges ) + +#define PrelChildrenCount(prel) ( (prel)->children_count ) + +#define PrelIsValid(prel) ( (prel) && (prel)->valid ) + +inline static uint32 +PrelLastChild(const PartRelationInfo *prel) +{ + Assert(PrelIsValid(prel)); + + if (PrelChildrenCount(prel) == 0) + elog(ERROR, "pg_pathman's cache entry for relation %u has 0 children", + PrelParentRelid(prel)); + + return PrelChildrenCount(prel) - 1; /* last partition */ +} + + +const PartRelationInfo *refresh_pathman_relation_info(Oid relid, + PartType partitioning_type, + const char *part_column_name); +void invalidate_pathman_relation_info(Oid relid, bool *found); +void remove_pathman_relation_info(Oid relid); +const PartRelationInfo *get_pathman_relation_info(Oid relid); +const PartRelationInfo *get_pathman_relation_info_after_lock(Oid relid, + bool unlock_if_not_found); + +void delay_pathman_shutdown(void); +void delay_invalidation_parent_rel(Oid parent); +void delay_invalidation_vague_rel(Oid vague_rel); +void finish_delayed_invalidation(void); + +void cache_parent_of_partition(Oid partition, Oid parent); +Oid forget_parent_of_partition(Oid partition, PartParentSearch *status); +Oid get_parent_of_partition(Oid partition, PartParentSearch *status); + +PartType DatumGetPartType(Datum datum); +char * PartTypeToCString(PartType parttype); + +void shout_if_prel_is_invalid(Oid parent_oid, + const PartRelationInfo *prel, + PartType expected_part_type); + + +/* + * Useful static functions for freeing memory. + */ + +static inline void +FreeChildrenArray(PartRelationInfo *prel) +{ + uint32 i; + + Assert(PrelIsValid(prel)); + + /* Remove relevant PartParentInfos */ + if ((prel)->children) + { + for (i = 0; i < PrelChildrenCount(prel); i++) + { + Oid child = (prel)->children[i]; + + /* If it's *always been* relid's partition, free cache */ + if (PrelParentRelid(prel) == get_parent_of_partition(child, NULL)) + forget_parent_of_partition(child, NULL); + } + + pfree((prel)->children); + (prel)->children = NULL; + } +} + +static inline void +FreeRangesArray(PartRelationInfo *prel) +{ + uint32 i; + + Assert(PrelIsValid(prel)); + + /* Remove RangeEntries array */ + if ((prel)->ranges) + { + /* Remove persistent entries if not byVal */ + if (!(prel)->attbyval) + { + for (i = 0; i < PrelChildrenCount(prel); i++) + { + pfree(DatumGetPointer((prel)->ranges[i].min)); + pfree(DatumGetPointer((prel)->ranges[i].max)); + } + } + + pfree((prel)->ranges); + (prel)->ranges = NULL; + } +} + +#endif diff --git a/contrib/pg_pathman/src/runtime_merge_append.c b/contrib/pg_pathman/src/runtime_merge_append.c index a0a9da4f13..e021d3ce83 100644 --- a/contrib/pg_pathman/src/runtime_merge_append.c +++ b/contrib/pg_pathman/src/runtime_merge_append.c @@ -4,25 +4,31 @@ * RuntimeMergeAppend node's function definitions and global variables * * Copyright (c) 2016, Postgres Professional + * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California * * ------------------------------------------------------------------------ */ -#include "postgres.h" -#include "runtime_merge_append.h" +#include "pg_compat.h" + +#include "runtime_merge_append.h" #include "pathman.h" +#include "postgres.h" #include "catalog/pg_collation.h" #include "miscadmin.h" #include "nodes/nodeFuncs.h" +#include "nodes/plannodes.h" #include "optimizer/clauses.h" #include "optimizer/cost.h" -#include "optimizer/restrictinfo.h" #include "optimizer/planmain.h" #include "optimizer/tlist.h" #include "optimizer/var.h" #include "utils/builtins.h" +#include "utils/guc.h" #include "utils/lsyscache.h" +#include "utils/typcache.h" #include "utils/memutils.h" #include "utils/ruleutils.h" @@ -112,18 +118,18 @@ static void pack_runtimemergeappend_private(CustomScan *cscan, MergeAppendGuts *mag) { List *runtimemergeappend_private = NIL; - List *sortColIdx = NIL, - *sortOperators = NIL, - *collations = NIL, - *nullsFirst = NIL; + List *sortColIdx = NIL, + *sortOperators = NIL, + *collations = NIL, + *nullsFirst = NIL; int i; for (i = 0; i < mag->numCols; i++) { - sortColIdx = lappend_int(sortColIdx, mag->sortColIdx[i]); - sortOperators = lappend_oid(sortOperators, mag->sortOperators[i]); - collations = lappend_oid(collations, mag->collations[i]); - nullsFirst = lappend_int(nullsFirst, mag->nullsFirst[i]); + sortColIdx = lappend_int(sortColIdx, mag->sortColIdx[i]); + sortOperators = lappend_oid(sortOperators, mag->sortOperators[i]); + collations = lappend_oid(collations, mag->collations[i]); + nullsFirst = lappend_int(nullsFirst, mag->nullsFirst[i]); } runtimemergeappend_private = list_make2(makeInteger(mag->numCols), @@ -132,7 +138,14 @@ pack_runtimemergeappend_private(CustomScan *cscan, MergeAppendGuts *mag) collations, nullsFirst)); - /* Append RuntimeMergeAppend's data to the 'custom_private' */ + /* + * Append RuntimeMergeAppend's data to the 'custom_private' (2nd). + * + * This way some sort of hierarchy is maintained in 'custom_private': + * inherited structure (in this case RuntimeAppend) is stored first, + * so we can think of pack\unpack functions as 'constructors' to some + * extent. + */ cscan->custom_private = lappend(cscan->custom_private, runtimemergeappend_private); } @@ -167,15 +180,45 @@ unpack_runtimemergeappend_private(RuntimeMergeAppendState *scan_state, runtimemergeappend_private = lsecond(cscan->custom_private); scan_state->numCols = intVal(linitial(runtimemergeappend_private)); - sortColIdx = linitial(lsecond(runtimemergeappend_private)); - sortOperators = lsecond(lsecond(runtimemergeappend_private)); - collations = lthird(lsecond(runtimemergeappend_private)); - nullsFirst = lfourth(lsecond(runtimemergeappend_private)); + sortColIdx = linitial(lsecond(runtimemergeappend_private)); + sortOperators = lsecond(lsecond(runtimemergeappend_private)); + collations = lthird(lsecond(runtimemergeappend_private)); + nullsFirst = lfourth(lsecond(runtimemergeappend_private)); + + FillStateField(sortColIdx, AttrNumber, lfirst_int); + FillStateField(sortOperators, Oid, lfirst_oid); + FillStateField(collations, Oid, lfirst_oid); + FillStateField(nullsFirst, bool, lfirst_int); +} - FillStateField(sortColIdx, AttrNumber, lfirst_int); - FillStateField(sortOperators, Oid, lfirst_oid); - FillStateField(collations, Oid, lfirst_oid); - FillStateField(nullsFirst, bool, lfirst_int); +void +init_runtime_merge_append_static_data(void) +{ + runtime_merge_append_path_methods.CustomName = "RuntimeMergeAppend"; + runtime_merge_append_path_methods.PlanCustomPath = create_runtimemergeappend_plan; + + runtime_merge_append_plan_methods.CustomName = "RuntimeMergeAppend"; + runtime_merge_append_plan_methods.CreateCustomScanState = runtimemergeappend_create_scan_state; + + runtime_merge_append_exec_methods.CustomName = "RuntimeMergeAppend"; + runtime_merge_append_exec_methods.BeginCustomScan = runtimemergeappend_begin; + runtime_merge_append_exec_methods.ExecCustomScan = runtimemergeappend_exec; + runtime_merge_append_exec_methods.EndCustomScan = runtimemergeappend_end; + runtime_merge_append_exec_methods.ReScanCustomScan = runtimemergeappend_rescan; + runtime_merge_append_exec_methods.MarkPosCustomScan = NULL; + runtime_merge_append_exec_methods.RestrPosCustomScan = NULL; + runtime_merge_append_exec_methods.ExplainCustomScan = runtimemergeappend_explain; + + DefineCustomBoolVariable("pg_pathman.enable_runtimemergeappend", + "Enables the planner's use of RuntimeMergeAppend custom node.", + NULL, + &pg_pathman_enable_runtime_merge_append, + true, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); } Path * @@ -419,7 +462,8 @@ runtimemergeappend_rescan(CustomScanState *node) * initialize sort-key information */ scan_state->ms_nkeys = scan_state->numCols; - scan_state->ms_sortkeys = palloc0(sizeof(SortSupportData) * scan_state->numCols); + scan_state->ms_sortkeys = (SortSupport) + palloc0(sizeof(SortSupportData) * scan_state->numCols); for (i = 0; i < scan_state->numCols; i++) { @@ -706,9 +750,9 @@ prepare_sort_from_pathkeys(PlannerInfo *root, Plan *lefttree, List *pathkeys, continue; sortexpr = em->em_expr; - exprvars = pull_var_clause((Node *) sortexpr, - PVC_INCLUDE_AGGREGATES, - PVC_INCLUDE_PLACEHOLDERS); + exprvars = pull_var_clause_compat((Node *) sortexpr, + PVC_INCLUDE_AGGREGATES, + PVC_INCLUDE_PLACEHOLDERS); foreach(k, exprvars) { if (!tlist_member_ignore_relabel(lfirst(k), tlist)) @@ -732,8 +776,8 @@ prepare_sort_from_pathkeys(PlannerInfo *root, Plan *lefttree, List *pathkeys, { /* copy needed so we don't modify input's tlist below */ tlist = copyObject(tlist); - lefttree = (Plan *) make_result(root, tlist, NULL, - lefttree); + lefttree = (Plan *) make_result_compat(root, tlist, NULL, + lefttree); } /* Don't bother testing is_projection_capable_plan again */ diff --git a/contrib/pg_pathman/src/runtime_merge_append.h b/contrib/pg_pathman/src/runtime_merge_append.h index 2a657c0643..8dd8dcb116 100644 --- a/contrib/pg_pathman/src/runtime_merge_append.h +++ b/contrib/pg_pathman/src/runtime_merge_append.h @@ -9,13 +9,15 @@ * * ------------------------------------------------------------------------ */ + #ifndef RUNTIME_MERGE_APPEND_H #define RUNTIME_MERGE_APPEND_H -#include "postgres.h" #include "runtimeappend.h" #include "pathman.h" +#include "postgres.h" + typedef struct { @@ -49,7 +51,10 @@ extern CustomScanMethods runtime_merge_append_plan_methods; extern CustomExecMethods runtime_merge_append_exec_methods; -Path * create_runtimemergeappend_path(PlannerInfo *root, AppendPath *inner_append, +void init_runtime_merge_append_static_data(void); + +Path * create_runtimemergeappend_path(PlannerInfo *root, + AppendPath *inner_append, ParamPathInfo *param_info, double sel); @@ -59,7 +64,9 @@ Plan * create_runtimemergeappend_plan(PlannerInfo *root, RelOptInfo *rel, Node * runtimemergeappend_create_scan_state(CustomScan *node); -void runtimemergeappend_begin(CustomScanState *node, EState *estate, int eflags); +void runtimemergeappend_begin(CustomScanState *node, + EState *estate, + int eflags); TupleTableSlot * runtimemergeappend_exec(CustomScanState *node); @@ -67,6 +74,8 @@ void runtimemergeappend_end(CustomScanState *node); void runtimemergeappend_rescan(CustomScanState *node); -void runtimemergeappend_explain(CustomScanState *node, List *ancestors, ExplainState *es); +void runtimemergeappend_explain(CustomScanState *node, + List *ancestors, + ExplainState *es); #endif diff --git a/contrib/pg_pathman/src/runtimeappend.c b/contrib/pg_pathman/src/runtimeappend.c index 838d887ae1..7260ab2c0c 100644 --- a/contrib/pg_pathman/src/runtimeappend.c +++ b/contrib/pg_pathman/src/runtimeappend.c @@ -7,10 +7,12 @@ * * ------------------------------------------------------------------------ */ + +#include "runtimeappend.h" + #include "postgres.h" #include "utils/memutils.h" -#include "runtimeappend.h" -#include "pathman.h" +#include "utils/guc.h" bool pg_pathman_enable_runtimeappend = true; @@ -20,6 +22,36 @@ CustomScanMethods runtimeappend_plan_methods; CustomExecMethods runtimeappend_exec_methods; +void +init_runtimeappend_static_data(void) +{ + runtimeappend_path_methods.CustomName = "RuntimeAppend"; + runtimeappend_path_methods.PlanCustomPath = create_runtimeappend_plan; + + runtimeappend_plan_methods.CustomName = "RuntimeAppend"; + runtimeappend_plan_methods.CreateCustomScanState = runtimeappend_create_scan_state; + + runtimeappend_exec_methods.CustomName = "RuntimeAppend"; + runtimeappend_exec_methods.BeginCustomScan = runtimeappend_begin; + runtimeappend_exec_methods.ExecCustomScan = runtimeappend_exec; + runtimeappend_exec_methods.EndCustomScan = runtimeappend_end; + runtimeappend_exec_methods.ReScanCustomScan = runtimeappend_rescan; + runtimeappend_exec_methods.MarkPosCustomScan = NULL; + runtimeappend_exec_methods.RestrPosCustomScan = NULL; + runtimeappend_exec_methods.ExplainCustomScan = runtimeappend_explain; + + DefineCustomBoolVariable("pg_pathman.enable_runtimeappend", + "Enables the planner's use of RuntimeAppend custom node.", + NULL, + &pg_pathman_enable_runtimeappend, + true, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); +} + Path * create_runtimeappend_path(PlannerInfo *root, AppendPath *inner_append, diff --git a/contrib/pg_pathman/src/runtimeappend.h b/contrib/pg_pathman/src/runtimeappend.h index 9e37c4ba6e..55c1320e99 100644 --- a/contrib/pg_pathman/src/runtimeappend.h +++ b/contrib/pg_pathman/src/runtimeappend.h @@ -7,24 +7,25 @@ * * ------------------------------------------------------------------------ */ + #ifndef RUNTIME_APPEND_H #define RUNTIME_APPEND_H +#include "pathman.h" +#include "nodes_common.h" + #include "postgres.h" #include "optimizer/paths.h" #include "optimizer/pathnode.h" #include "commands/explain.h" -#include "pathman.h" -#include "nodes_common.h" - typedef struct { CustomPath cpath; - Oid relid; /* relid of the partitioned table */ + Oid relid; /* relid of the partitioned table */ - ChildScanCommon *children; /* all available plans */ + ChildScanCommon *children; /* all available plans */ int nchildren; } RuntimeAppendPath; @@ -32,7 +33,6 @@ typedef struct { CustomScanState css; Oid relid; /* relid of the partitioned table */ - PartRelationInfo *prel; /* Restrictions to be checked during ReScan and Exec */ List *custom_exprs; @@ -46,6 +46,9 @@ typedef struct ChildScanCommon *cur_plans; int ncur_plans; + /* Should we include parent table? Cached for prepared statements */ + bool enable_parent; + /* Index of the selected plan state */ int running_idx; @@ -53,13 +56,18 @@ typedef struct TupleTableSlot *slot; } RuntimeAppendState; + extern bool pg_pathman_enable_runtimeappend; extern CustomPathMethods runtimeappend_path_methods; extern CustomScanMethods runtimeappend_plan_methods; extern CustomExecMethods runtimeappend_exec_methods; -Path * create_runtimeappend_path(PlannerInfo *root, AppendPath *inner_append, + +void init_runtimeappend_static_data(void); + +Path * create_runtimeappend_path(PlannerInfo *root, + AppendPath *inner_append, ParamPathInfo *param_info, double sel); @@ -69,7 +77,9 @@ Plan * create_runtimeappend_plan(PlannerInfo *root, RelOptInfo *rel, Node * runtimeappend_create_scan_state(CustomScan *node); -void runtimeappend_begin(CustomScanState *node, EState *estate, int eflags); +void runtimeappend_begin(CustomScanState *node, + EState *estate, + int eflags); TupleTableSlot * runtimeappend_exec(CustomScanState *node); @@ -77,6 +87,8 @@ void runtimeappend_end(CustomScanState *node); void runtimeappend_rescan(CustomScanState *node); -void runtimeappend_explain(CustomScanState *node, List *ancestors, ExplainState *es); +void runtimeappend_explain(CustomScanState *node, + List *ancestors, + ExplainState *es); #endif diff --git a/contrib/pg_pathman/src/utils.c b/contrib/pg_pathman/src/utils.c index fec54bb1fe..b22c1fd0de 100644 --- a/contrib/pg_pathman/src/utils.c +++ b/contrib/pg_pathman/src/utils.c @@ -7,59 +7,45 @@ * * ------------------------------------------------------------------------ */ + #include "utils.h" -#include "nodes/nodeFuncs.h" -#include "nodes/makefuncs.h" + +#include "access/htup_details.h" +#include "access/nbtree.h" +#include "access/sysattr.h" +#include "access/xact.h" +#include "catalog/heap.h" +#include "catalog/pg_type.h" +#include "catalog/pg_extension.h" +#include "catalog/pg_proc.h" +#include "commands/extension.h" +#include "miscadmin.h" #include "optimizer/var.h" #include "optimizer/restrictinfo.h" -#include "parser/parse_param.h" +#include "parser/parse_oper.h" #include "utils/builtins.h" -#include "utils/memutils.h" -#include "rewrite/rewriteManip.h" -#include "catalog/heap.h" +#include "utils/fmgroids.h" +#include "utils/lsyscache.h" +#include "utils/syscache.h" +#include "utils/typcache.h" -static bool clause_contains_params_walker(Node *node, void *context); -static void change_varnos_in_restrinct_info(RestrictInfo *rinfo, change_varno_context *context); -static bool change_varno_walker(Node *node, change_varno_context *context); +#define TABLEOID_STR(subst) ( "pathman_tableoid" subst ) +#define TABLEOID_STR_BASE_LEN ( sizeof(TABLEOID_STR("")) - 1 ) -/* - * Execute 'cb_proc' on 'xact_context' reset. - */ -void -execute_on_xact_mcxt_reset(MemoryContext xact_context, - MemoryContextCallbackFunction cb_proc, - void *arg) -{ - MemoryContextCallback *mcxt_cb = MemoryContextAlloc(xact_context, - sizeof(MemoryContextCallback)); - - /* Initialize MemoryContextCallback */ - mcxt_cb->arg = arg; - mcxt_cb->func = cb_proc; - mcxt_cb->next = NULL; - - MemoryContextRegisterResetCallback(xact_context, mcxt_cb); -} +static bool clause_contains_params_walker(Node *node, void *context); +static void change_varnos_in_restrinct_info(RestrictInfo *rinfo, + change_varno_context *context); +static bool change_varno_walker(Node *node, change_varno_context *context); +static List *get_tableoids_list(List *tlist); +static void lock_rows_visitor(Plan *plan, void *context); +static bool rowmark_add_tableoids_walker(Node *node, void *context); /* - * Returns the same list in reversed order. + * Check whether clause contains PARAMs or not */ -List * -list_reverse(List *l) -{ - List *result = NIL; - ListCell *lc; - - foreach (lc, l) - { - result = lcons(lfirst(lc), result); - } - return result; -} - bool clause_contains_params(Node *clause) { @@ -80,7 +66,99 @@ clause_contains_params_walker(Node *node, void *context) context); } -/* NOTE: Used for debug */ +/* + * Extract target entries with resnames beginning with TABLEOID_STR + * and var->varoattno == TableOidAttributeNumber + */ +static List * +get_tableoids_list(List *tlist) +{ + List *result = NIL; + ListCell *lc; + + foreach (lc, tlist) + { + TargetEntry *te = (TargetEntry *) lfirst(lc); + Var *var = (Var *) te->expr; + + if (!IsA(var, Var)) + continue; + + /* Check that column name begins with TABLEOID_STR & it's tableoid */ + if (var->varoattno == TableOidAttributeNumber && + (te->resname && strlen(te->resname) > TABLEOID_STR_BASE_LEN) && + 0 == strncmp(te->resname, TABLEOID_STR(""), TABLEOID_STR_BASE_LEN)) + { + result = lappend(result, te); + } + } + + return result; +} + +/* + * Find 'TABLEOID_STR%u' attributes that were manually + * created for partitioned tables and replace Oids + * (used for '%u') with expected rc->rowmarkIds + */ +static void +lock_rows_visitor(Plan *plan, void *context) +{ + List *rtable = (List *) context; + LockRows *lock_rows = (LockRows *) plan; + Plan *lock_child = outerPlan(plan); + List *tableoids; + ListCell *lc; + + if (!IsA(lock_rows, LockRows)) + return; + + Assert(rtable && IsA(rtable, List) && lock_child); + + /* Select tableoid attributes that must be renamed */ + tableoids = get_tableoids_list(lock_child->targetlist); + if (!tableoids) + return; /* this LockRows has nothing to do with partitioned table */ + + foreach (lc, lock_rows->rowMarks) + { + PlanRowMark *rc = (PlanRowMark *) lfirst(lc); + Oid parent_oid = getrelid(rc->rti, rtable); + ListCell *mark_lc; + List *finished_tes = NIL; /* postprocessed target entries */ + + foreach (mark_lc, tableoids) + { + TargetEntry *te = (TargetEntry *) lfirst(mark_lc); + const char *cur_oid_str = &(te->resname[TABLEOID_STR_BASE_LEN]); + Datum cur_oid_datum; + + cur_oid_datum = DirectFunctionCall1(oidin, CStringGetDatum(cur_oid_str)); + + if (DatumGetObjectId(cur_oid_datum) == parent_oid) + { + char resname[64]; + + /* Replace 'TABLEOID_STR:Oid' with 'tableoid:rowmarkId' */ + snprintf(resname, sizeof(resname), "tableoid%u", rc->rowmarkId); + te->resname = pstrdup(resname); + + finished_tes = lappend(finished_tes, te); + } + } + + /* Remove target entries that have been processed in this step */ + foreach (mark_lc, finished_tes) + tableoids = list_delete_ptr(tableoids, lfirst(mark_lc)); + + if (list_length(tableoids) == 0) + break; /* nothing to do */ + } +} + +/* + * Print Bitmapset as cstring. + */ #ifdef __GNUC__ __attribute__((unused)) #endif @@ -98,6 +176,11 @@ bms_print(Bitmapset *bms) return str.data; } +/* + * Copied from util/plancat.c + * + * Build a targetlist representing the columns of the specified index. + */ List * build_index_tlist(PlannerInfo *root, IndexOptInfo *index, Relation heapRelation) @@ -153,33 +236,41 @@ build_index_tlist(PlannerInfo *root, IndexOptInfo *index, } /* - * We should ensure that 'rel->baserestrictinfo' or 'ppi->ppi_clauses' contain - * Var which corresponds to partition attribute before creating RuntimeXXX - * paths since they are used by create_scan_plan() to form 'scan_clauses' - * that are passed to create_customscan_plan(). + * Get BTORDER_PROC for two types described by Oids */ -bool -check_rinfo_for_partitioned_attr(List *rinfo, Index varno, AttrNumber varattno) +void +fill_type_cmp_fmgr_info(FmgrInfo *finfo, Oid type1, Oid type2) { - List *vars; - List *clauses; - ListCell *lc; + Oid cmp_proc_oid; + TypeCacheEntry *tce; - clauses = get_actual_clauses(rinfo); + tce = lookup_type_cache(type1, TYPECACHE_BTREE_OPFAMILY); - vars = pull_var_clause((Node *) clauses, - PVC_REJECT_AGGREGATES, - PVC_REJECT_PLACEHOLDERS); + cmp_proc_oid = get_opfamily_proc(tce->btree_opf, + type1, + type2, + BTORDER_PROC); - foreach (lc, vars) - { - Var *var = (Var *) lfirst(lc); + if (cmp_proc_oid == InvalidOid) + elog(ERROR, "missing comparison function for types %s & %s", + format_type_be(type1), format_type_be(type2)); - if (var->varno == varno && var->varoattno == varattno) - return true; - } + fmgr_info(cmp_proc_oid, finfo); + + return; +} + +List * +list_reverse(List *l) +{ + List *result = NIL; + ListCell *lc; - return false; + foreach (lc, l) + { + result = lcons(lfirst(lc), result); + } + return result; } /* @@ -282,7 +373,6 @@ change_varnos_in_restrinct_info(RestrictInfo *rinfo, change_varno_context *conte change_varno_walker(node, context); } - /* TODO: find some elegant way to do this */ if (bms_is_member(context->old_varno, rinfo->clause_relids)) { rinfo->clause_relids = bms_del_member(rinfo->clause_relids, context->old_varno); @@ -299,3 +389,340 @@ change_varnos_in_restrinct_info(RestrictInfo *rinfo, change_varno_context *conte rinfo->right_relids = bms_add_member(rinfo->right_relids, context->new_varno); } } + +/* + * Basic plan tree walker + * + * 'visitor' is applied right before return + */ +void +plan_tree_walker(Plan *plan, + void (*visitor) (Plan *plan, void *context), + void *context) +{ + ListCell *l; + + if (plan == NULL) + return; + + check_stack_depth(); + + /* Plan-type-specific fixes */ + switch (nodeTag(plan)) + { + case T_SubqueryScan: + plan_tree_walker(((SubqueryScan *) plan)->subplan, visitor, context); + break; + + case T_CustomScan: + foreach(l, ((CustomScan *) plan)->custom_plans) + plan_tree_walker((Plan *) lfirst(l), visitor, context); + break; + + case T_ModifyTable: + foreach (l, ((ModifyTable *) plan)->plans) + plan_tree_walker((Plan *) lfirst(l), visitor, context); + break; + + /* Since they look alike */ + case T_MergeAppend: + case T_Append: + foreach(l, ((Append *) plan)->appendplans) + plan_tree_walker((Plan *) lfirst(l), visitor, context); + break; + + case T_BitmapAnd: + foreach(l, ((BitmapAnd *) plan)->bitmapplans) + plan_tree_walker((Plan *) lfirst(l), visitor, context); + break; + + case T_BitmapOr: + foreach(l, ((BitmapOr *) plan)->bitmapplans) + plan_tree_walker((Plan *) lfirst(l), visitor, context); + break; + + default: + break; + } + + plan_tree_walker(plan->lefttree, visitor, context); + plan_tree_walker(plan->righttree, visitor, context); + + /* Apply visitor to the current node */ + visitor(plan, context); +} + +static bool +rowmark_add_tableoids_walker(Node *node, void *context) +{ + if (node == NULL) + return false; + + if (IsA(node, Query)) + { + Query *parse = (Query *) node; + ListCell *lc; + + /* Generate 'tableoid' for partitioned table rowmark */ + foreach (lc, parse->rowMarks) + { + RowMarkClause *rc = (RowMarkClause *) lfirst(lc); + Oid parent = getrelid(rc->rti, parse->rtable); + Var *var; + TargetEntry *tle; + char resname[64]; + + /* Check that table is partitioned */ + if (!get_pathman_relation_info(parent)) + continue; + + var = makeVar(rc->rti, + TableOidAttributeNumber, + OIDOID, + -1, + InvalidOid, + 0); + + /* Use parent's Oid as TABLEOID_STR's key (%u) */ + snprintf(resname, sizeof(resname), TABLEOID_STR("%u"), parent); + + tle = makeTargetEntry((Expr *) var, + list_length(parse->targetList) + 1, + pstrdup(resname), + true); + + /* There's no problem here since new attribute is junk */ + parse->targetList = lappend(parse->targetList, tle); + } + + return query_tree_walker((Query *) node, + rowmark_add_tableoids_walker, + NULL, 0); + } + + return expression_tree_walker(node, rowmark_add_tableoids_walker, NULL); +} + +/* + * Add missing 'TABLEOID_STR%u' junk attributes for inherited partitions + * + * This is necessary since preprocess_targetlist() heavily + * depends on the 'inh' flag which we have to unset. + * + * postprocess_lock_rows() will later transform 'TABLEOID_STR:Oid' + * relnames into 'tableoid:rowmarkId'. + */ +void +rowmark_add_tableoids(Query *parse) +{ + rowmark_add_tableoids_walker((Node *) parse, NULL); +} + +/* + * Final rowmark processing for partitioned tables + */ +void +postprocess_lock_rows(List *rtable, Plan *plan) +{ + plan_tree_walker(plan, lock_rows_visitor, rtable); +} + +/* + * Returns pg_pathman schema's Oid or InvalidOid if that's not possible. + */ +Oid +get_pathman_schema(void) +{ + Oid result; + Relation rel; + SysScanDesc scandesc; + HeapTuple tuple; + ScanKeyData entry[1]; + Oid ext_schema; + + /* It's impossible to fetch pg_pathman's schema now */ + if (!IsTransactionState()) + return InvalidOid; + + ext_schema = get_extension_oid("pg_pathman", true); + if (ext_schema == InvalidOid) + return InvalidOid; /* exit if pg_pathman does not exist */ + + ScanKeyInit(&entry[0], + ObjectIdAttributeNumber, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(ext_schema)); + + rel = heap_open(ExtensionRelationId, AccessShareLock); + scandesc = systable_beginscan(rel, ExtensionOidIndexId, true, + NULL, 1, entry); + + tuple = systable_getnext(scandesc); + + /* We assume that there can be at most one matching tuple */ + if (HeapTupleIsValid(tuple)) + result = ((Form_pg_extension) GETSTRUCT(tuple))->extnamespace; + else + result = InvalidOid; + + systable_endscan(scandesc); + + heap_close(rel, AccessShareLock); + + return result; +} + +/* + * Check if this is a "date"-related type. + */ +bool +is_date_type_internal(Oid typid) +{ + return typid == TIMESTAMPOID || + typid == TIMESTAMPTZOID || + typid == DATEOID; +} + +/* + * Check if this is a string type. + */ +bool +is_string_type_internal(Oid typid) +{ + return typid == TEXTOID || + typid == CSTRINGOID; +} + + +/* + * Try to find binary operator. + * + * Returns operator function's Oid or throws an ERROR on InvalidOid. + */ +Oid +get_binary_operator_oid(char *oprname, Oid arg1, Oid arg2) +{ + Oid funcid = InvalidOid; + Operator op; + + op = oper(NULL, list_make1(makeString(oprname)), arg1, arg2, true, -1); + if (op) + { + funcid = oprfuncid(op); + ReleaseSysCache(op); + } + else + elog(ERROR, "Cannot find operator \"%s\"(%u, %u)", oprname, arg1, arg2); + + return funcid; +} + +/* + * Get CSTRING representation of Datum using the type Oid. + */ +char * +datum_to_cstring(Datum datum, Oid typid) +{ + char *result; + HeapTuple tup = SearchSysCache1(TYPEOID, ObjectIdGetDatum(typid)); + + if (HeapTupleIsValid(tup)) + { + Form_pg_type typtup = (Form_pg_type) GETSTRUCT(tup); + result = OidOutputFunctionCall(typtup->typoutput, datum); + ReleaseSysCache(tup); + } + else + result = pstrdup("[error]"); + + return result; +} + +/* + * Try to get relname or at least relid as cstring. + */ +char * +get_rel_name_or_relid(Oid relid) +{ + char *relname = get_rel_name(relid); + + if (!relname) + return DatumGetCString(DirectFunctionCall1(oidout, + ObjectIdGetDatum(relid))); + return relname; +} + +/* + * Try to get opname or at least opid as cstring. + */ +char * +get_op_name_or_opid(Oid opid) +{ + char *opname = get_opname(opid); + + if (!opname) + return DatumGetCString(DirectFunctionCall1(oidout, + ObjectIdGetDatum(opid))); + return opname; +} + + +#if PG_VERSION_NUM < 90600 +/* + * Returns the relpersistence associated with a given relation. + * + * NOTE: this function is implemented in 9.6 + */ +char +get_rel_persistence(Oid relid) +{ + HeapTuple tp; + Form_pg_class reltup; + char result; + + tp = SearchSysCache1(RELOID, ObjectIdGetDatum(relid)); + if (!HeapTupleIsValid(tp)) + elog(ERROR, "cache lookup failed for relation %u", relid); + + reltup = (Form_pg_class) GETSTRUCT(tp); + result = reltup->relpersistence; + ReleaseSysCache(tp); + + return result; +} +#endif + +/* + * Checks that callback function meets specific requirements. + * It must have the only JSONB argument and BOOL return type. + */ +bool +validate_on_part_init_cb(Oid procid, bool emit_error) +{ + HeapTuple tp; + Form_pg_proc functup; + bool is_ok = true; + + if (procid == InvalidOid) + return true; + + tp = SearchSysCache1(PROCOID, ObjectIdGetDatum(procid)); + if (!HeapTupleIsValid(tp)) + elog(ERROR, "cache lookup failed for function %u", procid); + + functup = (Form_pg_proc) GETSTRUCT(tp); + + if (functup->pronargs != 1 || + functup->proargtypes.values[0] != JSONBOID || + functup->prorettype != VOIDOID) + is_ok = false; + + ReleaseSysCache(tp); + + if (emit_error && !is_ok) + elog(ERROR, + "Callback function must have the following signature: " + "callback(arg JSONB) RETURNS VOID"); + + return is_ok; +} diff --git a/contrib/pg_pathman/src/utils.h b/contrib/pg_pathman/src/utils.h index 74b87d7b50..1b2af2ee56 100644 --- a/contrib/pg_pathman/src/utils.h +++ b/contrib/pg_pathman/src/utils.h @@ -7,8 +7,11 @@ * * ------------------------------------------------------------------------ */ -#ifndef UTILS_H -#define UTILS_H + +#ifndef PATHMAN_UTILS_H +#define PATHMAN_UTILS_H + +#include "pathman.h" #include "postgres.h" #include "utils/rel.h" @@ -18,33 +21,56 @@ typedef struct { - RelOptInfo *child; - RelOptInfo *parent; - int sublevels_up; -} ReplaceVarsContext; - -typedef struct -{ - Oid old_varno; - Oid new_varno; + Oid old_varno; + Oid new_varno; } change_varno_context; -void execute_on_xact_mcxt_reset(MemoryContext xact_context, - MemoryContextCallbackFunction cb_proc, - void *arg); +/* + * Plan tree modification. + */ +void plan_tree_walker(Plan *plan, + void (*visitor) (Plan *plan, void *context), + void *context); +List * build_index_tlist(PlannerInfo *root, + IndexOptInfo *index, + Relation heapRelation); +void change_varnos(Node *node, Oid old_varno, Oid new_varno); -List * list_reverse(List *l); +/* + * Rowmark processing. + */ +void rowmark_add_tableoids(Query *parse); +void postprocess_lock_rows(List *rtable, Plan *plan); +/* + * Various traits. + */ bool clause_contains_params(Node *clause); +bool is_date_type_internal(Oid typid); +bool is_string_type_internal(Oid typid); +bool validate_on_part_init_cb(Oid procid, bool emit_error); -List * build_index_tlist(PlannerInfo *root, IndexOptInfo *index, - Relation heapRelation); +/* + * Misc. + */ +Oid get_pathman_schema(void); +List * list_reverse(List *l); -bool check_rinfo_for_partitioned_attr(List *rinfo, - Index varno, - AttrNumber varattno); +#if PG_VERSION_NUM < 90600 +char get_rel_persistence(Oid relid); +#endif -void change_varnos(Node *node, Oid old_varno, Oid new_varno); +/* + * Handy execution-stage functions. + */ +char * get_rel_name_or_relid(Oid relid); +char * get_op_name_or_opid(Oid opid); + +Oid get_binary_operator_oid(char *opname, Oid arg1, Oid arg2); +void fill_type_cmp_fmgr_info(FmgrInfo *finfo, + Oid type1, + Oid type2); +char * datum_to_cstring(Datum datum, Oid typid); #endif diff --git a/contrib/pg_pathman/src/worker.c b/contrib/pg_pathman/src/worker.c deleted file mode 100644 index 0edf2d5c4b..0000000000 --- a/contrib/pg_pathman/src/worker.c +++ /dev/null @@ -1,216 +0,0 @@ -#include "pathman.h" -#include "miscadmin.h" -#include "postmaster/bgworker.h" -#include "catalog/pg_type.h" -#include "executor/spi.h" -#include "storage/dsm.h" -#include "access/xact.h" -#include "utils/snapmgr.h" -#include "utils/typcache.h" - -/*------------------------------------------------------------------------- - * - * worker.c - * - * The purpose of this module is to create partitions in a separate - * transaction. To do so we create a separate background worker, - * pass arguments to it (see PartitionArgs) and gather the result - * (which is the new partition oid). - * - *------------------------------------------------------------------------- - */ - -static dsm_segment *segment; - -static void bg_worker_main(Datum main_arg); - -typedef struct PartitionArgs -{ - Oid dbid; - Oid relid; - #ifdef HAVE_INT64_TIMESTAMP - int64 value; - #else - double value; - #endif - Oid value_type; - bool by_val; - Oid result; - bool crashed; -} PartitionArgs; - -/* - * Starts background worker that will create new partitions, - * waits till it finishes the job and returns the result (new partition oid) - */ -Oid -create_partitions_bg_worker(Oid relid, Datum value, Oid value_type, bool *crashed) -{ - BackgroundWorker worker; - BackgroundWorkerHandle *worker_handle; - BgwHandleStatus status; - dsm_segment *segment; - dsm_handle segment_handle; - pid_t pid; - PartitionArgs *args; - Oid child_oid; - TypeCacheEntry *tce; - - /* Create a dsm segment for the worker to pass arguments */ - segment = dsm_create(sizeof(PartitionArgs), 0); - segment_handle = dsm_segment_handle(segment); - - tce = lookup_type_cache(value_type, 0); - - /* Fill arguments structure */ - args = (PartitionArgs *) dsm_segment_address(segment); - args->dbid = MyDatabaseId; - args->relid = relid; - if (tce->typbyval) - args->value = value; - else - memcpy(&args->value, DatumGetPointer(value), sizeof(args->value)); - args->by_val = tce->typbyval; - args->value_type = value_type; - args->result = 0; - - /* Initialize worker struct */ - worker.bgw_flags = BGWORKER_SHMEM_ACCESS | - BGWORKER_BACKEND_DATABASE_CONNECTION; - worker.bgw_start_time = BgWorkerStart_RecoveryFinished; - worker.bgw_restart_time = BGW_NEVER_RESTART; - worker.bgw_main = bg_worker_main; - worker.bgw_main_arg = Int32GetDatum(segment_handle); - worker.bgw_notify_pid = MyProcPid; - - /* Start dynamic worker */ - if (!RegisterDynamicBackgroundWorker(&worker, &worker_handle)) - { - elog(WARNING, "Unable to create background worker for pg_pathman"); - } - - status = WaitForBackgroundWorkerStartup(worker_handle, &pid); - if (status == BGWH_POSTMASTER_DIED) - { - ereport(WARNING, - (errmsg("Postmaster died during the pg_pathman background worker process"), - errhint("More details may be available in the server log."))); - } - - /* Wait till the worker finishes its job */ - status = WaitForBackgroundWorkerShutdown(worker_handle); - if (status == BGWH_POSTMASTER_DIED) - { - ereport(WARNING, - (errmsg("Postmaster died during the pg_pathman background worker process"), - errhint("More details may be available in the server log."))); - } - *crashed = args->crashed; - child_oid = args->result; - - /* Free dsm segment */ - dsm_detach(segment); - - return child_oid; -} - -/* - * Main worker routine. Accepts dsm_handle as an argument - */ -static void -bg_worker_main(Datum main_arg) -{ - PartitionArgs *args; - dsm_handle handle = DatumGetInt32(main_arg); - - /* Create resource owner */ - CurrentResourceOwner = ResourceOwnerCreate(NULL, "CreatePartitionsWorker"); - - /* Attach to dynamic shared memory */ - if (!handle) - { - ereport(WARNING, - (errmsg("pg_pathman worker: invalid dsm_handle"))); - } - segment = dsm_attach(handle); - args = dsm_segment_address(segment); - - /* Establish connection and start transaction */ - BackgroundWorkerInitializeConnectionByOid(args->dbid, InvalidOid); - StartTransactionCommand(); - SPI_connect(); - PushActiveSnapshot(GetTransactionSnapshot()); - - /* Create partitions */ - args->result = create_partitions(args->relid, PATHMAN_GET_DATUM(args->value, args->by_val), args->value_type, &args->crashed); - - /* Cleanup */ - SPI_finish(); - PopActiveSnapshot(); - CommitTransactionCommand(); - - dsm_detach(segment); -} - -/* - * Create partitions and return an OID of the partition that contain value - */ -Oid -create_partitions(Oid relid, Datum value, Oid value_type, bool *crashed) -{ - int ret; - RangeEntry *ranges; - Datum vals[2]; - Oid oids[] = {OIDOID, value_type}; - bool nulls[] = {false, false}; - char *sql; - bool found; - int pos; - PartRelationInfo *prel; - RangeRelation *rangerel; - FmgrInfo cmp_func; - char *schema; - - *crashed = false; - schema = get_extension_schema(); - - prel = get_pathman_relation_info(relid, NULL); - rangerel = get_pathman_range_relation(relid, NULL); - - /* Comparison function */ - cmp_func = *get_cmp_func(value_type, prel->atttype); - - vals[0] = ObjectIdGetDatum(relid); - vals[1] = value; - - /* Perform PL procedure */ - sql = psprintf("SELECT %s.append_partitions_on_demand_internal($1, $2)", - schema); - PG_TRY(); - { - ret = SPI_execute_with_args(sql, 2, oids, vals, nulls, false, 0); - if (ret > 0) - { - /* Update relation info */ - free_dsm_array(&rangerel->ranges); - free_dsm_array(&prel->children); - load_partitions(relid, GetCatalogSnapshot(relid)); - } - } - PG_CATCH(); - { - elog(WARNING, "Attempt to create new partitions failed"); - if (crashed != NULL) - *crashed = true; - return 0; - } - PG_END_TRY(); - - /* Repeat binary search */ - ranges = dsm_array_get_pointer(&rangerel->ranges); - pos = range_binary_search(rangerel, &cmp_func, value, &found); - if (found) - return ranges[pos].child_oid; - - return 0; -} diff --git a/contrib/pg_pathman/src/xact_handling.c b/contrib/pg_pathman/src/xact_handling.c new file mode 100644 index 0000000000..44d9195bd0 --- /dev/null +++ b/contrib/pg_pathman/src/xact_handling.c @@ -0,0 +1,190 @@ +/* ------------------------------------------------------------------------ + * + * xact_handling.c + * Transaction-specific locks and other functions + * + * Copyright (c) 2016, Postgres Professional + * + * ------------------------------------------------------------------------ + */ + +#include "xact_handling.h" + +#include "postgres.h" +#include "access/xact.h" +#include "catalog/catalog.h" +#include "miscadmin.h" +#include "storage/lmgr.h" + + +static inline void SetLocktagRelationOid(LOCKTAG *tag, Oid relid); +static inline bool do_we_hold_the_lock(Oid relid, LOCKMODE lockmode); + + +/* + * Lock certain partitioned relation to disable concurrent access. + */ +bool +xact_lock_partitioned_rel(Oid relid, bool nowait) +{ + if (nowait) + { + if (ConditionalLockRelationOid(relid, ShareUpdateExclusiveLock)) + return true; + return false; + } + else + LockRelationOid(relid, ShareUpdateExclusiveLock); + + return true; +} + +/* + * Unlock partitioned relation. + */ +void +xact_unlock_partitioned_rel(Oid relid) +{ + UnlockRelationOid(relid, ShareUpdateExclusiveLock); +} + +/* + * Lock relation exclusively (SELECTs are possible). + */ +bool +xact_lock_rel_exclusive(Oid relid, bool nowait) +{ + if (nowait) + { + if (ConditionalLockRelationOid(relid, ExclusiveLock)) + return true; + return false; + } + else + LockRelationOid(relid, ExclusiveLock); + + return true; +} + +/* + * Unlock relation (exclusive lock). + */ +void +xact_unlock_rel_exclusive(Oid relid) +{ + UnlockRelationOid(relid, ExclusiveLock); +} + +/* + * Check whether we already hold a lock that + * might conflict with partition spawning BGW. + */ +bool +xact_bgw_conflicting_lock_exists(Oid relid) +{ + LOCKMODE lockmode; + + /* Try each lock >= ShareUpdateExclusiveLock */ + for (lockmode = ShareUpdateExclusiveLock; + lockmode <= AccessExclusiveLock; + lockmode++) + { + if (do_we_hold_the_lock(relid, lockmode)) + return true; + } + + return false; +} + + +/* + * Check if current transaction's level is READ COMMITTED. + */ +bool +xact_is_level_read_committed(void) +{ + if (XactIsoLevel <= XACT_READ_COMMITTED) + return true; + + return false; +} + +/* + * Check if 'stmt' is BEGIN\ROLLBACK etc transaction statement. + */ +bool +xact_is_transaction_stmt(Node *stmt) +{ + if (!stmt) + return false; + + if (IsA(stmt, TransactionStmt)) + return true; + + return false; +} + +/* + * Check if 'stmt' is SET TRANSACTION statement. + */ +bool +xact_is_set_transaction_stmt(Node *stmt) +{ + if (!stmt) + return false; + + if (IsA(stmt, VariableSetStmt)) + { + VariableSetStmt *var_set_stmt = (VariableSetStmt *) stmt; + + /* special case for SET TRANSACTION ... */ + if (var_set_stmt->kind == VAR_SET_MULTI) + return true; + } + + return false; +} + +/* + * Do we hold the specified lock? + */ +static inline bool +do_we_hold_the_lock(Oid relid, LOCKMODE lockmode) +{ + LOCKTAG tag; + + /* Create a tag for lock */ + SetLocktagRelationOid(&tag, relid); + + /* If lock is alredy held, release it one time (decrement) */ + switch (LockAcquire(&tag, lockmode, false, true)) + { + case LOCKACQUIRE_ALREADY_HELD: + LockRelease(&tag, lockmode, false); + return true; + + case LOCKACQUIRE_OK: + LockRelease(&tag, lockmode, false); + return false; + + default: + return false; + } +} + +/* + * SetLocktagRelationOid + * Set up a locktag for a relation, given only relation OID + */ +static inline void +SetLocktagRelationOid(LOCKTAG *tag, Oid relid) +{ + Oid dbid; + + if (IsSharedRelation(relid)) + dbid = InvalidOid; + else + dbid = MyDatabaseId; + + SET_LOCKTAG_RELATION(*tag, dbid, relid); +} diff --git a/contrib/pg_pathman/src/xact_handling.h b/contrib/pg_pathman/src/xact_handling.h new file mode 100644 index 0000000000..b5f8ed3c6c --- /dev/null +++ b/contrib/pg_pathman/src/xact_handling.h @@ -0,0 +1,36 @@ +/* ------------------------------------------------------------------------ + * + * xact_handling.h + * Transaction-specific locks and other functions + * + * Copyright (c) 2016, Postgres Professional + * + * ------------------------------------------------------------------------ + */ + +#ifndef XACT_HANDLING_H +#define XACT_HANDLING_H + +#include "pathman.h" + +#include "postgres.h" + + +/* + * Transaction locks. + */ +bool xact_lock_partitioned_rel(Oid relid, bool nowait); +void xact_unlock_partitioned_rel(Oid relid); + +bool xact_lock_rel_exclusive(Oid relid, bool nowait); +void xact_unlock_rel_exclusive(Oid relid); + +/* + * Utility checks. + */ +bool xact_bgw_conflicting_lock_exists(Oid relid); +bool xact_is_level_read_committed(void); +bool xact_is_transaction_stmt(Node *stmt); +bool xact_is_set_transaction_stmt(Node *stmt); + +#endif diff --git a/contrib/pg_pathman/tests/README.md b/contrib/pg_pathman/tests/README.md new file mode 100644 index 0000000000..8d07cc445b --- /dev/null +++ b/contrib/pg_pathman/tests/README.md @@ -0,0 +1,34 @@ +# Tests + +This directory contains script to tests some features which cannot be tested +with only regression tests + +## Running + +First of all you need to install `testgres` python module which contains useful +functions to start postgres clusters and make queries: + +``` +pip install testgres +``` + +To run tests execute: + +``` +python -m unittest partitioning_test +``` + +from current directory. If you want to run a specific postgres build then +you should specify the path to your pg_config executable by setting PG_CONFIG +environment variable: + +``` +export PG_CONFIG=/path/to/pg_config +``` + +To test FDW features you need to install postgres_fdw contrib module first. +If you want to skip FDW tests set the FDW_DISABLED environment variable: + +``` +export FDW_DISABLED=1 +``` diff --git a/contrib/pg_pathman/tests/__init__.py b/contrib/pg_pathman/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/contrib/pg_pathman/tests/partitioning_test.py b/contrib/pg_pathman/tests/partitioning_test.py new file mode 100644 index 0000000000..c71c9b9ba5 --- /dev/null +++ b/contrib/pg_pathman/tests/partitioning_test.py @@ -0,0 +1,411 @@ +# coding: utf-8 +""" + concurrent_partitioning_test.py + Tests concurrent partitioning worker with simultaneous update queries + + Copyright (c) 2015-2016, Postgres Professional +""" + +import unittest +from testgres import get_new_node, stop_all +import time +import os + + +def if_fdw_enabled(func): + """To run tests with FDW support set environment variable TEST_FDW=1""" + def wrapper(*args, **kwargs): + if os.environ.get('FDW_DISABLED') != '1': + func(*args, **kwargs) + else: + print('Warning: FDW features tests are disabled, skipping...') + return wrapper + + +class PartitioningTests(unittest.TestCase): + + def setUp(self): + self.setup_cmd = [ + # 'create extension pg_pathman', + 'create table abc(id serial, t text)', + 'insert into abc select generate_series(1, 300000)', + 'select create_hash_partitions(\'abc\', \'id\', 3, partition_data := false)', + ] + + def tearDown(self): + stop_all() + # clean_all() + + def start_new_pathman_cluster(self, name='test', allows_streaming=False): + node = get_new_node(name) + node.init(allows_streaming=allows_streaming) + node.append_conf( + 'postgresql.conf', + 'shared_preload_libraries=\'pg_pathman\'\n') + node.start() + node.psql('postgres', 'create extension pg_pathman') + return node + + def init_test_data(self, node): + """Initialize pg_pathman extension and test data""" + for cmd in self.setup_cmd: + node.safe_psql('postgres', cmd) + + def catchup_replica(self, master, replica): + """Wait until replica synchronizes with master""" + master.poll_query_until( + 'postgres', + 'SELECT pg_current_xlog_location() <= replay_location ' + 'FROM pg_stat_replication WHERE application_name = \'%s\'' + % replica.name) + + def printlog(self, logfile): + with open(logfile, 'r') as log: + for line in log.readlines(): + print(line) + + def test_concurrent(self): + """Tests concurrent partitioning""" + try: + node = self.start_new_pathman_cluster() + self.init_test_data(node) + + node.psql( + 'postgres', + 'select partition_table_concurrently(\'abc\')') + + while True: + # update some rows to check for deadlocks + node.safe_psql( + 'postgres', + ''' + update abc set t = 'test' + where id in (select (random() * 300000)::int + from generate_series(1, 3000)) + ''') + + count = node.execute( + 'postgres', + 'select count(*) from pathman_concurrent_part_tasks') + + # if there is no active workers then it means work is done + if count[0][0] == 0: + break + time.sleep(1) + + data = node.execute('postgres', 'select count(*) from only abc') + self.assertEqual(data[0][0], 0) + data = node.execute('postgres', 'select count(*) from abc') + self.assertEqual(data[0][0], 300000) + + node.stop() + except Exception, e: + self.printlog(node.logs_dir + '/postgresql.log') + raise e + + def test_replication(self): + """Tests how pg_pathman works with replication""" + node = get_new_node('master') + replica = get_new_node('repl') + + try: + # initialize master server + node = self.start_new_pathman_cluster(allows_streaming=True) + node.backup('my_backup') + + # initialize replica from backup + replica.init_from_backup(node, 'my_backup', has_streaming=True) + replica.start() + + # initialize pg_pathman extension and some test data + self.init_test_data(node) + + # wait until replica catches up + self.catchup_replica(node, replica) + + # check that results are equal + self.assertEqual( + node.psql('postgres', 'explain (costs off) select * from abc'), + replica.psql('postgres', 'explain (costs off) select * from abc') + ) + + # enable parent and see if it is enabled in replica + node.psql('postgres', 'select enable_parent(\'abc\'') + + self.catchup_replica(node, replica) + self.assertEqual( + node.psql('postgres', 'explain (costs off) select * from abc'), + replica.psql('postgres', 'explain (costs off) select * from abc') + ) + self.assertEqual( + node.psql('postgres', 'select * from abc'), + replica.psql('postgres', 'select * from abc') + ) + self.assertEqual( + node.execute('postgres', 'select count(*) from abc')[0][0], + 300000 + ) + + # check that direct UPDATE in pathman_config_params invalidates + # cache + node.psql( + 'postgres', + 'update pathman_config_params set enable_parent = false') + self.catchup_replica(node, replica) + self.assertEqual( + node.psql('postgres', 'explain (costs off) select * from abc'), + replica.psql('postgres', 'explain (costs off) select * from abc') + ) + self.assertEqual( + node.psql('postgres', 'select * from abc'), + replica.psql('postgres', 'select * from abc') + ) + self.assertEqual( + node.execute('postgres', 'select count(*) from abc')[0][0], + 0 + ) + except Exception, e: + self.printlog(node.logs_dir + '/postgresql.log') + self.printlog(replica.logs_dir + '/postgresql.log') + raise e + + def test_locks(self): + """Test that a session trying to create new partitions waits for other + sessions if they doing the same""" + + import threading + import time + + class Flag: + def __init__(self, value): + self.flag = value + + def set(self, value): + self.flag = value + + def get(self): + return self.flag + + # There is one flag for each thread which shows if thread have done + # its work + flags = [Flag(False) for i in xrange(3)] + + # All threads synchronizes though this lock + lock = threading.Lock() + + # Define thread function + def add_partition(node, flag, query): + """ We expect that this query will wait until another session + commits or rolls back""" + node.safe_psql('postgres', query) + with lock: + flag.set(True) + + # Initialize master server + node = get_new_node('master') + + try: + node.init() + node.append_conf( + 'postgresql.conf', + 'shared_preload_libraries=\'pg_pathman\'\n') + node.start() + node.safe_psql( + 'postgres', + 'create extension pg_pathman; ' + + 'create table abc(id serial, t text); ' + + 'insert into abc select generate_series(1, 100000); ' + + 'select create_range_partitions(\'abc\', \'id\', 1, 50000);' + ) + + # Start transaction that will create partition + con = node.connect() + con.begin() + con.execute('select append_range_partition(\'abc\')') + + # Start threads that suppose to add new partitions and wait some + # time + query = [ + 'select prepend_range_partition(\'abc\')', + 'select append_range_partition(\'abc\')', + 'select add_range_partition(\'abc\', 500000, 550000)', + ] + threads = [] + for i in range(3): + thread = threading.Thread( + target=add_partition, + args=(node, flags[i], query[i])) + threads.append(thread) + thread.start() + time.sleep(3) + + # This threads should wait until current transaction finished + with lock: + for i in range(3): + self.assertEqual(flags[i].get(), False) + + # Commit transaction. Since then other sessions can create + # partitions + con.commit() + + # Now wait until each thread finishes + for thread in threads: + thread.join() + + # Check flags, it should be true which means that threads are + # finished + with lock: + for i in range(3): + self.assertEqual(flags[i].get(), True) + + # Check that all partitions are created + self.assertEqual( + node.safe_psql( + 'postgres', + 'select count(*) from pg_inherits where inhparent=\'abc\'::regclass' + ), + '6\n' + ) + except Exception, e: + self.printlog(node.logs_dir + '/postgresql.log') + raise e + + def test_tablespace(self): + """Check tablespace support""" + + def check_tablespace(node, tablename, tablespace): + res = node.execute( + 'postgres', + 'select get_rel_tablespace_name(\'{}\')'.format(tablename)) + if len(res) == 0: + return False + + return res[0][0] == tablespace + + node = get_new_node('master') + node.init() + node.append_conf( + 'postgresql.conf', + 'shared_preload_libraries=\'pg_pathman\'\n') + node.start() + node.psql('postgres', 'create extension pg_pathman') + + # create tablespace + path = os.path.join(node.data_dir, 'test_space_location') + os.mkdir(path) + node.psql( + 'postgres', + 'create tablespace test_space location \'{}\''.format(path)) + + # create table in this tablespace + node.psql( + 'postgres', + 'create table abc(a serial, b int) tablespace test_space') + + # create three partitions. Excpect that they will be created in the + # same tablespace as the parent table + node.psql( + 'postgres', + 'select create_range_partitions(\'abc\', \'a\', 1, 10, 3)') + self.assertTrue(check_tablespace(node, 'abc', 'test_space')) + + # check tablespace for appended partition + node.psql( + 'postgres', + 'select append_range_partition(\'abc\', \'abc_appended\')') + self.assertTrue(check_tablespace(node, 'abc_appended', 'test_space')) + + # check tablespace for prepended partition + node.psql( + 'postgres', + 'select prepend_range_partition(\'abc\', \'abc_prepended\')') + self.assertTrue(check_tablespace(node, 'abc_prepended', 'test_space')) + + # check tablespace for prepended partition + node.psql( + 'postgres', + 'select add_range_partition(\'abc\', 41, 51, \'abc_added\')') + self.assertTrue(check_tablespace(node, 'abc_added', 'test_space')) + + # now let's specify tablespace explicitly + node.psql( + 'postgres', + 'select append_range_partition(\'abc\', \'abc_appended_2\', \'pg_default\')') + node.psql( + 'postgres', + 'select prepend_range_partition(\'abc\', \'abc_prepended_2\', \'pg_default\')') + node.psql( + 'postgres', + 'select add_range_partition(\'abc\', 61, 71, \'abc_added_2\', \'pg_default\')') + self.assertTrue(check_tablespace(node, 'abc_appended_2', 'pg_default')) + self.assertTrue(check_tablespace(node, 'abc_prepended_2', 'pg_default')) + self.assertTrue(check_tablespace(node, 'abc_added_2', 'pg_default')) + + @if_fdw_enabled + def test_foreign_table(self): + """Test foreign tables""" + + # Start master server + master = get_new_node('test') + master.init() + master.append_conf( + 'postgresql.conf', + 'shared_preload_libraries=\'pg_pathman, postgres_fdw\'\n') + master.start() + master.psql('postgres', 'create extension pg_pathman') + master.psql('postgres', 'create extension postgres_fdw') + master.psql( + 'postgres', + '''create table abc(id serial, name text); + select create_range_partitions('abc', 'id', 0, 10, 2)''') + + # Current user name (needed for user mapping) + username = master.execute('postgres', 'select current_user')[0][0] + + # Start foreign server + fserv = get_new_node('fserv') + fserv.init().start() + fserv.safe_psql('postgres', 'create table ftable(id serial, name text)') + fserv.safe_psql('postgres', 'insert into ftable values (25, \'foreign\')') + + # Create foreign table and attach it to partitioned table + master.safe_psql( + 'postgres', + '''create server fserv + foreign data wrapper postgres_fdw + options (dbname 'postgres', host '127.0.0.1', port '{}')'''.format(fserv.port) + ) + master.safe_psql( + 'postgres', + '''create user mapping for {0} + server fserv + options (user '{0}')'''.format(username) + ) + master.safe_psql( + 'postgres', + '''import foreign schema public limit to (ftable) + from server fserv into public''' + ) + master.safe_psql( + 'postgres', + 'select attach_range_partition(\'abc\', \'ftable\', 20, 30)') + + # Check that table attached to partitioned table + self.assertEqual( + master.safe_psql('postgres', 'select * from ftable'), + '25|foreign\n' + ) + + # Check that we can successfully insert new data into foreign partition + master.safe_psql('postgres', 'insert into abc values (26, \'part\')') + self.assertEqual( + master.safe_psql('postgres', 'select * from ftable order by id'), + '25|foreign\n26|part\n' + ) + + # Testing drop partitions (including foreign partitions) + master.safe_psql('postgres', 'select drop_partitions(\'abc\')') + + +if __name__ == "__main__": + unittest.main() diff --git a/contrib/pg_pathman/travis/pg-travis-test.sh b/contrib/pg_pathman/travis/pg-travis-test.sh index 1ad78bd2d4..44552ae35c 100644 --- a/contrib/pg_pathman/travis/pg-travis-test.sh +++ b/contrib/pg_pathman/travis/pg-travis-test.sh @@ -6,7 +6,8 @@ sudo apt-get update # required packages -packages="postgresql-$PGVERSION postgresql-server-dev-$PGVERSION postgresql-common" +apt_packages="postgresql-$PGVERSION postgresql-server-dev-$PGVERSION postgresql-common python-pip python-dev build-essential" +pip_packages="testgres" # exit code status=0 @@ -25,7 +26,7 @@ echo 'exit 0' | sudo tee /etc/init.d/postgresql sudo chmod a+x /etc/init.d/postgresql # install required packages -sudo apt-get -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" -y install -qq $packages +sudo apt-get -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" -y install -qq $apt_packages # create cluster 'test' sudo pg_createcluster --start $PGVERSION test -p 55435 -- -A trust @@ -73,4 +74,23 @@ PGPORT=55435 make installcheck USE_PGXS=1 PGUSER=postgres PG_CONFIG=$config_path # show diff if it exists if test -f regression.diffs; then cat regression.diffs; fi + +set +u + +# create a virtual environment and activate it +virtualenv /tmp/envs/pg_pathman +source /tmp/envs/pg_pathman/bin/activate + +# install pip packages +pip install $pip_packages + +# set permission to write postgres locks +sudo chmod a+w /var/run/postgresql/ + +# run python tests +cd tests +PG_CONFIG=$config_path python -m unittest partitioning_test || status=$? + +set -u + exit $status diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index d94739af51..9866a345b3 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -15347,6 +15347,21 @@ SET search_path TO schema , schema, .. boolean does current user have privilege for tablespace + + has_type_privilege(user, + type, + privilege) + + boolean + does user have privilege for type + + + has_type_privilege(type, + privilege) + + boolean + does current user have privilege for type + pg_has_role(user, role, @@ -15405,6 +15420,9 @@ SET search_path TO schema , schema, .. has_tablespace_privilege + + has_type_privilege + pg_has_role @@ -15559,6 +15577,18 @@ SELECT has_function_privilege('joeuser', 'myfunc(int, text)', 'execute'); CREATE. + + has_type_privilege checks whether a user + can access a type in a particular way. + Its argument possibilities + are analogous to has_table_privilege. + When specifying a type by a text string rather than by OID, + the allowed input is the same as for the regtype data type + (see ). + The desired access privilege type must evaluate to + USAGE. + + pg_has_role checks whether a user can access a role in a particular way. diff --git a/doc/src/sgml/pgpathman.sgml b/doc/src/sgml/pgpathman.sgml index 36f93c9a6d..78f4c689c8 100644 --- a/doc/src/sgml/pgpathman.sgml +++ b/doc/src/sgml/pgpathman.sgml @@ -94,12 +94,12 @@ WHERE id = 150 - LIST-patitioning; + LIST-partitioning; - HASH-patitioning by non integer attribtes. + HASH-partitioning by non-integer attributes. diff --git a/doc/src/sgml/problems.sgml b/doc/src/sgml/problems.sgml index 02d509956b..60b41f3ff2 100644 --- a/doc/src/sgml/problems.sgml +++ b/doc/src/sgml/problems.sgml @@ -295,15 +295,24 @@ Where to Report Bugs - In general, send bug reports to the bug report our support email - address at + In general, send bug reports to our support email address at bugs@postgrespro.ru. You are requested to use a descriptive subject for your email message, perhaps parts of the error message. - Do not send bug reports to any of the user mailing lists, such as + Do not send bug reports specific to Postgres Pro + to the PostgreSQL support email address, + as Postgres Pro is not supported by + the PostgreSQL community. + But you can send reports to bugs@postgresql.org + for any bugs related to PostgreSQL. + + + + Even if your bug is not specific to Postgres Pro, + do not send bug reports to any of the user mailing lists, such as pgsql-sql@postgresql.org or pgsql-general@postgresql.org. These mailing lists are for answering @@ -316,9 +325,10 @@ the developers' mailing list pgsql-hackers@postgresql.org. This list is for discussing the development of PostgreSQL, and it would be nice - if we could keep the bug reports separate. We might choose to take up a + if the community could keep the bug reports separate. + The community might choose to take up a discussion about your bug report on pgsql-hackers, - if the problem needs more review. + if the PostgreSQL-related problem needs more review. diff --git a/doc/src/sgml/sepgsql.sgml b/doc/src/sgml/sepgsql.sgml index 4758d21d28..c1221c56d6 100644 --- a/doc/src/sgml/sepgsql.sgml +++ b/doc/src/sgml/sepgsql.sgml @@ -753,7 +753,7 @@ ERROR: SELinux: security policy violation External Resources - SE-&productname; Introduction + SE-PostgreSQL Introduction This wiki page provides a brief overview, security design, architecture, diff --git a/doc/src/sgml/sr_plan.sgml b/doc/src/sgml/sr_plan.sgml index 34136d18b2..7437da5d4d 100644 --- a/doc/src/sgml/sr_plan.sgml +++ b/doc/src/sgml/sr_plan.sgml @@ -10,7 +10,7 @@ sr_plan is an extension which allows to save query execution plans and use these plans for all repetitions of same query, instead of - optimizing identical query again and again/ + optimizing identical query again and again. sr_plan looks like Oracle Outline system. It can be used to lock diff --git a/src/backend/catalog/objectaddress.c b/src/backend/catalog/objectaddress.c index e44d7d09e1..179bf125c5 100644 --- a/src/backend/catalog/objectaddress.c +++ b/src/backend/catalog/objectaddress.c @@ -2237,23 +2237,18 @@ get_object_namespace(const ObjectAddress *address) int read_objtype_from_string(const char *objtype) { - ObjectType type; int i; for (i = 0; i < lengthof(ObjectTypeMap); i++) { if (strcmp(ObjectTypeMap[i].tm_name, objtype) == 0) - { - type = ObjectTypeMap[i].tm_type; - break; - } + return ObjectTypeMap[i].tm_type; } - if (i >= lengthof(ObjectTypeMap)) - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("unrecognized object type \"%s\"", objtype))); + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("unrecognized object type \"%s\"", objtype))); - return type; + return -1; /* keep compiler quiet */ } /* diff --git a/src/backend/libpq/auth.c b/src/backend/libpq/auth.c index dd8d2e9ff3..936a7ccae5 100644 --- a/src/backend/libpq/auth.c +++ b/src/backend/libpq/auth.c @@ -20,6 +20,9 @@ #include #include #include +#ifdef HAVE_SYS_SELECT_H +#include +#endif #include "libpq/auth.h" #include "libpq/crypt.h" diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 3c862feb92..0440f4a1d4 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -28,6 +28,9 @@ #include #include #include +#ifdef HAVE_SYS_SELECT_H +#include +#endif #include "pgstat.h" diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index f16a63aade..5db878f9b4 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -4618,10 +4618,17 @@ SubPostmasterMain(int argc, char *argv[]) /* Setup essential subsystems (to ensure elog() behaves sanely) */ InitializeGUCOptions(); + /* Check we got appropriate args */ + if (argc < 3) + elog(FATAL, "invalid subpostmaster invocation"); + /* Read in the variables file */ memset(&port, 0, sizeof(Port)); read_backend_variables(argv[2], &port); + /* Close the postmaster's sockets (as soon as we know them) */ + ClosePostmasterPorts(strcmp(argv[1], "--forklog") == 0); + /* * Set reference point for stack-depth checking */ @@ -4639,15 +4646,21 @@ SubPostmasterMain(int argc, char *argv[]) errmsg("out of memory"))); #endif - /* Check we got appropriate args */ - if (argc < 3) - elog(FATAL, "invalid subpostmaster invocation"); - /* * If appropriate, physically re-attach to shared memory segment. We want * to do this before going any further to ensure that we can attach at the * same address the postmaster used. On the other hand, if we choose not * to re-attach, we may have other cleanup to do. + * + * If testing EXEC_BACKEND on Linux, you should run this as root before + * starting the postmaster: + * + * echo 0 >/proc/sys/kernel/randomize_va_space + * + * This prevents using randomized stack and code addresses that cause the + * child process's memory map to be different from the parent's, making it + * sometimes impossible to attach to shared memory at the desired address. + * Return the setting to its old value (usually '1' or '2') when finished. */ if (strcmp(argv[1], "--forkbackend") == 0 || strcmp(argv[1], "--forkavlauncher") == 0 || @@ -4693,9 +4706,6 @@ SubPostmasterMain(int argc, char *argv[]) { Assert(argc == 3); /* shouldn't be any more args */ - /* Close the postmaster's sockets */ - ClosePostmasterPorts(false); - /* * Need to reinitialize the SSL library in the backend, since the * context structures contain function pointers and cannot be passed @@ -4726,17 +4736,7 @@ SubPostmasterMain(int argc, char *argv[]) /* Need a PGPROC to run CreateSharedMemoryAndSemaphores */ InitProcess(); - /* - * Attach process to shared data structures. If testing EXEC_BACKEND - * on Linux, you must run this as root before starting the postmaster: - * - * echo 0 >/proc/sys/kernel/randomize_va_space - * - * This prevents a randomized stack base address that causes child - * shared memory to be at a different address than the parent, making - * it impossible to attached to shared memory. Return the value to - * '1' when finished. - */ + /* Attach process to shared data structures */ CreateSharedMemoryAndSemaphores(false, 0); /* And run the backend */ @@ -4744,9 +4744,6 @@ SubPostmasterMain(int argc, char *argv[]) } if (strcmp(argv[1], "--forkboot") == 0) { - /* Close the postmaster's sockets */ - ClosePostmasterPorts(false); - /* Restore basic shared memory pointers */ InitShmemAccess(UsedShmemSegAddr); @@ -4760,9 +4757,6 @@ SubPostmasterMain(int argc, char *argv[]) } if (strcmp(argv[1], "--forkavlauncher") == 0) { - /* Close the postmaster's sockets */ - ClosePostmasterPorts(false); - /* Restore basic shared memory pointers */ InitShmemAccess(UsedShmemSegAddr); @@ -4776,9 +4770,6 @@ SubPostmasterMain(int argc, char *argv[]) } if (strcmp(argv[1], "--forkavworker") == 0) { - /* Close the postmaster's sockets */ - ClosePostmasterPorts(false); - /* Restore basic shared memory pointers */ InitShmemAccess(UsedShmemSegAddr); @@ -4797,9 +4788,6 @@ SubPostmasterMain(int argc, char *argv[]) /* do this as early as possible; in particular, before InitProcess() */ IsBackgroundWorker = true; - /* Close the postmaster's sockets */ - ClosePostmasterPorts(false); - /* Restore basic shared memory pointers */ InitShmemAccess(UsedShmemSegAddr); @@ -4817,27 +4805,18 @@ SubPostmasterMain(int argc, char *argv[]) } if (strcmp(argv[1], "--forkarch") == 0) { - /* Close the postmaster's sockets */ - ClosePostmasterPorts(false); - /* Do not want to attach to shared memory */ PgArchiverMain(argc, argv); /* does not return */ } if (strcmp(argv[1], "--forkcol") == 0) { - /* Close the postmaster's sockets */ - ClosePostmasterPorts(false); - /* Do not want to attach to shared memory */ PgstatCollectorMain(argc, argv); /* does not return */ } if (strcmp(argv[1], "--forklog") == 0) { - /* Close the postmaster's sockets */ - ClosePostmasterPorts(true); - /* Do not want to attach to shared memory */ SysLoggerMain(argc, argv); /* does not return */ diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c index 1f49c4a3bc..7234c49c37 100644 --- a/src/bin/pg_basebackup/pg_basebackup.c +++ b/src/bin/pg_basebackup/pg_basebackup.c @@ -20,7 +20,9 @@ #include #include #include - +#ifdef HAVE_SYS_SELECT_H +#include +#endif #ifdef HAVE_LIBZ #include #endif diff --git a/src/bin/pg_basebackup/pg_recvlogical.c b/src/bin/pg_basebackup/pg_recvlogical.c index 73625256ac..5907d607b0 100644 --- a/src/bin/pg_basebackup/pg_recvlogical.c +++ b/src/bin/pg_basebackup/pg_recvlogical.c @@ -15,6 +15,9 @@ #include #include #include +#ifdef HAVE_SYS_SELECT_H +#include +#endif /* local includes */ #include "streamutil.h" diff --git a/src/bin/pg_basebackup/receivelog.c b/src/bin/pg_basebackup/receivelog.c index f8bd551ef9..406c01bfcc 100644 --- a/src/bin/pg_basebackup/receivelog.c +++ b/src/bin/pg_basebackup/receivelog.c @@ -16,6 +16,9 @@ #include #include +#ifdef HAVE_SYS_SELECT_H +#include +#endif /* local includes */ #include "receivelog.h" diff --git a/src/bin/pg_dump/parallel.c b/src/bin/pg_dump/parallel.c index 51a8eee369..ce3a06ae81 100644 --- a/src/bin/pg_dump/parallel.c +++ b/src/bin/pg_dump/parallel.c @@ -59,6 +59,10 @@ #include "postgres_fe.h" +#ifdef HAVE_SYS_SELECT_H +#include +#endif + #include "parallel.h" #include "pg_backup_utils.h" diff --git a/src/bin/pg_xlogdump/pg_xlogdump.c b/src/bin/pg_xlogdump/pg_xlogdump.c index dbaf727cd8..49e9a34b52 100644 --- a/src/bin/pg_xlogdump/pg_xlogdump.c +++ b/src/bin/pg_xlogdump/pg_xlogdump.c @@ -249,6 +249,7 @@ XLogDumpXLogRead(const char *directory, TimeLineID timeline_id, if (sendFile < 0 || !XLByteInSeg(recptr, sendSegNo)) { char fname[MAXFNAMELEN]; + int tries; /* Switch to another logfile segment */ if (sendFile >= 0) @@ -258,7 +259,30 @@ XLogDumpXLogRead(const char *directory, TimeLineID timeline_id, XLogFileName(fname, timeline_id, sendSegNo); - sendFile = fuzzy_open_file(directory, fname); + /* + * In follow mode there is a short period of time after the + * server has written the end of the previous file before the + * new file is available. So we loop for 5 seconds looking + * for the file to appear before giving up. + */ + for (tries = 0; tries < 10; tries++) + { + sendFile = fuzzy_open_file(directory, fname); + if (sendFile >= 0) + break; + if (errno == ENOENT) + { + int save_errno = errno; + + /* File not there yet, try again */ + pg_usleep(500 * 1000); + + errno = save_errno; + continue; + } + /* Any other error, fall through and fail */ + break; + } if (sendFile < 0) fatal_error("could not find file \"%s\": %s", diff --git a/src/bin/scripts/vacuumdb.c b/src/bin/scripts/vacuumdb.c index f99be3bf7a..2125f42c99 100644 --- a/src/bin/scripts/vacuumdb.c +++ b/src/bin/scripts/vacuumdb.c @@ -12,6 +12,10 @@ #include "postgres_fe.h" +#ifdef HAVE_SYS_SELECT_H +#include +#endif + #include "common.h" #include "dumputils.h" diff --git a/src/port/pgsleep.c b/src/port/pgsleep.c index 89a12b9da7..3f84d8f240 100644 --- a/src/port/pgsleep.c +++ b/src/port/pgsleep.c @@ -14,6 +14,9 @@ #include #include +#ifdef HAVE_SYS_SELECT_H +#include +#endif /* * In a Windows backend, we don't use this implementation, but rather diff --git a/src/test/examples/testlibpq2.c b/src/test/examples/testlibpq2.c index 850993f6e8..07c6317a21 100644 --- a/src/test/examples/testlibpq2.c +++ b/src/test/examples/testlibpq2.c @@ -34,6 +34,10 @@ #include #include #include +#ifdef HAVE_SYS_SELECT_H +#include +#endif + #include "libpq-fe.h" static void diff --git a/src/test/modules/worker_spi/worker_spi.c b/src/test/modules/worker_spi/worker_spi.c index fcb34ca198..7c655f9021 100644 --- a/src/test/modules/worker_spi/worker_spi.c +++ b/src/test/modules/worker_spi/worker_spi.c @@ -292,6 +292,7 @@ worker_spi_main(Datum main_arg) SPI_finish(); PopActiveSnapshot(); CommitTransactionCommand(); + pgstat_report_stat(false); pgstat_report_activity(STATE_IDLE, NULL); }