diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c index 880e897796a1e..8b68dafc2611c 100644 --- a/contrib/pg_prewarm/autoprewarm.c +++ b/contrib/pg_prewarm/autoprewarm.c @@ -370,6 +370,15 @@ apw_load_buffers(void) apw_state->prewarm_start_idx = apw_state->prewarm_stop_idx = 0; apw_state->prewarmed_blocks = 0; + /* Don't prewarm more than we can fit. */ + if (num_elements > NBuffers) + { + num_elements = NBuffers; + ereport(LOG, + (errmsg("autoprewarm capping prewarmed blocks to %d (shared_buffers size)", + NBuffers))); + } + /* Get the info position of the first block of the next database. */ while (apw_state->prewarm_start_idx < num_elements) { @@ -410,10 +419,6 @@ apw_load_buffers(void) apw_state->database = current_db; Assert(apw_state->prewarm_start_idx < apw_state->prewarm_stop_idx); - /* If we've run out of free buffers, don't launch another worker. */ - if (!have_free_buffer()) - break; - /* * Likewise, don't launch if we've already been told to shut down. * (The launch would fail anyway, but we might as well skip it.) @@ -462,12 +467,6 @@ apw_read_stream_next_block(ReadStream *stream, { BlockInfoRecord blk = p->block_info[p->pos]; - if (!have_free_buffer()) - { - p->pos = apw_state->prewarm_stop_idx; - return InvalidBlockNumber; - } - if (blk.tablespace != p->tablespace) return InvalidBlockNumber; @@ -523,10 +522,10 @@ autoprewarm_database_main(Datum main_arg) blk = block_info[i]; /* - * Loop until we run out of blocks to prewarm or until we run out of free + * Loop until we run out of blocks to prewarm or until we run out of * buffers. */ - while (i < apw_state->prewarm_stop_idx && have_free_buffer()) + while (i < apw_state->prewarm_stop_idx) { Oid tablespace = blk.tablespace; RelFileNumber filenumber = blk.filenumber; @@ -568,14 +567,13 @@ autoprewarm_database_main(Datum main_arg) /* * We have a relation; now let's loop until we find a valid fork of - * the relation or we run out of free buffers. Once we've read from - * all valid forks or run out of options, we'll close the relation and + * the relation or we run out of buffers. Once we've read from all + * valid forks or run out of options, we'll close the relation and * move on. */ while (i < apw_state->prewarm_stop_idx && blk.tablespace == tablespace && - blk.filenumber == filenumber && - have_free_buffer()) + blk.filenumber == filenumber) { ForkNumber forknum = blk.forknum; BlockNumber nblocks; diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out index 78b8367d28935..18d727d77907a 100644 --- a/contrib/postgres_fdw/expected/postgres_fdw.out +++ b/contrib/postgres_fdw/expected/postgres_fdw.out @@ -5086,13 +5086,13 @@ SELECT ft1.c1 FROM ft1 JOIN ft2 on ft1.c1 = ft2.c1 WHERE -- =================================================================== EXPLAIN (verbose, costs off) INSERT INTO ft2 (c1,c2,c3) SELECT c1+1000,c2+100, c3 || c3 FROM ft2 LIMIT 20; - QUERY PLAN ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ Insert on public.ft2 Remote SQL: INSERT INTO "S 1"."T 1"("C 1", c2, c3, c4, c5, c6, c7, c8) VALUES ($1, $2, $3, $4, $5, $6, $7, $8) Batch Size: 1 - -> Subquery Scan on "*SELECT*" - Output: "*SELECT*"."?column?", "*SELECT*"."?column?_1", NULL::integer, "*SELECT*"."?column?_2", NULL::timestamp with time zone, NULL::timestamp without time zone, NULL::character varying(10), 'ft2 '::character(10), NULL::user_enum + -> Subquery Scan on unnamed_subquery + Output: unnamed_subquery."?column?", unnamed_subquery."?column?_1", NULL::integer, unnamed_subquery."?column?_2", NULL::timestamp with time zone, NULL::timestamp without time zone, NULL::character varying(10), 'ft2 '::character(10), NULL::user_enum -> Foreign Scan on public.ft2 ft2_1 Output: (ft2_1.c1 + 1000), (ft2_1.c2 + 100), (ft2_1.c3 || ft2_1.c3) Remote SQL: SELECT "C 1", c2, c3 FROM "S 1"."T 1" LIMIT 20::bigint diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 0a4b3e55ba5ed..2a3685f474a96 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -7383,6 +7383,11 @@ local0.* /var/log/postgresql + debug_print_raw_parse (boolean) + + debug_print_raw_parse configuration parameter + + debug_print_parse (boolean) debug_print_parse configuration parameter @@ -7401,8 +7406,8 @@ local0.* /var/log/postgresql These parameters enable various debugging output to be emitted. - When set, they print the resulting parse tree, the query rewriter - output, or the execution plan for each executed query. + When set, they print the resulting raw parse tree, the parse tree, the query + rewriter output, or the execution plan for each executed query. These messages are emitted at LOG message level, so by default they will appear in the server log but will not be sent to the client. You can change that by adjusting @@ -7422,7 +7427,8 @@ local0.* /var/log/postgresql When set, debug_pretty_print indents the messages - produced by debug_print_parse, + produced by debug_print_raw_parse, + debug_print_parse, debug_print_rewritten, or debug_print_plan. This results in more readable but much longer output than the compact format used when diff --git a/doc/src/sgml/func/func-datetime.sgml b/doc/src/sgml/func/func-datetime.sgml index 482fe45f42ebc..98dd60aa9a7ec 100644 --- a/doc/src/sgml/func/func-datetime.sgml +++ b/doc/src/sgml/func/func-datetime.sgml @@ -928,6 +928,36 @@ + + + + random + + random ( min date, max date ) + date + + + random ( min timestamp, max timestamp ) + timestamp + + + random ( min timestamptz, max timestamptz ) + timestamptz + + + Returns a random value in the range + min <= x <= max. + + + random('1979-02-08'::date,'2025-07-03'::date) + 1983-04-21 + + + random('2000-01-01'::timestamptz, now()) + 2015-09-27 09:11:33.732707+00 + + + diff --git a/doc/src/sgml/func/func-math.sgml b/doc/src/sgml/func/func-math.sgml index 7528dc4cea4b9..fd821c0e70677 100644 --- a/doc/src/sgml/func/func-math.sgml +++ b/doc/src/sgml/func/func-math.sgml @@ -1151,7 +1151,8 @@ The random() and random_normal() - functions listed in use a + functions listed in and + use a deterministic pseudo-random number generator. It is fast but not suitable for cryptographic applications; see the module for a more diff --git a/doc/src/sgml/rules.sgml b/doc/src/sgml/rules.sgml index 8467d961fd0a0..282dcd722d495 100644 --- a/doc/src/sgml/rules.sgml +++ b/doc/src/sgml/rules.sgml @@ -60,6 +60,7 @@ SQL statement where the single parts that it is built from are stored separately. These query trees can be shown in the server log if you set the configuration parameters + debug_print_raw_parse, debug_print_parse, debug_print_rewritten, or debug_print_plan. The rule actions are also diff --git a/doc/src/sgml/xfunc.sgml b/doc/src/sgml/xfunc.sgml index da21ef5689184..04bf919b34384 100644 --- a/doc/src/sgml/xfunc.sgml +++ b/doc/src/sgml/xfunc.sgml @@ -3668,11 +3668,14 @@ LWLockRelease(AddinShmemInitLock); shmem_startup_hook provides a convenient place for the initialization code, but it is not strictly required that all such code - be placed in this hook. Each backend will execute the registered - shmem_startup_hook shortly after it attaches to shared - memory. Note that add-ins should still acquire + be placed in this hook. On Windows (and anywhere else where + EXEC_BACKEND is defined), each backend executes the + registered shmem_startup_hook shortly after it + attaches to shared memory, so add-ins should still acquire AddinShmemInitLock within this hook, as shown in the - example above. + example above. On other platforms, only the postmaster process executes + the shmem_startup_hook, and each backend automatically + inherits the pointers to shared memory. diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index e3e7307ef5f79..4c5ae205a7a60 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -2647,9 +2647,6 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, */ if (all_frozen_set) { - Assert(PageIsAllVisible(page)); - Assert(visibilitymap_pin_ok(BufferGetBlockNumber(buffer), vmbuffer)); - /* * It's fine to use InvalidTransactionId here - this is only used * when HEAP_INSERT_FROZEN is specified, which intentionally diff --git a/src/backend/access/heap/heapam_xlog.c b/src/backend/access/heap/heapam_xlog.c index 5d48f071f53a7..cf843277938de 100644 --- a/src/backend/access/heap/heapam_xlog.c +++ b/src/backend/access/heap/heapam_xlog.c @@ -295,7 +295,6 @@ heap_xlog_visible(XLogReaderState *record) LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK); reln = CreateFakeRelcacheEntry(rlocator); - visibilitymap_pin(reln, blkno, &vmbuffer); visibilitymap_set(reln, blkno, InvalidBuffer, lsn, vmbuffer, xlrec->snapshotConflictHorizon, vmbits); diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 932701d8420dc..981d9380a925c 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -2121,8 +2121,11 @@ lazy_scan_prune(LVRelState *vacrel, else if (all_visible_according_to_vm && !PageIsAllVisible(page) && visibilitymap_get_status(vacrel->rel, blkno, &vmbuffer) != 0) { - elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", - vacrel->relname, blkno); + ereport(WARNING, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", + vacrel->relname, blkno))); + visibilitymap_clear(vacrel->rel, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS); } @@ -2143,8 +2146,11 @@ lazy_scan_prune(LVRelState *vacrel, */ else if (presult.lpdead_items > 0 && PageIsAllVisible(page)) { - elog(WARNING, "page containing LP_DEAD items is marked as all-visible in relation \"%s\" page %u", - vacrel->relname, blkno); + ereport(WARNING, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("page containing LP_DEAD items is marked as all-visible in relation \"%s\" page %u", + vacrel->relname, blkno))); + PageClearAllVisible(page); MarkBufferDirty(buf); visibilitymap_clear(vacrel->rel, blkno, vmbuffer, diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index 953ad4a484399..7306c16f05cd3 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -255,7 +255,8 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, uint8 status; #ifdef TRACE_VISIBILITYMAP - elog(DEBUG1, "vm_set %s %d", RelationGetRelationName(rel), heapBlk); + elog(DEBUG1, "vm_set flags 0x%02X for %s %d", + flags, RelationGetRelationName(rel), heapBlk); #endif Assert(InRecovery || XLogRecPtrIsInvalid(recptr)); @@ -269,6 +270,8 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, if (BufferIsValid(heapBuf) && BufferGetBlockNumber(heapBuf) != heapBlk) elog(ERROR, "wrong heap buffer passed to visibilitymap_set"); + Assert(!BufferIsValid(heapBuf) || BufferIsExclusiveLocked(heapBuf)); + /* Check that we have the right VM page pinned */ if (!BufferIsValid(vmBuf) || BufferGetBlockNumber(vmBuf) != mapBlock) elog(ERROR, "wrong VM buffer passed to visibilitymap_set"); diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index d7ebd889aea71..5d3fcd62c9443 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -246,6 +246,7 @@ SimpleLruAutotuneBuffers(int divisor, int max) * buffer_tranche_id: tranche ID to use for the SLRU's per-buffer LWLocks. * bank_tranche_id: tranche ID to use for the bank LWLocks. * sync_handler: which set of functions to use to handle sync requests + * long_segment_names: use short or long segment names */ void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, @@ -644,7 +645,7 @@ SimpleLruReadPage_ReadOnly(SlruCtl ctl, int64 pageno, TransactionId xid) shared->page_number[slotno] == pageno && shared->page_status[slotno] != SLRU_PAGE_READ_IN_PROGRESS) { - /* See comments for SlruRecentlyUsed macro */ + /* See comments for SlruRecentlyUsed() */ SlruRecentlyUsed(shared, slotno); /* update the stats counter of pages found in the SLRU */ diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 7918176fc588e..d8e2fce2c99b7 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -103,6 +103,7 @@ #include "storage/proc.h" #include "storage/procarray.h" #include "utils/builtins.h" +#include "utils/injection_point.h" #include "utils/memutils.h" #include "utils/timestamp.h" @@ -2332,12 +2333,17 @@ RecordTransactionCommitPrepared(TransactionId xid, replorigin = (replorigin_session_origin != InvalidRepOriginId && replorigin_session_origin != DoNotReplicateId); + /* Load the injection point before entering the critical section */ + INJECTION_POINT_LOAD("commit-after-delay-checkpoint"); + START_CRIT_SECTION(); /* See notes in RecordTransactionCommit */ Assert((MyProc->delayChkptFlags & DELAY_CHKPT_IN_COMMIT) == 0); MyProc->delayChkptFlags |= DELAY_CHKPT_IN_COMMIT; + INJECTION_POINT_CACHED("commit-after-delay-checkpoint", NULL); + /* * Ensures the DELAY_CHKPT_IN_COMMIT flag write is globally visible before * commit time is written. @@ -2809,3 +2815,58 @@ LookupGXactBySubid(Oid subid) return found; } + +/* + * TwoPhaseGetXidByLockingProc + * Return the oldest transaction ID from prepared transactions that are + * currently in the commit critical section. + * + * This function only considers transactions in the currently connected + * database. If no matching transactions are found, it returns + * InvalidTransactionId. + */ +TransactionId +TwoPhaseGetOldestXidInCommit(void) +{ + TransactionId oldestRunningXid = InvalidTransactionId; + + LWLockAcquire(TwoPhaseStateLock, LW_SHARED); + + for (int i = 0; i < TwoPhaseState->numPrepXacts; i++) + { + GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; + PGPROC *commitproc; + TransactionId xid; + + if (!gxact->valid) + continue; + + if (gxact->locking_backend == INVALID_PROC_NUMBER) + continue; + + /* + * Get the backend that is handling the transaction. It's safe to + * access this backend while holding TwoPhaseStateLock, as the backend + * can only be destroyed after either removing or unlocking the + * current global transaction, both of which require an exclusive + * TwoPhaseStateLock. + */ + commitproc = GetPGProcByNumber(gxact->locking_backend); + + if (MyDatabaseId != commitproc->databaseId) + continue; + + if ((commitproc->delayChkptFlags & DELAY_CHKPT_IN_COMMIT) == 0) + continue; + + xid = XidFromFullTransactionId(gxact->fxid); + + if (!TransactionIdIsValid(oldestRunningXid) || + TransactionIdPrecedes(xid, oldestRunningXid)) + oldestRunningXid = xid; + } + + LWLockRelease(TwoPhaseStateLock); + + return oldestRunningXid; +} diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 7ffb217915190..0baf0ac6160af 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -8385,6 +8385,14 @@ xlog_redo(XLogReaderState *record) checkPoint.ThisTimeLineID, replayTLI))); RecoveryRestartPoint(&checkPoint, record); + + /* + * After replaying a checkpoint record, free all smgr objects. + * Otherwise we would never do so for dropped relations, as the + * startup does not process shared invalidation messages or call + * AtEOXact_SMgr(). + */ + smgrdestroyall(); } else if (info == XLOG_CHECKPOINT_ONLINE) { @@ -8438,6 +8446,14 @@ xlog_redo(XLogReaderState *record) checkPoint.ThisTimeLineID, replayTLI))); RecoveryRestartPoint(&checkPoint, record); + + /* + * After replaying a checkpoint record, free all smgr objects. + * Otherwise we would never do so for dropped relations, as the + * startup does not process shared invalidation messages or call + * AtEOXact_SMgr(). + */ + smgrdestroyall(); } else if (info == XLOG_OVERWRITE_CONTRECORD) { diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index f23ec8969c27d..346319338a0ee 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -4834,10 +4834,10 @@ check_recovery_target_lsn(char **newval, void **extra, GucSource source) { XLogRecPtr lsn; XLogRecPtr *myextra; - bool have_error = false; + ErrorSaveContext escontext = {T_ErrorSaveContext}; - lsn = pg_lsn_in_internal(*newval, &have_error); - if (have_error) + lsn = pg_lsn_in_safe(*newval, (Node *) &escontext); + if (escontext.error_occurred) return false; myextra = (XLogRecPtr *) guc_malloc(LOG, sizeof(XLogRecPtr)); diff --git a/src/backend/commands/subscriptioncmds.c b/src/backend/commands/subscriptioncmds.c index 82cf65fae737a..750d262fccade 100644 --- a/src/backend/commands/subscriptioncmds.c +++ b/src/backend/commands/subscriptioncmds.c @@ -854,7 +854,17 @@ CreateSubscription(ParseState *pstate, CreateSubscriptionStmt *stmt, pgstat_create_subscription(subid); - if (opts.enabled) + /* + * Notify the launcher to start the apply worker if the subscription is + * enabled, or to create the conflict detection slot if retain_dead_tuples + * is enabled. + * + * Creating the conflict detection slot is essential even when the + * subscription is not enabled. This ensures that dead tuples are + * retained, which is necessary for accurately identifying the type of + * conflict during replication. + */ + if (opts.enabled || opts.retaindeadtuples) ApplyLauncherWakeupAtCommit(); ObjectAddressSet(myself, SubscriptionRelationId, subid); diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 082a3575d621e..3be2e051d32fb 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -42,6 +42,7 @@ #include "catalog/pg_foreign_table.h" #include "catalog/pg_inherits.h" #include "catalog/pg_largeobject.h" +#include "catalog/pg_largeobject_metadata.h" #include "catalog/pg_namespace.h" #include "catalog/pg_opclass.h" #include "catalog/pg_policy.h" @@ -2389,12 +2390,15 @@ truncate_check_rel(Oid relid, Form_pg_class reltuple) /* * Most system catalogs can't be truncated at all, or at least not unless * allow_system_table_mods=on. As an exception, however, we allow - * pg_largeobject to be truncated as part of pg_upgrade, because we need - * to change its relfilenode to match the old cluster, and allowing a - * TRUNCATE command to be executed is the easiest way of doing that. + * pg_largeobject and pg_largeobject_metadata to be truncated as part of + * pg_upgrade, because we need to change its relfilenode to match the old + * cluster, and allowing a TRUNCATE command to be executed is the easiest + * way of doing that. */ if (!allowSystemTableMods && IsSystemClass(relid, reltuple) - && (!IsBinaryUpgrade || relid != LargeObjectRelationId)) + && (!IsBinaryUpgrade || + (relid != LargeObjectRelationId && + relid != LargeObjectMetadataRelationId))) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("permission denied: \"%s\" is a system catalog", diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c index b540074935386..75087204f0c69 100644 --- a/src/backend/executor/execGrouping.c +++ b/src/backend/executor/execGrouping.c @@ -156,6 +156,12 @@ execTuplesHashPrepare(int numCols, * * Note that the keyColIdx, hashfunctions, and collations arrays must be * allocated in storage that will live as long as the hashtable does. + * + * LookupTupleHashEntry, FindTupleHashEntry, and related functions may leak + * memory in the tempcxt. It is caller's responsibility to reset that context + * reasonably often, typically once per tuple. (We do it that way, rather + * than managing an extra context within the hashtable, because in many cases + * the caller can specify a tempcxt that it needs to reset per-tuple anyway.) */ TupleHashTable BuildTupleHashTable(PlanState *parent, diff --git a/src/backend/executor/functions.c b/src/backend/executor/functions.c index 97455b1ed4a5b..630d708d2a3f0 100644 --- a/src/backend/executor/functions.c +++ b/src/backend/executor/functions.c @@ -2483,7 +2483,7 @@ check_sql_stmt_retval(List *queryTreeList, rte = makeNode(RangeTblEntry); rte->rtekind = RTE_SUBQUERY; rte->subquery = parse; - rte->eref = rte->alias = makeAlias("*SELECT*", colnames); + rte->eref = makeAlias("unnamed_subquery", colnames); rte->lateral = false; rte->inh = false; rte->inFromCl = true; diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c index 377e016d73225..a4f3d30f307cc 100644 --- a/src/backend/executor/nodeAgg.c +++ b/src/backend/executor/nodeAgg.c @@ -267,7 +267,6 @@ #include "utils/acl.h" #include "utils/builtins.h" #include "utils/datum.h" -#include "utils/dynahash.h" #include "utils/expandeddatum.h" #include "utils/injection_point.h" #include "utils/logtape.h" @@ -2115,7 +2114,7 @@ hash_choose_num_partitions(double input_groups, double hashentrysize, npartitions = (int) dpartitions; /* ceil(log2(npartitions)) */ - partition_bits = my_log2(npartitions); + partition_bits = pg_ceil_log2_32(npartitions); /* make sure that we don't exhaust the hash bits */ if (partition_bits + used_bits >= 32) diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c index 8d2201ab67fa5..a3415db4e20f5 100644 --- a/src/backend/executor/nodeHash.c +++ b/src/backend/executor/nodeHash.c @@ -36,7 +36,6 @@ #include "executor/nodeHashjoin.h" #include "miscadmin.h" #include "port/pg_bitutils.h" -#include "utils/dynahash.h" #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/syscache.h" @@ -340,7 +339,7 @@ MultiExecParallelHash(HashState *node) */ hashtable->curbatch = -1; hashtable->nbuckets = pstate->nbuckets; - hashtable->log2_nbuckets = my_log2(hashtable->nbuckets); + hashtable->log2_nbuckets = pg_ceil_log2_32(hashtable->nbuckets); hashtable->totalTuples = pstate->total_tuples; /* @@ -480,7 +479,7 @@ ExecHashTableCreate(HashState *state) &nbuckets, &nbatch, &num_skew_mcvs); /* nbuckets must be a power of 2 */ - log2_nbuckets = my_log2(nbuckets); + log2_nbuckets = pg_ceil_log2_32(nbuckets); Assert(nbuckets == (1 << log2_nbuckets)); /* @@ -3499,7 +3498,7 @@ ExecParallelHashTableSetCurrentBatch(HashJoinTable hashtable, int batchno) dsa_get_address(hashtable->area, hashtable->batches[batchno].shared->buckets); hashtable->nbuckets = hashtable->parallel_state->nbuckets; - hashtable->log2_nbuckets = my_log2(hashtable->nbuckets); + hashtable->log2_nbuckets = pg_ceil_log2_32(hashtable->nbuckets); hashtable->current_chunk = NULL; hashtable->current_chunk_shared = InvalidDsaPointer; hashtable->batches[batchno].at_least_one_chunk = false; diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index b0c4e2c0d32a4..4c5647ac38a1c 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -3402,7 +3402,7 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, * the tuple moved, and setting our current * resultRelInfo to that. */ - if (ItemPointerIndicatesMovedPartitions(&context->tmfd.ctid)) + if (ItemPointerIndicatesMovedPartitions(tupleid)) ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("tuple to be merged was already moved to another partition due to concurrent update"))); @@ -3450,12 +3450,13 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, if (ItemPointerIsValid(&lockedtid)) UnlockTuple(resultRelInfo->ri_RelationDesc, &lockedtid, InplaceUpdateTupleLock); - LockTuple(resultRelInfo->ri_RelationDesc, &context->tmfd.ctid, + LockTuple(resultRelInfo->ri_RelationDesc, tupleid, InplaceUpdateTupleLock); - lockedtid = context->tmfd.ctid; + lockedtid = *tupleid; } + if (!table_tuple_fetch_row_version(resultRelationDesc, - &context->tmfd.ctid, + tupleid, SnapshotAny, resultRelInfo->ri_oldTupleSlot)) elog(ERROR, "failed to fetch the target tuple"); diff --git a/src/backend/executor/nodeSubplan.c b/src/backend/executor/nodeSubplan.c index f7f6fc2da0b95..53fb56f7388e8 100644 --- a/src/backend/executor/nodeSubplan.c +++ b/src/backend/executor/nodeSubplan.c @@ -102,6 +102,7 @@ ExecHashSubPlan(SubPlanState *node, ExprContext *econtext, bool *isNull) { + bool result = false; SubPlan *subplan = node->subplan; PlanState *planstate = node->planstate; TupleTableSlot *slot; @@ -132,14 +133,6 @@ ExecHashSubPlan(SubPlanState *node, node->projLeft->pi_exprContext = econtext; slot = ExecProject(node->projLeft); - /* - * Note: because we are typically called in a per-tuple context, we have - * to explicitly clear the projected tuple before returning. Otherwise, - * we'll have a double-free situation: the per-tuple context will probably - * be reset before we're called again, and then the tuple slot will think - * it still needs to free the tuple. - */ - /* * If the LHS is all non-null, probe for an exact match in the main hash * table. If we find one, the result is TRUE. Otherwise, scan the @@ -161,19 +154,10 @@ ExecHashSubPlan(SubPlanState *node, slot, node->cur_eq_comp, node->lhs_hash_expr) != NULL) - { - ExecClearTuple(slot); - return BoolGetDatum(true); - } - if (node->havenullrows && - findPartialMatch(node->hashnulls, slot, node->cur_eq_funcs)) - { - ExecClearTuple(slot); + result = true; + else if (node->havenullrows && + findPartialMatch(node->hashnulls, slot, node->cur_eq_funcs)) *isNull = true; - return BoolGetDatum(false); - } - ExecClearTuple(slot); - return BoolGetDatum(false); } /* @@ -186,34 +170,31 @@ ExecHashSubPlan(SubPlanState *node, * aren't provably unequal to the LHS; if so, the result is UNKNOWN. * Otherwise, the result is FALSE. */ - if (node->hashnulls == NULL) - { - ExecClearTuple(slot); - return BoolGetDatum(false); - } - if (slotAllNulls(slot)) - { - ExecClearTuple(slot); + else if (node->hashnulls == NULL) + /* just return FALSE */ ; + else if (slotAllNulls(slot)) *isNull = true; - return BoolGetDatum(false); - } /* Scan partly-null table first, since more likely to get a match */ - if (node->havenullrows && - findPartialMatch(node->hashnulls, slot, node->cur_eq_funcs)) - { - ExecClearTuple(slot); + else if (node->havenullrows && + findPartialMatch(node->hashnulls, slot, node->cur_eq_funcs)) *isNull = true; - return BoolGetDatum(false); - } - if (node->havehashrows && - findPartialMatch(node->hashtable, slot, node->cur_eq_funcs)) - { - ExecClearTuple(slot); + else if (node->havehashrows && + findPartialMatch(node->hashtable, slot, node->cur_eq_funcs)) *isNull = true; - return BoolGetDatum(false); - } + + /* + * Note: because we are typically called in a per-tuple context, we have + * to explicitly clear the projected tuple before returning. Otherwise, + * we'll have a double-free situation: the per-tuple context will probably + * be reset before we're called again, and then the tuple slot will think + * it still needs to free the tuple. + */ ExecClearTuple(slot); - return BoolGetDatum(false); + + /* Also must reset the innerecontext after each hashtable lookup. */ + ResetExprContext(node->innerecontext); + + return BoolGetDatum(result); } /* @@ -548,7 +529,7 @@ buildSubPlanHash(SubPlanState *node, ExprContext *econtext) 0, node->planstate->state->es_query_cxt, node->hashtablecxt, - node->hashtempcxt, + innerecontext->ecxt_per_tuple_memory, false); if (!subplan->unknownEqFalse) @@ -577,7 +558,7 @@ buildSubPlanHash(SubPlanState *node, ExprContext *econtext) 0, node->planstate->state->es_query_cxt, node->hashtablecxt, - node->hashtempcxt, + innerecontext->ecxt_per_tuple_memory, false); } else @@ -639,7 +620,7 @@ buildSubPlanHash(SubPlanState *node, ExprContext *econtext) /* * Reset innerecontext after each inner tuple to free any memory used - * during ExecProject. + * during ExecProject and hashtable lookup. */ ResetExprContext(innerecontext); } @@ -858,7 +839,6 @@ ExecInitSubPlan(SubPlan *subplan, PlanState *parent) sstate->hashtable = NULL; sstate->hashnulls = NULL; sstate->hashtablecxt = NULL; - sstate->hashtempcxt = NULL; sstate->innerecontext = NULL; sstate->keyColIdx = NULL; sstate->tab_eq_funcoids = NULL; @@ -914,11 +894,6 @@ ExecInitSubPlan(SubPlan *subplan, PlanState *parent) AllocSetContextCreate(CurrentMemoryContext, "Subplan HashTable Context", ALLOCSET_DEFAULT_SIZES); - /* and a small one for the hash tables to use as temp storage */ - sstate->hashtempcxt = - AllocSetContextCreate(CurrentMemoryContext, - "Subplan HashTable Temp Context", - ALLOCSET_SMALL_SIZES); /* and a short-lived exprcontext for function evaluation */ sstate->innerecontext = CreateExprContext(estate); diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index d71ed958e31b3..fae18548e074e 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -1397,7 +1397,7 @@ convert_ANY_sublink_to_join(PlannerInfo *root, SubLink *sublink, */ nsitem = addRangeTableEntryForSubquery(pstate, subselect, - makeAlias("ANY_subquery", NIL), + NULL, use_lateral, false); rte = nsitem->p_rte; diff --git a/src/backend/parser/README b/src/backend/parser/README index e0c986a41efea..e26eb437a9f35 100644 --- a/src/backend/parser/README +++ b/src/backend/parser/README @@ -20,6 +20,7 @@ parse_cte.c handle Common Table Expressions (WITH clauses) parse_expr.c handle expressions like col, col + 3, x = 3 or x = 4 parse_enr.c handle ephemeral named rels (trigger transition tables, ...) parse_func.c handle functions, table.column and column identifiers +parse_jsontable.c handle JSON_TABLE parse_merge.c handle MERGE parse_node.c create nodes for various structures parse_oper.c handle operators in expressions diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c index 34f7c17f576ef..b9763ea17144c 100644 --- a/src/backend/parser/analyze.c +++ b/src/backend/parser/analyze.c @@ -777,7 +777,7 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) */ nsitem = addRangeTableEntryForSubquery(pstate, selectQuery, - makeAlias("*SELECT*", NIL), + NULL, false, false); addNSItemToQuery(pstate, nsitem, true, false, false); @@ -2100,7 +2100,6 @@ transformSetOperationTree(ParseState *pstate, SelectStmt *stmt, { /* Process leaf SELECT */ Query *selectQuery; - char selectName[32]; ParseNamespaceItem *nsitem; RangeTblRef *rtr; ListCell *tl; @@ -2156,11 +2155,9 @@ transformSetOperationTree(ParseState *pstate, SelectStmt *stmt, /* * Make the leaf query be a subquery in the top-level rangetable. */ - snprintf(selectName, sizeof(selectName), "*SELECT* %d", - list_length(pstate->p_rtable) + 1); nsitem = addRangeTableEntryForSubquery(pstate, selectQuery, - makeAlias(selectName, NIL), + NULL, false, false); diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c index afcf54169c3b3..e96b38a59d503 100644 --- a/src/backend/parser/parse_utilcmd.c +++ b/src/backend/parser/parse_utilcmd.c @@ -1461,7 +1461,6 @@ expandTableLikeClause(RangeVar *heapRel, TableLikeClause *table_like_clause) char *ccname = constr->check[ccnum].ccname; char *ccbin = constr->check[ccnum].ccbin; bool ccenforced = constr->check[ccnum].ccenforced; - bool ccvalid = constr->check[ccnum].ccvalid; bool ccnoinherit = constr->check[ccnum].ccnoinherit; Node *ccbin_node; bool found_whole_row; @@ -1492,7 +1491,7 @@ expandTableLikeClause(RangeVar *heapRel, TableLikeClause *table_like_clause) n->conname = pstrdup(ccname); n->location = -1; n->is_enforced = ccenforced; - n->initially_valid = ccvalid; + n->initially_valid = ccenforced; /* sic */ n->is_no_inherit = ccnoinherit; n->raw_expr = NULL; n->cooked_expr = nodeToString(ccbin_node); diff --git a/src/backend/replication/logical/tablesync.c b/src/backend/replication/logical/tablesync.c index d3356bc84ee0c..e6da4028d392e 100644 --- a/src/backend/replication/logical/tablesync.c +++ b/src/backend/replication/logical/tablesync.c @@ -1788,6 +1788,32 @@ AllTablesyncsReady(void) return has_subrels && (table_states_not_ready == NIL); } +/* + * Return whether the subscription currently has any relations. + * + * Note: Unlike HasSubscriptionRelations(), this function relies on cached + * information for subscription relations. Additionally, it should not be + * invoked outside of apply or tablesync workers, as MySubscription must be + * initialized first. + */ +bool +HasSubscriptionRelationsCached(void) +{ + bool started_tx; + bool has_subrels; + + /* We need up-to-date subscription tables info here */ + has_subrels = FetchTableStates(&started_tx); + + if (started_tx) + { + CommitTransactionCommand(); + pgstat_report_stat(true); + } + + return has_subrels; +} + /* * Update the two_phase state of the specified subscription in pg_subscription. */ diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c index f1ebd63e792ee..ee6ac22329fdc 100644 --- a/src/backend/replication/logical/worker.c +++ b/src/backend/replication/logical/worker.c @@ -276,7 +276,6 @@ #include "storage/procarray.h" #include "tcop/tcopprot.h" #include "utils/acl.h" -#include "utils/dynahash.h" #include "utils/guc.h" #include "utils/inval.h" #include "utils/lsyscache.h" @@ -3266,12 +3265,18 @@ FindDeletedTupleInLocalRel(Relation localrel, Oid localidxoid, /* * Obtain the information from the leader apply worker as only the - * leader manages conflict retention (see + * leader manages oldest_nonremovable_xid (see * maybe_advance_nonremovable_xid() for details). */ LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); leader = logicalrep_worker_find(MyLogicalRepWorker->subid, InvalidOid, false); + if (!leader) + { + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("could not detect conflict as the leader apply worker has exited"))); + } SpinLockAcquire(&leader->relmutex); oldestxmin = leader->oldest_nonremovable_xid; @@ -4595,11 +4600,28 @@ wait_for_local_flush(RetainDeadTuplesData *rdt_data) * workers is complex and not worth the effort, so we simply return if not * all tables are in the READY state. * - * It is safe to add new tables with initial states to the subscription - * after this check because any changes applied to these tables should - * have a WAL position greater than the rdt_data->remote_lsn. + * Advancing the transaction ID is necessary even when no tables are + * currently subscribed, to avoid retaining dead tuples unnecessarily. + * While it might seem safe to skip all phases and directly assign + * candidate_xid to oldest_nonremovable_xid during the + * RDT_GET_CANDIDATE_XID phase in such cases, this is unsafe. If users + * concurrently add tables to the subscription, the apply worker may not + * process invalidations in time. Consequently, + * HasSubscriptionRelationsCached() might miss the new tables, leading to + * premature advancement of oldest_nonremovable_xid. + * + * Performing the check during RDT_WAIT_FOR_LOCAL_FLUSH is safe, as + * invalidations are guaranteed to be processed before applying changes + * from newly added tables while waiting for the local flush to reach + * remote_lsn. + * + * Additionally, even if we check for subscription tables during + * RDT_GET_CANDIDATE_XID, they might be dropped before reaching + * RDT_WAIT_FOR_LOCAL_FLUSH. Therefore, it's still necessary to verify + * subscription tables at this stage to prevent unnecessary tuple + * retention. */ - if (!AllTablesyncsReady()) + if (HasSubscriptionRelationsCached() && !AllTablesyncsReady()) { TimestampTz now; @@ -5092,7 +5114,7 @@ subxact_info_read(Oid subid, TransactionId xid) len = sizeof(SubXactInfo) * subxact_data.nsubxacts; /* we keep the maximum as a power of 2 */ - subxact_data.nsubxacts_max = 1 << my_log2(subxact_data.nsubxacts); + subxact_data.nsubxacts_max = 1 << pg_ceil_log2_32(subxact_data.nsubxacts); /* * Allocate subxact information in the logical streaming context. We need diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index e3dce9dc68d04..59822f22b8d06 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -51,6 +51,7 @@ #include "access/timeline.h" #include "access/transam.h" +#include "access/twophase.h" #include "access/xact.h" #include "access/xlog_internal.h" #include "access/xlogreader.h" @@ -2719,6 +2720,7 @@ ProcessStandbyPSRequestMessage(void) { XLogRecPtr lsn = InvalidXLogRecPtr; TransactionId oldestXidInCommit; + TransactionId oldestGXidInCommit; FullTransactionId nextFullXid; FullTransactionId fullOldestXidInCommit; WalSnd *walsnd = MyWalSnd; @@ -2746,6 +2748,16 @@ ProcessStandbyPSRequestMessage(void) * ones replicated. */ oldestXidInCommit = GetOldestActiveTransactionId(true, false); + oldestGXidInCommit = TwoPhaseGetOldestXidInCommit(); + + /* + * Update the oldest xid for standby transmission if an older prepared + * transaction exists and is currently in commit phase. + */ + if (TransactionIdIsValid(oldestGXidInCommit) && + TransactionIdPrecedes(oldestGXidInCommit, oldestXidInCommit)) + oldestXidInCommit = oldestGXidInCommit; + nextFullXid = ReadNextFullTransactionId(); fullOldestXidInCommit = FullTransactionIdFromAllowableAt(nextFullXid, oldestXidInCommit); diff --git a/src/backend/rewrite/rewriteSearchCycle.c b/src/backend/rewrite/rewriteSearchCycle.c index 9f95d4dc1b0e8..5202ef43d1068 100644 --- a/src/backend/rewrite/rewriteSearchCycle.c +++ b/src/backend/rewrite/rewriteSearchCycle.c @@ -282,8 +282,8 @@ rewriteSearchAndCycle(CommonTableExpr *cte) newrte = makeNode(RangeTblEntry); newrte->rtekind = RTE_SUBQUERY; - newrte->alias = makeAlias("*TLOCRN*", cte->ctecolnames); - newrte->eref = newrte->alias; + newrte->alias = NULL; + newrte->eref = makeAlias("*TLOCRN*", cte->ctecolnames); newsubquery = copyObject(rte1->subquery); IncrementVarSublevelsUp((Node *) newsubquery, 1, 1); newrte->subquery = newsubquery; @@ -379,8 +379,8 @@ rewriteSearchAndCycle(CommonTableExpr *cte) ewcl = lappend(ewcl, makeString(cte->cycle_clause->cycle_mark_column)); ewcl = lappend(ewcl, makeString(cte->cycle_clause->cycle_path_column)); } - newrte->alias = makeAlias("*TROCRN*", ewcl); - newrte->eref = newrte->alias; + newrte->alias = NULL; + newrte->eref = makeAlias("*TROCRN*", ewcl); /* * Find the reference to the recursive CTE in the right UNION subquery's diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c index af0b99243c614..3c3d2d315c6f4 100644 --- a/src/backend/statistics/extended_stats.c +++ b/src/backend/statistics/extended_stats.c @@ -986,10 +986,9 @@ build_sorted_items(StatsBuildData *data, int *nitems, { int i, j, - len, nrows; int nvalues = data->numrows * numattrs; - + Size len; SortItem *items; Datum *values; bool *isnull; @@ -997,14 +996,16 @@ build_sorted_items(StatsBuildData *data, int *nitems, int *typlen; /* Compute the total amount of memory we need (both items and values). */ - len = data->numrows * sizeof(SortItem) + nvalues * (sizeof(Datum) + sizeof(bool)); + len = MAXALIGN(data->numrows * sizeof(SortItem)) + + nvalues * (sizeof(Datum) + sizeof(bool)); /* Allocate the memory and split it into the pieces. */ ptr = palloc0(len); /* items to sort */ items = (SortItem *) ptr; - ptr += data->numrows * sizeof(SortItem); + /* MAXALIGN ensures that the following Datums are suitably aligned */ + ptr += MAXALIGN(data->numrows * sizeof(SortItem)); /* values and null flags */ values = (Datum *) ptr; diff --git a/src/backend/storage/buffer/README b/src/backend/storage/buffer/README index a182fcd660ccb..119f31b5d6584 100644 --- a/src/backend/storage/buffer/README +++ b/src/backend/storage/buffer/README @@ -128,11 +128,11 @@ independently. If it is necessary to lock more than one partition at a time, they must be locked in partition-number order to avoid risk of deadlock. * A separate system-wide spinlock, buffer_strategy_lock, provides mutual -exclusion for operations that access the buffer free list or select -buffers for replacement. A spinlock is used here rather than a lightweight -lock for efficiency; no other locks of any sort should be acquired while -buffer_strategy_lock is held. This is essential to allow buffer replacement -to happen in multiple backends with reasonable concurrency. +exclusion for operations that select buffers for replacement. A spinlock is +used here rather than a lightweight lock for efficiency; no other locks of any +sort should be acquired while buffer_strategy_lock is held. This is essential +to allow buffer replacement to happen in multiple backends with reasonable +concurrency. * Each buffer header contains a spinlock that must be taken when examining or changing fields of that buffer header. This allows operations such as @@ -158,18 +158,8 @@ unset by sleeping on the buffer's condition variable. Normal Buffer Replacement Strategy ---------------------------------- -There is a "free list" of buffers that are prime candidates for replacement. -In particular, buffers that are completely free (contain no valid page) are -always in this list. We could also throw buffers into this list if we -consider their pages unlikely to be needed soon; however, the current -algorithm never does that. The list is singly-linked using fields in the -buffer headers; we maintain head and tail pointers in global variables. -(Note: although the list links are in the buffer headers, they are -considered to be protected by the buffer_strategy_lock, not the buffer-header -spinlocks.) To choose a victim buffer to recycle when there are no free -buffers available, we use a simple clock-sweep algorithm, which avoids the -need to take system-wide locks during common operations. It works like -this: +To choose a victim buffer to recycle we use a simple clock-sweep algorithm. It +works like this: Each buffer header contains a usage counter, which is incremented (up to a small limit value) whenever the buffer is pinned. (This requires only the @@ -184,20 +174,14 @@ The algorithm for a process that needs to obtain a victim buffer is: 1. Obtain buffer_strategy_lock. -2. If buffer free list is nonempty, remove its head buffer. Release -buffer_strategy_lock. If the buffer is pinned or has a nonzero usage count, -it cannot be used; ignore it go back to step 1. Otherwise, pin the buffer, -and return it. +2. Select the buffer pointed to by nextVictimBuffer, and circularly advance +nextVictimBuffer for next time. Release buffer_strategy_lock. -3. Otherwise, the buffer free list is empty. Select the buffer pointed to by -nextVictimBuffer, and circularly advance nextVictimBuffer for next time. -Release buffer_strategy_lock. - -4. If the selected buffer is pinned or has a nonzero usage count, it cannot +3. If the selected buffer is pinned or has a nonzero usage count, it cannot be used. Decrement its usage count (if nonzero), reacquire buffer_strategy_lock, and return to step 3 to examine the next buffer. -5. Pin the selected buffer, and return. +4. Pin the selected buffer, and return. (Note that if the selected buffer is dirty, we will have to write it out before we can recycle it; if someone else pins the buffer meanwhile we will @@ -211,9 +195,9 @@ Buffer Ring Replacement Strategy When running a query that needs to access a large number of pages just once, such as VACUUM or a large sequential scan, a different strategy is used. A page that has been touched only by such a scan is unlikely to be needed -again soon, so instead of running the normal clock sweep algorithm and +again soon, so instead of running the normal clock-sweep algorithm and blowing out the entire buffer cache, a small ring of buffers is allocated -using the normal clock sweep algorithm and those buffers are reused for the +using the normal clock-sweep algorithm and those buffers are reused for the whole scan. This also implies that much of the write traffic caused by such a statement will be done by the backend itself and not pushed off onto other processes. @@ -234,7 +218,7 @@ the ring strategy effectively degrades to the normal strategy. VACUUM uses a ring like sequential scans, however, the size of this ring is controlled by the vacuum_buffer_usage_limit GUC. Dirty pages are not removed -from the ring. Instead, WAL is flushed if needed to allow reuse of the +from the ring. Instead, the WAL is flushed if needed to allow reuse of the buffers. Before introducing the buffer ring strategy in 8.3, VACUUM's buffers were sent to the freelist, which was effectively a buffer ring of 1 buffer, resulting in excessive WAL flushing. diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index ed1dc488a42b4..6fd3a6bbac5ea 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -128,20 +128,11 @@ BufferManagerShmemInit(void) pgaio_wref_clear(&buf->io_wref); - /* - * Initially link all the buffers together as unused. Subsequent - * management of this list is done by freelist.c. - */ - buf->freeNext = i + 1; - LWLockInitialize(BufferDescriptorGetContentLock(buf), LWTRANCHE_BUFFER_CONTENT); ConditionVariableInit(BufferDescriptorGetIOCV(buf)); } - - /* Correct last entry of linked list */ - GetBufferDescriptor(NBuffers - 1)->freeNext = FREENEXT_END_OF_LIST; } /* Init other shared buffer-management stuff */ diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 350cc0402aa8f..fe470de63f20c 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -2094,12 +2094,6 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, */ UnpinBuffer(victim_buf_hdr); - /* - * The victim buffer we acquired previously is clean and unused, let - * it be found again quickly - */ - StrategyFreeBuffer(victim_buf_hdr); - /* remaining code should match code at top of routine */ existing_buf_hdr = GetBufferDescriptor(existing_buf_id); @@ -2158,8 +2152,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, } /* - * InvalidateBuffer -- mark a shared buffer invalid and return it to the - * freelist. + * InvalidateBuffer -- mark a shared buffer invalid. * * The buffer header spinlock must be held at entry. We drop it before * returning. (This is sane because the caller must have locked the @@ -2257,11 +2250,6 @@ InvalidateBuffer(BufferDesc *buf) * Done with mapping lock. */ LWLockRelease(oldPartitionLock); - - /* - * Insert the buffer at the head of the list of free buffers. - */ - StrategyFreeBuffer(buf); } /* @@ -2679,11 +2667,6 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, { BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1); - /* - * The victim buffer we acquired previously is clean and unused, - * let it be found again quickly - */ - StrategyFreeBuffer(buf_hdr); UnpinBuffer(buf_hdr); } @@ -2756,12 +2739,6 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, valid = PinBuffer(existing_hdr, strategy); LWLockRelease(partition_lock); - - /* - * The victim buffer we acquired previously is clean and unused, - * let it be found again quickly - */ - StrategyFreeBuffer(victim_buf_hdr); UnpinBuffer(victim_buf_hdr); buffers[i] = BufferDescriptorGetBuffer(existing_hdr); @@ -3608,7 +3585,7 @@ BufferSync(int flags) * This is called periodically by the background writer process. * * Returns true if it's appropriate for the bgwriter process to go into - * low-power hibernation mode. (This happens if the strategy clock sweep + * low-power hibernation mode. (This happens if the strategy clock-sweep * has been "lapped" and no buffer allocations have occurred recently, * or if the bgwriter has been effectively disabled by setting * bgwriter_lru_maxpages to 0.) @@ -3658,8 +3635,8 @@ BgBufferSync(WritebackContext *wb_context) uint32 new_recent_alloc; /* - * Find out where the freelist clock sweep currently is, and how many - * buffer allocations have happened since our last call. + * Find out where the clock-sweep currently is, and how many buffer + * allocations have happened since our last call. */ strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc); @@ -3679,8 +3656,8 @@ BgBufferSync(WritebackContext *wb_context) /* * Compute strategy_delta = how many buffers have been scanned by the - * clock sweep since last time. If first time through, assume none. Then - * see if we are still ahead of the clock sweep, and if so, how many + * clock-sweep since last time. If first time through, assume none. Then + * see if we are still ahead of the clock-sweep, and if so, how many * buffers we could scan before we'd catch up with it and "lap" it. Note: * weird-looking coding of xxx_passes comparisons are to avoid bogus * behavior when the passes counts wrap around. diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index 01909be027258..7d59a92bd1a88 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -33,25 +33,17 @@ typedef struct slock_t buffer_strategy_lock; /* - * Clock sweep hand: index of next buffer to consider grabbing. Note that + * clock-sweep hand: index of next buffer to consider grabbing. Note that * this isn't a concrete buffer - we only ever increase the value. So, to * get an actual buffer, it needs to be used modulo NBuffers. */ pg_atomic_uint32 nextVictimBuffer; - int firstFreeBuffer; /* Head of list of unused buffers */ - int lastFreeBuffer; /* Tail of list of unused buffers */ - - /* - * NOTE: lastFreeBuffer is undefined when firstFreeBuffer is -1 (that is, - * when the list is empty) - */ - /* * Statistics. These counters should be wide enough that they can't * overflow during a single bgwriter cycle. */ - uint32 completePasses; /* Complete cycles of the clock sweep */ + uint32 completePasses; /* Complete cycles of the clock-sweep */ pg_atomic_uint32 numBufferAllocs; /* Buffers allocated since last reset */ /* @@ -163,23 +155,6 @@ ClockSweepTick(void) return victim; } -/* - * have_free_buffer -- a lockless check to see if there is a free buffer in - * buffer pool. - * - * If the result is true that will become stale once free buffers are moved out - * by other operations, so the caller who strictly want to use a free buffer - * should not call this. - */ -bool -have_free_buffer(void) -{ - if (StrategyControl->firstFreeBuffer >= 0) - return true; - else - return false; -} - /* * StrategyGetBuffer * @@ -249,69 +224,7 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r */ pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1); - /* - * First check, without acquiring the lock, whether there's buffers in the - * freelist. Since we otherwise don't require the spinlock in every - * StrategyGetBuffer() invocation, it'd be sad to acquire it here - - * uselessly in most cases. That obviously leaves a race where a buffer is - * put on the freelist but we don't see the store yet - but that's pretty - * harmless, it'll just get used during the next buffer acquisition. - * - * If there's buffers on the freelist, acquire the spinlock to pop one - * buffer of the freelist. Then check whether that buffer is usable and - * repeat if not. - * - * Note that the freeNext fields are considered to be protected by the - * buffer_strategy_lock not the individual buffer spinlocks, so it's OK to - * manipulate them without holding the spinlock. - */ - if (StrategyControl->firstFreeBuffer >= 0) - { - while (true) - { - /* Acquire the spinlock to remove element from the freelist */ - SpinLockAcquire(&StrategyControl->buffer_strategy_lock); - - if (StrategyControl->firstFreeBuffer < 0) - { - SpinLockRelease(&StrategyControl->buffer_strategy_lock); - break; - } - - buf = GetBufferDescriptor(StrategyControl->firstFreeBuffer); - Assert(buf->freeNext != FREENEXT_NOT_IN_LIST); - - /* Unconditionally remove buffer from freelist */ - StrategyControl->firstFreeBuffer = buf->freeNext; - buf->freeNext = FREENEXT_NOT_IN_LIST; - - /* - * Release the lock so someone else can access the freelist while - * we check out this buffer. - */ - SpinLockRelease(&StrategyControl->buffer_strategy_lock); - - /* - * If the buffer is pinned or has a nonzero usage_count, we cannot - * use it; discard it and retry. (This can only happen if VACUUM - * put a valid buffer in the freelist and then someone else used - * it before we got to it. It's probably impossible altogether as - * of 8.3, but we'd better check anyway.) - */ - local_buf_state = LockBufHdr(buf); - if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0 - && BUF_STATE_GET_USAGECOUNT(local_buf_state) == 0) - { - if (strategy != NULL) - AddBufferToRing(strategy, buf); - *buf_state = local_buf_state; - return buf; - } - UnlockBufHdr(buf, local_buf_state); - } - } - - /* Nothing on the freelist, so run the "clock sweep" algorithm */ + /* Use the "clock sweep" algorithm to find a free buffer */ trycounter = NBuffers; for (;;) { @@ -356,29 +269,6 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r } } -/* - * StrategyFreeBuffer: put a buffer on the freelist - */ -void -StrategyFreeBuffer(BufferDesc *buf) -{ - SpinLockAcquire(&StrategyControl->buffer_strategy_lock); - - /* - * It is possible that we are told to put something in the freelist that - * is already in it; don't screw up the list if so. - */ - if (buf->freeNext == FREENEXT_NOT_IN_LIST) - { - buf->freeNext = StrategyControl->firstFreeBuffer; - if (buf->freeNext < 0) - StrategyControl->lastFreeBuffer = buf->buf_id; - StrategyControl->firstFreeBuffer = buf->buf_id; - } - - SpinLockRelease(&StrategyControl->buffer_strategy_lock); -} - /* * StrategySyncStart -- tell BgBufferSync where to start syncing * @@ -504,14 +394,7 @@ StrategyInitialize(bool init) SpinLockInit(&StrategyControl->buffer_strategy_lock); - /* - * Grab the whole linked list of free buffers for our strategy. We - * assume it was previously set up by BufferManagerShmemInit(). - */ - StrategyControl->firstFreeBuffer = 0; - StrategyControl->lastFreeBuffer = NBuffers - 1; - - /* Initialize the clock sweep pointer */ + /* Initialize the clock-sweep pointer */ pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0); /* Clear statistics */ @@ -759,7 +642,7 @@ GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state) * * If usage_count is 0 or 1 then the buffer is fair game (we expect 1, * since our own previous usage of the ring element would have left it - * there, but it might've been decremented by clock sweep since then). A + * there, but it might've been decremented by clock-sweep since then). A * higher usage_count indicates someone else has touched the buffer, so we * shouldn't re-use it. */ diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index 3c0d20f4659d2..04fef13409b02 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -229,7 +229,7 @@ GetLocalVictimBuffer(void) ResourceOwnerEnlarge(CurrentResourceOwner); /* - * Need to get a new buffer. We use a clock sweep algorithm (essentially + * Need to get a new buffer. We use a clock-sweep algorithm (essentially * the same as what freelist.c does now...) */ trycounter = NLocBuffer; diff --git a/src/backend/storage/page/meson.build b/src/backend/storage/page/meson.build index c3e4a805862a9..112f00ff36552 100644 --- a/src/backend/storage/page/meson.build +++ b/src/backend/storage/page/meson.build @@ -1,7 +1,15 @@ # Copyright (c) 2022-2025, PostgreSQL Global Development Group +checksum_backend_lib = static_library('checksum_backend_lib', + 'checksum.c', + dependencies: backend_build_deps, + kwargs: internal_lib_args, + c_args: vectorize_cflags + unroll_loops_cflags, +) + +backend_link_with += checksum_backend_lib + backend_sources += files( 'bufpage.c', - 'checksum.c', 'itemptr.c', ) diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 0cecd4649020f..d356830f756be 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -649,6 +649,10 @@ pg_parse_query(const char *query_string) TRACE_POSTGRESQL_QUERY_PARSE_DONE(query_string); + if (Debug_print_raw_parse) + elog_node_display(LOG, "raw parse tree", raw_parsetree_list, + Debug_pretty_print); + return raw_parsetree_list; } @@ -3697,7 +3701,10 @@ set_debug_options(int debug_flag, GucContext context, GucSource source) if (debug_flag >= 2) SetConfigOption("log_statement", "all", context, source); if (debug_flag >= 3) + { + SetConfigOption("debug_print_raw_parse", "true", context, source); SetConfigOption("debug_print_parse", "true", context, source); + } if (debug_flag >= 4) SetConfigOption("debug_print_plan", "true", context, source); if (debug_flag >= 5) diff --git a/src/backend/utils/activity/pgstat.c b/src/backend/utils/activity/pgstat.c index ffb5b8cce3441..f8e91484e36be 100644 --- a/src/backend/utils/activity/pgstat.c +++ b/src/backend/utils/activity/pgstat.c @@ -1975,6 +1975,17 @@ pgstat_read_statsfile(void) header = pgstat_init_entry(key.kind, p); dshash_release_lock(pgStatLocal.shared_hash, p); + if (header == NULL) + { + /* + * It would be tempting to switch this ERROR to a + * WARNING, but it would mean that all the statistics + * are discarded when the environment fails on OOM. + */ + elog(ERROR, "could not allocate entry %u/%u/%" PRIu64 " of type %c", + key.kind, key.dboid, + key.objid, t); + } if (!read_chunk(fpin, pgstat_get_entry_data(key.kind, header), diff --git a/src/backend/utils/activity/pgstat_shmem.c b/src/backend/utils/activity/pgstat_shmem.c index 62de347445365..9dc3212f7dd01 100644 --- a/src/backend/utils/activity/pgstat_shmem.c +++ b/src/backend/utils/activity/pgstat_shmem.c @@ -289,6 +289,13 @@ pgstat_detach_shmem(void) * ------------------------------------------------------------ */ +/* + * Initialize entry newly-created. + * + * Returns NULL in the event of an allocation failure, so as callers can + * take cleanup actions as the entry initialized is already inserted in the + * shared hashtable. + */ PgStatShared_Common * pgstat_init_entry(PgStat_Kind kind, PgStatShared_HashEntry *shhashent) @@ -311,7 +318,12 @@ pgstat_init_entry(PgStat_Kind kind, pg_atomic_init_u32(&shhashent->generation, 0); shhashent->dropped = false; - chunk = dsa_allocate0(pgStatLocal.dsa, pgstat_get_kind_info(kind)->shared_size); + chunk = dsa_allocate_extended(pgStatLocal.dsa, + pgstat_get_kind_info(kind)->shared_size, + DSA_ALLOC_ZERO | DSA_ALLOC_NO_OOM); + if (chunk == InvalidDsaPointer) + return NULL; + shheader = dsa_get_address(pgStatLocal.dsa, chunk); shheader->magic = 0xdeadbeef; @@ -509,6 +521,20 @@ pgstat_get_entry_ref(PgStat_Kind kind, Oid dboid, uint64 objid, bool create, if (!shfound) { shheader = pgstat_init_entry(kind, shhashent); + if (shheader == NULL) + { + /* + * Failed the allocation of a new entry, so clean up the + * shared hashtable before giving up. + */ + dshash_delete_entry(pgStatLocal.shared_hash, shhashent); + + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"), + errdetail("Failed while allocating entry %u/%u/%" PRIu64 ".", + key.kind, key.dboid, key.objid))); + } pgstat_acquire_entry_ref(entry_ref, shhashent, shheader); if (created_entry != NULL) diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index 7ad453314c307..78e19ac39ac17 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -6389,12 +6389,12 @@ numeric_to_char(PG_FUNCTION_ARGS) if (IS_ROMAN(&Num)) { int32 intvalue; - bool err; + ErrorSaveContext escontext = {T_ErrorSaveContext}; /* Round and convert to int */ - intvalue = numeric_int4_opt_error(value, &err); + intvalue = numeric_int4_safe(value, (Node *) &escontext); /* On overflow, just use PG_INT32_MAX; int_to_roman will cope */ - if (err) + if (escontext.error_occurred) intvalue = PG_INT32_MAX; numstr = int_to_roman(intvalue); } diff --git a/src/backend/utils/adt/jsonbsubs.c b/src/backend/utils/adt/jsonbsubs.c index de64d49851251..e8626d3b4fc6e 100644 --- a/src/backend/utils/adt/jsonbsubs.c +++ b/src/backend/utils/adt/jsonbsubs.c @@ -51,7 +51,7 @@ jsonb_subscript_transform(SubscriptingRef *sbsref, /* * Transform and convert the subscript expressions. Jsonb subscripting - * does not support slices, look only and the upper index. + * does not support slices, look only at the upper index. */ foreach(idx, indirection) { diff --git a/src/backend/utils/adt/jsonpath_exec.c b/src/backend/utils/adt/jsonpath_exec.c index 5a56253522357..8156695e97e09 100644 --- a/src/backend/utils/adt/jsonpath_exec.c +++ b/src/backend/utils/adt/jsonpath_exec.c @@ -252,7 +252,8 @@ typedef JsonPathBool (*JsonPathPredicateCallback) (JsonPathItem *jsp, JsonbValue *larg, JsonbValue *rarg, void *param); -typedef Numeric (*BinaryArithmFunc) (Numeric num1, Numeric num2, bool *error); +typedef Numeric (*BinaryArithmFunc) (Numeric num1, Numeric num2, + Node *escontext); static JsonPathExecResult executeJsonPath(JsonPath *path, void *vars, JsonPathGetVarCallback getVar, @@ -808,23 +809,23 @@ executeItemOptUnwrapTarget(JsonPathExecContext *cxt, JsonPathItem *jsp, case jpiAdd: return executeBinaryArithmExpr(cxt, jsp, jb, - numeric_add_opt_error, found); + numeric_add_safe, found); case jpiSub: return executeBinaryArithmExpr(cxt, jsp, jb, - numeric_sub_opt_error, found); + numeric_sub_safe, found); case jpiMul: return executeBinaryArithmExpr(cxt, jsp, jb, - numeric_mul_opt_error, found); + numeric_mul_safe, found); case jpiDiv: return executeBinaryArithmExpr(cxt, jsp, jb, - numeric_div_opt_error, found); + numeric_div_safe, found); case jpiMod: return executeBinaryArithmExpr(cxt, jsp, jb, - numeric_mod_opt_error, found); + numeric_mod_safe, found); case jpiPlus: return executeUnaryArithmExpr(cxt, jsp, jb, NULL, found); @@ -1269,11 +1270,12 @@ executeItemOptUnwrapTarget(JsonPathExecContext *cxt, JsonPathItem *jsp, if (jb->type == jbvNumeric) { - bool have_error; + ErrorSaveContext escontext = {T_ErrorSaveContext}; int64 val; - val = numeric_int8_opt_error(jb->val.numeric, &have_error); - if (have_error) + val = numeric_int8_safe(jb->val.numeric, + (Node *) &escontext); + if (escontext.error_occurred) RETURN_ERROR(ereport(ERROR, (errcode(ERRCODE_NON_NUMERIC_SQL_JSON_ITEM), errmsg("argument \"%s\" of jsonpath item method .%s() is invalid for type %s", @@ -1466,7 +1468,6 @@ executeItemOptUnwrapTarget(JsonPathExecContext *cxt, JsonPathItem *jsp, Datum dtypmod; int32 precision; int32 scale = 0; - bool have_error; bool noerr; ArrayType *arrtypmod; Datum datums[2]; @@ -1478,9 +1479,9 @@ executeItemOptUnwrapTarget(JsonPathExecContext *cxt, JsonPathItem *jsp, if (elem.type != jpiNumeric) elog(ERROR, "invalid jsonpath item type for .decimal() precision"); - precision = numeric_int4_opt_error(jspGetNumeric(&elem), - &have_error); - if (have_error) + precision = numeric_int4_safe(jspGetNumeric(&elem), + (Node *) &escontext); + if (escontext.error_occurred) RETURN_ERROR(ereport(ERROR, (errcode(ERRCODE_NON_NUMERIC_SQL_JSON_ITEM), errmsg("precision of jsonpath item method .%s() is out of range for type integer", @@ -1492,9 +1493,9 @@ executeItemOptUnwrapTarget(JsonPathExecContext *cxt, JsonPathItem *jsp, if (elem.type != jpiNumeric) elog(ERROR, "invalid jsonpath item type for .decimal() scale"); - scale = numeric_int4_opt_error(jspGetNumeric(&elem), - &have_error); - if (have_error) + scale = numeric_int4_safe(jspGetNumeric(&elem), + (Node *) &escontext); + if (escontext.error_occurred) RETURN_ERROR(ereport(ERROR, (errcode(ERRCODE_NON_NUMERIC_SQL_JSON_ITEM), errmsg("scale of jsonpath item method .%s() is out of range for type integer", @@ -1550,11 +1551,12 @@ executeItemOptUnwrapTarget(JsonPathExecContext *cxt, JsonPathItem *jsp, if (jb->type == jbvNumeric) { - bool have_error; int32 val; + ErrorSaveContext escontext = {T_ErrorSaveContext}; - val = numeric_int4_opt_error(jb->val.numeric, &have_error); - if (have_error) + val = numeric_int4_safe(jb->val.numeric, + (Node *) &escontext); + if (escontext.error_occurred) RETURN_ERROR(ereport(ERROR, (errcode(ERRCODE_NON_NUMERIC_SQL_JSON_ITEM), errmsg("argument \"%s\" of jsonpath item method .%s() is invalid for type %s", @@ -2149,11 +2151,11 @@ executeBinaryArithmExpr(JsonPathExecContext *cxt, JsonPathItem *jsp, } else { - bool error = false; + ErrorSaveContext escontext = {T_ErrorSaveContext}; - res = func(lval->val.numeric, rval->val.numeric, &error); + res = func(lval->val.numeric, rval->val.numeric, (Node *) &escontext); - if (error) + if (escontext.error_occurred) return jperError; } @@ -2433,7 +2435,7 @@ executeDateTimeMethod(JsonPathExecContext *cxt, JsonPathItem *jsp, if (jsp->type != jpiDatetime && jsp->type != jpiDate && jsp->content.arg) { - bool have_error; + ErrorSaveContext escontext = {T_ErrorSaveContext}; jspGetArg(jsp, &elem); @@ -2441,9 +2443,9 @@ executeDateTimeMethod(JsonPathExecContext *cxt, JsonPathItem *jsp, elog(ERROR, "invalid jsonpath item type for %s argument", jspOperationName(jsp->type)); - time_precision = numeric_int4_opt_error(jspGetNumeric(&elem), - &have_error); - if (have_error) + time_precision = numeric_int4_safe(jspGetNumeric(&elem), + (Node *) &escontext); + if (escontext.error_occurred) RETURN_ERROR(ereport(ERROR, (errcode(ERRCODE_INVALID_ARGUMENT_FOR_SQL_JSON_DATETIME_FUNCTION), errmsg("time precision of jsonpath item method .%s() is out of range for type integer", @@ -3462,7 +3464,7 @@ getArrayIndex(JsonPathExecContext *cxt, JsonPathItem *jsp, JsonbValue *jb, JsonValueList found = {0}; JsonPathExecResult res = executeItem(cxt, jsp, jb, &found); Datum numeric_index; - bool have_error = false; + ErrorSaveContext escontext = {T_ErrorSaveContext}; if (jperIsError(res)) return res; @@ -3477,10 +3479,10 @@ getArrayIndex(JsonPathExecContext *cxt, JsonPathItem *jsp, JsonbValue *jb, NumericGetDatum(jbv->val.numeric), Int32GetDatum(0)); - *index = numeric_int4_opt_error(DatumGetNumeric(numeric_index), - &have_error); + *index = numeric_int4_safe(DatumGetNumeric(numeric_index), + (Node *) &escontext); - if (have_error) + if (escontext.error_occurred) RETURN_ERROR(ereport(ERROR, (errcode(ERRCODE_INVALID_SQL_JSON_SUBSCRIPT), errmsg("jsonpath array subscript is out of integer range")))); diff --git a/src/backend/utils/adt/meson.build b/src/backend/utils/adt/meson.build index dac372c3bea3b..12fa0c209127c 100644 --- a/src/backend/utils/adt/meson.build +++ b/src/backend/utils/adt/meson.build @@ -1,5 +1,15 @@ # Copyright (c) 2022-2025, PostgreSQL Global Development Group +# Some code in numeric.c benefits from auto-vectorization +numeric_backend_lib = static_library('numeric_backend_lib', + 'numeric.c', + dependencies: backend_build_deps, + kwargs: internal_lib_args, + c_args: vectorize_cflags, +) + +backend_link_with += numeric_backend_lib + backend_sources += files( 'acl.c', 'amutils.c', @@ -61,7 +71,6 @@ backend_sources += files( 'network_gist.c', 'network_selfuncs.c', 'network_spgist.c', - 'numeric.c', 'numutils.c', 'oid.c', 'oracle_compat.c', diff --git a/src/backend/utils/adt/numeric.c b/src/backend/utils/adt/numeric.c index b6287f5d97305..76269918593d7 100644 --- a/src/backend/utils/adt/numeric.c +++ b/src/backend/utils/adt/numeric.c @@ -517,7 +517,7 @@ static void numericvar_deserialize(StringInfo buf, NumericVar *var); static Numeric duplicate_numeric(Numeric num); static Numeric make_result(const NumericVar *var); -static Numeric make_result_opt_error(const NumericVar *var, bool *have_error); +static Numeric make_result_safe(const NumericVar *var, Node *escontext); static bool apply_typmod(NumericVar *var, int32 typmod, Node *escontext); static bool apply_typmod_special(Numeric num, int32 typmod, Node *escontext); @@ -717,7 +717,6 @@ numeric_in(PG_FUNCTION_ARGS) */ NumericVar value; int base; - bool have_error; init_var(&value); @@ -776,12 +775,7 @@ numeric_in(PG_FUNCTION_ARGS) if (!apply_typmod(&value, typmod, escontext)) PG_RETURN_NULL(); - res = make_result_opt_error(&value, &have_error); - - if (have_error) - ereturn(escontext, (Datum) 0, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("value overflows numeric format"))); + res = make_result_safe(&value, escontext); free_var(&value); } @@ -2874,20 +2868,18 @@ numeric_add(PG_FUNCTION_ARGS) Numeric num2 = PG_GETARG_NUMERIC(1); Numeric res; - res = numeric_add_opt_error(num1, num2, NULL); + res = numeric_add_safe(num1, num2, NULL); PG_RETURN_NUMERIC(res); } /* - * numeric_add_opt_error() - + * numeric_add_safe() - * - * Internal version of numeric_add(). If "*have_error" flag is provided, - * on error it's set to true, NULL returned. This is helpful when caller - * need to handle errors by itself. + * Internal version of numeric_add() with support for soft error reporting. */ Numeric -numeric_add_opt_error(Numeric num1, Numeric num2, bool *have_error) +numeric_add_safe(Numeric num1, Numeric num2, Node *escontext) { NumericVar arg1; NumericVar arg2; @@ -2931,7 +2923,7 @@ numeric_add_opt_error(Numeric num1, Numeric num2, bool *have_error) init_var(&result); add_var(&arg1, &arg2, &result); - res = make_result_opt_error(&result, have_error); + res = make_result_safe(&result, escontext); free_var(&result); @@ -2951,21 +2943,19 @@ numeric_sub(PG_FUNCTION_ARGS) Numeric num2 = PG_GETARG_NUMERIC(1); Numeric res; - res = numeric_sub_opt_error(num1, num2, NULL); + res = numeric_sub_safe(num1, num2, NULL); PG_RETURN_NUMERIC(res); } /* - * numeric_sub_opt_error() - + * numeric_sub_safe() - * - * Internal version of numeric_sub(). If "*have_error" flag is provided, - * on error it's set to true, NULL returned. This is helpful when caller - * need to handle errors by itself. + * Internal version of numeric_sub() with support for soft error reporting. */ Numeric -numeric_sub_opt_error(Numeric num1, Numeric num2, bool *have_error) +numeric_sub_safe(Numeric num1, Numeric num2, Node *escontext) { NumericVar arg1; NumericVar arg2; @@ -3009,7 +2999,7 @@ numeric_sub_opt_error(Numeric num1, Numeric num2, bool *have_error) init_var(&result); sub_var(&arg1, &arg2, &result); - res = make_result_opt_error(&result, have_error); + res = make_result_safe(&result, escontext); free_var(&result); @@ -3029,21 +3019,19 @@ numeric_mul(PG_FUNCTION_ARGS) Numeric num2 = PG_GETARG_NUMERIC(1); Numeric res; - res = numeric_mul_opt_error(num1, num2, NULL); + res = numeric_mul_safe(num1, num2, NULL); PG_RETURN_NUMERIC(res); } /* - * numeric_mul_opt_error() - + * numeric_mul_safe() - * - * Internal version of numeric_mul(). If "*have_error" flag is provided, - * on error it's set to true, NULL returned. This is helpful when caller - * need to handle errors by itself. + * Internal version of numeric_mul() with support for soft error reporting. */ Numeric -numeric_mul_opt_error(Numeric num1, Numeric num2, bool *have_error) +numeric_mul_safe(Numeric num1, Numeric num2, Node *escontext) { NumericVar arg1; NumericVar arg2; @@ -3130,7 +3118,7 @@ numeric_mul_opt_error(Numeric num1, Numeric num2, bool *have_error) if (result.dscale > NUMERIC_DSCALE_MAX) round_var(&result, NUMERIC_DSCALE_MAX); - res = make_result_opt_error(&result, have_error); + res = make_result_safe(&result, escontext); free_var(&result); @@ -3150,21 +3138,19 @@ numeric_div(PG_FUNCTION_ARGS) Numeric num2 = PG_GETARG_NUMERIC(1); Numeric res; - res = numeric_div_opt_error(num1, num2, NULL); + res = numeric_div_safe(num1, num2, NULL); PG_RETURN_NUMERIC(res); } /* - * numeric_div_opt_error() - + * numeric_div_safe() - * - * Internal version of numeric_div(). If "*have_error" flag is provided, - * on error it's set to true, NULL returned. This is helpful when caller - * need to handle errors by itself. + * Internal version of numeric_div() with support for soft error reporting. */ Numeric -numeric_div_opt_error(Numeric num1, Numeric num2, bool *have_error) +numeric_div_safe(Numeric num1, Numeric num2, Node *escontext) { NumericVar arg1; NumericVar arg2; @@ -3172,9 +3158,6 @@ numeric_div_opt_error(Numeric num1, Numeric num2, bool *have_error) Numeric res; int rscale; - if (have_error) - *have_error = false; - /* * Handle NaN and infinities */ @@ -3189,15 +3172,7 @@ numeric_div_opt_error(Numeric num1, Numeric num2, bool *have_error) switch (numeric_sign_internal(num2)) { case 0: - if (have_error) - { - *have_error = true; - return NULL; - } - ereport(ERROR, - (errcode(ERRCODE_DIVISION_BY_ZERO), - errmsg("division by zero"))); - break; + goto division_by_zero; case 1: return make_result(&const_pinf); case -1: @@ -3212,15 +3187,7 @@ numeric_div_opt_error(Numeric num1, Numeric num2, bool *have_error) switch (numeric_sign_internal(num2)) { case 0: - if (have_error) - { - *have_error = true; - return NULL; - } - ereport(ERROR, - (errcode(ERRCODE_DIVISION_BY_ZERO), - errmsg("division by zero"))); - break; + goto division_by_zero; case 1: return make_result(&const_ninf); case -1: @@ -3251,25 +3218,25 @@ numeric_div_opt_error(Numeric num1, Numeric num2, bool *have_error) */ rscale = select_div_scale(&arg1, &arg2); - /* - * If "have_error" is provided, check for division by zero here - */ - if (have_error && (arg2.ndigits == 0 || arg2.digits[0] == 0)) - { - *have_error = true; - return NULL; - } + /* Check for division by zero */ + if (arg2.ndigits == 0 || arg2.digits[0] == 0) + goto division_by_zero; /* * Do the divide and return the result */ div_var(&arg1, &arg2, &result, rscale, true, true); - res = make_result_opt_error(&result, have_error); + res = make_result_safe(&result, escontext); free_var(&result); return res; + +division_by_zero: + ereturn(escontext, NULL, + errcode(ERRCODE_DIVISION_BY_ZERO), + errmsg("division by zero")); } @@ -3374,30 +3341,25 @@ numeric_mod(PG_FUNCTION_ARGS) Numeric num2 = PG_GETARG_NUMERIC(1); Numeric res; - res = numeric_mod_opt_error(num1, num2, NULL); + res = numeric_mod_safe(num1, num2, NULL); PG_RETURN_NUMERIC(res); } /* - * numeric_mod_opt_error() - + * numeric_mod_safe() - * - * Internal version of numeric_mod(). If "*have_error" flag is provided, - * on error it's set to true, NULL returned. This is helpful when caller - * need to handle errors by itself. + * Internal version of numeric_mod() with support for soft error reporting. */ Numeric -numeric_mod_opt_error(Numeric num1, Numeric num2, bool *have_error) +numeric_mod_safe(Numeric num1, Numeric num2, Node *escontext) { Numeric res; NumericVar arg1; NumericVar arg2; NumericVar result; - if (have_error) - *have_error = false; - /* * Handle NaN and infinities. We follow POSIX fmod() on this, except that * POSIX treats x-is-infinite and y-is-zero identically, raising EDOM and @@ -3410,16 +3372,8 @@ numeric_mod_opt_error(Numeric num1, Numeric num2, bool *have_error) if (NUMERIC_IS_INF(num1)) { if (numeric_sign_internal(num2) == 0) - { - if (have_error) - { - *have_error = true; - return NULL; - } - ereport(ERROR, - (errcode(ERRCODE_DIVISION_BY_ZERO), - errmsg("division by zero"))); - } + goto division_by_zero; + /* Inf % any nonzero = NaN */ return make_result(&const_nan); } @@ -3432,22 +3386,22 @@ numeric_mod_opt_error(Numeric num1, Numeric num2, bool *have_error) init_var(&result); - /* - * If "have_error" is provided, check for division by zero here - */ - if (have_error && (arg2.ndigits == 0 || arg2.digits[0] == 0)) - { - *have_error = true; - return NULL; - } + /* Check for division by zero */ + if (arg2.ndigits == 0 || arg2.digits[0] == 0) + goto division_by_zero; mod_var(&arg1, &arg2, &result); - res = make_result_opt_error(&result, NULL); + res = make_result_safe(&result, escontext); free_var(&result); return res; + +division_by_zero: + ereturn(escontext, NULL, + errcode(ERRCODE_DIVISION_BY_ZERO), + errmsg("division by zero")); } @@ -4404,52 +4358,34 @@ int4_numeric(PG_FUNCTION_ARGS) PG_RETURN_NUMERIC(int64_to_numeric(val)); } +/* + * Internal version of int4_numeric() with support for soft error reporting. + */ int32 -numeric_int4_opt_error(Numeric num, bool *have_error) +numeric_int4_safe(Numeric num, Node *escontext) { NumericVar x; int32 result; - if (have_error) - *have_error = false; - if (NUMERIC_IS_SPECIAL(num)) { - if (have_error) - { - *have_error = true; - return 0; - } + if (NUMERIC_IS_NAN(num)) + ereturn(escontext, 0, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot convert NaN to %s", "integer"))); else - { - if (NUMERIC_IS_NAN(num)) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot convert NaN to %s", "integer"))); - else - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot convert infinity to %s", "integer"))); - } + ereturn(escontext, 0, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot convert infinity to %s", "integer"))); } /* Convert to variable format, then convert to int4 */ init_var_from_num(num, &x); if (!numericvar_to_int32(&x, &result)) - { - if (have_error) - { - *have_error = true; - return 0; - } - else - { - ereport(ERROR, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("integer out of range"))); - } - } + ereturn(escontext, 0, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("integer out of range"))); return result; } @@ -4459,7 +4395,7 @@ numeric_int4(PG_FUNCTION_ARGS) { Numeric num = PG_GETARG_NUMERIC(0); - PG_RETURN_INT32(numeric_int4_opt_error(num, NULL)); + PG_RETURN_INT32(numeric_int4_safe(num, NULL)); } /* @@ -4492,52 +4428,34 @@ int8_numeric(PG_FUNCTION_ARGS) PG_RETURN_NUMERIC(int64_to_numeric(val)); } +/* + * Internal version of int8_numeric() with support for soft error reporting. + */ int64 -numeric_int8_opt_error(Numeric num, bool *have_error) +numeric_int8_safe(Numeric num, Node *escontext) { NumericVar x; int64 result; - if (have_error) - *have_error = false; - if (NUMERIC_IS_SPECIAL(num)) { - if (have_error) - { - *have_error = true; - return 0; - } + if (NUMERIC_IS_NAN(num)) + ereturn(escontext, 0, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot convert NaN to %s", "bigint"))); else - { - if (NUMERIC_IS_NAN(num)) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot convert NaN to %s", "bigint"))); - else - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot convert infinity to %s", "bigint"))); - } + ereturn(escontext, 0, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot convert infinity to %s", "bigint"))); } /* Convert to variable format, then convert to int8 */ init_var_from_num(num, &x); if (!numericvar_to_int64(&x, &result)) - { - if (have_error) - { - *have_error = true; - return 0; - } - else - { - ereport(ERROR, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("bigint out of range"))); - } - } + ereturn(escontext, 0, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("bigint out of range"))); return result; } @@ -4547,7 +4465,7 @@ numeric_int8(PG_FUNCTION_ARGS) { Numeric num = PG_GETARG_NUMERIC(0); - PG_RETURN_INT64(numeric_int8_opt_error(num, NULL)); + PG_RETURN_INT64(numeric_int8_safe(num, NULL)); } @@ -7583,16 +7501,13 @@ duplicate_numeric(Numeric num) } /* - * make_result_opt_error() - + * make_result_safe() - * * Create the packed db numeric format in palloc()'d memory from * a variable. This will handle NaN and Infinity cases. - * - * If "have_error" isn't NULL, on overflow *have_error is set to true and - * NULL is returned. This is helpful when caller needs to handle errors. */ static Numeric -make_result_opt_error(const NumericVar *var, bool *have_error) +make_result_safe(const NumericVar *var, Node *escontext) { Numeric result; NumericDigit *digits = var->digits; @@ -7601,9 +7516,6 @@ make_result_opt_error(const NumericVar *var, bool *have_error) int n; Size len; - if (have_error) - *have_error = false; - if ((sign & NUMERIC_SIGN_MASK) == NUMERIC_SPECIAL) { /* @@ -7676,19 +7588,9 @@ make_result_opt_error(const NumericVar *var, bool *have_error) /* Check for overflow of int16 fields */ if (NUMERIC_WEIGHT(result) != weight || NUMERIC_DSCALE(result) != var->dscale) - { - if (have_error) - { - *have_error = true; - return NULL; - } - else - { - ereport(ERROR, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("value overflows numeric format"))); - } - } + ereturn(escontext, NULL, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("value overflows numeric format"))); dump_numeric("make_result()", result); return result; @@ -7698,12 +7600,12 @@ make_result_opt_error(const NumericVar *var, bool *have_error) /* * make_result() - * - * An interface to make_result_opt_error() without "have_error" argument. + * An interface to make_result_safe() without "escontext" argument. */ static Numeric make_result(const NumericVar *var) { - return make_result_opt_error(var, NULL); + return make_result_safe(var, NULL); } diff --git a/src/backend/utils/adt/pg_lsn.c b/src/backend/utils/adt/pg_lsn.c index 12de2446f5b69..e1ec5f3bc69cf 100644 --- a/src/backend/utils/adt/pg_lsn.c +++ b/src/backend/utils/adt/pg_lsn.c @@ -25,8 +25,11 @@ * Formatting and conversion routines. *---------------------------------------------------------*/ +/* + * Internal version of pg_lsn_in() with support for soft error reporting. + */ XLogRecPtr -pg_lsn_in_internal(const char *str, bool *have_error) +pg_lsn_in_safe(const char *str, Node *escontext) { int len1, len2; @@ -34,22 +37,14 @@ pg_lsn_in_internal(const char *str, bool *have_error) off; XLogRecPtr result; - Assert(have_error != NULL); - *have_error = false; - /* Sanity check input format. */ len1 = strspn(str, "0123456789abcdefABCDEF"); if (len1 < 1 || len1 > MAXPG_LSNCOMPONENT || str[len1] != '/') - { - *have_error = true; - return InvalidXLogRecPtr; - } + goto syntax_error; + len2 = strspn(str + len1 + 1, "0123456789abcdefABCDEF"); if (len2 < 1 || len2 > MAXPG_LSNCOMPONENT || str[len1 + 1 + len2] != '\0') - { - *have_error = true; - return InvalidXLogRecPtr; - } + goto syntax_error; /* Decode result. */ id = (uint32) strtoul(str, NULL, 16); @@ -57,6 +52,12 @@ pg_lsn_in_internal(const char *str, bool *have_error) result = ((uint64) id << 32) | off; return result; + +syntax_error: + ereturn(escontext, InvalidXLogRecPtr, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type %s: \"%s\"", + "pg_lsn", str))); } Datum @@ -64,14 +65,8 @@ pg_lsn_in(PG_FUNCTION_ARGS) { char *str = PG_GETARG_CSTRING(0); XLogRecPtr result; - bool have_error = false; - - result = pg_lsn_in_internal(str, &have_error); - if (have_error) - ereturn(fcinfo->context, (Datum) 0, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid input syntax for type %s: \"%s\"", - "pg_lsn", str))); + + result = pg_lsn_in_safe(str, fcinfo->context); PG_RETURN_LSN(result); } diff --git a/src/backend/utils/adt/pseudorandomfuncs.c b/src/backend/utils/adt/pseudorandomfuncs.c index e7b8045f92508..1d2a981491bf5 100644 --- a/src/backend/utils/adt/pseudorandomfuncs.c +++ b/src/backend/utils/adt/pseudorandomfuncs.c @@ -17,6 +17,7 @@ #include "common/pg_prng.h" #include "miscadmin.h" +#include "utils/date.h" #include "utils/fmgrprotos.h" #include "utils/numeric.h" #include "utils/timestamp.h" @@ -25,6 +26,18 @@ static pg_prng_state prng_state; static bool prng_seed_set = false; +/* + * Macro for checking the range bounds of random(min, max) functions. Throws + * an error if they're the wrong way round. + */ +#define CHECK_RANGE_BOUNDS(rmin, rmax) \ + do { \ + if ((rmin) > (rmax)) \ + ereport(ERROR, \ + errcode(ERRCODE_INVALID_PARAMETER_VALUE), \ + errmsg("lower bound must be less than or equal to upper bound")); \ + } while (0) + /* * initialize_prng() - * @@ -129,10 +142,7 @@ int4random(PG_FUNCTION_ARGS) int32 rmax = PG_GETARG_INT32(1); int32 result; - if (rmin > rmax) - ereport(ERROR, - errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("lower bound must be less than or equal to upper bound")); + CHECK_RANGE_BOUNDS(rmin, rmax); initialize_prng(); @@ -153,10 +163,7 @@ int8random(PG_FUNCTION_ARGS) int64 rmax = PG_GETARG_INT64(1); int64 result; - if (rmin > rmax) - ereport(ERROR, - errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("lower bound must be less than or equal to upper bound")); + CHECK_RANGE_BOUNDS(rmin, rmax); initialize_prng(); @@ -177,9 +184,90 @@ numeric_random(PG_FUNCTION_ARGS) Numeric rmax = PG_GETARG_NUMERIC(1); Numeric result; + /* Leave range bound checking to random_numeric() */ + initialize_prng(); result = random_numeric(&prng_state, rmin, rmax); PG_RETURN_NUMERIC(result); } + + +/* + * date_random() - + * + * Returns a random date chosen uniformly in the specified range. + */ +Datum +date_random(PG_FUNCTION_ARGS) +{ + int32 rmin = (int32) PG_GETARG_DATEADT(0); + int32 rmax = (int32) PG_GETARG_DATEADT(1); + DateADT result; + + CHECK_RANGE_BOUNDS(rmin, rmax); + + if (DATE_IS_NOBEGIN(rmin) || DATE_IS_NOEND(rmax)) + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("lower and upper bounds must be finite")); + + initialize_prng(); + + result = (DateADT) pg_prng_int64_range(&prng_state, rmin, rmax); + + PG_RETURN_DATEADT(result); +} + +/* + * timestamp_random() - + * + * Returns a random timestamp chosen uniformly in the specified range. + */ +Datum +timestamp_random(PG_FUNCTION_ARGS) +{ + int64 rmin = (int64) PG_GETARG_TIMESTAMP(0); + int64 rmax = (int64) PG_GETARG_TIMESTAMP(1); + Timestamp result; + + CHECK_RANGE_BOUNDS(rmin, rmax); + + if (TIMESTAMP_IS_NOBEGIN(rmin) || TIMESTAMP_IS_NOEND(rmax)) + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("lower and upper bounds must be finite")); + + initialize_prng(); + + result = (Timestamp) pg_prng_int64_range(&prng_state, rmin, rmax); + + PG_RETURN_TIMESTAMP(result); +} + +/* + * timestamptz_random() - + * + * Returns a random timestamptz chosen uniformly in the specified range. + */ +Datum +timestamptz_random(PG_FUNCTION_ARGS) +{ + int64 rmin = (int64) PG_GETARG_TIMESTAMPTZ(0); + int64 rmax = (int64) PG_GETARG_TIMESTAMPTZ(1); + TimestampTz result; + + CHECK_RANGE_BOUNDS(rmin, rmax); + + if (TIMESTAMP_IS_NOBEGIN(rmin) || TIMESTAMP_IS_NOEND(rmax)) + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("lower and upper bounds must be finite")); + + initialize_prng(); + + result = (TimestampTz) pg_prng_int64_range(&prng_state, rmin, rmax); + + PG_RETURN_TIMESTAMPTZ(result); +} diff --git a/src/backend/utils/adt/timestamp.c b/src/backend/utils/adt/timestamp.c index 3e5f9dc1458e1..156a4830ffda6 100644 --- a/src/backend/utils/adt/timestamp.c +++ b/src/backend/utils/adt/timestamp.c @@ -5629,11 +5629,11 @@ timestamp_part_common(PG_FUNCTION_ARGS, bool retnumeric) case DTK_JULIAN: if (retnumeric) - PG_RETURN_NUMERIC(numeric_add_opt_error(int64_to_numeric(date2j(tm->tm_year, tm->tm_mon, tm->tm_mday)), - numeric_div_opt_error(int64_to_numeric(((((tm->tm_hour * MINS_PER_HOUR) + tm->tm_min) * SECS_PER_MINUTE) + tm->tm_sec) * INT64CONST(1000000) + fsec), - int64_to_numeric(SECS_PER_DAY * INT64CONST(1000000)), - NULL), - NULL)); + PG_RETURN_NUMERIC(numeric_add_safe(int64_to_numeric(date2j(tm->tm_year, tm->tm_mon, tm->tm_mday)), + numeric_div_safe(int64_to_numeric(((((tm->tm_hour * MINS_PER_HOUR) + tm->tm_min) * SECS_PER_MINUTE) + tm->tm_sec) * INT64CONST(1000000) + fsec), + int64_to_numeric(SECS_PER_DAY * INT64CONST(1000000)), + NULL), + NULL)); else PG_RETURN_FLOAT8(date2j(tm->tm_year, tm->tm_mon, tm->tm_mday) + ((((tm->tm_hour * MINS_PER_HOUR) + tm->tm_min) * SECS_PER_MINUTE) + @@ -5685,11 +5685,11 @@ timestamp_part_common(PG_FUNCTION_ARGS, bool retnumeric) result = int64_div_fast_to_numeric(timestamp - epoch, 6); else { - result = numeric_div_opt_error(numeric_sub_opt_error(int64_to_numeric(timestamp), - int64_to_numeric(epoch), - NULL), - int64_to_numeric(1000000), - NULL); + result = numeric_div_safe(numeric_sub_safe(int64_to_numeric(timestamp), + int64_to_numeric(epoch), + NULL), + int64_to_numeric(1000000), + NULL); result = DatumGetNumeric(DirectFunctionCall2(numeric_round, NumericGetDatum(result), Int32GetDatum(6))); @@ -5903,11 +5903,11 @@ timestamptz_part_common(PG_FUNCTION_ARGS, bool retnumeric) case DTK_JULIAN: if (retnumeric) - PG_RETURN_NUMERIC(numeric_add_opt_error(int64_to_numeric(date2j(tm->tm_year, tm->tm_mon, tm->tm_mday)), - numeric_div_opt_error(int64_to_numeric(((((tm->tm_hour * MINS_PER_HOUR) + tm->tm_min) * SECS_PER_MINUTE) + tm->tm_sec) * INT64CONST(1000000) + fsec), - int64_to_numeric(SECS_PER_DAY * INT64CONST(1000000)), - NULL), - NULL)); + PG_RETURN_NUMERIC(numeric_add_safe(int64_to_numeric(date2j(tm->tm_year, tm->tm_mon, tm->tm_mday)), + numeric_div_safe(int64_to_numeric(((((tm->tm_hour * MINS_PER_HOUR) + tm->tm_min) * SECS_PER_MINUTE) + tm->tm_sec) * INT64CONST(1000000) + fsec), + int64_to_numeric(SECS_PER_DAY * INT64CONST(1000000)), + NULL), + NULL)); else PG_RETURN_FLOAT8(date2j(tm->tm_year, tm->tm_mon, tm->tm_mday) + ((((tm->tm_hour * MINS_PER_HOUR) + tm->tm_min) * SECS_PER_MINUTE) + @@ -5956,11 +5956,11 @@ timestamptz_part_common(PG_FUNCTION_ARGS, bool retnumeric) result = int64_div_fast_to_numeric(timestamp - epoch, 6); else { - result = numeric_div_opt_error(numeric_sub_opt_error(int64_to_numeric(timestamp), - int64_to_numeric(epoch), - NULL), - int64_to_numeric(1000000), - NULL); + result = numeric_div_safe(numeric_sub_safe(int64_to_numeric(timestamp), + int64_to_numeric(epoch), + NULL), + int64_to_numeric(1000000), + NULL); result = DatumGetNumeric(DirectFunctionCall2(numeric_round, NumericGetDatum(result), Int32GetDatum(6))); @@ -6247,9 +6247,9 @@ interval_part_common(PG_FUNCTION_ARGS, bool retnumeric) result = int64_div_fast_to_numeric(val, 6); else result = - numeric_add_opt_error(int64_div_fast_to_numeric(interval->time, 6), - int64_to_numeric(secs_from_day_month), - NULL); + numeric_add_safe(int64_div_fast_to_numeric(interval->time, 6), + int64_to_numeric(secs_from_day_month), + NULL); PG_RETURN_NUMERIC(result); } diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c index 1aeee5be42acd..ac94b9e93c6e3 100644 --- a/src/backend/utils/hash/dynahash.c +++ b/src/backend/utils/hash/dynahash.c @@ -102,7 +102,6 @@ #include "port/pg_bitutils.h" #include "storage/shmem.h" #include "storage/spin.h" -#include "utils/dynahash.h" #include "utils/memutils.h" @@ -281,6 +280,7 @@ static bool init_htab(HTAB *hashp, int64 nelem); pg_noreturn static void hash_corrupted(HTAB *hashp); static uint32 hash_initial_lookup(HTAB *hashp, uint32 hashvalue, HASHBUCKET **bucketptr); +static int my_log2(int64 num); static int64 next_pow2_int64(int64 num); static int next_pow2_int(int64 num); static void register_seq_scan(HTAB *hashp); @@ -1813,7 +1813,7 @@ hash_corrupted(HTAB *hashp) } /* calculate ceil(log base 2) of num */ -int +static int my_log2(int64 num) { /* diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index a157cec3c4d00..0da01627cfec1 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -414,6 +414,12 @@ ifdef => 'DEBUG_NODE_TESTS_ENABLED', }, +{ name => 'debug_print_raw_parse', type => 'bool', context => 'PGC_USERSET', group => 'LOGGING_WHAT', + short_desc => 'Logs each query\'s raw parse tree.', + variable => 'Debug_print_raw_parse', + boot_val => 'false', +}, + { name => 'debug_print_parse', type => 'bool', context => 'PGC_USERSET', group => 'LOGGING_WHAT', short_desc => 'Logs each query\'s parse tree.', variable => 'Debug_print_parse', diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 787933a9e5acd..00c8376cf4ded 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -507,6 +507,7 @@ bool AllowAlterSystem = true; bool log_duration = false; bool Debug_print_plan = false; bool Debug_print_parse = false; +bool Debug_print_raw_parse = false; bool Debug_print_rewritten = false; bool Debug_pretty_print = true; diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index a9d8293474af5..26c0869356485 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -581,6 +581,7 @@ # - What to Log - +#debug_print_raw_parse = off #debug_print_parse = off #debug_print_rewritten = off #debug_print_plan = off diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index bea793456f969..b4c45ad803e94 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -1131,6 +1131,23 @@ main(int argc, char **argv) shdepend->dataObj->filtercond = "WHERE classid = 'pg_largeobject'::regclass " "AND dbid = (SELECT oid FROM pg_database " " WHERE datname = current_database())"; + + /* + * If upgrading from v16 or newer, only dump large objects with + * comments/seclabels. For these upgrades, pg_upgrade can copy/link + * pg_largeobject_metadata's files (which is usually faster) but we + * still need to dump LOs with comments/seclabels here so that the + * subsequent COMMENT and SECURITY LABEL commands work. pg_upgrade + * can't copy/link the files from older versions because aclitem + * (needed by pg_largeobject_metadata.lomacl) changed its storage + * format in v16. + */ + if (fout->remoteVersion >= 160000) + lo_metadata->dataObj->filtercond = "WHERE oid IN " + "(SELECT objoid FROM pg_description " + "WHERE classoid = " CppAsString2(LargeObjectRelationId) " " + "UNION SELECT objoid FROM pg_seclabel " + "WHERE classoid = " CppAsString2(LargeObjectRelationId) ")"; } /* @@ -3629,26 +3646,32 @@ dumpDatabase(Archive *fout) /* * pg_largeobject comes from the old system intact, so set its * relfrozenxids, relminmxids and relfilenode. + * + * pg_largeobject_metadata also comes from the old system intact for + * upgrades from v16 and newer, so set its relfrozenxids, relminmxids, and + * relfilenode, too. pg_upgrade can't copy/link the files from older + * versions because aclitem (needed by pg_largeobject_metadata.lomacl) + * changed its storage format in v16. */ if (dopt->binary_upgrade) { PGresult *lo_res; PQExpBuffer loFrozenQry = createPQExpBuffer(); PQExpBuffer loOutQry = createPQExpBuffer(); + PQExpBuffer lomOutQry = createPQExpBuffer(); PQExpBuffer loHorizonQry = createPQExpBuffer(); + PQExpBuffer lomHorizonQry = createPQExpBuffer(); int ii_relfrozenxid, ii_relfilenode, ii_oid, ii_relminmxid; - /* - * pg_largeobject - */ if (fout->remoteVersion >= 90300) appendPQExpBuffer(loFrozenQry, "SELECT relfrozenxid, relminmxid, relfilenode, oid\n" "FROM pg_catalog.pg_class\n" - "WHERE oid IN (%u, %u);\n", - LargeObjectRelationId, LargeObjectLOidPNIndexId); + "WHERE oid IN (%u, %u, %u, %u);\n", + LargeObjectRelationId, LargeObjectLOidPNIndexId, + LargeObjectMetadataRelationId, LargeObjectMetadataOidIndexId); else appendPQExpBuffer(loFrozenQry, "SELECT relfrozenxid, 0 AS relminmxid, relfilenode, oid\n" "FROM pg_catalog.pg_class\n" @@ -3663,35 +3686,57 @@ dumpDatabase(Archive *fout) ii_oid = PQfnumber(lo_res, "oid"); appendPQExpBufferStr(loHorizonQry, "\n-- For binary upgrade, set pg_largeobject relfrozenxid and relminmxid\n"); + appendPQExpBufferStr(lomHorizonQry, "\n-- For binary upgrade, set pg_largeobject_metadata relfrozenxid and relminmxid\n"); appendPQExpBufferStr(loOutQry, "\n-- For binary upgrade, preserve pg_largeobject and index relfilenodes\n"); + appendPQExpBufferStr(lomOutQry, "\n-- For binary upgrade, preserve pg_largeobject_metadata and index relfilenodes\n"); for (int i = 0; i < PQntuples(lo_res); ++i) { Oid oid; RelFileNumber relfilenumber; + PQExpBuffer horizonQry; + PQExpBuffer outQry; + + oid = atooid(PQgetvalue(lo_res, i, ii_oid)); + relfilenumber = atooid(PQgetvalue(lo_res, i, ii_relfilenode)); - appendPQExpBuffer(loHorizonQry, "UPDATE pg_catalog.pg_class\n" + if (oid == LargeObjectRelationId || + oid == LargeObjectLOidPNIndexId) + { + horizonQry = loHorizonQry; + outQry = loOutQry; + } + else + { + horizonQry = lomHorizonQry; + outQry = lomOutQry; + } + + appendPQExpBuffer(horizonQry, "UPDATE pg_catalog.pg_class\n" "SET relfrozenxid = '%u', relminmxid = '%u'\n" "WHERE oid = %u;\n", atooid(PQgetvalue(lo_res, i, ii_relfrozenxid)), atooid(PQgetvalue(lo_res, i, ii_relminmxid)), atooid(PQgetvalue(lo_res, i, ii_oid))); - oid = atooid(PQgetvalue(lo_res, i, ii_oid)); - relfilenumber = atooid(PQgetvalue(lo_res, i, ii_relfilenode)); - - if (oid == LargeObjectRelationId) - appendPQExpBuffer(loOutQry, + if (oid == LargeObjectRelationId || + oid == LargeObjectMetadataRelationId) + appendPQExpBuffer(outQry, "SELECT pg_catalog.binary_upgrade_set_next_heap_relfilenode('%u'::pg_catalog.oid);\n", relfilenumber); - else if (oid == LargeObjectLOidPNIndexId) - appendPQExpBuffer(loOutQry, + else if (oid == LargeObjectLOidPNIndexId || + oid == LargeObjectMetadataOidIndexId) + appendPQExpBuffer(outQry, "SELECT pg_catalog.binary_upgrade_set_next_index_relfilenode('%u'::pg_catalog.oid);\n", relfilenumber); } appendPQExpBufferStr(loOutQry, "TRUNCATE pg_catalog.pg_largeobject;\n"); + appendPQExpBufferStr(lomOutQry, + "TRUNCATE pg_catalog.pg_largeobject_metadata;\n"); + appendPQExpBufferStr(loOutQry, loHorizonQry->data); + appendPQExpBufferStr(lomOutQry, lomHorizonQry->data); ArchiveEntry(fout, nilCatalogId, createDumpId(), ARCHIVE_OPTS(.tag = "pg_largeobject", @@ -3699,11 +3744,20 @@ dumpDatabase(Archive *fout) .section = SECTION_PRE_DATA, .createStmt = loOutQry->data)); + if (fout->remoteVersion >= 160000) + ArchiveEntry(fout, nilCatalogId, createDumpId(), + ARCHIVE_OPTS(.tag = "pg_largeobject_metadata", + .description = "pg_largeobject_metadata", + .section = SECTION_PRE_DATA, + .createStmt = lomOutQry->data)); + PQclear(lo_res); destroyPQExpBuffer(loFrozenQry); destroyPQExpBuffer(loHorizonQry); + destroyPQExpBuffer(lomHorizonQry); destroyPQExpBuffer(loOutQry); + destroyPQExpBuffer(lomOutQry); } PQclear(res); diff --git a/src/bin/pg_upgrade/Makefile b/src/bin/pg_upgrade/Makefile index f83d2b5d30955..69fcf593caec9 100644 --- a/src/bin/pg_upgrade/Makefile +++ b/src/bin/pg_upgrade/Makefile @@ -3,8 +3,7 @@ PGFILEDESC = "pg_upgrade - an in-place binary upgrade utility" PGAPPICON = win32 -# required for 003_upgrade_logical_replication_slots.pl -EXTRA_INSTALL=contrib/test_decoding +EXTRA_INSTALL=contrib/test_decoding src/test/modules/dummy_seclabel subdir = src/bin/pg_upgrade top_builddir = ../../.. diff --git a/src/bin/pg_upgrade/info.c b/src/bin/pg_upgrade/info.c index c39eb077c2fae..7ce0827016803 100644 --- a/src/bin/pg_upgrade/info.c +++ b/src/bin/pg_upgrade/info.c @@ -498,7 +498,10 @@ get_rel_infos_query(void) * * pg_largeobject contains user data that does not appear in pg_dump * output, so we have to copy that system table. It's easiest to do that - * by treating it as a user table. + * by treating it as a user table. We can do the same for + * pg_largeobject_metadata for upgrades from v16 and newer. pg_upgrade + * can't copy/link the files from older versions because aclitem (needed + * by pg_largeobject_metadata.lomacl) changed its storage format in v16. */ appendPQExpBuffer(&query, "WITH regular_heap (reloid, indtable, toastheap) AS ( " @@ -514,10 +517,12 @@ get_rel_infos_query(void) " 'binary_upgrade', 'pg_toast') AND " " c.oid >= %u::pg_catalog.oid) OR " " (n.nspname = 'pg_catalog' AND " - " relname IN ('pg_largeobject') ))), ", + " relname IN ('pg_largeobject'%s) ))), ", (user_opts.transfer_mode == TRANSFER_MODE_SWAP) ? ", " CppAsString2(RELKIND_SEQUENCE) : "", - FirstNormalObjectId); + FirstNormalObjectId, + (GET_MAJOR_VERSION(old_cluster.major_version) >= 1600) ? + ", 'pg_largeobject_metadata'" : ""); /* * Add a CTE that collects OIDs of toast tables belonging to the tables diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c index d5cd5bf0b3a6b..490e98fa26f2a 100644 --- a/src/bin/pg_upgrade/pg_upgrade.c +++ b/src/bin/pg_upgrade/pg_upgrade.c @@ -29,9 +29,9 @@ * We control all assignments of pg_enum.oid because these oids are stored * in user tables as enum values. * - * We control all assignments of pg_authid.oid for historical reasons (the - * oids used to be stored in pg_largeobject_metadata, which is now copied via - * SQL commands), that might change at some point in the future. + * We control all assignments of pg_authid.oid because the oids are stored in + * pg_largeobject_metadata, which is copied via file transfer for upgrades + * from v16 and newer. * * We control all assignments of pg_database.oid because we want the directory * names to match between the old and new cluster. diff --git a/src/bin/pg_upgrade/t/006_transfer_modes.pl b/src/bin/pg_upgrade/t/006_transfer_modes.pl index 348f402146234..2f68f0b56aa61 100644 --- a/src/bin/pg_upgrade/t/006_transfer_modes.pl +++ b/src/bin/pg_upgrade/t/006_transfer_modes.pl @@ -45,6 +45,22 @@ sub test_mode $old->append_conf('postgresql.conf', "allow_in_place_tablespaces = true"); } + # We can only test security labels if both the old and new installations + # have dummy_seclabel. + my $test_seclabel = 1; + $old->start; + if (!$old->check_extension('dummy_seclabel')) + { + $test_seclabel = 0; + } + $old->stop; + $new->start; + if (!$new->check_extension('dummy_seclabel')) + { + $test_seclabel = 0; + } + $new->stop; + # Create a small variety of simple test objects on the old cluster. We'll # check that these reach the new version after upgrading. $old->start; @@ -83,6 +99,29 @@ sub test_mode $old->safe_psql('testdb3', "CREATE TABLE test6 AS SELECT generate_series(607, 711)"); } + + # While we are here, test handling of large objects. + $old->safe_psql('postgres', q| + CREATE ROLE regress_lo_1; + CREATE ROLE regress_lo_2; + + SELECT lo_from_bytea(4532, '\xffffff00'); + COMMENT ON LARGE OBJECT 4532 IS 'test'; + + SELECT lo_from_bytea(4533, '\x0f0f0f0f'); + ALTER LARGE OBJECT 4533 OWNER TO regress_lo_1; + GRANT SELECT ON LARGE OBJECT 4533 TO regress_lo_2; + |); + + if ($test_seclabel) + { + $old->safe_psql('postgres', q| + CREATE EXTENSION dummy_seclabel; + + SELECT lo_from_bytea(4534, '\x00ffffff'); + SECURITY LABEL ON LARGE OBJECT 4534 IS 'classified'; + |); + } $old->stop; my $result = command_ok_or_fails_like( @@ -132,6 +171,34 @@ sub test_mode $result = $new->safe_psql('testdb3', "SELECT COUNT(*) FROM test6"); is($result, '105', "test6 data after pg_upgrade $mode"); } + + # Tests for large objects + $result = $new->safe_psql('postgres', "SELECT lo_get(4532)"); + is($result, '\xffffff00', "LO contents after upgrade"); + $result = $new->safe_psql('postgres', + "SELECT obj_description(4532, 'pg_largeobject')"); + is($result, 'test', "comment on LO after pg_upgrade"); + + $result = $new->safe_psql('postgres', "SELECT lo_get(4533)"); + is($result, '\x0f0f0f0f', "LO contents after upgrade"); + $result = $new->safe_psql('postgres', + "SELECT lomowner::regrole FROM pg_largeobject_metadata WHERE oid = 4533"); + is($result, 'regress_lo_1', "LO owner after upgrade"); + $result = $new->safe_psql('postgres', + "SELECT lomacl FROM pg_largeobject_metadata WHERE oid = 4533"); + is($result, '{regress_lo_1=rw/regress_lo_1,regress_lo_2=r/regress_lo_1}', + "LO ACL after upgrade"); + + if ($test_seclabel) + { + $result = $new->safe_psql('postgres', "SELECT lo_get(4534)"); + is($result, '\x00ffffff', "LO contents after upgrade"); + $result = $new->safe_psql('postgres', q| + SELECT label FROM pg_seclabel WHERE objoid = 4534 + AND classoid = 'pg_largeobject'::regclass + |); + is($result, 'classified', "seclabel on LO after pg_upgrade"); + } $new->stop; } diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h index 277df6b3cf0b3..d4c0625b63228 100644 --- a/src/include/access/heapam_xlog.h +++ b/src/include/access/heapam_xlog.h @@ -284,7 +284,6 @@ typedef struct xl_heap_update */ typedef struct xl_heap_prune { - uint8 reason; uint8 flags; /* diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 1c9e802a6b128..b2ce35e2a3407 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -121,7 +121,9 @@ typedef enum TU_UpdateIndexes /* * When table_tuple_update, table_tuple_delete, or table_tuple_lock fail * because the target tuple is already outdated, they fill in this struct to - * provide information to the caller about what happened. + * provide information to the caller about what happened. When those functions + * succeed, the contents of this struct should not be relied upon, except for + * `traversed`, which may be set in both success and failure cases. * * ctid is the target's ctid link: it is the same as the target's TID if the * target was deleted, or the location of the replacement tuple if the target @@ -137,6 +139,9 @@ typedef enum TU_UpdateIndexes * tuple); otherwise cmax is zero. (We make this restriction because * HeapTupleHeaderGetCmax doesn't work for tuples outdated in other * transactions.) + * + * traversed indicates if an update chain was followed in order to try to lock + * the target tuple. (This may be set in both success and failure cases.) */ typedef struct TM_FailureData { @@ -1508,7 +1513,7 @@ table_tuple_update(Relation rel, ItemPointer otid, TupleTableSlot *slot, * * Input parameters: * relation: relation containing tuple (caller must hold suitable lock) - * tid: TID of tuple to lock + * tid: TID of tuple to lock (updated if an update chain was followed) * snapshot: snapshot to use for visibility determinations * cid: current command ID (used for visibility test, and stored into * tuple's cmax if lock is successful) @@ -1533,8 +1538,10 @@ table_tuple_update(Relation rel, ItemPointer otid, TupleTableSlot *slot, * TM_WouldBlock: lock couldn't be acquired and wait_policy is skip * * In the failure cases other than TM_Invisible and TM_Deleted, the routine - * fills *tmfd with the tuple's t_ctid, t_xmax, and, if possible, t_cmax. See - * comments for struct TM_FailureData for additional info. + * fills *tmfd with the tuple's t_ctid, t_xmax, and, if possible, t_cmax. + * Additionally, in both success and failure cases, tmfd->traversed is set if + * an update chain was followed. See comments for struct TM_FailureData for + * additional info. */ static inline TM_Result table_tuple_lock(Relation rel, ItemPointer tid, Snapshot snapshot, diff --git a/src/include/access/twophase.h b/src/include/access/twophase.h index 509bdad9a5d55..64463e9f4afb4 100644 --- a/src/include/access/twophase.h +++ b/src/include/access/twophase.h @@ -68,4 +68,6 @@ extern void TwoPhaseTransactionGid(Oid subid, TransactionId xid, char *gid_res, int szgid); extern bool LookupGXactBySubid(Oid subid); +extern TransactionId TwoPhaseGetOldestXidInCommit(void); + #endif /* TWOPHASE_H */ diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 836369f163ef5..ef0d0f92165eb 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -57,6 +57,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202509021 +#define CATALOG_VERSION_NO 202509091 #endif diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 118d6da1ace0e..03e82d28c8767 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -3503,6 +3503,18 @@ proname => 'random', provolatile => 'v', proparallel => 'r', prorettype => 'numeric', proargtypes => 'numeric numeric', proargnames => '{min,max}', prosrc => 'numeric_random' }, +{ oid => '6431', descr => 'random date in range', + proname => 'random', provolatile => 'v', proparallel => 'r', + prorettype => 'date', proargtypes => 'date date', + proargnames => '{min,max}', prosrc => 'date_random' }, +{ oid => '6432', descr => 'random timestamp in range', + proname => 'random', provolatile => 'v', proparallel => 'r', + prorettype => 'timestamp', proargtypes => 'timestamp timestamp', + proargnames => '{min,max}', prosrc => 'timestamp_random' }, +{ oid => '6433', descr => 'random timestamptz in range', + proname => 'random', provolatile => 'v', proparallel => 'r', + prorettype => 'timestamptz', proargtypes => 'timestamptz timestamptz', + proargnames => '{min,max}', prosrc => 'timestamptz_random' }, { oid => '1599', descr => 'set random seed', proname => 'setseed', provolatile => 'v', proparallel => 'r', prorettype => 'void', proargtypes => 'float8', prosrc => 'setseed' }, diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index de782014b2d41..71857feae4823 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1020,7 +1020,6 @@ typedef struct SubPlanState bool havehashrows; /* true if hashtable is not empty */ bool havenullrows; /* true if hashnulls is not empty */ MemoryContext hashtablecxt; /* memory context containing hash tables */ - MemoryContext hashtempcxt; /* temp memory context for hash tables */ ExprContext *innerecontext; /* econtext for computing inner tuples */ int numCols; /* number of columns being hashed */ /* each of the remaining fields is an array of length numCols: */ diff --git a/src/include/replication/worker_internal.h b/src/include/replication/worker_internal.h index 62ea1a0058081..de00380261279 100644 --- a/src/include/replication/worker_internal.h +++ b/src/include/replication/worker_internal.h @@ -272,6 +272,7 @@ extern void ReplicationOriginNameForLogicalRep(Oid suboid, Oid relid, char *originname, Size szoriginname); extern bool AllTablesyncsReady(void); +extern bool HasSubscriptionRelationsCached(void); extern void UpdateTwoPhaseState(Oid suboid, char new_state); extern void process_syncing_tables(XLogRecPtr current_lsn); diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index 52a71b138f736..dfd614f7ca449 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -80,8 +80,8 @@ StaticAssertDecl(BUF_REFCOUNT_BITS + BUF_USAGECOUNT_BITS + BUF_FLAG_BITS == 32, * The maximum allowed value of usage_count represents a tradeoff between * accuracy and speed of the clock-sweep buffer management algorithm. A * large value (comparable to NBuffers) would approximate LRU semantics. - * But it can take as many as BM_MAX_USAGE_COUNT+1 complete cycles of - * clock sweeps to find a free buffer, so in practice we don't want the + * But it can take as many as BM_MAX_USAGE_COUNT+1 complete cycles of the + * clock-sweep hand to find a free buffer, so in practice we don't want the * value to be very large. */ #define BM_MAX_USAGE_COUNT 5 @@ -217,8 +217,7 @@ BufMappingPartitionLockByIndex(uint32 index) * single atomic variable. This layout allow us to do some operations in a * single atomic operation, without actually acquiring and releasing spinlock; * for instance, increase or decrease refcount. buf_id field never changes - * after initialization, so does not need locking. freeNext is protected by - * the buffer_strategy_lock not buffer header lock. The LWLock can take care + * after initialization, so does not need locking. The LWLock can take care * of itself. The buffer header lock is *not* used to control access to the * data in the buffer! * @@ -264,7 +263,6 @@ typedef struct BufferDesc pg_atomic_uint32 state; int wait_backend_pgprocno; /* backend of pin-count waiter */ - int freeNext; /* link in freelist chain */ PgAioWaitRef io_wref; /* set iff AIO is in progress */ LWLock content_lock; /* to lock access to buffer contents */ @@ -360,13 +358,6 @@ BufferDescriptorGetContentLock(const BufferDesc *bdesc) return (LWLock *) (&bdesc->content_lock); } -/* - * The freeNext field is either the index of the next freelist entry, - * or one of these special values: - */ -#define FREENEXT_END_OF_LIST (-1) -#define FREENEXT_NOT_IN_LIST (-2) - /* * Functions for acquiring/releasing a shared buffer header's spinlock. Do * not apply these to local buffers! @@ -444,7 +435,6 @@ extern void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag extern IOContext IOContextForStrategy(BufferAccessStrategy strategy); extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_ring); -extern void StrategyFreeBuffer(BufferDesc *buf); extern bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring); @@ -453,7 +443,6 @@ extern void StrategyNotifyBgWriter(int bgwprocno); extern Size StrategyShmemSize(void); extern void StrategyInitialize(bool init); -extern bool have_free_buffer(void); /* buf_table.c */ extern Size BufTableShmemSize(int size); diff --git a/src/include/utils/dynahash.h b/src/include/utils/dynahash.h deleted file mode 100644 index a4362d3f65e59..0000000000000 --- a/src/include/utils/dynahash.h +++ /dev/null @@ -1,20 +0,0 @@ -/*------------------------------------------------------------------------- - * - * dynahash.h - * POSTGRES dynahash.h file definitions - * - * - * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group - * Portions Copyright (c) 1994, Regents of the University of California - * - * IDENTIFICATION - * src/include/utils/dynahash.h - * - *------------------------------------------------------------------------- - */ -#ifndef DYNAHASH_H -#define DYNAHASH_H - -extern int my_log2(int64 num); - -#endif /* DYNAHASH_H */ diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h index 72981053e610f..f21ec37da8933 100644 --- a/src/include/utils/guc.h +++ b/src/include/utils/guc.h @@ -106,7 +106,7 @@ typedef enum * will show as "default" in pg_settings. If there is a specific reason not * to want that, use source == PGC_S_OVERRIDE. * - * NB: see GucSource_Names in guc.c if you change this. + * NB: see GucSource_Names in guc_tables.c if you change this. */ typedef enum { @@ -247,6 +247,7 @@ typedef enum /* GUC vars that are actually defined in guc_tables.c, rather than elsewhere */ extern PGDLLIMPORT bool Debug_print_plan; extern PGDLLIMPORT bool Debug_print_parse; +extern PGDLLIMPORT bool Debug_print_raw_parse; extern PGDLLIMPORT bool Debug_print_rewritten; extern PGDLLIMPORT bool Debug_pretty_print; diff --git a/src/include/utils/numeric.h b/src/include/utils/numeric.h index 9e79fc376cbea..215f1ea4f53b4 100644 --- a/src/include/utils/numeric.h +++ b/src/include/utils/numeric.h @@ -17,6 +17,9 @@ #include "common/pg_prng.h" #include "fmgr.h" +/* forward declaration to avoid node.h include */ +typedef struct Node Node; + /* * Limits on the precision and scale specifiable in a NUMERIC typmod. The * precision is strictly positive, but the scale may be positive or negative. @@ -91,18 +94,13 @@ extern char *numeric_normalize(Numeric num); extern Numeric int64_to_numeric(int64 val); extern Numeric int64_div_fast_to_numeric(int64 val1, int log10val2); -extern Numeric numeric_add_opt_error(Numeric num1, Numeric num2, - bool *have_error); -extern Numeric numeric_sub_opt_error(Numeric num1, Numeric num2, - bool *have_error); -extern Numeric numeric_mul_opt_error(Numeric num1, Numeric num2, - bool *have_error); -extern Numeric numeric_div_opt_error(Numeric num1, Numeric num2, - bool *have_error); -extern Numeric numeric_mod_opt_error(Numeric num1, Numeric num2, - bool *have_error); -extern int32 numeric_int4_opt_error(Numeric num, bool *have_error); -extern int64 numeric_int8_opt_error(Numeric num, bool *have_error); +extern Numeric numeric_add_safe(Numeric num1, Numeric num2, Node *escontext); +extern Numeric numeric_sub_safe(Numeric num1, Numeric num2, Node *escontext); +extern Numeric numeric_mul_safe(Numeric num1, Numeric num2, Node *escontext); +extern Numeric numeric_div_safe(Numeric num1, Numeric num2, Node *escontext); +extern Numeric numeric_mod_safe(Numeric num1, Numeric num2, Node *escontext); +extern int32 numeric_int4_safe(Numeric num, Node *escontext); +extern int64 numeric_int8_safe(Numeric num, Node *escontext); extern Numeric random_numeric(pg_prng_state *state, Numeric rmin, Numeric rmax); diff --git a/src/include/utils/pg_lsn.h b/src/include/utils/pg_lsn.h index ae198af745029..461a4fdcba954 100644 --- a/src/include/utils/pg_lsn.h +++ b/src/include/utils/pg_lsn.h @@ -18,6 +18,9 @@ #include "access/xlogdefs.h" #include "fmgr.h" +/* forward declaration to avoid node.h include */ +typedef struct Node Node; + static inline XLogRecPtr DatumGetLSN(Datum X) { @@ -33,6 +36,6 @@ LSNGetDatum(XLogRecPtr X) #define PG_GETARG_LSN(n) DatumGetLSN(PG_GETARG_DATUM(n)) #define PG_RETURN_LSN(x) return LSNGetDatum(x) -extern XLogRecPtr pg_lsn_in_internal(const char *str, bool *have_error); +extern XLogRecPtr pg_lsn_in_safe(const char *str, Node *escontext); #endif /* PG_LSN_H */ diff --git a/src/test/isolation/expected/merge-match-recheck.out b/src/test/isolation/expected/merge-match-recheck.out index 90300f1db5ab3..4250b85af2d3c 100644 --- a/src/test/isolation/expected/merge-match-recheck.out +++ b/src/test/isolation/expected/merge-match-recheck.out @@ -271,6 +271,151 @@ key|balance|status|val step c1: COMMIT; +starting permutation: update1 update6 merge_bal c2 select1 c1 +step update1: UPDATE target t SET balance = balance + 10, val = t.val || ' updated by update1' WHERE t.key = 1; +step update6: UPDATE target t SET balance = balance - 100, val = t.val || ' updated by update6' WHERE t.key = 1; +step merge_bal: + MERGE INTO target t + USING (SELECT 1 as key) s + ON s.key = t.key + WHEN MATCHED AND balance < 100 THEN + UPDATE SET balance = balance * 2, val = t.val || ' when1' + WHEN MATCHED AND balance < 200 THEN + UPDATE SET balance = balance * 4, val = t.val || ' when2' + WHEN MATCHED AND balance < 300 THEN + UPDATE SET balance = balance * 8, val = t.val || ' when3'; + +step c2: COMMIT; +step merge_bal: <... completed> +step select1: SELECT * FROM target; +key|balance|status|val +---+-------+------+------------------------------------------------- + 1| 140|s1 |setup updated by update1 updated by update6 when1 +(1 row) + +step c1: COMMIT; + +starting permutation: update1_pa update6_pa merge_bal_pa c2 select1_pa c1 +step update1_pa: UPDATE target_pa t SET balance = balance + 10, val = t.val || ' updated by update1_pa' WHERE t.key = 1; +step update6_pa: UPDATE target_pa t SET balance = balance - 100, val = t.val || ' updated by update6_pa' WHERE t.key = 1; +step merge_bal_pa: + MERGE INTO target_pa t + USING (SELECT 1 as key) s + ON s.key = t.key + WHEN MATCHED AND balance < 100 THEN + UPDATE SET balance = balance * 2, val = t.val || ' when1' + WHEN MATCHED AND balance < 200 THEN + UPDATE SET balance = balance * 4, val = t.val || ' when2' + WHEN MATCHED AND balance < 300 THEN + UPDATE SET balance = balance * 8, val = t.val || ' when3'; + +step c2: COMMIT; +step merge_bal_pa: <... completed> +step select1_pa: SELECT * FROM target_pa; +key|balance|status|val +---+-------+------+------------------------------------------------------- + 1| 140|s1 |setup updated by update1_pa updated by update6_pa when1 +(1 row) + +step c1: COMMIT; + +starting permutation: update1_tg update6_tg merge_bal_tg c2 select1_tg c1 +s2: NOTICE: Update: (1,160,s1,setup) -> (1,170,s1,"setup updated by update1_tg") +step update1_tg: UPDATE target_tg t SET balance = balance + 10, val = t.val || ' updated by update1_tg' WHERE t.key = 1; +s2: NOTICE: Update: (1,170,s1,"setup updated by update1_tg") -> (1,70,s1,"setup updated by update1_tg updated by update6_tg") +step update6_tg: UPDATE target_tg t SET balance = balance - 100, val = t.val || ' updated by update6_tg' WHERE t.key = 1; +step merge_bal_tg: + WITH t AS ( + MERGE INTO target_tg t + USING (SELECT 1 as key) s + ON s.key = t.key + WHEN MATCHED AND balance < 100 THEN + UPDATE SET balance = balance * 2, val = t.val || ' when1' + WHEN MATCHED AND balance < 200 THEN + UPDATE SET balance = balance * 4, val = t.val || ' when2' + WHEN MATCHED AND balance < 300 THEN + UPDATE SET balance = balance * 8, val = t.val || ' when3' + RETURNING t.* + ) + SELECT * FROM t; + +step c2: COMMIT; +s1: NOTICE: Update: (1,70,s1,"setup updated by update1_tg updated by update6_tg") -> (1,140,s1,"setup updated by update1_tg updated by update6_tg when1") +step merge_bal_tg: <... completed> +key|balance|status|val +---+-------+------+------------------------------------------------------- + 1| 140|s1 |setup updated by update1_tg updated by update6_tg when1 +(1 row) + +step select1_tg: SELECT * FROM target_tg; +key|balance|status|val +---+-------+------+------------------------------------------------------- + 1| 140|s1 |setup updated by update1_tg updated by update6_tg when1 +(1 row) + +step c1: COMMIT; + +starting permutation: update7 update6 merge_bal c2 select1 c1 +step update7: UPDATE target t SET balance = 350, val = t.val || ' updated by update7' WHERE t.key = 1; +step update6: UPDATE target t SET balance = balance - 100, val = t.val || ' updated by update6' WHERE t.key = 1; +step merge_bal: + MERGE INTO target t + USING (SELECT 1 as key) s + ON s.key = t.key + WHEN MATCHED AND balance < 100 THEN + UPDATE SET balance = balance * 2, val = t.val || ' when1' + WHEN MATCHED AND balance < 200 THEN + UPDATE SET balance = balance * 4, val = t.val || ' when2' + WHEN MATCHED AND balance < 300 THEN + UPDATE SET balance = balance * 8, val = t.val || ' when3'; + +step c2: COMMIT; +step merge_bal: <... completed> +step select1: SELECT * FROM target; +key|balance|status|val +---+-------+------+------------------------------------------------- + 1| 2000|s1 |setup updated by update7 updated by update6 when3 +(1 row) + +step c1: COMMIT; + +starting permutation: update1_pa_move merge_bal_pa c2 c1 +step update1_pa_move: UPDATE target_pa t SET balance = 210, val = t.val || ' updated by update1_pa_move' WHERE t.key = 1; +step merge_bal_pa: + MERGE INTO target_pa t + USING (SELECT 1 as key) s + ON s.key = t.key + WHEN MATCHED AND balance < 100 THEN + UPDATE SET balance = balance * 2, val = t.val || ' when1' + WHEN MATCHED AND balance < 200 THEN + UPDATE SET balance = balance * 4, val = t.val || ' when2' + WHEN MATCHED AND balance < 300 THEN + UPDATE SET balance = balance * 8, val = t.val || ' when3'; + +step c2: COMMIT; +step merge_bal_pa: <... completed> +ERROR: tuple to be locked was already moved to another partition due to concurrent update +step c1: COMMIT; + +starting permutation: update1_pa update1_pa_move merge_bal_pa c2 c1 +step update1_pa: UPDATE target_pa t SET balance = balance + 10, val = t.val || ' updated by update1_pa' WHERE t.key = 1; +step update1_pa_move: UPDATE target_pa t SET balance = 210, val = t.val || ' updated by update1_pa_move' WHERE t.key = 1; +step merge_bal_pa: + MERGE INTO target_pa t + USING (SELECT 1 as key) s + ON s.key = t.key + WHEN MATCHED AND balance < 100 THEN + UPDATE SET balance = balance * 2, val = t.val || ' when1' + WHEN MATCHED AND balance < 200 THEN + UPDATE SET balance = balance * 4, val = t.val || ' when2' + WHEN MATCHED AND balance < 300 THEN + UPDATE SET balance = balance * 8, val = t.val || ' when3'; + +step c2: COMMIT; +step merge_bal_pa: <... completed> +ERROR: tuple to be locked was already moved to another partition due to concurrent update +step c1: COMMIT; + starting permutation: update1 merge_delete c2 select1 c1 step update1: UPDATE target t SET balance = balance + 10, val = t.val || ' updated by update1' WHERE t.key = 1; step merge_delete: diff --git a/src/test/isolation/specs/merge-match-recheck.spec b/src/test/isolation/specs/merge-match-recheck.spec index 15226e40c9efc..6e7a776d17e5a 100644 --- a/src/test/isolation/specs/merge-match-recheck.spec +++ b/src/test/isolation/specs/merge-match-recheck.spec @@ -146,6 +146,8 @@ setup BEGIN ISOLATION LEVEL READ COMMITTED; } step "update1" { UPDATE target t SET balance = balance + 10, val = t.val || ' updated by update1' WHERE t.key = 1; } +step "update1_pa" { UPDATE target_pa t SET balance = balance + 10, val = t.val || ' updated by update1_pa' WHERE t.key = 1; } +step "update1_pa_move" { UPDATE target_pa t SET balance = 210, val = t.val || ' updated by update1_pa_move' WHERE t.key = 1; } step "update1_tg" { UPDATE target_tg t SET balance = balance + 10, val = t.val || ' updated by update1_tg' WHERE t.key = 1; } step "update2" { UPDATE target t SET status = 's2', val = t.val || ' updated by update2' WHERE t.key = 1; } step "update2_tg" { UPDATE target_tg t SET status = 's2', val = t.val || ' updated by update2_tg' WHERE t.key = 1; } @@ -153,6 +155,10 @@ step "update3" { UPDATE target t SET status = 's3', val = t.val || ' updated by step "update3_tg" { UPDATE target_tg t SET status = 's3', val = t.val || ' updated by update3_tg' WHERE t.key = 1; } step "update5" { UPDATE target t SET status = 's5', val = t.val || ' updated by update5' WHERE t.key = 1; } step "update5_tg" { UPDATE target_tg t SET status = 's5', val = t.val || ' updated by update5_tg' WHERE t.key = 1; } +step "update6" { UPDATE target t SET balance = balance - 100, val = t.val || ' updated by update6' WHERE t.key = 1; } +step "update6_pa" { UPDATE target_pa t SET balance = balance - 100, val = t.val || ' updated by update6_pa' WHERE t.key = 1; } +step "update6_tg" { UPDATE target_tg t SET balance = balance - 100, val = t.val || ' updated by update6_tg' WHERE t.key = 1; } +step "update7" { UPDATE target t SET balance = 350, val = t.val || ' updated by update7' WHERE t.key = 1; } step "update_bal1" { UPDATE target t SET balance = 50, val = t.val || ' updated by update_bal1' WHERE t.key = 1; } step "update_bal1_pa" { UPDATE target_pa t SET balance = 50, val = t.val || ' updated by update_bal1_pa' WHERE t.key = 1; } step "update_bal1_tg" { UPDATE target_tg t SET balance = 50, val = t.val || ' updated by update_bal1_tg' WHERE t.key = 1; } @@ -179,6 +185,18 @@ permutation "update_bal1" "merge_bal" "c2" "select1" "c1" permutation "update_bal1_pa" "merge_bal_pa" "c2" "select1_pa" "c1" permutation "update_bal1_tg" "merge_bal_tg" "c2" "select1_tg" "c1" +# merge_bal sees row concurrently updated twice and rechecks WHEN conditions, different check passes, so final balance = 140 +permutation "update1" "update6" "merge_bal" "c2" "select1" "c1" +permutation "update1_pa" "update6_pa" "merge_bal_pa" "c2" "select1_pa" "c1" +permutation "update1_tg" "update6_tg" "merge_bal_tg" "c2" "select1_tg" "c1" + +# merge_bal sees row concurrently updated twice, first update would cause all checks to fail, second update causes different check to pass, so final balance = 2000 +permutation "update7" "update6" "merge_bal" "c2" "select1" "c1" + +# merge_bal sees concurrently updated row moved to new partition, so fails +permutation "update1_pa_move" "merge_bal_pa" "c2" "c1" +permutation "update1_pa" "update1_pa_move" "merge_bal_pa" "c2" "c1" + # merge_delete sees concurrently updated row and rechecks WHEN conditions, but recheck passes and row is deleted permutation "update1" "merge_delete" "c2" "select1" "c1" permutation "update1_tg" "merge_delete_tg" "c2" "select1_tg" "c1" diff --git a/src/test/modules/test_slru/test_slru.c b/src/test/modules/test_slru/test_slru.c index 8c0367eeee424..e963466aef1cd 100644 --- a/src/test/modules/test_slru/test_slru.c +++ b/src/test/modules/test_slru/test_slru.c @@ -219,8 +219,8 @@ test_slru_shmem_startup(void) */ const bool long_segment_names = true; const char slru_dir_name[] = "pg_test_slru"; - int test_tranche_id; - int test_buffer_tranche_id; + int test_tranche_id = -1; + int test_buffer_tranche_id = -1; if (prev_shmem_startup_hook) prev_shmem_startup_hook(); @@ -231,10 +231,18 @@ test_slru_shmem_startup(void) */ (void) MakePGDirectory(slru_dir_name); - /* initialize the SLRU facility */ - test_tranche_id = LWLockNewTrancheId("test_slru_tranche"); - - test_buffer_tranche_id = LWLockNewTrancheId("test_buffer_tranche"); + /* + * Initialize the SLRU facility. In EXEC_BACKEND builds, the + * shmem_startup_hook is called in the postmaster and in each backend, but + * we only need to generate the LWLock tranches once. Note that these + * tranche ID variables are not used by SimpleLruInit() when + * IsUnderPostmaster is true. + */ + if (!IsUnderPostmaster) + { + test_tranche_id = LWLockNewTrancheId("test_slru_tranche"); + test_buffer_tranche_id = LWLockNewTrancheId("test_buffer_tranche"); + } TestSlruCtl->PagePrecedes = test_slru_page_precedes_logically; SimpleLruInit(TestSlruCtl, "TestSLRU", diff --git a/src/test/regress/expected/create_table_like.out b/src/test/regress/expected/create_table_like.out index 29a779c2e9072..d3c35c148475d 100644 --- a/src/test/regress/expected/create_table_like.out +++ b/src/test/regress/expected/create_table_like.out @@ -320,6 +320,7 @@ DROP TABLE inhz; -- including storage and comments CREATE TABLE ctlt1 (a text CHECK (length(a) > 2) ENFORCED PRIMARY KEY, b text CHECK (length(b) > 100) NOT ENFORCED); +ALTER TABLE ctlt1 ADD CONSTRAINT cc CHECK (length(b) > 100) NOT VALID; CREATE INDEX ctlt1_b_key ON ctlt1 (b); CREATE INDEX ctlt1_fnidx ON ctlt1 ((a || b)); CREATE STATISTICS ctlt1_a_b_stat ON a,b FROM ctlt1; @@ -378,6 +379,7 @@ SELECT conname, description FROM pg_description, pg_constraint c WHERE classoid CREATE TABLE ctlt1_inh (LIKE ctlt1 INCLUDING CONSTRAINTS INCLUDING COMMENTS) INHERITS (ctlt1); NOTICE: merging column "a" with inherited definition NOTICE: merging column "b" with inherited definition +NOTICE: merging constraint "cc" with inherited definition NOTICE: merging constraint "ctlt1_a_check" with inherited definition NOTICE: merging constraint "ctlt1_b_check" with inherited definition \d+ ctlt1_inh @@ -387,6 +389,7 @@ NOTICE: merging constraint "ctlt1_b_check" with inherited definition a | text | | not null | | main | | A b | text | | | | extended | | B Check constraints: + "cc" CHECK (length(b) > 100) "ctlt1_a_check" CHECK (length(a) > 2) "ctlt1_b_check" CHECK (length(b) > 100) NOT ENFORCED Not-null constraints: @@ -409,6 +412,7 @@ NOTICE: merging multiple inherited definitions of column "a" b | text | | | | extended | | c | text | | | | external | | Check constraints: + "cc" CHECK (length(b) > 100) "ctlt1_a_check" CHECK (length(a) > 2) "ctlt1_b_check" CHECK (length(b) > 100) NOT ENFORCED "ctlt3_a_check" CHECK (length(a) < 5) @@ -430,6 +434,7 @@ NOTICE: merging column "a" with inherited definition Indexes: "ctlt13_like_expr_idx" btree ((a || c)) Check constraints: + "cc" CHECK (length(b) > 100) "ctlt1_a_check" CHECK (length(a) > 2) "ctlt1_b_check" CHECK (length(b) > 100) NOT ENFORCED "ctlt3_a_check" CHECK (length(a) < 5) @@ -456,6 +461,7 @@ Indexes: "ctlt_all_b_idx" btree (b) "ctlt_all_expr_idx" btree ((a || b)) Check constraints: + "cc" CHECK (length(b) > 100) "ctlt1_a_check" CHECK (length(a) > 2) "ctlt1_b_check" CHECK (length(b) > 100) NOT ENFORCED Statistics objects: @@ -499,6 +505,7 @@ Indexes: "pg_attrdef_b_idx" btree (b) "pg_attrdef_expr_idx" btree ((a || b)) Check constraints: + "cc" CHECK (length(b) > 100) "ctlt1_a_check" CHECK (length(a) > 2) "ctlt1_b_check" CHECK (length(b) > 100) NOT ENFORCED Statistics objects: @@ -524,6 +531,7 @@ Indexes: "ctlt1_b_idx" btree (b) "ctlt1_expr_idx" btree ((a || b)) Check constraints: + "cc" CHECK (length(b) > 100) "ctlt1_a_check" CHECK (length(a) > 2) "ctlt1_b_check" CHECK (length(b) > 100) NOT ENFORCED Statistics objects: diff --git a/src/test/regress/expected/memoize.out b/src/test/regress/expected/memoize.out index 150dc1b44cf62..fbcaf113266c5 100644 --- a/src/test/regress/expected/memoize.out +++ b/src/test/regress/expected/memoize.out @@ -545,15 +545,15 @@ EXPLAIN (COSTS OFF) SELECT * FROM tab_anti t1 WHERE t1.a IN (SELECT a FROM tab_anti t2 WHERE t2.b IN (SELECT t1.b FROM tab_anti t3 WHERE t2.a > 1 OFFSET 0)); - QUERY PLAN -------------------------------------------------- + QUERY PLAN +--------------------------------------------------- Nested Loop Semi Join -> Seq Scan on tab_anti t1 -> Nested Loop Semi Join Join Filter: (t1.a = t2.a) -> Seq Scan on tab_anti t2 - -> Subquery Scan on "ANY_subquery" - Filter: (t2.b = "ANY_subquery".b) + -> Subquery Scan on unnamed_subquery + Filter: (t2.b = unnamed_subquery.b) -> Result One-Time Filter: (t2.a > 1) -> Seq Scan on tab_anti t3 diff --git a/src/test/regress/expected/partition_prune.out b/src/test/regress/expected/partition_prune.out index d1966cd7d829f..68ecd95180920 100644 --- a/src/test/regress/expected/partition_prune.out +++ b/src/test/regress/expected/partition_prune.out @@ -4763,7 +4763,7 @@ select min(a) over (partition by a order by a) from part_abc where a >= stable_o QUERY PLAN ---------------------------------------------------------------------------------------------- Append - -> Subquery Scan on "*SELECT* 1_1" + -> Subquery Scan on unnamed_subquery_2 -> WindowAgg Window: w1 AS (PARTITION BY part_abc.a ORDER BY part_abc.a) -> Append @@ -4780,7 +4780,7 @@ select min(a) over (partition by a order by a) from part_abc where a >= stable_o -> Index Scan using part_abc_3_2_a_idx on part_abc_3_2 part_abc_4 Index Cond: (a >= (stable_one() + 1)) Filter: (d <= stable_one()) - -> Subquery Scan on "*SELECT* 2" + -> Subquery Scan on unnamed_subquery_1 -> WindowAgg Window: w1 AS (PARTITION BY part_abc_5.a ORDER BY part_abc_5.a) -> Append diff --git a/src/test/regress/expected/random.out b/src/test/regress/expected/random.out index 43cf88a36341b..7f17b2a1b12f8 100644 --- a/src/test/regress/expected/random.out +++ b/src/test/regress/expected/random.out @@ -536,3 +536,90 @@ SELECT n, random(0, trim_scale(abs(1 - 10.0^(-n)))) FROM generate_series(-20, 20 20 | 0.60795101234744211935 (41 rows) +-- random dates +SELECT random('1979-02-08'::date,'2025-07-03'::date) AS random_date_multiple_years; + random_date_multiple_years +---------------------------- + 04-09-1986 +(1 row) + +SELECT random('4714-11-24 BC'::date,'5874897-12-31 AD'::date) AS random_date_maximum_range; + random_date_maximum_range +--------------------------- + 10-02-2898131 +(1 row) + +SELECT random('1979-02-08'::date,'1979-02-08'::date) AS random_date_empty_range; + random_date_empty_range +------------------------- + 02-08-1979 +(1 row) + +SELECT random('2024-12-31'::date, '2024-01-01'::date); -- fail +ERROR: lower bound must be less than or equal to upper bound +SELECT random('-infinity'::date, '2024-01-01'::date); -- fail +ERROR: lower and upper bounds must be finite +SELECT random('2024-12-31'::date, 'infinity'::date); -- fail +ERROR: lower and upper bounds must be finite +-- random timestamps +SELECT random('1979-02-08'::timestamp,'2025-07-03'::timestamp) AS random_timestamp_multiple_years; + random_timestamp_multiple_years +--------------------------------- + Fri Jan 27 18:52:05.366009 2017 +(1 row) + +SELECT random('4714-11-24 BC'::timestamp,'294276-12-31 23:59:59.999999'::timestamp) AS random_timestamp_maximum_range; + random_timestamp_maximum_range +----------------------------------- + Wed Mar 28 00:45:36.180395 226694 +(1 row) + +SELECT random('2024-07-01 12:00:00.000001'::timestamp, '2024-07-01 12:00:00.999999'::timestamp) AS random_narrow_range; + random_narrow_range +--------------------------------- + Mon Jul 01 12:00:00.999286 2024 +(1 row) + +SELECT random('1979-02-08'::timestamp,'1979-02-08'::timestamp) AS random_timestamp_empty_range; + random_timestamp_empty_range +------------------------------ + Thu Feb 08 00:00:00 1979 +(1 row) + +SELECT random('2024-12-31'::timestamp, '2024-01-01'::timestamp); -- fail +ERROR: lower bound must be less than or equal to upper bound +SELECT random('-infinity'::timestamp, '2024-01-01'::timestamp); -- fail +ERROR: lower and upper bounds must be finite +SELECT random('2024-12-31'::timestamp, 'infinity'::timestamp); -- fail +ERROR: lower and upper bounds must be finite +-- random timestamps with timezone +SELECT random('1979-02-08 +01'::timestamptz,'2025-07-03 +02'::timestamptz) AS random_timestamptz_multiple_years; + random_timestamptz_multiple_years +------------------------------------- + Tue Jun 14 04:41:16.652896 2016 PDT +(1 row) + +SELECT random('4714-11-24 BC +00'::timestamptz,'294276-12-31 23:59:59.999999 +00'::timestamptz) AS random_timestamptz_maximum_range; + random_timestamptz_maximum_range +-------------------------------------- + Wed Mar 26 14:07:16.980265 31603 PDT +(1 row) + +SELECT random('2024-07-01 12:00:00.000001 +04'::timestamptz, '2024-07-01 12:00:00.999999 +04'::timestamptz) AS random_timestamptz_narrow_range; + random_timestamptz_narrow_range +------------------------------------- + Mon Jul 01 01:00:00.835808 2024 PDT +(1 row) + +SELECT random('1979-02-08 +05'::timestamptz,'1979-02-08 +05'::timestamptz) AS random_timestamptz_empty_range; + random_timestamptz_empty_range +-------------------------------- + Wed Feb 07 11:00:00 1979 PST +(1 row) + +SELECT random('2024-01-01 +06'::timestamptz, '2024-01-01 +07'::timestamptz); -- fail +ERROR: lower bound must be less than or equal to upper bound +SELECT random('-infinity'::timestamptz, '2024-01-01 +07'::timestamptz); -- fail +ERROR: lower and upper bounds must be finite +SELECT random('2024-01-01 +06'::timestamptz, 'infinity'::timestamptz); -- fail +ERROR: lower and upper bounds must be finite diff --git a/src/test/regress/expected/rangefuncs.out b/src/test/regress/expected/rangefuncs.out index c21be83aa4aaf..30241e22da270 100644 --- a/src/test/regress/expected/rangefuncs.out +++ b/src/test/regress/expected/rangefuncs.out @@ -2130,10 +2130,10 @@ select testrngfunc(); explain (verbose, costs off) select * from testrngfunc(); - QUERY PLAN ----------------------------------------------------------- - Subquery Scan on "*SELECT*" - Output: "*SELECT*"."?column?", "*SELECT*"."?column?_1" + QUERY PLAN +---------------------------------------------------------------------- + Subquery Scan on unnamed_subquery + Output: unnamed_subquery."?column?", unnamed_subquery."?column?_1" -> Unique Output: (1), (2) -> Sort diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out index c16dff05bc12e..7a1c216a0b1b7 100644 --- a/src/test/regress/expected/subselect.out +++ b/src/test/regress/expected/subselect.out @@ -1692,14 +1692,14 @@ select * from int4_tbl o where (f1, f1) in ------------------------------------------------------------------- Nested Loop Semi Join Output: o.f1 - Join Filter: (o.f1 = "ANY_subquery".f1) + Join Filter: (o.f1 = unnamed_subquery.f1) -> Seq Scan on public.int4_tbl o Output: o.f1 -> Materialize - Output: "ANY_subquery".f1, "ANY_subquery".g - -> Subquery Scan on "ANY_subquery" - Output: "ANY_subquery".f1, "ANY_subquery".g - Filter: ("ANY_subquery".f1 = "ANY_subquery".g) + Output: unnamed_subquery.f1, unnamed_subquery.g + -> Subquery Scan on unnamed_subquery + Output: unnamed_subquery.f1, unnamed_subquery.g + Filter: (unnamed_subquery.f1 = unnamed_subquery.g) -> Result Output: i.f1, ((generate_series(1, 50)) / 10) -> ProjectSet @@ -2867,8 +2867,8 @@ ON B.hundred in (SELECT min(c.hundred) FROM tenk2 C WHERE c.odd = b.odd); -> Memoize Cache Key: b.hundred, b.odd Cache Mode: binary - -> Subquery Scan on "ANY_subquery" - Filter: (b.hundred = "ANY_subquery".min) + -> Subquery Scan on unnamed_subquery + Filter: (b.hundred = unnamed_subquery.min) -> Result InitPlan 1 -> Limit diff --git a/src/test/regress/expected/union.out b/src/test/regress/expected/union.out index 96962817ed45a..d3ea433db1577 100644 --- a/src/test/regress/expected/union.out +++ b/src/test/regress/expected/union.out @@ -942,7 +942,7 @@ SELECT q1 FROM int8_tbl EXCEPT SELECT q2 FROM int8_tbl ORDER BY q2 LIMIT 1; ERROR: column "q2" does not exist LINE 1: ... int8_tbl EXCEPT SELECT q2 FROM int8_tbl ORDER BY q2 LIMIT 1... ^ -DETAIL: There is a column named "q2" in table "*SELECT* 2", but it cannot be referenced from this part of the query. +DETAIL: There is a column named "q2" in table "unnamed_subquery", but it cannot be referenced from this part of the query. -- But this should work: SELECT q1 FROM int8_tbl EXCEPT (((SELECT q2 FROM int8_tbl ORDER BY q2 LIMIT 1))) ORDER BY 1; q1 @@ -1338,14 +1338,14 @@ where q2 = q2; ---------------------------------------------------------- Unique -> Merge Append - Sort Key: "*SELECT* 1".q1 - -> Subquery Scan on "*SELECT* 1" + Sort Key: unnamed_subquery.q1 + -> Subquery Scan on unnamed_subquery -> Unique -> Sort Sort Key: i81.q1, i81.q2 -> Seq Scan on int8_tbl i81 Filter: (q2 IS NOT NULL) - -> Subquery Scan on "*SELECT* 2" + -> Subquery Scan on unnamed_subquery_1 -> Unique -> Sort Sort Key: i82.q1, i82.q2 @@ -1374,14 +1374,14 @@ where -q1 = q2; -------------------------------------------------------- Unique -> Merge Append - Sort Key: "*SELECT* 1".q1 - -> Subquery Scan on "*SELECT* 1" + Sort Key: unnamed_subquery.q1 + -> Subquery Scan on unnamed_subquery -> Unique -> Sort Sort Key: i81.q1, i81.q2 -> Seq Scan on int8_tbl i81 Filter: ((- q1) = q2) - -> Subquery Scan on "*SELECT* 2" + -> Subquery Scan on unnamed_subquery_1 -> Unique -> Sort Sort Key: i82.q1, i82.q2 diff --git a/src/test/regress/sql/create_table_like.sql b/src/test/regress/sql/create_table_like.sql index bf8702116a74b..93389b57dbf95 100644 --- a/src/test/regress/sql/create_table_like.sql +++ b/src/test/regress/sql/create_table_like.sql @@ -130,6 +130,7 @@ DROP TABLE inhz; -- including storage and comments CREATE TABLE ctlt1 (a text CHECK (length(a) > 2) ENFORCED PRIMARY KEY, b text CHECK (length(b) > 100) NOT ENFORCED); +ALTER TABLE ctlt1 ADD CONSTRAINT cc CHECK (length(b) > 100) NOT VALID; CREATE INDEX ctlt1_b_key ON ctlt1 (b); CREATE INDEX ctlt1_fnidx ON ctlt1 ((a || b)); CREATE STATISTICS ctlt1_a_b_stat ON a,b FROM ctlt1; diff --git a/src/test/regress/sql/random.sql b/src/test/regress/sql/random.sql index ebfa7539ede25..890f14687ef98 100644 --- a/src/test/regress/sql/random.sql +++ b/src/test/regress/sql/random.sql @@ -277,3 +277,29 @@ SELECT random(-1e30, 1e30) FROM generate_series(1, 10); SELECT random(-0.4, 0.4) FROM generate_series(1, 10); SELECT random(0, 1 - 1e-30) FROM generate_series(1, 10); SELECT n, random(0, trim_scale(abs(1 - 10.0^(-n)))) FROM generate_series(-20, 20) n; + +-- random dates +SELECT random('1979-02-08'::date,'2025-07-03'::date) AS random_date_multiple_years; +SELECT random('4714-11-24 BC'::date,'5874897-12-31 AD'::date) AS random_date_maximum_range; +SELECT random('1979-02-08'::date,'1979-02-08'::date) AS random_date_empty_range; +SELECT random('2024-12-31'::date, '2024-01-01'::date); -- fail +SELECT random('-infinity'::date, '2024-01-01'::date); -- fail +SELECT random('2024-12-31'::date, 'infinity'::date); -- fail + +-- random timestamps +SELECT random('1979-02-08'::timestamp,'2025-07-03'::timestamp) AS random_timestamp_multiple_years; +SELECT random('4714-11-24 BC'::timestamp,'294276-12-31 23:59:59.999999'::timestamp) AS random_timestamp_maximum_range; +SELECT random('2024-07-01 12:00:00.000001'::timestamp, '2024-07-01 12:00:00.999999'::timestamp) AS random_narrow_range; +SELECT random('1979-02-08'::timestamp,'1979-02-08'::timestamp) AS random_timestamp_empty_range; +SELECT random('2024-12-31'::timestamp, '2024-01-01'::timestamp); -- fail +SELECT random('-infinity'::timestamp, '2024-01-01'::timestamp); -- fail +SELECT random('2024-12-31'::timestamp, 'infinity'::timestamp); -- fail + +-- random timestamps with timezone +SELECT random('1979-02-08 +01'::timestamptz,'2025-07-03 +02'::timestamptz) AS random_timestamptz_multiple_years; +SELECT random('4714-11-24 BC +00'::timestamptz,'294276-12-31 23:59:59.999999 +00'::timestamptz) AS random_timestamptz_maximum_range; +SELECT random('2024-07-01 12:00:00.000001 +04'::timestamptz, '2024-07-01 12:00:00.999999 +04'::timestamptz) AS random_timestamptz_narrow_range; +SELECT random('1979-02-08 +05'::timestamptz,'1979-02-08 +05'::timestamptz) AS random_timestamptz_empty_range; +SELECT random('2024-01-01 +06'::timestamptz, '2024-01-01 +07'::timestamptz); -- fail +SELECT random('-infinity'::timestamptz, '2024-01-01 +07'::timestamptz); -- fail +SELECT random('2024-01-01 +06'::timestamptz, 'infinity'::timestamptz); -- fail diff --git a/src/test/subscription/Makefile b/src/test/subscription/Makefile index 50b65d8f6ea21..9d97e7d5c0d6d 100644 --- a/src/test/subscription/Makefile +++ b/src/test/subscription/Makefile @@ -13,9 +13,11 @@ subdir = src/test/subscription top_builddir = ../../.. include $(top_builddir)/src/Makefile.global -EXTRA_INSTALL = contrib/hstore +EXTRA_INSTALL = contrib/hstore \ + src/test/modules/injection_points export with_icu +export enable_injection_points check: $(prove_check) diff --git a/src/test/subscription/meson.build b/src/test/subscription/meson.build index 586ffba434e11..20b4e523d9307 100644 --- a/src/test/subscription/meson.build +++ b/src/test/subscription/meson.build @@ -5,7 +5,10 @@ tests += { 'sd': meson.current_source_dir(), 'bd': meson.current_build_dir(), 'tap': { - 'env': {'with_icu': icu.found() ? 'yes' : 'no'}, + 'env': { + 'with_icu': icu.found() ? 'yes' : 'no', + 'enable_injection_points': get_option('injection_points') ? 'yes' : 'no', + }, 'tests': [ 't/001_rep_changes.pl', 't/002_types.pl', diff --git a/src/test/subscription/t/035_conflicts.pl b/src/test/subscription/t/035_conflicts.pl index 51b23a39fa935..db0d5b464e825 100644 --- a/src/test/subscription/t/035_conflicts.pl +++ b/src/test/subscription/t/035_conflicts.pl @@ -386,6 +386,195 @@ .*Remote row \(2, 4\); replica identity full \(2, 2\)/, 'update target row was deleted in tab'); +############################################################################### +# Check that the xmin value of the conflict detection slot can be advanced when +# the subscription has no tables. +############################################################################### + +# Remove the table from the publication +$node_B->safe_psql('postgres', "ALTER PUBLICATION tap_pub_B DROP TABLE tab"); + +$node_A->safe_psql('postgres', + "ALTER SUBSCRIPTION $subname_AB REFRESH PUBLICATION"); + +# Remember the next transaction ID to be assigned +$next_xid = $node_A->safe_psql('postgres', "SELECT txid_current() + 1;"); + +# Confirm that the xmin value is advanced to the latest nextXid. If no +# transactions are running, the apply worker selects nextXid as the candidate +# for the non-removable xid. See GetOldestActiveTransactionId(). +ok( $node_A->poll_query_until( + 'postgres', + "SELECT xmin = $next_xid from pg_replication_slots WHERE slot_name = 'pg_conflict_detection'" + ), + "the xmin value of slot 'pg_conflict_detection' is updated on Node A"); + +# Re-add the table to the publication for further tests +$node_B->safe_psql('postgres', "ALTER PUBLICATION tap_pub_B ADD TABLE tab"); + +$node_A->safe_psql('postgres', + "ALTER SUBSCRIPTION $subname_AB REFRESH PUBLICATION WITH (copy_data = false)"); + +############################################################################### +# Test that publisher's transactions marked with DELAY_CHKPT_IN_COMMIT prevent +# concurrently deleted tuples on the subscriber from being removed. This test +# also acts as a safeguard to prevent developers from moving the commit +# timestamp acquisition before marking DELAY_CHKPT_IN_COMMIT in +# RecordTransactionCommitPrepared. +############################################################################### + +my $injection_points_supported = $node_B->check_extension('injection_points'); + +# This test depends on an injection point to block the prepared transaction +# commit after marking DELAY_CHKPT_IN_COMMIT flag. +if ($injection_points_supported != 0) +{ + $node_B->append_conf('postgresql.conf', + "shared_preload_libraries = 'injection_points' + max_prepared_transactions = 1"); + $node_B->restart; + + # Disable the subscription on Node B for testing only one-way + # replication. + $node_B->psql('postgres', "ALTER SUBSCRIPTION $subname_BA DISABLE;"); + + # Wait for the apply worker to stop + $node_B->poll_query_until('postgres', + "SELECT count(*) = 0 FROM pg_stat_activity WHERE backend_type = 'logical replication apply worker'" + ); + + # Truncate the table to cleanup existing dead rows in the table. Then insert + # a new row. + $node_B->safe_psql( + 'postgres', qq( + TRUNCATE tab; + INSERT INTO tab VALUES(1, 1); + )); + + $node_B->wait_for_catchup($subname_AB); + + # Create the injection_points extension on the publisher node and attach to the + # commit-after-delay-checkpoint injection point. + $node_B->safe_psql( + 'postgres', + "CREATE EXTENSION injection_points; + SELECT injection_points_attach('commit-after-delay-checkpoint', 'wait');" + ); + + # Start a background session on the publisher node to perform an update and + # pause at the injection point. + my $pub_session = $node_B->background_psql('postgres'); + $pub_session->query_until( + qr/starting_bg_psql/, + q{ + \echo starting_bg_psql + BEGIN; + UPDATE tab SET b = 2 WHERE a = 1; + PREPARE TRANSACTION 'txn_with_later_commit_ts'; + COMMIT PREPARED 'txn_with_later_commit_ts'; + } + ); + + # Confirm the update is suspended + $result = + $node_B->safe_psql('postgres', 'SELECT * FROM tab WHERE a = 1'); + is($result, qq(1|1), 'publisher sees the old row'); + + # Delete the row on the subscriber. The deleted row should be retained due to a + # transaction on the publisher, which is currently marked with the + # DELAY_CHKPT_IN_COMMIT flag. + $node_A->safe_psql('postgres', "DELETE FROM tab WHERE a = 1;"); + + # Get the commit timestamp for the delete + my $sub_ts = $node_A->safe_psql('postgres', + "SELECT timestamp FROM pg_last_committed_xact();"); + + $log_location = -s $node_A->logfile; + + # Confirm that the apply worker keeps requesting publisher status, while + # awaiting the prepared transaction to commit. Thus, the request log should + # appear more than once. + $node_A->wait_for_log( + qr/sending publisher status request message/, + $log_location); + + $log_location = -s $node_A->logfile; + + $node_A->wait_for_log( + qr/sending publisher status request message/, + $log_location); + + # Confirm that the dead tuple cannot be removed + ($cmdret, $stdout, $stderr) = + $node_A->psql('postgres', qq(VACUUM (verbose) public.tab;)); + + ok($stderr =~ qr/1 are dead but not yet removable/, + 'the deleted column is non-removable'); + + $log_location = -s $node_A->logfile; + + # Wakeup and detach the injection point on the publisher node. The prepared + # transaction should now commit. + $node_B->safe_psql( + 'postgres', + "SELECT injection_points_wakeup('commit-after-delay-checkpoint'); + SELECT injection_points_detach('commit-after-delay-checkpoint');" + ); + + # Close the background session on the publisher node + ok($pub_session->quit, "close publisher session"); + + # Confirm that the transaction committed + $result = + $node_B->safe_psql('postgres', 'SELECT * FROM tab WHERE a = 1'); + is($result, qq(1|2), 'publisher sees the new row'); + + # Ensure the UPDATE is replayed on subscriber + $node_B->wait_for_catchup($subname_AB); + + $logfile = slurp_file($node_A->logfile(), $log_location); + ok( $logfile =~ + qr/conflict detected on relation "public.tab": conflict=update_deleted.* +.*DETAIL:.* The row to be updated was deleted locally in transaction [0-9]+ at .* +.*Remote row \(1, 2\); replica identity full \(1, 1\)/, + 'update target row was deleted in tab'); + + # Remember the next transaction ID to be assigned + $next_xid = + $node_A->safe_psql('postgres', "SELECT txid_current() + 1;"); + + # Confirm that the xmin value is advanced to the latest nextXid after the + # prepared transaction on the publisher has been committed. + ok( $node_A->poll_query_until( + 'postgres', + "SELECT xmin = $next_xid from pg_replication_slots WHERE slot_name = 'pg_conflict_detection'" + ), + "the xmin value of slot 'pg_conflict_detection' is updated on subscriber" + ); + + # Confirm that the dead tuple can be removed now + ($cmdret, $stdout, $stderr) = + $node_A->psql('postgres', qq(VACUUM (verbose) public.tab;)); + + ok($stderr =~ qr/1 removed, 0 remain, 0 are dead but not yet removable/, + 'the deleted column is removed'); + + # Get the commit timestamp for the publisher's update + my $pub_ts = $node_B->safe_psql('postgres', + "SELECT pg_xact_commit_timestamp(xmin) from tab where a=1;"); + + # Check that the commit timestamp for the update on the publisher is later than + # or equal to the timestamp of the local deletion, as the commit timestamp + # should be assigned after marking the DELAY_CHKPT_IN_COMMIT flag. + $result = $node_B->safe_psql('postgres', + "SELECT '$pub_ts'::timestamp >= '$sub_ts'::timestamp"); + is($result, qq(t), + "pub UPDATE's timestamp is later than that of sub's DELETE"); + + # Re-enable the subscription for further tests + $node_B->psql('postgres', "ALTER SUBSCRIPTION $subname_BA ENABLE;"); +} + ############################################################################### # Check that dead tuple retention stops due to the wait time surpassing # max_retention_duration.