From 38b602b0289fe1dbaf31d5737fba2d42a1e90371 Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Wed, 3 Sep 2025 13:57:48 -0500 Subject: [PATCH 01/73] Move dynamically-allocated LWLock tranche names to shared memory. There are two ways for shared libraries to allocate their own LWLock tranches. One way is to call RequestNamedLWLockTranche() in a shmem_request_hook, which requires the library to be loaded via shared_preload_libraries. The other way is to call LWLockNewTrancheId(), which is not subject to the same restrictions. However, LWLockNewTrancheId() does require each backend to store the tranche's name in backend-local memory via LWLockRegisterTranche(). This API is a little cumbersome and leads to things like unhelpful pg_stat_activity.wait_event values in backends that haven't loaded the library. This commit moves these LWLock tranche names to shared memory, thus eliminating the need for each backend to call LWLockRegisterTranche(). Instead, the tranche name must be provided to LWLockNewTrancheId(), which immediately makes the name available to all backends. Since the tranche name array is append-only, lookups can ordinarily avoid locking as long as their local copy of the LWLock counter is greater than the requested tranche ID. One downside of this approach is that we now have a hard limit on both the length of tranche names (NAMEDATALEN-1 bytes) and the number of dynamically-allocated tranches (256). Besides a limit of NAMEDATALEN-1 bytes for tranche names registered via RequestNamedLWLockTranche(), no such limits previously existed. We could avoid these new limits by using dynamic shared memory, but the complexity involved didn't seem worth it. We briefly considered making the tranche limit user-configurable but ultimately decided against that, too. Since there is still a lot of time left in the v19 development cycle, it's possible we will revisit this choice. Author: Sami Imseih Reviewed-by: Bertrand Drouvot Reviewed-by: Tom Lane Reviewed-by: Rahila Syed Reviewed-by: Andres Freund Discussion: https://postgr.es/m/CAA5RZ0vvED3naph8My8Szv6DL4AxOVK3eTPS0qXsaKi%3DbVdW2A%40mail.gmail.com --- contrib/pg_prewarm/autoprewarm.c | 3 +- doc/src/sgml/xfunc.sgml | 15 +- src/backend/postmaster/launch_backend.c | 6 +- src/backend/storage/ipc/dsm_registry.c | 12 +- src/backend/storage/lmgr/lwlock.c | 216 +++++++++--------- src/include/storage/lwlock.h | 25 +- src/test/modules/test_dsa/test_dsa.c | 6 +- .../test_dsm_registry/test_dsm_registry.c | 3 +- .../modules/test_radixtree/test_radixtree.c | 9 +- src/test/modules/test_slru/test_slru.c | 6 +- .../modules/test_tidstore/test_tidstore.c | 3 +- 11 files changed, 130 insertions(+), 174 deletions(-) diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c index c01b9c7e6a4d6..880e897796a1e 100644 --- a/contrib/pg_prewarm/autoprewarm.c +++ b/contrib/pg_prewarm/autoprewarm.c @@ -864,7 +864,7 @@ apw_init_state(void *ptr) { AutoPrewarmSharedState *state = (AutoPrewarmSharedState *) ptr; - LWLockInitialize(&state->lock, LWLockNewTrancheId()); + LWLockInitialize(&state->lock, LWLockNewTrancheId("autoprewarm")); state->bgworker_pid = InvalidPid; state->pid_using_dumpfile = InvalidPid; } @@ -883,7 +883,6 @@ apw_init_shmem(void) sizeof(AutoPrewarmSharedState), apw_init_state, &found); - LWLockRegisterTranche(apw_state->lock.tranche, "autoprewarm"); return found; } diff --git a/doc/src/sgml/xfunc.sgml b/doc/src/sgml/xfunc.sgml index f116d0648e559..da21ef5689184 100644 --- a/doc/src/sgml/xfunc.sgml +++ b/doc/src/sgml/xfunc.sgml @@ -3759,7 +3759,7 @@ LWLockPadded *GetNamedLWLockTranche(const char *tranche_name) shmem_request_hook. To do so, first allocate a tranche_id by calling: -int LWLockNewTrancheId(void) +int LWLockNewTrancheId(const char *name) Next, initialize each LWLock, passing the new tranche_id as an argument: @@ -3777,17 +3777,8 @@ void LWLockInitialize(LWLock *lock, int tranche_id) - Finally, each backend using the tranche_id should - associate it with a tranche_name by calling: - -void LWLockRegisterTranche(int tranche_id, const char *tranche_name) - - - - - A complete usage example of LWLockNewTrancheId, - LWLockInitialize, and - LWLockRegisterTranche can be found in + A complete usage example of LWLockNewTrancheId and + LWLockInitialize can be found in contrib/pg_prewarm/autoprewarm.c in the PostgreSQL source tree. diff --git a/src/backend/postmaster/launch_backend.c b/src/backend/postmaster/launch_backend.c index cd9547b03a32b..a38979c50e4bb 100644 --- a/src/backend/postmaster/launch_backend.c +++ b/src/backend/postmaster/launch_backend.c @@ -101,7 +101,7 @@ typedef struct struct InjectionPointsCtl *ActiveInjectionPoints; #endif int NamedLWLockTrancheRequests; - NamedLWLockTranche *NamedLWLockTrancheArray; + char **LWLockTrancheNames; int *LWLockCounter; LWLockPadded *MainLWLockArray; slock_t *ProcStructLock; @@ -761,7 +761,7 @@ save_backend_variables(BackendParameters *param, #endif param->NamedLWLockTrancheRequests = NamedLWLockTrancheRequests; - param->NamedLWLockTrancheArray = NamedLWLockTrancheArray; + param->LWLockTrancheNames = LWLockTrancheNames; param->LWLockCounter = LWLockCounter; param->MainLWLockArray = MainLWLockArray; param->ProcStructLock = ProcStructLock; @@ -1022,7 +1022,7 @@ restore_backend_variables(BackendParameters *param) #endif NamedLWLockTrancheRequests = param->NamedLWLockTrancheRequests; - NamedLWLockTrancheArray = param->NamedLWLockTrancheArray; + LWLockTrancheNames = param->LWLockTrancheNames; LWLockCounter = param->LWLockCounter; MainLWLockArray = param->MainLWLockArray; ProcStructLock = param->ProcStructLock; diff --git a/src/backend/storage/ipc/dsm_registry.c b/src/backend/storage/ipc/dsm_registry.c index ca12815f4a854..971309251062d 100644 --- a/src/backend/storage/ipc/dsm_registry.c +++ b/src/backend/storage/ipc/dsm_registry.c @@ -299,8 +299,7 @@ GetNamedDSA(const char *name, bool *found) entry->type = DSMR_ENTRY_TYPE_DSA; /* Initialize the LWLock tranche for the DSA. */ - state->tranche = LWLockNewTrancheId(); - LWLockRegisterTranche(state->tranche, name); + state->tranche = LWLockNewTrancheId(name); /* Initialize the DSA. */ ret = dsa_create(state->tranche); @@ -321,9 +320,6 @@ GetNamedDSA(const char *name, bool *found) ereport(ERROR, (errmsg("requested DSA already attached to current process"))); - /* Initialize existing LWLock tranche for the DSA. */ - LWLockRegisterTranche(state->tranche, name); - /* Attach to existing DSA. */ ret = dsa_attach(state->handle); dsa_pin_mapping(ret); @@ -378,8 +374,7 @@ GetNamedDSHash(const char *name, const dshash_parameters *params, bool *found) entry->type = DSMR_ENTRY_TYPE_DSH; /* Initialize the LWLock tranche for the hash table. */ - dsh_state->tranche = LWLockNewTrancheId(); - LWLockRegisterTranche(dsh_state->tranche, name); + dsh_state->tranche = LWLockNewTrancheId(name); /* Initialize the DSA for the hash table. */ dsa = dsa_create(dsh_state->tranche); @@ -409,9 +404,6 @@ GetNamedDSHash(const char *name, const dshash_parameters *params, bool *found) ereport(ERROR, (errmsg("requested DSHash already attached to current process"))); - /* Initialize existing LWLock tranche for the hash table. */ - LWLockRegisterTranche(dsh_state->tranche, name); - /* Attach to existing DSA for the hash table. */ dsa = dsa_attach(dsh_state->dsa_handle); dsa_pin_mapping(dsa); diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index a4aecd1fbc34f..258cdebd0f5c9 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -126,8 +126,8 @@ StaticAssertDecl((LW_VAL_EXCLUSIVE & LW_FLAG_MASK) == 0, * in lwlocklist.h. We absorb the names of these tranches, too. * * 3. Extensions can create new tranches, via either RequestNamedLWLockTranche - * or LWLockRegisterTranche. The names of these that are known in the current - * process appear in LWLockTrancheNames[]. + * or LWLockNewTrancheId. These names are stored in shared memory and can be + * accessed via LWLockTrancheNames. * * All these names are user-visible as wait event names, so choose with care * ... and do not forget to update the documentation's list of wait events. @@ -146,11 +146,12 @@ StaticAssertDecl(lengthof(BuiltinTrancheNames) == /* * This is indexed by tranche ID minus LWTRANCHE_FIRST_USER_DEFINED, and - * stores the names of all dynamically-created tranches known to the current - * process. Any unused entries in the array will contain NULL. + * points to the shared memory locations of the names of all + * dynamically-created tranches. Backends inherit the pointer by fork from the + * postmaster (except in the EXEC_BACKEND case, where we have special measures + * to pass it down). */ -static const char **LWLockTrancheNames = NULL; -static int LWLockTrancheNamesAllocated = 0; +char **LWLockTrancheNames = NULL; /* * This points to the main array of LWLocks in shared memory. Backends inherit @@ -184,20 +185,22 @@ typedef struct NamedLWLockTrancheRequest } NamedLWLockTrancheRequest; static NamedLWLockTrancheRequest *NamedLWLockTrancheRequestArray = NULL; -static int NamedLWLockTrancheRequestsAllocated = 0; /* - * NamedLWLockTrancheRequests is both the valid length of the request array, - * and the length of the shared-memory NamedLWLockTrancheArray later on. - * This variable and NamedLWLockTrancheArray are non-static so that - * postmaster.c can copy them to child processes in EXEC_BACKEND builds. + * NamedLWLockTrancheRequests is the valid length of the request array. This + * variable is non-static so that postmaster.c can copy them to child processes + * in EXEC_BACKEND builds. */ int NamedLWLockTrancheRequests = 0; -/* points to data in shared memory: */ -NamedLWLockTranche *NamedLWLockTrancheArray = NULL; +/* shared memory counter of registered tranches */ int *LWLockCounter = NULL; +/* backend-local counter of registered tranches */ +static int LocalLWLockCounter; + +#define MAX_NAMED_TRANCHES 256 + static void InitializeLWLocks(void); static inline void LWLockReportWaitStart(LWLock *lock); static inline void LWLockReportWaitEnd(void); @@ -392,31 +395,28 @@ Size LWLockShmemSize(void) { Size size; - int i; int numLocks = NUM_FIXED_LWLOCKS; /* Calculate total number of locks needed in the main array. */ numLocks += NumLWLocksForNamedTranches(); - /* Space for dynamic allocation counter, plus room for alignment. */ - size = sizeof(int) + LWLOCK_PADDED_SIZE; + /* Space for dynamic allocation counter. */ + size = MAXALIGN(sizeof(int)); - /* Space for the LWLock array. */ - size = add_size(size, mul_size(numLocks, sizeof(LWLockPadded))); + /* Space for named tranches. */ + size = add_size(size, mul_size(MAX_NAMED_TRANCHES, sizeof(char *))); + size = add_size(size, mul_size(MAX_NAMED_TRANCHES, NAMEDATALEN)); - /* space for named tranches. */ - size = add_size(size, mul_size(NamedLWLockTrancheRequests, sizeof(NamedLWLockTranche))); - - /* space for name of each tranche. */ - for (i = 0; i < NamedLWLockTrancheRequests; i++) - size = add_size(size, strlen(NamedLWLockTrancheRequestArray[i].tranche_name) + 1); + /* Space for the LWLock array, plus room for cache line alignment. */ + size = add_size(size, LWLOCK_PADDED_SIZE); + size = add_size(size, mul_size(numLocks, sizeof(LWLockPadded))); return size; } /* * Allocate shmem space for the main LWLock array and all tranches and - * initialize it. We also register extension LWLock tranches here. + * initialize it. */ void CreateLWLocks(void) @@ -432,7 +432,16 @@ CreateLWLocks(void) /* Initialize the dynamic-allocation counter for tranches */ LWLockCounter = (int *) ptr; *LWLockCounter = LWTRANCHE_FIRST_USER_DEFINED; - ptr += sizeof(int); + ptr += MAXALIGN(sizeof(int)); + + /* Initialize tranche names */ + LWLockTrancheNames = (char **) ptr; + ptr += MAX_NAMED_TRANCHES * sizeof(char *); + for (int i = 0; i < MAX_NAMED_TRANCHES; i++) + { + LWLockTrancheNames[i] = ptr; + ptr += NAMEDATALEN; + } /* Ensure desired alignment of LWLock array */ ptr += LWLOCK_PADDED_SIZE - ((uintptr_t) ptr) % LWLOCK_PADDED_SIZE; @@ -441,11 +450,6 @@ CreateLWLocks(void) /* Initialize all LWLocks */ InitializeLWLocks(); } - - /* Register named extension LWLock tranches in the current process. */ - for (int i = 0; i < NamedLWLockTrancheRequests; i++) - LWLockRegisterTranche(NamedLWLockTrancheArray[i].trancheId, - NamedLWLockTrancheArray[i].trancheName); } /* @@ -454,7 +458,6 @@ CreateLWLocks(void) static void InitializeLWLocks(void) { - int numNamedLocks = NumLWLocksForNamedTranches(); int id; int i; int j; @@ -485,32 +488,18 @@ InitializeLWLocks(void) */ if (NamedLWLockTrancheRequests > 0) { - char *trancheNames; - - NamedLWLockTrancheArray = (NamedLWLockTranche *) - &MainLWLockArray[NUM_FIXED_LWLOCKS + numNamedLocks]; - - trancheNames = (char *) NamedLWLockTrancheArray + - (NamedLWLockTrancheRequests * sizeof(NamedLWLockTranche)); lock = &MainLWLockArray[NUM_FIXED_LWLOCKS]; for (i = 0; i < NamedLWLockTrancheRequests; i++) { NamedLWLockTrancheRequest *request; - NamedLWLockTranche *tranche; - char *name; + int tranche; request = &NamedLWLockTrancheRequestArray[i]; - tranche = &NamedLWLockTrancheArray[i]; - - name = trancheNames; - trancheNames += strlen(request->tranche_name) + 1; - strcpy(name, request->tranche_name); - tranche->trancheId = LWLockNewTrancheId(); - tranche->trancheName = name; + tranche = LWLockNewTrancheId(request->tranche_name); for (j = 0; j < request->num_lwlocks; j++, lock++) - LWLockInitialize(&lock->lock, tranche->trancheId); + LWLockInitialize(&lock->lock, tranche); } } } @@ -562,59 +551,47 @@ GetNamedLWLockTranche(const char *tranche_name) } /* - * Allocate a new tranche ID. + * Allocate a new tranche ID with the provided name. */ int -LWLockNewTrancheId(void) +LWLockNewTrancheId(const char *name) { int result; - /* We use the ShmemLock spinlock to protect LWLockCounter */ - SpinLockAcquire(ShmemLock); - result = (*LWLockCounter)++; - SpinLockRelease(ShmemLock); + if (!name) + ereport(ERROR, + (errcode(ERRCODE_INVALID_NAME), + errmsg("tranche name cannot be NULL"))); - return result; -} + if (strlen(name) >= NAMEDATALEN) + ereport(ERROR, + (errcode(ERRCODE_NAME_TOO_LONG), + errmsg("tranche name too long"), + errdetail("LWLock tranche names must be no longer than %d bytes.", + NAMEDATALEN - 1))); -/* - * Register a dynamic tranche name in the lookup table of the current process. - * - * This routine will save a pointer to the tranche name passed as an argument, - * so the name should be allocated in a backend-lifetime context - * (shared memory, TopMemoryContext, static constant, or similar). - * - * The tranche name will be user-visible as a wait event name, so try to - * use a name that fits the style for those. - */ -void -LWLockRegisterTranche(int tranche_id, const char *tranche_name) -{ - /* This should only be called for user-defined tranches. */ - if (tranche_id < LWTRANCHE_FIRST_USER_DEFINED) - return; - - /* Convert to array index. */ - tranche_id -= LWTRANCHE_FIRST_USER_DEFINED; + /* + * We use the ShmemLock spinlock to protect LWLockCounter and + * LWLockTrancheNames. + */ + SpinLockAcquire(ShmemLock); - /* If necessary, create or enlarge array. */ - if (tranche_id >= LWLockTrancheNamesAllocated) + if (*LWLockCounter - LWTRANCHE_FIRST_USER_DEFINED >= MAX_NAMED_TRANCHES) { - int newalloc; + SpinLockRelease(ShmemLock); + ereport(ERROR, + (errmsg("maximum number of tranches already registered"), + errdetail("No more than %d tranches may be registered.", + MAX_NAMED_TRANCHES))); + } - newalloc = pg_nextpower2_32(Max(8, tranche_id + 1)); + result = (*LWLockCounter)++; + LocalLWLockCounter = *LWLockCounter; + strlcpy(LWLockTrancheNames[result - LWTRANCHE_FIRST_USER_DEFINED], name, NAMEDATALEN); - if (LWLockTrancheNames == NULL) - LWLockTrancheNames = (const char **) - MemoryContextAllocZero(TopMemoryContext, - newalloc * sizeof(char *)); - else - LWLockTrancheNames = - repalloc0_array(LWLockTrancheNames, const char *, LWLockTrancheNamesAllocated, newalloc); - LWLockTrancheNamesAllocated = newalloc; - } + SpinLockRelease(ShmemLock); - LWLockTrancheNames[tranche_id] = tranche_name; + return result; } /* @@ -637,27 +614,33 @@ RequestNamedLWLockTranche(const char *tranche_name, int num_lwlocks) if (!process_shmem_requests_in_progress) elog(FATAL, "cannot request additional LWLocks outside shmem_request_hook"); + if (!tranche_name) + ereport(ERROR, + (errcode(ERRCODE_INVALID_NAME), + errmsg("tranche name cannot be NULL"))); + + if (strlen(tranche_name) >= NAMEDATALEN) + ereport(ERROR, + (errcode(ERRCODE_NAME_TOO_LONG), + errmsg("tranche name too long"), + errdetail("LWLock tranche names must be no longer than %d bytes.", + NAMEDATALEN - 1))); + if (NamedLWLockTrancheRequestArray == NULL) { - NamedLWLockTrancheRequestsAllocated = 16; NamedLWLockTrancheRequestArray = (NamedLWLockTrancheRequest *) MemoryContextAlloc(TopMemoryContext, - NamedLWLockTrancheRequestsAllocated + MAX_NAMED_TRANCHES * sizeof(NamedLWLockTrancheRequest)); } - if (NamedLWLockTrancheRequests >= NamedLWLockTrancheRequestsAllocated) - { - int i = pg_nextpower2_32(NamedLWLockTrancheRequests + 1); - - NamedLWLockTrancheRequestArray = (NamedLWLockTrancheRequest *) - repalloc(NamedLWLockTrancheRequestArray, - i * sizeof(NamedLWLockTrancheRequest)); - NamedLWLockTrancheRequestsAllocated = i; - } + if (NamedLWLockTrancheRequests >= MAX_NAMED_TRANCHES) + ereport(ERROR, + (errmsg("maximum number of tranches already registered"), + errdetail("No more than %d tranches may be registered.", + MAX_NAMED_TRANCHES))); request = &NamedLWLockTrancheRequestArray[NamedLWLockTrancheRequests]; - Assert(strlen(tranche_name) + 1 <= NAMEDATALEN); strlcpy(request->tranche_name, tranche_name, NAMEDATALEN); request->num_lwlocks = num_lwlocks; NamedLWLockTrancheRequests++; @@ -669,6 +652,9 @@ RequestNamedLWLockTranche(const char *tranche_name, int num_lwlocks) void LWLockInitialize(LWLock *lock, int tranche_id) { + /* verify the tranche_id is valid */ + (void) GetLWTrancheName(tranche_id); + pg_atomic_init_u32(&lock->state, LW_FLAG_RELEASE_OK); #ifdef LOCK_DEBUG pg_atomic_init_u32(&lock->nwaiters, 0); @@ -710,15 +696,27 @@ GetLWTrancheName(uint16 trancheId) return BuiltinTrancheNames[trancheId]; /* - * It's an extension tranche, so look in LWLockTrancheNames[]. However, - * it's possible that the tranche has never been registered in the current - * process, in which case give up and return "extension". + * We only ever add new entries to LWLockTrancheNames, so most lookups can + * avoid taking the spinlock as long as the backend-local counter + * (LocalLWLockCounter) is greater than the requested tranche ID. Else, + * we need to first update the backend-local counter with ShmemLock held + * before attempting the lookup again. In practice, the latter case is + * probably rare. */ - trancheId -= LWTRANCHE_FIRST_USER_DEFINED; + if (trancheId >= LocalLWLockCounter) + { + SpinLockAcquire(ShmemLock); + LocalLWLockCounter = *LWLockCounter; + SpinLockRelease(ShmemLock); + + if (trancheId >= LocalLWLockCounter) + elog(ERROR, "tranche %d is not registered", trancheId); + } - if (trancheId >= LWLockTrancheNamesAllocated || - LWLockTrancheNames[trancheId] == NULL) - return "extension"; + /* + * It's an extension tranche, so look in LWLockTrancheNames. + */ + trancheId -= LWTRANCHE_FIRST_USER_DEFINED; return LWLockTrancheNames[trancheId]; } diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index f9cf57f8d266d..0e9cf81a4c766 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -73,14 +73,7 @@ typedef union LWLockPadded extern PGDLLIMPORT LWLockPadded *MainLWLockArray; -/* struct for storing named tranche information */ -typedef struct NamedLWLockTranche -{ - int trancheId; - char *trancheName; -} NamedLWLockTranche; - -extern PGDLLIMPORT NamedLWLockTranche *NamedLWLockTrancheArray; +extern PGDLLIMPORT char **LWLockTrancheNames; extern PGDLLIMPORT int NamedLWLockTrancheRequests; extern PGDLLIMPORT int *LWLockCounter; @@ -158,19 +151,11 @@ extern LWLockPadded *GetNamedLWLockTranche(const char *tranche_name); /* * There is another, more flexible method of obtaining lwlocks. First, call - * LWLockNewTrancheId just once to obtain a tranche ID; this allocates from - * a shared counter. Next, each individual process using the tranche should - * call LWLockRegisterTranche() to associate that tranche ID with a name. - * Finally, LWLockInitialize should be called just once per lwlock, passing - * the tranche ID as an argument. - * - * It may seem strange that each process using the tranche must register it - * separately, but dynamic shared memory segments aren't guaranteed to be - * mapped at the same address in all coordinating backends, so storing the - * registration in the main shared memory segment wouldn't work for that case. + * LWLockNewTrancheId to obtain a tranche ID; this allocates from a shared + * counter. Second, LWLockInitialize should be called just once per lwlock, + * passing the tranche ID as an argument. */ -extern int LWLockNewTrancheId(void); -extern void LWLockRegisterTranche(int tranche_id, const char *tranche_name); +extern int LWLockNewTrancheId(const char *name); extern void LWLockInitialize(LWLock *lock, int tranche_id); /* diff --git a/src/test/modules/test_dsa/test_dsa.c b/src/test/modules/test_dsa/test_dsa.c index cd24d0f48736d..01d5c6fa67f0e 100644 --- a/src/test/modules/test_dsa/test_dsa.c +++ b/src/test/modules/test_dsa/test_dsa.c @@ -29,8 +29,7 @@ test_dsa_basic(PG_FUNCTION_ARGS) dsa_pointer p[100]; /* XXX: this tranche is leaked */ - tranche_id = LWLockNewTrancheId(); - LWLockRegisterTranche(tranche_id, "test_dsa"); + tranche_id = LWLockNewTrancheId("test_dsa"); a = dsa_create(tranche_id); for (int i = 0; i < 100; i++) @@ -70,8 +69,7 @@ test_dsa_resowners(PG_FUNCTION_ARGS) ResourceOwner childowner; /* XXX: this tranche is leaked */ - tranche_id = LWLockNewTrancheId(); - LWLockRegisterTranche(tranche_id, "test_dsa"); + tranche_id = LWLockNewTrancheId("test_dsa"); /* Create DSA in parent resource owner */ a = dsa_create(tranche_id); diff --git a/src/test/modules/test_dsm_registry/test_dsm_registry.c b/src/test/modules/test_dsm_registry/test_dsm_registry.c index 141c8ed1b34e3..4cc2ccdac3f11 100644 --- a/src/test/modules/test_dsm_registry/test_dsm_registry.c +++ b/src/test/modules/test_dsm_registry/test_dsm_registry.c @@ -48,7 +48,7 @@ init_tdr_dsm(void *ptr) { TestDSMRegistryStruct *dsm = (TestDSMRegistryStruct *) ptr; - LWLockInitialize(&dsm->lck, LWLockNewTrancheId()); + LWLockInitialize(&dsm->lck, LWLockNewTrancheId("test_dsm_registry")); dsm->val = 0; } @@ -61,7 +61,6 @@ tdr_attach_shmem(void) sizeof(TestDSMRegistryStruct), init_tdr_dsm, &found); - LWLockRegisterTranche(tdr_dsm->lck.tranche, "test_dsm_registry"); if (tdr_dsa == NULL) tdr_dsa = GetNamedDSA("test_dsm_registry_dsa", &found); diff --git a/src/test/modules/test_radixtree/test_radixtree.c b/src/test/modules/test_radixtree/test_radixtree.c index 80ad029616473..787162c879330 100644 --- a/src/test/modules/test_radixtree/test_radixtree.c +++ b/src/test/modules/test_radixtree/test_radixtree.c @@ -124,10 +124,9 @@ test_empty(void) rt_iter *iter; uint64 key; #ifdef TEST_SHARED_RT - int tranche_id = LWLockNewTrancheId(); + int tranche_id = LWLockNewTrancheId("test_radix_tree"); dsa_area *dsa; - LWLockRegisterTranche(tranche_id, "test_radix_tree"); dsa = dsa_create(tranche_id); radixtree = rt_create(dsa, tranche_id); #else @@ -167,10 +166,9 @@ test_basic(rt_node_class_test_elem *test_info, int shift, bool asc) uint64 *keys; int children = test_info->nkeys; #ifdef TEST_SHARED_RT - int tranche_id = LWLockNewTrancheId(); + int tranche_id = LWLockNewTrancheId("test_radix_tree"); dsa_area *dsa; - LWLockRegisterTranche(tranche_id, "test_radix_tree"); dsa = dsa_create(tranche_id); radixtree = rt_create(dsa, tranche_id); #else @@ -304,10 +302,9 @@ test_random(void) int num_keys = 100000; uint64 *keys; #ifdef TEST_SHARED_RT - int tranche_id = LWLockNewTrancheId(); + int tranche_id = LWLockNewTrancheId("test_radix_tree"); dsa_area *dsa; - LWLockRegisterTranche(tranche_id, "test_radix_tree"); dsa = dsa_create(tranche_id); radixtree = rt_create(dsa, tranche_id); #else diff --git a/src/test/modules/test_slru/test_slru.c b/src/test/modules/test_slru/test_slru.c index 32750930e433d..8c0367eeee424 100644 --- a/src/test/modules/test_slru/test_slru.c +++ b/src/test/modules/test_slru/test_slru.c @@ -232,11 +232,9 @@ test_slru_shmem_startup(void) (void) MakePGDirectory(slru_dir_name); /* initialize the SLRU facility */ - test_tranche_id = LWLockNewTrancheId(); - LWLockRegisterTranche(test_tranche_id, "test_slru_tranche"); + test_tranche_id = LWLockNewTrancheId("test_slru_tranche"); - test_buffer_tranche_id = LWLockNewTrancheId(); - LWLockRegisterTranche(test_tranche_id, "test_buffer_tranche"); + test_buffer_tranche_id = LWLockNewTrancheId("test_buffer_tranche"); TestSlruCtl->PagePrecedes = test_slru_page_precedes_logically; SimpleLruInit(TestSlruCtl, "TestSLRU", diff --git a/src/test/modules/test_tidstore/test_tidstore.c b/src/test/modules/test_tidstore/test_tidstore.c index eb16e0fbfa647..0c8f43867e55e 100644 --- a/src/test/modules/test_tidstore/test_tidstore.c +++ b/src/test/modules/test_tidstore/test_tidstore.c @@ -103,8 +103,7 @@ test_create(PG_FUNCTION_ARGS) { int tranche_id; - tranche_id = LWLockNewTrancheId(); - LWLockRegisterTranche(tranche_id, "test_tidstore"); + tranche_id = LWLockNewTrancheId("test_tidstore"); tidstore = TidStoreCreateShared(tidstore_max_size, tranche_id); From e351e5c4fea463d9f96557913f7f767af3795c32 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 3 Sep 2025 16:07:57 -0400 Subject: [PATCH 02/73] Make libpq_pipeline.c shorter and more uniform via helper functions. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are many places in this test program that need to consume a PGresult while checking that its PQresultStatus is as-expected, or related tasks such as checking that PQgetResult has nothing more to return. These tasks were open-coded in a rather inconsistent way, leading to some outright bugs, some memory leakage, and frequent inconsistencies about what would be reported in event of an error. Invent a few helper functions to standardize the behavior and reduce code duplication. Also, rename the one pre-existing helper function from confirm_query_canceled to consume_query_cancel, per Álvaro's suggestion that "confirm" is a poor choice of verb for a function that will discard the PGresult. While at it, clean up assorted other places that were leaking PGresults or even server connections. This is pure neatnik-ism, since the test doesn't run long enough for those leaks to be of any real-world concern. While this fixes some things that are clearly bugs, it's only a test program, and none of the bugs seem serious enough to justify back-patching. Bug: #18960 Reported-by: Dmitry Kovalenko Author: Tom Lane Reviewed-by: Álvaro Herrera Discussion: https://postgr.es/m/18960-09cd4a5100152e58@postgresql.org --- .../modules/libpq_pipeline/libpq_pipeline.c | 549 +++++++----------- 1 file changed, 201 insertions(+), 348 deletions(-) diff --git a/src/test/modules/libpq_pipeline/libpq_pipeline.c b/src/test/modules/libpq_pipeline/libpq_pipeline.c index 9a3c0236325c6..b3af70fa09bf8 100644 --- a/src/test/modules/libpq_pipeline/libpq_pipeline.c +++ b/src/test/modules/libpq_pipeline/libpq_pipeline.c @@ -88,20 +88,67 @@ pg_fatal_impl(int line, const char *fmt,...) } /* - * Check that the query on the given connection got canceled. + * Check that libpq next returns a PGresult with the specified status, + * returning the PGresult so that caller can perform additional checks. */ -#define confirm_query_canceled(conn) confirm_query_canceled_impl(__LINE__, conn) -static void -confirm_query_canceled_impl(int line, PGconn *conn) +#define confirm_result_status(conn, status) confirm_result_status_impl(__LINE__, conn, status) +static PGresult * +confirm_result_status_impl(int line, PGconn *conn, ExecStatusType status) { - PGresult *res = NULL; + PGresult *res; res = PQgetResult(conn); if (res == NULL) - pg_fatal_impl(line, "PQgetResult returned null: %s", + pg_fatal_impl(line, "PQgetResult returned null unexpectedly: %s", PQerrorMessage(conn)); - if (PQresultStatus(res) != PGRES_FATAL_ERROR) - pg_fatal_impl(line, "query did not fail when it was expected"); + if (PQresultStatus(res) != status) + pg_fatal_impl(line, "PQgetResult returned status %s, expected %s: %s", + PQresStatus(PQresultStatus(res)), + PQresStatus(status), + PQerrorMessage(conn)); + return res; +} + +/* + * Check that libpq next returns a PGresult with the specified status, + * then free the PGresult. + */ +#define consume_result_status(conn, status) consume_result_status_impl(__LINE__, conn, status) +static void +consume_result_status_impl(int line, PGconn *conn, ExecStatusType status) +{ + PGresult *res; + + res = confirm_result_status_impl(line, conn, status); + PQclear(res); +} + +/* + * Check that libpq next returns a null PGresult. + */ +#define consume_null_result(conn) consume_null_result_impl(__LINE__, conn) +static void +consume_null_result_impl(int line, PGconn *conn) +{ + PGresult *res; + + res = PQgetResult(conn); + if (res != NULL) + pg_fatal_impl(line, "expected NULL PGresult, got %s: %s", + PQresStatus(PQresultStatus(res)), + PQerrorMessage(conn)); +} + +/* + * Check that the query on the given connection got canceled. + */ +#define consume_query_cancel(conn) consume_query_cancel_impl(__LINE__, conn) +static void +consume_query_cancel_impl(int line, PGconn *conn) +{ + PGresult *res; + + res = confirm_result_status_impl(line, conn, PGRES_FATAL_ERROR); if (strcmp(PQresultErrorField(res, PG_DIAG_SQLSTATE), "57014") != 0) pg_fatal_impl(line, "query failed with a different error than cancellation: %s", PQerrorMessage(conn)); @@ -234,6 +281,10 @@ copy_connection(PGconn *conn) pg_fatal("Connection to database failed: %s", PQerrorMessage(copyConn)); + pfree(keywords); + pfree(vals); + PQconninfoFree(opts); + return copyConn; } @@ -265,13 +316,13 @@ test_cancel(PGconn *conn) cancel = PQgetCancel(conn); if (!PQcancel(cancel, errorbuf, sizeof(errorbuf))) pg_fatal("failed to run PQcancel: %s", errorbuf); - confirm_query_canceled(conn); + consume_query_cancel(conn); /* PGcancel object can be reused for the next query */ send_cancellable_query(conn, monitorConn); if (!PQcancel(cancel, errorbuf, sizeof(errorbuf))) pg_fatal("failed to run PQcancel: %s", errorbuf); - confirm_query_canceled(conn); + consume_query_cancel(conn); PQfreeCancel(cancel); @@ -279,14 +330,14 @@ test_cancel(PGconn *conn) send_cancellable_query(conn, monitorConn); if (!PQrequestCancel(conn)) pg_fatal("failed to run PQrequestCancel: %s", PQerrorMessage(conn)); - confirm_query_canceled(conn); + consume_query_cancel(conn); /* test PQcancelBlocking */ send_cancellable_query(conn, monitorConn); cancelConn = PQcancelCreate(conn); if (!PQcancelBlocking(cancelConn)) pg_fatal("failed to run PQcancelBlocking: %s", PQcancelErrorMessage(cancelConn)); - confirm_query_canceled(conn); + consume_query_cancel(conn); PQcancelFinish(cancelConn); /* test PQcancelCreate and then polling with PQcancelPoll */ @@ -340,7 +391,7 @@ test_cancel(PGconn *conn) } if (PQcancelStatus(cancelConn) != CONNECTION_OK) pg_fatal("unexpected cancel connection status: %s", PQcancelErrorMessage(cancelConn)); - confirm_query_canceled(conn); + consume_query_cancel(conn); /* * test PQcancelReset works on the cancel connection and it can be reused @@ -397,9 +448,10 @@ test_cancel(PGconn *conn) } if (PQcancelStatus(cancelConn) != CONNECTION_OK) pg_fatal("unexpected cancel connection status: %s", PQcancelErrorMessage(cancelConn)); - confirm_query_canceled(conn); + consume_query_cancel(conn); PQcancelFinish(cancelConn); + PQfinish(monitorConn); fprintf(stderr, "ok\n"); } @@ -428,6 +480,7 @@ test_disallowed_in_pipeline(PGconn *conn) "synchronous command execution functions are not allowed in pipeline mode\n") != 0) pg_fatal("did not get expected error message; got: \"%s\"", PQerrorMessage(conn)); + PQclear(res); /* PQsendQuery should fail in pipeline mode */ if (PQsendQuery(conn, "SELECT 1") != 0) @@ -460,6 +513,7 @@ test_disallowed_in_pipeline(PGconn *conn) if (PQresultStatus(res) != PGRES_TUPLES_OK) pg_fatal("PQexec should succeed after exiting pipeline mode but failed with: %s", PQerrorMessage(conn)); + PQclear(res); fprintf(stderr, "ok\n"); } @@ -467,7 +521,6 @@ test_disallowed_in_pipeline(PGconn *conn) static void test_multi_pipelines(PGconn *conn) { - PGresult *res = NULL; const char *dummy_params[1] = {"1"}; Oid dummy_param_oids[1] = {INT4OID}; @@ -508,87 +561,31 @@ test_multi_pipelines(PGconn *conn) /* OK, start processing the results */ /* first pipeline */ + consume_result_status(conn, PGRES_TUPLES_OK); - res = PQgetResult(conn); - if (res == NULL) - pg_fatal("PQgetResult returned null when there's a pipeline item: %s", - PQerrorMessage(conn)); - - if (PQresultStatus(res) != PGRES_TUPLES_OK) - pg_fatal("Unexpected result code %s from first pipeline item", - PQresStatus(PQresultStatus(res))); - PQclear(res); - res = NULL; - - if (PQgetResult(conn) != NULL) - pg_fatal("PQgetResult returned something extra after first result"); + consume_null_result(conn); if (PQexitPipelineMode(conn) != 0) pg_fatal("exiting pipeline mode after query but before sync succeeded incorrectly"); - res = PQgetResult(conn); - if (res == NULL) - pg_fatal("PQgetResult returned null when sync result expected: %s", - PQerrorMessage(conn)); - - if (PQresultStatus(res) != PGRES_PIPELINE_SYNC) - pg_fatal("Unexpected result code %s instead of sync result, error: %s", - PQresStatus(PQresultStatus(res)), PQerrorMessage(conn)); - PQclear(res); + consume_result_status(conn, PGRES_PIPELINE_SYNC); /* second pipeline */ + consume_result_status(conn, PGRES_TUPLES_OK); - res = PQgetResult(conn); - if (res == NULL) - pg_fatal("PQgetResult returned null when there's a pipeline item: %s", - PQerrorMessage(conn)); - - if (PQresultStatus(res) != PGRES_TUPLES_OK) - pg_fatal("Unexpected result code %s from second pipeline item", - PQresStatus(PQresultStatus(res))); - PQclear(res); - res = NULL; - - if (PQgetResult(conn) != NULL) - pg_fatal("PQgetResult returned something extra after first result"); + consume_null_result(conn); if (PQexitPipelineMode(conn) != 0) pg_fatal("exiting pipeline mode after query but before sync succeeded incorrectly"); - res = PQgetResult(conn); - if (res == NULL) - pg_fatal("PQgetResult returned null when sync result expected: %s", - PQerrorMessage(conn)); - - if (PQresultStatus(res) != PGRES_PIPELINE_SYNC) - pg_fatal("Unexpected result code %s instead of sync result, error: %s", - PQresStatus(PQresultStatus(res)), PQerrorMessage(conn)); - PQclear(res); + consume_result_status(conn, PGRES_PIPELINE_SYNC); /* third pipeline */ + consume_result_status(conn, PGRES_TUPLES_OK); - res = PQgetResult(conn); - if (res == NULL) - pg_fatal("PQgetResult returned null when there's a pipeline item: %s", - PQerrorMessage(conn)); - - if (PQresultStatus(res) != PGRES_TUPLES_OK) - pg_fatal("Unexpected result code %s from third pipeline item", - PQresStatus(PQresultStatus(res))); - - res = PQgetResult(conn); - if (res != NULL) - pg_fatal("Expected null result, got %s", - PQresStatus(PQresultStatus(res))); + consume_null_result(conn); - res = PQgetResult(conn); - if (res == NULL) - pg_fatal("PQgetResult returned null when there's a pipeline item: %s", - PQerrorMessage(conn)); - - if (PQresultStatus(res) != PGRES_PIPELINE_SYNC) - pg_fatal("Unexpected result code %s from second pipeline sync", - PQresStatus(PQresultStatus(res))); + consume_result_status(conn, PGRES_PIPELINE_SYNC); /* We're still in pipeline mode ... */ if (PQpipelineStatus(conn) == PQ_PIPELINE_OFF) @@ -657,36 +654,17 @@ test_nosync(PGconn *conn) /* Now read all results */ for (;;) { - PGresult *res; - - res = PQgetResult(conn); - - /* NULL results are only expected after TUPLES_OK */ - if (res == NULL) - pg_fatal("got unexpected NULL result after %d results", results); - /* We expect exactly one TUPLES_OK result for each query we sent */ - if (PQresultStatus(res) == PGRES_TUPLES_OK) - { - PGresult *res2; - - /* and one NULL result should follow each */ - res2 = PQgetResult(conn); - if (res2 != NULL) - pg_fatal("expected NULL, got %s", - PQresStatus(PQresultStatus(res2))); - PQclear(res); - results++; + consume_result_status(conn, PGRES_TUPLES_OK); - /* if we're done, we're done */ - if (results == numqueries) - break; + /* and one NULL result should follow each */ + consume_null_result(conn); - continue; - } + results++; - /* anything else is unexpected */ - pg_fatal("got unexpected %s\n", PQresStatus(PQresultStatus(res))); + /* if we're done, we're done */ + if (results == numqueries) + break; } fprintf(stderr, "ok\n"); @@ -716,10 +694,12 @@ test_pipeline_abort(PGconn *conn) res = PQexec(conn, drop_table_sql); if (PQresultStatus(res) != PGRES_COMMAND_OK) pg_fatal("dispatching DROP TABLE failed: %s", PQerrorMessage(conn)); + PQclear(res); res = PQexec(conn, create_table_sql); if (PQresultStatus(res) != PGRES_COMMAND_OK) pg_fatal("dispatching CREATE TABLE failed: %s", PQerrorMessage(conn)); + PQclear(res); /* * Queue up a couple of small pipelines and process each without returning @@ -763,33 +743,16 @@ test_pipeline_abort(PGconn *conn) * a pipeline aborted message for the second insert, a pipeline-end, then * a command-ok and a pipeline-ok for the second pipeline operation. */ - res = PQgetResult(conn); - if (res == NULL) - pg_fatal("Unexpected NULL result: %s", PQerrorMessage(conn)); - if (PQresultStatus(res) != PGRES_COMMAND_OK) - pg_fatal("Unexpected result status %s: %s", - PQresStatus(PQresultStatus(res)), - PQresultErrorMessage(res)); - PQclear(res); + consume_result_status(conn, PGRES_COMMAND_OK); /* NULL result to signal end-of-results for this command */ - if ((res = PQgetResult(conn)) != NULL) - pg_fatal("Expected null result, got %s", - PQresStatus(PQresultStatus(res))); + consume_null_result(conn); /* Second query caused error, so we expect an error next */ - res = PQgetResult(conn); - if (res == NULL) - pg_fatal("Unexpected NULL result: %s", PQerrorMessage(conn)); - if (PQresultStatus(res) != PGRES_FATAL_ERROR) - pg_fatal("Unexpected result code -- expected PGRES_FATAL_ERROR, got %s", - PQresStatus(PQresultStatus(res))); - PQclear(res); + consume_result_status(conn, PGRES_FATAL_ERROR); /* NULL result to signal end-of-results for this command */ - if ((res = PQgetResult(conn)) != NULL) - pg_fatal("Expected null result, got %s", - PQresStatus(PQresultStatus(res))); + consume_null_result(conn); /* * pipeline should now be aborted. @@ -802,17 +765,10 @@ test_pipeline_abort(PGconn *conn) pg_fatal("pipeline should be flagged as aborted but isn't"); /* third query in pipeline, the second insert */ - res = PQgetResult(conn); - if (res == NULL) - pg_fatal("Unexpected NULL result: %s", PQerrorMessage(conn)); - if (PQresultStatus(res) != PGRES_PIPELINE_ABORTED) - pg_fatal("Unexpected result code -- expected PGRES_PIPELINE_ABORTED, got %s", - PQresStatus(PQresultStatus(res))); - PQclear(res); + consume_result_status(conn, PGRES_PIPELINE_ABORTED); /* NULL result to signal end-of-results for this command */ - if ((res = PQgetResult(conn)) != NULL) - pg_fatal("Expected null result, got %s", PQresStatus(PQresultStatus(res))); + consume_null_result(conn); if (PQpipelineStatus(conn) != PQ_PIPELINE_ABORTED) pg_fatal("pipeline should be flagged as aborted but isn't"); @@ -827,14 +783,7 @@ test_pipeline_abort(PGconn *conn) * (This is so clients know to start processing results normally again and * can tell the difference between skipped commands and the sync.) */ - res = PQgetResult(conn); - if (res == NULL) - pg_fatal("Unexpected NULL result: %s", PQerrorMessage(conn)); - if (PQresultStatus(res) != PGRES_PIPELINE_SYNC) - pg_fatal("Unexpected result code from first pipeline sync\n" - "Expected PGRES_PIPELINE_SYNC, got %s", - PQresStatus(PQresultStatus(res))); - PQclear(res); + consume_result_status(conn, PGRES_PIPELINE_SYNC); if (PQpipelineStatus(conn) == PQ_PIPELINE_ABORTED) pg_fatal("sync should've cleared the aborted flag but didn't"); @@ -844,30 +793,16 @@ test_pipeline_abort(PGconn *conn) pg_fatal("Fell out of pipeline mode somehow"); /* the insert from the second pipeline */ - res = PQgetResult(conn); - if (res == NULL) - pg_fatal("Unexpected NULL result: %s", PQerrorMessage(conn)); - if (PQresultStatus(res) != PGRES_COMMAND_OK) - pg_fatal("Unexpected result code %s from first item in second pipeline", - PQresStatus(PQresultStatus(res))); - PQclear(res); + consume_result_status(conn, PGRES_COMMAND_OK); /* Read the NULL result at the end of the command */ - if ((res = PQgetResult(conn)) != NULL) - pg_fatal("Expected null result, got %s", PQresStatus(PQresultStatus(res))); + consume_null_result(conn); /* the second pipeline sync */ - if ((res = PQgetResult(conn)) == NULL) - pg_fatal("Unexpected NULL result: %s", PQerrorMessage(conn)); - if (PQresultStatus(res) != PGRES_PIPELINE_SYNC) - pg_fatal("Unexpected result code %s from second pipeline sync", - PQresStatus(PQresultStatus(res))); - PQclear(res); + consume_result_status(conn, PGRES_PIPELINE_SYNC); - if ((res = PQgetResult(conn)) != NULL) - pg_fatal("Expected null result, got %s: %s", - PQresStatus(PQresultStatus(res)), - PQerrorMessage(conn)); + /* Read the NULL result at the end of the command */ + consume_null_result(conn); /* Try to send two queries in one command */ if (PQsendQueryParams(conn, "SELECT 1; SELECT 2", 0, NULL, NULL, NULL, NULL, 0) != 1) @@ -890,15 +825,14 @@ test_pipeline_abort(PGconn *conn) pg_fatal("got unexpected status %s", PQresStatus(PQresultStatus(res))); break; } + PQclear(res); } if (!goterror) pg_fatal("did not get cannot-insert-multiple-commands error"); - res = PQgetResult(conn); - if (res == NULL) - pg_fatal("got NULL result"); - if (PQresultStatus(res) != PGRES_PIPELINE_SYNC) - pg_fatal("Unexpected result code %s from pipeline sync", - PQresStatus(PQresultStatus(res))); + + /* the second pipeline sync */ + consume_result_status(conn, PGRES_PIPELINE_SYNC); + fprintf(stderr, "ok\n"); /* Test single-row mode with an error partways */ @@ -935,13 +869,9 @@ test_pipeline_abort(PGconn *conn) pg_fatal("did not get division-by-zero error"); if (gotrows != 3) pg_fatal("did not get three rows"); + /* the third pipeline sync */ - if ((res = PQgetResult(conn)) == NULL) - pg_fatal("Unexpected NULL result: %s", PQerrorMessage(conn)); - if (PQresultStatus(res) != PGRES_PIPELINE_SYNC) - pg_fatal("Unexpected result code %s from third pipeline sync", - PQresStatus(PQresultStatus(res))); - PQclear(res); + consume_result_status(conn, PGRES_PIPELINE_SYNC); /* We're still in pipeline mode... */ if (PQpipelineStatus(conn) == PQ_PIPELINE_OFF) @@ -1274,21 +1204,11 @@ test_prepared(PGconn *conn) if (PQpipelineSync(conn) != 1) pg_fatal("pipeline sync failed: %s", PQerrorMessage(conn)); - res = PQgetResult(conn); - if (res == NULL) - pg_fatal("PQgetResult returned null"); - if (PQresultStatus(res) != PGRES_COMMAND_OK) - pg_fatal("expected COMMAND_OK, got %s", PQresStatus(PQresultStatus(res))); - PQclear(res); - res = PQgetResult(conn); - if (res != NULL) - pg_fatal("expected NULL result"); + consume_result_status(conn, PGRES_COMMAND_OK); - res = PQgetResult(conn); - if (res == NULL) - pg_fatal("PQgetResult returned NULL"); - if (PQresultStatus(res) != PGRES_COMMAND_OK) - pg_fatal("expected COMMAND_OK, got %s", PQresStatus(PQresultStatus(res))); + consume_null_result(conn); + + res = confirm_result_status(conn, PGRES_COMMAND_OK); if (PQnfields(res) != lengthof(expected_oids)) pg_fatal("expected %zu columns, got %d", lengthof(expected_oids), PQnfields(res)); @@ -1300,13 +1220,10 @@ test_prepared(PGconn *conn) i, expected_oids[i], typ); } PQclear(res); - res = PQgetResult(conn); - if (res != NULL) - pg_fatal("expected NULL result"); - res = PQgetResult(conn); - if (PQresultStatus(res) != PGRES_PIPELINE_SYNC) - pg_fatal("expected PGRES_PIPELINE_SYNC, got %s", PQresStatus(PQresultStatus(res))); + consume_null_result(conn); + + consume_result_status(conn, PGRES_PIPELINE_SYNC); fprintf(stderr, "closing statement.."); if (PQsendClosePrepared(conn, "select_one") != 1) @@ -1314,18 +1231,11 @@ test_prepared(PGconn *conn) if (PQpipelineSync(conn) != 1) pg_fatal("pipeline sync failed: %s", PQerrorMessage(conn)); - res = PQgetResult(conn); - if (res == NULL) - pg_fatal("expected non-NULL result"); - if (PQresultStatus(res) != PGRES_COMMAND_OK) - pg_fatal("expected COMMAND_OK, got %s", PQresStatus(PQresultStatus(res))); - PQclear(res); - res = PQgetResult(conn); - if (res != NULL) - pg_fatal("expected NULL result"); - res = PQgetResult(conn); - if (PQresultStatus(res) != PGRES_PIPELINE_SYNC) - pg_fatal("expected PGRES_PIPELINE_SYNC, got %s", PQresStatus(PQresultStatus(res))); + consume_result_status(conn, PGRES_COMMAND_OK); + + consume_null_result(conn); + + consume_result_status(conn, PGRES_PIPELINE_SYNC); if (PQexitPipelineMode(conn) != 1) pg_fatal("could not exit pipeline mode: %s", PQerrorMessage(conn)); @@ -1334,6 +1244,7 @@ test_prepared(PGconn *conn) res = PQdescribePrepared(conn, "select_one"); if (PQresultStatus(res) != PGRES_FATAL_ERROR) pg_fatal("expected FATAL_ERROR, got %s", PQresStatus(PQresultStatus(res))); + PQclear(res); /* * Also test the blocking close, this should not fail since closing a @@ -1342,32 +1253,36 @@ test_prepared(PGconn *conn) res = PQclosePrepared(conn, "select_one"); if (PQresultStatus(res) != PGRES_COMMAND_OK) pg_fatal("expected COMMAND_OK, got %s", PQresStatus(PQresultStatus(res))); + PQclear(res); fprintf(stderr, "creating portal... "); - PQexec(conn, "BEGIN"); - PQexec(conn, "DECLARE cursor_one CURSOR FOR SELECT 1"); + + res = PQexec(conn, "BEGIN"); + if (PQresultStatus(res) != PGRES_COMMAND_OK) + pg_fatal("BEGIN failed: %s", PQerrorMessage(conn)); + PQclear(res); + + res = PQexec(conn, "DECLARE cursor_one CURSOR FOR SELECT 1"); + if (PQresultStatus(res) != PGRES_COMMAND_OK) + pg_fatal("DECLARE CURSOR failed: %s", PQerrorMessage(conn)); + PQclear(res); + PQenterPipelineMode(conn); if (PQsendDescribePortal(conn, "cursor_one") != 1) pg_fatal("PQsendDescribePortal failed: %s", PQerrorMessage(conn)); if (PQpipelineSync(conn) != 1) pg_fatal("pipeline sync failed: %s", PQerrorMessage(conn)); - res = PQgetResult(conn); - if (res == NULL) - pg_fatal("PQgetResult returned null"); - if (PQresultStatus(res) != PGRES_COMMAND_OK) - pg_fatal("expected COMMAND_OK, got %s", PQresStatus(PQresultStatus(res))); + res = confirm_result_status(conn, PGRES_COMMAND_OK); typ = PQftype(res, 0); if (typ != INT4OID) pg_fatal("portal: expected type %u, got %u", INT4OID, typ); PQclear(res); - res = PQgetResult(conn); - if (res != NULL) - pg_fatal("expected NULL result"); - res = PQgetResult(conn); - if (PQresultStatus(res) != PGRES_PIPELINE_SYNC) - pg_fatal("expected PGRES_PIPELINE_SYNC, got %s", PQresStatus(PQresultStatus(res))); + + consume_null_result(conn); + + consume_result_status(conn, PGRES_PIPELINE_SYNC); fprintf(stderr, "closing portal... "); if (PQsendClosePortal(conn, "cursor_one") != 1) @@ -1375,18 +1290,11 @@ test_prepared(PGconn *conn) if (PQpipelineSync(conn) != 1) pg_fatal("pipeline sync failed: %s", PQerrorMessage(conn)); - res = PQgetResult(conn); - if (res == NULL) - pg_fatal("expected non-NULL result"); - if (PQresultStatus(res) != PGRES_COMMAND_OK) - pg_fatal("expected COMMAND_OK, got %s", PQresStatus(PQresultStatus(res))); - PQclear(res); - res = PQgetResult(conn); - if (res != NULL) - pg_fatal("expected NULL result"); - res = PQgetResult(conn); - if (PQresultStatus(res) != PGRES_PIPELINE_SYNC) - pg_fatal("expected PGRES_PIPELINE_SYNC, got %s", PQresStatus(PQresultStatus(res))); + consume_result_status(conn, PGRES_COMMAND_OK); + + consume_null_result(conn); + + consume_result_status(conn, PGRES_PIPELINE_SYNC); if (PQexitPipelineMode(conn) != 1) pg_fatal("could not exit pipeline mode: %s", PQerrorMessage(conn)); @@ -1395,6 +1303,7 @@ test_prepared(PGconn *conn) res = PQdescribePortal(conn, "cursor_one"); if (PQresultStatus(res) != PGRES_FATAL_ERROR) pg_fatal("expected FATAL_ERROR, got %s", PQresStatus(PQresultStatus(res))); + PQclear(res); /* * Also test the blocking close, this should not fail since closing a @@ -1403,6 +1312,7 @@ test_prepared(PGconn *conn) res = PQclosePortal(conn, "cursor_one"); if (PQresultStatus(res) != PGRES_COMMAND_OK) pg_fatal("expected COMMAND_OK, got %s", PQresStatus(PQresultStatus(res))); + PQclear(res); fprintf(stderr, "ok\n"); } @@ -1509,6 +1419,10 @@ test_protocol_version(PGconn *conn) pg_fatal("expected 30002, got %d", protocol_version); PQfinish(conn); + + pfree(keywords); + pfree(vals); + PQconninfoFree(opts); } /* Notice processor: print notices, and count how many we got */ @@ -1525,7 +1439,6 @@ notice_processor(void *arg, const char *message) static void test_pipeline_idle(PGconn *conn) { - PGresult *res; int n_notices = 0; fprintf(stderr, "\npipeline idle...\n"); @@ -1538,17 +1451,11 @@ test_pipeline_idle(PGconn *conn) if (PQsendQueryParams(conn, "SELECT 1", 0, NULL, NULL, NULL, NULL, 0) != 1) pg_fatal("failed to send query: %s", PQerrorMessage(conn)); PQsendFlushRequest(conn); - res = PQgetResult(conn); - if (res == NULL) - pg_fatal("PQgetResult returned null when there's a pipeline item: %s", - PQerrorMessage(conn)); - if (PQresultStatus(res) != PGRES_TUPLES_OK) - pg_fatal("unexpected result code %s from first pipeline item", - PQresStatus(PQresultStatus(res))); - PQclear(res); - res = PQgetResult(conn); - if (res != NULL) - pg_fatal("did not receive terminating NULL"); + + consume_result_status(conn, PGRES_TUPLES_OK); + + consume_null_result(conn); + if (PQsendQueryParams(conn, "SELECT 2", 0, NULL, NULL, NULL, NULL, 0) != 1) pg_fatal("failed to send query: %s", PQerrorMessage(conn)); if (PQexitPipelineMode(conn) == 1) @@ -1558,14 +1465,11 @@ test_pipeline_idle(PGconn *conn) pg_fatal("did not get expected error; got: %s", PQerrorMessage(conn)); PQsendFlushRequest(conn); - res = PQgetResult(conn); - if (PQresultStatus(res) != PGRES_TUPLES_OK) - pg_fatal("unexpected result code %s from second pipeline item", - PQresStatus(PQresultStatus(res))); - PQclear(res); - res = PQgetResult(conn); - if (res != NULL) - pg_fatal("did not receive terminating NULL"); + + consume_result_status(conn, PGRES_TUPLES_OK); + + consume_null_result(conn); + if (PQexitPipelineMode(conn) != 1) pg_fatal("exiting pipeline failed: %s", PQerrorMessage(conn)); @@ -1579,11 +1483,9 @@ test_pipeline_idle(PGconn *conn) if (PQsendQueryParams(conn, "SELECT pg_catalog.pg_advisory_unlock(1,1)", 0, NULL, NULL, NULL, NULL, 0) != 1) pg_fatal("failed to send query: %s", PQerrorMessage(conn)); PQsendFlushRequest(conn); - res = PQgetResult(conn); - if (res == NULL) - pg_fatal("unexpected NULL result received"); - if (PQresultStatus(res) != PGRES_TUPLES_OK) - pg_fatal("unexpected result code %s", PQresStatus(PQresultStatus(res))); + + consume_result_status(conn, PGRES_TUPLES_OK); + if (PQexitPipelineMode(conn) != 1) pg_fatal("failed to exit pipeline mode: %s", PQerrorMessage(conn)); fprintf(stderr, "ok - 2\n"); @@ -1592,7 +1494,6 @@ test_pipeline_idle(PGconn *conn) static void test_simple_pipeline(PGconn *conn) { - PGresult *res = NULL; const char *dummy_params[1] = {"1"}; Oid dummy_param_oids[1] = {INT4OID}; @@ -1623,20 +1524,9 @@ test_simple_pipeline(PGconn *conn) if (PQpipelineSync(conn) != 1) pg_fatal("pipeline sync failed: %s", PQerrorMessage(conn)); - res = PQgetResult(conn); - if (res == NULL) - pg_fatal("PQgetResult returned null when there's a pipeline item: %s", - PQerrorMessage(conn)); - - if (PQresultStatus(res) != PGRES_TUPLES_OK) - pg_fatal("Unexpected result code %s from first pipeline item", - PQresStatus(PQresultStatus(res))); - - PQclear(res); - res = NULL; + consume_result_status(conn, PGRES_TUPLES_OK); - if (PQgetResult(conn) != NULL) - pg_fatal("PQgetResult returned something extra after first query result."); + consume_null_result(conn); /* * Even though we've processed the result there's still a sync to come and @@ -1645,21 +1535,9 @@ test_simple_pipeline(PGconn *conn) if (PQexitPipelineMode(conn) != 0) pg_fatal("exiting pipeline mode after query but before sync succeeded incorrectly"); - res = PQgetResult(conn); - if (res == NULL) - pg_fatal("PQgetResult returned null when sync result PGRES_PIPELINE_SYNC expected: %s", - PQerrorMessage(conn)); - - if (PQresultStatus(res) != PGRES_PIPELINE_SYNC) - pg_fatal("Unexpected result code %s instead of PGRES_PIPELINE_SYNC, error: %s", - PQresStatus(PQresultStatus(res)), PQerrorMessage(conn)); - - PQclear(res); - res = NULL; + consume_result_status(conn, PGRES_PIPELINE_SYNC); - if (PQgetResult(conn) != NULL) - pg_fatal("PQgetResult returned something extra after pipeline end: %s", - PQresStatus(PQresultStatus(res))); + consume_null_result(conn); /* We're still in pipeline mode... */ if (PQpipelineStatus(conn) == PQ_PIPELINE_OFF) @@ -1792,20 +1670,12 @@ test_singlerowmode(PGconn *conn) pg_fatal("failed to send flush request"); if (PQsetSingleRowMode(conn) != 1) pg_fatal("PQsetSingleRowMode() failed"); - res = PQgetResult(conn); - if (res == NULL) - pg_fatal("unexpected NULL"); - if (PQresultStatus(res) != PGRES_SINGLE_TUPLE) - pg_fatal("Expected PGRES_SINGLE_TUPLE, got %s", - PQresStatus(PQresultStatus(res))); - res = PQgetResult(conn); - if (res == NULL) - pg_fatal("unexpected NULL"); - if (PQresultStatus(res) != PGRES_TUPLES_OK) - pg_fatal("Expected PGRES_TUPLES_OK, got %s", - PQresStatus(PQresultStatus(res))); - if (PQgetResult(conn) != NULL) - pg_fatal("expected NULL result"); + + consume_result_status(conn, PGRES_SINGLE_TUPLE); + + consume_result_status(conn, PGRES_TUPLES_OK); + + consume_null_result(conn); if (PQsendQueryParams(conn, "SELECT 1", 0, NULL, NULL, NULL, NULL, 0) != 1) @@ -1813,14 +1683,10 @@ test_singlerowmode(PGconn *conn) PQerrorMessage(conn)); if (PQsendFlushRequest(conn) != 1) pg_fatal("failed to send flush request"); - res = PQgetResult(conn); - if (res == NULL) - pg_fatal("unexpected NULL"); - if (PQresultStatus(res) != PGRES_TUPLES_OK) - pg_fatal("Expected PGRES_TUPLES_OK, got %s", - PQresStatus(PQresultStatus(res))); - if (PQgetResult(conn) != NULL) - pg_fatal("expected NULL result"); + + consume_result_status(conn, PGRES_TUPLES_OK); + + consume_null_result(conn); /* * Try chunked mode as well; make sure that it correctly delivers a @@ -1834,33 +1700,23 @@ test_singlerowmode(PGconn *conn) pg_fatal("failed to send flush request"); if (PQsetChunkedRowsMode(conn, 3) != 1) pg_fatal("PQsetChunkedRowsMode() failed"); - res = PQgetResult(conn); - if (res == NULL) - pg_fatal("unexpected NULL"); - if (PQresultStatus(res) != PGRES_TUPLES_CHUNK) - pg_fatal("Expected PGRES_TUPLES_CHUNK, got %s: %s", - PQresStatus(PQresultStatus(res)), - PQerrorMessage(conn)); + + res = confirm_result_status(conn, PGRES_TUPLES_CHUNK); if (PQntuples(res) != 3) pg_fatal("Expected 3 rows, got %d", PQntuples(res)); - res = PQgetResult(conn); - if (res == NULL) - pg_fatal("unexpected NULL"); - if (PQresultStatus(res) != PGRES_TUPLES_CHUNK) - pg_fatal("Expected PGRES_TUPLES_CHUNK, got %s", - PQresStatus(PQresultStatus(res))); + PQclear(res); + + res = confirm_result_status(conn, PGRES_TUPLES_CHUNK); if (PQntuples(res) != 2) pg_fatal("Expected 2 rows, got %d", PQntuples(res)); - res = PQgetResult(conn); - if (res == NULL) - pg_fatal("unexpected NULL"); - if (PQresultStatus(res) != PGRES_TUPLES_OK) - pg_fatal("Expected PGRES_TUPLES_OK, got %s", - PQresStatus(PQresultStatus(res))); + PQclear(res); + + res = confirm_result_status(conn, PGRES_TUPLES_OK); if (PQntuples(res) != 0) pg_fatal("Expected 0 rows, got %d", PQntuples(res)); - if (PQgetResult(conn) != NULL) - pg_fatal("expected NULL result"); + PQclear(res); + + consume_null_result(conn); if (PQexitPipelineMode(conn) != 1) pg_fatal("failed to end pipeline mode: %s", PQerrorMessage(conn)); @@ -1995,9 +1851,8 @@ test_transaction(PGconn *conn) if (num_syncs <= 0) break; } - if (PQgetResult(conn) != NULL) - pg_fatal("returned something extra after all the syncs: %s", - PQresStatus(PQresultStatus(res))); + + consume_null_result(conn); if (PQexitPipelineMode(conn) != 1) pg_fatal("failed to end pipeline mode: %s", PQerrorMessage(conn)); @@ -2053,16 +1908,19 @@ test_uniqviol(PGconn *conn) "create table ppln_uniqviol(id bigint primary key, idata bigint)"); if (PQresultStatus(res) != PGRES_COMMAND_OK) pg_fatal("failed to create table: %s", PQerrorMessage(conn)); + PQclear(res); res = PQexec(conn, "begin"); if (PQresultStatus(res) != PGRES_COMMAND_OK) pg_fatal("failed to begin transaction: %s", PQerrorMessage(conn)); + PQclear(res); res = PQprepare(conn, "insertion", "insert into ppln_uniqviol values ($1, $2) returning id", 2, paramTypes); - if (res == NULL || PQresultStatus(res) != PGRES_COMMAND_OK) + if (PQresultStatus(res) != PGRES_COMMAND_OK) pg_fatal("failed to prepare query: %s", PQerrorMessage(conn)); + PQclear(res); if (PQenterPipelineMode(conn) != 1) pg_fatal("failed to enter pipeline mode"); @@ -2191,7 +2049,6 @@ test_uniqviol(PGconn *conn) static bool process_result(PGconn *conn, PGresult *res, int results, int numsent) { - PGresult *res2; bool got_error = false; if (res == NULL) @@ -2203,29 +2060,19 @@ process_result(PGconn *conn, PGresult *res, int results, int numsent) got_error = true; fprintf(stderr, "result %d/%d (error): %s\n", results, numsent, PQerrorMessage(conn)); PQclear(res); - - res2 = PQgetResult(conn); - if (res2 != NULL) - pg_fatal("expected NULL, got %s", - PQresStatus(PQresultStatus(res2))); + consume_null_result(conn); break; case PGRES_TUPLES_OK: fprintf(stderr, "result %d/%d: %s\n", results, numsent, PQgetvalue(res, 0, 0)); PQclear(res); - - res2 = PQgetResult(conn); - if (res2 != NULL) - pg_fatal("expected NULL, got %s", - PQresStatus(PQresultStatus(res2))); + consume_null_result(conn); break; case PGRES_PIPELINE_ABORTED: fprintf(stderr, "result %d/%d: pipeline aborted\n", results, numsent); - res2 = PQgetResult(conn); - if (res2 != NULL) - pg_fatal("expected NULL, got %s", - PQresStatus(PQresultStatus(res2))); + PQclear(res); + consume_null_result(conn); break; default: @@ -2271,7 +2118,7 @@ main(int argc, char **argv) { const char *conninfo = ""; PGconn *conn; - FILE *trace; + FILE *trace = NULL; char *testname; int numrows = 10000; PGresult *res; @@ -2332,9 +2179,11 @@ main(int argc, char **argv) res = PQexec(conn, "SET lc_messages TO \"C\""); if (PQresultStatus(res) != PGRES_COMMAND_OK) pg_fatal("failed to set \"lc_messages\": %s", PQerrorMessage(conn)); + PQclear(res); res = PQexec(conn, "SET debug_parallel_query = off"); if (PQresultStatus(res) != PGRES_COMMAND_OK) pg_fatal("failed to set \"debug_parallel_query\": %s", PQerrorMessage(conn)); + PQclear(res); /* Set the trace file, if requested */ if (tracefile != NULL) @@ -2388,5 +2237,9 @@ main(int argc, char **argv) /* close the connection to the database and cleanup */ PQfinish(conn); + + if (trace && trace != stdout) + fclose(trace); + return 0; } From 09119238a18191dea3deed635a2b2a6ffe904932 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Thu, 4 Sep 2025 08:34:51 +0900 Subject: [PATCH 03/73] Fix incorrect comment in pgstat_backend.c The counters saved from pgWalUsage, used for the difference calculations when flushing the backend WAL stats, are updated when calling pgstat_flush_backend() under PGSTAT_BACKEND_FLUSH_WAL, and not pgstat_report_wal(). The comment updated in this commit referenced the latter, but it is perfectly OK to flush the backend stats independently of the WAL stats. Noticed while looking at this area of the code, introduced by 76def4cdd7c2 as a copy-pasto. Backpatch-through: 18 --- src/backend/utils/activity/pgstat_backend.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/backend/utils/activity/pgstat_backend.c b/src/backend/utils/activity/pgstat_backend.c index 8714a85e2d936..07a1116671b18 100644 --- a/src/backend/utils/activity/pgstat_backend.c +++ b/src/backend/utils/activity/pgstat_backend.c @@ -41,9 +41,9 @@ static bool backend_has_iostats = false; /* * WAL usage counters saved from pgWalUsage at the previous call to - * pgstat_report_wal(). This is used to calculate how much WAL usage - * happens between pgstat_report_wal() calls, by subtracting the previous - * counters from the current ones. + * pgstat_flush_backend(). This is used to calculate how much WAL usage + * happens between pgstat_flush_backend() calls, by subtracting the + * previous counters from the current ones. */ static WalUsage prevBackendWalUsage; From 5386bfb9c1f566887d084e4ea2e350e7efd188c1 Mon Sep 17 00:00:00 2001 From: Dean Rasheed Date: Thu, 4 Sep 2025 11:27:53 +0100 Subject: [PATCH 04/73] Fix replica identity check for INSERT ON CONFLICT DO UPDATE. If an INSERT has an ON CONFLICT DO UPDATE clause, the executor must check that the target relation supports UPDATE as well as INSERT. In particular, it must check that the target relation has a REPLICA IDENTITY if it publishes updates. Formerly, it was not doing this check, which could lead to silently breaking replication. Fix by adding such a check to CheckValidResultRel(), which requires adding a new onConflictAction argument. In back-branches, preserve ABI compatibility by introducing a wrapper function with the original signature. Author: Zhijie Hou Reviewed-by: Ashutosh Bapat Reviewed-by: Dean Rasheed Tested-by: Chao Li Discussion: https://postgr.es/m/OS3PR01MB57180C87E43A679A730482DF94B62@OS3PR01MB5718.jpnprd01.prod.outlook.com Backpatch-through: 13 --- src/backend/commands/copyfrom.c | 2 +- src/backend/executor/execMain.c | 12 +++++++++- src/backend/executor/execPartition.c | 9 ++++++-- src/backend/executor/nodeModifyTable.c | 3 ++- src/include/executor/executor.h | 1 + src/test/regress/expected/publication.out | 23 +++++++++++++++++++ src/test/regress/sql/publication.sql | 27 +++++++++++++++++++++++ 7 files changed, 72 insertions(+), 5 deletions(-) diff --git a/src/backend/commands/copyfrom.c b/src/backend/commands/copyfrom.c index fbbbc09a97b17..12781963b4f95 100644 --- a/src/backend/commands/copyfrom.c +++ b/src/backend/commands/copyfrom.c @@ -919,7 +919,7 @@ CopyFrom(CopyFromState cstate) ExecInitResultRelation(estate, resultRelInfo, 1); /* Verify the named relation is a valid target for INSERT */ - CheckValidResultRel(resultRelInfo, CMD_INSERT, NIL); + CheckValidResultRel(resultRelInfo, CMD_INSERT, ONCONFLICT_NONE, NIL); ExecOpenIndices(resultRelInfo, false); diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index b8b9d2a85f76c..8f56d5e312ec2 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -1036,6 +1036,9 @@ InitPlan(QueryDesc *queryDesc, int eflags) * Generally the parser and/or planner should have noticed any such mistake * already, but let's make sure. * + * For INSERT ON CONFLICT, the result relation is required to support the + * onConflictAction, regardless of whether a conflict actually occurs. + * * For MERGE, mergeActions is the list of actions that may be performed. The * result relation is required to support every action, regardless of whether * or not they are all executed. @@ -1045,7 +1048,7 @@ InitPlan(QueryDesc *queryDesc, int eflags) */ void CheckValidResultRel(ResultRelInfo *resultRelInfo, CmdType operation, - List *mergeActions) + OnConflictAction onConflictAction, List *mergeActions) { Relation resultRel = resultRelInfo->ri_RelationDesc; FdwRoutine *fdwroutine; @@ -1059,6 +1062,13 @@ CheckValidResultRel(ResultRelInfo *resultRelInfo, CmdType operation, case RELKIND_RELATION: case RELKIND_PARTITIONED_TABLE: CheckCmdReplicaIdentity(resultRel, operation); + + /* + * For INSERT ON CONFLICT DO UPDATE, additionally check that the + * target relation supports UPDATE. + */ + if (onConflictAction == ONCONFLICT_UPDATE) + CheckCmdReplicaIdentity(resultRel, CMD_UPDATE); break; case RELKIND_SEQUENCE: ereport(ERROR, diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c index 514eae1037dc3..1f2da072632e3 100644 --- a/src/backend/executor/execPartition.c +++ b/src/backend/executor/execPartition.c @@ -360,8 +360,12 @@ ExecFindPartition(ModifyTableState *mtstate, true, false); if (rri) { + ModifyTable *node = (ModifyTable *) mtstate->ps.plan; + /* Verify this ResultRelInfo allows INSERTs */ - CheckValidResultRel(rri, CMD_INSERT, NIL); + CheckValidResultRel(rri, CMD_INSERT, + node ? node->onConflictAction : ONCONFLICT_NONE, + NIL); /* * Initialize information needed to insert this and @@ -527,7 +531,8 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, EState *estate, * partition-key becomes a DELETE+INSERT operation, so this check is still * required when the operation is CMD_UPDATE. */ - CheckValidResultRel(leaf_part_rri, CMD_INSERT, NIL); + CheckValidResultRel(leaf_part_rri, CMD_INSERT, + node ? node->onConflictAction : ONCONFLICT_NONE, NIL); /* * Open partition indices. The user may have asked to check for conflicts diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 7c6c2c1f6e42a..b0c4e2c0d32a4 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -4811,7 +4811,8 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) /* * Verify result relation is a valid target for the current operation */ - CheckValidResultRel(resultRelInfo, operation, mergeActions); + CheckValidResultRel(resultRelInfo, operation, node->onConflictAction, + mergeActions); resultRelInfo++; i++; diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index 10dcea037c3d0..31133514e8438 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -244,6 +244,7 @@ extern bool ExecCheckPermissions(List *rangeTable, List *rteperminfos, bool ereport_on_violation); extern bool ExecCheckOneRelPerms(RTEPermissionInfo *perminfo); extern void CheckValidResultRel(ResultRelInfo *resultRelInfo, CmdType operation, + OnConflictAction onConflictAction, List *mergeActions); extern void InitResultRelInfo(ResultRelInfo *resultRelInfo, Relation resultRelationDesc, diff --git a/src/test/regress/expected/publication.out b/src/test/regress/expected/publication.out index 53268059142ee..00f3cb978d774 100644 --- a/src/test/regress/expected/publication.out +++ b/src/test/regress/expected/publication.out @@ -1924,6 +1924,29 @@ DROP PUBLICATION pub1; DROP PUBLICATION pub2; DROP TABLE gencols; RESET client_min_messages; +-- Test that the INSERT ON CONFLICT command correctly checks REPLICA IDENTITY +-- when the target table is published. +CREATE TABLE testpub_insert_onconfl_no_ri (a int unique, b int); +CREATE TABLE testpub_insert_onconfl_parted (a int unique, b int) PARTITION by RANGE (a); +CREATE TABLE testpub_insert_onconfl_part_no_ri PARTITION OF testpub_insert_onconfl_parted FOR VALUES FROM (1) TO (10); +SET client_min_messages = 'ERROR'; +CREATE PUBLICATION pub1 FOR ALL TABLES; +RESET client_min_messages; +-- fail - missing REPLICA IDENTITY +INSERT INTO testpub_insert_onconfl_no_ri VALUES (1, 1) ON CONFLICT (a) DO UPDATE SET b = 2; +ERROR: cannot update table "testpub_insert_onconfl_no_ri" because it does not have a replica identity and publishes updates +HINT: To enable updating the table, set REPLICA IDENTITY using ALTER TABLE. +-- ok - no updates +INSERT INTO testpub_insert_onconfl_no_ri VALUES (1, 1) ON CONFLICT DO NOTHING; +-- fail - missing REPLICA IDENTITY in partition testpub_insert_onconfl_no_ri +INSERT INTO testpub_insert_onconfl_parted VALUES (1, 1) ON CONFLICT (a) DO UPDATE SET b = 2; +ERROR: cannot update table "testpub_insert_onconfl_part_no_ri" because it does not have a replica identity and publishes updates +HINT: To enable updating the table, set REPLICA IDENTITY using ALTER TABLE. +-- ok - no updates +INSERT INTO testpub_insert_onconfl_parted VALUES (1, 1) ON CONFLICT DO NOTHING; +DROP PUBLICATION pub1; +DROP TABLE testpub_insert_onconfl_no_ri; +DROP TABLE testpub_insert_onconfl_parted; RESET SESSION AUTHORIZATION; DROP ROLE regress_publication_user, regress_publication_user2; DROP ROLE regress_publication_user_dummy; diff --git a/src/test/regress/sql/publication.sql b/src/test/regress/sql/publication.sql index deddf0da8445f..53422d30320d3 100644 --- a/src/test/regress/sql/publication.sql +++ b/src/test/regress/sql/publication.sql @@ -1223,6 +1223,33 @@ DROP PUBLICATION pub2; DROP TABLE gencols; RESET client_min_messages; + +-- Test that the INSERT ON CONFLICT command correctly checks REPLICA IDENTITY +-- when the target table is published. +CREATE TABLE testpub_insert_onconfl_no_ri (a int unique, b int); +CREATE TABLE testpub_insert_onconfl_parted (a int unique, b int) PARTITION by RANGE (a); +CREATE TABLE testpub_insert_onconfl_part_no_ri PARTITION OF testpub_insert_onconfl_parted FOR VALUES FROM (1) TO (10); + +SET client_min_messages = 'ERROR'; +CREATE PUBLICATION pub1 FOR ALL TABLES; +RESET client_min_messages; + +-- fail - missing REPLICA IDENTITY +INSERT INTO testpub_insert_onconfl_no_ri VALUES (1, 1) ON CONFLICT (a) DO UPDATE SET b = 2; + +-- ok - no updates +INSERT INTO testpub_insert_onconfl_no_ri VALUES (1, 1) ON CONFLICT DO NOTHING; + +-- fail - missing REPLICA IDENTITY in partition testpub_insert_onconfl_no_ri +INSERT INTO testpub_insert_onconfl_parted VALUES (1, 1) ON CONFLICT (a) DO UPDATE SET b = 2; + +-- ok - no updates +INSERT INTO testpub_insert_onconfl_parted VALUES (1, 1) ON CONFLICT DO NOTHING; + +DROP PUBLICATION pub1; +DROP TABLE testpub_insert_onconfl_no_ri; +DROP TABLE testpub_insert_onconfl_parted; + RESET SESSION AUTHORIZATION; DROP ROLE regress_publication_user, regress_publication_user2; DROP ROLE regress_publication_user_dummy; From fc6600fc1cd13b695fbde4b5f3ff0d2e97c36dea Mon Sep 17 00:00:00 2001 From: Dean Rasheed Date: Thu, 4 Sep 2025 11:45:44 +0100 Subject: [PATCH 05/73] Fix replica identity check for MERGE. When executing a MERGE, check that the target relation supports all actions mentioned in the MERGE command. Specifically, check that it has a REPLICA IDENTITY if it publishes updates or deletes and the MERGE command contains update or delete actions. Failing to do this can silently break replication. Author: Zhijie Hou Reviewed-by: Ashutosh Bapat Reviewed-by: Dean Rasheed Tested-by: Chao Li Discussion: https://postgr.es/m/OS3PR01MB57180C87E43A679A730482DF94B62@OS3PR01MB5718.jpnprd01.prod.outlook.com Backpatch-through: 15 --- src/backend/executor/execMain.c | 11 +++++++- src/test/regress/expected/publication.out | 28 ++++++++++++++++++++ src/test/regress/sql/publication.sql | 31 +++++++++++++++++++++++ 3 files changed, 69 insertions(+), 1 deletion(-) diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 8f56d5e312ec2..ff12e2e136438 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -1061,7 +1061,16 @@ CheckValidResultRel(ResultRelInfo *resultRelInfo, CmdType operation, { case RELKIND_RELATION: case RELKIND_PARTITIONED_TABLE: - CheckCmdReplicaIdentity(resultRel, operation); + + /* + * For MERGE, check that the target relation supports each action. + * For other operations, just check the operation itself. + */ + if (operation == CMD_MERGE) + foreach_node(MergeAction, action, mergeActions) + CheckCmdReplicaIdentity(resultRel, action->commandType); + else + CheckCmdReplicaIdentity(resultRel, operation); /* * For INSERT ON CONFLICT DO UPDATE, additionally check that the diff --git a/src/test/regress/expected/publication.out b/src/test/regress/expected/publication.out index 00f3cb978d774..895ca87a0dfeb 100644 --- a/src/test/regress/expected/publication.out +++ b/src/test/regress/expected/publication.out @@ -1947,6 +1947,34 @@ INSERT INTO testpub_insert_onconfl_parted VALUES (1, 1) ON CONFLICT DO NOTHING; DROP PUBLICATION pub1; DROP TABLE testpub_insert_onconfl_no_ri; DROP TABLE testpub_insert_onconfl_parted; +-- Test that the MERGE command correctly checks REPLICA IDENTITY when the +-- target table is published. +CREATE TABLE testpub_merge_no_ri (a int, b int); +CREATE TABLE testpub_merge_pk (a int primary key, b int); +SET client_min_messages = 'ERROR'; +CREATE PUBLICATION pub1 FOR ALL TABLES; +RESET client_min_messages; +-- fail - missing REPLICA IDENTITY +MERGE INTO testpub_merge_no_ri USING testpub_merge_pk s ON s.a >= 1 + WHEN MATCHED THEN UPDATE SET b = s.b; +ERROR: cannot update table "testpub_merge_no_ri" because it does not have a replica identity and publishes updates +HINT: To enable updating the table, set REPLICA IDENTITY using ALTER TABLE. +-- fail - missing REPLICA IDENTITY +MERGE INTO testpub_merge_no_ri USING testpub_merge_pk s ON s.a >= 1 + WHEN MATCHED THEN DELETE; +ERROR: cannot delete from table "testpub_merge_no_ri" because it does not have a replica identity and publishes deletes +HINT: To enable deleting from the table, set REPLICA IDENTITY using ALTER TABLE. +-- ok - insert and do nothing are not restricted +MERGE INTO testpub_merge_no_ri USING testpub_merge_pk s ON s.a >= 1 + WHEN MATCHED THEN DO NOTHING + WHEN NOT MATCHED THEN INSERT (a, b) VALUES (0, 0); +-- ok - REPLICA IDENTITY is DEFAULT and table has a PK +MERGE INTO testpub_merge_pk USING testpub_merge_no_ri s ON s.a >= 1 + WHEN MATCHED AND s.a > 0 THEN UPDATE SET b = s.b + WHEN MATCHED THEN DELETE; +DROP PUBLICATION pub1; +DROP TABLE testpub_merge_no_ri; +DROP TABLE testpub_merge_pk; RESET SESSION AUTHORIZATION; DROP ROLE regress_publication_user, regress_publication_user2; DROP ROLE regress_publication_user_dummy; diff --git a/src/test/regress/sql/publication.sql b/src/test/regress/sql/publication.sql index 53422d30320d3..3f42306139533 100644 --- a/src/test/regress/sql/publication.sql +++ b/src/test/regress/sql/publication.sql @@ -1250,6 +1250,37 @@ DROP PUBLICATION pub1; DROP TABLE testpub_insert_onconfl_no_ri; DROP TABLE testpub_insert_onconfl_parted; +-- Test that the MERGE command correctly checks REPLICA IDENTITY when the +-- target table is published. +CREATE TABLE testpub_merge_no_ri (a int, b int); +CREATE TABLE testpub_merge_pk (a int primary key, b int); + +SET client_min_messages = 'ERROR'; +CREATE PUBLICATION pub1 FOR ALL TABLES; +RESET client_min_messages; + +-- fail - missing REPLICA IDENTITY +MERGE INTO testpub_merge_no_ri USING testpub_merge_pk s ON s.a >= 1 + WHEN MATCHED THEN UPDATE SET b = s.b; + +-- fail - missing REPLICA IDENTITY +MERGE INTO testpub_merge_no_ri USING testpub_merge_pk s ON s.a >= 1 + WHEN MATCHED THEN DELETE; + +-- ok - insert and do nothing are not restricted +MERGE INTO testpub_merge_no_ri USING testpub_merge_pk s ON s.a >= 1 + WHEN MATCHED THEN DO NOTHING + WHEN NOT MATCHED THEN INSERT (a, b) VALUES (0, 0); + +-- ok - REPLICA IDENTITY is DEFAULT and table has a PK +MERGE INTO testpub_merge_pk USING testpub_merge_no_ri s ON s.a >= 1 + WHEN MATCHED AND s.a > 0 THEN UPDATE SET b = s.b + WHEN MATCHED THEN DELETE; + +DROP PUBLICATION pub1; +DROP TABLE testpub_merge_no_ri; +DROP TABLE testpub_merge_pk; + RESET SESSION AUTHORIZATION; DROP ROLE regress_publication_user, regress_publication_user2; DROP ROLE regress_publication_user_dummy; From 1129d3e4c8883f25642d1f44750b4d36e29ec006 Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Thu, 4 Sep 2025 10:18:42 -0500 Subject: [PATCH 06/73] Adjust commentary for WaitEventLWLock in wait_event_names.txt. In addition to changing a couple of references for clarity, this commit combines the two similar comments. --- src/backend/utils/activity/wait_event_names.txt | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt index 5427da5bc1b19..7553f6eacef7b 100644 --- a/src/backend/utils/activity/wait_event_names.txt +++ b/src/backend/utils/activity/wait_event_names.txt @@ -302,9 +302,12 @@ ABI_compatibility: # This class of wait events has its own set of C structure, so these are # only used for the documentation. # -# NB: Predefined LWLocks (i.e., those declared in lwlocklist.h) must be -# listed in the top section of locks and must be listed in the same order as in -# lwlocklist.h. +# NB: Predefined LWLocks (i.e., those declared with PG_LWLOCK in lwlocklist.h) +# must be listed before the "END OF PREDEFINED LWLOCKS" comment and must be +# listed in the same order as in lwlocklist.h. Likewise, the built-in LWLock +# tranches (i.e., those declared with PG_LWLOCKTRANCHE in lwlocklist.h) must be +# listed after the "END OF PREDEFINED LWLOCKS" comment and must be listed in +# the same order as lwlocklist.h. # Section: ClassName - WaitEventLWLock @@ -356,14 +359,6 @@ AioWorkerSubmissionQueue "Waiting to access AIO worker submission queue." # # END OF PREDEFINED LWLOCKS (DO NOT CHANGE THIS LINE) # -# Predefined LWLocks (i.e., those declared at the top of lwlocknames.h) must be -# listed in the section above and must be listed in the same order as in -# lwlocknames.h. -# -# Likewise, the built-in LWLock tranches (i.e., those declared at the bottom of -# lwlocknames.h) must be listed in the section below and must be listed in the -# same order as in lwlocknames.h. -# XactBuffer "Waiting for I/O on a transaction status SLRU buffer." CommitTsBuffer "Waiting for I/O on a commit timestamp SLRU buffer." From f0478149c34dfe02e0f43fc5f832a63a864dc364 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Thu, 4 Sep 2025 12:57:03 +0200 Subject: [PATCH 07/73] Clean up newly added guc_tables.inc.c There was a missing makefile rule to clean up the guc_tables.inc.c symlink in src/include/. Oversight in commit 63599896545. Author: Nathan Bossart Discussion: https://www.postgresql.org/message-id/flat/dae6fe89-1e0c-4c3f-8d92-19d23374fb10%40eisentraut.org --- src/include/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/Makefile b/src/include/Makefile index 3f94543f3270b..24c5452de98fb 100644 --- a/src/include/Makefile +++ b/src/include/Makefile @@ -72,7 +72,7 @@ uninstall: clean: - rm -f utils/fmgroids.h utils/fmgrprotos.h utils/errcodes.h utils/header-stamp + rm -f utils/fmgroids.h utils/fmgrprotos.h utils/guc_tables.inc.c utils/errcodes.h utils/header-stamp rm -f storage/lwlocknames.h utils/probes.h utils/wait_event_types.h rm -f nodes/nodetags.h nodes/header-stamp $(MAKE) -C catalog clean From d814d7fc3d5257ae258b502229fc7ca97c97270a Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Thu, 4 Sep 2025 15:34:48 -0500 Subject: [PATCH 08/73] Revert recent change to RequestNamedLWLockTranche(). Commit 38b602b028 modified this function to allocate enough space for MAX_NAMED_TRANCHES (256) requests, which is likely far more than most clusters need. This commit reverts that change so that it first allocates enough space for only 16 requests and resizes the array when necessary. While at it, remove the check for too many tranches from this function. We can now rely on InitializeLWLocks() to do that check via its calls to LWLockNewTrancheId() for the named tranches. Reviewed-by: Sami Imseih Discussion: https://postgr.es/m/aLmzwC2dRbqk14y6%40nathan --- src/backend/storage/lmgr/lwlock.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index 258cdebd0f5c9..fcbac5213a5c0 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -610,6 +610,7 @@ void RequestNamedLWLockTranche(const char *tranche_name, int num_lwlocks) { NamedLWLockTrancheRequest *request; + static int NamedLWLockTrancheRequestsAllocated; if (!process_shmem_requests_in_progress) elog(FATAL, "cannot request additional LWLocks outside shmem_request_hook"); @@ -628,17 +629,22 @@ RequestNamedLWLockTranche(const char *tranche_name, int num_lwlocks) if (NamedLWLockTrancheRequestArray == NULL) { + NamedLWLockTrancheRequestsAllocated = 16; NamedLWLockTrancheRequestArray = (NamedLWLockTrancheRequest *) MemoryContextAlloc(TopMemoryContext, - MAX_NAMED_TRANCHES + NamedLWLockTrancheRequestsAllocated * sizeof(NamedLWLockTrancheRequest)); } - if (NamedLWLockTrancheRequests >= MAX_NAMED_TRANCHES) - ereport(ERROR, - (errmsg("maximum number of tranches already registered"), - errdetail("No more than %d tranches may be registered.", - MAX_NAMED_TRANCHES))); + if (NamedLWLockTrancheRequests >= NamedLWLockTrancheRequestsAllocated) + { + int i = pg_nextpower2_32(NamedLWLockTrancheRequests + 1); + + NamedLWLockTrancheRequestArray = (NamedLWLockTrancheRequest *) + repalloc(NamedLWLockTrancheRequestArray, + i * sizeof(NamedLWLockTrancheRequest)); + NamedLWLockTrancheRequestsAllocated = i; + } request = &NamedLWLockTrancheRequestArray[NamedLWLockTrancheRequests]; strlcpy(request->tranche_name, tranche_name, NAMEDATALEN); From ae453120085f7da8f4082bb912e9668410cdccab Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Fri, 5 Sep 2025 12:59:29 +0900 Subject: [PATCH 09/73] Change pg_lsn_in_internal() to use soft error reporting pg_lsn includes pg_lsn_in_internal() for the purpose of parsing a LSN position for the GUC recovery_target_lsn (21f428ebde39). It relies on a boolean called "have_error" that would be set when the LSN parsing fails, then let its callers handle any errors. d9f7f5d32f20 has added support for soft error reporting. This commit removes some boilerplate code and switches the routine to use soft error reporting directly, giving to the callers of pg_lsn_in_internal() the possibility to be fed the error message generated on failure. pg_lsn_in_internal() routine is renamed to pg_lsn_in_safe(), for consistency with other similar routines that are given an escontext. Author: Amul Sul Reviewed-by: Dean Rasheed Discussion: https://postgr.es/m/CAAJ_b96No5h5tRuR+KhcC44YcYUCw8WAHuLoqqyyop8_k3+JDQ@mail.gmail.com --- src/backend/access/transam/xlogrecovery.c | 6 ++-- src/backend/utils/adt/pg_lsn.c | 35 ++++++++++------------- src/include/utils/pg_lsn.h | 5 +++- 3 files changed, 22 insertions(+), 24 deletions(-) diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index f23ec8969c27d..346319338a0ee 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -4834,10 +4834,10 @@ check_recovery_target_lsn(char **newval, void **extra, GucSource source) { XLogRecPtr lsn; XLogRecPtr *myextra; - bool have_error = false; + ErrorSaveContext escontext = {T_ErrorSaveContext}; - lsn = pg_lsn_in_internal(*newval, &have_error); - if (have_error) + lsn = pg_lsn_in_safe(*newval, (Node *) &escontext); + if (escontext.error_occurred) return false; myextra = (XLogRecPtr *) guc_malloc(LOG, sizeof(XLogRecPtr)); diff --git a/src/backend/utils/adt/pg_lsn.c b/src/backend/utils/adt/pg_lsn.c index 12de2446f5b69..e1ec5f3bc69cf 100644 --- a/src/backend/utils/adt/pg_lsn.c +++ b/src/backend/utils/adt/pg_lsn.c @@ -25,8 +25,11 @@ * Formatting and conversion routines. *---------------------------------------------------------*/ +/* + * Internal version of pg_lsn_in() with support for soft error reporting. + */ XLogRecPtr -pg_lsn_in_internal(const char *str, bool *have_error) +pg_lsn_in_safe(const char *str, Node *escontext) { int len1, len2; @@ -34,22 +37,14 @@ pg_lsn_in_internal(const char *str, bool *have_error) off; XLogRecPtr result; - Assert(have_error != NULL); - *have_error = false; - /* Sanity check input format. */ len1 = strspn(str, "0123456789abcdefABCDEF"); if (len1 < 1 || len1 > MAXPG_LSNCOMPONENT || str[len1] != '/') - { - *have_error = true; - return InvalidXLogRecPtr; - } + goto syntax_error; + len2 = strspn(str + len1 + 1, "0123456789abcdefABCDEF"); if (len2 < 1 || len2 > MAXPG_LSNCOMPONENT || str[len1 + 1 + len2] != '\0') - { - *have_error = true; - return InvalidXLogRecPtr; - } + goto syntax_error; /* Decode result. */ id = (uint32) strtoul(str, NULL, 16); @@ -57,6 +52,12 @@ pg_lsn_in_internal(const char *str, bool *have_error) result = ((uint64) id << 32) | off; return result; + +syntax_error: + ereturn(escontext, InvalidXLogRecPtr, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type %s: \"%s\"", + "pg_lsn", str))); } Datum @@ -64,14 +65,8 @@ pg_lsn_in(PG_FUNCTION_ARGS) { char *str = PG_GETARG_CSTRING(0); XLogRecPtr result; - bool have_error = false; - - result = pg_lsn_in_internal(str, &have_error); - if (have_error) - ereturn(fcinfo->context, (Datum) 0, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid input syntax for type %s: \"%s\"", - "pg_lsn", str))); + + result = pg_lsn_in_safe(str, fcinfo->context); PG_RETURN_LSN(result); } diff --git a/src/include/utils/pg_lsn.h b/src/include/utils/pg_lsn.h index ae198af745029..461a4fdcba954 100644 --- a/src/include/utils/pg_lsn.h +++ b/src/include/utils/pg_lsn.h @@ -18,6 +18,9 @@ #include "access/xlogdefs.h" #include "fmgr.h" +/* forward declaration to avoid node.h include */ +typedef struct Node Node; + static inline XLogRecPtr DatumGetLSN(Datum X) { @@ -33,6 +36,6 @@ LSNGetDatum(XLogRecPtr X) #define PG_GETARG_LSN(n) DatumGetLSN(PG_GETARG_DATUM(n)) #define PG_RETURN_LSN(x) return LSNGetDatum(x) -extern XLogRecPtr pg_lsn_in_internal(const char *str, bool *have_error); +extern XLogRecPtr pg_lsn_in_safe(const char *str, Node *escontext); #endif /* PG_LSN_H */ From 4246a977bad6e76c4276a0d52def8a3dced154bb Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Fri, 5 Sep 2025 13:53:47 +0900 Subject: [PATCH 10/73] Switch some numeric-related functions to use soft error reporting This commit changes some functions related to the data type numeric to use the soft error reporting rather than a custom boolean flag (called "have_error") that callers of these functions could rely on to bypass the generation of ERROR reports, letting the callers do their own error handling (timestamp, jsonpath and numeric_to_char() require them). This results in the removal of some boilerplate code that was required to handle both the ereport() and the "have_error" code paths bypassing ereport(), unifying everything under the soft error reporting facility. While on it, some duplicated error messages are removed. The function upgraded in this commit were suffixed with "_opt_error" in their names. They are renamed to "_safe" instead. This change relies on d9f7f5d32f20, that has introduced the soft error reporting infrastructure. Author: Amul Sul Reviewed-by: Dean Rasheed Discussion: https://postgr.es/m/CAAJ_b96No5h5tRuR+KhcC44YcYUCw8WAHuLoqqyyop8_k3+JDQ@mail.gmail.com --- src/backend/utils/adt/formatting.c | 6 +- src/backend/utils/adt/jsonpath_exec.c | 62 +++--- src/backend/utils/adt/numeric.c | 266 ++++++++------------------ src/backend/utils/adt/timestamp.c | 46 ++--- src/include/utils/numeric.h | 22 +-- 5 files changed, 152 insertions(+), 250 deletions(-) diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index 7ad453314c307..78e19ac39ac17 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -6389,12 +6389,12 @@ numeric_to_char(PG_FUNCTION_ARGS) if (IS_ROMAN(&Num)) { int32 intvalue; - bool err; + ErrorSaveContext escontext = {T_ErrorSaveContext}; /* Round and convert to int */ - intvalue = numeric_int4_opt_error(value, &err); + intvalue = numeric_int4_safe(value, (Node *) &escontext); /* On overflow, just use PG_INT32_MAX; int_to_roman will cope */ - if (err) + if (escontext.error_occurred) intvalue = PG_INT32_MAX; numstr = int_to_roman(intvalue); } diff --git a/src/backend/utils/adt/jsonpath_exec.c b/src/backend/utils/adt/jsonpath_exec.c index 5a56253522357..8156695e97e09 100644 --- a/src/backend/utils/adt/jsonpath_exec.c +++ b/src/backend/utils/adt/jsonpath_exec.c @@ -252,7 +252,8 @@ typedef JsonPathBool (*JsonPathPredicateCallback) (JsonPathItem *jsp, JsonbValue *larg, JsonbValue *rarg, void *param); -typedef Numeric (*BinaryArithmFunc) (Numeric num1, Numeric num2, bool *error); +typedef Numeric (*BinaryArithmFunc) (Numeric num1, Numeric num2, + Node *escontext); static JsonPathExecResult executeJsonPath(JsonPath *path, void *vars, JsonPathGetVarCallback getVar, @@ -808,23 +809,23 @@ executeItemOptUnwrapTarget(JsonPathExecContext *cxt, JsonPathItem *jsp, case jpiAdd: return executeBinaryArithmExpr(cxt, jsp, jb, - numeric_add_opt_error, found); + numeric_add_safe, found); case jpiSub: return executeBinaryArithmExpr(cxt, jsp, jb, - numeric_sub_opt_error, found); + numeric_sub_safe, found); case jpiMul: return executeBinaryArithmExpr(cxt, jsp, jb, - numeric_mul_opt_error, found); + numeric_mul_safe, found); case jpiDiv: return executeBinaryArithmExpr(cxt, jsp, jb, - numeric_div_opt_error, found); + numeric_div_safe, found); case jpiMod: return executeBinaryArithmExpr(cxt, jsp, jb, - numeric_mod_opt_error, found); + numeric_mod_safe, found); case jpiPlus: return executeUnaryArithmExpr(cxt, jsp, jb, NULL, found); @@ -1269,11 +1270,12 @@ executeItemOptUnwrapTarget(JsonPathExecContext *cxt, JsonPathItem *jsp, if (jb->type == jbvNumeric) { - bool have_error; + ErrorSaveContext escontext = {T_ErrorSaveContext}; int64 val; - val = numeric_int8_opt_error(jb->val.numeric, &have_error); - if (have_error) + val = numeric_int8_safe(jb->val.numeric, + (Node *) &escontext); + if (escontext.error_occurred) RETURN_ERROR(ereport(ERROR, (errcode(ERRCODE_NON_NUMERIC_SQL_JSON_ITEM), errmsg("argument \"%s\" of jsonpath item method .%s() is invalid for type %s", @@ -1466,7 +1468,6 @@ executeItemOptUnwrapTarget(JsonPathExecContext *cxt, JsonPathItem *jsp, Datum dtypmod; int32 precision; int32 scale = 0; - bool have_error; bool noerr; ArrayType *arrtypmod; Datum datums[2]; @@ -1478,9 +1479,9 @@ executeItemOptUnwrapTarget(JsonPathExecContext *cxt, JsonPathItem *jsp, if (elem.type != jpiNumeric) elog(ERROR, "invalid jsonpath item type for .decimal() precision"); - precision = numeric_int4_opt_error(jspGetNumeric(&elem), - &have_error); - if (have_error) + precision = numeric_int4_safe(jspGetNumeric(&elem), + (Node *) &escontext); + if (escontext.error_occurred) RETURN_ERROR(ereport(ERROR, (errcode(ERRCODE_NON_NUMERIC_SQL_JSON_ITEM), errmsg("precision of jsonpath item method .%s() is out of range for type integer", @@ -1492,9 +1493,9 @@ executeItemOptUnwrapTarget(JsonPathExecContext *cxt, JsonPathItem *jsp, if (elem.type != jpiNumeric) elog(ERROR, "invalid jsonpath item type for .decimal() scale"); - scale = numeric_int4_opt_error(jspGetNumeric(&elem), - &have_error); - if (have_error) + scale = numeric_int4_safe(jspGetNumeric(&elem), + (Node *) &escontext); + if (escontext.error_occurred) RETURN_ERROR(ereport(ERROR, (errcode(ERRCODE_NON_NUMERIC_SQL_JSON_ITEM), errmsg("scale of jsonpath item method .%s() is out of range for type integer", @@ -1550,11 +1551,12 @@ executeItemOptUnwrapTarget(JsonPathExecContext *cxt, JsonPathItem *jsp, if (jb->type == jbvNumeric) { - bool have_error; int32 val; + ErrorSaveContext escontext = {T_ErrorSaveContext}; - val = numeric_int4_opt_error(jb->val.numeric, &have_error); - if (have_error) + val = numeric_int4_safe(jb->val.numeric, + (Node *) &escontext); + if (escontext.error_occurred) RETURN_ERROR(ereport(ERROR, (errcode(ERRCODE_NON_NUMERIC_SQL_JSON_ITEM), errmsg("argument \"%s\" of jsonpath item method .%s() is invalid for type %s", @@ -2149,11 +2151,11 @@ executeBinaryArithmExpr(JsonPathExecContext *cxt, JsonPathItem *jsp, } else { - bool error = false; + ErrorSaveContext escontext = {T_ErrorSaveContext}; - res = func(lval->val.numeric, rval->val.numeric, &error); + res = func(lval->val.numeric, rval->val.numeric, (Node *) &escontext); - if (error) + if (escontext.error_occurred) return jperError; } @@ -2433,7 +2435,7 @@ executeDateTimeMethod(JsonPathExecContext *cxt, JsonPathItem *jsp, if (jsp->type != jpiDatetime && jsp->type != jpiDate && jsp->content.arg) { - bool have_error; + ErrorSaveContext escontext = {T_ErrorSaveContext}; jspGetArg(jsp, &elem); @@ -2441,9 +2443,9 @@ executeDateTimeMethod(JsonPathExecContext *cxt, JsonPathItem *jsp, elog(ERROR, "invalid jsonpath item type for %s argument", jspOperationName(jsp->type)); - time_precision = numeric_int4_opt_error(jspGetNumeric(&elem), - &have_error); - if (have_error) + time_precision = numeric_int4_safe(jspGetNumeric(&elem), + (Node *) &escontext); + if (escontext.error_occurred) RETURN_ERROR(ereport(ERROR, (errcode(ERRCODE_INVALID_ARGUMENT_FOR_SQL_JSON_DATETIME_FUNCTION), errmsg("time precision of jsonpath item method .%s() is out of range for type integer", @@ -3462,7 +3464,7 @@ getArrayIndex(JsonPathExecContext *cxt, JsonPathItem *jsp, JsonbValue *jb, JsonValueList found = {0}; JsonPathExecResult res = executeItem(cxt, jsp, jb, &found); Datum numeric_index; - bool have_error = false; + ErrorSaveContext escontext = {T_ErrorSaveContext}; if (jperIsError(res)) return res; @@ -3477,10 +3479,10 @@ getArrayIndex(JsonPathExecContext *cxt, JsonPathItem *jsp, JsonbValue *jb, NumericGetDatum(jbv->val.numeric), Int32GetDatum(0)); - *index = numeric_int4_opt_error(DatumGetNumeric(numeric_index), - &have_error); + *index = numeric_int4_safe(DatumGetNumeric(numeric_index), + (Node *) &escontext); - if (have_error) + if (escontext.error_occurred) RETURN_ERROR(ereport(ERROR, (errcode(ERRCODE_INVALID_SQL_JSON_SUBSCRIPT), errmsg("jsonpath array subscript is out of integer range")))); diff --git a/src/backend/utils/adt/numeric.c b/src/backend/utils/adt/numeric.c index b6287f5d97305..76269918593d7 100644 --- a/src/backend/utils/adt/numeric.c +++ b/src/backend/utils/adt/numeric.c @@ -517,7 +517,7 @@ static void numericvar_deserialize(StringInfo buf, NumericVar *var); static Numeric duplicate_numeric(Numeric num); static Numeric make_result(const NumericVar *var); -static Numeric make_result_opt_error(const NumericVar *var, bool *have_error); +static Numeric make_result_safe(const NumericVar *var, Node *escontext); static bool apply_typmod(NumericVar *var, int32 typmod, Node *escontext); static bool apply_typmod_special(Numeric num, int32 typmod, Node *escontext); @@ -717,7 +717,6 @@ numeric_in(PG_FUNCTION_ARGS) */ NumericVar value; int base; - bool have_error; init_var(&value); @@ -776,12 +775,7 @@ numeric_in(PG_FUNCTION_ARGS) if (!apply_typmod(&value, typmod, escontext)) PG_RETURN_NULL(); - res = make_result_opt_error(&value, &have_error); - - if (have_error) - ereturn(escontext, (Datum) 0, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("value overflows numeric format"))); + res = make_result_safe(&value, escontext); free_var(&value); } @@ -2874,20 +2868,18 @@ numeric_add(PG_FUNCTION_ARGS) Numeric num2 = PG_GETARG_NUMERIC(1); Numeric res; - res = numeric_add_opt_error(num1, num2, NULL); + res = numeric_add_safe(num1, num2, NULL); PG_RETURN_NUMERIC(res); } /* - * numeric_add_opt_error() - + * numeric_add_safe() - * - * Internal version of numeric_add(). If "*have_error" flag is provided, - * on error it's set to true, NULL returned. This is helpful when caller - * need to handle errors by itself. + * Internal version of numeric_add() with support for soft error reporting. */ Numeric -numeric_add_opt_error(Numeric num1, Numeric num2, bool *have_error) +numeric_add_safe(Numeric num1, Numeric num2, Node *escontext) { NumericVar arg1; NumericVar arg2; @@ -2931,7 +2923,7 @@ numeric_add_opt_error(Numeric num1, Numeric num2, bool *have_error) init_var(&result); add_var(&arg1, &arg2, &result); - res = make_result_opt_error(&result, have_error); + res = make_result_safe(&result, escontext); free_var(&result); @@ -2951,21 +2943,19 @@ numeric_sub(PG_FUNCTION_ARGS) Numeric num2 = PG_GETARG_NUMERIC(1); Numeric res; - res = numeric_sub_opt_error(num1, num2, NULL); + res = numeric_sub_safe(num1, num2, NULL); PG_RETURN_NUMERIC(res); } /* - * numeric_sub_opt_error() - + * numeric_sub_safe() - * - * Internal version of numeric_sub(). If "*have_error" flag is provided, - * on error it's set to true, NULL returned. This is helpful when caller - * need to handle errors by itself. + * Internal version of numeric_sub() with support for soft error reporting. */ Numeric -numeric_sub_opt_error(Numeric num1, Numeric num2, bool *have_error) +numeric_sub_safe(Numeric num1, Numeric num2, Node *escontext) { NumericVar arg1; NumericVar arg2; @@ -3009,7 +2999,7 @@ numeric_sub_opt_error(Numeric num1, Numeric num2, bool *have_error) init_var(&result); sub_var(&arg1, &arg2, &result); - res = make_result_opt_error(&result, have_error); + res = make_result_safe(&result, escontext); free_var(&result); @@ -3029,21 +3019,19 @@ numeric_mul(PG_FUNCTION_ARGS) Numeric num2 = PG_GETARG_NUMERIC(1); Numeric res; - res = numeric_mul_opt_error(num1, num2, NULL); + res = numeric_mul_safe(num1, num2, NULL); PG_RETURN_NUMERIC(res); } /* - * numeric_mul_opt_error() - + * numeric_mul_safe() - * - * Internal version of numeric_mul(). If "*have_error" flag is provided, - * on error it's set to true, NULL returned. This is helpful when caller - * need to handle errors by itself. + * Internal version of numeric_mul() with support for soft error reporting. */ Numeric -numeric_mul_opt_error(Numeric num1, Numeric num2, bool *have_error) +numeric_mul_safe(Numeric num1, Numeric num2, Node *escontext) { NumericVar arg1; NumericVar arg2; @@ -3130,7 +3118,7 @@ numeric_mul_opt_error(Numeric num1, Numeric num2, bool *have_error) if (result.dscale > NUMERIC_DSCALE_MAX) round_var(&result, NUMERIC_DSCALE_MAX); - res = make_result_opt_error(&result, have_error); + res = make_result_safe(&result, escontext); free_var(&result); @@ -3150,21 +3138,19 @@ numeric_div(PG_FUNCTION_ARGS) Numeric num2 = PG_GETARG_NUMERIC(1); Numeric res; - res = numeric_div_opt_error(num1, num2, NULL); + res = numeric_div_safe(num1, num2, NULL); PG_RETURN_NUMERIC(res); } /* - * numeric_div_opt_error() - + * numeric_div_safe() - * - * Internal version of numeric_div(). If "*have_error" flag is provided, - * on error it's set to true, NULL returned. This is helpful when caller - * need to handle errors by itself. + * Internal version of numeric_div() with support for soft error reporting. */ Numeric -numeric_div_opt_error(Numeric num1, Numeric num2, bool *have_error) +numeric_div_safe(Numeric num1, Numeric num2, Node *escontext) { NumericVar arg1; NumericVar arg2; @@ -3172,9 +3158,6 @@ numeric_div_opt_error(Numeric num1, Numeric num2, bool *have_error) Numeric res; int rscale; - if (have_error) - *have_error = false; - /* * Handle NaN and infinities */ @@ -3189,15 +3172,7 @@ numeric_div_opt_error(Numeric num1, Numeric num2, bool *have_error) switch (numeric_sign_internal(num2)) { case 0: - if (have_error) - { - *have_error = true; - return NULL; - } - ereport(ERROR, - (errcode(ERRCODE_DIVISION_BY_ZERO), - errmsg("division by zero"))); - break; + goto division_by_zero; case 1: return make_result(&const_pinf); case -1: @@ -3212,15 +3187,7 @@ numeric_div_opt_error(Numeric num1, Numeric num2, bool *have_error) switch (numeric_sign_internal(num2)) { case 0: - if (have_error) - { - *have_error = true; - return NULL; - } - ereport(ERROR, - (errcode(ERRCODE_DIVISION_BY_ZERO), - errmsg("division by zero"))); - break; + goto division_by_zero; case 1: return make_result(&const_ninf); case -1: @@ -3251,25 +3218,25 @@ numeric_div_opt_error(Numeric num1, Numeric num2, bool *have_error) */ rscale = select_div_scale(&arg1, &arg2); - /* - * If "have_error" is provided, check for division by zero here - */ - if (have_error && (arg2.ndigits == 0 || arg2.digits[0] == 0)) - { - *have_error = true; - return NULL; - } + /* Check for division by zero */ + if (arg2.ndigits == 0 || arg2.digits[0] == 0) + goto division_by_zero; /* * Do the divide and return the result */ div_var(&arg1, &arg2, &result, rscale, true, true); - res = make_result_opt_error(&result, have_error); + res = make_result_safe(&result, escontext); free_var(&result); return res; + +division_by_zero: + ereturn(escontext, NULL, + errcode(ERRCODE_DIVISION_BY_ZERO), + errmsg("division by zero")); } @@ -3374,30 +3341,25 @@ numeric_mod(PG_FUNCTION_ARGS) Numeric num2 = PG_GETARG_NUMERIC(1); Numeric res; - res = numeric_mod_opt_error(num1, num2, NULL); + res = numeric_mod_safe(num1, num2, NULL); PG_RETURN_NUMERIC(res); } /* - * numeric_mod_opt_error() - + * numeric_mod_safe() - * - * Internal version of numeric_mod(). If "*have_error" flag is provided, - * on error it's set to true, NULL returned. This is helpful when caller - * need to handle errors by itself. + * Internal version of numeric_mod() with support for soft error reporting. */ Numeric -numeric_mod_opt_error(Numeric num1, Numeric num2, bool *have_error) +numeric_mod_safe(Numeric num1, Numeric num2, Node *escontext) { Numeric res; NumericVar arg1; NumericVar arg2; NumericVar result; - if (have_error) - *have_error = false; - /* * Handle NaN and infinities. We follow POSIX fmod() on this, except that * POSIX treats x-is-infinite and y-is-zero identically, raising EDOM and @@ -3410,16 +3372,8 @@ numeric_mod_opt_error(Numeric num1, Numeric num2, bool *have_error) if (NUMERIC_IS_INF(num1)) { if (numeric_sign_internal(num2) == 0) - { - if (have_error) - { - *have_error = true; - return NULL; - } - ereport(ERROR, - (errcode(ERRCODE_DIVISION_BY_ZERO), - errmsg("division by zero"))); - } + goto division_by_zero; + /* Inf % any nonzero = NaN */ return make_result(&const_nan); } @@ -3432,22 +3386,22 @@ numeric_mod_opt_error(Numeric num1, Numeric num2, bool *have_error) init_var(&result); - /* - * If "have_error" is provided, check for division by zero here - */ - if (have_error && (arg2.ndigits == 0 || arg2.digits[0] == 0)) - { - *have_error = true; - return NULL; - } + /* Check for division by zero */ + if (arg2.ndigits == 0 || arg2.digits[0] == 0) + goto division_by_zero; mod_var(&arg1, &arg2, &result); - res = make_result_opt_error(&result, NULL); + res = make_result_safe(&result, escontext); free_var(&result); return res; + +division_by_zero: + ereturn(escontext, NULL, + errcode(ERRCODE_DIVISION_BY_ZERO), + errmsg("division by zero")); } @@ -4404,52 +4358,34 @@ int4_numeric(PG_FUNCTION_ARGS) PG_RETURN_NUMERIC(int64_to_numeric(val)); } +/* + * Internal version of int4_numeric() with support for soft error reporting. + */ int32 -numeric_int4_opt_error(Numeric num, bool *have_error) +numeric_int4_safe(Numeric num, Node *escontext) { NumericVar x; int32 result; - if (have_error) - *have_error = false; - if (NUMERIC_IS_SPECIAL(num)) { - if (have_error) - { - *have_error = true; - return 0; - } + if (NUMERIC_IS_NAN(num)) + ereturn(escontext, 0, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot convert NaN to %s", "integer"))); else - { - if (NUMERIC_IS_NAN(num)) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot convert NaN to %s", "integer"))); - else - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot convert infinity to %s", "integer"))); - } + ereturn(escontext, 0, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot convert infinity to %s", "integer"))); } /* Convert to variable format, then convert to int4 */ init_var_from_num(num, &x); if (!numericvar_to_int32(&x, &result)) - { - if (have_error) - { - *have_error = true; - return 0; - } - else - { - ereport(ERROR, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("integer out of range"))); - } - } + ereturn(escontext, 0, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("integer out of range"))); return result; } @@ -4459,7 +4395,7 @@ numeric_int4(PG_FUNCTION_ARGS) { Numeric num = PG_GETARG_NUMERIC(0); - PG_RETURN_INT32(numeric_int4_opt_error(num, NULL)); + PG_RETURN_INT32(numeric_int4_safe(num, NULL)); } /* @@ -4492,52 +4428,34 @@ int8_numeric(PG_FUNCTION_ARGS) PG_RETURN_NUMERIC(int64_to_numeric(val)); } +/* + * Internal version of int8_numeric() with support for soft error reporting. + */ int64 -numeric_int8_opt_error(Numeric num, bool *have_error) +numeric_int8_safe(Numeric num, Node *escontext) { NumericVar x; int64 result; - if (have_error) - *have_error = false; - if (NUMERIC_IS_SPECIAL(num)) { - if (have_error) - { - *have_error = true; - return 0; - } + if (NUMERIC_IS_NAN(num)) + ereturn(escontext, 0, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot convert NaN to %s", "bigint"))); else - { - if (NUMERIC_IS_NAN(num)) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot convert NaN to %s", "bigint"))); - else - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot convert infinity to %s", "bigint"))); - } + ereturn(escontext, 0, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot convert infinity to %s", "bigint"))); } /* Convert to variable format, then convert to int8 */ init_var_from_num(num, &x); if (!numericvar_to_int64(&x, &result)) - { - if (have_error) - { - *have_error = true; - return 0; - } - else - { - ereport(ERROR, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("bigint out of range"))); - } - } + ereturn(escontext, 0, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("bigint out of range"))); return result; } @@ -4547,7 +4465,7 @@ numeric_int8(PG_FUNCTION_ARGS) { Numeric num = PG_GETARG_NUMERIC(0); - PG_RETURN_INT64(numeric_int8_opt_error(num, NULL)); + PG_RETURN_INT64(numeric_int8_safe(num, NULL)); } @@ -7583,16 +7501,13 @@ duplicate_numeric(Numeric num) } /* - * make_result_opt_error() - + * make_result_safe() - * * Create the packed db numeric format in palloc()'d memory from * a variable. This will handle NaN and Infinity cases. - * - * If "have_error" isn't NULL, on overflow *have_error is set to true and - * NULL is returned. This is helpful when caller needs to handle errors. */ static Numeric -make_result_opt_error(const NumericVar *var, bool *have_error) +make_result_safe(const NumericVar *var, Node *escontext) { Numeric result; NumericDigit *digits = var->digits; @@ -7601,9 +7516,6 @@ make_result_opt_error(const NumericVar *var, bool *have_error) int n; Size len; - if (have_error) - *have_error = false; - if ((sign & NUMERIC_SIGN_MASK) == NUMERIC_SPECIAL) { /* @@ -7676,19 +7588,9 @@ make_result_opt_error(const NumericVar *var, bool *have_error) /* Check for overflow of int16 fields */ if (NUMERIC_WEIGHT(result) != weight || NUMERIC_DSCALE(result) != var->dscale) - { - if (have_error) - { - *have_error = true; - return NULL; - } - else - { - ereport(ERROR, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("value overflows numeric format"))); - } - } + ereturn(escontext, NULL, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("value overflows numeric format"))); dump_numeric("make_result()", result); return result; @@ -7698,12 +7600,12 @@ make_result_opt_error(const NumericVar *var, bool *have_error) /* * make_result() - * - * An interface to make_result_opt_error() without "have_error" argument. + * An interface to make_result_safe() without "escontext" argument. */ static Numeric make_result(const NumericVar *var) { - return make_result_opt_error(var, NULL); + return make_result_safe(var, NULL); } diff --git a/src/backend/utils/adt/timestamp.c b/src/backend/utils/adt/timestamp.c index 3e5f9dc1458e1..156a4830ffda6 100644 --- a/src/backend/utils/adt/timestamp.c +++ b/src/backend/utils/adt/timestamp.c @@ -5629,11 +5629,11 @@ timestamp_part_common(PG_FUNCTION_ARGS, bool retnumeric) case DTK_JULIAN: if (retnumeric) - PG_RETURN_NUMERIC(numeric_add_opt_error(int64_to_numeric(date2j(tm->tm_year, tm->tm_mon, tm->tm_mday)), - numeric_div_opt_error(int64_to_numeric(((((tm->tm_hour * MINS_PER_HOUR) + tm->tm_min) * SECS_PER_MINUTE) + tm->tm_sec) * INT64CONST(1000000) + fsec), - int64_to_numeric(SECS_PER_DAY * INT64CONST(1000000)), - NULL), - NULL)); + PG_RETURN_NUMERIC(numeric_add_safe(int64_to_numeric(date2j(tm->tm_year, tm->tm_mon, tm->tm_mday)), + numeric_div_safe(int64_to_numeric(((((tm->tm_hour * MINS_PER_HOUR) + tm->tm_min) * SECS_PER_MINUTE) + tm->tm_sec) * INT64CONST(1000000) + fsec), + int64_to_numeric(SECS_PER_DAY * INT64CONST(1000000)), + NULL), + NULL)); else PG_RETURN_FLOAT8(date2j(tm->tm_year, tm->tm_mon, tm->tm_mday) + ((((tm->tm_hour * MINS_PER_HOUR) + tm->tm_min) * SECS_PER_MINUTE) + @@ -5685,11 +5685,11 @@ timestamp_part_common(PG_FUNCTION_ARGS, bool retnumeric) result = int64_div_fast_to_numeric(timestamp - epoch, 6); else { - result = numeric_div_opt_error(numeric_sub_opt_error(int64_to_numeric(timestamp), - int64_to_numeric(epoch), - NULL), - int64_to_numeric(1000000), - NULL); + result = numeric_div_safe(numeric_sub_safe(int64_to_numeric(timestamp), + int64_to_numeric(epoch), + NULL), + int64_to_numeric(1000000), + NULL); result = DatumGetNumeric(DirectFunctionCall2(numeric_round, NumericGetDatum(result), Int32GetDatum(6))); @@ -5903,11 +5903,11 @@ timestamptz_part_common(PG_FUNCTION_ARGS, bool retnumeric) case DTK_JULIAN: if (retnumeric) - PG_RETURN_NUMERIC(numeric_add_opt_error(int64_to_numeric(date2j(tm->tm_year, tm->tm_mon, tm->tm_mday)), - numeric_div_opt_error(int64_to_numeric(((((tm->tm_hour * MINS_PER_HOUR) + tm->tm_min) * SECS_PER_MINUTE) + tm->tm_sec) * INT64CONST(1000000) + fsec), - int64_to_numeric(SECS_PER_DAY * INT64CONST(1000000)), - NULL), - NULL)); + PG_RETURN_NUMERIC(numeric_add_safe(int64_to_numeric(date2j(tm->tm_year, tm->tm_mon, tm->tm_mday)), + numeric_div_safe(int64_to_numeric(((((tm->tm_hour * MINS_PER_HOUR) + tm->tm_min) * SECS_PER_MINUTE) + tm->tm_sec) * INT64CONST(1000000) + fsec), + int64_to_numeric(SECS_PER_DAY * INT64CONST(1000000)), + NULL), + NULL)); else PG_RETURN_FLOAT8(date2j(tm->tm_year, tm->tm_mon, tm->tm_mday) + ((((tm->tm_hour * MINS_PER_HOUR) + tm->tm_min) * SECS_PER_MINUTE) + @@ -5956,11 +5956,11 @@ timestamptz_part_common(PG_FUNCTION_ARGS, bool retnumeric) result = int64_div_fast_to_numeric(timestamp - epoch, 6); else { - result = numeric_div_opt_error(numeric_sub_opt_error(int64_to_numeric(timestamp), - int64_to_numeric(epoch), - NULL), - int64_to_numeric(1000000), - NULL); + result = numeric_div_safe(numeric_sub_safe(int64_to_numeric(timestamp), + int64_to_numeric(epoch), + NULL), + int64_to_numeric(1000000), + NULL); result = DatumGetNumeric(DirectFunctionCall2(numeric_round, NumericGetDatum(result), Int32GetDatum(6))); @@ -6247,9 +6247,9 @@ interval_part_common(PG_FUNCTION_ARGS, bool retnumeric) result = int64_div_fast_to_numeric(val, 6); else result = - numeric_add_opt_error(int64_div_fast_to_numeric(interval->time, 6), - int64_to_numeric(secs_from_day_month), - NULL); + numeric_add_safe(int64_div_fast_to_numeric(interval->time, 6), + int64_to_numeric(secs_from_day_month), + NULL); PG_RETURN_NUMERIC(result); } diff --git a/src/include/utils/numeric.h b/src/include/utils/numeric.h index 9e79fc376cbea..215f1ea4f53b4 100644 --- a/src/include/utils/numeric.h +++ b/src/include/utils/numeric.h @@ -17,6 +17,9 @@ #include "common/pg_prng.h" #include "fmgr.h" +/* forward declaration to avoid node.h include */ +typedef struct Node Node; + /* * Limits on the precision and scale specifiable in a NUMERIC typmod. The * precision is strictly positive, but the scale may be positive or negative. @@ -91,18 +94,13 @@ extern char *numeric_normalize(Numeric num); extern Numeric int64_to_numeric(int64 val); extern Numeric int64_div_fast_to_numeric(int64 val1, int log10val2); -extern Numeric numeric_add_opt_error(Numeric num1, Numeric num2, - bool *have_error); -extern Numeric numeric_sub_opt_error(Numeric num1, Numeric num2, - bool *have_error); -extern Numeric numeric_mul_opt_error(Numeric num1, Numeric num2, - bool *have_error); -extern Numeric numeric_div_opt_error(Numeric num1, Numeric num2, - bool *have_error); -extern Numeric numeric_mod_opt_error(Numeric num1, Numeric num2, - bool *have_error); -extern int32 numeric_int4_opt_error(Numeric num, bool *have_error); -extern int64 numeric_int8_opt_error(Numeric num, bool *have_error); +extern Numeric numeric_add_safe(Numeric num1, Numeric num2, Node *escontext); +extern Numeric numeric_sub_safe(Numeric num1, Numeric num2, Node *escontext); +extern Numeric numeric_mul_safe(Numeric num1, Numeric num2, Node *escontext); +extern Numeric numeric_div_safe(Numeric num1, Numeric num2, Node *escontext); +extern Numeric numeric_mod_safe(Numeric num1, Numeric num2, Node *escontext); +extern int32 numeric_int4_safe(Numeric num, Node *escontext); +extern int64 numeric_int8_safe(Numeric num, Node *escontext); extern Numeric random_numeric(pg_prng_state *state, Numeric rmin, Numeric rmax); From 567d27e8e2b752743626eb259ba75ecdc936eaf3 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Fri, 5 Sep 2025 14:10:08 +0900 Subject: [PATCH 11/73] Fix outdated comments in slru.c SlruRecentlyUsed() is an inline function since 53c2a97a9266, not a macro. The description of long_segment_names was missing at the top of SimpleLruInit(), part forgotten in 4ed8f0913bfd. Author: Julien Rouhaud Discussion: https://postgr.es/m/aLpBLMOYwEQkaleF@jrouhaud Backpatch-through: 17 --- src/backend/access/transam/slru.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index d7ebd889aea71..5d3fcd62c9443 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -246,6 +246,7 @@ SimpleLruAutotuneBuffers(int divisor, int max) * buffer_tranche_id: tranche ID to use for the SLRU's per-buffer LWLocks. * bank_tranche_id: tranche ID to use for the bank LWLocks. * sync_handler: which set of functions to use to handle sync requests + * long_segment_names: use short or long segment names */ void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, @@ -644,7 +645,7 @@ SimpleLruReadPage_ReadOnly(SlruCtl ctl, int64 pageno, TransactionId xid) shared->page_number[slotno] == pageno && shared->page_status[slotno] != SLRU_PAGE_READ_IN_PROGRESS) { - /* See comments for SlruRecentlyUsed macro */ + /* See comments for SlruRecentlyUsed() */ SlruRecentlyUsed(shared, slotno); /* update the stats counter of pages found in the SLRU */ From 6ede13d1b5f515df0a199a7a830e448dab1511c0 Mon Sep 17 00:00:00 2001 From: Dean Rasheed Date: Fri, 5 Sep 2025 08:18:18 +0100 Subject: [PATCH 12/73] Fix concurrent update issue with MERGE. When executing a MERGE UPDATE action, if there is more than one concurrent update of the target row, the lock-and-retry code would sometimes incorrectly identify the latest version of the target tuple, leading to incorrect results. This was caused by using the ctid field from the TM_FailureData returned by table_tuple_lock() in a case where the result was TM_Ok, which is unsafe because the TM_FailureData struct is not guaranteed to be fully populated in that case. Instead, it should use the tupleid passed to (and updated by) table_tuple_lock(). To reduce the chances of similar errors in the future, improve the commentary for table_tuple_lock() and TM_FailureData to make it clearer that table_tuple_lock() updates the tid passed to it, and most fields of TM_FailureData should not be relied on in non-failure cases. An exception to this is the "traversed" field, which is set in both success and failure cases. Reported-by: Dmitry Author: Yugo Nagata Reviewed-by: Dean Rasheed Reviewed-by: Chao Li Discussion: https://postgr.es/m/1570d30e-2b95-4239-b9c3-f7bf2f2f8556@yandex.ru Backpatch-through: 15 --- src/backend/executor/nodeModifyTable.c | 9 +- src/include/access/tableam.h | 15 +- .../expected/merge-match-recheck.out | 145 ++++++++++++++++++ .../isolation/specs/merge-match-recheck.spec | 18 +++ 4 files changed, 179 insertions(+), 8 deletions(-) diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index b0c4e2c0d32a4..4c5647ac38a1c 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -3402,7 +3402,7 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, * the tuple moved, and setting our current * resultRelInfo to that. */ - if (ItemPointerIndicatesMovedPartitions(&context->tmfd.ctid)) + if (ItemPointerIndicatesMovedPartitions(tupleid)) ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("tuple to be merged was already moved to another partition due to concurrent update"))); @@ -3450,12 +3450,13 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, if (ItemPointerIsValid(&lockedtid)) UnlockTuple(resultRelInfo->ri_RelationDesc, &lockedtid, InplaceUpdateTupleLock); - LockTuple(resultRelInfo->ri_RelationDesc, &context->tmfd.ctid, + LockTuple(resultRelInfo->ri_RelationDesc, tupleid, InplaceUpdateTupleLock); - lockedtid = context->tmfd.ctid; + lockedtid = *tupleid; } + if (!table_tuple_fetch_row_version(resultRelationDesc, - &context->tmfd.ctid, + tupleid, SnapshotAny, resultRelInfo->ri_oldTupleSlot)) elog(ERROR, "failed to fetch the target tuple"); diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 1c9e802a6b128..b2ce35e2a3407 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -121,7 +121,9 @@ typedef enum TU_UpdateIndexes /* * When table_tuple_update, table_tuple_delete, or table_tuple_lock fail * because the target tuple is already outdated, they fill in this struct to - * provide information to the caller about what happened. + * provide information to the caller about what happened. When those functions + * succeed, the contents of this struct should not be relied upon, except for + * `traversed`, which may be set in both success and failure cases. * * ctid is the target's ctid link: it is the same as the target's TID if the * target was deleted, or the location of the replacement tuple if the target @@ -137,6 +139,9 @@ typedef enum TU_UpdateIndexes * tuple); otherwise cmax is zero. (We make this restriction because * HeapTupleHeaderGetCmax doesn't work for tuples outdated in other * transactions.) + * + * traversed indicates if an update chain was followed in order to try to lock + * the target tuple. (This may be set in both success and failure cases.) */ typedef struct TM_FailureData { @@ -1508,7 +1513,7 @@ table_tuple_update(Relation rel, ItemPointer otid, TupleTableSlot *slot, * * Input parameters: * relation: relation containing tuple (caller must hold suitable lock) - * tid: TID of tuple to lock + * tid: TID of tuple to lock (updated if an update chain was followed) * snapshot: snapshot to use for visibility determinations * cid: current command ID (used for visibility test, and stored into * tuple's cmax if lock is successful) @@ -1533,8 +1538,10 @@ table_tuple_update(Relation rel, ItemPointer otid, TupleTableSlot *slot, * TM_WouldBlock: lock couldn't be acquired and wait_policy is skip * * In the failure cases other than TM_Invisible and TM_Deleted, the routine - * fills *tmfd with the tuple's t_ctid, t_xmax, and, if possible, t_cmax. See - * comments for struct TM_FailureData for additional info. + * fills *tmfd with the tuple's t_ctid, t_xmax, and, if possible, t_cmax. + * Additionally, in both success and failure cases, tmfd->traversed is set if + * an update chain was followed. See comments for struct TM_FailureData for + * additional info. */ static inline TM_Result table_tuple_lock(Relation rel, ItemPointer tid, Snapshot snapshot, diff --git a/src/test/isolation/expected/merge-match-recheck.out b/src/test/isolation/expected/merge-match-recheck.out index 90300f1db5ab3..4250b85af2d3c 100644 --- a/src/test/isolation/expected/merge-match-recheck.out +++ b/src/test/isolation/expected/merge-match-recheck.out @@ -271,6 +271,151 @@ key|balance|status|val step c1: COMMIT; +starting permutation: update1 update6 merge_bal c2 select1 c1 +step update1: UPDATE target t SET balance = balance + 10, val = t.val || ' updated by update1' WHERE t.key = 1; +step update6: UPDATE target t SET balance = balance - 100, val = t.val || ' updated by update6' WHERE t.key = 1; +step merge_bal: + MERGE INTO target t + USING (SELECT 1 as key) s + ON s.key = t.key + WHEN MATCHED AND balance < 100 THEN + UPDATE SET balance = balance * 2, val = t.val || ' when1' + WHEN MATCHED AND balance < 200 THEN + UPDATE SET balance = balance * 4, val = t.val || ' when2' + WHEN MATCHED AND balance < 300 THEN + UPDATE SET balance = balance * 8, val = t.val || ' when3'; + +step c2: COMMIT; +step merge_bal: <... completed> +step select1: SELECT * FROM target; +key|balance|status|val +---+-------+------+------------------------------------------------- + 1| 140|s1 |setup updated by update1 updated by update6 when1 +(1 row) + +step c1: COMMIT; + +starting permutation: update1_pa update6_pa merge_bal_pa c2 select1_pa c1 +step update1_pa: UPDATE target_pa t SET balance = balance + 10, val = t.val || ' updated by update1_pa' WHERE t.key = 1; +step update6_pa: UPDATE target_pa t SET balance = balance - 100, val = t.val || ' updated by update6_pa' WHERE t.key = 1; +step merge_bal_pa: + MERGE INTO target_pa t + USING (SELECT 1 as key) s + ON s.key = t.key + WHEN MATCHED AND balance < 100 THEN + UPDATE SET balance = balance * 2, val = t.val || ' when1' + WHEN MATCHED AND balance < 200 THEN + UPDATE SET balance = balance * 4, val = t.val || ' when2' + WHEN MATCHED AND balance < 300 THEN + UPDATE SET balance = balance * 8, val = t.val || ' when3'; + +step c2: COMMIT; +step merge_bal_pa: <... completed> +step select1_pa: SELECT * FROM target_pa; +key|balance|status|val +---+-------+------+------------------------------------------------------- + 1| 140|s1 |setup updated by update1_pa updated by update6_pa when1 +(1 row) + +step c1: COMMIT; + +starting permutation: update1_tg update6_tg merge_bal_tg c2 select1_tg c1 +s2: NOTICE: Update: (1,160,s1,setup) -> (1,170,s1,"setup updated by update1_tg") +step update1_tg: UPDATE target_tg t SET balance = balance + 10, val = t.val || ' updated by update1_tg' WHERE t.key = 1; +s2: NOTICE: Update: (1,170,s1,"setup updated by update1_tg") -> (1,70,s1,"setup updated by update1_tg updated by update6_tg") +step update6_tg: UPDATE target_tg t SET balance = balance - 100, val = t.val || ' updated by update6_tg' WHERE t.key = 1; +step merge_bal_tg: + WITH t AS ( + MERGE INTO target_tg t + USING (SELECT 1 as key) s + ON s.key = t.key + WHEN MATCHED AND balance < 100 THEN + UPDATE SET balance = balance * 2, val = t.val || ' when1' + WHEN MATCHED AND balance < 200 THEN + UPDATE SET balance = balance * 4, val = t.val || ' when2' + WHEN MATCHED AND balance < 300 THEN + UPDATE SET balance = balance * 8, val = t.val || ' when3' + RETURNING t.* + ) + SELECT * FROM t; + +step c2: COMMIT; +s1: NOTICE: Update: (1,70,s1,"setup updated by update1_tg updated by update6_tg") -> (1,140,s1,"setup updated by update1_tg updated by update6_tg when1") +step merge_bal_tg: <... completed> +key|balance|status|val +---+-------+------+------------------------------------------------------- + 1| 140|s1 |setup updated by update1_tg updated by update6_tg when1 +(1 row) + +step select1_tg: SELECT * FROM target_tg; +key|balance|status|val +---+-------+------+------------------------------------------------------- + 1| 140|s1 |setup updated by update1_tg updated by update6_tg when1 +(1 row) + +step c1: COMMIT; + +starting permutation: update7 update6 merge_bal c2 select1 c1 +step update7: UPDATE target t SET balance = 350, val = t.val || ' updated by update7' WHERE t.key = 1; +step update6: UPDATE target t SET balance = balance - 100, val = t.val || ' updated by update6' WHERE t.key = 1; +step merge_bal: + MERGE INTO target t + USING (SELECT 1 as key) s + ON s.key = t.key + WHEN MATCHED AND balance < 100 THEN + UPDATE SET balance = balance * 2, val = t.val || ' when1' + WHEN MATCHED AND balance < 200 THEN + UPDATE SET balance = balance * 4, val = t.val || ' when2' + WHEN MATCHED AND balance < 300 THEN + UPDATE SET balance = balance * 8, val = t.val || ' when3'; + +step c2: COMMIT; +step merge_bal: <... completed> +step select1: SELECT * FROM target; +key|balance|status|val +---+-------+------+------------------------------------------------- + 1| 2000|s1 |setup updated by update7 updated by update6 when3 +(1 row) + +step c1: COMMIT; + +starting permutation: update1_pa_move merge_bal_pa c2 c1 +step update1_pa_move: UPDATE target_pa t SET balance = 210, val = t.val || ' updated by update1_pa_move' WHERE t.key = 1; +step merge_bal_pa: + MERGE INTO target_pa t + USING (SELECT 1 as key) s + ON s.key = t.key + WHEN MATCHED AND balance < 100 THEN + UPDATE SET balance = balance * 2, val = t.val || ' when1' + WHEN MATCHED AND balance < 200 THEN + UPDATE SET balance = balance * 4, val = t.val || ' when2' + WHEN MATCHED AND balance < 300 THEN + UPDATE SET balance = balance * 8, val = t.val || ' when3'; + +step c2: COMMIT; +step merge_bal_pa: <... completed> +ERROR: tuple to be locked was already moved to another partition due to concurrent update +step c1: COMMIT; + +starting permutation: update1_pa update1_pa_move merge_bal_pa c2 c1 +step update1_pa: UPDATE target_pa t SET balance = balance + 10, val = t.val || ' updated by update1_pa' WHERE t.key = 1; +step update1_pa_move: UPDATE target_pa t SET balance = 210, val = t.val || ' updated by update1_pa_move' WHERE t.key = 1; +step merge_bal_pa: + MERGE INTO target_pa t + USING (SELECT 1 as key) s + ON s.key = t.key + WHEN MATCHED AND balance < 100 THEN + UPDATE SET balance = balance * 2, val = t.val || ' when1' + WHEN MATCHED AND balance < 200 THEN + UPDATE SET balance = balance * 4, val = t.val || ' when2' + WHEN MATCHED AND balance < 300 THEN + UPDATE SET balance = balance * 8, val = t.val || ' when3'; + +step c2: COMMIT; +step merge_bal_pa: <... completed> +ERROR: tuple to be locked was already moved to another partition due to concurrent update +step c1: COMMIT; + starting permutation: update1 merge_delete c2 select1 c1 step update1: UPDATE target t SET balance = balance + 10, val = t.val || ' updated by update1' WHERE t.key = 1; step merge_delete: diff --git a/src/test/isolation/specs/merge-match-recheck.spec b/src/test/isolation/specs/merge-match-recheck.spec index 15226e40c9efc..6e7a776d17e5a 100644 --- a/src/test/isolation/specs/merge-match-recheck.spec +++ b/src/test/isolation/specs/merge-match-recheck.spec @@ -146,6 +146,8 @@ setup BEGIN ISOLATION LEVEL READ COMMITTED; } step "update1" { UPDATE target t SET balance = balance + 10, val = t.val || ' updated by update1' WHERE t.key = 1; } +step "update1_pa" { UPDATE target_pa t SET balance = balance + 10, val = t.val || ' updated by update1_pa' WHERE t.key = 1; } +step "update1_pa_move" { UPDATE target_pa t SET balance = 210, val = t.val || ' updated by update1_pa_move' WHERE t.key = 1; } step "update1_tg" { UPDATE target_tg t SET balance = balance + 10, val = t.val || ' updated by update1_tg' WHERE t.key = 1; } step "update2" { UPDATE target t SET status = 's2', val = t.val || ' updated by update2' WHERE t.key = 1; } step "update2_tg" { UPDATE target_tg t SET status = 's2', val = t.val || ' updated by update2_tg' WHERE t.key = 1; } @@ -153,6 +155,10 @@ step "update3" { UPDATE target t SET status = 's3', val = t.val || ' updated by step "update3_tg" { UPDATE target_tg t SET status = 's3', val = t.val || ' updated by update3_tg' WHERE t.key = 1; } step "update5" { UPDATE target t SET status = 's5', val = t.val || ' updated by update5' WHERE t.key = 1; } step "update5_tg" { UPDATE target_tg t SET status = 's5', val = t.val || ' updated by update5_tg' WHERE t.key = 1; } +step "update6" { UPDATE target t SET balance = balance - 100, val = t.val || ' updated by update6' WHERE t.key = 1; } +step "update6_pa" { UPDATE target_pa t SET balance = balance - 100, val = t.val || ' updated by update6_pa' WHERE t.key = 1; } +step "update6_tg" { UPDATE target_tg t SET balance = balance - 100, val = t.val || ' updated by update6_tg' WHERE t.key = 1; } +step "update7" { UPDATE target t SET balance = 350, val = t.val || ' updated by update7' WHERE t.key = 1; } step "update_bal1" { UPDATE target t SET balance = 50, val = t.val || ' updated by update_bal1' WHERE t.key = 1; } step "update_bal1_pa" { UPDATE target_pa t SET balance = 50, val = t.val || ' updated by update_bal1_pa' WHERE t.key = 1; } step "update_bal1_tg" { UPDATE target_tg t SET balance = 50, val = t.val || ' updated by update_bal1_tg' WHERE t.key = 1; } @@ -179,6 +185,18 @@ permutation "update_bal1" "merge_bal" "c2" "select1" "c1" permutation "update_bal1_pa" "merge_bal_pa" "c2" "select1_pa" "c1" permutation "update_bal1_tg" "merge_bal_tg" "c2" "select1_tg" "c1" +# merge_bal sees row concurrently updated twice and rechecks WHEN conditions, different check passes, so final balance = 140 +permutation "update1" "update6" "merge_bal" "c2" "select1" "c1" +permutation "update1_pa" "update6_pa" "merge_bal_pa" "c2" "select1_pa" "c1" +permutation "update1_tg" "update6_tg" "merge_bal_tg" "c2" "select1_tg" "c1" + +# merge_bal sees row concurrently updated twice, first update would cause all checks to fail, second update causes different check to pass, so final balance = 2000 +permutation "update7" "update6" "merge_bal" "c2" "select1" "c1" + +# merge_bal sees concurrently updated row moved to new partition, so fails +permutation "update1_pa_move" "merge_bal_pa" "c2" "c1" +permutation "update1_pa" "update1_pa_move" "merge_bal_pa" "c2" "c1" + # merge_delete sees concurrently updated row and rechecks WHEN conditions, but recheck passes and row is deleted permutation "update1" "merge_delete" "c2" "select1" "c1" permutation "update1_tg" "merge_delete_tg" "c2" "select1_tg" "c1" From e3d5ddb7ca91e5982e9d4cff9eef210d97e4f47e Mon Sep 17 00:00:00 2001 From: Melanie Plageman Date: Fri, 5 Sep 2025 09:33:36 -0400 Subject: [PATCH 13/73] Add assert and log message to visibilitymap_set Add an assert to visibilitymap_set() that the provided heap buffer is exclusively locked, which is expected. Also, enhance the debug logging message to specify which VM flags were set. Based on a related suggestion by Kirill Reshke on an in-progress patchset. Author: Melanie Plageman Reviewed-by: Kirill Reshke Reviewed-by: Andres Freund Discussion: https://postgr.es/m/CALdSSPhAU56g1gGVT0%2BwG8RrSWE6qW8TOfNJS1HNAWX6wPgbFA%40mail.gmail.com --- src/backend/access/heap/visibilitymap.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index 953ad4a484399..7306c16f05cd3 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -255,7 +255,8 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, uint8 status; #ifdef TRACE_VISIBILITYMAP - elog(DEBUG1, "vm_set %s %d", RelationGetRelationName(rel), heapBlk); + elog(DEBUG1, "vm_set flags 0x%02X for %s %d", + flags, RelationGetRelationName(rel), heapBlk); #endif Assert(InRecovery || XLogRecPtrIsInvalid(recptr)); @@ -269,6 +270,8 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, if (BufferIsValid(heapBuf) && BufferGetBlockNumber(heapBuf) != heapBlk) elog(ERROR, "wrong heap buffer passed to visibilitymap_set"); + Assert(!BufferIsValid(heapBuf) || BufferIsExclusiveLocked(heapBuf)); + /* Check that we have the right VM page pinned */ if (!BufferIsValid(vmBuf) || BufferGetBlockNumber(vmBuf) != mapBlock) elog(ERROR, "wrong VM buffer passed to visibilitymap_set"); From 50e4c6ace5e69fbe69868c270a1c76acd4cb12ec Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Fri, 5 Sep 2025 12:25:59 -0400 Subject: [PATCH 14/73] bufmgr: Use consistent naming of the clock-sweep algorithm Minor edits to comments only. Author: Greg Burd Reviewed-by: Tomas Vondra Reviewed-by: Andres Freund Discussion: https://postgr.es/m/70C6A5B5-2A20-4D0B-BC73-EB09DD62D61C@getmailspring.com --- src/backend/storage/buffer/README | 4 ++-- src/backend/storage/buffer/bufmgr.c | 8 ++++---- src/backend/storage/buffer/freelist.c | 10 +++++----- src/backend/storage/buffer/localbuf.c | 2 +- src/include/storage/buf_internals.h | 4 ++-- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/backend/storage/buffer/README b/src/backend/storage/buffer/README index a182fcd660ccb..4b13da5d7add8 100644 --- a/src/backend/storage/buffer/README +++ b/src/backend/storage/buffer/README @@ -211,9 +211,9 @@ Buffer Ring Replacement Strategy When running a query that needs to access a large number of pages just once, such as VACUUM or a large sequential scan, a different strategy is used. A page that has been touched only by such a scan is unlikely to be needed -again soon, so instead of running the normal clock sweep algorithm and +again soon, so instead of running the normal clock-sweep algorithm and blowing out the entire buffer cache, a small ring of buffers is allocated -using the normal clock sweep algorithm and those buffers are reused for the +using the normal clock-sweep algorithm and those buffers are reused for the whole scan. This also implies that much of the write traffic caused by such a statement will be done by the backend itself and not pushed off onto other processes. diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 350cc0402aa8f..9fc906a4a4082 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -3608,7 +3608,7 @@ BufferSync(int flags) * This is called periodically by the background writer process. * * Returns true if it's appropriate for the bgwriter process to go into - * low-power hibernation mode. (This happens if the strategy clock sweep + * low-power hibernation mode. (This happens if the strategy clock-sweep * has been "lapped" and no buffer allocations have occurred recently, * or if the bgwriter has been effectively disabled by setting * bgwriter_lru_maxpages to 0.) @@ -3658,7 +3658,7 @@ BgBufferSync(WritebackContext *wb_context) uint32 new_recent_alloc; /* - * Find out where the freelist clock sweep currently is, and how many + * Find out where the freelist clock-sweep currently is, and how many * buffer allocations have happened since our last call. */ strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc); @@ -3679,8 +3679,8 @@ BgBufferSync(WritebackContext *wb_context) /* * Compute strategy_delta = how many buffers have been scanned by the - * clock sweep since last time. If first time through, assume none. Then - * see if we are still ahead of the clock sweep, and if so, how many + * clock-sweep since last time. If first time through, assume none. Then + * see if we are still ahead of the clock-sweep, and if so, how many * buffers we could scan before we'd catch up with it and "lap" it. Note: * weird-looking coding of xxx_passes comparisons are to avoid bogus * behavior when the passes counts wrap around. diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index 01909be027258..cd94a7d8a7b39 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -33,7 +33,7 @@ typedef struct slock_t buffer_strategy_lock; /* - * Clock sweep hand: index of next buffer to consider grabbing. Note that + * clock-sweep hand: index of next buffer to consider grabbing. Note that * this isn't a concrete buffer - we only ever increase the value. So, to * get an actual buffer, it needs to be used modulo NBuffers. */ @@ -51,7 +51,7 @@ typedef struct * Statistics. These counters should be wide enough that they can't * overflow during a single bgwriter cycle. */ - uint32 completePasses; /* Complete cycles of the clock sweep */ + uint32 completePasses; /* Complete cycles of the clock-sweep */ pg_atomic_uint32 numBufferAllocs; /* Buffers allocated since last reset */ /* @@ -311,7 +311,7 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r } } - /* Nothing on the freelist, so run the "clock sweep" algorithm */ + /* Nothing on the freelist, so run the "clock-sweep" algorithm */ trycounter = NBuffers; for (;;) { @@ -511,7 +511,7 @@ StrategyInitialize(bool init) StrategyControl->firstFreeBuffer = 0; StrategyControl->lastFreeBuffer = NBuffers - 1; - /* Initialize the clock sweep pointer */ + /* Initialize the clock-sweep pointer */ pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0); /* Clear statistics */ @@ -759,7 +759,7 @@ GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state) * * If usage_count is 0 or 1 then the buffer is fair game (we expect 1, * since our own previous usage of the ring element would have left it - * there, but it might've been decremented by clock sweep since then). A + * there, but it might've been decremented by clock-sweep since then). A * higher usage_count indicates someone else has touched the buffer, so we * shouldn't re-use it. */ diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index 3c0d20f4659d2..04fef13409b02 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -229,7 +229,7 @@ GetLocalVictimBuffer(void) ResourceOwnerEnlarge(CurrentResourceOwner); /* - * Need to get a new buffer. We use a clock sweep algorithm (essentially + * Need to get a new buffer. We use a clock-sweep algorithm (essentially * the same as what freelist.c does now...) */ trycounter = NLocBuffer; diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index 52a71b138f736..3a210c710f633 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -80,8 +80,8 @@ StaticAssertDecl(BUF_REFCOUNT_BITS + BUF_USAGECOUNT_BITS + BUF_FLAG_BITS == 32, * The maximum allowed value of usage_count represents a tradeoff between * accuracy and speed of the clock-sweep buffer management algorithm. A * large value (comparable to NBuffers) would approximate LRU semantics. - * But it can take as many as BM_MAX_USAGE_COUNT+1 complete cycles of - * clock sweeps to find a free buffer, so in practice we don't want the + * But it can take as many as BM_MAX_USAGE_COUNT+1 complete cycles of the + * clock-sweep hand to find a free buffer, so in practice we don't want the * value to be very large. */ #define BM_MAX_USAGE_COUNT 5 From 2c789405275928ce0d2ceb7c4add91d27df92502 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Fri, 5 Sep 2025 12:25:59 -0400 Subject: [PATCH 15/73] bufmgr: Remove freelist, always use clock-sweep This set of changes removes the list of available buffers and instead simply uses the clock-sweep algorithm to find and return an available buffer. This also removes the have_free_buffer() function and simply caps the pg_autoprewarm process to at most NBuffers. While on the surface this appears to be removing an optimization it is in fact eliminating code that induces overhead in the form of synchronization that is problematic for multi-core systems. The main reason for removing the freelist, however, is not the moderate improvement in scalability, but that having the freelist would require dedicated complexity in several upcoming patches. As we have not been able to find a case benefiting from the freelist... Author: Greg Burd Reviewed-by: Tomas Vondra Reviewed-by: Andres Freund Discussion: https://postgr.es/m/70C6A5B5-2A20-4D0B-BC73-EB09DD62D61C@getmailspring.com --- contrib/pg_prewarm/autoprewarm.c | 30 +++---- src/backend/storage/buffer/README | 40 +++------ src/backend/storage/buffer/buf_init.c | 9 -- src/backend/storage/buffer/bufmgr.c | 29 +------ src/backend/storage/buffer/freelist.c | 119 +------------------------- src/include/storage/buf_internals.h | 13 +-- 6 files changed, 31 insertions(+), 209 deletions(-) diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c index 880e897796a1e..8b68dafc2611c 100644 --- a/contrib/pg_prewarm/autoprewarm.c +++ b/contrib/pg_prewarm/autoprewarm.c @@ -370,6 +370,15 @@ apw_load_buffers(void) apw_state->prewarm_start_idx = apw_state->prewarm_stop_idx = 0; apw_state->prewarmed_blocks = 0; + /* Don't prewarm more than we can fit. */ + if (num_elements > NBuffers) + { + num_elements = NBuffers; + ereport(LOG, + (errmsg("autoprewarm capping prewarmed blocks to %d (shared_buffers size)", + NBuffers))); + } + /* Get the info position of the first block of the next database. */ while (apw_state->prewarm_start_idx < num_elements) { @@ -410,10 +419,6 @@ apw_load_buffers(void) apw_state->database = current_db; Assert(apw_state->prewarm_start_idx < apw_state->prewarm_stop_idx); - /* If we've run out of free buffers, don't launch another worker. */ - if (!have_free_buffer()) - break; - /* * Likewise, don't launch if we've already been told to shut down. * (The launch would fail anyway, but we might as well skip it.) @@ -462,12 +467,6 @@ apw_read_stream_next_block(ReadStream *stream, { BlockInfoRecord blk = p->block_info[p->pos]; - if (!have_free_buffer()) - { - p->pos = apw_state->prewarm_stop_idx; - return InvalidBlockNumber; - } - if (blk.tablespace != p->tablespace) return InvalidBlockNumber; @@ -523,10 +522,10 @@ autoprewarm_database_main(Datum main_arg) blk = block_info[i]; /* - * Loop until we run out of blocks to prewarm or until we run out of free + * Loop until we run out of blocks to prewarm or until we run out of * buffers. */ - while (i < apw_state->prewarm_stop_idx && have_free_buffer()) + while (i < apw_state->prewarm_stop_idx) { Oid tablespace = blk.tablespace; RelFileNumber filenumber = blk.filenumber; @@ -568,14 +567,13 @@ autoprewarm_database_main(Datum main_arg) /* * We have a relation; now let's loop until we find a valid fork of - * the relation or we run out of free buffers. Once we've read from - * all valid forks or run out of options, we'll close the relation and + * the relation or we run out of buffers. Once we've read from all + * valid forks or run out of options, we'll close the relation and * move on. */ while (i < apw_state->prewarm_stop_idx && blk.tablespace == tablespace && - blk.filenumber == filenumber && - have_free_buffer()) + blk.filenumber == filenumber) { ForkNumber forknum = blk.forknum; BlockNumber nblocks; diff --git a/src/backend/storage/buffer/README b/src/backend/storage/buffer/README index 4b13da5d7add8..119f31b5d6584 100644 --- a/src/backend/storage/buffer/README +++ b/src/backend/storage/buffer/README @@ -128,11 +128,11 @@ independently. If it is necessary to lock more than one partition at a time, they must be locked in partition-number order to avoid risk of deadlock. * A separate system-wide spinlock, buffer_strategy_lock, provides mutual -exclusion for operations that access the buffer free list or select -buffers for replacement. A spinlock is used here rather than a lightweight -lock for efficiency; no other locks of any sort should be acquired while -buffer_strategy_lock is held. This is essential to allow buffer replacement -to happen in multiple backends with reasonable concurrency. +exclusion for operations that select buffers for replacement. A spinlock is +used here rather than a lightweight lock for efficiency; no other locks of any +sort should be acquired while buffer_strategy_lock is held. This is essential +to allow buffer replacement to happen in multiple backends with reasonable +concurrency. * Each buffer header contains a spinlock that must be taken when examining or changing fields of that buffer header. This allows operations such as @@ -158,18 +158,8 @@ unset by sleeping on the buffer's condition variable. Normal Buffer Replacement Strategy ---------------------------------- -There is a "free list" of buffers that are prime candidates for replacement. -In particular, buffers that are completely free (contain no valid page) are -always in this list. We could also throw buffers into this list if we -consider their pages unlikely to be needed soon; however, the current -algorithm never does that. The list is singly-linked using fields in the -buffer headers; we maintain head and tail pointers in global variables. -(Note: although the list links are in the buffer headers, they are -considered to be protected by the buffer_strategy_lock, not the buffer-header -spinlocks.) To choose a victim buffer to recycle when there are no free -buffers available, we use a simple clock-sweep algorithm, which avoids the -need to take system-wide locks during common operations. It works like -this: +To choose a victim buffer to recycle we use a simple clock-sweep algorithm. It +works like this: Each buffer header contains a usage counter, which is incremented (up to a small limit value) whenever the buffer is pinned. (This requires only the @@ -184,20 +174,14 @@ The algorithm for a process that needs to obtain a victim buffer is: 1. Obtain buffer_strategy_lock. -2. If buffer free list is nonempty, remove its head buffer. Release -buffer_strategy_lock. If the buffer is pinned or has a nonzero usage count, -it cannot be used; ignore it go back to step 1. Otherwise, pin the buffer, -and return it. +2. Select the buffer pointed to by nextVictimBuffer, and circularly advance +nextVictimBuffer for next time. Release buffer_strategy_lock. -3. Otherwise, the buffer free list is empty. Select the buffer pointed to by -nextVictimBuffer, and circularly advance nextVictimBuffer for next time. -Release buffer_strategy_lock. - -4. If the selected buffer is pinned or has a nonzero usage count, it cannot +3. If the selected buffer is pinned or has a nonzero usage count, it cannot be used. Decrement its usage count (if nonzero), reacquire buffer_strategy_lock, and return to step 3 to examine the next buffer. -5. Pin the selected buffer, and return. +4. Pin the selected buffer, and return. (Note that if the selected buffer is dirty, we will have to write it out before we can recycle it; if someone else pins the buffer meanwhile we will @@ -234,7 +218,7 @@ the ring strategy effectively degrades to the normal strategy. VACUUM uses a ring like sequential scans, however, the size of this ring is controlled by the vacuum_buffer_usage_limit GUC. Dirty pages are not removed -from the ring. Instead, WAL is flushed if needed to allow reuse of the +from the ring. Instead, the WAL is flushed if needed to allow reuse of the buffers. Before introducing the buffer ring strategy in 8.3, VACUUM's buffers were sent to the freelist, which was effectively a buffer ring of 1 buffer, resulting in excessive WAL flushing. diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index ed1dc488a42b4..6fd3a6bbac5ea 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -128,20 +128,11 @@ BufferManagerShmemInit(void) pgaio_wref_clear(&buf->io_wref); - /* - * Initially link all the buffers together as unused. Subsequent - * management of this list is done by freelist.c. - */ - buf->freeNext = i + 1; - LWLockInitialize(BufferDescriptorGetContentLock(buf), LWTRANCHE_BUFFER_CONTENT); ConditionVariableInit(BufferDescriptorGetIOCV(buf)); } - - /* Correct last entry of linked list */ - GetBufferDescriptor(NBuffers - 1)->freeNext = FREENEXT_END_OF_LIST; } /* Init other shared buffer-management stuff */ diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 9fc906a4a4082..fe470de63f20c 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -2094,12 +2094,6 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, */ UnpinBuffer(victim_buf_hdr); - /* - * The victim buffer we acquired previously is clean and unused, let - * it be found again quickly - */ - StrategyFreeBuffer(victim_buf_hdr); - /* remaining code should match code at top of routine */ existing_buf_hdr = GetBufferDescriptor(existing_buf_id); @@ -2158,8 +2152,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, } /* - * InvalidateBuffer -- mark a shared buffer invalid and return it to the - * freelist. + * InvalidateBuffer -- mark a shared buffer invalid. * * The buffer header spinlock must be held at entry. We drop it before * returning. (This is sane because the caller must have locked the @@ -2257,11 +2250,6 @@ InvalidateBuffer(BufferDesc *buf) * Done with mapping lock. */ LWLockRelease(oldPartitionLock); - - /* - * Insert the buffer at the head of the list of free buffers. - */ - StrategyFreeBuffer(buf); } /* @@ -2679,11 +2667,6 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, { BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1); - /* - * The victim buffer we acquired previously is clean and unused, - * let it be found again quickly - */ - StrategyFreeBuffer(buf_hdr); UnpinBuffer(buf_hdr); } @@ -2756,12 +2739,6 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, valid = PinBuffer(existing_hdr, strategy); LWLockRelease(partition_lock); - - /* - * The victim buffer we acquired previously is clean and unused, - * let it be found again quickly - */ - StrategyFreeBuffer(victim_buf_hdr); UnpinBuffer(victim_buf_hdr); buffers[i] = BufferDescriptorGetBuffer(existing_hdr); @@ -3658,8 +3635,8 @@ BgBufferSync(WritebackContext *wb_context) uint32 new_recent_alloc; /* - * Find out where the freelist clock-sweep currently is, and how many - * buffer allocations have happened since our last call. + * Find out where the clock-sweep currently is, and how many buffer + * allocations have happened since our last call. */ strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc); diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index cd94a7d8a7b39..7d59a92bd1a88 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -39,14 +39,6 @@ typedef struct */ pg_atomic_uint32 nextVictimBuffer; - int firstFreeBuffer; /* Head of list of unused buffers */ - int lastFreeBuffer; /* Tail of list of unused buffers */ - - /* - * NOTE: lastFreeBuffer is undefined when firstFreeBuffer is -1 (that is, - * when the list is empty) - */ - /* * Statistics. These counters should be wide enough that they can't * overflow during a single bgwriter cycle. @@ -163,23 +155,6 @@ ClockSweepTick(void) return victim; } -/* - * have_free_buffer -- a lockless check to see if there is a free buffer in - * buffer pool. - * - * If the result is true that will become stale once free buffers are moved out - * by other operations, so the caller who strictly want to use a free buffer - * should not call this. - */ -bool -have_free_buffer(void) -{ - if (StrategyControl->firstFreeBuffer >= 0) - return true; - else - return false; -} - /* * StrategyGetBuffer * @@ -249,69 +224,7 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r */ pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1); - /* - * First check, without acquiring the lock, whether there's buffers in the - * freelist. Since we otherwise don't require the spinlock in every - * StrategyGetBuffer() invocation, it'd be sad to acquire it here - - * uselessly in most cases. That obviously leaves a race where a buffer is - * put on the freelist but we don't see the store yet - but that's pretty - * harmless, it'll just get used during the next buffer acquisition. - * - * If there's buffers on the freelist, acquire the spinlock to pop one - * buffer of the freelist. Then check whether that buffer is usable and - * repeat if not. - * - * Note that the freeNext fields are considered to be protected by the - * buffer_strategy_lock not the individual buffer spinlocks, so it's OK to - * manipulate them without holding the spinlock. - */ - if (StrategyControl->firstFreeBuffer >= 0) - { - while (true) - { - /* Acquire the spinlock to remove element from the freelist */ - SpinLockAcquire(&StrategyControl->buffer_strategy_lock); - - if (StrategyControl->firstFreeBuffer < 0) - { - SpinLockRelease(&StrategyControl->buffer_strategy_lock); - break; - } - - buf = GetBufferDescriptor(StrategyControl->firstFreeBuffer); - Assert(buf->freeNext != FREENEXT_NOT_IN_LIST); - - /* Unconditionally remove buffer from freelist */ - StrategyControl->firstFreeBuffer = buf->freeNext; - buf->freeNext = FREENEXT_NOT_IN_LIST; - - /* - * Release the lock so someone else can access the freelist while - * we check out this buffer. - */ - SpinLockRelease(&StrategyControl->buffer_strategy_lock); - - /* - * If the buffer is pinned or has a nonzero usage_count, we cannot - * use it; discard it and retry. (This can only happen if VACUUM - * put a valid buffer in the freelist and then someone else used - * it before we got to it. It's probably impossible altogether as - * of 8.3, but we'd better check anyway.) - */ - local_buf_state = LockBufHdr(buf); - if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0 - && BUF_STATE_GET_USAGECOUNT(local_buf_state) == 0) - { - if (strategy != NULL) - AddBufferToRing(strategy, buf); - *buf_state = local_buf_state; - return buf; - } - UnlockBufHdr(buf, local_buf_state); - } - } - - /* Nothing on the freelist, so run the "clock-sweep" algorithm */ + /* Use the "clock sweep" algorithm to find a free buffer */ trycounter = NBuffers; for (;;) { @@ -356,29 +269,6 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r } } -/* - * StrategyFreeBuffer: put a buffer on the freelist - */ -void -StrategyFreeBuffer(BufferDesc *buf) -{ - SpinLockAcquire(&StrategyControl->buffer_strategy_lock); - - /* - * It is possible that we are told to put something in the freelist that - * is already in it; don't screw up the list if so. - */ - if (buf->freeNext == FREENEXT_NOT_IN_LIST) - { - buf->freeNext = StrategyControl->firstFreeBuffer; - if (buf->freeNext < 0) - StrategyControl->lastFreeBuffer = buf->buf_id; - StrategyControl->firstFreeBuffer = buf->buf_id; - } - - SpinLockRelease(&StrategyControl->buffer_strategy_lock); -} - /* * StrategySyncStart -- tell BgBufferSync where to start syncing * @@ -504,13 +394,6 @@ StrategyInitialize(bool init) SpinLockInit(&StrategyControl->buffer_strategy_lock); - /* - * Grab the whole linked list of free buffers for our strategy. We - * assume it was previously set up by BufferManagerShmemInit(). - */ - StrategyControl->firstFreeBuffer = 0; - StrategyControl->lastFreeBuffer = NBuffers - 1; - /* Initialize the clock-sweep pointer */ pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0); diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index 3a210c710f633..dfd614f7ca449 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -217,8 +217,7 @@ BufMappingPartitionLockByIndex(uint32 index) * single atomic variable. This layout allow us to do some operations in a * single atomic operation, without actually acquiring and releasing spinlock; * for instance, increase or decrease refcount. buf_id field never changes - * after initialization, so does not need locking. freeNext is protected by - * the buffer_strategy_lock not buffer header lock. The LWLock can take care + * after initialization, so does not need locking. The LWLock can take care * of itself. The buffer header lock is *not* used to control access to the * data in the buffer! * @@ -264,7 +263,6 @@ typedef struct BufferDesc pg_atomic_uint32 state; int wait_backend_pgprocno; /* backend of pin-count waiter */ - int freeNext; /* link in freelist chain */ PgAioWaitRef io_wref; /* set iff AIO is in progress */ LWLock content_lock; /* to lock access to buffer contents */ @@ -360,13 +358,6 @@ BufferDescriptorGetContentLock(const BufferDesc *bdesc) return (LWLock *) (&bdesc->content_lock); } -/* - * The freeNext field is either the index of the next freelist entry, - * or one of these special values: - */ -#define FREENEXT_END_OF_LIST (-1) -#define FREENEXT_NOT_IN_LIST (-2) - /* * Functions for acquiring/releasing a shared buffer header's spinlock. Do * not apply these to local buffers! @@ -444,7 +435,6 @@ extern void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag extern IOContext IOContextForStrategy(BufferAccessStrategy strategy); extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_ring); -extern void StrategyFreeBuffer(BufferDesc *buf); extern bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring); @@ -453,7 +443,6 @@ extern void StrategyNotifyBgWriter(int bgwprocno); extern Size StrategyShmemSize(void); extern void StrategyInitialize(bool init); -extern bool have_free_buffer(void); /* buf_table.c */ extern Size BufTableShmemSize(int size); From 06473f5a344df8c9594ead90a609b86f6724cff8 Mon Sep 17 00:00:00 2001 From: Tatsuo Ishii Date: Sat, 6 Sep 2025 07:49:51 +0900 Subject: [PATCH 16/73] Allow to log raw parse tree. This commit allows to log the raw parse tree in the same way we currently log the parse tree, rewritten tree, and plan tree. To avoid unnecessary log noise for users not interested in this detail, a new GUC option, "debug_print_raw_parse", has been added. When starting the PostgreSQL process with "-d N", and N is 3 or higher, debug_print_raw_parse is enabled automatically, alongside debug_print_parse. Author: Chao Li Reviewed-by: Tender Wang Reviewed-by: Tatsuo Ishii Reviewed-by: John Naylor Discussion: https://postgr.es/m/CAEoWx2mcO0Gpo4vd8kPMAFWeJLSp0MeUUnaLdE1x0tSVd-VzUw%40mail.gmail.com --- doc/src/sgml/config.sgml | 12 +++++++++--- doc/src/sgml/rules.sgml | 1 + src/backend/tcop/postgres.c | 7 +++++++ src/backend/utils/misc/guc_parameters.dat | 6 ++++++ src/backend/utils/misc/guc_tables.c | 1 + src/backend/utils/misc/postgresql.conf.sample | 1 + src/include/utils/guc.h | 1 + 7 files changed, 26 insertions(+), 3 deletions(-) diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 0a4b3e55ba5ed..2a3685f474a96 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -7383,6 +7383,11 @@ local0.* /var/log/postgresql + debug_print_raw_parse (boolean) + + debug_print_raw_parse configuration parameter + + debug_print_parse (boolean) debug_print_parse configuration parameter @@ -7401,8 +7406,8 @@ local0.* /var/log/postgresql These parameters enable various debugging output to be emitted. - When set, they print the resulting parse tree, the query rewriter - output, or the execution plan for each executed query. + When set, they print the resulting raw parse tree, the parse tree, the query + rewriter output, or the execution plan for each executed query. These messages are emitted at LOG message level, so by default they will appear in the server log but will not be sent to the client. You can change that by adjusting @@ -7422,7 +7427,8 @@ local0.* /var/log/postgresql When set, debug_pretty_print indents the messages - produced by debug_print_parse, + produced by debug_print_raw_parse, + debug_print_parse, debug_print_rewritten, or debug_print_plan. This results in more readable but much longer output than the compact format used when diff --git a/doc/src/sgml/rules.sgml b/doc/src/sgml/rules.sgml index 8467d961fd0a0..282dcd722d495 100644 --- a/doc/src/sgml/rules.sgml +++ b/doc/src/sgml/rules.sgml @@ -60,6 +60,7 @@ SQL statement where the single parts that it is built from are stored separately. These query trees can be shown in the server log if you set the configuration parameters + debug_print_raw_parse, debug_print_parse, debug_print_rewritten, or debug_print_plan. The rule actions are also diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 0cecd4649020f..d356830f756be 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -649,6 +649,10 @@ pg_parse_query(const char *query_string) TRACE_POSTGRESQL_QUERY_PARSE_DONE(query_string); + if (Debug_print_raw_parse) + elog_node_display(LOG, "raw parse tree", raw_parsetree_list, + Debug_pretty_print); + return raw_parsetree_list; } @@ -3697,7 +3701,10 @@ set_debug_options(int debug_flag, GucContext context, GucSource source) if (debug_flag >= 2) SetConfigOption("log_statement", "all", context, source); if (debug_flag >= 3) + { + SetConfigOption("debug_print_raw_parse", "true", context, source); SetConfigOption("debug_print_parse", "true", context, source); + } if (debug_flag >= 4) SetConfigOption("debug_print_plan", "true", context, source); if (debug_flag >= 5) diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index a157cec3c4d00..0da01627cfec1 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -414,6 +414,12 @@ ifdef => 'DEBUG_NODE_TESTS_ENABLED', }, +{ name => 'debug_print_raw_parse', type => 'bool', context => 'PGC_USERSET', group => 'LOGGING_WHAT', + short_desc => 'Logs each query\'s raw parse tree.', + variable => 'Debug_print_raw_parse', + boot_val => 'false', +}, + { name => 'debug_print_parse', type => 'bool', context => 'PGC_USERSET', group => 'LOGGING_WHAT', short_desc => 'Logs each query\'s parse tree.', variable => 'Debug_print_parse', diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 787933a9e5acd..00c8376cf4ded 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -507,6 +507,7 @@ bool AllowAlterSystem = true; bool log_duration = false; bool Debug_print_plan = false; bool Debug_print_parse = false; +bool Debug_print_raw_parse = false; bool Debug_print_rewritten = false; bool Debug_pretty_print = true; diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index a9d8293474af5..26c0869356485 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -581,6 +581,7 @@ # - What to Log - +#debug_print_raw_parse = off #debug_print_parse = off #debug_print_rewritten = off #debug_print_plan = off diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h index 72981053e610f..756e80a2c2fcc 100644 --- a/src/include/utils/guc.h +++ b/src/include/utils/guc.h @@ -247,6 +247,7 @@ typedef enum /* GUC vars that are actually defined in guc_tables.c, rather than elsewhere */ extern PGDLLIMPORT bool Debug_print_plan; extern PGDLLIMPORT bool Debug_print_parse; +extern PGDLLIMPORT bool Debug_print_raw_parse; extern PGDLLIMPORT bool Debug_print_rewritten; extern PGDLLIMPORT bool Debug_pretty_print; From 43eb2c541941479714c11de9cfb7c67b54f1810d Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Mon, 8 Sep 2025 10:07:14 +0900 Subject: [PATCH 17/73] Update parser README to include parse_jsontable.c The README was missing parse_jsontable.c which handles JSON_TABLE. Oversight in de3600452b61. Author: Karthik S Discussion: https://postgr.es/m/CAK4gQD9gdcj+vq_FZGp=Rv-W+41v8_C7cmCUmDeu=cfrOdfXEw@mail.gmail.com Backpatch-through: 17 --- src/backend/parser/README | 1 + 1 file changed, 1 insertion(+) diff --git a/src/backend/parser/README b/src/backend/parser/README index e0c986a41efea..e26eb437a9f35 100644 --- a/src/backend/parser/README +++ b/src/backend/parser/README @@ -20,6 +20,7 @@ parse_cte.c handle Common Table Expressions (WITH clauses) parse_expr.c handle expressions like col, col + 3, x = 3 or x = 4 parse_enr.c handle ephemeral named rels (trigger transition tables, ...) parse_func.c handle functions, table.column and column identifiers +parse_jsontable.c handle JSON_TABLE parse_merge.c handle MERGE parse_node.c create nodes for various structures parse_oper.c handle operators in expressions From 1f7e9ba3ac4eff13041abcc4c9c517ad835fa449 Mon Sep 17 00:00:00 2001 From: Amit Kapila Date: Mon, 8 Sep 2025 06:10:15 +0000 Subject: [PATCH 18/73] Post-commit review fixes for 228c370868. This commit fixes three issues: 1) When a disabled subscription is created with retain_dead_tuples set to true, the launcher is not woken up immediately, which may lead to delays in creating the conflict detection slot. Creating the conflict detection slot is essential even when the subscription is not enabled. This ensures that dead tuples are retained, which is necessary for accurately identifying the type of conflict during replication. 2) Conflict-related data was unnecessarily retained when the subscription does not have a table. 3) Conflict-relevant data could be prematurely removed before applying prepared transactions on the publisher that are in the commit critical section. This issue occurred because the backend executing COMMIT PREPARED was not accounted for during the computation of oldestXid in the commit phase on the publisher. As a result, the subscriber could advance the conflict slot's xmin without waiting for such COMMIT PREPARED transactions to complete. We fixed this issue by identifying prepared transactions that are in the commit critical section during computation of oldestXid in commit phase. Author: Zhijie Hou Reviewed-by: shveta malik Reviewed-by: Dilip Kumar Reviewed-by: Nisha Moond Reviewed-by: Amit Kapila Discussion: https://postgr.es/m/OS9PR01MB16913DACB64E5721872AA5C02943BA@OS9PR01MB16913.jpnprd01.prod.outlook.com Discussion: https://postgr.es/m/OS9PR01MB16913F67856B0DA2A909788129400A@OS9PR01MB16913.jpnprd01.prod.outlook.com --- src/backend/access/transam/twophase.c | 55 +++++++++++++++++++++ src/backend/commands/subscriptioncmds.c | 12 ++++- src/backend/replication/logical/tablesync.c | 26 ++++++++++ src/backend/replication/logical/worker.c | 25 ++++++++-- src/backend/replication/walsender.c | 12 +++++ src/include/access/twophase.h | 2 + src/include/replication/worker_internal.h | 1 + src/test/subscription/t/035_conflicts.pl | 29 +++++++++++ 8 files changed, 157 insertions(+), 5 deletions(-) diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 7918176fc588e..3e20f4487872e 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -2809,3 +2809,58 @@ LookupGXactBySubid(Oid subid) return found; } + +/* + * TwoPhaseGetXidByLockingProc + * Return the oldest transaction ID from prepared transactions that are + * currently in the commit critical section. + * + * This function only considers transactions in the currently connected + * database. If no matching transactions are found, it returns + * InvalidTransactionId. + */ +TransactionId +TwoPhaseGetOldestXidInCommit(void) +{ + TransactionId oldestRunningXid = InvalidTransactionId; + + LWLockAcquire(TwoPhaseStateLock, LW_SHARED); + + for (int i = 0; i < TwoPhaseState->numPrepXacts; i++) + { + GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; + PGPROC *commitproc; + TransactionId xid; + + if (!gxact->valid) + continue; + + if (gxact->locking_backend == INVALID_PROC_NUMBER) + continue; + + /* + * Get the backend that is handling the transaction. It's safe to + * access this backend while holding TwoPhaseStateLock, as the backend + * can only be destroyed after either removing or unlocking the + * current global transaction, both of which require an exclusive + * TwoPhaseStateLock. + */ + commitproc = GetPGProcByNumber(gxact->locking_backend); + + if (MyDatabaseId != commitproc->databaseId) + continue; + + if ((commitproc->delayChkptFlags & DELAY_CHKPT_IN_COMMIT) == 0) + continue; + + xid = XidFromFullTransactionId(gxact->fxid); + + if (!TransactionIdIsValid(oldestRunningXid) || + TransactionIdPrecedes(xid, oldestRunningXid)) + oldestRunningXid = xid; + } + + LWLockRelease(TwoPhaseStateLock); + + return oldestRunningXid; +} diff --git a/src/backend/commands/subscriptioncmds.c b/src/backend/commands/subscriptioncmds.c index 82cf65fae737a..750d262fccade 100644 --- a/src/backend/commands/subscriptioncmds.c +++ b/src/backend/commands/subscriptioncmds.c @@ -854,7 +854,17 @@ CreateSubscription(ParseState *pstate, CreateSubscriptionStmt *stmt, pgstat_create_subscription(subid); - if (opts.enabled) + /* + * Notify the launcher to start the apply worker if the subscription is + * enabled, or to create the conflict detection slot if retain_dead_tuples + * is enabled. + * + * Creating the conflict detection slot is essential even when the + * subscription is not enabled. This ensures that dead tuples are + * retained, which is necessary for accurately identifying the type of + * conflict during replication. + */ + if (opts.enabled || opts.retaindeadtuples) ApplyLauncherWakeupAtCommit(); ObjectAddressSet(myself, SubscriptionRelationId, subid); diff --git a/src/backend/replication/logical/tablesync.c b/src/backend/replication/logical/tablesync.c index d3356bc84ee0c..e6da4028d392e 100644 --- a/src/backend/replication/logical/tablesync.c +++ b/src/backend/replication/logical/tablesync.c @@ -1788,6 +1788,32 @@ AllTablesyncsReady(void) return has_subrels && (table_states_not_ready == NIL); } +/* + * Return whether the subscription currently has any relations. + * + * Note: Unlike HasSubscriptionRelations(), this function relies on cached + * information for subscription relations. Additionally, it should not be + * invoked outside of apply or tablesync workers, as MySubscription must be + * initialized first. + */ +bool +HasSubscriptionRelationsCached(void) +{ + bool started_tx; + bool has_subrels; + + /* We need up-to-date subscription tables info here */ + has_subrels = FetchTableStates(&started_tx); + + if (started_tx) + { + CommitTransactionCommand(); + pgstat_report_stat(true); + } + + return has_subrels; +} + /* * Update the two_phase state of the specified subscription in pg_subscription. */ diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c index f1ebd63e792ee..c0f6bef5c282c 100644 --- a/src/backend/replication/logical/worker.c +++ b/src/backend/replication/logical/worker.c @@ -4595,11 +4595,28 @@ wait_for_local_flush(RetainDeadTuplesData *rdt_data) * workers is complex and not worth the effort, so we simply return if not * all tables are in the READY state. * - * It is safe to add new tables with initial states to the subscription - * after this check because any changes applied to these tables should - * have a WAL position greater than the rdt_data->remote_lsn. + * Advancing the transaction ID is necessary even when no tables are + * currently subscribed, to avoid retaining dead tuples unnecessarily. + * While it might seem safe to skip all phases and directly assign + * candidate_xid to oldest_nonremovable_xid during the + * RDT_GET_CANDIDATE_XID phase in such cases, this is unsafe. If users + * concurrently add tables to the subscription, the apply worker may not + * process invalidations in time. Consequently, + * HasSubscriptionRelationsCached() might miss the new tables, leading to + * premature advancement of oldest_nonremovable_xid. + * + * Performing the check during RDT_WAIT_FOR_LOCAL_FLUSH is safe, as + * invalidations are guaranteed to be processed before applying changes + * from newly added tables while waiting for the local flush to reach + * remote_lsn. + * + * Additionally, even if we check for subscription tables during + * RDT_GET_CANDIDATE_XID, they might be dropped before reaching + * RDT_WAIT_FOR_LOCAL_FLUSH. Therefore, it's still necessary to verify + * subscription tables at this stage to prevent unnecessary tuple + * retention. */ - if (!AllTablesyncsReady()) + if (HasSubscriptionRelationsCached() && !AllTablesyncsReady()) { TimestampTz now; diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index e3dce9dc68d04..59822f22b8d06 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -51,6 +51,7 @@ #include "access/timeline.h" #include "access/transam.h" +#include "access/twophase.h" #include "access/xact.h" #include "access/xlog_internal.h" #include "access/xlogreader.h" @@ -2719,6 +2720,7 @@ ProcessStandbyPSRequestMessage(void) { XLogRecPtr lsn = InvalidXLogRecPtr; TransactionId oldestXidInCommit; + TransactionId oldestGXidInCommit; FullTransactionId nextFullXid; FullTransactionId fullOldestXidInCommit; WalSnd *walsnd = MyWalSnd; @@ -2746,6 +2748,16 @@ ProcessStandbyPSRequestMessage(void) * ones replicated. */ oldestXidInCommit = GetOldestActiveTransactionId(true, false); + oldestGXidInCommit = TwoPhaseGetOldestXidInCommit(); + + /* + * Update the oldest xid for standby transmission if an older prepared + * transaction exists and is currently in commit phase. + */ + if (TransactionIdIsValid(oldestGXidInCommit) && + TransactionIdPrecedes(oldestGXidInCommit, oldestXidInCommit)) + oldestXidInCommit = oldestGXidInCommit; + nextFullXid = ReadNextFullTransactionId(); fullOldestXidInCommit = FullTransactionIdFromAllowableAt(nextFullXid, oldestXidInCommit); diff --git a/src/include/access/twophase.h b/src/include/access/twophase.h index 509bdad9a5d55..64463e9f4afb4 100644 --- a/src/include/access/twophase.h +++ b/src/include/access/twophase.h @@ -68,4 +68,6 @@ extern void TwoPhaseTransactionGid(Oid subid, TransactionId xid, char *gid_res, int szgid); extern bool LookupGXactBySubid(Oid subid); +extern TransactionId TwoPhaseGetOldestXidInCommit(void); + #endif /* TWOPHASE_H */ diff --git a/src/include/replication/worker_internal.h b/src/include/replication/worker_internal.h index 62ea1a0058081..de00380261279 100644 --- a/src/include/replication/worker_internal.h +++ b/src/include/replication/worker_internal.h @@ -272,6 +272,7 @@ extern void ReplicationOriginNameForLogicalRep(Oid suboid, Oid relid, char *originname, Size szoriginname); extern bool AllTablesyncsReady(void); +extern bool HasSubscriptionRelationsCached(void); extern void UpdateTwoPhaseState(Oid suboid, char new_state); extern void process_syncing_tables(XLogRecPtr current_lsn); diff --git a/src/test/subscription/t/035_conflicts.pl b/src/test/subscription/t/035_conflicts.pl index 51b23a39fa935..e06429c288fe6 100644 --- a/src/test/subscription/t/035_conflicts.pl +++ b/src/test/subscription/t/035_conflicts.pl @@ -386,6 +386,35 @@ .*Remote row \(2, 4\); replica identity full \(2, 2\)/, 'update target row was deleted in tab'); +############################################################################### +# Check that the xmin value of the conflict detection slot can be advanced when +# the subscription has no tables. +############################################################################### + +# Remove the table from the publication +$node_B->safe_psql('postgres', "ALTER PUBLICATION tap_pub_B DROP TABLE tab"); + +$node_A->safe_psql('postgres', + "ALTER SUBSCRIPTION $subname_AB REFRESH PUBLICATION"); + +# Remember the next transaction ID to be assigned +$next_xid = $node_A->safe_psql('postgres', "SELECT txid_current() + 1;"); + +# Confirm that the xmin value is advanced to the latest nextXid. If no +# transactions are running, the apply worker selects nextXid as the candidate +# for the non-removable xid. See GetOldestActiveTransactionId(). +ok( $node_A->poll_query_until( + 'postgres', + "SELECT xmin = $next_xid from pg_replication_slots WHERE slot_name = 'pg_conflict_detection'" + ), + "the xmin value of slot 'pg_conflict_detection' is updated on Node A"); + +# Re-add the table to the publication for further tests +$node_B->safe_psql('postgres', "ALTER PUBLICATION tap_pub_B ADD TABLE tab"); + +$node_A->safe_psql('postgres', + "ALTER SUBSCRIPTION $subname_AB REFRESH PUBLICATION WITH (copy_data = false)"); + ############################################################################### # Check that dead tuple retention stops due to the wait time surpassing # max_retention_duration. From 8191e0c16a0373f851a9f5a8112e3aec105b5276 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Mon, 8 Sep 2025 15:52:23 +0900 Subject: [PATCH 19/73] Fix corruption of pgstats shared hashtable due to OOM failures A new pgstats entry is created as a two-step process: - The entry is looked at in the shared hashtable of pgstats, and is inserted if not found. - When not found and inserted, its fields are then initialized. This part include a DSA chunk allocation for the stats data of the new entry. As currently coded, if the DSA chunk allocation fails due to an out-of-memory failure, an ERROR is generated, leaving in the pgstats shared hashtable an inconsistent entry due to the first step, as the entry has already been inserted in the hashtable. These broken entries can then be found by other backends, crashing them. There are only two callers of pgstat_init_entry(), when loading the pgstats file at startup and when creating a new pgstats entry. This commit changes pgstat_init_entry() so as we use dsa_allocate_extended() with DSA_ALLOC_NO_OOM, making it return NULL on allocation failure instead of failing. This way, a backend failing an entry creation can take appropriate cleanup actions in the shared hashtable before throwing an error. Currently, this means removing the entry from the shared hashtable before throwing the error for the allocation failure. Out-of-memory errors unlikely happen in the wild, and we do not bother with back-patches when these are fixed, usually. However, the problem dealt with here is a degree worse as it breaks the shared memory state of pgstats, impacting other processes that may look at an inconsistent entry that a different process has failed to create. Author: Mikhail Kot Discussion: https://postgr.es/m/CAAi9E7jELo5_-sBENftnc2E8XhW2PKZJWfTC3i2y-GMQd2bcqQ@mail.gmail.com Backpatch-through: 15 --- src/backend/utils/activity/pgstat.c | 11 +++++++++ src/backend/utils/activity/pgstat_shmem.c | 28 ++++++++++++++++++++++- 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/src/backend/utils/activity/pgstat.c b/src/backend/utils/activity/pgstat.c index ffb5b8cce3441..f8e91484e36be 100644 --- a/src/backend/utils/activity/pgstat.c +++ b/src/backend/utils/activity/pgstat.c @@ -1975,6 +1975,17 @@ pgstat_read_statsfile(void) header = pgstat_init_entry(key.kind, p); dshash_release_lock(pgStatLocal.shared_hash, p); + if (header == NULL) + { + /* + * It would be tempting to switch this ERROR to a + * WARNING, but it would mean that all the statistics + * are discarded when the environment fails on OOM. + */ + elog(ERROR, "could not allocate entry %u/%u/%" PRIu64 " of type %c", + key.kind, key.dboid, + key.objid, t); + } if (!read_chunk(fpin, pgstat_get_entry_data(key.kind, header), diff --git a/src/backend/utils/activity/pgstat_shmem.c b/src/backend/utils/activity/pgstat_shmem.c index 62de347445365..9dc3212f7dd01 100644 --- a/src/backend/utils/activity/pgstat_shmem.c +++ b/src/backend/utils/activity/pgstat_shmem.c @@ -289,6 +289,13 @@ pgstat_detach_shmem(void) * ------------------------------------------------------------ */ +/* + * Initialize entry newly-created. + * + * Returns NULL in the event of an allocation failure, so as callers can + * take cleanup actions as the entry initialized is already inserted in the + * shared hashtable. + */ PgStatShared_Common * pgstat_init_entry(PgStat_Kind kind, PgStatShared_HashEntry *shhashent) @@ -311,7 +318,12 @@ pgstat_init_entry(PgStat_Kind kind, pg_atomic_init_u32(&shhashent->generation, 0); shhashent->dropped = false; - chunk = dsa_allocate0(pgStatLocal.dsa, pgstat_get_kind_info(kind)->shared_size); + chunk = dsa_allocate_extended(pgStatLocal.dsa, + pgstat_get_kind_info(kind)->shared_size, + DSA_ALLOC_ZERO | DSA_ALLOC_NO_OOM); + if (chunk == InvalidDsaPointer) + return NULL; + shheader = dsa_get_address(pgStatLocal.dsa, chunk); shheader->magic = 0xdeadbeef; @@ -509,6 +521,20 @@ pgstat_get_entry_ref(PgStat_Kind kind, Oid dboid, uint64 objid, bool create, if (!shfound) { shheader = pgstat_init_entry(kind, shhashent); + if (shheader == NULL) + { + /* + * Failed the allocation of a new entry, so clean up the + * shared hashtable before giving up. + */ + dshash_delete_entry(pgStatLocal.shared_hash, shhashent); + + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"), + errdetail("Failed while allocating entry %u/%u/%" PRIu64 ".", + key.kind, key.dboid, key.objid))); + } pgstat_acquire_entry_ref(entry_ref, shhashent, shheader); if (created_entry != NULL) From 6456c6e2c4ad1cf9752e09cce37bfcfe2190c5e0 Mon Sep 17 00:00:00 2001 From: Amit Kapila Date: Mon, 8 Sep 2025 11:38:02 +0000 Subject: [PATCH 20/73] Add test to prevent premature removal of conflict-relevant data. A test has been added to ensure that conflict-relevant data is not prematurely removed when a concurrent prepared transaction is being committed on the publisher. This test introduces an injection point that simulates the presence of a prepared transaction in the commit phase, validating that the system correctly delays conflict slot advancement until the transaction is fully committed. Additionally, the test serves as a safeguard for developers, ensuring that the acquisition of the commit timestamp does not occur before marking DELAY_CHKPT_IN_COMMIT in RecordTransactionCommitPrepared. Reported-by: Robert Haas Author: Zhijie Hou Reviewed-by: shveta malik Reviewed-by: Amit Kapila Discussion: https://postgr.es/m/OS9PR01MB16913F67856B0DA2A909788129400A@OS9PR01MB16913.jpnprd01.prod.outlook.com --- src/backend/access/transam/twophase.c | 6 + src/test/subscription/Makefile | 4 +- src/test/subscription/meson.build | 5 +- src/test/subscription/t/035_conflicts.pl | 160 +++++++++++++++++++++++ 4 files changed, 173 insertions(+), 2 deletions(-) diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 3e20f4487872e..d8e2fce2c99b7 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -103,6 +103,7 @@ #include "storage/proc.h" #include "storage/procarray.h" #include "utils/builtins.h" +#include "utils/injection_point.h" #include "utils/memutils.h" #include "utils/timestamp.h" @@ -2332,12 +2333,17 @@ RecordTransactionCommitPrepared(TransactionId xid, replorigin = (replorigin_session_origin != InvalidRepOriginId && replorigin_session_origin != DoNotReplicateId); + /* Load the injection point before entering the critical section */ + INJECTION_POINT_LOAD("commit-after-delay-checkpoint"); + START_CRIT_SECTION(); /* See notes in RecordTransactionCommit */ Assert((MyProc->delayChkptFlags & DELAY_CHKPT_IN_COMMIT) == 0); MyProc->delayChkptFlags |= DELAY_CHKPT_IN_COMMIT; + INJECTION_POINT_CACHED("commit-after-delay-checkpoint", NULL); + /* * Ensures the DELAY_CHKPT_IN_COMMIT flag write is globally visible before * commit time is written. diff --git a/src/test/subscription/Makefile b/src/test/subscription/Makefile index 50b65d8f6ea21..9d97e7d5c0d6d 100644 --- a/src/test/subscription/Makefile +++ b/src/test/subscription/Makefile @@ -13,9 +13,11 @@ subdir = src/test/subscription top_builddir = ../../.. include $(top_builddir)/src/Makefile.global -EXTRA_INSTALL = contrib/hstore +EXTRA_INSTALL = contrib/hstore \ + src/test/modules/injection_points export with_icu +export enable_injection_points check: $(prove_check) diff --git a/src/test/subscription/meson.build b/src/test/subscription/meson.build index 586ffba434e11..20b4e523d9307 100644 --- a/src/test/subscription/meson.build +++ b/src/test/subscription/meson.build @@ -5,7 +5,10 @@ tests += { 'sd': meson.current_source_dir(), 'bd': meson.current_build_dir(), 'tap': { - 'env': {'with_icu': icu.found() ? 'yes' : 'no'}, + 'env': { + 'with_icu': icu.found() ? 'yes' : 'no', + 'enable_injection_points': get_option('injection_points') ? 'yes' : 'no', + }, 'tests': [ 't/001_rep_changes.pl', 't/002_types.pl', diff --git a/src/test/subscription/t/035_conflicts.pl b/src/test/subscription/t/035_conflicts.pl index e06429c288fe6..db0d5b464e825 100644 --- a/src/test/subscription/t/035_conflicts.pl +++ b/src/test/subscription/t/035_conflicts.pl @@ -415,6 +415,166 @@ $node_A->safe_psql('postgres', "ALTER SUBSCRIPTION $subname_AB REFRESH PUBLICATION WITH (copy_data = false)"); +############################################################################### +# Test that publisher's transactions marked with DELAY_CHKPT_IN_COMMIT prevent +# concurrently deleted tuples on the subscriber from being removed. This test +# also acts as a safeguard to prevent developers from moving the commit +# timestamp acquisition before marking DELAY_CHKPT_IN_COMMIT in +# RecordTransactionCommitPrepared. +############################################################################### + +my $injection_points_supported = $node_B->check_extension('injection_points'); + +# This test depends on an injection point to block the prepared transaction +# commit after marking DELAY_CHKPT_IN_COMMIT flag. +if ($injection_points_supported != 0) +{ + $node_B->append_conf('postgresql.conf', + "shared_preload_libraries = 'injection_points' + max_prepared_transactions = 1"); + $node_B->restart; + + # Disable the subscription on Node B for testing only one-way + # replication. + $node_B->psql('postgres', "ALTER SUBSCRIPTION $subname_BA DISABLE;"); + + # Wait for the apply worker to stop + $node_B->poll_query_until('postgres', + "SELECT count(*) = 0 FROM pg_stat_activity WHERE backend_type = 'logical replication apply worker'" + ); + + # Truncate the table to cleanup existing dead rows in the table. Then insert + # a new row. + $node_B->safe_psql( + 'postgres', qq( + TRUNCATE tab; + INSERT INTO tab VALUES(1, 1); + )); + + $node_B->wait_for_catchup($subname_AB); + + # Create the injection_points extension on the publisher node and attach to the + # commit-after-delay-checkpoint injection point. + $node_B->safe_psql( + 'postgres', + "CREATE EXTENSION injection_points; + SELECT injection_points_attach('commit-after-delay-checkpoint', 'wait');" + ); + + # Start a background session on the publisher node to perform an update and + # pause at the injection point. + my $pub_session = $node_B->background_psql('postgres'); + $pub_session->query_until( + qr/starting_bg_psql/, + q{ + \echo starting_bg_psql + BEGIN; + UPDATE tab SET b = 2 WHERE a = 1; + PREPARE TRANSACTION 'txn_with_later_commit_ts'; + COMMIT PREPARED 'txn_with_later_commit_ts'; + } + ); + + # Confirm the update is suspended + $result = + $node_B->safe_psql('postgres', 'SELECT * FROM tab WHERE a = 1'); + is($result, qq(1|1), 'publisher sees the old row'); + + # Delete the row on the subscriber. The deleted row should be retained due to a + # transaction on the publisher, which is currently marked with the + # DELAY_CHKPT_IN_COMMIT flag. + $node_A->safe_psql('postgres', "DELETE FROM tab WHERE a = 1;"); + + # Get the commit timestamp for the delete + my $sub_ts = $node_A->safe_psql('postgres', + "SELECT timestamp FROM pg_last_committed_xact();"); + + $log_location = -s $node_A->logfile; + + # Confirm that the apply worker keeps requesting publisher status, while + # awaiting the prepared transaction to commit. Thus, the request log should + # appear more than once. + $node_A->wait_for_log( + qr/sending publisher status request message/, + $log_location); + + $log_location = -s $node_A->logfile; + + $node_A->wait_for_log( + qr/sending publisher status request message/, + $log_location); + + # Confirm that the dead tuple cannot be removed + ($cmdret, $stdout, $stderr) = + $node_A->psql('postgres', qq(VACUUM (verbose) public.tab;)); + + ok($stderr =~ qr/1 are dead but not yet removable/, + 'the deleted column is non-removable'); + + $log_location = -s $node_A->logfile; + + # Wakeup and detach the injection point on the publisher node. The prepared + # transaction should now commit. + $node_B->safe_psql( + 'postgres', + "SELECT injection_points_wakeup('commit-after-delay-checkpoint'); + SELECT injection_points_detach('commit-after-delay-checkpoint');" + ); + + # Close the background session on the publisher node + ok($pub_session->quit, "close publisher session"); + + # Confirm that the transaction committed + $result = + $node_B->safe_psql('postgres', 'SELECT * FROM tab WHERE a = 1'); + is($result, qq(1|2), 'publisher sees the new row'); + + # Ensure the UPDATE is replayed on subscriber + $node_B->wait_for_catchup($subname_AB); + + $logfile = slurp_file($node_A->logfile(), $log_location); + ok( $logfile =~ + qr/conflict detected on relation "public.tab": conflict=update_deleted.* +.*DETAIL:.* The row to be updated was deleted locally in transaction [0-9]+ at .* +.*Remote row \(1, 2\); replica identity full \(1, 1\)/, + 'update target row was deleted in tab'); + + # Remember the next transaction ID to be assigned + $next_xid = + $node_A->safe_psql('postgres', "SELECT txid_current() + 1;"); + + # Confirm that the xmin value is advanced to the latest nextXid after the + # prepared transaction on the publisher has been committed. + ok( $node_A->poll_query_until( + 'postgres', + "SELECT xmin = $next_xid from pg_replication_slots WHERE slot_name = 'pg_conflict_detection'" + ), + "the xmin value of slot 'pg_conflict_detection' is updated on subscriber" + ); + + # Confirm that the dead tuple can be removed now + ($cmdret, $stdout, $stderr) = + $node_A->psql('postgres', qq(VACUUM (verbose) public.tab;)); + + ok($stderr =~ qr/1 removed, 0 remain, 0 are dead but not yet removable/, + 'the deleted column is removed'); + + # Get the commit timestamp for the publisher's update + my $pub_ts = $node_B->safe_psql('postgres', + "SELECT pg_xact_commit_timestamp(xmin) from tab where a=1;"); + + # Check that the commit timestamp for the update on the publisher is later than + # or equal to the timestamp of the local deletion, as the commit timestamp + # should be assigned after marking the DELAY_CHKPT_IN_COMMIT flag. + $result = $node_B->safe_psql('postgres', + "SELECT '$pub_ts'::timestamp >= '$sub_ts'::timestamp"); + is($result, qq(t), + "pub UPDATE's timestamp is later than that of sub's DELETE"); + + # Re-enable the subscription for further tests + $node_B->psql('postgres', "ALTER SUBSCRIPTION $subname_BA ENABLE;"); +} + ############################################################################### # Check that dead tuple retention stops due to the wait time surpassing # max_retention_duration. From 3399c265543ec3cdbeff2fa2900e03b326705f63 Mon Sep 17 00:00:00 2001 From: Melanie Plageman Date: Mon, 8 Sep 2025 10:22:42 -0400 Subject: [PATCH 21/73] Remove unneeded VM pin from VM replay Previously, heap_xlog_visible() called visibilitymap_pin() even after getting a buffer from XLogReadBufferForRedoExtended() -- which returns a pinned buffer containing the specified block of the visibility map. This would just have resulted in visibilitymap_pin() returning early since the specified page was already present and pinned, but it was confusing extraneous code, so remove it. It doesn't seem worth backporting, though. It appears to be an oversight in 2c03216. While we are at it, remove two VM-related redundant asserts in the COPY FREEZE code path. visibilitymap_set() already asserts that PD_ALL_VISIBLE is set on the heap page and checks that the vmbuffer contains the bits corresponding to the specified heap block, so callers do not also need to check this. Author: Melanie Plageman Reported-by: Melanie Plageman Reported-by: Kirill Reshke Reviewed-by: Kirill Reshke Reviewed-by: Andres Freund Discussion: https://postgr.es/m/CALdSSPhu7WZd%2BEfQDha1nz%3DDC93OtY1%3DUFEdWwSZsASka_2eRQ%40mail.gmail.com --- src/backend/access/heap/heapam.c | 3 --- src/backend/access/heap/heapam_xlog.c | 1 - 2 files changed, 4 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index e3e7307ef5f79..4c5ae205a7a60 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -2647,9 +2647,6 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, */ if (all_frozen_set) { - Assert(PageIsAllVisible(page)); - Assert(visibilitymap_pin_ok(BufferGetBlockNumber(buffer), vmbuffer)); - /* * It's fine to use InvalidTransactionId here - this is only used * when HEAP_INSERT_FROZEN is specified, which intentionally diff --git a/src/backend/access/heap/heapam_xlog.c b/src/backend/access/heap/heapam_xlog.c index 5d48f071f53a7..cf843277938de 100644 --- a/src/backend/access/heap/heapam_xlog.c +++ b/src/backend/access/heap/heapam_xlog.c @@ -295,7 +295,6 @@ heap_xlog_visible(XLogReaderState *record) LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK); reln = CreateFakeRelcacheEntry(rlocator); - visibilitymap_pin(reln, blkno, &vmbuffer); visibilitymap_set(reln, blkno, InvalidBuffer, lsn, vmbuffer, xlrec->snapshotConflictHorizon, vmbits); From 585e31fcb6dfcb1d88cfee2371f565574db24869 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Mon, 8 Sep 2025 11:50:33 -0400 Subject: [PATCH 22/73] Don't generate fake "*SELECT*" or "*SELECT* %d" subquery aliases. rte->alias should point only to a user-written alias, but in these cases that principle was violated. Fixing this causes some regression test output changes: wherever rte->alias previously had a value and is now NULL, rte->eref is now set to a generated name rather than to rte->alias; and the scheme used to generate eref names differs from what we were doing for aliases. The upshot is that instead of "*SELECT*" or "*SELECT* %d", EXPLAIN will now emit "unnamed_subquery" or "unnamed_subquery_%d". But that's a reasonable descriptor, and we were already producing that in yet other cases, so this seems not too objectionable. Author: Tom Lane Co-authored-by: Robert Haas Discussion: https://postgr.es/m/CA+TgmoYSYmDA2GvanzPMci084n+mVucv0bJ0HPbs6uhmMN6HMg@mail.gmail.com --- contrib/postgres_fdw/expected/postgres_fdw.out | 8 ++++---- src/backend/executor/functions.c | 2 +- src/backend/parser/analyze.c | 7 ++----- src/test/regress/expected/partition_prune.out | 4 ++-- src/test/regress/expected/rangefuncs.out | 8 ++++---- src/test/regress/expected/union.out | 14 +++++++------- 6 files changed, 20 insertions(+), 23 deletions(-) diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out index 78b8367d28935..18d727d77907a 100644 --- a/contrib/postgres_fdw/expected/postgres_fdw.out +++ b/contrib/postgres_fdw/expected/postgres_fdw.out @@ -5086,13 +5086,13 @@ SELECT ft1.c1 FROM ft1 JOIN ft2 on ft1.c1 = ft2.c1 WHERE -- =================================================================== EXPLAIN (verbose, costs off) INSERT INTO ft2 (c1,c2,c3) SELECT c1+1000,c2+100, c3 || c3 FROM ft2 LIMIT 20; - QUERY PLAN ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ Insert on public.ft2 Remote SQL: INSERT INTO "S 1"."T 1"("C 1", c2, c3, c4, c5, c6, c7, c8) VALUES ($1, $2, $3, $4, $5, $6, $7, $8) Batch Size: 1 - -> Subquery Scan on "*SELECT*" - Output: "*SELECT*"."?column?", "*SELECT*"."?column?_1", NULL::integer, "*SELECT*"."?column?_2", NULL::timestamp with time zone, NULL::timestamp without time zone, NULL::character varying(10), 'ft2 '::character(10), NULL::user_enum + -> Subquery Scan on unnamed_subquery + Output: unnamed_subquery."?column?", unnamed_subquery."?column?_1", NULL::integer, unnamed_subquery."?column?_2", NULL::timestamp with time zone, NULL::timestamp without time zone, NULL::character varying(10), 'ft2 '::character(10), NULL::user_enum -> Foreign Scan on public.ft2 ft2_1 Output: (ft2_1.c1 + 1000), (ft2_1.c2 + 100), (ft2_1.c3 || ft2_1.c3) Remote SQL: SELECT "C 1", c2, c3 FROM "S 1"."T 1" LIMIT 20::bigint diff --git a/src/backend/executor/functions.c b/src/backend/executor/functions.c index 97455b1ed4a5b..630d708d2a3f0 100644 --- a/src/backend/executor/functions.c +++ b/src/backend/executor/functions.c @@ -2483,7 +2483,7 @@ check_sql_stmt_retval(List *queryTreeList, rte = makeNode(RangeTblEntry); rte->rtekind = RTE_SUBQUERY; rte->subquery = parse; - rte->eref = rte->alias = makeAlias("*SELECT*", colnames); + rte->eref = makeAlias("unnamed_subquery", colnames); rte->lateral = false; rte->inh = false; rte->inFromCl = true; diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c index 34f7c17f576ef..b9763ea17144c 100644 --- a/src/backend/parser/analyze.c +++ b/src/backend/parser/analyze.c @@ -777,7 +777,7 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) */ nsitem = addRangeTableEntryForSubquery(pstate, selectQuery, - makeAlias("*SELECT*", NIL), + NULL, false, false); addNSItemToQuery(pstate, nsitem, true, false, false); @@ -2100,7 +2100,6 @@ transformSetOperationTree(ParseState *pstate, SelectStmt *stmt, { /* Process leaf SELECT */ Query *selectQuery; - char selectName[32]; ParseNamespaceItem *nsitem; RangeTblRef *rtr; ListCell *tl; @@ -2156,11 +2155,9 @@ transformSetOperationTree(ParseState *pstate, SelectStmt *stmt, /* * Make the leaf query be a subquery in the top-level rangetable. */ - snprintf(selectName, sizeof(selectName), "*SELECT* %d", - list_length(pstate->p_rtable) + 1); nsitem = addRangeTableEntryForSubquery(pstate, selectQuery, - makeAlias(selectName, NIL), + NULL, false, false); diff --git a/src/test/regress/expected/partition_prune.out b/src/test/regress/expected/partition_prune.out index d1966cd7d829f..68ecd95180920 100644 --- a/src/test/regress/expected/partition_prune.out +++ b/src/test/regress/expected/partition_prune.out @@ -4763,7 +4763,7 @@ select min(a) over (partition by a order by a) from part_abc where a >= stable_o QUERY PLAN ---------------------------------------------------------------------------------------------- Append - -> Subquery Scan on "*SELECT* 1_1" + -> Subquery Scan on unnamed_subquery_2 -> WindowAgg Window: w1 AS (PARTITION BY part_abc.a ORDER BY part_abc.a) -> Append @@ -4780,7 +4780,7 @@ select min(a) over (partition by a order by a) from part_abc where a >= stable_o -> Index Scan using part_abc_3_2_a_idx on part_abc_3_2 part_abc_4 Index Cond: (a >= (stable_one() + 1)) Filter: (d <= stable_one()) - -> Subquery Scan on "*SELECT* 2" + -> Subquery Scan on unnamed_subquery_1 -> WindowAgg Window: w1 AS (PARTITION BY part_abc_5.a ORDER BY part_abc_5.a) -> Append diff --git a/src/test/regress/expected/rangefuncs.out b/src/test/regress/expected/rangefuncs.out index c21be83aa4aaf..30241e22da270 100644 --- a/src/test/regress/expected/rangefuncs.out +++ b/src/test/regress/expected/rangefuncs.out @@ -2130,10 +2130,10 @@ select testrngfunc(); explain (verbose, costs off) select * from testrngfunc(); - QUERY PLAN ----------------------------------------------------------- - Subquery Scan on "*SELECT*" - Output: "*SELECT*"."?column?", "*SELECT*"."?column?_1" + QUERY PLAN +---------------------------------------------------------------------- + Subquery Scan on unnamed_subquery + Output: unnamed_subquery."?column?", unnamed_subquery."?column?_1" -> Unique Output: (1), (2) -> Sort diff --git a/src/test/regress/expected/union.out b/src/test/regress/expected/union.out index 96962817ed45a..d3ea433db1577 100644 --- a/src/test/regress/expected/union.out +++ b/src/test/regress/expected/union.out @@ -942,7 +942,7 @@ SELECT q1 FROM int8_tbl EXCEPT SELECT q2 FROM int8_tbl ORDER BY q2 LIMIT 1; ERROR: column "q2" does not exist LINE 1: ... int8_tbl EXCEPT SELECT q2 FROM int8_tbl ORDER BY q2 LIMIT 1... ^ -DETAIL: There is a column named "q2" in table "*SELECT* 2", but it cannot be referenced from this part of the query. +DETAIL: There is a column named "q2" in table "unnamed_subquery", but it cannot be referenced from this part of the query. -- But this should work: SELECT q1 FROM int8_tbl EXCEPT (((SELECT q2 FROM int8_tbl ORDER BY q2 LIMIT 1))) ORDER BY 1; q1 @@ -1338,14 +1338,14 @@ where q2 = q2; ---------------------------------------------------------- Unique -> Merge Append - Sort Key: "*SELECT* 1".q1 - -> Subquery Scan on "*SELECT* 1" + Sort Key: unnamed_subquery.q1 + -> Subquery Scan on unnamed_subquery -> Unique -> Sort Sort Key: i81.q1, i81.q2 -> Seq Scan on int8_tbl i81 Filter: (q2 IS NOT NULL) - -> Subquery Scan on "*SELECT* 2" + -> Subquery Scan on unnamed_subquery_1 -> Unique -> Sort Sort Key: i82.q1, i82.q2 @@ -1374,14 +1374,14 @@ where -q1 = q2; -------------------------------------------------------- Unique -> Merge Append - Sort Key: "*SELECT* 1".q1 - -> Subquery Scan on "*SELECT* 1" + Sort Key: unnamed_subquery.q1 + -> Subquery Scan on unnamed_subquery -> Unique -> Sort Sort Key: i81.q1, i81.q2 -> Seq Scan on int8_tbl i81 Filter: ((- q1) = q2) - -> Subquery Scan on "*SELECT* 2" + -> Subquery Scan on unnamed_subquery_1 -> Unique -> Sort Sort Key: i82.q1, i82.q2 From 6f79024df3461f794aace8bbc8706d8e5f7da091 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Mon, 8 Sep 2025 12:24:02 -0400 Subject: [PATCH 23/73] Don't generate fake "ANY_subquery" aliases, either. This is just like the previous commit, but for a different invented alias name. Author: Robert Haas Reviewed-by: Tom Lane Discussion: https://postgr.es/m/CA+TgmoYSYmDA2GvanzPMci084n+mVucv0bJ0HPbs6uhmMN6HMg@mail.gmail.com --- src/backend/optimizer/plan/subselect.c | 2 +- src/test/regress/expected/memoize.out | 8 ++++---- src/test/regress/expected/subselect.out | 14 +++++++------- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index d71ed958e31b3..fae18548e074e 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -1397,7 +1397,7 @@ convert_ANY_sublink_to_join(PlannerInfo *root, SubLink *sublink, */ nsitem = addRangeTableEntryForSubquery(pstate, subselect, - makeAlias("ANY_subquery", NIL), + NULL, use_lateral, false); rte = nsitem->p_rte; diff --git a/src/test/regress/expected/memoize.out b/src/test/regress/expected/memoize.out index 150dc1b44cf62..fbcaf113266c5 100644 --- a/src/test/regress/expected/memoize.out +++ b/src/test/regress/expected/memoize.out @@ -545,15 +545,15 @@ EXPLAIN (COSTS OFF) SELECT * FROM tab_anti t1 WHERE t1.a IN (SELECT a FROM tab_anti t2 WHERE t2.b IN (SELECT t1.b FROM tab_anti t3 WHERE t2.a > 1 OFFSET 0)); - QUERY PLAN -------------------------------------------------- + QUERY PLAN +--------------------------------------------------- Nested Loop Semi Join -> Seq Scan on tab_anti t1 -> Nested Loop Semi Join Join Filter: (t1.a = t2.a) -> Seq Scan on tab_anti t2 - -> Subquery Scan on "ANY_subquery" - Filter: (t2.b = "ANY_subquery".b) + -> Subquery Scan on unnamed_subquery + Filter: (t2.b = unnamed_subquery.b) -> Result One-Time Filter: (t2.a > 1) -> Seq Scan on tab_anti t3 diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out index c16dff05bc12e..7a1c216a0b1b7 100644 --- a/src/test/regress/expected/subselect.out +++ b/src/test/regress/expected/subselect.out @@ -1692,14 +1692,14 @@ select * from int4_tbl o where (f1, f1) in ------------------------------------------------------------------- Nested Loop Semi Join Output: o.f1 - Join Filter: (o.f1 = "ANY_subquery".f1) + Join Filter: (o.f1 = unnamed_subquery.f1) -> Seq Scan on public.int4_tbl o Output: o.f1 -> Materialize - Output: "ANY_subquery".f1, "ANY_subquery".g - -> Subquery Scan on "ANY_subquery" - Output: "ANY_subquery".f1, "ANY_subquery".g - Filter: ("ANY_subquery".f1 = "ANY_subquery".g) + Output: unnamed_subquery.f1, unnamed_subquery.g + -> Subquery Scan on unnamed_subquery + Output: unnamed_subquery.f1, unnamed_subquery.g + Filter: (unnamed_subquery.f1 = unnamed_subquery.g) -> Result Output: i.f1, ((generate_series(1, 50)) / 10) -> ProjectSet @@ -2867,8 +2867,8 @@ ON B.hundred in (SELECT min(c.hundred) FROM tenk2 C WHERE c.odd = b.odd); -> Memoize Cache Key: b.hundred, b.odd Cache Mode: binary - -> Subquery Scan on "ANY_subquery" - Filter: (b.hundred = "ANY_subquery".min) + -> Subquery Scan on unnamed_subquery + Filter: (b.hundred = unnamed_subquery.min) -> Result InitPlan 1 -> Limit From 5a170e992a4d402ef0e1b8ce7284cd78879ece73 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Mon, 8 Sep 2025 12:58:07 -0400 Subject: [PATCH 24/73] Don't generate fake "*TLOCRN*" or "*TROCRN*" aliases, either. This is just like the previous two commits, except that this fix actually doesn't change any regression test outputs. Author: Robert Haas Reviewed-by: Tom Lane Discussion: https://postgr.es/m/CA+TgmoYSYmDA2GvanzPMci084n+mVucv0bJ0HPbs6uhmMN6HMg@mail.gmail.com --- src/backend/rewrite/rewriteSearchCycle.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/backend/rewrite/rewriteSearchCycle.c b/src/backend/rewrite/rewriteSearchCycle.c index 9f95d4dc1b0e8..5202ef43d1068 100644 --- a/src/backend/rewrite/rewriteSearchCycle.c +++ b/src/backend/rewrite/rewriteSearchCycle.c @@ -282,8 +282,8 @@ rewriteSearchAndCycle(CommonTableExpr *cte) newrte = makeNode(RangeTblEntry); newrte->rtekind = RTE_SUBQUERY; - newrte->alias = makeAlias("*TLOCRN*", cte->ctecolnames); - newrte->eref = newrte->alias; + newrte->alias = NULL; + newrte->eref = makeAlias("*TLOCRN*", cte->ctecolnames); newsubquery = copyObject(rte1->subquery); IncrementVarSublevelsUp((Node *) newsubquery, 1, 1); newrte->subquery = newsubquery; @@ -379,8 +379,8 @@ rewriteSearchAndCycle(CommonTableExpr *cte) ewcl = lappend(ewcl, makeString(cte->cycle_clause->cycle_mark_column)); ewcl = lappend(ewcl, makeString(cte->cycle_clause->cycle_path_column)); } - newrte->alias = makeAlias("*TROCRN*", ewcl); - newrte->eref = newrte->alias; + newrte->alias = NULL; + newrte->eref = makeAlias("*TROCRN*", ewcl); /* * Find the reference to the recursive CTE in the right UNION subquery's From 4b5f206de2bb9152a99a5c218caf2580cc5a0e9e Mon Sep 17 00:00:00 2001 From: Melanie Plageman Date: Mon, 8 Sep 2025 14:25:10 -0400 Subject: [PATCH 25/73] Remove unused xl_heap_prune member, reason f83d709760d8 refactored xl_heap_prune and added an unused member, reason. While PruneReason is used when constructing this WAL record to set the WAL record definition, it doesn't need to be stored in a separate field in the record. Remove it. We won't backport this, since modifying an exposed struct definition to remove an unused field would do more harm than good. Author: Melanie Plageman Reported-by: Andres Freund Reviewed-by: Robert Haas Discussion: https://postgr.es/m/tvvtfoxz5ykpsctxjbzxg3nldnzfc7geplrt2z2s54pmgto27y%40hbijsndifu45 --- src/include/access/heapam_xlog.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h index 277df6b3cf0b3..d4c0625b63228 100644 --- a/src/include/access/heapam_xlog.h +++ b/src/include/access/heapam_xlog.h @@ -284,7 +284,6 @@ typedef struct xl_heap_update */ typedef struct xl_heap_prune { - uint8 reason; uint8 flags; /* From 3bcfcd815e1a2d51772ba27e0d034467f0344f15 Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Mon, 8 Sep 2025 14:19:48 -0500 Subject: [PATCH 26/73] pg_upgrade: Transfer pg_largeobject_metadata's files when possible. Commit 161a3e8b68 taught pg_upgrade to use COPY for large object metadata for upgrades from v12 and newer, which is much faster to restore than the proper large object commands. For upgrades from v16 and newer, we can take this a step further and transfer the large object metadata files as if they were user tables. We can't transfer the files from older versions because the aclitem data type (needed by pg_largeobject_metadata.lomacl) changed its storage format in v16 (see commit 7b378237aa). Note that this commit is essentially a revert of commit 12a53c732c. There are a couple of caveats. First, we still need to COPY the corresponding pg_shdepend rows for large objects. Second, we need to COPY anything in pg_largeobject_metadata with a comment or security label, else restoring those will fail. This means that an upgrade in which every large object has a comment or security label won't gain anything from this commit, but it should at least avoid making those unusual use-cases any worse. pg_upgrade must also take care to transfer the relfilenodes of pg_largeobject_metadata and its index, as was done for pg_largeobject in commits d498e052b4 and bbe08b8869. Reviewed-by: Michael Paquier Discussion: https://postgr.es/m/aJ3_Gih_XW1_O2HF%40nathan --- src/backend/commands/tablecmds.c | 12 ++-- src/bin/pg_dump/pg_dump.c | 80 ++++++++++++++++++---- src/bin/pg_upgrade/Makefile | 3 +- src/bin/pg_upgrade/info.c | 11 ++- src/bin/pg_upgrade/pg_upgrade.c | 6 +- src/bin/pg_upgrade/t/006_transfer_modes.pl | 67 ++++++++++++++++++ 6 files changed, 154 insertions(+), 25 deletions(-) diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 082a3575d621e..3be2e051d32fb 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -42,6 +42,7 @@ #include "catalog/pg_foreign_table.h" #include "catalog/pg_inherits.h" #include "catalog/pg_largeobject.h" +#include "catalog/pg_largeobject_metadata.h" #include "catalog/pg_namespace.h" #include "catalog/pg_opclass.h" #include "catalog/pg_policy.h" @@ -2389,12 +2390,15 @@ truncate_check_rel(Oid relid, Form_pg_class reltuple) /* * Most system catalogs can't be truncated at all, or at least not unless * allow_system_table_mods=on. As an exception, however, we allow - * pg_largeobject to be truncated as part of pg_upgrade, because we need - * to change its relfilenode to match the old cluster, and allowing a - * TRUNCATE command to be executed is the easiest way of doing that. + * pg_largeobject and pg_largeobject_metadata to be truncated as part of + * pg_upgrade, because we need to change its relfilenode to match the old + * cluster, and allowing a TRUNCATE command to be executed is the easiest + * way of doing that. */ if (!allowSystemTableMods && IsSystemClass(relid, reltuple) - && (!IsBinaryUpgrade || relid != LargeObjectRelationId)) + && (!IsBinaryUpgrade || + (relid != LargeObjectRelationId && + relid != LargeObjectMetadataRelationId))) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("permission denied: \"%s\" is a system catalog", diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index bea793456f969..b4c45ad803e94 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -1131,6 +1131,23 @@ main(int argc, char **argv) shdepend->dataObj->filtercond = "WHERE classid = 'pg_largeobject'::regclass " "AND dbid = (SELECT oid FROM pg_database " " WHERE datname = current_database())"; + + /* + * If upgrading from v16 or newer, only dump large objects with + * comments/seclabels. For these upgrades, pg_upgrade can copy/link + * pg_largeobject_metadata's files (which is usually faster) but we + * still need to dump LOs with comments/seclabels here so that the + * subsequent COMMENT and SECURITY LABEL commands work. pg_upgrade + * can't copy/link the files from older versions because aclitem + * (needed by pg_largeobject_metadata.lomacl) changed its storage + * format in v16. + */ + if (fout->remoteVersion >= 160000) + lo_metadata->dataObj->filtercond = "WHERE oid IN " + "(SELECT objoid FROM pg_description " + "WHERE classoid = " CppAsString2(LargeObjectRelationId) " " + "UNION SELECT objoid FROM pg_seclabel " + "WHERE classoid = " CppAsString2(LargeObjectRelationId) ")"; } /* @@ -3629,26 +3646,32 @@ dumpDatabase(Archive *fout) /* * pg_largeobject comes from the old system intact, so set its * relfrozenxids, relminmxids and relfilenode. + * + * pg_largeobject_metadata also comes from the old system intact for + * upgrades from v16 and newer, so set its relfrozenxids, relminmxids, and + * relfilenode, too. pg_upgrade can't copy/link the files from older + * versions because aclitem (needed by pg_largeobject_metadata.lomacl) + * changed its storage format in v16. */ if (dopt->binary_upgrade) { PGresult *lo_res; PQExpBuffer loFrozenQry = createPQExpBuffer(); PQExpBuffer loOutQry = createPQExpBuffer(); + PQExpBuffer lomOutQry = createPQExpBuffer(); PQExpBuffer loHorizonQry = createPQExpBuffer(); + PQExpBuffer lomHorizonQry = createPQExpBuffer(); int ii_relfrozenxid, ii_relfilenode, ii_oid, ii_relminmxid; - /* - * pg_largeobject - */ if (fout->remoteVersion >= 90300) appendPQExpBuffer(loFrozenQry, "SELECT relfrozenxid, relminmxid, relfilenode, oid\n" "FROM pg_catalog.pg_class\n" - "WHERE oid IN (%u, %u);\n", - LargeObjectRelationId, LargeObjectLOidPNIndexId); + "WHERE oid IN (%u, %u, %u, %u);\n", + LargeObjectRelationId, LargeObjectLOidPNIndexId, + LargeObjectMetadataRelationId, LargeObjectMetadataOidIndexId); else appendPQExpBuffer(loFrozenQry, "SELECT relfrozenxid, 0 AS relminmxid, relfilenode, oid\n" "FROM pg_catalog.pg_class\n" @@ -3663,35 +3686,57 @@ dumpDatabase(Archive *fout) ii_oid = PQfnumber(lo_res, "oid"); appendPQExpBufferStr(loHorizonQry, "\n-- For binary upgrade, set pg_largeobject relfrozenxid and relminmxid\n"); + appendPQExpBufferStr(lomHorizonQry, "\n-- For binary upgrade, set pg_largeobject_metadata relfrozenxid and relminmxid\n"); appendPQExpBufferStr(loOutQry, "\n-- For binary upgrade, preserve pg_largeobject and index relfilenodes\n"); + appendPQExpBufferStr(lomOutQry, "\n-- For binary upgrade, preserve pg_largeobject_metadata and index relfilenodes\n"); for (int i = 0; i < PQntuples(lo_res); ++i) { Oid oid; RelFileNumber relfilenumber; + PQExpBuffer horizonQry; + PQExpBuffer outQry; + + oid = atooid(PQgetvalue(lo_res, i, ii_oid)); + relfilenumber = atooid(PQgetvalue(lo_res, i, ii_relfilenode)); - appendPQExpBuffer(loHorizonQry, "UPDATE pg_catalog.pg_class\n" + if (oid == LargeObjectRelationId || + oid == LargeObjectLOidPNIndexId) + { + horizonQry = loHorizonQry; + outQry = loOutQry; + } + else + { + horizonQry = lomHorizonQry; + outQry = lomOutQry; + } + + appendPQExpBuffer(horizonQry, "UPDATE pg_catalog.pg_class\n" "SET relfrozenxid = '%u', relminmxid = '%u'\n" "WHERE oid = %u;\n", atooid(PQgetvalue(lo_res, i, ii_relfrozenxid)), atooid(PQgetvalue(lo_res, i, ii_relminmxid)), atooid(PQgetvalue(lo_res, i, ii_oid))); - oid = atooid(PQgetvalue(lo_res, i, ii_oid)); - relfilenumber = atooid(PQgetvalue(lo_res, i, ii_relfilenode)); - - if (oid == LargeObjectRelationId) - appendPQExpBuffer(loOutQry, + if (oid == LargeObjectRelationId || + oid == LargeObjectMetadataRelationId) + appendPQExpBuffer(outQry, "SELECT pg_catalog.binary_upgrade_set_next_heap_relfilenode('%u'::pg_catalog.oid);\n", relfilenumber); - else if (oid == LargeObjectLOidPNIndexId) - appendPQExpBuffer(loOutQry, + else if (oid == LargeObjectLOidPNIndexId || + oid == LargeObjectMetadataOidIndexId) + appendPQExpBuffer(outQry, "SELECT pg_catalog.binary_upgrade_set_next_index_relfilenode('%u'::pg_catalog.oid);\n", relfilenumber); } appendPQExpBufferStr(loOutQry, "TRUNCATE pg_catalog.pg_largeobject;\n"); + appendPQExpBufferStr(lomOutQry, + "TRUNCATE pg_catalog.pg_largeobject_metadata;\n"); + appendPQExpBufferStr(loOutQry, loHorizonQry->data); + appendPQExpBufferStr(lomOutQry, lomHorizonQry->data); ArchiveEntry(fout, nilCatalogId, createDumpId(), ARCHIVE_OPTS(.tag = "pg_largeobject", @@ -3699,11 +3744,20 @@ dumpDatabase(Archive *fout) .section = SECTION_PRE_DATA, .createStmt = loOutQry->data)); + if (fout->remoteVersion >= 160000) + ArchiveEntry(fout, nilCatalogId, createDumpId(), + ARCHIVE_OPTS(.tag = "pg_largeobject_metadata", + .description = "pg_largeobject_metadata", + .section = SECTION_PRE_DATA, + .createStmt = lomOutQry->data)); + PQclear(lo_res); destroyPQExpBuffer(loFrozenQry); destroyPQExpBuffer(loHorizonQry); + destroyPQExpBuffer(lomHorizonQry); destroyPQExpBuffer(loOutQry); + destroyPQExpBuffer(lomOutQry); } PQclear(res); diff --git a/src/bin/pg_upgrade/Makefile b/src/bin/pg_upgrade/Makefile index f83d2b5d30955..69fcf593caec9 100644 --- a/src/bin/pg_upgrade/Makefile +++ b/src/bin/pg_upgrade/Makefile @@ -3,8 +3,7 @@ PGFILEDESC = "pg_upgrade - an in-place binary upgrade utility" PGAPPICON = win32 -# required for 003_upgrade_logical_replication_slots.pl -EXTRA_INSTALL=contrib/test_decoding +EXTRA_INSTALL=contrib/test_decoding src/test/modules/dummy_seclabel subdir = src/bin/pg_upgrade top_builddir = ../../.. diff --git a/src/bin/pg_upgrade/info.c b/src/bin/pg_upgrade/info.c index c39eb077c2fae..7ce0827016803 100644 --- a/src/bin/pg_upgrade/info.c +++ b/src/bin/pg_upgrade/info.c @@ -498,7 +498,10 @@ get_rel_infos_query(void) * * pg_largeobject contains user data that does not appear in pg_dump * output, so we have to copy that system table. It's easiest to do that - * by treating it as a user table. + * by treating it as a user table. We can do the same for + * pg_largeobject_metadata for upgrades from v16 and newer. pg_upgrade + * can't copy/link the files from older versions because aclitem (needed + * by pg_largeobject_metadata.lomacl) changed its storage format in v16. */ appendPQExpBuffer(&query, "WITH regular_heap (reloid, indtable, toastheap) AS ( " @@ -514,10 +517,12 @@ get_rel_infos_query(void) " 'binary_upgrade', 'pg_toast') AND " " c.oid >= %u::pg_catalog.oid) OR " " (n.nspname = 'pg_catalog' AND " - " relname IN ('pg_largeobject') ))), ", + " relname IN ('pg_largeobject'%s) ))), ", (user_opts.transfer_mode == TRANSFER_MODE_SWAP) ? ", " CppAsString2(RELKIND_SEQUENCE) : "", - FirstNormalObjectId); + FirstNormalObjectId, + (GET_MAJOR_VERSION(old_cluster.major_version) >= 1600) ? + ", 'pg_largeobject_metadata'" : ""); /* * Add a CTE that collects OIDs of toast tables belonging to the tables diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c index d5cd5bf0b3a6b..490e98fa26f2a 100644 --- a/src/bin/pg_upgrade/pg_upgrade.c +++ b/src/bin/pg_upgrade/pg_upgrade.c @@ -29,9 +29,9 @@ * We control all assignments of pg_enum.oid because these oids are stored * in user tables as enum values. * - * We control all assignments of pg_authid.oid for historical reasons (the - * oids used to be stored in pg_largeobject_metadata, which is now copied via - * SQL commands), that might change at some point in the future. + * We control all assignments of pg_authid.oid because the oids are stored in + * pg_largeobject_metadata, which is copied via file transfer for upgrades + * from v16 and newer. * * We control all assignments of pg_database.oid because we want the directory * names to match between the old and new cluster. diff --git a/src/bin/pg_upgrade/t/006_transfer_modes.pl b/src/bin/pg_upgrade/t/006_transfer_modes.pl index 348f402146234..2f68f0b56aa61 100644 --- a/src/bin/pg_upgrade/t/006_transfer_modes.pl +++ b/src/bin/pg_upgrade/t/006_transfer_modes.pl @@ -45,6 +45,22 @@ sub test_mode $old->append_conf('postgresql.conf', "allow_in_place_tablespaces = true"); } + # We can only test security labels if both the old and new installations + # have dummy_seclabel. + my $test_seclabel = 1; + $old->start; + if (!$old->check_extension('dummy_seclabel')) + { + $test_seclabel = 0; + } + $old->stop; + $new->start; + if (!$new->check_extension('dummy_seclabel')) + { + $test_seclabel = 0; + } + $new->stop; + # Create a small variety of simple test objects on the old cluster. We'll # check that these reach the new version after upgrading. $old->start; @@ -83,6 +99,29 @@ sub test_mode $old->safe_psql('testdb3', "CREATE TABLE test6 AS SELECT generate_series(607, 711)"); } + + # While we are here, test handling of large objects. + $old->safe_psql('postgres', q| + CREATE ROLE regress_lo_1; + CREATE ROLE regress_lo_2; + + SELECT lo_from_bytea(4532, '\xffffff00'); + COMMENT ON LARGE OBJECT 4532 IS 'test'; + + SELECT lo_from_bytea(4533, '\x0f0f0f0f'); + ALTER LARGE OBJECT 4533 OWNER TO regress_lo_1; + GRANT SELECT ON LARGE OBJECT 4533 TO regress_lo_2; + |); + + if ($test_seclabel) + { + $old->safe_psql('postgres', q| + CREATE EXTENSION dummy_seclabel; + + SELECT lo_from_bytea(4534, '\x00ffffff'); + SECURITY LABEL ON LARGE OBJECT 4534 IS 'classified'; + |); + } $old->stop; my $result = command_ok_or_fails_like( @@ -132,6 +171,34 @@ sub test_mode $result = $new->safe_psql('testdb3', "SELECT COUNT(*) FROM test6"); is($result, '105', "test6 data after pg_upgrade $mode"); } + + # Tests for large objects + $result = $new->safe_psql('postgres', "SELECT lo_get(4532)"); + is($result, '\xffffff00', "LO contents after upgrade"); + $result = $new->safe_psql('postgres', + "SELECT obj_description(4532, 'pg_largeobject')"); + is($result, 'test', "comment on LO after pg_upgrade"); + + $result = $new->safe_psql('postgres', "SELECT lo_get(4533)"); + is($result, '\x0f0f0f0f', "LO contents after upgrade"); + $result = $new->safe_psql('postgres', + "SELECT lomowner::regrole FROM pg_largeobject_metadata WHERE oid = 4533"); + is($result, 'regress_lo_1', "LO owner after upgrade"); + $result = $new->safe_psql('postgres', + "SELECT lomacl FROM pg_largeobject_metadata WHERE oid = 4533"); + is($result, '{regress_lo_1=rw/regress_lo_1,regress_lo_2=r/regress_lo_1}', + "LO ACL after upgrade"); + + if ($test_seclabel) + { + $result = $new->safe_psql('postgres', "SELECT lo_get(4534)"); + is($result, '\x00ffffff', "LO contents after upgrade"); + $result = $new->safe_psql('postgres', q| + SELECT label FROM pg_seclabel WHERE objoid = 4534 + AND classoid = 'pg_largeobject'::regclass + |); + is($result, 'classified', "seclabel on LO after pg_upgrade"); + } $new->stop; } From 9af672bcb245950e58198119ba6eb17043fd3a6d Mon Sep 17 00:00:00 2001 From: Jeff Davis Date: Mon, 8 Sep 2025 12:29:42 -0700 Subject: [PATCH 27/73] meson: build checksums with extra optimization flags. Use -funroll-loops and -ftree-vectorize when building checksum.c to match what autoconf does. Discussion: https://postgr.es/m/a81f2f7ef34afc24a89c613671ea017e3651329c.camel@j-davis.com Reviewed-by: Andres Freund --- src/backend/storage/page/meson.build | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/backend/storage/page/meson.build b/src/backend/storage/page/meson.build index c3e4a805862a9..112f00ff36552 100644 --- a/src/backend/storage/page/meson.build +++ b/src/backend/storage/page/meson.build @@ -1,7 +1,15 @@ # Copyright (c) 2022-2025, PostgreSQL Global Development Group +checksum_backend_lib = static_library('checksum_backend_lib', + 'checksum.c', + dependencies: backend_build_deps, + kwargs: internal_lib_args, + c_args: vectorize_cflags + unroll_loops_cflags, +) + +backend_link_with += checksum_backend_lib + backend_sources += files( 'bufpage.c', - 'checksum.c', 'itemptr.c', ) From 8ec97e78a7713a1ebf4976b55c19f6c9bc2716d9 Mon Sep 17 00:00:00 2001 From: Melanie Plageman Date: Mon, 8 Sep 2025 17:13:31 -0400 Subject: [PATCH 28/73] Add error codes when vacuum discovers VM corruption Commit fd6ec93bf890314a and other previous work established the principle that when an error is potentially reachable in case of on-disk corruption but is not expected to be reached otherwise, ERRCODE_DATA_CORRUPTED should be used. This allows log monitoring software to search for evidence of corruption by filtering on the error code. Enhance the existing log messages emitted when the heap page is found to be inconsistent with the VM by adding this error code. Suggested-by: Andrey Borodin Author: Melanie Plageman Reviewed-by: Robert Haas Discussion: https://postgr.es/m/87DD95AA-274F-4F4F-BAD9-7738E5B1F905%40yandex-team.ru --- src/backend/access/heap/vacuumlazy.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 932701d8420dc..981d9380a925c 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -2121,8 +2121,11 @@ lazy_scan_prune(LVRelState *vacrel, else if (all_visible_according_to_vm && !PageIsAllVisible(page) && visibilitymap_get_status(vacrel->rel, blkno, &vmbuffer) != 0) { - elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", - vacrel->relname, blkno); + ereport(WARNING, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", + vacrel->relname, blkno))); + visibilitymap_clear(vacrel->rel, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS); } @@ -2143,8 +2146,11 @@ lazy_scan_prune(LVRelState *vacrel, */ else if (presult.lpdead_items > 0 && PageIsAllVisible(page)) { - elog(WARNING, "page containing LP_DEAD items is marked as all-visible in relation \"%s\" page %u", - vacrel->relname, blkno); + ereport(WARNING, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("page containing LP_DEAD items is marked as all-visible in relation \"%s\" page %u", + vacrel->relname, blkno))); + PageClearAllVisible(page); MarkBufferDirty(buf); visibilitymap_clear(vacrel->rel, blkno, vmbuffer, From 5ac3c1ac22cb325844d0bee37f79f2c11931b32e Mon Sep 17 00:00:00 2001 From: Amit Kapila Date: Tue, 9 Sep 2025 03:18:22 +0000 Subject: [PATCH 29/73] Fix Coverity issue reported in commit a850be2fe. Address a potential SIGSEGV that may occur when the tablesync worker attempts to locate a deleted row while applying changes. This situation arises during conflict detection for update-deleted scenarios. To prevent this crash, ensure that the operation is errored out early if the leader apply worker is unavailable. Since the leader worker maintains the necessary conflict detection metadata, proceeding without it serves no purpose and risks reporting incorrect conflict type. In the passing, improve a nearby comment. Reported by Tom Lane as per Coverity Author: shveta malik Reviewed-by: Amit Kapila Discussion: https://postgr.es/m/334468.1757280992@sss.pgh.pa.us --- src/backend/replication/logical/worker.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c index c0f6bef5c282c..b3cac1023731a 100644 --- a/src/backend/replication/logical/worker.c +++ b/src/backend/replication/logical/worker.c @@ -3266,12 +3266,18 @@ FindDeletedTupleInLocalRel(Relation localrel, Oid localidxoid, /* * Obtain the information from the leader apply worker as only the - * leader manages conflict retention (see + * leader manages oldest_nonremovable_xid (see * maybe_advance_nonremovable_xid() for details). */ LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); leader = logicalrep_worker_find(MyLogicalRepWorker->subid, InvalidOid, false); + if (!leader) + { + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("could not detect conflict as the leader apply worker has exited"))); + } SpinLockAcquire(&leader->relmutex); oldestxmin = leader->oldest_nonremovable_xid; From faf071b553830d39fc583beabcaf56ed65259acc Mon Sep 17 00:00:00 2001 From: Dean Rasheed Date: Tue, 9 Sep 2025 10:39:30 +0100 Subject: [PATCH 30/73] Add date and timestamp variants of random(min, max). This adds 3 new variants of the random() function: random(min date, max date) returns date random(min timestamp, max timestamp) returns timestamp random(min timestamptz, max timestamptz) returns timestamptz Each returns a random value x in the range min <= x <= max. Author: Damien Clochard Reviewed-by: Greg Sabino Mullane Reviewed-by: Dean Rasheed Reviewed-by: Vik Fearing Reviewed-by: Chao Li Discussion: https://postgr.es/m/f524d8cab5914613d9e624d9ce177d3d@dalibo.info --- doc/src/sgml/func/func-datetime.sgml | 30 +++++++ doc/src/sgml/func/func-math.sgml | 3 +- src/backend/utils/adt/pseudorandomfuncs.c | 104 ++++++++++++++++++++-- src/include/catalog/catversion.h | 2 +- src/include/catalog/pg_proc.dat | 12 +++ src/test/regress/expected/random.out | 87 ++++++++++++++++++ src/test/regress/sql/random.sql | 26 ++++++ 7 files changed, 254 insertions(+), 10 deletions(-) diff --git a/doc/src/sgml/func/func-datetime.sgml b/doc/src/sgml/func/func-datetime.sgml index 482fe45f42ebc..98dd60aa9a7ec 100644 --- a/doc/src/sgml/func/func-datetime.sgml +++ b/doc/src/sgml/func/func-datetime.sgml @@ -928,6 +928,36 @@ + + + + random + + random ( min date, max date ) + date + + + random ( min timestamp, max timestamp ) + timestamp + + + random ( min timestamptz, max timestamptz ) + timestamptz + + + Returns a random value in the range + min <= x <= max. + + + random('1979-02-08'::date,'2025-07-03'::date) + 1983-04-21 + + + random('2000-01-01'::timestamptz, now()) + 2015-09-27 09:11:33.732707+00 + + + diff --git a/doc/src/sgml/func/func-math.sgml b/doc/src/sgml/func/func-math.sgml index 7528dc4cea4b9..fd821c0e70677 100644 --- a/doc/src/sgml/func/func-math.sgml +++ b/doc/src/sgml/func/func-math.sgml @@ -1151,7 +1151,8 @@ The random() and random_normal() - functions listed in use a + functions listed in and + use a deterministic pseudo-random number generator. It is fast but not suitable for cryptographic applications; see the module for a more diff --git a/src/backend/utils/adt/pseudorandomfuncs.c b/src/backend/utils/adt/pseudorandomfuncs.c index e7b8045f92508..1d2a981491bf5 100644 --- a/src/backend/utils/adt/pseudorandomfuncs.c +++ b/src/backend/utils/adt/pseudorandomfuncs.c @@ -17,6 +17,7 @@ #include "common/pg_prng.h" #include "miscadmin.h" +#include "utils/date.h" #include "utils/fmgrprotos.h" #include "utils/numeric.h" #include "utils/timestamp.h" @@ -25,6 +26,18 @@ static pg_prng_state prng_state; static bool prng_seed_set = false; +/* + * Macro for checking the range bounds of random(min, max) functions. Throws + * an error if they're the wrong way round. + */ +#define CHECK_RANGE_BOUNDS(rmin, rmax) \ + do { \ + if ((rmin) > (rmax)) \ + ereport(ERROR, \ + errcode(ERRCODE_INVALID_PARAMETER_VALUE), \ + errmsg("lower bound must be less than or equal to upper bound")); \ + } while (0) + /* * initialize_prng() - * @@ -129,10 +142,7 @@ int4random(PG_FUNCTION_ARGS) int32 rmax = PG_GETARG_INT32(1); int32 result; - if (rmin > rmax) - ereport(ERROR, - errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("lower bound must be less than or equal to upper bound")); + CHECK_RANGE_BOUNDS(rmin, rmax); initialize_prng(); @@ -153,10 +163,7 @@ int8random(PG_FUNCTION_ARGS) int64 rmax = PG_GETARG_INT64(1); int64 result; - if (rmin > rmax) - ereport(ERROR, - errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("lower bound must be less than or equal to upper bound")); + CHECK_RANGE_BOUNDS(rmin, rmax); initialize_prng(); @@ -177,9 +184,90 @@ numeric_random(PG_FUNCTION_ARGS) Numeric rmax = PG_GETARG_NUMERIC(1); Numeric result; + /* Leave range bound checking to random_numeric() */ + initialize_prng(); result = random_numeric(&prng_state, rmin, rmax); PG_RETURN_NUMERIC(result); } + + +/* + * date_random() - + * + * Returns a random date chosen uniformly in the specified range. + */ +Datum +date_random(PG_FUNCTION_ARGS) +{ + int32 rmin = (int32) PG_GETARG_DATEADT(0); + int32 rmax = (int32) PG_GETARG_DATEADT(1); + DateADT result; + + CHECK_RANGE_BOUNDS(rmin, rmax); + + if (DATE_IS_NOBEGIN(rmin) || DATE_IS_NOEND(rmax)) + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("lower and upper bounds must be finite")); + + initialize_prng(); + + result = (DateADT) pg_prng_int64_range(&prng_state, rmin, rmax); + + PG_RETURN_DATEADT(result); +} + +/* + * timestamp_random() - + * + * Returns a random timestamp chosen uniformly in the specified range. + */ +Datum +timestamp_random(PG_FUNCTION_ARGS) +{ + int64 rmin = (int64) PG_GETARG_TIMESTAMP(0); + int64 rmax = (int64) PG_GETARG_TIMESTAMP(1); + Timestamp result; + + CHECK_RANGE_BOUNDS(rmin, rmax); + + if (TIMESTAMP_IS_NOBEGIN(rmin) || TIMESTAMP_IS_NOEND(rmax)) + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("lower and upper bounds must be finite")); + + initialize_prng(); + + result = (Timestamp) pg_prng_int64_range(&prng_state, rmin, rmax); + + PG_RETURN_TIMESTAMP(result); +} + +/* + * timestamptz_random() - + * + * Returns a random timestamptz chosen uniformly in the specified range. + */ +Datum +timestamptz_random(PG_FUNCTION_ARGS) +{ + int64 rmin = (int64) PG_GETARG_TIMESTAMPTZ(0); + int64 rmax = (int64) PG_GETARG_TIMESTAMPTZ(1); + TimestampTz result; + + CHECK_RANGE_BOUNDS(rmin, rmax); + + if (TIMESTAMP_IS_NOBEGIN(rmin) || TIMESTAMP_IS_NOEND(rmax)) + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("lower and upper bounds must be finite")); + + initialize_prng(); + + result = (TimestampTz) pg_prng_int64_range(&prng_state, rmin, rmax); + + PG_RETURN_TIMESTAMPTZ(result); +} diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 836369f163ef5..ef0d0f92165eb 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -57,6 +57,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202509021 +#define CATALOG_VERSION_NO 202509091 #endif diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 118d6da1ace0e..03e82d28c8767 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -3503,6 +3503,18 @@ proname => 'random', provolatile => 'v', proparallel => 'r', prorettype => 'numeric', proargtypes => 'numeric numeric', proargnames => '{min,max}', prosrc => 'numeric_random' }, +{ oid => '6431', descr => 'random date in range', + proname => 'random', provolatile => 'v', proparallel => 'r', + prorettype => 'date', proargtypes => 'date date', + proargnames => '{min,max}', prosrc => 'date_random' }, +{ oid => '6432', descr => 'random timestamp in range', + proname => 'random', provolatile => 'v', proparallel => 'r', + prorettype => 'timestamp', proargtypes => 'timestamp timestamp', + proargnames => '{min,max}', prosrc => 'timestamp_random' }, +{ oid => '6433', descr => 'random timestamptz in range', + proname => 'random', provolatile => 'v', proparallel => 'r', + prorettype => 'timestamptz', proargtypes => 'timestamptz timestamptz', + proargnames => '{min,max}', prosrc => 'timestamptz_random' }, { oid => '1599', descr => 'set random seed', proname => 'setseed', provolatile => 'v', proparallel => 'r', prorettype => 'void', proargtypes => 'float8', prosrc => 'setseed' }, diff --git a/src/test/regress/expected/random.out b/src/test/regress/expected/random.out index 43cf88a36341b..7f17b2a1b12f8 100644 --- a/src/test/regress/expected/random.out +++ b/src/test/regress/expected/random.out @@ -536,3 +536,90 @@ SELECT n, random(0, trim_scale(abs(1 - 10.0^(-n)))) FROM generate_series(-20, 20 20 | 0.60795101234744211935 (41 rows) +-- random dates +SELECT random('1979-02-08'::date,'2025-07-03'::date) AS random_date_multiple_years; + random_date_multiple_years +---------------------------- + 04-09-1986 +(1 row) + +SELECT random('4714-11-24 BC'::date,'5874897-12-31 AD'::date) AS random_date_maximum_range; + random_date_maximum_range +--------------------------- + 10-02-2898131 +(1 row) + +SELECT random('1979-02-08'::date,'1979-02-08'::date) AS random_date_empty_range; + random_date_empty_range +------------------------- + 02-08-1979 +(1 row) + +SELECT random('2024-12-31'::date, '2024-01-01'::date); -- fail +ERROR: lower bound must be less than or equal to upper bound +SELECT random('-infinity'::date, '2024-01-01'::date); -- fail +ERROR: lower and upper bounds must be finite +SELECT random('2024-12-31'::date, 'infinity'::date); -- fail +ERROR: lower and upper bounds must be finite +-- random timestamps +SELECT random('1979-02-08'::timestamp,'2025-07-03'::timestamp) AS random_timestamp_multiple_years; + random_timestamp_multiple_years +--------------------------------- + Fri Jan 27 18:52:05.366009 2017 +(1 row) + +SELECT random('4714-11-24 BC'::timestamp,'294276-12-31 23:59:59.999999'::timestamp) AS random_timestamp_maximum_range; + random_timestamp_maximum_range +----------------------------------- + Wed Mar 28 00:45:36.180395 226694 +(1 row) + +SELECT random('2024-07-01 12:00:00.000001'::timestamp, '2024-07-01 12:00:00.999999'::timestamp) AS random_narrow_range; + random_narrow_range +--------------------------------- + Mon Jul 01 12:00:00.999286 2024 +(1 row) + +SELECT random('1979-02-08'::timestamp,'1979-02-08'::timestamp) AS random_timestamp_empty_range; + random_timestamp_empty_range +------------------------------ + Thu Feb 08 00:00:00 1979 +(1 row) + +SELECT random('2024-12-31'::timestamp, '2024-01-01'::timestamp); -- fail +ERROR: lower bound must be less than or equal to upper bound +SELECT random('-infinity'::timestamp, '2024-01-01'::timestamp); -- fail +ERROR: lower and upper bounds must be finite +SELECT random('2024-12-31'::timestamp, 'infinity'::timestamp); -- fail +ERROR: lower and upper bounds must be finite +-- random timestamps with timezone +SELECT random('1979-02-08 +01'::timestamptz,'2025-07-03 +02'::timestamptz) AS random_timestamptz_multiple_years; + random_timestamptz_multiple_years +------------------------------------- + Tue Jun 14 04:41:16.652896 2016 PDT +(1 row) + +SELECT random('4714-11-24 BC +00'::timestamptz,'294276-12-31 23:59:59.999999 +00'::timestamptz) AS random_timestamptz_maximum_range; + random_timestamptz_maximum_range +-------------------------------------- + Wed Mar 26 14:07:16.980265 31603 PDT +(1 row) + +SELECT random('2024-07-01 12:00:00.000001 +04'::timestamptz, '2024-07-01 12:00:00.999999 +04'::timestamptz) AS random_timestamptz_narrow_range; + random_timestamptz_narrow_range +------------------------------------- + Mon Jul 01 01:00:00.835808 2024 PDT +(1 row) + +SELECT random('1979-02-08 +05'::timestamptz,'1979-02-08 +05'::timestamptz) AS random_timestamptz_empty_range; + random_timestamptz_empty_range +-------------------------------- + Wed Feb 07 11:00:00 1979 PST +(1 row) + +SELECT random('2024-01-01 +06'::timestamptz, '2024-01-01 +07'::timestamptz); -- fail +ERROR: lower bound must be less than or equal to upper bound +SELECT random('-infinity'::timestamptz, '2024-01-01 +07'::timestamptz); -- fail +ERROR: lower and upper bounds must be finite +SELECT random('2024-01-01 +06'::timestamptz, 'infinity'::timestamptz); -- fail +ERROR: lower and upper bounds must be finite diff --git a/src/test/regress/sql/random.sql b/src/test/regress/sql/random.sql index ebfa7539ede25..890f14687ef98 100644 --- a/src/test/regress/sql/random.sql +++ b/src/test/regress/sql/random.sql @@ -277,3 +277,29 @@ SELECT random(-1e30, 1e30) FROM generate_series(1, 10); SELECT random(-0.4, 0.4) FROM generate_series(1, 10); SELECT random(0, 1 - 1e-30) FROM generate_series(1, 10); SELECT n, random(0, trim_scale(abs(1 - 10.0^(-n)))) FROM generate_series(-20, 20) n; + +-- random dates +SELECT random('1979-02-08'::date,'2025-07-03'::date) AS random_date_multiple_years; +SELECT random('4714-11-24 BC'::date,'5874897-12-31 AD'::date) AS random_date_maximum_range; +SELECT random('1979-02-08'::date,'1979-02-08'::date) AS random_date_empty_range; +SELECT random('2024-12-31'::date, '2024-01-01'::date); -- fail +SELECT random('-infinity'::date, '2024-01-01'::date); -- fail +SELECT random('2024-12-31'::date, 'infinity'::date); -- fail + +-- random timestamps +SELECT random('1979-02-08'::timestamp,'2025-07-03'::timestamp) AS random_timestamp_multiple_years; +SELECT random('4714-11-24 BC'::timestamp,'294276-12-31 23:59:59.999999'::timestamp) AS random_timestamp_maximum_range; +SELECT random('2024-07-01 12:00:00.000001'::timestamp, '2024-07-01 12:00:00.999999'::timestamp) AS random_narrow_range; +SELECT random('1979-02-08'::timestamp,'1979-02-08'::timestamp) AS random_timestamp_empty_range; +SELECT random('2024-12-31'::timestamp, '2024-01-01'::timestamp); -- fail +SELECT random('-infinity'::timestamp, '2024-01-01'::timestamp); -- fail +SELECT random('2024-12-31'::timestamp, 'infinity'::timestamp); -- fail + +-- random timestamps with timezone +SELECT random('1979-02-08 +01'::timestamptz,'2025-07-03 +02'::timestamptz) AS random_timestamptz_multiple_years; +SELECT random('4714-11-24 BC +00'::timestamptz,'294276-12-31 23:59:59.999999 +00'::timestamptz) AS random_timestamptz_maximum_range; +SELECT random('2024-07-01 12:00:00.000001 +04'::timestamptz, '2024-07-01 12:00:00.999999 +04'::timestamptz) AS random_timestamptz_narrow_range; +SELECT random('1979-02-08 +05'::timestamptz,'1979-02-08 +05'::timestamptz) AS random_timestamptz_empty_range; +SELECT random('2024-01-01 +06'::timestamptz, '2024-01-01 +07'::timestamptz); -- fail +SELECT random('-infinity'::timestamptz, '2024-01-01 +07'::timestamptz); -- fail +SELECT random('2024-01-01 +06'::timestamptz, 'infinity'::timestamptz); -- fail From 81a61fde84ffc74f7b3c7854ed4193cc4d31f78b Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Tue, 9 Sep 2025 15:33:46 +0200 Subject: [PATCH 31/73] Fix typo in comment Author: Alexandra Wang Discussion: https://www.postgresql.org/message-id/CAK98qZ0whQ%3Dc%2BJGXbGSEBxCtLgy6sf-YGYqsKTAGsS-wt0wj%2BA%40mail.gmail.com --- src/backend/utils/adt/jsonbsubs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/utils/adt/jsonbsubs.c b/src/backend/utils/adt/jsonbsubs.c index de64d49851251..e8626d3b4fc6e 100644 --- a/src/backend/utils/adt/jsonbsubs.c +++ b/src/backend/utils/adt/jsonbsubs.c @@ -51,7 +51,7 @@ jsonb_subscript_transform(SubscriptingRef *sbsref, /* * Transform and convert the subscript expressions. Jsonb subscripting - * does not support slices, look only and the upper index. + * does not support slices, look only at the upper index. */ foreach(idx, indirection) { From 530cfa8eb50ca5a2151dfc50a6a5999ec8aff148 Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Tue, 9 Sep 2025 14:09:36 -0500 Subject: [PATCH 32/73] test_slru: Fix LWLock tranche allocation in EXEC_BACKEND builds. Currently, test_slru's shmem_startup_hook unconditionally generates new LWLock tranche IDs. This is fine on non-EXEC_BACKEND builds, where only the postmaster executes this hook, but on EXEC_BACKEND builds, every backend executes it, too. To fix, only generate the tranche IDs in the postmaster process by checking the IsUnderPostmaster variable. This is arguably a bug fix and could be back-patched, but since the damage is limited to some extra unused tranche IDs in a test module, I'm not going to bother. Reported-by: Sami Imseih Reviewed-by: Sami Imseih Discussion: https://postgr.es/m/CAA5RZ0vaAuonaf12CeDddQJu5xKL%2B6xVyS%2B_q1%2BcH%3D33JXV82w%40mail.gmail.com --- src/test/modules/test_slru/test_slru.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/test/modules/test_slru/test_slru.c b/src/test/modules/test_slru/test_slru.c index 8c0367eeee424..e963466aef1cd 100644 --- a/src/test/modules/test_slru/test_slru.c +++ b/src/test/modules/test_slru/test_slru.c @@ -219,8 +219,8 @@ test_slru_shmem_startup(void) */ const bool long_segment_names = true; const char slru_dir_name[] = "pg_test_slru"; - int test_tranche_id; - int test_buffer_tranche_id; + int test_tranche_id = -1; + int test_buffer_tranche_id = -1; if (prev_shmem_startup_hook) prev_shmem_startup_hook(); @@ -231,10 +231,18 @@ test_slru_shmem_startup(void) */ (void) MakePGDirectory(slru_dir_name); - /* initialize the SLRU facility */ - test_tranche_id = LWLockNewTrancheId("test_slru_tranche"); - - test_buffer_tranche_id = LWLockNewTrancheId("test_buffer_tranche"); + /* + * Initialize the SLRU facility. In EXEC_BACKEND builds, the + * shmem_startup_hook is called in the postmaster and in each backend, but + * we only need to generate the LWLock tranches once. Note that these + * tranche ID variables are not used by SimpleLruInit() when + * IsUnderPostmaster is true. + */ + if (!IsUnderPostmaster) + { + test_tranche_id = LWLockNewTrancheId("test_slru_tranche"); + test_buffer_tranche_id = LWLockNewTrancheId("test_buffer_tranche"); + } TestSlruCtl->PagePrecedes = test_slru_page_precedes_logically; SimpleLruInit(TestSlruCtl, "TestSLRU", From d96c854dfc634212193007ca58f8978bc272d457 Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Tue, 9 Sep 2025 14:35:30 -0500 Subject: [PATCH 33/73] Fix documentation for shmem_startup_hook. This section claims that each backend executes the shmem_startup_hook shortly after attaching to shared memory, which is true for EXEC_BACKEND builds, but not for others. This commit adds this important detail. Oversight in commit 964152c476. Reported-by: Sami Imseih Reviewed-by: Sami Imseih Discussion: https://postgr.es/m/CAA5RZ0vEGT1eigGbVt604LkXP6mUPMwPMxQoRCbFny44w%2B9EUQ%40mail.gmail.com Backpatch-through: 17 --- doc/src/sgml/xfunc.sgml | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/doc/src/sgml/xfunc.sgml b/doc/src/sgml/xfunc.sgml index da21ef5689184..04bf919b34384 100644 --- a/doc/src/sgml/xfunc.sgml +++ b/doc/src/sgml/xfunc.sgml @@ -3668,11 +3668,14 @@ LWLockRelease(AddinShmemInitLock); shmem_startup_hook provides a convenient place for the initialization code, but it is not strictly required that all such code - be placed in this hook. Each backend will execute the registered - shmem_startup_hook shortly after it attaches to shared - memory. Note that add-ins should still acquire + be placed in this hook. On Windows (and anywhere else where + EXEC_BACKEND is defined), each backend executes the + registered shmem_startup_hook shortly after it + attaches to shared memory, so add-ins should still acquire AddinShmemInitLock within this hook, as shown in the - example above. + example above. On other platforms, only the postmaster process executes + the shmem_startup_hook, and each backend automatically + inherits the pointers to shared memory. From 8c8f7b199d9095dbc2e101a4614043b5ae13bde3 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Wed, 10 Sep 2025 07:23:05 +0900 Subject: [PATCH 34/73] Fix leak with SMgrRelations in startup process The startup process does not process shared invalidation messages, only sending them, and never calls AtEOXact_SMgr() which clean up any unpinned SMgrRelations. Hence, it is never able to free SMgrRelations on a periodic basis, bloating its hashtable over time. Like the checkpointer and the bgwriter, this commit takes a conservative approach by freeing periodically SMgrRelations when replaying a checkpoint record, either online or shutdown, so as the startup process has a way to perform a periodic cleanup. Issue caused by 21d9c3ee4ef7, so backpatch down to v17. Author: Jingtang Zhang Reviewed-by: Yuhang Qiu Discussion: https://postgr.es/m/28C687D4-F335-417E-B06C-6612A0BD5A10@gmail.com Backpatch-through: 17 --- src/backend/access/transam/xlog.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 7ffb217915190..0baf0ac6160af 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -8385,6 +8385,14 @@ xlog_redo(XLogReaderState *record) checkPoint.ThisTimeLineID, replayTLI))); RecoveryRestartPoint(&checkPoint, record); + + /* + * After replaying a checkpoint record, free all smgr objects. + * Otherwise we would never do so for dropped relations, as the + * startup does not process shared invalidation messages or call + * AtEOXact_SMgr(). + */ + smgrdestroyall(); } else if (info == XLOG_CHECKPOINT_ONLINE) { @@ -8438,6 +8446,14 @@ xlog_redo(XLogReaderState *record) checkPoint.ThisTimeLineID, replayTLI))); RecoveryRestartPoint(&checkPoint, record); + + /* + * After replaying a checkpoint record, free all smgr objects. + * Otherwise we would never do so for dropped relations, as the + * startup does not process shared invalidation messages or call + * AtEOXact_SMgr(). + */ + smgrdestroyall(); } else if (info == XLOG_OVERWRITE_CONTRECORD) { From b1187266e077265cb061cbedd502e94179dc7b21 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Wed, 10 Sep 2025 11:20:46 +0900 Subject: [PATCH 35/73] Replace callers of dynahash.h's my_log() by equivalent in pg_bitutils.h All the calls replaced by this commit use 4-byte integers for their variables used in input of my_log2(). Hence, the limit against too-large inputs does not really apply. Thresholds are also applied, as of: - In nodeAgg.c, the number of partitions is limited by HASHAGG_MAX_PARTITIONS. - In nodeHash.c, ExecChooseHashTableSize() caps its maximum number of buckets based on HashJoinTuple and palloc() allocation limit. - In worker.c, the number of subxacts tracked by ApplySubXactData uses uint32, making pg_ceil_log2_64() safe to use directly. Several approaches have been discussed, like an integration with thresholds in pg_bitutils.h, but it was found confusing. This uses Dean's idea, which gives a simpler result than what I came up with to be able to remove dynahash.h. dynahash.h will be removed in a follow-up commit, removing some duplication with the ceil log2 routines. Reviewed-by: Peter Eisentraut Reviewed-by: Dean Rasheed Discussion: https://postgr.es/m/CAEZATCUJPQD_7sC-wErak2CQGNa6bj2hY-mr8wsBki=kX7f2_A@mail.gmail.com --- src/backend/executor/nodeAgg.c | 3 +-- src/backend/executor/nodeHash.c | 7 +++---- src/backend/replication/logical/worker.c | 3 +-- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c index 377e016d73225..a4f3d30f307cc 100644 --- a/src/backend/executor/nodeAgg.c +++ b/src/backend/executor/nodeAgg.c @@ -267,7 +267,6 @@ #include "utils/acl.h" #include "utils/builtins.h" #include "utils/datum.h" -#include "utils/dynahash.h" #include "utils/expandeddatum.h" #include "utils/injection_point.h" #include "utils/logtape.h" @@ -2115,7 +2114,7 @@ hash_choose_num_partitions(double input_groups, double hashentrysize, npartitions = (int) dpartitions; /* ceil(log2(npartitions)) */ - partition_bits = my_log2(npartitions); + partition_bits = pg_ceil_log2_32(npartitions); /* make sure that we don't exhaust the hash bits */ if (partition_bits + used_bits >= 32) diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c index 8d2201ab67fa5..a3415db4e20f5 100644 --- a/src/backend/executor/nodeHash.c +++ b/src/backend/executor/nodeHash.c @@ -36,7 +36,6 @@ #include "executor/nodeHashjoin.h" #include "miscadmin.h" #include "port/pg_bitutils.h" -#include "utils/dynahash.h" #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/syscache.h" @@ -340,7 +339,7 @@ MultiExecParallelHash(HashState *node) */ hashtable->curbatch = -1; hashtable->nbuckets = pstate->nbuckets; - hashtable->log2_nbuckets = my_log2(hashtable->nbuckets); + hashtable->log2_nbuckets = pg_ceil_log2_32(hashtable->nbuckets); hashtable->totalTuples = pstate->total_tuples; /* @@ -480,7 +479,7 @@ ExecHashTableCreate(HashState *state) &nbuckets, &nbatch, &num_skew_mcvs); /* nbuckets must be a power of 2 */ - log2_nbuckets = my_log2(nbuckets); + log2_nbuckets = pg_ceil_log2_32(nbuckets); Assert(nbuckets == (1 << log2_nbuckets)); /* @@ -3499,7 +3498,7 @@ ExecParallelHashTableSetCurrentBatch(HashJoinTable hashtable, int batchno) dsa_get_address(hashtable->area, hashtable->batches[batchno].shared->buckets); hashtable->nbuckets = hashtable->parallel_state->nbuckets; - hashtable->log2_nbuckets = my_log2(hashtable->nbuckets); + hashtable->log2_nbuckets = pg_ceil_log2_32(hashtable->nbuckets); hashtable->current_chunk = NULL; hashtable->current_chunk_shared = InvalidDsaPointer; hashtable->batches[batchno].at_least_one_chunk = false; diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c index b3cac1023731a..ee6ac22329fdc 100644 --- a/src/backend/replication/logical/worker.c +++ b/src/backend/replication/logical/worker.c @@ -276,7 +276,6 @@ #include "storage/procarray.h" #include "tcop/tcopprot.h" #include "utils/acl.h" -#include "utils/dynahash.h" #include "utils/guc.h" #include "utils/inval.h" #include "utils/lsyscache.h" @@ -5115,7 +5114,7 @@ subxact_info_read(Oid subid, TransactionId xid) len = sizeof(SubXactInfo) * subxact_data.nsubxacts; /* we keep the maximum as a power of 2 */ - subxact_data.nsubxacts_max = 1 << my_log2(subxact_data.nsubxacts); + subxact_data.nsubxacts_max = 1 << pg_ceil_log2_32(subxact_data.nsubxacts); /* * Allocate subxact information in the logical streaming context. We need From e6da68a6e1d60a037b63a9c9ed36e5ef0a996769 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Wed, 10 Sep 2025 14:11:50 +0900 Subject: [PATCH 36/73] Remove dynahash.h All the callers of my_log2() are now limited inside dynahash.c, so let's remove this header. The same capability is provided by pg_bitutils.h already. Discussion: https://postgr.es/m/CAEZATCUJPQD_7sC-wErak2CQGNa6bj2hY-mr8wsBki=kX7f2_A@mail.gmail.com --- src/backend/utils/hash/dynahash.c | 4 ++-- src/include/utils/dynahash.h | 20 -------------------- 2 files changed, 2 insertions(+), 22 deletions(-) delete mode 100644 src/include/utils/dynahash.h diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c index 1aeee5be42acd..ac94b9e93c6e3 100644 --- a/src/backend/utils/hash/dynahash.c +++ b/src/backend/utils/hash/dynahash.c @@ -102,7 +102,6 @@ #include "port/pg_bitutils.h" #include "storage/shmem.h" #include "storage/spin.h" -#include "utils/dynahash.h" #include "utils/memutils.h" @@ -281,6 +280,7 @@ static bool init_htab(HTAB *hashp, int64 nelem); pg_noreturn static void hash_corrupted(HTAB *hashp); static uint32 hash_initial_lookup(HTAB *hashp, uint32 hashvalue, HASHBUCKET **bucketptr); +static int my_log2(int64 num); static int64 next_pow2_int64(int64 num); static int next_pow2_int(int64 num); static void register_seq_scan(HTAB *hashp); @@ -1813,7 +1813,7 @@ hash_corrupted(HTAB *hashp) } /* calculate ceil(log base 2) of num */ -int +static int my_log2(int64 num) { /* diff --git a/src/include/utils/dynahash.h b/src/include/utils/dynahash.h deleted file mode 100644 index a4362d3f65e59..0000000000000 --- a/src/include/utils/dynahash.h +++ /dev/null @@ -1,20 +0,0 @@ -/*------------------------------------------------------------------------- - * - * dynahash.h - * POSTGRES dynahash.h file definitions - * - * - * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group - * Portions Copyright (c) 1994, Regents of the University of California - * - * IDENTIFICATION - * src/include/utils/dynahash.h - * - *------------------------------------------------------------------------- - */ -#ifndef DYNAHASH_H -#define DYNAHASH_H - -extern int my_log2(int64 num); - -#endif /* DYNAHASH_H */ From 33eec809402bfbf3eb0d01ad5b023d3d05fcb3bc Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Wed, 10 Sep 2025 11:49:53 +0200 Subject: [PATCH 37/73] Fix CREATE TABLE LIKE with not-valid check constraint In CREATE TABLE ... LIKE, any check constraints copied from the source table should be set to valid if they are ENFORCED (the default). Bug introduced in commit ca87c415e2f. Author: jian he Discussion: https://www.postgresql.org/message-id/CACJufxH%3D%2Bod8Wy0P4L3_GpapNwLUP3oAes5UFRJ7yTxrM_M5kg%40mail.gmail.com --- src/backend/parser/parse_utilcmd.c | 3 +-- src/test/regress/expected/create_table_like.out | 8 ++++++++ src/test/regress/sql/create_table_like.sql | 1 + 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c index afcf54169c3b3..e96b38a59d503 100644 --- a/src/backend/parser/parse_utilcmd.c +++ b/src/backend/parser/parse_utilcmd.c @@ -1461,7 +1461,6 @@ expandTableLikeClause(RangeVar *heapRel, TableLikeClause *table_like_clause) char *ccname = constr->check[ccnum].ccname; char *ccbin = constr->check[ccnum].ccbin; bool ccenforced = constr->check[ccnum].ccenforced; - bool ccvalid = constr->check[ccnum].ccvalid; bool ccnoinherit = constr->check[ccnum].ccnoinherit; Node *ccbin_node; bool found_whole_row; @@ -1492,7 +1491,7 @@ expandTableLikeClause(RangeVar *heapRel, TableLikeClause *table_like_clause) n->conname = pstrdup(ccname); n->location = -1; n->is_enforced = ccenforced; - n->initially_valid = ccvalid; + n->initially_valid = ccenforced; /* sic */ n->is_no_inherit = ccnoinherit; n->raw_expr = NULL; n->cooked_expr = nodeToString(ccbin_node); diff --git a/src/test/regress/expected/create_table_like.out b/src/test/regress/expected/create_table_like.out index 29a779c2e9072..d3c35c148475d 100644 --- a/src/test/regress/expected/create_table_like.out +++ b/src/test/regress/expected/create_table_like.out @@ -320,6 +320,7 @@ DROP TABLE inhz; -- including storage and comments CREATE TABLE ctlt1 (a text CHECK (length(a) > 2) ENFORCED PRIMARY KEY, b text CHECK (length(b) > 100) NOT ENFORCED); +ALTER TABLE ctlt1 ADD CONSTRAINT cc CHECK (length(b) > 100) NOT VALID; CREATE INDEX ctlt1_b_key ON ctlt1 (b); CREATE INDEX ctlt1_fnidx ON ctlt1 ((a || b)); CREATE STATISTICS ctlt1_a_b_stat ON a,b FROM ctlt1; @@ -378,6 +379,7 @@ SELECT conname, description FROM pg_description, pg_constraint c WHERE classoid CREATE TABLE ctlt1_inh (LIKE ctlt1 INCLUDING CONSTRAINTS INCLUDING COMMENTS) INHERITS (ctlt1); NOTICE: merging column "a" with inherited definition NOTICE: merging column "b" with inherited definition +NOTICE: merging constraint "cc" with inherited definition NOTICE: merging constraint "ctlt1_a_check" with inherited definition NOTICE: merging constraint "ctlt1_b_check" with inherited definition \d+ ctlt1_inh @@ -387,6 +389,7 @@ NOTICE: merging constraint "ctlt1_b_check" with inherited definition a | text | | not null | | main | | A b | text | | | | extended | | B Check constraints: + "cc" CHECK (length(b) > 100) "ctlt1_a_check" CHECK (length(a) > 2) "ctlt1_b_check" CHECK (length(b) > 100) NOT ENFORCED Not-null constraints: @@ -409,6 +412,7 @@ NOTICE: merging multiple inherited definitions of column "a" b | text | | | | extended | | c | text | | | | external | | Check constraints: + "cc" CHECK (length(b) > 100) "ctlt1_a_check" CHECK (length(a) > 2) "ctlt1_b_check" CHECK (length(b) > 100) NOT ENFORCED "ctlt3_a_check" CHECK (length(a) < 5) @@ -430,6 +434,7 @@ NOTICE: merging column "a" with inherited definition Indexes: "ctlt13_like_expr_idx" btree ((a || c)) Check constraints: + "cc" CHECK (length(b) > 100) "ctlt1_a_check" CHECK (length(a) > 2) "ctlt1_b_check" CHECK (length(b) > 100) NOT ENFORCED "ctlt3_a_check" CHECK (length(a) < 5) @@ -456,6 +461,7 @@ Indexes: "ctlt_all_b_idx" btree (b) "ctlt_all_expr_idx" btree ((a || b)) Check constraints: + "cc" CHECK (length(b) > 100) "ctlt1_a_check" CHECK (length(a) > 2) "ctlt1_b_check" CHECK (length(b) > 100) NOT ENFORCED Statistics objects: @@ -499,6 +505,7 @@ Indexes: "pg_attrdef_b_idx" btree (b) "pg_attrdef_expr_idx" btree ((a || b)) Check constraints: + "cc" CHECK (length(b) > 100) "ctlt1_a_check" CHECK (length(a) > 2) "ctlt1_b_check" CHECK (length(b) > 100) NOT ENFORCED Statistics objects: @@ -524,6 +531,7 @@ Indexes: "ctlt1_b_idx" btree (b) "ctlt1_expr_idx" btree ((a || b)) Check constraints: + "cc" CHECK (length(b) > 100) "ctlt1_a_check" CHECK (length(a) > 2) "ctlt1_b_check" CHECK (length(b) > 100) NOT ENFORCED Statistics objects: diff --git a/src/test/regress/sql/create_table_like.sql b/src/test/regress/sql/create_table_like.sql index bf8702116a74b..93389b57dbf95 100644 --- a/src/test/regress/sql/create_table_like.sql +++ b/src/test/regress/sql/create_table_like.sql @@ -130,6 +130,7 @@ DROP TABLE inhz; -- including storage and comments CREATE TABLE ctlt1 (a text CHECK (length(a) > 2) ENFORCED PRIMARY KEY, b text CHECK (length(b) > 100) NOT ENFORCED); +ALTER TABLE ctlt1 ADD CONSTRAINT cc CHECK (length(b) > 100) NOT VALID; CREATE INDEX ctlt1_b_key ON ctlt1 (b); CREATE INDEX ctlt1_fnidx ON ctlt1 ((a || b)); CREATE STATISTICS ctlt1_a_b_stat ON a,b FROM ctlt1; From 9016fa7e3bcde8ae4c3d63c707143af147486a10 Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Wed, 10 Sep 2025 11:21:12 -0500 Subject: [PATCH 38/73] meson: Build numeric.c with -ftree-vectorize. autoconf builds have compiled this file with -ftree-vectorize since commit 8870917623, but meson builds seem to have missed the memo. Reviewed-by: Jeff Davis Discussion: https://postgr.es/m/aL85CeasM51-0D1h%40nathan Backpatch-through: 16 --- src/backend/utils/adt/meson.build | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/backend/utils/adt/meson.build b/src/backend/utils/adt/meson.build index dac372c3bea3b..12fa0c209127c 100644 --- a/src/backend/utils/adt/meson.build +++ b/src/backend/utils/adt/meson.build @@ -1,5 +1,15 @@ # Copyright (c) 2022-2025, PostgreSQL Global Development Group +# Some code in numeric.c benefits from auto-vectorization +numeric_backend_lib = static_library('numeric_backend_lib', + 'numeric.c', + dependencies: backend_build_deps, + kwargs: internal_lib_args, + c_args: vectorize_cflags, +) + +backend_link_with += numeric_backend_lib + backend_sources += files( 'acl.c', 'amutils.c', @@ -61,7 +71,6 @@ backend_sources += files( 'network_gist.c', 'network_selfuncs.c', 'network_spgist.c', - 'numeric.c', 'numutils.c', 'oid.c', 'oracle_compat.c', From abdeacdb0920d94dec7500d09f6f29fbb2f6310d Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 10 Sep 2025 16:05:03 -0400 Subject: [PATCH 39/73] Fix memory leakage in nodeSubplan.c. If the hash functions used for hashing tuples leaked any memory, we failed to clean that up, resulting in query-lifespan memory leakage in queries using hashed subplans. One way that could happen is if the values being hashed require de-toasting, since most of our hash functions don't trouble to clean up de-toasted inputs. Prior to commit bf6c614a2, this leakage was largely masked because TupleHashTableMatch would reset hashtable->tempcxt (via execTuplesMatch). But it doesn't do that anymore, and that's not really the right place for this anyway: doing it there could reset the tempcxt many times per hash lookup, or not at all. Instead put reset calls into ExecHashSubPlan and buildSubPlanHash. Along the way to that, rearrange ExecHashSubPlan so that there's just one place to call MemoryContextReset instead of several. This amounts to accepting the de-facto API spec that the caller of the TupleHashTable routines is responsible for resetting the tempcxt adequately often. Although the other callers seem to get this right, it was not documented anywhere, so add a comment about it. Bug: #19040 Reported-by: Haiyang Li Author: Haiyang Li Reviewed-by: Fei Changhong Reviewed-by: Tom Lane Discussion: https://postgr.es/m/19040-c9b6073ef814f48c@postgresql.org Backpatch-through: 13 --- src/backend/executor/execGrouping.c | 6 +++ src/backend/executor/nodeSubplan.c | 70 +++++++++++------------------ 2 files changed, 33 insertions(+), 43 deletions(-) diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c index b540074935386..75087204f0c69 100644 --- a/src/backend/executor/execGrouping.c +++ b/src/backend/executor/execGrouping.c @@ -156,6 +156,12 @@ execTuplesHashPrepare(int numCols, * * Note that the keyColIdx, hashfunctions, and collations arrays must be * allocated in storage that will live as long as the hashtable does. + * + * LookupTupleHashEntry, FindTupleHashEntry, and related functions may leak + * memory in the tempcxt. It is caller's responsibility to reset that context + * reasonably often, typically once per tuple. (We do it that way, rather + * than managing an extra context within the hashtable, because in many cases + * the caller can specify a tempcxt that it needs to reset per-tuple anyway.) */ TupleHashTable BuildTupleHashTable(PlanState *parent, diff --git a/src/backend/executor/nodeSubplan.c b/src/backend/executor/nodeSubplan.c index f7f6fc2da0b95..8e55dcc159b0b 100644 --- a/src/backend/executor/nodeSubplan.c +++ b/src/backend/executor/nodeSubplan.c @@ -102,6 +102,7 @@ ExecHashSubPlan(SubPlanState *node, ExprContext *econtext, bool *isNull) { + bool result = false; SubPlan *subplan = node->subplan; PlanState *planstate = node->planstate; TupleTableSlot *slot; @@ -132,14 +133,6 @@ ExecHashSubPlan(SubPlanState *node, node->projLeft->pi_exprContext = econtext; slot = ExecProject(node->projLeft); - /* - * Note: because we are typically called in a per-tuple context, we have - * to explicitly clear the projected tuple before returning. Otherwise, - * we'll have a double-free situation: the per-tuple context will probably - * be reset before we're called again, and then the tuple slot will think - * it still needs to free the tuple. - */ - /* * If the LHS is all non-null, probe for an exact match in the main hash * table. If we find one, the result is TRUE. Otherwise, scan the @@ -161,19 +154,10 @@ ExecHashSubPlan(SubPlanState *node, slot, node->cur_eq_comp, node->lhs_hash_expr) != NULL) - { - ExecClearTuple(slot); - return BoolGetDatum(true); - } - if (node->havenullrows && - findPartialMatch(node->hashnulls, slot, node->cur_eq_funcs)) - { - ExecClearTuple(slot); + result = true; + else if (node->havenullrows && + findPartialMatch(node->hashnulls, slot, node->cur_eq_funcs)) *isNull = true; - return BoolGetDatum(false); - } - ExecClearTuple(slot); - return BoolGetDatum(false); } /* @@ -186,34 +170,31 @@ ExecHashSubPlan(SubPlanState *node, * aren't provably unequal to the LHS; if so, the result is UNKNOWN. * Otherwise, the result is FALSE. */ - if (node->hashnulls == NULL) - { - ExecClearTuple(slot); - return BoolGetDatum(false); - } - if (slotAllNulls(slot)) - { - ExecClearTuple(slot); + else if (node->hashnulls == NULL) + /* just return FALSE */ ; + else if (slotAllNulls(slot)) *isNull = true; - return BoolGetDatum(false); - } /* Scan partly-null table first, since more likely to get a match */ - if (node->havenullrows && - findPartialMatch(node->hashnulls, slot, node->cur_eq_funcs)) - { - ExecClearTuple(slot); + else if (node->havenullrows && + findPartialMatch(node->hashnulls, slot, node->cur_eq_funcs)) *isNull = true; - return BoolGetDatum(false); - } - if (node->havehashrows && - findPartialMatch(node->hashtable, slot, node->cur_eq_funcs)) - { - ExecClearTuple(slot); + else if (node->havehashrows && + findPartialMatch(node->hashtable, slot, node->cur_eq_funcs)) *isNull = true; - return BoolGetDatum(false); - } + + /* + * Note: because we are typically called in a per-tuple context, we have + * to explicitly clear the projected tuple before returning. Otherwise, + * we'll have a double-free situation: the per-tuple context will probably + * be reset before we're called again, and then the tuple slot will think + * it still needs to free the tuple. + */ ExecClearTuple(slot); - return BoolGetDatum(false); + + /* Also must reset the hashtempcxt after each hashtable lookup. */ + MemoryContextReset(node->hashtempcxt); + + return BoolGetDatum(result); } /* @@ -642,6 +623,9 @@ buildSubPlanHash(SubPlanState *node, ExprContext *econtext) * during ExecProject. */ ResetExprContext(innerecontext); + + /* Also must reset the hashtempcxt after each hashtable lookup. */ + MemoryContextReset(node->hashtempcxt); } /* From bdc6cfcd12f5c95799328e05aa4bfa75cfe3e79f Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 10 Sep 2025 16:15:08 -0400 Subject: [PATCH 40/73] Eliminate duplicative hashtempcxt in nodeSubplan.c. Instead of building a separate memory context that's used just for running hash functions, make the hash functions run in the per-tuple context of the node's innerecontext. This saves a little space at runtime, and it avoids needing to reset two contexts instead of one inside buildSubPlanHash's main loop. This largely reverts commit 133924e13. That's safe to do now because bf6c614a2 decoupled the evaluation context used by TupleHashTableMatch from that used for hash function evaluation, so that there's no longer a risk of resetting the innerecontext too soon. Per discussion of bug #19040, although this is not directly a fix for that. Author: Tom Lane Reviewed-by: Haiyang Li Reviewed-by: Fei Changhong Discussion: https://postgr.es/m/19040-c9b6073ef814f48c@postgresql.org --- src/backend/executor/nodeSubplan.c | 19 +++++-------------- src/include/nodes/execnodes.h | 1 - 2 files changed, 5 insertions(+), 15 deletions(-) diff --git a/src/backend/executor/nodeSubplan.c b/src/backend/executor/nodeSubplan.c index 8e55dcc159b0b..53fb56f7388e8 100644 --- a/src/backend/executor/nodeSubplan.c +++ b/src/backend/executor/nodeSubplan.c @@ -191,8 +191,8 @@ ExecHashSubPlan(SubPlanState *node, */ ExecClearTuple(slot); - /* Also must reset the hashtempcxt after each hashtable lookup. */ - MemoryContextReset(node->hashtempcxt); + /* Also must reset the innerecontext after each hashtable lookup. */ + ResetExprContext(node->innerecontext); return BoolGetDatum(result); } @@ -529,7 +529,7 @@ buildSubPlanHash(SubPlanState *node, ExprContext *econtext) 0, node->planstate->state->es_query_cxt, node->hashtablecxt, - node->hashtempcxt, + innerecontext->ecxt_per_tuple_memory, false); if (!subplan->unknownEqFalse) @@ -558,7 +558,7 @@ buildSubPlanHash(SubPlanState *node, ExprContext *econtext) 0, node->planstate->state->es_query_cxt, node->hashtablecxt, - node->hashtempcxt, + innerecontext->ecxt_per_tuple_memory, false); } else @@ -620,12 +620,9 @@ buildSubPlanHash(SubPlanState *node, ExprContext *econtext) /* * Reset innerecontext after each inner tuple to free any memory used - * during ExecProject. + * during ExecProject and hashtable lookup. */ ResetExprContext(innerecontext); - - /* Also must reset the hashtempcxt after each hashtable lookup. */ - MemoryContextReset(node->hashtempcxt); } /* @@ -842,7 +839,6 @@ ExecInitSubPlan(SubPlan *subplan, PlanState *parent) sstate->hashtable = NULL; sstate->hashnulls = NULL; sstate->hashtablecxt = NULL; - sstate->hashtempcxt = NULL; sstate->innerecontext = NULL; sstate->keyColIdx = NULL; sstate->tab_eq_funcoids = NULL; @@ -898,11 +894,6 @@ ExecInitSubPlan(SubPlan *subplan, PlanState *parent) AllocSetContextCreate(CurrentMemoryContext, "Subplan HashTable Context", ALLOCSET_DEFAULT_SIZES); - /* and a small one for the hash tables to use as temp storage */ - sstate->hashtempcxt = - AllocSetContextCreate(CurrentMemoryContext, - "Subplan HashTable Temp Context", - ALLOCSET_SMALL_SIZES); /* and a short-lived exprcontext for function evaluation */ sstate->innerecontext = CreateExprContext(estate); diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index de782014b2d41..71857feae4823 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1020,7 +1020,6 @@ typedef struct SubPlanState bool havehashrows; /* true if hashtable is not empty */ bool havenullrows; /* true if hashnulls is not empty */ MemoryContext hashtablecxt; /* memory context containing hash tables */ - MemoryContext hashtempcxt; /* temp memory context for hash tables */ ExprContext *innerecontext; /* econtext for computing inner tuples */ int numCols; /* number of columns being hashed */ /* each of the remaining fields is an array of length numCols: */ From 09036dc71c682b0bf7234ed39c1429ed99fbe442 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 10 Sep 2025 17:51:24 -0400 Subject: [PATCH 41/73] Avoid faulty alignment of Datums in build_sorted_items(). If sizeof(Pointer) is 4 then sizeof(SortItem) will be 12, so that if data->numrows is odd then we placed the values array at a location that's not a multiple of 8. That was fine when sizeof(Datum) was also 4, but in the wake of commit 2a600a93c it makes some alignment-picky machines unhappy. (You need a 32-bit machine that nonetheless expects 8-byte alignment of 8-byte quantities, which is an odd-seeming combination but it does exist outside the Intel universe.) To fix, MAXALIGN the space allocated to the SortItem array. In passing, let's make the "len" variable be Size not int, just for paranoia's sake. This code was arguably not too safe even before 2a600a93c, but at present I don't see a strong argument for back-patching. Reported-by: Tomas Vondra Author: Tom Lane Discussion: https://postgr.es/m/87036018-8d70-40ad-a0ac-192b07bd7b04@vondra.me --- src/backend/statistics/extended_stats.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c index af0b99243c614..3c3d2d315c6f4 100644 --- a/src/backend/statistics/extended_stats.c +++ b/src/backend/statistics/extended_stats.c @@ -986,10 +986,9 @@ build_sorted_items(StatsBuildData *data, int *nitems, { int i, j, - len, nrows; int nvalues = data->numrows * numattrs; - + Size len; SortItem *items; Datum *values; bool *isnull; @@ -997,14 +996,16 @@ build_sorted_items(StatsBuildData *data, int *nitems, int *typlen; /* Compute the total amount of memory we need (both items and values). */ - len = data->numrows * sizeof(SortItem) + nvalues * (sizeof(Datum) + sizeof(bool)); + len = MAXALIGN(data->numrows * sizeof(SortItem)) + + nvalues * (sizeof(Datum) + sizeof(bool)); /* Allocate the memory and split it into the pieces. */ ptr = palloc0(len); /* items to sort */ items = (SortItem *) ptr; - ptr += data->numrows * sizeof(SortItem); + /* MAXALIGN ensures that the following Datums are suitably aligned */ + ptr += MAXALIGN(data->numrows * sizeof(SortItem)); /* values and null flags */ values = (Datum *) ptr; From c88ce73eda2e0a818d730c5b72475ef99cc9c4cf Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Thu, 11 Sep 2025 10:15:33 +0900 Subject: [PATCH 42/73] Fix incorrect file reference in guc.h GucSource_Names was documented as being in guc.c, but since 0a20ff54f5e6 it is located in guc_tables.c. The reference to the location of GucSource_Names is important, as GucSource needs to be kept in sync with GucSource_Names. Author: David G. Johnston Discussion: https://postgr.es/m/CAKFQuwYPgAHWPYjPzK7iXzhSZ6MKR8w20_Nz7ZXpOvx=kZbs7A@mail.gmail.com Backpatch-through: 16 --- src/include/utils/guc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h index 756e80a2c2fcc..f21ec37da8933 100644 --- a/src/include/utils/guc.h +++ b/src/include/utils/guc.h @@ -106,7 +106,7 @@ typedef enum * will show as "default" in pg_settings. If there is a specific reason not * to want that, use source == PGC_S_OVERRIDE. * - * NB: see GucSource_Names in guc.c if you change this. + * NB: see GucSource_Names in guc_tables.c if you change this. */ typedef enum { From 26eadf4d2b14a8a8d110b214dbb7ef952fbf8e93 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Thu, 11 Sep 2025 17:17:04 +0900 Subject: [PATCH 43/73] Fix description of WAL record blocks in hash_xlog.h hash_xlog.h included descriptions for the blocks used in WAL records that were was not completely consistent with how the records are generated, with one block missing for SQUEEZE_PAGE, and inconsistent descriptions used for block 0 in VACUUM_ONE_PAGE and MOVE_PAGE_CONTENTS. This information was incorrect since c11453ce0aea, cross-checking the logic for the record generation. Author: Kirill Reshke Reviewed-by: Andrey Borodin Discussion: https://postgr.es/m/CALdSSPj1j=a1d1hVA3oabRFz0hSU3KKrYtZPijw4UPUM7LY9zw@mail.gmail.com Backpatch-through: 13 --- src/include/access/hash_xlog.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/include/access/hash_xlog.h b/src/include/access/hash_xlog.h index 6fe97de4d66f1..5d4671dc4c128 100644 --- a/src/include/access/hash_xlog.h +++ b/src/include/access/hash_xlog.h @@ -129,7 +129,7 @@ typedef struct xl_hash_split_complete * * This data record is used for XLOG_HASH_MOVE_PAGE_CONTENTS * - * Backup Blk 0: bucket page + * Backup Blk 0: primary bucket page * Backup Blk 1: page containing moved tuples * Backup Blk 2: page from which tuples will be removed */ @@ -149,12 +149,13 @@ typedef struct xl_hash_move_page_contents * * This data record is used for XLOG_HASH_SQUEEZE_PAGE * - * Backup Blk 0: page containing tuples moved from freed overflow page - * Backup Blk 1: freed overflow page - * Backup Blk 2: page previous to the freed overflow page - * Backup Blk 3: page next to the freed overflow page - * Backup Blk 4: bitmap page containing info of freed overflow page - * Backup Blk 5: meta page + * Backup Blk 0: primary bucket page + * Backup Blk 1: page containing tuples moved from freed overflow page + * Backup Blk 2: freed overflow page + * Backup Blk 3: page previous to the freed overflow page + * Backup Blk 4: page next to the freed overflow page + * Backup Blk 5: bitmap page containing info of freed overflow page + * Backup Blk 6: meta page */ typedef struct xl_hash_squeeze_page { @@ -245,7 +246,7 @@ typedef struct xl_hash_init_bitmap_page * * This data record is used for XLOG_HASH_VACUUM_ONE_PAGE * - * Backup Blk 0: bucket page + * Backup Blk 0: primary bucket page * Backup Blk 1: meta page */ typedef struct xl_hash_vacuum_one_page From 9c24111c4dad850bc2625c2113bb3d2dfd592efc Mon Sep 17 00:00:00 2001 From: Dean Rasheed Date: Thu, 11 Sep 2025 09:25:47 +0100 Subject: [PATCH 44/73] doc: Improve description of new random(min, max) functions. Mention that the new variants of random(min, max) are affected by setseed(), like the original functions. Reported-by: Marcos Pegoraro Discussion: https://postgr.es/m/CAB-JLwb1=drA3Le6uZXDBi_tCpeS1qm6XQU7dKwac_x91Z4qDg@mail.gmail.com --- doc/src/sgml/func/func-datetime.sgml | 6 ++++++ doc/src/sgml/func/func-math.sgml | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/doc/src/sgml/func/func-datetime.sgml b/doc/src/sgml/func/func-datetime.sgml index 98dd60aa9a7ec..a25da4b5175ca 100644 --- a/doc/src/sgml/func/func-datetime.sgml +++ b/doc/src/sgml/func/func-datetime.sgml @@ -948,6 +948,12 @@ Returns a random value in the range min <= x <= max. + + Note that these functions use the same pseudo-random number generator + as the functions listed in , + and respond in the same way to calling + setseed(). + random('1979-02-08'::date,'2025-07-03'::date) 1983-04-21 diff --git a/doc/src/sgml/func/func-math.sgml b/doc/src/sgml/func/func-math.sgml index fd821c0e70677..9dcf97e7c9e06 100644 --- a/doc/src/sgml/func/func-math.sgml +++ b/doc/src/sgml/func/func-math.sgml @@ -1130,7 +1130,7 @@ - + setseed setseed ( double precision ) From 2bbbb2eca9303df590cc79be74b13cad259124a5 Mon Sep 17 00:00:00 2001 From: Dean Rasheed Date: Thu, 11 Sep 2025 09:48:12 +0100 Subject: [PATCH 45/73] doc: Fix indentation in func-datetime.sgml. Incorrect indentation introduced by commit faf071b5538. --- doc/src/sgml/func/func-datetime.sgml | 56 ++++++++++++++-------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/doc/src/sgml/func/func-datetime.sgml b/doc/src/sgml/func/func-datetime.sgml index a25da4b5175ca..8cd7150b0d313 100644 --- a/doc/src/sgml/func/func-datetime.sgml +++ b/doc/src/sgml/func/func-datetime.sgml @@ -935,34 +935,34 @@ random ( min date, max date ) date - - - random ( min timestamp, max timestamp ) - timestamp - - - random ( min timestamptz, max timestamptz ) - timestamptz - - - Returns a random value in the range - min <= x <= max. - - - Note that these functions use the same pseudo-random number generator - as the functions listed in , - and respond in the same way to calling - setseed(). - - - random('1979-02-08'::date,'2025-07-03'::date) - 1983-04-21 - - - random('2000-01-01'::timestamptz, now()) - 2015-09-27 09:11:33.732707+00 - - + + + random ( min timestamp, max timestamp ) + timestamp + + + random ( min timestamptz, max timestamptz ) + timestamptz + + + Returns a random value in the range + min <= x <= max. + + + Note that these functions use the same pseudo-random number generator + as the functions listed in , + and respond in the same way to calling + setseed(). + + + random('1979-02-08'::date,'2025-07-03'::date) + 1983-04-21 + + + random('2000-01-01'::timestamptz, now()) + 2015-09-27 09:11:33.732707+00 + + From 01d793698f5921547a7b5e1e003722c17f552574 Mon Sep 17 00:00:00 2001 From: Amit Kapila Date: Thu, 11 Sep 2025 09:33:48 +0000 Subject: [PATCH 46/73] Fix intermittent test failure introduced in 6456c6e2c4. The test assumes that a backend will execute COMMIT PREPARED on the publisher and hit the injection point commit-after-delay-checkpoint within the commit critical section. This should cause the apply worker on the subscriber to wait for the transaction to complete. However, the test does not guarantee that the injection point is actually triggered, creating a race condition where the apply worker may proceed prematurely during COMMIT PREPARED. This commit resolves the issue by explicitly waiting for the injection point to be hit before continuing with the test, ensuring consistent and reliable behavior. Author: Zhijie Hou Reviewed-by: shveta malik Discussion: https://postgr.es/m/TY4PR01MB1690751D1CA8C128B0770EC6F9409A@TY4PR01MB16907.jpnprd01.prod.outlook.com --- src/test/subscription/t/035_conflicts.pl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/test/subscription/t/035_conflicts.pl b/src/test/subscription/t/035_conflicts.pl index db0d5b464e825..880551fc69d74 100644 --- a/src/test/subscription/t/035_conflicts.pl +++ b/src/test/subscription/t/035_conflicts.pl @@ -475,6 +475,9 @@ } ); + # Wait until the backend enters the injection point + $node_B->wait_for_event('client backend', 'commit-after-delay-checkpoint'); + # Confirm the update is suspended $result = $node_B->safe_psql('postgres', 'SELECT * FROM tab WHERE a = 1'); From 368c38dd47c209fa95d2df855d62bcde386f3037 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Thu, 11 Sep 2025 11:55:29 +0200 Subject: [PATCH 47/73] Remove stray semicolon at global scope The Sun Studio compiler complains about an empty declaration here. Note for future historians: This does not mean that this compiler is still of current interest for anyone using PostgreSQL. But we can let this small fix be its parting gift. Reviewed-by: Andres Freund Reviewed-by: Tom Lane Discussion: https://www.postgresql.org/message-id/flat/a0f817ee-fb86-483a-8a14-b6f7f5991b6e%40eisentraut.org --- src/backend/replication/logical/slotsync.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/replication/logical/slotsync.c b/src/backend/replication/logical/slotsync.c index 9d0072a49ed6d..8c061d55bdb51 100644 --- a/src/backend/replication/logical/slotsync.c +++ b/src/backend/replication/logical/slotsync.c @@ -1337,7 +1337,7 @@ reset_syncing_flag() SpinLockRelease(&SlotSyncCtx->mutex); syncing_slots = false; -}; +} /* * The main loop of our worker process. From 4fbe0151455fefbef7abc9d507adb04c978beb0d Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Thu, 11 Sep 2025 11:55:29 +0200 Subject: [PATCH 48/73] Remove checks for no longer supported GCC versions Since commit f5e0186f865 (Raise C requirement to C11), we effectively require at least GCC version 4.7, so checks for older versions can be removed. Reviewed-by: Andres Freund Reviewed-by: Tom Lane Discussion: https://www.postgresql.org/message-id/flat/a0f817ee-fb86-483a-8a14-b6f7f5991b6e%40eisentraut.org --- src/include/c.h | 8 ++++---- src/include/port/atomics/generic-gcc.h | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/include/c.h b/src/include/c.h index 39022f8a9dd75..b580cfa7d3178 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -259,8 +259,8 @@ * choose not to. But, if possible, don't force inlining in unoptimized * debug builds. */ -#if (defined(__GNUC__) && __GNUC__ > 3 && defined(__OPTIMIZE__)) || defined(__SUNPRO_C) -/* GCC > 3 and Sunpro support always_inline via __attribute__ */ +#if (defined(__GNUC__) && defined(__OPTIMIZE__)) || defined(__SUNPRO_C) +/* GCC and Sunpro support always_inline via __attribute__ */ #define pg_attribute_always_inline __attribute__((always_inline)) inline #elif defined(_MSC_VER) /* MSVC has a special keyword for this */ @@ -277,7 +277,7 @@ * above, this should be placed before the function's return type and name. */ /* GCC and Sunpro support noinline via __attribute__ */ -#if (defined(__GNUC__) && __GNUC__ > 2) || defined(__SUNPRO_C) +#if defined(__GNUC__) || defined(__SUNPRO_C) #define pg_noinline __attribute__((noinline)) /* msvc via declspec */ #elif defined(_MSC_VER) @@ -369,7 +369,7 @@ * These should only be used sparingly, in very hot code paths. It's very easy * to mis-estimate likelihoods. */ -#if __GNUC__ >= 3 +#ifdef __GNUC__ #define likely(x) __builtin_expect((x) != 0, 1) #define unlikely(x) __builtin_expect((x) != 0, 0) #else diff --git a/src/include/port/atomics/generic-gcc.h b/src/include/port/atomics/generic-gcc.h index d8f04c89ccac2..e7dfad4f0d5eb 100644 --- a/src/include/port/atomics/generic-gcc.h +++ b/src/include/port/atomics/generic-gcc.h @@ -30,14 +30,14 @@ #define pg_compiler_barrier_impl() __asm__ __volatile__("" ::: "memory") /* - * If we're on GCC 4.1.0 or higher, we should be able to get a memory barrier + * If we're on GCC, we should be able to get a memory barrier * out of this compiler built-in. But we prefer to rely on platform specific * definitions where possible, and use this only as a fallback. */ #if !defined(pg_memory_barrier_impl) # if defined(HAVE_GCC__ATOMIC_INT32_CAS) # define pg_memory_barrier_impl() __atomic_thread_fence(__ATOMIC_SEQ_CST) -# elif (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)) +# elif defined(__GNUC__) # define pg_memory_barrier_impl() __sync_synchronize() # endif #endif /* !defined(pg_memory_barrier_impl) */ From a2b4102a21ad730ce46b059acf49d72151e979f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Herrera?= Date: Thu, 11 Sep 2025 18:11:46 +0200 Subject: [PATCH 49/73] Fill testing gap for possible referential integrity violation This commit adds a missing isolation test for (non-PERIOD) foreign keys. With REPEATABLE READ, one transaction can insert a referencing row while another deletes the referenced row, and both see a valid state. But after they have committed, the table violates referential integrity. If the INSERT precedes the DELETE, we use a crosscheck snapshot to see the just-added row, so that the DELETE can raise a foreign key error. You can see the table violate referential integrity if you change ri_restrict to pass false for detectNewRows to ri_PerformCheck. A crosscheck snapshot is not needed when the DELETE comes first, because the INSERT's trigger takes a FOR KEY SHARE lock that sees the row now marked for deletion, waits for that transaction to commit, and raises a serialization error. I (Paul) added a test for that too though. We already have a similar test (in ri-triggers.spec) for SERIALIZABLE snapshot isolation showing that you can implement foreign keys with just pl/pgSQL, but that test does nothing to validate ri_triggers.c. We also have tests (in fk-snapshot.spec) for other concurrency scenarios, but not this one: we test concurrently deleting both the referencing and referenced row, when the constraint activates a cascade/set null action. But those tests don't exercise ri_restrict, and the consequence of omitting a crosscheck comparison is different: a serialization failure, not a referential integrity violation. Author: Paul Jungwirth Reviewed-by: Rustam ALLAKOV Reviewed-by: Dean Rasheed Reviewed-by: Robert Haas Discussion: https://postgr.es/m/CA+renyUp=xja80rBaB6NpY3RRdi750y046x28bo_xg29zKY72Q@mail.gmail.com --- src/test/isolation/expected/fk-snapshot-2.out | 61 +++++++++++++++++++ src/test/isolation/isolation_schedule | 1 + src/test/isolation/specs/fk-snapshot-2.spec | 50 +++++++++++++++ 3 files changed, 112 insertions(+) create mode 100644 src/test/isolation/expected/fk-snapshot-2.out create mode 100644 src/test/isolation/specs/fk-snapshot-2.spec diff --git a/src/test/isolation/expected/fk-snapshot-2.out b/src/test/isolation/expected/fk-snapshot-2.out new file mode 100644 index 0000000000000..0a4c9646fca4e --- /dev/null +++ b/src/test/isolation/expected/fk-snapshot-2.out @@ -0,0 +1,61 @@ +Parsed test spec with 2 sessions + +starting permutation: s1rr s2rr s2ins s1del s2c s1c +step s1rr: BEGIN ISOLATION LEVEL REPEATABLE READ; +step s2rr: BEGIN ISOLATION LEVEL REPEATABLE READ; +step s2ins: INSERT INTO child VALUES (1, 1); +step s1del: DELETE FROM parent WHERE parent_id = 1; +step s2c: COMMIT; +step s1del: <... completed> +ERROR: update or delete on table "parent" violates foreign key constraint "child_parent_id_fkey" on table "child" +step s1c: COMMIT; + +starting permutation: s1rr s2rr s1del s2ins s1c s2c +step s1rr: BEGIN ISOLATION LEVEL REPEATABLE READ; +step s2rr: BEGIN ISOLATION LEVEL REPEATABLE READ; +step s1del: DELETE FROM parent WHERE parent_id = 1; +step s2ins: INSERT INTO child VALUES (1, 1); +step s1c: COMMIT; +step s2ins: <... completed> +ERROR: could not serialize access due to concurrent update +step s2c: COMMIT; + +starting permutation: s1rc s2rc s2ins s1del s2c s1c +step s1rc: BEGIN ISOLATION LEVEL READ COMMITTED; +step s2rc: BEGIN ISOLATION LEVEL READ COMMITTED; +step s2ins: INSERT INTO child VALUES (1, 1); +step s1del: DELETE FROM parent WHERE parent_id = 1; +step s2c: COMMIT; +step s1del: <... completed> +ERROR: update or delete on table "parent" violates foreign key constraint "child_parent_id_fkey" on table "child" +step s1c: COMMIT; + +starting permutation: s1rc s2rc s1del s2ins s1c s2c +step s1rc: BEGIN ISOLATION LEVEL READ COMMITTED; +step s2rc: BEGIN ISOLATION LEVEL READ COMMITTED; +step s1del: DELETE FROM parent WHERE parent_id = 1; +step s2ins: INSERT INTO child VALUES (1, 1); +step s1c: COMMIT; +step s2ins: <... completed> +ERROR: insert or update on table "child" violates foreign key constraint "child_parent_id_fkey" +step s2c: COMMIT; + +starting permutation: s1ser s2ser s2ins s1del s2c s1c +step s1ser: BEGIN ISOLATION LEVEL SERIALIZABLE; +step s2ser: BEGIN ISOLATION LEVEL SERIALIZABLE; +step s2ins: INSERT INTO child VALUES (1, 1); +step s1del: DELETE FROM parent WHERE parent_id = 1; +step s2c: COMMIT; +step s1del: <... completed> +ERROR: update or delete on table "parent" violates foreign key constraint "child_parent_id_fkey" on table "child" +step s1c: COMMIT; + +starting permutation: s1ser s2ser s1del s2ins s1c s2c +step s1ser: BEGIN ISOLATION LEVEL SERIALIZABLE; +step s2ser: BEGIN ISOLATION LEVEL SERIALIZABLE; +step s1del: DELETE FROM parent WHERE parent_id = 1; +step s2ins: INSERT INTO child VALUES (1, 1); +step s1c: COMMIT; +step s2ins: <... completed> +ERROR: could not serialize access due to concurrent update +step s2c: COMMIT; diff --git a/src/test/isolation/isolation_schedule b/src/test/isolation/isolation_schedule index 9f1e997d81b00..130525dfd3d68 100644 --- a/src/test/isolation/isolation_schedule +++ b/src/test/isolation/isolation_schedule @@ -36,6 +36,7 @@ test: fk-deadlock2 test: fk-partitioned-1 test: fk-partitioned-2 test: fk-snapshot +test: fk-snapshot-2 test: subxid-overflow test: eval-plan-qual test: eval-plan-qual-trigger diff --git a/src/test/isolation/specs/fk-snapshot-2.spec b/src/test/isolation/specs/fk-snapshot-2.spec new file mode 100644 index 0000000000000..94cd151aab9d3 --- /dev/null +++ b/src/test/isolation/specs/fk-snapshot-2.spec @@ -0,0 +1,50 @@ +# RI Trigger test +# +# Test C-based referential integrity enforcement. +# Under REPEATABLE READ we need some snapshot trickery in C, +# or we would permit things that violate referential integrity. + +setup +{ + CREATE TABLE parent (parent_id SERIAL NOT NULL PRIMARY KEY); + CREATE TABLE child ( + child_id SERIAL NOT NULL PRIMARY KEY, + parent_id INTEGER REFERENCES parent); + INSERT INTO parent VALUES(1); +} + +teardown { DROP TABLE parent, child; } + +session s1 +step s1rc { BEGIN ISOLATION LEVEL READ COMMITTED; } +step s1rr { BEGIN ISOLATION LEVEL REPEATABLE READ; } +step s1ser { BEGIN ISOLATION LEVEL SERIALIZABLE; } +step s1del { DELETE FROM parent WHERE parent_id = 1; } +step s1c { COMMIT; } + +session s2 +step s2rc { BEGIN ISOLATION LEVEL READ COMMITTED; } +step s2rr { BEGIN ISOLATION LEVEL REPEATABLE READ; } +step s2ser { BEGIN ISOLATION LEVEL SERIALIZABLE; } +step s2ins { INSERT INTO child VALUES (1, 1); } +step s2c { COMMIT; } + +# Violates referential integrity unless we use a crosscheck snapshot, +# which is up-to-date compared with the transaction's snapshot. +permutation s1rr s2rr s2ins s1del s2c s1c + +# Raises a can't-serialize exception +# when the INSERT trigger does SELECT FOR KEY SHARE: +permutation s1rr s2rr s1del s2ins s1c s2c + +# Test the same scenarios in READ COMMITTED: +# A crosscheck snapshot is not required here. +permutation s1rc s2rc s2ins s1del s2c s1c +permutation s1rc s2rc s1del s2ins s1c s2c + +# Test the same scenarios in SERIALIZABLE: +# We should report the FK violation: +permutation s1ser s2ser s2ins s1del s2c s1c +# We raise a concurrent update error +# which is good enough: +permutation s1ser s2ser s1del s2ins s1c s2c From e8cec3d1791c140398454aa561cf51659dd8243d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Herrera?= Date: Thu, 11 Sep 2025 18:13:09 +0200 Subject: [PATCH 50/73] Add test for temporal referential integrity This commit adds an isolation test showing that temporal foreign keys do not permit referential integrity violations under concurrency, like fk-snapshot-2. You can show that the test fails by passing false for detectNewRows to ri_PerformCheck in ri_restrict. Author: Paul Jungwirth Reviewed-by: Rustam ALLAKOV Reviewed-by: Dean Rasheed Reviewed-by: Robert Haas Discussion: https://postgr.es/m/CA+renyUp=xja80rBaB6NpY3RRdi750y046x28bo_xg29zKY72Q@mail.gmail.com --- src/test/isolation/expected/fk-snapshot-3.out | 213 ++++++++++++++++++ src/test/isolation/isolation_schedule | 1 + src/test/isolation/specs/fk-snapshot-3.spec | 82 +++++++ 3 files changed, 296 insertions(+) create mode 100644 src/test/isolation/expected/fk-snapshot-3.out create mode 100644 src/test/isolation/specs/fk-snapshot-3.spec diff --git a/src/test/isolation/expected/fk-snapshot-3.out b/src/test/isolation/expected/fk-snapshot-3.out new file mode 100644 index 0000000000000..f98cb72fdac30 --- /dev/null +++ b/src/test/isolation/expected/fk-snapshot-3.out @@ -0,0 +1,213 @@ +Parsed test spec with 2 sessions + +starting permutation: s1rr s2rr s2ins s1del s2c s1c +step s1rr: BEGIN ISOLATION LEVEL REPEATABLE READ; +step s2rr: BEGIN ISOLATION LEVEL REPEATABLE READ; +step s2ins: + INSERT INTO child VALUES ('[1,2)', '[2020-01-01,2025-01-01)', '[1,2)'); + +step s1del: DELETE FROM parent WHERE id = '[1,2)'; +step s2c: COMMIT; +step s1del: <... completed> +ERROR: update or delete on table "parent" violates foreign key constraint "child_parent_id_valid_at_fkey" on table "child" +step s1c: COMMIT; + +starting permutation: s1rr s2rr s1del s2ins s1c s2c +step s1rr: BEGIN ISOLATION LEVEL REPEATABLE READ; +step s2rr: BEGIN ISOLATION LEVEL REPEATABLE READ; +step s1del: DELETE FROM parent WHERE id = '[1,2)'; +step s2ins: + INSERT INTO child VALUES ('[1,2)', '[2020-01-01,2025-01-01)', '[1,2)'); + +step s1c: COMMIT; +step s2ins: <... completed> +ERROR: could not serialize access due to concurrent update +step s2c: COMMIT; + +starting permutation: s1rc s2rc s2ins s1del s2c s1c +step s1rc: BEGIN ISOLATION LEVEL READ COMMITTED; +step s2rc: BEGIN ISOLATION LEVEL READ COMMITTED; +step s2ins: + INSERT INTO child VALUES ('[1,2)', '[2020-01-01,2025-01-01)', '[1,2)'); + +step s1del: DELETE FROM parent WHERE id = '[1,2)'; +step s2c: COMMIT; +step s1del: <... completed> +ERROR: update or delete on table "parent" violates foreign key constraint "child_parent_id_valid_at_fkey" on table "child" +step s1c: COMMIT; + +starting permutation: s1rc s2rc s1del s2ins s1c s2c +step s1rc: BEGIN ISOLATION LEVEL READ COMMITTED; +step s2rc: BEGIN ISOLATION LEVEL READ COMMITTED; +step s1del: DELETE FROM parent WHERE id = '[1,2)'; +step s2ins: + INSERT INTO child VALUES ('[1,2)', '[2020-01-01,2025-01-01)', '[1,2)'); + +step s1c: COMMIT; +step s2ins: <... completed> +ERROR: insert or update on table "child" violates foreign key constraint "child_parent_id_valid_at_fkey" +step s2c: COMMIT; + +starting permutation: s1ser s2ser s2ins s1del s2c s1c +step s1ser: BEGIN ISOLATION LEVEL SERIALIZABLE; +step s2ser: BEGIN ISOLATION LEVEL SERIALIZABLE; +step s2ins: + INSERT INTO child VALUES ('[1,2)', '[2020-01-01,2025-01-01)', '[1,2)'); + +step s1del: DELETE FROM parent WHERE id = '[1,2)'; +step s2c: COMMIT; +step s1del: <... completed> +ERROR: update or delete on table "parent" violates foreign key constraint "child_parent_id_valid_at_fkey" on table "child" +step s1c: COMMIT; + +starting permutation: s1ser s2ser s1del s2ins s1c s2c +step s1ser: BEGIN ISOLATION LEVEL SERIALIZABLE; +step s2ser: BEGIN ISOLATION LEVEL SERIALIZABLE; +step s1del: DELETE FROM parent WHERE id = '[1,2)'; +step s2ins: + INSERT INTO child VALUES ('[1,2)', '[2020-01-01,2025-01-01)', '[1,2)'); + +step s1c: COMMIT; +step s2ins: <... completed> +ERROR: could not serialize access due to concurrent update +step s2c: COMMIT; + +starting permutation: s1rc s2rc s2ins s1upok s2c s1c +step s1rc: BEGIN ISOLATION LEVEL READ COMMITTED; +step s2rc: BEGIN ISOLATION LEVEL READ COMMITTED; +step s2ins: + INSERT INTO child VALUES ('[1,2)', '[2020-01-01,2025-01-01)', '[1,2)'); + +step s1upok: UPDATE parent SET valid_at = '[2020-01-01,2026-01-01)' WHERE id = '[1,2)'; +step s2c: COMMIT; +step s1upok: <... completed> +step s1c: COMMIT; + +starting permutation: s1rc s2rc s1upok s2ins s1c s2c +step s1rc: BEGIN ISOLATION LEVEL READ COMMITTED; +step s2rc: BEGIN ISOLATION LEVEL READ COMMITTED; +step s1upok: UPDATE parent SET valid_at = '[2020-01-01,2026-01-01)' WHERE id = '[1,2)'; +step s2ins: + INSERT INTO child VALUES ('[1,2)', '[2020-01-01,2025-01-01)', '[1,2)'); + +step s1c: COMMIT; +step s2ins: <... completed> +step s2c: COMMIT; + +starting permutation: s1rr s2rr s2ins s1upok s2c s1c +step s1rr: BEGIN ISOLATION LEVEL REPEATABLE READ; +step s2rr: BEGIN ISOLATION LEVEL REPEATABLE READ; +step s2ins: + INSERT INTO child VALUES ('[1,2)', '[2020-01-01,2025-01-01)', '[1,2)'); + +step s1upok: UPDATE parent SET valid_at = '[2020-01-01,2026-01-01)' WHERE id = '[1,2)'; +step s2c: COMMIT; +step s1upok: <... completed> +step s1c: COMMIT; + +starting permutation: s1rr s2rr s1upok s2ins s1c s2c +step s1rr: BEGIN ISOLATION LEVEL REPEATABLE READ; +step s2rr: BEGIN ISOLATION LEVEL REPEATABLE READ; +step s1upok: UPDATE parent SET valid_at = '[2020-01-01,2026-01-01)' WHERE id = '[1,2)'; +step s2ins: + INSERT INTO child VALUES ('[1,2)', '[2020-01-01,2025-01-01)', '[1,2)'); + +step s1c: COMMIT; +step s2ins: <... completed> +ERROR: could not serialize access due to concurrent update +step s2c: COMMIT; + +starting permutation: s1ser s2ser s2ins s1upok s2c s1c +step s1ser: BEGIN ISOLATION LEVEL SERIALIZABLE; +step s2ser: BEGIN ISOLATION LEVEL SERIALIZABLE; +step s2ins: + INSERT INTO child VALUES ('[1,2)', '[2020-01-01,2025-01-01)', '[1,2)'); + +step s1upok: UPDATE parent SET valid_at = '[2020-01-01,2026-01-01)' WHERE id = '[1,2)'; +step s2c: COMMIT; +step s1upok: <... completed> +step s1c: COMMIT; + +starting permutation: s1ser s2ser s1upok s2ins s1c s2c +step s1ser: BEGIN ISOLATION LEVEL SERIALIZABLE; +step s2ser: BEGIN ISOLATION LEVEL SERIALIZABLE; +step s1upok: UPDATE parent SET valid_at = '[2020-01-01,2026-01-01)' WHERE id = '[1,2)'; +step s2ins: + INSERT INTO child VALUES ('[1,2)', '[2020-01-01,2025-01-01)', '[1,2)'); + +step s1c: COMMIT; +step s2ins: <... completed> +ERROR: could not serialize access due to concurrent update +step s2c: COMMIT; + +starting permutation: s1rc s2rc s2ins s1upbad s2c s1c +step s1rc: BEGIN ISOLATION LEVEL READ COMMITTED; +step s2rc: BEGIN ISOLATION LEVEL READ COMMITTED; +step s2ins: + INSERT INTO child VALUES ('[1,2)', '[2020-01-01,2025-01-01)', '[1,2)'); + +step s1upbad: UPDATE parent SET valid_at = '[2020-01-01,2024-01-01)' WHERE id = '[1,2)'; +step s2c: COMMIT; +step s1upbad: <... completed> +ERROR: update or delete on table "parent" violates foreign key constraint "child_parent_id_valid_at_fkey" on table "child" +step s1c: COMMIT; + +starting permutation: s1rc s2rc s1upbad s2ins s1c s2c +step s1rc: BEGIN ISOLATION LEVEL READ COMMITTED; +step s2rc: BEGIN ISOLATION LEVEL READ COMMITTED; +step s1upbad: UPDATE parent SET valid_at = '[2020-01-01,2024-01-01)' WHERE id = '[1,2)'; +step s2ins: + INSERT INTO child VALUES ('[1,2)', '[2020-01-01,2025-01-01)', '[1,2)'); + +step s1c: COMMIT; +step s2ins: <... completed> +ERROR: insert or update on table "child" violates foreign key constraint "child_parent_id_valid_at_fkey" +step s2c: COMMIT; + +starting permutation: s1rr s2rr s2ins s1upbad s2c s1c +step s1rr: BEGIN ISOLATION LEVEL REPEATABLE READ; +step s2rr: BEGIN ISOLATION LEVEL REPEATABLE READ; +step s2ins: + INSERT INTO child VALUES ('[1,2)', '[2020-01-01,2025-01-01)', '[1,2)'); + +step s1upbad: UPDATE parent SET valid_at = '[2020-01-01,2024-01-01)' WHERE id = '[1,2)'; +step s2c: COMMIT; +step s1upbad: <... completed> +ERROR: update or delete on table "parent" violates foreign key constraint "child_parent_id_valid_at_fkey" on table "child" +step s1c: COMMIT; + +starting permutation: s1rr s2rr s1upbad s2ins s1c s2c +step s1rr: BEGIN ISOLATION LEVEL REPEATABLE READ; +step s2rr: BEGIN ISOLATION LEVEL REPEATABLE READ; +step s1upbad: UPDATE parent SET valid_at = '[2020-01-01,2024-01-01)' WHERE id = '[1,2)'; +step s2ins: + INSERT INTO child VALUES ('[1,2)', '[2020-01-01,2025-01-01)', '[1,2)'); + +step s1c: COMMIT; +step s2ins: <... completed> +ERROR: could not serialize access due to concurrent update +step s2c: COMMIT; + +starting permutation: s1ser s2ser s2ins s1upbad s2c s1c +step s1ser: BEGIN ISOLATION LEVEL SERIALIZABLE; +step s2ser: BEGIN ISOLATION LEVEL SERIALIZABLE; +step s2ins: + INSERT INTO child VALUES ('[1,2)', '[2020-01-01,2025-01-01)', '[1,2)'); + +step s1upbad: UPDATE parent SET valid_at = '[2020-01-01,2024-01-01)' WHERE id = '[1,2)'; +step s2c: COMMIT; +step s1upbad: <... completed> +ERROR: update or delete on table "parent" violates foreign key constraint "child_parent_id_valid_at_fkey" on table "child" +step s1c: COMMIT; + +starting permutation: s1ser s2ser s1upbad s2ins s1c s2c +step s1ser: BEGIN ISOLATION LEVEL SERIALIZABLE; +step s2ser: BEGIN ISOLATION LEVEL SERIALIZABLE; +step s1upbad: UPDATE parent SET valid_at = '[2020-01-01,2024-01-01)' WHERE id = '[1,2)'; +step s2ins: + INSERT INTO child VALUES ('[1,2)', '[2020-01-01,2025-01-01)', '[1,2)'); + +step s1c: COMMIT; +step s2ins: <... completed> +ERROR: could not serialize access due to concurrent update +step s2c: COMMIT; diff --git a/src/test/isolation/isolation_schedule b/src/test/isolation/isolation_schedule index 130525dfd3d68..5afae33d37036 100644 --- a/src/test/isolation/isolation_schedule +++ b/src/test/isolation/isolation_schedule @@ -37,6 +37,7 @@ test: fk-partitioned-1 test: fk-partitioned-2 test: fk-snapshot test: fk-snapshot-2 +test: fk-snapshot-3 test: subxid-overflow test: eval-plan-qual test: eval-plan-qual-trigger diff --git a/src/test/isolation/specs/fk-snapshot-3.spec b/src/test/isolation/specs/fk-snapshot-3.spec new file mode 100644 index 0000000000000..90075024f5cc0 --- /dev/null +++ b/src/test/isolation/specs/fk-snapshot-3.spec @@ -0,0 +1,82 @@ +# RI Trigger test +# +# Test C-based temporal referential integrity enforcement. +# Under REPEATABLE READ we need some snapshot trickery in C, +# or we would permit things that violate referential integrity. + +setup +{ + CREATE TABLE parent ( + id int4range NOT NULL, + valid_at daterange NOT NULL, + PRIMARY KEY (id, valid_at WITHOUT OVERLAPS)); + CREATE TABLE child ( + id int4range NOT NULL, + valid_at daterange NOT NULL, + parent_id int4range, + FOREIGN KEY (parent_id, PERIOD valid_at) REFERENCES parent); + INSERT INTO parent VALUES ('[1,2)', '[2020-01-01,2030-01-01)'); +} + +teardown { DROP TABLE parent, child; } + +session s1 +step s1rc { BEGIN ISOLATION LEVEL READ COMMITTED; } +step s1rr { BEGIN ISOLATION LEVEL REPEATABLE READ; } +step s1ser { BEGIN ISOLATION LEVEL SERIALIZABLE; } +step s1del { DELETE FROM parent WHERE id = '[1,2)'; } +step s1upok { UPDATE parent SET valid_at = '[2020-01-01,2026-01-01)' WHERE id = '[1,2)'; } +step s1upbad { UPDATE parent SET valid_at = '[2020-01-01,2024-01-01)' WHERE id = '[1,2)'; } +step s1c { COMMIT; } + +session s2 +step s2rc { BEGIN ISOLATION LEVEL READ COMMITTED; } +step s2rr { BEGIN ISOLATION LEVEL REPEATABLE READ; } +step s2ser { BEGIN ISOLATION LEVEL SERIALIZABLE; } +step s2ins { + INSERT INTO child VALUES ('[1,2)', '[2020-01-01,2025-01-01)', '[1,2)'); +} +step s2c { COMMIT; } + +# Violates referential integrity unless we use an up-to-date crosscheck snapshot: +permutation s1rr s2rr s2ins s1del s2c s1c + +# Raises a can't-serialize exception +# when the INSERT trigger does SELECT FOR KEY SHARE: +permutation s1rr s2rr s1del s2ins s1c s2c + +# Test the same scenarios in READ COMMITTED: +# A crosscheck snapshot is not required here. +permutation s1rc s2rc s2ins s1del s2c s1c +permutation s1rc s2rc s1del s2ins s1c s2c + +# Test the same scenarios in SERIALIZABLE: +# We should report the FK violation: +permutation s1ser s2ser s2ins s1del s2c s1c +# We raise a concurrent update error +# which is good enough: +permutation s1ser s2ser s1del s2ins s1c s2c + +# Also check updating the valid time (without violating RI): + +# ...with READ COMMITED: +permutation s1rc s2rc s2ins s1upok s2c s1c +permutation s1rc s2rc s1upok s2ins s1c s2c +# ...with REPEATABLE READ: +permutation s1rr s2rr s2ins s1upok s2c s1c +permutation s1rr s2rr s1upok s2ins s1c s2c +# ...with SERIALIZABLE: +permutation s1ser s2ser s2ins s1upok s2c s1c +permutation s1ser s2ser s1upok s2ins s1c s2c + +# Also check updating the valid time (while violating RI): + +# ...with READ COMMITED: +permutation s1rc s2rc s2ins s1upbad s2c s1c +permutation s1rc s2rc s1upbad s2ins s1c s2c +# ...with REPEATABLE READ: +permutation s1rr s2rr s2ins s1upbad s2c s1c +permutation s1rr s2rr s1upbad s2ins s1c s2c +# ...with SERIALIZABLE: +permutation s1ser s2ser s2ins s1upbad s2c s1c +permutation s1ser s2ser s1upbad s2ins s1c s2c From 1d5800019f68d81139021b8bab159b8578fcaa2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Herrera?= Date: Thu, 11 Sep 2025 19:49:57 +0200 Subject: [PATCH 51/73] Improve comment about snapshot macros The comment mistakenly had "the others" for "the other", but this commit also reorders the comment so it matches the macros below. Now we describe the levels in increasing strictness. In addition, it seems easier to follow if we introduce one level at a time, rather than describing two, followed by "the other" (and then jumping back to one of the first two). Finally, reword the sentence about the purpose of the macros, which was slightly off-point. Author: Paul Jungwirth Reviewed-by: Rustam ALLAKOV Reviewed-by: Dean Rasheed Reviewed-by: Robert Haas Discussion: https://postgr.es/m/CA+renyUp=xja80rBaB6NpY3RRdi750y046x28bo_xg29zKY72Q@mail.gmail.com --- src/include/access/xact.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/include/access/xact.h b/src/include/access/xact.h index b2bc10ee04196..4528e51829e61 100644 --- a/src/include/access/xact.h +++ b/src/include/access/xact.h @@ -43,10 +43,11 @@ extern PGDLLIMPORT int XactIsoLevel; /* * We implement three isolation levels internally. - * The two stronger ones use one snapshot per database transaction; - * the others use one snapshot per statement. - * Serializable uses predicate locks in addition to snapshots. - * These macros should be used to check which isolation level is selected. + * The weakest uses one snapshot per statement; + * the two stronger levels use one snapshot per database transaction. + * Serializable uses predicate locks in addition to the snapshot. + * These macros can be used to determine which implementation to use + * depending on the prevailing serialization level. */ #define IsolationUsesXactSnapshot() (XactIsoLevel >= XACT_REPEATABLE_READ) #define IsolationIsSerializable() (XactIsoLevel == XACT_SERIALIZABLE) From a0b99fc12203fa179d5b4218a21de30e0e91a7b8 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Thu, 11 Sep 2025 17:11:54 -0400 Subject: [PATCH 52/73] Report the correct is_temporary flag for column defaults. pg_event_trigger_dropped_objects() would report a column default object with is_temporary = false, even if it belongs to a temporary table. This seems clearly wrong, so adjust it to report the table's temp-ness. While here, refactor EventTriggerSQLDropAddObject to make its handling of namespace objects less messy and avoid duplication of the schema-lookup code. And add some explicit test coverage of dropped-object reports for dependencies of temp tables. Back-patch to v15. The bug exists further back, but the GetAttrDefaultColumnAddress function this patch depends on does not, and it doesn't seem worth adjusting it to cope with the older code. Author: Antoine Violin Co-authored-by: Tom Lane Discussion: https://postgr.es/m/CAFjUV9x3-hv0gihf+CtUc-1it0hh7Skp9iYFhMS7FJjtAeAptA@mail.gmail.com Backpatch-through: 15 --- src/backend/commands/event_trigger.c | 111 ++++++++++++++------ src/test/regress/expected/event_trigger.out | 37 +++++++ src/test/regress/sql/event_trigger.sql | 20 ++++ 3 files changed, 136 insertions(+), 32 deletions(-) diff --git a/src/backend/commands/event_trigger.c b/src/backend/commands/event_trigger.c index 631fb0525f1e7..fcdcba009d4e3 100644 --- a/src/backend/commands/event_trigger.c +++ b/src/backend/commands/event_trigger.c @@ -21,6 +21,7 @@ #include "catalog/dependency.h" #include "catalog/indexing.h" #include "catalog/objectaccess.h" +#include "catalog/pg_attrdef.h" #include "catalog/pg_authid.h" #include "catalog/pg_auth_members.h" #include "catalog/pg_database.h" @@ -109,6 +110,8 @@ static Oid insert_event_trigger_tuple(const char *trigname, const char *eventnam static void validate_ddl_tags(const char *filtervar, List *taglist); static void validate_table_rewrite_tags(const char *filtervar, List *taglist); static void EventTriggerInvoke(List *fn_oid_list, EventTriggerData *trigdata); +static bool obtain_object_name_namespace(const ObjectAddress *object, + SQLDropObject *obj); static const char *stringify_grant_objtype(ObjectType objtype); static const char *stringify_adefprivs_objtype(ObjectType objtype); static void SetDatabaseHasLoginEventTriggers(void); @@ -1280,12 +1283,6 @@ EventTriggerSQLDropAddObject(const ObjectAddress *object, bool original, bool no Assert(EventTriggerSupportsObject(object)); - /* don't report temp schemas except my own */ - if (object->classId == NamespaceRelationId && - (isAnyTempNamespace(object->objectId) && - !isTempNamespace(object->objectId))) - return; - oldcxt = MemoryContextSwitchTo(currentEventTriggerState->cxt); obj = palloc0(sizeof(SQLDropObject)); @@ -1293,21 +1290,88 @@ EventTriggerSQLDropAddObject(const ObjectAddress *object, bool original, bool no obj->original = original; obj->normal = normal; + if (object->classId == NamespaceRelationId) + { + /* Special handling is needed for temp namespaces */ + if (isTempNamespace(object->objectId)) + obj->istemp = true; + else if (isAnyTempNamespace(object->objectId)) + { + /* don't report temp schemas except my own */ + pfree(obj); + MemoryContextSwitchTo(oldcxt); + return; + } + } + else if (object->classId == AttrDefaultRelationId) + { + /* We treat a column default as temp if its table is temp */ + ObjectAddress colobject; + + colobject = GetAttrDefaultColumnAddress(object->objectId); + if (OidIsValid(colobject.objectId)) + { + colobject.objectSubId = 0; /* convert to table reference */ + if (!obtain_object_name_namespace(&colobject, obj)) + { + pfree(obj); + MemoryContextSwitchTo(oldcxt); + return; + } + } + } + else + { + /* Generic handling for all other object classes */ + if (!obtain_object_name_namespace(object, obj)) + { + /* don't report temp objects except my own */ + pfree(obj); + MemoryContextSwitchTo(oldcxt); + return; + } + } + + /* object identity, objname and objargs */ + obj->objidentity = + getObjectIdentityParts(&obj->address, &obj->addrnames, &obj->addrargs, + false); + + /* object type */ + obj->objecttype = getObjectTypeDescription(&obj->address, false); + + slist_push_head(&(currentEventTriggerState->SQLDropList), &obj->next); + + MemoryContextSwitchTo(oldcxt); +} + +/* + * Fill obj->objname, obj->schemaname, and obj->istemp based on object. + * + * Returns true if this object should be reported, false if it should + * be ignored because it is a temporary object of another session. + */ +static bool +obtain_object_name_namespace(const ObjectAddress *object, SQLDropObject *obj) +{ /* * Obtain schema names from the object's catalog tuple, if one exists; * this lets us skip objects in temp schemas. We trust that * ObjectProperty contains all object classes that can be * schema-qualified. + * + * Currently, this function does nothing for object classes that are not + * in ObjectProperty, but we might sometime add special cases for that. */ if (is_objectclass_supported(object->classId)) { Relation catalog; HeapTuple tuple; - catalog = table_open(obj->address.classId, AccessShareLock); + catalog = table_open(object->classId, AccessShareLock); tuple = get_catalog_object_by_oid(catalog, get_object_attnum_oid(object->classId), - obj->address.objectId); + object->objectId); if (tuple) { @@ -1315,7 +1379,7 @@ EventTriggerSQLDropAddObject(const ObjectAddress *object, bool original, bool no Datum datum; bool isnull; - attnum = get_object_attnum_namespace(obj->address.classId); + attnum = get_object_attnum_namespace(object->classId); if (attnum != InvalidAttrNumber) { datum = heap_getattr(tuple, attnum, @@ -1333,10 +1397,9 @@ EventTriggerSQLDropAddObject(const ObjectAddress *object, bool original, bool no } else if (isAnyTempNamespace(namespaceId)) { - pfree(obj); + /* no need to fill any fields of *obj */ table_close(catalog, AccessShareLock); - MemoryContextSwitchTo(oldcxt); - return; + return false; } else { @@ -1346,10 +1409,10 @@ EventTriggerSQLDropAddObject(const ObjectAddress *object, bool original, bool no } } - if (get_object_namensp_unique(obj->address.classId) && - obj->address.objectSubId == 0) + if (get_object_namensp_unique(object->classId) && + object->objectSubId == 0) { - attnum = get_object_attnum_name(obj->address.classId); + attnum = get_object_attnum_name(object->classId); if (attnum != InvalidAttrNumber) { datum = heap_getattr(tuple, attnum, @@ -1362,24 +1425,8 @@ EventTriggerSQLDropAddObject(const ObjectAddress *object, bool original, bool no table_close(catalog, AccessShareLock); } - else - { - if (object->classId == NamespaceRelationId && - isTempNamespace(object->objectId)) - obj->istemp = true; - } - /* object identity, objname and objargs */ - obj->objidentity = - getObjectIdentityParts(&obj->address, &obj->addrnames, &obj->addrargs, - false); - - /* object type */ - obj->objecttype = getObjectTypeDescription(&obj->address, false); - - slist_push_head(&(currentEventTriggerState->SQLDropList), &obj->next); - - MemoryContextSwitchTo(oldcxt); + return true; } /* diff --git a/src/test/regress/expected/event_trigger.out b/src/test/regress/expected/event_trigger.out index 7b2198eac6f20..0e090cbc37500 100644 --- a/src/test/regress/expected/event_trigger.out +++ b/src/test/regress/expected/event_trigger.out @@ -476,6 +476,43 @@ NOTICE: NORMAL: orig=f normal=t istemp=f type=table identity=evttrig.part_10_15 NOTICE: NORMAL: orig=f normal=t istemp=f type=table identity=evttrig.part_15_20 name={evttrig,part_15_20} args={} DROP TABLE a_temp_tbl; NOTICE: NORMAL: orig=t normal=f istemp=t type=table identity=pg_temp.a_temp_tbl name={pg_temp,a_temp_tbl} args={} +-- check unfiltered results, too +CREATE OR REPLACE FUNCTION event_trigger_report_dropped() + RETURNS event_trigger + LANGUAGE plpgsql +AS $$ +DECLARE r record; +BEGIN + FOR r IN SELECT * from pg_event_trigger_dropped_objects() + LOOP + RAISE NOTICE 'DROP: orig=% normal=% istemp=% type=% identity=% name=% args=%', + r.original, r.normal, r.is_temporary, r.object_type, + r.object_identity, r.address_names, r.address_args; + END LOOP; +END; $$; +NOTICE: END: command_tag=CREATE FUNCTION type=function identity=public.event_trigger_report_dropped() +CREATE TABLE evtrg_nontemp_table (f1 int primary key, f2 int default 42); +NOTICE: END: command_tag=CREATE TABLE type=table identity=public.evtrg_nontemp_table +NOTICE: END: command_tag=CREATE INDEX type=index identity=public.evtrg_nontemp_table_pkey +DROP TABLE evtrg_nontemp_table; +NOTICE: DROP: orig=t normal=f istemp=f type=table identity=public.evtrg_nontemp_table name={public,evtrg_nontemp_table} args={} +NOTICE: DROP: orig=f normal=f istemp=f type=type identity=public.evtrg_nontemp_table name={public.evtrg_nontemp_table} args={} +NOTICE: DROP: orig=f normal=f istemp=f type=type identity=public.evtrg_nontemp_table[] name={public.evtrg_nontemp_table[]} args={} +NOTICE: DROP: orig=f normal=f istemp=f type=default value identity=for public.evtrg_nontemp_table.f2 name={public,evtrg_nontemp_table,f2} args={} +NOTICE: DROP: orig=f normal=f istemp=f type=table constraint identity=evtrg_nontemp_table_f1_not_null on public.evtrg_nontemp_table name={public,evtrg_nontemp_table,evtrg_nontemp_table_f1_not_null} args={} +NOTICE: DROP: orig=f normal=f istemp=f type=table constraint identity=evtrg_nontemp_table_pkey on public.evtrg_nontemp_table name={public,evtrg_nontemp_table,evtrg_nontemp_table_pkey} args={} +NOTICE: DROP: orig=f normal=f istemp=f type=index identity=public.evtrg_nontemp_table_pkey name={public,evtrg_nontemp_table_pkey} args={} +CREATE TEMP TABLE a_temp_tbl (f1 int primary key, f2 int default 42); +NOTICE: END: command_tag=CREATE TABLE type=table identity=pg_temp.a_temp_tbl +NOTICE: END: command_tag=CREATE INDEX type=index identity=pg_temp.a_temp_tbl_pkey +DROP TABLE a_temp_tbl; +NOTICE: DROP: orig=t normal=f istemp=t type=table identity=pg_temp.a_temp_tbl name={pg_temp,a_temp_tbl} args={} +NOTICE: DROP: orig=f normal=f istemp=t type=type identity=pg_temp.a_temp_tbl name={pg_temp.a_temp_tbl} args={} +NOTICE: DROP: orig=f normal=f istemp=t type=type identity=pg_temp.a_temp_tbl[] name={pg_temp.a_temp_tbl[]} args={} +NOTICE: DROP: orig=f normal=f istemp=t type=default value identity=for pg_temp.a_temp_tbl.f2 name={pg_temp,a_temp_tbl,f2} args={} +NOTICE: DROP: orig=f normal=f istemp=t type=table constraint identity=a_temp_tbl_f1_not_null on pg_temp.a_temp_tbl name={pg_temp,a_temp_tbl,a_temp_tbl_f1_not_null} args={} +NOTICE: DROP: orig=f normal=f istemp=t type=table constraint identity=a_temp_tbl_pkey on pg_temp.a_temp_tbl name={pg_temp,a_temp_tbl,a_temp_tbl_pkey} args={} +NOTICE: DROP: orig=f normal=f istemp=t type=index identity=pg_temp.a_temp_tbl_pkey name={pg_temp,a_temp_tbl_pkey} args={} -- CREATE OPERATOR CLASS without FAMILY clause should report -- both CREATE OPERATOR FAMILY and CREATE OPERATOR CLASS CREATE OPERATOR CLASS evttrigopclass FOR TYPE int USING btree AS STORAGE int; diff --git a/src/test/regress/sql/event_trigger.sql b/src/test/regress/sql/event_trigger.sql index 013546b83057b..ef5978b9697aa 100644 --- a/src/test/regress/sql/event_trigger.sql +++ b/src/test/regress/sql/event_trigger.sql @@ -337,6 +337,26 @@ DROP INDEX evttrig.one_idx; DROP SCHEMA evttrig CASCADE; DROP TABLE a_temp_tbl; +-- check unfiltered results, too +CREATE OR REPLACE FUNCTION event_trigger_report_dropped() + RETURNS event_trigger + LANGUAGE plpgsql +AS $$ +DECLARE r record; +BEGIN + FOR r IN SELECT * from pg_event_trigger_dropped_objects() + LOOP + RAISE NOTICE 'DROP: orig=% normal=% istemp=% type=% identity=% name=% args=%', + r.original, r.normal, r.is_temporary, r.object_type, + r.object_identity, r.address_names, r.address_args; + END LOOP; +END; $$; + +CREATE TABLE evtrg_nontemp_table (f1 int primary key, f2 int default 42); +DROP TABLE evtrg_nontemp_table; +CREATE TEMP TABLE a_temp_tbl (f1 int primary key, f2 int default 42); +DROP TABLE a_temp_tbl; + -- CREATE OPERATOR CLASS without FAMILY clause should report -- both CREATE OPERATOR FAMILY and CREATE OPERATOR CLASS CREATE OPERATOR CLASS evttrigopclass FOR TYPE int USING btree AS STORAGE int; From ed1aad15e09d7d523f4ef413e3c4d410497c8065 Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Thu, 11 Sep 2025 16:13:55 -0500 Subject: [PATCH 53/73] Move named LWLock tranche requests to shared memory. In EXEC_BACKEND builds, GetNamedLWLockTranche() can segfault when called outside of the postmaster process, as it might access NamedLWLockTrancheRequestArray, which won't be initialized. Given the lack of reports, this is apparently unusual, presumably because it is usually called from a shmem_startup_hook like this: mystruct = ShmemInitStruct(..., &found); if (!found) { mystruct->locks = GetNamedLWLockTranche(...); ... } This genre of shmem_startup_hook evades the aforementioned segfaults because the struct is initialized in the postmaster, so all other callers skip the !found path. We considered modifying the documentation or requiring GetNamedLWLockTranche() to be called from the postmaster, but ultimately we decided to simply move the request array to shared memory (and add it to the BackendParameters struct), thereby allowing calls outside postmaster on all platforms. Since the main shared memory segment is initialized after accepting LWLock tranche requests, postmaster builds the request array in local memory first and then copies it to shared memory later. Given the lack of reports, back-patching seems unnecessary. Reported-by: Sami Imseih Reviewed-by: Sami Imseih Discussion: https://postgr.es/m/CAA5RZ0v1_15QPg5Sqd2Qz5rh_qcsyCeHHmRDY89xVHcy2yt5BQ%40mail.gmail.com --- src/backend/postmaster/launch_backend.c | 3 +++ src/backend/storage/lmgr/lwlock.c | 31 +++++++++++++++++++++---- src/include/storage/lwlock.h | 4 ++++ 3 files changed, 33 insertions(+), 5 deletions(-) diff --git a/src/backend/postmaster/launch_backend.c b/src/backend/postmaster/launch_backend.c index a38979c50e4bb..c5ef14e1eaae8 100644 --- a/src/backend/postmaster/launch_backend.c +++ b/src/backend/postmaster/launch_backend.c @@ -101,6 +101,7 @@ typedef struct struct InjectionPointsCtl *ActiveInjectionPoints; #endif int NamedLWLockTrancheRequests; + NamedLWLockTrancheRequest *NamedLWLockTrancheRequestArray; char **LWLockTrancheNames; int *LWLockCounter; LWLockPadded *MainLWLockArray; @@ -761,6 +762,7 @@ save_backend_variables(BackendParameters *param, #endif param->NamedLWLockTrancheRequests = NamedLWLockTrancheRequests; + param->NamedLWLockTrancheRequestArray = NamedLWLockTrancheRequestArray; param->LWLockTrancheNames = LWLockTrancheNames; param->LWLockCounter = LWLockCounter; param->MainLWLockArray = MainLWLockArray; @@ -1022,6 +1024,7 @@ restore_backend_variables(BackendParameters *param) #endif NamedLWLockTrancheRequests = param->NamedLWLockTrancheRequests; + NamedLWLockTrancheRequestArray = param->NamedLWLockTrancheRequestArray; LWLockTrancheNames = param->LWLockTrancheNames; LWLockCounter = param->LWLockCounter; MainLWLockArray = param->MainLWLockArray; diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index fcbac5213a5c0..46c82c63ca537 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -184,14 +184,13 @@ typedef struct NamedLWLockTrancheRequest int num_lwlocks; } NamedLWLockTrancheRequest; -static NamedLWLockTrancheRequest *NamedLWLockTrancheRequestArray = NULL; - /* - * NamedLWLockTrancheRequests is the valid length of the request array. This - * variable is non-static so that postmaster.c can copy them to child processes - * in EXEC_BACKEND builds. + * NamedLWLockTrancheRequests is the valid length of the request array. These + * variables are non-static so that launch_backend.c can copy them to child + * processes in EXEC_BACKEND builds. */ int NamedLWLockTrancheRequests = 0; +NamedLWLockTrancheRequest *NamedLWLockTrancheRequestArray = NULL; /* shared memory counter of registered tranches */ int *LWLockCounter = NULL; @@ -407,6 +406,14 @@ LWLockShmemSize(void) size = add_size(size, mul_size(MAX_NAMED_TRANCHES, sizeof(char *))); size = add_size(size, mul_size(MAX_NAMED_TRANCHES, NAMEDATALEN)); + /* + * Make space for named tranche requests. This is done for the benefit of + * EXEC_BACKEND builds, which otherwise wouldn't be able to call + * GetNamedLWLockTranche() outside postmaster. + */ + size = add_size(size, mul_size(NamedLWLockTrancheRequests, + sizeof(NamedLWLockTrancheRequest))); + /* Space for the LWLock array, plus room for cache line alignment. */ size = add_size(size, LWLOCK_PADDED_SIZE); size = add_size(size, mul_size(numLocks, sizeof(LWLockPadded))); @@ -443,6 +450,20 @@ CreateLWLocks(void) ptr += NAMEDATALEN; } + /* + * Move named tranche requests to shared memory. This is done for the + * benefit of EXEC_BACKEND builds, which otherwise wouldn't be able to + * call GetNamedLWLockTranche() outside postmaster. + */ + if (NamedLWLockTrancheRequests > 0) + { + memcpy(ptr, NamedLWLockTrancheRequestArray, + NamedLWLockTrancheRequests * sizeof(NamedLWLockTrancheRequest)); + pfree(NamedLWLockTrancheRequestArray); + NamedLWLockTrancheRequestArray = (NamedLWLockTrancheRequest *) ptr; + ptr += NamedLWLockTrancheRequests * sizeof(NamedLWLockTrancheRequest); + } + /* Ensure desired alignment of LWLock array */ ptr += LWLOCK_PADDED_SIZE - ((uintptr_t) ptr) % LWLOCK_PADDED_SIZE; MainLWLockArray = (LWLockPadded *) ptr; diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index 0e9cf81a4c766..8e0d0d233b48f 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -73,8 +73,12 @@ typedef union LWLockPadded extern PGDLLIMPORT LWLockPadded *MainLWLockArray; +/* forward declaration of private type for use only by lwlock.c */ +typedef struct NamedLWLockTrancheRequest NamedLWLockTrancheRequest; + extern PGDLLIMPORT char **LWLockTrancheNames; extern PGDLLIMPORT int NamedLWLockTrancheRequests; +extern PGDLLIMPORT NamedLWLockTrancheRequest *NamedLWLockTrancheRequestArray; extern PGDLLIMPORT int *LWLockCounter; /* From 306dd13079ed616c414c9411c5deadffea273266 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Fri, 12 Sep 2025 09:55:16 +0900 Subject: [PATCH 54/73] Remove whitespace in comment of pg_stat_statements.c Introduced by 6b4d23feef6e, spotted while reading this area of the code. --- contrib/pg_stat_statements/pg_stat_statements.c | 1 - 1 file changed, 1 deletion(-) diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c index 1cb368c8590ba..0bb0f9333998b 100644 --- a/contrib/pg_stat_statements/pg_stat_statements.c +++ b/contrib/pg_stat_statements/pg_stat_statements.c @@ -139,7 +139,6 @@ typedef enum pgssStoreKind * If you add a new key to this struct, make sure to teach pgss_store() to * zero the padding bytes. Otherwise, things will break, because pgss_hash is * created using HASH_BLOBS, and thus tag_hash is used to hash this. - */ typedef struct pgssHashKey { From 528dadf691df3023fbb0bd71da5c6087c2d49d6a Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Fri, 12 Sep 2025 10:29:02 +0900 Subject: [PATCH 55/73] Add more information for WAL records of hash index AMs hashdesc.c was missing a couple of fields in its record descriptions, as of: - is_prev_bucket_same_wrt for SQUEEZE_PAGE. - procid for INIT_META_PAGE. - old_bucket_flag and new_bucket_flag for SPLIT_ALLOCATE_PAGE. The author has noted the first hole, and I have spotted the others while double-checking this area of the code. Note that the only data missing now are the offsets stored in VACUUM_ONE_PAGE. We could perhaps add them, if somebody sees value in this data, even if it makes the output larger. These are discarded here. Author: Kirill Reshke Discussion: https://postgr.es/m/CALdSSPjc-OVwtZH0Xrkvg7n=2ZwdbMJzqrm_ed_CfjiAzuKVGg@mail.gmail.com --- src/backend/access/rmgrdesc/hashdesc.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/backend/access/rmgrdesc/hashdesc.c b/src/backend/access/rmgrdesc/hashdesc.c index 75f43a9152071..2ee5332452f39 100644 --- a/src/backend/access/rmgrdesc/hashdesc.c +++ b/src/backend/access/rmgrdesc/hashdesc.c @@ -28,8 +28,10 @@ hash_desc(StringInfo buf, XLogReaderState *record) { xl_hash_init_meta_page *xlrec = (xl_hash_init_meta_page *) rec; - appendStringInfo(buf, "num_tuples %g, fillfactor %d", - xlrec->num_tuples, xlrec->ffactor); + appendStringInfo(buf, "num_tuples %g, procid %u, fillfactor %d", + xlrec->num_tuples, + xlrec->procid, + xlrec->ffactor); break; } case XLOG_HASH_INIT_BITMAP_PAGE: @@ -58,8 +60,10 @@ hash_desc(StringInfo buf, XLogReaderState *record) { xl_hash_split_allocate_page *xlrec = (xl_hash_split_allocate_page *) rec; - appendStringInfo(buf, "new_bucket %u, meta_page_masks_updated %c, issplitpoint_changed %c", + appendStringInfo(buf, "new_bucket %u, old_bucket_flag %u, new_bucket_flag %u, meta_page_masks_updated %c, issplitpoint_changed %c", xlrec->new_bucket, + xlrec->old_bucket_flag, + xlrec->new_bucket_flag, (xlrec->flags & XLH_SPLIT_META_UPDATE_MASKS) ? 'T' : 'F', (xlrec->flags & XLH_SPLIT_META_UPDATE_SPLITPOINT) ? 'T' : 'F'); break; @@ -85,11 +89,12 @@ hash_desc(StringInfo buf, XLogReaderState *record) { xl_hash_squeeze_page *xlrec = (xl_hash_squeeze_page *) rec; - appendStringInfo(buf, "prevblkno %u, nextblkno %u, ntups %d, is_primary %c", + appendStringInfo(buf, "prevblkno %u, nextblkno %u, ntups %d, is_primary %c, is_previous %c", xlrec->prevblkno, xlrec->nextblkno, xlrec->ntups, - xlrec->is_prim_bucket_same_wrt ? 'T' : 'F'); + xlrec->is_prim_bucket_same_wrt ? 'T' : 'F', + xlrec->is_prev_bucket_same_wrt ? 'T' : 'F'); break; } case XLOG_HASH_DELETE: From 2d756ebbe857e3d395d18350bf232300ebd23981 Mon Sep 17 00:00:00 2001 From: Richard Guo Date: Fri, 12 Sep 2025 11:12:19 +0900 Subject: [PATCH 56/73] Fix misuse of Relids for storing attribute numbers The typedef Relids (Bitmapset *) is intended to represent set of relation identifiers, but was incorrectly used in several places to store sets of attribute numbers. This is my oversight in e2debb643. Fix that by replacing such usages with Bitmapset * to reflect the correct semantics. Author: Junwang Zhao Reviewed-by: Tender Wang Reviewed-by: Richard Guo Discussion: https://postgr.es/m/CAEG8a3LJhp_xriXf39iCz0TsK+M-2biuhDhpLC6Baxw8+ZYT3A@mail.gmail.com --- src/backend/optimizer/util/clauses.c | 2 +- src/backend/optimizer/util/plancat.c | 6 +++--- src/include/optimizer/plancat.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/backend/optimizer/util/clauses.c b/src/backend/optimizer/util/clauses.c index 6f0b338d2cdf1..ae0bd073ca917 100644 --- a/src/backend/optimizer/util/clauses.c +++ b/src/backend/optimizer/util/clauses.c @@ -4203,7 +4203,7 @@ simplify_function(Oid funcid, Oid result_type, int32 result_typmod, bool var_is_nonnullable(PlannerInfo *root, Var *var, bool use_rel_info) { - Relids notnullattnums = NULL; + Bitmapset *notnullattnums = NULL; Assert(IsA(var, Var)); diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 4536bdd6cb4d7..572d626b2c4d2 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -62,7 +62,7 @@ get_relation_info_hook_type get_relation_info_hook = NULL; typedef struct NotnullHashEntry { Oid relid; /* OID of the relation */ - Relids notnullattnums; /* attnums of NOT NULL columns */ + Bitmapset *notnullattnums; /* attnums of NOT NULL columns */ } NotnullHashEntry; @@ -683,7 +683,7 @@ get_relation_notnullatts(PlannerInfo *root, Relation relation) Oid relid = RelationGetRelid(relation); NotnullHashEntry *hentry; bool found; - Relids notnullattnums = NULL; + Bitmapset *notnullattnums = NULL; /* bail out if the relation has no not-null constraints */ if (relation->rd_att->constr == NULL || @@ -750,7 +750,7 @@ get_relation_notnullatts(PlannerInfo *root, Relation relation) * Searches the hash table and returns the column not-null constraint * information for a given relation. */ -Relids +Bitmapset * find_relation_notnullatts(PlannerInfo *root, Oid relid) { NotnullHashEntry *hentry; diff --git a/src/include/optimizer/plancat.h b/src/include/optimizer/plancat.h index dd8f2cd157f6f..9610707683235 100644 --- a/src/include/optimizer/plancat.h +++ b/src/include/optimizer/plancat.h @@ -30,7 +30,7 @@ extern void get_relation_info(PlannerInfo *root, Oid relationObjectId, extern void get_relation_notnullatts(PlannerInfo *root, Relation relation); -extern Relids find_relation_notnullatts(PlannerInfo *root, Oid relid); +extern Bitmapset *find_relation_notnullatts(PlannerInfo *root, Oid relid); extern List *infer_arbiter_indexes(PlannerInfo *root); From e92677e86333562b8dd4972083c8a1abf985d90d Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Fri, 12 Sep 2025 07:27:48 +0200 Subject: [PATCH 57/73] Silence compiler warnings on clang 21 Clang 21 shows some new compiler warnings, for example: warning: variable 'dstsize' is uninitialized when passed as a const pointer argument here [-Wuninitialized-const-pointer] The fix is to initialize the variables when they are defined. This is similar to, for example, the existing situation in gistKeyIsEQ(). Discussion: https://www.postgresql.org/message-id/flat/6604ad6e-5934-43ac-8590-15113d6ae4b1%40eisentraut.org --- src/backend/access/common/toast_internals.c | 2 +- src/backend/access/gist/gistutil.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/backend/access/common/toast_internals.c b/src/backend/access/common/toast_internals.c index a1d0eed8953ba..75e908c2e80a7 100644 --- a/src/backend/access/common/toast_internals.c +++ b/src/backend/access/common/toast_internals.c @@ -135,7 +135,7 @@ toast_save_datum(Relation rel, Datum value, char data[TOAST_MAX_CHUNK_SIZE + VARHDRSZ]; /* ensure union is aligned well enough: */ int32 align_it; - } chunk_data; + } chunk_data = {0}; /* silence compiler warning */ int32 chunk_size; int32 chunk_seq = 0; char *data_p; diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c index c0aa7d0222f39..cdc4ab3151be1 100644 --- a/src/backend/access/gist/gistutil.c +++ b/src/backend/access/gist/gistutil.c @@ -157,7 +157,7 @@ gistMakeUnionItVec(GISTSTATE *giststate, IndexTuple *itvec, int len, { int i; GistEntryVector *evec; - int attrsize; + int attrsize = 0; /* silence compiler warning */ evec = (GistEntryVector *) palloc((len + 2) * sizeof(GISTENTRY) + GEVHDRSZ); @@ -242,7 +242,7 @@ gistMakeUnionKey(GISTSTATE *giststate, int attno, char padding[2 * sizeof(GISTENTRY) + GEVHDRSZ]; } storage; GistEntryVector *evec = &storage.gev; - int dstsize; + int dstsize = 0; /* silence compiler warning */ evec->n = 2; From 25f36066dd2abde74faa12f08e5e498a95128cd0 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Thu, 11 Sep 2025 11:55:29 +0200 Subject: [PATCH 58/73] Remove traces of support for Sun Studio compiler Per discussion, this compiler suite is no longer maintained, and it has not been able to compile PostgreSQL since at least PostgreSQL 17. This removes all the remaining support code for this compiler. Note that the Solaris operating system continues to be supported, but using GCC as the compiler. Reviewed-by: Andres Freund Reviewed-by: Tom Lane Discussion: https://www.postgresql.org/message-id/flat/a0f817ee-fb86-483a-8a14-b6f7f5991b6e%40eisentraut.org --- config/c-compiler.m4 | 2 +- configure | 53 +--------- configure.ac | 31 +----- doc/src/sgml/dfunc.sgml | 9 +- doc/src/sgml/installation.sgml | 60 +----------- meson.build | 2 +- src/Makefile.global.in | 4 - src/backend/port/Makefile | 12 --- src/backend/port/tas/sunstudio_sparc.s | 53 ---------- src/backend/port/tas/sunstudio_x86.s | 43 -------- src/backend/storage/lmgr/Makefile | 6 +- src/include/c.h | 15 ++- src/include/port/atomics.h | 2 - src/include/port/atomics/arch-x86.h | 2 +- src/include/port/atomics/generic-sunpro.h | 113 ---------------------- src/include/port/solaris.h | 21 ---- src/include/storage/s_lock.h | 24 +---- src/makefiles/meson.build | 4 +- src/template/linux | 23 ----- src/template/solaris | 29 +----- src/tools/pginclude/headerscheck | 1 - 21 files changed, 24 insertions(+), 485 deletions(-) delete mode 100644 src/backend/port/tas/sunstudio_sparc.s delete mode 100644 src/backend/port/tas/sunstudio_x86.s delete mode 100644 src/include/port/atomics/generic-sunpro.h diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index da40bd6a64755..236a59e8536c2 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -83,7 +83,7 @@ if test x"$pgac_cv__128bit_int" = xyes ; then AC_CACHE_CHECK([for __int128 alignment bug], [pgac_cv__128bit_int_bug], [AC_RUN_IFELSE([AC_LANG_PROGRAM([ /* This must match the corresponding code in c.h: */ -#if defined(__GNUC__) || defined(__SUNPRO_C) +#if defined(__GNUC__) #define pg_attribute_aligned(a) __attribute__((aligned(a))) #elif defined(_MSC_VER) #define pg_attribute_aligned(a) __declspec(align(a)) diff --git a/configure b/configure index 39c68161ceced..22cd866147b96 100755 --- a/configure +++ b/configure @@ -739,7 +739,6 @@ PKG_CONFIG_LIBDIR PKG_CONFIG_PATH PKG_CONFIG DLSUFFIX -TAS GCC CPP CFLAGS_SL @@ -760,7 +759,6 @@ CLANG LLVM_CONFIG AWK with_llvm -SUN_STUDIO_CC ac_ct_CXX CXXFLAGS CXX @@ -3059,12 +3057,6 @@ $as_echo "$template" >&6; } PORTNAME=$template -# Initialize default assumption that we do not need separate assembly code -# for TAS (test-and-set). This can be overridden by the template file -# when it's executed. -need_tas=no -tas_file=dummy.s - # Default, works for most platforms, override in template file if needed DLSUFFIX=".so" @@ -4799,30 +4791,6 @@ else fi rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext -# Check if it's Sun Studio compiler. We assume that -# __SUNPRO_C will be defined for Sun Studio compilers -cat confdefs.h - <<_ACEOF >conftest.$ac_ext -/* end confdefs.h. */ - -int -main () -{ -#ifndef __SUNPRO_C -choke me -#endif - ; - return 0; -} -_ACEOF -if ac_fn_c_try_compile "$LINENO"; then : - SUN_STUDIO_CC=yes -else - SUN_STUDIO_CC=no -fi -rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext - - - # # LLVM @@ -6748,7 +6716,7 @@ fi # __attribute__((visibility("hidden"))) is supported, if we encounter a # compiler that supports one of the supported variants of -fvisibility=hidden # but uses a different syntax to mark a symbol as exported. -if test "$GCC" = yes -o "$SUN_STUDIO_CC" = yes ; then +if test "$GCC" = yes; then { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${CC} supports -fvisibility=hidden, for CFLAGS_SL_MODULE" >&5 $as_echo_n "checking whether ${CC} supports -fvisibility=hidden, for CFLAGS_SL_MODULE... " >&6; } if ${pgac_cv_prog_CC_cflags__fvisibility_hidden+:} false; then : @@ -7731,20 +7699,6 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu -# -# Set up TAS assembly code if needed; the template file has now had its -# chance to request this. -# -ac_config_links="$ac_config_links src/backend/port/tas.s:src/backend/port/tas/${tas_file}" - - -if test "$need_tas" = yes ; then - TAS=tas.o -else - TAS="" -fi - - cat >>confdefs.h <<_ACEOF #define DLSUFFIX "$DLSUFFIX" @@ -17141,7 +17095,7 @@ else /* end confdefs.h. */ /* This must match the corresponding code in c.h: */ -#if defined(__GNUC__) || defined(__SUNPRO_C) +#if defined(__GNUC__) #define pg_attribute_aligned(a) __attribute__((aligned(a))) #elif defined(_MSC_VER) #define pg_attribute_aligned(a) __declspec(align(a)) @@ -19344,8 +19298,6 @@ fi if test x"$GCC" = x"yes" ; then cc_string=`${CC} --version | sed q` case $cc_string in [A-Za-z]*) ;; *) cc_string="GCC $cc_string";; esac -elif test x"$SUN_STUDIO_CC" = x"yes" ; then - cc_string=`${CC} -V 2>&1 | sed q` else cc_string=$CC fi @@ -20142,7 +20094,6 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 for ac_config_target in $ac_config_targets do case $ac_config_target in - "src/backend/port/tas.s") CONFIG_LINKS="$CONFIG_LINKS src/backend/port/tas.s:src/backend/port/tas/${tas_file}" ;; "GNUmakefile") CONFIG_FILES="$CONFIG_FILES GNUmakefile" ;; "src/Makefile.global") CONFIG_FILES="$CONFIG_FILES src/Makefile.global" ;; "src/backend/port/pg_sema.c") CONFIG_LINKS="$CONFIG_LINKS src/backend/port/pg_sema.c:${SEMA_IMPLEMENTATION}" ;; diff --git a/configure.ac b/configure.ac index 066e3976c0aac..e44943aa6fe35 100644 --- a/configure.ac +++ b/configure.ac @@ -95,12 +95,6 @@ AC_MSG_RESULT([$template]) PORTNAME=$template AC_SUBST(PORTNAME) -# Initialize default assumption that we do not need separate assembly code -# for TAS (test-and-set). This can be overridden by the template file -# when it's executed. -need_tas=no -tas_file=dummy.s - # Default, works for most platforms, override in template file if needed DLSUFFIX=".so" @@ -400,14 +394,6 @@ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [@%:@ifndef __INTEL_COMPILER choke me @%:@endif])], [ICC=yes], [ICC=no]) -# Check if it's Sun Studio compiler. We assume that -# __SUNPRO_C will be defined for Sun Studio compilers -AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [@%:@ifndef __SUNPRO_C -choke me -@%:@endif])], [SUN_STUDIO_CC=yes], [SUN_STUDIO_CC=no]) - -AC_SUBST(SUN_STUDIO_CC) - # # LLVM @@ -618,7 +604,7 @@ fi # __attribute__((visibility("hidden"))) is supported, if we encounter a # compiler that supports one of the supported variants of -fvisibility=hidden # but uses a different syntax to mark a symbol as exported. -if test "$GCC" = yes -o "$SUN_STUDIO_CC" = yes ; then +if test "$GCC" = yes; then PGAC_PROG_CC_VAR_OPT(CFLAGS_SL_MODULE, [-fvisibility=hidden]) # For C++ we additionally want -fvisibility-inlines-hidden PGAC_PROG_VARCXX_VARFLAGS_OPT(CXX, CXXFLAGS_SL_MODULE, [-fvisibility=hidden]) @@ -774,19 +760,6 @@ AC_PROG_CPP AC_SUBST(GCC) -# -# Set up TAS assembly code if needed; the template file has now had its -# chance to request this. -# -AC_CONFIG_LINKS([src/backend/port/tas.s:src/backend/port/tas/${tas_file}]) - -if test "$need_tas" = yes ; then - TAS=tas.o -else - TAS="" -fi -AC_SUBST(TAS) - AC_SUBST(DLSUFFIX)dnl AC_DEFINE_UNQUOTED([DLSUFFIX], ["$DLSUFFIX"], [Define to the file name extension of dynamically-loadable modules.]) @@ -2478,8 +2451,6 @@ AC_SUBST(LDFLAGS_EX_BE) if test x"$GCC" = x"yes" ; then cc_string=`${CC} --version | sed q` case $cc_string in [[A-Za-z]]*) ;; *) cc_string="GCC $cc_string";; esac -elif test x"$SUN_STUDIO_CC" = x"yes" ; then - cc_string=`${CC} -V 2>&1 | sed q` else cc_string=$CC fi diff --git a/doc/src/sgml/dfunc.sgml b/doc/src/sgml/dfunc.sgml index b94aefcd0ca6c..3778efc83ebfa 100644 --- a/doc/src/sgml/dfunc.sgml +++ b/doc/src/sgml/dfunc.sgml @@ -157,19 +157,12 @@ ld -Bshareable -o foo.so foo.o The compiler flag to create PIC is - with the Sun compiler and with GCC. To link shared libraries, the compiler option is - with either compiler or alternatively with GCC. -cc -KPIC -c foo.c -cc -G -o foo.so foo.o - - or - gcc -fPIC -c foo.c -gcc -G -o foo.so foo.o +gcc -shared -o foo.so foo.o diff --git a/doc/src/sgml/installation.sgml b/doc/src/sgml/installation.sgml index a4ad80a678211..593202f4fb259 100644 --- a/doc/src/sgml/installation.sgml +++ b/doc/src/sgml/installation.sgml @@ -1676,10 +1676,6 @@ build-postgresql: using the GCC compiler: ./configure CC='gcc -m64' --enable-dtrace DTRACEFLAGS='-64' ... - - Using Sun's compiler: - -./configure CC='/opt/SUNWspro/bin/cc -xtarget=native64' --enable-dtrace DTRACEFLAGS='-64' ... @@ -3713,24 +3709,13 @@ xcrun --show-sdk-path Required Tools - You can build with either GCC or Sun's compiler suite. For - better code optimization, Sun's compiler is strongly recommended - on the SPARC architecture. If - you are using Sun's compiler, be careful not to select - /usr/ucb/cc; - use /opt/SUNWspro/bin/cc. + Only GCC is supported as the compiler. Sun's compiler suite is no longer + supported. - You can download Sun Studio - from . - Many GNU tools are integrated into Solaris 10, or they are - present on the Solaris companion CD. If you need packages for - older versions of Solaris, you can find these tools - at . - If you prefer - sources, look - at . + Many additional dependencies can be installed via the package management + system. @@ -3753,27 +3738,6 @@ configure ... LDFLAGS="-R /usr/sfw/lib:/opt/sfw/lib:/usr/local/lib" - - Compiling for Optimal Performance - - - On the SPARC architecture, Sun Studio is strongly recommended for - compilation. Try using the optimization - flag to generate significantly faster binaries. Do not use any - flags that modify behavior of floating-point operations - and errno processing (e.g., - ). - - - - If you do not have a reason to use 64-bit binaries on SPARC, - prefer the 32-bit version. The 64-bit operations are slower and - 64-bit binaries are slower than the 32-bit variants. On the - other hand, 32-bit code on the AMD64 CPU family is not native, - so 32-bit code is significantly slower on that CPU family. - - - Using DTrace for Tracing PostgreSQL @@ -3781,22 +3745,6 @@ configure ... LDFLAGS="-R /usr/sfw/lib:/opt/sfw/lib:/usr/local/lib" Yes, using DTrace is possible. See for further information. - - - If you see the linking of the postgres executable abort with an - error message like: - -Undefined first referenced - symbol in file -AbortTransaction utils/probes.o -CommitTransaction utils/probes.o -ld: fatal: Symbol referencing errors. No output written to postgres -collect2: ld returned 1 exit status -make: *** [postgres] Error 1 - - your DTrace installation is too old to handle probes in static - functions. You need Solaris 10u4 or newer to use DTrace. - diff --git a/meson.build b/meson.build index ab8101d67b26d..d71c7c8267e79 100644 --- a/meson.build +++ b/meson.build @@ -1809,7 +1809,7 @@ if cc.links(''' if not meson.is_cross_build() r = cc.run(''' /* This must match the corresponding code in c.h: */ - #if defined(__GNUC__) || defined(__SUNPRO_C) + #if defined(__GNUC__) #define pg_attribute_aligned(a) __attribute__((aligned(a))) #elif defined(_MSC_VER) #define pg_attribute_aligned(a) __declspec(align(a)) diff --git a/src/Makefile.global.in b/src/Makefile.global.in index 8b1b357beaa04..0aa389bc71012 100644 --- a/src/Makefile.global.in +++ b/src/Makefile.global.in @@ -267,7 +267,6 @@ endif # not PGXS CC = @CC@ GCC = @GCC@ -SUN_STUDIO_CC = @SUN_STUDIO_CC@ CXX = @CXX@ CFLAGS = @CFLAGS@ CFLAGS_SL = @CFLAGS_SL@ @@ -796,9 +795,6 @@ ifeq ($(PORTNAME),win32) LIBS += -lws2_32 endif -# Not really standard libc functions, used by the backend. -TAS = @TAS@ - ########################################################################## # diff --git a/src/backend/port/Makefile b/src/backend/port/Makefile index 47338d9922957..8613ac01aff6d 100644 --- a/src/backend/port/Makefile +++ b/src/backend/port/Makefile @@ -22,7 +22,6 @@ top_builddir = ../../.. include $(top_builddir)/src/Makefile.global OBJS = \ - $(TAS) \ atomics.o \ pg_sema.o \ pg_shmem.o @@ -33,16 +32,5 @@ endif include $(top_srcdir)/src/backend/common.mk -tas.o: tas.s -ifeq ($(SUN_STUDIO_CC), yes) -# preprocess assembler file with cpp - $(CC) $(CFLAGS) -c -P $< - mv $*.i $*_cpp.s - $(CC) $(CFLAGS) -c $*_cpp.s -o $@ -else - $(CC) $(CFLAGS) -c $< -endif - clean: - rm -f tas_cpp.s $(MAKE) -C win32 clean diff --git a/src/backend/port/tas/sunstudio_sparc.s b/src/backend/port/tas/sunstudio_sparc.s deleted file mode 100644 index 8e0a0965b64ea..0000000000000 --- a/src/backend/port/tas/sunstudio_sparc.s +++ /dev/null @@ -1,53 +0,0 @@ -!------------------------------------------------------------------------- -! -! sunstudio_sparc.s -! compare and swap for Sun Studio on Sparc -! -! Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group -! Portions Copyright (c) 1994, Regents of the University of California -! -! IDENTIFICATION -! src/backend/port/tas/sunstudio_sparc.s -! -!------------------------------------------------------------------------- - -! Fortunately the Sun compiler can process cpp conditionals with -P - -! '/' is the comment for x86, while '!' is the comment for Sparc - -#if defined(__sparcv9) || defined(__sparc) - - .section ".text" - .align 8 - .skip 24 - .align 4 - - .global pg_atomic_cas -pg_atomic_cas: - - ! "cas" only works on sparcv9 and sparcv8plus chips, and - ! requires a compiler targeting these CPUs. It will fail - ! on a compiler targeting sparcv8, and of course will not - ! be understood by a sparcv8 CPU. gcc continues to use - ! "ldstub" because it targets sparcv7. - ! - ! There is actually a trick for embedding "cas" in a - ! sparcv8-targeted compiler, but it can only be run - ! on a sparcv8plus/v9 cpus: - ! - ! http://cvs.opensolaris.org/source/xref/on/usr/src/lib/libc/sparc/threads/sparc.il - ! - ! NB: We're assuming we're running on a TSO system here - solaris - ! userland luckily always has done so. - -#if defined(__sparcv9) || defined(__sparcv8plus) - cas [%o0],%o2,%o1 -#else - ldstub [%o0],%o1 -#endif - mov %o1,%o0 - retl - nop - .type pg_atomic_cas,2 - .size pg_atomic_cas,(.-pg_atomic_cas) -#endif diff --git a/src/backend/port/tas/sunstudio_x86.s b/src/backend/port/tas/sunstudio_x86.s deleted file mode 100644 index 0111ffde45c29..0000000000000 --- a/src/backend/port/tas/sunstudio_x86.s +++ /dev/null @@ -1,43 +0,0 @@ -/------------------------------------------------------------------------- -/ -/ sunstudio_x86.s -/ compare and swap for Sun Studio on x86 -/ -/ Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group -/ Portions Copyright (c) 1994, Regents of the University of California -/ -/ IDENTIFICATION -/ src/backend/port/tas/sunstudio_x86.s -/ -/------------------------------------------------------------------------- - -/ Fortunately the Sun compiler can process cpp conditionals with -P - -/ '/' is the comment for x86, while '!' is the comment for Sparc - - .file "tas.s" - -#if defined(__amd64) - .code64 -#endif - - .globl pg_atomic_cas - .type pg_atomic_cas, @function - - .section .text, "ax" - .align 16 - -pg_atomic_cas: -#if defined(__amd64) - movl %edx,%eax - lock - cmpxchgl %esi,(%rdi) -#else - movl 4(%esp), %edx - movl 8(%esp), %ecx - movl 12(%esp), %eax - lock - cmpxchgl %ecx, (%edx) -#endif - ret - .size pg_atomic_cas, . - pg_atomic_cas diff --git a/src/backend/storage/lmgr/Makefile b/src/backend/storage/lmgr/Makefile index 6cbaf23b855f6..a5fbc24ddad6e 100644 --- a/src/backend/storage/lmgr/Makefile +++ b/src/backend/storage/lmgr/Makefile @@ -24,13 +24,9 @@ OBJS = \ include $(top_srcdir)/src/backend/common.mk -ifdef TAS -TASPATH = $(top_builddir)/src/backend/port/tas.o -endif - s_lock_test: s_lock.c $(top_builddir)/src/common/libpgcommon.a $(top_builddir)/src/port/libpgport.a $(CC) $(CPPFLAGS) $(CFLAGS) -DS_LOCK_TEST=1 $(srcdir)/s_lock.c \ - $(TASPATH) -L $(top_builddir)/src/common -lpgcommon \ + -L $(top_builddir)/src/common -lpgcommon \ -L $(top_builddir)/src/port -lpgport -lm -o s_lock_test lwlocknames.h: ../../../include/storage/lwlocklist.h ../../utils/activity/wait_event_names.txt generate-lwlocknames.pl diff --git a/src/include/c.h b/src/include/c.h index b580cfa7d3178..f303ba0605a40 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -114,7 +114,6 @@ * GCC: https://gcc.gnu.org/onlinedocs/gcc/Function-Attributes.html * GCC: https://gcc.gnu.org/onlinedocs/gcc/Type-Attributes.html * Clang: https://clang.llvm.org/docs/AttributeReference.html - * Sunpro: https://docs.oracle.com/cd/E18659_01/html/821-1384/gjzke.html */ /* @@ -157,7 +156,7 @@ */ #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L #define pg_noreturn _Noreturn -#elif defined(__GNUC__) || defined(__SUNPRO_C) +#elif defined(__GNUC__) #define pg_noreturn __attribute__((noreturn)) #elif defined(_MSC_VER) #define pg_noreturn __declspec(noreturn) @@ -233,8 +232,8 @@ #define pg_attribute_printf(f,a) #endif -/* GCC and Sunpro support aligned and packed */ -#if defined(__GNUC__) || defined(__SUNPRO_C) +/* GCC supports aligned and packed */ +#if defined(__GNUC__) #define pg_attribute_aligned(a) __attribute__((aligned(a))) #define pg_attribute_packed() __attribute__((packed)) #elif defined(_MSC_VER) @@ -259,8 +258,8 @@ * choose not to. But, if possible, don't force inlining in unoptimized * debug builds. */ -#if (defined(__GNUC__) && defined(__OPTIMIZE__)) || defined(__SUNPRO_C) -/* GCC and Sunpro support always_inline via __attribute__ */ +#if defined(__GNUC__) && defined(__OPTIMIZE__) +/* GCC supports always_inline via __attribute__ */ #define pg_attribute_always_inline __attribute__((always_inline)) inline #elif defined(_MSC_VER) /* MSVC has a special keyword for this */ @@ -276,8 +275,8 @@ * for proper cost attribution. Note that unlike the pg_attribute_XXX macros * above, this should be placed before the function's return type and name. */ -/* GCC and Sunpro support noinline via __attribute__ */ -#if defined(__GNUC__) || defined(__SUNPRO_C) +/* GCC supports noinline via __attribute__ */ +#if defined(__GNUC__) #define pg_noinline __attribute__((noinline)) /* msvc via declspec */ #elif defined(_MSC_VER) diff --git a/src/include/port/atomics.h b/src/include/port/atomics.h index 074136fe4c4a8..96f1858da9722 100644 --- a/src/include/port/atomics.h +++ b/src/include/port/atomics.h @@ -88,8 +88,6 @@ #include "port/atomics/generic-gcc.h" #elif defined(_MSC_VER) #include "port/atomics/generic-msvc.h" -#elif defined(__SUNPRO_C) && !defined(__GNUC__) -#include "port/atomics/generic-sunpro.h" #else /* Unknown compiler. */ #endif diff --git a/src/include/port/atomics/arch-x86.h b/src/include/port/atomics/arch-x86.h index 8983dd89d53d7..4ba2ccc05913d 100644 --- a/src/include/port/atomics/arch-x86.h +++ b/src/include/port/atomics/arch-x86.h @@ -241,6 +241,6 @@ pg_atomic_fetch_add_u64_impl(volatile pg_atomic_uint64 *ptr, int64 add_) */ #if defined(__i568__) || defined(__i668__) || /* gcc i586+ */ \ (defined(_M_IX86) && _M_IX86 >= 500) || /* msvc i586+ */ \ - defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) /* gcc, sunpro, msvc */ + defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) /* gcc, msvc */ #define PG_HAVE_8BYTE_SINGLE_COPY_ATOMICITY #endif /* 8 byte single-copy atomicity */ diff --git a/src/include/port/atomics/generic-sunpro.h b/src/include/port/atomics/generic-sunpro.h deleted file mode 100644 index 09bba0be2037d..0000000000000 --- a/src/include/port/atomics/generic-sunpro.h +++ /dev/null @@ -1,113 +0,0 @@ -/*------------------------------------------------------------------------- - * - * generic-sunpro.h - * Atomic operations for solaris' CC - * - * Portions Copyright (c) 2013-2025, PostgreSQL Global Development Group - * - * NOTES: - * - * Documentation: - * * manpage for atomic_cas(3C) - * http://www.unix.com/man-page/opensolaris/3c/atomic_cas/ - * http://docs.oracle.com/cd/E23824_01/html/821-1465/atomic-cas-3c.html - * - * src/include/port/atomics/generic-sunpro.h - * - * ------------------------------------------------------------------------- - */ - -#ifdef HAVE_MBARRIER_H -#include - -#define pg_compiler_barrier_impl() __compiler_barrier() - -#ifndef pg_memory_barrier_impl -/* - * Despite the name this is actually a full barrier. Expanding to mfence/ - * membar #StoreStore | #LoadStore | #StoreLoad | #LoadLoad on x86/sparc - * respectively. - */ -# define pg_memory_barrier_impl() __machine_rw_barrier() -#endif -#ifndef pg_read_barrier_impl -# define pg_read_barrier_impl() __machine_r_barrier() -#endif -#ifndef pg_write_barrier_impl -# define pg_write_barrier_impl() __machine_w_barrier() -#endif - -#endif /* HAVE_MBARRIER_H */ - -/* Older versions of the compiler don't have atomic.h... */ -#ifdef HAVE_ATOMIC_H - -#include - -#define PG_HAVE_ATOMIC_U32_SUPPORT -typedef struct pg_atomic_uint32 -{ - volatile uint32 value; -} pg_atomic_uint32; - -#define PG_HAVE_ATOMIC_U64_SUPPORT -typedef struct pg_atomic_uint64 -{ - /* - * Syntax to enforce variable alignment should be supported by versions - * supporting atomic.h, but it's hard to find accurate documentation. If - * it proves to be a problem, we'll have to add more version checks for 64 - * bit support. - */ - volatile uint64 value pg_attribute_aligned(8); -} pg_atomic_uint64; - -#endif /* HAVE_ATOMIC_H */ - - -#ifdef HAVE_ATOMIC_H - -#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32 -static inline bool -pg_atomic_compare_exchange_u32_impl(volatile pg_atomic_uint32 *ptr, - uint32 *expected, uint32 newval) -{ - bool ret; - uint32 current; - - current = atomic_cas_32(&ptr->value, *expected, newval); - ret = current == *expected; - *expected = current; - return ret; -} - -#define PG_HAVE_ATOMIC_EXCHANGE_U32 -static inline uint32 -pg_atomic_exchange_u32_impl(volatile pg_atomic_uint32 *ptr, uint32 newval) -{ - return atomic_swap_32(&ptr->value, newval); -} - -#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U64 -static inline bool -pg_atomic_compare_exchange_u64_impl(volatile pg_atomic_uint64 *ptr, - uint64 *expected, uint64 newval) -{ - bool ret; - uint64 current; - - AssertPointerAlignment(expected, 8); - current = atomic_cas_64(&ptr->value, *expected, newval); - ret = current == *expected; - *expected = current; - return ret; -} - -#define PG_HAVE_ATOMIC_EXCHANGE_U64 -static inline uint64 -pg_atomic_exchange_u64_impl(volatile pg_atomic_uint64 *ptr, uint64 newval) -{ - return atomic_swap_64(&ptr->value, newval); -} - -#endif /* HAVE_ATOMIC_H */ diff --git a/src/include/port/solaris.h b/src/include/port/solaris.h index 8ff40007c7f6a..c352361c81d83 100644 --- a/src/include/port/solaris.h +++ b/src/include/port/solaris.h @@ -1,26 +1,5 @@ /* src/include/port/solaris.h */ -/* - * Sort this out for all operating systems some time. The __xxx - * symbols are defined on both GCC and Solaris CC, although GCC - * doesn't document them. The __xxx__ symbols are only on GCC. - */ -#if defined(__i386) && !defined(__i386__) -#define __i386__ -#endif - -#if defined(__amd64) && !defined(__amd64__) -#define __amd64__ -#endif - -#if defined(__x86_64) && !defined(__x86_64__) -#define __x86_64__ -#endif - -#if defined(__sparc) && !defined(__sparc__) -#define __sparc__ -#endif - #if defined(__i386__) #include #endif diff --git a/src/include/storage/s_lock.h b/src/include/storage/s_lock.h index 2f73f9fcf57a2..7f8f566bd407f 100644 --- a/src/include/storage/s_lock.h +++ b/src/include/storage/s_lock.h @@ -333,9 +333,9 @@ tas(volatile slock_t *lock) slock_t _res; /* - * See comment in src/backend/port/tas/sunstudio_sparc.s for why this - * uses "ldstub", and that file uses "cas". gcc currently generates - * sparcv7-targeted binaries, so "cas" use isn't possible. + * "cas" would be better than "ldstub", but it is only present on + * sparcv8plus and later, while some platforms still support sparcv7 or + * sparcv8. Also, "cas" requires that the system be running in TSO mode. */ __asm__ __volatile__( " ldstub [%2], %0 \n" @@ -594,24 +594,6 @@ tas(volatile slock_t *lock) #if !defined(HAS_TEST_AND_SET) /* We didn't trigger above, let's try here */ -/* These are in sunstudio_(sparc|x86).s */ - -#if defined(__SUNPRO_C) && (defined(__i386) || defined(__x86_64__) || defined(__sparc__) || defined(__sparc)) -#define HAS_TEST_AND_SET - -#if defined(__i386) || defined(__x86_64__) || defined(__sparcv9) || defined(__sparcv8plus) -typedef unsigned int slock_t; -#else -typedef unsigned char slock_t; -#endif - -extern slock_t pg_atomic_cas(volatile slock_t *lock, slock_t with, - slock_t cmp); - -#define TAS(a) (pg_atomic_cas((a), 1, 0) != 0) -#endif - - #ifdef _MSC_VER typedef LONG slock_t; diff --git a/src/makefiles/meson.build b/src/makefiles/meson.build index 54dbc059adac7..0def244c9011d 100644 --- a/src/makefiles/meson.build +++ b/src/makefiles/meson.build @@ -63,8 +63,6 @@ pgxs_kv = { 'DLSUFFIX': dlsuffix, 'EXEEXT': exesuffix, - 'SUN_STUDIO_CC': 'no', # not supported so far - # want the chosen option, rather than the library 'with_ssl' : ssl_library, 'with_uuid': uuidopt, @@ -179,7 +177,7 @@ pgxs_empty = [ 'WANTED_LANGUAGES', # Not needed because we don't build the server / PLs with the generated makefile - 'LIBOBJS', 'PG_CRC32C_OBJS', 'TAS', + 'LIBOBJS', 'PG_CRC32C_OBJS', 'PG_TEST_EXTRA', 'DTRACEFLAGS', # only server has dtrace probes diff --git a/src/template/linux b/src/template/linux index ec3302c4a223f..faefe64254a90 100644 --- a/src/template/linux +++ b/src/template/linux @@ -14,26 +14,3 @@ CFLAGS_SL="-fPIC" # If --enable-profiling is specified, we need -DLINUX_PROFILE PLATFORM_PROFILE_FLAGS="-DLINUX_PROFILE" - -if test "$SUN_STUDIO_CC" = "yes" ; then - CC="$CC -Xa" # relaxed ISO C mode - CFLAGS="-v" # -v is like gcc -Wall - if test "$enable_debug" != yes; then - CFLAGS="$CFLAGS -O" # any optimization breaks debug - fi - - # Pick the right test-and-set (TAS) code for the Sun compiler. - # We would like to use in-line assembler, but the compiler - # requires *.il files to be on every compile line, making - # the build system too fragile. - case $host_cpu in - sparc) - need_tas=yes - tas_file=sunstudio_sparc.s - ;; - i?86|x86_64) - need_tas=yes - tas_file=sunstudio_x86.s - ;; - esac -fi diff --git a/src/template/solaris b/src/template/solaris index f88b1cdad37f8..a4d8d38a8f852 100644 --- a/src/template/solaris +++ b/src/template/solaris @@ -1,31 +1,4 @@ # src/template/solaris # Extra CFLAGS for code that will go into a shared library -if test "$GCC" = yes ; then - CFLAGS_SL="-fPIC" -else - CFLAGS_SL="-KPIC" -fi - -if test "$SUN_STUDIO_CC" = yes ; then - CC="$CC -Xa" # relaxed ISO C mode - CFLAGS="-v" # -v is like gcc -Wall - if test "$enable_debug" != yes; then - CFLAGS="$CFLAGS -O" # any optimization breaks debug - fi - - # Pick the right test-and-set (TAS) code for the Sun compiler. - # We would like to use in-line assembler, but the compiler - # requires *.il files to be on every compile line, making - # the build system too fragile. - case $host_cpu in - sparc) - need_tas=yes - tas_file=sunstudio_sparc.s - ;; - i?86|x86_64) - need_tas=yes - tas_file=sunstudio_x86.s - ;; - esac -fi +CFLAGS_SL="-fPIC" diff --git a/src/tools/pginclude/headerscheck b/src/tools/pginclude/headerscheck index 17138a7569e4f..d017490a5386a 100755 --- a/src/tools/pginclude/headerscheck +++ b/src/tools/pginclude/headerscheck @@ -114,7 +114,6 @@ do test "$f" = src/include/port/atomics/generic.h && continue test "$f" = src/include/port/atomics/generic-gcc.h && continue test "$f" = src/include/port/atomics/generic-msvc.h && continue - test "$f" = src/include/port/atomics/generic-sunpro.h && continue # sepgsql.h depends on headers that aren't there on most platforms. test "$f" = contrib/sepgsql/sepgsql.h && continue From 2aac62be8cbb870ccf8c5b3fbb8a4e4aa8a14a73 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Fri, 12 Sep 2025 07:57:06 +0200 Subject: [PATCH 59/73] Default to log_lock_waits=on If someone is stuck behind a lock for more than a second, that is almost always a problem that is worth a log entry. Author: Laurenz Albe Reviewed-By: Michael Banck Reviewed-By: Robert Haas Reviewed-By: Christoph Berg Reviewed-By: Stephen Frost Discussion: https://postgr.es/m/b8b8502915e50f44deb111bc0b43a99e2733e117.camel%40cybertec.at --- doc/src/sgml/config.sgml | 2 +- src/backend/storage/lmgr/proc.c | 2 +- src/backend/utils/misc/guc_parameters.dat | 2 +- src/backend/utils/misc/postgresql.conf.sample | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 2a3685f474a96..3c33d5d0fbcae 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -7929,7 +7929,7 @@ log_line_prefix = '%m [%p] %q%u@%d/%a ' Controls whether a log message is produced when a session waits longer than to acquire a lock. This is useful in determining if lock waits are causing - poor performance. The default is off. + poor performance. The default is on. Only superusers and users with the appropriate SET privilege can change this setting. diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index e9ef0fbfe32cb..96f29aafc391e 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -60,7 +60,7 @@ int LockTimeout = 0; int IdleInTransactionSessionTimeout = 0; int TransactionTimeout = 0; int IdleSessionTimeout = 0; -bool log_lock_waits = false; +bool log_lock_waits = true; /* Pointer to this process's PGPROC struct, if any */ PGPROC *MyProc = NULL; diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index 0da01627cfec1..6bc6be13d2ad2 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -566,7 +566,7 @@ { name => 'log_lock_waits', type => 'bool', context => 'PGC_SUSET', group => 'LOGGING_WHAT', short_desc => 'Logs long lock waits.', variable => 'log_lock_waits', - boot_val => 'false', + boot_val => 'true', }, { name => 'log_lock_failures', type => 'bool', context => 'PGC_SUSET', group => 'LOGGING_WHAT', diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 26c0869356485..c36fcb9ab6105 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -624,7 +624,7 @@ # processes # %% = '%' # e.g. '<%u%%%d> ' -#log_lock_waits = off # log lock waits >= deadlock_timeout +#log_lock_waits = on # log lock waits >= deadlock_timeout #log_lock_failures = off # log lock failures #log_recovery_conflict_waits = off # log standby recovery conflict waits # >= deadlock_timeout From 675ddc4d704f4cde0bc72244263a9efbb0d32cb8 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Fri, 12 Sep 2025 08:13:05 +0200 Subject: [PATCH 60/73] Improve pgbench definition of yyscan_t It was defining yyscan_t as a macro while the rest of the code uses a typedef with #ifdef guards around it. The latter is also what the flex generated code uses. So it seems best to make it look like those other places for consistency. The old way also had a potential for conflict if some code included multiple headers providing yyscan_t. exprscan.l includes #include "fe_utils/psqlscan_int.h" #include "pgbench.h" and fe_utils/psqlscan_int.h contains #ifndef YY_TYPEDEF_YY_SCANNER_T #define YY_TYPEDEF_YY_SCANNER_T typedef void *yyscan_t; #endif which was then followed by pgbench.h #define yyscan_t void * and then the generated code in exprscan.c #ifndef YY_TYPEDEF_YY_SCANNER_T #define YY_TYPEDEF_YY_SCANNER_T typedef void* yyscan_t; #endif This works, but if the #ifdef guard in psqlscan_int.h is removed, this fails. We want to move toward allowing repeat typedefs, per C11, but for that we need to make sure they are all the same. Reviewed-by: Chao Li Discussion: https://www.postgresql.org/message-id/flat/10d32190-f31b-40a5-b177-11db55597355@eisentraut.org --- src/bin/pgbench/pgbench.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/bin/pgbench/pgbench.h b/src/bin/pgbench/pgbench.h index e053c9e2eb63d..f8b7b497d1ee0 100644 --- a/src/bin/pgbench/pgbench.h +++ b/src/bin/pgbench/pgbench.h @@ -16,11 +16,14 @@ /* * This file is included outside exprscan.l, in places where we can't see * flex's definition of typedef yyscan_t. Fortunately, it's documented as - * being "void *", so we can use a macro to keep the function declarations + * being "void *", so we can use typedef to keep the function declarations * here looking like the definitions in exprscan.l. exprparse.y and * pgbench.c also use this to be able to declare things as "yyscan_t". */ -#define yyscan_t void * +#ifndef YY_TYPEDEF_YY_SCANNER_T +#define YY_TYPEDEF_YY_SCANNER_T +typedef void *yyscan_t; +#endif /* * Likewise, we can't see exprparse.y's definition of union YYSTYPE here, From ae0e1be9f2a20f6b64072dcee5b8dd7b9027a8fa Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Fri, 12 Sep 2025 08:13:05 +0200 Subject: [PATCH 61/73] Allow redeclaration of typedef yyscan_t This is allowed in C11, so we don't need the workaround guards against it anymore. This effectively reverts commit 382092a0cd2 that put these guards in place. Reviewed-by: Chao Li Discussion: https://www.postgresql.org/message-id/flat/10d32190-f31b-40a5-b177-11db55597355@eisentraut.org --- contrib/cube/cubedata.h | 3 --- contrib/seg/segdata.h | 3 --- src/backend/utils/adt/jsonpath_internal.h | 3 --- src/bin/pgbench/pgbench.h | 3 --- src/include/bootstrap/bootstrap.h | 3 --- src/include/fe_utils/psqlscan_int.h | 6 ------ src/include/replication/syncrep.h | 3 --- src/include/replication/walsender_private.h | 3 --- src/pl/plpgsql/src/plpgsql.h | 3 --- 9 files changed, 30 deletions(-) diff --git a/contrib/cube/cubedata.h b/contrib/cube/cubedata.h index ad1e2bd699810..8bfcc6e99a27d 100644 --- a/contrib/cube/cubedata.h +++ b/contrib/cube/cubedata.h @@ -62,10 +62,7 @@ typedef struct NDBOX /* for cubescan.l and cubeparse.y */ /* All grammar constructs return strings */ #define YYSTYPE char * -#ifndef YY_TYPEDEF_YY_SCANNER_T -#define YY_TYPEDEF_YY_SCANNER_T typedef void *yyscan_t; -#endif /* in cubescan.l */ extern int cube_yylex(YYSTYPE *yylval_param, yyscan_t yyscanner); diff --git a/contrib/seg/segdata.h b/contrib/seg/segdata.h index 4347c31c28e94..7bc7c83dca309 100644 --- a/contrib/seg/segdata.h +++ b/contrib/seg/segdata.h @@ -16,10 +16,7 @@ extern int significant_digits(const char *s); /* for segscan.l and segparse.y */ union YYSTYPE; -#ifndef YY_TYPEDEF_YY_SCANNER_T -#define YY_TYPEDEF_YY_SCANNER_T typedef void *yyscan_t; -#endif /* in segscan.l */ extern int seg_yylex(union YYSTYPE *yylval_param, yyscan_t yyscanner); diff --git a/src/backend/utils/adt/jsonpath_internal.h b/src/backend/utils/adt/jsonpath_internal.h index f78069857d02b..19567aca6f775 100644 --- a/src/backend/utils/adt/jsonpath_internal.h +++ b/src/backend/utils/adt/jsonpath_internal.h @@ -22,10 +22,7 @@ typedef struct JsonPathString int total; } JsonPathString; -#ifndef YY_TYPEDEF_YY_SCANNER_T -#define YY_TYPEDEF_YY_SCANNER_T typedef void *yyscan_t; -#endif #include "utils/jsonpath.h" #include "jsonpath_gram.h" diff --git a/src/bin/pgbench/pgbench.h b/src/bin/pgbench/pgbench.h index f8b7b497d1ee0..d55d30e0ef954 100644 --- a/src/bin/pgbench/pgbench.h +++ b/src/bin/pgbench/pgbench.h @@ -20,10 +20,7 @@ * here looking like the definitions in exprscan.l. exprparse.y and * pgbench.c also use this to be able to declare things as "yyscan_t". */ -#ifndef YY_TYPEDEF_YY_SCANNER_T -#define YY_TYPEDEF_YY_SCANNER_T typedef void *yyscan_t; -#endif /* * Likewise, we can't see exprparse.y's definition of union YYSTYPE here, diff --git a/src/include/bootstrap/bootstrap.h b/src/include/bootstrap/bootstrap.h index befc4fa1b3d87..5ad347ec290fa 100644 --- a/src/include/bootstrap/bootstrap.h +++ b/src/include/bootstrap/bootstrap.h @@ -56,10 +56,7 @@ extern void boot_get_type_io_data(Oid typid, Oid *typoutput); union YYSTYPE; -#ifndef YY_TYPEDEF_YY_SCANNER_T -#define YY_TYPEDEF_YY_SCANNER_T typedef void *yyscan_t; -#endif extern int boot_yyparse(yyscan_t yyscanner); extern int boot_yylex_init(yyscan_t *yyscannerp); diff --git a/src/include/fe_utils/psqlscan_int.h b/src/include/fe_utils/psqlscan_int.h index 2a3a9d7c82aaa..a1ebf226cf499 100644 --- a/src/include/fe_utils/psqlscan_int.h +++ b/src/include/fe_utils/psqlscan_int.h @@ -51,14 +51,8 @@ * validity checking; in actual use, this file should always be included * from the body of a flex file, where these symbols are already defined. */ -#ifndef YY_TYPEDEF_YY_BUFFER_STATE -#define YY_TYPEDEF_YY_BUFFER_STATE typedef struct yy_buffer_state *YY_BUFFER_STATE; -#endif -#ifndef YY_TYPEDEF_YY_SCANNER_T -#define YY_TYPEDEF_YY_SCANNER_T typedef void *yyscan_t; -#endif /* * We use a stack of flex buffers to handle substitution of psql variables. diff --git a/src/include/replication/syncrep.h b/src/include/replication/syncrep.h index 675669a79f7d3..dc2b118b16629 100644 --- a/src/include/replication/syncrep.h +++ b/src/include/replication/syncrep.h @@ -97,10 +97,7 @@ extern void SyncRepUpdateSyncStandbysDefined(void); * in syncrep_gram.y and syncrep_scanner.l */ union YYSTYPE; -#ifndef YY_TYPEDEF_YY_SCANNER_T -#define YY_TYPEDEF_YY_SCANNER_T typedef void *yyscan_t; -#endif extern int syncrep_yyparse(SyncRepConfigData **syncrep_parse_result_p, char **syncrep_parse_error_msg_p, yyscan_t yyscanner); extern int syncrep_yylex(union YYSTYPE *yylval_param, char **syncrep_parse_error_msg_p, yyscan_t yyscanner); extern void syncrep_yyerror(SyncRepConfigData **syncrep_parse_result_p, char **syncrep_parse_error_msg_p, yyscan_t yyscanner, const char *str); diff --git a/src/include/replication/walsender_private.h b/src/include/replication/walsender_private.h index e98701038f506..384b8a78b9462 100644 --- a/src/include/replication/walsender_private.h +++ b/src/include/replication/walsender_private.h @@ -141,10 +141,7 @@ extern void WalSndSetState(WalSndState state); * repl_scanner.l */ union YYSTYPE; -#ifndef YY_TYPEDEF_YY_SCANNER_T -#define YY_TYPEDEF_YY_SCANNER_T typedef void *yyscan_t; -#endif extern int replication_yyparse(Node **replication_parse_result_p, yyscan_t yyscanner); extern int replication_yylex(union YYSTYPE *yylval_param, yyscan_t yyscanner); pg_noreturn extern void replication_yyerror(Node **replication_parse_result_p, yyscan_t yyscanner, const char *message); diff --git a/src/pl/plpgsql/src/plpgsql.h b/src/pl/plpgsql/src/plpgsql.h index 41e52b8ce7183..5f193a3718399 100644 --- a/src/pl/plpgsql/src/plpgsql.h +++ b/src/pl/plpgsql/src/plpgsql.h @@ -1307,10 +1307,7 @@ extern void plpgsql_dumptree(PLpgSQL_function *func); */ union YYSTYPE; #define YYLTYPE int -#ifndef YY_TYPEDEF_YY_SCANNER_T -#define YY_TYPEDEF_YY_SCANNER_T typedef void *yyscan_t; -#endif extern int plpgsql_yylex(union YYSTYPE *yylvalp, YYLTYPE *yyllocp, yyscan_t yyscanner); extern int plpgsql_token_length(yyscan_t yyscanner); extern void plpgsql_push_back_token(int token, union YYSTYPE *yylvalp, YYLTYPE *yyllocp, yyscan_t yyscanner); From 20d541a200e9dfed8affef9e798ff35ca0f30b8e Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Fri, 12 Sep 2025 10:18:31 -0400 Subject: [PATCH 62/73] ci: openbsd: Increase RAM disk's size Its size was ~3.8GB before, which sometimes was not enough. OpenBSD CI task often were failing due to no space left on device. Increase the RAM disk size to ~4.6 GB. Author: Nazir Bilal Yavuz Discussion: https://postgr.es/m/CAN55FZ2XVVPJRJmGB2DsL3gOrOinWh=HWvj6GO1cHzJ=6LwTag@mail.gmail.com Backpatch-through: 18, where openbsd was added to CI --- src/tools/ci/gcp_ram_disk.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/tools/ci/gcp_ram_disk.sh b/src/tools/ci/gcp_ram_disk.sh index d48634512ac28..18dbb2037f5dc 100755 --- a/src/tools/ci/gcp_ram_disk.sh +++ b/src/tools/ci/gcp_ram_disk.sh @@ -15,7 +15,12 @@ case "`uname`" in umount /dev/sd0j # unused /usr/obj partition printf "m j\n\n\nswap\nw\nq\n" | disklabel -E sd0 swapon /dev/sd0j - mount -t mfs -o rw,noatime,nodev,-s=8000000 swap $CIRRUS_WORKING_DIR + # Remove the per-process data segment limit so that mount_mfs can allocate + # large memory filesystems. Without this, mount_mfs mmap() may fail with + # "Cannot allocate memory" if the requested size exceeds the current + # datasize limit. + ulimit -d unlimited + mount -t mfs -o rw,noatime,nodev,-s=10000000 swap $CIRRUS_WORKING_DIR ;; esac From 7dcea51c2a4dcf7c512bbd4f618d1d3620f9d3d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Herrera?= Date: Fri, 12 Sep 2025 18:47:25 +0200 Subject: [PATCH 63/73] Avoid unexpected changes of CurrentResourceOwner and CurrentMemoryContext Users of logical decoding can encounter an unexpected change of CurrentResourceOwner and CurrentMemoryContext. The problem is that, unlike other call sites of RollbackAndReleaseCurrentSubTransaction(), in reorderbuffer.c we fail to restore the original values of these global variables after being clobbered by subtransaction abort. This patch saves the values prior to the call and restores them eventually. In addition, logical.c and logicalfuncs.c had a hack to restore resource owner, presumably because of lack of this restore. Remove that. Instead, because the test coverage here is not very consistent, add an Assert() to ensure that the resowner is kept identical; this would make it easy to detect other cases of bugs were we fail to restore resowner properly. This could be removed later. This is arguably an old bug, but there appears to be no reason to backpatch it and it's risky to do so, so refrain for now. Author: Antonin Houska Reported-by: Mihail Nikalayeu Reviewed-by: Euler Taveira Discussion: https://postgr.es/m/119497.1756892972@localhost --- src/backend/replication/logical/logical.c | 19 +++++++++++-------- .../replication/logical/logicalfuncs.c | 19 +++++++++++-------- .../replication/logical/reorderbuffer.c | 15 +++++++++++++++ 3 files changed, 37 insertions(+), 16 deletions(-) diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c index 7e363a7c05b4f..c68c0481f427a 100644 --- a/src/backend/replication/logical/logical.c +++ b/src/backend/replication/logical/logical.c @@ -2082,7 +2082,7 @@ LogicalSlotAdvanceAndCheckSnapState(XLogRecPtr moveto, bool *found_consistent_snapshot) { LogicalDecodingContext *ctx; - ResourceOwner old_resowner = CurrentResourceOwner; + ResourceOwner old_resowner PG_USED_FOR_ASSERTS_ONLY = CurrentResourceOwner; XLogRecPtr retlsn; Assert(moveto != InvalidXLogRecPtr); @@ -2141,21 +2141,24 @@ LogicalSlotAdvanceAndCheckSnapState(XLogRecPtr moveto, * might still have critical updates to do. */ if (record) + { LogicalDecodingProcessRecord(ctx, ctx->reader); + /* + * We used to have bugs where logical decoding would fail to + * preserve the resource owner. That's important here, so + * verify that that doesn't happen anymore. XXX this could be + * removed once it's been battle-tested. + */ + Assert(CurrentResourceOwner == old_resowner); + } + CHECK_FOR_INTERRUPTS(); } if (found_consistent_snapshot && DecodingContextReady(ctx)) *found_consistent_snapshot = true; - /* - * Logical decoding could have clobbered CurrentResourceOwner during - * transaction management, so restore the executor's value. (This is - * a kluge, but it's not worth cleaning up right now.) - */ - CurrentResourceOwner = old_resowner; - if (ctx->reader->EndRecPtr != InvalidXLogRecPtr) { LogicalConfirmReceivedLocation(moveto); diff --git a/src/backend/replication/logical/logicalfuncs.c b/src/backend/replication/logical/logicalfuncs.c index ca53caac2f2f5..25f890ddeedac 100644 --- a/src/backend/replication/logical/logicalfuncs.c +++ b/src/backend/replication/logical/logicalfuncs.c @@ -107,7 +107,7 @@ pg_logical_slot_get_changes_guts(FunctionCallInfo fcinfo, bool confirm, bool bin XLogRecPtr end_of_wal; XLogRecPtr wait_for_wal_lsn; LogicalDecodingContext *ctx; - ResourceOwner old_resowner = CurrentResourceOwner; + ResourceOwner old_resowner PG_USED_FOR_ASSERTS_ONLY = CurrentResourceOwner; ArrayType *arr; Size ndim; List *options = NIL; @@ -263,8 +263,18 @@ pg_logical_slot_get_changes_guts(FunctionCallInfo fcinfo, bool confirm, bool bin * store the description into our tuplestore. */ if (record != NULL) + { LogicalDecodingProcessRecord(ctx, ctx->reader); + /* + * We used to have bugs where logical decoding would fail to + * preserve the resource owner. Verify that that doesn't + * happen anymore. XXX this could be removed once it's been + * battle-tested. + */ + Assert(CurrentResourceOwner == old_resowner); + } + /* check limits */ if (upto_lsn != InvalidXLogRecPtr && upto_lsn <= ctx->reader->EndRecPtr) @@ -275,13 +285,6 @@ pg_logical_slot_get_changes_guts(FunctionCallInfo fcinfo, bool confirm, bool bin CHECK_FOR_INTERRUPTS(); } - /* - * Logical decoding could have clobbered CurrentResourceOwner during - * transaction management, so restore the executor's value. (This is - * a kluge, but it's not worth cleaning up right now.) - */ - CurrentResourceOwner = old_resowner; - /* * Next time, start where we left off. (Hunting things, the family * business..) diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c index 34cf05668ae84..4736f993c3743 100644 --- a/src/backend/replication/logical/reorderbuffer.c +++ b/src/backend/replication/logical/reorderbuffer.c @@ -2215,6 +2215,7 @@ ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, { bool using_subtxn; MemoryContext ccxt = CurrentMemoryContext; + ResourceOwner cowner = CurrentResourceOwner; ReorderBufferIterTXNState *volatile iterstate = NULL; volatile XLogRecPtr prev_lsn = InvalidXLogRecPtr; ReorderBufferChange *volatile specinsert = NULL; @@ -2692,7 +2693,11 @@ ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, } if (using_subtxn) + { RollbackAndReleaseCurrentSubTransaction(); + MemoryContextSwitchTo(ccxt); + CurrentResourceOwner = cowner; + } /* * We are here due to one of the four reasons: 1. Decoding an @@ -2751,7 +2756,11 @@ ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, } if (using_subtxn) + { RollbackAndReleaseCurrentSubTransaction(); + MemoryContextSwitchTo(ccxt); + CurrentResourceOwner = cowner; + } /* * The error code ERRCODE_TRANSACTION_ROLLBACK indicates a concurrent @@ -3244,6 +3253,8 @@ ReorderBufferImmediateInvalidation(ReorderBuffer *rb, uint32 ninvalidations, SharedInvalidationMessage *invalidations) { bool use_subtxn = IsTransactionOrTransactionBlock(); + MemoryContext ccxt = CurrentMemoryContext; + ResourceOwner cowner = CurrentResourceOwner; int i; if (use_subtxn) @@ -3262,7 +3273,11 @@ ReorderBufferImmediateInvalidation(ReorderBuffer *rb, uint32 ninvalidations, LocalExecuteInvalidationMessage(&invalidations[i]); if (use_subtxn) + { RollbackAndReleaseCurrentSubTransaction(); + MemoryContextSwitchTo(ccxt); + CurrentResourceOwner = cowner; + } } /* From 796962922e6938fdad4dbf810fb2a5dfcfc5f45a Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Fri, 12 Sep 2025 13:23:00 -0400 Subject: [PATCH 64/73] Always commute strategy when preprocessing DESC keys. A recently added nbtree preprocessing step failed to account for the fact that DESC columns already had their B-Tree strategy number commuted at this point in preprocessing. As a result, preprocessing could output a set of scan keys where one or more keys had the correct strategy number, but used the wrong comparison routine. To fix, make the faulty code path that looks up a more restrictive replacement operator/comparison routine commute its requested inequality strategy (while outputting the transformed strategy number as before). This makes the final transformed scan key comport with the approach preprocessing has always used to deal with DESC columns (which is described by comments above _bt_fix_scankey_strategy). Oversight in commit commit b3f1a13f, which made nbtree preprocessing perform transformations on skip array inequalities that can reduce the total number of index searches. Author: Peter Geoghegan Reported-By: Natalya Aksman Discussion: https://postgr.es/m/19049-b7df801e71de41b2@postgresql.org Backpatch-through: 18 --- src/backend/access/nbtree/nbtpreprocesskeys.c | 24 ++++++++++++++----- 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/src/backend/access/nbtree/nbtpreprocesskeys.c b/src/backend/access/nbtree/nbtpreprocesskeys.c index 936b93f157a8b..07a3ff11a0b87 100644 --- a/src/backend/access/nbtree/nbtpreprocesskeys.c +++ b/src/backend/access/nbtree/nbtpreprocesskeys.c @@ -1412,6 +1412,7 @@ _bt_skiparray_strat_decrement(IndexScanDesc scan, ScanKey arraysk, Datum orig_sk_argument = high_compare->sk_argument, new_sk_argument; bool uflow; + int16 lookupstrat; Assert(high_compare->sk_strategy == BTLessStrategyNumber); @@ -1433,9 +1434,14 @@ _bt_skiparray_strat_decrement(IndexScanDesc scan, ScanKey arraysk, return; } - /* Look up <= operator (might fail) */ - leop = get_opfamily_member(opfamily, opcintype, opcintype, - BTLessEqualStrategyNumber); + /* + * Look up <= operator (might fail), accounting for the fact that a + * high_compare on a DESC column already had its strategy commuted + */ + lookupstrat = BTLessEqualStrategyNumber; + if (high_compare->sk_flags & SK_BT_DESC) + lookupstrat = BTGreaterEqualStrategyNumber; /* commute this too */ + leop = get_opfamily_member(opfamily, opcintype, opcintype, lookupstrat); if (!OidIsValid(leop)) return; cmp_proc = get_opcode(leop); @@ -1464,6 +1470,7 @@ _bt_skiparray_strat_increment(IndexScanDesc scan, ScanKey arraysk, Datum orig_sk_argument = low_compare->sk_argument, new_sk_argument; bool oflow; + int16 lookupstrat; Assert(low_compare->sk_strategy == BTGreaterStrategyNumber); @@ -1485,9 +1492,14 @@ _bt_skiparray_strat_increment(IndexScanDesc scan, ScanKey arraysk, return; } - /* Look up >= operator (might fail) */ - geop = get_opfamily_member(opfamily, opcintype, opcintype, - BTGreaterEqualStrategyNumber); + /* + * Look up >= operator (might fail), accounting for the fact that a + * low_compare on a DESC column already had its strategy commuted + */ + lookupstrat = BTGreaterEqualStrategyNumber; + if (low_compare->sk_flags & SK_BT_DESC) + lookupstrat = BTLessEqualStrategyNumber; /* commute this too */ + geop = get_opfamily_member(opfamily, opcintype, opcintype, lookupstrat); if (!OidIsValid(geop)) return; cmp_proc = get_opcode(geop); From 4adb0380b9bff5ec6424a9e87f76f8974b025367 Mon Sep 17 00:00:00 2001 From: Noah Misch Date: Fri, 12 Sep 2025 12:44:14 -0700 Subject: [PATCH 65/73] Replace tests of ALTER DATABASE RESET TABLESPACE. This unblocks rejection of that syntax. One copy was a misspelling of "SET TABLESPACE pg_default" that instead made no persistent changes. The other copy just needed to populate a DATABASEOID syscache entry. This slightly raises database.sql test coverage of catcache.c, while dbcommands.c coverage remains the same. Reported-by: Tom Lane Reviewed-by: Tom Lane Discussion: https://postgr.es/m/1802710.1757608564@sss.pgh.pa.us --- src/test/regress/expected/database.out | 4 ++-- src/test/regress/sql/database.sql | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/test/regress/expected/database.out b/src/test/regress/expected/database.out index 4cbdbdf84d0c5..6b879b0f62a75 100644 --- a/src/test/regress/expected/database.out +++ b/src/test/regress/expected/database.out @@ -2,7 +2,7 @@ CREATE DATABASE regression_tbd ENCODING utf8 LC_COLLATE "C" LC_CTYPE "C" TEMPLATE template0; ALTER DATABASE regression_tbd RENAME TO regression_utf8; ALTER DATABASE regression_utf8 SET TABLESPACE regress_tblspace; -ALTER DATABASE regression_utf8 RESET TABLESPACE; +ALTER DATABASE regression_utf8 SET TABLESPACE pg_default; ALTER DATABASE regression_utf8 CONNECTION_LIMIT 123; -- Test PgDatabaseToastTable. Doing this with GRANT would be slow. BEGIN; @@ -10,7 +10,7 @@ UPDATE pg_database SET datacl = array_fill(makeaclitem(10, 10, 'USAGE', false), ARRAY[5e5::int]) WHERE datname = 'regression_utf8'; -- load catcache entry, if nothing else does -ALTER DATABASE regression_utf8 RESET TABLESPACE; +ALTER DATABASE regression_utf8 RENAME TO regression_rename_rolled_back; ROLLBACK; CREATE ROLE regress_datdba_before; CREATE ROLE regress_datdba_after; diff --git a/src/test/regress/sql/database.sql b/src/test/regress/sql/database.sql index 46ad2634781ea..4ef361272911e 100644 --- a/src/test/regress/sql/database.sql +++ b/src/test/regress/sql/database.sql @@ -2,7 +2,7 @@ CREATE DATABASE regression_tbd ENCODING utf8 LC_COLLATE "C" LC_CTYPE "C" TEMPLATE template0; ALTER DATABASE regression_tbd RENAME TO regression_utf8; ALTER DATABASE regression_utf8 SET TABLESPACE regress_tblspace; -ALTER DATABASE regression_utf8 RESET TABLESPACE; +ALTER DATABASE regression_utf8 SET TABLESPACE pg_default; ALTER DATABASE regression_utf8 CONNECTION_LIMIT 123; -- Test PgDatabaseToastTable. Doing this with GRANT would be slow. @@ -11,7 +11,7 @@ UPDATE pg_database SET datacl = array_fill(makeaclitem(10, 10, 'USAGE', false), ARRAY[5e5::int]) WHERE datname = 'regression_utf8'; -- load catcache entry, if nothing else does -ALTER DATABASE regression_utf8 RESET TABLESPACE; +ALTER DATABASE regression_utf8 RENAME TO regression_rename_rolled_back; ROLLBACK; CREATE ROLE regress_datdba_before; From f14ea34d6e563374cd71b4e7b91cf8d2f60aabb3 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Fri, 12 Sep 2025 17:43:15 -0400 Subject: [PATCH 66/73] Fix oversights in pg_event_trigger_dropped_objects() fixes. Commit a0b99fc12 caused pg_event_trigger_dropped_objects() to not fill the object_name field for schemas, which it should have; and caused it to fill the object_name field for default values, which it should not have. In addition, triggers and RLS policies really should behave the same way as we're making column defaults do; that is, they should have is_temporary = true if they belong to a temporary table. Fix those things, and upgrade event_trigger.sql's woefully inadequate test coverage of these secondary output columns. As before, back-patch only to v15. Reported-by: Sergey Shinderuk Author: Tom Lane Discussion: https://postgr.es/m/bd7b4651-1c26-4d30-832b-f942fabcb145@postgrespro.ru Backpatch-through: 15 --- src/backend/commands/event_trigger.c | 87 ++++++++- src/test/regress/expected/event_trigger.out | 188 ++++++++++++-------- src/test/regress/sql/event_trigger.sql | 55 ++++-- 3 files changed, 244 insertions(+), 86 deletions(-) diff --git a/src/backend/commands/event_trigger.c b/src/backend/commands/event_trigger.c index fcdcba009d4e3..f34868da5ab94 100644 --- a/src/backend/commands/event_trigger.c +++ b/src/backend/commands/event_trigger.c @@ -30,6 +30,7 @@ #include "catalog/pg_opclass.h" #include "catalog/pg_opfamily.h" #include "catalog/pg_parameter_acl.h" +#include "catalog/pg_policy.h" #include "catalog/pg_proc.h" #include "catalog/pg_tablespace.h" #include "catalog/pg_trigger.h" @@ -1302,6 +1303,7 @@ EventTriggerSQLDropAddObject(const ObjectAddress *object, bool original, bool no MemoryContextSwitchTo(oldcxt); return; } + obj->objname = get_namespace_name(object->objectId); } else if (object->classId == AttrDefaultRelationId) { @@ -1311,7 +1313,6 @@ EventTriggerSQLDropAddObject(const ObjectAddress *object, bool original, bool no colobject = GetAttrDefaultColumnAddress(object->objectId); if (OidIsValid(colobject.objectId)) { - colobject.objectSubId = 0; /* convert to table reference */ if (!obtain_object_name_namespace(&colobject, obj)) { pfree(obj); @@ -1320,6 +1321,90 @@ EventTriggerSQLDropAddObject(const ObjectAddress *object, bool original, bool no } } } + else if (object->classId == TriggerRelationId) + { + /* Similarly, a trigger is temp if its table is temp */ + /* Sadly, there's no lsyscache.c support for trigger objects */ + Relation pg_trigger_rel; + ScanKeyData skey[1]; + SysScanDesc sscan; + HeapTuple tuple; + Oid relid; + + /* Fetch the trigger's table OID the hard way */ + pg_trigger_rel = table_open(TriggerRelationId, AccessShareLock); + ScanKeyInit(&skey[0], + Anum_pg_trigger_oid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(object->objectId)); + sscan = systable_beginscan(pg_trigger_rel, TriggerOidIndexId, true, + NULL, 1, skey); + tuple = systable_getnext(sscan); + if (HeapTupleIsValid(tuple)) + relid = ((Form_pg_trigger) GETSTRUCT(tuple))->tgrelid; + else + relid = InvalidOid; /* shouldn't happen */ + systable_endscan(sscan); + table_close(pg_trigger_rel, AccessShareLock); + /* Do nothing if we didn't find the trigger */ + if (OidIsValid(relid)) + { + ObjectAddress relobject; + + relobject.classId = RelationRelationId; + relobject.objectId = relid; + /* Arbitrarily set objectSubId nonzero so as not to fill objname */ + relobject.objectSubId = 1; + if (!obtain_object_name_namespace(&relobject, obj)) + { + pfree(obj); + MemoryContextSwitchTo(oldcxt); + return; + } + } + } + else if (object->classId == PolicyRelationId) + { + /* Similarly, a policy is temp if its table is temp */ + /* Sadly, there's no lsyscache.c support for policy objects */ + Relation pg_policy_rel; + ScanKeyData skey[1]; + SysScanDesc sscan; + HeapTuple tuple; + Oid relid; + + /* Fetch the policy's table OID the hard way */ + pg_policy_rel = table_open(PolicyRelationId, AccessShareLock); + ScanKeyInit(&skey[0], + Anum_pg_policy_oid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(object->objectId)); + sscan = systable_beginscan(pg_policy_rel, PolicyOidIndexId, true, + NULL, 1, skey); + tuple = systable_getnext(sscan); + if (HeapTupleIsValid(tuple)) + relid = ((Form_pg_policy) GETSTRUCT(tuple))->polrelid; + else + relid = InvalidOid; /* shouldn't happen */ + systable_endscan(sscan); + table_close(pg_policy_rel, AccessShareLock); + /* Do nothing if we didn't find the policy */ + if (OidIsValid(relid)) + { + ObjectAddress relobject; + + relobject.classId = RelationRelationId; + relobject.objectId = relid; + /* Arbitrarily set objectSubId nonzero so as not to fill objname */ + relobject.objectSubId = 1; + if (!obtain_object_name_namespace(&relobject, obj)) + { + pfree(obj); + MemoryContextSwitchTo(oldcxt); + return; + } + } + } else { /* Generic handling for all other object classes */ diff --git a/src/test/regress/expected/event_trigger.out b/src/test/regress/expected/event_trigger.out index 0e090cbc37500..16e4530708cc9 100644 --- a/src/test/regress/expected/event_trigger.out +++ b/src/test/regress/expected/event_trigger.out @@ -228,9 +228,15 @@ INSERT INTO undroppable_objs VALUES ('table', 'schema_one.table_three'), ('table', 'audit_tbls.schema_two_table_three'); CREATE TABLE dropped_objects ( - type text, - schema text, - object text + object_type text, + schema_name text, + object_name text, + object_identity text, + address_names text[], + address_args text[], + is_temporary bool, + original bool, + normal bool ); -- This tests errors raised within event triggers; the one in audit_tbls -- uses 2nd-level recursive invocation via test_evtrig_dropped_objects(). @@ -268,8 +274,12 @@ BEGIN END IF; INSERT INTO dropped_objects - (type, schema, object) VALUES - (obj.object_type, obj.schema_name, obj.object_identity); + (object_type, schema_name, object_name, + object_identity, address_names, address_args, + is_temporary, original, normal) VALUES + (obj.object_type, obj.schema_name, obj.object_name, + obj.object_identity, obj.address_names, obj.address_args, + obj.is_temporary, obj.original, obj.normal); END LOOP; END $$; @@ -325,42 +335,44 @@ NOTICE: table "audit_tbls_schema_two_table_three" does not exist, skipping NOTICE: table "schema_one_table_one" does not exist, skipping NOTICE: table "schema_one_table two" does not exist, skipping NOTICE: table "schema_one_table_three" does not exist, skipping -SELECT * FROM dropped_objects WHERE schema IS NULL OR schema <> 'pg_toast'; - type | schema | object ---------------+------------+------------------------------------- - table column | schema_one | schema_one.table_one.a - schema | | schema_two - table | schema_two | schema_two.table_two - type | schema_two | schema_two.table_two - type | schema_two | schema_two.table_two[] - table | audit_tbls | audit_tbls.schema_two_table_three - type | audit_tbls | audit_tbls.schema_two_table_three - type | audit_tbls | audit_tbls.schema_two_table_three[] - table | schema_two | schema_two.table_three - type | schema_two | schema_two.table_three - type | schema_two | schema_two.table_three[] - function | schema_two | schema_two.add(integer,integer) - aggregate | schema_two | schema_two.newton(integer) - schema | | schema_one - table | schema_one | schema_one.table_one - type | schema_one | schema_one.table_one - type | schema_one | schema_one.table_one[] - table | schema_one | schema_one."table two" - type | schema_one | schema_one."table two" - type | schema_one | schema_one."table two"[] - table | schema_one | schema_one.table_three - type | schema_one | schema_one.table_three - type | schema_one | schema_one.table_three[] +-- exclude TOAST objects because they have unstable names +SELECT * FROM dropped_objects + WHERE schema_name IS NULL OR schema_name <> 'pg_toast'; + object_type | schema_name | object_name | object_identity | address_names | address_args | is_temporary | original | normal +--------------+-------------+-------------------------+-------------------------------------+---------------------------------------+-------------------+--------------+----------+-------- + table column | schema_one | | schema_one.table_one.a | {schema_one,table_one,a} | {} | f | t | f + schema | | schema_two | schema_two | {schema_two} | {} | f | t | f + table | schema_two | table_two | schema_two.table_two | {schema_two,table_two} | {} | f | f | t + type | schema_two | table_two | schema_two.table_two | {schema_two.table_two} | {} | f | f | f + type | schema_two | _table_two | schema_two.table_two[] | {schema_two.table_two[]} | {} | f | f | f + table | audit_tbls | schema_two_table_three | audit_tbls.schema_two_table_three | {audit_tbls,schema_two_table_three} | {} | f | t | f + type | audit_tbls | schema_two_table_three | audit_tbls.schema_two_table_three | {audit_tbls.schema_two_table_three} | {} | f | f | f + type | audit_tbls | _schema_two_table_three | audit_tbls.schema_two_table_three[] | {audit_tbls.schema_two_table_three[]} | {} | f | f | f + table | schema_two | table_three | schema_two.table_three | {schema_two,table_three} | {} | f | f | t + type | schema_two | table_three | schema_two.table_three | {schema_two.table_three} | {} | f | f | f + type | schema_two | _table_three | schema_two.table_three[] | {schema_two.table_three[]} | {} | f | f | f + function | schema_two | | schema_two.add(integer,integer) | {schema_two,add} | {integer,integer} | f | f | t + aggregate | schema_two | | schema_two.newton(integer) | {schema_two,newton} | {integer} | f | f | t + schema | | schema_one | schema_one | {schema_one} | {} | f | t | f + table | schema_one | table_one | schema_one.table_one | {schema_one,table_one} | {} | f | f | t + type | schema_one | table_one | schema_one.table_one | {schema_one.table_one} | {} | f | f | f + type | schema_one | _table_one | schema_one.table_one[] | {schema_one.table_one[]} | {} | f | f | f + table | schema_one | table two | schema_one."table two" | {schema_one,"table two"} | {} | f | f | t + type | schema_one | table two | schema_one."table two" | {"schema_one.\"table two\""} | {} | f | f | f + type | schema_one | _table two | schema_one."table two"[] | {"schema_one.\"table two\"[]"} | {} | f | f | f + table | schema_one | table_three | schema_one.table_three | {schema_one,table_three} | {} | f | f | t + type | schema_one | table_three | schema_one.table_three | {schema_one.table_three} | {} | f | f | f + type | schema_one | _table_three | schema_one.table_three[] | {schema_one.table_three[]} | {} | f | f | f (23 rows) DROP OWNED BY regress_evt_user; NOTICE: schema "audit_tbls" does not exist, skipping -SELECT * FROM dropped_objects WHERE type = 'schema'; - type | schema | object ---------+--------+------------ - schema | | schema_two - schema | | schema_one - schema | | audit_tbls +SELECT * FROM dropped_objects WHERE object_type = 'schema'; + object_type | schema_name | object_name | object_identity | address_names | address_args | is_temporary | original | normal +-------------+-------------+-------------+-----------------+---------------+--------------+--------------+----------+-------- + schema | | schema_two | schema_two | {schema_two} | {} | f | t | f + schema | | schema_one | schema_one | {schema_one} | {} | f | t | f + schema | | audit_tbls | audit_tbls | {audit_tbls} | {} | f | t | f (3 rows) DROP ROLE regress_evt_user; @@ -378,9 +390,10 @@ BEGIN IF NOT r.normal AND NOT r.original THEN CONTINUE; END IF; - RAISE NOTICE 'NORMAL: orig=% normal=% istemp=% type=% identity=% name=% args=%', + RAISE NOTICE 'NORMAL: orig=% normal=% istemp=% type=% identity=% schema=% name=% addr=% args=%', r.original, r.normal, r.is_temporary, r.object_type, - r.object_identity, r.address_names, r.address_args; + r.object_identity, r.schema_name, r.object_name, + r.address_names, r.address_args; END LOOP; END; $$; CREATE EVENT TRIGGER regress_event_trigger_report_dropped ON sql_drop @@ -436,18 +449,18 @@ CREATE TABLE evttrig.part_15_20 PARTITION OF evttrig.part_10_20 (id) FOR VALUES FROM (15) TO (20); NOTICE: END: command_tag=CREATE TABLE type=table identity=evttrig.part_15_20 ALTER TABLE evttrig.two DROP COLUMN col_c; -NOTICE: NORMAL: orig=t normal=f istemp=f type=table column identity=evttrig.two.col_c name={evttrig,two,col_c} args={} -NOTICE: NORMAL: orig=f normal=t istemp=f type=table constraint identity=two_col_c_check on evttrig.two name={evttrig,two,two_col_c_check} args={} +NOTICE: NORMAL: orig=t normal=f istemp=f type=table column identity=evttrig.two.col_c schema=evttrig name= addr={evttrig,two,col_c} args={} +NOTICE: NORMAL: orig=f normal=t istemp=f type=table constraint identity=two_col_c_check on evttrig.two schema=evttrig name= addr={evttrig,two,two_col_c_check} args={} NOTICE: END: command_tag=ALTER TABLE type=table identity=evttrig.two ALTER TABLE evttrig.one ALTER COLUMN col_b DROP DEFAULT; -NOTICE: NORMAL: orig=t normal=f istemp=f type=default value identity=for evttrig.one.col_b name={evttrig,one,col_b} args={} +NOTICE: NORMAL: orig=t normal=f istemp=f type=default value identity=for evttrig.one.col_b schema=evttrig name= addr={evttrig,one,col_b} args={} NOTICE: END: command_tag=ALTER TABLE type=table identity=evttrig.one ALTER TABLE evttrig.one DROP CONSTRAINT one_pkey; -NOTICE: NORMAL: orig=t normal=f istemp=f type=table constraint identity=one_pkey on evttrig.one name={evttrig,one,one_pkey} args={} +NOTICE: NORMAL: orig=t normal=f istemp=f type=table constraint identity=one_pkey on evttrig.one schema=evttrig name= addr={evttrig,one,one_pkey} args={} NOTICE: END: command_tag=ALTER TABLE type=table identity=evttrig.one ALTER TABLE evttrig.one DROP COLUMN col_c; -NOTICE: NORMAL: orig=t normal=f istemp=f type=table column identity=evttrig.one.col_c name={evttrig,one,col_c} args={} -NOTICE: NORMAL: orig=f normal=t istemp=f type=default value identity=for evttrig.one.col_c name={evttrig,one,col_c} args={} +NOTICE: NORMAL: orig=t normal=f istemp=f type=table column identity=evttrig.one.col_c schema=evttrig name= addr={evttrig,one,col_c} args={} +NOTICE: NORMAL: orig=f normal=t istemp=f type=default value identity=for evttrig.one.col_c schema=evttrig name= addr={evttrig,one,col_c} args={} NOTICE: END: command_tag=ALTER TABLE type=table identity=evttrig.one ALTER TABLE evttrig.id ALTER COLUMN col_d SET DATA TYPE bigint; NOTICE: END: command_tag=ALTER SEQUENCE type=sequence identity=evttrig.id_col_d_seq @@ -456,26 +469,26 @@ ALTER TABLE evttrig.id ALTER COLUMN col_d DROP IDENTITY, ALTER COLUMN col_d SET DATA TYPE int; NOTICE: END: command_tag=ALTER TABLE type=table identity=evttrig.id DROP INDEX evttrig.one_idx; -NOTICE: NORMAL: orig=t normal=f istemp=f type=index identity=evttrig.one_idx name={evttrig,one_idx} args={} +NOTICE: NORMAL: orig=t normal=f istemp=f type=index identity=evttrig.one_idx schema=evttrig name=one_idx addr={evttrig,one_idx} args={} DROP SCHEMA evttrig CASCADE; NOTICE: drop cascades to 4 other objects DETAIL: drop cascades to table evttrig.one drop cascades to table evttrig.two drop cascades to table evttrig.id drop cascades to table evttrig.parted -NOTICE: NORMAL: orig=t normal=f istemp=f type=schema identity=evttrig name={evttrig} args={} -NOTICE: NORMAL: orig=f normal=t istemp=f type=table identity=evttrig.one name={evttrig,one} args={} -NOTICE: NORMAL: orig=f normal=t istemp=f type=sequence identity=evttrig.one_col_a_seq name={evttrig,one_col_a_seq} args={} -NOTICE: NORMAL: orig=f normal=t istemp=f type=default value identity=for evttrig.one.col_a name={evttrig,one,col_a} args={} -NOTICE: NORMAL: orig=f normal=t istemp=f type=table identity=evttrig.two name={evttrig,two} args={} -NOTICE: NORMAL: orig=f normal=t istemp=f type=table identity=evttrig.id name={evttrig,id} args={} -NOTICE: NORMAL: orig=f normal=t istemp=f type=table identity=evttrig.parted name={evttrig,parted} args={} -NOTICE: NORMAL: orig=f normal=t istemp=f type=table identity=evttrig.part_1_10 name={evttrig,part_1_10} args={} -NOTICE: NORMAL: orig=f normal=t istemp=f type=table identity=evttrig.part_10_20 name={evttrig,part_10_20} args={} -NOTICE: NORMAL: orig=f normal=t istemp=f type=table identity=evttrig.part_10_15 name={evttrig,part_10_15} args={} -NOTICE: NORMAL: orig=f normal=t istemp=f type=table identity=evttrig.part_15_20 name={evttrig,part_15_20} args={} +NOTICE: NORMAL: orig=t normal=f istemp=f type=schema identity=evttrig schema= name=evttrig addr={evttrig} args={} +NOTICE: NORMAL: orig=f normal=t istemp=f type=table identity=evttrig.one schema=evttrig name=one addr={evttrig,one} args={} +NOTICE: NORMAL: orig=f normal=t istemp=f type=sequence identity=evttrig.one_col_a_seq schema=evttrig name=one_col_a_seq addr={evttrig,one_col_a_seq} args={} +NOTICE: NORMAL: orig=f normal=t istemp=f type=default value identity=for evttrig.one.col_a schema=evttrig name= addr={evttrig,one,col_a} args={} +NOTICE: NORMAL: orig=f normal=t istemp=f type=table identity=evttrig.two schema=evttrig name=two addr={evttrig,two} args={} +NOTICE: NORMAL: orig=f normal=t istemp=f type=table identity=evttrig.id schema=evttrig name=id addr={evttrig,id} args={} +NOTICE: NORMAL: orig=f normal=t istemp=f type=table identity=evttrig.parted schema=evttrig name=parted addr={evttrig,parted} args={} +NOTICE: NORMAL: orig=f normal=t istemp=f type=table identity=evttrig.part_1_10 schema=evttrig name=part_1_10 addr={evttrig,part_1_10} args={} +NOTICE: NORMAL: orig=f normal=t istemp=f type=table identity=evttrig.part_10_20 schema=evttrig name=part_10_20 addr={evttrig,part_10_20} args={} +NOTICE: NORMAL: orig=f normal=t istemp=f type=table identity=evttrig.part_10_15 schema=evttrig name=part_10_15 addr={evttrig,part_10_15} args={} +NOTICE: NORMAL: orig=f normal=t istemp=f type=table identity=evttrig.part_15_20 schema=evttrig name=part_15_20 addr={evttrig,part_15_20} args={} DROP TABLE a_temp_tbl; -NOTICE: NORMAL: orig=t normal=f istemp=t type=table identity=pg_temp.a_temp_tbl name={pg_temp,a_temp_tbl} args={} +NOTICE: NORMAL: orig=t normal=f istemp=t type=table identity=pg_temp.a_temp_tbl schema=pg_temp name=a_temp_tbl addr={pg_temp,a_temp_tbl} args={} -- check unfiltered results, too CREATE OR REPLACE FUNCTION event_trigger_report_dropped() RETURNS event_trigger @@ -485,34 +498,61 @@ DECLARE r record; BEGIN FOR r IN SELECT * from pg_event_trigger_dropped_objects() LOOP - RAISE NOTICE 'DROP: orig=% normal=% istemp=% type=% identity=% name=% args=%', + RAISE NOTICE 'DROP: orig=% normal=% istemp=% type=% identity=% schema=% name=% addr=% args=%', r.original, r.normal, r.is_temporary, r.object_type, - r.object_identity, r.address_names, r.address_args; + r.object_identity, r.schema_name, r.object_name, + r.address_names, r.address_args; END LOOP; END; $$; NOTICE: END: command_tag=CREATE FUNCTION type=function identity=public.event_trigger_report_dropped() +CREATE FUNCTION event_trigger_dummy_trigger() + RETURNS trigger + LANGUAGE plpgsql +AS $$ +BEGIN + RETURN new; +END; $$; +NOTICE: END: command_tag=CREATE FUNCTION type=function identity=public.event_trigger_dummy_trigger() CREATE TABLE evtrg_nontemp_table (f1 int primary key, f2 int default 42); NOTICE: END: command_tag=CREATE TABLE type=table identity=public.evtrg_nontemp_table NOTICE: END: command_tag=CREATE INDEX type=index identity=public.evtrg_nontemp_table_pkey +CREATE TRIGGER evtrg_nontemp_trig + BEFORE INSERT ON evtrg_nontemp_table + EXECUTE FUNCTION event_trigger_dummy_trigger(); +NOTICE: END: command_tag=CREATE TRIGGER type=trigger identity=evtrg_nontemp_trig on public.evtrg_nontemp_table +CREATE POLICY evtrg_nontemp_pol ON evtrg_nontemp_table USING (f2 > 0); +NOTICE: END: command_tag=CREATE POLICY type=policy identity=evtrg_nontemp_pol on public.evtrg_nontemp_table DROP TABLE evtrg_nontemp_table; -NOTICE: DROP: orig=t normal=f istemp=f type=table identity=public.evtrg_nontemp_table name={public,evtrg_nontemp_table} args={} -NOTICE: DROP: orig=f normal=f istemp=f type=type identity=public.evtrg_nontemp_table name={public.evtrg_nontemp_table} args={} -NOTICE: DROP: orig=f normal=f istemp=f type=type identity=public.evtrg_nontemp_table[] name={public.evtrg_nontemp_table[]} args={} -NOTICE: DROP: orig=f normal=f istemp=f type=default value identity=for public.evtrg_nontemp_table.f2 name={public,evtrg_nontemp_table,f2} args={} -NOTICE: DROP: orig=f normal=f istemp=f type=table constraint identity=evtrg_nontemp_table_f1_not_null on public.evtrg_nontemp_table name={public,evtrg_nontemp_table,evtrg_nontemp_table_f1_not_null} args={} -NOTICE: DROP: orig=f normal=f istemp=f type=table constraint identity=evtrg_nontemp_table_pkey on public.evtrg_nontemp_table name={public,evtrg_nontemp_table,evtrg_nontemp_table_pkey} args={} -NOTICE: DROP: orig=f normal=f istemp=f type=index identity=public.evtrg_nontemp_table_pkey name={public,evtrg_nontemp_table_pkey} args={} +NOTICE: DROP: orig=t normal=f istemp=f type=table identity=public.evtrg_nontemp_table schema=public name=evtrg_nontemp_table addr={public,evtrg_nontemp_table} args={} +NOTICE: DROP: orig=f normal=f istemp=f type=type identity=public.evtrg_nontemp_table schema=public name=evtrg_nontemp_table addr={public.evtrg_nontemp_table} args={} +NOTICE: DROP: orig=f normal=f istemp=f type=type identity=public.evtrg_nontemp_table[] schema=public name=_evtrg_nontemp_table addr={public.evtrg_nontemp_table[]} args={} +NOTICE: DROP: orig=f normal=f istemp=f type=default value identity=for public.evtrg_nontemp_table.f2 schema=public name= addr={public,evtrg_nontemp_table,f2} args={} +NOTICE: DROP: orig=f normal=f istemp=f type=table constraint identity=evtrg_nontemp_table_f1_not_null on public.evtrg_nontemp_table schema=public name= addr={public,evtrg_nontemp_table,evtrg_nontemp_table_f1_not_null} args={} +NOTICE: DROP: orig=f normal=f istemp=f type=table constraint identity=evtrg_nontemp_table_pkey on public.evtrg_nontemp_table schema=public name= addr={public,evtrg_nontemp_table,evtrg_nontemp_table_pkey} args={} +NOTICE: DROP: orig=f normal=f istemp=f type=index identity=public.evtrg_nontemp_table_pkey schema=public name=evtrg_nontemp_table_pkey addr={public,evtrg_nontemp_table_pkey} args={} +NOTICE: DROP: orig=f normal=f istemp=f type=trigger identity=evtrg_nontemp_trig on public.evtrg_nontemp_table schema=public name= addr={public,evtrg_nontemp_table,evtrg_nontemp_trig} args={} +NOTICE: DROP: orig=f normal=t istemp=f type=policy identity=evtrg_nontemp_pol on public.evtrg_nontemp_table schema=public name= addr={public,evtrg_nontemp_table,evtrg_nontemp_pol} args={} CREATE TEMP TABLE a_temp_tbl (f1 int primary key, f2 int default 42); NOTICE: END: command_tag=CREATE TABLE type=table identity=pg_temp.a_temp_tbl NOTICE: END: command_tag=CREATE INDEX type=index identity=pg_temp.a_temp_tbl_pkey +CREATE TRIGGER a_temp_trig + BEFORE INSERT ON a_temp_tbl + EXECUTE FUNCTION event_trigger_dummy_trigger(); +NOTICE: END: command_tag=CREATE TRIGGER type=trigger identity=a_temp_trig on pg_temp.a_temp_tbl +CREATE POLICY a_temp_pol ON a_temp_tbl USING (f2 > 0); +NOTICE: END: command_tag=CREATE POLICY type=policy identity=a_temp_pol on pg_temp.a_temp_tbl DROP TABLE a_temp_tbl; -NOTICE: DROP: orig=t normal=f istemp=t type=table identity=pg_temp.a_temp_tbl name={pg_temp,a_temp_tbl} args={} -NOTICE: DROP: orig=f normal=f istemp=t type=type identity=pg_temp.a_temp_tbl name={pg_temp.a_temp_tbl} args={} -NOTICE: DROP: orig=f normal=f istemp=t type=type identity=pg_temp.a_temp_tbl[] name={pg_temp.a_temp_tbl[]} args={} -NOTICE: DROP: orig=f normal=f istemp=t type=default value identity=for pg_temp.a_temp_tbl.f2 name={pg_temp,a_temp_tbl,f2} args={} -NOTICE: DROP: orig=f normal=f istemp=t type=table constraint identity=a_temp_tbl_f1_not_null on pg_temp.a_temp_tbl name={pg_temp,a_temp_tbl,a_temp_tbl_f1_not_null} args={} -NOTICE: DROP: orig=f normal=f istemp=t type=table constraint identity=a_temp_tbl_pkey on pg_temp.a_temp_tbl name={pg_temp,a_temp_tbl,a_temp_tbl_pkey} args={} -NOTICE: DROP: orig=f normal=f istemp=t type=index identity=pg_temp.a_temp_tbl_pkey name={pg_temp,a_temp_tbl_pkey} args={} +NOTICE: DROP: orig=t normal=f istemp=t type=table identity=pg_temp.a_temp_tbl schema=pg_temp name=a_temp_tbl addr={pg_temp,a_temp_tbl} args={} +NOTICE: DROP: orig=f normal=f istemp=t type=type identity=pg_temp.a_temp_tbl schema=pg_temp name=a_temp_tbl addr={pg_temp.a_temp_tbl} args={} +NOTICE: DROP: orig=f normal=f istemp=t type=type identity=pg_temp.a_temp_tbl[] schema=pg_temp name=_a_temp_tbl addr={pg_temp.a_temp_tbl[]} args={} +NOTICE: DROP: orig=f normal=f istemp=t type=default value identity=for pg_temp.a_temp_tbl.f2 schema=pg_temp name= addr={pg_temp,a_temp_tbl,f2} args={} +NOTICE: DROP: orig=f normal=f istemp=t type=table constraint identity=a_temp_tbl_f1_not_null on pg_temp.a_temp_tbl schema=pg_temp name= addr={pg_temp,a_temp_tbl,a_temp_tbl_f1_not_null} args={} +NOTICE: DROP: orig=f normal=f istemp=t type=table constraint identity=a_temp_tbl_pkey on pg_temp.a_temp_tbl schema=pg_temp name= addr={pg_temp,a_temp_tbl,a_temp_tbl_pkey} args={} +NOTICE: DROP: orig=f normal=f istemp=t type=index identity=pg_temp.a_temp_tbl_pkey schema=pg_temp name=a_temp_tbl_pkey addr={pg_temp,a_temp_tbl_pkey} args={} +NOTICE: DROP: orig=f normal=f istemp=t type=trigger identity=a_temp_trig on pg_temp.a_temp_tbl schema=pg_temp name= addr={pg_temp,a_temp_tbl,a_temp_trig} args={} +NOTICE: DROP: orig=f normal=t istemp=t type=policy identity=a_temp_pol on pg_temp.a_temp_tbl schema=pg_temp name= addr={pg_temp,a_temp_tbl,a_temp_pol} args={} +DROP FUNCTION event_trigger_dummy_trigger(); +NOTICE: DROP: orig=t normal=f istemp=f type=function identity=public.event_trigger_dummy_trigger() schema=public name= addr={public,event_trigger_dummy_trigger} args={} -- CREATE OPERATOR CLASS without FAMILY clause should report -- both CREATE OPERATOR FAMILY and CREATE OPERATOR CLASS CREATE OPERATOR CLASS evttrigopclass FOR TYPE int USING btree AS STORAGE int; diff --git a/src/test/regress/sql/event_trigger.sql b/src/test/regress/sql/event_trigger.sql index ef5978b9697aa..c613c0cfd439b 100644 --- a/src/test/regress/sql/event_trigger.sql +++ b/src/test/regress/sql/event_trigger.sql @@ -202,9 +202,15 @@ INSERT INTO undroppable_objs VALUES ('table', 'audit_tbls.schema_two_table_three'); CREATE TABLE dropped_objects ( - type text, - schema text, - object text + object_type text, + schema_name text, + object_name text, + object_identity text, + address_names text[], + address_args text[], + is_temporary bool, + original bool, + normal bool ); -- This tests errors raised within event triggers; the one in audit_tbls @@ -245,8 +251,12 @@ BEGIN END IF; INSERT INTO dropped_objects - (type, schema, object) VALUES - (obj.object_type, obj.schema_name, obj.object_identity); + (object_type, schema_name, object_name, + object_identity, address_names, address_args, + is_temporary, original, normal) VALUES + (obj.object_type, obj.schema_name, obj.object_name, + obj.object_identity, obj.address_names, obj.address_args, + obj.is_temporary, obj.original, obj.normal); END LOOP; END $$; @@ -263,10 +273,12 @@ DROP SCHEMA schema_one, schema_two CASCADE; DELETE FROM undroppable_objs WHERE object_identity = 'schema_one.table_three'; DROP SCHEMA schema_one, schema_two CASCADE; -SELECT * FROM dropped_objects WHERE schema IS NULL OR schema <> 'pg_toast'; +-- exclude TOAST objects because they have unstable names +SELECT * FROM dropped_objects + WHERE schema_name IS NULL OR schema_name <> 'pg_toast'; DROP OWNED BY regress_evt_user; -SELECT * FROM dropped_objects WHERE type = 'schema'; +SELECT * FROM dropped_objects WHERE object_type = 'schema'; DROP ROLE regress_evt_user; @@ -285,9 +297,10 @@ BEGIN IF NOT r.normal AND NOT r.original THEN CONTINUE; END IF; - RAISE NOTICE 'NORMAL: orig=% normal=% istemp=% type=% identity=% name=% args=%', + RAISE NOTICE 'NORMAL: orig=% normal=% istemp=% type=% identity=% schema=% name=% addr=% args=%', r.original, r.normal, r.is_temporary, r.object_type, - r.object_identity, r.address_names, r.address_args; + r.object_identity, r.schema_name, r.object_name, + r.address_names, r.address_args; END LOOP; END; $$; CREATE EVENT TRIGGER regress_event_trigger_report_dropped ON sql_drop @@ -346,17 +359,37 @@ DECLARE r record; BEGIN FOR r IN SELECT * from pg_event_trigger_dropped_objects() LOOP - RAISE NOTICE 'DROP: orig=% normal=% istemp=% type=% identity=% name=% args=%', + RAISE NOTICE 'DROP: orig=% normal=% istemp=% type=% identity=% schema=% name=% addr=% args=%', r.original, r.normal, r.is_temporary, r.object_type, - r.object_identity, r.address_names, r.address_args; + r.object_identity, r.schema_name, r.object_name, + r.address_names, r.address_args; END LOOP; END; $$; +CREATE FUNCTION event_trigger_dummy_trigger() + RETURNS trigger + LANGUAGE plpgsql +AS $$ +BEGIN + RETURN new; +END; $$; + CREATE TABLE evtrg_nontemp_table (f1 int primary key, f2 int default 42); +CREATE TRIGGER evtrg_nontemp_trig + BEFORE INSERT ON evtrg_nontemp_table + EXECUTE FUNCTION event_trigger_dummy_trigger(); +CREATE POLICY evtrg_nontemp_pol ON evtrg_nontemp_table USING (f2 > 0); DROP TABLE evtrg_nontemp_table; + CREATE TEMP TABLE a_temp_tbl (f1 int primary key, f2 int default 42); +CREATE TRIGGER a_temp_trig + BEFORE INSERT ON a_temp_tbl + EXECUTE FUNCTION event_trigger_dummy_trigger(); +CREATE POLICY a_temp_pol ON a_temp_tbl USING (f2 > 0); DROP TABLE a_temp_tbl; +DROP FUNCTION event_trigger_dummy_trigger(); + -- CREATE OPERATOR CLASS without FAMILY clause should report -- both CREATE OPERATOR FAMILY and CREATE OPERATOR CLASS CREATE OPERATOR CLASS evttrigopclass FOR TYPE int USING btree AS STORAGE int; From 9a71989a8f61d7ee003c443a979a1bd43a08ff63 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Fri, 12 Sep 2025 18:10:11 -0400 Subject: [PATCH 67/73] Reject "ALTER DATABASE/USER ... RESET foo" with invalid GUC name. If the database or user had no entry in pg_db_role_setting, RESET silently did nothing --- including not checking the validity of the given GUC name. This is quite inconsistent and surprising, because you *would* get such an error if there were any pg_db_role_setting entry, even though it contains values for unrelated GUCs. While this is clearly a bug, changing it in stable branches seems unwise. The effect will be that some ALTER commands that formerly were no-ops will now be errors, and people don't like that sort of thing in minor releases. Author: Vitaly Davydov Reviewed-by: Tom Lane Discussion: https://postgr.es/m/30783e-68c28a00-9-41004480@130449754 --- src/backend/catalog/pg_db_role_setting.c | 9 +++++++++ .../modules/unsafe_tests/expected/setconfig.out | 13 +++++++++++++ src/test/modules/unsafe_tests/sql/setconfig.sql | 9 +++++++++ 3 files changed, 31 insertions(+) diff --git a/src/backend/catalog/pg_db_role_setting.c b/src/backend/catalog/pg_db_role_setting.c index 090fc07c28acb..832e49a34bea5 100644 --- a/src/backend/catalog/pg_db_role_setting.c +++ b/src/backend/catalog/pg_db_role_setting.c @@ -151,6 +151,15 @@ AlterSetting(Oid databaseid, Oid roleid, VariableSetStmt *setstmt) CatalogTupleInsert(rel, newtuple); } + else + { + /* + * RESET doesn't need to change any state if there's no pre-existing + * pg_db_role_setting entry, but for consistency we should still check + * that the option is valid and we're allowed to set it. + */ + (void) GUCArrayDelete(NULL, setstmt->name); + } InvokeObjectPostAlterHookArg(DbRoleSettingRelationId, databaseid, 0, roleid, false); diff --git a/src/test/modules/unsafe_tests/expected/setconfig.out b/src/test/modules/unsafe_tests/expected/setconfig.out index 5f42443e144b9..37e33709012c2 100644 --- a/src/test/modules/unsafe_tests/expected/setconfig.out +++ b/src/test/modules/unsafe_tests/expected/setconfig.out @@ -62,6 +62,19 @@ SELECT current_user, session_user; SET ROLE NONE; DO $$BEGIN EXECUTE format( 'ALTER DATABASE %I RESET role', current_catalog); END$$; +-- Test some error cases +DO $$BEGIN EXECUTE format( + 'ALTER DATABASE %I SET bogus = 0', current_catalog); END$$; +ERROR: unrecognized configuration parameter "bogus" +CONTEXT: SQL statement "ALTER DATABASE contrib_regression SET bogus = 0" +PL/pgSQL function inline_code_block line 1 at EXECUTE +DO $$BEGIN EXECUTE format( + 'ALTER DATABASE %I RESET bogus', current_catalog); END$$; +ERROR: unrecognized configuration parameter "bogus" +CONTEXT: SQL statement "ALTER DATABASE contrib_regression RESET bogus" +PL/pgSQL function inline_code_block line 1 at EXECUTE +ALTER USER regress_authenticated_user_db_ssa RESET bogus; +ERROR: unrecognized configuration parameter "bogus" -- Test connection string options \c -reuse-previous=on "user=regress_authenticated_user_db_sr options=-crole=regress_current_user" SELECT current_user, session_user; diff --git a/src/test/modules/unsafe_tests/sql/setconfig.sql b/src/test/modules/unsafe_tests/sql/setconfig.sql index 81296d1091b47..d9e1fc908a125 100644 --- a/src/test/modules/unsafe_tests/sql/setconfig.sql +++ b/src/test/modules/unsafe_tests/sql/setconfig.sql @@ -50,6 +50,15 @@ DO $$BEGIN EXECUTE format( 'ALTER DATABASE %I RESET role', current_catalog); END$$; +-- Test some error cases + +DO $$BEGIN EXECUTE format( + 'ALTER DATABASE %I SET bogus = 0', current_catalog); END$$; +DO $$BEGIN EXECUTE format( + 'ALTER DATABASE %I RESET bogus', current_catalog); END$$; +ALTER USER regress_authenticated_user_db_ssa RESET bogus; + + -- Test connection string options \c -reuse-previous=on "user=regress_authenticated_user_db_sr options=-crole=regress_current_user" From 88824e68611a88a4ef7218c093810a94f86e12e0 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Fri, 12 Sep 2025 18:45:06 -0400 Subject: [PATCH 68/73] Avoid context dependency in test case added by 9a71989a8. It's not quite clear to me why this didn't show up in my local check-world testing, but some of the buildfarm evidently runs this test with a different database name. Adjust the test so that that doesn't affect the reported error messages. --- src/test/modules/unsafe_tests/expected/setconfig.out | 8 ++++---- src/test/modules/unsafe_tests/sql/setconfig.sql | 4 ++++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/test/modules/unsafe_tests/expected/setconfig.out b/src/test/modules/unsafe_tests/expected/setconfig.out index 37e33709012c2..5318f075d1e8c 100644 --- a/src/test/modules/unsafe_tests/expected/setconfig.out +++ b/src/test/modules/unsafe_tests/expected/setconfig.out @@ -63,18 +63,18 @@ SET ROLE NONE; DO $$BEGIN EXECUTE format( 'ALTER DATABASE %I RESET role', current_catalog); END$$; -- Test some error cases +-- We have to use terse mode so that the database name doesn't +-- appear in the error output. +\set VERBOSITY terse DO $$BEGIN EXECUTE format( 'ALTER DATABASE %I SET bogus = 0', current_catalog); END$$; ERROR: unrecognized configuration parameter "bogus" -CONTEXT: SQL statement "ALTER DATABASE contrib_regression SET bogus = 0" -PL/pgSQL function inline_code_block line 1 at EXECUTE DO $$BEGIN EXECUTE format( 'ALTER DATABASE %I RESET bogus', current_catalog); END$$; ERROR: unrecognized configuration parameter "bogus" -CONTEXT: SQL statement "ALTER DATABASE contrib_regression RESET bogus" -PL/pgSQL function inline_code_block line 1 at EXECUTE ALTER USER regress_authenticated_user_db_ssa RESET bogus; ERROR: unrecognized configuration parameter "bogus" +\set VERBOSITY default -- Test connection string options \c -reuse-previous=on "user=regress_authenticated_user_db_sr options=-crole=regress_current_user" SELECT current_user, session_user; diff --git a/src/test/modules/unsafe_tests/sql/setconfig.sql b/src/test/modules/unsafe_tests/sql/setconfig.sql index d9e1fc908a125..4349490f94117 100644 --- a/src/test/modules/unsafe_tests/sql/setconfig.sql +++ b/src/test/modules/unsafe_tests/sql/setconfig.sql @@ -51,12 +51,16 @@ DO $$BEGIN EXECUTE format( -- Test some error cases +-- We have to use terse mode so that the database name doesn't +-- appear in the error output. +\set VERBOSITY terse DO $$BEGIN EXECUTE format( 'ALTER DATABASE %I SET bogus = 0', current_catalog); END$$; DO $$BEGIN EXECUTE format( 'ALTER DATABASE %I RESET bogus', current_catalog); END$$; ALTER USER regress_authenticated_user_db_ssa RESET bogus; +\set VERBOSITY default -- Test connection string options From f6edf403a99923b98f8b8b3398c7ef32e1ae9a3e Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Sat, 13 Sep 2025 20:38:52 +0300 Subject: [PATCH 69/73] Specify locale provider for pg_regress --no-locale pg_regress has a --no-locale option that forces the temporary database to have C locale. However, currently, locale C only exists in the 'builtin' locale provider. This makes 'pg_regress --no-locale' fail when the default locale provider is not 'builtin'. This commit makes 'pg_regress --no-locale' specify both LOCALE='C' and LOCALE_PROVIDER='builtin'. Discussion: https://postgr.es/m/b54921f95e23b4391b1613e9053a3d58%40postgrespro.ru Author: Oleg Tselebrovskiy Reviewed-by: Alexander Korotkov --- src/test/regress/pg_regress.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/test/regress/pg_regress.c b/src/test/regress/pg_regress.c index 5d85dcc62f0a5..61c035a39834a 100644 --- a/src/test/regress/pg_regress.c +++ b/src/test/regress/pg_regress.c @@ -1968,10 +1968,10 @@ create_database(const char *dbname) */ if (encoding) psql_add_command(buf, "CREATE DATABASE \"%s\" TEMPLATE=template0 ENCODING='%s'%s", dbname, encoding, - (nolocale) ? " LOCALE='C'" : ""); + (nolocale) ? " LOCALE='C' LOCALE_PROVIDER='builtin'" : ""); else psql_add_command(buf, "CREATE DATABASE \"%s\" TEMPLATE=template0%s", dbname, - (nolocale) ? " LOCALE='C'" : ""); + (nolocale) ? " LOCALE='C' LOCALE_PROVIDER='builtin'" : ""); psql_add_command(buf, "ALTER DATABASE \"%s\" SET lc_messages TO 'C';" "ALTER DATABASE \"%s\" SET lc_monetary TO 'C';" From 7e9c216b5236cc61f677787b35e8c8f28f5f6959 Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Sat, 13 Sep 2025 14:50:02 -0500 Subject: [PATCH 70/73] Re-pgindent nbtpreprocesskeys.c after commit 796962922e. Backpatch-through: 18 --- src/backend/access/nbtree/nbtpreprocesskeys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/access/nbtree/nbtpreprocesskeys.c b/src/backend/access/nbtree/nbtpreprocesskeys.c index 07a3ff11a0b87..71ddd68fd548c 100644 --- a/src/backend/access/nbtree/nbtpreprocesskeys.c +++ b/src/backend/access/nbtree/nbtpreprocesskeys.c @@ -1498,7 +1498,7 @@ _bt_skiparray_strat_increment(IndexScanDesc scan, ScanKey arraysk, */ lookupstrat = BTGreaterEqualStrategyNumber; if (low_compare->sk_flags & SK_BT_DESC) - lookupstrat = BTLessEqualStrategyNumber; /* commute this too */ + lookupstrat = BTLessEqualStrategyNumber; /* commute this too */ geop = get_opfamily_member(opfamily, opcintype, opcintype, lookupstrat); if (!OidIsValid(geop)) return; From 95bdc672282722fb52656a81fefe18296015708e Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Sat, 13 Sep 2025 14:55:38 -0500 Subject: [PATCH 71/73] Add commit 7e9c216b52 to .git-blame-ignore-revs. --- .git-blame-ignore-revs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index f83e2fc658664..65aecdffaca6e 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -14,6 +14,9 @@ # # $ git log --pretty=format:"%H # %cd%n# %s" $PGINDENTGITHASH -1 --date=iso +7e9c216b5236cc61f677787b35e8c8f28f5f6959 # 2025-09-13 14:50:02 -0500 +# Re-pgindent nbtpreprocesskeys.c after commit 796962922e. + 1d1612aec7688139e1a5506df1366b4b6a69605d # 2025-07-29 09:10:41 -0400 # Run pgindent. From cdf7feb96562071f15ceb070272d7e84246d943d Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sat, 13 Sep 2025 16:55:51 -0400 Subject: [PATCH 72/73] Amend recent fix for SIMILAR TO regex conversion. Commit e3ffc3e91 fixed the translation of character classes in SIMILAR TO regular expressions. Unfortunately the fix broke a corner case: if there is an escape character right after the opening bracket (for example in "[\q]"), a closing bracket right after the escape sequence would not be seen as closing the character class. There were two more oversights: a backslash or a nested opening bracket right at the beginning of a character class should remove the special meaning from any following caret or closing bracket. This bug suggests that this code needs to be more readable, so also rename the variables "charclass_depth" and "charclass_start" to something more meaningful, rewrite an "if" cascade to be more consistent, and improve the commentary. Reported-by: Dominique Devienne Reported-by: Stephan Springl Author: Laurenz Albe Reviewed-by: Tom Lane Discussion: https://postgr.es/m/CAFCRh-8NwJd0jq6P=R3qhHyqU7hw0BTor3W0SvUcii24et+zAw@mail.gmail.com Backpatch-through: 13 --- src/backend/utils/adt/regexp.c | 97 +++++++++++++++++++-------- src/test/regress/expected/strings.out | 9 +++ src/test/regress/sql/strings.sql | 3 + 3 files changed, 82 insertions(+), 27 deletions(-) diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c index 6e2864cbbda8c..b0cdef9b19fa2 100644 --- a/src/backend/utils/adt/regexp.c +++ b/src/backend/utils/adt/regexp.c @@ -774,10 +774,8 @@ similar_escape_internal(text *pat_text, text *esc_text) elen; bool afterescape = false; int nquotes = 0; - int charclass_depth = 0; /* Nesting level of character classes, - * encompassed by square brackets */ - int charclass_start = 0; /* State of the character class start, - * for carets */ + int bracket_depth = 0; /* square bracket nesting level */ + int charclass_pos = 0; /* position inside a character class */ p = VARDATA_ANY(pat_text); plen = VARSIZE_ANY_EXHDR(pat_text); @@ -836,6 +834,17 @@ similar_escape_internal(text *pat_text, text *esc_text) * the relevant part separators in the above expansion. If the result * of this function is used in a plain regexp match (SIMILAR TO), the * escape-double-quotes have no effect on the match behavior. + * + * While we don't fully validate character classes (bracket expressions), + * we do need to parse them well enough to know where they end. + * "charclass_pos" tracks where we are in a character class. + * Its value is uninteresting when bracket_depth is 0. + * But when bracket_depth > 0, it will be + * 1: right after the opening '[' (a following '^' will negate + * the class, while ']' is a literal character) + * 2: right after a '^' after the opening '[' (']' is still a literal + * character) + * 3 or more: further inside the character class (']' ends the class) *---------- */ @@ -907,7 +916,7 @@ similar_escape_internal(text *pat_text, text *esc_text) /* fast path */ if (afterescape) { - if (pchar == '"' && charclass_depth < 1) /* escape-double-quote? */ + if (pchar == '"' && bracket_depth < 1) /* escape-double-quote? */ { /* emit appropriate part separator, per notes above */ if (nquotes == 0) @@ -948,6 +957,12 @@ similar_escape_internal(text *pat_text, text *esc_text) */ *r++ = '\\'; *r++ = pchar; + + /* + * If we encounter an escaped character in a character class, + * we are no longer at the beginning. + */ + charclass_pos = 3; } afterescape = false; } @@ -956,41 +971,69 @@ similar_escape_internal(text *pat_text, text *esc_text) /* SQL escape character; do not send to output */ afterescape = true; } - else if (charclass_depth > 0) + else if (bracket_depth > 0) { + /* inside a character class */ if (pchar == '\\') + { + /* + * If we're here, backslash is not the SQL escape character, + * so treat it as a literal class element, which requires + * doubling it. (This matches our behavior for backslashes + * outside character classes.) + */ *r++ = '\\'; + } *r++ = pchar; - /* - * Ignore a closing bracket at the start of a character class. - * Such a bracket is taken literally rather than closing the - * class. "charclass_start" is 1 right at the beginning of a - * class and 2 after an initial caret. - */ - if (pchar == ']' && charclass_start > 2) - charclass_depth--; + /* parse the character class well enough to identify ending ']' */ + if (pchar == ']' && charclass_pos > 2) + { + /* found the real end of a bracket pair */ + bracket_depth--; + /* don't reset charclass_pos, this may be an inner bracket */ + } else if (pchar == '[') - charclass_depth++; + { + /* start of a nested bracket pair */ + bracket_depth++; - /* - * If there is a caret right after the opening bracket, it negates - * the character class, but a following closing bracket should - * still be treated as a normal character. That holds only for - * the first caret, so only the values 1 and 2 mean that closing - * brackets should be taken literally. - */ - if (pchar == '^') - charclass_start++; + /* + * We are no longer at the beginning of a character class. + * (The nested bracket pair is a collating element, not a + * character class in its own right.) + */ + charclass_pos = 3; + } + else if (pchar == '^') + { + /* + * A caret right after the opening bracket negates the + * character class. In that case, the following will + * increment charclass_pos from 1 to 2, so that a following + * ']' is still a literal character and does not end the + * character class. If we are further inside a character + * class, charclass_pos might get incremented past 3, which is + * fine. + */ + charclass_pos++; + } else - charclass_start = 3; /* definitely past the start */ + { + /* + * Anything else (including a backslash or leading ']') is an + * element of the character class, so we are no longer at the + * beginning of the class. + */ + charclass_pos = 3; + } } else if (pchar == '[') { /* start of a character class */ *r++ = pchar; - charclass_depth++; - charclass_start = 1; + bracket_depth = 1; + charclass_pos = 1; } else if (pchar == '%') { diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out index ba302da51e7b2..2d6cb02ad6085 100644 --- a/src/test/regress/expected/strings.out +++ b/src/test/regress/expected/strings.out @@ -693,6 +693,15 @@ EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[^^]^'; Filter: (f1 ~ '^(?:[^^]\^)$'::text) (2 rows) +-- Closing square bracket after an escape sequence at the beginning of +-- a character closes the character class +EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[|a]%' ESCAPE '|'; + QUERY PLAN +--------------------------------------- + Seq Scan on text_tbl + Filter: (f1 ~ '^(?:[\a].*)$'::text) +(2 rows) + -- Test backslash escapes in regexp_replace's replacement string SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3'); regexp_replace diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql index b94004cc08ce6..5ed421d62059d 100644 --- a/src/test/regress/sql/strings.sql +++ b/src/test/regress/sql/strings.sql @@ -218,6 +218,9 @@ EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[]%][^]%][^%]%'; -- Closing square bracket effective after two carets at the beginning -- of character class. EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[^^]^'; +-- Closing square bracket after an escape sequence at the beginning of +-- a character closes the character class +EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[|a]%' ESCAPE '|'; -- Test backslash escapes in regexp_replace's replacement string SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3'); From 454c046094ab3431c2ce0c540c46e623bc05bd1a Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Sat, 13 Sep 2025 21:01:33 -0400 Subject: [PATCH 73/73] nbtree: Always set skipScan flag on rescan. The TimescaleDB extension expects to be able to change an nbtree scan's keys across rescans. The issue arises in the extension's implementation of loose index scan. This is arguably a misuse of the index AM API, though apparently it worked until recently. It stopped working when the skipScan flag was added to BTScanOpaqueData by commit 8a510275, though. The flag wouldn't reliably track whether the scan (actually, the current rescan) has any skip arrays, leading to confusion in _bt_set_startikey. nbtree preprocessing will now defensively initialize the scan's skipScan flag in all cases, including the case where _bt_preprocess_array_keys returns early due to the (re)scan not using arrays. While nbtree isn't obligated to support this use case (at least not according to my reading of the index AM API), it still seems like a good idea to be consistent here, on general robustness grounds. Author: Peter Geoghegan Reported-By: Natalya Aksman Discussion: https://postgr.es/m/CAJumhcirfMojbk20+W0YimbNDkwdECvJprQGQ-XqK--ph09nQw@mail.gmail.com Backpatch-through: 18 --- src/backend/access/nbtree/nbtpreprocesskeys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/access/nbtree/nbtpreprocesskeys.c b/src/backend/access/nbtree/nbtpreprocesskeys.c index 71ddd68fd548c..7b7d7860d8f53 100644 --- a/src/backend/access/nbtree/nbtpreprocesskeys.c +++ b/src/backend/access/nbtree/nbtpreprocesskeys.c @@ -1854,6 +1854,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys) * (also checks if we should add extra skip arrays based on input keys) */ numArrayKeys = _bt_num_array_keys(scan, skip_eq_ops, &numSkipArrayKeys); + so->skipScan = (numSkipArrayKeys > 0); /* Quit if nothing to do. */ if (numArrayKeys == 0) @@ -1883,7 +1884,6 @@ _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys) arrayKeyData = (ScanKey) palloc(numArrayKeyData * sizeof(ScanKeyData)); /* Allocate space for per-array data in the workspace context */ - so->skipScan = (numSkipArrayKeys > 0); so->arrayKeys = (BTArrayKeyInfo *) palloc(numArrayKeys * sizeof(BTArrayKeyInfo)); /* Allocate space for ORDER procs used to help _bt_checkkeys */