From ae453120085f7da8f4082bb912e9668410cdccab Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Fri, 5 Sep 2025 12:59:29 +0900 Subject: [PATCH 01/34] Change pg_lsn_in_internal() to use soft error reporting pg_lsn includes pg_lsn_in_internal() for the purpose of parsing a LSN position for the GUC recovery_target_lsn (21f428ebde39). It relies on a boolean called "have_error" that would be set when the LSN parsing fails, then let its callers handle any errors. d9f7f5d32f20 has added support for soft error reporting. This commit removes some boilerplate code and switches the routine to use soft error reporting directly, giving to the callers of pg_lsn_in_internal() the possibility to be fed the error message generated on failure. pg_lsn_in_internal() routine is renamed to pg_lsn_in_safe(), for consistency with other similar routines that are given an escontext. Author: Amul Sul Reviewed-by: Dean Rasheed Discussion: https://postgr.es/m/CAAJ_b96No5h5tRuR+KhcC44YcYUCw8WAHuLoqqyyop8_k3+JDQ@mail.gmail.com --- src/backend/access/transam/xlogrecovery.c | 6 ++-- src/backend/utils/adt/pg_lsn.c | 35 ++++++++++------------- src/include/utils/pg_lsn.h | 5 +++- 3 files changed, 22 insertions(+), 24 deletions(-) diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index f23ec8969c27d..346319338a0ee 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -4834,10 +4834,10 @@ check_recovery_target_lsn(char **newval, void **extra, GucSource source) { XLogRecPtr lsn; XLogRecPtr *myextra; - bool have_error = false; + ErrorSaveContext escontext = {T_ErrorSaveContext}; - lsn = pg_lsn_in_internal(*newval, &have_error); - if (have_error) + lsn = pg_lsn_in_safe(*newval, (Node *) &escontext); + if (escontext.error_occurred) return false; myextra = (XLogRecPtr *) guc_malloc(LOG, sizeof(XLogRecPtr)); diff --git a/src/backend/utils/adt/pg_lsn.c b/src/backend/utils/adt/pg_lsn.c index 12de2446f5b69..e1ec5f3bc69cf 100644 --- a/src/backend/utils/adt/pg_lsn.c +++ b/src/backend/utils/adt/pg_lsn.c @@ -25,8 +25,11 @@ * Formatting and conversion routines. *---------------------------------------------------------*/ +/* + * Internal version of pg_lsn_in() with support for soft error reporting. + */ XLogRecPtr -pg_lsn_in_internal(const char *str, bool *have_error) +pg_lsn_in_safe(const char *str, Node *escontext) { int len1, len2; @@ -34,22 +37,14 @@ pg_lsn_in_internal(const char *str, bool *have_error) off; XLogRecPtr result; - Assert(have_error != NULL); - *have_error = false; - /* Sanity check input format. */ len1 = strspn(str, "0123456789abcdefABCDEF"); if (len1 < 1 || len1 > MAXPG_LSNCOMPONENT || str[len1] != '/') - { - *have_error = true; - return InvalidXLogRecPtr; - } + goto syntax_error; + len2 = strspn(str + len1 + 1, "0123456789abcdefABCDEF"); if (len2 < 1 || len2 > MAXPG_LSNCOMPONENT || str[len1 + 1 + len2] != '\0') - { - *have_error = true; - return InvalidXLogRecPtr; - } + goto syntax_error; /* Decode result. */ id = (uint32) strtoul(str, NULL, 16); @@ -57,6 +52,12 @@ pg_lsn_in_internal(const char *str, bool *have_error) result = ((uint64) id << 32) | off; return result; + +syntax_error: + ereturn(escontext, InvalidXLogRecPtr, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type %s: \"%s\"", + "pg_lsn", str))); } Datum @@ -64,14 +65,8 @@ pg_lsn_in(PG_FUNCTION_ARGS) { char *str = PG_GETARG_CSTRING(0); XLogRecPtr result; - bool have_error = false; - - result = pg_lsn_in_internal(str, &have_error); - if (have_error) - ereturn(fcinfo->context, (Datum) 0, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid input syntax for type %s: \"%s\"", - "pg_lsn", str))); + + result = pg_lsn_in_safe(str, fcinfo->context); PG_RETURN_LSN(result); } diff --git a/src/include/utils/pg_lsn.h b/src/include/utils/pg_lsn.h index ae198af745029..461a4fdcba954 100644 --- a/src/include/utils/pg_lsn.h +++ b/src/include/utils/pg_lsn.h @@ -18,6 +18,9 @@ #include "access/xlogdefs.h" #include "fmgr.h" +/* forward declaration to avoid node.h include */ +typedef struct Node Node; + static inline XLogRecPtr DatumGetLSN(Datum X) { @@ -33,6 +36,6 @@ LSNGetDatum(XLogRecPtr X) #define PG_GETARG_LSN(n) DatumGetLSN(PG_GETARG_DATUM(n)) #define PG_RETURN_LSN(x) return LSNGetDatum(x) -extern XLogRecPtr pg_lsn_in_internal(const char *str, bool *have_error); +extern XLogRecPtr pg_lsn_in_safe(const char *str, Node *escontext); #endif /* PG_LSN_H */ From 4246a977bad6e76c4276a0d52def8a3dced154bb Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Fri, 5 Sep 2025 13:53:47 +0900 Subject: [PATCH 02/34] Switch some numeric-related functions to use soft error reporting This commit changes some functions related to the data type numeric to use the soft error reporting rather than a custom boolean flag (called "have_error") that callers of these functions could rely on to bypass the generation of ERROR reports, letting the callers do their own error handling (timestamp, jsonpath and numeric_to_char() require them). This results in the removal of some boilerplate code that was required to handle both the ereport() and the "have_error" code paths bypassing ereport(), unifying everything under the soft error reporting facility. While on it, some duplicated error messages are removed. The function upgraded in this commit were suffixed with "_opt_error" in their names. They are renamed to "_safe" instead. This change relies on d9f7f5d32f20, that has introduced the soft error reporting infrastructure. Author: Amul Sul Reviewed-by: Dean Rasheed Discussion: https://postgr.es/m/CAAJ_b96No5h5tRuR+KhcC44YcYUCw8WAHuLoqqyyop8_k3+JDQ@mail.gmail.com --- src/backend/utils/adt/formatting.c | 6 +- src/backend/utils/adt/jsonpath_exec.c | 62 +++--- src/backend/utils/adt/numeric.c | 266 ++++++++------------------ src/backend/utils/adt/timestamp.c | 46 ++--- src/include/utils/numeric.h | 22 +-- 5 files changed, 152 insertions(+), 250 deletions(-) diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index 7ad453314c307..78e19ac39ac17 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -6389,12 +6389,12 @@ numeric_to_char(PG_FUNCTION_ARGS) if (IS_ROMAN(&Num)) { int32 intvalue; - bool err; + ErrorSaveContext escontext = {T_ErrorSaveContext}; /* Round and convert to int */ - intvalue = numeric_int4_opt_error(value, &err); + intvalue = numeric_int4_safe(value, (Node *) &escontext); /* On overflow, just use PG_INT32_MAX; int_to_roman will cope */ - if (err) + if (escontext.error_occurred) intvalue = PG_INT32_MAX; numstr = int_to_roman(intvalue); } diff --git a/src/backend/utils/adt/jsonpath_exec.c b/src/backend/utils/adt/jsonpath_exec.c index 5a56253522357..8156695e97e09 100644 --- a/src/backend/utils/adt/jsonpath_exec.c +++ b/src/backend/utils/adt/jsonpath_exec.c @@ -252,7 +252,8 @@ typedef JsonPathBool (*JsonPathPredicateCallback) (JsonPathItem *jsp, JsonbValue *larg, JsonbValue *rarg, void *param); -typedef Numeric (*BinaryArithmFunc) (Numeric num1, Numeric num2, bool *error); +typedef Numeric (*BinaryArithmFunc) (Numeric num1, Numeric num2, + Node *escontext); static JsonPathExecResult executeJsonPath(JsonPath *path, void *vars, JsonPathGetVarCallback getVar, @@ -808,23 +809,23 @@ executeItemOptUnwrapTarget(JsonPathExecContext *cxt, JsonPathItem *jsp, case jpiAdd: return executeBinaryArithmExpr(cxt, jsp, jb, - numeric_add_opt_error, found); + numeric_add_safe, found); case jpiSub: return executeBinaryArithmExpr(cxt, jsp, jb, - numeric_sub_opt_error, found); + numeric_sub_safe, found); case jpiMul: return executeBinaryArithmExpr(cxt, jsp, jb, - numeric_mul_opt_error, found); + numeric_mul_safe, found); case jpiDiv: return executeBinaryArithmExpr(cxt, jsp, jb, - numeric_div_opt_error, found); + numeric_div_safe, found); case jpiMod: return executeBinaryArithmExpr(cxt, jsp, jb, - numeric_mod_opt_error, found); + numeric_mod_safe, found); case jpiPlus: return executeUnaryArithmExpr(cxt, jsp, jb, NULL, found); @@ -1269,11 +1270,12 @@ executeItemOptUnwrapTarget(JsonPathExecContext *cxt, JsonPathItem *jsp, if (jb->type == jbvNumeric) { - bool have_error; + ErrorSaveContext escontext = {T_ErrorSaveContext}; int64 val; - val = numeric_int8_opt_error(jb->val.numeric, &have_error); - if (have_error) + val = numeric_int8_safe(jb->val.numeric, + (Node *) &escontext); + if (escontext.error_occurred) RETURN_ERROR(ereport(ERROR, (errcode(ERRCODE_NON_NUMERIC_SQL_JSON_ITEM), errmsg("argument \"%s\" of jsonpath item method .%s() is invalid for type %s", @@ -1466,7 +1468,6 @@ executeItemOptUnwrapTarget(JsonPathExecContext *cxt, JsonPathItem *jsp, Datum dtypmod; int32 precision; int32 scale = 0; - bool have_error; bool noerr; ArrayType *arrtypmod; Datum datums[2]; @@ -1478,9 +1479,9 @@ executeItemOptUnwrapTarget(JsonPathExecContext *cxt, JsonPathItem *jsp, if (elem.type != jpiNumeric) elog(ERROR, "invalid jsonpath item type for .decimal() precision"); - precision = numeric_int4_opt_error(jspGetNumeric(&elem), - &have_error); - if (have_error) + precision = numeric_int4_safe(jspGetNumeric(&elem), + (Node *) &escontext); + if (escontext.error_occurred) RETURN_ERROR(ereport(ERROR, (errcode(ERRCODE_NON_NUMERIC_SQL_JSON_ITEM), errmsg("precision of jsonpath item method .%s() is out of range for type integer", @@ -1492,9 +1493,9 @@ executeItemOptUnwrapTarget(JsonPathExecContext *cxt, JsonPathItem *jsp, if (elem.type != jpiNumeric) elog(ERROR, "invalid jsonpath item type for .decimal() scale"); - scale = numeric_int4_opt_error(jspGetNumeric(&elem), - &have_error); - if (have_error) + scale = numeric_int4_safe(jspGetNumeric(&elem), + (Node *) &escontext); + if (escontext.error_occurred) RETURN_ERROR(ereport(ERROR, (errcode(ERRCODE_NON_NUMERIC_SQL_JSON_ITEM), errmsg("scale of jsonpath item method .%s() is out of range for type integer", @@ -1550,11 +1551,12 @@ executeItemOptUnwrapTarget(JsonPathExecContext *cxt, JsonPathItem *jsp, if (jb->type == jbvNumeric) { - bool have_error; int32 val; + ErrorSaveContext escontext = {T_ErrorSaveContext}; - val = numeric_int4_opt_error(jb->val.numeric, &have_error); - if (have_error) + val = numeric_int4_safe(jb->val.numeric, + (Node *) &escontext); + if (escontext.error_occurred) RETURN_ERROR(ereport(ERROR, (errcode(ERRCODE_NON_NUMERIC_SQL_JSON_ITEM), errmsg("argument \"%s\" of jsonpath item method .%s() is invalid for type %s", @@ -2149,11 +2151,11 @@ executeBinaryArithmExpr(JsonPathExecContext *cxt, JsonPathItem *jsp, } else { - bool error = false; + ErrorSaveContext escontext = {T_ErrorSaveContext}; - res = func(lval->val.numeric, rval->val.numeric, &error); + res = func(lval->val.numeric, rval->val.numeric, (Node *) &escontext); - if (error) + if (escontext.error_occurred) return jperError; } @@ -2433,7 +2435,7 @@ executeDateTimeMethod(JsonPathExecContext *cxt, JsonPathItem *jsp, if (jsp->type != jpiDatetime && jsp->type != jpiDate && jsp->content.arg) { - bool have_error; + ErrorSaveContext escontext = {T_ErrorSaveContext}; jspGetArg(jsp, &elem); @@ -2441,9 +2443,9 @@ executeDateTimeMethod(JsonPathExecContext *cxt, JsonPathItem *jsp, elog(ERROR, "invalid jsonpath item type for %s argument", jspOperationName(jsp->type)); - time_precision = numeric_int4_opt_error(jspGetNumeric(&elem), - &have_error); - if (have_error) + time_precision = numeric_int4_safe(jspGetNumeric(&elem), + (Node *) &escontext); + if (escontext.error_occurred) RETURN_ERROR(ereport(ERROR, (errcode(ERRCODE_INVALID_ARGUMENT_FOR_SQL_JSON_DATETIME_FUNCTION), errmsg("time precision of jsonpath item method .%s() is out of range for type integer", @@ -3462,7 +3464,7 @@ getArrayIndex(JsonPathExecContext *cxt, JsonPathItem *jsp, JsonbValue *jb, JsonValueList found = {0}; JsonPathExecResult res = executeItem(cxt, jsp, jb, &found); Datum numeric_index; - bool have_error = false; + ErrorSaveContext escontext = {T_ErrorSaveContext}; if (jperIsError(res)) return res; @@ -3477,10 +3479,10 @@ getArrayIndex(JsonPathExecContext *cxt, JsonPathItem *jsp, JsonbValue *jb, NumericGetDatum(jbv->val.numeric), Int32GetDatum(0)); - *index = numeric_int4_opt_error(DatumGetNumeric(numeric_index), - &have_error); + *index = numeric_int4_safe(DatumGetNumeric(numeric_index), + (Node *) &escontext); - if (have_error) + if (escontext.error_occurred) RETURN_ERROR(ereport(ERROR, (errcode(ERRCODE_INVALID_SQL_JSON_SUBSCRIPT), errmsg("jsonpath array subscript is out of integer range")))); diff --git a/src/backend/utils/adt/numeric.c b/src/backend/utils/adt/numeric.c index b6287f5d97305..76269918593d7 100644 --- a/src/backend/utils/adt/numeric.c +++ b/src/backend/utils/adt/numeric.c @@ -517,7 +517,7 @@ static void numericvar_deserialize(StringInfo buf, NumericVar *var); static Numeric duplicate_numeric(Numeric num); static Numeric make_result(const NumericVar *var); -static Numeric make_result_opt_error(const NumericVar *var, bool *have_error); +static Numeric make_result_safe(const NumericVar *var, Node *escontext); static bool apply_typmod(NumericVar *var, int32 typmod, Node *escontext); static bool apply_typmod_special(Numeric num, int32 typmod, Node *escontext); @@ -717,7 +717,6 @@ numeric_in(PG_FUNCTION_ARGS) */ NumericVar value; int base; - bool have_error; init_var(&value); @@ -776,12 +775,7 @@ numeric_in(PG_FUNCTION_ARGS) if (!apply_typmod(&value, typmod, escontext)) PG_RETURN_NULL(); - res = make_result_opt_error(&value, &have_error); - - if (have_error) - ereturn(escontext, (Datum) 0, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("value overflows numeric format"))); + res = make_result_safe(&value, escontext); free_var(&value); } @@ -2874,20 +2868,18 @@ numeric_add(PG_FUNCTION_ARGS) Numeric num2 = PG_GETARG_NUMERIC(1); Numeric res; - res = numeric_add_opt_error(num1, num2, NULL); + res = numeric_add_safe(num1, num2, NULL); PG_RETURN_NUMERIC(res); } /* - * numeric_add_opt_error() - + * numeric_add_safe() - * - * Internal version of numeric_add(). If "*have_error" flag is provided, - * on error it's set to true, NULL returned. This is helpful when caller - * need to handle errors by itself. + * Internal version of numeric_add() with support for soft error reporting. */ Numeric -numeric_add_opt_error(Numeric num1, Numeric num2, bool *have_error) +numeric_add_safe(Numeric num1, Numeric num2, Node *escontext) { NumericVar arg1; NumericVar arg2; @@ -2931,7 +2923,7 @@ numeric_add_opt_error(Numeric num1, Numeric num2, bool *have_error) init_var(&result); add_var(&arg1, &arg2, &result); - res = make_result_opt_error(&result, have_error); + res = make_result_safe(&result, escontext); free_var(&result); @@ -2951,21 +2943,19 @@ numeric_sub(PG_FUNCTION_ARGS) Numeric num2 = PG_GETARG_NUMERIC(1); Numeric res; - res = numeric_sub_opt_error(num1, num2, NULL); + res = numeric_sub_safe(num1, num2, NULL); PG_RETURN_NUMERIC(res); } /* - * numeric_sub_opt_error() - + * numeric_sub_safe() - * - * Internal version of numeric_sub(). If "*have_error" flag is provided, - * on error it's set to true, NULL returned. This is helpful when caller - * need to handle errors by itself. + * Internal version of numeric_sub() with support for soft error reporting. */ Numeric -numeric_sub_opt_error(Numeric num1, Numeric num2, bool *have_error) +numeric_sub_safe(Numeric num1, Numeric num2, Node *escontext) { NumericVar arg1; NumericVar arg2; @@ -3009,7 +2999,7 @@ numeric_sub_opt_error(Numeric num1, Numeric num2, bool *have_error) init_var(&result); sub_var(&arg1, &arg2, &result); - res = make_result_opt_error(&result, have_error); + res = make_result_safe(&result, escontext); free_var(&result); @@ -3029,21 +3019,19 @@ numeric_mul(PG_FUNCTION_ARGS) Numeric num2 = PG_GETARG_NUMERIC(1); Numeric res; - res = numeric_mul_opt_error(num1, num2, NULL); + res = numeric_mul_safe(num1, num2, NULL); PG_RETURN_NUMERIC(res); } /* - * numeric_mul_opt_error() - + * numeric_mul_safe() - * - * Internal version of numeric_mul(). If "*have_error" flag is provided, - * on error it's set to true, NULL returned. This is helpful when caller - * need to handle errors by itself. + * Internal version of numeric_mul() with support for soft error reporting. */ Numeric -numeric_mul_opt_error(Numeric num1, Numeric num2, bool *have_error) +numeric_mul_safe(Numeric num1, Numeric num2, Node *escontext) { NumericVar arg1; NumericVar arg2; @@ -3130,7 +3118,7 @@ numeric_mul_opt_error(Numeric num1, Numeric num2, bool *have_error) if (result.dscale > NUMERIC_DSCALE_MAX) round_var(&result, NUMERIC_DSCALE_MAX); - res = make_result_opt_error(&result, have_error); + res = make_result_safe(&result, escontext); free_var(&result); @@ -3150,21 +3138,19 @@ numeric_div(PG_FUNCTION_ARGS) Numeric num2 = PG_GETARG_NUMERIC(1); Numeric res; - res = numeric_div_opt_error(num1, num2, NULL); + res = numeric_div_safe(num1, num2, NULL); PG_RETURN_NUMERIC(res); } /* - * numeric_div_opt_error() - + * numeric_div_safe() - * - * Internal version of numeric_div(). If "*have_error" flag is provided, - * on error it's set to true, NULL returned. This is helpful when caller - * need to handle errors by itself. + * Internal version of numeric_div() with support for soft error reporting. */ Numeric -numeric_div_opt_error(Numeric num1, Numeric num2, bool *have_error) +numeric_div_safe(Numeric num1, Numeric num2, Node *escontext) { NumericVar arg1; NumericVar arg2; @@ -3172,9 +3158,6 @@ numeric_div_opt_error(Numeric num1, Numeric num2, bool *have_error) Numeric res; int rscale; - if (have_error) - *have_error = false; - /* * Handle NaN and infinities */ @@ -3189,15 +3172,7 @@ numeric_div_opt_error(Numeric num1, Numeric num2, bool *have_error) switch (numeric_sign_internal(num2)) { case 0: - if (have_error) - { - *have_error = true; - return NULL; - } - ereport(ERROR, - (errcode(ERRCODE_DIVISION_BY_ZERO), - errmsg("division by zero"))); - break; + goto division_by_zero; case 1: return make_result(&const_pinf); case -1: @@ -3212,15 +3187,7 @@ numeric_div_opt_error(Numeric num1, Numeric num2, bool *have_error) switch (numeric_sign_internal(num2)) { case 0: - if (have_error) - { - *have_error = true; - return NULL; - } - ereport(ERROR, - (errcode(ERRCODE_DIVISION_BY_ZERO), - errmsg("division by zero"))); - break; + goto division_by_zero; case 1: return make_result(&const_ninf); case -1: @@ -3251,25 +3218,25 @@ numeric_div_opt_error(Numeric num1, Numeric num2, bool *have_error) */ rscale = select_div_scale(&arg1, &arg2); - /* - * If "have_error" is provided, check for division by zero here - */ - if (have_error && (arg2.ndigits == 0 || arg2.digits[0] == 0)) - { - *have_error = true; - return NULL; - } + /* Check for division by zero */ + if (arg2.ndigits == 0 || arg2.digits[0] == 0) + goto division_by_zero; /* * Do the divide and return the result */ div_var(&arg1, &arg2, &result, rscale, true, true); - res = make_result_opt_error(&result, have_error); + res = make_result_safe(&result, escontext); free_var(&result); return res; + +division_by_zero: + ereturn(escontext, NULL, + errcode(ERRCODE_DIVISION_BY_ZERO), + errmsg("division by zero")); } @@ -3374,30 +3341,25 @@ numeric_mod(PG_FUNCTION_ARGS) Numeric num2 = PG_GETARG_NUMERIC(1); Numeric res; - res = numeric_mod_opt_error(num1, num2, NULL); + res = numeric_mod_safe(num1, num2, NULL); PG_RETURN_NUMERIC(res); } /* - * numeric_mod_opt_error() - + * numeric_mod_safe() - * - * Internal version of numeric_mod(). If "*have_error" flag is provided, - * on error it's set to true, NULL returned. This is helpful when caller - * need to handle errors by itself. + * Internal version of numeric_mod() with support for soft error reporting. */ Numeric -numeric_mod_opt_error(Numeric num1, Numeric num2, bool *have_error) +numeric_mod_safe(Numeric num1, Numeric num2, Node *escontext) { Numeric res; NumericVar arg1; NumericVar arg2; NumericVar result; - if (have_error) - *have_error = false; - /* * Handle NaN and infinities. We follow POSIX fmod() on this, except that * POSIX treats x-is-infinite and y-is-zero identically, raising EDOM and @@ -3410,16 +3372,8 @@ numeric_mod_opt_error(Numeric num1, Numeric num2, bool *have_error) if (NUMERIC_IS_INF(num1)) { if (numeric_sign_internal(num2) == 0) - { - if (have_error) - { - *have_error = true; - return NULL; - } - ereport(ERROR, - (errcode(ERRCODE_DIVISION_BY_ZERO), - errmsg("division by zero"))); - } + goto division_by_zero; + /* Inf % any nonzero = NaN */ return make_result(&const_nan); } @@ -3432,22 +3386,22 @@ numeric_mod_opt_error(Numeric num1, Numeric num2, bool *have_error) init_var(&result); - /* - * If "have_error" is provided, check for division by zero here - */ - if (have_error && (arg2.ndigits == 0 || arg2.digits[0] == 0)) - { - *have_error = true; - return NULL; - } + /* Check for division by zero */ + if (arg2.ndigits == 0 || arg2.digits[0] == 0) + goto division_by_zero; mod_var(&arg1, &arg2, &result); - res = make_result_opt_error(&result, NULL); + res = make_result_safe(&result, escontext); free_var(&result); return res; + +division_by_zero: + ereturn(escontext, NULL, + errcode(ERRCODE_DIVISION_BY_ZERO), + errmsg("division by zero")); } @@ -4404,52 +4358,34 @@ int4_numeric(PG_FUNCTION_ARGS) PG_RETURN_NUMERIC(int64_to_numeric(val)); } +/* + * Internal version of int4_numeric() with support for soft error reporting. + */ int32 -numeric_int4_opt_error(Numeric num, bool *have_error) +numeric_int4_safe(Numeric num, Node *escontext) { NumericVar x; int32 result; - if (have_error) - *have_error = false; - if (NUMERIC_IS_SPECIAL(num)) { - if (have_error) - { - *have_error = true; - return 0; - } + if (NUMERIC_IS_NAN(num)) + ereturn(escontext, 0, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot convert NaN to %s", "integer"))); else - { - if (NUMERIC_IS_NAN(num)) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot convert NaN to %s", "integer"))); - else - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot convert infinity to %s", "integer"))); - } + ereturn(escontext, 0, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot convert infinity to %s", "integer"))); } /* Convert to variable format, then convert to int4 */ init_var_from_num(num, &x); if (!numericvar_to_int32(&x, &result)) - { - if (have_error) - { - *have_error = true; - return 0; - } - else - { - ereport(ERROR, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("integer out of range"))); - } - } + ereturn(escontext, 0, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("integer out of range"))); return result; } @@ -4459,7 +4395,7 @@ numeric_int4(PG_FUNCTION_ARGS) { Numeric num = PG_GETARG_NUMERIC(0); - PG_RETURN_INT32(numeric_int4_opt_error(num, NULL)); + PG_RETURN_INT32(numeric_int4_safe(num, NULL)); } /* @@ -4492,52 +4428,34 @@ int8_numeric(PG_FUNCTION_ARGS) PG_RETURN_NUMERIC(int64_to_numeric(val)); } +/* + * Internal version of int8_numeric() with support for soft error reporting. + */ int64 -numeric_int8_opt_error(Numeric num, bool *have_error) +numeric_int8_safe(Numeric num, Node *escontext) { NumericVar x; int64 result; - if (have_error) - *have_error = false; - if (NUMERIC_IS_SPECIAL(num)) { - if (have_error) - { - *have_error = true; - return 0; - } + if (NUMERIC_IS_NAN(num)) + ereturn(escontext, 0, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot convert NaN to %s", "bigint"))); else - { - if (NUMERIC_IS_NAN(num)) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot convert NaN to %s", "bigint"))); - else - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot convert infinity to %s", "bigint"))); - } + ereturn(escontext, 0, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot convert infinity to %s", "bigint"))); } /* Convert to variable format, then convert to int8 */ init_var_from_num(num, &x); if (!numericvar_to_int64(&x, &result)) - { - if (have_error) - { - *have_error = true; - return 0; - } - else - { - ereport(ERROR, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("bigint out of range"))); - } - } + ereturn(escontext, 0, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("bigint out of range"))); return result; } @@ -4547,7 +4465,7 @@ numeric_int8(PG_FUNCTION_ARGS) { Numeric num = PG_GETARG_NUMERIC(0); - PG_RETURN_INT64(numeric_int8_opt_error(num, NULL)); + PG_RETURN_INT64(numeric_int8_safe(num, NULL)); } @@ -7583,16 +7501,13 @@ duplicate_numeric(Numeric num) } /* - * make_result_opt_error() - + * make_result_safe() - * * Create the packed db numeric format in palloc()'d memory from * a variable. This will handle NaN and Infinity cases. - * - * If "have_error" isn't NULL, on overflow *have_error is set to true and - * NULL is returned. This is helpful when caller needs to handle errors. */ static Numeric -make_result_opt_error(const NumericVar *var, bool *have_error) +make_result_safe(const NumericVar *var, Node *escontext) { Numeric result; NumericDigit *digits = var->digits; @@ -7601,9 +7516,6 @@ make_result_opt_error(const NumericVar *var, bool *have_error) int n; Size len; - if (have_error) - *have_error = false; - if ((sign & NUMERIC_SIGN_MASK) == NUMERIC_SPECIAL) { /* @@ -7676,19 +7588,9 @@ make_result_opt_error(const NumericVar *var, bool *have_error) /* Check for overflow of int16 fields */ if (NUMERIC_WEIGHT(result) != weight || NUMERIC_DSCALE(result) != var->dscale) - { - if (have_error) - { - *have_error = true; - return NULL; - } - else - { - ereport(ERROR, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("value overflows numeric format"))); - } - } + ereturn(escontext, NULL, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("value overflows numeric format"))); dump_numeric("make_result()", result); return result; @@ -7698,12 +7600,12 @@ make_result_opt_error(const NumericVar *var, bool *have_error) /* * make_result() - * - * An interface to make_result_opt_error() without "have_error" argument. + * An interface to make_result_safe() without "escontext" argument. */ static Numeric make_result(const NumericVar *var) { - return make_result_opt_error(var, NULL); + return make_result_safe(var, NULL); } diff --git a/src/backend/utils/adt/timestamp.c b/src/backend/utils/adt/timestamp.c index 3e5f9dc1458e1..156a4830ffda6 100644 --- a/src/backend/utils/adt/timestamp.c +++ b/src/backend/utils/adt/timestamp.c @@ -5629,11 +5629,11 @@ timestamp_part_common(PG_FUNCTION_ARGS, bool retnumeric) case DTK_JULIAN: if (retnumeric) - PG_RETURN_NUMERIC(numeric_add_opt_error(int64_to_numeric(date2j(tm->tm_year, tm->tm_mon, tm->tm_mday)), - numeric_div_opt_error(int64_to_numeric(((((tm->tm_hour * MINS_PER_HOUR) + tm->tm_min) * SECS_PER_MINUTE) + tm->tm_sec) * INT64CONST(1000000) + fsec), - int64_to_numeric(SECS_PER_DAY * INT64CONST(1000000)), - NULL), - NULL)); + PG_RETURN_NUMERIC(numeric_add_safe(int64_to_numeric(date2j(tm->tm_year, tm->tm_mon, tm->tm_mday)), + numeric_div_safe(int64_to_numeric(((((tm->tm_hour * MINS_PER_HOUR) + tm->tm_min) * SECS_PER_MINUTE) + tm->tm_sec) * INT64CONST(1000000) + fsec), + int64_to_numeric(SECS_PER_DAY * INT64CONST(1000000)), + NULL), + NULL)); else PG_RETURN_FLOAT8(date2j(tm->tm_year, tm->tm_mon, tm->tm_mday) + ((((tm->tm_hour * MINS_PER_HOUR) + tm->tm_min) * SECS_PER_MINUTE) + @@ -5685,11 +5685,11 @@ timestamp_part_common(PG_FUNCTION_ARGS, bool retnumeric) result = int64_div_fast_to_numeric(timestamp - epoch, 6); else { - result = numeric_div_opt_error(numeric_sub_opt_error(int64_to_numeric(timestamp), - int64_to_numeric(epoch), - NULL), - int64_to_numeric(1000000), - NULL); + result = numeric_div_safe(numeric_sub_safe(int64_to_numeric(timestamp), + int64_to_numeric(epoch), + NULL), + int64_to_numeric(1000000), + NULL); result = DatumGetNumeric(DirectFunctionCall2(numeric_round, NumericGetDatum(result), Int32GetDatum(6))); @@ -5903,11 +5903,11 @@ timestamptz_part_common(PG_FUNCTION_ARGS, bool retnumeric) case DTK_JULIAN: if (retnumeric) - PG_RETURN_NUMERIC(numeric_add_opt_error(int64_to_numeric(date2j(tm->tm_year, tm->tm_mon, tm->tm_mday)), - numeric_div_opt_error(int64_to_numeric(((((tm->tm_hour * MINS_PER_HOUR) + tm->tm_min) * SECS_PER_MINUTE) + tm->tm_sec) * INT64CONST(1000000) + fsec), - int64_to_numeric(SECS_PER_DAY * INT64CONST(1000000)), - NULL), - NULL)); + PG_RETURN_NUMERIC(numeric_add_safe(int64_to_numeric(date2j(tm->tm_year, tm->tm_mon, tm->tm_mday)), + numeric_div_safe(int64_to_numeric(((((tm->tm_hour * MINS_PER_HOUR) + tm->tm_min) * SECS_PER_MINUTE) + tm->tm_sec) * INT64CONST(1000000) + fsec), + int64_to_numeric(SECS_PER_DAY * INT64CONST(1000000)), + NULL), + NULL)); else PG_RETURN_FLOAT8(date2j(tm->tm_year, tm->tm_mon, tm->tm_mday) + ((((tm->tm_hour * MINS_PER_HOUR) + tm->tm_min) * SECS_PER_MINUTE) + @@ -5956,11 +5956,11 @@ timestamptz_part_common(PG_FUNCTION_ARGS, bool retnumeric) result = int64_div_fast_to_numeric(timestamp - epoch, 6); else { - result = numeric_div_opt_error(numeric_sub_opt_error(int64_to_numeric(timestamp), - int64_to_numeric(epoch), - NULL), - int64_to_numeric(1000000), - NULL); + result = numeric_div_safe(numeric_sub_safe(int64_to_numeric(timestamp), + int64_to_numeric(epoch), + NULL), + int64_to_numeric(1000000), + NULL); result = DatumGetNumeric(DirectFunctionCall2(numeric_round, NumericGetDatum(result), Int32GetDatum(6))); @@ -6247,9 +6247,9 @@ interval_part_common(PG_FUNCTION_ARGS, bool retnumeric) result = int64_div_fast_to_numeric(val, 6); else result = - numeric_add_opt_error(int64_div_fast_to_numeric(interval->time, 6), - int64_to_numeric(secs_from_day_month), - NULL); + numeric_add_safe(int64_div_fast_to_numeric(interval->time, 6), + int64_to_numeric(secs_from_day_month), + NULL); PG_RETURN_NUMERIC(result); } diff --git a/src/include/utils/numeric.h b/src/include/utils/numeric.h index 9e79fc376cbea..215f1ea4f53b4 100644 --- a/src/include/utils/numeric.h +++ b/src/include/utils/numeric.h @@ -17,6 +17,9 @@ #include "common/pg_prng.h" #include "fmgr.h" +/* forward declaration to avoid node.h include */ +typedef struct Node Node; + /* * Limits on the precision and scale specifiable in a NUMERIC typmod. The * precision is strictly positive, but the scale may be positive or negative. @@ -91,18 +94,13 @@ extern char *numeric_normalize(Numeric num); extern Numeric int64_to_numeric(int64 val); extern Numeric int64_div_fast_to_numeric(int64 val1, int log10val2); -extern Numeric numeric_add_opt_error(Numeric num1, Numeric num2, - bool *have_error); -extern Numeric numeric_sub_opt_error(Numeric num1, Numeric num2, - bool *have_error); -extern Numeric numeric_mul_opt_error(Numeric num1, Numeric num2, - bool *have_error); -extern Numeric numeric_div_opt_error(Numeric num1, Numeric num2, - bool *have_error); -extern Numeric numeric_mod_opt_error(Numeric num1, Numeric num2, - bool *have_error); -extern int32 numeric_int4_opt_error(Numeric num, bool *have_error); -extern int64 numeric_int8_opt_error(Numeric num, bool *have_error); +extern Numeric numeric_add_safe(Numeric num1, Numeric num2, Node *escontext); +extern Numeric numeric_sub_safe(Numeric num1, Numeric num2, Node *escontext); +extern Numeric numeric_mul_safe(Numeric num1, Numeric num2, Node *escontext); +extern Numeric numeric_div_safe(Numeric num1, Numeric num2, Node *escontext); +extern Numeric numeric_mod_safe(Numeric num1, Numeric num2, Node *escontext); +extern int32 numeric_int4_safe(Numeric num, Node *escontext); +extern int64 numeric_int8_safe(Numeric num, Node *escontext); extern Numeric random_numeric(pg_prng_state *state, Numeric rmin, Numeric rmax); From 567d27e8e2b752743626eb259ba75ecdc936eaf3 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Fri, 5 Sep 2025 14:10:08 +0900 Subject: [PATCH 03/34] Fix outdated comments in slru.c SlruRecentlyUsed() is an inline function since 53c2a97a9266, not a macro. The description of long_segment_names was missing at the top of SimpleLruInit(), part forgotten in 4ed8f0913bfd. Author: Julien Rouhaud Discussion: https://postgr.es/m/aLpBLMOYwEQkaleF@jrouhaud Backpatch-through: 17 --- src/backend/access/transam/slru.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index d7ebd889aea71..5d3fcd62c9443 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -246,6 +246,7 @@ SimpleLruAutotuneBuffers(int divisor, int max) * buffer_tranche_id: tranche ID to use for the SLRU's per-buffer LWLocks. * bank_tranche_id: tranche ID to use for the bank LWLocks. * sync_handler: which set of functions to use to handle sync requests + * long_segment_names: use short or long segment names */ void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, @@ -644,7 +645,7 @@ SimpleLruReadPage_ReadOnly(SlruCtl ctl, int64 pageno, TransactionId xid) shared->page_number[slotno] == pageno && shared->page_status[slotno] != SLRU_PAGE_READ_IN_PROGRESS) { - /* See comments for SlruRecentlyUsed macro */ + /* See comments for SlruRecentlyUsed() */ SlruRecentlyUsed(shared, slotno); /* update the stats counter of pages found in the SLRU */ From 6ede13d1b5f515df0a199a7a830e448dab1511c0 Mon Sep 17 00:00:00 2001 From: Dean Rasheed Date: Fri, 5 Sep 2025 08:18:18 +0100 Subject: [PATCH 04/34] Fix concurrent update issue with MERGE. When executing a MERGE UPDATE action, if there is more than one concurrent update of the target row, the lock-and-retry code would sometimes incorrectly identify the latest version of the target tuple, leading to incorrect results. This was caused by using the ctid field from the TM_FailureData returned by table_tuple_lock() in a case where the result was TM_Ok, which is unsafe because the TM_FailureData struct is not guaranteed to be fully populated in that case. Instead, it should use the tupleid passed to (and updated by) table_tuple_lock(). To reduce the chances of similar errors in the future, improve the commentary for table_tuple_lock() and TM_FailureData to make it clearer that table_tuple_lock() updates the tid passed to it, and most fields of TM_FailureData should not be relied on in non-failure cases. An exception to this is the "traversed" field, which is set in both success and failure cases. Reported-by: Dmitry Author: Yugo Nagata Reviewed-by: Dean Rasheed Reviewed-by: Chao Li Discussion: https://postgr.es/m/1570d30e-2b95-4239-b9c3-f7bf2f2f8556@yandex.ru Backpatch-through: 15 --- src/backend/executor/nodeModifyTable.c | 9 +- src/include/access/tableam.h | 15 +- .../expected/merge-match-recheck.out | 145 ++++++++++++++++++ .../isolation/specs/merge-match-recheck.spec | 18 +++ 4 files changed, 179 insertions(+), 8 deletions(-) diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index b0c4e2c0d32a4..4c5647ac38a1c 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -3402,7 +3402,7 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, * the tuple moved, and setting our current * resultRelInfo to that. */ - if (ItemPointerIndicatesMovedPartitions(&context->tmfd.ctid)) + if (ItemPointerIndicatesMovedPartitions(tupleid)) ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("tuple to be merged was already moved to another partition due to concurrent update"))); @@ -3450,12 +3450,13 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, if (ItemPointerIsValid(&lockedtid)) UnlockTuple(resultRelInfo->ri_RelationDesc, &lockedtid, InplaceUpdateTupleLock); - LockTuple(resultRelInfo->ri_RelationDesc, &context->tmfd.ctid, + LockTuple(resultRelInfo->ri_RelationDesc, tupleid, InplaceUpdateTupleLock); - lockedtid = context->tmfd.ctid; + lockedtid = *tupleid; } + if (!table_tuple_fetch_row_version(resultRelationDesc, - &context->tmfd.ctid, + tupleid, SnapshotAny, resultRelInfo->ri_oldTupleSlot)) elog(ERROR, "failed to fetch the target tuple"); diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 1c9e802a6b128..b2ce35e2a3407 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -121,7 +121,9 @@ typedef enum TU_UpdateIndexes /* * When table_tuple_update, table_tuple_delete, or table_tuple_lock fail * because the target tuple is already outdated, they fill in this struct to - * provide information to the caller about what happened. + * provide information to the caller about what happened. When those functions + * succeed, the contents of this struct should not be relied upon, except for + * `traversed`, which may be set in both success and failure cases. * * ctid is the target's ctid link: it is the same as the target's TID if the * target was deleted, or the location of the replacement tuple if the target @@ -137,6 +139,9 @@ typedef enum TU_UpdateIndexes * tuple); otherwise cmax is zero. (We make this restriction because * HeapTupleHeaderGetCmax doesn't work for tuples outdated in other * transactions.) + * + * traversed indicates if an update chain was followed in order to try to lock + * the target tuple. (This may be set in both success and failure cases.) */ typedef struct TM_FailureData { @@ -1508,7 +1513,7 @@ table_tuple_update(Relation rel, ItemPointer otid, TupleTableSlot *slot, * * Input parameters: * relation: relation containing tuple (caller must hold suitable lock) - * tid: TID of tuple to lock + * tid: TID of tuple to lock (updated if an update chain was followed) * snapshot: snapshot to use for visibility determinations * cid: current command ID (used for visibility test, and stored into * tuple's cmax if lock is successful) @@ -1533,8 +1538,10 @@ table_tuple_update(Relation rel, ItemPointer otid, TupleTableSlot *slot, * TM_WouldBlock: lock couldn't be acquired and wait_policy is skip * * In the failure cases other than TM_Invisible and TM_Deleted, the routine - * fills *tmfd with the tuple's t_ctid, t_xmax, and, if possible, t_cmax. See - * comments for struct TM_FailureData for additional info. + * fills *tmfd with the tuple's t_ctid, t_xmax, and, if possible, t_cmax. + * Additionally, in both success and failure cases, tmfd->traversed is set if + * an update chain was followed. See comments for struct TM_FailureData for + * additional info. */ static inline TM_Result table_tuple_lock(Relation rel, ItemPointer tid, Snapshot snapshot, diff --git a/src/test/isolation/expected/merge-match-recheck.out b/src/test/isolation/expected/merge-match-recheck.out index 90300f1db5ab3..4250b85af2d3c 100644 --- a/src/test/isolation/expected/merge-match-recheck.out +++ b/src/test/isolation/expected/merge-match-recheck.out @@ -271,6 +271,151 @@ key|balance|status|val step c1: COMMIT; +starting permutation: update1 update6 merge_bal c2 select1 c1 +step update1: UPDATE target t SET balance = balance + 10, val = t.val || ' updated by update1' WHERE t.key = 1; +step update6: UPDATE target t SET balance = balance - 100, val = t.val || ' updated by update6' WHERE t.key = 1; +step merge_bal: + MERGE INTO target t + USING (SELECT 1 as key) s + ON s.key = t.key + WHEN MATCHED AND balance < 100 THEN + UPDATE SET balance = balance * 2, val = t.val || ' when1' + WHEN MATCHED AND balance < 200 THEN + UPDATE SET balance = balance * 4, val = t.val || ' when2' + WHEN MATCHED AND balance < 300 THEN + UPDATE SET balance = balance * 8, val = t.val || ' when3'; + +step c2: COMMIT; +step merge_bal: <... completed> +step select1: SELECT * FROM target; +key|balance|status|val +---+-------+------+------------------------------------------------- + 1| 140|s1 |setup updated by update1 updated by update6 when1 +(1 row) + +step c1: COMMIT; + +starting permutation: update1_pa update6_pa merge_bal_pa c2 select1_pa c1 +step update1_pa: UPDATE target_pa t SET balance = balance + 10, val = t.val || ' updated by update1_pa' WHERE t.key = 1; +step update6_pa: UPDATE target_pa t SET balance = balance - 100, val = t.val || ' updated by update6_pa' WHERE t.key = 1; +step merge_bal_pa: + MERGE INTO target_pa t + USING (SELECT 1 as key) s + ON s.key = t.key + WHEN MATCHED AND balance < 100 THEN + UPDATE SET balance = balance * 2, val = t.val || ' when1' + WHEN MATCHED AND balance < 200 THEN + UPDATE SET balance = balance * 4, val = t.val || ' when2' + WHEN MATCHED AND balance < 300 THEN + UPDATE SET balance = balance * 8, val = t.val || ' when3'; + +step c2: COMMIT; +step merge_bal_pa: <... completed> +step select1_pa: SELECT * FROM target_pa; +key|balance|status|val +---+-------+------+------------------------------------------------------- + 1| 140|s1 |setup updated by update1_pa updated by update6_pa when1 +(1 row) + +step c1: COMMIT; + +starting permutation: update1_tg update6_tg merge_bal_tg c2 select1_tg c1 +s2: NOTICE: Update: (1,160,s1,setup) -> (1,170,s1,"setup updated by update1_tg") +step update1_tg: UPDATE target_tg t SET balance = balance + 10, val = t.val || ' updated by update1_tg' WHERE t.key = 1; +s2: NOTICE: Update: (1,170,s1,"setup updated by update1_tg") -> (1,70,s1,"setup updated by update1_tg updated by update6_tg") +step update6_tg: UPDATE target_tg t SET balance = balance - 100, val = t.val || ' updated by update6_tg' WHERE t.key = 1; +step merge_bal_tg: + WITH t AS ( + MERGE INTO target_tg t + USING (SELECT 1 as key) s + ON s.key = t.key + WHEN MATCHED AND balance < 100 THEN + UPDATE SET balance = balance * 2, val = t.val || ' when1' + WHEN MATCHED AND balance < 200 THEN + UPDATE SET balance = balance * 4, val = t.val || ' when2' + WHEN MATCHED AND balance < 300 THEN + UPDATE SET balance = balance * 8, val = t.val || ' when3' + RETURNING t.* + ) + SELECT * FROM t; + +step c2: COMMIT; +s1: NOTICE: Update: (1,70,s1,"setup updated by update1_tg updated by update6_tg") -> (1,140,s1,"setup updated by update1_tg updated by update6_tg when1") +step merge_bal_tg: <... completed> +key|balance|status|val +---+-------+------+------------------------------------------------------- + 1| 140|s1 |setup updated by update1_tg updated by update6_tg when1 +(1 row) + +step select1_tg: SELECT * FROM target_tg; +key|balance|status|val +---+-------+------+------------------------------------------------------- + 1| 140|s1 |setup updated by update1_tg updated by update6_tg when1 +(1 row) + +step c1: COMMIT; + +starting permutation: update7 update6 merge_bal c2 select1 c1 +step update7: UPDATE target t SET balance = 350, val = t.val || ' updated by update7' WHERE t.key = 1; +step update6: UPDATE target t SET balance = balance - 100, val = t.val || ' updated by update6' WHERE t.key = 1; +step merge_bal: + MERGE INTO target t + USING (SELECT 1 as key) s + ON s.key = t.key + WHEN MATCHED AND balance < 100 THEN + UPDATE SET balance = balance * 2, val = t.val || ' when1' + WHEN MATCHED AND balance < 200 THEN + UPDATE SET balance = balance * 4, val = t.val || ' when2' + WHEN MATCHED AND balance < 300 THEN + UPDATE SET balance = balance * 8, val = t.val || ' when3'; + +step c2: COMMIT; +step merge_bal: <... completed> +step select1: SELECT * FROM target; +key|balance|status|val +---+-------+------+------------------------------------------------- + 1| 2000|s1 |setup updated by update7 updated by update6 when3 +(1 row) + +step c1: COMMIT; + +starting permutation: update1_pa_move merge_bal_pa c2 c1 +step update1_pa_move: UPDATE target_pa t SET balance = 210, val = t.val || ' updated by update1_pa_move' WHERE t.key = 1; +step merge_bal_pa: + MERGE INTO target_pa t + USING (SELECT 1 as key) s + ON s.key = t.key + WHEN MATCHED AND balance < 100 THEN + UPDATE SET balance = balance * 2, val = t.val || ' when1' + WHEN MATCHED AND balance < 200 THEN + UPDATE SET balance = balance * 4, val = t.val || ' when2' + WHEN MATCHED AND balance < 300 THEN + UPDATE SET balance = balance * 8, val = t.val || ' when3'; + +step c2: COMMIT; +step merge_bal_pa: <... completed> +ERROR: tuple to be locked was already moved to another partition due to concurrent update +step c1: COMMIT; + +starting permutation: update1_pa update1_pa_move merge_bal_pa c2 c1 +step update1_pa: UPDATE target_pa t SET balance = balance + 10, val = t.val || ' updated by update1_pa' WHERE t.key = 1; +step update1_pa_move: UPDATE target_pa t SET balance = 210, val = t.val || ' updated by update1_pa_move' WHERE t.key = 1; +step merge_bal_pa: + MERGE INTO target_pa t + USING (SELECT 1 as key) s + ON s.key = t.key + WHEN MATCHED AND balance < 100 THEN + UPDATE SET balance = balance * 2, val = t.val || ' when1' + WHEN MATCHED AND balance < 200 THEN + UPDATE SET balance = balance * 4, val = t.val || ' when2' + WHEN MATCHED AND balance < 300 THEN + UPDATE SET balance = balance * 8, val = t.val || ' when3'; + +step c2: COMMIT; +step merge_bal_pa: <... completed> +ERROR: tuple to be locked was already moved to another partition due to concurrent update +step c1: COMMIT; + starting permutation: update1 merge_delete c2 select1 c1 step update1: UPDATE target t SET balance = balance + 10, val = t.val || ' updated by update1' WHERE t.key = 1; step merge_delete: diff --git a/src/test/isolation/specs/merge-match-recheck.spec b/src/test/isolation/specs/merge-match-recheck.spec index 15226e40c9efc..6e7a776d17e5a 100644 --- a/src/test/isolation/specs/merge-match-recheck.spec +++ b/src/test/isolation/specs/merge-match-recheck.spec @@ -146,6 +146,8 @@ setup BEGIN ISOLATION LEVEL READ COMMITTED; } step "update1" { UPDATE target t SET balance = balance + 10, val = t.val || ' updated by update1' WHERE t.key = 1; } +step "update1_pa" { UPDATE target_pa t SET balance = balance + 10, val = t.val || ' updated by update1_pa' WHERE t.key = 1; } +step "update1_pa_move" { UPDATE target_pa t SET balance = 210, val = t.val || ' updated by update1_pa_move' WHERE t.key = 1; } step "update1_tg" { UPDATE target_tg t SET balance = balance + 10, val = t.val || ' updated by update1_tg' WHERE t.key = 1; } step "update2" { UPDATE target t SET status = 's2', val = t.val || ' updated by update2' WHERE t.key = 1; } step "update2_tg" { UPDATE target_tg t SET status = 's2', val = t.val || ' updated by update2_tg' WHERE t.key = 1; } @@ -153,6 +155,10 @@ step "update3" { UPDATE target t SET status = 's3', val = t.val || ' updated by step "update3_tg" { UPDATE target_tg t SET status = 's3', val = t.val || ' updated by update3_tg' WHERE t.key = 1; } step "update5" { UPDATE target t SET status = 's5', val = t.val || ' updated by update5' WHERE t.key = 1; } step "update5_tg" { UPDATE target_tg t SET status = 's5', val = t.val || ' updated by update5_tg' WHERE t.key = 1; } +step "update6" { UPDATE target t SET balance = balance - 100, val = t.val || ' updated by update6' WHERE t.key = 1; } +step "update6_pa" { UPDATE target_pa t SET balance = balance - 100, val = t.val || ' updated by update6_pa' WHERE t.key = 1; } +step "update6_tg" { UPDATE target_tg t SET balance = balance - 100, val = t.val || ' updated by update6_tg' WHERE t.key = 1; } +step "update7" { UPDATE target t SET balance = 350, val = t.val || ' updated by update7' WHERE t.key = 1; } step "update_bal1" { UPDATE target t SET balance = 50, val = t.val || ' updated by update_bal1' WHERE t.key = 1; } step "update_bal1_pa" { UPDATE target_pa t SET balance = 50, val = t.val || ' updated by update_bal1_pa' WHERE t.key = 1; } step "update_bal1_tg" { UPDATE target_tg t SET balance = 50, val = t.val || ' updated by update_bal1_tg' WHERE t.key = 1; } @@ -179,6 +185,18 @@ permutation "update_bal1" "merge_bal" "c2" "select1" "c1" permutation "update_bal1_pa" "merge_bal_pa" "c2" "select1_pa" "c1" permutation "update_bal1_tg" "merge_bal_tg" "c2" "select1_tg" "c1" +# merge_bal sees row concurrently updated twice and rechecks WHEN conditions, different check passes, so final balance = 140 +permutation "update1" "update6" "merge_bal" "c2" "select1" "c1" +permutation "update1_pa" "update6_pa" "merge_bal_pa" "c2" "select1_pa" "c1" +permutation "update1_tg" "update6_tg" "merge_bal_tg" "c2" "select1_tg" "c1" + +# merge_bal sees row concurrently updated twice, first update would cause all checks to fail, second update causes different check to pass, so final balance = 2000 +permutation "update7" "update6" "merge_bal" "c2" "select1" "c1" + +# merge_bal sees concurrently updated row moved to new partition, so fails +permutation "update1_pa_move" "merge_bal_pa" "c2" "c1" +permutation "update1_pa" "update1_pa_move" "merge_bal_pa" "c2" "c1" + # merge_delete sees concurrently updated row and rechecks WHEN conditions, but recheck passes and row is deleted permutation "update1" "merge_delete" "c2" "select1" "c1" permutation "update1_tg" "merge_delete_tg" "c2" "select1_tg" "c1" From e3d5ddb7ca91e5982e9d4cff9eef210d97e4f47e Mon Sep 17 00:00:00 2001 From: Melanie Plageman Date: Fri, 5 Sep 2025 09:33:36 -0400 Subject: [PATCH 05/34] Add assert and log message to visibilitymap_set Add an assert to visibilitymap_set() that the provided heap buffer is exclusively locked, which is expected. Also, enhance the debug logging message to specify which VM flags were set. Based on a related suggestion by Kirill Reshke on an in-progress patchset. Author: Melanie Plageman Reviewed-by: Kirill Reshke Reviewed-by: Andres Freund Discussion: https://postgr.es/m/CALdSSPhAU56g1gGVT0%2BwG8RrSWE6qW8TOfNJS1HNAWX6wPgbFA%40mail.gmail.com --- src/backend/access/heap/visibilitymap.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index 953ad4a484399..7306c16f05cd3 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -255,7 +255,8 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, uint8 status; #ifdef TRACE_VISIBILITYMAP - elog(DEBUG1, "vm_set %s %d", RelationGetRelationName(rel), heapBlk); + elog(DEBUG1, "vm_set flags 0x%02X for %s %d", + flags, RelationGetRelationName(rel), heapBlk); #endif Assert(InRecovery || XLogRecPtrIsInvalid(recptr)); @@ -269,6 +270,8 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, if (BufferIsValid(heapBuf) && BufferGetBlockNumber(heapBuf) != heapBlk) elog(ERROR, "wrong heap buffer passed to visibilitymap_set"); + Assert(!BufferIsValid(heapBuf) || BufferIsExclusiveLocked(heapBuf)); + /* Check that we have the right VM page pinned */ if (!BufferIsValid(vmBuf) || BufferGetBlockNumber(vmBuf) != mapBlock) elog(ERROR, "wrong VM buffer passed to visibilitymap_set"); From 50e4c6ace5e69fbe69868c270a1c76acd4cb12ec Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Fri, 5 Sep 2025 12:25:59 -0400 Subject: [PATCH 06/34] bufmgr: Use consistent naming of the clock-sweep algorithm Minor edits to comments only. Author: Greg Burd Reviewed-by: Tomas Vondra Reviewed-by: Andres Freund Discussion: https://postgr.es/m/70C6A5B5-2A20-4D0B-BC73-EB09DD62D61C@getmailspring.com --- src/backend/storage/buffer/README | 4 ++-- src/backend/storage/buffer/bufmgr.c | 8 ++++---- src/backend/storage/buffer/freelist.c | 10 +++++----- src/backend/storage/buffer/localbuf.c | 2 +- src/include/storage/buf_internals.h | 4 ++-- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/backend/storage/buffer/README b/src/backend/storage/buffer/README index a182fcd660ccb..4b13da5d7add8 100644 --- a/src/backend/storage/buffer/README +++ b/src/backend/storage/buffer/README @@ -211,9 +211,9 @@ Buffer Ring Replacement Strategy When running a query that needs to access a large number of pages just once, such as VACUUM or a large sequential scan, a different strategy is used. A page that has been touched only by such a scan is unlikely to be needed -again soon, so instead of running the normal clock sweep algorithm and +again soon, so instead of running the normal clock-sweep algorithm and blowing out the entire buffer cache, a small ring of buffers is allocated -using the normal clock sweep algorithm and those buffers are reused for the +using the normal clock-sweep algorithm and those buffers are reused for the whole scan. This also implies that much of the write traffic caused by such a statement will be done by the backend itself and not pushed off onto other processes. diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 350cc0402aa8f..9fc906a4a4082 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -3608,7 +3608,7 @@ BufferSync(int flags) * This is called periodically by the background writer process. * * Returns true if it's appropriate for the bgwriter process to go into - * low-power hibernation mode. (This happens if the strategy clock sweep + * low-power hibernation mode. (This happens if the strategy clock-sweep * has been "lapped" and no buffer allocations have occurred recently, * or if the bgwriter has been effectively disabled by setting * bgwriter_lru_maxpages to 0.) @@ -3658,7 +3658,7 @@ BgBufferSync(WritebackContext *wb_context) uint32 new_recent_alloc; /* - * Find out where the freelist clock sweep currently is, and how many + * Find out where the freelist clock-sweep currently is, and how many * buffer allocations have happened since our last call. */ strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc); @@ -3679,8 +3679,8 @@ BgBufferSync(WritebackContext *wb_context) /* * Compute strategy_delta = how many buffers have been scanned by the - * clock sweep since last time. If first time through, assume none. Then - * see if we are still ahead of the clock sweep, and if so, how many + * clock-sweep since last time. If first time through, assume none. Then + * see if we are still ahead of the clock-sweep, and if so, how many * buffers we could scan before we'd catch up with it and "lap" it. Note: * weird-looking coding of xxx_passes comparisons are to avoid bogus * behavior when the passes counts wrap around. diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index 01909be027258..cd94a7d8a7b39 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -33,7 +33,7 @@ typedef struct slock_t buffer_strategy_lock; /* - * Clock sweep hand: index of next buffer to consider grabbing. Note that + * clock-sweep hand: index of next buffer to consider grabbing. Note that * this isn't a concrete buffer - we only ever increase the value. So, to * get an actual buffer, it needs to be used modulo NBuffers. */ @@ -51,7 +51,7 @@ typedef struct * Statistics. These counters should be wide enough that they can't * overflow during a single bgwriter cycle. */ - uint32 completePasses; /* Complete cycles of the clock sweep */ + uint32 completePasses; /* Complete cycles of the clock-sweep */ pg_atomic_uint32 numBufferAllocs; /* Buffers allocated since last reset */ /* @@ -311,7 +311,7 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r } } - /* Nothing on the freelist, so run the "clock sweep" algorithm */ + /* Nothing on the freelist, so run the "clock-sweep" algorithm */ trycounter = NBuffers; for (;;) { @@ -511,7 +511,7 @@ StrategyInitialize(bool init) StrategyControl->firstFreeBuffer = 0; StrategyControl->lastFreeBuffer = NBuffers - 1; - /* Initialize the clock sweep pointer */ + /* Initialize the clock-sweep pointer */ pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0); /* Clear statistics */ @@ -759,7 +759,7 @@ GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state) * * If usage_count is 0 or 1 then the buffer is fair game (we expect 1, * since our own previous usage of the ring element would have left it - * there, but it might've been decremented by clock sweep since then). A + * there, but it might've been decremented by clock-sweep since then). A * higher usage_count indicates someone else has touched the buffer, so we * shouldn't re-use it. */ diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index 3c0d20f4659d2..04fef13409b02 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -229,7 +229,7 @@ GetLocalVictimBuffer(void) ResourceOwnerEnlarge(CurrentResourceOwner); /* - * Need to get a new buffer. We use a clock sweep algorithm (essentially + * Need to get a new buffer. We use a clock-sweep algorithm (essentially * the same as what freelist.c does now...) */ trycounter = NLocBuffer; diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index 52a71b138f736..3a210c710f633 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -80,8 +80,8 @@ StaticAssertDecl(BUF_REFCOUNT_BITS + BUF_USAGECOUNT_BITS + BUF_FLAG_BITS == 32, * The maximum allowed value of usage_count represents a tradeoff between * accuracy and speed of the clock-sweep buffer management algorithm. A * large value (comparable to NBuffers) would approximate LRU semantics. - * But it can take as many as BM_MAX_USAGE_COUNT+1 complete cycles of - * clock sweeps to find a free buffer, so in practice we don't want the + * But it can take as many as BM_MAX_USAGE_COUNT+1 complete cycles of the + * clock-sweep hand to find a free buffer, so in practice we don't want the * value to be very large. */ #define BM_MAX_USAGE_COUNT 5 From 2c789405275928ce0d2ceb7c4add91d27df92502 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Fri, 5 Sep 2025 12:25:59 -0400 Subject: [PATCH 07/34] bufmgr: Remove freelist, always use clock-sweep This set of changes removes the list of available buffers and instead simply uses the clock-sweep algorithm to find and return an available buffer. This also removes the have_free_buffer() function and simply caps the pg_autoprewarm process to at most NBuffers. While on the surface this appears to be removing an optimization it is in fact eliminating code that induces overhead in the form of synchronization that is problematic for multi-core systems. The main reason for removing the freelist, however, is not the moderate improvement in scalability, but that having the freelist would require dedicated complexity in several upcoming patches. As we have not been able to find a case benefiting from the freelist... Author: Greg Burd Reviewed-by: Tomas Vondra Reviewed-by: Andres Freund Discussion: https://postgr.es/m/70C6A5B5-2A20-4D0B-BC73-EB09DD62D61C@getmailspring.com --- contrib/pg_prewarm/autoprewarm.c | 30 +++---- src/backend/storage/buffer/README | 40 +++------ src/backend/storage/buffer/buf_init.c | 9 -- src/backend/storage/buffer/bufmgr.c | 29 +------ src/backend/storage/buffer/freelist.c | 119 +------------------------- src/include/storage/buf_internals.h | 13 +-- 6 files changed, 31 insertions(+), 209 deletions(-) diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c index 880e897796a1e..8b68dafc2611c 100644 --- a/contrib/pg_prewarm/autoprewarm.c +++ b/contrib/pg_prewarm/autoprewarm.c @@ -370,6 +370,15 @@ apw_load_buffers(void) apw_state->prewarm_start_idx = apw_state->prewarm_stop_idx = 0; apw_state->prewarmed_blocks = 0; + /* Don't prewarm more than we can fit. */ + if (num_elements > NBuffers) + { + num_elements = NBuffers; + ereport(LOG, + (errmsg("autoprewarm capping prewarmed blocks to %d (shared_buffers size)", + NBuffers))); + } + /* Get the info position of the first block of the next database. */ while (apw_state->prewarm_start_idx < num_elements) { @@ -410,10 +419,6 @@ apw_load_buffers(void) apw_state->database = current_db; Assert(apw_state->prewarm_start_idx < apw_state->prewarm_stop_idx); - /* If we've run out of free buffers, don't launch another worker. */ - if (!have_free_buffer()) - break; - /* * Likewise, don't launch if we've already been told to shut down. * (The launch would fail anyway, but we might as well skip it.) @@ -462,12 +467,6 @@ apw_read_stream_next_block(ReadStream *stream, { BlockInfoRecord blk = p->block_info[p->pos]; - if (!have_free_buffer()) - { - p->pos = apw_state->prewarm_stop_idx; - return InvalidBlockNumber; - } - if (blk.tablespace != p->tablespace) return InvalidBlockNumber; @@ -523,10 +522,10 @@ autoprewarm_database_main(Datum main_arg) blk = block_info[i]; /* - * Loop until we run out of blocks to prewarm or until we run out of free + * Loop until we run out of blocks to prewarm or until we run out of * buffers. */ - while (i < apw_state->prewarm_stop_idx && have_free_buffer()) + while (i < apw_state->prewarm_stop_idx) { Oid tablespace = blk.tablespace; RelFileNumber filenumber = blk.filenumber; @@ -568,14 +567,13 @@ autoprewarm_database_main(Datum main_arg) /* * We have a relation; now let's loop until we find a valid fork of - * the relation or we run out of free buffers. Once we've read from - * all valid forks or run out of options, we'll close the relation and + * the relation or we run out of buffers. Once we've read from all + * valid forks or run out of options, we'll close the relation and * move on. */ while (i < apw_state->prewarm_stop_idx && blk.tablespace == tablespace && - blk.filenumber == filenumber && - have_free_buffer()) + blk.filenumber == filenumber) { ForkNumber forknum = blk.forknum; BlockNumber nblocks; diff --git a/src/backend/storage/buffer/README b/src/backend/storage/buffer/README index 4b13da5d7add8..119f31b5d6584 100644 --- a/src/backend/storage/buffer/README +++ b/src/backend/storage/buffer/README @@ -128,11 +128,11 @@ independently. If it is necessary to lock more than one partition at a time, they must be locked in partition-number order to avoid risk of deadlock. * A separate system-wide spinlock, buffer_strategy_lock, provides mutual -exclusion for operations that access the buffer free list or select -buffers for replacement. A spinlock is used here rather than a lightweight -lock for efficiency; no other locks of any sort should be acquired while -buffer_strategy_lock is held. This is essential to allow buffer replacement -to happen in multiple backends with reasonable concurrency. +exclusion for operations that select buffers for replacement. A spinlock is +used here rather than a lightweight lock for efficiency; no other locks of any +sort should be acquired while buffer_strategy_lock is held. This is essential +to allow buffer replacement to happen in multiple backends with reasonable +concurrency. * Each buffer header contains a spinlock that must be taken when examining or changing fields of that buffer header. This allows operations such as @@ -158,18 +158,8 @@ unset by sleeping on the buffer's condition variable. Normal Buffer Replacement Strategy ---------------------------------- -There is a "free list" of buffers that are prime candidates for replacement. -In particular, buffers that are completely free (contain no valid page) are -always in this list. We could also throw buffers into this list if we -consider their pages unlikely to be needed soon; however, the current -algorithm never does that. The list is singly-linked using fields in the -buffer headers; we maintain head and tail pointers in global variables. -(Note: although the list links are in the buffer headers, they are -considered to be protected by the buffer_strategy_lock, not the buffer-header -spinlocks.) To choose a victim buffer to recycle when there are no free -buffers available, we use a simple clock-sweep algorithm, which avoids the -need to take system-wide locks during common operations. It works like -this: +To choose a victim buffer to recycle we use a simple clock-sweep algorithm. It +works like this: Each buffer header contains a usage counter, which is incremented (up to a small limit value) whenever the buffer is pinned. (This requires only the @@ -184,20 +174,14 @@ The algorithm for a process that needs to obtain a victim buffer is: 1. Obtain buffer_strategy_lock. -2. If buffer free list is nonempty, remove its head buffer. Release -buffer_strategy_lock. If the buffer is pinned or has a nonzero usage count, -it cannot be used; ignore it go back to step 1. Otherwise, pin the buffer, -and return it. +2. Select the buffer pointed to by nextVictimBuffer, and circularly advance +nextVictimBuffer for next time. Release buffer_strategy_lock. -3. Otherwise, the buffer free list is empty. Select the buffer pointed to by -nextVictimBuffer, and circularly advance nextVictimBuffer for next time. -Release buffer_strategy_lock. - -4. If the selected buffer is pinned or has a nonzero usage count, it cannot +3. If the selected buffer is pinned or has a nonzero usage count, it cannot be used. Decrement its usage count (if nonzero), reacquire buffer_strategy_lock, and return to step 3 to examine the next buffer. -5. Pin the selected buffer, and return. +4. Pin the selected buffer, and return. (Note that if the selected buffer is dirty, we will have to write it out before we can recycle it; if someone else pins the buffer meanwhile we will @@ -234,7 +218,7 @@ the ring strategy effectively degrades to the normal strategy. VACUUM uses a ring like sequential scans, however, the size of this ring is controlled by the vacuum_buffer_usage_limit GUC. Dirty pages are not removed -from the ring. Instead, WAL is flushed if needed to allow reuse of the +from the ring. Instead, the WAL is flushed if needed to allow reuse of the buffers. Before introducing the buffer ring strategy in 8.3, VACUUM's buffers were sent to the freelist, which was effectively a buffer ring of 1 buffer, resulting in excessive WAL flushing. diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index ed1dc488a42b4..6fd3a6bbac5ea 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -128,20 +128,11 @@ BufferManagerShmemInit(void) pgaio_wref_clear(&buf->io_wref); - /* - * Initially link all the buffers together as unused. Subsequent - * management of this list is done by freelist.c. - */ - buf->freeNext = i + 1; - LWLockInitialize(BufferDescriptorGetContentLock(buf), LWTRANCHE_BUFFER_CONTENT); ConditionVariableInit(BufferDescriptorGetIOCV(buf)); } - - /* Correct last entry of linked list */ - GetBufferDescriptor(NBuffers - 1)->freeNext = FREENEXT_END_OF_LIST; } /* Init other shared buffer-management stuff */ diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 9fc906a4a4082..fe470de63f20c 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -2094,12 +2094,6 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, */ UnpinBuffer(victim_buf_hdr); - /* - * The victim buffer we acquired previously is clean and unused, let - * it be found again quickly - */ - StrategyFreeBuffer(victim_buf_hdr); - /* remaining code should match code at top of routine */ existing_buf_hdr = GetBufferDescriptor(existing_buf_id); @@ -2158,8 +2152,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, } /* - * InvalidateBuffer -- mark a shared buffer invalid and return it to the - * freelist. + * InvalidateBuffer -- mark a shared buffer invalid. * * The buffer header spinlock must be held at entry. We drop it before * returning. (This is sane because the caller must have locked the @@ -2257,11 +2250,6 @@ InvalidateBuffer(BufferDesc *buf) * Done with mapping lock. */ LWLockRelease(oldPartitionLock); - - /* - * Insert the buffer at the head of the list of free buffers. - */ - StrategyFreeBuffer(buf); } /* @@ -2679,11 +2667,6 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, { BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1); - /* - * The victim buffer we acquired previously is clean and unused, - * let it be found again quickly - */ - StrategyFreeBuffer(buf_hdr); UnpinBuffer(buf_hdr); } @@ -2756,12 +2739,6 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, valid = PinBuffer(existing_hdr, strategy); LWLockRelease(partition_lock); - - /* - * The victim buffer we acquired previously is clean and unused, - * let it be found again quickly - */ - StrategyFreeBuffer(victim_buf_hdr); UnpinBuffer(victim_buf_hdr); buffers[i] = BufferDescriptorGetBuffer(existing_hdr); @@ -3658,8 +3635,8 @@ BgBufferSync(WritebackContext *wb_context) uint32 new_recent_alloc; /* - * Find out where the freelist clock-sweep currently is, and how many - * buffer allocations have happened since our last call. + * Find out where the clock-sweep currently is, and how many buffer + * allocations have happened since our last call. */ strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc); diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index cd94a7d8a7b39..7d59a92bd1a88 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -39,14 +39,6 @@ typedef struct */ pg_atomic_uint32 nextVictimBuffer; - int firstFreeBuffer; /* Head of list of unused buffers */ - int lastFreeBuffer; /* Tail of list of unused buffers */ - - /* - * NOTE: lastFreeBuffer is undefined when firstFreeBuffer is -1 (that is, - * when the list is empty) - */ - /* * Statistics. These counters should be wide enough that they can't * overflow during a single bgwriter cycle. @@ -163,23 +155,6 @@ ClockSweepTick(void) return victim; } -/* - * have_free_buffer -- a lockless check to see if there is a free buffer in - * buffer pool. - * - * If the result is true that will become stale once free buffers are moved out - * by other operations, so the caller who strictly want to use a free buffer - * should not call this. - */ -bool -have_free_buffer(void) -{ - if (StrategyControl->firstFreeBuffer >= 0) - return true; - else - return false; -} - /* * StrategyGetBuffer * @@ -249,69 +224,7 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r */ pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1); - /* - * First check, without acquiring the lock, whether there's buffers in the - * freelist. Since we otherwise don't require the spinlock in every - * StrategyGetBuffer() invocation, it'd be sad to acquire it here - - * uselessly in most cases. That obviously leaves a race where a buffer is - * put on the freelist but we don't see the store yet - but that's pretty - * harmless, it'll just get used during the next buffer acquisition. - * - * If there's buffers on the freelist, acquire the spinlock to pop one - * buffer of the freelist. Then check whether that buffer is usable and - * repeat if not. - * - * Note that the freeNext fields are considered to be protected by the - * buffer_strategy_lock not the individual buffer spinlocks, so it's OK to - * manipulate them without holding the spinlock. - */ - if (StrategyControl->firstFreeBuffer >= 0) - { - while (true) - { - /* Acquire the spinlock to remove element from the freelist */ - SpinLockAcquire(&StrategyControl->buffer_strategy_lock); - - if (StrategyControl->firstFreeBuffer < 0) - { - SpinLockRelease(&StrategyControl->buffer_strategy_lock); - break; - } - - buf = GetBufferDescriptor(StrategyControl->firstFreeBuffer); - Assert(buf->freeNext != FREENEXT_NOT_IN_LIST); - - /* Unconditionally remove buffer from freelist */ - StrategyControl->firstFreeBuffer = buf->freeNext; - buf->freeNext = FREENEXT_NOT_IN_LIST; - - /* - * Release the lock so someone else can access the freelist while - * we check out this buffer. - */ - SpinLockRelease(&StrategyControl->buffer_strategy_lock); - - /* - * If the buffer is pinned or has a nonzero usage_count, we cannot - * use it; discard it and retry. (This can only happen if VACUUM - * put a valid buffer in the freelist and then someone else used - * it before we got to it. It's probably impossible altogether as - * of 8.3, but we'd better check anyway.) - */ - local_buf_state = LockBufHdr(buf); - if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0 - && BUF_STATE_GET_USAGECOUNT(local_buf_state) == 0) - { - if (strategy != NULL) - AddBufferToRing(strategy, buf); - *buf_state = local_buf_state; - return buf; - } - UnlockBufHdr(buf, local_buf_state); - } - } - - /* Nothing on the freelist, so run the "clock-sweep" algorithm */ + /* Use the "clock sweep" algorithm to find a free buffer */ trycounter = NBuffers; for (;;) { @@ -356,29 +269,6 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r } } -/* - * StrategyFreeBuffer: put a buffer on the freelist - */ -void -StrategyFreeBuffer(BufferDesc *buf) -{ - SpinLockAcquire(&StrategyControl->buffer_strategy_lock); - - /* - * It is possible that we are told to put something in the freelist that - * is already in it; don't screw up the list if so. - */ - if (buf->freeNext == FREENEXT_NOT_IN_LIST) - { - buf->freeNext = StrategyControl->firstFreeBuffer; - if (buf->freeNext < 0) - StrategyControl->lastFreeBuffer = buf->buf_id; - StrategyControl->firstFreeBuffer = buf->buf_id; - } - - SpinLockRelease(&StrategyControl->buffer_strategy_lock); -} - /* * StrategySyncStart -- tell BgBufferSync where to start syncing * @@ -504,13 +394,6 @@ StrategyInitialize(bool init) SpinLockInit(&StrategyControl->buffer_strategy_lock); - /* - * Grab the whole linked list of free buffers for our strategy. We - * assume it was previously set up by BufferManagerShmemInit(). - */ - StrategyControl->firstFreeBuffer = 0; - StrategyControl->lastFreeBuffer = NBuffers - 1; - /* Initialize the clock-sweep pointer */ pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0); diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index 3a210c710f633..dfd614f7ca449 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -217,8 +217,7 @@ BufMappingPartitionLockByIndex(uint32 index) * single atomic variable. This layout allow us to do some operations in a * single atomic operation, without actually acquiring and releasing spinlock; * for instance, increase or decrease refcount. buf_id field never changes - * after initialization, so does not need locking. freeNext is protected by - * the buffer_strategy_lock not buffer header lock. The LWLock can take care + * after initialization, so does not need locking. The LWLock can take care * of itself. The buffer header lock is *not* used to control access to the * data in the buffer! * @@ -264,7 +263,6 @@ typedef struct BufferDesc pg_atomic_uint32 state; int wait_backend_pgprocno; /* backend of pin-count waiter */ - int freeNext; /* link in freelist chain */ PgAioWaitRef io_wref; /* set iff AIO is in progress */ LWLock content_lock; /* to lock access to buffer contents */ @@ -360,13 +358,6 @@ BufferDescriptorGetContentLock(const BufferDesc *bdesc) return (LWLock *) (&bdesc->content_lock); } -/* - * The freeNext field is either the index of the next freelist entry, - * or one of these special values: - */ -#define FREENEXT_END_OF_LIST (-1) -#define FREENEXT_NOT_IN_LIST (-2) - /* * Functions for acquiring/releasing a shared buffer header's spinlock. Do * not apply these to local buffers! @@ -444,7 +435,6 @@ extern void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag extern IOContext IOContextForStrategy(BufferAccessStrategy strategy); extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_ring); -extern void StrategyFreeBuffer(BufferDesc *buf); extern bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring); @@ -453,7 +443,6 @@ extern void StrategyNotifyBgWriter(int bgwprocno); extern Size StrategyShmemSize(void); extern void StrategyInitialize(bool init); -extern bool have_free_buffer(void); /* buf_table.c */ extern Size BufTableShmemSize(int size); From 06473f5a344df8c9594ead90a609b86f6724cff8 Mon Sep 17 00:00:00 2001 From: Tatsuo Ishii Date: Sat, 6 Sep 2025 07:49:51 +0900 Subject: [PATCH 08/34] Allow to log raw parse tree. This commit allows to log the raw parse tree in the same way we currently log the parse tree, rewritten tree, and plan tree. To avoid unnecessary log noise for users not interested in this detail, a new GUC option, "debug_print_raw_parse", has been added. When starting the PostgreSQL process with "-d N", and N is 3 or higher, debug_print_raw_parse is enabled automatically, alongside debug_print_parse. Author: Chao Li Reviewed-by: Tender Wang Reviewed-by: Tatsuo Ishii Reviewed-by: John Naylor Discussion: https://postgr.es/m/CAEoWx2mcO0Gpo4vd8kPMAFWeJLSp0MeUUnaLdE1x0tSVd-VzUw%40mail.gmail.com --- doc/src/sgml/config.sgml | 12 +++++++++--- doc/src/sgml/rules.sgml | 1 + src/backend/tcop/postgres.c | 7 +++++++ src/backend/utils/misc/guc_parameters.dat | 6 ++++++ src/backend/utils/misc/guc_tables.c | 1 + src/backend/utils/misc/postgresql.conf.sample | 1 + src/include/utils/guc.h | 1 + 7 files changed, 26 insertions(+), 3 deletions(-) diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 0a4b3e55ba5ed..2a3685f474a96 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -7383,6 +7383,11 @@ local0.* /var/log/postgresql + debug_print_raw_parse (boolean) + + debug_print_raw_parse configuration parameter + + debug_print_parse (boolean) debug_print_parse configuration parameter @@ -7401,8 +7406,8 @@ local0.* /var/log/postgresql These parameters enable various debugging output to be emitted. - When set, they print the resulting parse tree, the query rewriter - output, or the execution plan for each executed query. + When set, they print the resulting raw parse tree, the parse tree, the query + rewriter output, or the execution plan for each executed query. These messages are emitted at LOG message level, so by default they will appear in the server log but will not be sent to the client. You can change that by adjusting @@ -7422,7 +7427,8 @@ local0.* /var/log/postgresql When set, debug_pretty_print indents the messages - produced by debug_print_parse, + produced by debug_print_raw_parse, + debug_print_parse, debug_print_rewritten, or debug_print_plan. This results in more readable but much longer output than the compact format used when diff --git a/doc/src/sgml/rules.sgml b/doc/src/sgml/rules.sgml index 8467d961fd0a0..282dcd722d495 100644 --- a/doc/src/sgml/rules.sgml +++ b/doc/src/sgml/rules.sgml @@ -60,6 +60,7 @@ SQL statement where the single parts that it is built from are stored separately. These query trees can be shown in the server log if you set the configuration parameters + debug_print_raw_parse, debug_print_parse, debug_print_rewritten, or debug_print_plan. The rule actions are also diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 0cecd4649020f..d356830f756be 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -649,6 +649,10 @@ pg_parse_query(const char *query_string) TRACE_POSTGRESQL_QUERY_PARSE_DONE(query_string); + if (Debug_print_raw_parse) + elog_node_display(LOG, "raw parse tree", raw_parsetree_list, + Debug_pretty_print); + return raw_parsetree_list; } @@ -3697,7 +3701,10 @@ set_debug_options(int debug_flag, GucContext context, GucSource source) if (debug_flag >= 2) SetConfigOption("log_statement", "all", context, source); if (debug_flag >= 3) + { + SetConfigOption("debug_print_raw_parse", "true", context, source); SetConfigOption("debug_print_parse", "true", context, source); + } if (debug_flag >= 4) SetConfigOption("debug_print_plan", "true", context, source); if (debug_flag >= 5) diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index a157cec3c4d00..0da01627cfec1 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -414,6 +414,12 @@ ifdef => 'DEBUG_NODE_TESTS_ENABLED', }, +{ name => 'debug_print_raw_parse', type => 'bool', context => 'PGC_USERSET', group => 'LOGGING_WHAT', + short_desc => 'Logs each query\'s raw parse tree.', + variable => 'Debug_print_raw_parse', + boot_val => 'false', +}, + { name => 'debug_print_parse', type => 'bool', context => 'PGC_USERSET', group => 'LOGGING_WHAT', short_desc => 'Logs each query\'s parse tree.', variable => 'Debug_print_parse', diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 787933a9e5acd..00c8376cf4ded 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -507,6 +507,7 @@ bool AllowAlterSystem = true; bool log_duration = false; bool Debug_print_plan = false; bool Debug_print_parse = false; +bool Debug_print_raw_parse = false; bool Debug_print_rewritten = false; bool Debug_pretty_print = true; diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index a9d8293474af5..26c0869356485 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -581,6 +581,7 @@ # - What to Log - +#debug_print_raw_parse = off #debug_print_parse = off #debug_print_rewritten = off #debug_print_plan = off diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h index 72981053e610f..756e80a2c2fcc 100644 --- a/src/include/utils/guc.h +++ b/src/include/utils/guc.h @@ -247,6 +247,7 @@ typedef enum /* GUC vars that are actually defined in guc_tables.c, rather than elsewhere */ extern PGDLLIMPORT bool Debug_print_plan; extern PGDLLIMPORT bool Debug_print_parse; +extern PGDLLIMPORT bool Debug_print_raw_parse; extern PGDLLIMPORT bool Debug_print_rewritten; extern PGDLLIMPORT bool Debug_pretty_print; From 43eb2c541941479714c11de9cfb7c67b54f1810d Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Mon, 8 Sep 2025 10:07:14 +0900 Subject: [PATCH 09/34] Update parser README to include parse_jsontable.c The README was missing parse_jsontable.c which handles JSON_TABLE. Oversight in de3600452b61. Author: Karthik S Discussion: https://postgr.es/m/CAK4gQD9gdcj+vq_FZGp=Rv-W+41v8_C7cmCUmDeu=cfrOdfXEw@mail.gmail.com Backpatch-through: 17 --- src/backend/parser/README | 1 + 1 file changed, 1 insertion(+) diff --git a/src/backend/parser/README b/src/backend/parser/README index e0c986a41efea..e26eb437a9f35 100644 --- a/src/backend/parser/README +++ b/src/backend/parser/README @@ -20,6 +20,7 @@ parse_cte.c handle Common Table Expressions (WITH clauses) parse_expr.c handle expressions like col, col + 3, x = 3 or x = 4 parse_enr.c handle ephemeral named rels (trigger transition tables, ...) parse_func.c handle functions, table.column and column identifiers +parse_jsontable.c handle JSON_TABLE parse_merge.c handle MERGE parse_node.c create nodes for various structures parse_oper.c handle operators in expressions From 1f7e9ba3ac4eff13041abcc4c9c517ad835fa449 Mon Sep 17 00:00:00 2001 From: Amit Kapila Date: Mon, 8 Sep 2025 06:10:15 +0000 Subject: [PATCH 10/34] Post-commit review fixes for 228c370868. This commit fixes three issues: 1) When a disabled subscription is created with retain_dead_tuples set to true, the launcher is not woken up immediately, which may lead to delays in creating the conflict detection slot. Creating the conflict detection slot is essential even when the subscription is not enabled. This ensures that dead tuples are retained, which is necessary for accurately identifying the type of conflict during replication. 2) Conflict-related data was unnecessarily retained when the subscription does not have a table. 3) Conflict-relevant data could be prematurely removed before applying prepared transactions on the publisher that are in the commit critical section. This issue occurred because the backend executing COMMIT PREPARED was not accounted for during the computation of oldestXid in the commit phase on the publisher. As a result, the subscriber could advance the conflict slot's xmin without waiting for such COMMIT PREPARED transactions to complete. We fixed this issue by identifying prepared transactions that are in the commit critical section during computation of oldestXid in commit phase. Author: Zhijie Hou Reviewed-by: shveta malik Reviewed-by: Dilip Kumar Reviewed-by: Nisha Moond Reviewed-by: Amit Kapila Discussion: https://postgr.es/m/OS9PR01MB16913DACB64E5721872AA5C02943BA@OS9PR01MB16913.jpnprd01.prod.outlook.com Discussion: https://postgr.es/m/OS9PR01MB16913F67856B0DA2A909788129400A@OS9PR01MB16913.jpnprd01.prod.outlook.com --- src/backend/access/transam/twophase.c | 55 +++++++++++++++++++++ src/backend/commands/subscriptioncmds.c | 12 ++++- src/backend/replication/logical/tablesync.c | 26 ++++++++++ src/backend/replication/logical/worker.c | 25 ++++++++-- src/backend/replication/walsender.c | 12 +++++ src/include/access/twophase.h | 2 + src/include/replication/worker_internal.h | 1 + src/test/subscription/t/035_conflicts.pl | 29 +++++++++++ 8 files changed, 157 insertions(+), 5 deletions(-) diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 7918176fc588e..3e20f4487872e 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -2809,3 +2809,58 @@ LookupGXactBySubid(Oid subid) return found; } + +/* + * TwoPhaseGetXidByLockingProc + * Return the oldest transaction ID from prepared transactions that are + * currently in the commit critical section. + * + * This function only considers transactions in the currently connected + * database. If no matching transactions are found, it returns + * InvalidTransactionId. + */ +TransactionId +TwoPhaseGetOldestXidInCommit(void) +{ + TransactionId oldestRunningXid = InvalidTransactionId; + + LWLockAcquire(TwoPhaseStateLock, LW_SHARED); + + for (int i = 0; i < TwoPhaseState->numPrepXacts; i++) + { + GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; + PGPROC *commitproc; + TransactionId xid; + + if (!gxact->valid) + continue; + + if (gxact->locking_backend == INVALID_PROC_NUMBER) + continue; + + /* + * Get the backend that is handling the transaction. It's safe to + * access this backend while holding TwoPhaseStateLock, as the backend + * can only be destroyed after either removing or unlocking the + * current global transaction, both of which require an exclusive + * TwoPhaseStateLock. + */ + commitproc = GetPGProcByNumber(gxact->locking_backend); + + if (MyDatabaseId != commitproc->databaseId) + continue; + + if ((commitproc->delayChkptFlags & DELAY_CHKPT_IN_COMMIT) == 0) + continue; + + xid = XidFromFullTransactionId(gxact->fxid); + + if (!TransactionIdIsValid(oldestRunningXid) || + TransactionIdPrecedes(xid, oldestRunningXid)) + oldestRunningXid = xid; + } + + LWLockRelease(TwoPhaseStateLock); + + return oldestRunningXid; +} diff --git a/src/backend/commands/subscriptioncmds.c b/src/backend/commands/subscriptioncmds.c index 82cf65fae737a..750d262fccade 100644 --- a/src/backend/commands/subscriptioncmds.c +++ b/src/backend/commands/subscriptioncmds.c @@ -854,7 +854,17 @@ CreateSubscription(ParseState *pstate, CreateSubscriptionStmt *stmt, pgstat_create_subscription(subid); - if (opts.enabled) + /* + * Notify the launcher to start the apply worker if the subscription is + * enabled, or to create the conflict detection slot if retain_dead_tuples + * is enabled. + * + * Creating the conflict detection slot is essential even when the + * subscription is not enabled. This ensures that dead tuples are + * retained, which is necessary for accurately identifying the type of + * conflict during replication. + */ + if (opts.enabled || opts.retaindeadtuples) ApplyLauncherWakeupAtCommit(); ObjectAddressSet(myself, SubscriptionRelationId, subid); diff --git a/src/backend/replication/logical/tablesync.c b/src/backend/replication/logical/tablesync.c index d3356bc84ee0c..e6da4028d392e 100644 --- a/src/backend/replication/logical/tablesync.c +++ b/src/backend/replication/logical/tablesync.c @@ -1788,6 +1788,32 @@ AllTablesyncsReady(void) return has_subrels && (table_states_not_ready == NIL); } +/* + * Return whether the subscription currently has any relations. + * + * Note: Unlike HasSubscriptionRelations(), this function relies on cached + * information for subscription relations. Additionally, it should not be + * invoked outside of apply or tablesync workers, as MySubscription must be + * initialized first. + */ +bool +HasSubscriptionRelationsCached(void) +{ + bool started_tx; + bool has_subrels; + + /* We need up-to-date subscription tables info here */ + has_subrels = FetchTableStates(&started_tx); + + if (started_tx) + { + CommitTransactionCommand(); + pgstat_report_stat(true); + } + + return has_subrels; +} + /* * Update the two_phase state of the specified subscription in pg_subscription. */ diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c index f1ebd63e792ee..c0f6bef5c282c 100644 --- a/src/backend/replication/logical/worker.c +++ b/src/backend/replication/logical/worker.c @@ -4595,11 +4595,28 @@ wait_for_local_flush(RetainDeadTuplesData *rdt_data) * workers is complex and not worth the effort, so we simply return if not * all tables are in the READY state. * - * It is safe to add new tables with initial states to the subscription - * after this check because any changes applied to these tables should - * have a WAL position greater than the rdt_data->remote_lsn. + * Advancing the transaction ID is necessary even when no tables are + * currently subscribed, to avoid retaining dead tuples unnecessarily. + * While it might seem safe to skip all phases and directly assign + * candidate_xid to oldest_nonremovable_xid during the + * RDT_GET_CANDIDATE_XID phase in such cases, this is unsafe. If users + * concurrently add tables to the subscription, the apply worker may not + * process invalidations in time. Consequently, + * HasSubscriptionRelationsCached() might miss the new tables, leading to + * premature advancement of oldest_nonremovable_xid. + * + * Performing the check during RDT_WAIT_FOR_LOCAL_FLUSH is safe, as + * invalidations are guaranteed to be processed before applying changes + * from newly added tables while waiting for the local flush to reach + * remote_lsn. + * + * Additionally, even if we check for subscription tables during + * RDT_GET_CANDIDATE_XID, they might be dropped before reaching + * RDT_WAIT_FOR_LOCAL_FLUSH. Therefore, it's still necessary to verify + * subscription tables at this stage to prevent unnecessary tuple + * retention. */ - if (!AllTablesyncsReady()) + if (HasSubscriptionRelationsCached() && !AllTablesyncsReady()) { TimestampTz now; diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index e3dce9dc68d04..59822f22b8d06 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -51,6 +51,7 @@ #include "access/timeline.h" #include "access/transam.h" +#include "access/twophase.h" #include "access/xact.h" #include "access/xlog_internal.h" #include "access/xlogreader.h" @@ -2719,6 +2720,7 @@ ProcessStandbyPSRequestMessage(void) { XLogRecPtr lsn = InvalidXLogRecPtr; TransactionId oldestXidInCommit; + TransactionId oldestGXidInCommit; FullTransactionId nextFullXid; FullTransactionId fullOldestXidInCommit; WalSnd *walsnd = MyWalSnd; @@ -2746,6 +2748,16 @@ ProcessStandbyPSRequestMessage(void) * ones replicated. */ oldestXidInCommit = GetOldestActiveTransactionId(true, false); + oldestGXidInCommit = TwoPhaseGetOldestXidInCommit(); + + /* + * Update the oldest xid for standby transmission if an older prepared + * transaction exists and is currently in commit phase. + */ + if (TransactionIdIsValid(oldestGXidInCommit) && + TransactionIdPrecedes(oldestGXidInCommit, oldestXidInCommit)) + oldestXidInCommit = oldestGXidInCommit; + nextFullXid = ReadNextFullTransactionId(); fullOldestXidInCommit = FullTransactionIdFromAllowableAt(nextFullXid, oldestXidInCommit); diff --git a/src/include/access/twophase.h b/src/include/access/twophase.h index 509bdad9a5d55..64463e9f4afb4 100644 --- a/src/include/access/twophase.h +++ b/src/include/access/twophase.h @@ -68,4 +68,6 @@ extern void TwoPhaseTransactionGid(Oid subid, TransactionId xid, char *gid_res, int szgid); extern bool LookupGXactBySubid(Oid subid); +extern TransactionId TwoPhaseGetOldestXidInCommit(void); + #endif /* TWOPHASE_H */ diff --git a/src/include/replication/worker_internal.h b/src/include/replication/worker_internal.h index 62ea1a0058081..de00380261279 100644 --- a/src/include/replication/worker_internal.h +++ b/src/include/replication/worker_internal.h @@ -272,6 +272,7 @@ extern void ReplicationOriginNameForLogicalRep(Oid suboid, Oid relid, char *originname, Size szoriginname); extern bool AllTablesyncsReady(void); +extern bool HasSubscriptionRelationsCached(void); extern void UpdateTwoPhaseState(Oid suboid, char new_state); extern void process_syncing_tables(XLogRecPtr current_lsn); diff --git a/src/test/subscription/t/035_conflicts.pl b/src/test/subscription/t/035_conflicts.pl index 51b23a39fa935..e06429c288fe6 100644 --- a/src/test/subscription/t/035_conflicts.pl +++ b/src/test/subscription/t/035_conflicts.pl @@ -386,6 +386,35 @@ .*Remote row \(2, 4\); replica identity full \(2, 2\)/, 'update target row was deleted in tab'); +############################################################################### +# Check that the xmin value of the conflict detection slot can be advanced when +# the subscription has no tables. +############################################################################### + +# Remove the table from the publication +$node_B->safe_psql('postgres', "ALTER PUBLICATION tap_pub_B DROP TABLE tab"); + +$node_A->safe_psql('postgres', + "ALTER SUBSCRIPTION $subname_AB REFRESH PUBLICATION"); + +# Remember the next transaction ID to be assigned +$next_xid = $node_A->safe_psql('postgres', "SELECT txid_current() + 1;"); + +# Confirm that the xmin value is advanced to the latest nextXid. If no +# transactions are running, the apply worker selects nextXid as the candidate +# for the non-removable xid. See GetOldestActiveTransactionId(). +ok( $node_A->poll_query_until( + 'postgres', + "SELECT xmin = $next_xid from pg_replication_slots WHERE slot_name = 'pg_conflict_detection'" + ), + "the xmin value of slot 'pg_conflict_detection' is updated on Node A"); + +# Re-add the table to the publication for further tests +$node_B->safe_psql('postgres', "ALTER PUBLICATION tap_pub_B ADD TABLE tab"); + +$node_A->safe_psql('postgres', + "ALTER SUBSCRIPTION $subname_AB REFRESH PUBLICATION WITH (copy_data = false)"); + ############################################################################### # Check that dead tuple retention stops due to the wait time surpassing # max_retention_duration. From 8191e0c16a0373f851a9f5a8112e3aec105b5276 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Mon, 8 Sep 2025 15:52:23 +0900 Subject: [PATCH 11/34] Fix corruption of pgstats shared hashtable due to OOM failures A new pgstats entry is created as a two-step process: - The entry is looked at in the shared hashtable of pgstats, and is inserted if not found. - When not found and inserted, its fields are then initialized. This part include a DSA chunk allocation for the stats data of the new entry. As currently coded, if the DSA chunk allocation fails due to an out-of-memory failure, an ERROR is generated, leaving in the pgstats shared hashtable an inconsistent entry due to the first step, as the entry has already been inserted in the hashtable. These broken entries can then be found by other backends, crashing them. There are only two callers of pgstat_init_entry(), when loading the pgstats file at startup and when creating a new pgstats entry. This commit changes pgstat_init_entry() so as we use dsa_allocate_extended() with DSA_ALLOC_NO_OOM, making it return NULL on allocation failure instead of failing. This way, a backend failing an entry creation can take appropriate cleanup actions in the shared hashtable before throwing an error. Currently, this means removing the entry from the shared hashtable before throwing the error for the allocation failure. Out-of-memory errors unlikely happen in the wild, and we do not bother with back-patches when these are fixed, usually. However, the problem dealt with here is a degree worse as it breaks the shared memory state of pgstats, impacting other processes that may look at an inconsistent entry that a different process has failed to create. Author: Mikhail Kot Discussion: https://postgr.es/m/CAAi9E7jELo5_-sBENftnc2E8XhW2PKZJWfTC3i2y-GMQd2bcqQ@mail.gmail.com Backpatch-through: 15 --- src/backend/utils/activity/pgstat.c | 11 +++++++++ src/backend/utils/activity/pgstat_shmem.c | 28 ++++++++++++++++++++++- 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/src/backend/utils/activity/pgstat.c b/src/backend/utils/activity/pgstat.c index ffb5b8cce3441..f8e91484e36be 100644 --- a/src/backend/utils/activity/pgstat.c +++ b/src/backend/utils/activity/pgstat.c @@ -1975,6 +1975,17 @@ pgstat_read_statsfile(void) header = pgstat_init_entry(key.kind, p); dshash_release_lock(pgStatLocal.shared_hash, p); + if (header == NULL) + { + /* + * It would be tempting to switch this ERROR to a + * WARNING, but it would mean that all the statistics + * are discarded when the environment fails on OOM. + */ + elog(ERROR, "could not allocate entry %u/%u/%" PRIu64 " of type %c", + key.kind, key.dboid, + key.objid, t); + } if (!read_chunk(fpin, pgstat_get_entry_data(key.kind, header), diff --git a/src/backend/utils/activity/pgstat_shmem.c b/src/backend/utils/activity/pgstat_shmem.c index 62de347445365..9dc3212f7dd01 100644 --- a/src/backend/utils/activity/pgstat_shmem.c +++ b/src/backend/utils/activity/pgstat_shmem.c @@ -289,6 +289,13 @@ pgstat_detach_shmem(void) * ------------------------------------------------------------ */ +/* + * Initialize entry newly-created. + * + * Returns NULL in the event of an allocation failure, so as callers can + * take cleanup actions as the entry initialized is already inserted in the + * shared hashtable. + */ PgStatShared_Common * pgstat_init_entry(PgStat_Kind kind, PgStatShared_HashEntry *shhashent) @@ -311,7 +318,12 @@ pgstat_init_entry(PgStat_Kind kind, pg_atomic_init_u32(&shhashent->generation, 0); shhashent->dropped = false; - chunk = dsa_allocate0(pgStatLocal.dsa, pgstat_get_kind_info(kind)->shared_size); + chunk = dsa_allocate_extended(pgStatLocal.dsa, + pgstat_get_kind_info(kind)->shared_size, + DSA_ALLOC_ZERO | DSA_ALLOC_NO_OOM); + if (chunk == InvalidDsaPointer) + return NULL; + shheader = dsa_get_address(pgStatLocal.dsa, chunk); shheader->magic = 0xdeadbeef; @@ -509,6 +521,20 @@ pgstat_get_entry_ref(PgStat_Kind kind, Oid dboid, uint64 objid, bool create, if (!shfound) { shheader = pgstat_init_entry(kind, shhashent); + if (shheader == NULL) + { + /* + * Failed the allocation of a new entry, so clean up the + * shared hashtable before giving up. + */ + dshash_delete_entry(pgStatLocal.shared_hash, shhashent); + + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"), + errdetail("Failed while allocating entry %u/%u/%" PRIu64 ".", + key.kind, key.dboid, key.objid))); + } pgstat_acquire_entry_ref(entry_ref, shhashent, shheader); if (created_entry != NULL) From 6456c6e2c4ad1cf9752e09cce37bfcfe2190c5e0 Mon Sep 17 00:00:00 2001 From: Amit Kapila Date: Mon, 8 Sep 2025 11:38:02 +0000 Subject: [PATCH 12/34] Add test to prevent premature removal of conflict-relevant data. A test has been added to ensure that conflict-relevant data is not prematurely removed when a concurrent prepared transaction is being committed on the publisher. This test introduces an injection point that simulates the presence of a prepared transaction in the commit phase, validating that the system correctly delays conflict slot advancement until the transaction is fully committed. Additionally, the test serves as a safeguard for developers, ensuring that the acquisition of the commit timestamp does not occur before marking DELAY_CHKPT_IN_COMMIT in RecordTransactionCommitPrepared. Reported-by: Robert Haas Author: Zhijie Hou Reviewed-by: shveta malik Reviewed-by: Amit Kapila Discussion: https://postgr.es/m/OS9PR01MB16913F67856B0DA2A909788129400A@OS9PR01MB16913.jpnprd01.prod.outlook.com --- src/backend/access/transam/twophase.c | 6 + src/test/subscription/Makefile | 4 +- src/test/subscription/meson.build | 5 +- src/test/subscription/t/035_conflicts.pl | 160 +++++++++++++++++++++++ 4 files changed, 173 insertions(+), 2 deletions(-) diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 3e20f4487872e..d8e2fce2c99b7 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -103,6 +103,7 @@ #include "storage/proc.h" #include "storage/procarray.h" #include "utils/builtins.h" +#include "utils/injection_point.h" #include "utils/memutils.h" #include "utils/timestamp.h" @@ -2332,12 +2333,17 @@ RecordTransactionCommitPrepared(TransactionId xid, replorigin = (replorigin_session_origin != InvalidRepOriginId && replorigin_session_origin != DoNotReplicateId); + /* Load the injection point before entering the critical section */ + INJECTION_POINT_LOAD("commit-after-delay-checkpoint"); + START_CRIT_SECTION(); /* See notes in RecordTransactionCommit */ Assert((MyProc->delayChkptFlags & DELAY_CHKPT_IN_COMMIT) == 0); MyProc->delayChkptFlags |= DELAY_CHKPT_IN_COMMIT; + INJECTION_POINT_CACHED("commit-after-delay-checkpoint", NULL); + /* * Ensures the DELAY_CHKPT_IN_COMMIT flag write is globally visible before * commit time is written. diff --git a/src/test/subscription/Makefile b/src/test/subscription/Makefile index 50b65d8f6ea21..9d97e7d5c0d6d 100644 --- a/src/test/subscription/Makefile +++ b/src/test/subscription/Makefile @@ -13,9 +13,11 @@ subdir = src/test/subscription top_builddir = ../../.. include $(top_builddir)/src/Makefile.global -EXTRA_INSTALL = contrib/hstore +EXTRA_INSTALL = contrib/hstore \ + src/test/modules/injection_points export with_icu +export enable_injection_points check: $(prove_check) diff --git a/src/test/subscription/meson.build b/src/test/subscription/meson.build index 586ffba434e11..20b4e523d9307 100644 --- a/src/test/subscription/meson.build +++ b/src/test/subscription/meson.build @@ -5,7 +5,10 @@ tests += { 'sd': meson.current_source_dir(), 'bd': meson.current_build_dir(), 'tap': { - 'env': {'with_icu': icu.found() ? 'yes' : 'no'}, + 'env': { + 'with_icu': icu.found() ? 'yes' : 'no', + 'enable_injection_points': get_option('injection_points') ? 'yes' : 'no', + }, 'tests': [ 't/001_rep_changes.pl', 't/002_types.pl', diff --git a/src/test/subscription/t/035_conflicts.pl b/src/test/subscription/t/035_conflicts.pl index e06429c288fe6..db0d5b464e825 100644 --- a/src/test/subscription/t/035_conflicts.pl +++ b/src/test/subscription/t/035_conflicts.pl @@ -415,6 +415,166 @@ $node_A->safe_psql('postgres', "ALTER SUBSCRIPTION $subname_AB REFRESH PUBLICATION WITH (copy_data = false)"); +############################################################################### +# Test that publisher's transactions marked with DELAY_CHKPT_IN_COMMIT prevent +# concurrently deleted tuples on the subscriber from being removed. This test +# also acts as a safeguard to prevent developers from moving the commit +# timestamp acquisition before marking DELAY_CHKPT_IN_COMMIT in +# RecordTransactionCommitPrepared. +############################################################################### + +my $injection_points_supported = $node_B->check_extension('injection_points'); + +# This test depends on an injection point to block the prepared transaction +# commit after marking DELAY_CHKPT_IN_COMMIT flag. +if ($injection_points_supported != 0) +{ + $node_B->append_conf('postgresql.conf', + "shared_preload_libraries = 'injection_points' + max_prepared_transactions = 1"); + $node_B->restart; + + # Disable the subscription on Node B for testing only one-way + # replication. + $node_B->psql('postgres', "ALTER SUBSCRIPTION $subname_BA DISABLE;"); + + # Wait for the apply worker to stop + $node_B->poll_query_until('postgres', + "SELECT count(*) = 0 FROM pg_stat_activity WHERE backend_type = 'logical replication apply worker'" + ); + + # Truncate the table to cleanup existing dead rows in the table. Then insert + # a new row. + $node_B->safe_psql( + 'postgres', qq( + TRUNCATE tab; + INSERT INTO tab VALUES(1, 1); + )); + + $node_B->wait_for_catchup($subname_AB); + + # Create the injection_points extension on the publisher node and attach to the + # commit-after-delay-checkpoint injection point. + $node_B->safe_psql( + 'postgres', + "CREATE EXTENSION injection_points; + SELECT injection_points_attach('commit-after-delay-checkpoint', 'wait');" + ); + + # Start a background session on the publisher node to perform an update and + # pause at the injection point. + my $pub_session = $node_B->background_psql('postgres'); + $pub_session->query_until( + qr/starting_bg_psql/, + q{ + \echo starting_bg_psql + BEGIN; + UPDATE tab SET b = 2 WHERE a = 1; + PREPARE TRANSACTION 'txn_with_later_commit_ts'; + COMMIT PREPARED 'txn_with_later_commit_ts'; + } + ); + + # Confirm the update is suspended + $result = + $node_B->safe_psql('postgres', 'SELECT * FROM tab WHERE a = 1'); + is($result, qq(1|1), 'publisher sees the old row'); + + # Delete the row on the subscriber. The deleted row should be retained due to a + # transaction on the publisher, which is currently marked with the + # DELAY_CHKPT_IN_COMMIT flag. + $node_A->safe_psql('postgres', "DELETE FROM tab WHERE a = 1;"); + + # Get the commit timestamp for the delete + my $sub_ts = $node_A->safe_psql('postgres', + "SELECT timestamp FROM pg_last_committed_xact();"); + + $log_location = -s $node_A->logfile; + + # Confirm that the apply worker keeps requesting publisher status, while + # awaiting the prepared transaction to commit. Thus, the request log should + # appear more than once. + $node_A->wait_for_log( + qr/sending publisher status request message/, + $log_location); + + $log_location = -s $node_A->logfile; + + $node_A->wait_for_log( + qr/sending publisher status request message/, + $log_location); + + # Confirm that the dead tuple cannot be removed + ($cmdret, $stdout, $stderr) = + $node_A->psql('postgres', qq(VACUUM (verbose) public.tab;)); + + ok($stderr =~ qr/1 are dead but not yet removable/, + 'the deleted column is non-removable'); + + $log_location = -s $node_A->logfile; + + # Wakeup and detach the injection point on the publisher node. The prepared + # transaction should now commit. + $node_B->safe_psql( + 'postgres', + "SELECT injection_points_wakeup('commit-after-delay-checkpoint'); + SELECT injection_points_detach('commit-after-delay-checkpoint');" + ); + + # Close the background session on the publisher node + ok($pub_session->quit, "close publisher session"); + + # Confirm that the transaction committed + $result = + $node_B->safe_psql('postgres', 'SELECT * FROM tab WHERE a = 1'); + is($result, qq(1|2), 'publisher sees the new row'); + + # Ensure the UPDATE is replayed on subscriber + $node_B->wait_for_catchup($subname_AB); + + $logfile = slurp_file($node_A->logfile(), $log_location); + ok( $logfile =~ + qr/conflict detected on relation "public.tab": conflict=update_deleted.* +.*DETAIL:.* The row to be updated was deleted locally in transaction [0-9]+ at .* +.*Remote row \(1, 2\); replica identity full \(1, 1\)/, + 'update target row was deleted in tab'); + + # Remember the next transaction ID to be assigned + $next_xid = + $node_A->safe_psql('postgres', "SELECT txid_current() + 1;"); + + # Confirm that the xmin value is advanced to the latest nextXid after the + # prepared transaction on the publisher has been committed. + ok( $node_A->poll_query_until( + 'postgres', + "SELECT xmin = $next_xid from pg_replication_slots WHERE slot_name = 'pg_conflict_detection'" + ), + "the xmin value of slot 'pg_conflict_detection' is updated on subscriber" + ); + + # Confirm that the dead tuple can be removed now + ($cmdret, $stdout, $stderr) = + $node_A->psql('postgres', qq(VACUUM (verbose) public.tab;)); + + ok($stderr =~ qr/1 removed, 0 remain, 0 are dead but not yet removable/, + 'the deleted column is removed'); + + # Get the commit timestamp for the publisher's update + my $pub_ts = $node_B->safe_psql('postgres', + "SELECT pg_xact_commit_timestamp(xmin) from tab where a=1;"); + + # Check that the commit timestamp for the update on the publisher is later than + # or equal to the timestamp of the local deletion, as the commit timestamp + # should be assigned after marking the DELAY_CHKPT_IN_COMMIT flag. + $result = $node_B->safe_psql('postgres', + "SELECT '$pub_ts'::timestamp >= '$sub_ts'::timestamp"); + is($result, qq(t), + "pub UPDATE's timestamp is later than that of sub's DELETE"); + + # Re-enable the subscription for further tests + $node_B->psql('postgres', "ALTER SUBSCRIPTION $subname_BA ENABLE;"); +} + ############################################################################### # Check that dead tuple retention stops due to the wait time surpassing # max_retention_duration. From 3399c265543ec3cdbeff2fa2900e03b326705f63 Mon Sep 17 00:00:00 2001 From: Melanie Plageman Date: Mon, 8 Sep 2025 10:22:42 -0400 Subject: [PATCH 13/34] Remove unneeded VM pin from VM replay Previously, heap_xlog_visible() called visibilitymap_pin() even after getting a buffer from XLogReadBufferForRedoExtended() -- which returns a pinned buffer containing the specified block of the visibility map. This would just have resulted in visibilitymap_pin() returning early since the specified page was already present and pinned, but it was confusing extraneous code, so remove it. It doesn't seem worth backporting, though. It appears to be an oversight in 2c03216. While we are at it, remove two VM-related redundant asserts in the COPY FREEZE code path. visibilitymap_set() already asserts that PD_ALL_VISIBLE is set on the heap page and checks that the vmbuffer contains the bits corresponding to the specified heap block, so callers do not also need to check this. Author: Melanie Plageman Reported-by: Melanie Plageman Reported-by: Kirill Reshke Reviewed-by: Kirill Reshke Reviewed-by: Andres Freund Discussion: https://postgr.es/m/CALdSSPhu7WZd%2BEfQDha1nz%3DDC93OtY1%3DUFEdWwSZsASka_2eRQ%40mail.gmail.com --- src/backend/access/heap/heapam.c | 3 --- src/backend/access/heap/heapam_xlog.c | 1 - 2 files changed, 4 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index e3e7307ef5f79..4c5ae205a7a60 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -2647,9 +2647,6 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, */ if (all_frozen_set) { - Assert(PageIsAllVisible(page)); - Assert(visibilitymap_pin_ok(BufferGetBlockNumber(buffer), vmbuffer)); - /* * It's fine to use InvalidTransactionId here - this is only used * when HEAP_INSERT_FROZEN is specified, which intentionally diff --git a/src/backend/access/heap/heapam_xlog.c b/src/backend/access/heap/heapam_xlog.c index 5d48f071f53a7..cf843277938de 100644 --- a/src/backend/access/heap/heapam_xlog.c +++ b/src/backend/access/heap/heapam_xlog.c @@ -295,7 +295,6 @@ heap_xlog_visible(XLogReaderState *record) LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK); reln = CreateFakeRelcacheEntry(rlocator); - visibilitymap_pin(reln, blkno, &vmbuffer); visibilitymap_set(reln, blkno, InvalidBuffer, lsn, vmbuffer, xlrec->snapshotConflictHorizon, vmbits); From 585e31fcb6dfcb1d88cfee2371f565574db24869 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Mon, 8 Sep 2025 11:50:33 -0400 Subject: [PATCH 14/34] Don't generate fake "*SELECT*" or "*SELECT* %d" subquery aliases. rte->alias should point only to a user-written alias, but in these cases that principle was violated. Fixing this causes some regression test output changes: wherever rte->alias previously had a value and is now NULL, rte->eref is now set to a generated name rather than to rte->alias; and the scheme used to generate eref names differs from what we were doing for aliases. The upshot is that instead of "*SELECT*" or "*SELECT* %d", EXPLAIN will now emit "unnamed_subquery" or "unnamed_subquery_%d". But that's a reasonable descriptor, and we were already producing that in yet other cases, so this seems not too objectionable. Author: Tom Lane Co-authored-by: Robert Haas Discussion: https://postgr.es/m/CA+TgmoYSYmDA2GvanzPMci084n+mVucv0bJ0HPbs6uhmMN6HMg@mail.gmail.com --- contrib/postgres_fdw/expected/postgres_fdw.out | 8 ++++---- src/backend/executor/functions.c | 2 +- src/backend/parser/analyze.c | 7 ++----- src/test/regress/expected/partition_prune.out | 4 ++-- src/test/regress/expected/rangefuncs.out | 8 ++++---- src/test/regress/expected/union.out | 14 +++++++------- 6 files changed, 20 insertions(+), 23 deletions(-) diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out index 78b8367d28935..18d727d77907a 100644 --- a/contrib/postgres_fdw/expected/postgres_fdw.out +++ b/contrib/postgres_fdw/expected/postgres_fdw.out @@ -5086,13 +5086,13 @@ SELECT ft1.c1 FROM ft1 JOIN ft2 on ft1.c1 = ft2.c1 WHERE -- =================================================================== EXPLAIN (verbose, costs off) INSERT INTO ft2 (c1,c2,c3) SELECT c1+1000,c2+100, c3 || c3 FROM ft2 LIMIT 20; - QUERY PLAN ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ Insert on public.ft2 Remote SQL: INSERT INTO "S 1"."T 1"("C 1", c2, c3, c4, c5, c6, c7, c8) VALUES ($1, $2, $3, $4, $5, $6, $7, $8) Batch Size: 1 - -> Subquery Scan on "*SELECT*" - Output: "*SELECT*"."?column?", "*SELECT*"."?column?_1", NULL::integer, "*SELECT*"."?column?_2", NULL::timestamp with time zone, NULL::timestamp without time zone, NULL::character varying(10), 'ft2 '::character(10), NULL::user_enum + -> Subquery Scan on unnamed_subquery + Output: unnamed_subquery."?column?", unnamed_subquery."?column?_1", NULL::integer, unnamed_subquery."?column?_2", NULL::timestamp with time zone, NULL::timestamp without time zone, NULL::character varying(10), 'ft2 '::character(10), NULL::user_enum -> Foreign Scan on public.ft2 ft2_1 Output: (ft2_1.c1 + 1000), (ft2_1.c2 + 100), (ft2_1.c3 || ft2_1.c3) Remote SQL: SELECT "C 1", c2, c3 FROM "S 1"."T 1" LIMIT 20::bigint diff --git a/src/backend/executor/functions.c b/src/backend/executor/functions.c index 97455b1ed4a5b..630d708d2a3f0 100644 --- a/src/backend/executor/functions.c +++ b/src/backend/executor/functions.c @@ -2483,7 +2483,7 @@ check_sql_stmt_retval(List *queryTreeList, rte = makeNode(RangeTblEntry); rte->rtekind = RTE_SUBQUERY; rte->subquery = parse; - rte->eref = rte->alias = makeAlias("*SELECT*", colnames); + rte->eref = makeAlias("unnamed_subquery", colnames); rte->lateral = false; rte->inh = false; rte->inFromCl = true; diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c index 34f7c17f576ef..b9763ea17144c 100644 --- a/src/backend/parser/analyze.c +++ b/src/backend/parser/analyze.c @@ -777,7 +777,7 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt) */ nsitem = addRangeTableEntryForSubquery(pstate, selectQuery, - makeAlias("*SELECT*", NIL), + NULL, false, false); addNSItemToQuery(pstate, nsitem, true, false, false); @@ -2100,7 +2100,6 @@ transformSetOperationTree(ParseState *pstate, SelectStmt *stmt, { /* Process leaf SELECT */ Query *selectQuery; - char selectName[32]; ParseNamespaceItem *nsitem; RangeTblRef *rtr; ListCell *tl; @@ -2156,11 +2155,9 @@ transformSetOperationTree(ParseState *pstate, SelectStmt *stmt, /* * Make the leaf query be a subquery in the top-level rangetable. */ - snprintf(selectName, sizeof(selectName), "*SELECT* %d", - list_length(pstate->p_rtable) + 1); nsitem = addRangeTableEntryForSubquery(pstate, selectQuery, - makeAlias(selectName, NIL), + NULL, false, false); diff --git a/src/test/regress/expected/partition_prune.out b/src/test/regress/expected/partition_prune.out index d1966cd7d829f..68ecd95180920 100644 --- a/src/test/regress/expected/partition_prune.out +++ b/src/test/regress/expected/partition_prune.out @@ -4763,7 +4763,7 @@ select min(a) over (partition by a order by a) from part_abc where a >= stable_o QUERY PLAN ---------------------------------------------------------------------------------------------- Append - -> Subquery Scan on "*SELECT* 1_1" + -> Subquery Scan on unnamed_subquery_2 -> WindowAgg Window: w1 AS (PARTITION BY part_abc.a ORDER BY part_abc.a) -> Append @@ -4780,7 +4780,7 @@ select min(a) over (partition by a order by a) from part_abc where a >= stable_o -> Index Scan using part_abc_3_2_a_idx on part_abc_3_2 part_abc_4 Index Cond: (a >= (stable_one() + 1)) Filter: (d <= stable_one()) - -> Subquery Scan on "*SELECT* 2" + -> Subquery Scan on unnamed_subquery_1 -> WindowAgg Window: w1 AS (PARTITION BY part_abc_5.a ORDER BY part_abc_5.a) -> Append diff --git a/src/test/regress/expected/rangefuncs.out b/src/test/regress/expected/rangefuncs.out index c21be83aa4aaf..30241e22da270 100644 --- a/src/test/regress/expected/rangefuncs.out +++ b/src/test/regress/expected/rangefuncs.out @@ -2130,10 +2130,10 @@ select testrngfunc(); explain (verbose, costs off) select * from testrngfunc(); - QUERY PLAN ----------------------------------------------------------- - Subquery Scan on "*SELECT*" - Output: "*SELECT*"."?column?", "*SELECT*"."?column?_1" + QUERY PLAN +---------------------------------------------------------------------- + Subquery Scan on unnamed_subquery + Output: unnamed_subquery."?column?", unnamed_subquery."?column?_1" -> Unique Output: (1), (2) -> Sort diff --git a/src/test/regress/expected/union.out b/src/test/regress/expected/union.out index 96962817ed45a..d3ea433db1577 100644 --- a/src/test/regress/expected/union.out +++ b/src/test/regress/expected/union.out @@ -942,7 +942,7 @@ SELECT q1 FROM int8_tbl EXCEPT SELECT q2 FROM int8_tbl ORDER BY q2 LIMIT 1; ERROR: column "q2" does not exist LINE 1: ... int8_tbl EXCEPT SELECT q2 FROM int8_tbl ORDER BY q2 LIMIT 1... ^ -DETAIL: There is a column named "q2" in table "*SELECT* 2", but it cannot be referenced from this part of the query. +DETAIL: There is a column named "q2" in table "unnamed_subquery", but it cannot be referenced from this part of the query. -- But this should work: SELECT q1 FROM int8_tbl EXCEPT (((SELECT q2 FROM int8_tbl ORDER BY q2 LIMIT 1))) ORDER BY 1; q1 @@ -1338,14 +1338,14 @@ where q2 = q2; ---------------------------------------------------------- Unique -> Merge Append - Sort Key: "*SELECT* 1".q1 - -> Subquery Scan on "*SELECT* 1" + Sort Key: unnamed_subquery.q1 + -> Subquery Scan on unnamed_subquery -> Unique -> Sort Sort Key: i81.q1, i81.q2 -> Seq Scan on int8_tbl i81 Filter: (q2 IS NOT NULL) - -> Subquery Scan on "*SELECT* 2" + -> Subquery Scan on unnamed_subquery_1 -> Unique -> Sort Sort Key: i82.q1, i82.q2 @@ -1374,14 +1374,14 @@ where -q1 = q2; -------------------------------------------------------- Unique -> Merge Append - Sort Key: "*SELECT* 1".q1 - -> Subquery Scan on "*SELECT* 1" + Sort Key: unnamed_subquery.q1 + -> Subquery Scan on unnamed_subquery -> Unique -> Sort Sort Key: i81.q1, i81.q2 -> Seq Scan on int8_tbl i81 Filter: ((- q1) = q2) - -> Subquery Scan on "*SELECT* 2" + -> Subquery Scan on unnamed_subquery_1 -> Unique -> Sort Sort Key: i82.q1, i82.q2 From 6f79024df3461f794aace8bbc8706d8e5f7da091 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Mon, 8 Sep 2025 12:24:02 -0400 Subject: [PATCH 15/34] Don't generate fake "ANY_subquery" aliases, either. This is just like the previous commit, but for a different invented alias name. Author: Robert Haas Reviewed-by: Tom Lane Discussion: https://postgr.es/m/CA+TgmoYSYmDA2GvanzPMci084n+mVucv0bJ0HPbs6uhmMN6HMg@mail.gmail.com --- src/backend/optimizer/plan/subselect.c | 2 +- src/test/regress/expected/memoize.out | 8 ++++---- src/test/regress/expected/subselect.out | 14 +++++++------- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index d71ed958e31b3..fae18548e074e 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -1397,7 +1397,7 @@ convert_ANY_sublink_to_join(PlannerInfo *root, SubLink *sublink, */ nsitem = addRangeTableEntryForSubquery(pstate, subselect, - makeAlias("ANY_subquery", NIL), + NULL, use_lateral, false); rte = nsitem->p_rte; diff --git a/src/test/regress/expected/memoize.out b/src/test/regress/expected/memoize.out index 150dc1b44cf62..fbcaf113266c5 100644 --- a/src/test/regress/expected/memoize.out +++ b/src/test/regress/expected/memoize.out @@ -545,15 +545,15 @@ EXPLAIN (COSTS OFF) SELECT * FROM tab_anti t1 WHERE t1.a IN (SELECT a FROM tab_anti t2 WHERE t2.b IN (SELECT t1.b FROM tab_anti t3 WHERE t2.a > 1 OFFSET 0)); - QUERY PLAN -------------------------------------------------- + QUERY PLAN +--------------------------------------------------- Nested Loop Semi Join -> Seq Scan on tab_anti t1 -> Nested Loop Semi Join Join Filter: (t1.a = t2.a) -> Seq Scan on tab_anti t2 - -> Subquery Scan on "ANY_subquery" - Filter: (t2.b = "ANY_subquery".b) + -> Subquery Scan on unnamed_subquery + Filter: (t2.b = unnamed_subquery.b) -> Result One-Time Filter: (t2.a > 1) -> Seq Scan on tab_anti t3 diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out index c16dff05bc12e..7a1c216a0b1b7 100644 --- a/src/test/regress/expected/subselect.out +++ b/src/test/regress/expected/subselect.out @@ -1692,14 +1692,14 @@ select * from int4_tbl o where (f1, f1) in ------------------------------------------------------------------- Nested Loop Semi Join Output: o.f1 - Join Filter: (o.f1 = "ANY_subquery".f1) + Join Filter: (o.f1 = unnamed_subquery.f1) -> Seq Scan on public.int4_tbl o Output: o.f1 -> Materialize - Output: "ANY_subquery".f1, "ANY_subquery".g - -> Subquery Scan on "ANY_subquery" - Output: "ANY_subquery".f1, "ANY_subquery".g - Filter: ("ANY_subquery".f1 = "ANY_subquery".g) + Output: unnamed_subquery.f1, unnamed_subquery.g + -> Subquery Scan on unnamed_subquery + Output: unnamed_subquery.f1, unnamed_subquery.g + Filter: (unnamed_subquery.f1 = unnamed_subquery.g) -> Result Output: i.f1, ((generate_series(1, 50)) / 10) -> ProjectSet @@ -2867,8 +2867,8 @@ ON B.hundred in (SELECT min(c.hundred) FROM tenk2 C WHERE c.odd = b.odd); -> Memoize Cache Key: b.hundred, b.odd Cache Mode: binary - -> Subquery Scan on "ANY_subquery" - Filter: (b.hundred = "ANY_subquery".min) + -> Subquery Scan on unnamed_subquery + Filter: (b.hundred = unnamed_subquery.min) -> Result InitPlan 1 -> Limit From 5a170e992a4d402ef0e1b8ce7284cd78879ece73 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Mon, 8 Sep 2025 12:58:07 -0400 Subject: [PATCH 16/34] Don't generate fake "*TLOCRN*" or "*TROCRN*" aliases, either. This is just like the previous two commits, except that this fix actually doesn't change any regression test outputs. Author: Robert Haas Reviewed-by: Tom Lane Discussion: https://postgr.es/m/CA+TgmoYSYmDA2GvanzPMci084n+mVucv0bJ0HPbs6uhmMN6HMg@mail.gmail.com --- src/backend/rewrite/rewriteSearchCycle.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/backend/rewrite/rewriteSearchCycle.c b/src/backend/rewrite/rewriteSearchCycle.c index 9f95d4dc1b0e8..5202ef43d1068 100644 --- a/src/backend/rewrite/rewriteSearchCycle.c +++ b/src/backend/rewrite/rewriteSearchCycle.c @@ -282,8 +282,8 @@ rewriteSearchAndCycle(CommonTableExpr *cte) newrte = makeNode(RangeTblEntry); newrte->rtekind = RTE_SUBQUERY; - newrte->alias = makeAlias("*TLOCRN*", cte->ctecolnames); - newrte->eref = newrte->alias; + newrte->alias = NULL; + newrte->eref = makeAlias("*TLOCRN*", cte->ctecolnames); newsubquery = copyObject(rte1->subquery); IncrementVarSublevelsUp((Node *) newsubquery, 1, 1); newrte->subquery = newsubquery; @@ -379,8 +379,8 @@ rewriteSearchAndCycle(CommonTableExpr *cte) ewcl = lappend(ewcl, makeString(cte->cycle_clause->cycle_mark_column)); ewcl = lappend(ewcl, makeString(cte->cycle_clause->cycle_path_column)); } - newrte->alias = makeAlias("*TROCRN*", ewcl); - newrte->eref = newrte->alias; + newrte->alias = NULL; + newrte->eref = makeAlias("*TROCRN*", ewcl); /* * Find the reference to the recursive CTE in the right UNION subquery's From 4b5f206de2bb9152a99a5c218caf2580cc5a0e9e Mon Sep 17 00:00:00 2001 From: Melanie Plageman Date: Mon, 8 Sep 2025 14:25:10 -0400 Subject: [PATCH 17/34] Remove unused xl_heap_prune member, reason f83d709760d8 refactored xl_heap_prune and added an unused member, reason. While PruneReason is used when constructing this WAL record to set the WAL record definition, it doesn't need to be stored in a separate field in the record. Remove it. We won't backport this, since modifying an exposed struct definition to remove an unused field would do more harm than good. Author: Melanie Plageman Reported-by: Andres Freund Reviewed-by: Robert Haas Discussion: https://postgr.es/m/tvvtfoxz5ykpsctxjbzxg3nldnzfc7geplrt2z2s54pmgto27y%40hbijsndifu45 --- src/include/access/heapam_xlog.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h index 277df6b3cf0b3..d4c0625b63228 100644 --- a/src/include/access/heapam_xlog.h +++ b/src/include/access/heapam_xlog.h @@ -284,7 +284,6 @@ typedef struct xl_heap_update */ typedef struct xl_heap_prune { - uint8 reason; uint8 flags; /* From 3bcfcd815e1a2d51772ba27e0d034467f0344f15 Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Mon, 8 Sep 2025 14:19:48 -0500 Subject: [PATCH 18/34] pg_upgrade: Transfer pg_largeobject_metadata's files when possible. Commit 161a3e8b68 taught pg_upgrade to use COPY for large object metadata for upgrades from v12 and newer, which is much faster to restore than the proper large object commands. For upgrades from v16 and newer, we can take this a step further and transfer the large object metadata files as if they were user tables. We can't transfer the files from older versions because the aclitem data type (needed by pg_largeobject_metadata.lomacl) changed its storage format in v16 (see commit 7b378237aa). Note that this commit is essentially a revert of commit 12a53c732c. There are a couple of caveats. First, we still need to COPY the corresponding pg_shdepend rows for large objects. Second, we need to COPY anything in pg_largeobject_metadata with a comment or security label, else restoring those will fail. This means that an upgrade in which every large object has a comment or security label won't gain anything from this commit, but it should at least avoid making those unusual use-cases any worse. pg_upgrade must also take care to transfer the relfilenodes of pg_largeobject_metadata and its index, as was done for pg_largeobject in commits d498e052b4 and bbe08b8869. Reviewed-by: Michael Paquier Discussion: https://postgr.es/m/aJ3_Gih_XW1_O2HF%40nathan --- src/backend/commands/tablecmds.c | 12 ++-- src/bin/pg_dump/pg_dump.c | 80 ++++++++++++++++++---- src/bin/pg_upgrade/Makefile | 3 +- src/bin/pg_upgrade/info.c | 11 ++- src/bin/pg_upgrade/pg_upgrade.c | 6 +- src/bin/pg_upgrade/t/006_transfer_modes.pl | 67 ++++++++++++++++++ 6 files changed, 154 insertions(+), 25 deletions(-) diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 082a3575d621e..3be2e051d32fb 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -42,6 +42,7 @@ #include "catalog/pg_foreign_table.h" #include "catalog/pg_inherits.h" #include "catalog/pg_largeobject.h" +#include "catalog/pg_largeobject_metadata.h" #include "catalog/pg_namespace.h" #include "catalog/pg_opclass.h" #include "catalog/pg_policy.h" @@ -2389,12 +2390,15 @@ truncate_check_rel(Oid relid, Form_pg_class reltuple) /* * Most system catalogs can't be truncated at all, or at least not unless * allow_system_table_mods=on. As an exception, however, we allow - * pg_largeobject to be truncated as part of pg_upgrade, because we need - * to change its relfilenode to match the old cluster, and allowing a - * TRUNCATE command to be executed is the easiest way of doing that. + * pg_largeobject and pg_largeobject_metadata to be truncated as part of + * pg_upgrade, because we need to change its relfilenode to match the old + * cluster, and allowing a TRUNCATE command to be executed is the easiest + * way of doing that. */ if (!allowSystemTableMods && IsSystemClass(relid, reltuple) - && (!IsBinaryUpgrade || relid != LargeObjectRelationId)) + && (!IsBinaryUpgrade || + (relid != LargeObjectRelationId && + relid != LargeObjectMetadataRelationId))) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("permission denied: \"%s\" is a system catalog", diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index bea793456f969..b4c45ad803e94 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -1131,6 +1131,23 @@ main(int argc, char **argv) shdepend->dataObj->filtercond = "WHERE classid = 'pg_largeobject'::regclass " "AND dbid = (SELECT oid FROM pg_database " " WHERE datname = current_database())"; + + /* + * If upgrading from v16 or newer, only dump large objects with + * comments/seclabels. For these upgrades, pg_upgrade can copy/link + * pg_largeobject_metadata's files (which is usually faster) but we + * still need to dump LOs with comments/seclabels here so that the + * subsequent COMMENT and SECURITY LABEL commands work. pg_upgrade + * can't copy/link the files from older versions because aclitem + * (needed by pg_largeobject_metadata.lomacl) changed its storage + * format in v16. + */ + if (fout->remoteVersion >= 160000) + lo_metadata->dataObj->filtercond = "WHERE oid IN " + "(SELECT objoid FROM pg_description " + "WHERE classoid = " CppAsString2(LargeObjectRelationId) " " + "UNION SELECT objoid FROM pg_seclabel " + "WHERE classoid = " CppAsString2(LargeObjectRelationId) ")"; } /* @@ -3629,26 +3646,32 @@ dumpDatabase(Archive *fout) /* * pg_largeobject comes from the old system intact, so set its * relfrozenxids, relminmxids and relfilenode. + * + * pg_largeobject_metadata also comes from the old system intact for + * upgrades from v16 and newer, so set its relfrozenxids, relminmxids, and + * relfilenode, too. pg_upgrade can't copy/link the files from older + * versions because aclitem (needed by pg_largeobject_metadata.lomacl) + * changed its storage format in v16. */ if (dopt->binary_upgrade) { PGresult *lo_res; PQExpBuffer loFrozenQry = createPQExpBuffer(); PQExpBuffer loOutQry = createPQExpBuffer(); + PQExpBuffer lomOutQry = createPQExpBuffer(); PQExpBuffer loHorizonQry = createPQExpBuffer(); + PQExpBuffer lomHorizonQry = createPQExpBuffer(); int ii_relfrozenxid, ii_relfilenode, ii_oid, ii_relminmxid; - /* - * pg_largeobject - */ if (fout->remoteVersion >= 90300) appendPQExpBuffer(loFrozenQry, "SELECT relfrozenxid, relminmxid, relfilenode, oid\n" "FROM pg_catalog.pg_class\n" - "WHERE oid IN (%u, %u);\n", - LargeObjectRelationId, LargeObjectLOidPNIndexId); + "WHERE oid IN (%u, %u, %u, %u);\n", + LargeObjectRelationId, LargeObjectLOidPNIndexId, + LargeObjectMetadataRelationId, LargeObjectMetadataOidIndexId); else appendPQExpBuffer(loFrozenQry, "SELECT relfrozenxid, 0 AS relminmxid, relfilenode, oid\n" "FROM pg_catalog.pg_class\n" @@ -3663,35 +3686,57 @@ dumpDatabase(Archive *fout) ii_oid = PQfnumber(lo_res, "oid"); appendPQExpBufferStr(loHorizonQry, "\n-- For binary upgrade, set pg_largeobject relfrozenxid and relminmxid\n"); + appendPQExpBufferStr(lomHorizonQry, "\n-- For binary upgrade, set pg_largeobject_metadata relfrozenxid and relminmxid\n"); appendPQExpBufferStr(loOutQry, "\n-- For binary upgrade, preserve pg_largeobject and index relfilenodes\n"); + appendPQExpBufferStr(lomOutQry, "\n-- For binary upgrade, preserve pg_largeobject_metadata and index relfilenodes\n"); for (int i = 0; i < PQntuples(lo_res); ++i) { Oid oid; RelFileNumber relfilenumber; + PQExpBuffer horizonQry; + PQExpBuffer outQry; + + oid = atooid(PQgetvalue(lo_res, i, ii_oid)); + relfilenumber = atooid(PQgetvalue(lo_res, i, ii_relfilenode)); - appendPQExpBuffer(loHorizonQry, "UPDATE pg_catalog.pg_class\n" + if (oid == LargeObjectRelationId || + oid == LargeObjectLOidPNIndexId) + { + horizonQry = loHorizonQry; + outQry = loOutQry; + } + else + { + horizonQry = lomHorizonQry; + outQry = lomOutQry; + } + + appendPQExpBuffer(horizonQry, "UPDATE pg_catalog.pg_class\n" "SET relfrozenxid = '%u', relminmxid = '%u'\n" "WHERE oid = %u;\n", atooid(PQgetvalue(lo_res, i, ii_relfrozenxid)), atooid(PQgetvalue(lo_res, i, ii_relminmxid)), atooid(PQgetvalue(lo_res, i, ii_oid))); - oid = atooid(PQgetvalue(lo_res, i, ii_oid)); - relfilenumber = atooid(PQgetvalue(lo_res, i, ii_relfilenode)); - - if (oid == LargeObjectRelationId) - appendPQExpBuffer(loOutQry, + if (oid == LargeObjectRelationId || + oid == LargeObjectMetadataRelationId) + appendPQExpBuffer(outQry, "SELECT pg_catalog.binary_upgrade_set_next_heap_relfilenode('%u'::pg_catalog.oid);\n", relfilenumber); - else if (oid == LargeObjectLOidPNIndexId) - appendPQExpBuffer(loOutQry, + else if (oid == LargeObjectLOidPNIndexId || + oid == LargeObjectMetadataOidIndexId) + appendPQExpBuffer(outQry, "SELECT pg_catalog.binary_upgrade_set_next_index_relfilenode('%u'::pg_catalog.oid);\n", relfilenumber); } appendPQExpBufferStr(loOutQry, "TRUNCATE pg_catalog.pg_largeobject;\n"); + appendPQExpBufferStr(lomOutQry, + "TRUNCATE pg_catalog.pg_largeobject_metadata;\n"); + appendPQExpBufferStr(loOutQry, loHorizonQry->data); + appendPQExpBufferStr(lomOutQry, lomHorizonQry->data); ArchiveEntry(fout, nilCatalogId, createDumpId(), ARCHIVE_OPTS(.tag = "pg_largeobject", @@ -3699,11 +3744,20 @@ dumpDatabase(Archive *fout) .section = SECTION_PRE_DATA, .createStmt = loOutQry->data)); + if (fout->remoteVersion >= 160000) + ArchiveEntry(fout, nilCatalogId, createDumpId(), + ARCHIVE_OPTS(.tag = "pg_largeobject_metadata", + .description = "pg_largeobject_metadata", + .section = SECTION_PRE_DATA, + .createStmt = lomOutQry->data)); + PQclear(lo_res); destroyPQExpBuffer(loFrozenQry); destroyPQExpBuffer(loHorizonQry); + destroyPQExpBuffer(lomHorizonQry); destroyPQExpBuffer(loOutQry); + destroyPQExpBuffer(lomOutQry); } PQclear(res); diff --git a/src/bin/pg_upgrade/Makefile b/src/bin/pg_upgrade/Makefile index f83d2b5d30955..69fcf593caec9 100644 --- a/src/bin/pg_upgrade/Makefile +++ b/src/bin/pg_upgrade/Makefile @@ -3,8 +3,7 @@ PGFILEDESC = "pg_upgrade - an in-place binary upgrade utility" PGAPPICON = win32 -# required for 003_upgrade_logical_replication_slots.pl -EXTRA_INSTALL=contrib/test_decoding +EXTRA_INSTALL=contrib/test_decoding src/test/modules/dummy_seclabel subdir = src/bin/pg_upgrade top_builddir = ../../.. diff --git a/src/bin/pg_upgrade/info.c b/src/bin/pg_upgrade/info.c index c39eb077c2fae..7ce0827016803 100644 --- a/src/bin/pg_upgrade/info.c +++ b/src/bin/pg_upgrade/info.c @@ -498,7 +498,10 @@ get_rel_infos_query(void) * * pg_largeobject contains user data that does not appear in pg_dump * output, so we have to copy that system table. It's easiest to do that - * by treating it as a user table. + * by treating it as a user table. We can do the same for + * pg_largeobject_metadata for upgrades from v16 and newer. pg_upgrade + * can't copy/link the files from older versions because aclitem (needed + * by pg_largeobject_metadata.lomacl) changed its storage format in v16. */ appendPQExpBuffer(&query, "WITH regular_heap (reloid, indtable, toastheap) AS ( " @@ -514,10 +517,12 @@ get_rel_infos_query(void) " 'binary_upgrade', 'pg_toast') AND " " c.oid >= %u::pg_catalog.oid) OR " " (n.nspname = 'pg_catalog' AND " - " relname IN ('pg_largeobject') ))), ", + " relname IN ('pg_largeobject'%s) ))), ", (user_opts.transfer_mode == TRANSFER_MODE_SWAP) ? ", " CppAsString2(RELKIND_SEQUENCE) : "", - FirstNormalObjectId); + FirstNormalObjectId, + (GET_MAJOR_VERSION(old_cluster.major_version) >= 1600) ? + ", 'pg_largeobject_metadata'" : ""); /* * Add a CTE that collects OIDs of toast tables belonging to the tables diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c index d5cd5bf0b3a6b..490e98fa26f2a 100644 --- a/src/bin/pg_upgrade/pg_upgrade.c +++ b/src/bin/pg_upgrade/pg_upgrade.c @@ -29,9 +29,9 @@ * We control all assignments of pg_enum.oid because these oids are stored * in user tables as enum values. * - * We control all assignments of pg_authid.oid for historical reasons (the - * oids used to be stored in pg_largeobject_metadata, which is now copied via - * SQL commands), that might change at some point in the future. + * We control all assignments of pg_authid.oid because the oids are stored in + * pg_largeobject_metadata, which is copied via file transfer for upgrades + * from v16 and newer. * * We control all assignments of pg_database.oid because we want the directory * names to match between the old and new cluster. diff --git a/src/bin/pg_upgrade/t/006_transfer_modes.pl b/src/bin/pg_upgrade/t/006_transfer_modes.pl index 348f402146234..2f68f0b56aa61 100644 --- a/src/bin/pg_upgrade/t/006_transfer_modes.pl +++ b/src/bin/pg_upgrade/t/006_transfer_modes.pl @@ -45,6 +45,22 @@ sub test_mode $old->append_conf('postgresql.conf', "allow_in_place_tablespaces = true"); } + # We can only test security labels if both the old and new installations + # have dummy_seclabel. + my $test_seclabel = 1; + $old->start; + if (!$old->check_extension('dummy_seclabel')) + { + $test_seclabel = 0; + } + $old->stop; + $new->start; + if (!$new->check_extension('dummy_seclabel')) + { + $test_seclabel = 0; + } + $new->stop; + # Create a small variety of simple test objects on the old cluster. We'll # check that these reach the new version after upgrading. $old->start; @@ -83,6 +99,29 @@ sub test_mode $old->safe_psql('testdb3', "CREATE TABLE test6 AS SELECT generate_series(607, 711)"); } + + # While we are here, test handling of large objects. + $old->safe_psql('postgres', q| + CREATE ROLE regress_lo_1; + CREATE ROLE regress_lo_2; + + SELECT lo_from_bytea(4532, '\xffffff00'); + COMMENT ON LARGE OBJECT 4532 IS 'test'; + + SELECT lo_from_bytea(4533, '\x0f0f0f0f'); + ALTER LARGE OBJECT 4533 OWNER TO regress_lo_1; + GRANT SELECT ON LARGE OBJECT 4533 TO regress_lo_2; + |); + + if ($test_seclabel) + { + $old->safe_psql('postgres', q| + CREATE EXTENSION dummy_seclabel; + + SELECT lo_from_bytea(4534, '\x00ffffff'); + SECURITY LABEL ON LARGE OBJECT 4534 IS 'classified'; + |); + } $old->stop; my $result = command_ok_or_fails_like( @@ -132,6 +171,34 @@ sub test_mode $result = $new->safe_psql('testdb3', "SELECT COUNT(*) FROM test6"); is($result, '105', "test6 data after pg_upgrade $mode"); } + + # Tests for large objects + $result = $new->safe_psql('postgres', "SELECT lo_get(4532)"); + is($result, '\xffffff00', "LO contents after upgrade"); + $result = $new->safe_psql('postgres', + "SELECT obj_description(4532, 'pg_largeobject')"); + is($result, 'test', "comment on LO after pg_upgrade"); + + $result = $new->safe_psql('postgres', "SELECT lo_get(4533)"); + is($result, '\x0f0f0f0f', "LO contents after upgrade"); + $result = $new->safe_psql('postgres', + "SELECT lomowner::regrole FROM pg_largeobject_metadata WHERE oid = 4533"); + is($result, 'regress_lo_1', "LO owner after upgrade"); + $result = $new->safe_psql('postgres', + "SELECT lomacl FROM pg_largeobject_metadata WHERE oid = 4533"); + is($result, '{regress_lo_1=rw/regress_lo_1,regress_lo_2=r/regress_lo_1}', + "LO ACL after upgrade"); + + if ($test_seclabel) + { + $result = $new->safe_psql('postgres', "SELECT lo_get(4534)"); + is($result, '\x00ffffff', "LO contents after upgrade"); + $result = $new->safe_psql('postgres', q| + SELECT label FROM pg_seclabel WHERE objoid = 4534 + AND classoid = 'pg_largeobject'::regclass + |); + is($result, 'classified', "seclabel on LO after pg_upgrade"); + } $new->stop; } From 9af672bcb245950e58198119ba6eb17043fd3a6d Mon Sep 17 00:00:00 2001 From: Jeff Davis Date: Mon, 8 Sep 2025 12:29:42 -0700 Subject: [PATCH 19/34] meson: build checksums with extra optimization flags. Use -funroll-loops and -ftree-vectorize when building checksum.c to match what autoconf does. Discussion: https://postgr.es/m/a81f2f7ef34afc24a89c613671ea017e3651329c.camel@j-davis.com Reviewed-by: Andres Freund --- src/backend/storage/page/meson.build | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/backend/storage/page/meson.build b/src/backend/storage/page/meson.build index c3e4a805862a9..112f00ff36552 100644 --- a/src/backend/storage/page/meson.build +++ b/src/backend/storage/page/meson.build @@ -1,7 +1,15 @@ # Copyright (c) 2022-2025, PostgreSQL Global Development Group +checksum_backend_lib = static_library('checksum_backend_lib', + 'checksum.c', + dependencies: backend_build_deps, + kwargs: internal_lib_args, + c_args: vectorize_cflags + unroll_loops_cflags, +) + +backend_link_with += checksum_backend_lib + backend_sources += files( 'bufpage.c', - 'checksum.c', 'itemptr.c', ) From 8ec97e78a7713a1ebf4976b55c19f6c9bc2716d9 Mon Sep 17 00:00:00 2001 From: Melanie Plageman Date: Mon, 8 Sep 2025 17:13:31 -0400 Subject: [PATCH 20/34] Add error codes when vacuum discovers VM corruption Commit fd6ec93bf890314a and other previous work established the principle that when an error is potentially reachable in case of on-disk corruption but is not expected to be reached otherwise, ERRCODE_DATA_CORRUPTED should be used. This allows log monitoring software to search for evidence of corruption by filtering on the error code. Enhance the existing log messages emitted when the heap page is found to be inconsistent with the VM by adding this error code. Suggested-by: Andrey Borodin Author: Melanie Plageman Reviewed-by: Robert Haas Discussion: https://postgr.es/m/87DD95AA-274F-4F4F-BAD9-7738E5B1F905%40yandex-team.ru --- src/backend/access/heap/vacuumlazy.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 932701d8420dc..981d9380a925c 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -2121,8 +2121,11 @@ lazy_scan_prune(LVRelState *vacrel, else if (all_visible_according_to_vm && !PageIsAllVisible(page) && visibilitymap_get_status(vacrel->rel, blkno, &vmbuffer) != 0) { - elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", - vacrel->relname, blkno); + ereport(WARNING, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", + vacrel->relname, blkno))); + visibilitymap_clear(vacrel->rel, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS); } @@ -2143,8 +2146,11 @@ lazy_scan_prune(LVRelState *vacrel, */ else if (presult.lpdead_items > 0 && PageIsAllVisible(page)) { - elog(WARNING, "page containing LP_DEAD items is marked as all-visible in relation \"%s\" page %u", - vacrel->relname, blkno); + ereport(WARNING, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("page containing LP_DEAD items is marked as all-visible in relation \"%s\" page %u", + vacrel->relname, blkno))); + PageClearAllVisible(page); MarkBufferDirty(buf); visibilitymap_clear(vacrel->rel, blkno, vmbuffer, From 5ac3c1ac22cb325844d0bee37f79f2c11931b32e Mon Sep 17 00:00:00 2001 From: Amit Kapila Date: Tue, 9 Sep 2025 03:18:22 +0000 Subject: [PATCH 21/34] Fix Coverity issue reported in commit a850be2fe. Address a potential SIGSEGV that may occur when the tablesync worker attempts to locate a deleted row while applying changes. This situation arises during conflict detection for update-deleted scenarios. To prevent this crash, ensure that the operation is errored out early if the leader apply worker is unavailable. Since the leader worker maintains the necessary conflict detection metadata, proceeding without it serves no purpose and risks reporting incorrect conflict type. In the passing, improve a nearby comment. Reported by Tom Lane as per Coverity Author: shveta malik Reviewed-by: Amit Kapila Discussion: https://postgr.es/m/334468.1757280992@sss.pgh.pa.us --- src/backend/replication/logical/worker.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c index c0f6bef5c282c..b3cac1023731a 100644 --- a/src/backend/replication/logical/worker.c +++ b/src/backend/replication/logical/worker.c @@ -3266,12 +3266,18 @@ FindDeletedTupleInLocalRel(Relation localrel, Oid localidxoid, /* * Obtain the information from the leader apply worker as only the - * leader manages conflict retention (see + * leader manages oldest_nonremovable_xid (see * maybe_advance_nonremovable_xid() for details). */ LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); leader = logicalrep_worker_find(MyLogicalRepWorker->subid, InvalidOid, false); + if (!leader) + { + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("could not detect conflict as the leader apply worker has exited"))); + } SpinLockAcquire(&leader->relmutex); oldestxmin = leader->oldest_nonremovable_xid; From faf071b553830d39fc583beabcaf56ed65259acc Mon Sep 17 00:00:00 2001 From: Dean Rasheed Date: Tue, 9 Sep 2025 10:39:30 +0100 Subject: [PATCH 22/34] Add date and timestamp variants of random(min, max). This adds 3 new variants of the random() function: random(min date, max date) returns date random(min timestamp, max timestamp) returns timestamp random(min timestamptz, max timestamptz) returns timestamptz Each returns a random value x in the range min <= x <= max. Author: Damien Clochard Reviewed-by: Greg Sabino Mullane Reviewed-by: Dean Rasheed Reviewed-by: Vik Fearing Reviewed-by: Chao Li Discussion: https://postgr.es/m/f524d8cab5914613d9e624d9ce177d3d@dalibo.info --- doc/src/sgml/func/func-datetime.sgml | 30 +++++++ doc/src/sgml/func/func-math.sgml | 3 +- src/backend/utils/adt/pseudorandomfuncs.c | 104 ++++++++++++++++++++-- src/include/catalog/catversion.h | 2 +- src/include/catalog/pg_proc.dat | 12 +++ src/test/regress/expected/random.out | 87 ++++++++++++++++++ src/test/regress/sql/random.sql | 26 ++++++ 7 files changed, 254 insertions(+), 10 deletions(-) diff --git a/doc/src/sgml/func/func-datetime.sgml b/doc/src/sgml/func/func-datetime.sgml index 482fe45f42ebc..98dd60aa9a7ec 100644 --- a/doc/src/sgml/func/func-datetime.sgml +++ b/doc/src/sgml/func/func-datetime.sgml @@ -928,6 +928,36 @@ + + + + random + + random ( min date, max date ) + date + + + random ( min timestamp, max timestamp ) + timestamp + + + random ( min timestamptz, max timestamptz ) + timestamptz + + + Returns a random value in the range + min <= x <= max. + + + random('1979-02-08'::date,'2025-07-03'::date) + 1983-04-21 + + + random('2000-01-01'::timestamptz, now()) + 2015-09-27 09:11:33.732707+00 + + + diff --git a/doc/src/sgml/func/func-math.sgml b/doc/src/sgml/func/func-math.sgml index 7528dc4cea4b9..fd821c0e70677 100644 --- a/doc/src/sgml/func/func-math.sgml +++ b/doc/src/sgml/func/func-math.sgml @@ -1151,7 +1151,8 @@ The random() and random_normal() - functions listed in use a + functions listed in and + use a deterministic pseudo-random number generator. It is fast but not suitable for cryptographic applications; see the module for a more diff --git a/src/backend/utils/adt/pseudorandomfuncs.c b/src/backend/utils/adt/pseudorandomfuncs.c index e7b8045f92508..1d2a981491bf5 100644 --- a/src/backend/utils/adt/pseudorandomfuncs.c +++ b/src/backend/utils/adt/pseudorandomfuncs.c @@ -17,6 +17,7 @@ #include "common/pg_prng.h" #include "miscadmin.h" +#include "utils/date.h" #include "utils/fmgrprotos.h" #include "utils/numeric.h" #include "utils/timestamp.h" @@ -25,6 +26,18 @@ static pg_prng_state prng_state; static bool prng_seed_set = false; +/* + * Macro for checking the range bounds of random(min, max) functions. Throws + * an error if they're the wrong way round. + */ +#define CHECK_RANGE_BOUNDS(rmin, rmax) \ + do { \ + if ((rmin) > (rmax)) \ + ereport(ERROR, \ + errcode(ERRCODE_INVALID_PARAMETER_VALUE), \ + errmsg("lower bound must be less than or equal to upper bound")); \ + } while (0) + /* * initialize_prng() - * @@ -129,10 +142,7 @@ int4random(PG_FUNCTION_ARGS) int32 rmax = PG_GETARG_INT32(1); int32 result; - if (rmin > rmax) - ereport(ERROR, - errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("lower bound must be less than or equal to upper bound")); + CHECK_RANGE_BOUNDS(rmin, rmax); initialize_prng(); @@ -153,10 +163,7 @@ int8random(PG_FUNCTION_ARGS) int64 rmax = PG_GETARG_INT64(1); int64 result; - if (rmin > rmax) - ereport(ERROR, - errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("lower bound must be less than or equal to upper bound")); + CHECK_RANGE_BOUNDS(rmin, rmax); initialize_prng(); @@ -177,9 +184,90 @@ numeric_random(PG_FUNCTION_ARGS) Numeric rmax = PG_GETARG_NUMERIC(1); Numeric result; + /* Leave range bound checking to random_numeric() */ + initialize_prng(); result = random_numeric(&prng_state, rmin, rmax); PG_RETURN_NUMERIC(result); } + + +/* + * date_random() - + * + * Returns a random date chosen uniformly in the specified range. + */ +Datum +date_random(PG_FUNCTION_ARGS) +{ + int32 rmin = (int32) PG_GETARG_DATEADT(0); + int32 rmax = (int32) PG_GETARG_DATEADT(1); + DateADT result; + + CHECK_RANGE_BOUNDS(rmin, rmax); + + if (DATE_IS_NOBEGIN(rmin) || DATE_IS_NOEND(rmax)) + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("lower and upper bounds must be finite")); + + initialize_prng(); + + result = (DateADT) pg_prng_int64_range(&prng_state, rmin, rmax); + + PG_RETURN_DATEADT(result); +} + +/* + * timestamp_random() - + * + * Returns a random timestamp chosen uniformly in the specified range. + */ +Datum +timestamp_random(PG_FUNCTION_ARGS) +{ + int64 rmin = (int64) PG_GETARG_TIMESTAMP(0); + int64 rmax = (int64) PG_GETARG_TIMESTAMP(1); + Timestamp result; + + CHECK_RANGE_BOUNDS(rmin, rmax); + + if (TIMESTAMP_IS_NOBEGIN(rmin) || TIMESTAMP_IS_NOEND(rmax)) + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("lower and upper bounds must be finite")); + + initialize_prng(); + + result = (Timestamp) pg_prng_int64_range(&prng_state, rmin, rmax); + + PG_RETURN_TIMESTAMP(result); +} + +/* + * timestamptz_random() - + * + * Returns a random timestamptz chosen uniformly in the specified range. + */ +Datum +timestamptz_random(PG_FUNCTION_ARGS) +{ + int64 rmin = (int64) PG_GETARG_TIMESTAMPTZ(0); + int64 rmax = (int64) PG_GETARG_TIMESTAMPTZ(1); + TimestampTz result; + + CHECK_RANGE_BOUNDS(rmin, rmax); + + if (TIMESTAMP_IS_NOBEGIN(rmin) || TIMESTAMP_IS_NOEND(rmax)) + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("lower and upper bounds must be finite")); + + initialize_prng(); + + result = (TimestampTz) pg_prng_int64_range(&prng_state, rmin, rmax); + + PG_RETURN_TIMESTAMPTZ(result); +} diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 836369f163ef5..ef0d0f92165eb 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -57,6 +57,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202509021 +#define CATALOG_VERSION_NO 202509091 #endif diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 118d6da1ace0e..03e82d28c8767 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -3503,6 +3503,18 @@ proname => 'random', provolatile => 'v', proparallel => 'r', prorettype => 'numeric', proargtypes => 'numeric numeric', proargnames => '{min,max}', prosrc => 'numeric_random' }, +{ oid => '6431', descr => 'random date in range', + proname => 'random', provolatile => 'v', proparallel => 'r', + prorettype => 'date', proargtypes => 'date date', + proargnames => '{min,max}', prosrc => 'date_random' }, +{ oid => '6432', descr => 'random timestamp in range', + proname => 'random', provolatile => 'v', proparallel => 'r', + prorettype => 'timestamp', proargtypes => 'timestamp timestamp', + proargnames => '{min,max}', prosrc => 'timestamp_random' }, +{ oid => '6433', descr => 'random timestamptz in range', + proname => 'random', provolatile => 'v', proparallel => 'r', + prorettype => 'timestamptz', proargtypes => 'timestamptz timestamptz', + proargnames => '{min,max}', prosrc => 'timestamptz_random' }, { oid => '1599', descr => 'set random seed', proname => 'setseed', provolatile => 'v', proparallel => 'r', prorettype => 'void', proargtypes => 'float8', prosrc => 'setseed' }, diff --git a/src/test/regress/expected/random.out b/src/test/regress/expected/random.out index 43cf88a36341b..7f17b2a1b12f8 100644 --- a/src/test/regress/expected/random.out +++ b/src/test/regress/expected/random.out @@ -536,3 +536,90 @@ SELECT n, random(0, trim_scale(abs(1 - 10.0^(-n)))) FROM generate_series(-20, 20 20 | 0.60795101234744211935 (41 rows) +-- random dates +SELECT random('1979-02-08'::date,'2025-07-03'::date) AS random_date_multiple_years; + random_date_multiple_years +---------------------------- + 04-09-1986 +(1 row) + +SELECT random('4714-11-24 BC'::date,'5874897-12-31 AD'::date) AS random_date_maximum_range; + random_date_maximum_range +--------------------------- + 10-02-2898131 +(1 row) + +SELECT random('1979-02-08'::date,'1979-02-08'::date) AS random_date_empty_range; + random_date_empty_range +------------------------- + 02-08-1979 +(1 row) + +SELECT random('2024-12-31'::date, '2024-01-01'::date); -- fail +ERROR: lower bound must be less than or equal to upper bound +SELECT random('-infinity'::date, '2024-01-01'::date); -- fail +ERROR: lower and upper bounds must be finite +SELECT random('2024-12-31'::date, 'infinity'::date); -- fail +ERROR: lower and upper bounds must be finite +-- random timestamps +SELECT random('1979-02-08'::timestamp,'2025-07-03'::timestamp) AS random_timestamp_multiple_years; + random_timestamp_multiple_years +--------------------------------- + Fri Jan 27 18:52:05.366009 2017 +(1 row) + +SELECT random('4714-11-24 BC'::timestamp,'294276-12-31 23:59:59.999999'::timestamp) AS random_timestamp_maximum_range; + random_timestamp_maximum_range +----------------------------------- + Wed Mar 28 00:45:36.180395 226694 +(1 row) + +SELECT random('2024-07-01 12:00:00.000001'::timestamp, '2024-07-01 12:00:00.999999'::timestamp) AS random_narrow_range; + random_narrow_range +--------------------------------- + Mon Jul 01 12:00:00.999286 2024 +(1 row) + +SELECT random('1979-02-08'::timestamp,'1979-02-08'::timestamp) AS random_timestamp_empty_range; + random_timestamp_empty_range +------------------------------ + Thu Feb 08 00:00:00 1979 +(1 row) + +SELECT random('2024-12-31'::timestamp, '2024-01-01'::timestamp); -- fail +ERROR: lower bound must be less than or equal to upper bound +SELECT random('-infinity'::timestamp, '2024-01-01'::timestamp); -- fail +ERROR: lower and upper bounds must be finite +SELECT random('2024-12-31'::timestamp, 'infinity'::timestamp); -- fail +ERROR: lower and upper bounds must be finite +-- random timestamps with timezone +SELECT random('1979-02-08 +01'::timestamptz,'2025-07-03 +02'::timestamptz) AS random_timestamptz_multiple_years; + random_timestamptz_multiple_years +------------------------------------- + Tue Jun 14 04:41:16.652896 2016 PDT +(1 row) + +SELECT random('4714-11-24 BC +00'::timestamptz,'294276-12-31 23:59:59.999999 +00'::timestamptz) AS random_timestamptz_maximum_range; + random_timestamptz_maximum_range +-------------------------------------- + Wed Mar 26 14:07:16.980265 31603 PDT +(1 row) + +SELECT random('2024-07-01 12:00:00.000001 +04'::timestamptz, '2024-07-01 12:00:00.999999 +04'::timestamptz) AS random_timestamptz_narrow_range; + random_timestamptz_narrow_range +------------------------------------- + Mon Jul 01 01:00:00.835808 2024 PDT +(1 row) + +SELECT random('1979-02-08 +05'::timestamptz,'1979-02-08 +05'::timestamptz) AS random_timestamptz_empty_range; + random_timestamptz_empty_range +-------------------------------- + Wed Feb 07 11:00:00 1979 PST +(1 row) + +SELECT random('2024-01-01 +06'::timestamptz, '2024-01-01 +07'::timestamptz); -- fail +ERROR: lower bound must be less than or equal to upper bound +SELECT random('-infinity'::timestamptz, '2024-01-01 +07'::timestamptz); -- fail +ERROR: lower and upper bounds must be finite +SELECT random('2024-01-01 +06'::timestamptz, 'infinity'::timestamptz); -- fail +ERROR: lower and upper bounds must be finite diff --git a/src/test/regress/sql/random.sql b/src/test/regress/sql/random.sql index ebfa7539ede25..890f14687ef98 100644 --- a/src/test/regress/sql/random.sql +++ b/src/test/regress/sql/random.sql @@ -277,3 +277,29 @@ SELECT random(-1e30, 1e30) FROM generate_series(1, 10); SELECT random(-0.4, 0.4) FROM generate_series(1, 10); SELECT random(0, 1 - 1e-30) FROM generate_series(1, 10); SELECT n, random(0, trim_scale(abs(1 - 10.0^(-n)))) FROM generate_series(-20, 20) n; + +-- random dates +SELECT random('1979-02-08'::date,'2025-07-03'::date) AS random_date_multiple_years; +SELECT random('4714-11-24 BC'::date,'5874897-12-31 AD'::date) AS random_date_maximum_range; +SELECT random('1979-02-08'::date,'1979-02-08'::date) AS random_date_empty_range; +SELECT random('2024-12-31'::date, '2024-01-01'::date); -- fail +SELECT random('-infinity'::date, '2024-01-01'::date); -- fail +SELECT random('2024-12-31'::date, 'infinity'::date); -- fail + +-- random timestamps +SELECT random('1979-02-08'::timestamp,'2025-07-03'::timestamp) AS random_timestamp_multiple_years; +SELECT random('4714-11-24 BC'::timestamp,'294276-12-31 23:59:59.999999'::timestamp) AS random_timestamp_maximum_range; +SELECT random('2024-07-01 12:00:00.000001'::timestamp, '2024-07-01 12:00:00.999999'::timestamp) AS random_narrow_range; +SELECT random('1979-02-08'::timestamp,'1979-02-08'::timestamp) AS random_timestamp_empty_range; +SELECT random('2024-12-31'::timestamp, '2024-01-01'::timestamp); -- fail +SELECT random('-infinity'::timestamp, '2024-01-01'::timestamp); -- fail +SELECT random('2024-12-31'::timestamp, 'infinity'::timestamp); -- fail + +-- random timestamps with timezone +SELECT random('1979-02-08 +01'::timestamptz,'2025-07-03 +02'::timestamptz) AS random_timestamptz_multiple_years; +SELECT random('4714-11-24 BC +00'::timestamptz,'294276-12-31 23:59:59.999999 +00'::timestamptz) AS random_timestamptz_maximum_range; +SELECT random('2024-07-01 12:00:00.000001 +04'::timestamptz, '2024-07-01 12:00:00.999999 +04'::timestamptz) AS random_timestamptz_narrow_range; +SELECT random('1979-02-08 +05'::timestamptz,'1979-02-08 +05'::timestamptz) AS random_timestamptz_empty_range; +SELECT random('2024-01-01 +06'::timestamptz, '2024-01-01 +07'::timestamptz); -- fail +SELECT random('-infinity'::timestamptz, '2024-01-01 +07'::timestamptz); -- fail +SELECT random('2024-01-01 +06'::timestamptz, 'infinity'::timestamptz); -- fail From 81a61fde84ffc74f7b3c7854ed4193cc4d31f78b Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Tue, 9 Sep 2025 15:33:46 +0200 Subject: [PATCH 23/34] Fix typo in comment Author: Alexandra Wang Discussion: https://www.postgresql.org/message-id/CAK98qZ0whQ%3Dc%2BJGXbGSEBxCtLgy6sf-YGYqsKTAGsS-wt0wj%2BA%40mail.gmail.com --- src/backend/utils/adt/jsonbsubs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/utils/adt/jsonbsubs.c b/src/backend/utils/adt/jsonbsubs.c index de64d49851251..e8626d3b4fc6e 100644 --- a/src/backend/utils/adt/jsonbsubs.c +++ b/src/backend/utils/adt/jsonbsubs.c @@ -51,7 +51,7 @@ jsonb_subscript_transform(SubscriptingRef *sbsref, /* * Transform and convert the subscript expressions. Jsonb subscripting - * does not support slices, look only and the upper index. + * does not support slices, look only at the upper index. */ foreach(idx, indirection) { From 530cfa8eb50ca5a2151dfc50a6a5999ec8aff148 Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Tue, 9 Sep 2025 14:09:36 -0500 Subject: [PATCH 24/34] test_slru: Fix LWLock tranche allocation in EXEC_BACKEND builds. Currently, test_slru's shmem_startup_hook unconditionally generates new LWLock tranche IDs. This is fine on non-EXEC_BACKEND builds, where only the postmaster executes this hook, but on EXEC_BACKEND builds, every backend executes it, too. To fix, only generate the tranche IDs in the postmaster process by checking the IsUnderPostmaster variable. This is arguably a bug fix and could be back-patched, but since the damage is limited to some extra unused tranche IDs in a test module, I'm not going to bother. Reported-by: Sami Imseih Reviewed-by: Sami Imseih Discussion: https://postgr.es/m/CAA5RZ0vaAuonaf12CeDddQJu5xKL%2B6xVyS%2B_q1%2BcH%3D33JXV82w%40mail.gmail.com --- src/test/modules/test_slru/test_slru.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/test/modules/test_slru/test_slru.c b/src/test/modules/test_slru/test_slru.c index 8c0367eeee424..e963466aef1cd 100644 --- a/src/test/modules/test_slru/test_slru.c +++ b/src/test/modules/test_slru/test_slru.c @@ -219,8 +219,8 @@ test_slru_shmem_startup(void) */ const bool long_segment_names = true; const char slru_dir_name[] = "pg_test_slru"; - int test_tranche_id; - int test_buffer_tranche_id; + int test_tranche_id = -1; + int test_buffer_tranche_id = -1; if (prev_shmem_startup_hook) prev_shmem_startup_hook(); @@ -231,10 +231,18 @@ test_slru_shmem_startup(void) */ (void) MakePGDirectory(slru_dir_name); - /* initialize the SLRU facility */ - test_tranche_id = LWLockNewTrancheId("test_slru_tranche"); - - test_buffer_tranche_id = LWLockNewTrancheId("test_buffer_tranche"); + /* + * Initialize the SLRU facility. In EXEC_BACKEND builds, the + * shmem_startup_hook is called in the postmaster and in each backend, but + * we only need to generate the LWLock tranches once. Note that these + * tranche ID variables are not used by SimpleLruInit() when + * IsUnderPostmaster is true. + */ + if (!IsUnderPostmaster) + { + test_tranche_id = LWLockNewTrancheId("test_slru_tranche"); + test_buffer_tranche_id = LWLockNewTrancheId("test_buffer_tranche"); + } TestSlruCtl->PagePrecedes = test_slru_page_precedes_logically; SimpleLruInit(TestSlruCtl, "TestSLRU", From d96c854dfc634212193007ca58f8978bc272d457 Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Tue, 9 Sep 2025 14:35:30 -0500 Subject: [PATCH 25/34] Fix documentation for shmem_startup_hook. This section claims that each backend executes the shmem_startup_hook shortly after attaching to shared memory, which is true for EXEC_BACKEND builds, but not for others. This commit adds this important detail. Oversight in commit 964152c476. Reported-by: Sami Imseih Reviewed-by: Sami Imseih Discussion: https://postgr.es/m/CAA5RZ0vEGT1eigGbVt604LkXP6mUPMwPMxQoRCbFny44w%2B9EUQ%40mail.gmail.com Backpatch-through: 17 --- doc/src/sgml/xfunc.sgml | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/doc/src/sgml/xfunc.sgml b/doc/src/sgml/xfunc.sgml index da21ef5689184..04bf919b34384 100644 --- a/doc/src/sgml/xfunc.sgml +++ b/doc/src/sgml/xfunc.sgml @@ -3668,11 +3668,14 @@ LWLockRelease(AddinShmemInitLock); shmem_startup_hook provides a convenient place for the initialization code, but it is not strictly required that all such code - be placed in this hook. Each backend will execute the registered - shmem_startup_hook shortly after it attaches to shared - memory. Note that add-ins should still acquire + be placed in this hook. On Windows (and anywhere else where + EXEC_BACKEND is defined), each backend executes the + registered shmem_startup_hook shortly after it + attaches to shared memory, so add-ins should still acquire AddinShmemInitLock within this hook, as shown in the - example above. + example above. On other platforms, only the postmaster process executes + the shmem_startup_hook, and each backend automatically + inherits the pointers to shared memory. From 8c8f7b199d9095dbc2e101a4614043b5ae13bde3 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Wed, 10 Sep 2025 07:23:05 +0900 Subject: [PATCH 26/34] Fix leak with SMgrRelations in startup process The startup process does not process shared invalidation messages, only sending them, and never calls AtEOXact_SMgr() which clean up any unpinned SMgrRelations. Hence, it is never able to free SMgrRelations on a periodic basis, bloating its hashtable over time. Like the checkpointer and the bgwriter, this commit takes a conservative approach by freeing periodically SMgrRelations when replaying a checkpoint record, either online or shutdown, so as the startup process has a way to perform a periodic cleanup. Issue caused by 21d9c3ee4ef7, so backpatch down to v17. Author: Jingtang Zhang Reviewed-by: Yuhang Qiu Discussion: https://postgr.es/m/28C687D4-F335-417E-B06C-6612A0BD5A10@gmail.com Backpatch-through: 17 --- src/backend/access/transam/xlog.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 7ffb217915190..0baf0ac6160af 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -8385,6 +8385,14 @@ xlog_redo(XLogReaderState *record) checkPoint.ThisTimeLineID, replayTLI))); RecoveryRestartPoint(&checkPoint, record); + + /* + * After replaying a checkpoint record, free all smgr objects. + * Otherwise we would never do so for dropped relations, as the + * startup does not process shared invalidation messages or call + * AtEOXact_SMgr(). + */ + smgrdestroyall(); } else if (info == XLOG_CHECKPOINT_ONLINE) { @@ -8438,6 +8446,14 @@ xlog_redo(XLogReaderState *record) checkPoint.ThisTimeLineID, replayTLI))); RecoveryRestartPoint(&checkPoint, record); + + /* + * After replaying a checkpoint record, free all smgr objects. + * Otherwise we would never do so for dropped relations, as the + * startup does not process shared invalidation messages or call + * AtEOXact_SMgr(). + */ + smgrdestroyall(); } else if (info == XLOG_OVERWRITE_CONTRECORD) { From b1187266e077265cb061cbedd502e94179dc7b21 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Wed, 10 Sep 2025 11:20:46 +0900 Subject: [PATCH 27/34] Replace callers of dynahash.h's my_log() by equivalent in pg_bitutils.h All the calls replaced by this commit use 4-byte integers for their variables used in input of my_log2(). Hence, the limit against too-large inputs does not really apply. Thresholds are also applied, as of: - In nodeAgg.c, the number of partitions is limited by HASHAGG_MAX_PARTITIONS. - In nodeHash.c, ExecChooseHashTableSize() caps its maximum number of buckets based on HashJoinTuple and palloc() allocation limit. - In worker.c, the number of subxacts tracked by ApplySubXactData uses uint32, making pg_ceil_log2_64() safe to use directly. Several approaches have been discussed, like an integration with thresholds in pg_bitutils.h, but it was found confusing. This uses Dean's idea, which gives a simpler result than what I came up with to be able to remove dynahash.h. dynahash.h will be removed in a follow-up commit, removing some duplication with the ceil log2 routines. Reviewed-by: Peter Eisentraut Reviewed-by: Dean Rasheed Discussion: https://postgr.es/m/CAEZATCUJPQD_7sC-wErak2CQGNa6bj2hY-mr8wsBki=kX7f2_A@mail.gmail.com --- src/backend/executor/nodeAgg.c | 3 +-- src/backend/executor/nodeHash.c | 7 +++---- src/backend/replication/logical/worker.c | 3 +-- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c index 377e016d73225..a4f3d30f307cc 100644 --- a/src/backend/executor/nodeAgg.c +++ b/src/backend/executor/nodeAgg.c @@ -267,7 +267,6 @@ #include "utils/acl.h" #include "utils/builtins.h" #include "utils/datum.h" -#include "utils/dynahash.h" #include "utils/expandeddatum.h" #include "utils/injection_point.h" #include "utils/logtape.h" @@ -2115,7 +2114,7 @@ hash_choose_num_partitions(double input_groups, double hashentrysize, npartitions = (int) dpartitions; /* ceil(log2(npartitions)) */ - partition_bits = my_log2(npartitions); + partition_bits = pg_ceil_log2_32(npartitions); /* make sure that we don't exhaust the hash bits */ if (partition_bits + used_bits >= 32) diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c index 8d2201ab67fa5..a3415db4e20f5 100644 --- a/src/backend/executor/nodeHash.c +++ b/src/backend/executor/nodeHash.c @@ -36,7 +36,6 @@ #include "executor/nodeHashjoin.h" #include "miscadmin.h" #include "port/pg_bitutils.h" -#include "utils/dynahash.h" #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/syscache.h" @@ -340,7 +339,7 @@ MultiExecParallelHash(HashState *node) */ hashtable->curbatch = -1; hashtable->nbuckets = pstate->nbuckets; - hashtable->log2_nbuckets = my_log2(hashtable->nbuckets); + hashtable->log2_nbuckets = pg_ceil_log2_32(hashtable->nbuckets); hashtable->totalTuples = pstate->total_tuples; /* @@ -480,7 +479,7 @@ ExecHashTableCreate(HashState *state) &nbuckets, &nbatch, &num_skew_mcvs); /* nbuckets must be a power of 2 */ - log2_nbuckets = my_log2(nbuckets); + log2_nbuckets = pg_ceil_log2_32(nbuckets); Assert(nbuckets == (1 << log2_nbuckets)); /* @@ -3499,7 +3498,7 @@ ExecParallelHashTableSetCurrentBatch(HashJoinTable hashtable, int batchno) dsa_get_address(hashtable->area, hashtable->batches[batchno].shared->buckets); hashtable->nbuckets = hashtable->parallel_state->nbuckets; - hashtable->log2_nbuckets = my_log2(hashtable->nbuckets); + hashtable->log2_nbuckets = pg_ceil_log2_32(hashtable->nbuckets); hashtable->current_chunk = NULL; hashtable->current_chunk_shared = InvalidDsaPointer; hashtable->batches[batchno].at_least_one_chunk = false; diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c index b3cac1023731a..ee6ac22329fdc 100644 --- a/src/backend/replication/logical/worker.c +++ b/src/backend/replication/logical/worker.c @@ -276,7 +276,6 @@ #include "storage/procarray.h" #include "tcop/tcopprot.h" #include "utils/acl.h" -#include "utils/dynahash.h" #include "utils/guc.h" #include "utils/inval.h" #include "utils/lsyscache.h" @@ -5115,7 +5114,7 @@ subxact_info_read(Oid subid, TransactionId xid) len = sizeof(SubXactInfo) * subxact_data.nsubxacts; /* we keep the maximum as a power of 2 */ - subxact_data.nsubxacts_max = 1 << my_log2(subxact_data.nsubxacts); + subxact_data.nsubxacts_max = 1 << pg_ceil_log2_32(subxact_data.nsubxacts); /* * Allocate subxact information in the logical streaming context. We need From e6da68a6e1d60a037b63a9c9ed36e5ef0a996769 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Wed, 10 Sep 2025 14:11:50 +0900 Subject: [PATCH 28/34] Remove dynahash.h All the callers of my_log2() are now limited inside dynahash.c, so let's remove this header. The same capability is provided by pg_bitutils.h already. Discussion: https://postgr.es/m/CAEZATCUJPQD_7sC-wErak2CQGNa6bj2hY-mr8wsBki=kX7f2_A@mail.gmail.com --- src/backend/utils/hash/dynahash.c | 4 ++-- src/include/utils/dynahash.h | 20 -------------------- 2 files changed, 2 insertions(+), 22 deletions(-) delete mode 100644 src/include/utils/dynahash.h diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c index 1aeee5be42acd..ac94b9e93c6e3 100644 --- a/src/backend/utils/hash/dynahash.c +++ b/src/backend/utils/hash/dynahash.c @@ -102,7 +102,6 @@ #include "port/pg_bitutils.h" #include "storage/shmem.h" #include "storage/spin.h" -#include "utils/dynahash.h" #include "utils/memutils.h" @@ -281,6 +280,7 @@ static bool init_htab(HTAB *hashp, int64 nelem); pg_noreturn static void hash_corrupted(HTAB *hashp); static uint32 hash_initial_lookup(HTAB *hashp, uint32 hashvalue, HASHBUCKET **bucketptr); +static int my_log2(int64 num); static int64 next_pow2_int64(int64 num); static int next_pow2_int(int64 num); static void register_seq_scan(HTAB *hashp); @@ -1813,7 +1813,7 @@ hash_corrupted(HTAB *hashp) } /* calculate ceil(log base 2) of num */ -int +static int my_log2(int64 num) { /* diff --git a/src/include/utils/dynahash.h b/src/include/utils/dynahash.h deleted file mode 100644 index a4362d3f65e59..0000000000000 --- a/src/include/utils/dynahash.h +++ /dev/null @@ -1,20 +0,0 @@ -/*------------------------------------------------------------------------- - * - * dynahash.h - * POSTGRES dynahash.h file definitions - * - * - * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group - * Portions Copyright (c) 1994, Regents of the University of California - * - * IDENTIFICATION - * src/include/utils/dynahash.h - * - *------------------------------------------------------------------------- - */ -#ifndef DYNAHASH_H -#define DYNAHASH_H - -extern int my_log2(int64 num); - -#endif /* DYNAHASH_H */ From 33eec809402bfbf3eb0d01ad5b023d3d05fcb3bc Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Wed, 10 Sep 2025 11:49:53 +0200 Subject: [PATCH 29/34] Fix CREATE TABLE LIKE with not-valid check constraint In CREATE TABLE ... LIKE, any check constraints copied from the source table should be set to valid if they are ENFORCED (the default). Bug introduced in commit ca87c415e2f. Author: jian he Discussion: https://www.postgresql.org/message-id/CACJufxH%3D%2Bod8Wy0P4L3_GpapNwLUP3oAes5UFRJ7yTxrM_M5kg%40mail.gmail.com --- src/backend/parser/parse_utilcmd.c | 3 +-- src/test/regress/expected/create_table_like.out | 8 ++++++++ src/test/regress/sql/create_table_like.sql | 1 + 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c index afcf54169c3b3..e96b38a59d503 100644 --- a/src/backend/parser/parse_utilcmd.c +++ b/src/backend/parser/parse_utilcmd.c @@ -1461,7 +1461,6 @@ expandTableLikeClause(RangeVar *heapRel, TableLikeClause *table_like_clause) char *ccname = constr->check[ccnum].ccname; char *ccbin = constr->check[ccnum].ccbin; bool ccenforced = constr->check[ccnum].ccenforced; - bool ccvalid = constr->check[ccnum].ccvalid; bool ccnoinherit = constr->check[ccnum].ccnoinherit; Node *ccbin_node; bool found_whole_row; @@ -1492,7 +1491,7 @@ expandTableLikeClause(RangeVar *heapRel, TableLikeClause *table_like_clause) n->conname = pstrdup(ccname); n->location = -1; n->is_enforced = ccenforced; - n->initially_valid = ccvalid; + n->initially_valid = ccenforced; /* sic */ n->is_no_inherit = ccnoinherit; n->raw_expr = NULL; n->cooked_expr = nodeToString(ccbin_node); diff --git a/src/test/regress/expected/create_table_like.out b/src/test/regress/expected/create_table_like.out index 29a779c2e9072..d3c35c148475d 100644 --- a/src/test/regress/expected/create_table_like.out +++ b/src/test/regress/expected/create_table_like.out @@ -320,6 +320,7 @@ DROP TABLE inhz; -- including storage and comments CREATE TABLE ctlt1 (a text CHECK (length(a) > 2) ENFORCED PRIMARY KEY, b text CHECK (length(b) > 100) NOT ENFORCED); +ALTER TABLE ctlt1 ADD CONSTRAINT cc CHECK (length(b) > 100) NOT VALID; CREATE INDEX ctlt1_b_key ON ctlt1 (b); CREATE INDEX ctlt1_fnidx ON ctlt1 ((a || b)); CREATE STATISTICS ctlt1_a_b_stat ON a,b FROM ctlt1; @@ -378,6 +379,7 @@ SELECT conname, description FROM pg_description, pg_constraint c WHERE classoid CREATE TABLE ctlt1_inh (LIKE ctlt1 INCLUDING CONSTRAINTS INCLUDING COMMENTS) INHERITS (ctlt1); NOTICE: merging column "a" with inherited definition NOTICE: merging column "b" with inherited definition +NOTICE: merging constraint "cc" with inherited definition NOTICE: merging constraint "ctlt1_a_check" with inherited definition NOTICE: merging constraint "ctlt1_b_check" with inherited definition \d+ ctlt1_inh @@ -387,6 +389,7 @@ NOTICE: merging constraint "ctlt1_b_check" with inherited definition a | text | | not null | | main | | A b | text | | | | extended | | B Check constraints: + "cc" CHECK (length(b) > 100) "ctlt1_a_check" CHECK (length(a) > 2) "ctlt1_b_check" CHECK (length(b) > 100) NOT ENFORCED Not-null constraints: @@ -409,6 +412,7 @@ NOTICE: merging multiple inherited definitions of column "a" b | text | | | | extended | | c | text | | | | external | | Check constraints: + "cc" CHECK (length(b) > 100) "ctlt1_a_check" CHECK (length(a) > 2) "ctlt1_b_check" CHECK (length(b) > 100) NOT ENFORCED "ctlt3_a_check" CHECK (length(a) < 5) @@ -430,6 +434,7 @@ NOTICE: merging column "a" with inherited definition Indexes: "ctlt13_like_expr_idx" btree ((a || c)) Check constraints: + "cc" CHECK (length(b) > 100) "ctlt1_a_check" CHECK (length(a) > 2) "ctlt1_b_check" CHECK (length(b) > 100) NOT ENFORCED "ctlt3_a_check" CHECK (length(a) < 5) @@ -456,6 +461,7 @@ Indexes: "ctlt_all_b_idx" btree (b) "ctlt_all_expr_idx" btree ((a || b)) Check constraints: + "cc" CHECK (length(b) > 100) "ctlt1_a_check" CHECK (length(a) > 2) "ctlt1_b_check" CHECK (length(b) > 100) NOT ENFORCED Statistics objects: @@ -499,6 +505,7 @@ Indexes: "pg_attrdef_b_idx" btree (b) "pg_attrdef_expr_idx" btree ((a || b)) Check constraints: + "cc" CHECK (length(b) > 100) "ctlt1_a_check" CHECK (length(a) > 2) "ctlt1_b_check" CHECK (length(b) > 100) NOT ENFORCED Statistics objects: @@ -524,6 +531,7 @@ Indexes: "ctlt1_b_idx" btree (b) "ctlt1_expr_idx" btree ((a || b)) Check constraints: + "cc" CHECK (length(b) > 100) "ctlt1_a_check" CHECK (length(a) > 2) "ctlt1_b_check" CHECK (length(b) > 100) NOT ENFORCED Statistics objects: diff --git a/src/test/regress/sql/create_table_like.sql b/src/test/regress/sql/create_table_like.sql index bf8702116a74b..93389b57dbf95 100644 --- a/src/test/regress/sql/create_table_like.sql +++ b/src/test/regress/sql/create_table_like.sql @@ -130,6 +130,7 @@ DROP TABLE inhz; -- including storage and comments CREATE TABLE ctlt1 (a text CHECK (length(a) > 2) ENFORCED PRIMARY KEY, b text CHECK (length(b) > 100) NOT ENFORCED); +ALTER TABLE ctlt1 ADD CONSTRAINT cc CHECK (length(b) > 100) NOT VALID; CREATE INDEX ctlt1_b_key ON ctlt1 (b); CREATE INDEX ctlt1_fnidx ON ctlt1 ((a || b)); CREATE STATISTICS ctlt1_a_b_stat ON a,b FROM ctlt1; From 9016fa7e3bcde8ae4c3d63c707143af147486a10 Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Wed, 10 Sep 2025 11:21:12 -0500 Subject: [PATCH 30/34] meson: Build numeric.c with -ftree-vectorize. autoconf builds have compiled this file with -ftree-vectorize since commit 8870917623, but meson builds seem to have missed the memo. Reviewed-by: Jeff Davis Discussion: https://postgr.es/m/aL85CeasM51-0D1h%40nathan Backpatch-through: 16 --- src/backend/utils/adt/meson.build | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/backend/utils/adt/meson.build b/src/backend/utils/adt/meson.build index dac372c3bea3b..12fa0c209127c 100644 --- a/src/backend/utils/adt/meson.build +++ b/src/backend/utils/adt/meson.build @@ -1,5 +1,15 @@ # Copyright (c) 2022-2025, PostgreSQL Global Development Group +# Some code in numeric.c benefits from auto-vectorization +numeric_backend_lib = static_library('numeric_backend_lib', + 'numeric.c', + dependencies: backend_build_deps, + kwargs: internal_lib_args, + c_args: vectorize_cflags, +) + +backend_link_with += numeric_backend_lib + backend_sources += files( 'acl.c', 'amutils.c', @@ -61,7 +71,6 @@ backend_sources += files( 'network_gist.c', 'network_selfuncs.c', 'network_spgist.c', - 'numeric.c', 'numutils.c', 'oid.c', 'oracle_compat.c', From abdeacdb0920d94dec7500d09f6f29fbb2f6310d Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 10 Sep 2025 16:05:03 -0400 Subject: [PATCH 31/34] Fix memory leakage in nodeSubplan.c. If the hash functions used for hashing tuples leaked any memory, we failed to clean that up, resulting in query-lifespan memory leakage in queries using hashed subplans. One way that could happen is if the values being hashed require de-toasting, since most of our hash functions don't trouble to clean up de-toasted inputs. Prior to commit bf6c614a2, this leakage was largely masked because TupleHashTableMatch would reset hashtable->tempcxt (via execTuplesMatch). But it doesn't do that anymore, and that's not really the right place for this anyway: doing it there could reset the tempcxt many times per hash lookup, or not at all. Instead put reset calls into ExecHashSubPlan and buildSubPlanHash. Along the way to that, rearrange ExecHashSubPlan so that there's just one place to call MemoryContextReset instead of several. This amounts to accepting the de-facto API spec that the caller of the TupleHashTable routines is responsible for resetting the tempcxt adequately often. Although the other callers seem to get this right, it was not documented anywhere, so add a comment about it. Bug: #19040 Reported-by: Haiyang Li Author: Haiyang Li Reviewed-by: Fei Changhong Reviewed-by: Tom Lane Discussion: https://postgr.es/m/19040-c9b6073ef814f48c@postgresql.org Backpatch-through: 13 --- src/backend/executor/execGrouping.c | 6 +++ src/backend/executor/nodeSubplan.c | 70 +++++++++++------------------ 2 files changed, 33 insertions(+), 43 deletions(-) diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c index b540074935386..75087204f0c69 100644 --- a/src/backend/executor/execGrouping.c +++ b/src/backend/executor/execGrouping.c @@ -156,6 +156,12 @@ execTuplesHashPrepare(int numCols, * * Note that the keyColIdx, hashfunctions, and collations arrays must be * allocated in storage that will live as long as the hashtable does. + * + * LookupTupleHashEntry, FindTupleHashEntry, and related functions may leak + * memory in the tempcxt. It is caller's responsibility to reset that context + * reasonably often, typically once per tuple. (We do it that way, rather + * than managing an extra context within the hashtable, because in many cases + * the caller can specify a tempcxt that it needs to reset per-tuple anyway.) */ TupleHashTable BuildTupleHashTable(PlanState *parent, diff --git a/src/backend/executor/nodeSubplan.c b/src/backend/executor/nodeSubplan.c index f7f6fc2da0b95..8e55dcc159b0b 100644 --- a/src/backend/executor/nodeSubplan.c +++ b/src/backend/executor/nodeSubplan.c @@ -102,6 +102,7 @@ ExecHashSubPlan(SubPlanState *node, ExprContext *econtext, bool *isNull) { + bool result = false; SubPlan *subplan = node->subplan; PlanState *planstate = node->planstate; TupleTableSlot *slot; @@ -132,14 +133,6 @@ ExecHashSubPlan(SubPlanState *node, node->projLeft->pi_exprContext = econtext; slot = ExecProject(node->projLeft); - /* - * Note: because we are typically called in a per-tuple context, we have - * to explicitly clear the projected tuple before returning. Otherwise, - * we'll have a double-free situation: the per-tuple context will probably - * be reset before we're called again, and then the tuple slot will think - * it still needs to free the tuple. - */ - /* * If the LHS is all non-null, probe for an exact match in the main hash * table. If we find one, the result is TRUE. Otherwise, scan the @@ -161,19 +154,10 @@ ExecHashSubPlan(SubPlanState *node, slot, node->cur_eq_comp, node->lhs_hash_expr) != NULL) - { - ExecClearTuple(slot); - return BoolGetDatum(true); - } - if (node->havenullrows && - findPartialMatch(node->hashnulls, slot, node->cur_eq_funcs)) - { - ExecClearTuple(slot); + result = true; + else if (node->havenullrows && + findPartialMatch(node->hashnulls, slot, node->cur_eq_funcs)) *isNull = true; - return BoolGetDatum(false); - } - ExecClearTuple(slot); - return BoolGetDatum(false); } /* @@ -186,34 +170,31 @@ ExecHashSubPlan(SubPlanState *node, * aren't provably unequal to the LHS; if so, the result is UNKNOWN. * Otherwise, the result is FALSE. */ - if (node->hashnulls == NULL) - { - ExecClearTuple(slot); - return BoolGetDatum(false); - } - if (slotAllNulls(slot)) - { - ExecClearTuple(slot); + else if (node->hashnulls == NULL) + /* just return FALSE */ ; + else if (slotAllNulls(slot)) *isNull = true; - return BoolGetDatum(false); - } /* Scan partly-null table first, since more likely to get a match */ - if (node->havenullrows && - findPartialMatch(node->hashnulls, slot, node->cur_eq_funcs)) - { - ExecClearTuple(slot); + else if (node->havenullrows && + findPartialMatch(node->hashnulls, slot, node->cur_eq_funcs)) *isNull = true; - return BoolGetDatum(false); - } - if (node->havehashrows && - findPartialMatch(node->hashtable, slot, node->cur_eq_funcs)) - { - ExecClearTuple(slot); + else if (node->havehashrows && + findPartialMatch(node->hashtable, slot, node->cur_eq_funcs)) *isNull = true; - return BoolGetDatum(false); - } + + /* + * Note: because we are typically called in a per-tuple context, we have + * to explicitly clear the projected tuple before returning. Otherwise, + * we'll have a double-free situation: the per-tuple context will probably + * be reset before we're called again, and then the tuple slot will think + * it still needs to free the tuple. + */ ExecClearTuple(slot); - return BoolGetDatum(false); + + /* Also must reset the hashtempcxt after each hashtable lookup. */ + MemoryContextReset(node->hashtempcxt); + + return BoolGetDatum(result); } /* @@ -642,6 +623,9 @@ buildSubPlanHash(SubPlanState *node, ExprContext *econtext) * during ExecProject. */ ResetExprContext(innerecontext); + + /* Also must reset the hashtempcxt after each hashtable lookup. */ + MemoryContextReset(node->hashtempcxt); } /* From bdc6cfcd12f5c95799328e05aa4bfa75cfe3e79f Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 10 Sep 2025 16:15:08 -0400 Subject: [PATCH 32/34] Eliminate duplicative hashtempcxt in nodeSubplan.c. Instead of building a separate memory context that's used just for running hash functions, make the hash functions run in the per-tuple context of the node's innerecontext. This saves a little space at runtime, and it avoids needing to reset two contexts instead of one inside buildSubPlanHash's main loop. This largely reverts commit 133924e13. That's safe to do now because bf6c614a2 decoupled the evaluation context used by TupleHashTableMatch from that used for hash function evaluation, so that there's no longer a risk of resetting the innerecontext too soon. Per discussion of bug #19040, although this is not directly a fix for that. Author: Tom Lane Reviewed-by: Haiyang Li Reviewed-by: Fei Changhong Discussion: https://postgr.es/m/19040-c9b6073ef814f48c@postgresql.org --- src/backend/executor/nodeSubplan.c | 19 +++++-------------- src/include/nodes/execnodes.h | 1 - 2 files changed, 5 insertions(+), 15 deletions(-) diff --git a/src/backend/executor/nodeSubplan.c b/src/backend/executor/nodeSubplan.c index 8e55dcc159b0b..53fb56f7388e8 100644 --- a/src/backend/executor/nodeSubplan.c +++ b/src/backend/executor/nodeSubplan.c @@ -191,8 +191,8 @@ ExecHashSubPlan(SubPlanState *node, */ ExecClearTuple(slot); - /* Also must reset the hashtempcxt after each hashtable lookup. */ - MemoryContextReset(node->hashtempcxt); + /* Also must reset the innerecontext after each hashtable lookup. */ + ResetExprContext(node->innerecontext); return BoolGetDatum(result); } @@ -529,7 +529,7 @@ buildSubPlanHash(SubPlanState *node, ExprContext *econtext) 0, node->planstate->state->es_query_cxt, node->hashtablecxt, - node->hashtempcxt, + innerecontext->ecxt_per_tuple_memory, false); if (!subplan->unknownEqFalse) @@ -558,7 +558,7 @@ buildSubPlanHash(SubPlanState *node, ExprContext *econtext) 0, node->planstate->state->es_query_cxt, node->hashtablecxt, - node->hashtempcxt, + innerecontext->ecxt_per_tuple_memory, false); } else @@ -620,12 +620,9 @@ buildSubPlanHash(SubPlanState *node, ExprContext *econtext) /* * Reset innerecontext after each inner tuple to free any memory used - * during ExecProject. + * during ExecProject and hashtable lookup. */ ResetExprContext(innerecontext); - - /* Also must reset the hashtempcxt after each hashtable lookup. */ - MemoryContextReset(node->hashtempcxt); } /* @@ -842,7 +839,6 @@ ExecInitSubPlan(SubPlan *subplan, PlanState *parent) sstate->hashtable = NULL; sstate->hashnulls = NULL; sstate->hashtablecxt = NULL; - sstate->hashtempcxt = NULL; sstate->innerecontext = NULL; sstate->keyColIdx = NULL; sstate->tab_eq_funcoids = NULL; @@ -898,11 +894,6 @@ ExecInitSubPlan(SubPlan *subplan, PlanState *parent) AllocSetContextCreate(CurrentMemoryContext, "Subplan HashTable Context", ALLOCSET_DEFAULT_SIZES); - /* and a small one for the hash tables to use as temp storage */ - sstate->hashtempcxt = - AllocSetContextCreate(CurrentMemoryContext, - "Subplan HashTable Temp Context", - ALLOCSET_SMALL_SIZES); /* and a short-lived exprcontext for function evaluation */ sstate->innerecontext = CreateExprContext(estate); diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index de782014b2d41..71857feae4823 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1020,7 +1020,6 @@ typedef struct SubPlanState bool havehashrows; /* true if hashtable is not empty */ bool havenullrows; /* true if hashnulls is not empty */ MemoryContext hashtablecxt; /* memory context containing hash tables */ - MemoryContext hashtempcxt; /* temp memory context for hash tables */ ExprContext *innerecontext; /* econtext for computing inner tuples */ int numCols; /* number of columns being hashed */ /* each of the remaining fields is an array of length numCols: */ From 09036dc71c682b0bf7234ed39c1429ed99fbe442 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 10 Sep 2025 17:51:24 -0400 Subject: [PATCH 33/34] Avoid faulty alignment of Datums in build_sorted_items(). If sizeof(Pointer) is 4 then sizeof(SortItem) will be 12, so that if data->numrows is odd then we placed the values array at a location that's not a multiple of 8. That was fine when sizeof(Datum) was also 4, but in the wake of commit 2a600a93c it makes some alignment-picky machines unhappy. (You need a 32-bit machine that nonetheless expects 8-byte alignment of 8-byte quantities, which is an odd-seeming combination but it does exist outside the Intel universe.) To fix, MAXALIGN the space allocated to the SortItem array. In passing, let's make the "len" variable be Size not int, just for paranoia's sake. This code was arguably not too safe even before 2a600a93c, but at present I don't see a strong argument for back-patching. Reported-by: Tomas Vondra Author: Tom Lane Discussion: https://postgr.es/m/87036018-8d70-40ad-a0ac-192b07bd7b04@vondra.me --- src/backend/statistics/extended_stats.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c index af0b99243c614..3c3d2d315c6f4 100644 --- a/src/backend/statistics/extended_stats.c +++ b/src/backend/statistics/extended_stats.c @@ -986,10 +986,9 @@ build_sorted_items(StatsBuildData *data, int *nitems, { int i, j, - len, nrows; int nvalues = data->numrows * numattrs; - + Size len; SortItem *items; Datum *values; bool *isnull; @@ -997,14 +996,16 @@ build_sorted_items(StatsBuildData *data, int *nitems, int *typlen; /* Compute the total amount of memory we need (both items and values). */ - len = data->numrows * sizeof(SortItem) + nvalues * (sizeof(Datum) + sizeof(bool)); + len = MAXALIGN(data->numrows * sizeof(SortItem)) + + nvalues * (sizeof(Datum) + sizeof(bool)); /* Allocate the memory and split it into the pieces. */ ptr = palloc0(len); /* items to sort */ items = (SortItem *) ptr; - ptr += data->numrows * sizeof(SortItem); + /* MAXALIGN ensures that the following Datums are suitably aligned */ + ptr += MAXALIGN(data->numrows * sizeof(SortItem)); /* values and null flags */ values = (Datum *) ptr; From c88ce73eda2e0a818d730c5b72475ef99cc9c4cf Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Thu, 11 Sep 2025 10:15:33 +0900 Subject: [PATCH 34/34] Fix incorrect file reference in guc.h GucSource_Names was documented as being in guc.c, but since 0a20ff54f5e6 it is located in guc_tables.c. The reference to the location of GucSource_Names is important, as GucSource needs to be kept in sync with GucSource_Names. Author: David G. Johnston Discussion: https://postgr.es/m/CAKFQuwYPgAHWPYjPzK7iXzhSZ6MKR8w20_Nz7ZXpOvx=kZbs7A@mail.gmail.com Backpatch-through: 16 --- src/include/utils/guc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h index 756e80a2c2fcc..f21ec37da8933 100644 --- a/src/include/utils/guc.h +++ b/src/include/utils/guc.h @@ -106,7 +106,7 @@ typedef enum * will show as "default" in pg_settings. If there is a specific reason not * to want that, use source == PGC_S_OVERRIDE. * - * NB: see GucSource_Names in guc.c if you change this. + * NB: see GucSource_Names in guc_tables.c if you change this. */ typedef enum {