Skip to content

Commit 37d10c5

Browse files
committed
Fix subtransaction cleanup after an outer-subtransaction portal fails.
Formerly, we treated only portals created in the current subtransaction as having failed during subtransaction abort. However, if the error occurred while running a portal created in an outer subtransaction (ie, a cursor declared before the last savepoint), that has to be considered broken too. To allow reliable detection of which ones those are, add a bookkeeping field to struct Portal that tracks the innermost subtransaction in which each portal has actually been executed. (Without this, we'd end up failing portals containing functions that had called the subtransaction, thereby breaking plpgsql exception blocks completely.) In addition, when we fail an outer-subtransaction Portal, transfer its resources into the subtransaction's resource owner, so that they're released early in cleanup of the subxact. This fixes a problem reported by Jim Nasby in which a function executed in an outer-subtransaction cursor could cause an Assert failure or crash by referencing a relation created within the inner subtransaction. The proximate cause of the Assert failure is that AtEOSubXact_RelationCache assumed it could blow away a relcache entry without first checking that the entry had zero refcount. That was a bad idea on its own terms, so add such a check there, and to the similar coding in AtEOXact_RelationCache. This provides an independent safety measure in case there are still ways to provoke the situation despite the Portal-level changes. This has been broken since subtransactions were invented, so back-patch to all supported branches. Tom Lane and Michael Paquier
1 parent 7f7fd9b commit 37d10c5

File tree

8 files changed

+203
-29
lines changed

8 files changed

+203
-29
lines changed

src/backend/access/transam/xact.c

+1
Original file line numberDiff line numberDiff line change
@@ -4352,6 +4352,7 @@ AbortSubTransaction(void)
43524352
AfterTriggerEndSubXact(false);
43534353
AtSubAbort_Portals(s->subTransactionId,
43544354
s->parent->subTransactionId,
4355+
s->curTransactionOwner,
43554356
s->parent->curTransactionOwner);
43564357
AtEOSubXact_LargeObject(false, s->subTransactionId,
43574358
s->parent->subTransactionId);

src/backend/commands/portalcmds.c

+1-5
Original file line numberDiff line numberDiff line change
@@ -335,11 +335,7 @@ PersistHoldablePortal(Portal portal)
335335
/*
336336
* Check for improper portal use, and mark portal active.
337337
*/
338-
if (portal->status != PORTAL_READY)
339-
ereport(ERROR,
340-
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
341-
errmsg("portal \"%s\" cannot be run", portal->name)));
342-
portal->status = PORTAL_ACTIVE;
338+
MarkPortalActive(portal);
343339

344340
/*
345341
* Set up global portal context pointers.

src/backend/tcop/pquery.c

+2-10
Original file line numberDiff line numberDiff line change
@@ -734,11 +734,7 @@ PortalRun(Portal portal, long count, bool isTopLevel,
734734
/*
735735
* Check for improper portal use, and mark portal active.
736736
*/
737-
if (portal->status != PORTAL_READY)
738-
ereport(ERROR,
739-
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
740-
errmsg("portal \"%s\" cannot be run", portal->name)));
741-
portal->status = PORTAL_ACTIVE;
737+
MarkPortalActive(portal);
742738

743739
/*
744740
* Set up global portal context pointers.
@@ -1398,11 +1394,7 @@ PortalRunFetch(Portal portal,
13981394
/*
13991395
* Check for improper portal use, and mark portal active.
14001396
*/
1401-
if (portal->status != PORTAL_READY)
1402-
ereport(ERROR,
1403-
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1404-
errmsg("portal \"%s\" cannot be run", portal->name)));
1405-
portal->status = PORTAL_ACTIVE;
1397+
MarkPortalActive(portal);
14061398

14071399
/*
14081400
* Set up global portal context pointers.

src/backend/utils/cache/relcache.c

+32-3
Original file line numberDiff line numberDiff line change
@@ -1957,7 +1957,9 @@ RelationClearRelation(Relation relation, bool rebuild)
19571957
{
19581958
/*
19591959
* As per notes above, a rel to be rebuilt MUST have refcnt > 0; while of
1960-
* course it would be a bad idea to blow away one with nonzero refcnt.
1960+
* course it would be an equally bad idea to blow away one with nonzero
1961+
* refcnt, since that would leave someone somewhere with a dangling
1962+
* pointer. All callers are expected to have verified that this holds.
19611963
*/
19621964
Assert(rebuild ?
19631965
!RelationHasReferenceCountZero(relation) :
@@ -2559,11 +2561,25 @@ AtEOXact_cleanup(Relation relation, bool isCommit)
25592561
{
25602562
if (isCommit)
25612563
relation->rd_createSubid = InvalidSubTransactionId;
2562-
else
2564+
else if (RelationHasReferenceCountZero(relation))
25632565
{
25642566
RelationClearRelation(relation, false);
25652567
return;
25662568
}
2569+
else
2570+
{
2571+
/*
2572+
* Hmm, somewhere there's a (leaked?) reference to the relation.
2573+
* We daren't remove the entry for fear of dereferencing a
2574+
* dangling pointer later. Bleat, and mark it as not belonging to
2575+
* the current transaction. Hopefully it'll get cleaned up
2576+
* eventually. This must be just a WARNING to avoid
2577+
* error-during-error-recovery loops.
2578+
*/
2579+
relation->rd_createSubid = InvalidSubTransactionId;
2580+
elog(WARNING, "cannot remove relcache entry for \"%s\" because it has nonzero refcount",
2581+
RelationGetRelationName(relation));
2582+
}
25672583
}
25682584

25692585
/*
@@ -2652,11 +2668,24 @@ AtEOSubXact_cleanup(Relation relation, bool isCommit,
26522668
{
26532669
if (isCommit)
26542670
relation->rd_createSubid = parentSubid;
2655-
else
2671+
else if (RelationHasReferenceCountZero(relation))
26562672
{
26572673
RelationClearRelation(relation, false);
26582674
return;
26592675
}
2676+
else
2677+
{
2678+
/*
2679+
* Hmm, somewhere there's a (leaked?) reference to the relation.
2680+
* We daren't remove the entry for fear of dereferencing a
2681+
* dangling pointer later. Bleat, and transfer it to the parent
2682+
* subtransaction so we can try again later. This must be just a
2683+
* WARNING to avoid error-during-error-recovery loops.
2684+
*/
2685+
relation->rd_createSubid = parentSubid;
2686+
elog(WARNING, "cannot remove relcache entry for \"%s\" because it has nonzero refcount",
2687+
RelationGetRelationName(relation));
2688+
}
26602689
}
26612690

26622691
/*

src/backend/utils/mmgr/portalmem.c

+74-8
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,7 @@ CreatePortal(const char *name, bool allowDup, bool dupSilent)
232232
portal->status = PORTAL_NEW;
233233
portal->cleanup = PortalCleanup;
234234
portal->createSubid = GetCurrentSubTransactionId();
235+
portal->activeSubid = portal->createSubid;
235236
portal->strategy = PORTAL_MULTI_QUERY;
236237
portal->cursorOptions = CURSOR_OPT_NO_SCROLL;
237238
portal->atStart = true;
@@ -402,6 +403,25 @@ UnpinPortal(Portal portal)
402403
portal->portalPinned = false;
403404
}
404405

406+
/*
407+
* MarkPortalActive
408+
* Transition a portal from READY to ACTIVE state.
409+
*
410+
* NOTE: never set portal->status = PORTAL_ACTIVE directly; call this instead.
411+
*/
412+
void
413+
MarkPortalActive(Portal portal)
414+
{
415+
/* For safety, this is a runtime test not just an Assert */
416+
if (portal->status != PORTAL_READY)
417+
ereport(ERROR,
418+
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
419+
errmsg("portal \"%s\" cannot be run", portal->name)));
420+
/* Perform the state transition */
421+
portal->status = PORTAL_ACTIVE;
422+
portal->activeSubid = GetCurrentSubTransactionId();
423+
}
424+
405425
/*
406426
* MarkPortalDone
407427
* Transition a portal from ACTIVE to DONE state.
@@ -690,6 +710,7 @@ PreCommit_Portals(bool isPrepare)
690710
* not belonging to this transaction.
691711
*/
692712
portal->createSubid = InvalidSubTransactionId;
713+
portal->activeSubid = InvalidSubTransactionId;
693714

694715
/* Report we changed state */
695716
result = true;
@@ -836,8 +857,8 @@ AtCleanup_Portals(void)
836857
/*
837858
* Pre-subcommit processing for portals.
838859
*
839-
* Reassign the portals created in the current subtransaction to the parent
840-
* subtransaction.
860+
* Reassign portals created or used in the current subtransaction to the
861+
* parent subtransaction.
841862
*/
842863
void
843864
AtSubCommit_Portals(SubTransactionId mySubid,
@@ -859,21 +880,24 @@ AtSubCommit_Portals(SubTransactionId mySubid,
859880
if (portal->resowner)
860881
ResourceOwnerNewParent(portal->resowner, parentXactOwner);
861882
}
883+
if (portal->activeSubid == mySubid)
884+
portal->activeSubid = parentSubid;
862885
}
863886
}
864887

865888
/*
866889
* Subtransaction abort handling for portals.
867890
*
868-
* Deactivate portals created during the failed subtransaction.
869-
* Note that per AtSubCommit_Portals, this will catch portals created
891+
* Deactivate portals created or used during the failed subtransaction.
892+
* Note that per AtSubCommit_Portals, this will catch portals created/used
870893
* in descendants of the subtransaction too.
871894
*
872895
* We don't destroy any portals here; that's done in AtSubCleanup_Portals.
873896
*/
874897
void
875898
AtSubAbort_Portals(SubTransactionId mySubid,
876899
SubTransactionId parentSubid,
900+
ResourceOwner myXactOwner,
877901
ResourceOwner parentXactOwner)
878902
{
879903
HASH_SEQ_STATUS status;
@@ -885,16 +909,58 @@ AtSubAbort_Portals(SubTransactionId mySubid,
885909
{
886910
Portal portal = hentry->portal;
887911

912+
/* Was it created in this subtransaction? */
888913
if (portal->createSubid != mySubid)
914+
{
915+
/* No, but maybe it was used in this subtransaction? */
916+
if (portal->activeSubid == mySubid)
917+
{
918+
/* Maintain activeSubid until the portal is removed */
919+
portal->activeSubid = parentSubid;
920+
921+
/*
922+
* Upper-level portals that failed while running in this
923+
* subtransaction must be forced into FAILED state, for the
924+
* same reasons discussed below.
925+
*
926+
* We assume we can get away without forcing upper-level READY
927+
* portals to fail, even if they were run and then suspended.
928+
* In theory a suspended upper-level portal could have
929+
* acquired some references to objects that are about to be
930+
* destroyed, but there should be sufficient defenses against
931+
* such cases: the portal's original query cannot contain such
932+
* references, and any references within, say, cached plans of
933+
* PL/pgSQL functions are not from active queries and should
934+
* be protected by revalidation logic.
935+
*/
936+
if (portal->status == PORTAL_ACTIVE)
937+
MarkPortalFailed(portal);
938+
939+
/*
940+
* Also, if we failed it during the current subtransaction
941+
* (either just above, or earlier), reattach its resource
942+
* owner to the current subtransaction's resource owner, so
943+
* that any resources it still holds will be released while
944+
* cleaning up this subtransaction. This prevents some corner
945+
* cases wherein we might get Asserts or worse while cleaning
946+
* up objects created during the current subtransaction
947+
* (because they're still referenced within this portal).
948+
*/
949+
if (portal->status == PORTAL_FAILED && portal->resowner)
950+
{
951+
ResourceOwnerNewParent(portal->resowner, myXactOwner);
952+
portal->resowner = NULL;
953+
}
954+
}
955+
/* Done if it wasn't created in this subtransaction */
889956
continue;
957+
}
890958

891959
/*
892960
* Force any live portals of my own subtransaction into FAILED state.
893961
* We have to do this because they might refer to objects created or
894-
* changed in the failed subtransaction, leading to crashes if
895-
* execution is resumed, or even if we just try to run ExecutorEnd.
896-
* (Note we do NOT do this to upper-level portals, since they cannot
897-
* have such references and hence may be able to continue.)
962+
* changed in the failed subtransaction, leading to crashes within
963+
* ExecutorEnd when portalcmds.c tries to close down the portal.
898964
*/
899965
if (portal->status == PORTAL_READY ||
900966
portal->status == PORTAL_ACTIVE)

src/include/utils/portal.h

+15-3
Original file line numberDiff line numberDiff line change
@@ -119,12 +119,15 @@ typedef struct PortalData
119119
MemoryContext heap; /* subsidiary memory for portal */
120120
ResourceOwner resowner; /* resources owned by portal */
121121
void (*cleanup) (Portal portal); /* cleanup hook */
122-
SubTransactionId createSubid; /* the ID of the creating subxact */
123122

124123
/*
125-
* if createSubid is InvalidSubTransactionId, the portal is held over from
126-
* a previous transaction
124+
* State data for remembering which subtransaction(s) the portal was
125+
* created or used in. If the portal is held over from a previous
126+
* transaction, both subxids are InvalidSubTransactionId. Otherwise,
127+
* createSubid is the creating subxact and activeSubid is the last subxact
128+
* in which we ran the portal.
127129
*/
130+
SubTransactionId createSubid; /* the creating subxact */
128131

129132
/* The query or queries the portal will execute */
130133
const char *sourceText; /* text of query (as of 8.4, never NULL) */
@@ -175,6 +178,13 @@ typedef struct PortalData
175178
/* Presentation data, primarily used by the pg_cursors system view */
176179
TimestampTz creation_time; /* time at which this portal was defined */
177180
bool visible; /* include this portal in pg_cursors? */
181+
182+
/*
183+
* This field belongs with createSubid, but in pre-9.5 branches, add it
184+
* at the end to avoid creating an ABI break for extensions that examine
185+
* Portal structs.
186+
*/
187+
SubTransactionId activeSubid; /* the last subxact with activity */
178188
} PortalData;
179189

180190
/*
@@ -201,12 +211,14 @@ extern void AtSubCommit_Portals(SubTransactionId mySubid,
201211
ResourceOwner parentXactOwner);
202212
extern void AtSubAbort_Portals(SubTransactionId mySubid,
203213
SubTransactionId parentSubid,
214+
ResourceOwner myXactOwner,
204215
ResourceOwner parentXactOwner);
205216
extern void AtSubCleanup_Portals(SubTransactionId mySubid);
206217
extern Portal CreatePortal(const char *name, bool allowDup, bool dupSilent);
207218
extern Portal CreateNewPortal(void);
208219
extern void PinPortal(Portal portal);
209220
extern void UnpinPortal(Portal portal);
221+
extern void MarkPortalActive(Portal portal);
210222
extern void MarkPortalDone(Portal portal);
211223
extern void MarkPortalFailed(Portal portal);
212224
extern void PortalDrop(Portal portal, bool isTopCommit);

src/test/regress/expected/transactions.out

+46
Original file line numberDiff line numberDiff line change
@@ -613,6 +613,52 @@ fetch from foo;
613613
(1 row)
614614

615615
abort;
616+
-- Test for proper cleanup after a failure in a cursor portal
617+
-- that was created in an outer subtransaction
618+
CREATE FUNCTION invert(x float8) RETURNS float8 LANGUAGE plpgsql AS
619+
$$ begin return 1/x; end $$;
620+
CREATE FUNCTION create_temp_tab() RETURNS text
621+
LANGUAGE plpgsql AS $$
622+
BEGIN
623+
CREATE TEMP TABLE new_table (f1 float8);
624+
-- case of interest is that we fail while holding an open
625+
-- relcache reference to new_table
626+
INSERT INTO new_table SELECT invert(0.0);
627+
RETURN 'foo';
628+
END $$;
629+
BEGIN;
630+
DECLARE ok CURSOR FOR SELECT * FROM int8_tbl;
631+
DECLARE ctt CURSOR FOR SELECT create_temp_tab();
632+
FETCH ok;
633+
q1 | q2
634+
-----+-----
635+
123 | 456
636+
(1 row)
637+
638+
SAVEPOINT s1;
639+
FETCH ok; -- should work
640+
q1 | q2
641+
-----+------------------
642+
123 | 4567890123456789
643+
(1 row)
644+
645+
FETCH ctt; -- error occurs here
646+
ERROR: division by zero
647+
CONTEXT: PL/pgSQL function invert(double precision) line 1 at RETURN
648+
SQL statement "INSERT INTO new_table SELECT invert(0.0)"
649+
PL/pgSQL function create_temp_tab() line 6 at SQL statement
650+
ROLLBACK TO s1;
651+
FETCH ok; -- should work
652+
q1 | q2
653+
------------------+-----
654+
4567890123456789 | 123
655+
(1 row)
656+
657+
FETCH ctt; -- must be rejected
658+
ERROR: portal "ctt" cannot be run
659+
COMMIT;
660+
DROP FUNCTION create_temp_tab();
661+
DROP FUNCTION invert(x float8);
616662
-- Test for successful cleanup of an aborted transaction at session exit.
617663
-- THIS MUST BE THE LAST TEST IN THIS FILE.
618664
begin;

src/test/regress/sql/transactions.sql

+32
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,38 @@ fetch from foo;
387387

388388
abort;
389389

390+
391+
-- Test for proper cleanup after a failure in a cursor portal
392+
-- that was created in an outer subtransaction
393+
CREATE FUNCTION invert(x float8) RETURNS float8 LANGUAGE plpgsql AS
394+
$$ begin return 1/x; end $$;
395+
396+
CREATE FUNCTION create_temp_tab() RETURNS text
397+
LANGUAGE plpgsql AS $$
398+
BEGIN
399+
CREATE TEMP TABLE new_table (f1 float8);
400+
-- case of interest is that we fail while holding an open
401+
-- relcache reference to new_table
402+
INSERT INTO new_table SELECT invert(0.0);
403+
RETURN 'foo';
404+
END $$;
405+
406+
BEGIN;
407+
DECLARE ok CURSOR FOR SELECT * FROM int8_tbl;
408+
DECLARE ctt CURSOR FOR SELECT create_temp_tab();
409+
FETCH ok;
410+
SAVEPOINT s1;
411+
FETCH ok; -- should work
412+
FETCH ctt; -- error occurs here
413+
ROLLBACK TO s1;
414+
FETCH ok; -- should work
415+
FETCH ctt; -- must be rejected
416+
COMMIT;
417+
418+
DROP FUNCTION create_temp_tab();
419+
DROP FUNCTION invert(x float8);
420+
421+
390422
-- Test for successful cleanup of an aborted transaction at session exit.
391423
-- THIS MUST BE THE LAST TEST IN THIS FILE.
392424

0 commit comments

Comments
 (0)