Skip to content

Commit b270713

Browse files
committed
Fix multiple crasher bugs in partitioned-table replication logic.
apply_handle_tuple_routing(), having detected and reported that the tuple it needed to update didn't exist, tried to update that tuple anyway, leading to a null-pointer dereference. logicalrep_partition_open() failed to ensure that the LogicalRepPartMapEntry it built for a partition was fully independent of that for the partition root, leading to trouble if the root entry was later freed or rebuilt. Meanwhile, on the publisher's side, pgoutput_change() sometimes attempted to apply execute_attr_map_tuple() to a NULL tuple. The first of these was reported by Sergey Bernikov in bug #17055; I found the other two while developing some test cases for this sadly under-tested code. Diagnosis and patch for the first issue by Amit Langote; patches for the others by me; new test cases by me. Back-patch to v13 where this logic came in. Discussion: https://postgr.es/m/17055-9ba800ec8522668b@postgresql.org
1 parent 218b101 commit b270713

File tree

5 files changed

+147
-29
lines changed

5 files changed

+147
-29
lines changed

src/backend/replication/logical/relation.c

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -589,7 +589,9 @@ logicalrep_partmap_init(void)
589589
* logicalrep_partition_open
590590
*
591591
* Returned entry reuses most of the values of the root table's entry, save
592-
* the attribute map, which can be different for the partition.
592+
* the attribute map, which can be different for the partition. However,
593+
* we must physically copy all the data, in case the root table's entry
594+
* gets freed/rebuilt.
593595
*
594596
* Note there's no logicalrep_partition_close, because the caller closes the
595597
* the component relation.
@@ -625,7 +627,7 @@ logicalrep_partition_open(LogicalRepRelMapEntry *root,
625627

626628
part_entry->partoid = partOid;
627629

628-
/* Remote relation is used as-is from the root entry. */
630+
/* Remote relation is copied as-is from the root entry. */
629631
entry = &part_entry->relmapentry;
630632
entry->remoterel.remoteid = remoterel->remoteid;
631633
entry->remoterel.nspname = pstrdup(remoterel->nspname);
@@ -668,7 +670,12 @@ logicalrep_partition_open(LogicalRepRelMapEntry *root,
668670
}
669671
}
670672
else
671-
entry->attrmap = attrmap;
673+
{
674+
/* Lacking copy_attmap, do this the hard way. */
675+
entry->attrmap = make_attrmap(attrmap->maplen);
676+
memcpy(entry->attrmap->attnums, attrmap->attnums,
677+
attrmap->maplen * sizeof(AttrNumber));
678+
}
672679

673680
entry->updatable = root->updatable;
674681

src/backend/replication/logical/worker.c

Lines changed: 28 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -898,12 +898,13 @@ apply_handle_update_internal(ResultRelInfo *relinfo,
898898
else
899899
{
900900
/*
901-
* The tuple to be updated could not be found.
901+
* The tuple to be updated could not be found. Do nothing except for
902+
* emitting a log message.
902903
*
903-
* TODO what to do here, change the log level to LOG perhaps?
904+
* XXX should this be promoted to ereport(LOG) perhaps?
904905
*/
905906
elog(DEBUG1,
906-
"logical replication did not find row for update "
907+
"logical replication did not find row to be updated "
907908
"in replication target relation \"%s\"",
908909
RelationGetRelationName(localrel));
909910
}
@@ -1001,9 +1002,14 @@ apply_handle_delete_internal(ResultRelInfo *relinfo, EState *estate,
10011002
}
10021003
else
10031004
{
1004-
/* The tuple to be deleted could not be found. */
1005+
/*
1006+
* The tuple to be deleted could not be found. Do nothing except for
1007+
* emitting a log message.
1008+
*
1009+
* XXX should this be promoted to ereport(LOG) perhaps?
1010+
*/
10051011
elog(DEBUG1,
1006-
"logical replication could not find row for delete "
1012+
"logical replication did not find row to be deleted "
10071013
"in replication target relation \"%s\"",
10081014
RelationGetRelationName(localrel));
10091015
}
@@ -1145,30 +1151,31 @@ apply_handle_tuple_routing(ApplyExecutionData *edata,
11451151
found = FindReplTupleInLocalRel(estate, partrel,
11461152
&part_entry->remoterel,
11471153
remoteslot_part, &localslot);
1148-
1149-
oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
1150-
if (found)
1151-
{
1152-
/* Apply the update. */
1153-
slot_modify_cstrings(remoteslot_part, localslot,
1154-
part_entry,
1155-
newtup->values, newtup->changed);
1156-
MemoryContextSwitchTo(oldctx);
1157-
}
1158-
else
1154+
if (!found)
11591155
{
11601156
/*
1161-
* The tuple to be updated could not be found.
1157+
* The tuple to be updated could not be found. Do nothing
1158+
* except for emitting a log message.
11621159
*
1163-
* TODO what to do here, change the log level to LOG
1164-
* perhaps?
1160+
* XXX should this be promoted to ereport(LOG) perhaps?
11651161
*/
11661162
elog(DEBUG1,
1167-
"logical replication did not find row for update "
1168-
"in replication target relation \"%s\"",
1163+
"logical replication did not find row to be updated "
1164+
"in replication target relation's partition \"%s\"",
11691165
RelationGetRelationName(partrel));
1166+
return;
11701167
}
11711168

1169+
/*
1170+
* Apply the update to the local tuple, putting the result in
1171+
* remoteslot_part.
1172+
*/
1173+
oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
1174+
slot_modify_cstrings(remoteslot_part, localslot,
1175+
part_entry,
1176+
newtup->values, newtup->changed);
1177+
MemoryContextSwitchTo(oldctx);
1178+
11721179
/*
11731180
* Does the updated tuple still satisfy the current
11741181
* partition's constraint?

src/backend/replication/pgoutput/pgoutput.c

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -452,8 +452,11 @@ pgoutput_change(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
452452
/* Convert tuples if needed. */
453453
if (relentry->map)
454454
{
455-
oldtuple = execute_attr_map_tuple(oldtuple, relentry->map);
456-
newtuple = execute_attr_map_tuple(newtuple, relentry->map);
455+
if (oldtuple)
456+
oldtuple = execute_attr_map_tuple(oldtuple,
457+
relentry->map);
458+
newtuple = execute_attr_map_tuple(newtuple,
459+
relentry->map);
457460
}
458461
}
459462

src/test/subscription/t/001_rep_changes.pl

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
use warnings;
44
use PostgresNode;
55
use TestLib;
6-
use Test::More tests => 24;
6+
use Test::More tests => 27;
77

88
# Initialize publisher node
99
my $node_publisher = get_new_node('publisher');
@@ -115,7 +115,7 @@
115115
"INSERT INTO tab_mixed VALUES (2, 'bar', 2.2)");
116116

117117
$node_publisher->safe_psql('postgres',
118-
"INSERT INTO tab_full_pk VALUES (1, 'foo')");
118+
"INSERT INTO tab_full_pk VALUES (1, 'foo'), (2, 'baz')");
119119

120120
$node_publisher->safe_psql('postgres',
121121
"INSERT INTO tab_nothing VALUES (generate_series(1,20))");
@@ -197,6 +197,38 @@
197197
local|2.2|bar|2),
198198
'update works with different column order and subscriber local values');
199199

200+
$result = $node_subscriber->safe_psql('postgres',
201+
"SELECT * FROM tab_full_pk ORDER BY a");
202+
is( $result, qq(1|bar
203+
2|baz),
204+
'update works with REPLICA IDENTITY FULL and a primary key');
205+
206+
# Check that subscriber handles cases where update/delete target tuple
207+
# is missing. We have to look for the DEBUG1 log messages about that,
208+
# so temporarily bump up the log verbosity.
209+
$node_subscriber->append_conf('postgresql.conf', "log_min_messages = debug1");
210+
$node_subscriber->reload;
211+
212+
$node_subscriber->safe_psql('postgres', "DELETE FROM tab_full_pk");
213+
214+
$node_publisher->safe_psql('postgres',
215+
"UPDATE tab_full_pk SET b = 'quux' WHERE a = 1");
216+
$node_publisher->safe_psql('postgres', "DELETE FROM tab_full_pk WHERE a = 2");
217+
218+
$node_publisher->wait_for_catchup('tap_sub');
219+
220+
my $logfile = slurp_file($node_subscriber->logfile());
221+
ok( $logfile =~
222+
qr/logical replication did not find row to be updated in replication target relation "tab_full_pk"/,
223+
'update target row is missing');
224+
ok( $logfile =~
225+
qr/logical replication did not find row to be deleted in replication target relation "tab_full_pk"/,
226+
'delete target row is missing');
227+
228+
$node_subscriber->append_conf('postgresql.conf',
229+
"log_min_messages = warning");
230+
$node_subscriber->reload;
231+
200232
# check behavior with toasted values
201233

202234
$node_publisher->safe_psql('postgres',

src/test/subscription/t/013_partition.pl

Lines changed: 70 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
use warnings;
44
use PostgresNode;
55
use TestLib;
6-
use Test::More tests => 56;
6+
use Test::More tests => 62;
77

88
# setup
99

@@ -344,6 +344,46 @@ BEGIN
344344
$node_subscriber2->safe_psql('postgres', "SELECT a FROM tab1 ORDER BY 1");
345345
is($result, qq(), 'truncate of tab1 replicated');
346346

347+
# Check that subscriber handles cases where update/delete target tuple
348+
# is missing. We have to look for the DEBUG1 log messages about that,
349+
# so temporarily bump up the log verbosity.
350+
$node_subscriber1->append_conf('postgresql.conf',
351+
"log_min_messages = debug1");
352+
$node_subscriber1->reload;
353+
354+
$node_publisher->safe_psql('postgres',
355+
"INSERT INTO tab1 VALUES (1, 'foo'), (4, 'bar'), (10, 'baz')");
356+
357+
$node_publisher->wait_for_catchup('sub1');
358+
$node_publisher->wait_for_catchup('sub2');
359+
360+
$node_subscriber1->safe_psql('postgres', "DELETE FROM tab1");
361+
362+
$node_publisher->safe_psql('postgres',
363+
"UPDATE tab1 SET b = 'quux' WHERE a = 4");
364+
$node_publisher->safe_psql('postgres', "DELETE FROM tab1");
365+
366+
$node_publisher->wait_for_catchup('sub1');
367+
$node_publisher->wait_for_catchup('sub2');
368+
369+
my $logfile = slurp_file($node_subscriber1->logfile());
370+
ok( $logfile =~
371+
qr/logical replication did not find row to be updated in replication target relation's partition "tab1_2_2"/,
372+
'update target row is missing in tab1_2_2');
373+
ok( $logfile =~
374+
qr/logical replication did not find row to be deleted in replication target relation "tab1_1"/,
375+
'delete target row is missing in tab1_1');
376+
ok( $logfile =~
377+
qr/logical replication did not find row to be deleted in replication target relation "tab1_2_2"/,
378+
'delete target row is missing in tab1_2_2');
379+
ok( $logfile =~
380+
qr/logical replication did not find row to be deleted in replication target relation "tab1_def"/,
381+
'delete target row is missing in tab1_def');
382+
383+
$node_subscriber1->append_conf('postgresql.conf',
384+
"log_min_messages = warning");
385+
$node_subscriber1->reload;
386+
347387
# Tests for replication using root table identity and schema
348388

349389
# publisher
@@ -647,3 +687,32 @@ BEGIN
647687
pub_tab2|3|yyy
648688
pub_tab2|5|zzz
649689
xxx_c|6|aaa), 'inserts into tab2 replicated');
690+
691+
# Check that subscriber handles cases where update/delete target tuple
692+
# is missing. We have to look for the DEBUG1 log messages about that,
693+
# so temporarily bump up the log verbosity.
694+
$node_subscriber1->append_conf('postgresql.conf',
695+
"log_min_messages = debug1");
696+
$node_subscriber1->reload;
697+
698+
$node_subscriber1->safe_psql('postgres', "DELETE FROM tab2");
699+
700+
$node_publisher->safe_psql('postgres',
701+
"UPDATE tab2 SET b = 'quux' WHERE a = 5");
702+
$node_publisher->safe_psql('postgres', "DELETE FROM tab2 WHERE a = 1");
703+
704+
$node_publisher->wait_for_catchup('sub_viaroot');
705+
$node_publisher->wait_for_catchup('sub2');
706+
707+
$logfile = slurp_file($node_subscriber1->logfile());
708+
ok( $logfile =~
709+
qr/logical replication did not find row to be updated in replication target relation's partition "tab2_1"/,
710+
'update target row is missing in tab2_1');
711+
ok( $logfile =~
712+
qr/logical replication did not find row to be deleted in replication target relation "tab2_1"/,
713+
'delete target row is missing in tab2_1');
714+
715+
# No need for this until more tests are added.
716+
# $node_subscriber1->append_conf('postgresql.conf',
717+
# "log_min_messages = warning");
718+
# $node_subscriber1->reload;

0 commit comments

Comments
 (0)