Skip to content

Commit ad00eb1

Browse files
schmiddydvarrazzo
authored andcommitted
Several fixes for concurrent-DDL guard.
* KILL_COMPETING_LOCKS was using pg_cancel_backend() instead of pg_terminate_backend() * create kill_ddl() function for canceling+terminating any pending unsafe concurrent DDL, i.e. anyone hanging out waiting for an ACCESS EXCLUSIVE lock on our table. * create lock_access_share() function for reliably obtaining an ACCESS SHARE lock on the target table, killing off any queued ACCESS EXCLUSIVE lockers in the process via kill_ddl() * Avoid deadlock possible before we run: CREATE TABLE reorg.table_xxx AS SELECT ... FROM ONLY ... by using lock_access_share() * Fix a few calls in lock_exclusive() which were forgetting to specify the passed-in connection. These fixes are related to Issue #8. The main thing remaining AFAIK is to review or fix some of the unlikely-error handling bits; most of these should be marked with XXX now.
1 parent cf25780 commit ad00eb1

File tree

1 file changed

+163
-38
lines changed

1 file changed

+163
-38
lines changed

bin/pg_repack.c

Lines changed: 163 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -57,17 +57,17 @@ const char *PROGRAM_VERSION = "unknown";
5757
* target table, and our secondary conn is attempting to grab an AccessShare
5858
* lock. We know that "granted" must be false for these queries because
5959
* we already hold the AccessExclusive lock. Also, we only care about other
60-
* transactions trying to grab an ACCESS EXCLUSIVE lock, because that lock
61-
* level is needed for any of the disallowed DDL commands, e.g. ALTER TABLE
62-
* or TRUNCATE.
60+
* transactions trying to grab an ACCESS EXCLUSIVE lock, because we are only
61+
* trying to kill off disallowed DDL commands, e.g. ALTER TABLE or TRUNCATE.
6362
*/
6463
#define CANCEL_COMPETING_LOCKS \
6564
"SELECT pg_cancel_backend(pid) FROM pg_locks WHERE locktype = 'relation'"\
6665
" AND granted = false AND relation = %u"\
6766
" AND mode = 'AccessExclusiveLock' AND pid <> pg_backend_pid()"
6867

6968
#define KILL_COMPETING_LOCKS \
70-
"SELECT pg_cancel_backend(pid) FROM pg_locks WHERE locktype = 'relation'"\
69+
"SELECT pg_terminate_backend(pid) "\
70+
"FROM pg_locks WHERE locktype = 'relation'"\
7171
" AND granted = false AND relation = %u"\
7272
" AND mode = 'AccessExclusiveLock' AND pid <> pg_backend_pid()"
7373

@@ -114,6 +114,8 @@ static void repack_cleanup(bool fatal, void *userdata);
114114
static char *getstr(PGresult *res, int row, int col);
115115
static Oid getoid(PGresult *res, int row, int col);
116116
static void lock_exclusive(PGconn *conn, const char *relid, const char *lock_query, bool start_xact);
117+
static bool kill_ddl(PGconn *conn, Oid relid, bool terminate);
118+
static void lock_access_share(PGconn *conn, Oid relid, const char *target_name);
117119

118120
#define SQLSTATE_INVALID_SCHEMA_NAME "3F000"
119121
#define SQLSTATE_QUERY_CANCELED "57014"
@@ -485,11 +487,17 @@ repack_one_table(const repack_table *table, const char *orderby)
485487
lock_conn_pid = strdup(PQgetvalue(res, 0, 0));
486488
PQclear(res);
487489

490+
/*
491+
* Not using lock_access_share() here since we know that
492+
* it's not possible to obtain the ACCESS SHARE lock right now
493+
* in conn2, since the primary connection holds ACCESS EXCLUSIVE.
494+
*/
488495
printfStringInfo(&sql, "LOCK TABLE %s IN ACCESS SHARE MODE",
489496
table->target_name);
490497
elog(DEBUG2, "LOCK TABLE %s IN ACCESS SHARE MODE", table->target_name);
491498
if (!(PQsendQuery(conn2, sql.data))) {
492499
printf("Error sending async query: %s\n%s", sql.data, PQerrorMessage(conn2));
500+
/* XXX: better error handling */
493501
exit(1);
494502
}
495503

@@ -500,40 +508,13 @@ repack_one_table(const repack_table *table, const char *orderby)
500508
* AccessExclusive lock before conn2 gets its AccessShare lock,
501509
* and perform unsafe DDL on the table.
502510
*
503-
* XXX: maybe we should use a loop canceling queries, as in
504-
* lock_exclusive().
511+
* Normally, lock_access_share() would take care of this for us,
512+
* but we're not able to use it here.
505513
*/
506-
printfStringInfo(&sql, CANCEL_COMPETING_LOCKS, table->target_oid);
507-
res = execute(sql.data, 0, NULL);
508-
if (PQresultStatus(res) != PGRES_TUPLES_OK)
514+
if (!(kill_ddl(connection, table->target_oid, true)))
509515
{
510-
printf("Error canceling competing queries: %s", PQerrorMessage(connection));
511-
PQclear(res);
512516
exit(1);
513517
}
514-
if (PQntuples(res) > 0)
515-
{
516-
elog(WARNING, "Canceled %d unsafe queries. Terminating any remaining PIDs.", PQntuples(res));
517-
518-
if (PQserverVersion(connection) >= 80400)
519-
{
520-
PQclear(res);
521-
printfStringInfo(&sql, KILL_COMPETING_LOCKS, table->target_oid);
522-
res = execute(sql.data, 0, NULL);
523-
if (PQresultStatus(res) != PGRES_TUPLES_OK)
524-
{
525-
printf("Error killing competing queries: %s", PQerrorMessage(connection));
526-
PQclear(res);
527-
exit(1);
528-
}
529-
}
530-
531-
}
532-
else
533-
{
534-
elog(DEBUG2, "No competing DDL to cancel.");
535-
}
536-
PQclear(res);
537518

538519

539520
/* We're finished killing off any unsafe DDL. COMMIT in our main
@@ -583,6 +564,21 @@ repack_one_table(const repack_table *table, const char *orderby)
583564
PQclear(res);
584565

585566
command(table->delete_log, 0, NULL);
567+
568+
/* We need to be able to obtain an AccessShare lock on the target table
569+
* for the create_table command to go through, so go ahead and obtain
570+
* the lock explicitly.
571+
*
572+
* Since conn2 has been diligently holding its AccessShare lock, it
573+
* is possible that another transaction has been waiting to acquire
574+
* an AccessExclusive lock on the table (e.g. a concurrent ALTER TABLE
575+
* or TRUNCATE which we must not allow). If there are any such
576+
* transactions, lock_access_share() will kill them so that our
577+
* CREATE TABLE ... AS SELECT does not deadlock waiting for an
578+
* AccessShare lock.
579+
*/
580+
lock_access_share(connection, table->target_oid, table->target_name);
581+
586582
command(table->create_table, 0, NULL);
587583
printfStringInfo(&sql, "SELECT repack.disable_autovacuum('repack.table_%u')", table->target_oid);
588584
if (table->drop_columns)
@@ -719,8 +715,134 @@ repack_one_table(const repack_table *table, const char *orderby)
719715
termStringInfo(&sql);
720716
}
721717

718+
/* Kill off any concurrent DDL (or any transaction attempting to take
719+
* an AccessExclusive lock) trying to run against our table. Note, we're
720+
* killing these queries off *before* they are granted an AccessExclusive
721+
* lock on our table.
722+
*
723+
* Returns true if no problems encountered, false otherwise.
724+
*/
725+
static bool
726+
kill_ddl(PGconn *conn, Oid relid, bool terminate)
727+
{
728+
bool ret = true;
729+
PGresult *res;
730+
StringInfoData sql;
731+
732+
initStringInfo(&sql);
733+
734+
printfStringInfo(&sql, CANCEL_COMPETING_LOCKS, relid);
735+
res = pgut_execute(conn, sql.data, 0, NULL);
736+
if (PQresultStatus(res) != PGRES_TUPLES_OK)
737+
{
738+
printf("Error canceling unsafe queries: %s", PQerrorMessage(conn));
739+
ret = false;
740+
}
741+
else if (PQntuples(res) > 0 && terminate && PQserverVersion(conn) >= 80400)
742+
{
743+
elog(WARNING, "Canceled %d unsafe queries. Terminating any remaining PIDs.", PQntuples(res));
744+
745+
PQclear(res);
746+
printfStringInfo(&sql, KILL_COMPETING_LOCKS, relid);
747+
res = pgut_execute(conn, sql.data, 0, NULL);
748+
if (PQresultStatus(res) != PGRES_TUPLES_OK)
749+
{
750+
printf("Error killing unsafe queries: %s", PQerrorMessage(conn));
751+
ret = false;
752+
}
753+
}
754+
else if (PQntuples(res) > 0)
755+
elog(NOTICE, "Canceled %d unsafe queries", PQntuples(res));
756+
else
757+
elog(DEBUG2, "No competing DDL to cancel.");
758+
759+
760+
PQclear(res);
761+
termStringInfo(&sql);
762+
763+
return ret;
764+
}
765+
766+
767+
/*
768+
* Try to acquire an ACCESS SHARE table lock, avoiding deadlocks and long
769+
* waits by killing off other sessions which may be stuck trying to obtain
770+
* an ACCESS EXCLUSIVE lock.
771+
*
772+
* Arguments:
773+
*
774+
* conn: connection to use
775+
* relid: OID of relation
776+
* target_name: name of table
777+
*/
778+
static void
779+
lock_access_share(PGconn *conn, Oid relid, const char *target_name)
780+
{
781+
StringInfoData sql;
782+
time_t start = time(NULL);
783+
int i;
784+
785+
initStringInfo(&sql);
786+
787+
for (i = 1; ; i++)
788+
{
789+
time_t duration;
790+
PGresult *res;
791+
int wait_msec;
792+
793+
duration = time(NULL) - start;
794+
795+
/* Cancel queries unconditionally, i.e. don't bother waiting
796+
* wait_timeout as lock_exclusive() does -- the only queries we
797+
* should be killing are disallowed DDL commands hanging around
798+
* for an AccessExclusive lock, which must be deadlocked at
799+
* this point anyway since conn2 holds its AccessShare lock
800+
* already.
801+
*/
802+
if (duration > (wait_timeout * 2))
803+
kill_ddl(conn, relid, true);
804+
else
805+
kill_ddl(conn, relid, false);
806+
807+
/* wait for a while to lock the table. */
808+
wait_msec = Min(1000, i * 100);
809+
printfStringInfo(&sql, "SET LOCAL statement_timeout = %d", wait_msec);
810+
pgut_command(conn, sql.data, 0, NULL);
811+
812+
printfStringInfo(&sql, "LOCK TABLE %s IN ACCESS SHARE MODE", target_name);
813+
res = pgut_execute_elevel(conn, sql.data, 0, NULL, DEBUG2);
814+
if (PQresultStatus(res) == PGRES_COMMAND_OK)
815+
{
816+
PQclear(res);
817+
break;
818+
}
819+
else if (sqlstate_equals(res, SQLSTATE_QUERY_CANCELED))
820+
{
821+
/* XXX: does this ROLLBACK need any rethinking wrt. two connections
822+
* now?
823+
*/
824+
/* retry if lock conflicted */
825+
PQclear(res);
826+
pgut_command(conn, "ROLLBACK", 0, NULL);
827+
continue;
828+
}
829+
else
830+
{
831+
/* exit otherwise */
832+
printf("%s", PQerrorMessage(connection));
833+
PQclear(res);
834+
exit(1);
835+
}
836+
}
837+
838+
termStringInfo(&sql);
839+
pgut_command(conn, "RESET statement_timeout", 0, NULL);
840+
}
841+
842+
722843
/*
723-
* Try acquire a table lock but avoid long time locks when conflict.
844+
* Try acquire an ACCESS EXCLUSIVE table lock, avoiding deadlocks and long
845+
* waits by killing off other sessions.
724846
* Arguments:
725847
*
726848
* conn: connection to use
@@ -772,7 +894,7 @@ lock_exclusive(PGconn *conn, const char *relid, const char *lock_query, bool sta
772894
/* wait for a while to lock the table. */
773895
wait_msec = Min(1000, i * 100);
774896
snprintf(sql, lengthof(sql), "SET LOCAL statement_timeout = %d", wait_msec);
775-
command(sql, 0, NULL);
897+
pgut_command(conn, sql, 0, NULL);
776898

777899
res = pgut_execute_elevel(conn, lock_query, 0, NULL, DEBUG2);
778900
if (PQresultStatus(res) == PGRES_COMMAND_OK)
@@ -782,9 +904,12 @@ lock_exclusive(PGconn *conn, const char *relid, const char *lock_query, bool sta
782904
}
783905
else if (sqlstate_equals(res, SQLSTATE_QUERY_CANCELED))
784906
{
907+
/* XXX: does this ROLLBACK need any rethinking wrt. two connections
908+
* now?
909+
*/
785910
/* retry if lock conflicted */
786911
PQclear(res);
787-
command("ROLLBACK", 0, NULL);
912+
pgut_command(conn, "ROLLBACK", 0, NULL);
788913
continue;
789914
}
790915
else
@@ -796,7 +921,7 @@ lock_exclusive(PGconn *conn, const char *relid, const char *lock_query, bool sta
796921
}
797922
}
798923

799-
command("RESET statement_timeout", 0, NULL);
924+
pgut_command(conn, "RESET statement_timeout", 0, NULL);
800925
}
801926

802927
/*

0 commit comments

Comments
 (0)