@@ -57,17 +57,17 @@ const char *PROGRAM_VERSION = "unknown";
57
57
* target table, and our secondary conn is attempting to grab an AccessShare
58
58
* lock. We know that "granted" must be false for these queries because
59
59
* we already hold the AccessExclusive lock. Also, we only care about other
60
- * transactions trying to grab an ACCESS EXCLUSIVE lock, because that lock
61
- * level is needed for any of the disallowed DDL commands, e.g. ALTER TABLE
62
- * or TRUNCATE.
60
+ * transactions trying to grab an ACCESS EXCLUSIVE lock, because we are only
61
+ * trying to kill off disallowed DDL commands, e.g. ALTER TABLE or TRUNCATE.
63
62
*/
64
63
#define CANCEL_COMPETING_LOCKS \
65
64
"SELECT pg_cancel_backend(pid) FROM pg_locks WHERE locktype = 'relation'"\
66
65
" AND granted = false AND relation = %u"\
67
66
" AND mode = 'AccessExclusiveLock' AND pid <> pg_backend_pid()"
68
67
69
68
#define KILL_COMPETING_LOCKS \
70
- "SELECT pg_cancel_backend(pid) FROM pg_locks WHERE locktype = 'relation'"\
69
+ "SELECT pg_terminate_backend(pid) "\
70
+ "FROM pg_locks WHERE locktype = 'relation'"\
71
71
" AND granted = false AND relation = %u"\
72
72
" AND mode = 'AccessExclusiveLock' AND pid <> pg_backend_pid()"
73
73
@@ -114,6 +114,8 @@ static void repack_cleanup(bool fatal, void *userdata);
114
114
static char * getstr (PGresult * res , int row , int col );
115
115
static Oid getoid (PGresult * res , int row , int col );
116
116
static void lock_exclusive (PGconn * conn , const char * relid , const char * lock_query , bool start_xact );
117
+ static bool kill_ddl (PGconn * conn , Oid relid , bool terminate );
118
+ static void lock_access_share (PGconn * conn , Oid relid , const char * target_name );
117
119
118
120
#define SQLSTATE_INVALID_SCHEMA_NAME "3F000"
119
121
#define SQLSTATE_QUERY_CANCELED "57014"
@@ -485,11 +487,17 @@ repack_one_table(const repack_table *table, const char *orderby)
485
487
lock_conn_pid = strdup (PQgetvalue (res , 0 , 0 ));
486
488
PQclear (res );
487
489
490
+ /*
491
+ * Not using lock_access_share() here since we know that
492
+ * it's not possible to obtain the ACCESS SHARE lock right now
493
+ * in conn2, since the primary connection holds ACCESS EXCLUSIVE.
494
+ */
488
495
printfStringInfo (& sql , "LOCK TABLE %s IN ACCESS SHARE MODE" ,
489
496
table -> target_name );
490
497
elog (DEBUG2 , "LOCK TABLE %s IN ACCESS SHARE MODE" , table -> target_name );
491
498
if (!(PQsendQuery (conn2 , sql .data ))) {
492
499
printf ("Error sending async query: %s\n%s" , sql .data , PQerrorMessage (conn2 ));
500
+ /* XXX: better error handling */
493
501
exit (1 );
494
502
}
495
503
@@ -500,40 +508,13 @@ repack_one_table(const repack_table *table, const char *orderby)
500
508
* AccessExclusive lock before conn2 gets its AccessShare lock,
501
509
* and perform unsafe DDL on the table.
502
510
*
503
- * XXX: maybe we should use a loop canceling queries, as in
504
- * lock_exclusive() .
511
+ * Normally, lock_access_share() would take care of this for us,
512
+ * but we're not able to use it here .
505
513
*/
506
- printfStringInfo (& sql , CANCEL_COMPETING_LOCKS , table -> target_oid );
507
- res = execute (sql .data , 0 , NULL );
508
- if (PQresultStatus (res ) != PGRES_TUPLES_OK )
514
+ if (!(kill_ddl (connection , table -> target_oid , true)))
509
515
{
510
- printf ("Error canceling competing queries: %s" , PQerrorMessage (connection ));
511
- PQclear (res );
512
516
exit (1 );
513
517
}
514
- if (PQntuples (res ) > 0 )
515
- {
516
- elog (WARNING , "Canceled %d unsafe queries. Terminating any remaining PIDs." , PQntuples (res ));
517
-
518
- if (PQserverVersion (connection ) >= 80400 )
519
- {
520
- PQclear (res );
521
- printfStringInfo (& sql , KILL_COMPETING_LOCKS , table -> target_oid );
522
- res = execute (sql .data , 0 , NULL );
523
- if (PQresultStatus (res ) != PGRES_TUPLES_OK )
524
- {
525
- printf ("Error killing competing queries: %s" , PQerrorMessage (connection ));
526
- PQclear (res );
527
- exit (1 );
528
- }
529
- }
530
-
531
- }
532
- else
533
- {
534
- elog (DEBUG2 , "No competing DDL to cancel." );
535
- }
536
- PQclear (res );
537
518
538
519
539
520
/* We're finished killing off any unsafe DDL. COMMIT in our main
@@ -583,6 +564,21 @@ repack_one_table(const repack_table *table, const char *orderby)
583
564
PQclear (res );
584
565
585
566
command (table -> delete_log , 0 , NULL );
567
+
568
+ /* We need to be able to obtain an AccessShare lock on the target table
569
+ * for the create_table command to go through, so go ahead and obtain
570
+ * the lock explicitly.
571
+ *
572
+ * Since conn2 has been diligently holding its AccessShare lock, it
573
+ * is possible that another transaction has been waiting to acquire
574
+ * an AccessExclusive lock on the table (e.g. a concurrent ALTER TABLE
575
+ * or TRUNCATE which we must not allow). If there are any such
576
+ * transactions, lock_access_share() will kill them so that our
577
+ * CREATE TABLE ... AS SELECT does not deadlock waiting for an
578
+ * AccessShare lock.
579
+ */
580
+ lock_access_share (connection , table -> target_oid , table -> target_name );
581
+
586
582
command (table -> create_table , 0 , NULL );
587
583
printfStringInfo (& sql , "SELECT repack.disable_autovacuum('repack.table_%u')" , table -> target_oid );
588
584
if (table -> drop_columns )
@@ -719,8 +715,134 @@ repack_one_table(const repack_table *table, const char *orderby)
719
715
termStringInfo (& sql );
720
716
}
721
717
718
+ /* Kill off any concurrent DDL (or any transaction attempting to take
719
+ * an AccessExclusive lock) trying to run against our table. Note, we're
720
+ * killing these queries off *before* they are granted an AccessExclusive
721
+ * lock on our table.
722
+ *
723
+ * Returns true if no problems encountered, false otherwise.
724
+ */
725
+ static bool
726
+ kill_ddl (PGconn * conn , Oid relid , bool terminate )
727
+ {
728
+ bool ret = true;
729
+ PGresult * res ;
730
+ StringInfoData sql ;
731
+
732
+ initStringInfo (& sql );
733
+
734
+ printfStringInfo (& sql , CANCEL_COMPETING_LOCKS , relid );
735
+ res = pgut_execute (conn , sql .data , 0 , NULL );
736
+ if (PQresultStatus (res ) != PGRES_TUPLES_OK )
737
+ {
738
+ printf ("Error canceling unsafe queries: %s" , PQerrorMessage (conn ));
739
+ ret = false;
740
+ }
741
+ else if (PQntuples (res ) > 0 && terminate && PQserverVersion (conn ) >= 80400 )
742
+ {
743
+ elog (WARNING , "Canceled %d unsafe queries. Terminating any remaining PIDs." , PQntuples (res ));
744
+
745
+ PQclear (res );
746
+ printfStringInfo (& sql , KILL_COMPETING_LOCKS , relid );
747
+ res = pgut_execute (conn , sql .data , 0 , NULL );
748
+ if (PQresultStatus (res ) != PGRES_TUPLES_OK )
749
+ {
750
+ printf ("Error killing unsafe queries: %s" , PQerrorMessage (conn ));
751
+ ret = false;
752
+ }
753
+ }
754
+ else if (PQntuples (res ) > 0 )
755
+ elog (NOTICE , "Canceled %d unsafe queries" , PQntuples (res ));
756
+ else
757
+ elog (DEBUG2 , "No competing DDL to cancel." );
758
+
759
+
760
+ PQclear (res );
761
+ termStringInfo (& sql );
762
+
763
+ return ret ;
764
+ }
765
+
766
+
767
+ /*
768
+ * Try to acquire an ACCESS SHARE table lock, avoiding deadlocks and long
769
+ * waits by killing off other sessions which may be stuck trying to obtain
770
+ * an ACCESS EXCLUSIVE lock.
771
+ *
772
+ * Arguments:
773
+ *
774
+ * conn: connection to use
775
+ * relid: OID of relation
776
+ * target_name: name of table
777
+ */
778
+ static void
779
+ lock_access_share (PGconn * conn , Oid relid , const char * target_name )
780
+ {
781
+ StringInfoData sql ;
782
+ time_t start = time (NULL );
783
+ int i ;
784
+
785
+ initStringInfo (& sql );
786
+
787
+ for (i = 1 ; ; i ++ )
788
+ {
789
+ time_t duration ;
790
+ PGresult * res ;
791
+ int wait_msec ;
792
+
793
+ duration = time (NULL ) - start ;
794
+
795
+ /* Cancel queries unconditionally, i.e. don't bother waiting
796
+ * wait_timeout as lock_exclusive() does -- the only queries we
797
+ * should be killing are disallowed DDL commands hanging around
798
+ * for an AccessExclusive lock, which must be deadlocked at
799
+ * this point anyway since conn2 holds its AccessShare lock
800
+ * already.
801
+ */
802
+ if (duration > (wait_timeout * 2 ))
803
+ kill_ddl (conn , relid , true);
804
+ else
805
+ kill_ddl (conn , relid , false);
806
+
807
+ /* wait for a while to lock the table. */
808
+ wait_msec = Min (1000 , i * 100 );
809
+ printfStringInfo (& sql , "SET LOCAL statement_timeout = %d" , wait_msec );
810
+ pgut_command (conn , sql .data , 0 , NULL );
811
+
812
+ printfStringInfo (& sql , "LOCK TABLE %s IN ACCESS SHARE MODE" , target_name );
813
+ res = pgut_execute_elevel (conn , sql .data , 0 , NULL , DEBUG2 );
814
+ if (PQresultStatus (res ) == PGRES_COMMAND_OK )
815
+ {
816
+ PQclear (res );
817
+ break ;
818
+ }
819
+ else if (sqlstate_equals (res , SQLSTATE_QUERY_CANCELED ))
820
+ {
821
+ /* XXX: does this ROLLBACK need any rethinking wrt. two connections
822
+ * now?
823
+ */
824
+ /* retry if lock conflicted */
825
+ PQclear (res );
826
+ pgut_command (conn , "ROLLBACK" , 0 , NULL );
827
+ continue ;
828
+ }
829
+ else
830
+ {
831
+ /* exit otherwise */
832
+ printf ("%s" , PQerrorMessage (connection ));
833
+ PQclear (res );
834
+ exit (1 );
835
+ }
836
+ }
837
+
838
+ termStringInfo (& sql );
839
+ pgut_command (conn , "RESET statement_timeout" , 0 , NULL );
840
+ }
841
+
842
+
722
843
/*
723
- * Try acquire a table lock but avoid long time locks when conflict.
844
+ * Try acquire an ACCESS EXCLUSIVE table lock, avoiding deadlocks and long
845
+ * waits by killing off other sessions.
724
846
* Arguments:
725
847
*
726
848
* conn: connection to use
@@ -772,7 +894,7 @@ lock_exclusive(PGconn *conn, const char *relid, const char *lock_query, bool sta
772
894
/* wait for a while to lock the table. */
773
895
wait_msec = Min (1000 , i * 100 );
774
896
snprintf (sql , lengthof (sql ), "SET LOCAL statement_timeout = %d" , wait_msec );
775
- command ( sql , 0 , NULL );
897
+ pgut_command ( conn , sql , 0 , NULL );
776
898
777
899
res = pgut_execute_elevel (conn , lock_query , 0 , NULL , DEBUG2 );
778
900
if (PQresultStatus (res ) == PGRES_COMMAND_OK )
@@ -782,9 +904,12 @@ lock_exclusive(PGconn *conn, const char *relid, const char *lock_query, bool sta
782
904
}
783
905
else if (sqlstate_equals (res , SQLSTATE_QUERY_CANCELED ))
784
906
{
907
+ /* XXX: does this ROLLBACK need any rethinking wrt. two connections
908
+ * now?
909
+ */
785
910
/* retry if lock conflicted */
786
911
PQclear (res );
787
- command ( "ROLLBACK" , 0 , NULL );
912
+ pgut_command ( conn , "ROLLBACK" , 0 , NULL );
788
913
continue ;
789
914
}
790
915
else
@@ -796,7 +921,7 @@ lock_exclusive(PGconn *conn, const char *relid, const char *lock_query, bool sta
796
921
}
797
922
}
798
923
799
- command ( "RESET statement_timeout" , 0 , NULL );
924
+ pgut_command ( conn , "RESET statement_timeout" , 0 , NULL );
800
925
}
801
926
802
927
/*
0 commit comments