[Issue #308] test coverage and comments improvement

gsmolk · gsmolk · commit 94ada4c13745 · 2021-02-03T01:41:49.000+03:00
diff --git a/src/catalog.c b/src/catalog.c
@@ -166,6 +166,9 @@ write_backup_status(pgBackup *backup, BackupStatus status,
  *
  * TODO: lock-timeout as parameter
  * TODO: we must think about more fine grain unlock mechanism - separate unlock_backup() function.
+ * TODO: more accurate naming
+ * -> exclusive lock -> acquire HW_LATCH and wait until all LW_LATCH`es are clear
+ * -> shared lock    -> acquire HW_LATCH, acquire LW_LATCH, release HW_LATCH
  */
 bool
 lock_backup(pgBackup *backup, bool strict, bool exclusive)
@@ -264,45 +267,13 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
 	int			empty_tries = LOCK_STALE_TIMEOUT;
 	int			len;
 	int			encoded_pid;
-	pid_t 		my_p_pid;
 
 	join_path_components(lock_file, backup->root_dir, BACKUP_LOCK_FILE);
 
-	/*
-	 * TODO: is this stuff with ppid below is relevant for us ?
-	 *
-	 * If the PID in the lockfile is our own PID or our parent's or
-	 * grandparent's PID, then the file must be stale (probably left over from
-	 * a previous system boot cycle).  We need to check this because of the
-	 * likelihood that a reboot will assign exactly the same PID as we had in
-	 * the previous reboot, or one that's only one or two counts larger and
-	 * hence the lockfile's PID now refers to an ancestor shell process.  We
-	 * allow pg_ctl to pass down its parent shell PID (our grandparent PID)
-	 * via the environment variable PG_GRANDPARENT_PID; this is so that
-	 * launching the postmaster via pg_ctl can be just as reliable as
-	 * launching it directly.  There is no provision for detecting
-	 * further-removed ancestor processes, but if the init script is written
-	 * carefully then all but the immediate parent shell will be root-owned
-	 * processes and so the kill test will fail with EPERM.  Note that we
-	 * cannot get a false negative this way, because an existing postmaster
-	 * would surely never launch a competing postmaster or pg_ctl process
-	 * directly.
-	 */
-#ifndef WIN32
-	my_p_pid = getppid();
-#else
-
-	/*
-	 * Windows hasn't got getppid(), but doesn't need it since it's not using
-	 * real kill() either...
-	 */
-	my_p_pid = 0;
-#endif
-
 	/*
 	 * We need a loop here because of race conditions.  But don't loop forever
 	 * (for example, a non-writable $backup_instance_path directory might cause a failure
-	 * that won't go away).  100 tries seems like plenty.
+	 * that won't go away).
 	 */
 	do
 	{
@@ -396,14 +367,12 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
 
 		/*
 		 * Check to see if the other process still exists
-		 *
-		 * Per discussion above, my_pid, my_p_pid can be
-		 * ignored as false matches.
-		 *
 		 * Normally kill() will fail with ESRCH if the given PID doesn't
 		 * exist.
 		 */
-		if (encoded_pid != my_pid && encoded_pid != my_p_pid)
+		if (encoded_pid == my_pid)
+			return 0;
+		else
 		{
 			if (kill(encoded_pid, 0) == 0)
 			{
@@ -508,6 +477,10 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
 				 lock_file, strerror(save_errno));
 	}
 
+//	elog(LOG, "Acquired exclusive lock for backup %s after %ds",
+//			base36enc(backup->start_time),
+//			LOCK_TIMEOUT - ntries + LOCK_STALE_TIMEOUT - empty_tries);
+
 	return 0;
 }
 
diff --git a/tests/helpers/ptrack_helpers.py b/tests/helpers/ptrack_helpers.py
@@ -757,7 +757,7 @@ def run_pb(self, command, asynchronous=False, gdb=False, old_binary=False, retur
                 return GDBobj([binary_path] + command, self.verbose)
             if asynchronous:
                 return subprocess.Popen(
-                    self.cmd,
+                    [binary_path] + command,
                     stdout=subprocess.PIPE,
                     stderr=subprocess.PIPE,
                     env=env
@@ -1133,8 +1133,8 @@ def show_archive(
             exit(1)
 
     def validate_pb(
-            self, backup_dir, instance=None,
-            backup_id=None, options=[], old_binary=False, gdb=False
+            self, backup_dir, instance=None, backup_id=None,
+            options=[], old_binary=False, gdb=False, asynchronous=False
             ):
 
         cmd_list = [
@@ -1146,11 +1146,11 @@ def validate_pb(
         if backup_id:
             cmd_list += ['-i', backup_id]
 
-        return self.run_pb(cmd_list + options, old_binary=old_binary, gdb=gdb)
+        return self.run_pb(cmd_list + options, old_binary=old_binary, gdb=gdb, asynchronous=asynchronous)
 
     def delete_pb(
-            self, backup_dir, instance,
-            backup_id=None, options=[], old_binary=False, gdb=False):
+            self, backup_dir, instance, backup_id=None,
+            options=[], old_binary=False, gdb=False, asynchronous=False):
         cmd_list = [
             'delete',
             '-B', backup_dir
@@ -1160,7 +1160,7 @@ def delete_pb(
         if backup_id:
             cmd_list += ['-i', backup_id]
 
-        return self.run_pb(cmd_list + options, old_binary=old_binary, gdb=gdb)
+        return self.run_pb(cmd_list + options, old_binary=old_binary, gdb=gdb, asynchronous=asynchronous)
 
     def delete_expired(
             self, backup_dir, instance, options=[], old_binary=False):
diff --git a/tests/locking.py b/tests/locking.py
@@ -535,6 +535,52 @@ def test_backup_directory_name(self):
         # Clean after yourself
         self.del_test_dir(module_name, fname)
 
+    def test_empty_lock_file(self):
+        """
+        https://github.com/postgrespro/pg_probackup/issues/308
+        """
+        fname = self.id().split('.')[3]
+        node = self.make_simple_node(
+            base_dir=os.path.join(module_name, fname, 'node'),
+            initdb_params=['--data-checksums'])
+
+        backup_dir = os.path.join(self.tmp_path, module_name, fname, 'backup')
+        self.init_pb(backup_dir)
+        self.add_instance(backup_dir, 'node', node)
+        self.set_archiving(backup_dir, 'node', node)
+        node.slow_start()
+
+        # Fill with data
+        node.pgbench_init(scale=100)
+
+        # FULL
+        backup_id = self.backup_node(backup_dir, 'node', node)
+
+        lockfile = os.path.join(backup_dir, 'backups', 'node', backup_id, 'backup.pid')
+        with open(lockfile, "w+") as f:
+            f.truncate()
+
+        out = self.validate_pb(backup_dir, 'node', backup_id)
+
+        self.assertIn(
+            "Waiting 30 seconds on empty exclusive lock for backup", out)
+
+#        lockfile = os.path.join(backup_dir, 'backups', 'node', backup_id, 'backup.pid')
+#        with open(lockfile, "w+") as f:
+#            f.truncate()
+#
+#        p1 = self.validate_pb(backup_dir, 'node', backup_id, asynchronous=True,
+#            options=['--log-level-file=LOG', '--log-filename=validate.log'])
+#        sleep(3)
+#        p2 = self.delete_pb(backup_dir, 'node', backup_id, asynchronous=True,
+#            options=['--log-level-file=LOG', '--log-filename=delete.log'])
+#
+#        p1.wait()
+#        p2.wait()
+
+        # Clean after yourself
+        self.del_test_dir(module_name, fname)
+
 # TODO:
 # test that concurrent validation and restore are not locking each other
 # check that quick exclusive lock, when taking RO-lock, is really quick