Skip to content

Commit 94ada4c

Browse files
committed
[Issue #308] test coverage and comments improvement
1 parent d9d6c34 commit 94ada4c

File tree

3 files changed

+64
-45
lines changed

3 files changed

+64
-45
lines changed

src/catalog.c

Lines changed: 11 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,9 @@ write_backup_status(pgBackup *backup, BackupStatus status,
166166
*
167167
* TODO: lock-timeout as parameter
168168
* TODO: we must think about more fine grain unlock mechanism - separate unlock_backup() function.
169+
* TODO: more accurate naming
170+
* -> exclusive lock -> acquire HW_LATCH and wait until all LW_LATCH`es are clear
171+
* -> shared lock -> acquire HW_LATCH, acquire LW_LATCH, release HW_LATCH
169172
*/
170173
bool
171174
lock_backup(pgBackup *backup, bool strict, bool exclusive)
@@ -264,45 +267,13 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
264267
int empty_tries = LOCK_STALE_TIMEOUT;
265268
int len;
266269
int encoded_pid;
267-
pid_t my_p_pid;
268270

269271
join_path_components(lock_file, backup->root_dir, BACKUP_LOCK_FILE);
270272

271-
/*
272-
* TODO: is this stuff with ppid below is relevant for us ?
273-
*
274-
* If the PID in the lockfile is our own PID or our parent's or
275-
* grandparent's PID, then the file must be stale (probably left over from
276-
* a previous system boot cycle). We need to check this because of the
277-
* likelihood that a reboot will assign exactly the same PID as we had in
278-
* the previous reboot, or one that's only one or two counts larger and
279-
* hence the lockfile's PID now refers to an ancestor shell process. We
280-
* allow pg_ctl to pass down its parent shell PID (our grandparent PID)
281-
* via the environment variable PG_GRANDPARENT_PID; this is so that
282-
* launching the postmaster via pg_ctl can be just as reliable as
283-
* launching it directly. There is no provision for detecting
284-
* further-removed ancestor processes, but if the init script is written
285-
* carefully then all but the immediate parent shell will be root-owned
286-
* processes and so the kill test will fail with EPERM. Note that we
287-
* cannot get a false negative this way, because an existing postmaster
288-
* would surely never launch a competing postmaster or pg_ctl process
289-
* directly.
290-
*/
291-
#ifndef WIN32
292-
my_p_pid = getppid();
293-
#else
294-
295-
/*
296-
* Windows hasn't got getppid(), but doesn't need it since it's not using
297-
* real kill() either...
298-
*/
299-
my_p_pid = 0;
300-
#endif
301-
302273
/*
303274
* We need a loop here because of race conditions. But don't loop forever
304275
* (for example, a non-writable $backup_instance_path directory might cause a failure
305-
* that won't go away). 100 tries seems like plenty.
276+
* that won't go away).
306277
*/
307278
do
308279
{
@@ -396,14 +367,12 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
396367

397368
/*
398369
* Check to see if the other process still exists
399-
*
400-
* Per discussion above, my_pid, my_p_pid can be
401-
* ignored as false matches.
402-
*
403370
* Normally kill() will fail with ESRCH if the given PID doesn't
404371
* exist.
405372
*/
406-
if (encoded_pid != my_pid && encoded_pid != my_p_pid)
373+
if (encoded_pid == my_pid)
374+
return 0;
375+
else
407376
{
408377
if (kill(encoded_pid, 0) == 0)
409378
{
@@ -508,6 +477,10 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
508477
lock_file, strerror(save_errno));
509478
}
510479

480+
// elog(LOG, "Acquired exclusive lock for backup %s after %ds",
481+
// base36enc(backup->start_time),
482+
// LOCK_TIMEOUT - ntries + LOCK_STALE_TIMEOUT - empty_tries);
483+
511484
return 0;
512485
}
513486

tests/helpers/ptrack_helpers.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -757,7 +757,7 @@ def run_pb(self, command, asynchronous=False, gdb=False, old_binary=False, retur
757757
return GDBobj([binary_path] + command, self.verbose)
758758
if asynchronous:
759759
return subprocess.Popen(
760-
self.cmd,
760+
[binary_path] + command,
761761
stdout=subprocess.PIPE,
762762
stderr=subprocess.PIPE,
763763
env=env
@@ -1133,8 +1133,8 @@ def show_archive(
11331133
exit(1)
11341134

11351135
def validate_pb(
1136-
self, backup_dir, instance=None,
1137-
backup_id=None, options=[], old_binary=False, gdb=False
1136+
self, backup_dir, instance=None, backup_id=None,
1137+
options=[], old_binary=False, gdb=False, asynchronous=False
11381138
):
11391139

11401140
cmd_list = [
@@ -1146,11 +1146,11 @@ def validate_pb(
11461146
if backup_id:
11471147
cmd_list += ['-i', backup_id]
11481148

1149-
return self.run_pb(cmd_list + options, old_binary=old_binary, gdb=gdb)
1149+
return self.run_pb(cmd_list + options, old_binary=old_binary, gdb=gdb, asynchronous=asynchronous)
11501150

11511151
def delete_pb(
1152-
self, backup_dir, instance,
1153-
backup_id=None, options=[], old_binary=False, gdb=False):
1152+
self, backup_dir, instance, backup_id=None,
1153+
options=[], old_binary=False, gdb=False, asynchronous=False):
11541154
cmd_list = [
11551155
'delete',
11561156
'-B', backup_dir
@@ -1160,7 +1160,7 @@ def delete_pb(
11601160
if backup_id:
11611161
cmd_list += ['-i', backup_id]
11621162

1163-
return self.run_pb(cmd_list + options, old_binary=old_binary, gdb=gdb)
1163+
return self.run_pb(cmd_list + options, old_binary=old_binary, gdb=gdb, asynchronous=asynchronous)
11641164

11651165
def delete_expired(
11661166
self, backup_dir, instance, options=[], old_binary=False):

tests/locking.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -535,6 +535,52 @@ def test_backup_directory_name(self):
535535
# Clean after yourself
536536
self.del_test_dir(module_name, fname)
537537

538+
def test_empty_lock_file(self):
539+
"""
540+
https://github.com/postgrespro/pg_probackup/issues/308
541+
"""
542+
fname = self.id().split('.')[3]
543+
node = self.make_simple_node(
544+
base_dir=os.path.join(module_name, fname, 'node'),
545+
initdb_params=['--data-checksums'])
546+
547+
backup_dir = os.path.join(self.tmp_path, module_name, fname, 'backup')
548+
self.init_pb(backup_dir)
549+
self.add_instance(backup_dir, 'node', node)
550+
self.set_archiving(backup_dir, 'node', node)
551+
node.slow_start()
552+
553+
# Fill with data
554+
node.pgbench_init(scale=100)
555+
556+
# FULL
557+
backup_id = self.backup_node(backup_dir, 'node', node)
558+
559+
lockfile = os.path.join(backup_dir, 'backups', 'node', backup_id, 'backup.pid')
560+
with open(lockfile, "w+") as f:
561+
f.truncate()
562+
563+
out = self.validate_pb(backup_dir, 'node', backup_id)
564+
565+
self.assertIn(
566+
"Waiting 30 seconds on empty exclusive lock for backup", out)
567+
568+
# lockfile = os.path.join(backup_dir, 'backups', 'node', backup_id, 'backup.pid')
569+
# with open(lockfile, "w+") as f:
570+
# f.truncate()
571+
#
572+
# p1 = self.validate_pb(backup_dir, 'node', backup_id, asynchronous=True,
573+
# options=['--log-level-file=LOG', '--log-filename=validate.log'])
574+
# sleep(3)
575+
# p2 = self.delete_pb(backup_dir, 'node', backup_id, asynchronous=True,
576+
# options=['--log-level-file=LOG', '--log-filename=delete.log'])
577+
#
578+
# p1.wait()
579+
# p2.wait()
580+
581+
# Clean after yourself
582+
self.del_test_dir(module_name, fname)
583+
538584
# TODO:
539585
# test that concurrent validation and restore are not locking each other
540586
# check that quick exclusive lock, when taking RO-lock, is really quick

0 commit comments

Comments
 (0)