Skip to content

Commit 12fcfd2

Browse files
committed
Btrfs: tree logging unlink/rename fixes
The tree logging code allows individual files or directories to be logged without including operations on other files and directories in the FS. It tries to commit the minimal set of changes to disk in order to fsync the single file or directory that was sent to fsync or O_SYNC. The tree logging code was allowing files and directories to be unlinked if they were part of a rename operation where only one directory in the rename was in the fsync log. This patch adds a few new rules to the tree logging. 1) on rename or unlink, if the inode being unlinked isn't in the fsync log, we must force a full commit before doing an fsync of the directory where the unlink was done. The commit isn't done during the unlink, but it is forced the next time we try to log the parent directory. Solution: record transid of last unlink/rename per directory when the directory wasn't already logged. For renames this is only done when renaming to a different directory. mkdir foo/some_dir normal commit rename foo/some_dir foo2/some_dir mkdir foo/some_dir fsync foo/some_dir/some_file The fsync above will unlink the original some_dir without recording it in its new location (foo2). After a crash, some_dir will be gone unless the fsync of some_file forces a full commit 2) we must log any new names for any file or dir that is in the fsync log. This way we make sure not to lose files that are unlinked during the same transaction. 2a) we must log any new names for any file or dir during rename when the directory they are being removed from was logged. 2a is actually the more important variant. Without the extra logging a crash might unlink the old name without recreating the new one 3) after a crash, we must go through any directories with a link count of zero and redo the rm -rf mkdir f1/foo normal commit rm -rf f1/foo fsync(f1) The directory f1 was fully removed from the FS, but fsync was never called on f1, only its parent dir. After a crash the rm -rf must be replayed. This must be able to recurse down the entire directory tree. The inode link count fixup code takes care of the ugly details. Signed-off-by: Chris Mason <chris.mason@oracle.com>
1 parent a74ac32 commit 12fcfd2

File tree

7 files changed

+372
-98
lines changed

7 files changed

+372
-98
lines changed

fs/btrfs/btrfs_inode.h

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -86,12 +86,6 @@ struct btrfs_inode {
8686
*/
8787
u64 logged_trans;
8888

89-
/*
90-
* trans that last made a change that should be fully fsync'd. This
91-
* gets reset to zero each time the inode is logged
92-
*/
93-
u64 log_dirty_trans;
94-
9589
/* total number of bytes pending delalloc, used by stat to calc the
9690
* real block usage of the file
9791
*/
@@ -121,6 +115,13 @@ struct btrfs_inode {
121115
/* the start of block group preferred for allocations. */
122116
u64 block_group;
123117

118+
/* the fsync log has some corner cases that mean we have to check
119+
* directories to see if any unlinks have been done before
120+
* the directory was logged. See tree-log.c for all the
121+
* details
122+
*/
123+
u64 last_unlink_trans;
124+
124125
struct inode vfs_inode;
125126
};
126127

fs/btrfs/ctree.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -695,7 +695,12 @@ struct btrfs_fs_info {
695695

696696
u64 generation;
697697
u64 last_trans_committed;
698-
u64 last_trans_new_blockgroup;
698+
699+
/*
700+
* this is updated to the current trans every time a full commit
701+
* is required instead of the faster short fsync log commits
702+
*/
703+
u64 last_trans_log_full_commit;
699704
u64 open_ioctl_trans;
700705
unsigned long mount_opt;
701706
u64 max_extent;

fs/btrfs/extent-tree.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5897,7 +5897,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
58975897

58985898
extent_root = root->fs_info->extent_root;
58995899

5900-
root->fs_info->last_trans_new_blockgroup = trans->transid;
5900+
root->fs_info->last_trans_log_full_commit = trans->transid;
59015901

59025902
cache = kzalloc(sizeof(*cache), GFP_NOFS);
59035903
if (!cache)

fs/btrfs/file.c

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1173,8 +1173,11 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
11731173
ret = btrfs_log_dentry_safe(trans, root,
11741174
file->f_dentry);
11751175
if (ret == 0) {
1176-
btrfs_sync_log(trans, root);
1177-
btrfs_end_transaction(trans, root);
1176+
ret = btrfs_sync_log(trans, root);
1177+
if (ret == 0)
1178+
btrfs_end_transaction(trans, root);
1179+
else
1180+
btrfs_commit_transaction(trans, root);
11781181
} else {
11791182
btrfs_commit_transaction(trans, root);
11801183
}
@@ -1266,8 +1269,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
12661269
if (ret > 0) {
12671270
ret = btrfs_commit_transaction(trans, root);
12681271
} else {
1269-
btrfs_sync_log(trans, root);
1270-
ret = btrfs_end_transaction(trans, root);
1272+
ret = btrfs_sync_log(trans, root);
1273+
if (ret == 0)
1274+
ret = btrfs_end_transaction(trans, root);
1275+
else
1276+
ret = btrfs_commit_transaction(trans, root);
12711277
}
12721278
mutex_lock(&dentry->d_inode->i_mutex);
12731279
out:

fs/btrfs/inode.c

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2246,8 +2246,6 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
22462246
ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
22472247
inode, dir->i_ino);
22482248
BUG_ON(ret != 0 && ret != -ENOENT);
2249-
if (ret != -ENOENT)
2250-
BTRFS_I(dir)->log_dirty_trans = trans->transid;
22512249

22522250
ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
22532251
dir, index);
@@ -2280,6 +2278,9 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
22802278
trans = btrfs_start_transaction(root, 1);
22812279

22822280
btrfs_set_trans_block_group(trans, dir);
2281+
2282+
btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
2283+
22832284
ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
22842285
dentry->d_name.name, dentry->d_name.len);
22852286

@@ -3042,7 +3043,7 @@ static noinline void init_btrfs_i(struct inode *inode)
30423043
bi->disk_i_size = 0;
30433044
bi->flags = 0;
30443045
bi->index_cnt = (u64)-1;
3045-
bi->log_dirty_trans = 0;
3046+
bi->last_unlink_trans = 0;
30463047
extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
30473048
extent_io_tree_init(&BTRFS_I(inode)->io_tree,
30483049
inode->i_mapping, GFP_NOFS);
@@ -3786,6 +3787,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
37863787
drop_inode = 1;
37873788

37883789
nr = trans->blocks_used;
3790+
3791+
btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
37893792
btrfs_end_transaction_throttle(trans, root);
37903793
fail:
37913794
if (drop_inode) {
@@ -4666,13 +4669,25 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
46664669

46674670
trans = btrfs_start_transaction(root, 1);
46684671

4672+
/*
4673+
* this is an ugly little race, but the rename is required to make
4674+
* sure that if we crash, the inode is either at the old name
4675+
* or the new one. pinning the log transaction lets us make sure
4676+
* we don't allow a log commit to come in after we unlink the
4677+
* name but before we add the new name back in.
4678+
*/
4679+
btrfs_pin_log_trans(root);
4680+
46694681
btrfs_set_trans_block_group(trans, new_dir);
46704682

46714683
btrfs_inc_nlink(old_dentry->d_inode);
46724684
old_dir->i_ctime = old_dir->i_mtime = ctime;
46734685
new_dir->i_ctime = new_dir->i_mtime = ctime;
46744686
old_inode->i_ctime = ctime;
46754687

4688+
if (old_dentry->d_parent != new_dentry->d_parent)
4689+
btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
4690+
46764691
ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
46774692
old_dentry->d_name.name,
46784693
old_dentry->d_name.len);
@@ -4704,7 +4719,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
47044719
if (ret)
47054720
goto out_fail;
47064721

4722+
btrfs_log_new_name(trans, old_inode, old_dir,
4723+
new_dentry->d_parent);
47074724
out_fail:
4725+
4726+
/* this btrfs_end_log_trans just allows the current
4727+
* log-sub transaction to complete
4728+
*/
4729+
btrfs_end_log_trans(root);
47084730
btrfs_end_transaction_throttle(trans, root);
47094731
out_unlock:
47104732
return ret;

0 commit comments

Comments
 (0)