Skip to content

Commit 1418bae

Browse files
adam900710kdave
authored andcommitted
btrfs: qgroup: Move reserved data accounting from btrfs_delayed_ref_head to btrfs_qgroup_extent_record
[BUG] Btrfs/139 will fail with a high probability if the testing machine (VM) has only 2G RAM. Resulting the final write success while it should fail due to EDQUOT, and the fs will have quota exceeding the limit by 16K. The simplified reproducer will be: (needs a 2G ram VM) $ mkfs.btrfs -f $dev $ mount $dev $mnt $ btrfs subv create $mnt/subv $ btrfs quota enable $mnt $ btrfs quota rescan -w $mnt $ btrfs qgroup limit -e 1G $mnt/subv $ for i in $(seq -w 1 8); do xfs_io -f -c "pwrite 0 128M" $mnt/subv/file_$i > /dev/null echo "file $i written" > /dev/kmsg done $ sync $ btrfs qgroup show -pcre --raw $mnt The last pwrite will not trigger EDQUOT and final 'qgroup show' will show something like: qgroupid rfer excl max_rfer max_excl parent child -------- ---- ---- -------- -------- ------ ----- 0/5 16384 16384 none none --- --- 0/256 1073758208 1073758208 none 1073741824 --- --- And 1073758208 is larger than > 1073741824. [CAUSE] It's a bug in btrfs qgroup data reserved space management. For quota limit, we must ensure that: reserved (data + metadata) + rfer/excl <= limit Since rfer/excl is only updated at transaction commmit time, reserved space needs to be taken special care. One important part of reserved space is data, and for a new data extent written to disk, we still need to take the reserved space until rfer/excl numbers get updated. Originally when an ordered extent finishes, we migrate the reserved qgroup data space from extent_io tree to delayed ref head of the data extent, expecting delayed ref will only be cleaned up at commit transaction time. However for small RAM machine, due to memory pressure dirty pages can be flushed back to disk without committing a transaction. The related events will be something like: file 1 written btrfs_finish_ordered_io: ino=258 ordered offset=0 len=54947840 btrfs_finish_ordered_io: ino=258 ordered offset=54947840 len=5636096 btrfs_finish_ordered_io: ino=258 ordered offset=61153280 len=57344 btrfs_finish_ordered_io: ino=258 ordered offset=61210624 len=8192 btrfs_finish_ordered_io: ino=258 ordered offset=60583936 len=569344 cleanup_ref_head: num_bytes=54947840 cleanup_ref_head: num_bytes=5636096 cleanup_ref_head: num_bytes=569344 cleanup_ref_head: num_bytes=57344 cleanup_ref_head: num_bytes=8192 ^^^^^^^^^^^^^^^^ This will free qgroup data reserved space file 2 written ... file 8 written cleanup_ref_head: num_bytes=8192 ... btrfs_commit_transaction <<< the only transaction committed during the test When file 2 is written, we have already freed 128M reserved qgroup data space for ino 258. Thus later write won't trigger EDQUOT. This allows us to write more data beyond qgroup limit. In my 2G ram VM, it could reach about 1.2G before hitting EDQUOT. [FIX] By moving reserved qgroup data space from btrfs_delayed_ref_head to btrfs_qgroup_extent_record, we can ensure that reserved qgroup data space won't be freed half way before commit transaction, thus fix the problem. Fixes: f64d5ca ("btrfs: delayed_ref: Add new function to record reserved space into delayed ref") Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
1 parent 0ea8207 commit 1418bae

File tree

6 files changed

+30
-67
lines changed

6 files changed

+30
-67
lines changed

fs/btrfs/delayed-ref.c

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -602,17 +602,14 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
602602
RB_CLEAR_NODE(&head_ref->href_node);
603603
head_ref->processing = 0;
604604
head_ref->total_ref_mod = count_mod;
605-
head_ref->qgroup_reserved = 0;
606-
head_ref->qgroup_ref_root = 0;
607605
spin_lock_init(&head_ref->lock);
608606
mutex_init(&head_ref->mutex);
609607

610608
if (qrecord) {
611609
if (ref_root && reserved) {
612-
head_ref->qgroup_ref_root = ref_root;
613-
head_ref->qgroup_reserved = reserved;
610+
qrecord->data_rsv = reserved;
611+
qrecord->data_rsv_refroot = ref_root;
614612
}
615-
616613
qrecord->bytenr = bytenr;
617614
qrecord->num_bytes = num_bytes;
618615
qrecord->old_roots = NULL;
@@ -651,10 +648,6 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
651648
existing = htree_insert(&delayed_refs->href_root,
652649
&head_ref->href_node);
653650
if (existing) {
654-
WARN_ON(qrecord && head_ref->qgroup_ref_root
655-
&& head_ref->qgroup_reserved
656-
&& existing->qgroup_ref_root
657-
&& existing->qgroup_reserved);
658651
update_existing_head_ref(trans, existing, head_ref,
659652
old_ref_mod);
660653
/*
@@ -770,7 +763,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
770763

771764
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
772765
is_fstree(ref_root)) {
773-
record = kmalloc(sizeof(*record), GFP_NOFS);
766+
record = kzalloc(sizeof(*record), GFP_NOFS);
774767
if (!record) {
775768
kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
776769
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
@@ -867,7 +860,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
867860

868861
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
869862
is_fstree(ref_root)) {
870-
record = kmalloc(sizeof(*record), GFP_NOFS);
863+
record = kzalloc(sizeof(*record), GFP_NOFS);
871864
if (!record) {
872865
kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
873866
kmem_cache_free(btrfs_delayed_ref_head_cachep,

fs/btrfs/delayed-ref.h

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -102,17 +102,6 @@ struct btrfs_delayed_ref_head {
102102
*/
103103
int ref_mod;
104104

105-
/*
106-
* For qgroup reserved space freeing.
107-
*
108-
* ref_root and reserved will be recorded after
109-
* BTRFS_ADD_DELAYED_EXTENT is called.
110-
* And will be used to free reserved qgroup space at
111-
* run_delayed_refs() time.
112-
*/
113-
u64 qgroup_ref_root;
114-
u64 qgroup_reserved;
115-
116105
/*
117106
* when a new extent is allocated, it is just reserved in memory
118107
* The actual extent isn't inserted into the extent allocation tree

fs/btrfs/extent-tree.c

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2492,9 +2492,6 @@ void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
24922492
}
24932493
}
24942494

2495-
/* Also free its reserved qgroup space */
2496-
btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
2497-
head->qgroup_reserved);
24982495
btrfs_delayed_refs_rsv_release(fs_info, nr_items);
24992496
}
25002497

fs/btrfs/qgroup.c

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1546,12 +1546,18 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
15461546
parent_node = *p;
15471547
entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
15481548
node);
1549-
if (bytenr < entry->bytenr)
1549+
if (bytenr < entry->bytenr) {
15501550
p = &(*p)->rb_left;
1551-
else if (bytenr > entry->bytenr)
1551+
} else if (bytenr > entry->bytenr) {
15521552
p = &(*p)->rb_right;
1553-
else
1553+
} else {
1554+
if (record->data_rsv && !entry->data_rsv) {
1555+
entry->data_rsv = record->data_rsv;
1556+
entry->data_rsv_refroot =
1557+
record->data_rsv_refroot;
1558+
}
15541559
return 1;
1560+
}
15551561
}
15561562

15571563
rb_link_node(&record->node, parent_node, p);
@@ -1597,7 +1603,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
15971603
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)
15981604
|| bytenr == 0 || num_bytes == 0)
15991605
return 0;
1600-
record = kmalloc(sizeof(*record), gfp_flag);
1606+
record = kzalloc(sizeof(*record), gfp_flag);
16011607
if (!record)
16021608
return -ENOMEM;
16031609

@@ -2517,6 +2523,11 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
25172523
goto cleanup;
25182524
}
25192525

2526+
/* Free the reserved data space */
2527+
btrfs_qgroup_free_refroot(fs_info,
2528+
record->data_rsv_refroot,
2529+
record->data_rsv,
2530+
BTRFS_QGROUP_RSV_DATA);
25202531
/*
25212532
* Use SEQ_LAST as time_seq to do special search, which
25222533
* doesn't lock tree or delayed_refs and search current

fs/btrfs/qgroup.h

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,17 @@ struct btrfs_qgroup_extent_record {
107107
struct rb_node node;
108108
u64 bytenr;
109109
u64 num_bytes;
110+
111+
/*
112+
* For qgroup reserved data space freeing.
113+
*
114+
* @data_rsv_refroot and @data_rsv will be recorded after
115+
* BTRFS_ADD_DELAYED_EXTENT is called.
116+
* And will be used to free reserved qgroup space at
117+
* transaction commit time.
118+
*/
119+
u32 data_rsv; /* reserved data space needs to be freed */
120+
u64 data_rsv_refroot; /* which root the reserved data belongs to */
110121
struct ulist *old_roots;
111122
};
112123

@@ -326,15 +337,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
326337
void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
327338
u64 ref_root, u64 num_bytes,
328339
enum btrfs_qgroup_rsv_type type);
329-
static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
330-
u64 ref_root, u64 num_bytes)
331-
{
332-
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
333-
return;
334-
trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes);
335-
btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes,
336-
BTRFS_QGROUP_RSV_DATA);
337-
}
338340

339341
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
340342
int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,

include/trace/events/btrfs.h

Lines changed: 0 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1513,35 +1513,6 @@ DEFINE_EVENT(btrfs__qgroup_rsv_data, btrfs_qgroup_release_data,
15131513
TP_ARGS(inode, start, len, reserved, op)
15141514
);
15151515

1516-
DECLARE_EVENT_CLASS(btrfs__qgroup_delayed_ref,
1517-
1518-
TP_PROTO(const struct btrfs_fs_info *fs_info,
1519-
u64 ref_root, u64 reserved),
1520-
1521-
TP_ARGS(fs_info, ref_root, reserved),
1522-
1523-
TP_STRUCT__entry_btrfs(
1524-
__field( u64, ref_root )
1525-
__field( u64, reserved )
1526-
),
1527-
1528-
TP_fast_assign_btrfs(fs_info,
1529-
__entry->ref_root = ref_root;
1530-
__entry->reserved = reserved;
1531-
),
1532-
1533-
TP_printk_btrfs("root=%llu reserved=%llu op=free",
1534-
__entry->ref_root, __entry->reserved)
1535-
);
1536-
1537-
DEFINE_EVENT(btrfs__qgroup_delayed_ref, btrfs_qgroup_free_delayed_ref,
1538-
1539-
TP_PROTO(const struct btrfs_fs_info *fs_info,
1540-
u64 ref_root, u64 reserved),
1541-
1542-
TP_ARGS(fs_info, ref_root, reserved)
1543-
);
1544-
15451516
DECLARE_EVENT_CLASS(btrfs_qgroup_extent,
15461517
TP_PROTO(const struct btrfs_fs_info *fs_info,
15471518
const struct btrfs_qgroup_extent_record *rec),

0 commit comments

Comments
 (0)