Skip to content

Commit 27dd438

Browse files
Lukas Czernertytso
authored andcommitted
ext4: introduce reserved space
Currently in ENOSPC condition when writing into unwritten space, or punching a hole, we might need to split the extent and grow extent tree. However since we can not allocate any new metadata blocks we'll have to zero out unwritten part of extent or punched out part of extent, or in the worst case return ENOSPC even though use actually does not allocate any space. Also in delalloc path we do reserve metadata and data blocks for the time we're going to write out, however metadata block reservation is very tricky especially since we expect that logical connectivity implies physical connectivity, however that might not be the case and hence we might end up allocating more metadata blocks than previously reserved. So in future, metadata reservation checks should be removed since we can not assure that we do not under reserve. And this is where reserved space comes into the picture. When mounting the file system we slice off a little bit of the file system space (2% or 4096 clusters, whichever is smaller) which can be then used for the cases mentioned above to prevent costly zeroout, or unexpected ENOSPC. The number of reserved clusters can be set via sysfs, however it can never be bigger than number of free clusters in the file system. Note that this patch fixes the failure of xfstest 274 as expected. Signed-off-by: Lukas Czerner <lczerner@redhat.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
1 parent f45a5ef commit 27dd438

File tree

6 files changed

+141
-23
lines changed

6 files changed

+141
-23
lines changed

Documentation/filesystems/ext4.txt

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,17 @@ Files in /sys/fs/ext4/<devname>
494494
session_write_kbytes This file is read-only and shows the number of
495495
kilobytes of data that have been written to this
496496
filesystem since it was mounted.
497+
498+
reserved_clusters This is RW file and contains number of reserved
499+
clusters in the file system which will be used
500+
in the specific situations to avoid costly
501+
zeroout, unexpected ENOSPC, or possible data
502+
loss. The default is 2% or 4096 clusters,
503+
whichever is smaller and this can be changed
504+
however it can never exceed number of clusters
505+
in the file system. If there is not enough space
506+
for the reserved space when mounting the file
507+
mount will _not_ fail.
497508
..............................................................................
498509

499510
Ioctls

fs/ext4/balloc.c

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -499,36 +499,44 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
499499
static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
500500
s64 nclusters, unsigned int flags)
501501
{
502-
s64 free_clusters, dirty_clusters, root_clusters;
502+
s64 free_clusters, dirty_clusters, rsv, resv_clusters;
503503
struct percpu_counter *fcc = &sbi->s_freeclusters_counter;
504504
struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter;
505505

506506
free_clusters = percpu_counter_read_positive(fcc);
507507
dirty_clusters = percpu_counter_read_positive(dcc);
508+
resv_clusters = atomic64_read(&sbi->s_resv_clusters);
508509

509510
/*
510511
* r_blocks_count should always be multiple of the cluster ratio so
511512
* we are safe to do a plane bit shift only.
512513
*/
513-
root_clusters = ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits;
514+
rsv = (ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits) +
515+
resv_clusters;
514516

515-
if (free_clusters - (nclusters + root_clusters + dirty_clusters) <
517+
if (free_clusters - (nclusters + rsv + dirty_clusters) <
516518
EXT4_FREECLUSTERS_WATERMARK) {
517519
free_clusters = percpu_counter_sum_positive(fcc);
518520
dirty_clusters = percpu_counter_sum_positive(dcc);
519521
}
520522
/* Check whether we have space after accounting for current
521523
* dirty clusters & root reserved clusters.
522524
*/
523-
if (free_clusters >= ((root_clusters + nclusters) + dirty_clusters))
525+
if (free_clusters >= (rsv + nclusters + dirty_clusters))
524526
return 1;
525527

526528
/* Hm, nope. Are (enough) root reserved clusters available? */
527529
if (uid_eq(sbi->s_resuid, current_fsuid()) ||
528530
(!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) ||
529531
capable(CAP_SYS_RESOURCE) ||
530-
(flags & EXT4_MB_USE_ROOT_BLOCKS)) {
532+
(flags & EXT4_MB_USE_ROOT_BLOCKS)) {
531533

534+
if (free_clusters >= (nclusters + dirty_clusters +
535+
resv_clusters))
536+
return 1;
537+
}
538+
/* No free blocks. Let's see if we can dip into reserved pool */
539+
if (flags & EXT4_MB_USE_RESERVED) {
532540
if (free_clusters >= (nclusters + dirty_clusters))
533541
return 1;
534542
}

fs/ext4/ext4.h

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,8 @@ typedef unsigned int ext4_group_t;
121121
#define EXT4_MB_STREAM_ALLOC 0x0800
122122
/* Use reserved root blocks if needed */
123123
#define EXT4_MB_USE_ROOT_BLOCKS 0x1000
124+
/* Use blocks from reserved pool */
125+
#define EXT4_MB_USE_RESERVED 0x2000
124126

125127
struct ext4_allocation_request {
126128
/* target inode for block we're allocating */
@@ -557,9 +559,8 @@ enum {
557559
#define EXT4_GET_BLOCKS_UNINIT_EXT 0x0002
558560
#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\
559561
EXT4_GET_BLOCKS_CREATE)
560-
/* Caller is from the delayed allocation writeout path,
561-
so set the magic i_delalloc_reserve_flag after taking the
562-
inode allocation semaphore for */
562+
/* Caller is from the delayed allocation writeout path
563+
* finally doing the actual allocation of delayed blocks */
563564
#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004
564565
/* caller is from the direct IO path, request to creation of an
565566
unitialized extents if not allocated, split the uninitialized
@@ -571,8 +572,9 @@ enum {
571572
/* Convert extent to initialized after IO complete */
572573
#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
573574
EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
574-
/* Punch out blocks of an extent */
575-
#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT 0x0020
575+
/* Eventual metadata allocation (due to growing extent tree)
576+
* should not fail, so try to use reserved blocks for that.*/
577+
#define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020
576578
/* Don't normalize allocation size (used for fallocate) */
577579
#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040
578580
/* Request will not result in inode size update (user for fallocate) */
@@ -1188,6 +1190,7 @@ struct ext4_sb_info {
11881190
unsigned int s_mount_flags;
11891191
unsigned int s_def_mount_opt;
11901192
ext4_fsblk_t s_sb_block;
1193+
atomic64_t s_resv_clusters;
11911194
kuid_t s_resuid;
11921195
kgid_t s_resgid;
11931196
unsigned short s_mount_state;

fs/ext4/extents.c

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1942,8 +1942,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
19421942
* There is no free space in the found leaf.
19431943
* We're gonna add a new leaf in the tree.
19441944
*/
1945-
if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT)
1946-
flags = EXT4_MB_USE_ROOT_BLOCKS;
1945+
if (flag & EXT4_GET_BLOCKS_METADATA_NOFAIL)
1946+
flags = EXT4_MB_USE_RESERVED;
19471947
err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext);
19481948
if (err)
19491949
goto cleanup;
@@ -2729,12 +2729,14 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
27292729

27302730
/*
27312731
* Split the extent in two so that 'end' is the last
2732-
* block in the first new extent
2732+
* block in the first new extent. Also we should not
2733+
* fail removing space due to ENOSPC so try to use
2734+
* reserved block if that happens.
27332735
*/
27342736
err = ext4_split_extent_at(handle, inode, path,
2735-
end + 1, split_flag,
2736-
EXT4_GET_BLOCKS_PRE_IO |
2737-
EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
2737+
end + 1, split_flag,
2738+
EXT4_GET_BLOCKS_PRE_IO |
2739+
EXT4_GET_BLOCKS_METADATA_NOFAIL);
27382740

27392741
if (err < 0)
27402742
goto out;
@@ -3209,7 +3211,8 @@ static int ext4_split_extent(handle_t *handle,
32093211
static int ext4_ext_convert_to_initialized(handle_t *handle,
32103212
struct inode *inode,
32113213
struct ext4_map_blocks *map,
3212-
struct ext4_ext_path *path)
3214+
struct ext4_ext_path *path,
3215+
int flags)
32133216
{
32143217
struct ext4_sb_info *sbi;
32153218
struct ext4_extent_header *eh;
@@ -3435,7 +3438,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
34353438
}
34363439

34373440
allocated = ext4_split_extent(handle, inode, path,
3438-
&split_map, split_flag, 0);
3441+
&split_map, split_flag, flags);
34393442
if (allocated < 0)
34403443
err = allocated;
34413444

@@ -3755,6 +3758,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
37553758
flags, allocated);
37563759
ext4_ext_show_leaf(inode, path);
37573760

3761+
/*
3762+
* When writing into uninitialized space, we should not fail to
3763+
* allocate metadata blocks for the new extent block if needed.
3764+
*/
3765+
flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;
3766+
37583767
trace_ext4_ext_handle_uninitialized_extents(inode, map, flags,
37593768
allocated, newblock);
37603769

@@ -3818,7 +3827,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
38183827
}
38193828

38203829
/* buffered write, writepage time, convert*/
3821-
ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
3830+
ret = ext4_ext_convert_to_initialized(handle, inode, map, path, flags);
38223831
if (ret >= 0)
38233832
ext4_update_inode_fsync_trans(handle, inode, 1);
38243833
out:

fs/ext4/inode.c

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1688,12 +1688,21 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
16881688
*/
16891689
map.m_lblk = next;
16901690
map.m_len = max_blocks;
1691-
get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
1691+
/*
1692+
* We're in delalloc path and it is possible that we're going to
1693+
* need more metadata blocks than previously reserved. However
1694+
* we must not fail because we're in writeback and there is
1695+
* nothing we can do about it so it might result in data loss.
1696+
* So use reserved blocks to allocate metadata if possible.
1697+
*/
1698+
get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
1699+
EXT4_GET_BLOCKS_METADATA_NOFAIL;
16921700
if (ext4_should_dioread_nolock(mpd->inode))
16931701
get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
16941702
if (mpd->b_state & (1 << BH_Delay))
16951703
get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
16961704

1705+
16971706
blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
16981707
if (blks < 0) {
16991708
struct super_block *sb = mpd->inode->i_sb;

fs/ext4/super.c

Lines changed: 81 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly);
8181
static void ext4_destroy_lazyinit_thread(void);
8282
static void ext4_unregister_li_request(struct super_block *sb);
8383
static void ext4_clear_request_list(void);
84+
static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t);
8485

8586
#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
8687
static struct file_system_type ext2_fs_type = {
@@ -2382,6 +2383,17 @@ struct ext4_attr {
23822383
int offset;
23832384
};
23842385

2386+
static int parse_strtoull(const char *buf,
2387+
unsigned long long max, unsigned long long *value)
2388+
{
2389+
int ret;
2390+
2391+
ret = kstrtoull(skip_spaces(buf), 0, value);
2392+
if (!ret && *value > max)
2393+
ret = -EINVAL;
2394+
return ret;
2395+
}
2396+
23852397
static int parse_strtoul(const char *buf,
23862398
unsigned long max, unsigned long *value)
23872399
{
@@ -2466,6 +2478,27 @@ static ssize_t sbi_ui_store(struct ext4_attr *a,
24662478
return count;
24672479
}
24682480

2481+
static ssize_t reserved_clusters_show(struct ext4_attr *a,
2482+
struct ext4_sb_info *sbi, char *buf)
2483+
{
2484+
return snprintf(buf, PAGE_SIZE, "%llu\n",
2485+
(unsigned long long) atomic64_read(&sbi->s_resv_clusters));
2486+
}
2487+
2488+
static ssize_t reserved_clusters_store(struct ext4_attr *a,
2489+
struct ext4_sb_info *sbi,
2490+
const char *buf, size_t count)
2491+
{
2492+
unsigned long long val;
2493+
int ret;
2494+
2495+
if (parse_strtoull(buf, -1ULL, &val))
2496+
return -EINVAL;
2497+
ret = ext4_reserve_clusters(sbi, val);
2498+
2499+
return ret ? ret : count;
2500+
}
2501+
24692502
static ssize_t trigger_test_error(struct ext4_attr *a,
24702503
struct ext4_sb_info *sbi,
24712504
const char *buf, size_t count)
@@ -2503,6 +2536,7 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
25032536
EXT4_RO_ATTR(delayed_allocation_blocks);
25042537
EXT4_RO_ATTR(session_write_kbytes);
25052538
EXT4_RO_ATTR(lifetime_write_kbytes);
2539+
EXT4_RW_ATTR(reserved_clusters);
25062540
EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
25072541
inode_readahead_blks_store, s_inode_readahead_blks);
25082542
EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
@@ -2520,6 +2554,7 @@ static struct attribute *ext4_attrs[] = {
25202554
ATTR_LIST(delayed_allocation_blocks),
25212555
ATTR_LIST(session_write_kbytes),
25222556
ATTR_LIST(lifetime_write_kbytes),
2557+
ATTR_LIST(reserved_clusters),
25232558
ATTR_LIST(inode_readahead_blks),
25242559
ATTR_LIST(inode_goal),
25252560
ATTR_LIST(mb_stats),
@@ -3195,6 +3230,40 @@ int ext4_calculate_overhead(struct super_block *sb)
31953230
return 0;
31963231
}
31973232

3233+
3234+
static ext4_fsblk_t ext4_calculate_resv_clusters(struct ext4_sb_info *sbi)
3235+
{
3236+
ext4_fsblk_t resv_clusters;
3237+
3238+
/*
3239+
* By default we reserve 2% or 4096 clusters, whichever is smaller.
3240+
* This should cover the situations where we can not afford to run
3241+
* out of space like for example punch hole, or converting
3242+
* uninitialized extents in delalloc path. In most cases such
3243+
* allocation would require 1, or 2 blocks, higher numbers are
3244+
* very rare.
3245+
*/
3246+
resv_clusters = ext4_blocks_count(sbi->s_es) >> sbi->s_cluster_bits;
3247+
3248+
do_div(resv_clusters, 50);
3249+
resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);
3250+
3251+
return resv_clusters;
3252+
}
3253+
3254+
3255+
static int ext4_reserve_clusters(struct ext4_sb_info *sbi, ext4_fsblk_t count)
3256+
{
3257+
ext4_fsblk_t clusters = ext4_blocks_count(sbi->s_es) >>
3258+
sbi->s_cluster_bits;
3259+
3260+
if (count >= clusters)
3261+
return -EINVAL;
3262+
3263+
atomic64_set(&sbi->s_resv_clusters, count);
3264+
return 0;
3265+
}
3266+
31983267
static int ext4_fill_super(struct super_block *sb, void *data, int silent)
31993268
{
32003269
char *orig_data = kstrdup(data, GFP_KERNEL);
@@ -3918,6 +3987,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
39183987
"available");
39193988
}
39203989

3990+
err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sbi));
3991+
if (err) {
3992+
ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for "
3993+
"reserved pool", ext4_calculate_resv_clusters(sbi));
3994+
goto failed_mount4a;
3995+
}
3996+
39213997
err = ext4_setup_system_zone(sb);
39223998
if (err) {
39233999
ext4_msg(sb, KERN_ERR, "failed to initialize system "
@@ -4750,9 +4826,10 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
47504826
struct super_block *sb = dentry->d_sb;
47514827
struct ext4_sb_info *sbi = EXT4_SB(sb);
47524828
struct ext4_super_block *es = sbi->s_es;
4753-
ext4_fsblk_t overhead = 0;
4829+
ext4_fsblk_t overhead = 0, resv_blocks;
47544830
u64 fsid;
47554831
s64 bfree;
4832+
resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
47564833

47574834
if (!test_opt(sb, MINIX_DF))
47584835
overhead = sbi->s_overhead;
@@ -4764,8 +4841,9 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
47644841
percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
47654842
/* prevent underflow in case that few free space is available */
47664843
buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
4767-
buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
4768-
if (buf->f_bfree < ext4_r_blocks_count(es))
4844+
buf->f_bavail = buf->f_bfree -
4845+
(ext4_r_blocks_count(es) + resv_blocks);
4846+
if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks))
47694847
buf->f_bavail = 0;
47704848
buf->f_files = le32_to_cpu(es->s_inodes_count);
47714849
buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);

0 commit comments

Comments
 (0)