Skip to content

Commit b8a8684

Browse files
Lukas Czernertytso
authored andcommitted
ext4: Introduce FALLOC_FL_ZERO_RANGE flag for fallocate
Introduce new FALLOC_FL_ZERO_RANGE flag for fallocate. This has the same functionality as xfs ioctl XFS_IOC_ZERO_RANGE. It can be used to convert a range of file to zeros preferably without issuing data IO. Blocks should be preallocated for the regions that span holes in the file, and the entire range is preferable converted to unwritten extents This can be also used to preallocate blocks past EOF in the same way as with fallocate. Flag FALLOC_FL_KEEP_SIZE which should cause the inode size to remain the same. Also add appropriate tracepoints. Signed-off-by: Lukas Czerner <lczerner@redhat.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
1 parent 0e8b687 commit b8a8684

File tree

4 files changed

+307
-53
lines changed

4 files changed

+307
-53
lines changed

fs/ext4/ext4.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -568,6 +568,8 @@ enum {
568568
#define EXT4_GET_BLOCKS_NO_LOCK 0x0100
569569
/* Do not put hole in extent cache */
570570
#define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200
571+
/* Convert written extents to unwritten */
572+
#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0400
571573

572574
/*
573575
* The bit position of these flags must not overlap with any of the

fs/ext4/extents.c

Lines changed: 259 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3602,6 +3602,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
36023602
* b> Splits in two extents: Write is happening at either end of the extent
36033603
* c> Splits in three extents: Somone is writing in middle of the extent
36043604
*
3605+
* This works the same way in the case of initialized -> unwritten conversion.
3606+
*
36053607
* One of more index blocks maybe needed if the extent tree grow after
36063608
* the uninitialized extent split. To prevent ENOSPC occur at the IO
36073609
* complete, we need to split the uninitialized extent before DIO submit
@@ -3612,7 +3614,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
36123614
*
36133615
* Returns the size of uninitialized extent to be written on success.
36143616
*/
3615-
static int ext4_split_unwritten_extents(handle_t *handle,
3617+
static int ext4_split_convert_extents(handle_t *handle,
36163618
struct inode *inode,
36173619
struct ext4_map_blocks *map,
36183620
struct ext4_ext_path *path,
@@ -3624,9 +3626,9 @@ static int ext4_split_unwritten_extents(handle_t *handle,
36243626
unsigned int ee_len;
36253627
int split_flag = 0, depth;
36263628

3627-
ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
3628-
"block %llu, max_blocks %u\n", inode->i_ino,
3629-
(unsigned long long)map->m_lblk, map->m_len);
3629+
ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n",
3630+
__func__, inode->i_ino,
3631+
(unsigned long long)map->m_lblk, map->m_len);
36303632

36313633
eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
36323634
inode->i_sb->s_blocksize_bits;
@@ -3641,14 +3643,73 @@ static int ext4_split_unwritten_extents(handle_t *handle,
36413643
ee_block = le32_to_cpu(ex->ee_block);
36423644
ee_len = ext4_ext_get_actual_len(ex);
36433645

3644-
split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
3645-
split_flag |= EXT4_EXT_MARK_UNINIT2;
3646-
if (flags & EXT4_GET_BLOCKS_CONVERT)
3647-
split_flag |= EXT4_EXT_DATA_VALID2;
3646+
/* Convert to unwritten */
3647+
if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
3648+
split_flag |= EXT4_EXT_DATA_VALID1;
3649+
/* Convert to initialized */
3650+
} else if (flags & EXT4_GET_BLOCKS_CONVERT) {
3651+
split_flag |= ee_block + ee_len <= eof_block ?
3652+
EXT4_EXT_MAY_ZEROOUT : 0;
3653+
split_flag |= (EXT4_EXT_MARK_UNINIT2 | EXT4_EXT_DATA_VALID2);
3654+
}
36483655
flags |= EXT4_GET_BLOCKS_PRE_IO;
36493656
return ext4_split_extent(handle, inode, path, map, split_flag, flags);
36503657
}
36513658

3659+
static int ext4_convert_initialized_extents(handle_t *handle,
3660+
struct inode *inode,
3661+
struct ext4_map_blocks *map,
3662+
struct ext4_ext_path *path)
3663+
{
3664+
struct ext4_extent *ex;
3665+
ext4_lblk_t ee_block;
3666+
unsigned int ee_len;
3667+
int depth;
3668+
int err = 0;
3669+
3670+
depth = ext_depth(inode);
3671+
ex = path[depth].p_ext;
3672+
ee_block = le32_to_cpu(ex->ee_block);
3673+
ee_len = ext4_ext_get_actual_len(ex);
3674+
3675+
ext_debug("%s: inode %lu, logical"
3676+
"block %llu, max_blocks %u\n", __func__, inode->i_ino,
3677+
(unsigned long long)ee_block, ee_len);
3678+
3679+
if (ee_block != map->m_lblk || ee_len > map->m_len) {
3680+
err = ext4_split_convert_extents(handle, inode, map, path,
3681+
EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
3682+
if (err < 0)
3683+
goto out;
3684+
ext4_ext_drop_refs(path);
3685+
path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
3686+
if (IS_ERR(path)) {
3687+
err = PTR_ERR(path);
3688+
goto out;
3689+
}
3690+
depth = ext_depth(inode);
3691+
ex = path[depth].p_ext;
3692+
}
3693+
3694+
err = ext4_ext_get_access(handle, inode, path + depth);
3695+
if (err)
3696+
goto out;
3697+
/* first mark the extent as uninitialized */
3698+
ext4_ext_mark_uninitialized(ex);
3699+
3700+
/* note: ext4_ext_correct_indexes() isn't needed here because
3701+
* borders are not changed
3702+
*/
3703+
ext4_ext_try_to_merge(handle, inode, path, ex);
3704+
3705+
/* Mark modified extent as dirty */
3706+
err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3707+
out:
3708+
ext4_ext_show_leaf(inode, path);
3709+
return err;
3710+
}
3711+
3712+
36523713
static int ext4_convert_unwritten_extents_endio(handle_t *handle,
36533714
struct inode *inode,
36543715
struct ext4_map_blocks *map,
@@ -3682,8 +3743,8 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
36823743
inode->i_ino, (unsigned long long)ee_block, ee_len,
36833744
(unsigned long long)map->m_lblk, map->m_len);
36843745
#endif
3685-
err = ext4_split_unwritten_extents(handle, inode, map, path,
3686-
EXT4_GET_BLOCKS_CONVERT);
3746+
err = ext4_split_convert_extents(handle, inode, map, path,
3747+
EXT4_GET_BLOCKS_CONVERT);
36873748
if (err < 0)
36883749
goto out;
36893750
ext4_ext_drop_refs(path);
@@ -3883,6 +3944,38 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
38833944
return allocated_clusters;
38843945
}
38853946

3947+
static int
3948+
ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode,
3949+
struct ext4_map_blocks *map,
3950+
struct ext4_ext_path *path, int flags,
3951+
unsigned int allocated, ext4_fsblk_t newblock)
3952+
{
3953+
int ret = 0;
3954+
int err = 0;
3955+
3956+
/*
3957+
* Make sure that the extent is no bigger than we support with
3958+
* uninitialized extent
3959+
*/
3960+
if (map->m_len > EXT_UNINIT_MAX_LEN)
3961+
map->m_len = EXT_UNINIT_MAX_LEN / 2;
3962+
3963+
ret = ext4_convert_initialized_extents(handle, inode, map,
3964+
path);
3965+
if (ret >= 0) {
3966+
ext4_update_inode_fsync_trans(handle, inode, 1);
3967+
err = check_eofblocks_fl(handle, inode, map->m_lblk,
3968+
path, map->m_len);
3969+
} else
3970+
err = ret;
3971+
map->m_flags |= EXT4_MAP_UNWRITTEN;
3972+
if (allocated > map->m_len)
3973+
allocated = map->m_len;
3974+
map->m_len = allocated;
3975+
3976+
return err ? err : allocated;
3977+
}
3978+
38863979
static int
38873980
ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
38883981
struct ext4_map_blocks *map,
@@ -3910,8 +4003,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
39104003

39114004
/* get_block() before submit the IO, split the extent */
39124005
if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3913-
ret = ext4_split_unwritten_extents(handle, inode, map,
3914-
path, flags);
4006+
ret = ext4_split_convert_extents(handle, inode, map,
4007+
path, flags | EXT4_GET_BLOCKS_CONVERT);
39154008
if (ret <= 0)
39164009
goto out;
39174010
/*
@@ -4199,6 +4292,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
41994292
ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
42004293
unsigned short ee_len;
42014294

4295+
42024296
/*
42034297
* Uninitialized extents are treated as holes, except that
42044298
* we split out initialized portions during a write.
@@ -4215,7 +4309,17 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
42154309
ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
42164310
ee_block, ee_len, newblock);
42174311

4218-
if (!ext4_ext_is_uninitialized(ex))
4312+
/*
4313+
* If the extent is initialized check whether the
4314+
* caller wants to convert it to unwritten.
4315+
*/
4316+
if ((!ext4_ext_is_uninitialized(ex)) &&
4317+
(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
4318+
allocated = ext4_ext_convert_initialized_extent(
4319+
handle, inode, map, path, flags,
4320+
allocated, newblock);
4321+
goto out2;
4322+
} else if (!ext4_ext_is_uninitialized(ex))
42194323
goto out;
42204324

42214325
ret = ext4_ext_handle_uninitialized_extents(
@@ -4604,6 +4708,144 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
46044708
return ret > 0 ? ret2 : ret;
46054709
}
46064710

4711+
static long ext4_zero_range(struct file *file, loff_t offset,
4712+
loff_t len, int mode)
4713+
{
4714+
struct inode *inode = file_inode(file);
4715+
handle_t *handle = NULL;
4716+
unsigned int max_blocks;
4717+
loff_t new_size = 0;
4718+
int ret = 0;
4719+
int flags;
4720+
int partial;
4721+
loff_t start, end;
4722+
ext4_lblk_t lblk;
4723+
struct address_space *mapping = inode->i_mapping;
4724+
unsigned int blkbits = inode->i_blkbits;
4725+
4726+
trace_ext4_zero_range(inode, offset, len, mode);
4727+
4728+
/*
4729+
* Write out all dirty pages to avoid race conditions
4730+
* Then release them.
4731+
*/
4732+
if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
4733+
ret = filemap_write_and_wait_range(mapping, offset,
4734+
offset + len - 1);
4735+
if (ret)
4736+
return ret;
4737+
}
4738+
4739+
/*
4740+
* Round up offset. This is not fallocate, we neet to zero out
4741+
* blocks, so convert interior block aligned part of the range to
4742+
* unwritten and possibly manually zero out unaligned parts of the
4743+
* range.
4744+
*/
4745+
start = round_up(offset, 1 << blkbits);
4746+
end = round_down((offset + len), 1 << blkbits);
4747+
4748+
if (start < offset || end > offset + len)
4749+
return -EINVAL;
4750+
partial = (offset + len) & ((1 << blkbits) - 1);
4751+
4752+
lblk = start >> blkbits;
4753+
max_blocks = (end >> blkbits);
4754+
if (max_blocks < lblk)
4755+
max_blocks = 0;
4756+
else
4757+
max_blocks -= lblk;
4758+
4759+
flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
4760+
EXT4_GET_BLOCKS_CONVERT_UNWRITTEN;
4761+
if (mode & FALLOC_FL_KEEP_SIZE)
4762+
flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
4763+
4764+
mutex_lock(&inode->i_mutex);
4765+
4766+
/*
4767+
* Indirect files do not support unwritten extnets
4768+
*/
4769+
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
4770+
ret = -EOPNOTSUPP;
4771+
goto out_mutex;
4772+
}
4773+
4774+
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4775+
offset + len > i_size_read(inode)) {
4776+
new_size = offset + len;
4777+
ret = inode_newsize_ok(inode, new_size);
4778+
if (ret)
4779+
goto out_mutex;
4780+
/*
4781+
* If we have a partial block after EOF we have to allocate
4782+
* the entire block.
4783+
*/
4784+
if (partial)
4785+
max_blocks += 1;
4786+
}
4787+
4788+
if (max_blocks > 0) {
4789+
4790+
/* Now release the pages and zero block aligned part of pages*/
4791+
truncate_pagecache_range(inode, start, end - 1);
4792+
4793+
/* Wait all existing dio workers, newcomers will block on i_mutex */
4794+
ext4_inode_block_unlocked_dio(inode);
4795+
inode_dio_wait(inode);
4796+
4797+
/*
4798+
* Remove entire range from the extent status tree.
4799+
*/
4800+
ret = ext4_es_remove_extent(inode, lblk, max_blocks);
4801+
if (ret)
4802+
goto out_dio;
4803+
4804+
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags,
4805+
mode);
4806+
if (ret)
4807+
goto out_dio;
4808+
}
4809+
4810+
handle = ext4_journal_start(inode, EXT4_HT_MISC, 4);
4811+
if (IS_ERR(handle)) {
4812+
ret = PTR_ERR(handle);
4813+
ext4_std_error(inode->i_sb, ret);
4814+
goto out_dio;
4815+
}
4816+
4817+
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4818+
4819+
if (!ret && new_size) {
4820+
if (new_size > i_size_read(inode))
4821+
i_size_write(inode, new_size);
4822+
if (new_size > EXT4_I(inode)->i_disksize)
4823+
ext4_update_i_disksize(inode, new_size);
4824+
} else if (!ret && !new_size) {
4825+
/*
4826+
* Mark that we allocate beyond EOF so the subsequent truncate
4827+
* can proceed even if the new size is the same as i_size.
4828+
*/
4829+
if ((offset + len) > i_size_read(inode))
4830+
ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4831+
}
4832+
4833+
ext4_mark_inode_dirty(handle, inode);
4834+
4835+
/* Zero out partial block at the edges of the range */
4836+
ret = ext4_zero_partial_blocks(handle, inode, offset, len);
4837+
4838+
if (file->f_flags & O_SYNC)
4839+
ext4_handle_sync(handle);
4840+
4841+
ext4_journal_stop(handle);
4842+
out_dio:
4843+
ext4_inode_resume_unlocked_dio(inode);
4844+
out_mutex:
4845+
mutex_unlock(&inode->i_mutex);
4846+
return ret;
4847+
}
4848+
46074849
/*
46084850
* preallocate space for a file. This implements ext4's fallocate file
46094851
* operation, which gets called from sys_fallocate system call.
@@ -4625,7 +4867,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
46254867

46264868
/* Return error if mode is not supported */
46274869
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
4628-
FALLOC_FL_COLLAPSE_RANGE))
4870+
FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
46294871
return -EOPNOTSUPP;
46304872

46314873
if (mode & FALLOC_FL_PUNCH_HOLE)
@@ -4645,6 +4887,9 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
46454887
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
46464888
return -EOPNOTSUPP;
46474889

4890+
if (mode & FALLOC_FL_ZERO_RANGE)
4891+
return ext4_zero_range(file, offset, len, mode);
4892+
46484893
trace_ext4_fallocate_enter(inode, offset, len, mode);
46494894
lblk = offset >> blkbits;
46504895
/*

0 commit comments

Comments
 (0)