Skip to content

Commit 416161d

Browse files
Mark FashehChris Mason
authored andcommitted
btrfs: offline dedupe
This patch adds an ioctl, BTRFS_IOC_FILE_EXTENT_SAME which will try to de-duplicate a list of extents across a range of files. Internally, the ioctl re-uses code from the clone ioctl. This avoids rewriting a large chunk of extent handling code. Userspace passes in an array of file, offset pairs along with a length argument. The ioctl will then (for each dedupe) do a byte-by-byte comparison of the user data before deduping the extent. Status and number of bytes deduped are returned for each operation. Signed-off-by: Mark Fasheh <mfasheh@suse.de> Reviewed-by: Zach Brown <zab@redhat.com> Signed-off-by: Josef Bacik <jbacik@fusionio.com> Signed-off-by: Chris Mason <chris.mason@fusionio.com>
1 parent 4b38431 commit 416161d

File tree

2 files changed

+307
-0
lines changed

2 files changed

+307
-0
lines changed

fs/btrfs/ioctl.c

Lines changed: 279 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
#include <linux/blkdev.h>
4444
#include <linux/uuid.h>
4545
#include <linux/btrfs.h>
46+
#include <linux/uaccess.h>
4647
#include "compat.h"
4748
#include "ctree.h"
4849
#include "disk-io.h"
@@ -57,6 +58,9 @@
5758
#include "send.h"
5859
#include "dev-replace.h"
5960

61+
static int btrfs_clone(struct inode *src, struct inode *inode,
62+
u64 off, u64 olen, u64 olen_aligned, u64 destoff);
63+
6064
/* Mask out flags that are inappropriate for the given type of inode. */
6165
static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
6266
{
@@ -2470,6 +2474,34 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
24702474
return ret;
24712475
}
24722476

2477+
static struct page *extent_same_get_page(struct inode *inode, u64 off)
2478+
{
2479+
struct page *page;
2480+
pgoff_t index;
2481+
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2482+
2483+
index = off >> PAGE_CACHE_SHIFT;
2484+
2485+
page = grab_cache_page(inode->i_mapping, index);
2486+
if (!page)
2487+
return NULL;
2488+
2489+
if (!PageUptodate(page)) {
2490+
if (extent_read_full_page_nolock(tree, page, btrfs_get_extent,
2491+
0))
2492+
return NULL;
2493+
lock_page(page);
2494+
if (!PageUptodate(page)) {
2495+
unlock_page(page);
2496+
page_cache_release(page);
2497+
return NULL;
2498+
}
2499+
}
2500+
unlock_page(page);
2501+
2502+
return page;
2503+
}
2504+
24732505
static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
24742506
{
24752507
/* do any pending delalloc/csum calc on src, one way or
@@ -2490,6 +2522,251 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
24902522
}
24912523
}
24922524

2525+
static void btrfs_double_unlock(struct inode *inode1, u64 loff1,
2526+
struct inode *inode2, u64 loff2, u64 len)
2527+
{
2528+
unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
2529+
unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
2530+
2531+
mutex_unlock(&inode1->i_mutex);
2532+
mutex_unlock(&inode2->i_mutex);
2533+
}
2534+
2535+
static void btrfs_double_lock(struct inode *inode1, u64 loff1,
2536+
struct inode *inode2, u64 loff2, u64 len)
2537+
{
2538+
if (inode1 < inode2) {
2539+
swap(inode1, inode2);
2540+
swap(loff1, loff2);
2541+
}
2542+
2543+
mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
2544+
lock_extent_range(inode1, loff1, len);
2545+
if (inode1 != inode2) {
2546+
mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
2547+
lock_extent_range(inode2, loff2, len);
2548+
}
2549+
}
2550+
2551+
static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst,
2552+
u64 dst_loff, u64 len)
2553+
{
2554+
int ret = 0;
2555+
struct page *src_page, *dst_page;
2556+
unsigned int cmp_len = PAGE_CACHE_SIZE;
2557+
void *addr, *dst_addr;
2558+
2559+
while (len) {
2560+
if (len < PAGE_CACHE_SIZE)
2561+
cmp_len = len;
2562+
2563+
src_page = extent_same_get_page(src, loff);
2564+
if (!src_page)
2565+
return -EINVAL;
2566+
dst_page = extent_same_get_page(dst, dst_loff);
2567+
if (!dst_page) {
2568+
page_cache_release(src_page);
2569+
return -EINVAL;
2570+
}
2571+
addr = kmap_atomic(src_page);
2572+
dst_addr = kmap_atomic(dst_page);
2573+
2574+
flush_dcache_page(src_page);
2575+
flush_dcache_page(dst_page);
2576+
2577+
if (memcmp(addr, dst_addr, cmp_len))
2578+
ret = BTRFS_SAME_DATA_DIFFERS;
2579+
2580+
kunmap_atomic(addr);
2581+
kunmap_atomic(dst_addr);
2582+
page_cache_release(src_page);
2583+
page_cache_release(dst_page);
2584+
2585+
if (ret)
2586+
break;
2587+
2588+
loff += cmp_len;
2589+
dst_loff += cmp_len;
2590+
len -= cmp_len;
2591+
}
2592+
2593+
return ret;
2594+
}
2595+
2596+
static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len)
2597+
{
2598+
u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize;
2599+
2600+
if (off + len > inode->i_size || off + len < off)
2601+
return -EINVAL;
2602+
/* Check that we are block aligned - btrfs_clone() requires this */
2603+
if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs))
2604+
return -EINVAL;
2605+
2606+
return 0;
2607+
}
2608+
2609+
static int btrfs_extent_same(struct inode *src, u64 loff, u64 len,
2610+
struct inode *dst, u64 dst_loff)
2611+
{
2612+
int ret;
2613+
2614+
/*
2615+
* btrfs_clone() can't handle extents in the same file
2616+
* yet. Once that works, we can drop this check and replace it
2617+
* with a check for the same inode, but overlapping extents.
2618+
*/
2619+
if (src == dst)
2620+
return -EINVAL;
2621+
2622+
btrfs_double_lock(src, loff, dst, dst_loff, len);
2623+
2624+
ret = extent_same_check_offsets(src, loff, len);
2625+
if (ret)
2626+
goto out_unlock;
2627+
2628+
ret = extent_same_check_offsets(dst, dst_loff, len);
2629+
if (ret)
2630+
goto out_unlock;
2631+
2632+
/* don't make the dst file partly checksummed */
2633+
if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
2634+
(BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) {
2635+
ret = -EINVAL;
2636+
goto out_unlock;
2637+
}
2638+
2639+
ret = btrfs_cmp_data(src, loff, dst, dst_loff, len);
2640+
if (ret == 0)
2641+
ret = btrfs_clone(src, dst, loff, len, len, dst_loff);
2642+
2643+
out_unlock:
2644+
btrfs_double_unlock(src, loff, dst, dst_loff, len);
2645+
2646+
return ret;
2647+
}
2648+
2649+
#define BTRFS_MAX_DEDUPE_LEN (16 * 1024 * 1024)
2650+
2651+
static long btrfs_ioctl_file_extent_same(struct file *file,
2652+
void __user *argp)
2653+
{
2654+
struct btrfs_ioctl_same_args *args = argp;
2655+
struct btrfs_ioctl_same_args same;
2656+
struct btrfs_ioctl_same_extent_info info;
2657+
struct inode *src = file->f_dentry->d_inode;
2658+
struct file *dst_file = NULL;
2659+
struct inode *dst;
2660+
u64 off;
2661+
u64 len;
2662+
int i;
2663+
int ret;
2664+
u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
2665+
bool is_admin = capable(CAP_SYS_ADMIN);
2666+
2667+
if (!(file->f_mode & FMODE_READ))
2668+
return -EINVAL;
2669+
2670+
ret = mnt_want_write_file(file);
2671+
if (ret)
2672+
return ret;
2673+
2674+
if (copy_from_user(&same,
2675+
(struct btrfs_ioctl_same_args __user *)argp,
2676+
sizeof(same))) {
2677+
ret = -EFAULT;
2678+
goto out;
2679+
}
2680+
2681+
off = same.logical_offset;
2682+
len = same.length;
2683+
2684+
/*
2685+
* Limit the total length we will dedupe for each operation.
2686+
* This is intended to bound the total time spent in this
2687+
* ioctl to something sane.
2688+
*/
2689+
if (len > BTRFS_MAX_DEDUPE_LEN)
2690+
len = BTRFS_MAX_DEDUPE_LEN;
2691+
2692+
if (WARN_ON_ONCE(bs < PAGE_CACHE_SIZE)) {
2693+
/*
2694+
* Btrfs does not support blocksize < page_size. As a
2695+
* result, btrfs_cmp_data() won't correctly handle
2696+
* this situation without an update.
2697+
*/
2698+
ret = -EINVAL;
2699+
goto out;
2700+
}
2701+
2702+
ret = -EISDIR;
2703+
if (S_ISDIR(src->i_mode))
2704+
goto out;
2705+
2706+
ret = -EACCES;
2707+
if (!S_ISREG(src->i_mode))
2708+
goto out;
2709+
2710+
ret = 0;
2711+
for (i = 0; i < same.dest_count; i++) {
2712+
if (copy_from_user(&info, &args->info[i], sizeof(info))) {
2713+
ret = -EFAULT;
2714+
goto out;
2715+
}
2716+
2717+
info.bytes_deduped = 0;
2718+
2719+
dst_file = fget(info.fd);
2720+
if (!dst_file) {
2721+
info.status = -EBADF;
2722+
goto next;
2723+
}
2724+
2725+
if (!(is_admin || (dst_file->f_mode & FMODE_WRITE))) {
2726+
info.status = -EINVAL;
2727+
goto next;
2728+
}
2729+
2730+
info.status = -EXDEV;
2731+
if (file->f_path.mnt != dst_file->f_path.mnt)
2732+
goto next;
2733+
2734+
dst = dst_file->f_dentry->d_inode;
2735+
if (src->i_sb != dst->i_sb)
2736+
goto next;
2737+
2738+
if (S_ISDIR(dst->i_mode)) {
2739+
info.status = -EISDIR;
2740+
goto next;
2741+
}
2742+
2743+
if (!S_ISREG(dst->i_mode)) {
2744+
info.status = -EACCES;
2745+
goto next;
2746+
}
2747+
2748+
info.status = btrfs_extent_same(src, off, len, dst,
2749+
info.logical_offset);
2750+
if (info.status == 0)
2751+
info.bytes_deduped += len;
2752+
2753+
next:
2754+
if (dst_file)
2755+
fput(dst_file);
2756+
2757+
if (__put_user_unaligned(info.status, &args->info[i].status) ||
2758+
__put_user_unaligned(info.bytes_deduped,
2759+
&args->info[i].bytes_deduped)) {
2760+
ret = -EFAULT;
2761+
goto out;
2762+
}
2763+
}
2764+
2765+
out:
2766+
mnt_drop_write_file(file);
2767+
return ret;
2768+
}
2769+
24932770
/**
24942771
* btrfs_clone() - clone a range from inode file to another
24952772
*
@@ -4242,6 +4519,8 @@ long btrfs_ioctl(struct file *file, unsigned int
42424519
return btrfs_ioctl_get_fslabel(file, argp);
42434520
case BTRFS_IOC_SET_FSLABEL:
42444521
return btrfs_ioctl_set_fslabel(file, argp);
4522+
case BTRFS_IOC_FILE_EXTENT_SAME:
4523+
return btrfs_ioctl_file_extent_same(file, argp);
42454524
}
42464525

42474526
return -ENOTTY;

include/uapi/linux/btrfs.h

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,31 @@ struct btrfs_ioctl_clone_range_args {
305305
#define BTRFS_DEFRAG_RANGE_COMPRESS 1
306306
#define BTRFS_DEFRAG_RANGE_START_IO 2
307307

308+
#define BTRFS_SAME_DATA_DIFFERS 1
309+
/* For extent-same ioctl */
310+
struct btrfs_ioctl_same_extent_info {
311+
__s64 fd; /* in - destination file */
312+
__u64 logical_offset; /* in - start of extent in destination */
313+
__u64 bytes_deduped; /* out - total # of bytes we were able
314+
* to dedupe from this file */
315+
/* status of this dedupe operation:
316+
* 0 if dedup succeeds
317+
* < 0 for error
318+
* == BTRFS_SAME_DATA_DIFFERS if data differs
319+
*/
320+
__s32 status; /* out - see above description */
321+
__u32 reserved;
322+
};
323+
324+
struct btrfs_ioctl_same_args {
325+
__u64 logical_offset; /* in - start of extent in source */
326+
__u64 length; /* in - length of extent */
327+
__u16 dest_count; /* in - total elements in info array */
328+
__u16 reserved1;
329+
__u32 reserved2;
330+
struct btrfs_ioctl_same_extent_info info[0];
331+
};
332+
308333
struct btrfs_ioctl_space_info {
309334
__u64 flags;
310335
__u64 total_bytes;
@@ -579,4 +604,7 @@ static inline char *btrfs_err_str(enum btrfs_err_code err_code)
579604
struct btrfs_ioctl_get_dev_stats)
580605
#define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \
581606
struct btrfs_ioctl_dev_replace_args)
607+
#define BTRFS_IOC_FILE_EXTENT_SAME _IOWR(BTRFS_IOCTL_MAGIC, 54, \
608+
struct btrfs_ioctl_same_args)
609+
582610
#endif /* _UAPI_LINUX_BTRFS_H */

0 commit comments

Comments
 (0)