Skip to content

Commit 5500ab4

Browse files
Gang Hetorvalds
authored andcommitted
ocfs2: fix the application IO timeout when fstrim is running
The user reported this problem, the upper application IO was timeout when fstrim was running on this ocfs2 partition. the application monitoring resource agent considered that this application did not work, then this node was fenced by the cluster brain (e.g. pacemaker). The root cause is that fstrim thread always holds main_bm meta-file related locks until all the cluster groups are trimmed. This patch will make fstrim thread release main_bm meta-file related locks when each cluster group is trimmed, this will let the current application IO has a chance to claim the clusters from main_bm meta-file. Link: http://lkml.kernel.org/r/20190111090014.31645-1-ghe@suse.com Signed-off-by: Gang He <ghe@suse.com> Reviewed-by: Changwei Ge <ge.changwei@h3c.com> Cc: Mark Fasheh <mfasheh@versity.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Junxiao Bi <junxiao.bi@oracle.com> Cc: Joseph Qi <joseph.qi@huawei.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent cc725ef commit 5500ab4

File tree

5 files changed

+106
-63
lines changed

5 files changed

+106
-63
lines changed

fs/ocfs2/alloc.c

Lines changed: 96 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -7532,18 +7532,18 @@ static int ocfs2_trim_group(struct super_block *sb,
75327532
return count;
75337533
}
75347534

7535-
int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
7535+
static
7536+
int ocfs2_trim_mainbm(struct super_block *sb, struct fstrim_range *range)
75367537
{
75377538
struct ocfs2_super *osb = OCFS2_SB(sb);
7538-
u64 start, len, trimmed, first_group, last_group, group;
7539+
u64 start, len, trimmed = 0, first_group, last_group = 0, group = 0;
75397540
int ret, cnt;
75407541
u32 first_bit, last_bit, minlen;
75417542
struct buffer_head *main_bm_bh = NULL;
75427543
struct inode *main_bm_inode = NULL;
75437544
struct buffer_head *gd_bh = NULL;
75447545
struct ocfs2_dinode *main_bm;
75457546
struct ocfs2_group_desc *gd = NULL;
7546-
struct ocfs2_trim_fs_info info, *pinfo = NULL;
75477547

75487548
start = range->start >> osb->s_clustersize_bits;
75497549
len = range->len >> osb->s_clustersize_bits;
@@ -7552,6 +7552,9 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
75527552
if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize)
75537553
return -EINVAL;
75547554

7555+
trace_ocfs2_trim_mainbm(start, len, minlen);
7556+
7557+
next_group:
75557558
main_bm_inode = ocfs2_get_system_file_inode(osb,
75567559
GLOBAL_BITMAP_SYSTEM_INODE,
75577560
OCFS2_INVALID_SLOT);
@@ -7570,64 +7573,34 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
75707573
}
75717574
main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
75727575

7573-
if (start >= le32_to_cpu(main_bm->i_clusters)) {
7574-
ret = -EINVAL;
7575-
goto out_unlock;
7576-
}
7577-
7578-
len = range->len >> osb->s_clustersize_bits;
7579-
if (start + len > le32_to_cpu(main_bm->i_clusters))
7580-
len = le32_to_cpu(main_bm->i_clusters) - start;
7581-
7582-
trace_ocfs2_trim_fs(start, len, minlen);
7583-
7584-
ocfs2_trim_fs_lock_res_init(osb);
7585-
ret = ocfs2_trim_fs_lock(osb, NULL, 1);
7586-
if (ret < 0) {
7587-
if (ret != -EAGAIN) {
7588-
mlog_errno(ret);
7589-
ocfs2_trim_fs_lock_res_uninit(osb);
7576+
/*
7577+
* Do some check before trim the first group.
7578+
*/
7579+
if (!group) {
7580+
if (start >= le32_to_cpu(main_bm->i_clusters)) {
7581+
ret = -EINVAL;
75907582
goto out_unlock;
75917583
}
75927584

7593-
mlog(ML_NOTICE, "Wait for trim on device (%s) to "
7594-
"finish, which is running from another node.\n",
7595-
osb->dev_str);
7596-
ret = ocfs2_trim_fs_lock(osb, &info, 0);
7597-
if (ret < 0) {
7598-
mlog_errno(ret);
7599-
ocfs2_trim_fs_lock_res_uninit(osb);
7600-
goto out_unlock;
7601-
}
7585+
if (start + len > le32_to_cpu(main_bm->i_clusters))
7586+
len = le32_to_cpu(main_bm->i_clusters) - start;
76027587

7603-
if (info.tf_valid && info.tf_success &&
7604-
info.tf_start == start && info.tf_len == len &&
7605-
info.tf_minlen == minlen) {
7606-
/* Avoid sending duplicated trim to a shared device */
7607-
mlog(ML_NOTICE, "The same trim on device (%s) was "
7608-
"just done from node (%u), return.\n",
7609-
osb->dev_str, info.tf_nodenum);
7610-
range->len = info.tf_trimlen;
7611-
goto out_trimunlock;
7612-
}
7588+
/*
7589+
* Determine first and last group to examine based on
7590+
* start and len
7591+
*/
7592+
first_group = ocfs2_which_cluster_group(main_bm_inode, start);
7593+
if (first_group == osb->first_cluster_group_blkno)
7594+
first_bit = start;
7595+
else
7596+
first_bit = start - ocfs2_blocks_to_clusters(sb,
7597+
first_group);
7598+
last_group = ocfs2_which_cluster_group(main_bm_inode,
7599+
start + len - 1);
7600+
group = first_group;
76137601
}
76147602

7615-
info.tf_nodenum = osb->node_num;
7616-
info.tf_start = start;
7617-
info.tf_len = len;
7618-
info.tf_minlen = minlen;
7619-
7620-
/* Determine first and last group to examine based on start and len */
7621-
first_group = ocfs2_which_cluster_group(main_bm_inode, start);
7622-
if (first_group == osb->first_cluster_group_blkno)
7623-
first_bit = start;
7624-
else
7625-
first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
7626-
last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
7627-
last_bit = osb->bitmap_cpg;
7628-
7629-
trimmed = 0;
7630-
for (group = first_group; group <= last_group;) {
7603+
do {
76317604
if (first_bit + len >= osb->bitmap_cpg)
76327605
last_bit = osb->bitmap_cpg;
76337606
else
@@ -7659,21 +7632,81 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
76597632
group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
76607633
else
76617634
group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
7662-
}
7663-
range->len = trimmed * sb->s_blocksize;
7635+
} while (0);
76647636

7665-
info.tf_trimlen = range->len;
7666-
info.tf_success = (ret ? 0 : 1);
7667-
pinfo = &info;
7668-
out_trimunlock:
7669-
ocfs2_trim_fs_unlock(osb, pinfo);
7670-
ocfs2_trim_fs_lock_res_uninit(osb);
76717637
out_unlock:
76727638
ocfs2_inode_unlock(main_bm_inode, 0);
76737639
brelse(main_bm_bh);
7640+
main_bm_bh = NULL;
76747641
out_mutex:
76757642
inode_unlock(main_bm_inode);
76767643
iput(main_bm_inode);
7644+
7645+
/*
7646+
* If all the groups trim are not done or failed, but we should release
7647+
* main_bm related locks for avoiding the current IO starve, then go to
7648+
* trim the next group
7649+
*/
7650+
if (ret >= 0 && group <= last_group)
7651+
goto next_group;
76777652
out:
7653+
range->len = trimmed * sb->s_blocksize;
7654+
return ret;
7655+
}
7656+
7657+
int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
7658+
{
7659+
int ret;
7660+
struct ocfs2_super *osb = OCFS2_SB(sb);
7661+
struct ocfs2_trim_fs_info info, *pinfo = NULL;
7662+
7663+
ocfs2_trim_fs_lock_res_init(osb);
7664+
7665+
trace_ocfs2_trim_fs(range->start, range->len, range->minlen);
7666+
7667+
ret = ocfs2_trim_fs_lock(osb, NULL, 1);
7668+
if (ret < 0) {
7669+
if (ret != -EAGAIN) {
7670+
mlog_errno(ret);
7671+
ocfs2_trim_fs_lock_res_uninit(osb);
7672+
return ret;
7673+
}
7674+
7675+
mlog(ML_NOTICE, "Wait for trim on device (%s) to "
7676+
"finish, which is running from another node.\n",
7677+
osb->dev_str);
7678+
ret = ocfs2_trim_fs_lock(osb, &info, 0);
7679+
if (ret < 0) {
7680+
mlog_errno(ret);
7681+
ocfs2_trim_fs_lock_res_uninit(osb);
7682+
return ret;
7683+
}
7684+
7685+
if (info.tf_valid && info.tf_success &&
7686+
info.tf_start == range->start &&
7687+
info.tf_len == range->len &&
7688+
info.tf_minlen == range->minlen) {
7689+
/* Avoid sending duplicated trim to a shared device */
7690+
mlog(ML_NOTICE, "The same trim on device (%s) was "
7691+
"just done from node (%u), return.\n",
7692+
osb->dev_str, info.tf_nodenum);
7693+
range->len = info.tf_trimlen;
7694+
goto out;
7695+
}
7696+
}
7697+
7698+
info.tf_nodenum = osb->node_num;
7699+
info.tf_start = range->start;
7700+
info.tf_len = range->len;
7701+
info.tf_minlen = range->minlen;
7702+
7703+
ret = ocfs2_trim_mainbm(sb, range);
7704+
7705+
info.tf_trimlen = range->len;
7706+
info.tf_success = (ret < 0 ? 0 : 1);
7707+
pinfo = &info;
7708+
out:
7709+
ocfs2_trim_fs_unlock(osb, pinfo);
7710+
ocfs2_trim_fs_lock_res_uninit(osb);
76787711
return ret;
76797712
}

fs/ocfs2/dlmglue.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -686,6 +686,9 @@ void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb)
686686
{
687687
struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
688688

689+
/* Only one trimfs thread are allowed to work at the same time. */
690+
mutex_lock(&osb->obs_trim_fs_mutex);
691+
689692
ocfs2_lock_res_init_once(lockres);
690693
ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name);
691694
ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS,
@@ -698,6 +701,8 @@ void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super *osb)
698701

699702
ocfs2_simple_drop_lockres(osb, lockres);
700703
ocfs2_lock_res_free(lockres);
704+
705+
mutex_unlock(&osb->obs_trim_fs_mutex);
701706
}
702707

703708
static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,

fs/ocfs2/ocfs2.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -407,6 +407,7 @@ struct ocfs2_super
407407
struct ocfs2_lock_res osb_rename_lockres;
408408
struct ocfs2_lock_res osb_nfs_sync_lockres;
409409
struct ocfs2_lock_res osb_trim_fs_lockres;
410+
struct mutex obs_trim_fs_mutex;
410411
struct ocfs2_dlm_debug *osb_dlm_debug;
411412

412413
struct dentry *osb_debug_root;

fs/ocfs2/ocfs2_trace.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -712,6 +712,8 @@ TRACE_EVENT(ocfs2_trim_extent,
712712

713713
DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
714714

715+
DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_mainbm);
716+
715717
DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
716718

717719
/* End of trace events for fs/ocfs2/alloc.c. */

fs/ocfs2/super.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1847,6 +1847,8 @@ static int ocfs2_mount_volume(struct super_block *sb)
18471847
if (ocfs2_is_hard_readonly(osb))
18481848
goto leave;
18491849

1850+
mutex_init(&osb->obs_trim_fs_mutex);
1851+
18501852
status = ocfs2_dlm_init(osb);
18511853
if (status < 0) {
18521854
mlog_errno(status);

0 commit comments

Comments
 (0)