Skip to content

Commit a563718

Browse files
committed
xfs: dispatch metadata scrub subcommands
Create structures needed to hold scrubbing context and dispatch incoming commands to the individual scrubbers. Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Dave Chinner <dchinner@redhat.com>
1 parent 36fd6e8 commit a563718

File tree

3 files changed

+262
-1
lines changed

3 files changed

+262
-1
lines changed

fs/xfs/scrub/scrub.c

Lines changed: 195 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,205 @@
4444
#include "scrub/scrub.h"
4545
#include "scrub/trace.h"
4646

47+
/*
48+
* Online Scrub and Repair
49+
*
50+
* Traditionally, XFS (the kernel driver) did not know how to check or
51+
* repair on-disk data structures. That task was left to the xfs_check
52+
* and xfs_repair tools, both of which require taking the filesystem
53+
* offline for a thorough but time consuming examination. Online
54+
* scrub & repair, on the other hand, enables us to check the metadata
55+
* for obvious errors while carefully stepping around the filesystem's
56+
* ongoing operations, locking rules, etc.
57+
*
58+
* Given that most XFS metadata consist of records stored in a btree,
59+
* most of the checking functions iterate the btree blocks themselves
60+
* looking for irregularities. When a record block is encountered, each
61+
* record can be checked for obviously bad values. Record values can
62+
* also be cross-referenced against other btrees to look for potential
63+
* misunderstandings between pieces of metadata.
64+
*
65+
* It is expected that the checkers responsible for per-AG metadata
66+
* structures will lock the AG headers (AGI, AGF, AGFL), iterate the
67+
* metadata structure, and perform any relevant cross-referencing before
68+
* unlocking the AG and returning the results to userspace. These
69+
* scrubbers must not keep an AG locked for too long to avoid tying up
70+
* the block and inode allocators.
71+
*
72+
* Block maps and b-trees rooted in an inode present a special challenge
73+
* because they can involve extents from any AG. The general scrubber
74+
* structure of lock -> check -> xref -> unlock still holds, but AG
75+
* locking order rules /must/ be obeyed to avoid deadlocks. The
76+
* ordering rule, of course, is that we must lock in increasing AG
77+
* order. Helper functions are provided to track which AG headers we've
78+
* already locked. If we detect an imminent locking order violation, we
79+
* can signal a potential deadlock, in which case the scrubber can jump
80+
* out to the top level, lock all the AGs in order, and retry the scrub.
81+
*
82+
* For file data (directories, extended attributes, symlinks) scrub, we
83+
* can simply lock the inode and walk the data. For btree data
84+
* (directories and attributes) we follow the same btree-scrubbing
85+
* strategy outlined previously to check the records.
86+
*
87+
* We use a bit of trickery with transactions to avoid buffer deadlocks
88+
* if there is a cycle in the metadata. The basic problem is that
89+
* travelling down a btree involves locking the current buffer at each
90+
* tree level. If a pointer should somehow point back to a buffer that
91+
* we've already examined, we will deadlock due to the second buffer
92+
* locking attempt. Note however that grabbing a buffer in transaction
93+
* context links the locked buffer to the transaction. If we try to
94+
* re-grab the buffer in the context of the same transaction, we avoid
95+
* the second lock attempt and continue. Between the verifier and the
96+
* scrubber, something will notice that something is amiss and report
97+
* the corruption. Therefore, each scrubber will allocate an empty
98+
* transaction, attach buffers to it, and cancel the transaction at the
99+
* end of the scrub run. Cancelling a non-dirty transaction simply
100+
* unlocks the buffers.
101+
*
102+
* There are four pieces of data that scrub can communicate to
103+
* userspace. The first is the error code (errno), which can be used to
104+
* communicate operational errors in performing the scrub. There are
105+
* also three flags that can be set in the scrub context. If the data
106+
* structure itself is corrupt, the CORRUPT flag will be set. If
107+
* the metadata is correct but otherwise suboptimal, the PREEN flag
108+
* will be set.
109+
*/
110+
111+
/* Scrub setup and teardown */
112+
113+
/* Free all the resources and finish the transactions. */
114+
STATIC int
115+
xfs_scrub_teardown(
116+
struct xfs_scrub_context *sc,
117+
int error)
118+
{
119+
if (sc->tp) {
120+
xfs_trans_cancel(sc->tp);
121+
sc->tp = NULL;
122+
}
123+
return error;
124+
}
125+
126+
/* Scrubbing dispatch. */
127+
128+
static const struct xfs_scrub_meta_ops meta_scrub_ops[] = {
129+
};
130+
131+
/* This isn't a stable feature, warn once per day. */
132+
static inline void
133+
xfs_scrub_experimental_warning(
134+
struct xfs_mount *mp)
135+
{
136+
static struct ratelimit_state scrub_warning = RATELIMIT_STATE_INIT(
137+
"xfs_scrub_warning", 86400 * HZ, 1);
138+
ratelimit_set_flags(&scrub_warning, RATELIMIT_MSG_ON_RELEASE);
139+
140+
if (__ratelimit(&scrub_warning))
141+
xfs_alert(mp,
142+
"EXPERIMENTAL online scrub feature in use. Use at your own risk!");
143+
}
144+
47145
/* Dispatch metadata scrubbing. */
48146
int
49147
xfs_scrub_metadata(
50148
struct xfs_inode *ip,
51149
struct xfs_scrub_metadata *sm)
52150
{
53-
return -EOPNOTSUPP;
151+
struct xfs_scrub_context sc;
152+
struct xfs_mount *mp = ip->i_mount;
153+
const struct xfs_scrub_meta_ops *ops;
154+
bool try_harder = false;
155+
int error = 0;
156+
157+
trace_xfs_scrub_start(ip, sm, error);
158+
159+
/* Forbidden if we are shut down or mounted norecovery. */
160+
error = -ESHUTDOWN;
161+
if (XFS_FORCED_SHUTDOWN(mp))
162+
goto out;
163+
error = -ENOTRECOVERABLE;
164+
if (mp->m_flags & XFS_MOUNT_NORECOVERY)
165+
goto out;
166+
167+
/* Check our inputs. */
168+
error = -EINVAL;
169+
sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
170+
if (sm->sm_flags & ~XFS_SCRUB_FLAGS_IN)
171+
goto out;
172+
if (memchr_inv(sm->sm_reserved, 0, sizeof(sm->sm_reserved)))
173+
goto out;
174+
175+
/* Do we know about this type of metadata? */
176+
error = -ENOENT;
177+
if (sm->sm_type >= XFS_SCRUB_TYPE_NR)
178+
goto out;
179+
ops = &meta_scrub_ops[sm->sm_type];
180+
if (ops->scrub == NULL)
181+
goto out;
182+
183+
/*
184+
* We won't scrub any filesystem that doesn't have the ability
185+
* to record unwritten extents. The option was made default in
186+
* 2003, removed from mkfs in 2007, and cannot be disabled in
187+
* v5, so if we find a filesystem without this flag it's either
188+
* really old or totally unsupported. Avoid it either way.
189+
* We also don't support v1-v3 filesystems, which aren't
190+
* mountable.
191+
*/
192+
error = -EOPNOTSUPP;
193+
if (!xfs_sb_version_hasextflgbit(&mp->m_sb))
194+
goto out;
195+
196+
/* Does this fs even support this type of metadata? */
197+
error = -ENOENT;
198+
if (ops->has && !ops->has(&mp->m_sb))
199+
goto out;
200+
201+
/* We don't know how to repair anything yet. */
202+
error = -EOPNOTSUPP;
203+
if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
204+
goto out;
205+
206+
xfs_scrub_experimental_warning(mp);
207+
208+
retry_op:
209+
/* Set up for the operation. */
210+
memset(&sc, 0, sizeof(sc));
211+
sc.mp = ip->i_mount;
212+
sc.sm = sm;
213+
sc.ops = ops;
214+
sc.try_harder = try_harder;
215+
error = sc.ops->setup(&sc, ip);
216+
if (error)
217+
goto out_teardown;
218+
219+
/* Scrub for errors. */
220+
error = sc.ops->scrub(&sc);
221+
if (!try_harder && error == -EDEADLOCK) {
222+
/*
223+
* Scrubbers return -EDEADLOCK to mean 'try harder'.
224+
* Tear down everything we hold, then set up again with
225+
* preparation for worst-case scenarios.
226+
*/
227+
error = xfs_scrub_teardown(&sc, 0);
228+
if (error)
229+
goto out;
230+
try_harder = true;
231+
goto retry_op;
232+
} else if (error)
233+
goto out_teardown;
234+
235+
if (sc.sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
236+
XFS_SCRUB_OFLAG_XCORRUPT))
237+
xfs_alert_ratelimited(mp, "Corruption detected during scrub.");
238+
239+
out_teardown:
240+
error = xfs_scrub_teardown(&sc, error);
241+
out:
242+
trace_xfs_scrub_done(ip, sm, error);
243+
if (error == -EFSCORRUPTED || error == -EFSBADCRC) {
244+
sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
245+
error = 0;
246+
}
247+
return error;
54248
}

fs/xfs/scrub/scrub.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,30 @@
2020
#ifndef __XFS_SCRUB_SCRUB_H__
2121
#define __XFS_SCRUB_SCRUB_H__
2222

23+
struct xfs_scrub_context;
24+
25+
struct xfs_scrub_meta_ops {
26+
/* Acquire whatever resources are needed for the operation. */
27+
int (*setup)(struct xfs_scrub_context *,
28+
struct xfs_inode *);
29+
30+
/* Examine metadata for errors. */
31+
int (*scrub)(struct xfs_scrub_context *);
32+
33+
/* Decide if we even have this piece of metadata. */
34+
bool (*has)(struct xfs_sb *);
35+
};
36+
37+
struct xfs_scrub_context {
38+
/* General scrub state. */
39+
struct xfs_mount *mp;
40+
struct xfs_scrub_metadata *sm;
41+
const struct xfs_scrub_meta_ops *ops;
42+
struct xfs_trans *tp;
43+
struct xfs_inode *ip;
44+
bool try_harder;
45+
};
46+
2347
/* Metadata scrubbers */
2448

2549
#endif /* __XFS_SCRUB_SCRUB_H__ */

fs/xfs/scrub/trace.h

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,49 @@
2525

2626
#include <linux/tracepoint.h>
2727

28+
DECLARE_EVENT_CLASS(xfs_scrub_class,
29+
TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm,
30+
int error),
31+
TP_ARGS(ip, sm, error),
32+
TP_STRUCT__entry(
33+
__field(dev_t, dev)
34+
__field(xfs_ino_t, ino)
35+
__field(unsigned int, type)
36+
__field(xfs_agnumber_t, agno)
37+
__field(xfs_ino_t, inum)
38+
__field(unsigned int, gen)
39+
__field(unsigned int, flags)
40+
__field(int, error)
41+
),
42+
TP_fast_assign(
43+
__entry->dev = ip->i_mount->m_super->s_dev;
44+
__entry->ino = ip->i_ino;
45+
__entry->type = sm->sm_type;
46+
__entry->agno = sm->sm_agno;
47+
__entry->inum = sm->sm_ino;
48+
__entry->gen = sm->sm_gen;
49+
__entry->flags = sm->sm_flags;
50+
__entry->error = error;
51+
),
52+
TP_printk("dev %d:%d ino %llu type %u agno %u inum %llu gen %u flags 0x%x error %d",
53+
MAJOR(__entry->dev), MINOR(__entry->dev),
54+
__entry->ino,
55+
__entry->type,
56+
__entry->agno,
57+
__entry->inum,
58+
__entry->gen,
59+
__entry->flags,
60+
__entry->error)
61+
)
62+
#define DEFINE_SCRUB_EVENT(name) \
63+
DEFINE_EVENT(xfs_scrub_class, name, \
64+
TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm, \
65+
int error), \
66+
TP_ARGS(ip, sm, error))
67+
68+
DEFINE_SCRUB_EVENT(xfs_scrub_start);
69+
DEFINE_SCRUB_EVENT(xfs_scrub_done);
70+
2871
#endif /* _TRACE_XFS_SCRUB_TRACE_H */
2972

3073
#undef TRACE_INCLUDE_PATH

0 commit comments

Comments
 (0)