|
44 | 44 | #include "scrub/scrub.h"
|
45 | 45 | #include "scrub/trace.h"
|
46 | 46 |
|
| 47 | +/* |
| 48 | + * Online Scrub and Repair |
| 49 | + * |
| 50 | + * Traditionally, XFS (the kernel driver) did not know how to check or |
| 51 | + * repair on-disk data structures. That task was left to the xfs_check |
| 52 | + * and xfs_repair tools, both of which require taking the filesystem |
| 53 | + * offline for a thorough but time consuming examination. Online |
| 54 | + * scrub & repair, on the other hand, enables us to check the metadata |
| 55 | + * for obvious errors while carefully stepping around the filesystem's |
| 56 | + * ongoing operations, locking rules, etc. |
| 57 | + * |
| 58 | + * Given that most XFS metadata consist of records stored in a btree, |
| 59 | + * most of the checking functions iterate the btree blocks themselves |
| 60 | + * looking for irregularities. When a record block is encountered, each |
| 61 | + * record can be checked for obviously bad values. Record values can |
| 62 | + * also be cross-referenced against other btrees to look for potential |
| 63 | + * misunderstandings between pieces of metadata. |
| 64 | + * |
| 65 | + * It is expected that the checkers responsible for per-AG metadata |
| 66 | + * structures will lock the AG headers (AGI, AGF, AGFL), iterate the |
| 67 | + * metadata structure, and perform any relevant cross-referencing before |
| 68 | + * unlocking the AG and returning the results to userspace. These |
| 69 | + * scrubbers must not keep an AG locked for too long to avoid tying up |
| 70 | + * the block and inode allocators. |
| 71 | + * |
| 72 | + * Block maps and b-trees rooted in an inode present a special challenge |
| 73 | + * because they can involve extents from any AG. The general scrubber |
| 74 | + * structure of lock -> check -> xref -> unlock still holds, but AG |
| 75 | + * locking order rules /must/ be obeyed to avoid deadlocks. The |
| 76 | + * ordering rule, of course, is that we must lock in increasing AG |
| 77 | + * order. Helper functions are provided to track which AG headers we've |
| 78 | + * already locked. If we detect an imminent locking order violation, we |
| 79 | + * can signal a potential deadlock, in which case the scrubber can jump |
| 80 | + * out to the top level, lock all the AGs in order, and retry the scrub. |
| 81 | + * |
| 82 | + * For file data (directories, extended attributes, symlinks) scrub, we |
| 83 | + * can simply lock the inode and walk the data. For btree data |
| 84 | + * (directories and attributes) we follow the same btree-scrubbing |
| 85 | + * strategy outlined previously to check the records. |
| 86 | + * |
| 87 | + * We use a bit of trickery with transactions to avoid buffer deadlocks |
| 88 | + * if there is a cycle in the metadata. The basic problem is that |
| 89 | + * travelling down a btree involves locking the current buffer at each |
| 90 | + * tree level. If a pointer should somehow point back to a buffer that |
| 91 | + * we've already examined, we will deadlock due to the second buffer |
| 92 | + * locking attempt. Note however that grabbing a buffer in transaction |
| 93 | + * context links the locked buffer to the transaction. If we try to |
| 94 | + * re-grab the buffer in the context of the same transaction, we avoid |
| 95 | + * the second lock attempt and continue. Between the verifier and the |
| 96 | + * scrubber, something will notice that something is amiss and report |
| 97 | + * the corruption. Therefore, each scrubber will allocate an empty |
| 98 | + * transaction, attach buffers to it, and cancel the transaction at the |
| 99 | + * end of the scrub run. Cancelling a non-dirty transaction simply |
| 100 | + * unlocks the buffers. |
| 101 | + * |
| 102 | + * There are four pieces of data that scrub can communicate to |
| 103 | + * userspace. The first is the error code (errno), which can be used to |
| 104 | + * communicate operational errors in performing the scrub. There are |
| 105 | + * also three flags that can be set in the scrub context. If the data |
| 106 | + * structure itself is corrupt, the CORRUPT flag will be set. If |
| 107 | + * the metadata is correct but otherwise suboptimal, the PREEN flag |
| 108 | + * will be set. |
| 109 | + */ |
| 110 | + |
| 111 | +/* Scrub setup and teardown */ |
| 112 | + |
| 113 | +/* Free all the resources and finish the transactions. */ |
| 114 | +STATIC int |
| 115 | +xfs_scrub_teardown( |
| 116 | + struct xfs_scrub_context *sc, |
| 117 | + int error) |
| 118 | +{ |
| 119 | + if (sc->tp) { |
| 120 | + xfs_trans_cancel(sc->tp); |
| 121 | + sc->tp = NULL; |
| 122 | + } |
| 123 | + return error; |
| 124 | +} |
| 125 | + |
| 126 | +/* Scrubbing dispatch. */ |
| 127 | + |
| 128 | +static const struct xfs_scrub_meta_ops meta_scrub_ops[] = { |
| 129 | +}; |
| 130 | + |
| 131 | +/* This isn't a stable feature, warn once per day. */ |
| 132 | +static inline void |
| 133 | +xfs_scrub_experimental_warning( |
| 134 | + struct xfs_mount *mp) |
| 135 | +{ |
| 136 | + static struct ratelimit_state scrub_warning = RATELIMIT_STATE_INIT( |
| 137 | + "xfs_scrub_warning", 86400 * HZ, 1); |
| 138 | + ratelimit_set_flags(&scrub_warning, RATELIMIT_MSG_ON_RELEASE); |
| 139 | + |
| 140 | + if (__ratelimit(&scrub_warning)) |
| 141 | + xfs_alert(mp, |
| 142 | +"EXPERIMENTAL online scrub feature in use. Use at your own risk!"); |
| 143 | +} |
| 144 | + |
47 | 145 | /* Dispatch metadata scrubbing. */
|
48 | 146 | int
|
49 | 147 | xfs_scrub_metadata(
|
50 | 148 | struct xfs_inode *ip,
|
51 | 149 | struct xfs_scrub_metadata *sm)
|
52 | 150 | {
|
53 |
| - return -EOPNOTSUPP; |
| 151 | + struct xfs_scrub_context sc; |
| 152 | + struct xfs_mount *mp = ip->i_mount; |
| 153 | + const struct xfs_scrub_meta_ops *ops; |
| 154 | + bool try_harder = false; |
| 155 | + int error = 0; |
| 156 | + |
| 157 | + trace_xfs_scrub_start(ip, sm, error); |
| 158 | + |
| 159 | + /* Forbidden if we are shut down or mounted norecovery. */ |
| 160 | + error = -ESHUTDOWN; |
| 161 | + if (XFS_FORCED_SHUTDOWN(mp)) |
| 162 | + goto out; |
| 163 | + error = -ENOTRECOVERABLE; |
| 164 | + if (mp->m_flags & XFS_MOUNT_NORECOVERY) |
| 165 | + goto out; |
| 166 | + |
| 167 | + /* Check our inputs. */ |
| 168 | + error = -EINVAL; |
| 169 | + sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; |
| 170 | + if (sm->sm_flags & ~XFS_SCRUB_FLAGS_IN) |
| 171 | + goto out; |
| 172 | + if (memchr_inv(sm->sm_reserved, 0, sizeof(sm->sm_reserved))) |
| 173 | + goto out; |
| 174 | + |
| 175 | + /* Do we know about this type of metadata? */ |
| 176 | + error = -ENOENT; |
| 177 | + if (sm->sm_type >= XFS_SCRUB_TYPE_NR) |
| 178 | + goto out; |
| 179 | + ops = &meta_scrub_ops[sm->sm_type]; |
| 180 | + if (ops->scrub == NULL) |
| 181 | + goto out; |
| 182 | + |
| 183 | + /* |
| 184 | + * We won't scrub any filesystem that doesn't have the ability |
| 185 | + * to record unwritten extents. The option was made default in |
| 186 | + * 2003, removed from mkfs in 2007, and cannot be disabled in |
| 187 | + * v5, so if we find a filesystem without this flag it's either |
| 188 | + * really old or totally unsupported. Avoid it either way. |
| 189 | + * We also don't support v1-v3 filesystems, which aren't |
| 190 | + * mountable. |
| 191 | + */ |
| 192 | + error = -EOPNOTSUPP; |
| 193 | + if (!xfs_sb_version_hasextflgbit(&mp->m_sb)) |
| 194 | + goto out; |
| 195 | + |
| 196 | + /* Does this fs even support this type of metadata? */ |
| 197 | + error = -ENOENT; |
| 198 | + if (ops->has && !ops->has(&mp->m_sb)) |
| 199 | + goto out; |
| 200 | + |
| 201 | + /* We don't know how to repair anything yet. */ |
| 202 | + error = -EOPNOTSUPP; |
| 203 | + if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) |
| 204 | + goto out; |
| 205 | + |
| 206 | + xfs_scrub_experimental_warning(mp); |
| 207 | + |
| 208 | +retry_op: |
| 209 | + /* Set up for the operation. */ |
| 210 | + memset(&sc, 0, sizeof(sc)); |
| 211 | + sc.mp = ip->i_mount; |
| 212 | + sc.sm = sm; |
| 213 | + sc.ops = ops; |
| 214 | + sc.try_harder = try_harder; |
| 215 | + error = sc.ops->setup(&sc, ip); |
| 216 | + if (error) |
| 217 | + goto out_teardown; |
| 218 | + |
| 219 | + /* Scrub for errors. */ |
| 220 | + error = sc.ops->scrub(&sc); |
| 221 | + if (!try_harder && error == -EDEADLOCK) { |
| 222 | + /* |
| 223 | + * Scrubbers return -EDEADLOCK to mean 'try harder'. |
| 224 | + * Tear down everything we hold, then set up again with |
| 225 | + * preparation for worst-case scenarios. |
| 226 | + */ |
| 227 | + error = xfs_scrub_teardown(&sc, 0); |
| 228 | + if (error) |
| 229 | + goto out; |
| 230 | + try_harder = true; |
| 231 | + goto retry_op; |
| 232 | + } else if (error) |
| 233 | + goto out_teardown; |
| 234 | + |
| 235 | + if (sc.sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | |
| 236 | + XFS_SCRUB_OFLAG_XCORRUPT)) |
| 237 | + xfs_alert_ratelimited(mp, "Corruption detected during scrub."); |
| 238 | + |
| 239 | +out_teardown: |
| 240 | + error = xfs_scrub_teardown(&sc, error); |
| 241 | +out: |
| 242 | + trace_xfs_scrub_done(ip, sm, error); |
| 243 | + if (error == -EFSCORRUPTED || error == -EFSBADCRC) { |
| 244 | + sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; |
| 245 | + error = 0; |
| 246 | + } |
| 247 | + return error; |
54 | 248 | }
|
0 commit comments