Skip to content

Commit 76d771b

Browse files
Christoph Hellwigdjwong
authored andcommitted
xfs: use per-AG reservations for the finobt
Currently we try to rely on the global reserved block pool for block allocations for the free inode btree, but I have customer reports (fairly complex workload, need to find an easier reproducer) where that is not enough as the AG where we free an inode that requires a new finobt block is entirely full. This causes us to cancel a dirty transaction and thus a file system shutdown. I think the right way to guard against this is to treat the finot the same way as the refcount btree and have a per-AG reservations for the possible worst case size of it, and the patch below implements that. Note that this could increase mount times with large finobt trees. In an ideal world we would have added a field for the number of finobt fields to the AGI, similar to what we did for the refcount blocks. We should do add it next time we rev the AGI or AGF format by adding new fields. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
1 parent 4dfa2b8 commit 76d771b

File tree

5 files changed

+144
-20
lines changed

5 files changed

+144
-20
lines changed

fs/xfs/libxfs/xfs_ag_resv.c

Lines changed: 41 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
#include "xfs_rmap_btree.h"
4040
#include "xfs_btree.h"
4141
#include "xfs_refcount_btree.h"
42+
#include "xfs_ialloc_btree.h"
4243

4344
/*
4445
* Per-AG Block Reservations
@@ -210,6 +211,9 @@ __xfs_ag_resv_init(
210211
if (error) {
211212
trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno,
212213
error, _RET_IP_);
214+
xfs_warn(mp,
215+
"Per-AG reservation for AG %u failed. Filesystem may run out of space.",
216+
pag->pag_agno);
213217
return error;
214218
}
215219

@@ -228,6 +232,8 @@ int
228232
xfs_ag_resv_init(
229233
struct xfs_perag *pag)
230234
{
235+
struct xfs_mount *mp = pag->pag_mount;
236+
xfs_agnumber_t agno = pag->pag_agno;
231237
xfs_extlen_t ask;
232238
xfs_extlen_t used;
233239
int error = 0;
@@ -236,23 +242,45 @@ xfs_ag_resv_init(
236242
if (pag->pag_meta_resv.ar_asked == 0) {
237243
ask = used = 0;
238244

239-
error = xfs_refcountbt_calc_reserves(pag->pag_mount,
240-
pag->pag_agno, &ask, &used);
245+
error = xfs_refcountbt_calc_reserves(mp, agno, &ask, &used);
241246
if (error)
242247
goto out;
243248

244-
error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
245-
ask, used);
249+
error = xfs_finobt_calc_reserves(mp, agno, &ask, &used);
246250
if (error)
247251
goto out;
252+
253+
error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
254+
ask, used);
255+
if (error) {
256+
/*
257+
* Because we didn't have per-AG reservations when the
258+
* finobt feature was added we might not be able to
259+
* reserve all needed blocks. Warn and fall back to the
260+
* old and potentially buggy code in that case, but
261+
* ensure we do have the reservation for the refcountbt.
262+
*/
263+
ask = used = 0;
264+
265+
mp->m_inotbt_nores = true;
266+
267+
error = xfs_refcountbt_calc_reserves(mp, agno, &ask,
268+
&used);
269+
if (error)
270+
goto out;
271+
272+
error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
273+
ask, used);
274+
if (error)
275+
goto out;
276+
}
248277
}
249278

250279
/* Create the AGFL metadata reservation */
251280
if (pag->pag_agfl_resv.ar_asked == 0) {
252281
ask = used = 0;
253282

254-
error = xfs_rmapbt_calc_reserves(pag->pag_mount, pag->pag_agno,
255-
&ask, &used);
283+
error = xfs_rmapbt_calc_reserves(mp, agno, &ask, &used);
256284
if (error)
257285
goto out;
258286

@@ -261,9 +289,16 @@ xfs_ag_resv_init(
261289
goto out;
262290
}
263291

292+
#ifdef DEBUG
293+
/* need to read in the AGF for the ASSERT below to work */
294+
error = xfs_alloc_pagf_init(pag->pag_mount, NULL, pag->pag_agno, 0);
295+
if (error)
296+
return error;
297+
264298
ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved +
265299
xfs_perag_resv(pag, XFS_AG_RESV_AGFL)->ar_reserved <=
266300
pag->pagf_freeblks + pag->pagf_flcount);
301+
#endif
267302
out:
268303
return error;
269304
}

fs/xfs/libxfs/xfs_ialloc_btree.c

Lines changed: 87 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -82,11 +82,12 @@ xfs_finobt_set_root(
8282
}
8383

8484
STATIC int
85-
xfs_inobt_alloc_block(
85+
__xfs_inobt_alloc_block(
8686
struct xfs_btree_cur *cur,
8787
union xfs_btree_ptr *start,
8888
union xfs_btree_ptr *new,
89-
int *stat)
89+
int *stat,
90+
enum xfs_ag_resv_type resv)
9091
{
9192
xfs_alloc_arg_t args; /* block allocation args */
9293
int error; /* error return value */
@@ -103,6 +104,7 @@ xfs_inobt_alloc_block(
103104
args.maxlen = 1;
104105
args.prod = 1;
105106
args.type = XFS_ALLOCTYPE_NEAR_BNO;
107+
args.resv = resv;
106108

107109
error = xfs_alloc_vextent(&args);
108110
if (error) {
@@ -122,6 +124,27 @@ xfs_inobt_alloc_block(
122124
return 0;
123125
}
124126

127+
STATIC int
128+
xfs_inobt_alloc_block(
129+
struct xfs_btree_cur *cur,
130+
union xfs_btree_ptr *start,
131+
union xfs_btree_ptr *new,
132+
int *stat)
133+
{
134+
return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_NONE);
135+
}
136+
137+
STATIC int
138+
xfs_finobt_alloc_block(
139+
struct xfs_btree_cur *cur,
140+
union xfs_btree_ptr *start,
141+
union xfs_btree_ptr *new,
142+
int *stat)
143+
{
144+
return __xfs_inobt_alloc_block(cur, start, new, stat,
145+
XFS_AG_RESV_METADATA);
146+
}
147+
125148
STATIC int
126149
xfs_inobt_free_block(
127150
struct xfs_btree_cur *cur,
@@ -328,7 +351,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = {
328351

329352
.dup_cursor = xfs_inobt_dup_cursor,
330353
.set_root = xfs_finobt_set_root,
331-
.alloc_block = xfs_inobt_alloc_block,
354+
.alloc_block = xfs_finobt_alloc_block,
332355
.free_block = xfs_inobt_free_block,
333356
.get_minrecs = xfs_inobt_get_minrecs,
334357
.get_maxrecs = xfs_inobt_get_maxrecs,
@@ -480,3 +503,64 @@ xfs_inobt_rec_check_count(
480503
return 0;
481504
}
482505
#endif /* DEBUG */
506+
507+
static xfs_extlen_t
508+
xfs_inobt_max_size(
509+
struct xfs_mount *mp)
510+
{
511+
/* Bail out if we're uninitialized, which can happen in mkfs. */
512+
if (mp->m_inobt_mxr[0] == 0)
513+
return 0;
514+
515+
return xfs_btree_calc_size(mp, mp->m_inobt_mnr,
516+
(uint64_t)mp->m_sb.sb_agblocks * mp->m_sb.sb_inopblock /
517+
XFS_INODES_PER_CHUNK);
518+
}
519+
520+
static int
521+
xfs_inobt_count_blocks(
522+
struct xfs_mount *mp,
523+
xfs_agnumber_t agno,
524+
xfs_btnum_t btnum,
525+
xfs_extlen_t *tree_blocks)
526+
{
527+
struct xfs_buf *agbp;
528+
struct xfs_btree_cur *cur;
529+
int error;
530+
531+
error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
532+
if (error)
533+
return error;
534+
535+
cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, btnum);
536+
error = xfs_btree_count_blocks(cur, tree_blocks);
537+
xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
538+
xfs_buf_relse(agbp);
539+
540+
return error;
541+
}
542+
543+
/*
544+
* Figure out how many blocks to reserve and how many are used by this btree.
545+
*/
546+
int
547+
xfs_finobt_calc_reserves(
548+
struct xfs_mount *mp,
549+
xfs_agnumber_t agno,
550+
xfs_extlen_t *ask,
551+
xfs_extlen_t *used)
552+
{
553+
xfs_extlen_t tree_len = 0;
554+
int error;
555+
556+
if (!xfs_sb_version_hasfinobt(&mp->m_sb))
557+
return 0;
558+
559+
error = xfs_inobt_count_blocks(mp, agno, XFS_BTNUM_FINO, &tree_len);
560+
if (error)
561+
return error;
562+
563+
*ask += xfs_inobt_max_size(mp);
564+
*used += tree_len;
565+
return 0;
566+
}

fs/xfs/libxfs/xfs_ialloc_btree.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,4 +72,7 @@ int xfs_inobt_rec_check_count(struct xfs_mount *,
7272
#define xfs_inobt_rec_check_count(mp, rec) 0
7373
#endif /* DEBUG */
7474

75+
int xfs_finobt_calc_reserves(struct xfs_mount *mp, xfs_agnumber_t agno,
76+
xfs_extlen_t *ask, xfs_extlen_t *used);
77+
7578
#endif /* __XFS_IALLOC_BTREE_H__ */

fs/xfs/xfs_inode.c

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1792,22 +1792,23 @@ xfs_inactive_ifree(
17921792
int error;
17931793

17941794
/*
1795-
* The ifree transaction might need to allocate blocks for record
1796-
* insertion to the finobt. We don't want to fail here at ENOSPC, so
1797-
* allow ifree to dip into the reserved block pool if necessary.
1798-
*
1799-
* Freeing large sets of inodes generally means freeing inode chunks,
1800-
* directory and file data blocks, so this should be relatively safe.
1801-
* Only under severe circumstances should it be possible to free enough
1802-
* inodes to exhaust the reserve block pool via finobt expansion while
1803-
* at the same time not creating free space in the filesystem.
1795+
* We try to use a per-AG reservation for any block needed by the finobt
1796+
* tree, but as the finobt feature predates the per-AG reservation
1797+
* support a degraded file system might not have enough space for the
1798+
* reservation at mount time. In that case try to dip into the reserved
1799+
* pool and pray.
18041800
*
18051801
* Send a warning if the reservation does happen to fail, as the inode
18061802
* now remains allocated and sits on the unlinked list until the fs is
18071803
* repaired.
18081804
*/
1809-
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
1810-
XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
1805+
if (unlikely(mp->m_inotbt_nores)) {
1806+
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
1807+
XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE,
1808+
&tp);
1809+
} else {
1810+
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp);
1811+
}
18111812
if (error) {
18121813
if (error == -ENOSPC) {
18131814
xfs_warn_ratelimited(mp,

fs/xfs/xfs_mount.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ typedef struct xfs_mount {
140140
int m_fixedfsid[2]; /* unchanged for life of FS */
141141
uint m_dmevmask; /* DMI events for this FS */
142142
__uint64_t m_flags; /* global mount flags */
143+
bool m_inotbt_nores; /* no per-AG finobt resv. */
143144
int m_ialloc_inos; /* inodes in inode allocation */
144145
int m_ialloc_blks; /* blocks in inode allocation */
145146
int m_ialloc_min_blks;/* min blocks in sparse inode

0 commit comments

Comments
 (0)