Skip to content

Commit acdda3a

Browse files
Christoph Hellwigdchinner
authored andcommitted
xfs: use iomap_dio_rw
Straight switch over to using iomap for direct I/O - we already have the non-COW dio path in write_begin for DAX and files with extent size hints, so nothing to add there. The COW path is ported over from the old get_blocks version and a bit of a mess, but I have some work in progress to make it look more like the buffered I/O COW path. This gets rid of xfs_get_blocks_direct and the last caller of xfs_get_blocks with the create flag set, so all that code can be removed. Last but not least I've removed a comment in xfs_filemap_fault that refers to xfs_get_blocks entirely instead of updating it - while the reference is correct, the whole DAX fault path looks different than the non-DAX one, so it seems rather pointless. Signed-off-by: Christoph Hellwig <hch@lst.de> Tested-by: Jens Axboe <axboe@fb.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
1 parent ff6a929 commit acdda3a

File tree

4 files changed

+110
-386
lines changed

4 files changed

+110
-386
lines changed

fs/xfs/xfs_aops.c

Lines changed: 8 additions & 283 deletions
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,6 @@
3737
#include <linux/pagevec.h>
3838
#include <linux/writeback.h>
3939

40-
/* flags for direct write completions */
41-
#define XFS_DIO_FLAG_UNWRITTEN (1 << 0)
42-
#define XFS_DIO_FLAG_APPEND (1 << 1)
43-
#define XFS_DIO_FLAG_COW (1 << 2)
44-
4540
/*
4641
* structure owned by writepages passed to individual writepage calls
4742
*/
@@ -1175,45 +1170,6 @@ xfs_vm_releasepage(
11751170
return try_to_free_buffers(page);
11761171
}
11771172

1178-
/*
1179-
* When we map a DIO buffer, we may need to pass flags to
1180-
* xfs_end_io_direct_write to tell it what kind of write IO we are doing.
1181-
*
1182-
* Note that for DIO, an IO to the highest supported file block offset (i.e.
1183-
* 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64
1184-
* bit variable. Hence if we see this overflow, we have to assume that the IO is
1185-
* extending the file size. We won't know for sure until IO completion is run
1186-
* and the actual max write offset is communicated to the IO completion
1187-
* routine.
1188-
*/
1189-
static void
1190-
xfs_map_direct(
1191-
struct inode *inode,
1192-
struct buffer_head *bh_result,
1193-
struct xfs_bmbt_irec *imap,
1194-
xfs_off_t offset,
1195-
bool is_cow)
1196-
{
1197-
uintptr_t *flags = (uintptr_t *)&bh_result->b_private;
1198-
xfs_off_t size = bh_result->b_size;
1199-
1200-
trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size,
1201-
ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : is_cow ? XFS_IO_COW :
1202-
XFS_IO_OVERWRITE, imap);
1203-
1204-
if (ISUNWRITTEN(imap)) {
1205-
*flags |= XFS_DIO_FLAG_UNWRITTEN;
1206-
set_buffer_defer_completion(bh_result);
1207-
} else if (is_cow) {
1208-
*flags |= XFS_DIO_FLAG_COW;
1209-
set_buffer_defer_completion(bh_result);
1210-
}
1211-
if (offset + size > i_size_read(inode) || offset + size < 0) {
1212-
*flags |= XFS_DIO_FLAG_APPEND;
1213-
set_buffer_defer_completion(bh_result);
1214-
}
1215-
}
1216-
12171173
/*
12181174
* If this is O_DIRECT or the mpage code calling tell them how large the mapping
12191175
* is, so that we can avoid repeated get_blocks calls.
@@ -1254,51 +1210,12 @@ xfs_map_trim_size(
12541210
bh_result->b_size = mapping_size;
12551211
}
12561212

1257-
/* Bounce unaligned directio writes to the page cache. */
12581213
static int
1259-
xfs_bounce_unaligned_dio_write(
1260-
struct xfs_inode *ip,
1261-
xfs_fileoff_t offset_fsb,
1262-
struct xfs_bmbt_irec *imap)
1263-
{
1264-
struct xfs_bmbt_irec irec;
1265-
xfs_fileoff_t delta;
1266-
bool shared;
1267-
bool x;
1268-
int error;
1269-
1270-
irec = *imap;
1271-
if (offset_fsb > irec.br_startoff) {
1272-
delta = offset_fsb - irec.br_startoff;
1273-
irec.br_blockcount -= delta;
1274-
irec.br_startblock += delta;
1275-
irec.br_startoff = offset_fsb;
1276-
}
1277-
error = xfs_reflink_trim_around_shared(ip, &irec, &shared, &x);
1278-
if (error)
1279-
return error;
1280-
1281-
/*
1282-
* We're here because we're trying to do a directio write to a
1283-
* region that isn't aligned to a filesystem block. If any part
1284-
* of the extent is shared, fall back to buffered mode to handle
1285-
* the RMW. This is done by returning -EREMCHG ("remote addr
1286-
* changed"), which is caught further up the call stack.
1287-
*/
1288-
if (shared) {
1289-
trace_xfs_reflink_bounce_dio_write(ip, imap);
1290-
return -EREMCHG;
1291-
}
1292-
return 0;
1293-
}
1294-
1295-
STATIC int
1296-
__xfs_get_blocks(
1214+
xfs_get_blocks(
12971215
struct inode *inode,
12981216
sector_t iblock,
12991217
struct buffer_head *bh_result,
1300-
int create,
1301-
bool direct)
1218+
int create)
13021219
{
13031220
struct xfs_inode *ip = XFS_I(inode);
13041221
struct xfs_mount *mp = ip->i_mount;
@@ -1309,10 +1226,8 @@ __xfs_get_blocks(
13091226
int nimaps = 1;
13101227
xfs_off_t offset;
13111228
ssize_t size;
1312-
int new = 0;
1313-
bool is_cow = false;
13141229

1315-
BUG_ON(create && !direct);
1230+
BUG_ON(create);
13161231

13171232
if (XFS_FORCED_SHUTDOWN(mp))
13181233
return -EIO;
@@ -1321,7 +1236,7 @@ __xfs_get_blocks(
13211236
ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
13221237
size = bh_result->b_size;
13231238

1324-
if (!create && offset >= i_size_read(inode))
1239+
if (offset >= i_size_read(inode))
13251240
return 0;
13261241

13271242
/*
@@ -1336,73 +1251,12 @@ __xfs_get_blocks(
13361251
end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
13371252
offset_fsb = XFS_B_TO_FSBT(mp, offset);
13381253

1339-
if (create && direct && xfs_is_reflink_inode(ip)) {
1340-
is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap);
1341-
ASSERT(!is_cow || !isnullstartblock(imap.br_startblock));
1342-
}
1343-
1344-
if (!is_cow) {
1345-
error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
1346-
&imap, &nimaps, XFS_BMAPI_ENTIRE);
1347-
/*
1348-
* Truncate an overwrite extent if there's a pending CoW
1349-
* reservation before the end of this extent. This
1350-
* forces us to come back to get_blocks to take care of
1351-
* the CoW.
1352-
*/
1353-
if (create && direct && nimaps &&
1354-
imap.br_startblock != HOLESTARTBLOCK &&
1355-
imap.br_startblock != DELAYSTARTBLOCK &&
1356-
!ISUNWRITTEN(&imap))
1357-
xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb,
1358-
&imap);
1359-
}
1254+
error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
1255+
&imap, &nimaps, XFS_BMAPI_ENTIRE);
13601256
if (error)
13611257
goto out_unlock;
13621258

1363-
/*
1364-
* The only time we can ever safely find delalloc blocks on direct I/O
1365-
* is a dio write to post-eof speculative preallocation. All other
1366-
* scenarios are indicative of a problem or misuse (such as mixing
1367-
* direct and mapped I/O).
1368-
*
1369-
* The file may be unmapped by the time we get here so we cannot
1370-
* reliably fail the I/O based on mapping. Instead, fail the I/O if this
1371-
* is a read or a write within eof. Otherwise, carry on but warn as a
1372-
* precuation if the file happens to be mapped.
1373-
*/
1374-
if (direct && imap.br_startblock == DELAYSTARTBLOCK) {
1375-
if (!create || offset < i_size_read(VFS_I(ip))) {
1376-
WARN_ON_ONCE(1);
1377-
error = -EIO;
1378-
goto out_unlock;
1379-
}
1380-
WARN_ON_ONCE(mapping_mapped(VFS_I(ip)->i_mapping));
1381-
}
1382-
1383-
/* for DAX, we convert unwritten extents directly */
1384-
if (create &&
1385-
(!nimaps ||
1386-
(imap.br_startblock == HOLESTARTBLOCK ||
1387-
imap.br_startblock == DELAYSTARTBLOCK) ||
1388-
(IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
1389-
/*
1390-
* xfs_iomap_write_direct() expects the shared lock. It
1391-
* is unlocked on return.
1392-
*/
1393-
if (lockmode == XFS_ILOCK_EXCL)
1394-
xfs_ilock_demote(ip, lockmode);
1395-
1396-
error = xfs_iomap_write_direct(ip, offset, size,
1397-
&imap, nimaps);
1398-
if (error)
1399-
return error;
1400-
new = 1;
1401-
1402-
trace_xfs_get_blocks_alloc(ip, offset, size,
1403-
ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
1404-
: XFS_IO_DELALLOC, &imap);
1405-
} else if (nimaps) {
1259+
if (nimaps) {
14061260
trace_xfs_get_blocks_found(ip, offset, size,
14071261
ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
14081262
: XFS_IO_OVERWRITE, &imap);
@@ -1412,12 +1266,6 @@ __xfs_get_blocks(
14121266
goto out_unlock;
14131267
}
14141268

1415-
if (IS_DAX(inode) && create) {
1416-
ASSERT(!ISUNWRITTEN(&imap));
1417-
/* zeroing is not needed at a higher layer */
1418-
new = 0;
1419-
}
1420-
14211269
/* trim mapping down to size requested */
14221270
xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
14231271

@@ -1427,144 +1275,21 @@ __xfs_get_blocks(
14271275
*/
14281276
if (imap.br_startblock != HOLESTARTBLOCK &&
14291277
imap.br_startblock != DELAYSTARTBLOCK &&
1430-
(create || !ISUNWRITTEN(&imap))) {
1431-
if (create && direct && !is_cow) {
1432-
error = xfs_bounce_unaligned_dio_write(ip, offset_fsb,
1433-
&imap);
1434-
if (error)
1435-
return error;
1436-
}
1437-
1278+
!ISUNWRITTEN(&imap))
14381279
xfs_map_buffer(inode, bh_result, &imap, offset);
1439-
if (ISUNWRITTEN(&imap))
1440-
set_buffer_unwritten(bh_result);
1441-
/* direct IO needs special help */
1442-
if (create)
1443-
xfs_map_direct(inode, bh_result, &imap, offset, is_cow);
1444-
}
14451280

14461281
/*
14471282
* If this is a realtime file, data may be on a different device.
14481283
* to that pointed to from the buffer_head b_bdev currently.
14491284
*/
14501285
bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
1451-
1452-
/*
1453-
* If we previously allocated a block out beyond eof and we are now
1454-
* coming back to use it then we will need to flag it as new even if it
1455-
* has a disk address.
1456-
*
1457-
* With sub-block writes into unwritten extents we also need to mark
1458-
* the buffer as new so that the unwritten parts of the buffer gets
1459-
* correctly zeroed.
1460-
*/
1461-
if (create &&
1462-
((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
1463-
(offset >= i_size_read(inode)) ||
1464-
(new || ISUNWRITTEN(&imap))))
1465-
set_buffer_new(bh_result);
1466-
14671286
return 0;
14681287

14691288
out_unlock:
14701289
xfs_iunlock(ip, lockmode);
14711290
return error;
14721291
}
14731292

1474-
int
1475-
xfs_get_blocks(
1476-
struct inode *inode,
1477-
sector_t iblock,
1478-
struct buffer_head *bh_result,
1479-
int create)
1480-
{
1481-
return __xfs_get_blocks(inode, iblock, bh_result, create, false);
1482-
}
1483-
1484-
int
1485-
xfs_get_blocks_direct(
1486-
struct inode *inode,
1487-
sector_t iblock,
1488-
struct buffer_head *bh_result,
1489-
int create)
1490-
{
1491-
return __xfs_get_blocks(inode, iblock, bh_result, create, true);
1492-
}
1493-
1494-
/*
1495-
* Complete a direct I/O write request.
1496-
*
1497-
* xfs_map_direct passes us some flags in the private data to tell us what to
1498-
* do. If no flags are set, then the write IO is an overwrite wholly within
1499-
* the existing allocated file size and so there is nothing for us to do.
1500-
*
1501-
* Note that in this case the completion can be called in interrupt context,
1502-
* whereas if we have flags set we will always be called in task context
1503-
* (i.e. from a workqueue).
1504-
*/
1505-
int
1506-
xfs_end_io_direct_write(
1507-
struct kiocb *iocb,
1508-
loff_t offset,
1509-
ssize_t size,
1510-
void *private)
1511-
{
1512-
struct inode *inode = file_inode(iocb->ki_filp);
1513-
struct xfs_inode *ip = XFS_I(inode);
1514-
uintptr_t flags = (uintptr_t)private;
1515-
int error = 0;
1516-
1517-
trace_xfs_end_io_direct_write(ip, offset, size);
1518-
1519-
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
1520-
return -EIO;
1521-
1522-
if (size <= 0)
1523-
return size;
1524-
1525-
/*
1526-
* The flags tell us whether we are doing unwritten extent conversions
1527-
* or an append transaction that updates the on-disk file size. These
1528-
* cases are the only cases where we should *potentially* be needing
1529-
* to update the VFS inode size.
1530-
*/
1531-
if (flags == 0) {
1532-
ASSERT(offset + size <= i_size_read(inode));
1533-
return 0;
1534-
}
1535-
1536-
/*
1537-
* We need to update the in-core inode size here so that we don't end up
1538-
* with the on-disk inode size being outside the in-core inode size. We
1539-
* have no other method of updating EOF for AIO, so always do it here
1540-
* if necessary.
1541-
*
1542-
* We need to lock the test/set EOF update as we can be racing with
1543-
* other IO completions here to update the EOF. Failing to serialise
1544-
* here can result in EOF moving backwards and Bad Things Happen when
1545-
* that occurs.
1546-
*/
1547-
spin_lock(&ip->i_flags_lock);
1548-
if (offset + size > i_size_read(inode))
1549-
i_size_write(inode, offset + size);
1550-
spin_unlock(&ip->i_flags_lock);
1551-
1552-
if (flags & XFS_DIO_FLAG_COW)
1553-
error = xfs_reflink_end_cow(ip, offset, size);
1554-
if (flags & XFS_DIO_FLAG_UNWRITTEN) {
1555-
trace_xfs_end_io_direct_write_unwritten(ip, offset, size);
1556-
1557-
error = xfs_iomap_write_unwritten(ip, offset, size);
1558-
}
1559-
if (flags & XFS_DIO_FLAG_APPEND) {
1560-
trace_xfs_end_io_direct_write_append(ip, offset, size);
1561-
1562-
error = xfs_setfilesize(ip, offset, size);
1563-
}
1564-
1565-
return error;
1566-
}
1567-
15681293
STATIC ssize_t
15691294
xfs_vm_direct_IO(
15701295
struct kiocb *iocb,

fs/xfs/xfs_aops.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -55,12 +55,6 @@ struct xfs_ioend {
5555

5656
extern const struct address_space_operations xfs_address_space_operations;
5757

58-
int xfs_get_blocks(struct inode *inode, sector_t offset,
59-
struct buffer_head *map_bh, int create);
60-
int xfs_get_blocks_direct(struct inode *inode, sector_t offset,
61-
struct buffer_head *map_bh, int create);
62-
int xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset,
63-
ssize_t size, void *private);
6458
int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
6559

6660
extern void xfs_count_page_state(struct page *, int *, int *);

0 commit comments

Comments
 (0)