Skip to content

Commit d00806b

Browse files
Nick PigginLinus Torvalds
authored andcommitted
mm: fix fault vs invalidate race for linear mappings
Fix the race between invalidate_inode_pages and do_no_page. Andrea Arcangeli identified a subtle race between invalidation of pages from pagecache with userspace mappings, and do_no_page. The issue is that invalidation has to shoot down all mappings to the page, before it can be discarded from the pagecache. Between shooting down ptes to a particular page, and actually dropping the struct page from the pagecache, do_no_page from any process might fault on that page and establish a new mapping to the page just before it gets discarded from the pagecache. The most common case where such invalidation is used is in file truncation. This case was catered for by doing a sort of open-coded seqlock between the file's i_size, and its truncate_count. Truncation will decrease i_size, then increment truncate_count before unmapping userspace pages; do_no_page will read truncate_count, then find the page if it is within i_size, and then check truncate_count under the page table lock and back out and retry if it had subsequently been changed (ptl will serialise against unmapping, and ensure a potentially updated truncate_count is actually visible). Complexity and documentation issues aside, the locking protocol fails in the case where we would like to invalidate pagecache inside i_size. do_no_page can come in anytime and filemap_nopage is not aware of the invalidation in progress (as it is when it is outside i_size). The end result is that dangling (->mapping == NULL) pages that appear to be from a particular file may be mapped into userspace with nonsense data. Valid mappings to the same place will see a different page. Andrea implemented two working fixes, one using a real seqlock, another using a page->flags bit. He also proposed using the page lock in do_no_page, but that was initially considered too heavyweight. However, it is not a global or per-file lock, and the page cacheline is modified in do_no_page to increment _count and _mapcount anyway, so a further modification should not be a large performance hit. Scalability is not an issue. This patch implements this latter approach. ->nopage implementations return with the page locked if it is possible for their underlying file to be invalidated (in that case, they must set a special vm_flags bit to indicate so). do_no_page only unlocks the page after setting up the mapping completely. invalidation is excluded because it holds the page lock during invalidation of each page (and ensures that the page is not mapped while holding the lock). This also allows significant simplifications in do_no_page, because we have the page locked in the right place in the pagecache from the start. Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 589f1e8 commit d00806b

File tree

10 files changed

+127
-116
lines changed

10 files changed

+127
-116
lines changed

fs/gfs2/ops_file.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,8 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
364364
else
365365
vma->vm_ops = &gfs2_vm_ops_private;
366366

367+
vma->vm_flags |= VM_CAN_INVALIDATE;
368+
367369
gfs2_glock_dq_uninit(&i_gh);
368370

369371
return error;

fs/gfs2/ops_vm.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,8 @@ static struct page *gfs2_sharewrite_nopage(struct vm_area_struct *area,
138138
if (alloc_required) {
139139
error = alloc_page_backing(ip, result);
140140
if (error) {
141+
if (area->vm_flags & VM_CAN_INVALIDATE)
142+
unlock_page(result);
141143
page_cache_release(result);
142144
result = NULL;
143145
goto out;

fs/ncpfs/mmap.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ int ncp_mmap(struct file *file, struct vm_area_struct *vma)
123123
return -EFBIG;
124124

125125
vma->vm_ops = &ncp_file_mmap;
126+
vma->vm_flags |= VM_CAN_INVALIDATE;
126127
file_accessed(file);
127128
return 0;
128129
}

fs/ocfs2/mmap.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,7 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
226226
ocfs2_meta_unlock(file->f_dentry->d_inode, lock_level);
227227
out:
228228
vma->vm_ops = &ocfs2_file_vm_ops;
229+
vma->vm_flags |= VM_CAN_INVALIDATE;
229230
return 0;
230231
}
231232

fs/xfs/linux-2.6/xfs_file.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,7 @@ xfs_file_mmap(
310310
struct vm_area_struct *vma)
311311
{
312312
vma->vm_ops = &xfs_file_vm_ops;
313+
vma->vm_flags |= VM_CAN_INVALIDATE;
313314

314315
#ifdef CONFIG_XFS_DMAPI
315316
if (vn_from_inode(filp->f_path.dentry->d_inode)->v_vfsp->vfs_flag & VFS_DMI)

include/linux/mm.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,12 @@ extern unsigned int kobjsize(const void *objp);
168168
#define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it */
169169
#define VM_ALWAYSDUMP 0x04000000 /* Always include in core dumps */
170170

171+
#define VM_CAN_INVALIDATE 0x08000000 /* The mapping may be invalidated,
172+
* eg. truncate or invalidate_inode_*.
173+
* In this case, do_no_page must
174+
* return with the page locked.
175+
*/
176+
171177
#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */
172178
#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
173179
#endif

mm/filemap.c

Lines changed: 20 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1325,9 +1325,10 @@ struct page *filemap_nopage(struct vm_area_struct *area,
13251325
unsigned long size, pgoff;
13261326
int did_readaround = 0, majmin = VM_FAULT_MINOR;
13271327

1328+
BUG_ON(!(area->vm_flags & VM_CAN_INVALIDATE));
1329+
13281330
pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
13291331

1330-
retry_all:
13311332
size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
13321333
if (pgoff >= size)
13331334
goto outside_data_content;
@@ -1349,7 +1350,7 @@ struct page *filemap_nopage(struct vm_area_struct *area,
13491350
* Do we have something in the page cache already?
13501351
*/
13511352
retry_find:
1352-
page = find_get_page(mapping, pgoff);
1353+
page = find_lock_page(mapping, pgoff);
13531354
if (!page) {
13541355
unsigned long ra_pages;
13551356

@@ -1383,7 +1384,7 @@ struct page *filemap_nopage(struct vm_area_struct *area,
13831384
start = pgoff - ra_pages / 2;
13841385
do_page_cache_readahead(mapping, file, start, ra_pages);
13851386
}
1386-
page = find_get_page(mapping, pgoff);
1387+
page = find_lock_page(mapping, pgoff);
13871388
if (!page)
13881389
goto no_cached_page;
13891390
}
@@ -1392,13 +1393,19 @@ struct page *filemap_nopage(struct vm_area_struct *area,
13921393
ra->mmap_hit++;
13931394

13941395
/*
1395-
* Ok, found a page in the page cache, now we need to check
1396-
* that it's up-to-date.
1396+
* We have a locked page in the page cache, now we need to check
1397+
* that it's up-to-date. If not, it is going to be due to an error.
13971398
*/
1398-
if (!PageUptodate(page))
1399+
if (unlikely(!PageUptodate(page)))
13991400
goto page_not_uptodate;
14001401

1401-
success:
1402+
/* Must recheck i_size under page lock */
1403+
size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1404+
if (unlikely(pgoff >= size)) {
1405+
unlock_page(page);
1406+
goto outside_data_content;
1407+
}
1408+
14021409
/*
14031410
* Found the page and have a reference on it.
14041411
*/
@@ -1440,6 +1447,7 @@ struct page *filemap_nopage(struct vm_area_struct *area,
14401447
return NOPAGE_SIGBUS;
14411448

14421449
page_not_uptodate:
1450+
/* IO error path */
14431451
if (!did_readaround) {
14441452
majmin = VM_FAULT_MAJOR;
14451453
count_vm_event(PGMAJFAULT);
@@ -1451,37 +1459,15 @@ struct page *filemap_nopage(struct vm_area_struct *area,
14511459
* because there really aren't any performance issues here
14521460
* and we need to check for errors.
14531461
*/
1454-
lock_page(page);
1455-
1456-
/* Somebody truncated the page on us? */
1457-
if (!page->mapping) {
1458-
unlock_page(page);
1459-
page_cache_release(page);
1460-
goto retry_all;
1461-
}
1462-
1463-
/* Somebody else successfully read it in? */
1464-
if (PageUptodate(page)) {
1465-
unlock_page(page);
1466-
goto success;
1467-
}
14681462
ClearPageError(page);
14691463
error = mapping->a_ops->readpage(file, page);
1470-
if (!error) {
1471-
wait_on_page_locked(page);
1472-
if (PageUptodate(page))
1473-
goto success;
1474-
} else if (error == AOP_TRUNCATED_PAGE) {
1475-
page_cache_release(page);
1464+
page_cache_release(page);
1465+
1466+
if (!error || error == AOP_TRUNCATED_PAGE)
14761467
goto retry_find;
1477-
}
14781468

1479-
/*
1480-
* Things didn't work out. Return zero to tell the
1481-
* mm layer so, possibly freeing the page cache page first.
1482-
*/
1469+
/* Things didn't work out. Return zero to tell the mm layer so. */
14831470
shrink_readahead_size_eio(file, ra);
1484-
page_cache_release(page);
14851471
return NOPAGE_SIGBUS;
14861472
}
14871473
EXPORT_SYMBOL(filemap_nopage);
@@ -1674,6 +1660,7 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
16741660
return -ENOEXEC;
16751661
file_accessed(file);
16761662
vma->vm_ops = &generic_file_vm_ops;
1663+
vma->vm_flags |= VM_CAN_INVALIDATE;
16771664
return 0;
16781665
}
16791666

0 commit comments

Comments
 (0)