Skip to content

Commit 05f65b5

Browse files
David Herrmanntorvalds
authored andcommitted
shm: wait for pins to be released when sealing
If we set SEAL_WRITE on a file, we must make sure there cannot be any ongoing write-operations on the file. For write() calls, we simply lock the inode mutex, for mmap() we simply verify there're no writable mappings. However, there might be pages pinned by AIO, Direct-IO and similar operations via GUP. We must make sure those do not write to the memfd file after we set SEAL_WRITE. As there is no way to notify GUP users to drop pages or to wait for them to be done, we implement the wait ourself: When setting SEAL_WRITE, we check all pages for their ref-count. If it's bigger than 1, we know there's some user of the page. We then mark the page and wait for up to 150ms for those ref-counts to be dropped. If the ref-counts are not dropped in time, we refuse the seal operation. Signed-off-by: David Herrmann <dh.herrmann@gmail.com> Acked-by: Hugh Dickins <hughd@google.com> Cc: Michael Kerrisk <mtk.manpages@gmail.com> Cc: Ryan Lortie <desrt@desrt.ca> Cc: Lennart Poettering <lennart@poettering.net> Cc: Daniel Mack <zonque@gmail.com> Cc: Andy Lutomirski <luto@amacapital.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 87b2d44 commit 05f65b5

File tree

1 file changed

+109
-1
lines changed

1 file changed

+109
-1
lines changed

mm/shmem.c

Lines changed: 109 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1828,9 +1828,117 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
18281828
return offset;
18291829
}
18301830

1831+
/*
1832+
* We need a tag: a new tag would expand every radix_tree_node by 8 bytes,
1833+
* so reuse a tag which we firmly believe is never set or cleared on shmem.
1834+
*/
1835+
#define SHMEM_TAG_PINNED PAGECACHE_TAG_TOWRITE
1836+
#define LAST_SCAN 4 /* about 150ms max */
1837+
1838+
static void shmem_tag_pins(struct address_space *mapping)
1839+
{
1840+
struct radix_tree_iter iter;
1841+
void **slot;
1842+
pgoff_t start;
1843+
struct page *page;
1844+
1845+
lru_add_drain();
1846+
start = 0;
1847+
rcu_read_lock();
1848+
1849+
restart:
1850+
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
1851+
page = radix_tree_deref_slot(slot);
1852+
if (!page || radix_tree_exception(page)) {
1853+
if (radix_tree_deref_retry(page))
1854+
goto restart;
1855+
} else if (page_count(page) - page_mapcount(page) > 1) {
1856+
spin_lock_irq(&mapping->tree_lock);
1857+
radix_tree_tag_set(&mapping->page_tree, iter.index,
1858+
SHMEM_TAG_PINNED);
1859+
spin_unlock_irq(&mapping->tree_lock);
1860+
}
1861+
1862+
if (need_resched()) {
1863+
cond_resched_rcu();
1864+
start = iter.index + 1;
1865+
goto restart;
1866+
}
1867+
}
1868+
rcu_read_unlock();
1869+
}
1870+
1871+
/*
1872+
* Setting SEAL_WRITE requires us to verify there's no pending writer. However,
1873+
* via get_user_pages(), drivers might have some pending I/O without any active
1874+
* user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages
1875+
* and see whether it has an elevated ref-count. If so, we tag them and wait for
1876+
* them to be dropped.
1877+
* The caller must guarantee that no new user will acquire writable references
1878+
* to those pages to avoid races.
1879+
*/
18311880
static int shmem_wait_for_pins(struct address_space *mapping)
18321881
{
1833-
return 0;
1882+
struct radix_tree_iter iter;
1883+
void **slot;
1884+
pgoff_t start;
1885+
struct page *page;
1886+
int error, scan;
1887+
1888+
shmem_tag_pins(mapping);
1889+
1890+
error = 0;
1891+
for (scan = 0; scan <= LAST_SCAN; scan++) {
1892+
if (!radix_tree_tagged(&mapping->page_tree, SHMEM_TAG_PINNED))
1893+
break;
1894+
1895+
if (!scan)
1896+
lru_add_drain_all();
1897+
else if (schedule_timeout_killable((HZ << scan) / 200))
1898+
scan = LAST_SCAN;
1899+
1900+
start = 0;
1901+
rcu_read_lock();
1902+
restart:
1903+
radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter,
1904+
start, SHMEM_TAG_PINNED) {
1905+
1906+
page = radix_tree_deref_slot(slot);
1907+
if (radix_tree_exception(page)) {
1908+
if (radix_tree_deref_retry(page))
1909+
goto restart;
1910+
1911+
page = NULL;
1912+
}
1913+
1914+
if (page &&
1915+
page_count(page) - page_mapcount(page) != 1) {
1916+
if (scan < LAST_SCAN)
1917+
goto continue_resched;
1918+
1919+
/*
1920+
* On the last scan, we clean up all those tags
1921+
* we inserted; but make a note that we still
1922+
* found pages pinned.
1923+
*/
1924+
error = -EBUSY;
1925+
}
1926+
1927+
spin_lock_irq(&mapping->tree_lock);
1928+
radix_tree_tag_clear(&mapping->page_tree,
1929+
iter.index, SHMEM_TAG_PINNED);
1930+
spin_unlock_irq(&mapping->tree_lock);
1931+
continue_resched:
1932+
if (need_resched()) {
1933+
cond_resched_rcu();
1934+
start = iter.index + 1;
1935+
goto restart;
1936+
}
1937+
}
1938+
rcu_read_unlock();
1939+
}
1940+
1941+
return error;
18341942
}
18351943

18361944
#define F_ALL_SEALS (F_SEAL_SEAL | \

0 commit comments

Comments
 (0)