Skip to content

Commit c1a4de9

Browse files
aagittorvalds
authored andcommitted
userfaultfd: mcopy_atomic|mfill_zeropage: UFFDIO_COPY|UFFDIO_ZEROPAGE preparation
This implements mcopy_atomic and mfill_zeropage that are the lowlevel VM methods that are invoked respectively by the UFFDIO_COPY and UFFDIO_ZEROPAGE userfaultfd commands. Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Acked-by: Pavel Emelyanov <xemul@parallels.com> Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com> Cc: zhang.zhanghailiang@huawei.com Cc: "Kirill A. Shutemov" <kirill@shutemov.name> Cc: Andres Lagar-Cavilla <andreslc@google.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Paolo Bonzini <pbonzini@redhat.com> Cc: Rik van Riel <riel@redhat.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Andy Lutomirski <luto@amacapital.net> Cc: Hugh Dickins <hughd@google.com> Cc: Peter Feiner <pfeiner@google.com> Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 1f1c6f0 commit c1a4de9

File tree

3 files changed

+276
-0
lines changed

3 files changed

+276
-0
lines changed

include/linux/userfaultfd_k.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,12 @@
3030
extern int handle_userfault(struct vm_area_struct *vma, unsigned long address,
3131
unsigned int flags, unsigned long reason);
3232

33+
extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
34+
unsigned long src_start, unsigned long len);
35+
extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
36+
unsigned long dst_start,
37+
unsigned long len);
38+
3339
/* mm helpers */
3440
static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
3541
struct vm_userfaultfd_ctx vm_ctx)

mm/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,3 +78,4 @@ obj-$(CONFIG_CMA) += cma.o
7878
obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
7979
obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
8080
obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
81+
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o

mm/userfaultfd.c

Lines changed: 269 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,269 @@
1+
/*
2+
* mm/userfaultfd.c
3+
*
4+
* Copyright (C) 2015 Red Hat, Inc.
5+
*
6+
* This work is licensed under the terms of the GNU GPL, version 2. See
7+
* the COPYING file in the top-level directory.
8+
*/
9+
10+
#include <linux/mm.h>
11+
#include <linux/pagemap.h>
12+
#include <linux/rmap.h>
13+
#include <linux/swap.h>
14+
#include <linux/swapops.h>
15+
#include <linux/userfaultfd_k.h>
16+
#include <linux/mmu_notifier.h>
17+
#include <asm/tlbflush.h>
18+
#include "internal.h"
19+
20+
static int mcopy_atomic_pte(struct mm_struct *dst_mm,
21+
pmd_t *dst_pmd,
22+
struct vm_area_struct *dst_vma,
23+
unsigned long dst_addr,
24+
unsigned long src_addr)
25+
{
26+
struct mem_cgroup *memcg;
27+
pte_t _dst_pte, *dst_pte;
28+
spinlock_t *ptl;
29+
struct page *page;
30+
void *page_kaddr;
31+
int ret;
32+
33+
ret = -ENOMEM;
34+
page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr);
35+
if (!page)
36+
goto out;
37+
38+
page_kaddr = kmap(page);
39+
ret = -EFAULT;
40+
if (copy_from_user(page_kaddr, (const void __user *) src_addr,
41+
PAGE_SIZE))
42+
goto out_kunmap_release;
43+
kunmap(page);
44+
45+
/*
46+
* The memory barrier inside __SetPageUptodate makes sure that
47+
* preceeding stores to the page contents become visible before
48+
* the set_pte_at() write.
49+
*/
50+
__SetPageUptodate(page);
51+
52+
ret = -ENOMEM;
53+
if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg))
54+
goto out_release;
55+
56+
_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
57+
if (dst_vma->vm_flags & VM_WRITE)
58+
_dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
59+
60+
ret = -EEXIST;
61+
dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
62+
if (!pte_none(*dst_pte))
63+
goto out_release_uncharge_unlock;
64+
65+
inc_mm_counter(dst_mm, MM_ANONPAGES);
66+
page_add_new_anon_rmap(page, dst_vma, dst_addr);
67+
mem_cgroup_commit_charge(page, memcg, false);
68+
lru_cache_add_active_or_unevictable(page, dst_vma);
69+
70+
set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
71+
72+
/* No need to invalidate - it was non-present before */
73+
update_mmu_cache(dst_vma, dst_addr, dst_pte);
74+
75+
pte_unmap_unlock(dst_pte, ptl);
76+
ret = 0;
77+
out:
78+
return ret;
79+
out_release_uncharge_unlock:
80+
pte_unmap_unlock(dst_pte, ptl);
81+
mem_cgroup_cancel_charge(page, memcg);
82+
out_release:
83+
page_cache_release(page);
84+
goto out;
85+
out_kunmap_release:
86+
kunmap(page);
87+
goto out_release;
88+
}
89+
90+
static int mfill_zeropage_pte(struct mm_struct *dst_mm,
91+
pmd_t *dst_pmd,
92+
struct vm_area_struct *dst_vma,
93+
unsigned long dst_addr)
94+
{
95+
pte_t _dst_pte, *dst_pte;
96+
spinlock_t *ptl;
97+
int ret;
98+
99+
_dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
100+
dst_vma->vm_page_prot));
101+
ret = -EEXIST;
102+
dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
103+
if (!pte_none(*dst_pte))
104+
goto out_unlock;
105+
set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
106+
/* No need to invalidate - it was non-present before */
107+
update_mmu_cache(dst_vma, dst_addr, dst_pte);
108+
ret = 0;
109+
out_unlock:
110+
pte_unmap_unlock(dst_pte, ptl);
111+
return ret;
112+
}
113+
114+
static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
115+
{
116+
pgd_t *pgd;
117+
pud_t *pud;
118+
pmd_t *pmd = NULL;
119+
120+
pgd = pgd_offset(mm, address);
121+
pud = pud_alloc(mm, pgd, address);
122+
if (pud)
123+
/*
124+
* Note that we didn't run this because the pmd was
125+
* missing, the *pmd may be already established and in
126+
* turn it may also be a trans_huge_pmd.
127+
*/
128+
pmd = pmd_alloc(mm, pud, address);
129+
return pmd;
130+
}
131+
132+
static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
133+
unsigned long dst_start,
134+
unsigned long src_start,
135+
unsigned long len,
136+
bool zeropage)
137+
{
138+
struct vm_area_struct *dst_vma;
139+
ssize_t err;
140+
pmd_t *dst_pmd;
141+
unsigned long src_addr, dst_addr;
142+
long copied = 0;
143+
144+
/*
145+
* Sanitize the command parameters:
146+
*/
147+
BUG_ON(dst_start & ~PAGE_MASK);
148+
BUG_ON(len & ~PAGE_MASK);
149+
150+
/* Does the address range wrap, or is the span zero-sized? */
151+
BUG_ON(src_start + len <= src_start);
152+
BUG_ON(dst_start + len <= dst_start);
153+
154+
down_read(&dst_mm->mmap_sem);
155+
156+
/*
157+
* Make sure the vma is not shared, that the dst range is
158+
* both valid and fully within a single existing vma.
159+
*/
160+
err = -EINVAL;
161+
dst_vma = find_vma(dst_mm, dst_start);
162+
if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
163+
goto out;
164+
if (dst_start < dst_vma->vm_start ||
165+
dst_start + len > dst_vma->vm_end)
166+
goto out;
167+
168+
/*
169+
* Be strict and only allow __mcopy_atomic on userfaultfd
170+
* registered ranges to prevent userland errors going
171+
* unnoticed. As far as the VM consistency is concerned, it
172+
* would be perfectly safe to remove this check, but there's
173+
* no useful usage for __mcopy_atomic ouside of userfaultfd
174+
* registered ranges. This is after all why these are ioctls
175+
* belonging to the userfaultfd and not syscalls.
176+
*/
177+
if (!dst_vma->vm_userfaultfd_ctx.ctx)
178+
goto out;
179+
180+
/*
181+
* FIXME: only allow copying on anonymous vmas, tmpfs should
182+
* be added.
183+
*/
184+
if (dst_vma->vm_ops)
185+
goto out;
186+
187+
/*
188+
* Ensure the dst_vma has a anon_vma or this page
189+
* would get a NULL anon_vma when moved in the
190+
* dst_vma.
191+
*/
192+
err = -ENOMEM;
193+
if (unlikely(anon_vma_prepare(dst_vma)))
194+
goto out;
195+
196+
for (src_addr = src_start, dst_addr = dst_start;
197+
src_addr < src_start + len; ) {
198+
pmd_t dst_pmdval;
199+
BUG_ON(dst_addr >= dst_start + len);
200+
dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
201+
if (unlikely(!dst_pmd)) {
202+
err = -ENOMEM;
203+
break;
204+
}
205+
206+
dst_pmdval = pmd_read_atomic(dst_pmd);
207+
/*
208+
* If the dst_pmd is mapped as THP don't
209+
* override it and just be strict.
210+
*/
211+
if (unlikely(pmd_trans_huge(dst_pmdval))) {
212+
err = -EEXIST;
213+
break;
214+
}
215+
if (unlikely(pmd_none(dst_pmdval)) &&
216+
unlikely(__pte_alloc(dst_mm, dst_vma, dst_pmd,
217+
dst_addr))) {
218+
err = -ENOMEM;
219+
break;
220+
}
221+
/* If an huge pmd materialized from under us fail */
222+
if (unlikely(pmd_trans_huge(*dst_pmd))) {
223+
err = -EFAULT;
224+
break;
225+
}
226+
227+
BUG_ON(pmd_none(*dst_pmd));
228+
BUG_ON(pmd_trans_huge(*dst_pmd));
229+
230+
if (!zeropage)
231+
err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
232+
dst_addr, src_addr);
233+
else
234+
err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma,
235+
dst_addr);
236+
237+
cond_resched();
238+
239+
if (!err) {
240+
dst_addr += PAGE_SIZE;
241+
src_addr += PAGE_SIZE;
242+
copied += PAGE_SIZE;
243+
244+
if (fatal_signal_pending(current))
245+
err = -EINTR;
246+
}
247+
if (err)
248+
break;
249+
}
250+
251+
out:
252+
up_read(&dst_mm->mmap_sem);
253+
BUG_ON(copied < 0);
254+
BUG_ON(err > 0);
255+
BUG_ON(!copied && !err);
256+
return copied ? copied : err;
257+
}
258+
259+
ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
260+
unsigned long src_start, unsigned long len)
261+
{
262+
return __mcopy_atomic(dst_mm, dst_start, src_start, len, false);
263+
}
264+
265+
ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
266+
unsigned long len)
267+
{
268+
return __mcopy_atomic(dst_mm, start, 0, len, true);
269+
}

0 commit comments

Comments
 (0)