Skip to content

Commit 36bc08c

Browse files
Gu Zhengbcrl
authored andcommitted
fs/aio: Add support to aio ring pages migration
As the aio job will pin the ring pages, that will lead to mem migrated failed. In order to fix this problem we use an anon inode to manage the aio ring pages, and setup the migratepage callback in the anon inode's address space, so that when mem migrating the aio ring pages will be moved to other mem node safely. Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com> Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
1 parent 5570869 commit 36bc08c

File tree

3 files changed

+112
-12
lines changed

3 files changed

+112
-12
lines changed

fs/aio.c

Lines changed: 108 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@
3535
#include <linux/eventfd.h>
3636
#include <linux/blkdev.h>
3737
#include <linux/compat.h>
38+
#include <linux/anon_inodes.h>
39+
#include <linux/migrate.h>
40+
#include <linux/ramfs.h>
3841

3942
#include <asm/kmap_types.h>
4043
#include <asm/uaccess.h>
@@ -110,6 +113,7 @@ struct kioctx {
110113
} ____cacheline_aligned_in_smp;
111114

112115
struct page *internal_pages[AIO_RING_PAGES];
116+
struct file *aio_ring_file;
113117
};
114118

115119
/*------ sysctl variables----*/
@@ -138,36 +142,124 @@ __initcall(aio_setup);
138142

139143
static void aio_free_ring(struct kioctx *ctx)
140144
{
141-
long i;
145+
int i;
146+
struct file *aio_ring_file = ctx->aio_ring_file;
142147

143-
for (i = 0; i < ctx->nr_pages; i++)
148+
for (i = 0; i < ctx->nr_pages; i++) {
149+
pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i,
150+
page_count(ctx->ring_pages[i]));
144151
put_page(ctx->ring_pages[i]);
152+
}
145153

146154
if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages)
147155
kfree(ctx->ring_pages);
156+
157+
if (aio_ring_file) {
158+
truncate_setsize(aio_ring_file->f_inode, 0);
159+
pr_debug("pid(%d) i_nlink=%u d_count=%d d_unhashed=%d i_count=%d\n",
160+
current->pid, aio_ring_file->f_inode->i_nlink,
161+
aio_ring_file->f_path.dentry->d_count,
162+
d_unhashed(aio_ring_file->f_path.dentry),
163+
atomic_read(&aio_ring_file->f_inode->i_count));
164+
fput(aio_ring_file);
165+
ctx->aio_ring_file = NULL;
166+
}
148167
}
149168

169+
static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
170+
{
171+
vma->vm_ops = &generic_file_vm_ops;
172+
return 0;
173+
}
174+
175+
static const struct file_operations aio_ring_fops = {
176+
.mmap = aio_ring_mmap,
177+
};
178+
179+
static int aio_set_page_dirty(struct page *page)
180+
{
181+
return 0;
182+
}
183+
184+
static int aio_migratepage(struct address_space *mapping, struct page *new,
185+
struct page *old, enum migrate_mode mode)
186+
{
187+
struct kioctx *ctx = mapping->private_data;
188+
unsigned long flags;
189+
unsigned idx = old->index;
190+
int rc;
191+
192+
/* Writeback must be complete */
193+
BUG_ON(PageWriteback(old));
194+
put_page(old);
195+
196+
rc = migrate_page_move_mapping(mapping, new, old, NULL, mode);
197+
if (rc != MIGRATEPAGE_SUCCESS) {
198+
get_page(old);
199+
return rc;
200+
}
201+
202+
get_page(new);
203+
204+
spin_lock_irqsave(&ctx->completion_lock, flags);
205+
migrate_page_copy(new, old);
206+
ctx->ring_pages[idx] = new;
207+
spin_unlock_irqrestore(&ctx->completion_lock, flags);
208+
209+
return rc;
210+
}
211+
212+
static const struct address_space_operations aio_ctx_aops = {
213+
.set_page_dirty = aio_set_page_dirty,
214+
.migratepage = aio_migratepage,
215+
};
216+
150217
static int aio_setup_ring(struct kioctx *ctx)
151218
{
152219
struct aio_ring *ring;
153220
unsigned nr_events = ctx->max_reqs;
154221
struct mm_struct *mm = current->mm;
155222
unsigned long size, populate;
156223
int nr_pages;
224+
int i;
225+
struct file *file;
157226

158227
/* Compensate for the ring buffer's head/tail overlap entry */
159228
nr_events += 2; /* 1 is required, 2 for good luck */
160229

161230
size = sizeof(struct aio_ring);
162231
size += sizeof(struct io_event) * nr_events;
163-
nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT;
164232

233+
nr_pages = PFN_UP(size);
165234
if (nr_pages < 0)
166235
return -EINVAL;
167236

168-
nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event);
237+
file = anon_inode_getfile_private("[aio]", &aio_ring_fops, ctx, O_RDWR);
238+
if (IS_ERR(file)) {
239+
ctx->aio_ring_file = NULL;
240+
return -EAGAIN;
241+
}
242+
243+
file->f_inode->i_mapping->a_ops = &aio_ctx_aops;
244+
file->f_inode->i_mapping->private_data = ctx;
245+
file->f_inode->i_size = PAGE_SIZE * (loff_t)nr_pages;
246+
247+
for (i = 0; i < nr_pages; i++) {
248+
struct page *page;
249+
page = find_or_create_page(file->f_inode->i_mapping,
250+
i, GFP_HIGHUSER | __GFP_ZERO);
251+
if (!page)
252+
break;
253+
pr_debug("pid(%d) page[%d]->count=%d\n",
254+
current->pid, i, page_count(page));
255+
SetPageUptodate(page);
256+
SetPageDirty(page);
257+
unlock_page(page);
258+
}
259+
ctx->aio_ring_file = file;
260+
nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring))
261+
/ sizeof(struct io_event);
169262

170-
ctx->nr_events = 0;
171263
ctx->ring_pages = ctx->internal_pages;
172264
if (nr_pages > AIO_RING_PAGES) {
173265
ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *),
@@ -178,28 +270,31 @@ static int aio_setup_ring(struct kioctx *ctx)
178270

179271
ctx->mmap_size = nr_pages * PAGE_SIZE;
180272
pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size);
273+
181274
down_write(&mm->mmap_sem);
182-
ctx->mmap_base = do_mmap_pgoff(NULL, 0, ctx->mmap_size,
183-
PROT_READ|PROT_WRITE,
184-
MAP_ANONYMOUS|MAP_PRIVATE, 0, &populate);
275+
ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size,
276+
PROT_READ | PROT_WRITE,
277+
MAP_SHARED | MAP_POPULATE, 0, &populate);
185278
if (IS_ERR((void *)ctx->mmap_base)) {
186279
up_write(&mm->mmap_sem);
187280
ctx->mmap_size = 0;
188281
aio_free_ring(ctx);
189282
return -EAGAIN;
190283
}
284+
up_write(&mm->mmap_sem);
285+
286+
mm_populate(ctx->mmap_base, populate);
191287

192288
pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base);
193289
ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages,
194290
1, 0, ctx->ring_pages, NULL);
195-
up_write(&mm->mmap_sem);
291+
for (i = 0; i < ctx->nr_pages; i++)
292+
put_page(ctx->ring_pages[i]);
196293

197294
if (unlikely(ctx->nr_pages != nr_pages)) {
198295
aio_free_ring(ctx);
199296
return -EAGAIN;
200297
}
201-
if (populate)
202-
mm_populate(ctx->mmap_base, populate);
203298

204299
ctx->user_id = ctx->mmap_base;
205300
ctx->nr_events = nr_events; /* trusted copy */
@@ -399,6 +494,8 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
399494
err = -EAGAIN;
400495
aio_free_ring(ctx);
401496
out_freectx:
497+
if (ctx->aio_ring_file)
498+
fput(ctx->aio_ring_file);
402499
kmem_cache_free(kioctx_cachep, ctx);
403500
pr_debug("error allocating ioctx %d\n", err);
404501
return ERR_PTR(err);

include/linux/migrate.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,9 @@ extern int migrate_vmas(struct mm_struct *mm,
5555
extern void migrate_page_copy(struct page *newpage, struct page *page);
5656
extern int migrate_huge_page_move_mapping(struct address_space *mapping,
5757
struct page *newpage, struct page *page);
58+
extern int migrate_page_move_mapping(struct address_space *mapping,
59+
struct page *newpage, struct page *page,
60+
struct buffer_head *head, enum migrate_mode mode);
5861
#else
5962

6063
static inline void putback_lru_pages(struct list_head *l) {}

mm/migrate.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -307,7 +307,7 @@ static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
307307
* 2 for pages with a mapping
308308
* 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
309309
*/
310-
static int migrate_page_move_mapping(struct address_space *mapping,
310+
int migrate_page_move_mapping(struct address_space *mapping,
311311
struct page *newpage, struct page *page,
312312
struct buffer_head *head, enum migrate_mode mode)
313313
{

0 commit comments

Comments
 (0)