Skip to content

Commit 6df3868

Browse files
Vladimir Davydovtorvalds
Vladimir Davydov
authored andcommitted
mm: memcontrol: fix possible memcg leak due to interrupted reclaim
Memory cgroup reclaim can be interrupted with mem_cgroup_iter_break() once enough pages have been reclaimed, in which case, in contrast to a full round-trip over a cgroup sub-tree, the current position stored in mem_cgroup_reclaim_iter of the target cgroup does not get invalidated and so is left holding the reference to the last scanned cgroup. If the target cgroup does not get scanned again (we might have just reclaimed the last page or all processes might exit and free their memory voluntary), we will leak it, because there is nobody to put the reference held by the iterator. The problem is easy to reproduce by running the following command sequence in a loop: mkdir /sys/fs/cgroup/memory/test echo 100M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes echo $$ > /sys/fs/cgroup/memory/test/cgroup.procs memhog 150M echo $$ > /sys/fs/cgroup/memory/cgroup.procs rmdir test The cgroups generated by it will never get freed. This patch fixes this issue by making mem_cgroup_iter avoid taking reference to the current position. In order not to hit use-after-free bug while running reclaim in parallel with cgroup deletion, we make use of ->css_released cgroup callback to clear references to the dying cgroup in all reclaim iterators that might refer to it. This callback is called right before scheduling rcu work which will free css, so if we access iter->position from rcu read section, we might be sure it won't go away under us. [hannes@cmpxchg.org: clean up css ref handling] Fixes: 5ac8fb3 ("mm: memcontrol: convert reclaim iterator to simple css refcounting") Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: Michal Hocko <mhocko@kernel.org> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Cc: <stable@vger.kernel.org> [3.19+] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 5c9ee4c commit 6df3868

File tree

1 file changed

+46
-14
lines changed

1 file changed

+46
-14
lines changed

mm/memcontrol.c

Lines changed: 46 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -903,14 +903,20 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
903903
if (prev && reclaim->generation != iter->generation)
904904
goto out_unlock;
905905

906-
do {
906+
while (1) {
907907
pos = READ_ONCE(iter->position);
908+
if (!pos || css_tryget(&pos->css))
909+
break;
908910
/*
909-
* A racing update may change the position and
910-
* put the last reference, hence css_tryget(),
911-
* or retry to see the updated position.
911+
* css reference reached zero, so iter->position will
912+
* be cleared by ->css_released. However, we should not
913+
* rely on this happening soon, because ->css_released
914+
* is called from a work queue, and by busy-waiting we
915+
* might block it. So we clear iter->position right
916+
* away.
912917
*/
913-
} while (pos && !css_tryget(&pos->css));
918+
(void)cmpxchg(&iter->position, pos, NULL);
919+
}
914920
}
915921

916922
if (pos)
@@ -956,17 +962,13 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
956962
}
957963

958964
if (reclaim) {
959-
if (cmpxchg(&iter->position, pos, memcg) == pos) {
960-
if (memcg)
961-
css_get(&memcg->css);
962-
if (pos)
963-
css_put(&pos->css);
964-
}
965-
966965
/*
967-
* pairs with css_tryget when dereferencing iter->position
968-
* above.
966+
* The position could have already been updated by a competing
967+
* thread, so check that the value hasn't changed since we read
968+
* it to avoid reclaiming from the same cgroup twice.
969969
*/
970+
(void)cmpxchg(&iter->position, pos, memcg);
971+
970972
if (pos)
971973
css_put(&pos->css);
972974

@@ -999,6 +1001,28 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
9991001
css_put(&prev->css);
10001002
}
10011003

1004+
static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
1005+
{
1006+
struct mem_cgroup *memcg = dead_memcg;
1007+
struct mem_cgroup_reclaim_iter *iter;
1008+
struct mem_cgroup_per_zone *mz;
1009+
int nid, zid;
1010+
int i;
1011+
1012+
while ((memcg = parent_mem_cgroup(memcg))) {
1013+
for_each_node(nid) {
1014+
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1015+
mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
1016+
for (i = 0; i <= DEF_PRIORITY; i++) {
1017+
iter = &mz->iter[i];
1018+
cmpxchg(&iter->position,
1019+
dead_memcg, NULL);
1020+
}
1021+
}
1022+
}
1023+
}
1024+
}
1025+
10021026
/*
10031027
* Iteration constructs for visiting all cgroups (under a tree). If
10041028
* loops are exited prematurely (break), mem_cgroup_iter_break() must
@@ -4324,6 +4348,13 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
43244348
wb_memcg_offline(memcg);
43254349
}
43264350

4351+
static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
4352+
{
4353+
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4354+
4355+
invalidate_reclaim_iterators(memcg);
4356+
}
4357+
43274358
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
43284359
{
43294360
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
@@ -5185,6 +5216,7 @@ struct cgroup_subsys memory_cgrp_subsys = {
51855216
.css_alloc = mem_cgroup_css_alloc,
51865217
.css_online = mem_cgroup_css_online,
51875218
.css_offline = mem_cgroup_css_offline,
5219+
.css_released = mem_cgroup_css_released,
51885220
.css_free = mem_cgroup_css_free,
51895221
.css_reset = mem_cgroup_css_reset,
51905222
.can_attach = mem_cgroup_can_attach,

0 commit comments

Comments
 (0)