Skip to content

Commit 30467e0

Browse files
rientjestorvalds
authored andcommitted
mm, hotplug: fix concurrent memory hot-add deadlock
There's a deadlock when concurrently hot-adding memory through the probe interface and switching a memory block from offline to online. When hot-adding memory via the probe interface, add_memory() first takes mem_hotplug_begin() and then device_lock() is later taken when registering the newly initialized memory block. This creates a lock dependency of (1) mem_hotplug.lock (2) dev->mutex. When switching a memory block from offline to online, dev->mutex is first grabbed in device_online() when the write(2) transitions an existing memory block from offline to online, and then online_pages() will take mem_hotplug_begin(). This creates a lock inversion between mem_hotplug.lock and dev->mutex. Vitaly reports that this deadlock can happen when kworker handling a probe event races with systemd-udevd switching a memory block's state. This patch requires the state transition to take mem_hotplug_begin() before dev->mutex. Hot-adding memory via the probe interface creates a memory block while holding mem_hotplug_begin(), there is no way to take dev->mutex first in this case. online_pages() and offline_pages() are only called when transitioning memory block state. We now require that mem_hotplug_begin() is taken before calling them -- this requires exporting the mem_hotplug_begin() and mem_hotplug_done() to generic code. In all hot-add and hot-remove cases, mem_hotplug_begin() is done prior to device_online(). This is all that is needed to avoid the deadlock. Signed-off-by: David Rientjes <rientjes@google.com> Reported-by: Vitaly Kuznetsov <vkuznets@redhat.com> Tested-by: Vitaly Kuznetsov <vkuznets@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net> Cc: "K. Y. Srinivasan" <kys@microsoft.com> Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com> Cc: Tang Chen <tangchen@cn.fujitsu.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Zhang Zhen <zhenzhang.zhang@huawei.com> Cc: Vladimir Davydov <vdavydov@parallels.com> Cc: Wang Nan <wangnan0@huawei.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 17e0db8 commit 30467e0

File tree

3 files changed

+30
-28
lines changed

3 files changed

+30
-28
lines changed

drivers/base/memory.c

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,7 @@ static bool pages_correctly_reserved(unsigned long start_pfn)
219219
/*
220220
* MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
221221
* OK to have direct references to sparsemem variables in here.
222+
* Must already be protected by mem_hotplug_begin().
222223
*/
223224
static int
224225
memory_block_action(unsigned long phys_index, unsigned long action, int online_type)
@@ -286,6 +287,7 @@ static int memory_subsys_online(struct device *dev)
286287
if (mem->online_type < 0)
287288
mem->online_type = MMOP_ONLINE_KEEP;
288289

290+
/* Already under protection of mem_hotplug_begin() */
289291
ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
290292

291293
/* clear online_type */
@@ -328,17 +330,19 @@ store_mem_state(struct device *dev,
328330
goto err;
329331
}
330332

333+
/*
334+
* Memory hotplug needs to hold mem_hotplug_begin() for probe to find
335+
* the correct memory block to online before doing device_online(dev),
336+
* which will take dev->mutex. Take the lock early to prevent an
337+
* inversion, memory_subsys_online() callbacks will be implemented by
338+
* assuming it's already protected.
339+
*/
340+
mem_hotplug_begin();
341+
331342
switch (online_type) {
332343
case MMOP_ONLINE_KERNEL:
333344
case MMOP_ONLINE_MOVABLE:
334345
case MMOP_ONLINE_KEEP:
335-
/*
336-
* mem->online_type is not protected so there can be a
337-
* race here. However, when racing online, the first
338-
* will succeed and the second will just return as the
339-
* block will already be online. The online type
340-
* could be either one, but that is expected.
341-
*/
342346
mem->online_type = online_type;
343347
ret = device_online(&mem->dev);
344348
break;
@@ -349,6 +353,7 @@ store_mem_state(struct device *dev,
349353
ret = -EINVAL; /* should never happen */
350354
}
351355

356+
mem_hotplug_done();
352357
err:
353358
unlock_device_hotplug();
354359

include/linux/memory_hotplug.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,9 @@ extern void get_page_bootmem(unsigned long ingo, struct page *page,
192192
void get_online_mems(void);
193193
void put_online_mems(void);
194194

195+
void mem_hotplug_begin(void);
196+
void mem_hotplug_done(void);
197+
195198
#else /* ! CONFIG_MEMORY_HOTPLUG */
196199
/*
197200
* Stub functions for when hotplug is off
@@ -231,6 +234,9 @@ static inline int try_online_node(int nid)
231234
static inline void get_online_mems(void) {}
232235
static inline void put_online_mems(void) {}
233236

237+
static inline void mem_hotplug_begin(void) {}
238+
static inline void mem_hotplug_done(void) {}
239+
234240
#endif /* ! CONFIG_MEMORY_HOTPLUG */
235241

236242
#ifdef CONFIG_MEMORY_HOTREMOVE

mm/memory_hotplug.c

Lines changed: 12 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ void put_online_mems(void)
104104

105105
}
106106

107-
static void mem_hotplug_begin(void)
107+
void mem_hotplug_begin(void)
108108
{
109109
mem_hotplug.active_writer = current;
110110

@@ -119,7 +119,7 @@ static void mem_hotplug_begin(void)
119119
}
120120
}
121121

122-
static void mem_hotplug_done(void)
122+
void mem_hotplug_done(void)
123123
{
124124
mem_hotplug.active_writer = NULL;
125125
mutex_unlock(&mem_hotplug.lock);
@@ -959,6 +959,7 @@ static void node_states_set_node(int node, struct memory_notify *arg)
959959
}
960960

961961

962+
/* Must be protected by mem_hotplug_begin() */
962963
int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
963964
{
964965
unsigned long flags;
@@ -969,29 +970,27 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
969970
int ret;
970971
struct memory_notify arg;
971972

972-
mem_hotplug_begin();
973973
/*
974974
* This doesn't need a lock to do pfn_to_page().
975975
* The section can't be removed here because of the
976976
* memory_block->state_mutex.
977977
*/
978978
zone = page_zone(pfn_to_page(pfn));
979979

980-
ret = -EINVAL;
981980
if ((zone_idx(zone) > ZONE_NORMAL ||
982981
online_type == MMOP_ONLINE_MOVABLE) &&
983982
!can_online_high_movable(zone))
984-
goto out;
983+
return -EINVAL;
985984

986985
if (online_type == MMOP_ONLINE_KERNEL &&
987986
zone_idx(zone) == ZONE_MOVABLE) {
988987
if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages))
989-
goto out;
988+
return -EINVAL;
990989
}
991990
if (online_type == MMOP_ONLINE_MOVABLE &&
992991
zone_idx(zone) == ZONE_MOVABLE - 1) {
993992
if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages))
994-
goto out;
993+
return -EINVAL;
995994
}
996995

997996
/* Previous code may changed the zone of the pfn range */
@@ -1007,7 +1006,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
10071006
ret = notifier_to_errno(ret);
10081007
if (ret) {
10091008
memory_notify(MEM_CANCEL_ONLINE, &arg);
1010-
goto out;
1009+
return ret;
10111010
}
10121011
/*
10131012
* If this zone is not populated, then it is not in zonelist.
@@ -1031,7 +1030,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
10311030
(((unsigned long long) pfn + nr_pages)
10321031
<< PAGE_SHIFT) - 1);
10331032
memory_notify(MEM_CANCEL_ONLINE, &arg);
1034-
goto out;
1033+
return ret;
10351034
}
10361035

10371036
zone->present_pages += onlined_pages;
@@ -1061,9 +1060,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
10611060

10621061
if (onlined_pages)
10631062
memory_notify(MEM_ONLINE, &arg);
1064-
out:
1065-
mem_hotplug_done();
1066-
return ret;
1063+
return 0;
10671064
}
10681065
#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
10691066

@@ -1688,21 +1685,18 @@ static int __ref __offline_pages(unsigned long start_pfn,
16881685
if (!test_pages_in_a_zone(start_pfn, end_pfn))
16891686
return -EINVAL;
16901687

1691-
mem_hotplug_begin();
1692-
16931688
zone = page_zone(pfn_to_page(start_pfn));
16941689
node = zone_to_nid(zone);
16951690
nr_pages = end_pfn - start_pfn;
16961691

1697-
ret = -EINVAL;
16981692
if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages))
1699-
goto out;
1693+
return -EINVAL;
17001694

17011695
/* set above range as isolated */
17021696
ret = start_isolate_page_range(start_pfn, end_pfn,
17031697
MIGRATE_MOVABLE, true);
17041698
if (ret)
1705-
goto out;
1699+
return ret;
17061700

17071701
arg.start_pfn = start_pfn;
17081702
arg.nr_pages = nr_pages;
@@ -1795,7 +1789,6 @@ static int __ref __offline_pages(unsigned long start_pfn,
17951789
writeback_set_ratelimit();
17961790

17971791
memory_notify(MEM_OFFLINE, &arg);
1798-
mem_hotplug_done();
17991792
return 0;
18001793

18011794
failed_removal:
@@ -1805,12 +1798,10 @@ static int __ref __offline_pages(unsigned long start_pfn,
18051798
memory_notify(MEM_CANCEL_OFFLINE, &arg);
18061799
/* pushback to free area */
18071800
undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
1808-
1809-
out:
1810-
mem_hotplug_done();
18111801
return ret;
18121802
}
18131803

1804+
/* Must be protected by mem_hotplug_begin() */
18141805
int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
18151806
{
18161807
return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);

0 commit comments

Comments
 (0)