Skip to content

Commit 485761b

Browse files
Andi KleenLinus Torvalds
authored andcommitted
[PATCH] x86_64: Tell VM about holes in nodes
Some nodes can have large holes on x86-64. This fixes problems with the VM allowing too many dirty pages because it overestimates the number of available RAM in a node. In extreme cases you can end up with all RAM filled with dirty pages which can lead to deadlocks and other nasty behaviour. This patch just tells the VM about the known holes from e820. Reserved (like the kernel text or mem_map) is still not taken into account, but that should be only a few percent error now. Small detail is that the flat setup uses the NUMA free_area_init_node() now too because it offers more flexibility. (akpm: lotsa thanks to Martin for working this problem out) Cc: Martin Bligh <mbligh@mbligh.org> Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
1 parent bebf468 commit 485761b

File tree

4 files changed

+55
-5
lines changed

4 files changed

+55
-5
lines changed

arch/x86_64/kernel/e820.c

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,40 @@ unsigned long __init e820_end_of_ram(void)
185185
}
186186

187187
/*
188+
* Compute how much memory is missing in a range.
189+
* Unlike the other functions in this file the arguments are in page numbers.
190+
*/
191+
unsigned long __init
192+
e820_hole_size(unsigned long start_pfn, unsigned long end_pfn)
193+
{
194+
unsigned long ram = 0;
195+
unsigned long start = start_pfn << PAGE_SHIFT;
196+
unsigned long end = end_pfn << PAGE_SHIFT;
197+
int i;
198+
for (i = 0; i < e820.nr_map; i++) {
199+
struct e820entry *ei = &e820.map[i];
200+
unsigned long last, addr;
201+
202+
if (ei->type != E820_RAM ||
203+
ei->addr+ei->size <= start ||
204+
ei->addr >= end)
205+
continue;
206+
207+
addr = round_up(ei->addr, PAGE_SIZE);
208+
if (addr < start)
209+
addr = start;
210+
211+
last = round_down(ei->addr + ei->size, PAGE_SIZE);
212+
if (last >= end)
213+
last = end;
214+
215+
if (last > addr)
216+
ram += last - addr;
217+
}
218+
return ((end - start) - ram) >> PAGE_SHIFT;
219+
}
220+
221+
/*
188222
* Mark e820 reserved areas as busy for the resource manager.
189223
*/
190224
void __init e820_reserve_resources(void)

arch/x86_64/mm/init.c

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -322,18 +322,26 @@ void zap_low_mappings(void)
322322
void __init paging_init(void)
323323
{
324324
{
325-
unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
325+
unsigned long zones_size[MAX_NR_ZONES];
326+
unsigned long holes[MAX_NR_ZONES];
326327
unsigned int max_dma;
327328

329+
memset(zones_size, 0, sizeof(zones_size));
330+
memset(holes, 0, sizeof(holes));
331+
328332
max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
329333

330-
if (end_pfn < max_dma)
334+
if (end_pfn < max_dma) {
331335
zones_size[ZONE_DMA] = end_pfn;
332-
else {
336+
holes[ZONE_DMA] = e820_hole_size(0, end_pfn);
337+
} else {
333338
zones_size[ZONE_DMA] = max_dma;
339+
holes[ZONE_DMA] = e820_hole_size(0, max_dma);
334340
zones_size[ZONE_NORMAL] = end_pfn - max_dma;
341+
holes[ZONE_NORMAL] = e820_hole_size(max_dma, end_pfn);
335342
}
336-
free_area_init(zones_size);
343+
free_area_init_node(0, NODE_DATA(0), zones_size,
344+
__pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
337345
}
338346
return;
339347
}

arch/x86_64/mm/numa.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,9 +126,11 @@ void __init setup_node_zones(int nodeid)
126126
{
127127
unsigned long start_pfn, end_pfn;
128128
unsigned long zones[MAX_NR_ZONES];
129+
unsigned long holes[MAX_NR_ZONES];
129130
unsigned long dma_end_pfn;
130131

131132
memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES);
133+
memset(holes, 0, sizeof(unsigned long) * MAX_NR_ZONES);
132134

133135
start_pfn = node_start_pfn(nodeid);
134136
end_pfn = node_end_pfn(nodeid);
@@ -139,13 +141,17 @@ void __init setup_node_zones(int nodeid)
139141
dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT;
140142
if (start_pfn < dma_end_pfn) {
141143
zones[ZONE_DMA] = dma_end_pfn - start_pfn;
144+
holes[ZONE_DMA] = e820_hole_size(start_pfn, dma_end_pfn);
142145
zones[ZONE_NORMAL] = end_pfn - dma_end_pfn;
146+
holes[ZONE_NORMAL] = e820_hole_size(dma_end_pfn, end_pfn);
147+
143148
} else {
144149
zones[ZONE_NORMAL] = end_pfn - start_pfn;
150+
holes[ZONE_NORMAL] = e820_hole_size(start_pfn, end_pfn);
145151
}
146152

147153
free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
148-
start_pfn, NULL);
154+
start_pfn, holes);
149155
}
150156

151157
void __init numa_init_array(void)

include/asm-x86_64/e820.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ extern int e820_mapped(unsigned long start, unsigned long end, unsigned type);
5151

5252
extern void e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end);
5353
extern void e820_setup_gap(void);
54+
extern unsigned long e820_hole_size(unsigned long start_pfn,
55+
unsigned long end_pfn);
5456

5557
extern void __init parse_memopt(char *p, char **end);
5658

0 commit comments

Comments
 (0)