Skip to content

Commit 4ffe713

Browse files
kvaneeshmpe
authored andcommitted
powerpc/mm: Increase the max addressable memory to 2PB
Currently we limit the max addressable memory to 128TB. This patch increase the limit to 2PB. We can have devices like nvdimm which adds memory above 512TB limit. We still don't support regular system ram above 512TB. One of the challenge with that is the percpu allocator, that allocates per node memory and use the max distance between them as the percpu offsets. This means with large gap in address space ( system ram above 1PB) we will run out of vmalloc space to map the percpu allocation. In order to support addressable memory above 512TB, kernel should be able to linear map this range. To do that with hash translation we now add 4 context to kernel linear map region. Our per context addressable range is 512TB. We still keep VMALLOC and VMEMMAP region to old size. SLB miss handlers is updated to validate these limit. We also limit this update to SPARSEMEM_VMEMMAP and SPARSEMEM_EXTREME Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
1 parent c9f8073 commit 4ffe713

File tree

4 files changed

+87
-41
lines changed

4 files changed

+87
-41
lines changed

arch/powerpc/include/asm/book3s/64/mmu-hash.h

Lines changed: 57 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -521,13 +521,9 @@ extern void slb_set_size(u16 size);
521521
* from mmu context id and effective segment id of the address.
522522
*
523523
* For user processes max context id is limited to MAX_USER_CONTEXT.
524-
525-
* For kernel space, we use context ids 1-4 to map addresses as below:
526-
* NOTE: each context only support 64TB now.
527-
* 0x00001 - [ 0xc000000000000000 - 0xc0003fffffffffff ]
528-
* 0x00002 - [ 0xd000000000000000 - 0xd0003fffffffffff ]
529-
* 0x00003 - [ 0xe000000000000000 - 0xe0003fffffffffff ]
530-
* 0x00004 - [ 0xf000000000000000 - 0xf0003fffffffffff ]
524+
* more details in get_user_context
525+
*
526+
* For kernel space get_kernel_context
531527
*
532528
* The proto-VSIDs are then scrambled into real VSIDs with the
533529
* multiplicative hash:
@@ -567,6 +563,21 @@ extern void slb_set_size(u16 size);
567563
#define ESID_BITS_MASK ((1 << ESID_BITS) - 1)
568564
#define ESID_BITS_1T_MASK ((1 << ESID_BITS_1T) - 1)
569565

566+
/*
567+
* Now certain config support MAX_PHYSMEM more than 512TB. Hence we will need
568+
* to use more than one context for linear mapping the kernel.
569+
* For vmalloc and memmap, we use just one context with 512TB. With 64 byte
570+
* struct page size, we need ony 32 TB in memmap for 2PB (51 bits (MAX_PHYSMEM_BITS)).
571+
*/
572+
#if (MAX_PHYSMEM_BITS > MAX_EA_BITS_PER_CONTEXT)
573+
#define MAX_KERNEL_CTX_CNT (1UL << (MAX_PHYSMEM_BITS - MAX_EA_BITS_PER_CONTEXT))
574+
#else
575+
#define MAX_KERNEL_CTX_CNT 1
576+
#endif
577+
578+
#define MAX_VMALLOC_CTX_CNT 1
579+
#define MAX_MEMMAP_CTX_CNT 1
580+
570581
/*
571582
* 256MB segment
572583
* The proto-VSID space has 2^(CONTEX_BITS + ESID_BITS) - 1 segments
@@ -577,12 +588,13 @@ extern void slb_set_size(u16 size);
577588
* We also need to avoid the last segment of the last context, because that
578589
* would give a protovsid of 0x1fffffffff. That will result in a VSID 0
579590
* because of the modulo operation in vsid scramble.
591+
*
592+
* We add one extra context to MIN_USER_CONTEXT so that we can map kernel
593+
* context easily. The +1 is to map the unused 0xe region mapping.
580594
*/
581595
#define MAX_USER_CONTEXT ((ASM_CONST(1) << CONTEXT_BITS) - 2)
582-
#define MIN_USER_CONTEXT (5)
583-
584-
/* Would be nice to use KERNEL_REGION_ID here */
585-
#define KERNEL_REGION_CONTEXT_OFFSET (0xc - 1)
596+
#define MIN_USER_CONTEXT (MAX_KERNEL_CTX_CNT + MAX_VMALLOC_CTX_CNT + \
597+
MAX_MEMMAP_CTX_CNT + 2)
586598

587599
/*
588600
* For platforms that support on 65bit VA we limit the context bits
@@ -742,6 +754,39 @@ static inline unsigned long get_vsid(unsigned long context, unsigned long ea,
742754
return vsid_scramble(protovsid, VSID_MULTIPLIER_1T, vsid_bits);
743755
}
744756

757+
/*
758+
* For kernel space, we use context ids as below
759+
* below. Range is 512TB per context.
760+
*
761+
* 0x00001 - [ 0xc000000000000000 - 0xc001ffffffffffff]
762+
* 0x00002 - [ 0xc002000000000000 - 0xc003ffffffffffff]
763+
* 0x00003 - [ 0xc004000000000000 - 0xc005ffffffffffff]
764+
* 0x00004 - [ 0xc006000000000000 - 0xc007ffffffffffff]
765+
766+
* 0x00005 - [ 0xd000000000000000 - 0xd001ffffffffffff ]
767+
* 0x00006 - Not used - Can map 0xe000000000000000 range.
768+
* 0x00007 - [ 0xf000000000000000 - 0xf001ffffffffffff ]
769+
*
770+
* So we can compute the context from the region (top nibble) by
771+
* subtracting 11, or 0xc - 1.
772+
*/
773+
static inline unsigned long get_kernel_context(unsigned long ea)
774+
{
775+
unsigned long region_id = REGION_ID(ea);
776+
unsigned long ctx;
777+
/*
778+
* For linear mapping we do support multiple context
779+
*/
780+
if (region_id == KERNEL_REGION_ID) {
781+
/*
782+
* We already verified ea to be not beyond the addr limit.
783+
*/
784+
ctx = 1 + ((ea & ~REGION_MASK) >> MAX_EA_BITS_PER_CONTEXT);
785+
} else
786+
ctx = (region_id - 0xc) + MAX_KERNEL_CTX_CNT;
787+
return ctx;
788+
}
789+
745790
/*
746791
* This is only valid for addresses >= PAGE_OFFSET
747792
*/
@@ -752,20 +797,7 @@ static inline unsigned long get_kernel_vsid(unsigned long ea, int ssize)
752797
if (!is_kernel_addr(ea))
753798
return 0;
754799

755-
/*
756-
* For kernel space, we use context ids 1-4 to map the address space as
757-
* below:
758-
*
759-
* 0x00001 - [ 0xc000000000000000 - 0xc0003fffffffffff ]
760-
* 0x00002 - [ 0xd000000000000000 - 0xd0003fffffffffff ]
761-
* 0x00003 - [ 0xe000000000000000 - 0xe0003fffffffffff ]
762-
* 0x00004 - [ 0xf000000000000000 - 0xf0003fffffffffff ]
763-
*
764-
* So we can compute the context from the region (top nibble) by
765-
* subtracting 11, or 0xc - 1.
766-
*/
767-
context = (ea >> 60) - KERNEL_REGION_CONTEXT_OFFSET;
768-
800+
context = get_kernel_context(ea);
769801
return get_vsid(context, ea, ssize);
770802
}
771803

arch/powerpc/include/asm/mmu.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,21 @@ static inline u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address)
309309
*/
310310
#define MMU_PAGE_COUNT 16
311311

312+
/*
313+
* If we store section details in page->flags we can't increase the MAX_PHYSMEM_BITS
314+
* if we increase SECTIONS_WIDTH we will not store node details in page->flags and
315+
* page_to_nid does a page->section->node lookup
316+
* Hence only increase for VMEMMAP. Further depending on SPARSEMEM_EXTREME reduce
317+
* memory requirements with large number of sections.
318+
* 51 bits is the max physical real address on POWER9
319+
*/
320+
#if defined(CONFIG_SPARSEMEM_VMEMMAP) && defined(CONFIG_SPARSEMEM_EXTREME) && \
321+
defined (CONFIG_PPC_64K_PAGES)
322+
#define MAX_PHYSMEM_BITS 51
323+
#else
324+
#define MAX_PHYSMEM_BITS 46
325+
#endif
326+
312327
#ifdef CONFIG_PPC_BOOK3S_64
313328
#include <asm/book3s/64/mmu.h>
314329
#else /* CONFIG_PPC_BOOK3S_64 */

arch/powerpc/include/asm/sparsemem.h

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,17 +9,6 @@
99
* MAX_PHYSMEM_BITS 2^N: how much memory we can have in that space
1010
*/
1111
#define SECTION_SIZE_BITS 24
12-
/*
13-
* If we store section details in page->flags we can't increase the MAX_PHYSMEM_BITS
14-
* if we increase SECTIONS_WIDTH we will not store node details in page->flags and
15-
* page_to_nid does a page->section->node lookup
16-
* Hence only increase for VMEMMAP.
17-
*/
18-
#ifdef CONFIG_SPARSEMEM_VMEMMAP
19-
#define MAX_PHYSMEM_BITS 47
20-
#else
21-
#define MAX_PHYSMEM_BITS 46
22-
#endif
2312

2413
#endif /* CONFIG_SPARSEMEM */
2514

arch/powerpc/mm/slb.c

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -693,16 +693,27 @@ static long slb_allocate_kernel(unsigned long ea, unsigned long id)
693693
unsigned long flags;
694694
int ssize;
695695

696-
if ((ea & ~REGION_MASK) >= (1ULL << MAX_EA_BITS_PER_CONTEXT))
697-
return -EFAULT;
698-
699696
if (id == KERNEL_REGION_ID) {
697+
698+
/* We only support upto MAX_PHYSMEM_BITS */
699+
if ((ea & ~REGION_MASK) > (1UL << MAX_PHYSMEM_BITS))
700+
return -EFAULT;
701+
700702
flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_linear_psize].sllp;
703+
701704
#ifdef CONFIG_SPARSEMEM_VMEMMAP
702705
} else if (id == VMEMMAP_REGION_ID) {
706+
707+
if ((ea & ~REGION_MASK) >= (1ULL << MAX_EA_BITS_PER_CONTEXT))
708+
return -EFAULT;
709+
703710
flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmemmap_psize].sllp;
704711
#endif
705712
} else if (id == VMALLOC_REGION_ID) {
713+
714+
if ((ea & ~REGION_MASK) >= (1ULL << MAX_EA_BITS_PER_CONTEXT))
715+
return -EFAULT;
716+
706717
if (ea < H_VMALLOC_END)
707718
flags = get_paca()->vmalloc_sllp;
708719
else
@@ -715,8 +726,7 @@ static long slb_allocate_kernel(unsigned long ea, unsigned long id)
715726
if (!mmu_has_feature(MMU_FTR_1T_SEGMENT))
716727
ssize = MMU_SEGSIZE_256M;
717728

718-
context = id - KERNEL_REGION_CONTEXT_OFFSET;
719-
729+
context = get_kernel_context(ea);
720730
return slb_insert_entry(ea, context, flags, ssize, true);
721731
}
722732

0 commit comments

Comments
 (0)