Skip to content

Commit 5434ae7

Browse files
npigginmpe
authored andcommitted
powerpc/64s/hash: Add a SLB preload cache
When switching processes, currently all user SLBEs are cleared, and a few (exec_base, pc, and stack) are preloaded. In trivial testing with small apps, this tends to miss the heap and low 256MB segments, and it will also miss commonly accessed segments on large memory workloads. Add a simple round-robin preload cache that just inserts the last SLB miss into the head of the cache and preloads those at context switch time. Every 256 context switches, the oldest entry is removed from the cache to shrink the cache and require fewer slbmte if they are unused. Much more could go into this, including into the SLB entry reclaim side to track some LRU information etc, which would require a study of large memory workloads. But this is a simple thing we can do now that is an obvious win for common workloads. With the full series, process switching speed on the context_switch benchmark on POWER9/hash (with kernel speculation security masures disabled) increases from 140K/s to 178K/s (27%). POWER8 does not change much (within 1%), it's unclear why it does not see a big gain like POWER9. Booting to busybox init with 256MB segments has SLB misses go down from 945 to 69, and with 1T segments 900 to 21. These could almost all be eliminated by preloading a bit more carefully with ELF binary loading. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
1 parent 425d331 commit 5434ae7

File tree

5 files changed

+181
-44
lines changed

5 files changed

+181
-44
lines changed

arch/powerpc/include/asm/processor.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,7 @@ struct thread_struct {
273273
#endif /* CONFIG_HAVE_HW_BREAKPOINT */
274274
struct arch_hw_breakpoint hw_brk; /* info on the hardware breakpoint */
275275
unsigned long trap_nr; /* last trap # on this thread */
276+
u8 load_slb; /* Ages out SLB preload cache entries */
276277
u8 load_fp;
277278
#ifdef CONFIG_ALTIVEC
278279
u8 load_vec;

arch/powerpc/include/asm/thread_info.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include <asm/page.h>
3030
#include <asm/accounting.h>
3131

32+
#define SLB_PRELOAD_NR 16U
3233
/*
3334
* low level task data.
3435
*/
@@ -44,6 +45,10 @@ struct thread_info {
4445
#if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) && defined(CONFIG_PPC32)
4546
struct cpu_accounting_data accounting;
4647
#endif
48+
unsigned char slb_preload_nr;
49+
unsigned char slb_preload_tail;
50+
u32 slb_preload_esid[SLB_PRELOAD_NR];
51+
4752
/* low level flags - has atomic operations done on it */
4853
unsigned long flags ____cacheline_aligned_in_smp;
4954
};

arch/powerpc/kernel/process.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1727,13 +1727,19 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
17271727
return 0;
17281728
}
17291729

1730+
void preload_new_slb_context(unsigned long start, unsigned long sp);
1731+
17301732
/*
17311733
* Set up a thread for executing a new program
17321734
*/
17331735
void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp)
17341736
{
17351737
#ifdef CONFIG_PPC64
17361738
unsigned long load_addr = regs->gpr[2]; /* saved by ELF_PLAT_INIT */
1739+
1740+
#ifdef CONFIG_PPC_BOOK3S_64
1741+
preload_new_slb_context(start, sp);
1742+
#endif
17371743
#endif
17381744

17391745
/*
@@ -1824,6 +1830,7 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp)
18241830
#ifdef CONFIG_VSX
18251831
current->thread.used_vsr = 0;
18261832
#endif
1833+
current->thread.load_slb = 0;
18271834
current->thread.load_fp = 0;
18281835
memset(&current->thread.fp_state, 0, sizeof(current->thread.fp_state));
18291836
current->thread.fp_save_area = NULL;

arch/powerpc/mm/mmu_context_book3s64.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,8 @@ int hash__alloc_context_id(void)
5353
}
5454
EXPORT_SYMBOL_GPL(hash__alloc_context_id);
5555

56+
void slb_setup_new_exec(void);
57+
5658
static int hash__init_new_context(struct mm_struct *mm)
5759
{
5860
int index;
@@ -87,6 +89,8 @@ static int hash__init_new_context(struct mm_struct *mm)
8789
void hash__setup_new_exec(void)
8890
{
8991
slice_setup_new_exec();
92+
93+
slb_setup_new_exec();
9094
}
9195

9296
static int radix__init_new_context(struct mm_struct *mm)

arch/powerpc/mm/slb.c

Lines changed: 164 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -257,41 +257,148 @@ void slb_vmalloc_update(void)
257257
slb_flush_and_rebolt();
258258
}
259259

260-
/* Helper function to compare esids. There are four cases to handle.
261-
* 1. The system is not 1T segment size capable. Use the GET_ESID compare.
262-
* 2. The system is 1T capable, both addresses are < 1T, use the GET_ESID compare.
263-
* 3. The system is 1T capable, only one of the two addresses is > 1T. This is not a match.
264-
* 4. The system is 1T capable, both addresses are > 1T, use the GET_ESID_1T macro to compare.
265-
*/
266-
static inline int esids_match(unsigned long addr1, unsigned long addr2)
260+
static bool preload_hit(struct thread_info *ti, unsigned long esid)
267261
{
268-
int esid_1t_count;
262+
unsigned char i;
269263

270-
/* System is not 1T segment size capable. */
271-
if (!mmu_has_feature(MMU_FTR_1T_SEGMENT))
272-
return (GET_ESID(addr1) == GET_ESID(addr2));
264+
for (i = 0; i < ti->slb_preload_nr; i++) {
265+
unsigned char idx;
266+
267+
idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
268+
if (esid == ti->slb_preload_esid[idx])
269+
return true;
270+
}
271+
return false;
272+
}
273+
274+
static bool preload_add(struct thread_info *ti, unsigned long ea)
275+
{
276+
unsigned char idx;
277+
unsigned long esid;
278+
279+
if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) {
280+
/* EAs are stored >> 28 so 256MB segments don't need clearing */
281+
if (ea & ESID_MASK_1T)
282+
ea &= ESID_MASK_1T;
283+
}
273284

274-
esid_1t_count = (((addr1 >> SID_SHIFT_1T) != 0) +
275-
((addr2 >> SID_SHIFT_1T) != 0));
285+
esid = ea >> SID_SHIFT;
276286

277-
/* both addresses are < 1T */
278-
if (esid_1t_count == 0)
279-
return (GET_ESID(addr1) == GET_ESID(addr2));
287+
if (preload_hit(ti, esid))
288+
return false;
280289

281-
/* One address < 1T, the other > 1T. Not a match */
282-
if (esid_1t_count == 1)
283-
return 0;
290+
idx = (ti->slb_preload_tail + ti->slb_preload_nr) % SLB_PRELOAD_NR;
291+
ti->slb_preload_esid[idx] = esid;
292+
if (ti->slb_preload_nr == SLB_PRELOAD_NR)
293+
ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR;
294+
else
295+
ti->slb_preload_nr++;
284296

285-
/* Both addresses are > 1T. */
286-
return (GET_ESID_1T(addr1) == GET_ESID_1T(addr2));
297+
return true;
287298
}
288299

300+
static void preload_age(struct thread_info *ti)
301+
{
302+
if (!ti->slb_preload_nr)
303+
return;
304+
ti->slb_preload_nr--;
305+
ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR;
306+
}
307+
308+
void slb_setup_new_exec(void)
309+
{
310+
struct thread_info *ti = current_thread_info();
311+
struct mm_struct *mm = current->mm;
312+
unsigned long exec = 0x10000000;
313+
314+
WARN_ON(irqs_disabled());
315+
316+
/*
317+
* preload cache can only be used to determine whether a SLB
318+
* entry exists if it does not start to overflow.
319+
*/
320+
if (ti->slb_preload_nr + 2 > SLB_PRELOAD_NR)
321+
return;
322+
323+
hard_irq_disable();
324+
325+
/*
326+
* We have no good place to clear the slb preload cache on exec,
327+
* flush_thread is about the earliest arch hook but that happens
328+
* after we switch to the mm and have aleady preloaded the SLBEs.
329+
*
330+
* For the most part that's probably okay to use entries from the
331+
* previous exec, they will age out if unused. It may turn out to
332+
* be an advantage to clear the cache before switching to it,
333+
* however.
334+
*/
335+
336+
/*
337+
* preload some userspace segments into the SLB.
338+
* Almost all 32 and 64bit PowerPC executables are linked at
339+
* 0x10000000 so it makes sense to preload this segment.
340+
*/
341+
if (!is_kernel_addr(exec)) {
342+
if (preload_add(ti, exec))
343+
slb_allocate_user(mm, exec);
344+
}
345+
346+
/* Libraries and mmaps. */
347+
if (!is_kernel_addr(mm->mmap_base)) {
348+
if (preload_add(ti, mm->mmap_base))
349+
slb_allocate_user(mm, mm->mmap_base);
350+
}
351+
352+
/* see switch_slb */
353+
asm volatile("isync" : : : "memory");
354+
355+
local_irq_enable();
356+
}
357+
358+
void preload_new_slb_context(unsigned long start, unsigned long sp)
359+
{
360+
struct thread_info *ti = current_thread_info();
361+
struct mm_struct *mm = current->mm;
362+
unsigned long heap = mm->start_brk;
363+
364+
WARN_ON(irqs_disabled());
365+
366+
/* see above */
367+
if (ti->slb_preload_nr + 3 > SLB_PRELOAD_NR)
368+
return;
369+
370+
hard_irq_disable();
371+
372+
/* Userspace entry address. */
373+
if (!is_kernel_addr(start)) {
374+
if (preload_add(ti, start))
375+
slb_allocate_user(mm, start);
376+
}
377+
378+
/* Top of stack, grows down. */
379+
if (!is_kernel_addr(sp)) {
380+
if (preload_add(ti, sp))
381+
slb_allocate_user(mm, sp);
382+
}
383+
384+
/* Bottom of heap, grows up. */
385+
if (heap && !is_kernel_addr(heap)) {
386+
if (preload_add(ti, heap))
387+
slb_allocate_user(mm, heap);
388+
}
389+
390+
/* see switch_slb */
391+
asm volatile("isync" : : : "memory");
392+
393+
local_irq_enable();
394+
}
395+
396+
289397
/* Flush all user entries from the segment table of the current processor. */
290398
void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
291399
{
292-
unsigned long pc = KSTK_EIP(tsk);
293-
unsigned long stack = KSTK_ESP(tsk);
294-
unsigned long exec_base;
400+
struct thread_info *ti = task_thread_info(tsk);
401+
unsigned char i;
295402

296403
/*
297404
* We need interrupts hard-disabled here, not just soft-disabled,
@@ -300,23 +407,22 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
300407
* which would update the slb_cache/slb_cache_ptr fields in the PACA.
301408
*/
302409
hard_irq_disable();
410+
asm volatile("isync" : : : "memory");
303411
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
304412
/*
305413
* SLBIA IH=3 invalidates all Class=1 SLBEs and their
306414
* associated lookaside structures, which matches what
307415
* switch_slb wants. So ARCH_300 does not use the slb
308416
* cache.
309417
*/
310-
asm volatile("isync ; " PPC_SLBIA(3)" ; isync");
418+
asm volatile(PPC_SLBIA(3));
311419
} else {
312420
unsigned long offset = get_paca()->slb_cache_ptr;
313421

314422
if (!mmu_has_feature(MMU_FTR_NO_SLBIE_B) &&
315423
offset <= SLB_CACHE_ENTRIES) {
316424
unsigned long slbie_data = 0;
317-
int i;
318425

319-
asm volatile("isync" : : : "memory");
320426
for (i = 0; i < offset; i++) {
321427
/* EA */
322428
slbie_data = (unsigned long)
@@ -331,16 +437,14 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
331437
if (!cpu_has_feature(CPU_FTR_ARCH_207S) && offset == 1)
332438
asm volatile("slbie %0" : : "r" (slbie_data));
333439

334-
asm volatile("isync" : : : "memory");
335440
} else {
336441
struct slb_shadow *p = get_slb_shadow();
337442
unsigned long ksp_esid_data =
338443
be64_to_cpu(p->save_area[KSTACK_INDEX].esid);
339444
unsigned long ksp_vsid_data =
340445
be64_to_cpu(p->save_area[KSTACK_INDEX].vsid);
341446

342-
asm volatile("isync\n"
343-
PPC_SLBIA(1) "\n"
447+
asm volatile(PPC_SLBIA(1) "\n"
344448
"slbmte %0,%1\n"
345449
"isync"
346450
:: "r"(ksp_vsid_data),
@@ -356,24 +460,35 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
356460
copy_mm_to_paca(mm);
357461

358462
/*
359-
* preload some userspace segments into the SLB.
360-
* Almost all 32 and 64bit PowerPC executables are linked at
361-
* 0x10000000 so it makes sense to preload this segment.
463+
* We gradually age out SLBs after a number of context switches to
464+
* reduce reload overhead of unused entries (like we do with FP/VEC
465+
* reload). Each time we wrap 256 switches, take an entry out of the
466+
* SLB preload cache.
362467
*/
363-
exec_base = 0x10000000;
468+
tsk->thread.load_slb++;
469+
if (!tsk->thread.load_slb) {
470+
unsigned long pc = KSTK_EIP(tsk);
364471

365-
if (is_kernel_addr(pc) || is_kernel_addr(stack) ||
366-
is_kernel_addr(exec_base))
367-
return;
472+
preload_age(ti);
473+
preload_add(ti, pc);
474+
}
475+
476+
for (i = 0; i < ti->slb_preload_nr; i++) {
477+
unsigned char idx;
478+
unsigned long ea;
368479

369-
slb_allocate_user(mm, pc);
480+
idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
481+
ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT;
370482

371-
if (!esids_match(pc, stack))
372-
slb_allocate_user(mm, stack);
483+
slb_allocate_user(mm, ea);
484+
}
373485

374-
if (!esids_match(pc, exec_base) &&
375-
!esids_match(stack, exec_base))
376-
slb_allocate_user(mm, exec_base);
486+
/*
487+
* Synchronize slbmte preloads with possible subsequent user memory
488+
* address accesses by the kernel (user mode won't happen until
489+
* rfid, which is safe).
490+
*/
491+
asm volatile("isync" : : : "memory");
377492
}
378493

379494
void slb_set_size(u16 size)
@@ -642,11 +757,16 @@ long do_slb_fault(struct pt_regs *regs, unsigned long ea)
642757
return slb_allocate_kernel(ea, id);
643758
} else {
644759
struct mm_struct *mm = current->mm;
760+
long err;
645761

646762
if (unlikely(!mm))
647763
return -EFAULT;
648764

649-
return slb_allocate_user(mm, ea);
765+
err = slb_allocate_user(mm, ea);
766+
if (!err)
767+
preload_add(current_thread_info(), ea);
768+
769+
return err;
650770
}
651771
}
652772

0 commit comments

Comments
 (0)