Skip to content

Commit 70fe3d9

Browse files
cyrilbur-ibmmpe
authored andcommitted
powerpc: Restore FPU/VEC/VSX if previously used
Currently the FPU, VEC and VSX facilities are lazily loaded. This is not a problem unless a process is using these facilities. Modern versions of GCC are very good at automatically vectorising code, new and modernised workloads make use of floating point and vector facilities, even the kernel makes use of vectorised memcpy. All this combined greatly increases the cost of a syscall since the kernel uses the facilities sometimes even in syscall fast-path making it increasingly common for a thread to take an *_unavailable exception soon after a syscall, not to mention potentially taking all three. The obvious overcompensation to this problem is to simply always load all the facilities on every exit to userspace. Loading up all FPU, VEC and VSX registers every time can be expensive and if a workload does avoid using them, it should not be forced to incur this penalty. An 8bit counter is used to detect if the registers have been used in the past and the registers are always loaded until the value wraps to back to zero. Several versions of the assembly in entry_64.S were tested: 1. Always calling C. 2. Performing a common case check and then calling C. 3. A complex check in asm. After some benchmarking it was determined that avoiding C in the common case is a performance benefit (option 2). The full check in asm (option 3) greatly complicated that codepath for a negligible performance gain and the trade-off was deemed not worth it. Signed-off-by: Cyril Bur <cyrilbur@gmail.com> [mpe: Move load_vec in the struct to fill an existing hole, reword change log] Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> fixup
1 parent d272f66 commit 70fe3d9

File tree

6 files changed

+107
-14
lines changed

6 files changed

+107
-14
lines changed

arch/powerpc/include/asm/processor.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,9 @@ struct thread_struct {
236236
#endif
237237
struct arch_hw_breakpoint hw_brk; /* info on the hardware breakpoint */
238238
unsigned long trap_nr; /* last trap # on this thread */
239+
u8 load_fp;
239240
#ifdef CONFIG_ALTIVEC
241+
u8 load_vec;
240242
struct thread_vr_state vr_state;
241243
struct thread_vr_state *vr_save_area;
242244
unsigned long vrsave;

arch/powerpc/kernel/asm-offsets.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,12 +95,14 @@ int main(void)
9595
DEFINE(THREAD_FPSTATE, offsetof(struct thread_struct, fp_state));
9696
DEFINE(THREAD_FPSAVEAREA, offsetof(struct thread_struct, fp_save_area));
9797
DEFINE(FPSTATE_FPSCR, offsetof(struct thread_fp_state, fpscr));
98+
DEFINE(THREAD_LOAD_FP, offsetof(struct thread_struct, load_fp));
9899
#ifdef CONFIG_ALTIVEC
99100
DEFINE(THREAD_VRSTATE, offsetof(struct thread_struct, vr_state));
100101
DEFINE(THREAD_VRSAVEAREA, offsetof(struct thread_struct, vr_save_area));
101102
DEFINE(THREAD_VRSAVE, offsetof(struct thread_struct, vrsave));
102103
DEFINE(THREAD_USED_VR, offsetof(struct thread_struct, used_vr));
103104
DEFINE(VRSTATE_VSCR, offsetof(struct thread_vr_state, vscr));
105+
DEFINE(THREAD_LOAD_VEC, offsetof(struct thread_struct, load_vec));
104106
#endif /* CONFIG_ALTIVEC */
105107
#ifdef CONFIG_VSX
106108
DEFINE(THREAD_USED_VSR, offsetof(struct thread_struct, used_vsr));

arch/powerpc/kernel/entry_64.S

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,20 @@ system_call: /* label this so stack traces look sane */
210210
li r11,-MAX_ERRNO
211211
andi. r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK)
212212
bne- syscall_exit_work
213-
cmpld r3,r11
213+
214+
andi. r0,r8,MSR_FP
215+
beq 2f
216+
#ifdef CONFIG_ALTIVEC
217+
andis. r0,r8,MSR_VEC@h
218+
bne 3f
219+
#endif
220+
2: addi r3,r1,STACK_FRAME_OVERHEAD
221+
bl restore_math
222+
ld r8,_MSR(r1)
223+
ld r3,RESULT(r1)
224+
li r11,-MAX_ERRNO
225+
226+
3: cmpld r3,r11
214227
ld r5,_CCR(r1)
215228
bge- syscall_error
216229
.Lsyscall_error_cont:
@@ -602,8 +615,8 @@ _GLOBAL(ret_from_except_lite)
602615

603616
/* Check current_thread_info()->flags */
604617
andi. r0,r4,_TIF_USER_WORK_MASK
605-
#ifdef CONFIG_PPC_BOOK3E
606618
bne 1f
619+
#ifdef CONFIG_PPC_BOOK3E
607620
/*
608621
* Check to see if the dbcr0 register is set up to debug.
609622
* Use the internal debug mode bit to do this.
@@ -618,7 +631,9 @@ _GLOBAL(ret_from_except_lite)
618631
mtspr SPRN_DBSR,r10
619632
b restore
620633
#else
621-
beq restore
634+
addi r3,r1,STACK_FRAME_OVERHEAD
635+
bl restore_math
636+
b restore
622637
#endif
623638
1: andi. r0,r4,_TIF_NEED_RESCHED
624639
beq 2f

arch/powerpc/kernel/fpu.S

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
130130
or r12,r12,r4
131131
std r12,_MSR(r1)
132132
#endif
133+
/* Don't care if r4 overflows, this is desired behaviour */
134+
lbz r4,THREAD_LOAD_FP(r5)
135+
addi r4,r4,1
136+
stb r4,THREAD_LOAD_FP(r5)
133137
addi r10,r5,THREAD_FPSTATE
134138
lfd fr0,FPSTATE_FPSCR(r10)
135139
MTFSF_L(fr0)

arch/powerpc/kernel/process.c

Lines changed: 77 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -187,9 +187,22 @@ void enable_kernel_fp(void)
187187
}
188188
}
189189
EXPORT_SYMBOL(enable_kernel_fp);
190+
191+
static int restore_fp(struct task_struct *tsk) {
192+
if (tsk->thread.load_fp) {
193+
load_fp_state(&current->thread.fp_state);
194+
current->thread.load_fp++;
195+
return 1;
196+
}
197+
return 0;
198+
}
199+
#else
200+
static int restore_fp(struct task_struct *tsk) { return 0; }
190201
#endif /* CONFIG_PPC_FPU */
191202

192203
#ifdef CONFIG_ALTIVEC
204+
#define loadvec(thr) ((thr).load_vec)
205+
193206
void giveup_altivec(struct task_struct *tsk)
194207
{
195208
check_if_tm_restore_required(tsk);
@@ -229,6 +242,21 @@ void flush_altivec_to_thread(struct task_struct *tsk)
229242
}
230243
}
231244
EXPORT_SYMBOL_GPL(flush_altivec_to_thread);
245+
246+
static int restore_altivec(struct task_struct *tsk)
247+
{
248+
if (cpu_has_feature(CPU_FTR_ALTIVEC) && tsk->thread.load_vec) {
249+
load_vr_state(&tsk->thread.vr_state);
250+
tsk->thread.used_vr = 1;
251+
tsk->thread.load_vec++;
252+
253+
return 1;
254+
}
255+
return 0;
256+
}
257+
#else
258+
#define loadvec(thr) 0
259+
static inline int restore_altivec(struct task_struct *tsk) { return 0; }
232260
#endif /* CONFIG_ALTIVEC */
233261

234262
#ifdef CONFIG_VSX
@@ -275,6 +303,18 @@ void flush_vsx_to_thread(struct task_struct *tsk)
275303
}
276304
}
277305
EXPORT_SYMBOL_GPL(flush_vsx_to_thread);
306+
307+
static int restore_vsx(struct task_struct *tsk)
308+
{
309+
if (cpu_has_feature(CPU_FTR_VSX)) {
310+
tsk->thread.used_vsr = 1;
311+
return 1;
312+
}
313+
314+
return 0;
315+
}
316+
#else
317+
static inline int restore_vsx(struct task_struct *tsk) { return 0; }
278318
#endif /* CONFIG_VSX */
279319

280320
#ifdef CONFIG_SPE
@@ -374,6 +414,36 @@ void giveup_all(struct task_struct *tsk)
374414
}
375415
EXPORT_SYMBOL(giveup_all);
376416

417+
void restore_math(struct pt_regs *regs)
418+
{
419+
unsigned long msr;
420+
421+
if (!current->thread.load_fp && !loadvec(current->thread))
422+
return;
423+
424+
msr = regs->msr;
425+
msr_check_and_set(msr_all_available);
426+
427+
/*
428+
* Only reload if the bit is not set in the user MSR, the bit BEING set
429+
* indicates that the registers are hot
430+
*/
431+
if ((!(msr & MSR_FP)) && restore_fp(current))
432+
msr |= MSR_FP | current->thread.fpexc_mode;
433+
434+
if ((!(msr & MSR_VEC)) && restore_altivec(current))
435+
msr |= MSR_VEC;
436+
437+
if ((msr & (MSR_FP | MSR_VEC)) == (MSR_FP | MSR_VEC) &&
438+
restore_vsx(current)) {
439+
msr |= MSR_VSX;
440+
}
441+
442+
msr_check_and_clear(msr_all_available);
443+
444+
regs->msr = msr;
445+
}
446+
377447
void flush_all_to_thread(struct task_struct *tsk)
378448
{
379449
if (tsk->thread.regs) {
@@ -832,17 +902,9 @@ void restore_tm_state(struct pt_regs *regs)
832902

833903
msr_diff = current->thread.ckpt_regs.msr & ~regs->msr;
834904
msr_diff &= MSR_FP | MSR_VEC | MSR_VSX;
835-
if (msr_diff & MSR_FP) {
836-
msr_check_and_set(MSR_FP);
837-
load_fp_state(&current->thread.fp_state);
838-
msr_check_and_clear(MSR_FP);
839-
regs->msr |= current->thread.fpexc_mode;
840-
}
841-
if (msr_diff & MSR_VEC) {
842-
msr_check_and_set(MSR_VEC);
843-
load_vr_state(&current->thread.vr_state);
844-
msr_check_and_clear(MSR_VEC);
845-
}
905+
906+
restore_math(regs);
907+
846908
regs->msr |= msr_diff;
847909
}
848910

@@ -1006,6 +1068,10 @@ struct task_struct *__switch_to(struct task_struct *prev,
10061068
batch = this_cpu_ptr(&ppc64_tlb_batch);
10071069
batch->active = 1;
10081070
}
1071+
1072+
if (current_thread_info()->task->thread.regs)
1073+
restore_math(current_thread_info()->task->thread.regs);
1074+
10091075
#endif /* CONFIG_PPC_BOOK3S_64 */
10101076

10111077
return last;

arch/powerpc/kernel/vector.S

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,10 @@ _GLOBAL(load_up_altivec)
9191
oris r12,r12,MSR_VEC@h
9292
std r12,_MSR(r1)
9393
#endif
94+
/* Don't care if r4 overflows, this is desired behaviour */
95+
lbz r4,THREAD_LOAD_VEC(r5)
96+
addi r4,r4,1
97+
stb r4,THREAD_LOAD_VEC(r5)
9498
addi r6,r5,THREAD_VRSTATE
9599
li r4,1
96100
li r10,VRSTATE_VSCR

0 commit comments

Comments
 (0)