Skip to content

Commit 7cba160

Browse files
shreyasbpmpe
authored andcommitted
powernv/cpuidle: Redesign idle states management
Deep idle states like sleep and winkle are per core idle states. A core enters these states only when all the threads enter either the particular idle state or a deeper one. There are tasks like fastsleep hardware bug workaround and hypervisor core state save which have to be done only by the last thread of the core entering deep idle state and similarly tasks like timebase resync, hypervisor core register restore that have to be done only by the first thread waking up from these state. The current idle state management does not have a way to distinguish the first/last thread of the core waking/entering idle states. Tasks like timebase resync are done for all the threads. This is not only is suboptimal, but can cause functionality issues when subcores and kvm is involved. This patch adds the necessary infrastructure to track idle states of threads in a per-core structure. It uses this info to perform tasks like fastsleep workaround and timebase resync only once per core. Signed-off-by: Shreyas B. Prabhu <shreyas@linux.vnet.ibm.com> Originally-by: Preeti U. Murthy <preeti@linux.vnet.ibm.com> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mackerras <paulus@samba.org> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Rafael J. Wysocki <rjw@rjwysocki.net> Cc: linux-pm@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
1 parent 8eb8ac8 commit 7cba160

File tree

11 files changed

+296
-58
lines changed

11 files changed

+296
-58
lines changed

arch/powerpc/include/asm/cpuidle.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#ifndef _ASM_POWERPC_CPUIDLE_H
2+
#define _ASM_POWERPC_CPUIDLE_H
3+
4+
#ifdef CONFIG_PPC_POWERNV
5+
/* Used in powernv idle state management */
6+
#define PNV_THREAD_RUNNING 0
7+
#define PNV_THREAD_NAP 1
8+
#define PNV_THREAD_SLEEP 2
9+
#define PNV_THREAD_WINKLE 3
10+
#define PNV_CORE_IDLE_LOCK_BIT 0x100
11+
#define PNV_CORE_IDLE_THREAD_BITS 0x0FF
12+
13+
#ifndef __ASSEMBLY__
14+
extern u32 pnv_fastsleep_workaround_at_entry[];
15+
extern u32 pnv_fastsleep_workaround_at_exit[];
16+
#endif
17+
18+
#endif
19+
20+
#endif

arch/powerpc/include/asm/opal.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@ struct opal_sg_list {
160160
#define OPAL_PCI_ERR_INJECT 96
161161
#define OPAL_PCI_EEH_FREEZE_SET 97
162162
#define OPAL_HANDLE_HMI 98
163+
#define OPAL_CONFIG_CPU_IDLE_STATE 99
163164
#define OPAL_REGISTER_DUMP_REGION 101
164165
#define OPAL_UNREGISTER_DUMP_REGION 102
165166
#define OPAL_WRITE_TPO 103
@@ -175,6 +176,7 @@ struct opal_sg_list {
175176
*/
176177
#define OPAL_PM_NAP_ENABLED 0x00010000
177178
#define OPAL_PM_SLEEP_ENABLED 0x00020000
179+
#define OPAL_PM_SLEEP_ENABLED_ER1 0x00080000
178180

179181
#ifndef __ASSEMBLY__
180182

arch/powerpc/include/asm/paca.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,14 @@ struct paca_struct {
152152
u64 tm_scratch; /* TM scratch area for reclaim */
153153
#endif
154154

155+
#ifdef CONFIG_PPC_POWERNV
156+
/* Per-core mask tracking idle threads and a lock bit-[L][TTTTTTTT] */
157+
u32 *core_idle_state_ptr;
158+
u8 thread_idle_state; /* PNV_THREAD_RUNNING/NAP/SLEEP */
159+
/* Mask to indicate thread id in core */
160+
u8 thread_mask;
161+
#endif
162+
155163
#ifdef CONFIG_PPC_BOOK3S_64
156164
/* Exclusive emergency stack pointer for machine check exception. */
157165
void *mc_emergency_sp;

arch/powerpc/include/asm/processor.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -452,7 +452,7 @@ enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF};
452452

453453
extern int powersave_nap; /* set if nap mode can be used in idle loop */
454454
extern unsigned long power7_nap(int check_irq);
455-
extern void power7_sleep(void);
455+
extern unsigned long power7_sleep(void);
456456
extern void flush_instruction_cache(void);
457457
extern void hard_reset_now(void);
458458
extern void poweroff_now(void);

arch/powerpc/kernel/asm-offsets.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -726,5 +726,14 @@ int main(void)
726726
arch.timing_last_enter.tv32.tbl));
727727
#endif
728728

729+
#ifdef CONFIG_PPC_POWERNV
730+
DEFINE(PACA_CORE_IDLE_STATE_PTR,
731+
offsetof(struct paca_struct, core_idle_state_ptr));
732+
DEFINE(PACA_THREAD_IDLE_STATE,
733+
offsetof(struct paca_struct, thread_idle_state));
734+
DEFINE(PACA_THREAD_MASK,
735+
offsetof(struct paca_struct, thread_mask));
736+
#endif
737+
729738
return 0;
730739
}

arch/powerpc/kernel/exceptions-64s.S

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include <asm/hw_irq.h>
1616
#include <asm/exception-64s.h>
1717
#include <asm/ptrace.h>
18+
#include <asm/cpuidle.h>
1819

1920
/*
2021
* We layout physical memory as follows:
@@ -109,15 +110,19 @@ BEGIN_FTR_SECTION
109110
rlwinm. r13,r13,47-31,30,31
110111
beq 9f
111112

112-
/* waking up from powersave (nap) state */
113-
cmpwi cr1,r13,2
114-
/* Total loss of HV state is fatal, we could try to use the
115-
* PIR to locate a PACA, then use an emergency stack etc...
116-
* OPAL v3 based powernv platforms have new idle states
117-
* which fall in this catagory.
118-
*/
119-
bgt cr1,8f
113+
cmpwi cr3,r13,2
114+
120115
GET_PACA(r13)
116+
lbz r0,PACA_THREAD_IDLE_STATE(r13)
117+
cmpwi cr2,r0,PNV_THREAD_NAP
118+
bgt cr2,8f /* Either sleep or Winkle */
119+
120+
/* Waking up from nap should not cause hypervisor state loss */
121+
bgt cr3,.
122+
123+
/* Waking up from nap */
124+
li r0,PNV_THREAD_RUNNING
125+
stb r0,PACA_THREAD_IDLE_STATE(r13) /* Clear thread state */
121126

122127
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
123128
li r0,KVM_HWTHREAD_IN_KERNEL
@@ -133,7 +138,7 @@ BEGIN_FTR_SECTION
133138

134139
/* Return SRR1 from power7_nap() */
135140
mfspr r3,SPRN_SRR1
136-
beq cr1,2f
141+
beq cr3,2f
137142
b power7_wakeup_noloss
138143
2: b power7_wakeup_loss
139144

@@ -1382,6 +1387,7 @@ machine_check_handle_early:
13821387
MACHINE_CHECK_HANDLER_WINDUP
13831388
GET_PACA(r13)
13841389
ld r1,PACAR1(r13)
1390+
li r3,PNV_THREAD_NAP
13851391
b power7_enter_nap_mode
13861392
4:
13871393
#endif

arch/powerpc/kernel/idle_power7.S

Lines changed: 153 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include <asm/hw_irq.h>
1919
#include <asm/kvm_book3s_asm.h>
2020
#include <asm/opal.h>
21+
#include <asm/cpuidle.h>
2122

2223
#undef DEBUG
2324

@@ -37,8 +38,7 @@
3738

3839
/*
3940
* Pass requested state in r3:
40-
* 0 - nap
41-
* 1 - sleep
41+
* r3 - PNV_THREAD_NAP/SLEEP/WINKLE
4242
*
4343
* To check IRQ_HAPPENED in r4
4444
* 0 - don't check
@@ -123,12 +123,58 @@ power7_enter_nap_mode:
123123
li r4,KVM_HWTHREAD_IN_NAP
124124
stb r4,HSTATE_HWTHREAD_STATE(r13)
125125
#endif
126-
cmpwi cr0,r3,1
127-
beq 2f
126+
stb r3,PACA_THREAD_IDLE_STATE(r13)
127+
cmpwi cr1,r3,PNV_THREAD_SLEEP
128+
bge cr1,2f
128129
IDLE_STATE_ENTER_SEQ(PPC_NAP)
129130
/* No return */
130-
2: IDLE_STATE_ENTER_SEQ(PPC_SLEEP)
131-
/* No return */
131+
2:
132+
/* Sleep or winkle */
133+
lbz r7,PACA_THREAD_MASK(r13)
134+
ld r14,PACA_CORE_IDLE_STATE_PTR(r13)
135+
lwarx_loop1:
136+
lwarx r15,0,r14
137+
andc r15,r15,r7 /* Clear thread bit */
138+
139+
andi. r15,r15,PNV_CORE_IDLE_THREAD_BITS
140+
141+
/*
142+
* If cr0 = 0, then current thread is the last thread of the core entering
143+
* sleep. Last thread needs to execute the hardware bug workaround code if
144+
* required by the platform.
145+
* Make the workaround call unconditionally here. The below branch call is
146+
* patched out when the idle states are discovered if the platform does not
147+
* require it.
148+
*/
149+
.global pnv_fastsleep_workaround_at_entry
150+
pnv_fastsleep_workaround_at_entry:
151+
beq fastsleep_workaround_at_entry
152+
153+
stwcx. r15,0,r14
154+
bne- lwarx_loop1
155+
isync
156+
157+
common_enter: /* common code for all the threads entering sleep */
158+
IDLE_STATE_ENTER_SEQ(PPC_SLEEP)
159+
160+
fastsleep_workaround_at_entry:
161+
ori r15,r15,PNV_CORE_IDLE_LOCK_BIT
162+
stwcx. r15,0,r14
163+
bne- lwarx_loop1
164+
isync
165+
166+
/* Fast sleep workaround */
167+
li r3,1
168+
li r4,1
169+
li r0,OPAL_CONFIG_CPU_IDLE_STATE
170+
bl opal_call_realmode
171+
172+
/* Clear Lock bit */
173+
li r0,0
174+
lwsync
175+
stw r0,0(r14)
176+
b common_enter
177+
132178

133179
_GLOBAL(power7_idle)
134180
/* Now check if user or arch enabled NAP mode */
@@ -141,49 +187,16 @@ _GLOBAL(power7_idle)
141187

142188
_GLOBAL(power7_nap)
143189
mr r4,r3
144-
li r3,0
190+
li r3,PNV_THREAD_NAP
145191
b power7_powersave_common
146192
/* No return */
147193

148194
_GLOBAL(power7_sleep)
149-
li r3,1
195+
li r3,PNV_THREAD_SLEEP
150196
li r4,1
151197
b power7_powersave_common
152198
/* No return */
153199

154-
/*
155-
* Make opal call in realmode. This is a generic function to be called
156-
* from realmode from reset vector. It handles endianess.
157-
*
158-
* r13 - paca pointer
159-
* r1 - stack pointer
160-
* r3 - opal token
161-
*/
162-
opal_call_realmode:
163-
mflr r12
164-
std r12,_LINK(r1)
165-
ld r2,PACATOC(r13)
166-
/* Set opal return address */
167-
LOAD_REG_ADDR(r0,return_from_opal_call)
168-
mtlr r0
169-
/* Handle endian-ness */
170-
li r0,MSR_LE
171-
mfmsr r12
172-
andc r12,r12,r0
173-
mtspr SPRN_HSRR1,r12
174-
mr r0,r3 /* Move opal token to r0 */
175-
LOAD_REG_ADDR(r11,opal)
176-
ld r12,8(r11)
177-
ld r2,0(r11)
178-
mtspr SPRN_HSRR0,r12
179-
hrfid
180-
181-
return_from_opal_call:
182-
FIXUP_ENDIAN
183-
ld r0,_LINK(r1)
184-
mtlr r0
185-
blr
186-
187200
#define CHECK_HMI_INTERRUPT \
188201
mfspr r0,SPRN_SRR1; \
189202
BEGIN_FTR_SECTION_NESTED(66); \
@@ -197,7 +210,7 @@ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66); \
197210
ld r2,PACATOC(r13); \
198211
ld r1,PACAR1(r13); \
199212
std r3,ORIG_GPR3(r1); /* Save original r3 */ \
200-
li r3,OPAL_HANDLE_HMI; /* Pass opal token argument*/ \
213+
li r0,OPAL_HANDLE_HMI; /* Pass opal token argument*/ \
201214
bl opal_call_realmode; \
202215
ld r3,ORIG_GPR3(r1); /* Restore original r3 */ \
203216
20: nop;
@@ -206,16 +219,105 @@ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66); \
206219
_GLOBAL(power7_wakeup_tb_loss)
207220
ld r2,PACATOC(r13);
208221
ld r1,PACAR1(r13)
222+
/*
223+
* Before entering any idle state, the NVGPRs are saved in the stack
224+
* and they are restored before switching to the process context. Hence
225+
* until they are restored, they are free to be used.
226+
*
227+
* Save SRR1 in a NVGPR as it might be clobbered in opal_call_realmode
228+
* (called in CHECK_HMI_INTERRUPT). SRR1 is required to determine the
229+
* wakeup reason if we branch to kvm_start_guest.
230+
*/
209231

232+
mfspr r16,SPRN_SRR1
210233
BEGIN_FTR_SECTION
211234
CHECK_HMI_INTERRUPT
212235
END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
236+
237+
lbz r7,PACA_THREAD_MASK(r13)
238+
ld r14,PACA_CORE_IDLE_STATE_PTR(r13)
239+
lwarx_loop2:
240+
lwarx r15,0,r14
241+
andi. r9,r15,PNV_CORE_IDLE_LOCK_BIT
242+
/*
243+
* Lock bit is set in one of the 2 cases-
244+
* a. In the sleep/winkle enter path, the last thread is executing
245+
* fastsleep workaround code.
246+
* b. In the wake up path, another thread is executing fastsleep
247+
* workaround undo code or resyncing timebase or restoring context
248+
* In either case loop until the lock bit is cleared.
249+
*/
250+
bne core_idle_lock_held
251+
252+
cmpwi cr2,r15,0
253+
or r15,r15,r7 /* Set thread bit */
254+
255+
beq cr2,first_thread
256+
257+
/* Not first thread in core to wake up */
258+
stwcx. r15,0,r14
259+
bne- lwarx_loop2
260+
isync
261+
b common_exit
262+
263+
core_idle_lock_held:
264+
HMT_LOW
265+
core_idle_lock_loop:
266+
lwz r15,0(14)
267+
andi. r9,r15,PNV_CORE_IDLE_LOCK_BIT
268+
bne core_idle_lock_loop
269+
HMT_MEDIUM
270+
b lwarx_loop2
271+
272+
first_thread:
273+
/* First thread in core to wakeup */
274+
ori r15,r15,PNV_CORE_IDLE_LOCK_BIT
275+
stwcx. r15,0,r14
276+
bne- lwarx_loop2
277+
isync
278+
279+
/*
280+
* First thread in the core waking up from fastsleep. It needs to
281+
* call the fastsleep workaround code if the platform requires it.
282+
* Call it unconditionally here. The below branch instruction will
283+
* be patched out when the idle states are discovered if platform
284+
* does not require workaround.
285+
*/
286+
.global pnv_fastsleep_workaround_at_exit
287+
pnv_fastsleep_workaround_at_exit:
288+
b fastsleep_workaround_at_exit
289+
290+
timebase_resync:
291+
/* Do timebase resync if we are waking up from sleep. Use cr3 value
292+
* set in exceptions-64s.S */
293+
ble cr3,clear_lock
213294
/* Time base re-sync */
214-
li r3,OPAL_RESYNC_TIMEBASE
295+
li r0,OPAL_RESYNC_TIMEBASE
215296
bl opal_call_realmode;
216-
217297
/* TODO: Check r3 for failure */
218298

299+
clear_lock:
300+
andi. r15,r15,PNV_CORE_IDLE_THREAD_BITS
301+
lwsync
302+
stw r15,0(r14)
303+
304+
common_exit:
305+
li r5,PNV_THREAD_RUNNING
306+
stb r5,PACA_THREAD_IDLE_STATE(r13)
307+
308+
mtspr SPRN_SRR1,r16
309+
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
310+
li r0,KVM_HWTHREAD_IN_KERNEL
311+
stb r0,HSTATE_HWTHREAD_STATE(r13)
312+
/* Order setting hwthread_state vs. testing hwthread_req */
313+
sync
314+
lbz r0,HSTATE_HWTHREAD_REQ(r13)
315+
cmpwi r0,0
316+
beq 6f
317+
b kvm_start_guest
318+
6:
319+
#endif
320+
219321
REST_NVGPRS(r1)
220322
REST_GPR(2, r1)
221323
ld r3,_CCR(r1)
@@ -228,6 +330,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
228330
mtspr SPRN_SRR0,r5
229331
rfid
230332

333+
fastsleep_workaround_at_exit:
334+
li r3,1
335+
li r4,0
336+
li r0,OPAL_CONFIG_CPU_IDLE_STATE
337+
bl opal_call_realmode
338+
b timebase_resync
339+
231340
/*
232341
* R3 here contains the value that will be returned to the caller
233342
* of power7_nap.

0 commit comments

Comments
 (0)