Skip to content

Commit 243e251

Browse files
ozbenhmpe
authored andcommitted
powerpc/xive: Native exploitation of the XIVE interrupt controller
The XIVE interrupt controller is the new interrupt controller found in POWER9. It supports advanced virtualization capabilities among other things. Currently we use a set of firmware calls that simulate the old "XICS" interrupt controller but this is fairly inefficient. This adds the framework for using XIVE along with a native backend which OPAL for configuration. Later, a backend allowing the use in a KVM or PowerVM guest will also be provided. This disables some fast path for interrupts in KVM when XIVE is enabled as these rely on the firmware emulation code which is no longer available when the XIVE is used natively by Linux. A latter patch will make KVM also directly exploit the XIVE, thus recovering the lost performance (and more). Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> [mpe: Fixup pr_xxx("XIVE:"...), don't split pr_xxx() strings, tweak Kconfig so XIVE_NATIVE selects XIVE and depends on POWERNV, fix build errors when SMP=n, fold in fixes from Ben: Don't call cpu_online() on an invalid CPU number Fix irq target selection returning out of bounds cpu# Extra sanity checks on cpu numbers ] Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
1 parent a978e13 commit 243e251

File tree

15 files changed

+2427
-12
lines changed

15 files changed

+2427
-12
lines changed

arch/powerpc/include/asm/xive-regs.h

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
/*
2+
* Copyright 2016,2017 IBM Corporation.
3+
*
4+
* This program is free software; you can redistribute it and/or
5+
* modify it under the terms of the GNU General Public License
6+
* as published by the Free Software Foundation; either version
7+
* 2 of the License, or (at your option) any later version.
8+
*/
9+
#ifndef _ASM_POWERPC_XIVE_REGS_H
10+
#define _ASM_POWERPC_XIVE_REGS_H
11+
12+
/*
13+
* Thread Management (aka "TM") registers
14+
*/
15+
16+
/* TM register offsets */
17+
#define TM_QW0_USER 0x000 /* All rings */
18+
#define TM_QW1_OS 0x010 /* Ring 0..2 */
19+
#define TM_QW2_HV_POOL 0x020 /* Ring 0..1 */
20+
#define TM_QW3_HV_PHYS 0x030 /* Ring 0..1 */
21+
22+
/* Byte offsets inside a QW QW0 QW1 QW2 QW3 */
23+
#define TM_NSR 0x0 /* + + - + */
24+
#define TM_CPPR 0x1 /* - + - + */
25+
#define TM_IPB 0x2 /* - + + + */
26+
#define TM_LSMFB 0x3 /* - + + + */
27+
#define TM_ACK_CNT 0x4 /* - + - - */
28+
#define TM_INC 0x5 /* - + - + */
29+
#define TM_AGE 0x6 /* - + - + */
30+
#define TM_PIPR 0x7 /* - + - + */
31+
32+
#define TM_WORD0 0x0
33+
#define TM_WORD1 0x4
34+
35+
/*
36+
* QW word 2 contains the valid bit at the top and other fields
37+
* depending on the QW.
38+
*/
39+
#define TM_WORD2 0x8
40+
#define TM_QW0W2_VU PPC_BIT32(0)
41+
#define TM_QW0W2_LOGIC_SERV PPC_BITMASK32(1,31) // XX 2,31 ?
42+
#define TM_QW1W2_VO PPC_BIT32(0)
43+
#define TM_QW1W2_OS_CAM PPC_BITMASK32(8,31)
44+
#define TM_QW2W2_VP PPC_BIT32(0)
45+
#define TM_QW2W2_POOL_CAM PPC_BITMASK32(8,31)
46+
#define TM_QW3W2_VT PPC_BIT32(0)
47+
#define TM_QW3W2_LP PPC_BIT32(6)
48+
#define TM_QW3W2_LE PPC_BIT32(7)
49+
#define TM_QW3W2_T PPC_BIT32(31)
50+
51+
/*
52+
* In addition to normal loads to "peek" and writes (only when invalid)
53+
* using 4 and 8 bytes accesses, the above registers support these
54+
* "special" byte operations:
55+
*
56+
* - Byte load from QW0[NSR] - User level NSR (EBB)
57+
* - Byte store to QW0[NSR] - User level NSR (EBB)
58+
* - Byte load/store to QW1[CPPR] and QW3[CPPR] - CPPR access
59+
* - Byte load from QW3[TM_WORD2] - Read VT||00000||LP||LE on thrd 0
60+
* otherwise VT||0000000
61+
* - Byte store to QW3[TM_WORD2] - Set VT bit (and LP/LE if present)
62+
*
63+
* Then we have all these "special" CI ops at these offset that trigger
64+
* all sorts of side effects:
65+
*/
66+
#define TM_SPC_ACK_EBB 0x800 /* Load8 ack EBB to reg*/
67+
#define TM_SPC_ACK_OS_REG 0x810 /* Load16 ack OS irq to reg */
68+
#define TM_SPC_PUSH_USR_CTX 0x808 /* Store32 Push/Validate user context */
69+
#define TM_SPC_PULL_USR_CTX 0x808 /* Load32 Pull/Invalidate user context */
70+
#define TM_SPC_SET_OS_PENDING 0x812 /* Store8 Set OS irq pending bit */
71+
#define TM_SPC_PULL_OS_CTX 0x818 /* Load32/Load64 Pull/Invalidate OS context to reg */
72+
#define TM_SPC_PULL_POOL_CTX 0x828 /* Load32/Load64 Pull/Invalidate Pool context to reg*/
73+
#define TM_SPC_ACK_HV_REG 0x830 /* Load16 ack HV irq to reg */
74+
#define TM_SPC_PULL_USR_CTX_OL 0xc08 /* Store8 Pull/Inval usr ctx to odd line */
75+
#define TM_SPC_ACK_OS_EL 0xc10 /* Store8 ack OS irq to even line */
76+
#define TM_SPC_ACK_HV_POOL_EL 0xc20 /* Store8 ack HV evt pool to even line */
77+
#define TM_SPC_ACK_HV_EL 0xc30 /* Store8 ack HV irq to even line */
78+
/* XXX more... */
79+
80+
/* NSR fields for the various QW ack types */
81+
#define TM_QW0_NSR_EB PPC_BIT8(0)
82+
#define TM_QW1_NSR_EO PPC_BIT8(0)
83+
#define TM_QW3_NSR_HE PPC_BITMASK8(0,1)
84+
#define TM_QW3_NSR_HE_NONE 0
85+
#define TM_QW3_NSR_HE_POOL 1
86+
#define TM_QW3_NSR_HE_PHYS 2
87+
#define TM_QW3_NSR_HE_LSI 3
88+
#define TM_QW3_NSR_I PPC_BIT8(2)
89+
#define TM_QW3_NSR_GRP_LVL PPC_BIT8(3,7)
90+
91+
/* Utilities to manipulate these (originaly from OPAL) */
92+
#define MASK_TO_LSH(m) (__builtin_ffsl(m) - 1)
93+
#define GETFIELD(m, v) (((v) & (m)) >> MASK_TO_LSH(m))
94+
#define SETFIELD(m, v, val) \
95+
(((v) & ~(m)) | ((((typeof(v))(val)) << MASK_TO_LSH(m)) & (m)))
96+
97+
#endif /* _ASM_POWERPC_XIVE_REGS_H */

arch/powerpc/include/asm/xive.h

Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
/*
2+
* Copyright 2016,2017 IBM Corporation.
3+
*
4+
* This program is free software; you can redistribute it and/or
5+
* modify it under the terms of the GNU General Public License
6+
* as published by the Free Software Foundation; either version
7+
* 2 of the License, or (at your option) any later version.
8+
*/
9+
#ifndef _ASM_POWERPC_XIVE_H
10+
#define _ASM_POWERPC_XIVE_H
11+
12+
#define XIVE_INVALID_VP 0xffffffff
13+
14+
#ifdef CONFIG_PPC_XIVE
15+
16+
/*
17+
* Thread Interrupt Management Area (TIMA)
18+
*
19+
* This is a global MMIO region divided in 4 pages of varying access
20+
* permissions, providing access to per-cpu interrupt management
21+
* functions. It always identifies the CPU doing the access based
22+
* on the PowerBus initiator ID, thus we always access via the
23+
* same offset regardless of where the code is executing
24+
*/
25+
extern void __iomem *xive_tima;
26+
27+
/*
28+
* Offset in the TM area of our current execution level (provided by
29+
* the backend)
30+
*/
31+
extern u32 xive_tima_offset;
32+
33+
/*
34+
* Per-irq data (irq_get_handler_data for normal IRQs), IPIs
35+
* have it stored in the xive_cpu structure. We also cache
36+
* for normal interrupts the current target CPU.
37+
*
38+
* This structure is setup by the backend for each interrupt.
39+
*/
40+
struct xive_irq_data {
41+
u64 flags;
42+
u64 eoi_page;
43+
void __iomem *eoi_mmio;
44+
u64 trig_page;
45+
void __iomem *trig_mmio;
46+
u32 esb_shift;
47+
int src_chip;
48+
49+
/* Setup/used by frontend */
50+
int target;
51+
bool saved_p;
52+
};
53+
#define XIVE_IRQ_FLAG_STORE_EOI 0x01
54+
#define XIVE_IRQ_FLAG_LSI 0x02
55+
#define XIVE_IRQ_FLAG_SHIFT_BUG 0x04
56+
#define XIVE_IRQ_FLAG_MASK_FW 0x08
57+
#define XIVE_IRQ_FLAG_EOI_FW 0x10
58+
59+
#define XIVE_INVALID_CHIP_ID -1
60+
61+
/* A queue tracking structure in a CPU */
62+
struct xive_q {
63+
__be32 *qpage;
64+
u32 msk;
65+
u32 idx;
66+
u32 toggle;
67+
u64 eoi_phys;
68+
u32 esc_irq;
69+
atomic_t count;
70+
atomic_t pending_count;
71+
};
72+
73+
/*
74+
* "magic" Event State Buffer (ESB) MMIO offsets.
75+
*
76+
* Each interrupt source has a 2-bit state machine called ESB
77+
* which can be controlled by MMIO. It's made of 2 bits, P and
78+
* Q. P indicates that an interrupt is pending (has been sent
79+
* to a queue and is waiting for an EOI). Q indicates that the
80+
* interrupt has been triggered while pending.
81+
*
82+
* This acts as a coalescing mechanism in order to guarantee
83+
* that a given interrupt only occurs at most once in a queue.
84+
*
85+
* When doing an EOI, the Q bit will indicate if the interrupt
86+
* needs to be re-triggered.
87+
*
88+
* The following offsets into the ESB MMIO allow to read or
89+
* manipulate the PQ bits. They must be used with an 8-bytes
90+
* load instruction. They all return the previous state of the
91+
* interrupt (atomically).
92+
*
93+
* Additionally, some ESB pages support doing an EOI via a
94+
* store at 0 and some ESBs support doing a trigger via a
95+
* separate trigger page.
96+
*/
97+
#define XIVE_ESB_GET 0x800
98+
#define XIVE_ESB_SET_PQ_00 0xc00
99+
#define XIVE_ESB_SET_PQ_01 0xd00
100+
#define XIVE_ESB_SET_PQ_10 0xe00
101+
#define XIVE_ESB_SET_PQ_11 0xf00
102+
#define XIVE_ESB_MASK XIVE_ESB_SET_PQ_01
103+
104+
#define XIVE_ESB_VAL_P 0x2
105+
#define XIVE_ESB_VAL_Q 0x1
106+
107+
/* Global enable flags for the XIVE support */
108+
extern bool __xive_enabled;
109+
110+
static inline bool xive_enabled(void) { return __xive_enabled; }
111+
112+
extern bool xive_native_init(void);
113+
extern void xive_smp_probe(void);
114+
extern int xive_smp_prepare_cpu(unsigned int cpu);
115+
extern void xive_smp_setup_cpu(void);
116+
extern void xive_smp_disable_cpu(void);
117+
extern void xive_kexec_teardown_cpu(int secondary);
118+
extern void xive_shutdown(void);
119+
extern void xive_flush_interrupt(void);
120+
121+
/* xmon hook */
122+
extern void xmon_xive_do_dump(int cpu);
123+
124+
/* APIs used by KVM */
125+
extern u32 xive_native_default_eq_shift(void);
126+
extern u32 xive_native_alloc_vp_block(u32 max_vcpus);
127+
extern void xive_native_free_vp_block(u32 vp_base);
128+
extern int xive_native_populate_irq_data(u32 hw_irq,
129+
struct xive_irq_data *data);
130+
extern void xive_cleanup_irq_data(struct xive_irq_data *xd);
131+
extern u32 xive_native_alloc_irq(void);
132+
extern void xive_native_free_irq(u32 irq);
133+
extern int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq);
134+
135+
extern int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio,
136+
__be32 *qpage, u32 order, bool can_escalate);
137+
extern void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio);
138+
139+
extern bool __xive_irq_trigger(struct xive_irq_data *xd);
140+
extern bool __xive_irq_retrigger(struct xive_irq_data *xd);
141+
extern void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd);
142+
143+
extern bool is_xive_irq(struct irq_chip *chip);
144+
145+
#else
146+
147+
static inline bool xive_enabled(void) { return false; }
148+
149+
static inline bool xive_native_init(void) { return false; }
150+
static inline void xive_smp_probe(void) { }
151+
extern inline int xive_smp_prepare_cpu(unsigned int cpu) { return -EINVAL; }
152+
static inline void xive_smp_setup_cpu(void) { }
153+
static inline void xive_smp_disable_cpu(void) { }
154+
static inline void xive_kexec_teardown_cpu(int secondary) { }
155+
static inline void xive_shutdown(void) { }
156+
static inline void xive_flush_interrupt(void) { }
157+
158+
static inline u32 xive_native_alloc_vp_block(u32 max_vcpus) { return XIVE_INVALID_VP; }
159+
static inline void xive_native_free_vp_block(u32 vp_base) { }
160+
161+
#endif
162+
163+
#endif /* _ASM_POWERPC_XIVE_H */

arch/powerpc/include/asm/xmon.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,5 +29,7 @@ static inline void xmon_register_spus(struct list_head *list) { };
2929
extern int cpus_are_in_xmon(void);
3030
#endif
3131

32+
extern void xmon_printf(const char *format, ...);
33+
3234
#endif /* __KERNEL __ */
3335
#endif /* __ASM_POWERPC_XMON_H */

arch/powerpc/kvm/book3s_hv_builtin.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include <asm/kvm_book3s.h>
2424
#include <asm/archrandom.h>
2525
#include <asm/xics.h>
26+
#include <asm/xive.h>
2627
#include <asm/dbell.h>
2728
#include <asm/cputhreads.h>
2829
#include <asm/io.h>
@@ -224,6 +225,10 @@ void kvmhv_rm_send_ipi(int cpu)
224225
return;
225226
}
226227

228+
/* We should never reach this */
229+
if (WARN_ON_ONCE(xive_enabled()))
230+
return;
231+
227232
/* Else poke the target with an IPI */
228233
xics_phys = paca[cpu].kvm_hstate.xics_phys;
229234
if (xics_phys)
@@ -386,6 +391,9 @@ long kvmppc_read_intr(void)
386391
long rc;
387392
bool again;
388393

394+
if (xive_enabled())
395+
return 1;
396+
389397
do {
390398
again = false;
391399
rc = kvmppc_read_one_intr(&again);

arch/powerpc/platforms/powernv/Kconfig

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ config PPC_POWERNV
44
select PPC_NATIVE
55
select PPC_XICS
66
select PPC_ICP_NATIVE
7+
select PPC_XIVE_NATIVE
78
select PPC_P7_NAP
89
select PCI
910
select PCI_MSI

arch/powerpc/platforms/powernv/setup.c

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include <asm/machdep.h>
3333
#include <asm/firmware.h>
3434
#include <asm/xics.h>
35+
#include <asm/xive.h>
3536
#include <asm/opal.h>
3637
#include <asm/kexec.h>
3738
#include <asm/smp.h>
@@ -76,7 +77,9 @@ static void __init pnv_init(void)
7677

7778
static void __init pnv_init_IRQ(void)
7879
{
79-
xics_init();
80+
/* Try using a XIVE if available, otherwise use a XICS */
81+
if (!xive_native_init())
82+
xics_init();
8083

8184
WARN_ON(!ppc_md.get_irq);
8285
}
@@ -218,10 +221,12 @@ static void pnv_kexec_wait_secondaries_down(void)
218221

219222
static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
220223
{
221-
xics_kexec_teardown_cpu(secondary);
224+
if (xive_enabled())
225+
xive_kexec_teardown_cpu(secondary);
226+
else
227+
xics_kexec_teardown_cpu(secondary);
222228

223229
/* On OPAL, we return all CPUs to firmware */
224-
225230
if (!firmware_has_feature(FW_FEATURE_OPAL))
226231
return;
227232

@@ -237,6 +242,10 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
237242
/* Primary waits for the secondaries to have reached OPAL */
238243
pnv_kexec_wait_secondaries_down();
239244

245+
/* Switch XIVE back to emulation mode */
246+
if (xive_enabled())
247+
xive_shutdown();
248+
240249
/*
241250
* We might be running as little-endian - now that interrupts
242251
* are disabled, reset the HILE bit to big-endian so we don't

0 commit comments

Comments
 (0)