Skip to content

Commit 00e37bd

Browse files
olafheringkonradwilk
authored andcommitted
xen PVonHVM: move shared_info to MMIO before kexec
Currently kexec in a PVonHVM guest fails with a triple fault because the new kernel overwrites the shared info page. The exact failure depends on the size of the kernel image. This patch moves the pfn from RAM into MMIO space before the kexec boot. The pfn containing the shared_info is located somewhere in RAM. This will cause trouble if the current kernel is doing a kexec boot into a new kernel. The new kernel (and its startup code) can not know where the pfn is, so it can not reserve the page. The hypervisor will continue to update the pfn, and as a result memory corruption occours in the new kernel. One way to work around this issue is to allocate a page in the xen-platform pci device's BAR memory range. But pci init is done very late and the shared_info page is already in use very early to read the pvclock. So moving the pfn from RAM to MMIO is racy because some code paths on other vcpus could access the pfn during the small window when the old pfn is moved to the new pfn. There is even a small window were the old pfn is not backed by a mfn, and during that time all reads return -1. Because it is not known upfront where the MMIO region is located it can not be used right from the start in xen_hvm_init_shared_info. To minimise trouble the move of the pfn is done shortly before kexec. This does not eliminate the race because all vcpus are still online when the syscore_ops will be called. But hopefully there is no work pending at this point in time. Also the syscore_op is run last which reduces the risk further. Signed-off-by: Olaf Hering <olaf@aepfle.de> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
1 parent 4ff2d06 commit 00e37bd

File tree

5 files changed

+126
-13
lines changed

5 files changed

+126
-13
lines changed

arch/x86/xen/enlighten.c

Lines changed: 107 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
#include <linux/pci.h>
3232
#include <linux/gfp.h>
3333
#include <linux/memblock.h>
34+
#include <linux/syscore_ops.h>
3435

3536
#include <xen/xen.h>
3637
#include <xen/interface/xen.h>
@@ -1471,38 +1472,130 @@ asmlinkage void __init xen_start_kernel(void)
14711472
#endif
14721473
}
14731474

1474-
void __ref xen_hvm_init_shared_info(void)
1475+
#ifdef CONFIG_XEN_PVHVM
1476+
/*
1477+
* The pfn containing the shared_info is located somewhere in RAM. This
1478+
* will cause trouble if the current kernel is doing a kexec boot into a
1479+
* new kernel. The new kernel (and its startup code) can not know where
1480+
* the pfn is, so it can not reserve the page. The hypervisor will
1481+
* continue to update the pfn, and as a result memory corruption occours
1482+
* in the new kernel.
1483+
*
1484+
* One way to work around this issue is to allocate a page in the
1485+
* xen-platform pci device's BAR memory range. But pci init is done very
1486+
* late and the shared_info page is already in use very early to read
1487+
* the pvclock. So moving the pfn from RAM to MMIO is racy because some
1488+
* code paths on other vcpus could access the pfn during the small
1489+
* window when the old pfn is moved to the new pfn. There is even a
1490+
* small window were the old pfn is not backed by a mfn, and during that
1491+
* time all reads return -1.
1492+
*
1493+
* Because it is not known upfront where the MMIO region is located it
1494+
* can not be used right from the start in xen_hvm_init_shared_info.
1495+
*
1496+
* To minimise trouble the move of the pfn is done shortly before kexec.
1497+
* This does not eliminate the race because all vcpus are still online
1498+
* when the syscore_ops will be called. But hopefully there is no work
1499+
* pending at this point in time. Also the syscore_op is run last which
1500+
* reduces the risk further.
1501+
*/
1502+
1503+
static struct shared_info *xen_hvm_shared_info;
1504+
1505+
static void xen_hvm_connect_shared_info(unsigned long pfn)
14751506
{
1476-
int cpu;
14771507
struct xen_add_to_physmap xatp;
1478-
static struct shared_info *shared_info_page = 0;
14791508

1480-
if (!shared_info_page)
1481-
shared_info_page = (struct shared_info *)
1482-
extend_brk(PAGE_SIZE, PAGE_SIZE);
14831509
xatp.domid = DOMID_SELF;
14841510
xatp.idx = 0;
14851511
xatp.space = XENMAPSPACE_shared_info;
1486-
xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT;
1512+
xatp.gpfn = pfn;
14871513
if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
14881514
BUG();
14891515

1490-
HYPERVISOR_shared_info = (struct shared_info *)shared_info_page;
1516+
}
1517+
static void xen_hvm_set_shared_info(struct shared_info *sip)
1518+
{
1519+
int cpu;
1520+
1521+
HYPERVISOR_shared_info = sip;
14911522

14921523
/* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
14931524
* page, we use it in the event channel upcall and in some pvclock
14941525
* related functions. We don't need the vcpu_info placement
14951526
* optimizations because we don't use any pv_mmu or pv_irq op on
14961527
* HVM.
1497-
* When xen_hvm_init_shared_info is run at boot time only vcpu 0 is
1498-
* online but xen_hvm_init_shared_info is run at resume time too and
1528+
* When xen_hvm_set_shared_info is run at boot time only vcpu 0 is
1529+
* online but xen_hvm_set_shared_info is run at resume time too and
14991530
* in that case multiple vcpus might be online. */
15001531
for_each_online_cpu(cpu) {
15011532
per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
15021533
}
15031534
}
15041535

1505-
#ifdef CONFIG_XEN_PVHVM
1536+
/* Reconnect the shared_info pfn to a mfn */
1537+
void xen_hvm_resume_shared_info(void)
1538+
{
1539+
xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
1540+
}
1541+
1542+
#ifdef CONFIG_KEXEC
1543+
static struct shared_info *xen_hvm_shared_info_kexec;
1544+
static unsigned long xen_hvm_shared_info_pfn_kexec;
1545+
1546+
/* Remember a pfn in MMIO space for kexec reboot */
1547+
void __devinit xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn)
1548+
{
1549+
xen_hvm_shared_info_kexec = sip;
1550+
xen_hvm_shared_info_pfn_kexec = pfn;
1551+
}
1552+
1553+
static void xen_hvm_syscore_shutdown(void)
1554+
{
1555+
struct xen_memory_reservation reservation = {
1556+
.domid = DOMID_SELF,
1557+
.nr_extents = 1,
1558+
};
1559+
unsigned long prev_pfn;
1560+
int rc;
1561+
1562+
if (!xen_hvm_shared_info_kexec)
1563+
return;
1564+
1565+
prev_pfn = __pa(xen_hvm_shared_info) >> PAGE_SHIFT;
1566+
set_xen_guest_handle(reservation.extent_start, &prev_pfn);
1567+
1568+
/* Move pfn to MMIO, disconnects previous pfn from mfn */
1569+
xen_hvm_connect_shared_info(xen_hvm_shared_info_pfn_kexec);
1570+
1571+
/* Update pointers, following hypercall is also a memory barrier */
1572+
xen_hvm_set_shared_info(xen_hvm_shared_info_kexec);
1573+
1574+
/* Allocate new mfn for previous pfn */
1575+
do {
1576+
rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
1577+
if (rc == 0)
1578+
msleep(123);
1579+
} while (rc == 0);
1580+
1581+
/* Make sure the previous pfn is really connected to a (new) mfn */
1582+
BUG_ON(rc != 1);
1583+
}
1584+
1585+
static struct syscore_ops xen_hvm_syscore_ops = {
1586+
.shutdown = xen_hvm_syscore_shutdown,
1587+
};
1588+
#endif
1589+
1590+
/* Use a pfn in RAM, may move to MMIO before kexec. */
1591+
static void __init xen_hvm_init_shared_info(void)
1592+
{
1593+
/* Remember pointer for resume */
1594+
xen_hvm_shared_info = extend_brk(PAGE_SIZE, PAGE_SIZE);
1595+
xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
1596+
xen_hvm_set_shared_info(xen_hvm_shared_info);
1597+
}
1598+
15061599
static void __init init_hvm_pv_info(void)
15071600
{
15081601
int major, minor;
@@ -1553,6 +1646,9 @@ static void __init xen_hvm_guest_init(void)
15531646
init_hvm_pv_info();
15541647

15551648
xen_hvm_init_shared_info();
1649+
#ifdef CONFIG_KEXEC
1650+
register_syscore_ops(&xen_hvm_syscore_ops);
1651+
#endif
15561652

15571653
if (xen_feature(XENFEAT_hvm_callback_vector))
15581654
xen_have_vector_callback = 1;

arch/x86/xen/suspend.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ void xen_arch_hvm_post_suspend(int suspend_cancelled)
3030
{
3131
#ifdef CONFIG_XEN_PVHVM
3232
int cpu;
33-
xen_hvm_init_shared_info();
33+
xen_hvm_resume_shared_info();
3434
xen_callback_vector();
3535
xen_unplug_emulated_devices();
3636
if (xen_feature(XENFEAT_hvm_safe_pvclock)) {

arch/x86/xen/xen-ops.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ void xen_enable_syscall(void);
4141
void xen_vcpu_restore(void);
4242

4343
void xen_callback_vector(void);
44-
void xen_hvm_init_shared_info(void);
44+
void xen_hvm_resume_shared_info(void);
4545
void xen_unplug_emulated_devices(void);
4646

4747
void __init xen_build_dynamic_phys_to_machine(void);

drivers/xen/platform-pci.c

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,19 @@ static int platform_pci_resume(struct pci_dev *pdev)
101101
return 0;
102102
}
103103

104+
static void __devinit prepare_shared_info(void)
105+
{
106+
#ifdef CONFIG_KEXEC
107+
unsigned long addr;
108+
struct shared_info *hvm_shared_info;
109+
110+
addr = alloc_xen_mmio(PAGE_SIZE);
111+
hvm_shared_info = ioremap(addr, PAGE_SIZE);
112+
memset(hvm_shared_info, 0, PAGE_SIZE);
113+
xen_hvm_prepare_kexec(hvm_shared_info, addr >> PAGE_SHIFT);
114+
#endif
115+
}
116+
104117
static int __devinit platform_pci_init(struct pci_dev *pdev,
105118
const struct pci_device_id *ent)
106119
{
@@ -138,6 +151,8 @@ static int __devinit platform_pci_init(struct pci_dev *pdev,
138151
platform_mmio = mmio_addr;
139152
platform_mmiolen = mmio_len;
140153

154+
prepare_shared_info();
155+
141156
if (!xen_have_vector_callback) {
142157
ret = xen_allocate_irq(pdev);
143158
if (ret) {

include/xen/events.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@ void notify_remote_via_irq(int irq);
5858

5959
void xen_irq_resume(void);
6060

61+
void xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn);
62+
6163
/* Clear an irq's pending state, in preparation for polling on it */
6264
void xen_clear_irq_pending(int irq);
6365
void xen_set_irq_pending(int irq);

0 commit comments

Comments
 (0)