Skip to content

Commit 2923b27

Browse files
committed
Merge tag 'libnvdimm-for-4.19_dax-memory-failure' of gitolite.kernel.org:pub/scm/linux/kernel/git/nvdimm/nvdimm
Pull libnvdimm memory-failure update from Dave Jiang: "As it stands, memory_failure() gets thoroughly confused by dev_pagemap backed mappings. The recovery code has specific enabling for several possible page states and needs new enabling to handle poison in dax mappings. In order to support reliable reverse mapping of user space addresses: 1/ Add new locking in the memory_failure() rmap path to prevent races that would typically be handled by the page lock. 2/ Since dev_pagemap pages are hidden from the page allocator and the "compound page" accounting machinery, add a mechanism to determine the size of the mapping that encompasses a given poisoned pfn. 3/ Given pmem errors can be repaired, change the speculatively accessed poison protection, mce_unmap_kpfn(), to be reversible and otherwise allow ongoing access from the kernel. A side effect of this enabling is that MADV_HWPOISON becomes usable for dax mappings, however the primary motivation is to allow the system to survive userspace consumption of hardware-poison via dax. Specifically the current behavior is: mce: Uncorrected hardware memory error in user-access at af34214200 {1}[Hardware Error]: It has been corrected by h/w and requires no further action mce: [Hardware Error]: Machine check events logged {1}[Hardware Error]: event severity: corrected Memory failure: 0xaf34214: reserved kernel page still referenced by 1 users [..] Memory failure: 0xaf34214: recovery action for reserved kernel page: Failed mce: Memory error not recovered <reboot> ...and with these changes: Injecting memory failure for pfn 0x20cb00 at process virtual address 0x7f763dd00000 Memory failure: 0x20cb00: Killing dax-pmd:5421 due to hardware memory corruption Memory failure: 0x20cb00: recovery action for dax page: Recovered Given all the cross dependencies I propose taking this through nvdimm.git with acks from Naoya, x86/core, x86/RAS, and of course dax folks" * tag 'libnvdimm-for-4.19_dax-memory-failure' of gitolite.kernel.org:pub/scm/linux/kernel/git/nvdimm/nvdimm: libnvdimm, pmem: Restore page attributes when clearing errors x86/memory_failure: Introduce {set, clear}_mce_nospec() x86/mm/pat: Prepare {reserve, free}_memtype() for "decoy" addresses mm, memory_failure: Teach memory_failure() about dev_pagemap pages filesystem-dax: Introduce dax_lock_mapping_entry() mm, memory_failure: Collect mapping size in collect_procs() mm, madvise_inject_error: Let memory_failure() optionally take a page reference mm, dev_pagemap: Do not clear ->mapping on final put mm, madvise_inject_error: Disable MADV_SOFT_OFFLINE for ZONE_DEVICE pages filesystem-dax: Set page->index device-dax: Set page->index device-dax: Enable page_mapping() device-dax: Convert to vmf_insert_mixed and vm_fault_t
2 parents 828bf6e + c953cc9 commit 2923b27

File tree

17 files changed

+481
-135
lines changed

17 files changed

+481
-135
lines changed

arch/x86/include/asm/set_memory.h

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,4 +89,46 @@ extern int kernel_set_to_readonly;
8989
void set_kernel_text_rw(void);
9090
void set_kernel_text_ro(void);
9191

92+
#ifdef CONFIG_X86_64
93+
static inline int set_mce_nospec(unsigned long pfn)
94+
{
95+
unsigned long decoy_addr;
96+
int rc;
97+
98+
/*
99+
* Mark the linear address as UC to make sure we don't log more
100+
* errors because of speculative access to the page.
101+
* We would like to just call:
102+
* set_memory_uc((unsigned long)pfn_to_kaddr(pfn), 1);
103+
* but doing that would radically increase the odds of a
104+
* speculative access to the poison page because we'd have
105+
* the virtual address of the kernel 1:1 mapping sitting
106+
* around in registers.
107+
* Instead we get tricky. We create a non-canonical address
108+
* that looks just like the one we want, but has bit 63 flipped.
109+
* This relies on set_memory_uc() properly sanitizing any __pa()
110+
* results with __PHYSICAL_MASK or PTE_PFN_MASK.
111+
*/
112+
decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
113+
114+
rc = set_memory_uc(decoy_addr, 1);
115+
if (rc)
116+
pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
117+
return rc;
118+
}
119+
#define set_mce_nospec set_mce_nospec
120+
121+
/* Restore full speculative operation to the pfn. */
122+
static inline int clear_mce_nospec(unsigned long pfn)
123+
{
124+
return set_memory_wb((unsigned long) pfn_to_kaddr(pfn), 1);
125+
}
126+
#define clear_mce_nospec clear_mce_nospec
127+
#else
128+
/*
129+
* Few people would run a 32-bit kernel on a machine that supports
130+
* recoverable errors because they have too much memory to boot 32-bit.
131+
*/
132+
#endif
133+
92134
#endif /* _ASM_X86_SET_MEMORY_H */

arch/x86/kernel/cpu/mcheck/mce-internal.h

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -113,21 +113,6 @@ static inline void mce_register_injector_chain(struct notifier_block *nb) { }
113113
static inline void mce_unregister_injector_chain(struct notifier_block *nb) { }
114114
#endif
115115

116-
#ifndef CONFIG_X86_64
117-
/*
118-
* On 32-bit systems it would be difficult to safely unmap a poison page
119-
* from the kernel 1:1 map because there are no non-canonical addresses that
120-
* we can use to refer to the address without risking a speculative access.
121-
* However, this isn't much of an issue because:
122-
* 1) Few unmappable pages are in the 1:1 map. Most are in HIGHMEM which
123-
* are only mapped into the kernel as needed
124-
* 2) Few people would run a 32-bit kernel on a machine that supports
125-
* recoverable errors because they have too much memory to boot 32-bit.
126-
*/
127-
static inline void mce_unmap_kpfn(unsigned long pfn) {}
128-
#define mce_unmap_kpfn mce_unmap_kpfn
129-
#endif
130-
131116
struct mca_config {
132117
bool dont_log_ce;
133118
bool cmci_disabled;

arch/x86/kernel/cpu/mcheck/mce.c

Lines changed: 3 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
#include <linux/irq_work.h>
4343
#include <linux/export.h>
4444
#include <linux/jump_label.h>
45+
#include <linux/set_memory.h>
4546

4647
#include <asm/intel-family.h>
4748
#include <asm/processor.h>
@@ -50,7 +51,6 @@
5051
#include <asm/mce.h>
5152
#include <asm/msr.h>
5253
#include <asm/reboot.h>
53-
#include <asm/set_memory.h>
5454

5555
#include "mce-internal.h"
5656

@@ -108,10 +108,6 @@ static struct irq_work mce_irq_work;
108108

109109
static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
110110

111-
#ifndef mce_unmap_kpfn
112-
static void mce_unmap_kpfn(unsigned long pfn);
113-
#endif
114-
115111
/*
116112
* CPU/chipset specific EDAC code can register a notifier call here to print
117113
* MCE errors in a human-readable form.
@@ -602,7 +598,7 @@ static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
602598
if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) {
603599
pfn = mce->addr >> PAGE_SHIFT;
604600
if (!memory_failure(pfn, 0))
605-
mce_unmap_kpfn(pfn);
601+
set_mce_nospec(pfn);
606602
}
607603

608604
return NOTIFY_OK;
@@ -1072,38 +1068,10 @@ static int do_memory_failure(struct mce *m)
10721068
if (ret)
10731069
pr_err("Memory error not recovered");
10741070
else
1075-
mce_unmap_kpfn(m->addr >> PAGE_SHIFT);
1071+
set_mce_nospec(m->addr >> PAGE_SHIFT);
10761072
return ret;
10771073
}
10781074

1079-
#ifndef mce_unmap_kpfn
1080-
static void mce_unmap_kpfn(unsigned long pfn)
1081-
{
1082-
unsigned long decoy_addr;
1083-
1084-
/*
1085-
* Unmap this page from the kernel 1:1 mappings to make sure
1086-
* we don't log more errors because of speculative access to
1087-
* the page.
1088-
* We would like to just call:
1089-
* set_memory_np((unsigned long)pfn_to_kaddr(pfn), 1);
1090-
* but doing that would radically increase the odds of a
1091-
* speculative access to the poison page because we'd have
1092-
* the virtual address of the kernel 1:1 mapping sitting
1093-
* around in registers.
1094-
* Instead we get tricky. We create a non-canonical address
1095-
* that looks just like the one we want, but has bit 63 flipped.
1096-
* This relies on set_memory_np() not checking whether we passed
1097-
* a legal address.
1098-
*/
1099-
1100-
decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
1101-
1102-
if (set_memory_np(decoy_addr, 1))
1103-
pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
1104-
}
1105-
#endif
1106-
11071075

11081076
/*
11091077
* Cases where we avoid rendezvous handler timeout:

arch/x86/mm/pat.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -512,6 +512,17 @@ static int free_ram_pages_type(u64 start, u64 end)
512512
return 0;
513513
}
514514

515+
static u64 sanitize_phys(u64 address)
516+
{
517+
/*
518+
* When changing the memtype for pages containing poison allow
519+
* for a "decoy" virtual address (bit 63 clear) passed to
520+
* set_memory_X(). __pa() on a "decoy" address results in a
521+
* physical address with bit 63 set.
522+
*/
523+
return address & __PHYSICAL_MASK;
524+
}
525+
515526
/*
516527
* req_type typically has one of the:
517528
* - _PAGE_CACHE_MODE_WB
@@ -533,6 +544,8 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type,
533544
int is_range_ram;
534545
int err = 0;
535546

547+
start = sanitize_phys(start);
548+
end = sanitize_phys(end);
536549
BUG_ON(start >= end); /* end is exclusive */
537550

538551
if (!pat_enabled()) {
@@ -609,6 +622,9 @@ int free_memtype(u64 start, u64 end)
609622
if (!pat_enabled())
610623
return 0;
611624

625+
start = sanitize_phys(start);
626+
end = sanitize_phys(end);
627+
612628
/* Low ISA region is always mapped WB. No need to track */
613629
if (x86_platform.is_untracked_pat_range(start, end))
614630
return 0;

drivers/dax/device.c

Lines changed: 48 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -248,13 +248,12 @@ __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff,
248248
return -1;
249249
}
250250

251-
static int __dev_dax_pte_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
251+
static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax,
252+
struct vm_fault *vmf, pfn_t *pfn)
252253
{
253254
struct device *dev = &dev_dax->dev;
254255
struct dax_region *dax_region;
255-
int rc = VM_FAULT_SIGBUS;
256256
phys_addr_t phys;
257-
pfn_t pfn;
258257
unsigned int fault_size = PAGE_SIZE;
259258

260259
if (check_vma(dev_dax, vmf->vma, __func__))
@@ -276,26 +275,19 @@ static int __dev_dax_pte_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
276275
return VM_FAULT_SIGBUS;
277276
}
278277

279-
pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
280-
281-
rc = vm_insert_mixed(vmf->vma, vmf->address, pfn);
282-
283-
if (rc == -ENOMEM)
284-
return VM_FAULT_OOM;
285-
if (rc < 0 && rc != -EBUSY)
286-
return VM_FAULT_SIGBUS;
278+
*pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
287279

288-
return VM_FAULT_NOPAGE;
280+
return vmf_insert_mixed(vmf->vma, vmf->address, *pfn);
289281
}
290282

291-
static int __dev_dax_pmd_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
283+
static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax,
284+
struct vm_fault *vmf, pfn_t *pfn)
292285
{
293286
unsigned long pmd_addr = vmf->address & PMD_MASK;
294287
struct device *dev = &dev_dax->dev;
295288
struct dax_region *dax_region;
296289
phys_addr_t phys;
297290
pgoff_t pgoff;
298-
pfn_t pfn;
299291
unsigned int fault_size = PMD_SIZE;
300292

301293
if (check_vma(dev_dax, vmf->vma, __func__))
@@ -331,21 +323,21 @@ static int __dev_dax_pmd_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
331323
return VM_FAULT_SIGBUS;
332324
}
333325

334-
pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
326+
*pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
335327

336-
return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, pfn,
328+
return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, *pfn,
337329
vmf->flags & FAULT_FLAG_WRITE);
338330
}
339331

340332
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
341-
static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
333+
static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
334+
struct vm_fault *vmf, pfn_t *pfn)
342335
{
343336
unsigned long pud_addr = vmf->address & PUD_MASK;
344337
struct device *dev = &dev_dax->dev;
345338
struct dax_region *dax_region;
346339
phys_addr_t phys;
347340
pgoff_t pgoff;
348-
pfn_t pfn;
349341
unsigned int fault_size = PUD_SIZE;
350342

351343

@@ -382,23 +374,26 @@ static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
382374
return VM_FAULT_SIGBUS;
383375
}
384376

385-
pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
377+
*pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
386378

387-
return vmf_insert_pfn_pud(vmf->vma, vmf->address, vmf->pud, pfn,
379+
return vmf_insert_pfn_pud(vmf->vma, vmf->address, vmf->pud, *pfn,
388380
vmf->flags & FAULT_FLAG_WRITE);
389381
}
390382
#else
391-
static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
383+
static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
384+
struct vm_fault *vmf, pfn_t *pfn)
392385
{
393386
return VM_FAULT_FALLBACK;
394387
}
395388
#endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
396389

397-
static int dev_dax_huge_fault(struct vm_fault *vmf,
390+
static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf,
398391
enum page_entry_size pe_size)
399392
{
400-
int rc, id;
401393
struct file *filp = vmf->vma->vm_file;
394+
unsigned long fault_size;
395+
int rc, id;
396+
pfn_t pfn;
402397
struct dev_dax *dev_dax = filp->private_data;
403398

404399
dev_dbg(&dev_dax->dev, "%s: %s (%#lx - %#lx) size = %d\n", current->comm,
@@ -408,23 +403,49 @@ static int dev_dax_huge_fault(struct vm_fault *vmf,
408403
id = dax_read_lock();
409404
switch (pe_size) {
410405
case PE_SIZE_PTE:
411-
rc = __dev_dax_pte_fault(dev_dax, vmf);
406+
fault_size = PAGE_SIZE;
407+
rc = __dev_dax_pte_fault(dev_dax, vmf, &pfn);
412408
break;
413409
case PE_SIZE_PMD:
414-
rc = __dev_dax_pmd_fault(dev_dax, vmf);
410+
fault_size = PMD_SIZE;
411+
rc = __dev_dax_pmd_fault(dev_dax, vmf, &pfn);
415412
break;
416413
case PE_SIZE_PUD:
417-
rc = __dev_dax_pud_fault(dev_dax, vmf);
414+
fault_size = PUD_SIZE;
415+
rc = __dev_dax_pud_fault(dev_dax, vmf, &pfn);
418416
break;
419417
default:
420418
rc = VM_FAULT_SIGBUS;
421419
}
420+
421+
if (rc == VM_FAULT_NOPAGE) {
422+
unsigned long i;
423+
pgoff_t pgoff;
424+
425+
/*
426+
* In the device-dax case the only possibility for a
427+
* VM_FAULT_NOPAGE result is when device-dax capacity is
428+
* mapped. No need to consider the zero page, or racing
429+
* conflicting mappings.
430+
*/
431+
pgoff = linear_page_index(vmf->vma, vmf->address
432+
& ~(fault_size - 1));
433+
for (i = 0; i < fault_size / PAGE_SIZE; i++) {
434+
struct page *page;
435+
436+
page = pfn_to_page(pfn_t_to_pfn(pfn) + i);
437+
if (page->mapping)
438+
continue;
439+
page->mapping = filp->f_mapping;
440+
page->index = pgoff + i;
441+
}
442+
}
422443
dax_read_unlock(id);
423444

424445
return rc;
425446
}
426447

427-
static int dev_dax_fault(struct vm_fault *vmf)
448+
static vm_fault_t dev_dax_fault(struct vm_fault *vmf)
428449
{
429450
return dev_dax_huge_fault(vmf, PE_SIZE_PTE);
430451
}

drivers/nvdimm/pmem.c

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include <linux/hdreg.h>
2121
#include <linux/init.h>
2222
#include <linux/platform_device.h>
23+
#include <linux/set_memory.h>
2324
#include <linux/module.h>
2425
#include <linux/moduleparam.h>
2526
#include <linux/badblocks.h>
@@ -51,6 +52,30 @@ static struct nd_region *to_region(struct pmem_device *pmem)
5152
return to_nd_region(to_dev(pmem)->parent);
5253
}
5354

55+
static void hwpoison_clear(struct pmem_device *pmem,
56+
phys_addr_t phys, unsigned int len)
57+
{
58+
unsigned long pfn_start, pfn_end, pfn;
59+
60+
/* only pmem in the linear map supports HWPoison */
61+
if (is_vmalloc_addr(pmem->virt_addr))
62+
return;
63+
64+
pfn_start = PHYS_PFN(phys);
65+
pfn_end = pfn_start + PHYS_PFN(len);
66+
for (pfn = pfn_start; pfn < pfn_end; pfn++) {
67+
struct page *page = pfn_to_page(pfn);
68+
69+
/*
70+
* Note, no need to hold a get_dev_pagemap() reference
71+
* here since we're in the driver I/O path and
72+
* outstanding I/O requests pin the dev_pagemap.
73+
*/
74+
if (test_and_clear_pmem_poison(page))
75+
clear_mce_nospec(pfn);
76+
}
77+
}
78+
5479
static blk_status_t pmem_clear_poison(struct pmem_device *pmem,
5580
phys_addr_t offset, unsigned int len)
5681
{
@@ -65,6 +90,7 @@ static blk_status_t pmem_clear_poison(struct pmem_device *pmem,
6590
if (cleared < len)
6691
rc = BLK_STS_IOERR;
6792
if (cleared > 0 && cleared / 512) {
93+
hwpoison_clear(pmem, pmem->phys_addr + offset, cleared);
6894
cleared /= 512;
6995
dev_dbg(dev, "%#llx clear %ld sector%s\n",
7096
(unsigned long long) sector, cleared,

0 commit comments

Comments
 (0)