Skip to content

Commit 8fc5c73

Browse files
committed
acpi/nfit, device-dax: Identify differentiated memory with a unique numa-node
Persistent memory, as described by the ACPI NFIT (NVDIMM Firmware Interface Table), is the first known instance of a memory range described by a unique "target" proximity domain. Where "initiator" and "target" proximity domains is an approach that the ACPI HMAT (Heterogeneous Memory Attributes Table) uses to described the unique performance properties of a memory range relative to a given initiator (e.g. CPU or DMA device). Currently the numa-node for a /dev/pmemX block-device or /dev/daxX.Y char-device follows the traditional notion of 'numa-node' where the attribute conveys the closest online numa-node. That numa-node attribute is useful for cpu-binding and memory-binding processes *near* the device. However, when the memory range backing a 'pmem', or 'dax' device is onlined (memory hot-add) the memory-only-numa-node representing that address needs to be differentiated from the set of online nodes. In other words, the numa-node association of the device depends on whether you can bind processes *near* the cpu-numa-node in the offline device-case, or bind process *on* the memory-range directly after the backing address range is onlined. Allow for the case that platform firmware describes persistent memory with a unique proximity domain, i.e. when it is distinct from the proximity of DRAM and CPUs that are on the same socket. Plumb the Linux numa-node translation of that proximity through the libnvdimm region device to namespaces that are in device-dax mode. With this in place the proposed kmem driver [1] can optionally discover a unique numa-node number for the address range as it transitions the memory from an offline state managed by a device-driver to an online memory range managed by the core-mm. [1]: https://lore.kernel.org/lkml/20181022201317.8558C1D8@viggo.jf.intel.com Reported-by: Fan Du <fan.du@intel.com> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: "Oliver O'Halloran" <oohall@gmail.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Jérôme Glisse <jglisse@redhat.com> Reviewed-by: Yang Shi <yang.shi@linux.alibaba.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
1 parent 730926c commit 8fc5c73

File tree

13 files changed

+30
-6
lines changed

13 files changed

+30
-6
lines changed

arch/powerpc/platforms/pseries/papr_scm.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,7 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
236236
memset(&ndr_desc, 0, sizeof(ndr_desc));
237237
ndr_desc.attr_groups = region_attr_groups;
238238
ndr_desc.numa_node = dev_to_node(&p->pdev->dev);
239+
ndr_desc.target_node = ndr_desc.numa_node;
239240
ndr_desc.res = &p->res;
240241
ndr_desc.of_node = p->dn;
241242
ndr_desc.provider_data = p;

drivers/acpi/nfit/core.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2869,11 +2869,15 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
28692869
ndr_desc->res = &res;
28702870
ndr_desc->provider_data = nfit_spa;
28712871
ndr_desc->attr_groups = acpi_nfit_region_attribute_groups;
2872-
if (spa->flags & ACPI_NFIT_PROXIMITY_VALID)
2872+
if (spa->flags & ACPI_NFIT_PROXIMITY_VALID) {
28732873
ndr_desc->numa_node = acpi_map_pxm_to_online_node(
28742874
spa->proximity_domain);
2875-
else
2875+
ndr_desc->target_node = acpi_map_pxm_to_node(
2876+
spa->proximity_domain);
2877+
} else {
28762878
ndr_desc->numa_node = NUMA_NO_NODE;
2879+
ndr_desc->target_node = NUMA_NO_NODE;
2880+
}
28772881

28782882
/*
28792883
* Persistence domain bits are hierarchical, if

drivers/acpi/numa.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ int acpi_map_pxm_to_node(int pxm)
8484

8585
return node;
8686
}
87+
EXPORT_SYMBOL(acpi_map_pxm_to_node);
8788

8889
/**
8990
* acpi_map_pxm_to_online_node - Map proximity ID to online node

drivers/dax/bus.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,7 @@ static void dax_region_unregister(void *region)
214214
}
215215

216216
struct dax_region *alloc_dax_region(struct device *parent, int region_id,
217-
struct resource *res, unsigned int align,
217+
struct resource *res, int target_node, unsigned int align,
218218
unsigned long pfn_flags)
219219
{
220220
struct dax_region *dax_region;
@@ -244,6 +244,7 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id,
244244
dax_region->id = region_id;
245245
dax_region->align = align;
246246
dax_region->dev = parent;
247+
dax_region->target_node = target_node;
247248
if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) {
248249
kfree(dax_region);
249250
return NULL;
@@ -348,6 +349,7 @@ struct dev_dax *__devm_create_dev_dax(struct dax_region *dax_region, int id,
348349

349350
dev_dax->dax_dev = dax_dev;
350351
dev_dax->region = dax_region;
352+
dev_dax->target_node = dax_region->target_node;
351353
kref_get(&dax_region->kref);
352354

353355
inode = dax_inode(dax_dev);

drivers/dax/bus.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@ struct dax_device;
1010
struct dax_region;
1111
void dax_region_put(struct dax_region *dax_region);
1212
struct dax_region *alloc_dax_region(struct device *parent, int region_id,
13-
struct resource *res, unsigned int align, unsigned long flags);
13+
struct resource *res, int target_node, unsigned int align,
14+
unsigned long flags);
1415

1516
enum dev_dax_subsys {
1617
DEV_DAX_BUS,

drivers/dax/dax-private.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ void dax_bus_exit(void);
2626
/**
2727
* struct dax_region - mapping infrastructure for dax devices
2828
* @id: kernel-wide unique region for a memory range
29+
* @target_node: effective numa node if this memory range is onlined
2930
* @kref: to pin while other agents have a need to do lookups
3031
* @dev: parent device backing this region
3132
* @align: allocation and mapping alignment for child dax devices
@@ -34,6 +35,7 @@ void dax_bus_exit(void);
3435
*/
3536
struct dax_region {
3637
int id;
38+
int target_node;
3739
struct kref kref;
3840
struct device *dev;
3941
unsigned int align;
@@ -46,6 +48,7 @@ struct dax_region {
4648
* data while the device is activated in the driver.
4749
* @region - parent region
4850
* @dax_dev - core dax functionality
51+
* @target_node: effective numa node if dev_dax memory range is onlined
4952
* @dev - device core
5053
* @pgmap - pgmap for memmap setup / lifetime (driver owned)
5154
* @ref: pgmap reference count (driver owned)
@@ -54,6 +57,7 @@ struct dax_region {
5457
struct dev_dax {
5558
struct dax_region *region;
5659
struct dax_device *dax_dev;
60+
int target_node;
5761
struct device dev;
5862
struct dev_pagemap pgmap;
5963
struct percpu_ref ref;

drivers/dax/pmem/core.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys)
2020
struct nd_namespace_common *ndns;
2121
struct nd_dax *nd_dax = to_nd_dax(dev);
2222
struct nd_pfn *nd_pfn = &nd_dax->nd_pfn;
23+
struct nd_region *nd_region = to_nd_region(dev->parent);
2324

2425
ndns = nvdimm_namespace_common_probe(dev);
2526
if (IS_ERR(ndns))
@@ -52,7 +53,8 @@ struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys)
5253
memcpy(&res, &pgmap.res, sizeof(res));
5354
res.start += offset;
5455
dax_region = alloc_dax_region(dev, region_id, &res,
55-
le32_to_cpu(pfn_sb->align), PFN_DEV|PFN_MAP);
56+
nd_region->target_node, le32_to_cpu(pfn_sb->align),
57+
PFN_DEV|PFN_MAP);
5658
if (!dax_region)
5759
return ERR_PTR(-ENOMEM);
5860

drivers/nvdimm/e820.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ static int e820_register_one(struct resource *res, void *data)
4747
ndr_desc.res = res;
4848
ndr_desc.attr_groups = e820_pmem_region_attribute_groups;
4949
ndr_desc.numa_node = e820_range_to_nid(res->start);
50+
ndr_desc.target_node = ndr_desc.numa_node;
5051
set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
5152
if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc))
5253
return -ENXIO;

drivers/nvdimm/nd.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ struct nd_region {
153153
u16 ndr_mappings;
154154
u64 ndr_size;
155155
u64 ndr_start;
156-
int id, num_lanes, ro, numa_node;
156+
int id, num_lanes, ro, numa_node, target_node;
157157
void *provider_data;
158158
struct kernfs_node *bb_state;
159159
struct badblocks bb;

drivers/nvdimm/of_pmem.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ static int of_pmem_region_probe(struct platform_device *pdev)
6868
memset(&ndr_desc, 0, sizeof(ndr_desc));
6969
ndr_desc.attr_groups = region_attr_groups;
7070
ndr_desc.numa_node = dev_to_node(&pdev->dev);
71+
ndr_desc.target_node = ndr_desc.numa_node;
7172
ndr_desc.res = &pdev->resource[i];
7273
ndr_desc.of_node = np;
7374
set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);

drivers/nvdimm/region_devs.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1065,6 +1065,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
10651065
nd_region->flags = ndr_desc->flags;
10661066
nd_region->ro = ro;
10671067
nd_region->numa_node = ndr_desc->numa_node;
1068+
nd_region->target_node = ndr_desc->target_node;
10681069
ida_init(&nd_region->ns_ida);
10691070
ida_init(&nd_region->btt_ida);
10701071
ida_init(&nd_region->pfn_ida);

include/linux/acpi.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -400,12 +400,17 @@ extern bool acpi_osi_is_win8(void);
400400

401401
#ifdef CONFIG_ACPI_NUMA
402402
int acpi_map_pxm_to_online_node(int pxm);
403+
int acpi_map_pxm_to_node(int pxm);
403404
int acpi_get_node(acpi_handle handle);
404405
#else
405406
static inline int acpi_map_pxm_to_online_node(int pxm)
406407
{
407408
return 0;
408409
}
410+
static inline int acpi_map_pxm_to_node(int pxm)
411+
{
412+
return 0;
413+
}
409414
static inline int acpi_get_node(acpi_handle handle)
410415
{
411416
return 0;

include/linux/libnvdimm.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ struct nd_region_desc {
128128
void *provider_data;
129129
int num_lanes;
130130
int numa_node;
131+
int target_node;
131132
unsigned long flags;
132133
struct device_node *of_node;
133134
};

0 commit comments

Comments
 (0)