From 65128868bb3b0621d2d8e71f19852675a064b373 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 29 Aug 2025 15:29:04 -0700 Subject: mm/memory_hotplug: Update comment for hotplug memory callback priorities Add clarification to comment for memory hotplug callback ordering as the current comment does not provide clear language on which callback happens first. Acked-by: David Hildenbrand Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20250829222907.1290912-2-dave.jiang@intel.com Signed-off-by: Dave Jiang --- include/linux/memory.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/memory.h') diff --git a/include/linux/memory.h b/include/linux/memory.h index 40eb70ccb09d..1305102688d0 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -115,8 +115,8 @@ struct notifier_block; struct mem_section; /* - * Priorities for the hotplug memory callback routines (stored in decreasing - * order in the callback chain) + * Priorities for the hotplug memory callback routines. Invoked from + * high to low. Higher priorities correspond to higher numbers. */ #define DEFAULT_CALLBACK_PRI 0 #define SLAB_CALLBACK_PRI 1 -- cgit v1.2.3 From 2e454fb8056df6da4bba7d89a57bf60e217463c0 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 29 Aug 2025 15:29:06 -0700 Subject: cxl, acpi/hmat: Update CXL access coordinates directly instead of through HMAT The current implementation of CXL memory hotplug notifier gets called before the HMAT memory hotplug notifier. The CXL driver calculates the access coordinates (bandwidth and latency values) for the CXL end to end path (i.e. CPU to endpoint). When the CXL region is onlined, the CXL memory hotplug notifier writes the access coordinates to the HMAT target structs. Then the HMAT memory hotplug notifier is called and it creates the access coordinates for the node sysfs attributes. During testing on an Intel platform, it was found that although the newly calculated coordinates were pushed to sysfs, the sysfs attributes for the access coordinates showed up with the wrong initiator. The system has 4 nodes (0, 1, 2, 3) where node 0 and 1 are CPU nodes and node 2 and 3 are CXL nodes. The expectation is that node 2 would show up as a target to node 0: /sys/devices/system/node/node2/access0/initiators/node0 However it was observed that node 2 showed up as a target under node 1: /sys/devices/system/node/node2/access0/initiators/node1 The original intent of the 'ext_updated' flag in HMAT handling code was to stop HMAT memory hotplug callback from clobbering the access coordinates after CXL has injected its calculated coordinates and replaced the generic target access coordinates provided by the HMAT table in the HMAT target structs. However the flag is hacky at best and blocks the updates from other CXL regions that are onlined in the same node later on. Remove the 'ext_updated' flag usage and just update the access coordinates for the nodes directly without touching HMAT target data. The hotplug memory callback ordering is changed. Instead of changing CXL, move HMAT back so there's room for the levels rather than have CXL share the same level as SLAB_CALLBACK_PRI. The change will resulting in the CXL callback to be executed after the HMAT callback. With the change, the CXL hotplug memory notifier runs after the HMAT callback. The HMAT callback will create the node sysfs attributes for access coordinates. The CXL callback will write the access coordinates to the now created node sysfs attributes directly and will not pollute the HMAT target values. A nodemask is introduced to keep track if a node has been updated and prevents further updates. Fixes: 067353a46d8c ("cxl/region: Add memory hotplug notifier for cxl region") Cc: stable@vger.kernel.org Tested-by: Marc Herbert Reviewed-by: Dan Williams Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20250829222907.1290912-4-dave.jiang@intel.com Signed-off-by: Dave Jiang --- drivers/acpi/numa/hmat.c | 6 ------ drivers/cxl/core/cdat.c | 5 ----- drivers/cxl/core/core.h | 1 - drivers/cxl/core/region.c | 20 ++++++++++++-------- include/linux/memory.h | 2 +- 5 files changed, 13 insertions(+), 21 deletions(-) (limited to 'include/linux/memory.h') diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index 4958301f5417..5d32490dc4ab 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -74,7 +74,6 @@ struct memory_target { struct node_cache_attrs cache_attrs; u8 gen_port_device_handle[ACPI_SRAT_DEVICE_HANDLE_SIZE]; bool registered; - bool ext_updated; /* externally updated */ }; struct memory_initiator { @@ -391,7 +390,6 @@ int hmat_update_target_coordinates(int nid, struct access_coordinate *coord, coord->read_bandwidth, access); hmat_update_target_access(target, ACPI_HMAT_WRITE_BANDWIDTH, coord->write_bandwidth, access); - target->ext_updated = true; return 0; } @@ -773,10 +771,6 @@ static void hmat_update_target_attrs(struct memory_target *target, u32 best = 0; int i; - /* Don't update if an external agent has changed the data. */ - if (target->ext_updated) - return; - /* Don't update for generic port if there's no device handle */ if ((access == NODE_ACCESS_CLASS_GENPORT_SINK_LOCAL || access == NODE_ACCESS_CLASS_GENPORT_SINK_CPU) && diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c index c0af645425f4..c891fd618cfd 100644 --- a/drivers/cxl/core/cdat.c +++ b/drivers/cxl/core/cdat.c @@ -1081,8 +1081,3 @@ int cxl_update_hmat_access_coordinates(int nid, struct cxl_region *cxlr, { return hmat_update_target_coordinates(nid, &cxlr->coord[access], access); } - -bool cxl_need_node_perf_attrs_update(int nid) -{ - return !acpi_node_backed_by_real_pxm(nid); -} diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 2669f251d677..a253d308f3c9 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -139,7 +139,6 @@ long cxl_pci_get_latency(struct pci_dev *pdev); int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c); int cxl_update_hmat_access_coordinates(int nid, struct cxl_region *cxlr, enum access_coordinate_class access); -bool cxl_need_node_perf_attrs_update(int nid); int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port, struct access_coordinate *c); diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 71cc42d05248..0ed95cbc5d5b 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -30,6 +30,12 @@ * 3. Decoder targets */ +/* + * nodemask that sets per node when the access_coordinates for the node has + * been updated by the CXL memory hotplug notifier. + */ +static nodemask_t nodemask_region_seen = NODE_MASK_NONE; + static struct cxl_region *to_cxl_region(struct device *dev); #define __ACCESS_ATTR_RO(_level, _name) { \ @@ -2442,14 +2448,8 @@ static bool cxl_region_update_coordinates(struct cxl_region *cxlr, int nid) for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { if (cxlr->coord[i].read_bandwidth) { - rc = 0; - if (cxl_need_node_perf_attrs_update(nid)) - node_set_perf_attrs(nid, &cxlr->coord[i], i); - else - rc = cxl_update_hmat_access_coordinates(nid, cxlr, i); - - if (rc == 0) - cset++; + node_update_perf_attrs(nid, &cxlr->coord[i], i); + cset++; } } @@ -2487,6 +2487,10 @@ static int cxl_region_perf_attrs_callback(struct notifier_block *nb, if (nid != region_nid) return NOTIFY_DONE; + /* No action needed if node bit already set */ + if (node_test_and_set(nid, nodemask_region_seen)) + return NOTIFY_DONE; + if (!cxl_region_update_coordinates(cxlr, nid)) return NOTIFY_DONE; diff --git a/include/linux/memory.h b/include/linux/memory.h index 1305102688d0..0b755d1ef1ec 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -120,8 +120,8 @@ struct mem_section; */ #define DEFAULT_CALLBACK_PRI 0 #define SLAB_CALLBACK_PRI 1 -#define HMAT_CALLBACK_PRI 2 #define CXL_CALLBACK_PRI 5 +#define HMAT_CALLBACK_PRI 6 #define MM_COMPUTE_BATCH_PRI 10 #define CPUSET_CALLBACK_PRI 10 #define MEMTIER_HOTPLUG_PRI 100 -- cgit v1.2.3 From b8179af120943e2fc099ea87caa234039a709a66 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 29 Jul 2025 08:46:35 +0200 Subject: mm/memory_hotplug: activate node before adding new memory blocks The sysfs attributes for memory blocks require the node ID to be set and initialized, so move the node activation before adding new memory blocks. This also has the nice side effect that the BUG_ON() can be converted into a WARN_ON() as we now can handle registration errors. Link: https://lkml.kernel.org/r/20250729064637.51662-3-hare@kernel.org Fixes: b9ff036082cd ("mm/memory_hotplug.c: make add_memory_resource use __try_online_node") Signed-off-by: Hannes Reinecke Acked-by: David Hildenbrand Acked-by: Oscar Salvador Reviewed-by: Donet Tom Signed-off-by: Andrew Morton --- drivers/base/memory.c | 4 ++-- include/linux/memory.h | 2 +- mm/memory_hotplug.c | 32 +++++++++++++++++--------------- 3 files changed, 20 insertions(+), 18 deletions(-) (limited to 'include/linux/memory.h') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 894d3891292b..fb212a889e65 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -879,7 +879,7 @@ static void remove_memory_block(struct memory_block *memory) * Called under device_hotplug_lock. */ int create_memory_block_devices(unsigned long start, unsigned long size, - struct vmem_altmap *altmap, + int nid, struct vmem_altmap *altmap, struct memory_group *group) { const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); @@ -893,7 +893,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size, return -EINVAL; for (block_id = start_block_id; block_id != end_block_id; block_id++) { - ret = add_memory_block(block_id, NUMA_NO_NODE, MEM_OFFLINE, altmap, group); + ret = add_memory_block(block_id, nid, MEM_OFFLINE, altmap, group); if (ret) break; } diff --git a/include/linux/memory.h b/include/linux/memory.h index 40eb70ccb09d..4a29153e372e 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -159,7 +159,7 @@ static inline unsigned long memory_block_advised_max_size(void) extern int register_memory_notifier(struct notifier_block *nb); extern void unregister_memory_notifier(struct notifier_block *nb); int create_memory_block_devices(unsigned long start, unsigned long size, - struct vmem_altmap *altmap, + int nid, struct vmem_altmap *altmap, struct memory_group *group); void remove_memory_block_devices(unsigned long start, unsigned long size); extern void memory_dev_init(void); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e9f14de4a9c9..0be83039c3b5 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1477,7 +1477,7 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group, } /* create memory block devices after memory was added */ - ret = create_memory_block_devices(cur_start, memblock_size, + ret = create_memory_block_devices(cur_start, memblock_size, nid, params.altmap, group); if (ret) { arch_remove_memory(cur_start, memblock_size, NULL); @@ -1539,8 +1539,16 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) ret = __try_online_node(nid, false); if (ret < 0) - goto error; - new_node = ret; + goto error_memblock_remove; + if (ret) { + node_set_online(nid); + ret = register_one_node(nid); + if (WARN_ON(ret)) { + node_set_offline(nid); + goto error_memblock_remove; + } + new_node = true; + } /* * Self hosted memmap array @@ -1556,24 +1564,13 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) goto error; /* create memory block devices after memory was added */ - ret = create_memory_block_devices(start, size, NULL, group); + ret = create_memory_block_devices(start, size, nid, NULL, group); if (ret) { arch_remove_memory(start, size, params.altmap); goto error; } } - if (new_node) { - /* If sysfs file of new node can't be created, cpu on the node - * can't be hot-added. There is no rollback way now. - * So, check by BUG_ON() to catch it reluctantly.. - * We online node here. We can't roll back from here. - */ - node_set_online(nid); - ret = register_one_node(nid); - BUG_ON(ret); - } - register_memory_blocks_under_node_hotplug(nid, PFN_DOWN(start), PFN_UP(start + size - 1)); @@ -1597,6 +1594,11 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) return ret; error: + if (new_node) { + node_set_offline(nid); + unregister_one_node(nid); + } +error_memblock_remove: if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) memblock_remove(start, size); error_mem_hotplug_end: -- cgit v1.2.3 From 0a947c14e48cbf9de222836170282e0167a9e096 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 29 Jul 2025 08:46:36 +0200 Subject: drivers/base: move memory_block_add_nid() into the caller Now the node id only needs to be set for early memory, so move memory_block_add_nid() into the caller and rename it into memory_block_add_nid_early(). This allows us to further simplify the code by dropping the 'context' argument to do_register_memory_block_under_node(). Link: https://lkml.kernel.org/r/20250729064637.51662-4-hare@kernel.org Suggested-by: David Hildenbrand Signed-off-by: Hannes Reinecke Acked-by: David Hildenbrand Acked-by: Oscar Salvador Reviewed-by: Donet Tom Signed-off-by: Andrew Morton --- drivers/base/memory.c | 36 ++++++++++++++++++------------------ drivers/base/node.c | 10 ++++------ include/linux/memory.h | 3 +-- 3 files changed, 23 insertions(+), 26 deletions(-) (limited to 'include/linux/memory.h') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index fb212a889e65..6d84a02cfa5d 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -769,21 +769,22 @@ static struct zone *early_node_zone_for_memory_block(struct memory_block *mem, #ifdef CONFIG_NUMA /** - * memory_block_add_nid() - Indicate that system RAM falling into this memory - * block device (partially) belongs to the given node. + * memory_block_add_nid_early() - Indicate that early system RAM falling into + * this memory block device (partially) belongs + * to the given node. * @mem: The memory block device. * @nid: The node id. - * @context: The memory initialization context. * - * Indicate that system RAM falling into this memory block (partially) belongs - * to the given node. If the context indicates ("early") that we are adding the - * node during node device subsystem initialization, this will also properly - * set/adjust mem->zone based on the zone ranges of the given node. + * Indicate that early system RAM falling into this memory block (partially) + * belongs to the given node. This will also properly set/adjust mem->zone based + * on the zone ranges of the given node. + * + * Memory hotplug handles this on memory block creation, where we can only have + * a single nid span a memory block. */ -void memory_block_add_nid(struct memory_block *mem, int nid, - enum meminit_context context) +void memory_block_add_nid_early(struct memory_block *mem, int nid) { - if (context == MEMINIT_EARLY && mem->nid != nid) { + if (mem->nid != nid) { /* * For early memory we have to determine the zone when setting * the node id and handle multiple nodes spanning a single @@ -797,15 +798,14 @@ void memory_block_add_nid(struct memory_block *mem, int nid, mem->zone = early_node_zone_for_memory_block(mem, nid); else mem->zone = NULL; + /* + * If this memory block spans multiple nodes, we only indicate + * the last processed node. If we span multiple nodes (not applicable + * to hotplugged memory), zone == NULL will prohibit memory offlining + * and consequently unplug. + */ + mem->nid = nid; } - - /* - * If this memory block spans multiple nodes, we only indicate - * the last processed node. If we span multiple nodes (not applicable - * to hotplugged memory), zone == NULL will prohibit memory offlining - * and consequently unplug. - */ - mem->nid = nid; } #endif diff --git a/drivers/base/node.c b/drivers/base/node.c index 67b01d579737..6b6e55a98b79 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -781,13 +781,10 @@ int unregister_cpu_under_node(unsigned int cpu, unsigned int nid) #ifdef CONFIG_MEMORY_HOTPLUG static void do_register_memory_block_under_node(int nid, - struct memory_block *mem_blk, - enum meminit_context context) + struct memory_block *mem_blk) { int ret; - memory_block_add_nid(mem_blk, nid, context); - ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj, &mem_blk->dev.kobj, kobject_name(&mem_blk->dev.kobj)); @@ -815,7 +812,7 @@ static int register_mem_block_under_node_hotplug(struct memory_block *mem_blk, { int nid = *(int *)arg; - do_register_memory_block_under_node(nid, mem_blk, MEMINIT_HOTPLUG); + do_register_memory_block_under_node(nid, mem_blk); return 0; } @@ -855,7 +852,8 @@ static void register_memory_blocks_under_nodes(void) if (!mem) continue; - do_register_memory_block_under_node(nid, mem, MEMINIT_EARLY); + memory_block_add_nid_early(mem, nid); + do_register_memory_block_under_node(nid, mem); put_device(&mem->dev); } diff --git a/include/linux/memory.h b/include/linux/memory.h index 4a29153e372e..43d378038ce2 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -202,8 +202,7 @@ static inline unsigned long phys_to_block_id(unsigned long phys) } #ifdef CONFIG_NUMA -void memory_block_add_nid(struct memory_block *mem, int nid, - enum meminit_context context); +void memory_block_add_nid_early(struct memory_block *mem, int nid); #endif /* CONFIG_NUMA */ int memory_block_advise_max_size(unsigned long size); unsigned long memory_block_advised_max_size(void); -- cgit v1.2.3