From d8010d4ba43e9f790925375a7de100604a5e2dba Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 11 Sep 2024 10:53:08 +0200 Subject: x86/bugs: Add a Transient Scheduler Attacks mitigation Add the required features detection glue to bugs.c et all in order to support the TSA mitigation. Co-developed-by: Kim Phillips Signed-off-by: Kim Phillips Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Pawan Gupta --- include/linux/cpu.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 96a3a0d6a60e..6378370a952f 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -82,6 +82,7 @@ extern ssize_t cpu_show_old_microcode(struct device *dev, struct device_attribute *attr, char *buf); extern ssize_t cpu_show_indirect_target_selection(struct device *dev, struct device_attribute *attr, char *buf); +extern ssize_t cpu_show_tsa(struct device *dev, struct device_attribute *attr, char *buf); extern __printf(4, 5) struct device *cpu_device_create(struct device *parent, void *drvdata, -- cgit v1.2.3 From 51a4273dcab39dd1e850870945ccec664352d383 Mon Sep 17 00:00:00 2001 From: Nikunj A Dadhania Date: Tue, 8 Apr 2025 15:02:11 +0530 Subject: KVM: SVM: Add missing member in SNP_LAUNCH_START command structure The sev_data_snp_launch_start structure should include a 4-byte desired_tsc_khz field before the gosvw field, which was missed in the initial implementation. As a result, the structure is 4 bytes shorter than expected by the firmware, causing the gosvw field to start 4 bytes early. Fix this by adding the missing 4-byte member for the desired TSC frequency. Fixes: 3a45dc2b419e ("crypto: ccp: Define the SEV-SNP commands") Cc: stable@vger.kernel.org Suggested-by: Tom Lendacky Reviewed-by: Tom Lendacky Tested-by: Vaishali Thakkar Signed-off-by: Nikunj A Dadhania Link: https://lore.kernel.org/r/20250408093213.57962-3-nikunj@amd.com Signed-off-by: Sean Christopherson --- include/linux/psp-sev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index 0b3a36bdaa90..0f5f94137f6d 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -594,6 +594,7 @@ struct sev_data_snp_addr { * @imi_en: launch flow is launching an IMI (Incoming Migration Image) for the * purpose of guest-assisted migration. * @rsvd: reserved + * @desired_tsc_khz: hypervisor desired mean TSC freq in kHz of the guest * @gosvw: guest OS-visible workarounds, as defined by hypervisor */ struct sev_data_snp_launch_start { @@ -603,6 +604,7 @@ struct sev_data_snp_launch_start { u32 ma_en:1; /* In */ u32 imi_en:1; /* In */ u32 rsvd:30; + u32 desired_tsc_khz; /* In */ u8 gosvw[16]; /* In */ } __packed; -- cgit v1.2.3 From c5fd399a24c8e2865524361f7dc4d4a6899be4f4 Mon Sep 17 00:00:00 2001 From: Lachlan Hodges Date: Tue, 1 Jul 2025 17:55:41 +1000 Subject: wifi: mac80211: correctly identify S1G short beacon mac80211 identifies a short beacon by the presence of the next TBTT field, however the standard actually doesn't explicitly state that the next TBTT can't be in a long beacon or even that it is required in a short beacon - and as a result this validation does not work for all vendor implementations. The standard explicitly states that an S1G long beacon shall contain the S1G beacon compatibility element as the first element in a beacon transmitted at a TBTT that is not a TSBTT (Target Short Beacon Transmission Time) as per IEEE80211-2024 11.1.3.10.1. This is validated by 9.3.4.3 Table 9-76 which states that the S1G beacon compatibility element is only allowed in the full set and is not allowed in the minimum set of elements permitted for use within short beacons. Correctly identify short beacons by the lack of an S1G beacon compatibility element as the first element in an S1G beacon frame. Fixes: 9eaffe5078ca ("cfg80211: convert S1G beacon to scan results") Signed-off-by: Simon Wadsworth Signed-off-by: Lachlan Hodges Link: https://patch.msgid.link/20250701075541.162619-1-lachlan.hodges@morsemicro.com Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 45 +++++++++++++++++++++++++++++++++------------ net/mac80211/mlme.c | 7 +++++-- 2 files changed, 38 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 22f39e5e2ff1..996be3c2cff0 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -662,18 +662,6 @@ static inline bool ieee80211_s1g_has_cssid(__le16 fc) (fc & cpu_to_le16(IEEE80211_S1G_BCN_CSSID)); } -/** - * ieee80211_is_s1g_short_beacon - check if frame is an S1G short beacon - * @fc: frame control bytes in little-endian byteorder - * Return: whether or not the frame is an S1G short beacon, - * i.e. it is an S1G beacon with 'next TBTT' flag set - */ -static inline bool ieee80211_is_s1g_short_beacon(__le16 fc) -{ - return ieee80211_is_s1g_beacon(fc) && - (fc & cpu_to_le16(IEEE80211_S1G_BCN_NEXT_TBTT)); -} - /** * ieee80211_is_atim - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_ATIM * @fc: frame control bytes in little-endian byteorder @@ -4901,6 +4889,39 @@ static inline bool ieee80211_is_ftm(struct sk_buff *skb) return false; } +/** + * ieee80211_is_s1g_short_beacon - check if frame is an S1G short beacon + * @fc: frame control bytes in little-endian byteorder + * @variable: pointer to the beacon frame elements + * @variable_len: length of the frame elements + * Return: whether or not the frame is an S1G short beacon. As per + * IEEE80211-2024 11.1.3.10.1, The S1G beacon compatibility element shall + * always be present as the first element in beacon frames generated at a + * TBTT (Target Beacon Transmission Time), so any frame not containing + * this element must have been generated at a TSBTT (Target Short Beacon + * Transmission Time) that is not a TBTT. Additionally, short beacons are + * prohibited from containing the S1G beacon compatibility element as per + * IEEE80211-2024 9.3.4.3 Table 9-76, so if we have an S1G beacon with + * either no elements or the first element is not the beacon compatibility + * element, we have a short beacon. + */ +static inline bool ieee80211_is_s1g_short_beacon(__le16 fc, const u8 *variable, + size_t variable_len) +{ + if (!ieee80211_is_s1g_beacon(fc)) + return false; + + /* + * If the frame does not contain at least 1 element (this is perfectly + * valid in a short beacon) and is an S1G beacon, we have a short + * beacon. + */ + if (variable_len < 2) + return true; + + return variable[0] != WLAN_EID_S1G_BCN_COMPAT; +} + struct element { u8 id; u8 datalen; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 2d46d4af60d7..7ddb8e77b4c7 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -7195,6 +7195,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, struct ieee80211_bss_conf *bss_conf = link->conf; struct ieee80211_vif_cfg *vif_cfg = &sdata->vif.cfg; struct ieee80211_mgmt *mgmt = (void *) hdr; + struct ieee80211_ext *ext = NULL; size_t baselen; struct ieee802_11_elems *elems; struct ieee80211_local *local = sdata->local; @@ -7220,7 +7221,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, /* Process beacon from the current BSS */ bssid = ieee80211_get_bssid(hdr, len, sdata->vif.type); if (ieee80211_is_s1g_beacon(mgmt->frame_control)) { - struct ieee80211_ext *ext = (void *) mgmt; + ext = (void *)mgmt; variable = ext->u.s1g_beacon.variable + ieee80211_s1g_optional_len(ext->frame_control); } @@ -7407,7 +7408,9 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, } if ((ncrc == link->u.mgd.beacon_crc && link->u.mgd.beacon_crc_valid) || - ieee80211_is_s1g_short_beacon(mgmt->frame_control)) + (ext && ieee80211_is_s1g_short_beacon(ext->frame_control, + parse_params.start, + parse_params.len))) goto free; link->u.mgd.beacon_crc = ncrc; link->u.mgd.beacon_crc_valid = true; -- cgit v1.2.3 From 4cdf1bdd45ac78a088773722f009883af30ad318 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Fri, 4 Jul 2025 11:21:34 +0200 Subject: block: reject bs > ps block devices when THP is disabled If THP is disabled and when a block device with logical block size > page size is present, the following null ptr deref panic happens during boot: [ [13.2 mK AOSAN: null-ptr-deref in range [0x0000000000000000-0x0000000000K0 0 0[07] [ 13.017749] RIP: 0010:create_empty_buffers+0x3b/0x380 [ 13.025448] Call Trace: [ 13.025692] [ 13.025895] block_read_full_folio+0x610/0x780 [ 13.026379] ? __pfx_blkdev_get_block+0x10/0x10 [ 13.027008] ? __folio_batch_add_and_move+0x1fa/0x2b0 [ 13.027548] ? __pfx_blkdev_read_folio+0x10/0x10 [ 13.028080] filemap_read_folio+0x9b/0x200 [ 13.028526] ? __pfx_filemap_read_folio+0x10/0x10 [ 13.029030] ? __filemap_get_folio+0x43/0x620 [ 13.029497] do_read_cache_folio+0x155/0x3b0 [ 13.029962] ? __pfx_blkdev_read_folio+0x10/0x10 [ 13.030381] read_part_sector+0xb7/0x2a0 [ 13.030805] read_lba+0x174/0x2c0 [ 13.045348] nvme_scan_ns+0x684/0x850 [nvme_core] [ 13.045858] ? __pfx_nvme_scan_ns+0x10/0x10 [nvme_core] [ 13.046414] ? _raw_spin_unlock+0x15/0x40 [ 13.046843] ? __switch_to+0x523/0x10a0 [ 13.047253] ? kvm_clock_get_cycles+0x14/0x30 [ 13.047742] ? __pfx_nvme_scan_ns_async+0x10/0x10 [nvme_core] [ 13.048353] async_run_entry_fn+0x96/0x4f0 [ 13.048787] process_one_work+0x667/0x10a0 [ 13.049219] worker_thread+0x63c/0xf60 As large folio support depends on THP, only allow bs > ps block devices if THP is enabled. Fixes: 47dd67532303 ("block/bdev: lift block size restrictions to 64k") Signed-off-by: Pankaj Raghav Reviewed-by: Luis Chamberlain Link: https://lore.kernel.org/r/20250704092134.289491-1-p.raghav@samsung.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 332b56f323d9..369a8e63c865 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -269,11 +269,16 @@ static inline dev_t disk_devt(struct gendisk *disk) return MKDEV(disk->major, disk->first_minor); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * We should strive for 1 << (PAGE_SHIFT + MAX_PAGECACHE_ORDER) * however we constrain this to what we can validate and test. */ #define BLK_MAX_BLOCK_SIZE SZ_64K +#else +#define BLK_MAX_BLOCK_SIZE PAGE_SIZE +#endif + /* blk_validate_limits() validates bsize, so drivers don't usually need to */ static inline int blk_validate_block_size(unsigned long bsize) -- cgit v1.2.3 From fc582cd26e888b0652bc1494f252329453fd3b23 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 8 Jul 2025 11:00:32 -0600 Subject: io_uring/msg_ring: ensure io_kiocb freeing is deferred for RCU syzbot reports that defer/local task_work adding via msg_ring can hit a request that has been freed: CPU: 1 UID: 0 PID: 19356 Comm: iou-wrk-19354 Not tainted 6.16.0-rc4-syzkaller-00108-g17bbde2e1716 #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/07/2025 Call Trace: dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:408 [inline] print_report+0xd2/0x2b0 mm/kasan/report.c:521 kasan_report+0x118/0x150 mm/kasan/report.c:634 io_req_local_work_add io_uring/io_uring.c:1184 [inline] __io_req_task_work_add+0x589/0x950 io_uring/io_uring.c:1252 io_msg_remote_post io_uring/msg_ring.c:103 [inline] io_msg_data_remote io_uring/msg_ring.c:133 [inline] __io_msg_ring_data+0x820/0xaa0 io_uring/msg_ring.c:151 io_msg_ring_data io_uring/msg_ring.c:173 [inline] io_msg_ring+0x134/0xa00 io_uring/msg_ring.c:314 __io_issue_sqe+0x17e/0x4b0 io_uring/io_uring.c:1739 io_issue_sqe+0x165/0xfd0 io_uring/io_uring.c:1762 io_wq_submit_work+0x6e9/0xb90 io_uring/io_uring.c:1874 io_worker_handle_work+0x7cd/0x1180 io_uring/io-wq.c:642 io_wq_worker+0x42f/0xeb0 io_uring/io-wq.c:696 ret_from_fork+0x3fc/0x770 arch/x86/kernel/process.c:148 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245 which is supposed to be safe with how requests are allocated. But msg ring requests alloc and free on their own, and hence must defer freeing to a sane time. Add an rcu_head and use kfree_rcu() in both spots where requests are freed. Only the one in io_msg_tw_complete() is strictly required as it has been visible on the other ring, but use it consistently in the other spot as well. This should not cause any other issues outside of KASAN rightfully complaining about it. Link: https://lore.kernel.org/io-uring/686cd2ea.a00a0220.338033.0007.GAE@google.com/ Reported-by: syzbot+54cbbfb4db9145d26fc2@syzkaller.appspotmail.com Cc: stable@vger.kernel.org Fixes: 0617bb500bfa ("io_uring/msg_ring: improve handling of target CQE posting") Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 ++ io_uring/msg_ring.c | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 2922635986f5..a7efcec2e3d0 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -698,6 +698,8 @@ struct io_kiocb { struct hlist_node hash_node; /* For IOPOLL setup queues, with hybrid polling */ u64 iopoll_start; + /* for private io_kiocb freeing */ + struct rcu_head rcu_head; }; /* internal polling, see IORING_FEAT_FAST_POLL */ struct async_poll *apoll; diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 71400d6cefc8..4c2578f2efcb 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -82,7 +82,7 @@ static void io_msg_tw_complete(struct io_kiocb *req, io_tw_token_t tw) spin_unlock(&ctx->msg_lock); } if (req) - kmem_cache_free(req_cachep, req); + kfree_rcu(req, rcu_head); percpu_ref_put(&ctx->refs); } @@ -90,7 +90,7 @@ static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req, int res, u32 cflags, u64 user_data) { if (!READ_ONCE(ctx->submitter_task)) { - kmem_cache_free(req_cachep, req); + kfree_rcu(req, rcu_head); return -EOWNERDEAD; } req->opcode = IORING_OP_NOP; -- cgit v1.2.3 From 82241a83cd15aaaf28200a40ad1a8b480012edaf Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 5 Jun 2025 20:58:29 +0800 Subject: mm: fix the inaccurate memory statistics issue for users On some large machines with a high number of CPUs running a 64K pagesize kernel, we found that the 'RES' field is always 0 displayed by the top command for some processes, which will cause a lot of confusion for users. PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 875525 root 20 0 12480 0 0 R 0.3 0.0 0:00.08 top 1 root 20 0 172800 0 0 S 0.0 0.0 0:04.52 systemd The main reason is that the batch size of the percpu counter is quite large on these machines, caching a significant percpu value, since converting mm's rss stats into percpu_counter by commit f1a7941243c1 ("mm: convert mm's rss stats into percpu_counter"). Intuitively, the batch number should be optimized, but on some paths, performance may take precedence over statistical accuracy. Therefore, introducing a new interface to add the percpu statistical count and display it to users, which can remove the confusion. In addition, this change is not expected to be on a performance-critical path, so the modification should be acceptable. In addition, the 'mm->rss_stat' is updated by using add_mm_counter() and dec/inc_mm_counter(), which are all wrappers around percpu_counter_add_batch(). In percpu_counter_add_batch(), there is percpu batch caching to avoid 'fbc->lock' contention. This patch changes task_mem() and task_statm() to get the accurate mm counters under the 'fbc->lock', but this should not exacerbate kernel 'mm->rss_stat' lock contention due to the percpu batch caching of the mm counters. The following test also confirm the theoretical analysis. I run the stress-ng that stresses anon page faults in 32 threads on my 32 cores machine, while simultaneously running a script that starts 32 threads to busy-loop pread each stress-ng thread's /proc/pid/status interface. From the following data, I did not observe any obvious impact of this patch on the stress-ng tests. w/o patch: stress-ng: info: [6848] 4,399,219,085,152 CPU Cycles 67.327 B/sec stress-ng: info: [6848] 1,616,524,844,832 Instructions 24.740 B/sec (0.367 instr. per cycle) stress-ng: info: [6848] 39,529,792 Page Faults Total 0.605 M/sec stress-ng: info: [6848] 39,529,792 Page Faults Minor 0.605 M/sec w/patch: stress-ng: info: [2485] 4,462,440,381,856 CPU Cycles 68.382 B/sec stress-ng: info: [2485] 1,615,101,503,296 Instructions 24.750 B/sec (0.362 instr. per cycle) stress-ng: info: [2485] 39,439,232 Page Faults Total 0.604 M/sec stress-ng: info: [2485] 39,439,232 Page Faults Minor 0.604 M/sec On comparing a very simple app which just allocates & touches some memory against v6.1 (which doesn't have f1a7941243c1) and latest Linus tree (4c06e63b9203) I can see that on latest Linus tree the values for VmRSS, RssAnon and RssFile from /proc/self/status are all zeroes while they do report values on v6.1 and a Linus tree with this patch. Link: https://lkml.kernel.org/r/f4586b17f66f97c174f7fd1f8647374fdb53de1c.1749119050.git.baolin.wang@linux.alibaba.com Fixes: f1a7941243c1 ("mm: convert mm's rss stats into percpu_counter") Signed-off-by: Baolin Wang Reviewed-by: Aboorva Devarajan Tested-by: Aboorva Devarajan Tested-by Donet Tom Acked-by: Shakeel Butt Acked-by: SeongJae Park Acked-by: Michal Hocko Reviewed-by: Vlastimil Babka Cc: David Hildenbrand Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 14 +++++++------- include/linux/mm.h | 5 +++++ 2 files changed, 12 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 4be91eb6ea5c..751479eb128f 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -36,9 +36,9 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) unsigned long text, lib, swap, anon, file, shmem; unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; - anon = get_mm_counter(mm, MM_ANONPAGES); - file = get_mm_counter(mm, MM_FILEPAGES); - shmem = get_mm_counter(mm, MM_SHMEMPAGES); + anon = get_mm_counter_sum(mm, MM_ANONPAGES); + file = get_mm_counter_sum(mm, MM_FILEPAGES); + shmem = get_mm_counter_sum(mm, MM_SHMEMPAGES); /* * Note: to minimize their overhead, mm maintains hiwater_vm and @@ -59,7 +59,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) text = min(text, mm->exec_vm << PAGE_SHIFT); lib = (mm->exec_vm << PAGE_SHIFT) - text; - swap = get_mm_counter(mm, MM_SWAPENTS); + swap = get_mm_counter_sum(mm, MM_SWAPENTS); SEQ_PUT_DEC("VmPeak:\t", hiwater_vm); SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm); SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm); @@ -92,12 +92,12 @@ unsigned long task_statm(struct mm_struct *mm, unsigned long *shared, unsigned long *text, unsigned long *data, unsigned long *resident) { - *shared = get_mm_counter(mm, MM_FILEPAGES) + - get_mm_counter(mm, MM_SHMEMPAGES); + *shared = get_mm_counter_sum(mm, MM_FILEPAGES) + + get_mm_counter_sum(mm, MM_SHMEMPAGES); *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> PAGE_SHIFT; *data = mm->data_vm + mm->stack_vm; - *resident = *shared + get_mm_counter(mm, MM_ANONPAGES); + *resident = *shared + get_mm_counter_sum(mm, MM_ANONPAGES); return mm->total_vm; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 0ef2ba0c667a..fa538feaa8d9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2568,6 +2568,11 @@ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) return percpu_counter_read_positive(&mm->rss_stat[member]); } +static inline unsigned long get_mm_counter_sum(struct mm_struct *mm, int member) +{ + return percpu_counter_sum_positive(&mm->rss_stat[member]); +} + void mm_trace_rss_stat(struct mm_struct *mm, int member); static inline void add_mm_counter(struct mm_struct *mm, int member, long value) -- cgit v1.2.3 From db6cc3f4ac2e6cdc898fc9cbc8b32ae1bf56bdad Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Fri, 4 Jul 2025 21:56:20 +0800 Subject: Revert "sched/numa: add statistics of numa balance task" This reverts commit ad6b26b6a0a79166b53209df2ca1cf8636296382. This commit introduces per-memcg/task NUMA balance statistics, but unfortunately it introduced a NULL pointer exception due to the following race condition: After a swap task candidate was chosen, its mm_struct pointer was set to NULL due to task exit. Later, when performing the actual task swapping, the p->mm caused the problem. CPU0 CPU1 : ... task_numa_migrate task_numa_find_cpu task_numa_compare # a normal task p is chosen env->best_task = p # p exit: exit_signals(p); p->flags |= PF_EXITING exit_mm p->mm = NULL; migrate_swap_stop __migrate_swap_task((arg->src_task, arg->dst_cpu) count_memcg_event_mm(p->mm, NUMA_TASK_SWAP)# p->mm is NULL task_lock() should be held and the PF_EXITING flag needs to be checked to prevent this from happening. After discussion, the conclusion was that adding a lock is not worthwhile for some statistics calculations. Revert the change and rely on the tracepoint for this purpose. Link: https://lkml.kernel.org/r/20250704135620.685752-1-yu.c.chen@intel.com Link: https://lkml.kernel.org/r/20250708064917.BBD13C4CEED@smtp.kernel.org Fixes: ad6b26b6a0a7 ("sched/numa: add statistics of numa balance task") Signed-off-by: Chen Yu Reported-by: Jirka Hladky Closes: https://lore.kernel.org/all/CAE4VaGBLJxpd=NeRJXpSCuw=REhC5LWJpC29kDy-Zh2ZDyzQZA@mail.gmail.com/ Reported-by: Srikanth Aithal Reported-by: Suneeth D Acked-by: Michal Hocko Cc: Borislav Petkov Cc: Ingo Molnar Cc: Jiri Hladky Cc: Libo Chen Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v2.rst | 6 ------ include/linux/sched.h | 4 ---- include/linux/vm_event_item.h | 2 -- kernel/sched/core.c | 9 ++------- kernel/sched/debug.c | 4 ---- mm/memcontrol.c | 2 -- mm/vmstat.c | 2 -- 7 files changed, 2 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 0cc35a14afbe..bd98ea3175ec 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1732,12 +1732,6 @@ The following nested keys are defined. numa_hint_faults (npn) Number of NUMA hinting faults. - numa_task_migrated (npn) - Number of task migration by NUMA balancing. - - numa_task_swapped (npn) - Number of task swap by NUMA balancing. - pgdemote_kswapd Number of pages demoted by kswapd. diff --git a/include/linux/sched.h b/include/linux/sched.h index 4f78a64beb52..aa9c5be7a632 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -548,10 +548,6 @@ struct sched_statistics { u64 nr_failed_migrations_running; u64 nr_failed_migrations_hot; u64 nr_forced_migrations; -#ifdef CONFIG_NUMA_BALANCING - u64 numa_task_migrated; - u64 numa_task_swapped; -#endif u64 nr_wakeups; u64 nr_wakeups_sync; diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 91a3ce9a2687..9e15a088ba38 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -66,8 +66,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, NUMA_HINT_FAULTS, NUMA_HINT_FAULTS_LOCAL, NUMA_PAGE_MIGRATE, - NUMA_TASK_MIGRATE, - NUMA_TASK_SWAP, #endif #ifdef CONFIG_MIGRATION PGMIGRATE_SUCCESS, PGMIGRATE_FAIL, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ec68fc686bd7..81c6df746df1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3362,10 +3362,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) #ifdef CONFIG_NUMA_BALANCING static void __migrate_swap_task(struct task_struct *p, int cpu) { - __schedstat_inc(p->stats.numa_task_swapped); - count_vm_numa_event(NUMA_TASK_SWAP); - count_memcg_event_mm(p->mm, NUMA_TASK_SWAP); - if (task_on_rq_queued(p)) { struct rq *src_rq, *dst_rq; struct rq_flags srf, drf; @@ -7939,9 +7935,8 @@ int migrate_task_to(struct task_struct *p, int target_cpu) if (!cpumask_test_cpu(target_cpu, p->cpus_ptr)) return -EINVAL; - __schedstat_inc(p->stats.numa_task_migrated); - count_vm_numa_event(NUMA_TASK_MIGRATE); - count_memcg_event_mm(p->mm, NUMA_TASK_MIGRATE); + /* TODO: This is not properly updating schedstats */ + trace_sched_move_numa(p, curr_cpu, target_cpu); return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); } diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 9d71baf08075..557246880a7e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -1210,10 +1210,6 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P_SCHEDSTAT(nr_failed_migrations_running); P_SCHEDSTAT(nr_failed_migrations_hot); P_SCHEDSTAT(nr_forced_migrations); -#ifdef CONFIG_NUMA_BALANCING - P_SCHEDSTAT(numa_task_migrated); - P_SCHEDSTAT(numa_task_swapped); -#endif P_SCHEDSTAT(nr_wakeups); P_SCHEDSTAT(nr_wakeups_sync); P_SCHEDSTAT(nr_wakeups_migrate); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 902da8a9c643..70fdeda1120b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -474,8 +474,6 @@ static const unsigned int memcg_vm_event_stat[] = { NUMA_PAGE_MIGRATE, NUMA_PTE_UPDATES, NUMA_HINT_FAULTS, - NUMA_TASK_MIGRATE, - NUMA_TASK_SWAP, #endif }; diff --git a/mm/vmstat.c b/mm/vmstat.c index 429ae5339bfe..a78d70ddeacd 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1346,8 +1346,6 @@ const char * const vmstat_text[] = { "numa_hint_faults", "numa_hint_faults_local", "numa_pages_migrated", - "numa_task_migrated", - "numa_task_swapped", #endif #ifdef CONFIG_MIGRATION "pgmigrate_success", -- cgit v1.2.3 From a8b289f0f2dcbadd8c207ad8f33cf7ba2b4eb088 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 10 Jul 2025 10:00:12 +0200 Subject: irqchip/irq-msi-lib: Fix build with PCI disabled The armada-370-xp irqchip fails in some randconfig builds because of a missing declaration: In file included from drivers/irqchip/irq-armada-370-xp.c:23: include/linux/irqchip/irq-msi-lib.h:25:39: error: 'struct msi_domain_info' declared inside parameter list will not be visible outside of this definition or declaration [-Werror] Add a forward declaration for the msi_domain_info structure. [ tglx: Fixed up the subsystem prefix. Is it really that hard to get right? ] Fixes: e51b27438a10 ("irqchip: Make irq-msi-lib.h globally available") Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Acked-by: Marc Zyngier Link: https://lore.kernel.org/all/20250710080021.2303640-1-arnd@kernel.org --- include/linux/irqchip/irq-msi-lib.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/irqchip/irq-msi-lib.h b/include/linux/irqchip/irq-msi-lib.h index dd8d1d138544..224ac28e88d7 100644 --- a/include/linux/irqchip/irq-msi-lib.h +++ b/include/linux/irqchip/irq-msi-lib.h @@ -17,6 +17,7 @@ #define MATCH_PLATFORM_MSI BIT(DOMAIN_BUS_PLATFORM_MSI) +struct msi_domain_info; int msi_lib_irq_domain_select(struct irq_domain *d, struct irq_fwspec *fwspec, enum irq_domain_bus_token bus_token); -- cgit v1.2.3