From 654b33ada4ab5e926cd9c570196fefa7bec7c1df Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Sat, 1 Mar 2025 15:06:24 +0300 Subject: proc: fix UAF in proc_get_inode() Fix race between rmmod and /proc/XXX's inode instantiation. The bug is that pde->proc_ops don't belong to /proc, it belongs to a module, therefore dereferencing it after /proc entry has been registered is a bug unless use_pde/unuse_pde() pair has been used. use_pde/unuse_pde can be avoided (2 atomic ops!) because pde->proc_ops never changes so information necessary for inode instantiation can be saved _before_ proc_register() in PDE itself and used later, avoiding pde->proc_ops->... dereference. rmmod lookup sys_delete_module proc_lookup_de pde_get(de); proc_get_inode(dir->i_sb, de); mod->exit() proc_remove remove_proc_subtree proc_entry_rundown(de); free_module(mod); if (S_ISREG(inode->i_mode)) if (de->proc_ops->proc_read_iter) --> As module is already freed, will trigger UAF BUG: unable to handle page fault for address: fffffbfff80a702b PGD 817fc4067 P4D 817fc4067 PUD 817fc0067 PMD 102ef4067 PTE 0 Oops: Oops: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 26 UID: 0 PID: 2667 Comm: ls Tainted: G Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) RIP: 0010:proc_get_inode+0x302/0x6e0 RSP: 0018:ffff88811c837998 EFLAGS: 00010a06 RAX: dffffc0000000000 RBX: ffffffffc0538140 RCX: 0000000000000007 RDX: 1ffffffff80a702b RSI: 0000000000000001 RDI: ffffffffc0538158 RBP: ffff8881299a6000 R08: 0000000067bbe1e5 R09: 1ffff11023906f20 R10: ffffffffb560ca07 R11: ffffffffb2b43a58 R12: ffff888105bb78f0 R13: ffff888100518048 R14: ffff8881299a6004 R15: 0000000000000001 FS: 00007f95b9686840(0000) GS:ffff8883af100000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: fffffbfff80a702b CR3: 0000000117dd2000 CR4: 00000000000006f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: proc_lookup_de+0x11f/0x2e0 __lookup_slow+0x188/0x350 walk_component+0x2ab/0x4f0 path_lookupat+0x120/0x660 filename_lookup+0x1ce/0x560 vfs_statx+0xac/0x150 __do_sys_newstat+0x96/0x110 do_syscall_64+0x5f/0x170 entry_SYSCALL_64_after_hwframe+0x76/0x7e [adobriyan@gmail.com: don't do 2 atomic ops on the common path] Link: https://lkml.kernel.org/r/3d25ded0-1739-447e-812b-e34da7990dcf@p183 Fixes: 778f3dd5a13c ("Fix procfs compat_ioctl regression") Signed-off-by: Ye Bin Signed-off-by: Alexey Dobriyan Cc: Al Viro Cc: David S. Miller Cc: Signed-off-by: Andrew Morton --- include/linux/proc_fs.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 0b2a89854440..ea62201c74c4 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -20,10 +20,13 @@ enum { * If in doubt, ignore this flag. */ #ifdef MODULE - PROC_ENTRY_PERMANENT = 0U, + PROC_ENTRY_PERMANENT = 0U, #else - PROC_ENTRY_PERMANENT = 1U << 0, + PROC_ENTRY_PERMANENT = 1U << 0, #endif + + PROC_ENTRY_proc_read_iter = 1U << 1, + PROC_ENTRY_proc_compat_ioctl = 1U << 2, }; struct proc_ops { -- cgit v1.2.3 From 39a326e6daba5703a8e4de17cacc7ddb0dc5b07f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 28 Feb 2025 09:53:36 -0800 Subject: mm/damon: respect core layer filters' allowance decision on ops layer Filtering decisions are made in filters evaluation order. Once a decision is made by a filter, filters that scheduled to be evaluated after the decision-made filter should just respect it. This is the intended and documented behavior. Since core layer-handled filters are evaluated before operations layer-handled filters, decisions made on core layer should respected by ops layer. In case of reject filters, the decision is respected, since core layer-rejected regions are not passed to ops layer. But in case of allow filters, ops layer filters don't know if the region has passed to them because it was allowed by core filters or just because it didn't match to any core layer. The current wrong implementation assumes it was due to not matched by any core filters. As a reuslt, the decision is not respected. Pass the missing information to ops layer using a new filed in 'struct damos', and make the ops layer filters respect it. Link: https://lkml.kernel.org/r/20250228175336.42781-1-sj@kernel.org Fixes: 491fee286e56 ("mm/damon/core: support damos_filter->allow") Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 5 +++++ mm/damon/core.c | 6 +++++- mm/damon/paddr.c | 3 +++ 3 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index af525252b853..c9074d569596 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -470,6 +470,11 @@ struct damos { unsigned long next_apply_sis; /* informs if ongoing DAMOS walk for this scheme is finished */ bool walk_completed; + /* + * If the current region in the filtering stage is allowed by core + * layer-handled filters. If true, operations layer allows it, too. + */ + bool core_filters_allowed; /* public: */ struct damos_quota quota; struct damos_watermarks wmarks; diff --git a/mm/damon/core.c b/mm/damon/core.c index c7b981308862..de4396519b83 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1429,9 +1429,13 @@ static bool damos_filter_out(struct damon_ctx *ctx, struct damon_target *t, { struct damos_filter *filter; + s->core_filters_allowed = false; damos_for_each_filter(filter, s) { - if (damos_filter_match(ctx, t, r, filter)) + if (damos_filter_match(ctx, t, r, filter)) { + if (filter->allow) + s->core_filters_allowed = true; return !filter->allow; + } } return false; } diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 0f9ae14f884d..c834aa217835 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -236,6 +236,9 @@ static bool damos_pa_filter_out(struct damos *scheme, struct folio *folio) { struct damos_filter *filter; + if (scheme->core_filters_allowed) + return false; + damos_for_each_filter(filter, scheme) { if (damos_pa_filter_match(filter, folio)) return !filter->allow; -- cgit v1.2.3 From 73f839b6d2ed75f281bd75aeb68e81bce373bdee Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 6 Mar 2025 10:31:33 +0800 Subject: mm: memcontrol: fix swap counter leak from offline cgroup Commit 6769183166b3 removed the parameter of id from swap_cgroup_record() and get the memcg id from mem_cgroup_id(folio_memcg(folio)). However, the caller of it may update a different memcg's counter instead of folio_memcg(folio). E.g. in the caller of mem_cgroup_swapout(), @swap_memcg could be different with @memcg and update the counter of @swap_memcg, but swap_cgroup_record() records the wrong memcg's ID. When it is uncharged from __mem_cgroup_uncharge_swap(), the swap counter will leak since the wrong recorded ID. Fix it by bringing the parameter of id back. Link: https://lkml.kernel.org/r/20250306023133.44838-1-songmuchun@bytedance.com Fixes: 6769183166b3 ("mm/swap_cgroup: decouple swap cgroup recording and clearing") Signed-off-by: Muchun Song Reviewed-by: Kairui Song Cc: Chris Li Cc: Johannes Weiner Cc: Michal Hocko Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/swap_cgroup.h | 4 ++-- mm/memcontrol.c | 4 ++-- mm/swap_cgroup.c | 7 ++++--- 3 files changed, 8 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h index b5ec038069da..91cdf12190a0 100644 --- a/include/linux/swap_cgroup.h +++ b/include/linux/swap_cgroup.h @@ -6,7 +6,7 @@ #if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP) -extern void swap_cgroup_record(struct folio *folio, swp_entry_t ent); +extern void swap_cgroup_record(struct folio *folio, unsigned short id, swp_entry_t ent); extern unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents); extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent); extern int swap_cgroup_swapon(int type, unsigned long max_pages); @@ -15,7 +15,7 @@ extern void swap_cgroup_swapoff(int type); #else static inline -void swap_cgroup_record(struct folio *folio, swp_entry_t ent) +void swap_cgroup_record(struct folio *folio, unsigned short id, swp_entry_t ent) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4de6acb9b8ec..8f9b35f80e24 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4993,7 +4993,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); - swap_cgroup_record(folio, entry); + swap_cgroup_record(folio, mem_cgroup_id(swap_memcg), entry); folio_unqueue_deferred_split(folio); folio->memcg_data = 0; @@ -5055,7 +5055,7 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) mem_cgroup_id_get_many(memcg, nr_pages - 1); mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); - swap_cgroup_record(folio, entry); + swap_cgroup_record(folio, mem_cgroup_id(memcg), entry); return 0; } diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index be39078f255b..1007c30f12e2 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -58,9 +58,11 @@ static unsigned short __swap_cgroup_id_xchg(struct swap_cgroup *map, * entries must not have been charged * * @folio: the folio that the swap entry belongs to + * @id: mem_cgroup ID to be recorded * @ent: the first swap entry to be recorded */ -void swap_cgroup_record(struct folio *folio, swp_entry_t ent) +void swap_cgroup_record(struct folio *folio, unsigned short id, + swp_entry_t ent) { unsigned int nr_ents = folio_nr_pages(folio); struct swap_cgroup *map; @@ -72,8 +74,7 @@ void swap_cgroup_record(struct folio *folio, swp_entry_t ent) map = swap_cgroup_ctrl[swp_type(ent)].map; do { - old = __swap_cgroup_id_xchg(map, offset, - mem_cgroup_id(folio_memcg(folio))); + old = __swap_cgroup_id_xchg(map, offset, id); VM_BUG_ON(old); } while (++offset != end); } -- cgit v1.2.3 From b9c0e49abfca06f1a109acea834bcfc934f33f76 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 10 Mar 2025 14:35:24 +0000 Subject: mm: decline to manipulate the refcount on a slab page Slab pages now have a refcount of 0, so nobody should be trying to manipulate the refcount on them. Doing so has little effect; the object could be freed and reallocated to a different purpose, although the slab itself would not be until the refcount was put making it behave rather like TYPESAFE_BY_RCU. Unfortunately, __iov_iter_get_pages_alloc() does take a refcount. Fix that to not change the refcount, and make put_page() silently not change the refcount. get_page() warns so that we can fix any other callers that need to be changed. Long-term, networking needs to stop taking a refcount on the pages that it uses and rely on the caller to hold whatever references are necessary to make the memory stable. In the medium term, more page types are going to hav a zero refcount, so we'll want to move get_page() and put_page() out of line. Link: https://lkml.kernel.org/r/20250310143544.1216127-1-willy@infradead.org Fixes: 9aec2fb0fd5e (slab: allocate frozen pages) Signed-off-by: Matthew Wilcox (Oracle) Reported-by: Hannes Reinecke Closes: https://lore.kernel.org/all/08c29e4b-2f71-4b6d-8046-27e407214d8c@suse.com/ Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 8 +++++++- lib/iov_iter.c | 8 ++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7b1068ddcbb7..fd1e85b4b48a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1458,7 +1458,10 @@ static inline void folio_get(struct folio *folio) static inline void get_page(struct page *page) { - folio_get(page_folio(page)); + struct folio *folio = page_folio(page); + if (WARN_ON_ONCE(folio_test_slab(folio))) + return; + folio_get(folio); } static inline __must_check bool try_get_page(struct page *page) @@ -1552,6 +1555,9 @@ static inline void put_page(struct page *page) { struct folio *folio = page_folio(page); + if (folio_test_slab(folio)) + return; + /* * For some devmap managed pages we need to catch refcount transition * from 2 to 1: diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 65f550cb5081..8c7fdb7d8c8f 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1190,8 +1190,12 @@ static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, if (!n) return -ENOMEM; p = *pages; - for (int k = 0; k < n; k++) - get_page(p[k] = page + k); + for (int k = 0; k < n; k++) { + struct folio *folio = page_folio(page); + p[k] = page + k; + if (!folio_test_slab(folio)) + folio_get(folio); + } maxsize = min_t(size_t, maxsize, n * PAGE_SIZE - *start); i->count -= maxsize; i->iov_offset += maxsize; -- cgit v1.2.3