From 73bd1227787bfe73eea3d04c63a89cb55db9c23e Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 18 Apr 2026 09:41:21 +0800 Subject: rhashtable: Restore insecure_elasticity toggle Some users of rhashtable cannot handle insertion failures, and are happy to accept the consequences of a hash table that having very long chains. Restore the insecure_elasticity toggle for these users. In addition to disabling the chain length checks, this also removes the emergency resize that would otherwise occur when the hash table occupancy hits 100% (an async resize is still scheduled at 75%). Signed-off-by: Herbert Xu Signed-off-by: Tejun Heo --- lib/rhashtable.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 6074ed5f66f3..fb2b7bc137ba 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -538,7 +538,7 @@ static void *rhashtable_lookup_one(struct rhashtable *ht, return NULL; } - if (elasticity <= 0) + if (elasticity <= 0 && !ht->p.insecure_elasticity) return ERR_PTR(-EAGAIN); return ERR_PTR(-ENOENT); @@ -568,7 +568,8 @@ static struct bucket_table *rhashtable_insert_one( if (unlikely(rht_grow_above_max(ht, tbl))) return ERR_PTR(-E2BIG); - if (unlikely(rht_grow_above_100(ht, tbl))) + if (unlikely(rht_grow_above_100(ht, tbl)) && + !ht->p.insecure_elasticity) return ERR_PTR(-EAGAIN); head = rht_ptr(bkt, tbl, hash); -- cgit v1.2.3 From 4fe985292709eeb6a4653c71660f893e26c2f2dd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 20 Apr 2026 20:03:26 -1000 Subject: rhashtable: Bounce deferred worker kick through irq_work Inserts past 75% load call schedule_work(&ht->run_work) to kick an async resize. If a caller holds a raw spinlock (e.g. an insecure_elasticity user), schedule_work() under that lock records caller_lock -> pool->lock -> pi_lock -> rq->__lock A cycle forms if any of these locks is acquired in the reverse direction elsewhere. sched_ext, the only current insecure_elasticity user, hits this: it holds scx_sched_lock across rhashtable inserts of sub-schedulers, while scx_bypass() takes rq->__lock -> scx_sched_lock. Exercising the resize path produces: Chain exists of: &pool->lock --> &rq->__lock --> scx_sched_lock Bounce the kick from the insert paths through irq_work so schedule_work() runs from hard IRQ context with the caller's lock no longer held. rht_deferred_worker()'s self-rearm on error stays on schedule_work(&ht->run_work) - the worker runs in process context with no caller lock held, and keeping the self-requeue on @run_work lets cancel_work_sync() in rhashtable_free_and_destroy() drain it. v3: Keep rht_deferred_worker()'s self-rearm on schedule_work(&run_work). Routing it through irq_work in v2 broke cancel_work_sync()'s self-requeue handling - an irq_work queued after irq_work_sync() returned but while cancel_work_sync() was still waiting could fire post-teardown. v2: Bounce unconditionally instead of gating on insecure_elasticity, as suggested by Herbert. Signed-off-by: Tejun Heo Acked-by: Herbert Xu --- include/linux/rhashtable-types.h | 3 +++ include/linux/rhashtable.h | 3 ++- lib/rhashtable.c | 31 ++++++++++++++++++++++++++++--- 3 files changed, 33 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h index 72082428d6c6..fc2f596a6df1 100644 --- a/include/linux/rhashtable-types.h +++ b/include/linux/rhashtable-types.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -77,6 +78,7 @@ struct rhashtable_params { * @p: Configuration parameters * @rhlist: True if this is an rhltable * @run_work: Deferred worker to expand/shrink asynchronously + * @run_irq_work: Bounces the @run_work kick through hard IRQ context. * @mutex: Mutex to protect current/future table swapping * @lock: Spin lock to protect walker list * @nelems: Number of elements in table @@ -88,6 +90,7 @@ struct rhashtable { struct rhashtable_params p; bool rhlist; struct work_struct run_work; + struct irq_work run_irq_work; struct mutex mutex; spinlock_t lock; atomic_t nelems; diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 7def3f0f556b..ef5230cece36 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -847,7 +848,7 @@ slow_path: rht_assign_unlock(tbl, bkt, obj, flags); if (rht_grow_above_75(ht, tbl)) - schedule_work(&ht->run_work); + irq_work_queue(&ht->run_irq_work); data = NULL; out: diff --git a/lib/rhashtable.c b/lib/rhashtable.c index fb2b7bc137ba..7a67ef5b67b6 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -441,10 +441,33 @@ static void rht_deferred_worker(struct work_struct *work) mutex_unlock(&ht->mutex); + /* + * Re-arm via @run_work, not @run_irq_work. + * rhashtable_free_and_destroy() drains async work as irq_work_sync() + * followed by cancel_work_sync(). If this site queued irq_work while + * cancel_work_sync() was waiting for us, irq_work_sync() would already + * have returned and the stale irq_work could fire post-teardown. + * cancel_work_sync() natively handles self-requeue on @run_work. + */ if (err) schedule_work(&ht->run_work); } +/* + * Insert-path callers can run under a raw spinlock (e.g. an insecure_elasticity + * user). Calling schedule_work() under that lock records caller_lock -> + * pool->lock -> pi_lock -> rq->__lock, closing a locking cycle if any of + * these is acquired in the reverse direction elsewhere. Bounce through + * irq_work so the schedule_work() runs with the caller's lock no longer held. + */ +static void rht_deferred_irq_work(struct irq_work *irq_work) +{ + struct rhashtable *ht = container_of(irq_work, struct rhashtable, + run_irq_work); + + schedule_work(&ht->run_work); +} + static int rhashtable_insert_rehash(struct rhashtable *ht, struct bucket_table *tbl) { @@ -477,7 +500,7 @@ static int rhashtable_insert_rehash(struct rhashtable *ht, if (err == -EEXIST) err = 0; } else - schedule_work(&ht->run_work); + irq_work_queue(&ht->run_irq_work); return err; @@ -488,7 +511,7 @@ fail: /* Schedule async rehash to retry allocation in process context. */ if (err == -ENOMEM) - schedule_work(&ht->run_work); + irq_work_queue(&ht->run_irq_work); return err; } @@ -630,7 +653,7 @@ static void *rhashtable_try_insert(struct rhashtable *ht, const void *key, rht_unlock(tbl, bkt, flags); if (inserted && rht_grow_above_75(ht, tbl)) - schedule_work(&ht->run_work); + irq_work_queue(&ht->run_irq_work); } } while (!IS_ERR_OR_NULL(new_tbl)); @@ -1085,6 +1108,7 @@ int rhashtable_init_noprof(struct rhashtable *ht, RCU_INIT_POINTER(ht->tbl, tbl); INIT_WORK(&ht->run_work, rht_deferred_worker); + init_irq_work(&ht->run_irq_work, rht_deferred_irq_work); return 0; } @@ -1150,6 +1174,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht, struct bucket_table *tbl, *next_tbl; unsigned int i; + irq_work_sync(&ht->run_irq_work); cancel_work_sync(&ht->run_work); mutex_lock(&ht->mutex); -- cgit v1.2.3 From 77a50e9652ac3c669c6690088bce97d960f5fd17 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 22 Apr 2026 14:43:10 -0400 Subject: MAINTAINERS: update Liam's email address Switching to private email address. Update all contact information Add an entry to mailmap at the same time. Link: https://lore.kernel.org/20260422184310.2682901-1-liam@infradead.org Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- .mailmap | 1 + MAINTAINERS | 20 ++++++++++---------- include/linux/maple_tree.h | 2 +- lib/maple_tree.c | 2 +- lib/test_maple_tree.c | 4 ++-- tools/testing/radix-tree/maple.c | 2 +- 6 files changed, 16 insertions(+), 15 deletions(-) (limited to 'lib') diff --git a/.mailmap b/.mailmap index a8fd13adf226..f1eafd91d2c2 100644 --- a/.mailmap +++ b/.mailmap @@ -496,6 +496,7 @@ Leon Romanovsky Leon Romanovsky Leon Romanovsky Leo Yan +Liam R. Howlett Liam Mark Linas Vepstas Linus Lüssing diff --git a/MAINTAINERS b/MAINTAINERS index d6f017581d54..be2e017b3a9d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15399,7 +15399,7 @@ F: include/net/netns/mctp.h F: net/mctp/ MAPLE TREE -M: Liam R. Howlett +M: Liam R. Howlett R: Alice Ryhl R: Andrew Ballance L: maple-tree@lists.infradead.org @@ -16759,7 +16759,7 @@ MEMORY MANAGEMENT - CORE M: Andrew Morton M: David Hildenbrand R: Lorenzo Stoakes -R: Liam R. Howlett +R: Liam R. Howlett R: Vlastimil Babka R: Mike Rapoport R: Suren Baghdasaryan @@ -16895,7 +16895,7 @@ MEMORY MANAGEMENT - MISC M: Andrew Morton M: David Hildenbrand R: Lorenzo Stoakes -R: Liam R. Howlett +R: Liam R. Howlett R: Vlastimil Babka R: Mike Rapoport R: Suren Baghdasaryan @@ -16997,7 +16997,7 @@ M: Andrew Morton M: David Hildenbrand M: Lorenzo Stoakes R: Rik van Riel -R: Liam R. Howlett +R: Liam R. Howlett R: Vlastimil Babka R: Harry Yoo R: Jann Horn @@ -17044,7 +17044,7 @@ M: David Hildenbrand M: Lorenzo Stoakes R: Zi Yan R: Baolin Wang -R: Liam R. Howlett +R: Liam R. Howlett R: Nico Pache R: Ryan Roberts R: Dev Jain @@ -17082,7 +17082,7 @@ F: tools/testing/selftests/mm/uffd-*.[ch] MEMORY MANAGEMENT - RUST M: Alice Ryhl R: Lorenzo Stoakes -R: Liam R. Howlett +R: Liam R. Howlett L: linux-mm@kvack.org L: rust-for-linux@vger.kernel.org S: Maintained @@ -17096,7 +17096,7 @@ F: rust/kernel/page.rs MEMORY MAPPING M: Andrew Morton -M: Liam R. Howlett +M: Liam R. Howlett M: Lorenzo Stoakes R: Vlastimil Babka R: Jann Horn @@ -17128,7 +17128,7 @@ F: tools/testing/vma/ MEMORY MAPPING - LOCKING M: Andrew Morton M: Suren Baghdasaryan -M: Liam R. Howlett +M: Liam R. Howlett M: Lorenzo Stoakes R: Vlastimil Babka R: Shakeel Butt @@ -17143,7 +17143,7 @@ F: mm/mmap_lock.c MEMORY MAPPING - MADVISE (MEMORY ADVICE) M: Andrew Morton -M: Liam R. Howlett +M: Liam R. Howlett M: Lorenzo Stoakes M: David Hildenbrand R: Vlastimil Babka @@ -23370,7 +23370,7 @@ RUST [ALLOC] M: Danilo Krummrich R: Lorenzo Stoakes R: Vlastimil Babka -R: Liam R. Howlett +R: Liam R. Howlett R: Uladzislau Rezki L: rust-for-linux@vger.kernel.org S: Maintained diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 0c464eade1d6..4a5631906aff 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -4,7 +4,7 @@ /* * Maple Tree - An RCU-safe adaptive tree for storing ranges * Copyright (c) 2018-2022 Oracle - * Authors: Liam R. Howlett + * Authors: Liam R. Howlett * Matthew Wilcox */ diff --git a/lib/maple_tree.c b/lib/maple_tree.c index d18d7ed9ab67..60ae5e6fc1ee 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -2,7 +2,7 @@ /* * Maple Tree implementation * Copyright (c) 2018-2022 Oracle Corporation - * Authors: Liam R. Howlett + * Authors: Liam R. Howlett * Matthew Wilcox * Copyright (c) 2023 ByteDance * Author: Peng Zhang diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index 434d8a2fdd99..b9367c61e8b5 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -2,7 +2,7 @@ /* * test_maple_tree.c: Test the maple tree API * Copyright (c) 2018-2022 Oracle Corporation - * Author: Liam R. Howlett + * Author: Liam R. Howlett * * Any tests that only require the interface of the tree. */ @@ -4021,6 +4021,6 @@ static void __exit maple_tree_harvest(void) module_init(maple_tree_seed); module_exit(maple_tree_harvest); -MODULE_AUTHOR("Liam R. Howlett "); +MODULE_AUTHOR("Liam R. Howlett "); MODULE_DESCRIPTION("maple tree API test module"); MODULE_LICENSE("GPL"); diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index feedd5ab7058..0607913a3022 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -2,7 +2,7 @@ /* * maple_tree.c: Userspace testing for maple tree test-suite * Copyright (c) 2018-2022 Oracle Corporation - * Author: Liam R. Howlett + * Author: Liam R. Howlett * * Any tests that require internal knowledge of the tree or threads and other * difficult to handle in kernel tests. -- cgit v1.2.3 From d237f719b2726c0e6d62bfa1543f53b624471929 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 28 Apr 2026 10:28:43 +0200 Subject: lib/fonts: Fix bit position when rotating by 180 degrees MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the horizontal bit position when rotating a glyph by 180°. The original code in rotate_ud() rounded the value in width up to a multiple of 8, aka the bit pitch, and calculated the rotated pixel from that value. The new code stores the glyph's pitch in bit_pitch, but fails to update the rotated pixel's output accordingly. Simply replacing the variable does this. The bug can be reproduced by setting a font with an unaligned width, such as sun12x22, like this: setfont sun12x22 echo 2 > /sys/class/graphics/fbcon/rotate Without the fix, the font looks distorted. Fixes: a30e9e6b018f ("lib/fonts: Refactor glyph-rotation helpers") Closes: https://lore.gitlab.freedesktop.org/drm-ai-reviews/review-patch7-20260407092555.58816-8-tzimmermann@suse.de/ Cc: dri-devel@lists.freedesktop.org Cc: linux-fbdev@vger.kernel.org Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- lib/fonts/font_rotate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/fonts/font_rotate.c b/lib/fonts/font_rotate.c index 065e0fc0667b..275406008823 100644 --- a/lib/fonts/font_rotate.c +++ b/lib/fonts/font_rotate.c @@ -106,7 +106,7 @@ static void __font_glyph_rotate_180(const unsigned char *glyph, for (y = 0; y < height; y++) { for (x = 0; x < width; x++) { if (font_glyph_test_bit(glyph, x, y, bit_pitch)) { - font_glyph_set_bit(out, width - (1 + x + shift), height - (1 + y), + font_glyph_set_bit(out, bit_pitch - 1 - x - shift, height - 1 - y, bit_pitch); } } -- cgit v1.2.3 From 17e4c68ff35090d8cb743e3c82c09f92fda1ebda Mon Sep 17 00:00:00 2001 From: David Gow Date: Sat, 25 Apr 2026 11:41:53 +0800 Subject: kunit: config: Enable KUNIT_DEBUGFS by default The KUNIT_DEBUGFS option is currently enabled based on the value of KUNIT_ALL_TESTS, but it really doesn't have anything to do with the set of enabled tests, so just enable it by default anyway. In particular, this shouldn't be only visible if KUNIT_ALL_TESTS is set, which is quite confusing. Link: https://lore.kernel.org/r/20260425034155.53913-1-david@davidgow.net Fixes: beaed42c427d ("kunit: default KUNIT_* fragments to KUNIT_ALL_TESTS") Signed-off-by: David Gow Signed-off-by: Shuah Khan --- lib/kunit/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/kunit/Kconfig b/lib/kunit/Kconfig index 498cc51e493d..f80ca3aeedb0 100644 --- a/lib/kunit/Kconfig +++ b/lib/kunit/Kconfig @@ -16,8 +16,8 @@ menuconfig KUNIT if KUNIT config KUNIT_DEBUGFS - bool "KUnit - Enable /sys/kernel/debug/kunit debugfs representation" if !KUNIT_ALL_TESTS - default KUNIT_ALL_TESTS + bool "KUnit - Enable /sys/kernel/debug/kunit debugfs representation" + default y help Enable debugfs representation for kunit. Currently this consists of /sys/kernel/debug/kunit//results files for each -- cgit v1.2.3 From 8f80b5b227ef9ea422080487715c841856339aed Mon Sep 17 00:00:00 2001 From: David Gow Date: Sat, 25 Apr 2026 11:41:54 +0800 Subject: kunit: config: KUNIT_DEBUGFS should depend on DEBUG_FS CONFIG_KUNIT_DEBUGFS is totally useless without debugfs, so it should depend on CONFIG_DEBUG_FS. Link: https://lore.kernel.org/r/20260425034155.53913-2-david@davidgow.net Fixes: e2219db280e3 ("kunit: add debugfs /sys/kernel/debug/kunit//results display") Signed-off-by: David Gow Signed-off-by: Shuah Khan --- lib/kunit/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'lib') diff --git a/lib/kunit/Kconfig b/lib/kunit/Kconfig index f80ca3aeedb0..94ff8e4089bf 100644 --- a/lib/kunit/Kconfig +++ b/lib/kunit/Kconfig @@ -17,6 +17,7 @@ if KUNIT config KUNIT_DEBUGFS bool "KUnit - Enable /sys/kernel/debug/kunit debugfs representation" + depends on DEBUG_FS default y help Enable debugfs representation for kunit. Currently this consists -- cgit v1.2.3 From 09ae540e1d5c02210795911bf5459282d7af04e9 Mon Sep 17 00:00:00 2001 From: Mikhail Gavrilov Date: Thu, 23 Apr 2026 02:33:49 +0500 Subject: rhashtable: drop ht->mutex in rhashtable_free_and_destroy() rhashtable_free_and_destroy() is a single-shot teardown routine: cancel_work_sync() has already quiesced the deferred rehash worker, and the function's documented contract requires the caller to guarantee no other concurrent access to the rhashtable. Under those conditions ht->mutex is not protecting anything -- taking it is a leftover from the original teardown path. That leftover is actively harmful: it closes a circular lock-class dependency with fs_reclaim. The deferred rehash worker takes ht->mutex and then allocates GFP_KERNEL memory in bucket_table_alloc(), establishing &ht->mutex -> fs_reclaim After commit b32c4a213698 ("xattr: add rhashtable-based simple_xattr infrastructure") introduced simple_xattr_ht_free(), which calls rhashtable_free_and_destroy(), the simple_xattrs teardown became reachable from evict() under the dcache shrinker. The subsequent per-subsystem adaptations made the reverse edge concrete in three independent code paths: * commit 52b364fed6e1 ("shmem: adapt to rhashtable-based simple_xattrs with lazy allocation") * commit 5bd97f5c5f24 ("kernfs: adapt to rhashtable-based simple_xattrs with lazy allocation") * commit 50704c391fbf ("pidfs: adapt to rhashtable-based simple_xattrs") Any of the three closes the cycle fs_reclaim -> &ht->mutex which lockdep reports as follows. This particular splat was observed organically on a workstation kernel built from vfs-7.1-rc1.xattr at ~35h uptime under normal mixed workload, with CONFIG_PROVE_LOCKING=y. The path happens to go through kernfs: WARNING: possible circular locking dependency detected 7.0.0-faeab166167f-with-fixes-v1+ #191 Tainted: G U kswapd0/243 is trying to acquire lock: ffff8882e475c0f8 (&ht->mutex){+.+.}-{4:4}, at: rhashtable_free_and_destroy+0x36/0x740 but task is already holding lock: ffffffffa8ad1d00 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x995/0x1600 the existing dependency chain (in reverse order) is: -> #1 (fs_reclaim){+.+.}-{0:0}: __lock_acquire+0x506/0xbf0 lock_acquire.part.0+0xc7/0x280 fs_reclaim_acquire+0xd9/0x130 __kvmalloc_node_noprof+0xcd/0xb40 bucket_table_alloc.isra.0+0x5a/0x440 rhashtable_rehash_alloc+0x4e/0xd0 rht_deferred_worker+0x14b/0x440 process_one_work+0x8fd/0x16a0 worker_thread+0x601/0xff0 kthread+0x36b/0x470 ret_from_fork+0x5bf/0x910 ret_from_fork_asm+0x1a/0x30 -> #0 (&ht->mutex){+.+.}-{4:4}: check_prev_add+0xdb/0xce0 validate_chain+0x554/0x780 __lock_acquire+0x506/0xbf0 lock_acquire.part.0+0xc7/0x280 __mutex_lock+0x1b2/0x2550 rhashtable_free_and_destroy+0x36/0x740 kernfs_put.part.0+0x119/0x570 evict+0x3b6/0x9c0 __dentry_kill+0x181/0x540 shrink_dentry_list+0x135/0x440 prune_dcache_sb+0xdb/0x150 super_cache_scan+0x2ff/0x520 do_shrink_slab+0x35a/0xee0 shrink_slab_memcg+0x457/0x950 shrink_slab+0x43b/0x550 shrink_one+0x31a/0x6f0 shrink_many+0x31e/0xc80 shrink_node+0xeb3/0x14a0 balance_pgdat+0x8ed/0x1600 kswapd+0x2f3/0x530 kthread+0x36b/0x470 ret_from_fork+0x5bf/0x910 ret_from_fork_asm+0x1a/0x30 Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(fs_reclaim); lock(&ht->mutex); lock(fs_reclaim); lock(&ht->mutex); Note that lockdep tracks lock classes, not instances: the two &ht->mutex sites are on different rhashtable objects (the deferred worker was triggered by some unrelated rhashtable growth), but because rhashtable_init() uses a single static lockdep key for all rhashtables, this is a real class-level cycle. Once reported, lockdep disables itself for the remainder of the boot, masking any subsequent locking bugs. Drop the mutex. After cancel_work_sync() the rehash worker is quiesced and, per this function's contract, no other concurrent access is possible; the tables are therefore owned exclusively by this function and can be walked without any lock held. Switch the table walks from rht_dereference() (which requires ht->mutex to be held under CONFIG_PROVE_RCU) to rcu_dereference_raw(), which has no lockdep annotation. rht_ptr_exclusive() already uses rcu_dereference_protected(p, 1) and needs no change. This is the only place in lib/rhashtable.c where &ht->mutex is acquired from a path reachable under fs_reclaim; the deferred worker is the only other site and it is the forward edge. Removing the acquisition here therefore eliminates the class cycle for all three subsystems that use simple_xattrs, not just the one in the splat above. No locking-semantics change is introduced for correct users; incorrect users would already be racing with rehash worker completion regardless of the mutex. Synthetic reproduction of the splat within a few-minute window was unsuccessful across several attempts (tmpfs and kernfs zombies via cgroupfs with open-fd-through-rmdir, with and without swap, up to ~60k reclaim-path executions of simple_xattr_ht_free() in a single run), consistent with the rare coincidence-of-edges profile of the bug: the forward edge is already registered in /proc/lockdep on any idle system via rht_deferred_worker, but the reverse edge requires evict() to complete kernfs_put()'s final release inside the fs_reclaim critical section, which in my attempts was ordered against rather than interleaved with the worker. Fixes: b32c4a213698 ("xattr: add rhashtable-based simple_xattr infrastructure") Signed-off-by: Mikhail Gavrilov Signed-off-by: Herbert Xu --- lib/rhashtable.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'lib') diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 7a67ef5b67b6..426d4e381f13 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -1166,6 +1166,11 @@ static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj, * This function will eventually sleep to wait for an async resize * to complete. The caller is responsible that no further write operations * occurs in parallel. + * + * After cancel_work_sync() has returned, the deferred rehash worker is + * quiesced and, per the contract above, no other concurrent access to the + * rhashtable is possible. The tables are therefore owned exclusively by + * this function and can be walked without ht->mutex held. */ void rhashtable_free_and_destroy(struct rhashtable *ht, void (*free_fn)(void *ptr, void *arg), @@ -1177,8 +1182,15 @@ void rhashtable_free_and_destroy(struct rhashtable *ht, irq_work_sync(&ht->run_irq_work); cancel_work_sync(&ht->run_work); - mutex_lock(&ht->mutex); - tbl = rht_dereference(ht->tbl, ht); + /* + * Do NOT take ht->mutex here. The rehash worker establishes + * ht->mutex -> fs_reclaim via GFP_KERNEL bucket allocation under + * the mutex; callers on the reclaim path (e.g. simple_xattr_ht_free() + * from evict() under the dcache shrinker for shmem/kernfs/pidfs + * inodes) would otherwise close a circular dependency + * fs_reclaim -> ht->mutex. + */ + tbl = rcu_dereference_raw(ht->tbl); restart: if (free_fn) { for (i = 0; i < tbl->size; i++) { @@ -1187,22 +1199,21 @@ restart: cond_resched(); for (pos = rht_ptr_exclusive(rht_bucket(tbl, i)), next = !rht_is_a_nulls(pos) ? - rht_dereference(pos->next, ht) : NULL; + rcu_dereference_raw(pos->next) : NULL; !rht_is_a_nulls(pos); pos = next, next = !rht_is_a_nulls(pos) ? - rht_dereference(pos->next, ht) : NULL) + rcu_dereference_raw(pos->next) : NULL) rhashtable_free_one(ht, pos, free_fn, arg); } } - next_tbl = rht_dereference(tbl->future_tbl, ht); + next_tbl = rcu_dereference_raw(tbl->future_tbl); bucket_table_free(tbl); if (next_tbl) { tbl = next_tbl; goto restart; } - mutex_unlock(&ht->mutex); } EXPORT_SYMBOL_GPL(rhashtable_free_and_destroy); -- cgit v1.2.3 From d1fa83ecac31093a550534a79a33bc7f4ba8fc10 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 28 Apr 2026 18:14:19 +0200 Subject: rhashtable: Add bucket_table_free_atomic() helper rhashtable_insert_rehash() allocates a new bucket table with GFP_ATOMIC, as it is called from an RCU read-side critical section. If rhashtable_rehash_attach() then fails, the new table is freed via kvfree(). This is unsafe, since kvfree() may fall back to vfree() for vmalloc-backed allocations, which can sleep and trigger: BUG: sleeping function called from invalid context Add bucket_table_free_atomic(), which uses kvfree_atomic() so the table can be freed safely from non-sleeping context. Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Herbert Xu --- lib/rhashtable.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 426d4e381f13..04b3a808fca9 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -114,6 +114,14 @@ static void bucket_table_free(const struct bucket_table *tbl) kvfree(tbl); } +static void bucket_table_free_atomic(const struct bucket_table *tbl) +{ + if (tbl->nest) + nested_bucket_table_free(tbl); + + kvfree_atomic(tbl); +} + static void bucket_table_free_rcu(struct rcu_head *head) { bucket_table_free(container_of(head, struct bucket_table, rcu)); @@ -496,7 +504,7 @@ static int rhashtable_insert_rehash(struct rhashtable *ht, err = rhashtable_rehash_attach(ht, tbl, new_tbl); if (err) { - bucket_table_free(new_tbl); + bucket_table_free_atomic(new_tbl); if (err == -EEXIST) err = 0; } else -- cgit v1.2.3 From ef5581bb30efb939cc2bf093475c6cc85258e5cd Mon Sep 17 00:00:00 2001 From: Martin Kaiser Date: Fri, 8 May 2026 09:56:36 +0900 Subject: test_kprobes: clear kprobes between test runs Running the kprobes sanity tests twice makes all tests fail and eventually crashes the kernel. [root@martin-riscv-1 ~]# echo 1 > /sys/kernel/debug/kunit/kprobes_test/run ... # Totals: pass:5 fail:0 skip:0 total:5 ok 1 kprobes_test [root@martin-riscv-1 ~]# echo 1 > /sys/kernel/debug/kunit/kprobes_test/run ... # test_kprobe: EXPECTATION FAILED at lib/tests/test_kprobes.c:64 Expected 0 == register_kprobe(&kp), but register_kprobe(&kp) == -22 (0xffffffffffffffea) ... Unable to handle kernel paging request ... The testsuite defines several kprobes and kretprobes as static variables that are preserved across test runs. After register_kprobe and unregister_kprobe, a kprobe contains some leftover data that must be cleared before the kprobe can be registered again. The tests are setting symbol_name to define the probe location. Address and flags must be cleared. The existing code clears some of the probes between subsequent tests, but not between two test runs. The leftover data from a previous test run makes the registrations fail in the next run. Move the cleanups for all kprobes into kprobes_test_init, this function is called before each single test (including the first test of a test run). Link: https://lore.kernel.org/all/20260507134615.1010905-1-martin@kaiser.cx/ Fixes: e44e81c5b90f ("kprobes: convert tests to kunit") Signed-off-by: Martin Kaiser Signed-off-by: Masami Hiramatsu (Google) --- lib/tests/test_kprobes.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) (limited to 'lib') diff --git a/lib/tests/test_kprobes.c b/lib/tests/test_kprobes.c index b7582010125c..06e729e4de05 100644 --- a/lib/tests/test_kprobes.c +++ b/lib/tests/test_kprobes.c @@ -12,6 +12,12 @@ #define div_factor 3 +#define KP_CLEAR(_kp) \ +do { \ + (_kp).addr = NULL; \ + (_kp).flags = 0; \ +} while (0) + static u32 rand1, preh_val, posth_val; static u32 (*target)(u32 value); static u32 (*recursed_target)(u32 value); @@ -125,10 +131,6 @@ static void test_kprobes(struct kunit *test) current_test = test; - /* addr and flags should be cleard for reusing kprobe. */ - kp.addr = NULL; - kp.flags = 0; - KUNIT_EXPECT_EQ(test, 0, register_kprobes(kps, 2)); preh_val = 0; posth_val = 0; @@ -226,9 +228,6 @@ static void test_kretprobes(struct kunit *test) struct kretprobe *rps[2] = {&rp, &rp2}; current_test = test; - /* addr and flags should be cleard for reusing kprobe. */ - rp.kp.addr = NULL; - rp.kp.flags = 0; KUNIT_EXPECT_EQ(test, 0, register_kretprobes(rps, 2)); krph_val = 0; @@ -290,8 +289,6 @@ static void test_stacktrace_on_kretprobe(struct kunit *test) unsigned long myretaddr = (unsigned long)__builtin_return_address(0); current_test = test; - rp3.kp.addr = NULL; - rp3.kp.flags = 0; /* * Run the stacktrace_driver() to record correct return address in @@ -352,8 +349,6 @@ static void test_stacktrace_on_nested_kretprobe(struct kunit *test) struct kretprobe *rps[2] = {&rp3, &rp4}; current_test = test; - rp3.kp.addr = NULL; - rp3.kp.flags = 0; //KUNIT_ASSERT_NE(test, myretaddr, stacktrace_driver()); @@ -367,6 +362,18 @@ static void test_stacktrace_on_nested_kretprobe(struct kunit *test) static int kprobes_test_init(struct kunit *test) { + KP_CLEAR(kp); + KP_CLEAR(kp2); + KP_CLEAR(kp_missed); +#ifdef CONFIG_KRETPROBES + KP_CLEAR(rp.kp); + KP_CLEAR(rp2.kp); +#ifdef CONFIG_ARCH_CORRECT_STACKTRACE_ON_KRETPROBE + KP_CLEAR(rp3.kp); + KP_CLEAR(rp4.kp); +#endif +#endif + target = kprobe_target; target2 = kprobe_target2; recursed_target = kprobe_recursed_target; -- cgit v1.2.3 From efdadbc180e53fe257a6e85f6bc706cb58088653 Mon Sep 17 00:00:00 2001 From: "Christian A. Ehrhardt" Date: Tue, 21 Apr 2026 09:07:07 +0200 Subject: lib: kunit_iov_iter: fix test fail on powerpc Increase buffer size to accommodate machines with 64K PAGE_SIZE. Link: https://lore.kernel.org/20260421070707.992873-1-lk@c--e.de Fixes: 0913b7554726 ("lib: kunit_iov_iter: add tests for extract_iter_to_sg") Signed-off-by: Christian A. Ehrhardt Reported-by: David Gow Closes: https://lore.kernel.org/34a81ec2-af84-465d-9b5e-7bb5bf01680f@davidgow.net Tested-by: David Gow Tested-by: Josh Law Reviewed-by: Josh Law Signed-off-by: Andrew Morton --- lib/tests/kunit_iov_iter.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'lib') diff --git a/lib/tests/kunit_iov_iter.c b/lib/tests/kunit_iov_iter.c index 37bd6eb25896..f02f7b7aa796 100644 --- a/lib/tests/kunit_iov_iter.c +++ b/lib/tests/kunit_iov_iter.c @@ -1128,7 +1128,7 @@ static void __init iov_kunit_iter_to_sg_kvec(struct kunit *test) struct kvec kvec; size_t bufsize; - bufsize = 0x100000; + bufsize = 0x200000; iov_kunit_iter_to_sg_init(test, bufsize, false, &data); kvec.iov_base = data.buffer; @@ -1146,7 +1146,7 @@ static void __init iov_kunit_iter_to_sg_bvec(struct kunit *test) struct bio_vec *bvec; struct iov_iter iter; - bufsize = 0x100000; + bufsize = 0x200000; iov_kunit_iter_to_sg_init(test, bufsize, false, &data); bvec = kunit_kmalloc_array(test, data.npages, sizeof(*bvec), @@ -1173,7 +1173,7 @@ static void __init iov_kunit_iter_to_sg_folioq(struct kunit *test) struct iov_iter iter; size_t bufsize; - bufsize = 0x100000; + bufsize = 0x200000; iov_kunit_iter_to_sg_init(test, bufsize, false, &data); folioq = iov_kunit_create_folioq(test); @@ -1190,7 +1190,7 @@ static void __init iov_kunit_iter_to_sg_xarray(struct kunit *test) struct iov_iter iter; size_t bufsize; - bufsize = 0x100000; + bufsize = 0x200000; iov_kunit_iter_to_sg_init(test, bufsize, false, &data); xarray = iov_kunit_create_xarray(test); @@ -1206,7 +1206,7 @@ static void __init iov_kunit_iter_to_sg_ubuf(struct kunit *test) struct iov_iter iter; size_t bufsize; - bufsize = 0x100000; + bufsize = 0x200000; iov_kunit_iter_to_sg_init(test, bufsize, true, &data); iov_iter_ubuf(&iter, READ, data.ubuf, bufsize); -- cgit v1.2.3 From 602d60ebae0f10bfbc7ba90eee026fdbd0203df3 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Wed, 22 Apr 2026 11:42:32 +0200 Subject: vdso/gettimeofday: Reload sequence counter after switch to time page in do_aux() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After switching to the real data pages, the sequence counter needs to be reloaded from there. The code using vdso_read_begin_timens() assumed this worked by 'continue' jumping to the *beginning* of the do-while retry loop. However the 'continue' jumps to the *end* of said loop, evaluating the exit condition. If the data page has a sequence counter of '1' it will match the one from the time namespace page and prematurely exit the retry loop. This would result in garbage returned to the caller. Reload the sequence counter after switching the pages by using an inner while loop again, which will loop at most once. The loop generates slightly better code than an explicit reload through 'seq = vdso_read_begin()'. Fixes: ed78b7b2c5ae ("vdso/gettimeofday: Add a helper to read the sequence lock of a time namespace aware clock") Reported-by: Ricardo Ribalda Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Tested-by: Ricardo Ribalda Reviewed-by: Christophe Leroy (CS GROUP) Link: https://patch.msgid.link/20260422-vdso-aux-timens-loop-v1-1-e2dd8c7164cc@linutronix.de Closes: https://lore.kernel.org/lkml/CANiDSCsOy0P1if-gJZqOM5pTJ0RDcwVfru1B7KFbTOEMqjPKJw@mail.gmail.com/ --- lib/vdso/gettimeofday.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index a5798bd26d20..da224011fafd 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -248,11 +248,10 @@ bool do_aux(const struct vdso_time_data *vd, clockid_t clock, struct __kernel_ti vc = &vd->aux_clock_data[idx]; do { - if (vdso_read_begin_timens(vc, &seq)) { + while (vdso_read_begin_timens(vc, &seq)) { + /* Re-read from the real time data page, reload seq by looping */ vd = __arch_get_vdso_u_timens_data(vd); vc = &vd->aux_clock_data[idx]; - /* Re-read from the real time data page */ - continue; } /* Auxclock disabled? */ -- cgit v1.2.3 From 5f41161059fd0f1bbf18c90f3180e38cc45a14eb Mon Sep 17 00:00:00 2001 From: Helen Koike Date: Mon, 11 May 2026 18:53:05 -0300 Subject: debugobjects: Do not fill_pool() if pi_blocked_on On RT enabled kernels, fill_pool() ends up calling rtlock_lock(), which asserts if current::pi_blocked_on is set, because a task can obviously only block on one lock as otherwise the priority inheritenace chain gets corrupted. Prevent this by expanding the conditional to take current::pi_blocked_on into account. Fixes: 4bedcc28469a ("debugobjects: Make them PREEMPT_RT aware") Reported-by: syzbot+b8ca586b9fc235f0c0df@syzkaller.appspotmail.com Signed-off-by: Helen Koike Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260511215359.3351259-1-koike@igalia.com Closes: https://syzkaller.appspot.com/bug?extid=b8ca586b9fc235f0c0df --- lib/debugobjects.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 12e2e42e6a31..772ddabcbe7d 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -711,6 +711,15 @@ static struct debug_obj *lookup_object_or_alloc(void *addr, struct debug_bucket return NULL; } +static inline bool debug_objects_is_pi_blocked_on(void) +{ +#ifdef CONFIG_RT_MUTEXES + return current->pi_blocked_on != NULL; +#else + return false; +#endif +} + static void debug_objects_fill_pool(void) { if (!static_branch_likely(&obj_cache_enabled)) @@ -727,11 +736,12 @@ static void debug_objects_fill_pool(void) /* * On RT enabled kernels the pool refill must happen in preemptible - * context -- for !RT kernels we rely on the fact that spinlock_t and - * raw_spinlock_t are basically the same type and this lock-type - * inversion works just fine. + * context and not enqueued on an rt_mutex -- for !RT kernels we rely + * on the fact that spinlock_t and raw_spinlock_t are basically the + * same type and this lock-type inversion works just fine. */ - if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible() || system_state < SYSTEM_SCHEDULING) { + if (!IS_ENABLED(CONFIG_PREEMPT_RT) || system_state < SYSTEM_SCHEDULING || + (preemptible() && !debug_objects_is_pi_blocked_on())) { /* * Annotate away the spinlock_t inside raw_spinlock_t warning * by temporarily raising the wait-type to LD_WAIT_CONFIG, matching -- cgit v1.2.3