summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMykyta Yatsenko <yatsenko@meta.com>2026-06-07 23:30:41 +0300
committerAlexei Starovoitov <ast@kernel.org>2026-06-08 04:46:13 +0300
commit89edbdfc5d0308cef57b71359331de5c4ddbf763 (patch)
treeecfc5f0f51192bb2207adaa249723b527a154be6
parent1444ee886e6fedf20b9c5bc74a273c6b7d100fdc (diff)
downloadlinux-89edbdfc5d0308cef57b71359331de5c4ddbf763.tar.xz
bpf: Fix NMI/tracepoint re-entry deadlock on lru locks
NMI and tracepoint BPF programs can re-enter the per-CPU or global LRU lock that bpf_lru_pop_free()/push_free() already hold on the same CPU, AA-deadlocking. Lockdep reports "inconsistent {INITIAL USE} -> {IN-NMI}" on &l->lock (syzbot c69a0a2c816716f1e0d5) and "possible recursive locking detected" on &loc_l->lock (syzbot 18b26edb69b2e19f3b33). Prior trylock and rqspinlock based fixes (see links) were nacked because compromised on reliability. This patch converts every LRU lock site to rqspinlock_t and adds a recovery path for some failure windows to avoid node leaks. Failure recovery: - *_pop_free top-level: return NULL; prealloc_lru_pop() already treats that as no-free-element (-ENOMEM). - Cross-CPU steal: skip the victim's locked loc_l, try next CPU. - Post-steal local lock fail: publish stolen node to lockless per-CPU free_llist; next pop on this CPU picks it up. - push_free fail: mark node pending_free=1. __local_list_flush(), __local_list_pop_pending() reclaim the node from pending_list. __bpf_lru_list_shrink_inactive() reclaims the node from inactive list. Nodes from active list are reclaimed by __bpf_lru_list_shrink() or after __bpf_lru_list_rotate_active() demotes it to the inactive. Fixes: 3a08c2fd7634 ("bpf: LRU List") Reported-by: syzbot+c69a0a2c816716f1e0d5@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=c69a0a2c816716f1e0d5 Reported-by: syzbot+18b26edb69b2e19f3b33@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=18b26edb69b2e19f3b33 Link: https://lore.kernel.org/bpf/CAPPBnEYO4R+m+SpVc2gNj_x31R6fo1uJvj2bK2YS1P09GWT6kQ@mail.gmail.com/ Link: https://lore.kernel.org/bpf/CAPPBnEZmFA3ab8Uc=PEm0bdojZy=7T_F5_+eyZSHyZR3MBG4Vw@mail.gmail.com/ Link: https://lore.kernel.org/bpf/20251030030010.95352-1-dongml2@chinatelecom.cn/ Link: https://lore.kernel.org/bpf/20260119142120.28170-1-leon.hwang@linux.dev/ Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com> Link: https://lore.kernel.org/r/20260607-lru_map_spin-v3-1-bcd9332e911b@meta.com Signed-off-by: Alexei Starovoitov <ast@kernel.org>
-rw-r--r--kernel/bpf/bpf_lru_list.c165
-rw-r--r--kernel/bpf/bpf_lru_list.h25
2 files changed, 119 insertions, 71 deletions
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index e7a2fc60523f..5ed7cb4b98c0 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -13,23 +13,8 @@
#define PERCPU_FREE_TARGET (4)
#define PERCPU_NR_SCANS PERCPU_FREE_TARGET
-/* Helpers to get the local list index */
-#define LOCAL_LIST_IDX(t) ((t) - BPF_LOCAL_LIST_T_OFFSET)
-#define LOCAL_FREE_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_FREE)
-#define LOCAL_PENDING_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_PENDING)
#define IS_LOCAL_LIST_TYPE(t) ((t) >= BPF_LOCAL_LIST_T_OFFSET)
-/* Local list helpers */
-static struct list_head *local_free_list(struct bpf_lru_locallist *loc_l)
-{
- return &loc_l->lists[LOCAL_FREE_LIST_IDX];
-}
-
-static struct list_head *local_pending_list(struct bpf_lru_locallist *loc_l)
-{
- return &loc_l->lists[LOCAL_PENDING_LIST_IDX];
-}
-
/* bpf_lru_node helpers */
static bool bpf_lru_node_is_ref(const struct bpf_lru_node *node)
{
@@ -72,6 +57,7 @@ static void __bpf_lru_node_move_to_free(struct bpf_lru_list *l,
bpf_lru_list_count_dec(l, node->type);
node->type = tgt_free_type;
+ WRITE_ONCE(node->pending_free, 0);
list_move(&node->list, free_list);
}
@@ -87,6 +73,9 @@ static void __bpf_lru_node_move_in(struct bpf_lru_list *l,
bpf_lru_list_count_inc(l, tgt_type);
node->type = tgt_type;
bpf_lru_node_clear_ref(node);
+ /* Reset pending_free only when moving to the free list */
+ if (tgt_type == BPF_LRU_LIST_T_FREE)
+ WRITE_ONCE(node->pending_free, 0);
list_move(&node->list, &l->lists[tgt_type]);
}
@@ -212,9 +201,11 @@ __bpf_lru_list_shrink_inactive(struct bpf_lru *lru,
unsigned int i = 0;
list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) {
- if (bpf_lru_node_is_ref(node)) {
+ if (bpf_lru_node_is_ref(node) &&
+ !READ_ONCE(node->pending_free)) {
__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE);
- } else if (lru->del_from_htab(lru->del_arg, node)) {
+ } else if (READ_ONCE(node->pending_free) ||
+ lru->del_from_htab(lru->del_arg, node)) {
__bpf_lru_node_move_to_free(l, node, free_list,
tgt_free_type);
if (++nshrinked == tgt_nshrink)
@@ -273,7 +264,8 @@ static unsigned int __bpf_lru_list_shrink(struct bpf_lru *lru,
list_for_each_entry_safe_reverse(node, tmp_node, force_shrink_list,
list) {
- if (lru->del_from_htab(lru->del_arg, node)) {
+ if (READ_ONCE(node->pending_free) ||
+ lru->del_from_htab(lru->del_arg, node)) {
__bpf_lru_node_move_to_free(l, node, free_list,
tgt_free_type);
return 1;
@@ -290,8 +282,10 @@ static void __local_list_flush(struct bpf_lru_list *l,
struct bpf_lru_node *node, *tmp_node;
list_for_each_entry_safe_reverse(node, tmp_node,
- local_pending_list(loc_l), list) {
- if (bpf_lru_node_is_ref(node))
+ &loc_l->pending_list, list) {
+ if (READ_ONCE(node->pending_free))
+ __bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_FREE);
+ else if (bpf_lru_node_is_ref(node))
__bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_ACTIVE);
else
__bpf_lru_node_move_in(l, node,
@@ -307,9 +301,12 @@ static void bpf_lru_list_push_free(struct bpf_lru_list *l,
if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)))
return;
- raw_spin_lock_irqsave(&l->lock, flags);
+ if (raw_res_spin_lock_irqsave(&l->lock, flags)) {
+ WRITE_ONCE(node->pending_free, 1);
+ return;
+ }
__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
- raw_spin_unlock_irqrestore(&l->lock, flags);
+ raw_res_spin_unlock_irqrestore(&l->lock, flags);
}
static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
@@ -318,8 +315,10 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
struct bpf_lru_list *l = &lru->common_lru.lru_list;
struct bpf_lru_node *node, *tmp_node;
unsigned int nfree = 0;
+ LIST_HEAD(tmp_free);
- raw_spin_lock(&l->lock);
+ if (raw_res_spin_lock(&l->lock))
+ return;
__local_list_flush(l, loc_l);
@@ -327,7 +326,7 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
list_for_each_entry_safe(node, tmp_node, &l->lists[BPF_LRU_LIST_T_FREE],
list) {
- __bpf_lru_node_move_to_free(l, node, local_free_list(loc_l),
+ __bpf_lru_node_move_to_free(l, node, &tmp_free,
BPF_LRU_LOCAL_LIST_T_FREE);
if (++nfree == lru->target_free)
break;
@@ -335,10 +334,19 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
if (nfree < lru->target_free)
__bpf_lru_list_shrink(lru, l, lru->target_free - nfree,
- local_free_list(loc_l),
+ &tmp_free,
BPF_LRU_LOCAL_LIST_T_FREE);
- raw_spin_unlock(&l->lock);
+ raw_res_spin_unlock(&l->lock);
+
+ /*
+ * Transfer the harvested nodes from the temporary list_head into
+ * the lockless per-CPU free llist.
+ */
+ list_for_each_entry_safe(node, tmp_node, &tmp_free, list) {
+ list_del(&node->list);
+ llist_add(&node->llist, &loc_l->free_llist);
+ }
}
static void __local_list_add_pending(struct bpf_lru *lru,
@@ -350,22 +358,21 @@ static void __local_list_add_pending(struct bpf_lru *lru,
*(u32 *)((void *)node + lru->hash_offset) = hash;
node->cpu = cpu;
node->type = BPF_LRU_LOCAL_LIST_T_PENDING;
+ WRITE_ONCE(node->pending_free, 0);
bpf_lru_node_clear_ref(node);
- list_add(&node->list, local_pending_list(loc_l));
+ list_add(&node->list, &loc_l->pending_list);
}
static struct bpf_lru_node *
__local_list_pop_free(struct bpf_lru_locallist *loc_l)
{
- struct bpf_lru_node *node;
+ struct llist_node *llnode;
- node = list_first_entry_or_null(local_free_list(loc_l),
- struct bpf_lru_node,
- list);
- if (node)
- list_del(&node->list);
+ llnode = llist_del_first(&loc_l->free_llist);
+ if (!llnode)
+ return NULL;
- return node;
+ return container_of(llnode, struct bpf_lru_node, llist);
}
static struct bpf_lru_node *
@@ -376,10 +383,10 @@ __local_list_pop_pending(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l)
ignore_ref:
/* Get from the tail (i.e. older element) of the pending list. */
- list_for_each_entry_reverse(node, local_pending_list(loc_l),
- list) {
+ list_for_each_entry_reverse(node, &loc_l->pending_list, list) {
if ((!bpf_lru_node_is_ref(node) || force) &&
- lru->del_from_htab(lru->del_arg, node)) {
+ (READ_ONCE(node->pending_free) ||
+ lru->del_from_htab(lru->del_arg, node))) {
list_del(&node->list);
return node;
}
@@ -404,7 +411,8 @@ static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru,
l = per_cpu_ptr(lru->percpu_lru, cpu);
- raw_spin_lock_irqsave(&l->lock, flags);
+ if (raw_res_spin_lock_irqsave(&l->lock, flags))
+ return NULL;
__bpf_lru_list_rotate(lru, l);
@@ -420,7 +428,7 @@ static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru,
__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE);
}
- raw_spin_unlock_irqrestore(&l->lock, flags);
+ raw_res_spin_unlock_irqrestore(&l->lock, flags);
return node;
}
@@ -437,7 +445,8 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru,
loc_l = per_cpu_ptr(clru->local_list, cpu);
- raw_spin_lock_irqsave(&loc_l->lock, flags);
+ if (raw_res_spin_lock_irqsave(&loc_l->lock, flags))
+ return NULL;
node = __local_list_pop_free(loc_l);
if (!node) {
@@ -448,17 +457,22 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru,
if (node)
__local_list_add_pending(lru, loc_l, cpu, node, hash);
- raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+ raw_res_spin_unlock_irqrestore(&loc_l->lock, flags);
if (node)
return node;
- /* No free nodes found from the local free list and
+ /*
+ * No free nodes found from the local free list and
* the global LRU list.
*
* Steal from the local free/pending list of the
* current CPU and remote CPU in RR. It starts
* with the loc_l->next_steal CPU.
+ *
+ * Acquire the victim's lock before touching either list. On
+ * acquisition failure (rqspinlock AA or timeout) skip the victim
+ * and try the next CPU.
*/
first_steal = loc_l->next_steal;
@@ -466,24 +480,36 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru,
do {
steal_loc_l = per_cpu_ptr(clru->local_list, steal);
- raw_spin_lock_irqsave(&steal_loc_l->lock, flags);
-
- node = __local_list_pop_free(steal_loc_l);
- if (!node)
- node = __local_list_pop_pending(lru, steal_loc_l);
-
- raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags);
+ if (!raw_res_spin_lock_irqsave(&steal_loc_l->lock, flags)) {
+ node = __local_list_pop_free(steal_loc_l);
+ if (!node)
+ node = __local_list_pop_pending(lru, steal_loc_l);
+ raw_res_spin_unlock_irqrestore(&steal_loc_l->lock, flags);
+ }
steal = cpumask_next_wrap(steal, cpu_possible_mask);
} while (!node && steal != first_steal);
loc_l->next_steal = steal;
- if (node) {
- raw_spin_lock_irqsave(&loc_l->lock, flags);
- __local_list_add_pending(lru, loc_l, cpu, node, hash);
- raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+ if (!node)
+ return NULL;
+
+ if (raw_res_spin_lock_irqsave(&loc_l->lock, flags)) {
+ /*
+ * The local pending lock can't be acquired (rqspinlock AA
+ * or timeout). Return the stolen node to the per-CPU
+ * free_llist instead of orphaning it; the next pop_free on
+ * this CPU will pick it up.
+ */
+ node->type = BPF_LRU_LOCAL_LIST_T_FREE;
+ bpf_lru_node_clear_ref(node);
+ WRITE_ONCE(node->pending_free, 0);
+ llist_add(&node->llist, &loc_l->free_llist);
+ return NULL;
}
+ __local_list_add_pending(lru, loc_l, cpu, node, hash);
+ raw_res_spin_unlock_irqrestore(&loc_l->lock, flags);
return node;
}
@@ -511,18 +537,24 @@ static void bpf_common_lru_push_free(struct bpf_lru *lru,
loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu);
- raw_spin_lock_irqsave(&loc_l->lock, flags);
+ if (raw_res_spin_lock_irqsave(&loc_l->lock, flags)) {
+ WRITE_ONCE(node->pending_free, 1);
+ return;
+ }
if (unlikely(node->type != BPF_LRU_LOCAL_LIST_T_PENDING)) {
- raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+ raw_res_spin_unlock_irqrestore(&loc_l->lock,
+ flags);
goto check_lru_list;
}
node->type = BPF_LRU_LOCAL_LIST_T_FREE;
bpf_lru_node_clear_ref(node);
- list_move(&node->list, local_free_list(loc_l));
+ list_del(&node->list);
+
+ raw_res_spin_unlock_irqrestore(&loc_l->lock, flags);
- raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+ llist_add(&node->llist, &loc_l->free_llist);
return;
}
@@ -538,11 +570,14 @@ static void bpf_percpu_lru_push_free(struct bpf_lru *lru,
l = per_cpu_ptr(lru->percpu_lru, node->cpu);
- raw_spin_lock_irqsave(&l->lock, flags);
+ if (raw_res_spin_lock_irqsave(&l->lock, flags)) {
+ WRITE_ONCE(node->pending_free, 1);
+ return;
+ }
__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
- raw_spin_unlock_irqrestore(&l->lock, flags);
+ raw_res_spin_unlock_irqrestore(&l->lock, flags);
}
void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node)
@@ -565,6 +600,7 @@ static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf,
node = (struct bpf_lru_node *)(buf + node_offset);
node->type = BPF_LRU_LIST_T_FREE;
+ node->pending_free = 0;
bpf_lru_node_clear_ref(node);
list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
buf += elem_size;
@@ -594,6 +630,7 @@ again:
node = (struct bpf_lru_node *)(buf + node_offset);
node->cpu = cpu;
node->type = BPF_LRU_LIST_T_FREE;
+ node->pending_free = 0;
bpf_lru_node_clear_ref(node);
list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
i++;
@@ -618,14 +655,12 @@ void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu)
{
- int i;
-
- for (i = 0; i < NR_BPF_LRU_LOCAL_LIST_T; i++)
- INIT_LIST_HEAD(&loc_l->lists[i]);
+ INIT_LIST_HEAD(&loc_l->pending_list);
+ init_llist_head(&loc_l->free_llist);
loc_l->next_steal = cpu;
- raw_spin_lock_init(&loc_l->lock);
+ raw_res_spin_lock_init(&loc_l->lock);
}
static void bpf_lru_list_init(struct bpf_lru_list *l)
@@ -640,7 +675,7 @@ static void bpf_lru_list_init(struct bpf_lru_list *l)
l->next_inactive_rotation = &l->lists[BPF_LRU_LIST_T_INACTIVE];
- raw_spin_lock_init(&l->lock);
+ raw_res_spin_lock_init(&l->lock);
}
int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
index fe2661a58ea9..8d0ee61622af 100644
--- a/kernel/bpf/bpf_lru_list.h
+++ b/kernel/bpf/bpf_lru_list.h
@@ -6,11 +6,11 @@
#include <linux/cache.h>
#include <linux/list.h>
-#include <linux/spinlock_types.h>
+#include <linux/llist.h>
+#include <asm/rqspinlock.h>
#define NR_BPF_LRU_LIST_T (3)
#define NR_BPF_LRU_LIST_COUNT (2)
-#define NR_BPF_LRU_LOCAL_LIST_T (2)
#define BPF_LOCAL_LIST_T_OFFSET NR_BPF_LRU_LIST_T
enum bpf_lru_list_type {
@@ -22,10 +22,22 @@ enum bpf_lru_list_type {
};
struct bpf_lru_node {
- struct list_head list;
+ /*
+ * A node is in at most one list at a time. The free path on the
+ * per-CPU locallist uses an llist, so share storage via a union.
+ */
+ union {
+ struct list_head list;
+ struct llist_node llist;
+ };
u16 cpu;
u8 type;
u8 ref;
+ /*
+ * Marks nodes whose *_push_free() lock acquire failed; reclaimed
+ * by flush/shrink which honor the flag instead of del_from_htab().
+ */
+ u8 pending_free;
};
struct bpf_lru_list {
@@ -34,13 +46,14 @@ struct bpf_lru_list {
/* The next inactive list rotation starts from here */
struct list_head *next_inactive_rotation;
- raw_spinlock_t lock ____cacheline_aligned_in_smp;
+ rqspinlock_t lock ____cacheline_aligned_in_smp;
};
struct bpf_lru_locallist {
- struct list_head lists[NR_BPF_LRU_LOCAL_LIST_T];
+ struct list_head pending_list;
+ struct llist_head free_llist;
u16 next_steal;
- raw_spinlock_t lock;
+ rqspinlock_t lock;
};
struct bpf_common_lru {