From 0ccef7079ea8d5f7b896b14be7e400022ff240c6 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:28:59 -0800 Subject: bpf: Select bpf_local_storage_map_bucket based on bpf_local_storage A later bpf_local_storage refactor will acquire all locks before performing any update. To simplified the number of locks needed to take in bpf_local_storage_map_update(), determine the bucket based on the local_storage an selem belongs to instead of the selem pointer. Currently, when a new selem needs to be created to replace the old selem in bpf_local_storage_map_update(), locks of both buckets need to be acquired to prevent racing. This can be simplified if the two selem belongs to the same bucket so that only one bucket needs to be locked. Therefore, instead of hashing selem, hashing the local_storage pointer the selem belongs. Performance wise, this is slightly better as update now requires locking one bucket. It should not change the level of contention on one bucket as the pointers to local storages of selems in a map are just as unique as pointers to selems. Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-2-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 66432248cd81..2638487425b8 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -179,6 +179,7 @@ void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now); void bpf_selem_link_map(struct bpf_local_storage_map *smap, + struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem); struct bpf_local_storage_elem * -- cgit v1.2.3 From fd103ffc57c9a3b8b76bc852ffae5eb630a6ded4 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:01 -0800 Subject: bpf: Convert bpf_selem_link_map to failable To prepare for changing bpf_local_storage_map_bucket::lock to rqspinlock, convert bpf_selem_link_map() to failable. It still always succeeds and returns 0 until the change happens. No functional change. Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-4-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 6 +++--- kernel/bpf/bpf_local_storage.c | 8 +++++--- net/core/bpf_sk_storage.c | 9 ++++++++- 3 files changed, 16 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 2638487425b8..709506e982a6 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -178,9 +178,9 @@ void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now); -void bpf_selem_link_map(struct bpf_local_storage_map *smap, - struct bpf_local_storage *local_storage, - struct bpf_local_storage_elem *selem); +int bpf_selem_link_map(struct bpf_local_storage_map *smap, + struct bpf_local_storage *local_storage, + struct bpf_local_storage_elem *selem); struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value, diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 6fa71502c7d7..2f94ca4a4475 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -365,9 +365,9 @@ static void bpf_selem_unlink_map_nolock(struct bpf_local_storage_elem *selem) hlist_del_init_rcu(&selem->map_node); } -void bpf_selem_link_map(struct bpf_local_storage_map *smap, - struct bpf_local_storage *local_storage, - struct bpf_local_storage_elem *selem) +int bpf_selem_link_map(struct bpf_local_storage_map *smap, + struct bpf_local_storage *local_storage, + struct bpf_local_storage_elem *selem) { struct bpf_local_storage_map_bucket *b; unsigned long flags; @@ -376,6 +376,8 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap, raw_spin_lock_irqsave(&b->lock, flags); hlist_add_head_rcu(&selem->map_node, &b->list); raw_spin_unlock_irqrestore(&b->lock, flags); + + return 0; } static void bpf_selem_link_map_nolock(struct bpf_local_storage_map_bucket *b, diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index e36273e4fcbd..0b85d8f2c17e 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -191,7 +191,14 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) } if (new_sk_storage) { - bpf_selem_link_map(smap, new_sk_storage, copy_selem); + ret = bpf_selem_link_map(smap, new_sk_storage, copy_selem); + if (ret) { + bpf_selem_free(copy_selem, true); + atomic_sub(smap->elem_size, + &newsk->sk_omem_alloc); + bpf_map_put(map); + goto out; + } bpf_selem_link_storage_nolock(new_sk_storage, copy_selem); } else { ret = bpf_local_storage_alloc(newsk, smap, copy_selem, GFP_ATOMIC); -- cgit v1.2.3 From 403e935f915896243ff93f9a2ff44e5bb6619032 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:02 -0800 Subject: bpf: Convert bpf_selem_unlink to failable To prepare changing both bpf_local_storage_map_bucket::lock and bpf_local_storage::lock to rqspinlock, convert bpf_selem_unlink() to failable. It still always succeeds and returns 0 until the change happens. No functional change. Open code bpf_selem_unlink_storage() in the only caller, bpf_selem_unlink(), since unlink_map and unlink_storage must be done together after all the necessary locks are acquired. For bpf_local_storage_map_free(), ignore the return from bpf_selem_unlink() for now. A later patch will allow it to unlink selems even when failing to acquire locks. Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-5-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 2 +- kernel/bpf/bpf_cgrp_storage.c | 3 +- kernel/bpf/bpf_inode_storage.c | 4 +-- kernel/bpf/bpf_local_storage.c | 71 +++++++++++++++++++-------------------- kernel/bpf/bpf_task_storage.c | 4 +-- net/core/bpf_sk_storage.c | 4 +-- 6 files changed, 39 insertions(+), 49 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 709506e982a6..f74e0f7656a1 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -176,7 +176,7 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem); -void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now); +int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now); int bpf_selem_link_map(struct bpf_local_storage_map *smap, struct bpf_local_storage *local_storage, diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 0687a760974a..8fef24fcac68 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -118,8 +118,7 @@ static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map) if (!sdata) return -ENOENT; - bpf_selem_unlink(SELEM(sdata), false); - return 0; + return bpf_selem_unlink(SELEM(sdata), false); } static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key) diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index e54cce2b9175..cedc99184dad 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -110,9 +110,7 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map) if (!sdata) return -ENOENT; - bpf_selem_unlink(SELEM(sdata), false); - - return 0; + return bpf_selem_unlink(SELEM(sdata), false); } static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key) diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 2f94ca4a4475..38f5c3db2957 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -308,33 +308,6 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor return free_local_storage; } -static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, - bool reuse_now) -{ - struct bpf_local_storage *local_storage; - bool free_local_storage = false; - HLIST_HEAD(selem_free_list); - unsigned long flags; - - if (unlikely(!selem_linked_to_storage_lockless(selem))) - /* selem has already been unlinked from sk */ - return; - - local_storage = rcu_dereference_check(selem->local_storage, - bpf_rcu_lock_held()); - - raw_spin_lock_irqsave(&local_storage->lock, flags); - if (likely(selem_linked_to_storage(selem))) - free_local_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, &selem_free_list); - raw_spin_unlock_irqrestore(&local_storage->lock, flags); - - bpf_selem_free_list(&selem_free_list, reuse_now); - - if (free_local_storage) - bpf_local_storage_free(local_storage, reuse_now); -} - void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem) { @@ -386,19 +359,43 @@ static void bpf_selem_link_map_nolock(struct bpf_local_storage_map_bucket *b, hlist_add_head_rcu(&selem->map_node, &b->list); } -void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) +int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) { - int err; + struct bpf_local_storage *local_storage; + bool free_local_storage = false; + HLIST_HEAD(selem_free_list); + unsigned long flags; + int err = 0; - /* Always unlink from map before unlinking from local_storage - * because selem will be freed after successfully unlinked from - * the local_storage. - */ - err = bpf_selem_unlink_map(selem); - if (err) - return; + if (unlikely(!selem_linked_to_storage_lockless(selem))) + /* selem has already been unlinked from sk */ + return 0; + + local_storage = rcu_dereference_check(selem->local_storage, + bpf_rcu_lock_held()); + + raw_spin_lock_irqsave(&local_storage->lock, flags); + if (likely(selem_linked_to_storage(selem))) { + /* Always unlink from map before unlinking from local_storage + * because selem will be freed after successfully unlinked from + * the local_storage. + */ + err = bpf_selem_unlink_map(selem); + if (err) + goto out; + + free_local_storage = bpf_selem_unlink_storage_nolock( + local_storage, selem, &selem_free_list); + } +out: + raw_spin_unlock_irqrestore(&local_storage->lock, flags); - bpf_selem_unlink_storage(selem, reuse_now); + bpf_selem_free_list(&selem_free_list, reuse_now); + + if (free_local_storage) + bpf_local_storage_free(local_storage, reuse_now); + + return err; } void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage, diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index a1dc1bf0848a..ab902364ac23 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -167,9 +167,7 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map, if (!nobusy) return -EBUSY; - bpf_selem_unlink(SELEM(sdata), false); - - return 0; + return bpf_selem_unlink(SELEM(sdata), false); } static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key) diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 0b85d8f2c17e..d7b5c4551997 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -40,9 +40,7 @@ static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map) if (!sdata) return -ENOENT; - bpf_selem_unlink(SELEM(sdata), false); - - return 0; + return bpf_selem_unlink(SELEM(sdata), false); } /* Called by __sk_destruct() & bpf_sk_storage_clone() */ -- cgit v1.2.3 From 8dabe34b9d5b1135b084268396ed2267107e64c1 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:03 -0800 Subject: bpf: Change local_storage->lock and b->lock to rqspinlock Change bpf_local_storage::lock and bpf_local_storage_map_bucket::lock from raw_spin_lock to rqspinlock. Finally, propagate errors from raw_res_spin_lock_irqsave() to syscall return or BPF helper return. In bpf_local_storage_destroy(), ignore return from raw_res_spin_lock_irqsave() for now. A later patch will correctly handle errors correctly in bpf_local_storage_destroy() so that it can unlink selems even when failing to acquire locks. For __bpf_local_storage_map_cache(), instead of handling the error, skip updating the cache. Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-6-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 5 +-- kernel/bpf/bpf_local_storage.c | 64 +++++++++++++++++++++++++++------------ 2 files changed, 47 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index f74e0f7656a1..fa50b7afee18 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -15,12 +15,13 @@ #include #include #include +#include #define BPF_LOCAL_STORAGE_CACHE_SIZE 16 struct bpf_local_storage_map_bucket { struct hlist_head list; - raw_spinlock_t lock; + rqspinlock_t lock; }; /* Thp map is not the primary owner of a bpf_local_storage_elem. @@ -94,7 +95,7 @@ struct bpf_local_storage { * bpf_local_storage_elem. */ struct rcu_head rcu; - raw_spinlock_t lock; /* Protect adding/removing from the "list" */ + rqspinlock_t lock; /* Protect adding/removing from the "list" */ bool use_kmalloc_nolock; }; diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 38f5c3db2957..1138e2293b50 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -321,14 +321,18 @@ static int bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) struct bpf_local_storage_map *smap; struct bpf_local_storage_map_bucket *b; unsigned long flags; + int err; local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held()); smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); b = select_bucket(smap, local_storage); - raw_spin_lock_irqsave(&b->lock, flags); + err = raw_res_spin_lock_irqsave(&b->lock, flags); + if (err) + return err; + hlist_del_init_rcu(&selem->map_node); - raw_spin_unlock_irqrestore(&b->lock, flags); + raw_res_spin_unlock_irqrestore(&b->lock, flags); return 0; } @@ -344,11 +348,16 @@ int bpf_selem_link_map(struct bpf_local_storage_map *smap, { struct bpf_local_storage_map_bucket *b; unsigned long flags; + int err; b = select_bucket(smap, local_storage); - raw_spin_lock_irqsave(&b->lock, flags); + + err = raw_res_spin_lock_irqsave(&b->lock, flags); + if (err) + return err; + hlist_add_head_rcu(&selem->map_node, &b->list); - raw_spin_unlock_irqrestore(&b->lock, flags); + raw_res_spin_unlock_irqrestore(&b->lock, flags); return 0; } @@ -365,7 +374,7 @@ int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) bool free_local_storage = false; HLIST_HEAD(selem_free_list); unsigned long flags; - int err = 0; + int err; if (unlikely(!selem_linked_to_storage_lockless(selem))) /* selem has already been unlinked from sk */ @@ -374,7 +383,10 @@ int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held()); - raw_spin_lock_irqsave(&local_storage->lock, flags); + err = raw_res_spin_lock_irqsave(&local_storage->lock, flags); + if (err) + return err; + if (likely(selem_linked_to_storage(selem))) { /* Always unlink from map before unlinking from local_storage * because selem will be freed after successfully unlinked from @@ -388,7 +400,7 @@ int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) local_storage, selem, &selem_free_list); } out: - raw_spin_unlock_irqrestore(&local_storage->lock, flags); + raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); bpf_selem_free_list(&selem_free_list, reuse_now); @@ -403,16 +415,20 @@ void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem) { unsigned long flags; + int err; /* spinlock is needed to avoid racing with the * parallel delete. Otherwise, publishing an already * deleted sdata to the cache will become a use-after-free * problem in the next bpf_local_storage_lookup(). */ - raw_spin_lock_irqsave(&local_storage->lock, flags); + err = raw_res_spin_lock_irqsave(&local_storage->lock, flags); + if (err) + return; + if (selem_linked_to_storage(selem)) rcu_assign_pointer(local_storage->cache[smap->cache_idx], SDATA(selem)); - raw_spin_unlock_irqrestore(&local_storage->lock, flags); + raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); } static int check_flags(const struct bpf_local_storage_data *old_sdata, @@ -457,14 +473,17 @@ int bpf_local_storage_alloc(void *owner, RCU_INIT_POINTER(storage->smap, smap); INIT_HLIST_HEAD(&storage->list); - raw_spin_lock_init(&storage->lock); + raw_res_spin_lock_init(&storage->lock); storage->owner = owner; storage->use_kmalloc_nolock = smap->use_kmalloc_nolock; bpf_selem_link_storage_nolock(storage, first_selem); b = select_bucket(smap, storage); - raw_spin_lock_irqsave(&b->lock, flags); + err = raw_res_spin_lock_irqsave(&b->lock, flags); + if (err) + goto uncharge; + bpf_selem_link_map_nolock(b, first_selem); owner_storage_ptr = @@ -482,11 +501,11 @@ int bpf_local_storage_alloc(void *owner, prev_storage = cmpxchg(owner_storage_ptr, NULL, storage); if (unlikely(prev_storage)) { bpf_selem_unlink_map_nolock(first_selem); - raw_spin_unlock_irqrestore(&b->lock, flags); + raw_res_spin_unlock_irqrestore(&b->lock, flags); err = -EAGAIN; goto uncharge; } - raw_spin_unlock_irqrestore(&b->lock, flags); + raw_res_spin_unlock_irqrestore(&b->lock, flags); return 0; @@ -569,7 +588,9 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (!alloc_selem) return ERR_PTR(-ENOMEM); - raw_spin_lock_irqsave(&local_storage->lock, flags); + err = raw_res_spin_lock_irqsave(&local_storage->lock, flags); + if (err) + goto free_selem; /* Recheck local_storage->list under local_storage->lock */ if (unlikely(hlist_empty(&local_storage->list))) { @@ -596,7 +617,9 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, b = select_bucket(smap, local_storage); - raw_spin_lock_irqsave(&b->lock, b_flags); + err = raw_res_spin_lock_irqsave(&b->lock, b_flags); + if (err) + goto unlock; alloc_selem = NULL; /* First, link the new selem to the map */ @@ -612,9 +635,10 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, &old_selem_free_list); } - raw_spin_unlock_irqrestore(&b->lock, b_flags); + raw_res_spin_unlock_irqrestore(&b->lock, b_flags); unlock: - raw_spin_unlock_irqrestore(&local_storage->lock, flags); + raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); +free_selem: bpf_selem_free_list(&old_selem_free_list, false); if (alloc_selem) { mem_uncharge(smap, owner, smap->elem_size); @@ -699,7 +723,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) * when unlinking elem from the local_storage->list and * the map's bucket->list. */ - raw_spin_lock_irqsave(&local_storage->lock, flags); + raw_res_spin_lock_irqsave(&local_storage->lock, flags); hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { /* Always unlink from map before unlinking from * local_storage. @@ -714,7 +738,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) free_storage = bpf_selem_unlink_storage_nolock( local_storage, selem, &free_selem_list); } - raw_spin_unlock_irqrestore(&local_storage->lock, flags); + raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); bpf_selem_free_list(&free_selem_list, true); @@ -761,7 +785,7 @@ bpf_local_storage_map_alloc(union bpf_attr *attr, for (i = 0; i < nbuckets; i++) { INIT_HLIST_HEAD(&smap->buckets[i].list); - raw_spin_lock_init(&smap->buckets[i].lock); + raw_res_spin_lock_init(&smap->buckets[i].lock); } smap->elem_size = offsetof(struct bpf_local_storage_elem, -- cgit v1.2.3 From 3417dffb58331d1e7e4f3e30ec95cc0f8114ff10 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:06 -0800 Subject: bpf: Remove unused percpu counter from bpf_local_storage_map_free Percpu locks have been removed from cgroup and task local storage. Now that all local storage no longer use percpu variables as locks preventing recursion, there is no need to pass them to bpf_local_storage_map_free(). Remove the argument from the function. Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-9-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 3 +-- kernel/bpf/bpf_cgrp_storage.c | 2 +- kernel/bpf/bpf_inode_storage.c | 2 +- kernel/bpf/bpf_local_storage.c | 7 +------ kernel/bpf/bpf_task_storage.c | 2 +- net/core/bpf_sk_storage.c | 2 +- 6 files changed, 6 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index fa50b7afee18..fba3354988d3 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -166,8 +166,7 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage, void bpf_local_storage_destroy(struct bpf_local_storage *local_storage); void bpf_local_storage_map_free(struct bpf_map *map, - struct bpf_local_storage_cache *cache, - int __percpu *busy_counter); + struct bpf_local_storage_cache *cache); int bpf_local_storage_map_check_btf(const struct bpf_map *map, const struct btf *btf, diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 4d84611d8222..853183eead2c 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -119,7 +119,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) static void cgroup_storage_map_free(struct bpf_map *map) { - bpf_local_storage_map_free(map, &cgroup_cache, NULL); + bpf_local_storage_map_free(map, &cgroup_cache); } /* *gfp_flags* is a hidden argument provided by the verifier */ diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index cedc99184dad..470f4b02c79e 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -184,7 +184,7 @@ static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr) static void inode_storage_map_free(struct bpf_map *map) { - bpf_local_storage_map_free(map, &inode_cache, NULL); + bpf_local_storage_map_free(map, &inode_cache); } const struct bpf_map_ops inode_storage_map_ops = { diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 1138e2293b50..76e812a40380 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -807,8 +807,7 @@ free_smap: } void bpf_local_storage_map_free(struct bpf_map *map, - struct bpf_local_storage_cache *cache, - int __percpu *busy_counter) + struct bpf_local_storage_cache *cache) { struct bpf_local_storage_map_bucket *b; struct bpf_local_storage_elem *selem; @@ -841,11 +840,7 @@ void bpf_local_storage_map_free(struct bpf_map *map, while ((selem = hlist_entry_safe( rcu_dereference_raw(hlist_first_rcu(&b->list)), struct bpf_local_storage_elem, map_node))) { - if (busy_counter) - this_cpu_inc(*busy_counter); bpf_selem_unlink(selem, true); - if (busy_counter) - this_cpu_dec(*busy_counter); cond_resched_rcu(); } rcu_read_unlock(); diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index dd858226ada2..4d53aebe6784 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -217,7 +217,7 @@ static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr) static void task_storage_map_free(struct bpf_map *map) { - bpf_local_storage_map_free(map, &task_cache, NULL); + bpf_local_storage_map_free(map, &task_cache); } BTF_ID_LIST_GLOBAL_SINGLE(bpf_local_storage_map_btf_id, struct, bpf_local_storage_map) diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index d7b5c4551997..d2164165a994 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -60,7 +60,7 @@ out: static void bpf_sk_storage_map_free(struct bpf_map *map) { - bpf_local_storage_map_free(map, &sk_cache, NULL); + bpf_local_storage_map_free(map, &sk_cache); } static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) -- cgit v1.2.3 From c8be3da14718f1e732afcb61e8ee5b78e8d93808 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:07 -0800 Subject: bpf: Prepare for bpf_selem_unlink_nofail() The next patch will introduce bpf_selem_unlink_nofail() to handle rqspinlock errors. bpf_selem_unlink_nofail() will allow an selem to be partially unlinked from map or local storage. Save memory allocation method in selem so that later an selem can be correctly freed even when SDATA(selem)->smap is init to NULL. In addition, keep track of memory charge to the owner in local storage so that later bpf_selem_unlink_nofail() can return the correct memory charge to the owner. Updating local_storage->mem_charge is protected by local_storage->lock. Finally, extract miscellaneous tasks performed when unlinking an selem from local_storage into bpf_selem_unlink_storage_nolock_misc(). It will be reused by bpf_selem_unlink_nofail(). This patch also takes the chance to remove local_storage->smap, which is no longer used since commit f484f4a3e058 ("bpf: Replace bpf memory allocator with kmalloc_nolock() in local storage"). Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-10-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 5 +-- kernel/bpf/bpf_local_storage.c | 69 +++++++++++++++++++-------------------- 2 files changed, 37 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index fba3354988d3..a34ed7fa81d8 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -80,7 +80,8 @@ struct bpf_local_storage_elem { * after raw_spin_unlock */ }; - /* 8 bytes hole */ + bool use_kmalloc_nolock; + /* 7 bytes hole */ /* The data is stored in another cacheline to minimize * the number of cachelines access during a cache hit. */ @@ -89,13 +90,13 @@ struct bpf_local_storage_elem { struct bpf_local_storage { struct bpf_local_storage_data __rcu *cache[BPF_LOCAL_STORAGE_CACHE_SIZE]; - struct bpf_local_storage_map __rcu *smap; struct hlist_head list; /* List of bpf_local_storage_elem */ void *owner; /* The object that owns the above "list" of * bpf_local_storage_elem. */ struct rcu_head rcu; rqspinlock_t lock; /* Protect adding/removing from the "list" */ + u64 mem_charge; /* Copy of mem charged to owner. Protected by "lock" */ bool use_kmalloc_nolock; }; diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 76e812a40380..0e9ae41a9759 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -85,6 +85,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, if (selem) { RCU_INIT_POINTER(SDATA(selem)->smap, smap); + selem->use_kmalloc_nolock = smap->use_kmalloc_nolock; if (value) { /* No need to call check_and_init_map_value as memory is zero init */ @@ -214,7 +215,7 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem, smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); - if (!smap->use_kmalloc_nolock) { + if (!selem->use_kmalloc_nolock) { /* * No uptr will be unpin even when reuse_now == false since uptr * is only supported in task local storage, where @@ -251,6 +252,30 @@ static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now) bpf_selem_free(selem, reuse_now); } +static void bpf_selem_unlink_storage_nolock_misc(struct bpf_local_storage_elem *selem, + struct bpf_local_storage_map *smap, + struct bpf_local_storage *local_storage, + bool free_local_storage) +{ + void *owner = local_storage->owner; + u32 uncharge = smap->elem_size; + + if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) == + SDATA(selem)) + RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); + + uncharge += free_local_storage ? sizeof(*local_storage) : 0; + mem_uncharge(smap, local_storage->owner, uncharge); + local_storage->mem_charge -= uncharge; + + if (free_local_storage) { + local_storage->owner = NULL; + + /* After this RCU_INIT, owner may be freed and cannot be used */ + RCU_INIT_POINTER(*owner_storage(smap, owner), NULL); + } +} + /* local_storage->lock must be held and selem->local_storage == local_storage. * The caller must ensure selem->smap is still valid to be * dereferenced for its smap->elem_size and smap->cache_idx. @@ -261,56 +286,30 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor { struct bpf_local_storage_map *smap; bool free_local_storage; - void *owner; smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); - owner = local_storage->owner; - - /* All uncharging on the owner must be done first. - * The owner may be freed once the last selem is unlinked - * from local_storage. - */ - mem_uncharge(smap, owner, smap->elem_size); free_local_storage = hlist_is_singular_node(&selem->snode, &local_storage->list); - if (free_local_storage) { - mem_uncharge(smap, owner, sizeof(struct bpf_local_storage)); - local_storage->owner = NULL; - /* After this RCU_INIT, owner may be freed and cannot be used */ - RCU_INIT_POINTER(*owner_storage(smap, owner), NULL); + bpf_selem_unlink_storage_nolock_misc(selem, smap, local_storage, + free_local_storage); - /* local_storage is not freed now. local_storage->lock is - * still held and raw_spin_unlock_bh(&local_storage->lock) - * will be done by the caller. - * - * Although the unlock will be done under - * rcu_read_lock(), it is more intuitive to - * read if the freeing of the storage is done - * after the raw_spin_unlock_bh(&local_storage->lock). - * - * Hence, a "bool free_local_storage" is returned - * to the caller which then calls then frees the storage after - * all the RCU grace periods have expired. - */ - } hlist_del_init_rcu(&selem->snode); - if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) == - SDATA(selem)) - RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); hlist_add_head(&selem->free_node, free_selem_list); - if (rcu_access_pointer(local_storage->smap) == smap) - RCU_INIT_POINTER(local_storage->smap, NULL); - return free_local_storage; } void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem) { + struct bpf_local_storage_map *smap; + + smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); + local_storage->mem_charge += smap->elem_size; + RCU_INIT_POINTER(selem->local_storage, local_storage); hlist_add_head_rcu(&selem->snode, &local_storage->list); } @@ -471,10 +470,10 @@ int bpf_local_storage_alloc(void *owner, goto uncharge; } - RCU_INIT_POINTER(storage->smap, smap); INIT_HLIST_HEAD(&storage->list); raw_res_spin_lock_init(&storage->lock); storage->owner = owner; + storage->mem_charge = sizeof(*storage); storage->use_kmalloc_nolock = smap->use_kmalloc_nolock; bpf_selem_link_storage_nolock(storage, first_selem); -- cgit v1.2.3 From 5d800f87d0a5ea1b156c47a4b9fd128479335153 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:08 -0800 Subject: bpf: Support lockless unlink when freeing map or local storage Introduce bpf_selem_unlink_nofail() to properly handle errors returned from rqspinlock in bpf_local_storage_map_free() and bpf_local_storage_destroy() where the operation must succeeds. The idea of bpf_selem_unlink_nofail() is to allow an selem to be partially linked and use atomic operation on a bit field, selem->state, to determine when and who can free the selem if any unlink under lock fails. An selem initially is fully linked to a map and a local storage. Under normal circumstances, bpf_selem_unlink_nofail() will be able to grab locks and unlink a selem from map and local storage in sequeunce, just like bpf_selem_unlink(), and then free it after an RCU grace period. However, if any of the lock attempts fails, it will only clear SDATA(selem)->smap or selem->local_storage depending on the caller and set SELEM_MAP_UNLINKED or SELEM_STORAGE_UNLINKED according to the caller. Then, after both map_free() and destroy() see the selem and the state becomes SELEM_UNLINKED, one of two racing caller can succeed in cmpxchg the state from SELEM_UNLINKED to SELEM_TOFREE, ensuring no double free or memory leak. To make sure bpf_obj_free_fields() is done only once and when map is still present, it is called when unlinking an selem from b->list under b->lock. To make sure uncharging memory is done only when the owner is still present in map_free(), block destroy() from returning until there is no pending map_free(). Since smap may not be valid in destroy(), bpf_selem_unlink_nofail() skips bpf_selem_unlink_storage_nolock_misc() when called from destroy(). This is okay as bpf_local_storage_destroy() will return the remaining amount of memory charge tracked by mem_charge to the owner to uncharge. It is also safe to skip clearing local_storage->owner and owner_storage as the owner is being freed and no users or bpf programs should be able to reference the owner and using local_storage. Finally, access of selem, SDATA(selem)->smap and selem->local_storage are racy. Callers will protect these fields with RCU. Acked-by: Alexei Starovoitov Co-developed-by: Martin KaFai Lau Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-11-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 9 ++- kernel/bpf/bpf_local_storage.c | 116 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 118 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index a34ed7fa81d8..69a5d8aa765d 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -68,6 +68,11 @@ struct bpf_local_storage_data { u8 data[] __aligned(8); }; +#define SELEM_MAP_UNLINKED (1 << 0) +#define SELEM_STORAGE_UNLINKED (1 << 1) +#define SELEM_UNLINKED (SELEM_MAP_UNLINKED | SELEM_STORAGE_UNLINKED) +#define SELEM_TOFREE (1 << 2) + /* Linked to bpf_local_storage and bpf_local_storage_map */ struct bpf_local_storage_elem { struct hlist_node map_node; /* Linked to bpf_local_storage_map */ @@ -80,8 +85,9 @@ struct bpf_local_storage_elem { * after raw_spin_unlock */ }; + atomic_t state; bool use_kmalloc_nolock; - /* 7 bytes hole */ + /* 3 bytes hole */ /* The data is stored in another cacheline to minimize * the number of cachelines access during a cache hit. */ @@ -97,6 +103,7 @@ struct bpf_local_storage { struct rcu_head rcu; rqspinlock_t lock; /* Protect adding/removing from the "list" */ u64 mem_charge; /* Copy of mem charged to owner. Protected by "lock" */ + refcount_t owner_refcnt;/* Used to pin owner when map_free is uncharging */ bool use_kmalloc_nolock; }; diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 0e9ae41a9759..294605c78271 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -85,6 +85,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, if (selem) { RCU_INIT_POINTER(SDATA(selem)->smap, smap); + atomic_set(&selem->state, 0); selem->use_kmalloc_nolock = smap->use_kmalloc_nolock; if (value) { @@ -194,9 +195,11 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu) /* The bpf_local_storage_map_free will wait for rcu_barrier */ smap = rcu_dereference_check(SDATA(selem)->smap, 1); - migrate_disable(); - bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); - migrate_enable(); + if (smap) { + migrate_disable(); + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); + migrate_enable(); + } kfree_nolock(selem); } @@ -221,7 +224,8 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem, * is only supported in task local storage, where * smap->use_kmalloc_nolock == true. */ - bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); + if (smap) + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); __bpf_selem_free(selem, reuse_now); return; } @@ -255,7 +259,7 @@ static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now) static void bpf_selem_unlink_storage_nolock_misc(struct bpf_local_storage_elem *selem, struct bpf_local_storage_map *smap, struct bpf_local_storage *local_storage, - bool free_local_storage) + bool free_local_storage, bool pin_owner) { void *owner = local_storage->owner; u32 uncharge = smap->elem_size; @@ -264,6 +268,9 @@ static void bpf_selem_unlink_storage_nolock_misc(struct bpf_local_storage_elem * SDATA(selem)) RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); + if (pin_owner && !refcount_inc_not_zero(&local_storage->owner_refcnt)) + return; + uncharge += free_local_storage ? sizeof(*local_storage) : 0; mem_uncharge(smap, local_storage->owner, uncharge); local_storage->mem_charge -= uncharge; @@ -274,6 +281,9 @@ static void bpf_selem_unlink_storage_nolock_misc(struct bpf_local_storage_elem * /* After this RCU_INIT, owner may be freed and cannot be used */ RCU_INIT_POINTER(*owner_storage(smap, owner), NULL); } + + if (pin_owner) + refcount_dec(&local_storage->owner_refcnt); } /* local_storage->lock must be held and selem->local_storage == local_storage. @@ -293,7 +303,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor &local_storage->list); bpf_selem_unlink_storage_nolock_misc(selem, smap, local_storage, - free_local_storage); + free_local_storage, false); hlist_del_init_rcu(&selem->snode); @@ -409,6 +419,94 @@ out: return err; } +/* + * Unlink an selem from map and local storage with lockless fallback if callers + * are racing or rqspinlock returns error. It should only be called by + * bpf_local_storage_destroy() or bpf_local_storage_map_free(). + */ +static void bpf_selem_unlink_nofail(struct bpf_local_storage_elem *selem, + struct bpf_local_storage_map_bucket *b) +{ + bool in_map_free = !!b, free_storage = false; + struct bpf_local_storage *local_storage; + struct bpf_local_storage_map *smap; + unsigned long flags; + int err, unlink = 0; + + local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held()); + smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); + + if (smap) { + b = b ? : select_bucket(smap, local_storage); + err = raw_res_spin_lock_irqsave(&b->lock, flags); + if (!err) { + /* + * Call bpf_obj_free_fields() under b->lock to make sure it is done + * exactly once for an selem. Safe to free special fields immediately + * as no BPF program should be referencing the selem. + */ + if (likely(selem_linked_to_map(selem))) { + hlist_del_init_rcu(&selem->map_node); + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); + unlink++; + } + raw_res_spin_unlock_irqrestore(&b->lock, flags); + } + /* + * Highly unlikely scenario: resource leak + * + * When map_free(selem1), destroy(selem1) and destroy(selem2) are racing + * and both selem belong to the same bucket, if destroy(selem2) acquired + * b->lock and block for too long, neither map_free(selem1) and + * destroy(selem1) will be able to free the special field associated + * with selem1 as raw_res_spin_lock_irqsave() returns -ETIMEDOUT. + */ + WARN_ON_ONCE(err && in_map_free); + if (!err || in_map_free) + RCU_INIT_POINTER(SDATA(selem)->smap, NULL); + } + + if (local_storage) { + err = raw_res_spin_lock_irqsave(&local_storage->lock, flags); + if (!err) { + if (likely(selem_linked_to_storage(selem))) { + free_storage = hlist_is_singular_node(&selem->snode, + &local_storage->list); + /* + * Okay to skip clearing owner_storage and storage->owner in + * destroy() since the owner is going away. No user or bpf + * programs should be able to reference it. + */ + if (smap && in_map_free) + bpf_selem_unlink_storage_nolock_misc( + selem, smap, local_storage, + free_storage, true); + hlist_del_init_rcu(&selem->snode); + unlink++; + } + raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); + } + if (!err || !in_map_free) + RCU_INIT_POINTER(selem->local_storage, NULL); + } + + if (unlink != 2) + atomic_or(in_map_free ? SELEM_MAP_UNLINKED : SELEM_STORAGE_UNLINKED, &selem->state); + + /* + * Normally, an selem can be unlinked under local_storage->lock and b->lock, and + * then freed after an RCU grace period. However, if destroy() and map_free() are + * racing or rqspinlock returns errors in unlikely situations (unlink != 2), free + * the selem only after both map_free() and destroy() see the selem. + */ + if (unlink == 2 || + atomic_cmpxchg(&selem->state, SELEM_UNLINKED, SELEM_TOFREE) == SELEM_UNLINKED) + bpf_selem_free(selem, true); + + if (free_storage) + bpf_local_storage_free(local_storage, true); +} + void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage, struct bpf_local_storage_map *smap, struct bpf_local_storage_elem *selem) @@ -475,6 +573,7 @@ int bpf_local_storage_alloc(void *owner, storage->owner = owner; storage->mem_charge = sizeof(*storage); storage->use_kmalloc_nolock = smap->use_kmalloc_nolock; + refcount_set(&storage->owner_refcnt, 1); bpf_selem_link_storage_nolock(storage, first_selem); @@ -743,6 +842,11 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) if (free_storage) bpf_local_storage_free(local_storage, true); + + if (!refcount_dec_and_test(&local_storage->owner_refcnt)) { + while (refcount_read(&local_storage->owner_refcnt)) + cpu_relax(); + } } u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map) -- cgit v1.2.3 From 0be08389c7f2f9db0194ee666d2e4870886e6ffb Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:09 -0800 Subject: bpf: Switch to bpf_selem_unlink_nofail in bpf_local_storage_{map_free, destroy} Take care of rqspinlock error in bpf_local_storage_{map_free, destroy}() properly by switching to bpf_selem_unlink_nofail(). Both functions iterate their own RCU-protected list of selems and call bpf_selem_unlink_nofail(). In map_free(), to prevent infinite loop when both map_free() and destroy() fail to remove a selem from b->list (extremely unlikely), switch to hlist_for_each_entry_rcu(). In destroy(), also switch to hlist_for_each_entry_rcu() since we no longer iterate local_storage->list under local_storage->lock. bpf_selem_unlink() now becomes dedicated to helpers and syscalls paths so reuse_now should always be false. Remove it from the argument and hardcode it. Acked-by: Alexei Starovoitov Co-developed-by: Martin KaFai Lau Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-12-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 4 +-- kernel/bpf/bpf_cgrp_storage.c | 2 +- kernel/bpf/bpf_inode_storage.c | 2 +- kernel/bpf/bpf_local_storage.c | 63 ++++++++++++++++++--------------------- kernel/bpf/bpf_task_storage.c | 2 +- net/core/bpf_sk_storage.c | 7 +++-- 6 files changed, 39 insertions(+), 41 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 69a5d8aa765d..85efa9772530 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -171,7 +171,7 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage, return SDATA(selem); } -void bpf_local_storage_destroy(struct bpf_local_storage *local_storage); +u32 bpf_local_storage_destroy(struct bpf_local_storage *local_storage); void bpf_local_storage_map_free(struct bpf_map *map, struct bpf_local_storage_cache *cache); @@ -184,7 +184,7 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem); -int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now); +int bpf_selem_unlink(struct bpf_local_storage_elem *selem); int bpf_selem_link_map(struct bpf_local_storage_map *smap, struct bpf_local_storage *local_storage, diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 853183eead2c..c2a2ead1f466 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -89,7 +89,7 @@ static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map) if (!sdata) return -ENOENT; - return bpf_selem_unlink(SELEM(sdata), false); + return bpf_selem_unlink(SELEM(sdata)); } static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key) diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 470f4b02c79e..e86734609f3d 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -110,7 +110,7 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map) if (!sdata) return -ENOENT; - return bpf_selem_unlink(SELEM(sdata), false); + return bpf_selem_unlink(SELEM(sdata)); } static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key) diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 294605c78271..b28f07d3a0db 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -377,7 +377,11 @@ static void bpf_selem_link_map_nolock(struct bpf_local_storage_map_bucket *b, hlist_add_head_rcu(&selem->map_node, &b->list); } -int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) +/* + * Unlink an selem from map and local storage with lock held. + * This is the common path used by local storages to delete an selem. + */ +int bpf_selem_unlink(struct bpf_local_storage_elem *selem) { struct bpf_local_storage *local_storage; bool free_local_storage = false; @@ -411,10 +415,10 @@ int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) out: raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); - bpf_selem_free_list(&selem_free_list, reuse_now); + bpf_selem_free_list(&selem_free_list, false); if (free_local_storage) - bpf_local_storage_free(local_storage, reuse_now); + bpf_local_storage_free(local_storage, false); return err; } @@ -804,13 +808,13 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, return 0; } -void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) +/* + * Destroy local storage when the owner is going away. Caller must uncharge memory + * if memory charging is used. + */ +u32 bpf_local_storage_destroy(struct bpf_local_storage *local_storage) { struct bpf_local_storage_elem *selem; - bool free_storage = false; - HLIST_HEAD(free_selem_list); - struct hlist_node *n; - unsigned long flags; /* Neither the bpf_prog nor the bpf_map's syscall * could be modifying the local_storage->list now. @@ -821,32 +825,20 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) * when unlinking elem from the local_storage->list and * the map's bucket->list. */ - raw_res_spin_lock_irqsave(&local_storage->lock, flags); - hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { - /* Always unlink from map before unlinking from - * local_storage. - */ - bpf_selem_unlink_map(selem); - /* If local_storage list has only one element, the - * bpf_selem_unlink_storage_nolock() will return true. - * Otherwise, it will return false. The current loop iteration - * intends to remove all local storage. So the last iteration - * of the loop will set the free_cgroup_storage to true. - */ - free_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, &free_selem_list); - } - raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); - - bpf_selem_free_list(&free_selem_list, true); - - if (free_storage) - bpf_local_storage_free(local_storage, true); + hlist_for_each_entry_rcu(selem, &local_storage->list, snode) + bpf_selem_unlink_nofail(selem, NULL); if (!refcount_dec_and_test(&local_storage->owner_refcnt)) { while (refcount_read(&local_storage->owner_refcnt)) cpu_relax(); + /* + * Paired with refcount_dec() in bpf_selem_unlink_nofail() + * to make sure destroy() sees the correct local_storage->mem_charge. + */ + smp_mb(); } + + return local_storage->mem_charge; } u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map) @@ -940,11 +932,14 @@ void bpf_local_storage_map_free(struct bpf_map *map, rcu_read_lock(); /* No one is adding to b->list now */ - while ((selem = hlist_entry_safe( - rcu_dereference_raw(hlist_first_rcu(&b->list)), - struct bpf_local_storage_elem, map_node))) { - bpf_selem_unlink(selem, true); - cond_resched_rcu(); +restart: + hlist_for_each_entry_rcu(selem, &b->list, map_node) { + bpf_selem_unlink_nofail(selem, b); + + if (need_resched()) { + cond_resched_rcu(); + goto restart; + } } rcu_read_unlock(); } diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index 4d53aebe6784..605506792b5b 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -134,7 +134,7 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map) if (!sdata) return -ENOENT; - return bpf_selem_unlink(SELEM(sdata), false); + return bpf_selem_unlink(SELEM(sdata)); } static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key) diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index d2164165a994..1eb3e060994e 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -40,20 +40,23 @@ static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map) if (!sdata) return -ENOENT; - return bpf_selem_unlink(SELEM(sdata), false); + return bpf_selem_unlink(SELEM(sdata)); } /* Called by __sk_destruct() & bpf_sk_storage_clone() */ void bpf_sk_storage_free(struct sock *sk) { struct bpf_local_storage *sk_storage; + u32 uncharge; rcu_read_lock_dont_migrate(); sk_storage = rcu_dereference(sk->sk_bpf_storage); if (!sk_storage) goto out; - bpf_local_storage_destroy(sk_storage); + uncharge = bpf_local_storage_destroy(sk_storage); + if (uncharge) + atomic_sub(uncharge, &sk->sk_omem_alloc); out: rcu_read_unlock_migrate(); } -- cgit v1.2.3