From 73d2c61919e9aeffad1b826ec23b1a4a07c1e0dd Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 5 Mar 2023 12:46:12 +0000 Subject: bpf, net: sock_map memory usage sockmap and sockhash don't have something in common in allocation, so let's introduce different helpers to calculate their memory usage. The reuslt as follows, - before 28: sockmap name count_map flags 0x0 key 4B value 4B max_entries 65536 memlock 524288B 29: sockhash name count_map flags 0x0 key 4B value 4B max_entries 65536 memlock 524288B - after 28: sockmap name count_map flags 0x0 key 4B value 4B max_entries 65536 memlock 524608B 29: sockhash name count_map flags 0x0 <<<< no updated elements key 4B value 4B max_entries 65536 memlock 1048896B Signed-off-by: Yafang Shao Link: https://lore.kernel.org/r/20230305124615.12358-16-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- net/core/sock_map.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'net/core/sock_map.c') diff --git a/net/core/sock_map.c b/net/core/sock_map.c index a68a7290a3b2..9b854e236d23 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -797,6 +797,14 @@ static void sock_map_fini_seq_private(void *priv_data) bpf_map_put_with_uref(info->map); } +static u64 sock_map_mem_usage(const struct bpf_map *map) +{ + u64 usage = sizeof(struct bpf_stab); + + usage += (u64)map->max_entries * sizeof(struct sock *); + return usage; +} + static const struct bpf_iter_seq_info sock_map_iter_seq_info = { .seq_ops = &sock_map_seq_ops, .init_seq_private = sock_map_init_seq_private, @@ -816,6 +824,7 @@ const struct bpf_map_ops sock_map_ops = { .map_lookup_elem = sock_map_lookup, .map_release_uref = sock_map_release_progs, .map_check_btf = map_check_no_btf, + .map_mem_usage = sock_map_mem_usage, .map_btf_id = &sock_map_btf_ids[0], .iter_seq_info = &sock_map_iter_seq_info, }; @@ -1397,6 +1406,16 @@ static void sock_hash_fini_seq_private(void *priv_data) bpf_map_put_with_uref(info->map); } +static u64 sock_hash_mem_usage(const struct bpf_map *map) +{ + struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); + u64 usage = sizeof(*htab); + + usage += htab->buckets_num * sizeof(struct bpf_shtab_bucket); + usage += atomic_read(&htab->count) * (u64)htab->elem_size; + return usage; +} + static const struct bpf_iter_seq_info sock_hash_iter_seq_info = { .seq_ops = &sock_hash_seq_ops, .init_seq_private = sock_hash_init_seq_private, @@ -1416,6 +1435,7 @@ const struct bpf_map_ops sock_hash_ops = { .map_lookup_elem_sys_only = sock_hash_lookup_sys, .map_release_uref = sock_hash_release_progs, .map_check_btf = map_check_no_btf, + .map_mem_usage = sock_hash_mem_usage, .map_btf_id = &sock_hash_map_btf_ids[0], .iter_seq_info = &sock_hash_iter_seq_info, }; -- cgit v1.2.3 From d7ba4cc900bf1eea2d8c807c6b1fc6bd61f41237 Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Wed, 22 Mar 2023 12:47:54 -0700 Subject: bpf: return long from bpf_map_ops funcs This patch changes the return types of bpf_map_ops functions to long, where previously int was returned. Using long allows for bpf programs to maintain the sign bit in the absence of sign extension during situations where inlined bpf helper funcs make calls to the bpf_map_ops funcs and a negative error is returned. The definitions of the helper funcs are generated from comments in the bpf uapi header at `include/uapi/linux/bpf.h`. The return type of these helpers was previously changed from int to long in commit bdb7b79b4ce8. For any case where one of the map helpers call the bpf_map_ops funcs that are still returning 32-bit int, a compiler might not include sign extension instructions to properly convert the 32-bit negative value a 64-bit negative value. For example: bpf assembly excerpt of an inlined helper calling a kernel function and checking for a specific error: ; err = bpf_map_update_elem(&mymap, &key, &val, BPF_NOEXIST); ... 46: call 0xffffffffe103291c ; htab_map_update_elem ; if (err && err != -EEXIST) { 4b: cmp $0xffffffffffffffef,%rax ; cmp -EEXIST,%rax kernel function assembly excerpt of return value from `htab_map_update_elem` returning 32-bit int: movl $0xffffffef, %r9d ... movl %r9d, %eax ...results in the comparison: cmp $0xffffffffffffffef, $0x00000000ffffffef Fixes: bdb7b79b4ce8 ("bpf: Switch most helper return values from 32-bit int to 64-bit long") Tested-by: Eduard Zingerman Signed-off-by: JP Kobryn Link: https://lore.kernel.org/r/20230322194754.185781-3-inwardvessel@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 14 +++++++------- include/linux/filter.h | 6 +++--- kernel/bpf/arraymap.c | 12 ++++++------ kernel/bpf/bloom_filter.c | 12 ++++++------ kernel/bpf/bpf_cgrp_storage.c | 6 +++--- kernel/bpf/bpf_inode_storage.c | 6 +++--- kernel/bpf/bpf_struct_ops.c | 6 +++--- kernel/bpf/bpf_task_storage.c | 6 +++--- kernel/bpf/cpumap.c | 8 ++++---- kernel/bpf/devmap.c | 24 ++++++++++++------------ kernel/bpf/hashtab.c | 36 ++++++++++++++++++------------------ kernel/bpf/local_storage.c | 6 +++--- kernel/bpf/lpm_trie.c | 6 +++--- kernel/bpf/queue_stack_maps.c | 22 +++++++++++----------- kernel/bpf/reuseport_array.c | 2 +- kernel/bpf/ringbuf.c | 6 +++--- kernel/bpf/stackmap.c | 6 +++--- kernel/bpf/verifier.c | 14 +++++++------- net/core/bpf_sk_storage.c | 6 +++--- net/core/sock_map.c | 8 ++++---- net/xdp/xskmap.c | 8 ++++---- 21 files changed, 110 insertions(+), 110 deletions(-) (limited to 'net/core/sock_map.c') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3ef98fb92987..ec0df059f562 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -96,11 +96,11 @@ struct bpf_map_ops { /* funcs callable from userspace and from eBPF programs */ void *(*map_lookup_elem)(struct bpf_map *map, void *key); - int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags); - int (*map_delete_elem)(struct bpf_map *map, void *key); - int (*map_push_elem)(struct bpf_map *map, void *value, u64 flags); - int (*map_pop_elem)(struct bpf_map *map, void *value); - int (*map_peek_elem)(struct bpf_map *map, void *value); + long (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags); + long (*map_delete_elem)(struct bpf_map *map, void *key); + long (*map_push_elem)(struct bpf_map *map, void *value, u64 flags); + long (*map_pop_elem)(struct bpf_map *map, void *value); + long (*map_peek_elem)(struct bpf_map *map, void *value); void *(*map_lookup_percpu_elem)(struct bpf_map *map, void *key, u32 cpu); /* funcs called by prog_array and perf_event_array map */ @@ -139,7 +139,7 @@ struct bpf_map_ops { struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner); /* Misc helpers.*/ - int (*map_redirect)(struct bpf_map *map, u64 key, u64 flags); + long (*map_redirect)(struct bpf_map *map, u64 key, u64 flags); /* map_meta_equal must be implemented for maps that can be * used as an inner map. It is a runtime check to ensure @@ -157,7 +157,7 @@ struct bpf_map_ops { int (*map_set_for_each_callback_args)(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee); - int (*map_for_each_callback)(struct bpf_map *map, + long (*map_for_each_callback)(struct bpf_map *map, bpf_callback_t callback_fn, void *callback_ctx, u64 flags); diff --git a/include/linux/filter.h b/include/linux/filter.h index efa5d4a1677e..23c08c31bea9 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1504,9 +1504,9 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol, } #endif /* IS_ENABLED(CONFIG_IPV6) */ -static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u64 index, - u64 flags, const u64 flag_mask, - void *lookup_elem(struct bpf_map *map, u32 key)) +static __always_inline long __bpf_xdp_redirect_map(struct bpf_map *map, u64 index, + u64 flags, const u64 flag_mask, + void *lookup_elem(struct bpf_map *map, u32 key)) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); const u64 action_mask = XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX; diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 1588c793a715..2058e89b5ddd 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -307,8 +307,8 @@ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key } /* Called from syscall or from eBPF program */ -static int array_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) +static long array_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; @@ -386,7 +386,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, } /* Called from syscall or from eBPF program */ -static int array_map_delete_elem(struct bpf_map *map, void *key) +static long array_map_delete_elem(struct bpf_map *map, void *key) { return -EINVAL; } @@ -686,8 +686,8 @@ static const struct bpf_iter_seq_info iter_seq_info = { .seq_priv_size = sizeof(struct bpf_iter_seq_array_map_info), }; -static int bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_fn, - void *callback_ctx, u64 flags) +static long bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_fn, + void *callback_ctx, u64 flags) { u32 i, key, num_elems = 0; struct bpf_array *array; @@ -871,7 +871,7 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, return 0; } -static int fd_array_map_delete_elem(struct bpf_map *map, void *key) +static long fd_array_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_array *array = container_of(map, struct bpf_array, map); void *old_ptr; diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c index 6350c5d35a9b..db19784601a7 100644 --- a/kernel/bpf/bloom_filter.c +++ b/kernel/bpf/bloom_filter.c @@ -41,7 +41,7 @@ static u32 hash(struct bpf_bloom_filter *bloom, void *value, return h & bloom->bitset_mask; } -static int bloom_map_peek_elem(struct bpf_map *map, void *value) +static long bloom_map_peek_elem(struct bpf_map *map, void *value) { struct bpf_bloom_filter *bloom = container_of(map, struct bpf_bloom_filter, map); @@ -56,7 +56,7 @@ static int bloom_map_peek_elem(struct bpf_map *map, void *value) return 0; } -static int bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags) +static long bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags) { struct bpf_bloom_filter *bloom = container_of(map, struct bpf_bloom_filter, map); @@ -73,12 +73,12 @@ static int bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags) return 0; } -static int bloom_map_pop_elem(struct bpf_map *map, void *value) +static long bloom_map_pop_elem(struct bpf_map *map, void *value) { return -EOPNOTSUPP; } -static int bloom_map_delete_elem(struct bpf_map *map, void *value) +static long bloom_map_delete_elem(struct bpf_map *map, void *value) { return -EOPNOTSUPP; } @@ -177,8 +177,8 @@ static void *bloom_map_lookup_elem(struct bpf_map *map, void *key) return ERR_PTR(-EINVAL); } -static int bloom_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 flags) +static long bloom_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) { /* The eBPF program should use map_push_elem instead */ return -EINVAL; diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index c975cacdd16b..f5b016a5484d 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -93,8 +93,8 @@ static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key) return sdata ? sdata->data : NULL; } -static int bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags) +static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) { struct bpf_local_storage_data *sdata; struct cgroup *cgroup; @@ -125,7 +125,7 @@ static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map) return 0; } -static int bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key) +static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key) { struct cgroup *cgroup; int err, fd; diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index ad2ab0187e45..9a5f05151898 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -91,8 +91,8 @@ static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key) return sdata ? sdata->data : NULL; } -static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags) +static long bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) { struct bpf_local_storage_data *sdata; struct file *f; @@ -127,7 +127,7 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map) return 0; } -static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key) +static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key) { struct file *f; int fd, err; diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 38903fb52f98..ba7a94276e3b 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -349,8 +349,8 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, model, flags, tlinks, NULL); } -static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 flags) +static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; const struct bpf_struct_ops *st_ops = st_map->st_ops; @@ -524,7 +524,7 @@ unlock: return err; } -static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) +static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) { enum bpf_struct_ops_state prev_state; struct bpf_struct_ops_map *st_map; diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index c88cc04c17c1..ab5bd1ef58c4 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -120,8 +120,8 @@ out: return ERR_PTR(err); } -static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags) +static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) { struct bpf_local_storage_data *sdata; struct task_struct *task; @@ -173,7 +173,7 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map, return 0; } -static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key) +static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key) { struct task_struct *task; unsigned int f_flags; diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 871809e71b4e..8ec18faa74ac 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -540,7 +540,7 @@ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, } } -static int cpu_map_delete_elem(struct bpf_map *map, void *key) +static long cpu_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); u32 key_cpu = *(u32 *)key; @@ -553,8 +553,8 @@ static int cpu_map_delete_elem(struct bpf_map *map, void *key) return 0; } -static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) +static long cpu_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); struct bpf_cpumap_val cpumap_value = {}; @@ -667,7 +667,7 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } -static int cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags) +static long cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags) { return __bpf_xdp_redirect_map(map, index, flags, 0, __cpu_map_lookup_elem); diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 19b036a228f7..802692fa3905 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -809,7 +809,7 @@ static void __dev_map_entry_free(struct rcu_head *rcu) kfree(dev); } -static int dev_map_delete_elem(struct bpf_map *map, void *key) +static long dev_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *old_dev; @@ -826,7 +826,7 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key) return 0; } -static int dev_map_hash_delete_elem(struct bpf_map *map, void *key) +static long dev_map_hash_delete_elem(struct bpf_map *map, void *key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *old_dev; @@ -897,8 +897,8 @@ err_out: return ERR_PTR(-EINVAL); } -static int __dev_map_update_elem(struct net *net, struct bpf_map *map, - void *key, void *value, u64 map_flags) +static long __dev_map_update_elem(struct net *net, struct bpf_map *map, + void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dev, *old_dev; @@ -939,15 +939,15 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, return 0; } -static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) +static long dev_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) { return __dev_map_update_elem(current->nsproxy->net_ns, map, key, value, map_flags); } -static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, - void *key, void *value, u64 map_flags) +static long __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, + void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dev, *old_dev; @@ -999,21 +999,21 @@ out_err: return err; } -static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) +static long dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) { return __dev_map_hash_update_elem(current->nsproxy->net_ns, map, key, value, map_flags); } -static int dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags) +static long dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags) { return __bpf_xdp_redirect_map(map, ifindex, flags, BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS, __dev_map_lookup_elem); } -static int dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags) +static long dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags) { return __bpf_xdp_redirect_map(map, ifindex, flags, BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS, diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 0df4b0c10f59..96b645bba3a4 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1073,8 +1073,8 @@ static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, } /* Called from syscall or from eBPF program */ -static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) +static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new = NULL, *l_old; @@ -1175,8 +1175,8 @@ static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem) bpf_lru_push_free(&htab->lru, &elem->lru_node); } -static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) +static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new, *l_old = NULL; @@ -1242,9 +1242,9 @@ err: return ret; } -static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags, - bool onallcpus) +static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags, + bool onallcpus) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new = NULL, *l_old; @@ -1297,9 +1297,9 @@ err: return ret; } -static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags, - bool onallcpus) +static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags, + bool onallcpus) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new = NULL, *l_old; @@ -1364,21 +1364,21 @@ err: return ret; } -static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags) +static long htab_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) { return __htab_percpu_map_update_elem(map, key, value, map_flags, false); } -static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags) +static long htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) { return __htab_lru_percpu_map_update_elem(map, key, value, map_flags, false); } /* Called from syscall or from eBPF program */ -static int htab_map_delete_elem(struct bpf_map *map, void *key) +static long htab_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_nulls_head *head; @@ -1414,7 +1414,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) return ret; } -static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) +static long htab_lru_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_nulls_head *head; @@ -2134,8 +2134,8 @@ static const struct bpf_iter_seq_info iter_seq_info = { .seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info), }; -static int bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn, - void *callback_ctx, u64 flags) +static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn, + void *callback_ctx, u64 flags) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_nulls_head *head; diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index a993560f200a..4c7bbec4a9e4 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -141,8 +141,8 @@ static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *key) return &READ_ONCE(storage->buf)->data[0]; } -static int cgroup_storage_update_elem(struct bpf_map *map, void *key, - void *value, u64 flags) +static long cgroup_storage_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) { struct bpf_cgroup_storage *storage; struct bpf_storage_buffer *new; @@ -348,7 +348,7 @@ static void cgroup_storage_map_free(struct bpf_map *_map) bpf_map_area_free(map); } -static int cgroup_storage_delete_elem(struct bpf_map *map, void *key) +static long cgroup_storage_delete_elem(struct bpf_map *map, void *key) { return -EINVAL; } diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index dc23f2ac9cde..e0d3ddf2037a 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -300,8 +300,8 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie, } /* Called from syscall or from eBPF program */ -static int trie_update_elem(struct bpf_map *map, - void *_key, void *value, u64 flags) +static long trie_update_elem(struct bpf_map *map, + void *_key, void *value, u64 flags) { struct lpm_trie *trie = container_of(map, struct lpm_trie, map); struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL; @@ -431,7 +431,7 @@ out: } /* Called from syscall or from eBPF program */ -static int trie_delete_elem(struct bpf_map *map, void *_key) +static long trie_delete_elem(struct bpf_map *map, void *_key) { struct lpm_trie *trie = container_of(map, struct lpm_trie, map); struct bpf_lpm_trie_key *key = _key; diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 63ecbbcb349d..601609164ef3 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -95,7 +95,7 @@ static void queue_stack_map_free(struct bpf_map *map) bpf_map_area_free(qs); } -static int __queue_map_get(struct bpf_map *map, void *value, bool delete) +static long __queue_map_get(struct bpf_map *map, void *value, bool delete) { struct bpf_queue_stack *qs = bpf_queue_stack(map); unsigned long flags; @@ -124,7 +124,7 @@ out: } -static int __stack_map_get(struct bpf_map *map, void *value, bool delete) +static long __stack_map_get(struct bpf_map *map, void *value, bool delete) { struct bpf_queue_stack *qs = bpf_queue_stack(map); unsigned long flags; @@ -156,32 +156,32 @@ out: } /* Called from syscall or from eBPF program */ -static int queue_map_peek_elem(struct bpf_map *map, void *value) +static long queue_map_peek_elem(struct bpf_map *map, void *value) { return __queue_map_get(map, value, false); } /* Called from syscall or from eBPF program */ -static int stack_map_peek_elem(struct bpf_map *map, void *value) +static long stack_map_peek_elem(struct bpf_map *map, void *value) { return __stack_map_get(map, value, false); } /* Called from syscall or from eBPF program */ -static int queue_map_pop_elem(struct bpf_map *map, void *value) +static long queue_map_pop_elem(struct bpf_map *map, void *value) { return __queue_map_get(map, value, true); } /* Called from syscall or from eBPF program */ -static int stack_map_pop_elem(struct bpf_map *map, void *value) +static long stack_map_pop_elem(struct bpf_map *map, void *value) { return __stack_map_get(map, value, true); } /* Called from syscall or from eBPF program */ -static int queue_stack_map_push_elem(struct bpf_map *map, void *value, - u64 flags) +static long queue_stack_map_push_elem(struct bpf_map *map, void *value, + u64 flags) { struct bpf_queue_stack *qs = bpf_queue_stack(map); unsigned long irq_flags; @@ -227,14 +227,14 @@ static void *queue_stack_map_lookup_elem(struct bpf_map *map, void *key) } /* Called from syscall or from eBPF program */ -static int queue_stack_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 flags) +static long queue_stack_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) { return -EINVAL; } /* Called from syscall or from eBPF program */ -static int queue_stack_map_delete_elem(struct bpf_map *map, void *key) +static long queue_stack_map_delete_elem(struct bpf_map *map, void *key) { return -EINVAL; } diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 71cb72f5b733..cbf2d8d784b8 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -59,7 +59,7 @@ static void *reuseport_array_lookup_elem(struct bpf_map *map, void *key) } /* Called from syscall only */ -static int reuseport_array_delete_elem(struct bpf_map *map, void *key) +static long reuseport_array_delete_elem(struct bpf_map *map, void *key) { struct reuseport_array *array = reuseport_array(map); u32 index = *(u32 *)key; diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 0d2a45ff83f1..875ac9b698d9 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -242,13 +242,13 @@ static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key) return ERR_PTR(-ENOTSUPP); } -static int ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 flags) +static long ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 flags) { return -ENOTSUPP; } -static int ringbuf_map_delete_elem(struct bpf_map *map, void *key) +static long ringbuf_map_delete_elem(struct bpf_map *map, void *key) { return -ENOTSUPP; } diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 0f1d8dced933..b25fce425b2c 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -618,14 +618,14 @@ static int stack_map_get_next_key(struct bpf_map *map, void *key, return 0; } -static int stack_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) +static long stack_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) { return -EINVAL; } /* Called from syscall or from eBPF program */ -static int stack_map_delete_elem(struct bpf_map *map, void *key) +static long stack_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct stack_map_bucket *old_bucket; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5693e4a92752..50c995697f0e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -17692,21 +17692,21 @@ static int do_misc_fixups(struct bpf_verifier_env *env) BUILD_BUG_ON(!__same_type(ops->map_lookup_elem, (void *(*)(struct bpf_map *map, void *key))NULL)); BUILD_BUG_ON(!__same_type(ops->map_delete_elem, - (int (*)(struct bpf_map *map, void *key))NULL)); + (long (*)(struct bpf_map *map, void *key))NULL)); BUILD_BUG_ON(!__same_type(ops->map_update_elem, - (int (*)(struct bpf_map *map, void *key, void *value, + (long (*)(struct bpf_map *map, void *key, void *value, u64 flags))NULL)); BUILD_BUG_ON(!__same_type(ops->map_push_elem, - (int (*)(struct bpf_map *map, void *value, + (long (*)(struct bpf_map *map, void *value, u64 flags))NULL)); BUILD_BUG_ON(!__same_type(ops->map_pop_elem, - (int (*)(struct bpf_map *map, void *value))NULL)); + (long (*)(struct bpf_map *map, void *value))NULL)); BUILD_BUG_ON(!__same_type(ops->map_peek_elem, - (int (*)(struct bpf_map *map, void *value))NULL)); + (long (*)(struct bpf_map *map, void *value))NULL)); BUILD_BUG_ON(!__same_type(ops->map_redirect, - (int (*)(struct bpf_map *map, u64 index, u64 flags))NULL)); + (long (*)(struct bpf_map *map, u64 index, u64 flags))NULL)); BUILD_BUG_ON(!__same_type(ops->map_for_each_callback, - (int (*)(struct bpf_map *map, + (long (*)(struct bpf_map *map, bpf_callback_t callback_fn, void *callback_ctx, u64 flags))NULL)); diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 24c3dc0d62e5..cb0f5a105b89 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -94,8 +94,8 @@ static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key) return ERR_PTR(err); } -static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags) +static long bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) { struct bpf_local_storage_data *sdata; struct socket *sock; @@ -114,7 +114,7 @@ static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key, return err; } -static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key) +static long bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key) { struct socket *sock; int fd, err; diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 9b854e236d23..7c189c2e2fbf 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -437,7 +437,7 @@ static void sock_map_delete_from_link(struct bpf_map *map, struct sock *sk, __sock_map_delete(stab, sk, link_raw); } -static int sock_map_delete_elem(struct bpf_map *map, void *key) +static long sock_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); u32 i = *(u32 *)key; @@ -587,8 +587,8 @@ out: return ret; } -static int sock_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 flags) +static long sock_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) { struct sock *sk = (struct sock *)value; int ret; @@ -925,7 +925,7 @@ static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk, raw_spin_unlock_bh(&bucket->lock); } -static int sock_hash_delete_elem(struct bpf_map *map, void *key) +static long sock_hash_delete_elem(struct bpf_map *map, void *key) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u32 hash, key_size = map->key_size; diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index 0c38d7175922..2c1427074a3b 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -162,8 +162,8 @@ static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key) return ERR_PTR(-EOPNOTSUPP); } -static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) +static long xsk_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) { struct xsk_map *m = container_of(map, struct xsk_map, map); struct xdp_sock __rcu **map_entry; @@ -223,7 +223,7 @@ out: return err; } -static int xsk_map_delete_elem(struct bpf_map *map, void *key) +static long xsk_map_delete_elem(struct bpf_map *map, void *key) { struct xsk_map *m = container_of(map, struct xsk_map, map); struct xdp_sock __rcu **map_entry; @@ -243,7 +243,7 @@ static int xsk_map_delete_elem(struct bpf_map *map, void *key) return 0; } -static int xsk_map_redirect(struct bpf_map *map, u64 index, u64 flags) +static long xsk_map_redirect(struct bpf_map *map, u64 index, u64 flags) { return __bpf_xdp_redirect_map(map, index, flags, 0, __xsk_map_lookup_elem); -- cgit v1.2.3 From ed17aa92dc56b6d8883e4b7a8f1c6fbf5ed6cd29 Mon Sep 17 00:00:00 2001 From: Xin Liu Date: Thu, 6 Apr 2023 20:26:22 +0800 Subject: bpf, sockmap: fix deadlocks in the sockhash and sockmap When huang uses sched_switch tracepoint, the tracepoint does only one thing in the mounted ebpf program, which deletes the fixed elements in sockhash ([0]) It seems that elements in sockhash are rarely actively deleted by users or ebpf program. Therefore, we do not pay much attention to their deletion. Compared with hash maps, sockhash only provides spin_lock_bh protection. This causes it to appear to have self-locking behavior in the interrupt context. [0]:https://lore.kernel.org/all/CABcoxUayum5oOqFMMqAeWuS8+EzojquSOSyDA3J_2omY=2EeAg@mail.gmail.com/ Reported-by: Hsin-Wei Hung Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Xin Liu Acked-by: John Fastabend Link: https://lore.kernel.org/r/20230406122622.109978-1-liuxin350@huawei.com Signed-off-by: Alexei Starovoitov --- net/core/sock_map.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net/core/sock_map.c') diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 7c189c2e2fbf..66305b7bf8b7 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -414,8 +414,9 @@ static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test, { struct sock *sk; int err = 0; + unsigned long flags; - raw_spin_lock_bh(&stab->lock); + raw_spin_lock_irqsave(&stab->lock, flags); sk = *psk; if (!sk_test || sk_test == sk) sk = xchg(psk, NULL); @@ -425,7 +426,7 @@ static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test, else err = -EINVAL; - raw_spin_unlock_bh(&stab->lock); + raw_spin_unlock_irqrestore(&stab->lock, flags); return err; } @@ -932,11 +933,12 @@ static long sock_hash_delete_elem(struct bpf_map *map, void *key) struct bpf_shtab_bucket *bucket; struct bpf_shtab_elem *elem; int ret = -ENOENT; + unsigned long flags; hash = sock_hash_bucket_hash(key, key_size); bucket = sock_hash_select_bucket(htab, hash); - raw_spin_lock_bh(&bucket->lock); + raw_spin_lock_irqsave(&bucket->lock, flags); elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); if (elem) { hlist_del_rcu(&elem->node); @@ -944,7 +946,7 @@ static long sock_hash_delete_elem(struct bpf_map *map, void *key) sock_hash_free_elem(htab, elem); ret = 0; } - raw_spin_unlock_bh(&bucket->lock); + raw_spin_unlock_irqrestore(&bucket->lock, flags); return ret; } -- cgit v1.2.3 From 8c5c2a4898e3d6bad86e29d471e023c8a19ba799 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 13 Apr 2023 20:28:42 +0200 Subject: bpf, sockmap: Revert buggy deadlock fix in the sockhash and sockmap syzbot reported a splat and bisected it to recent commit ed17aa92dc56 ("bpf, sockmap: fix deadlocks in the sockhash and sockmap"): [...] WARNING: CPU: 1 PID: 9280 at kernel/softirq.c:376 __local_bh_enable_ip+0xbe/0x130 kernel/softirq.c:376 Modules linked in: CPU: 1 PID: 9280 Comm: syz-executor.1 Not tainted 6.2.0-syzkaller-13249-gd319f344561d #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/30/2023 RIP: 0010:__local_bh_enable_ip+0xbe/0x130 kernel/softirq.c:376 [...] Call Trace: spin_unlock_bh include/linux/spinlock.h:395 [inline] sock_map_del_link+0x2ea/0x510 net/core/sock_map.c:165 sock_map_unref+0xb0/0x1d0 net/core/sock_map.c:184 sock_hash_delete_elem+0x1ec/0x2a0 net/core/sock_map.c:945 map_delete_elem kernel/bpf/syscall.c:1536 [inline] __sys_bpf+0x2edc/0x53e0 kernel/bpf/syscall.c:5053 __do_sys_bpf kernel/bpf/syscall.c:5166 [inline] __se_sys_bpf kernel/bpf/syscall.c:5164 [inline] __x64_sys_bpf+0x79/0xc0 kernel/bpf/syscall.c:5164 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x39/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7fe8f7c8c169 [...] Revert for now until we have a proper solution. Fixes: ed17aa92dc56 ("bpf, sockmap: fix deadlocks in the sockhash and sockmap") Reported-by: syzbot+49f6cef45247ff249498@syzkaller.appspotmail.com Cc: Hsin-Wei Hung Cc: Xin Liu Cc: John Fastabend Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/000000000000f1db9605f939720e@google.com/ --- net/core/sock_map.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'net/core/sock_map.c') diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 66305b7bf8b7..7c189c2e2fbf 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -414,9 +414,8 @@ static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test, { struct sock *sk; int err = 0; - unsigned long flags; - raw_spin_lock_irqsave(&stab->lock, flags); + raw_spin_lock_bh(&stab->lock); sk = *psk; if (!sk_test || sk_test == sk) sk = xchg(psk, NULL); @@ -426,7 +425,7 @@ static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test, else err = -EINVAL; - raw_spin_unlock_irqrestore(&stab->lock, flags); + raw_spin_unlock_bh(&stab->lock); return err; } @@ -933,12 +932,11 @@ static long sock_hash_delete_elem(struct bpf_map *map, void *key) struct bpf_shtab_bucket *bucket; struct bpf_shtab_elem *elem; int ret = -ENOENT; - unsigned long flags; hash = sock_hash_bucket_hash(key, key_size); bucket = sock_hash_select_bucket(htab, hash); - raw_spin_lock_irqsave(&bucket->lock, flags); + raw_spin_lock_bh(&bucket->lock); elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); if (elem) { hlist_del_rcu(&elem->node); @@ -946,7 +944,7 @@ static long sock_hash_delete_elem(struct bpf_map *map, void *key) sock_hash_free_elem(htab, elem); ret = 0; } - raw_spin_unlock_irqrestore(&bucket->lock, flags); + raw_spin_unlock_bh(&bucket->lock); return ret; } -- cgit v1.2.3 From 29173d07f79883ac94f5570294f98af3d4287382 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 22 May 2023 19:56:06 -0700 Subject: bpf, sockmap: Convert schedule_work into delayed_work Sk_buffs are fed into sockmap verdict programs either from a strparser (when the user might want to decide how framing of skb is done by attaching another parser program) or directly through tcp_read_sock. The tcp_read_sock is the preferred method for performance when the BPF logic is a stream parser. The flow for Cilium's common use case with a stream parser is, tcp_read_sock() sk_psock_verdict_recv ret = bpf_prog_run_pin_on_cpu() sk_psock_verdict_apply(sock, skb, ret) // if system is under memory pressure or app is slow we may // need to queue skb. Do this queuing through ingress_skb and // then kick timer to wake up handler skb_queue_tail(ingress_skb, skb) schedule_work(work); The work queue is wired up to sk_psock_backlog(). This will then walk the ingress_skb skb list that holds our sk_buffs that could not be handled, but should be OK to run at some later point. However, its possible that the workqueue doing this work still hits an error when sending the skb. When this happens the skbuff is requeued on a temporary 'state' struct kept with the workqueue. This is necessary because its possible to partially send an skbuff before hitting an error and we need to know how and where to restart when the workqueue runs next. Now for the trouble, we don't rekick the workqueue. This can cause a stall where the skbuff we just cached on the state variable might never be sent. This happens when its the last packet in a flow and no further packets come along that would cause the system to kick the workqueue from that side. To fix we could do simple schedule_work(), but while under memory pressure it makes sense to back off some instead of continue to retry repeatedly. So instead to fix convert schedule_work to schedule_delayed_work and add backoff logic to reschedule from backlog queue on errors. Its not obvious though what a good backoff is so use '1'. To test we observed some flakes whil running NGINX compliance test with sockmap we attributed these failed test to this bug and subsequent issue. >From on list discussion. This commit bec217197b41("skmsg: Schedule psock work if the cached skb exists on the psock") was intended to address similar race, but had a couple cases it missed. Most obvious it only accounted for receiving traffic on the local socket so if redirecting into another socket we could still get an sk_buff stuck here. Next it missed the case where copied=0 in the recv() handler and then we wouldn't kick the scheduler. Also its sub-optimal to require userspace to kick the internal mechanisms of sockmap to wake it up and copy data to user. It results in an extra syscall and requires the app to actual handle the EAGAIN correctly. Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Tested-by: William Findlay Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20230523025618.113937-3-john.fastabend@gmail.com --- include/linux/skmsg.h | 2 +- net/core/skmsg.c | 21 ++++++++++++++------- net/core/sock_map.c | 3 ++- 3 files changed, 17 insertions(+), 9 deletions(-) (limited to 'net/core/sock_map.c') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 84f787416a54..904ff9a32ad6 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -105,7 +105,7 @@ struct sk_psock { struct proto *sk_proto; struct mutex work_mutex; struct sk_psock_work_state work_state; - struct work_struct work; + struct delayed_work work; struct rcu_work rwork; }; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 4a3dc8d27295..0a9ee2acac0b 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -482,7 +482,7 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, } out: if (psock->work_state.skb && copied > 0) - schedule_work(&psock->work); + schedule_delayed_work(&psock->work, 0); return copied; } EXPORT_SYMBOL_GPL(sk_msg_recvmsg); @@ -640,7 +640,8 @@ static void sk_psock_skb_state(struct sk_psock *psock, static void sk_psock_backlog(struct work_struct *work) { - struct sk_psock *psock = container_of(work, struct sk_psock, work); + struct delayed_work *dwork = to_delayed_work(work); + struct sk_psock *psock = container_of(dwork, struct sk_psock, work); struct sk_psock_work_state *state = &psock->work_state; struct sk_buff *skb = NULL; bool ingress; @@ -680,6 +681,12 @@ start: if (ret == -EAGAIN) { sk_psock_skb_state(psock, state, skb, len, off); + + /* Delay slightly to prioritize any + * other work that might be here. + */ + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) + schedule_delayed_work(&psock->work, 1); goto end; } /* Hard errors break pipe and stop xmit. */ @@ -734,7 +741,7 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node) INIT_LIST_HEAD(&psock->link); spin_lock_init(&psock->link_lock); - INIT_WORK(&psock->work, sk_psock_backlog); + INIT_DELAYED_WORK(&psock->work, sk_psock_backlog); mutex_init(&psock->work_mutex); INIT_LIST_HEAD(&psock->ingress_msg); spin_lock_init(&psock->ingress_lock); @@ -823,7 +830,7 @@ static void sk_psock_destroy(struct work_struct *work) sk_psock_done_strp(psock); - cancel_work_sync(&psock->work); + cancel_delayed_work_sync(&psock->work); mutex_destroy(&psock->work_mutex); psock_progs_drop(&psock->progs); @@ -938,7 +945,7 @@ static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb) } skb_queue_tail(&psock_other->ingress_skb, skb); - schedule_work(&psock_other->work); + schedule_delayed_work(&psock_other->work, 0); spin_unlock_bh(&psock_other->ingress_lock); return 0; } @@ -1018,7 +1025,7 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, spin_lock_bh(&psock->ingress_lock); if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { skb_queue_tail(&psock->ingress_skb, skb); - schedule_work(&psock->work); + schedule_delayed_work(&psock->work, 0); err = 0; } spin_unlock_bh(&psock->ingress_lock); @@ -1049,7 +1056,7 @@ static void sk_psock_write_space(struct sock *sk) psock = sk_psock(sk); if (likely(psock)) { if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) - schedule_work(&psock->work); + schedule_delayed_work(&psock->work, 0); write_space = psock->saved_write_space; } rcu_read_unlock(); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 7c189c2e2fbf..00afb66cd095 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1644,9 +1644,10 @@ void sock_map_close(struct sock *sk, long timeout) rcu_read_unlock(); sk_psock_stop(psock); release_sock(sk); - cancel_work_sync(&psock->work); + cancel_delayed_work_sync(&psock->work); sk_psock_put(sk, psock); } + /* Make sure we do not recurse. This is a bug. * Leak the socket instead of crashing on a stack overflow. */ -- cgit v1.2.3