summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2020-09-01 23:05:08 +0300
committerDavid S. Miller <davem@davemloft.net>2020-09-01 23:22:59 +0300
commit150f29f5e6ea55d8a7d368b162a4e9947a95d2f5 (patch)
treed028c9c9a7cdddcc79feb49e79fa011af029ff8c /include
parent8aa639e1483bbdc0615796801829c773724f6645 (diff)
parentebc4ecd48ca6552b223047839f66e9a9c09aea4c (diff)
downloadlinux-150f29f5e6ea55d8a7d368b162a4e9947a95d2f5.tar.xz
Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Daniel Borkmann says: ==================== pull-request: bpf-next 2020-09-01 The following pull-request contains BPF updates for your *net-next* tree. There are two small conflicts when pulling, resolve as follows: 1) Merge conflict in tools/lib/bpf/libbpf.c between 88a82120282b ("libbpf: Factor out common ELF operations and improve logging") in bpf-next and 1e891e513e16 ("libbpf: Fix map index used in error message") in net-next. Resolve by taking the hunk in bpf-next: [...] scn = elf_sec_by_idx(obj, obj->efile.btf_maps_shndx); data = elf_sec_data(obj, scn); if (!scn || !data) { pr_warn("elf: failed to get %s map definitions for %s\n", MAPS_ELF_SEC, obj->path); return -EINVAL; } [...] 2) Merge conflict in drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c between 9647c57b11e5 ("xsk: i40e: ice: ixgbe: mlx5: Test for dma_need_sync earlier for better performance") in bpf-next and e20f0dbf204f ("net/mlx5e: RX, Add a prefetch command for small L1_CACHE_BYTES") in net-next. Resolve the two locations by retaining net_prefetch() and taking xsk_buff_dma_sync_for_cpu() from bpf-next. Should look like: [...] xdp_set_data_meta_invalid(xdp); xsk_buff_dma_sync_for_cpu(xdp, rq->xsk_pool); net_prefetch(xdp->data); [...] We've added 133 non-merge commits during the last 14 day(s) which contain a total of 246 files changed, 13832 insertions(+), 3105 deletions(-). The main changes are: 1) Initial support for sleepable BPF programs along with bpf_copy_from_user() helper for tracing to reliably access user memory, from Alexei Starovoitov. 2) Add BPF infra for writing and parsing TCP header options, from Martin KaFai Lau. 3) bpf_d_path() helper for returning full path for given 'struct path', from Jiri Olsa. 4) AF_XDP support for shared umems between devices and queues, from Magnus Karlsson. 5) Initial prep work for full BPF-to-BPF call support in libbpf, from Andrii Nakryiko. 6) Generalize bpf_sk_storage map & add local storage for inodes, from KP Singh. 7) Implement sockmap/hash updates from BPF context, from Lorenz Bauer. 8) BPF xor verification for scalar types & add BPF link iterator, from Yonghong Song. 9) Use target's prog type for BPF_PROG_TYPE_EXT prog verification, from Udip Pant. 10) Rework BPF tracing samples to use libbpf loader, from Daniel T. Lee. 11) Fix xdpsock sample to really cycle through all buffers, from Weqaar Janjua. 12) Improve type safety for tun/veth XDP frame handling, from Maciej Żenczykowski. 13) Various smaller cleanups and improvements all over the place. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'include')
-rw-r--r--include/linux/bpf-cgroup.h25
-rw-r--r--include/linux/bpf.h52
-rw-r--r--include/linux/bpf_local_storage.h163
-rw-r--r--include/linux/bpf_lsm.h29
-rw-r--r--include/linux/bpf_types.h3
-rw-r--r--include/linux/btf.h3
-rw-r--r--include/linux/btf_ids.h51
-rw-r--r--include/linux/filter.h8
-rw-r--r--include/linux/if_tun.h19
-rw-r--r--include/linux/netdevice.h10
-rw-r--r--include/linux/rcupdate_trace.h9
-rw-r--r--include/linux/skmsg.h17
-rw-r--r--include/linux/tcp.h20
-rw-r--r--include/net/bpf_sk_storage.h14
-rw-r--r--include/net/inet_connection_sock.h2
-rw-r--r--include/net/request_sock.h9
-rw-r--r--include/net/sock.h4
-rw-r--r--include/net/tcp.h59
-rw-r--r--include/net/xdp_sock.h30
-rw-r--r--include/net/xdp_sock_drv.h122
-rw-r--r--include/net/xsk_buff_pool.h53
-rw-r--r--include/uapi/linux/bpf.h398
22 files changed, 967 insertions, 133 deletions
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 64f367044e25..2f98d2fce62e 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -279,6 +279,31 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) \
BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_RECVMSG, NULL)
+/* The SOCK_OPS"_SK" macro should be used when sock_ops->sk is not a
+ * fullsock and its parent fullsock cannot be traced by
+ * sk_to_full_sk().
+ *
+ * e.g. sock_ops->sk is a request_sock and it is under syncookie mode.
+ * Its listener-sk is not attached to the rsk_listener.
+ * In this case, the caller holds the listener-sk (unlocked),
+ * set its sock_ops->sk to req_sk, and call this SOCK_OPS"_SK" with
+ * the listener-sk such that the cgroup-bpf-progs of the
+ * listener-sk will be run.
+ *
+ * Regardless of syncookie mode or not,
+ * calling bpf_setsockopt on listener-sk will not make sense anyway,
+ * so passing 'sock_ops->sk == req_sk' to the bpf prog is appropriate here.
+ */
+#define BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(sock_ops, sk) \
+({ \
+ int __ret = 0; \
+ if (cgroup_bpf_enabled) \
+ __ret = __cgroup_bpf_run_filter_sock_ops(sk, \
+ sock_ops, \
+ BPF_CGROUP_SOCK_OPS); \
+ __ret; \
+})
+
#define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \
({ \
int __ret = 0; \
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 55f694b63164..c6d9f2c444f4 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -34,6 +34,8 @@ struct btf_type;
struct exception_table_entry;
struct seq_operations;
struct bpf_iter_aux_info;
+struct bpf_local_storage;
+struct bpf_local_storage_map;
extern struct idr btf_idr;
extern spinlock_t btf_idr_lock;
@@ -104,6 +106,25 @@ struct bpf_map_ops {
__poll_t (*map_poll)(struct bpf_map *map, struct file *filp,
struct poll_table_struct *pts);
+ /* Functions called by bpf_local_storage maps */
+ int (*map_local_storage_charge)(struct bpf_local_storage_map *smap,
+ void *owner, u32 size);
+ void (*map_local_storage_uncharge)(struct bpf_local_storage_map *smap,
+ void *owner, u32 size);
+ struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner);
+
+ /* map_meta_equal must be implemented for maps that can be
+ * used as an inner map. It is a runtime check to ensure
+ * an inner map can be inserted to an outer map.
+ *
+ * Some properties of the inner map has been used during the
+ * verification time. When inserting an inner map at the runtime,
+ * map_meta_equal has to ensure the inserting map has the same
+ * properties that the verifier has used earlier.
+ */
+ bool (*map_meta_equal)(const struct bpf_map *meta0,
+ const struct bpf_map *meta1);
+
/* BTF name and id of struct allocated by map_alloc */
const char * const map_btf_name;
int *map_btf_id;
@@ -227,6 +248,9 @@ int map_check_no_btf(const struct bpf_map *map,
const struct btf_type *key_type,
const struct btf_type *value_type);
+bool bpf_map_meta_equal(const struct bpf_map *meta0,
+ const struct bpf_map *meta1);
+
extern const struct bpf_map_ops bpf_map_offload_ops;
/* function argument constraints */
@@ -309,6 +333,7 @@ struct bpf_func_proto {
* for this argument.
*/
int *ret_btf_id; /* return value btf_id */
+ bool (*allowed)(const struct bpf_prog *prog);
};
/* bpf_context is intentionally undefined structure. Pointer to bpf_context is
@@ -514,6 +539,8 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end,
/* these two functions are called from generated trampoline */
u64 notrace __bpf_prog_enter(void);
void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start);
+void notrace __bpf_prog_enter_sleepable(void);
+void notrace __bpf_prog_exit_sleepable(void);
struct bpf_ksym {
unsigned long start;
@@ -709,6 +736,7 @@ struct bpf_prog_aux {
bool offload_requested;
bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */
bool func_proto_unreliable;
+ bool sleepable;
enum bpf_tramp_prog_type trampoline_prog_type;
struct bpf_trampoline *trampoline;
struct hlist_node tramp_hlist;
@@ -1218,12 +1246,18 @@ typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
union bpf_iter_link_info *linfo,
struct bpf_iter_aux_info *aux);
typedef void (*bpf_iter_detach_target_t)(struct bpf_iter_aux_info *aux);
+typedef void (*bpf_iter_show_fdinfo_t) (const struct bpf_iter_aux_info *aux,
+ struct seq_file *seq);
+typedef int (*bpf_iter_fill_link_info_t)(const struct bpf_iter_aux_info *aux,
+ struct bpf_link_info *info);
#define BPF_ITER_CTX_ARG_MAX 2
struct bpf_iter_reg {
const char *target;
bpf_iter_attach_target_t attach_target;
bpf_iter_detach_target_t detach_target;
+ bpf_iter_show_fdinfo_t show_fdinfo;
+ bpf_iter_fill_link_info_t fill_link_info;
u32 ctx_arg_info_size;
struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX];
const struct bpf_iter_seq_info *seq_info;
@@ -1250,6 +1284,10 @@ int bpf_iter_new_fd(struct bpf_link *link);
bool bpf_link_is_iter(struct bpf_link *link);
struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop);
int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx);
+void bpf_iter_map_show_fdinfo(const struct bpf_iter_aux_info *aux,
+ struct seq_file *seq);
+int bpf_iter_map_fill_link_info(const struct bpf_iter_aux_info *aux,
+ struct bpf_link_info *info);
int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
@@ -1340,6 +1378,8 @@ int btf_struct_access(struct bpf_verifier_log *log,
const struct btf_type *t, int off, int size,
enum bpf_access_type atype,
u32 *next_btf_id);
+bool btf_struct_ids_match(struct bpf_verifier_log *log,
+ int off, u32 id, u32 need_type_id);
int btf_resolve_helper_id(struct bpf_verifier_log *log,
const struct bpf_func_proto *fn, int);
@@ -1358,6 +1398,7 @@ int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog,
struct btf *btf, const struct btf_type *t);
struct bpf_prog *bpf_prog_by_id(u32 id);
+struct bpf_link *bpf_link_by_id(u32 id);
const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id);
#else /* !CONFIG_BPF_SYSCALL */
@@ -1637,6 +1678,7 @@ int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
struct bpf_prog *old, u32 which);
int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog);
int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype);
+int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags);
void sock_map_unhash(struct sock *sk);
void sock_map_close(struct sock *sk, long timeout);
#else
@@ -1658,6 +1700,12 @@ static inline int sock_map_prog_detach(const union bpf_attr *attr,
{
return -EOPNOTSUPP;
}
+
+static inline int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value,
+ u64 flags)
+{
+ return -EOPNOTSUPP;
+}
#endif /* CONFIG_BPF_STREAM_PARSER */
#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL)
@@ -1736,6 +1784,7 @@ extern const struct bpf_func_proto bpf_skc_to_tcp_sock_proto;
extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto;
extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto;
extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto;
+extern const struct bpf_func_proto bpf_copy_from_user_proto;
const struct bpf_func_proto *bpf_tracing_func_proto(
enum bpf_func_id func_id, const struct bpf_prog *prog);
@@ -1850,4 +1899,7 @@ enum bpf_text_poke_type {
int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
void *addr1, void *addr2);
+struct btf_id_set;
+bool btf_id_set_contains(struct btf_id_set *set, u32 id);
+
#endif /* _LINUX_BPF_H */
diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
new file mode 100644
index 000000000000..b2c9463f36a1
--- /dev/null
+++ b/include/linux/bpf_local_storage.h
@@ -0,0 +1,163 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2019 Facebook
+ * Copyright 2020 Google LLC.
+ */
+
+#ifndef _BPF_LOCAL_STORAGE_H
+#define _BPF_LOCAL_STORAGE_H
+
+#include <linux/bpf.h>
+#include <linux/rculist.h>
+#include <linux/list.h>
+#include <linux/hash.h>
+#include <linux/types.h>
+#include <uapi/linux/btf.h>
+
+#define BPF_LOCAL_STORAGE_CACHE_SIZE 16
+
+struct bpf_local_storage_map_bucket {
+ struct hlist_head list;
+ raw_spinlock_t lock;
+};
+
+/* Thp map is not the primary owner of a bpf_local_storage_elem.
+ * Instead, the container object (eg. sk->sk_bpf_storage) is.
+ *
+ * The map (bpf_local_storage_map) is for two purposes
+ * 1. Define the size of the "local storage". It is
+ * the map's value_size.
+ *
+ * 2. Maintain a list to keep track of all elems such
+ * that they can be cleaned up during the map destruction.
+ *
+ * When a bpf local storage is being looked up for a
+ * particular object, the "bpf_map" pointer is actually used
+ * as the "key" to search in the list of elem in
+ * the respective bpf_local_storage owned by the object.
+ *
+ * e.g. sk->sk_bpf_storage is the mini-map with the "bpf_map" pointer
+ * as the searching key.
+ */
+struct bpf_local_storage_map {
+ struct bpf_map map;
+ /* Lookup elem does not require accessing the map.
+ *
+ * Updating/Deleting requires a bucket lock to
+ * link/unlink the elem from the map. Having
+ * multiple buckets to improve contention.
+ */
+ struct bpf_local_storage_map_bucket *buckets;
+ u32 bucket_log;
+ u16 elem_size;
+ u16 cache_idx;
+};
+
+struct bpf_local_storage_data {
+ /* smap is used as the searching key when looking up
+ * from the object's bpf_local_storage.
+ *
+ * Put it in the same cacheline as the data to minimize
+ * the number of cachelines access during the cache hit case.
+ */
+ struct bpf_local_storage_map __rcu *smap;
+ u8 data[] __aligned(8);
+};
+
+/* Linked to bpf_local_storage and bpf_local_storage_map */
+struct bpf_local_storage_elem {
+ struct hlist_node map_node; /* Linked to bpf_local_storage_map */
+ struct hlist_node snode; /* Linked to bpf_local_storage */
+ struct bpf_local_storage __rcu *local_storage;
+ struct rcu_head rcu;
+ /* 8 bytes hole */
+ /* The data is stored in aother cacheline to minimize
+ * the number of cachelines access during a cache hit.
+ */
+ struct bpf_local_storage_data sdata ____cacheline_aligned;
+};
+
+struct bpf_local_storage {
+ struct bpf_local_storage_data __rcu *cache[BPF_LOCAL_STORAGE_CACHE_SIZE];
+ struct hlist_head list; /* List of bpf_local_storage_elem */
+ void *owner; /* The object that owns the above "list" of
+ * bpf_local_storage_elem.
+ */
+ struct rcu_head rcu;
+ raw_spinlock_t lock; /* Protect adding/removing from the "list" */
+};
+
+/* U16_MAX is much more than enough for sk local storage
+ * considering a tcp_sock is ~2k.
+ */
+#define BPF_LOCAL_STORAGE_MAX_VALUE_SIZE \
+ min_t(u32, \
+ (KMALLOC_MAX_SIZE - MAX_BPF_STACK - \
+ sizeof(struct bpf_local_storage_elem)), \
+ (U16_MAX - sizeof(struct bpf_local_storage_elem)))
+
+#define SELEM(_SDATA) \
+ container_of((_SDATA), struct bpf_local_storage_elem, sdata)
+#define SDATA(_SELEM) (&(_SELEM)->sdata)
+
+#define BPF_LOCAL_STORAGE_CACHE_SIZE 16
+
+struct bpf_local_storage_cache {
+ spinlock_t idx_lock;
+ u64 idx_usage_counts[BPF_LOCAL_STORAGE_CACHE_SIZE];
+};
+
+#define DEFINE_BPF_STORAGE_CACHE(name) \
+static struct bpf_local_storage_cache name = { \
+ .idx_lock = __SPIN_LOCK_UNLOCKED(name.idx_lock), \
+}
+
+u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache);
+void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
+ u16 idx);
+
+/* Helper functions for bpf_local_storage */
+int bpf_local_storage_map_alloc_check(union bpf_attr *attr);
+
+struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr);
+
+struct bpf_local_storage_data *
+bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
+ struct bpf_local_storage_map *smap,
+ bool cacheit_lockit);
+
+void bpf_local_storage_map_free(struct bpf_local_storage_map *smap);
+
+int bpf_local_storage_map_check_btf(const struct bpf_map *map,
+ const struct btf *btf,
+ const struct btf_type *key_type,
+ const struct btf_type *value_type);
+
+void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
+ struct bpf_local_storage_elem *selem);
+
+bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
+ struct bpf_local_storage_elem *selem,
+ bool uncharge_omem);
+
+void bpf_selem_unlink(struct bpf_local_storage_elem *selem);
+
+void bpf_selem_link_map(struct bpf_local_storage_map *smap,
+ struct bpf_local_storage_elem *selem);
+
+void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem);
+
+struct bpf_local_storage_elem *
+bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value,
+ bool charge_mem);
+
+int
+bpf_local_storage_alloc(void *owner,
+ struct bpf_local_storage_map *smap,
+ struct bpf_local_storage_elem *first_selem);
+
+struct bpf_local_storage_data *
+bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
+ void *value, u64 map_flags);
+
+#endif /* _BPF_LOCAL_STORAGE_H */
diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h
index af74712af585..aaacb6aafc87 100644
--- a/include/linux/bpf_lsm.h
+++ b/include/linux/bpf_lsm.h
@@ -17,9 +17,28 @@
#include <linux/lsm_hook_defs.h>
#undef LSM_HOOK
+struct bpf_storage_blob {
+ struct bpf_local_storage __rcu *storage;
+};
+
+extern struct lsm_blob_sizes bpf_lsm_blob_sizes;
+
int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
const struct bpf_prog *prog);
+static inline struct bpf_storage_blob *bpf_inode(
+ const struct inode *inode)
+{
+ if (unlikely(!inode->i_security))
+ return NULL;
+
+ return inode->i_security + bpf_lsm_blob_sizes.lbs_inode;
+}
+
+extern const struct bpf_func_proto bpf_inode_storage_get_proto;
+extern const struct bpf_func_proto bpf_inode_storage_delete_proto;
+void bpf_inode_storage_free(struct inode *inode);
+
#else /* !CONFIG_BPF_LSM */
static inline int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
@@ -28,6 +47,16 @@ static inline int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
return -EOPNOTSUPP;
}
+static inline struct bpf_storage_blob *bpf_inode(
+ const struct inode *inode)
+{
+ return NULL;
+}
+
+static inline void bpf_inode_storage_free(struct inode *inode)
+{
+}
+
#endif /* CONFIG_BPF_LSM */
#endif /* _LINUX_BPF_LSM_H */
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index a52a5688418e..2e6f568377f1 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -107,6 +107,9 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops)
#endif
+#ifdef CONFIG_BPF_LSM
+BPF_MAP_TYPE(BPF_MAP_TYPE_INODE_STORAGE, inode_storage_map_ops)
+#endif
BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
#if defined(CONFIG_XDP_SOCKETS)
BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops)
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 8b81fbb4497c..a9af5e7a7ece 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -64,8 +64,7 @@ const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf,
u32 id, u32 *res_id);
const struct btf_type *
btf_resolve_size(const struct btf *btf, const struct btf_type *type,
- u32 *type_size, const struct btf_type **elem_type,
- u32 *total_nelems);
+ u32 *type_size);
#define for_each_member(i, struct_type, member) \
for (i = 0, member = btf_type_member(struct_type); \
diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h
index 4867d549e3c1..210b086188a3 100644
--- a/include/linux/btf_ids.h
+++ b/include/linux/btf_ids.h
@@ -3,6 +3,11 @@
#ifndef _LINUX_BTF_IDS_H
#define _LINUX_BTF_IDS_H
+struct btf_id_set {
+ u32 cnt;
+ u32 ids[];
+};
+
#ifdef CONFIG_DEBUG_INFO_BTF
#include <linux/compiler.h> /* for __PASTE */
@@ -62,7 +67,7 @@ asm( \
".pushsection " BTF_IDS_SECTION ",\"a\"; \n" \
"." #scope " " #name "; \n" \
#name ":; \n" \
-".popsection; \n"); \
+".popsection; \n");
#define BTF_ID_LIST(name) \
__BTF_ID_LIST(name, local) \
@@ -88,12 +93,56 @@ asm( \
".zero 4 \n" \
".popsection; \n");
+/*
+ * The BTF_SET_START/END macros pair defines sorted list of
+ * BTF IDs plus its members count, with following layout:
+ *
+ * BTF_SET_START(list)
+ * BTF_ID(type1, name1)
+ * BTF_ID(type2, name2)
+ * BTF_SET_END(list)
+ *
+ * __BTF_ID__set__list:
+ * .zero 4
+ * list:
+ * __BTF_ID__type1__name1__3:
+ * .zero 4
+ * __BTF_ID__type2__name2__4:
+ * .zero 4
+ *
+ */
+#define __BTF_SET_START(name, scope) \
+asm( \
+".pushsection " BTF_IDS_SECTION ",\"a\"; \n" \
+"." #scope " __BTF_ID__set__" #name "; \n" \
+"__BTF_ID__set__" #name ":; \n" \
+".zero 4 \n" \
+".popsection; \n");
+
+#define BTF_SET_START(name) \
+__BTF_ID_LIST(name, local) \
+__BTF_SET_START(name, local)
+
+#define BTF_SET_START_GLOBAL(name) \
+__BTF_ID_LIST(name, globl) \
+__BTF_SET_START(name, globl)
+
+#define BTF_SET_END(name) \
+asm( \
+".pushsection " BTF_IDS_SECTION ",\"a\"; \n" \
+".size __BTF_ID__set__" #name ", .-" #name " \n" \
+".popsection; \n"); \
+extern struct btf_id_set name;
+
#else
#define BTF_ID_LIST(name) static u32 name[5];
#define BTF_ID(prefix, name)
#define BTF_ID_UNUSED
#define BTF_ID_LIST_GLOBAL(name) u32 name[1];
+#define BTF_SET_START(name) static struct btf_id_set name = { 0 };
+#define BTF_SET_START_GLOBAL(name) static struct btf_id_set name = { 0 };
+#define BTF_SET_END(name)
#endif /* CONFIG_DEBUG_INFO_BTF */
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 0a355b005bf4..995625950cc1 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1236,13 +1236,17 @@ struct bpf_sock_addr_kern {
struct bpf_sock_ops_kern {
struct sock *sk;
- u32 op;
union {
u32 args[4];
u32 reply;
u32 replylong[4];
};
- u32 is_fullsock;
+ struct sk_buff *syn_skb;
+ struct sk_buff *skb;
+ void *skb_data_end;
+ u8 op;
+ u8 is_fullsock;
+ u8 remaining_opt_len;
u64 temp; /* temp and everything after is not
* initialized to 0 before calling
* the BPF program. New fields that
diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
index 5bda8cf457b6..2a7660843444 100644
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -27,9 +27,18 @@ struct tun_xdp_hdr {
#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
struct socket *tun_get_socket(struct file *);
struct ptr_ring *tun_get_tx_ring(struct file *file);
-bool tun_is_xdp_frame(void *ptr);
-void *tun_xdp_to_ptr(void *ptr);
-void *tun_ptr_to_xdp(void *ptr);
+static inline bool tun_is_xdp_frame(void *ptr)
+{
+ return (unsigned long)ptr & TUN_XDP_FLAG;
+}
+static inline void *tun_xdp_to_ptr(struct xdp_frame *xdp)
+{
+ return (void *)((unsigned long)xdp | TUN_XDP_FLAG);
+}
+static inline struct xdp_frame *tun_ptr_to_xdp(void *ptr)
+{
+ return (void *)((unsigned long)ptr & ~TUN_XDP_FLAG);
+}
void tun_ptr_free(void *ptr);
#else
#include <linux/err.h>
@@ -48,11 +57,11 @@ static inline bool tun_is_xdp_frame(void *ptr)
{
return false;
}
-static inline void *tun_xdp_to_ptr(void *ptr)
+static inline void *tun_xdp_to_ptr(struct xdp_frame *xdp)
{
return NULL;
}
-static inline void *tun_ptr_to_xdp(void *ptr)
+static inline struct xdp_frame *tun_ptr_to_xdp(void *ptr)
{
return NULL;
}
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c0b512e6a02b..7f9fcfd15942 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -618,7 +618,7 @@ struct netdev_queue {
/* Subordinate device that the queue has been assigned to */
struct net_device *sb_dev;
#ifdef CONFIG_XDP_SOCKETS
- struct xdp_umem *umem;
+ struct xsk_buff_pool *pool;
#endif
/*
* write-mostly part
@@ -755,7 +755,7 @@ struct netdev_rx_queue {
struct net_device *dev;
struct xdp_rxq_info xdp_rxq;
#ifdef CONFIG_XDP_SOCKETS
- struct xdp_umem *umem;
+ struct xsk_buff_pool *pool;
#endif
} ____cacheline_aligned_in_smp;
@@ -883,7 +883,7 @@ enum bpf_netdev_command {
/* BPF program for offload callbacks, invoked at program load time. */
BPF_OFFLOAD_MAP_ALLOC,
BPF_OFFLOAD_MAP_FREE,
- XDP_SETUP_XSK_UMEM,
+ XDP_SETUP_XSK_POOL,
};
struct bpf_prog_offload_ops;
@@ -917,9 +917,9 @@ struct netdev_bpf {
struct {
struct bpf_offloaded_map *offmap;
};
- /* XDP_SETUP_XSK_UMEM */
+ /* XDP_SETUP_XSK_POOL */
struct {
- struct xdp_umem *umem;
+ struct xsk_buff_pool *pool;
u16 queue_id;
} xsk;
};
diff --git a/include/linux/rcupdate_trace.h b/include/linux/rcupdate_trace.h
index d9015aac78c6..aaaac8ac927c 100644
--- a/include/linux/rcupdate_trace.h
+++ b/include/linux/rcupdate_trace.h
@@ -82,7 +82,14 @@ static inline void rcu_read_unlock_trace(void)
void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func);
void synchronize_rcu_tasks_trace(void);
void rcu_barrier_tasks_trace(void);
-
+#else
+/*
+ * The BPF JIT forms these addresses even when it doesn't call these
+ * functions, so provide definitions that result in runtime errors.
+ */
+static inline void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func) { BUG(); }
+static inline void rcu_read_lock_trace(void) { BUG(); }
+static inline void rcu_read_unlock_trace(void) { BUG(); }
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
#endif /* __LINUX_RCUPDATE_TRACE_H */
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 1e9ed840b9fc..3119928fc103 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -340,23 +340,6 @@ static inline void sk_psock_update_proto(struct sock *sk,
struct sk_psock *psock,
struct proto *ops)
{
- /* Initialize saved callbacks and original proto only once, since this
- * function may be called multiple times for a psock, e.g. when
- * psock->progs.msg_parser is updated.
- *
- * Since we've not installed the new proto, psock is not yet in use and
- * we can initialize it without synchronization.
- */
- if (!psock->sk_proto) {
- struct proto *orig = READ_ONCE(sk->sk_prot);
-
- psock->saved_unhash = orig->unhash;
- psock->saved_close = orig->close;
- psock->saved_write_space = sk->sk_write_space;
-
- psock->sk_proto = orig;
- }
-
/* Pairs with lockless read in sk_clone_lock() */
WRITE_ONCE(sk->sk_prot, ops);
}
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 14b62d7df942..56ff2952edaf 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -92,6 +92,8 @@ struct tcp_options_received {
smc_ok : 1, /* SMC seen on SYN packet */
snd_wscale : 4, /* Window scaling received from sender */
rcv_wscale : 4; /* Window scaling to send to receiver */
+ u8 saw_unknown:1, /* Received unknown option */
+ unused:7;
u8 num_sacks; /* Number of SACK blocks */
u16 user_mss; /* mss requested by user in ioctl */
u16 mss_clamp; /* Maximal mss, negotiated at connection setup */
@@ -237,14 +239,13 @@ struct tcp_sock {
repair : 1,
frto : 1;/* F-RTO (RFC5682) activated in CA_Loss */
u8 repair_queue;
- u8 syn_data:1, /* SYN includes data */
+ u8 save_syn:2, /* Save headers of SYN packet */
+ syn_data:1, /* SYN includes data */
syn_fastopen:1, /* SYN includes Fast Open option */
syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
syn_fastopen_ch:1, /* Active TFO re-enabling probe */
syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
- save_syn:1, /* Save headers of SYN packet */
- is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */
- syn_smc:1; /* SYN includes SMC */
+ is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */
u32 tlp_high_seq; /* snd_nxt at the time of TLP */
u32 tcp_tx_delay; /* delay (in usec) added to TX packets */
@@ -391,6 +392,9 @@ struct tcp_sock {
#if IS_ENABLED(CONFIG_MPTCP)
bool is_mptcp;
#endif
+#if IS_ENABLED(CONFIG_SMC)
+ bool syn_smc; /* SYN includes SMC */
+#endif
#ifdef CONFIG_TCP_MD5SIG
/* TCP AF-Specific parts; only used by MD5 Signature support so far */
@@ -406,7 +410,7 @@ struct tcp_sock {
* socket. Used to retransmit SYNACKs etc.
*/
struct request_sock __rcu *fastopen_rsk;
- u32 *saved_syn;
+ struct saved_syn *saved_syn;
};
enum tsq_enum {
@@ -484,6 +488,12 @@ static inline void tcp_saved_syn_free(struct tcp_sock *tp)
tp->saved_syn = NULL;
}
+static inline u32 tcp_saved_syn_len(const struct saved_syn *saved_syn)
+{
+ return saved_syn->mac_hdrlen + saved_syn->network_hdrlen +
+ saved_syn->tcp_hdrlen;
+}
+
struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
const struct sk_buff *orig_skb);
diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h
index 5036c94c0503..119f4c9c3a9c 100644
--- a/include/net/bpf_sk_storage.h
+++ b/include/net/bpf_sk_storage.h
@@ -3,13 +3,27 @@
#ifndef _BPF_SK_STORAGE_H
#define _BPF_SK_STORAGE_H
+#include <linux/rculist.h>
+#include <linux/list.h>
+#include <linux/hash.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/bpf.h>
+#include <net/sock.h>
+#include <uapi/linux/sock_diag.h>
+#include <uapi/linux/btf.h>
+#include <linux/bpf_local_storage.h>
+
struct sock;
void bpf_sk_storage_free(struct sock *sk);
extern const struct bpf_func_proto bpf_sk_storage_get_proto;
extern const struct bpf_func_proto bpf_sk_storage_delete_proto;
+extern const struct bpf_func_proto sk_storage_get_btf_proto;
+extern const struct bpf_func_proto sk_storage_delete_btf_proto;
+struct bpf_local_storage_elem;
struct bpf_sk_storage_diag;
struct sk_buff;
struct nlattr;
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index aa8893c68c50..c738abeb3265 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -86,6 +86,8 @@ struct inet_connection_sock {
struct timer_list icsk_retransmit_timer;
struct timer_list icsk_delack_timer;
__u32 icsk_rto;
+ __u32 icsk_rto_min;
+ __u32 icsk_delack_max;
__u32 icsk_pmtu_cookie;
const struct tcp_congestion_ops *icsk_ca_ops;
const struct inet_connection_sock_af_ops *icsk_af_ops;
diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index b2eb8b4ba697..29e41ff3ec93 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -41,6 +41,13 @@ struct request_sock_ops {
int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req);
+struct saved_syn {
+ u32 mac_hdrlen;
+ u32 network_hdrlen;
+ u32 tcp_hdrlen;
+ u8 data[];
+};
+
/* struct request_sock - mini sock to represent a connection request
*/
struct request_sock {
@@ -60,7 +67,7 @@ struct request_sock {
struct timer_list rsk_timer;
const struct request_sock_ops *rsk_ops;
struct sock *sk;
- u32 *saved_syn;
+ struct saved_syn *saved_syn;
u32 secid;
u32 peer_secid;
};
diff --git a/include/net/sock.h b/include/net/sock.h
index b943731fa879..7dd3051551fb 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -246,7 +246,7 @@ struct sock_common {
/* public: */
};
-struct bpf_sk_storage;
+struct bpf_local_storage;
/**
* struct sock - network layer representation of sockets
@@ -517,7 +517,7 @@ struct sock {
void (*sk_destruct)(struct sock *sk);
struct sock_reuseport __rcu *sk_reuseport_cb;
#ifdef CONFIG_BPF_SYSCALL
- struct bpf_sk_storage __rcu *sk_bpf_storage;
+ struct bpf_local_storage __rcu *sk_bpf_storage;
#endif
struct rcu_head sk_rcu;
};
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0d11db6436c8..e85d564446c6 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -394,7 +394,7 @@ void tcp_metrics_init(void);
bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst);
void tcp_close(struct sock *sk, long timeout);
void tcp_init_sock(struct sock *sk);
-void tcp_init_transfer(struct sock *sk, int bpf_op);
+void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb);
__poll_t tcp_poll(struct file *file, struct socket *sock,
struct poll_table_struct *wait);
int tcp_getsockopt(struct sock *sk, int level, int optname,
@@ -455,7 +455,8 @@ enum tcp_synack_type {
struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
struct request_sock *req,
struct tcp_fastopen_cookie *foc,
- enum tcp_synack_type synack_type);
+ enum tcp_synack_type synack_type,
+ struct sk_buff *syn_skb);
int tcp_disconnect(struct sock *sk, int flags);
void tcp_finish_connect(struct sock *sk, struct sk_buff *skb);
@@ -699,7 +700,7 @@ static inline void tcp_fast_path_check(struct sock *sk)
static inline u32 tcp_rto_min(struct sock *sk)
{
const struct dst_entry *dst = __sk_dst_get(sk);
- u32 rto_min = TCP_RTO_MIN;
+ u32 rto_min = inet_csk(sk)->icsk_rto_min;
if (dst && dst_metric_locked(dst, RTAX_RTO_MIN))
rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN);
@@ -2025,7 +2026,8 @@ struct tcp_request_sock_ops {
int (*send_synack)(const struct sock *sk, struct dst_entry *dst,
struct flowi *fl, struct request_sock *req,
struct tcp_fastopen_cookie *foc,
- enum tcp_synack_type synack_type);
+ enum tcp_synack_type synack_type,
+ struct sk_buff *syn_skb);
};
extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops;
@@ -2223,6 +2225,55 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
struct msghdr *msg, int len, int flags);
#endif /* CONFIG_NET_SOCK_MSG */
+#ifdef CONFIG_CGROUP_BPF
+/* Copy the listen sk's HDR_OPT_CB flags to its child.
+ *
+ * During 3-Way-HandShake, the synack is usually sent from
+ * the listen sk with the HDR_OPT_CB flags set so that
+ * bpf-prog will be called to write the BPF hdr option.
+ *
+ * In fastopen, the child sk is used to send synack instead
+ * of the listen sk. Thus, inheriting the HDR_OPT_CB flags
+ * from the listen sk gives the bpf-prog a chance to write
+ * BPF hdr option in the synack pkt during fastopen.
+ *
+ * Both fastopen and non-fastopen child will inherit the
+ * HDR_OPT_CB flags to keep the bpf-prog having a consistent
+ * behavior when deciding to clear this cb flags (or not)
+ * during the PASSIVE_ESTABLISHED_CB.
+ *
+ * In the future, other cb flags could be inherited here also.
+ */
+static inline void bpf_skops_init_child(const struct sock *sk,
+ struct sock *child)
+{
+ tcp_sk(child)->bpf_sock_ops_cb_flags =
+ tcp_sk(sk)->bpf_sock_ops_cb_flags &
+ (BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG |
+ BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG |
+ BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG);
+}
+
+static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops,
+ struct sk_buff *skb,
+ unsigned int end_offset)
+{
+ skops->skb = skb;
+ skops->skb_data_end = skb->data + end_offset;
+}
+#else
+static inline void bpf_skops_init_child(const struct sock *sk,
+ struct sock *child)
+{
+}
+
+static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops,
+ struct sk_buff *skb,
+ unsigned int end_offset)
+{
+}
+#endif
+
/* Call BPF_SOCK_OPS program that returns an int. If the return value
* is < 0, then the BPF op failed (for example if the loaded BPF
* program does not support the chosen operation or there is no BPF
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index c9d87cc40c11..1a9559c0cbdd 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -18,25 +18,19 @@ struct xsk_queue;
struct xdp_buff;
struct xdp_umem {
- struct xsk_queue *fq;
- struct xsk_queue *cq;
- struct xsk_buff_pool *pool;
+ void *addrs;
u64 size;
u32 headroom;
u32 chunk_size;
+ u32 chunks;
+ u32 npgs;
struct user_struct *user;
refcount_t users;
- struct work_struct work;
- struct page **pgs;
- u32 npgs;
- u16 queue_id;
- u8 need_wakeup;
u8 flags;
- int id;
- struct net_device *dev;
bool zc;
- spinlock_t xsk_tx_list_lock;
- struct list_head xsk_tx_list;
+ struct page **pgs;
+ int id;
+ struct list_head xsk_dma_list;
};
struct xsk_map {
@@ -48,10 +42,11 @@ struct xsk_map {
struct xdp_sock {
/* struct sock must be the first member of struct xdp_sock */
struct sock sk;
- struct xsk_queue *rx;
+ struct xsk_queue *rx ____cacheline_aligned_in_smp;
struct net_device *dev;
struct xdp_umem *umem;
struct list_head flush_node;
+ struct xsk_buff_pool *pool;
u16 queue_id;
bool zc;
enum {
@@ -59,10 +54,9 @@ struct xdp_sock {
XSK_BOUND,
XSK_UNBOUND,
} state;
- /* Protects multiple processes in the control path */
- struct mutex mutex;
+
struct xsk_queue *tx ____cacheline_aligned_in_smp;
- struct list_head list;
+ struct list_head tx_list;
/* Mutual exclusion of NAPI TX thread and sendmsg error paths
* in the SKB destructor callback.
*/
@@ -77,6 +71,10 @@ struct xdp_sock {
struct list_head map_list;
/* Protects map_list */
spinlock_t map_list_lock;
+ /* Protects multiple processes in the control path */
+ struct mutex mutex;
+ struct xsk_queue *fq_tmp; /* Only as tmp storage before bind */
+ struct xsk_queue *cq_tmp; /* Only as tmp storage before bind */
};
#ifdef CONFIG_XDP_SOCKETS
diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
index ccf848f7efa4..5b1ee8a9976d 100644
--- a/include/net/xdp_sock_drv.h
+++ b/include/net/xdp_sock_drv.h
@@ -11,47 +11,50 @@
#ifdef CONFIG_XDP_SOCKETS
-void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries);
-bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc);
-void xsk_umem_consume_tx_done(struct xdp_umem *umem);
-struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, u16 queue_id);
-void xsk_set_rx_need_wakeup(struct xdp_umem *umem);
-void xsk_set_tx_need_wakeup(struct xdp_umem *umem);
-void xsk_clear_rx_need_wakeup(struct xdp_umem *umem);
-void xsk_clear_tx_need_wakeup(struct xdp_umem *umem);
-bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem);
+void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries);
+bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc);
+void xsk_tx_release(struct xsk_buff_pool *pool);
+struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
+ u16 queue_id);
+void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool);
+void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool);
+void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool);
+void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool);
+bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool);
-static inline u32 xsk_umem_get_headroom(struct xdp_umem *umem)
+static inline u32 xsk_pool_get_headroom(struct xsk_buff_pool *pool)
{
- return XDP_PACKET_HEADROOM + umem->headroom;
+ return XDP_PACKET_HEADROOM + pool->headroom;
}
-static inline u32 xsk_umem_get_chunk_size(struct xdp_umem *umem)
+static inline u32 xsk_pool_get_chunk_size(struct xsk_buff_pool *pool)
{
- return umem->chunk_size;
+ return pool->chunk_size;
}
-static inline u32 xsk_umem_get_rx_frame_size(struct xdp_umem *umem)
+static inline u32 xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool)
{
- return xsk_umem_get_chunk_size(umem) - xsk_umem_get_headroom(umem);
+ return xsk_pool_get_chunk_size(pool) - xsk_pool_get_headroom(pool);
}
-static inline void xsk_buff_set_rxq_info(struct xdp_umem *umem,
+static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool,
struct xdp_rxq_info *rxq)
{
- xp_set_rxq_info(umem->pool, rxq);
+ xp_set_rxq_info(pool, rxq);
}
-static inline void xsk_buff_dma_unmap(struct xdp_umem *umem,
+static inline void xsk_pool_dma_unmap(struct xsk_buff_pool *pool,
unsigned long attrs)
{
- xp_dma_unmap(umem->pool, attrs);
+ xp_dma_unmap(pool, attrs);
}
-static inline int xsk_buff_dma_map(struct xdp_umem *umem, struct device *dev,
- unsigned long attrs)
+static inline int xsk_pool_dma_map(struct xsk_buff_pool *pool,
+ struct device *dev, unsigned long attrs)
{
- return xp_dma_map(umem->pool, dev, attrs, umem->pgs, umem->npgs);
+ struct xdp_umem *umem = pool->umem;
+
+ return xp_dma_map(pool, dev, attrs, umem->pgs, umem->npgs);
}
static inline dma_addr_t xsk_buff_xdp_get_dma(struct xdp_buff *xdp)
@@ -68,14 +71,14 @@ static inline dma_addr_t xsk_buff_xdp_get_frame_dma(struct xdp_buff *xdp)
return xp_get_frame_dma(xskb);
}
-static inline struct xdp_buff *xsk_buff_alloc(struct xdp_umem *umem)
+static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool)
{
- return xp_alloc(umem->pool);
+ return xp_alloc(pool);
}
-static inline bool xsk_buff_can_alloc(struct xdp_umem *umem, u32 count)
+static inline bool xsk_buff_can_alloc(struct xsk_buff_pool *pool, u32 count)
{
- return xp_can_alloc(umem->pool, count);
+ return xp_can_alloc(pool, count);
}
static inline void xsk_buff_free(struct xdp_buff *xdp)
@@ -85,100 +88,104 @@ static inline void xsk_buff_free(struct xdp_buff *xdp)
xp_free(xskb);
}
-static inline dma_addr_t xsk_buff_raw_get_dma(struct xdp_umem *umem, u64 addr)
+static inline dma_addr_t xsk_buff_raw_get_dma(struct xsk_buff_pool *pool,
+ u64 addr)
{
- return xp_raw_get_dma(umem->pool, addr);
+ return xp_raw_get_dma(pool, addr);
}
-static inline void *xsk_buff_raw_get_data(struct xdp_umem *umem, u64 addr)
+static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr)
{
- return xp_raw_get_data(umem->pool, addr);
+ return xp_raw_get_data(pool, addr);
}
-static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp)
+static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool)
{
struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
+ if (!pool->dma_need_sync)
+ return;
+
xp_dma_sync_for_cpu(xskb);
}
-static inline void xsk_buff_raw_dma_sync_for_device(struct xdp_umem *umem,
+static inline void xsk_buff_raw_dma_sync_for_device(struct xsk_buff_pool *pool,
dma_addr_t dma,
size_t size)
{
- xp_dma_sync_for_device(umem->pool, dma, size);
+ xp_dma_sync_for_device(pool, dma, size);
}
#else
-static inline void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
+static inline void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
{
}
-static inline bool xsk_umem_consume_tx(struct xdp_umem *umem,
- struct xdp_desc *desc)
+static inline bool xsk_tx_peek_desc(struct xsk_buff_pool *pool,
+ struct xdp_desc *desc)
{
return false;
}
-static inline void xsk_umem_consume_tx_done(struct xdp_umem *umem)
+static inline void xsk_tx_release(struct xsk_buff_pool *pool)
{
}
-static inline struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
- u16 queue_id)
+static inline struct xsk_buff_pool *
+xsk_get_pool_from_qid(struct net_device *dev, u16 queue_id)
{
return NULL;
}
-static inline void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
+static inline void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
{
}
-static inline void xsk_set_tx_need_wakeup(struct xdp_umem *umem)
+static inline void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool)
{
}
-static inline void xsk_clear_rx_need_wakeup(struct xdp_umem *umem)
+static inline void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool)
{
}
-static inline void xsk_clear_tx_need_wakeup(struct xdp_umem *umem)
+static inline void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool)
{
}
-static inline bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
+static inline bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool)
{
return false;
}
-static inline u32 xsk_umem_get_headroom(struct xdp_umem *umem)
+static inline u32 xsk_pool_get_headroom(struct xsk_buff_pool *pool)
{
return 0;
}
-static inline u32 xsk_umem_get_chunk_size(struct xdp_umem *umem)
+static inline u32 xsk_pool_get_chunk_size(struct xsk_buff_pool *pool)
{
return 0;
}
-static inline u32 xsk_umem_get_rx_frame_size(struct xdp_umem *umem)
+static inline u32 xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool)
{
return 0;
}
-static inline void xsk_buff_set_rxq_info(struct xdp_umem *umem,
+static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool,
struct xdp_rxq_info *rxq)
{
}
-static inline void xsk_buff_dma_unmap(struct xdp_umem *umem,
+static inline void xsk_pool_dma_unmap(struct xsk_buff_pool *pool,
unsigned long attrs)
{
}
-static inline int xsk_buff_dma_map(struct xdp_umem *umem, struct device *dev,
- unsigned long attrs)
+static inline int xsk_pool_dma_map(struct xsk_buff_pool *pool,
+ struct device *dev, unsigned long attrs)
{
return 0;
}
@@ -193,12 +200,12 @@ static inline dma_addr_t xsk_buff_xdp_get_frame_dma(struct xdp_buff *xdp)
return 0;
}
-static inline struct xdp_buff *xsk_buff_alloc(struct xdp_umem *umem)
+static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool)
{
return NULL;
}
-static inline bool xsk_buff_can_alloc(struct xdp_umem *umem, u32 count)
+static inline bool xsk_buff_can_alloc(struct xsk_buff_pool *pool, u32 count)
{
return false;
}
@@ -207,21 +214,22 @@ static inline void xsk_buff_free(struct xdp_buff *xdp)
{
}
-static inline dma_addr_t xsk_buff_raw_get_dma(struct xdp_umem *umem, u64 addr)
+static inline dma_addr_t xsk_buff_raw_get_dma(struct xsk_buff_pool *pool,
+ u64 addr)
{
return 0;
}
-static inline void *xsk_buff_raw_get_data(struct xdp_umem *umem, u64 addr)
+static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr)
{
return NULL;
}
-static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp)
+static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool)
{
}
-static inline void xsk_buff_raw_dma_sync_for_device(struct xdp_umem *umem,
+static inline void xsk_buff_raw_dma_sync_for_device(struct xsk_buff_pool *pool,
dma_addr_t dma,
size_t size)
{
diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
index 6842990e2712..0140d086dc84 100644
--- a/include/net/xsk_buff_pool.h
+++ b/include/net/xsk_buff_pool.h
@@ -13,6 +13,8 @@ struct xsk_buff_pool;
struct xdp_rxq_info;
struct xsk_queue;
struct xdp_desc;
+struct xdp_umem;
+struct xdp_sock;
struct device;
struct page;
@@ -26,34 +28,68 @@ struct xdp_buff_xsk {
struct list_head free_list_node;
};
+struct xsk_dma_map {
+ dma_addr_t *dma_pages;
+ struct device *dev;
+ struct net_device *netdev;
+ refcount_t users;
+ struct list_head list; /* Protected by the RTNL_LOCK */
+ u32 dma_pages_cnt;
+ bool dma_need_sync;
+};
+
struct xsk_buff_pool {
- struct xsk_queue *fq;
+ /* Members only used in the control path first. */
+ struct device *dev;
+ struct net_device *netdev;
+ struct list_head xsk_tx_list;
+ /* Protects modifications to the xsk_tx_list */
+ spinlock_t xsk_tx_list_lock;
+ refcount_t users;
+ struct xdp_umem *umem;
+ struct work_struct work;
struct list_head free_list;
+ u32 heads_cnt;
+ u16 queue_id;
+
+ /* Data path members as close to free_heads at the end as possible. */
+ struct xsk_queue *fq ____cacheline_aligned_in_smp;
+ struct xsk_queue *cq;
+ /* For performance reasons, each buff pool has its own array of dma_pages
+ * even when they are identical.
+ */
dma_addr_t *dma_pages;
struct xdp_buff_xsk *heads;
u64 chunk_mask;
u64 addrs_cnt;
u32 free_list_cnt;
u32 dma_pages_cnt;
- u32 heads_cnt;
u32 free_heads_cnt;
u32 headroom;
u32 chunk_size;
u32 frame_len;
+ u8 cached_need_wakeup;
+ bool uses_need_wakeup;
bool dma_need_sync;
bool unaligned;
void *addrs;
- struct device *dev;
struct xdp_buff_xsk *free_heads[];
};
/* AF_XDP core. */
-struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks,
- u32 chunk_size, u32 headroom, u64 size,
- bool unaligned);
-void xp_set_fq(struct xsk_buff_pool *pool, struct xsk_queue *fq);
+struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
+ struct xdp_umem *umem);
+int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *dev,
+ u16 queue_id, u16 flags);
+int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_umem *umem,
+ struct net_device *dev, u16 queue_id);
void xp_destroy(struct xsk_buff_pool *pool);
void xp_release(struct xdp_buff_xsk *xskb);
+void xp_get_pool(struct xsk_buff_pool *pool);
+void xp_put_pool(struct xsk_buff_pool *pool);
+void xp_clear_dev(struct xsk_buff_pool *pool);
+void xp_add_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs);
+void xp_del_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs);
/* AF_XDP, and XDP core. */
void xp_free(struct xdp_buff_xsk *xskb);
@@ -80,9 +116,6 @@ static inline dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb)
void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb);
static inline void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb)
{
- if (!xskb->pool->dma_need_sync)
- return;
-
xp_dma_sync_for_cpu_slow(xskb);
}
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index b6238b2209b7..8dda13880957 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -155,6 +155,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_DEVMAP_HASH,
BPF_MAP_TYPE_STRUCT_OPS,
BPF_MAP_TYPE_RINGBUF,
+ BPF_MAP_TYPE_INODE_STORAGE,
};
/* Note that tracing related programs such as
@@ -345,6 +346,14 @@ enum bpf_link_type {
/* The verifier internal test flag. Behavior is undefined */
#define BPF_F_TEST_STATE_FREQ (1U << 3)
+/* If BPF_F_SLEEPABLE is used in BPF_PROG_LOAD command, the verifier will
+ * restrict map and helper usage for such programs. Sleepable BPF programs can
+ * only be attached to hooks where kernel execution context allows sleeping.
+ * Such programs are allowed to use helpers that may sleep like
+ * bpf_copy_from_user().
+ */
+#define BPF_F_SLEEPABLE (1U << 4)
+
/* When BPF ldimm64's insn[0].src_reg != 0 then this can have
* two extensions:
*
@@ -2807,7 +2816,7 @@ union bpf_attr {
*
* **-ERANGE** if resulting value was out of range.
*
- * void *bpf_sk_storage_get(struct bpf_map *map, struct bpf_sock *sk, void *value, u64 flags)
+ * void *bpf_sk_storage_get(struct bpf_map *map, void *sk, void *value, u64 flags)
* Description
* Get a bpf-local-storage from a *sk*.
*
@@ -2823,6 +2832,9 @@ union bpf_attr {
* "type". The bpf-local-storage "type" (i.e. the *map*) is
* searched against all bpf-local-storages residing at *sk*.
*
+ * *sk* is a kernel **struct sock** pointer for LSM program.
+ * *sk* is a **struct bpf_sock** pointer for other program types.
+ *
* An optional *flags* (**BPF_SK_STORAGE_GET_F_CREATE**) can be
* used such that a new bpf-local-storage will be
* created if one does not exist. *value* can be used
@@ -2835,7 +2847,7 @@ union bpf_attr {
* **NULL** if not found or there was an error in adding
* a new bpf-local-storage.
*
- * long bpf_sk_storage_delete(struct bpf_map *map, struct bpf_sock *sk)
+ * long bpf_sk_storage_delete(struct bpf_map *map, void *sk)
* Description
* Delete a bpf-local-storage from a *sk*.
* Return
@@ -3395,6 +3407,175 @@ union bpf_attr {
* A non-negative value equal to or less than *size* on success,
* or a negative error in case of failure.
*
+ * long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags)
+ * Description
+ * Load header option. Support reading a particular TCP header
+ * option for bpf program (BPF_PROG_TYPE_SOCK_OPS).
+ *
+ * If *flags* is 0, it will search the option from the
+ * sock_ops->skb_data. The comment in "struct bpf_sock_ops"
+ * has details on what skb_data contains under different
+ * sock_ops->op.
+ *
+ * The first byte of the *searchby_res* specifies the
+ * kind that it wants to search.
+ *
+ * If the searching kind is an experimental kind
+ * (i.e. 253 or 254 according to RFC6994). It also
+ * needs to specify the "magic" which is either
+ * 2 bytes or 4 bytes. It then also needs to
+ * specify the size of the magic by using
+ * the 2nd byte which is "kind-length" of a TCP
+ * header option and the "kind-length" also
+ * includes the first 2 bytes "kind" and "kind-length"
+ * itself as a normal TCP header option also does.
+ *
+ * For example, to search experimental kind 254 with
+ * 2 byte magic 0xeB9F, the searchby_res should be
+ * [ 254, 4, 0xeB, 0x9F, 0, 0, .... 0 ].
+ *
+ * To search for the standard window scale option (3),
+ * the searchby_res should be [ 3, 0, 0, .... 0 ].
+ * Note, kind-length must be 0 for regular option.
+ *
+ * Searching for No-Op (0) and End-of-Option-List (1) are
+ * not supported.
+ *
+ * *len* must be at least 2 bytes which is the minimal size
+ * of a header option.
+ *
+ * Supported flags:
+ * * **BPF_LOAD_HDR_OPT_TCP_SYN** to search from the
+ * saved_syn packet or the just-received syn packet.
+ *
+ * Return
+ * >0 when found, the header option is copied to *searchby_res*.
+ * The return value is the total length copied.
+ *
+ * **-EINVAL** If param is invalid
+ *
+ * **-ENOMSG** The option is not found
+ *
+ * **-ENOENT** No syn packet available when
+ * **BPF_LOAD_HDR_OPT_TCP_SYN** is used
+ *
+ * **-ENOSPC** Not enough space. Only *len* number of
+ * bytes are copied.
+ *
+ * **-EFAULT** Cannot parse the header options in the packet
+ *
+ * **-EPERM** This helper cannot be used under the
+ * current sock_ops->op.
+ *
+ * long bpf_store_hdr_opt(struct bpf_sock_ops *skops, const void *from, u32 len, u64 flags)
+ * Description
+ * Store header option. The data will be copied
+ * from buffer *from* with length *len* to the TCP header.
+ *
+ * The buffer *from* should have the whole option that
+ * includes the kind, kind-length, and the actual
+ * option data. The *len* must be at least kind-length
+ * long. The kind-length does not have to be 4 byte
+ * aligned. The kernel will take care of the padding
+ * and setting the 4 bytes aligned value to th->doff.
+ *
+ * This helper will check for duplicated option
+ * by searching the same option in the outgoing skb.
+ *
+ * This helper can only be called during
+ * BPF_SOCK_OPS_WRITE_HDR_OPT_CB.
+ *
+ * Return
+ * 0 on success, or negative error in case of failure:
+ *
+ * **-EINVAL** If param is invalid
+ *
+ * **-ENOSPC** Not enough space in the header.
+ * Nothing has been written
+ *
+ * **-EEXIST** The option has already existed
+ *
+ * **-EFAULT** Cannot parse the existing header options
+ *
+ * **-EPERM** This helper cannot be used under the
+ * current sock_ops->op.
+ *
+ * long bpf_reserve_hdr_opt(struct bpf_sock_ops *skops, u32 len, u64 flags)
+ * Description
+ * Reserve *len* bytes for the bpf header option. The
+ * space will be used by bpf_store_hdr_opt() later in
+ * BPF_SOCK_OPS_WRITE_HDR_OPT_CB.
+ *
+ * If bpf_reserve_hdr_opt() is called multiple times,
+ * the total number of bytes will be reserved.
+ *
+ * This helper can only be called during
+ * BPF_SOCK_OPS_HDR_OPT_LEN_CB.
+ *
+ * Return
+ * 0 on success, or negative error in case of failure:
+ *
+ * **-EINVAL** if param is invalid
+ *
+ * **-ENOSPC** Not enough space in the header.
+ *
+ * **-EPERM** This helper cannot be used under the
+ * current sock_ops->op.
+ *
+ * void *bpf_inode_storage_get(struct bpf_map *map, void *inode, void *value, u64 flags)
+ * Description
+ * Get a bpf_local_storage from an *inode*.
+ *
+ * Logically, it could be thought of as getting the value from
+ * a *map* with *inode* as the **key**. From this
+ * perspective, the usage is not much different from
+ * **bpf_map_lookup_elem**\ (*map*, **&**\ *inode*) except this
+ * helper enforces the key must be an inode and the map must also
+ * be a **BPF_MAP_TYPE_INODE_STORAGE**.
+ *
+ * Underneath, the value is stored locally at *inode* instead of
+ * the *map*. The *map* is used as the bpf-local-storage
+ * "type". The bpf-local-storage "type" (i.e. the *map*) is
+ * searched against all bpf_local_storage residing at *inode*.
+ *
+ * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be
+ * used such that a new bpf_local_storage will be
+ * created if one does not exist. *value* can be used
+ * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify
+ * the initial value of a bpf_local_storage. If *value* is
+ * **NULL**, the new bpf_local_storage will be zero initialized.
+ * Return
+ * A bpf_local_storage pointer is returned on success.
+ *
+ * **NULL** if not found or there was an error in adding
+ * a new bpf_local_storage.
+ *
+ * int bpf_inode_storage_delete(struct bpf_map *map, void *inode)
+ * Description
+ * Delete a bpf_local_storage from an *inode*.
+ * Return
+ * 0 on success.
+ *
+ * **-ENOENT** if the bpf_local_storage cannot be found.
+ *
+ * long bpf_d_path(struct path *path, char *buf, u32 sz)
+ * Description
+ * Return full path for given 'struct path' object, which
+ * needs to be the kernel BTF 'path' object. The path is
+ * returned in the provided buffer 'buf' of size 'sz' and
+ * is zero terminated.
+ *
+ * Return
+ * On success, the strictly positive length of the string,
+ * including the trailing NUL character. On error, a negative
+ * value.
+ *
+ * long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr)
+ * Description
+ * Read *size* bytes from user space address *user_ptr* and store
+ * the data in *dst*. This is a wrapper of copy_from_user().
+ * Return
+ * 0 on success, or a negative error in case of failure.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@@ -3539,6 +3720,13 @@ union bpf_attr {
FN(skc_to_tcp_request_sock), \
FN(skc_to_udp6_sock), \
FN(get_task_stack), \
+ FN(load_hdr_opt), \
+ FN(store_hdr_opt), \
+ FN(reserve_hdr_opt), \
+ FN(inode_storage_get), \
+ FN(inode_storage_delete), \
+ FN(d_path), \
+ FN(copy_from_user), \
/* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -3648,9 +3836,13 @@ enum {
BPF_F_SYSCTL_BASE_NAME = (1ULL << 0),
};
-/* BPF_FUNC_sk_storage_get flags */
+/* BPF_FUNC_<kernel_obj>_storage_get flags */
enum {
- BPF_SK_STORAGE_GET_F_CREATE = (1ULL << 0),
+ BPF_LOCAL_STORAGE_GET_F_CREATE = (1ULL << 0),
+ /* BPF_SK_STORAGE_GET_F_CREATE is only kept for backward compatibility
+ * and BPF_LOCAL_STORAGE_GET_F_CREATE must be used instead.
+ */
+ BPF_SK_STORAGE_GET_F_CREATE = BPF_LOCAL_STORAGE_GET_F_CREATE,
};
/* BPF_FUNC_read_branch_records flags. */
@@ -4071,6 +4263,15 @@ struct bpf_link_info {
__u64 cgroup_id;
__u32 attach_type;
} cgroup;
+ struct {
+ __aligned_u64 target_name; /* in/out: target_name buffer ptr */
+ __u32 target_name_len; /* in/out: target_name buffer len */
+ union {
+ struct {
+ __u32 map_id;
+ } map;
+ };
+ } iter;
struct {
__u32 netns_ino;
__u32 attach_type;
@@ -4158,6 +4359,36 @@ struct bpf_sock_ops {
__u64 bytes_received;
__u64 bytes_acked;
__bpf_md_ptr(struct bpf_sock *, sk);
+ /* [skb_data, skb_data_end) covers the whole TCP header.
+ *
+ * BPF_SOCK_OPS_PARSE_HDR_OPT_CB: The packet received
+ * BPF_SOCK_OPS_HDR_OPT_LEN_CB: Not useful because the
+ * header has not been written.
+ * BPF_SOCK_OPS_WRITE_HDR_OPT_CB: The header and options have
+ * been written so far.
+ * BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: The SYNACK that concludes
+ * the 3WHS.
+ * BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: The ACK that concludes
+ * the 3WHS.
+ *
+ * bpf_load_hdr_opt() can also be used to read a particular option.
+ */
+ __bpf_md_ptr(void *, skb_data);
+ __bpf_md_ptr(void *, skb_data_end);
+ __u32 skb_len; /* The total length of a packet.
+ * It includes the header, options,
+ * and payload.
+ */
+ __u32 skb_tcp_flags; /* tcp_flags of the header. It provides
+ * an easy way to check for tcp_flags
+ * without parsing skb_data.
+ *
+ * In particular, the skb_tcp_flags
+ * will still be available in
+ * BPF_SOCK_OPS_HDR_OPT_LEN even though
+ * the outgoing header has not
+ * been written yet.
+ */
};
/* Definitions for bpf_sock_ops_cb_flags */
@@ -4166,8 +4397,51 @@ enum {
BPF_SOCK_OPS_RETRANS_CB_FLAG = (1<<1),
BPF_SOCK_OPS_STATE_CB_FLAG = (1<<2),
BPF_SOCK_OPS_RTT_CB_FLAG = (1<<3),
+ /* Call bpf for all received TCP headers. The bpf prog will be
+ * called under sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB
+ *
+ * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB
+ * for the header option related helpers that will be useful
+ * to the bpf programs.
+ *
+ * It could be used at the client/active side (i.e. connect() side)
+ * when the server told it that the server was in syncookie
+ * mode and required the active side to resend the bpf-written
+ * options. The active side can keep writing the bpf-options until
+ * it received a valid packet from the server side to confirm
+ * the earlier packet (and options) has been received. The later
+ * example patch is using it like this at the active side when the
+ * server is in syncookie mode.
+ *
+ * The bpf prog will usually turn this off in the common cases.
+ */
+ BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG = (1<<4),
+ /* Call bpf when kernel has received a header option that
+ * the kernel cannot handle. The bpf prog will be called under
+ * sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB.
+ *
+ * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB
+ * for the header option related helpers that will be useful
+ * to the bpf programs.
+ */
+ BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG = (1<<5),
+ /* Call bpf when the kernel is writing header options for the
+ * outgoing packet. The bpf prog will first be called
+ * to reserve space in a skb under
+ * sock_ops->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB. Then
+ * the bpf prog will be called to write the header option(s)
+ * under sock_ops->op == BPF_SOCK_OPS_WRITE_HDR_OPT_CB.
+ *
+ * Please refer to the comment in BPF_SOCK_OPS_HDR_OPT_LEN_CB
+ * and BPF_SOCK_OPS_WRITE_HDR_OPT_CB for the header option
+ * related helpers that will be useful to the bpf programs.
+ *
+ * The kernel gets its chance to reserve space and write
+ * options first before the BPF program does.
+ */
+ BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG = (1<<6),
/* Mask of all currently supported cb flags */
- BPF_SOCK_OPS_ALL_CB_FLAGS = 0xF,
+ BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F,
};
/* List of known BPF sock_ops operators.
@@ -4223,6 +4497,63 @@ enum {
*/
BPF_SOCK_OPS_RTT_CB, /* Called on every RTT.
*/
+ BPF_SOCK_OPS_PARSE_HDR_OPT_CB, /* Parse the header option.
+ * It will be called to handle
+ * the packets received at
+ * an already established
+ * connection.
+ *
+ * sock_ops->skb_data:
+ * Referring to the received skb.
+ * It covers the TCP header only.
+ *
+ * bpf_load_hdr_opt() can also
+ * be used to search for a
+ * particular option.
+ */
+ BPF_SOCK_OPS_HDR_OPT_LEN_CB, /* Reserve space for writing the
+ * header option later in
+ * BPF_SOCK_OPS_WRITE_HDR_OPT_CB.
+ * Arg1: bool want_cookie. (in
+ * writing SYNACK only)
+ *
+ * sock_ops->skb_data:
+ * Not available because no header has
+ * been written yet.
+ *
+ * sock_ops->skb_tcp_flags:
+ * The tcp_flags of the
+ * outgoing skb. (e.g. SYN, ACK, FIN).
+ *
+ * bpf_reserve_hdr_opt() should
+ * be used to reserve space.
+ */
+ BPF_SOCK_OPS_WRITE_HDR_OPT_CB, /* Write the header options
+ * Arg1: bool want_cookie. (in
+ * writing SYNACK only)
+ *
+ * sock_ops->skb_data:
+ * Referring to the outgoing skb.
+ * It covers the TCP header
+ * that has already been written
+ * by the kernel and the
+ * earlier bpf-progs.
+ *
+ * sock_ops->skb_tcp_flags:
+ * The tcp_flags of the outgoing
+ * skb. (e.g. SYN, ACK, FIN).
+ *
+ * bpf_store_hdr_opt() should
+ * be used to write the
+ * option.
+ *
+ * bpf_load_hdr_opt() can also
+ * be used to search for a
+ * particular option that
+ * has already been written
+ * by the kernel or the
+ * earlier bpf-progs.
+ */
};
/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
@@ -4250,6 +4581,63 @@ enum {
enum {
TCP_BPF_IW = 1001, /* Set TCP initial congestion window */
TCP_BPF_SNDCWND_CLAMP = 1002, /* Set sndcwnd_clamp */
+ TCP_BPF_DELACK_MAX = 1003, /* Max delay ack in usecs */
+ TCP_BPF_RTO_MIN = 1004, /* Min delay ack in usecs */
+ /* Copy the SYN pkt to optval
+ *
+ * BPF_PROG_TYPE_SOCK_OPS only. It is similar to the
+ * bpf_getsockopt(TCP_SAVED_SYN) but it does not limit
+ * to only getting from the saved_syn. It can either get the
+ * syn packet from:
+ *
+ * 1. the just-received SYN packet (only available when writing the
+ * SYNACK). It will be useful when it is not necessary to
+ * save the SYN packet for latter use. It is also the only way
+ * to get the SYN during syncookie mode because the syn
+ * packet cannot be saved during syncookie.
+ *
+ * OR
+ *
+ * 2. the earlier saved syn which was done by
+ * bpf_setsockopt(TCP_SAVE_SYN).
+ *
+ * The bpf_getsockopt(TCP_BPF_SYN*) option will hide where the
+ * SYN packet is obtained.
+ *
+ * If the bpf-prog does not need the IP[46] header, the
+ * bpf-prog can avoid parsing the IP header by using
+ * TCP_BPF_SYN. Otherwise, the bpf-prog can get both
+ * IP[46] and TCP header by using TCP_BPF_SYN_IP.
+ *
+ * >0: Total number of bytes copied
+ * -ENOSPC: Not enough space in optval. Only optlen number of
+ * bytes is copied.
+ * -ENOENT: The SYN skb is not available now and the earlier SYN pkt
+ * is not saved by setsockopt(TCP_SAVE_SYN).
+ */
+ TCP_BPF_SYN = 1005, /* Copy the TCP header */
+ TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */
+ TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */
+};
+
+enum {
+ BPF_LOAD_HDR_OPT_TCP_SYN = (1ULL << 0),
+};
+
+/* args[0] value during BPF_SOCK_OPS_HDR_OPT_LEN_CB and
+ * BPF_SOCK_OPS_WRITE_HDR_OPT_CB.
+ */
+enum {
+ BPF_WRITE_HDR_TCP_CURRENT_MSS = 1, /* Kernel is finding the
+ * total option spaces
+ * required for an established
+ * sk in order to calculate the
+ * MSS. No skb is actually
+ * sent.
+ */
+ BPF_WRITE_HDR_TCP_SYNACK_COOKIE = 2, /* Kernel is in syncookie mode
+ * when sending a SYN.
+ */
};
struct bpf_perf_event_value {