summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--MAINTAINERS10
-rw-r--r--include/linux/bpf.h33
-rw-r--r--include/linux/bpf_types.h2
-rw-r--r--include/linux/filter.h21
-rw-r--r--include/linux/skmsg.h410
-rw-r--r--include/net/addrconf.h5
-rw-r--r--include/net/sock.h4
-rw-r--r--include/net/tcp.h28
-rw-r--r--include/net/tls.h24
-rw-r--r--kernel/bpf/Makefile5
-rw-r--r--kernel/bpf/arraymap.c2
-rw-r--r--kernel/bpf/btf.c3
-rw-r--r--kernel/bpf/core.c2
-rw-r--r--kernel/bpf/sockmap.c2629
-rw-r--r--kernel/bpf/stackmap.c2
-rw-r--r--kernel/bpf/syscall.c28
-rw-r--r--kernel/bpf/xskmap.c2
-rw-r--r--net/Kconfig11
-rw-r--r--net/core/Makefile2
-rw-r--r--net/core/dev.c14
-rw-r--r--net/core/filter.c290
-rw-r--r--net/core/skmsg.c802
-rw-r--r--net/core/sock.c61
-rw-r--r--net/core/sock_map.c1002
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/tcp_bpf.c655
-rw-r--r--net/ipv4/tcp_ulp.c73
-rw-r--r--net/ipv6/af_inet6.c1
-rw-r--r--net/strparser/Kconfig4
-rw-r--r--net/tls/Kconfig1
-rw-r--r--net/tls/tls_device.c2
-rw-r--r--net/tls/tls_main.c11
-rw-r--r--net/tls/tls_sw.c900
-rw-r--r--tools/bpf/bpftool/Documentation/bpftool-map.rst15
-rw-r--r--tools/bpf/bpftool/Documentation/bpftool-prog.rst11
-rw-r--r--tools/bpf/bpftool/Documentation/bpftool.rst10
-rw-r--r--tools/bpf/bpftool/Makefile9
-rw-r--r--tools/bpf/bpftool/bash-completion/bpftool59
-rw-r--r--tools/bpf/bpftool/common.c21
-rw-r--r--tools/bpf/bpftool/main.c7
-rw-r--r--tools/bpf/bpftool/main.h4
-rw-r--r--tools/bpf/bpftool/map.c212
-rw-r--r--tools/bpf/bpftool/prog.c101
-rw-r--r--tools/lib/bpf/Makefile2
-rw-r--r--tools/lib/bpf/bpf.h3
-rw-r--r--tools/lib/bpf/libbpf.c35
-rw-r--r--tools/lib/bpf/libbpf.h2
-rw-r--r--tools/testing/selftests/bpf/Makefile8
-rw-r--r--tools/testing/selftests/bpf/bpf_helpers.h4
-rw-r--r--tools/testing/selftests/bpf/config1
-rw-r--r--tools/testing/selftests/bpf/test_sockmap.c89
-rw-r--r--tools/testing/selftests/bpf/test_verifier.c501
-rw-r--r--tools/testing/selftests/bpf/test_xdp_vlan.c292
-rwxr-xr-xtools/testing/selftests/bpf/test_xdp_vlan.sh195
54 files changed, 4962 insertions, 3659 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 031127139f3b..7f1399ac028e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8193,6 +8193,16 @@ S: Maintained
F: net/l3mdev
F: include/net/l3mdev.h
+L7 BPF FRAMEWORK
+M: John Fastabend <john.fastabend@gmail.com>
+M: Daniel Borkmann <daniel@iogearbox.net>
+L: netdev@vger.kernel.org
+S: Maintained
+F: include/linux/skmsg.h
+F: net/core/skmsg.c
+F: net/core/sock_map.c
+F: net/ipv4/tcp_bpf.c
+
LANTIQ / INTEL Ethernet drivers
M: Hauke Mehrtens <hauke@hauke-m.de>
L: netdev@vger.kernel.org
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9b558713447f..e60fff48288b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -737,33 +737,18 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map)
}
#endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
-#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_INET)
-struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key);
-struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key);
-int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type);
-int sockmap_get_from_fd(const union bpf_attr *attr, int type,
- struct bpf_prog *prog);
+#if defined(CONFIG_BPF_STREAM_PARSER)
+int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, u32 which);
+int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog);
#else
-static inline struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
-{
- return NULL;
-}
-
-static inline struct sock *__sock_hash_lookup_elem(struct bpf_map *map,
- void *key)
-{
- return NULL;
-}
-
-static inline int sock_map_prog(struct bpf_map *map,
- struct bpf_prog *prog,
- u32 type)
+static inline int sock_map_prog_update(struct bpf_map *map,
+ struct bpf_prog *prog, u32 which)
{
return -EOPNOTSUPP;
}
-static inline int sockmap_get_from_fd(const union bpf_attr *attr, int type,
- struct bpf_prog *prog)
+static inline int sock_map_get_from_fd(const union bpf_attr *attr,
+ struct bpf_prog *prog)
{
return -EINVAL;
}
@@ -839,6 +824,10 @@ extern const struct bpf_func_proto bpf_get_stack_proto;
extern const struct bpf_func_proto bpf_sock_map_update_proto;
extern const struct bpf_func_proto bpf_sock_hash_update_proto;
extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto;
+extern const struct bpf_func_proto bpf_msg_redirect_hash_proto;
+extern const struct bpf_func_proto bpf_msg_redirect_map_proto;
+extern const struct bpf_func_proto bpf_sk_redirect_hash_proto;
+extern const struct bpf_func_proto bpf_sk_redirect_map_proto;
extern const struct bpf_func_proto bpf_get_local_storage_proto;
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 5432f4c9f50e..fa48343a5ea1 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -57,7 +57,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
#ifdef CONFIG_NET
BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
-#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_INET)
+#if defined(CONFIG_BPF_STREAM_PARSER)
BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops)
#endif
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 6791a0ac0139..5771874bc01e 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -520,24 +520,6 @@ struct bpf_skb_data_end {
void *data_end;
};
-struct sk_msg_buff {
- void *data;
- void *data_end;
- __u32 apply_bytes;
- __u32 cork_bytes;
- int sg_copybreak;
- int sg_start;
- int sg_curr;
- int sg_end;
- struct scatterlist sg_data[MAX_SKB_FRAGS];
- bool sg_copy[MAX_SKB_FRAGS];
- __u32 flags;
- struct sock *sk_redir;
- struct sock *sk;
- struct sk_buff *skb;
- struct list_head list;
-};
-
struct bpf_redirect_info {
u32 ifindex;
u32 flags;
@@ -833,9 +815,6 @@ void xdp_do_flush_map(void);
void bpf_warn_invalid_xdp_action(u32 act);
-struct sock *do_sk_redirect_map(struct sk_buff *skb);
-struct sock *do_msg_redirect_map(struct sk_msg_buff *md);
-
#ifdef CONFIG_INET
struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
struct bpf_prog *prog, struct sk_buff *skb,
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
new file mode 100644
index 000000000000..0b919f0bc6d6
--- /dev/null
+++ b/include/linux/skmsg.h
@@ -0,0 +1,410 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
+
+#ifndef _LINUX_SKMSG_H
+#define _LINUX_SKMSG_H
+
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/scatterlist.h>
+#include <linux/skbuff.h>
+
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <net/strparser.h>
+
+#define MAX_MSG_FRAGS MAX_SKB_FRAGS
+
+enum __sk_action {
+ __SK_DROP = 0,
+ __SK_PASS,
+ __SK_REDIRECT,
+ __SK_NONE,
+};
+
+struct sk_msg_sg {
+ u32 start;
+ u32 curr;
+ u32 end;
+ u32 size;
+ u32 copybreak;
+ bool copy[MAX_MSG_FRAGS];
+ /* The extra element is used for chaining the front and sections when
+ * the list becomes partitioned (e.g. end < start). The crypto APIs
+ * require the chaining.
+ */
+ struct scatterlist data[MAX_MSG_FRAGS + 1];
+};
+
+struct sk_msg {
+ struct sk_msg_sg sg;
+ void *data;
+ void *data_end;
+ u32 apply_bytes;
+ u32 cork_bytes;
+ u32 flags;
+ struct sk_buff *skb;
+ struct sock *sk_redir;
+ struct sock *sk;
+ struct list_head list;
+};
+
+struct sk_psock_progs {
+ struct bpf_prog *msg_parser;
+ struct bpf_prog *skb_parser;
+ struct bpf_prog *skb_verdict;
+};
+
+enum sk_psock_state_bits {
+ SK_PSOCK_TX_ENABLED,
+};
+
+struct sk_psock_link {
+ struct list_head list;
+ struct bpf_map *map;
+ void *link_raw;
+};
+
+struct sk_psock_parser {
+ struct strparser strp;
+ bool enabled;
+ void (*saved_data_ready)(struct sock *sk);
+};
+
+struct sk_psock_work_state {
+ struct sk_buff *skb;
+ u32 len;
+ u32 off;
+};
+
+struct sk_psock {
+ struct sock *sk;
+ struct sock *sk_redir;
+ u32 apply_bytes;
+ u32 cork_bytes;
+ u32 eval;
+ struct sk_msg *cork;
+ struct sk_psock_progs progs;
+ struct sk_psock_parser parser;
+ struct sk_buff_head ingress_skb;
+ struct list_head ingress_msg;
+ unsigned long state;
+ struct list_head link;
+ spinlock_t link_lock;
+ refcount_t refcnt;
+ void (*saved_unhash)(struct sock *sk);
+ void (*saved_close)(struct sock *sk, long timeout);
+ void (*saved_write_space)(struct sock *sk);
+ struct proto *sk_proto;
+ struct sk_psock_work_state work_state;
+ struct work_struct work;
+ union {
+ struct rcu_head rcu;
+ struct work_struct gc;
+ };
+};
+
+int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len,
+ int elem_first_coalesce);
+int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src,
+ u32 off, u32 len);
+void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len);
+int sk_msg_free(struct sock *sk, struct sk_msg *msg);
+int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg);
+void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes);
+void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg,
+ u32 bytes);
+
+void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes);
+void sk_msg_return_zero(struct sock *sk, struct sk_msg *msg, int bytes);
+
+int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
+ struct sk_msg *msg, u32 bytes);
+int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
+ struct sk_msg *msg, u32 bytes);
+
+static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes)
+{
+ WARN_ON(i == msg->sg.end && bytes);
+}
+
+static inline void sk_msg_apply_bytes(struct sk_psock *psock, u32 bytes)
+{
+ if (psock->apply_bytes) {
+ if (psock->apply_bytes < bytes)
+ psock->apply_bytes = 0;
+ else
+ psock->apply_bytes -= bytes;
+ }
+}
+
+#define sk_msg_iter_var_prev(var) \
+ do { \
+ if (var == 0) \
+ var = MAX_MSG_FRAGS - 1; \
+ else \
+ var--; \
+ } while (0)
+
+#define sk_msg_iter_var_next(var) \
+ do { \
+ var++; \
+ if (var == MAX_MSG_FRAGS) \
+ var = 0; \
+ } while (0)
+
+#define sk_msg_iter_prev(msg, which) \
+ sk_msg_iter_var_prev(msg->sg.which)
+
+#define sk_msg_iter_next(msg, which) \
+ sk_msg_iter_var_next(msg->sg.which)
+
+static inline void sk_msg_clear_meta(struct sk_msg *msg)
+{
+ memset(&msg->sg, 0, offsetofend(struct sk_msg_sg, copy));
+}
+
+static inline void sk_msg_init(struct sk_msg *msg)
+{
+ BUILD_BUG_ON(ARRAY_SIZE(msg->sg.data) - 1 != MAX_MSG_FRAGS);
+ memset(msg, 0, sizeof(*msg));
+ sg_init_marker(msg->sg.data, MAX_MSG_FRAGS);
+}
+
+static inline void sk_msg_xfer(struct sk_msg *dst, struct sk_msg *src,
+ int which, u32 size)
+{
+ dst->sg.data[which] = src->sg.data[which];
+ dst->sg.data[which].length = size;
+ src->sg.data[which].length -= size;
+ src->sg.data[which].offset += size;
+}
+
+static inline void sk_msg_xfer_full(struct sk_msg *dst, struct sk_msg *src)
+{
+ memcpy(dst, src, sizeof(*src));
+ sk_msg_init(src);
+}
+
+static inline u32 sk_msg_elem_used(const struct sk_msg *msg)
+{
+ return msg->sg.end >= msg->sg.start ?
+ msg->sg.end - msg->sg.start :
+ msg->sg.end + (MAX_MSG_FRAGS - msg->sg.start);
+}
+
+static inline bool sk_msg_full(const struct sk_msg *msg)
+{
+ return (msg->sg.end == msg->sg.start) && msg->sg.size;
+}
+
+static inline struct scatterlist *sk_msg_elem(struct sk_msg *msg, int which)
+{
+ return &msg->sg.data[which];
+}
+
+static inline struct page *sk_msg_page(struct sk_msg *msg, int which)
+{
+ return sg_page(sk_msg_elem(msg, which));
+}
+
+static inline bool sk_msg_to_ingress(const struct sk_msg *msg)
+{
+ return msg->flags & BPF_F_INGRESS;
+}
+
+static inline void sk_msg_compute_data_pointers(struct sk_msg *msg)
+{
+ struct scatterlist *sge = sk_msg_elem(msg, msg->sg.start);
+
+ if (msg->sg.copy[msg->sg.start]) {
+ msg->data = NULL;
+ msg->data_end = NULL;
+ } else {
+ msg->data = sg_virt(sge);
+ msg->data_end = msg->data + sge->length;
+ }
+}
+
+static inline void sk_msg_page_add(struct sk_msg *msg, struct page *page,
+ u32 len, u32 offset)
+{
+ struct scatterlist *sge;
+
+ get_page(page);
+ sge = sk_msg_elem(msg, msg->sg.end);
+ sg_set_page(sge, page, len, offset);
+ sg_unmark_end(sge);
+
+ msg->sg.copy[msg->sg.end] = true;
+ msg->sg.size += len;
+ sk_msg_iter_next(msg, end);
+}
+
+static inline void sk_msg_sg_copy(struct sk_msg *msg, u32 i, bool copy_state)
+{
+ do {
+ msg->sg.copy[i] = copy_state;
+ sk_msg_iter_var_next(i);
+ if (i == msg->sg.end)
+ break;
+ } while (1);
+}
+
+static inline void sk_msg_sg_copy_set(struct sk_msg *msg, u32 start)
+{
+ sk_msg_sg_copy(msg, start, true);
+}
+
+static inline void sk_msg_sg_copy_clear(struct sk_msg *msg, u32 start)
+{
+ sk_msg_sg_copy(msg, start, false);
+}
+
+static inline struct sk_psock *sk_psock(const struct sock *sk)
+{
+ return rcu_dereference_sk_user_data(sk);
+}
+
+static inline bool sk_has_psock(struct sock *sk)
+{
+ return sk_psock(sk) != NULL && sk->sk_prot->recvmsg == tcp_bpf_recvmsg;
+}
+
+static inline void sk_psock_queue_msg(struct sk_psock *psock,
+ struct sk_msg *msg)
+{
+ list_add_tail(&msg->list, &psock->ingress_msg);
+}
+
+static inline bool sk_psock_queue_empty(const struct sk_psock *psock)
+{
+ return psock ? list_empty(&psock->ingress_msg) : true;
+}
+
+static inline void sk_psock_report_error(struct sk_psock *psock, int err)
+{
+ struct sock *sk = psock->sk;
+
+ sk->sk_err = err;
+ sk->sk_error_report(sk);
+}
+
+struct sk_psock *sk_psock_init(struct sock *sk, int node);
+
+int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock);
+void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock);
+void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock);
+
+int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
+ struct sk_msg *msg);
+
+static inline struct sk_psock_link *sk_psock_init_link(void)
+{
+ return kzalloc(sizeof(struct sk_psock_link),
+ GFP_ATOMIC | __GFP_NOWARN);
+}
+
+static inline void sk_psock_free_link(struct sk_psock_link *link)
+{
+ kfree(link);
+}
+
+struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock);
+#if defined(CONFIG_BPF_STREAM_PARSER)
+void sk_psock_unlink(struct sock *sk, struct sk_psock_link *link);
+#else
+static inline void sk_psock_unlink(struct sock *sk,
+ struct sk_psock_link *link)
+{
+}
+#endif
+
+void __sk_psock_purge_ingress_msg(struct sk_psock *psock);
+
+static inline void sk_psock_cork_free(struct sk_psock *psock)
+{
+ if (psock->cork) {
+ sk_msg_free(psock->sk, psock->cork);
+ kfree(psock->cork);
+ psock->cork = NULL;
+ }
+}
+
+static inline void sk_psock_update_proto(struct sock *sk,
+ struct sk_psock *psock,
+ struct proto *ops)
+{
+ psock->saved_unhash = sk->sk_prot->unhash;
+ psock->saved_close = sk->sk_prot->close;
+ psock->saved_write_space = sk->sk_write_space;
+
+ psock->sk_proto = sk->sk_prot;
+ sk->sk_prot = ops;
+}
+
+static inline void sk_psock_restore_proto(struct sock *sk,
+ struct sk_psock *psock)
+{
+ if (psock->sk_proto) {
+ sk->sk_prot = psock->sk_proto;
+ psock->sk_proto = NULL;
+ }
+}
+
+static inline void sk_psock_set_state(struct sk_psock *psock,
+ enum sk_psock_state_bits bit)
+{
+ set_bit(bit, &psock->state);
+}
+
+static inline void sk_psock_clear_state(struct sk_psock *psock,
+ enum sk_psock_state_bits bit)
+{
+ clear_bit(bit, &psock->state);
+}
+
+static inline bool sk_psock_test_state(const struct sk_psock *psock,
+ enum sk_psock_state_bits bit)
+{
+ return test_bit(bit, &psock->state);
+}
+
+static inline struct sk_psock *sk_psock_get(struct sock *sk)
+{
+ struct sk_psock *psock;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (psock && !refcount_inc_not_zero(&psock->refcnt))
+ psock = NULL;
+ rcu_read_unlock();
+ return psock;
+}
+
+void sk_psock_stop(struct sock *sk, struct sk_psock *psock);
+void sk_psock_destroy(struct rcu_head *rcu);
+void sk_psock_drop(struct sock *sk, struct sk_psock *psock);
+
+static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock)
+{
+ if (refcount_dec_and_test(&psock->refcnt))
+ sk_psock_drop(sk, psock);
+}
+
+static inline void psock_set_prog(struct bpf_prog **pprog,
+ struct bpf_prog *prog)
+{
+ prog = xchg(pprog, prog);
+ if (prog)
+ bpf_prog_put(prog);
+}
+
+static inline void psock_progs_drop(struct sk_psock_progs *progs)
+{
+ psock_set_prog(&progs->msg_parser, NULL);
+ psock_set_prog(&progs->skb_parser, NULL);
+ psock_set_prog(&progs->skb_verdict, NULL);
+}
+
+#endif /* _LINUX_SKMSG_H */
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 6def0351bcc3..14b789a123e7 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -265,6 +265,11 @@ extern const struct ipv6_stub *ipv6_stub __read_mostly;
struct ipv6_bpf_stub {
int (*inet6_bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len,
bool force_bind_address_no_port, bool with_lock);
+ struct sock *(*udp6_lib_lookup)(struct net *net,
+ const struct in6_addr *saddr, __be16 sport,
+ const struct in6_addr *daddr, __be16 dport,
+ int dif, int sdif, struct udp_table *tbl,
+ struct sk_buff *skb);
};
extern const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly;
diff --git a/include/net/sock.h b/include/net/sock.h
index cfaf261936c8..2440f8b407eb 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2214,10 +2214,6 @@ static inline struct page_frag *sk_page_frag(struct sock *sk)
bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag);
-int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
- int sg_start, int *sg_curr, unsigned int *sg_size,
- int first_coalesce);
-
/*
* Default write policy as shown to user space via poll/select/SIGIO
*/
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0d2929223c70..3600ae0f25c3 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -858,6 +858,21 @@ static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb)
TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb);
}
+static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb)
+{
+ return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS;
+}
+
+static inline struct sock *tcp_skb_bpf_redirect_fetch(struct sk_buff *skb)
+{
+ return TCP_SKB_CB(skb)->bpf.sk_redir;
+}
+
+static inline void tcp_skb_bpf_redirect_clear(struct sk_buff *skb)
+{
+ TCP_SKB_CB(skb)->bpf.sk_redir = NULL;
+}
+
#if IS_ENABLED(CONFIG_IPV6)
/* This is the variant of inet6_iif() that must be used by TCP,
* as TCP moves IP6CB into a different location in skb->cb[]
@@ -2057,7 +2072,6 @@ struct tcp_ulp_ops {
int tcp_register_ulp(struct tcp_ulp_ops *type);
void tcp_unregister_ulp(struct tcp_ulp_ops *type);
int tcp_set_ulp(struct sock *sk, const char *name);
-int tcp_set_ulp_id(struct sock *sk, const int ulp);
void tcp_get_available_ulp(char *buf, size_t len);
void tcp_cleanup_ulp(struct sock *sk);
@@ -2065,6 +2079,18 @@ void tcp_cleanup_ulp(struct sock *sk);
__MODULE_INFO(alias, alias_userspace, name); \
__MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name)
+struct sk_msg;
+struct sk_psock;
+
+int tcp_bpf_init(struct sock *sk);
+void tcp_bpf_reinit(struct sock *sk);
+int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes,
+ int flags);
+int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int nonblock, int flags, int *addr_len);
+int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
+ struct msghdr *msg, int len);
+
/* Call BPF_SOCK_OPS program that returns an int. If the return value
* is < 0, then the BPF op failed (for example if the loaded BPF
* program does not support the chosen operation or there is no BPF
diff --git a/include/net/tls.h b/include/net/tls.h
index 5e853835597e..bab5627ff5e3 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -39,6 +39,8 @@
#include <linux/crypto.h>
#include <linux/socket.h>
#include <linux/tcp.h>
+#include <linux/skmsg.h>
+
#include <net/tcp.h>
#include <net/strparser.h>
#include <crypto/aead.h>
@@ -103,15 +105,13 @@ struct tls_rec {
int tx_flags;
int inplace_crypto;
- /* AAD | sg_plaintext_data | sg_tag */
- struct scatterlist sg_plaintext_data[MAX_SKB_FRAGS + 1];
- /* AAD | sg_encrypted_data (data contain overhead for hdr&iv&tag) */
- struct scatterlist sg_encrypted_data[MAX_SKB_FRAGS + 1];
+ struct sk_msg msg_plaintext;
+ struct sk_msg msg_encrypted;
- unsigned int sg_plaintext_size;
- unsigned int sg_encrypted_size;
- int sg_plaintext_num_elem;
- int sg_encrypted_num_elem;
+ /* AAD | msg_plaintext.sg.data | sg_tag */
+ struct scatterlist sg_aead_in[2];
+ /* AAD | msg_encrypted.sg.data (data contains overhead for hdr & iv & tag) */
+ struct scatterlist sg_aead_out[2];
char aad_space[TLS_AAD_SPACE_SIZE];
struct aead_request aead_req;
@@ -142,8 +142,7 @@ struct tls_sw_context_rx {
struct strparser strp;
void (*saved_data_ready)(struct sock *sk);
- unsigned int (*sk_poll)(struct file *file, struct socket *sock,
- struct poll_table_struct *wait);
+
struct sk_buff *recv_pkt;
u8 control;
bool decrypted;
@@ -223,8 +222,8 @@ struct tls_context {
unsigned long flags;
bool in_tcp_sendpages;
+ bool pending_open_record_frags;
- u16 pending_open_record_frags;
int (*push_pending_record)(struct sock *sk, int flags);
void (*sk_write_space)(struct sock *sk);
@@ -272,8 +271,7 @@ void tls_sw_free_resources_rx(struct sock *sk);
void tls_sw_release_resources_rx(struct sock *sk);
int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
int nonblock, int flags, int *addr_len);
-unsigned int tls_sw_poll(struct file *file, struct socket *sock,
- struct poll_table_struct *wait);
+bool tls_sw_stream_read(const struct sock *sk);
ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
struct pipe_inode_info *pipe,
size_t len, unsigned int flags);
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 0488b8258321..ff8262626b8f 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -13,11 +13,6 @@ ifeq ($(CONFIG_XDP_SOCKETS),y)
obj-$(CONFIG_BPF_SYSCALL) += xskmap.o
endif
obj-$(CONFIG_BPF_SYSCALL) += offload.o
-ifeq ($(CONFIG_STREAM_PARSER),y)
-ifeq ($(CONFIG_INET),y)
-obj-$(CONFIG_BPF_SYSCALL) += sockmap.o
-endif
-endif
endif
ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index dded84cbe814..24583da9ffd1 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -449,7 +449,7 @@ static void fd_array_map_free(struct bpf_map *map)
static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
{
- return NULL;
+ return ERR_PTR(-EOPNOTSUPP);
}
/* only called from syscall */
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 138f0302692e..378cef70341c 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -2114,6 +2114,9 @@ static int btf_parse_hdr(struct btf_verifier_env *env, void __user *btf_data,
hdr = &btf->hdr;
+ if (hdr->hdr_len != hdr_len)
+ return -EINVAL;
+
btf_verifier_log_hdr(env, btf_data_size);
if (hdr->magic != BTF_MAGIC) {
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 3f5bf1af0826..defcf4df6d91 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1792,8 +1792,6 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
const struct bpf_func_proto bpf_get_current_comm_proto __weak;
-const struct bpf_func_proto bpf_sock_map_update_proto __weak;
-const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
const struct bpf_func_proto bpf_get_local_storage_proto __weak;
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
deleted file mode 100644
index d37a1a0a6e1e..000000000000
--- a/kernel/bpf/sockmap.c
+++ /dev/null
@@ -1,2629 +0,0 @@
-/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- */
-
-/* A BPF sock_map is used to store sock objects. This is primarly used
- * for doing socket redirect with BPF helper routines.
- *
- * A sock map may have BPF programs attached to it, currently a program
- * used to parse packets and a program to provide a verdict and redirect
- * decision on the packet are supported. Any programs attached to a sock
- * map are inherited by sock objects when they are added to the map. If
- * no BPF programs are attached the sock object may only be used for sock
- * redirect.
- *
- * A sock object may be in multiple maps, but can only inherit a single
- * parse or verdict program. If adding a sock object to a map would result
- * in having multiple parsing programs the update will return an EBUSY error.
- *
- * For reference this program is similar to devmap used in XDP context
- * reviewing these together may be useful. For an example please review
- * ./samples/bpf/sockmap/.
- */
-#include <linux/bpf.h>
-#include <net/sock.h>
-#include <linux/filter.h>
-#include <linux/errno.h>
-#include <linux/file.h>
-#include <linux/kernel.h>
-#include <linux/net.h>
-#include <linux/skbuff.h>
-#include <linux/workqueue.h>
-#include <linux/list.h>
-#include <linux/mm.h>
-#include <net/strparser.h>
-#include <net/tcp.h>
-#include <linux/ptr_ring.h>
-#include <net/inet_common.h>
-#include <linux/sched/signal.h>
-
-#define SOCK_CREATE_FLAG_MASK \
- (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
-
-struct bpf_sock_progs {
- struct bpf_prog *bpf_tx_msg;
- struct bpf_prog *bpf_parse;
- struct bpf_prog *bpf_verdict;
-};
-
-struct bpf_stab {
- struct bpf_map map;
- struct sock **sock_map;
- struct bpf_sock_progs progs;
- raw_spinlock_t lock;
-};
-
-struct bucket {
- struct hlist_head head;
- raw_spinlock_t lock;
-};
-
-struct bpf_htab {
- struct bpf_map map;
- struct bucket *buckets;
- atomic_t count;
- u32 n_buckets;
- u32 elem_size;
- struct bpf_sock_progs progs;
- struct rcu_head rcu;
-};
-
-struct htab_elem {
- struct rcu_head rcu;
- struct hlist_node hash_node;
- u32 hash;
- struct sock *sk;
- char key[0];
-};
-
-enum smap_psock_state {
- SMAP_TX_RUNNING,
-};
-
-struct smap_psock_map_entry {
- struct list_head list;
- struct bpf_map *map;
- struct sock **entry;
- struct htab_elem __rcu *hash_link;
-};
-
-struct smap_psock {
- struct rcu_head rcu;
- refcount_t refcnt;
-
- /* datapath variables */
- struct sk_buff_head rxqueue;
- bool strp_enabled;
-
- /* datapath error path cache across tx work invocations */
- int save_rem;
- int save_off;
- struct sk_buff *save_skb;
-
- /* datapath variables for tx_msg ULP */
- struct sock *sk_redir;
- int apply_bytes;
- int cork_bytes;
- int sg_size;
- int eval;
- struct sk_msg_buff *cork;
- struct list_head ingress;
-
- struct strparser strp;
- struct bpf_prog *bpf_tx_msg;
- struct bpf_prog *bpf_parse;
- struct bpf_prog *bpf_verdict;
- struct list_head maps;
- spinlock_t maps_lock;
-
- /* Back reference used when sock callback trigger sockmap operations */
- struct sock *sock;
- unsigned long state;
-
- struct work_struct tx_work;
- struct work_struct gc_work;
-
- struct proto *sk_proto;
- void (*save_unhash)(struct sock *sk);
- void (*save_close)(struct sock *sk, long timeout);
- void (*save_data_ready)(struct sock *sk);
- void (*save_write_space)(struct sock *sk);
-};
-
-static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
-static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
- int nonblock, int flags, int *addr_len);
-static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
-static int bpf_tcp_sendpage(struct sock *sk, struct page *page,
- int offset, size_t size, int flags);
-static void bpf_tcp_unhash(struct sock *sk);
-static void bpf_tcp_close(struct sock *sk, long timeout);
-
-static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
-{
- return rcu_dereference_sk_user_data(sk);
-}
-
-static bool bpf_tcp_stream_read(const struct sock *sk)
-{
- struct smap_psock *psock;
- bool empty = true;
-
- rcu_read_lock();
- psock = smap_psock_sk(sk);
- if (unlikely(!psock))
- goto out;
- empty = list_empty(&psock->ingress);
-out:
- rcu_read_unlock();
- return !empty;
-}
-
-enum {
- SOCKMAP_IPV4,
- SOCKMAP_IPV6,
- SOCKMAP_NUM_PROTS,
-};
-
-enum {
- SOCKMAP_BASE,
- SOCKMAP_TX,
- SOCKMAP_NUM_CONFIGS,
-};
-
-static struct proto *saved_tcpv6_prot __read_mostly;
-static DEFINE_SPINLOCK(tcpv6_prot_lock);
-static struct proto bpf_tcp_prots[SOCKMAP_NUM_PROTS][SOCKMAP_NUM_CONFIGS];
-static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS],
- struct proto *base)
-{
- prot[SOCKMAP_BASE] = *base;
- prot[SOCKMAP_BASE].unhash = bpf_tcp_unhash;
- prot[SOCKMAP_BASE].close = bpf_tcp_close;
- prot[SOCKMAP_BASE].recvmsg = bpf_tcp_recvmsg;
- prot[SOCKMAP_BASE].stream_memory_read = bpf_tcp_stream_read;
-
- prot[SOCKMAP_TX] = prot[SOCKMAP_BASE];
- prot[SOCKMAP_TX].sendmsg = bpf_tcp_sendmsg;
- prot[SOCKMAP_TX].sendpage = bpf_tcp_sendpage;
-}
-
-static void update_sk_prot(struct sock *sk, struct smap_psock *psock)
-{
- int family = sk->sk_family == AF_INET6 ? SOCKMAP_IPV6 : SOCKMAP_IPV4;
- int conf = psock->bpf_tx_msg ? SOCKMAP_TX : SOCKMAP_BASE;
-
- sk->sk_prot = &bpf_tcp_prots[family][conf];
-}
-
-static int bpf_tcp_init(struct sock *sk)
-{
- struct smap_psock *psock;
-
- rcu_read_lock();
- psock = smap_psock_sk(sk);
- if (unlikely(!psock)) {
- rcu_read_unlock();
- return -EINVAL;
- }
-
- if (unlikely(psock->sk_proto)) {
- rcu_read_unlock();
- return -EBUSY;
- }
-
- psock->save_unhash = sk->sk_prot->unhash;
- psock->save_close = sk->sk_prot->close;
- psock->sk_proto = sk->sk_prot;
-
- /* Build IPv6 sockmap whenever the address of tcpv6_prot changes */
- if (sk->sk_family == AF_INET6 &&
- unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) {
- spin_lock_bh(&tcpv6_prot_lock);
- if (likely(sk->sk_prot != saved_tcpv6_prot)) {
- build_protos(bpf_tcp_prots[SOCKMAP_IPV6], sk->sk_prot);
- smp_store_release(&saved_tcpv6_prot, sk->sk_prot);
- }
- spin_unlock_bh(&tcpv6_prot_lock);
- }
- update_sk_prot(sk, psock);
- rcu_read_unlock();
- return 0;
-}
-
-static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
-static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge);
-
-static void bpf_tcp_release(struct sock *sk)
-{
- struct smap_psock *psock;
-
- rcu_read_lock();
- psock = smap_psock_sk(sk);
- if (unlikely(!psock))
- goto out;
-
- if (psock->cork) {
- free_start_sg(psock->sock, psock->cork, true);
- kfree(psock->cork);
- psock->cork = NULL;
- }
-
- if (psock->sk_proto) {
- sk->sk_prot = psock->sk_proto;
- psock->sk_proto = NULL;
- }
-out:
- rcu_read_unlock();
-}
-
-static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
- u32 hash, void *key, u32 key_size)
-{
- struct htab_elem *l;
-
- hlist_for_each_entry_rcu(l, head, hash_node) {
- if (l->hash == hash && !memcmp(&l->key, key, key_size))
- return l;
- }
-
- return NULL;
-}
-
-static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
-{
- return &htab->buckets[hash & (htab->n_buckets - 1)];
-}
-
-static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
-{
- return &__select_bucket(htab, hash)->head;
-}
-
-static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
-{
- atomic_dec(&htab->count);
- kfree_rcu(l, rcu);
-}
-
-static struct smap_psock_map_entry *psock_map_pop(struct sock *sk,
- struct smap_psock *psock)
-{
- struct smap_psock_map_entry *e;
-
- spin_lock_bh(&psock->maps_lock);
- e = list_first_entry_or_null(&psock->maps,
- struct smap_psock_map_entry,
- list);
- if (e)
- list_del(&e->list);
- spin_unlock_bh(&psock->maps_lock);
- return e;
-}
-
-static void bpf_tcp_remove(struct sock *sk, struct smap_psock *psock)
-{
- struct smap_psock_map_entry *e;
- struct sk_msg_buff *md, *mtmp;
- struct sock *osk;
-
- if (psock->cork) {
- free_start_sg(psock->sock, psock->cork, true);
- kfree(psock->cork);
- psock->cork = NULL;
- }
-
- list_for_each_entry_safe(md, mtmp, &psock->ingress, list) {
- list_del(&md->list);
- free_start_sg(psock->sock, md, true);
- kfree(md);
- }
-
- e = psock_map_pop(sk, psock);
- while (e) {
- if (e->entry) {
- struct bpf_stab *stab = container_of(e->map, struct bpf_stab, map);
-
- raw_spin_lock_bh(&stab->lock);
- osk = *e->entry;
- if (osk == sk) {
- *e->entry = NULL;
- smap_release_sock(psock, sk);
- }
- raw_spin_unlock_bh(&stab->lock);
- } else {
- struct htab_elem *link = rcu_dereference(e->hash_link);
- struct bpf_htab *htab = container_of(e->map, struct bpf_htab, map);
- struct hlist_head *head;
- struct htab_elem *l;
- struct bucket *b;
-
- b = __select_bucket(htab, link->hash);
- head = &b->head;
- raw_spin_lock_bh(&b->lock);
- l = lookup_elem_raw(head,
- link->hash, link->key,
- htab->map.key_size);
- /* If another thread deleted this object skip deletion.
- * The refcnt on psock may or may not be zero.
- */
- if (l && l == link) {
- hlist_del_rcu(&link->hash_node);
- smap_release_sock(psock, link->sk);
- free_htab_elem(htab, link);
- }
- raw_spin_unlock_bh(&b->lock);
- }
- kfree(e);
- e = psock_map_pop(sk, psock);
- }
-}
-
-static void bpf_tcp_unhash(struct sock *sk)
-{
- void (*unhash_fun)(struct sock *sk);
- struct smap_psock *psock;
-
- rcu_read_lock();
- psock = smap_psock_sk(sk);
- if (unlikely(!psock)) {
- rcu_read_unlock();
- if (sk->sk_prot->unhash)
- sk->sk_prot->unhash(sk);
- return;
- }
- unhash_fun = psock->save_unhash;
- bpf_tcp_remove(sk, psock);
- rcu_read_unlock();
- unhash_fun(sk);
-}
-
-static void bpf_tcp_close(struct sock *sk, long timeout)
-{
- void (*close_fun)(struct sock *sk, long timeout);
- struct smap_psock *psock;
-
- lock_sock(sk);
- rcu_read_lock();
- psock = smap_psock_sk(sk);
- if (unlikely(!psock)) {
- rcu_read_unlock();
- release_sock(sk);
- return sk->sk_prot->close(sk, timeout);
- }
- close_fun = psock->save_close;
- bpf_tcp_remove(sk, psock);
- rcu_read_unlock();
- release_sock(sk);
- close_fun(sk, timeout);
-}
-
-enum __sk_action {
- __SK_DROP = 0,
- __SK_PASS,
- __SK_REDIRECT,
- __SK_NONE,
-};
-
-static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = {
- .name = "bpf_tcp",
- .uid = TCP_ULP_BPF,
- .user_visible = false,
- .owner = NULL,
- .init = bpf_tcp_init,
- .release = bpf_tcp_release,
-};
-
-static int memcopy_from_iter(struct sock *sk,
- struct sk_msg_buff *md,
- struct iov_iter *from, int bytes)
-{
- struct scatterlist *sg = md->sg_data;
- int i = md->sg_curr, rc = -ENOSPC;
-
- do {
- int copy;
- char *to;
-
- if (md->sg_copybreak >= sg[i].length) {
- md->sg_copybreak = 0;
-
- if (++i == MAX_SKB_FRAGS)
- i = 0;
-
- if (i == md->sg_end)
- break;
- }
-
- copy = sg[i].length - md->sg_copybreak;
- to = sg_virt(&sg[i]) + md->sg_copybreak;
- md->sg_copybreak += copy;
-
- if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY)
- rc = copy_from_iter_nocache(to, copy, from);
- else
- rc = copy_from_iter(to, copy, from);
-
- if (rc != copy) {
- rc = -EFAULT;
- goto out;
- }
-
- bytes -= copy;
- if (!bytes)
- break;
-
- md->sg_copybreak = 0;
- if (++i == MAX_SKB_FRAGS)
- i = 0;
- } while (i != md->sg_end);
-out:
- md->sg_curr = i;
- return rc;
-}
-
-static int bpf_tcp_push(struct sock *sk, int apply_bytes,
- struct sk_msg_buff *md,
- int flags, bool uncharge)
-{
- bool apply = apply_bytes;
- struct scatterlist *sg;
- int offset, ret = 0;
- struct page *p;
- size_t size;
-
- while (1) {
- sg = md->sg_data + md->sg_start;
- size = (apply && apply_bytes < sg->length) ?
- apply_bytes : sg->length;
- offset = sg->offset;
-
- tcp_rate_check_app_limited(sk);
- p = sg_page(sg);
-retry:
- ret = do_tcp_sendpages(sk, p, offset, size, flags);
- if (ret != size) {
- if (ret > 0) {
- if (apply)
- apply_bytes -= ret;
-
- sg->offset += ret;
- sg->length -= ret;
- size -= ret;
- offset += ret;
- if (uncharge)
- sk_mem_uncharge(sk, ret);
- goto retry;
- }
-
- return ret;
- }
-
- if (apply)
- apply_bytes -= ret;
- sg->offset += ret;
- sg->length -= ret;
- if (uncharge)
- sk_mem_uncharge(sk, ret);
-
- if (!sg->length) {
- put_page(p);
- md->sg_start++;
- if (md->sg_start == MAX_SKB_FRAGS)
- md->sg_start = 0;
- sg_init_table(sg, 1);
-
- if (md->sg_start == md->sg_end)
- break;
- }
-
- if (apply && !apply_bytes)
- break;
- }
- return 0;
-}
-
-static inline void bpf_compute_data_pointers_sg(struct sk_msg_buff *md)
-{
- struct scatterlist *sg = md->sg_data + md->sg_start;
-
- if (md->sg_copy[md->sg_start]) {
- md->data = md->data_end = 0;
- } else {
- md->data = sg_virt(sg);
- md->data_end = md->data + sg->length;
- }
-}
-
-static void return_mem_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
-{
- struct scatterlist *sg = md->sg_data;
- int i = md->sg_start;
-
- do {
- int uncharge = (bytes < sg[i].length) ? bytes : sg[i].length;
-
- sk_mem_uncharge(sk, uncharge);
- bytes -= uncharge;
- if (!bytes)
- break;
- i++;
- if (i == MAX_SKB_FRAGS)
- i = 0;
- } while (i != md->sg_end);
-}
-
-static void free_bytes_sg(struct sock *sk, int bytes,
- struct sk_msg_buff *md, bool charge)
-{
- struct scatterlist *sg = md->sg_data;
- int i = md->sg_start, free;
-
- while (bytes && sg[i].length) {
- free = sg[i].length;
- if (bytes < free) {
- sg[i].length -= bytes;
- sg[i].offset += bytes;
- if (charge)
- sk_mem_uncharge(sk, bytes);
- break;
- }
-
- if (charge)
- sk_mem_uncharge(sk, sg[i].length);
- put_page(sg_page(&sg[i]));
- bytes -= sg[i].length;
- sg[i].length = 0;
- sg[i].page_link = 0;
- sg[i].offset = 0;
- i++;
-
- if (i == MAX_SKB_FRAGS)
- i = 0;
- }
- md->sg_start = i;
-}
-
-static int free_sg(struct sock *sk, int start,
- struct sk_msg_buff *md, bool charge)
-{
- struct scatterlist *sg = md->sg_data;
- int i = start, free = 0;
-
- while (sg[i].length) {
- free += sg[i].length;
- if (charge)
- sk_mem_uncharge(sk, sg[i].length);
- if (!md->skb)
- put_page(sg_page(&sg[i]));
- sg[i].length = 0;
- sg[i].page_link = 0;
- sg[i].offset = 0;
- i++;
-
- if (i == MAX_SKB_FRAGS)
- i = 0;
- }
- consume_skb(md->skb);
-
- return free;
-}
-
-static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge)
-{
- int free = free_sg(sk, md->sg_start, md, charge);
-
- md->sg_start = md->sg_end;
- return free;
-}
-
-static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md)
-{
- return free_sg(sk, md->sg_curr, md, true);
-}
-
-static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md)
-{
- return ((_rc == SK_PASS) ?
- (md->sk_redir ? __SK_REDIRECT : __SK_PASS) :
- __SK_DROP);
-}
-
-static unsigned int smap_do_tx_msg(struct sock *sk,
- struct smap_psock *psock,
- struct sk_msg_buff *md)
-{
- struct bpf_prog *prog;
- unsigned int rc, _rc;
-
- preempt_disable();
- rcu_read_lock();
-
- /* If the policy was removed mid-send then default to 'accept' */
- prog = READ_ONCE(psock->bpf_tx_msg);
- if (unlikely(!prog)) {
- _rc = SK_PASS;
- goto verdict;
- }
-
- bpf_compute_data_pointers_sg(md);
- md->sk = sk;
- rc = (*prog->bpf_func)(md, prog->insnsi);
- psock->apply_bytes = md->apply_bytes;
-
- /* Moving return codes from UAPI namespace into internal namespace */
- _rc = bpf_map_msg_verdict(rc, md);
-
- /* The psock has a refcount on the sock but not on the map and because
- * we need to drop rcu read lock here its possible the map could be
- * removed between here and when we need it to execute the sock
- * redirect. So do the map lookup now for future use.
- */
- if (_rc == __SK_REDIRECT) {
- if (psock->sk_redir)
- sock_put(psock->sk_redir);
- psock->sk_redir = do_msg_redirect_map(md);
- if (!psock->sk_redir) {
- _rc = __SK_DROP;
- goto verdict;
- }
- sock_hold(psock->sk_redir);
- }
-verdict:
- rcu_read_unlock();
- preempt_enable();
-
- return _rc;
-}
-
-static int bpf_tcp_ingress(struct sock *sk, int apply_bytes,
- struct smap_psock *psock,
- struct sk_msg_buff *md, int flags)
-{
- bool apply = apply_bytes;
- size_t size, copied = 0;
- struct sk_msg_buff *r;
- int err = 0, i;
-
- r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_KERNEL);
- if (unlikely(!r))
- return -ENOMEM;
-
- lock_sock(sk);
- r->sg_start = md->sg_start;
- i = md->sg_start;
-
- do {
- size = (apply && apply_bytes < md->sg_data[i].length) ?
- apply_bytes : md->sg_data[i].length;
-
- if (!sk_wmem_schedule(sk, size)) {
- if (!copied)
- err = -ENOMEM;
- break;
- }
-
- sk_mem_charge(sk, size);
- r->sg_data[i] = md->sg_data[i];
- r->sg_data[i].length = size;
- md->sg_data[i].length -= size;
- md->sg_data[i].offset += size;
- copied += size;
-
- if (md->sg_data[i].length) {
- get_page(sg_page(&r->sg_data[i]));
- r->sg_end = (i + 1) == MAX_SKB_FRAGS ? 0 : i + 1;
- } else {
- i++;
- if (i == MAX_SKB_FRAGS)
- i = 0;
- r->sg_end = i;
- }
-
- if (apply) {
- apply_bytes -= size;
- if (!apply_bytes)
- break;
- }
- } while (i != md->sg_end);
-
- md->sg_start = i;
-
- if (!err) {
- list_add_tail(&r->list, &psock->ingress);
- sk->sk_data_ready(sk);
- } else {
- free_start_sg(sk, r, true);
- kfree(r);
- }
-
- release_sock(sk);
- return err;
-}
-
-static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send,
- struct sk_msg_buff *md,
- int flags)
-{
- bool ingress = !!(md->flags & BPF_F_INGRESS);
- struct smap_psock *psock;
- int err = 0;
-
- rcu_read_lock();
- psock = smap_psock_sk(sk);
- if (unlikely(!psock))
- goto out_rcu;
-
- if (!refcount_inc_not_zero(&psock->refcnt))
- goto out_rcu;
-
- rcu_read_unlock();
-
- if (ingress) {
- err = bpf_tcp_ingress(sk, send, psock, md, flags);
- } else {
- lock_sock(sk);
- err = bpf_tcp_push(sk, send, md, flags, false);
- release_sock(sk);
- }
- smap_release_sock(psock, sk);
- return err;
-out_rcu:
- rcu_read_unlock();
- return 0;
-}
-
-static inline void bpf_md_init(struct smap_psock *psock)
-{
- if (!psock->apply_bytes) {
- psock->eval = __SK_NONE;
- if (psock->sk_redir) {
- sock_put(psock->sk_redir);
- psock->sk_redir = NULL;
- }
- }
-}
-
-static void apply_bytes_dec(struct smap_psock *psock, int i)
-{
- if (psock->apply_bytes) {
- if (psock->apply_bytes < i)
- psock->apply_bytes = 0;
- else
- psock->apply_bytes -= i;
- }
-}
-
-static int bpf_exec_tx_verdict(struct smap_psock *psock,
- struct sk_msg_buff *m,
- struct sock *sk,
- int *copied, int flags)
-{
- bool cork = false, enospc = (m->sg_start == m->sg_end);
- struct sock *redir;
- int err = 0;
- int send;
-
-more_data:
- if (psock->eval == __SK_NONE)
- psock->eval = smap_do_tx_msg(sk, psock, m);
-
- if (m->cork_bytes &&
- m->cork_bytes > psock->sg_size && !enospc) {
- psock->cork_bytes = m->cork_bytes - psock->sg_size;
- if (!psock->cork) {
- psock->cork = kcalloc(1,
- sizeof(struct sk_msg_buff),
- GFP_ATOMIC | __GFP_NOWARN);
-
- if (!psock->cork) {
- err = -ENOMEM;
- goto out_err;
- }
- }
- memcpy(psock->cork, m, sizeof(*m));
- goto out_err;
- }
-
- send = psock->sg_size;
- if (psock->apply_bytes && psock->apply_bytes < send)
- send = psock->apply_bytes;
-
- switch (psock->eval) {
- case __SK_PASS:
- err = bpf_tcp_push(sk, send, m, flags, true);
- if (unlikely(err)) {
- *copied -= free_start_sg(sk, m, true);
- break;
- }
-
- apply_bytes_dec(psock, send);
- psock->sg_size -= send;
- break;
- case __SK_REDIRECT:
- redir = psock->sk_redir;
- apply_bytes_dec(psock, send);
-
- if (psock->cork) {
- cork = true;
- psock->cork = NULL;
- }
-
- return_mem_sg(sk, send, m);
- release_sock(sk);
-
- err = bpf_tcp_sendmsg_do_redirect(redir, send, m, flags);
- lock_sock(sk);
-
- if (unlikely(err < 0)) {
- int free = free_start_sg(sk, m, false);
-
- psock->sg_size = 0;
- if (!cork)
- *copied -= free;
- } else {
- psock->sg_size -= send;
- }
-
- if (cork) {
- free_start_sg(sk, m, true);
- psock->sg_size = 0;
- kfree(m);
- m = NULL;
- err = 0;
- }
- break;
- case __SK_DROP:
- default:
- free_bytes_sg(sk, send, m, true);
- apply_bytes_dec(psock, send);
- *copied -= send;
- psock->sg_size -= send;
- err = -EACCES;
- break;
- }
-
- if (likely(!err)) {
- bpf_md_init(psock);
- if (m &&
- m->sg_data[m->sg_start].page_link &&
- m->sg_data[m->sg_start].length)
- goto more_data;
- }
-
-out_err:
- return err;
-}
-
-static int bpf_wait_data(struct sock *sk,
- struct smap_psock *psk, int flags,
- long timeo, int *err)
-{
- int rc;
-
- DEFINE_WAIT_FUNC(wait, woken_wake_function);
-
- add_wait_queue(sk_sleep(sk), &wait);
- sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
- rc = sk_wait_event(sk, &timeo,
- !list_empty(&psk->ingress) ||
- !skb_queue_empty(&sk->sk_receive_queue),
- &wait);
- sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
- remove_wait_queue(sk_sleep(sk), &wait);
-
- return rc;
-}
-
-static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
- int nonblock, int flags, int *addr_len)
-{
- struct iov_iter *iter = &msg->msg_iter;
- struct smap_psock *psock;
- int copied = 0;
-
- if (unlikely(flags & MSG_ERRQUEUE))
- return inet_recv_error(sk, msg, len, addr_len);
- if (!skb_queue_empty(&sk->sk_receive_queue))
- return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
-
- rcu_read_lock();
- psock = smap_psock_sk(sk);
- if (unlikely(!psock))
- goto out;
-
- if (unlikely(!refcount_inc_not_zero(&psock->refcnt)))
- goto out;
- rcu_read_unlock();
-
- lock_sock(sk);
-bytes_ready:
- while (copied != len) {
- struct scatterlist *sg;
- struct sk_msg_buff *md;
- int i;
-
- md = list_first_entry_or_null(&psock->ingress,
- struct sk_msg_buff, list);
- if (unlikely(!md))
- break;
- i = md->sg_start;
- do {
- struct page *page;
- int n, copy;
-
- sg = &md->sg_data[i];
- copy = sg->length;
- page = sg_page(sg);
-
- if (copied + copy > len)
- copy = len - copied;
-
- n = copy_page_to_iter(page, sg->offset, copy, iter);
- if (n != copy) {
- md->sg_start = i;
- release_sock(sk);
- smap_release_sock(psock, sk);
- return -EFAULT;
- }
-
- copied += copy;
- sg->offset += copy;
- sg->length -= copy;
- sk_mem_uncharge(sk, copy);
-
- if (!sg->length) {
- i++;
- if (i == MAX_SKB_FRAGS)
- i = 0;
- if (!md->skb)
- put_page(page);
- }
- if (copied == len)
- break;
- } while (i != md->sg_end);
- md->sg_start = i;
-
- if (!sg->length && md->sg_start == md->sg_end) {
- list_del(&md->list);
- consume_skb(md->skb);
- kfree(md);
- }
- }
-
- if (!copied) {
- long timeo;
- int data;
- int err = 0;
-
- timeo = sock_rcvtimeo(sk, nonblock);
- data = bpf_wait_data(sk, psock, flags, timeo, &err);
-
- if (data) {
- if (!skb_queue_empty(&sk->sk_receive_queue)) {
- release_sock(sk);
- smap_release_sock(psock, sk);
- copied = tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
- return copied;
- }
- goto bytes_ready;
- }
-
- if (err)
- copied = err;
- }
-
- release_sock(sk);
- smap_release_sock(psock, sk);
- return copied;
-out:
- rcu_read_unlock();
- return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
-}
-
-
-static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
-{
- int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS;
- struct sk_msg_buff md = {0};
- unsigned int sg_copy = 0;
- struct smap_psock *psock;
- int copied = 0, err = 0;
- struct scatterlist *sg;
- long timeo;
-
- /* Its possible a sock event or user removed the psock _but_ the ops
- * have not been reprogrammed yet so we get here. In this case fallback
- * to tcp_sendmsg. Note this only works because we _only_ ever allow
- * a single ULP there is no hierarchy here.
- */
- rcu_read_lock();
- psock = smap_psock_sk(sk);
- if (unlikely(!psock)) {
- rcu_read_unlock();
- return tcp_sendmsg(sk, msg, size);
- }
-
- /* Increment the psock refcnt to ensure its not released while sending a
- * message. Required because sk lookup and bpf programs are used in
- * separate rcu critical sections. Its OK if we lose the map entry
- * but we can't lose the sock reference.
- */
- if (!refcount_inc_not_zero(&psock->refcnt)) {
- rcu_read_unlock();
- return tcp_sendmsg(sk, msg, size);
- }
-
- sg = md.sg_data;
- sg_init_marker(sg, MAX_SKB_FRAGS);
- rcu_read_unlock();
-
- lock_sock(sk);
- timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
-
- while (msg_data_left(msg)) {
- struct sk_msg_buff *m = NULL;
- bool enospc = false;
- int copy;
-
- if (sk->sk_err) {
- err = -sk->sk_err;
- goto out_err;
- }
-
- copy = msg_data_left(msg);
- if (!sk_stream_memory_free(sk))
- goto wait_for_sndbuf;
-
- m = psock->cork_bytes ? psock->cork : &md;
- m->sg_curr = m->sg_copybreak ? m->sg_curr : m->sg_end;
- err = sk_alloc_sg(sk, copy, m->sg_data,
- m->sg_start, &m->sg_end, &sg_copy,
- m->sg_end - 1);
- if (err) {
- if (err != -ENOSPC)
- goto wait_for_memory;
- enospc = true;
- copy = sg_copy;
- }
-
- err = memcopy_from_iter(sk, m, &msg->msg_iter, copy);
- if (err < 0) {
- free_curr_sg(sk, m);
- goto out_err;
- }
-
- psock->sg_size += copy;
- copied += copy;
- sg_copy = 0;
-
- /* When bytes are being corked skip running BPF program and
- * applying verdict unless there is no more buffer space. In
- * the ENOSPC case simply run BPF prorgram with currently
- * accumulated data. We don't have much choice at this point
- * we could try extending the page frags or chaining complex
- * frags but even in these cases _eventually_ we will hit an
- * OOM scenario. More complex recovery schemes may be
- * implemented in the future, but BPF programs must handle
- * the case where apply_cork requests are not honored. The
- * canonical method to verify this is to check data length.
- */
- if (psock->cork_bytes) {
- if (copy > psock->cork_bytes)
- psock->cork_bytes = 0;
- else
- psock->cork_bytes -= copy;
-
- if (psock->cork_bytes && !enospc)
- goto out_cork;
-
- /* All cork bytes accounted for re-run filter */
- psock->eval = __SK_NONE;
- psock->cork_bytes = 0;
- }
-
- err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags);
- if (unlikely(err < 0))
- goto out_err;
- continue;
-wait_for_sndbuf:
- set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-wait_for_memory:
- err = sk_stream_wait_memory(sk, &timeo);
- if (err) {
- if (m && m != psock->cork)
- free_start_sg(sk, m, true);
- goto out_err;
- }
- }
-out_err:
- if (err < 0)
- err = sk_stream_error(sk, msg->msg_flags, err);
-out_cork:
- release_sock(sk);
- smap_release_sock(psock, sk);
- return copied ? copied : err;
-}
-
-static int bpf_tcp_sendpage(struct sock *sk, struct page *page,
- int offset, size_t size, int flags)
-{
- struct sk_msg_buff md = {0}, *m = NULL;
- int err = 0, copied = 0;
- struct smap_psock *psock;
- struct scatterlist *sg;
- bool enospc = false;
-
- rcu_read_lock();
- psock = smap_psock_sk(sk);
- if (unlikely(!psock))
- goto accept;
-
- if (!refcount_inc_not_zero(&psock->refcnt))
- goto accept;
- rcu_read_unlock();
-
- lock_sock(sk);
-
- if (psock->cork_bytes) {
- m = psock->cork;
- sg = &m->sg_data[m->sg_end];
- } else {
- m = &md;
- sg = m->sg_data;
- sg_init_marker(sg, MAX_SKB_FRAGS);
- }
-
- /* Catch case where ring is full and sendpage is stalled. */
- if (unlikely(m->sg_end == m->sg_start &&
- m->sg_data[m->sg_end].length))
- goto out_err;
-
- psock->sg_size += size;
- sg_set_page(sg, page, size, offset);
- get_page(page);
- m->sg_copy[m->sg_end] = true;
- sk_mem_charge(sk, size);
- m->sg_end++;
- copied = size;
-
- if (m->sg_end == MAX_SKB_FRAGS)
- m->sg_end = 0;
-
- if (m->sg_end == m->sg_start)
- enospc = true;
-
- if (psock->cork_bytes) {
- if (size > psock->cork_bytes)
- psock->cork_bytes = 0;
- else
- psock->cork_bytes -= size;
-
- if (psock->cork_bytes && !enospc)
- goto out_err;
-
- /* All cork bytes accounted for re-run filter */
- psock->eval = __SK_NONE;
- psock->cork_bytes = 0;
- }
-
- err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags);
-out_err:
- release_sock(sk);
- smap_release_sock(psock, sk);
- return copied ? copied : err;
-accept:
- rcu_read_unlock();
- return tcp_sendpage(sk, page, offset, size, flags);
-}
-
-static void bpf_tcp_msg_add(struct smap_psock *psock,
- struct sock *sk,
- struct bpf_prog *tx_msg)
-{
- struct bpf_prog *orig_tx_msg;
-
- orig_tx_msg = xchg(&psock->bpf_tx_msg, tx_msg);
- if (orig_tx_msg)
- bpf_prog_put(orig_tx_msg);
-}
-
-static int bpf_tcp_ulp_register(void)
-{
- build_protos(bpf_tcp_prots[SOCKMAP_IPV4], &tcp_prot);
- /* Once BPF TX ULP is registered it is never unregistered. It
- * will be in the ULP list for the lifetime of the system. Doing
- * duplicate registers is not a problem.
- */
- return tcp_register_ulp(&bpf_tcp_ulp_ops);
-}
-
-static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
-{
- struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict);
- int rc;
-
- if (unlikely(!prog))
- return __SK_DROP;
-
- skb_orphan(skb);
- /* We need to ensure that BPF metadata for maps is also cleared
- * when we orphan the skb so that we don't have the possibility
- * to reference a stale map.
- */
- TCP_SKB_CB(skb)->bpf.sk_redir = NULL;
- skb->sk = psock->sock;
- bpf_compute_data_end_sk_skb(skb);
- preempt_disable();
- rc = (*prog->bpf_func)(skb, prog->insnsi);
- preempt_enable();
- skb->sk = NULL;
-
- /* Moving return codes from UAPI namespace into internal namespace */
- return rc == SK_PASS ?
- (TCP_SKB_CB(skb)->bpf.sk_redir ? __SK_REDIRECT : __SK_PASS) :
- __SK_DROP;
-}
-
-static int smap_do_ingress(struct smap_psock *psock, struct sk_buff *skb)
-{
- struct sock *sk = psock->sock;
- int copied = 0, num_sg;
- struct sk_msg_buff *r;
-
- r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_ATOMIC);
- if (unlikely(!r))
- return -EAGAIN;
-
- if (!sk_rmem_schedule(sk, skb, skb->len)) {
- kfree(r);
- return -EAGAIN;
- }
-
- sg_init_table(r->sg_data, MAX_SKB_FRAGS);
- num_sg = skb_to_sgvec(skb, r->sg_data, 0, skb->len);
- if (unlikely(num_sg < 0)) {
- kfree(r);
- return num_sg;
- }
- sk_mem_charge(sk, skb->len);
- copied = skb->len;
- r->sg_start = 0;
- r->sg_end = num_sg == MAX_SKB_FRAGS ? 0 : num_sg;
- r->skb = skb;
- list_add_tail(&r->list, &psock->ingress);
- sk->sk_data_ready(sk);
- return copied;
-}
-
-static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)
-{
- struct smap_psock *peer;
- struct sock *sk;
- __u32 in;
- int rc;
-
- rc = smap_verdict_func(psock, skb);
- switch (rc) {
- case __SK_REDIRECT:
- sk = do_sk_redirect_map(skb);
- if (!sk) {
- kfree_skb(skb);
- break;
- }
-
- peer = smap_psock_sk(sk);
- in = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS;
-
- if (unlikely(!peer || sock_flag(sk, SOCK_DEAD) ||
- !test_bit(SMAP_TX_RUNNING, &peer->state))) {
- kfree_skb(skb);
- break;
- }
-
- if (!in && sock_writeable(sk)) {
- skb_set_owner_w(skb, sk);
- skb_queue_tail(&peer->rxqueue, skb);
- schedule_work(&peer->tx_work);
- break;
- } else if (in &&
- atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) {
- skb_queue_tail(&peer->rxqueue, skb);
- schedule_work(&peer->tx_work);
- break;
- }
- /* Fall through and free skb otherwise */
- case __SK_DROP:
- default:
- kfree_skb(skb);
- }
-}
-
-static void smap_report_sk_error(struct smap_psock *psock, int err)
-{
- struct sock *sk = psock->sock;
-
- sk->sk_err = err;
- sk->sk_error_report(sk);
-}
-
-static void smap_read_sock_strparser(struct strparser *strp,
- struct sk_buff *skb)
-{
- struct smap_psock *psock;
-
- rcu_read_lock();
- psock = container_of(strp, struct smap_psock, strp);
- smap_do_verdict(psock, skb);
- rcu_read_unlock();
-}
-
-/* Called with lock held on socket */
-static void smap_data_ready(struct sock *sk)
-{
- struct smap_psock *psock;
-
- rcu_read_lock();
- psock = smap_psock_sk(sk);
- if (likely(psock)) {
- write_lock_bh(&sk->sk_callback_lock);
- strp_data_ready(&psock->strp);
- write_unlock_bh(&sk->sk_callback_lock);
- }
- rcu_read_unlock();
-}
-
-static void smap_tx_work(struct work_struct *w)
-{
- struct smap_psock *psock;
- struct sk_buff *skb;
- int rem, off, n;
-
- psock = container_of(w, struct smap_psock, tx_work);
-
- /* lock sock to avoid losing sk_socket at some point during loop */
- lock_sock(psock->sock);
- if (psock->save_skb) {
- skb = psock->save_skb;
- rem = psock->save_rem;
- off = psock->save_off;
- psock->save_skb = NULL;
- goto start;
- }
-
- while ((skb = skb_dequeue(&psock->rxqueue))) {
- __u32 flags;
-
- rem = skb->len;
- off = 0;
-start:
- flags = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS;
- do {
- if (likely(psock->sock->sk_socket)) {
- if (flags)
- n = smap_do_ingress(psock, skb);
- else
- n = skb_send_sock_locked(psock->sock,
- skb, off, rem);
- } else {
- n = -EINVAL;
- }
-
- if (n <= 0) {
- if (n == -EAGAIN) {
- /* Retry when space is available */
- psock->save_skb = skb;
- psock->save_rem = rem;
- psock->save_off = off;
- goto out;
- }
- /* Hard errors break pipe and stop xmit */
- smap_report_sk_error(psock, n ? -n : EPIPE);
- clear_bit(SMAP_TX_RUNNING, &psock->state);
- kfree_skb(skb);
- goto out;
- }
- rem -= n;
- off += n;
- } while (rem);
-
- if (!flags)
- kfree_skb(skb);
- }
-out:
- release_sock(psock->sock);
-}
-
-static void smap_write_space(struct sock *sk)
-{
- struct smap_psock *psock;
- void (*write_space)(struct sock *sk);
-
- rcu_read_lock();
- psock = smap_psock_sk(sk);
- if (likely(psock && test_bit(SMAP_TX_RUNNING, &psock->state)))
- schedule_work(&psock->tx_work);
- write_space = psock->save_write_space;
- rcu_read_unlock();
- write_space(sk);
-}
-
-static void smap_stop_sock(struct smap_psock *psock, struct sock *sk)
-{
- if (!psock->strp_enabled)
- return;
- sk->sk_data_ready = psock->save_data_ready;
- sk->sk_write_space = psock->save_write_space;
- psock->save_data_ready = NULL;
- psock->save_write_space = NULL;
- strp_stop(&psock->strp);
- psock->strp_enabled = false;
-}
-
-static void smap_destroy_psock(struct rcu_head *rcu)
-{
- struct smap_psock *psock = container_of(rcu,
- struct smap_psock, rcu);
-
- /* Now that a grace period has passed there is no longer
- * any reference to this sock in the sockmap so we can
- * destroy the psock, strparser, and bpf programs. But,
- * because we use workqueue sync operations we can not
- * do it in rcu context
- */
- schedule_work(&psock->gc_work);
-}
-
-static bool psock_is_smap_sk(struct sock *sk)
-{
- return inet_csk(sk)->icsk_ulp_ops == &bpf_tcp_ulp_ops;
-}
-
-static void smap_release_sock(struct smap_psock *psock, struct sock *sock)
-{
- if (refcount_dec_and_test(&psock->refcnt)) {
- if (psock_is_smap_sk(sock))
- tcp_cleanup_ulp(sock);
- write_lock_bh(&sock->sk_callback_lock);
- smap_stop_sock(psock, sock);
- write_unlock_bh(&sock->sk_callback_lock);
- clear_bit(SMAP_TX_RUNNING, &psock->state);
- rcu_assign_sk_user_data(sock, NULL);
- call_rcu_sched(&psock->rcu, smap_destroy_psock);
- }
-}
-
-static int smap_parse_func_strparser(struct strparser *strp,
- struct sk_buff *skb)
-{
- struct smap_psock *psock;
- struct bpf_prog *prog;
- int rc;
-
- rcu_read_lock();
- psock = container_of(strp, struct smap_psock, strp);
- prog = READ_ONCE(psock->bpf_parse);
-
- if (unlikely(!prog)) {
- rcu_read_unlock();
- return skb->len;
- }
-
- /* Attach socket for bpf program to use if needed we can do this
- * because strparser clones the skb before handing it to a upper
- * layer, meaning skb_orphan has been called. We NULL sk on the
- * way out to ensure we don't trigger a BUG_ON in skb/sk operations
- * later and because we are not charging the memory of this skb to
- * any socket yet.
- */
- skb->sk = psock->sock;
- bpf_compute_data_end_sk_skb(skb);
- rc = (*prog->bpf_func)(skb, prog->insnsi);
- skb->sk = NULL;
- rcu_read_unlock();
- return rc;
-}
-
-static int smap_read_sock_done(struct strparser *strp, int err)
-{
- return err;
-}
-
-static int smap_init_sock(struct smap_psock *psock,
- struct sock *sk)
-{
- static const struct strp_callbacks cb = {
- .rcv_msg = smap_read_sock_strparser,
- .parse_msg = smap_parse_func_strparser,
- .read_sock_done = smap_read_sock_done,
- };
-
- return strp_init(&psock->strp, sk, &cb);
-}
-
-static void smap_init_progs(struct smap_psock *psock,
- struct bpf_prog *verdict,
- struct bpf_prog *parse)
-{
- struct bpf_prog *orig_parse, *orig_verdict;
-
- orig_parse = xchg(&psock->bpf_parse, parse);
- orig_verdict = xchg(&psock->bpf_verdict, verdict);
-
- if (orig_verdict)
- bpf_prog_put(orig_verdict);
- if (orig_parse)
- bpf_prog_put(orig_parse);
-}
-
-static void smap_start_sock(struct smap_psock *psock, struct sock *sk)
-{
- if (sk->sk_data_ready == smap_data_ready)
- return;
- psock->save_data_ready = sk->sk_data_ready;
- psock->save_write_space = sk->sk_write_space;
- sk->sk_data_ready = smap_data_ready;
- sk->sk_write_space = smap_write_space;
- psock->strp_enabled = true;
-}
-
-static void sock_map_remove_complete(struct bpf_stab *stab)
-{
- bpf_map_area_free(stab->sock_map);
- kfree(stab);
-}
-
-static void smap_gc_work(struct work_struct *w)
-{
- struct smap_psock_map_entry *e, *tmp;
- struct sk_msg_buff *md, *mtmp;
- struct smap_psock *psock;
-
- psock = container_of(w, struct smap_psock, gc_work);
-
- /* no callback lock needed because we already detached sockmap ops */
- if (psock->strp_enabled)
- strp_done(&psock->strp);
-
- cancel_work_sync(&psock->tx_work);
- __skb_queue_purge(&psock->rxqueue);
-
- /* At this point all strparser and xmit work must be complete */
- if (psock->bpf_parse)
- bpf_prog_put(psock->bpf_parse);
- if (psock->bpf_verdict)
- bpf_prog_put(psock->bpf_verdict);
- if (psock->bpf_tx_msg)
- bpf_prog_put(psock->bpf_tx_msg);
-
- if (psock->cork) {
- free_start_sg(psock->sock, psock->cork, true);
- kfree(psock->cork);
- }
-
- list_for_each_entry_safe(md, mtmp, &psock->ingress, list) {
- list_del(&md->list);
- free_start_sg(psock->sock, md, true);
- kfree(md);
- }
-
- list_for_each_entry_safe(e, tmp, &psock->maps, list) {
- list_del(&e->list);
- kfree(e);
- }
-
- if (psock->sk_redir)
- sock_put(psock->sk_redir);
-
- sock_put(psock->sock);
- kfree(psock);
-}
-
-static struct smap_psock *smap_init_psock(struct sock *sock, int node)
-{
- struct smap_psock *psock;
-
- psock = kzalloc_node(sizeof(struct smap_psock),
- GFP_ATOMIC | __GFP_NOWARN,
- node);
- if (!psock)
- return ERR_PTR(-ENOMEM);
-
- psock->eval = __SK_NONE;
- psock->sock = sock;
- skb_queue_head_init(&psock->rxqueue);
- INIT_WORK(&psock->tx_work, smap_tx_work);
- INIT_WORK(&psock->gc_work, smap_gc_work);
- INIT_LIST_HEAD(&psock->maps);
- INIT_LIST_HEAD(&psock->ingress);
- refcount_set(&psock->refcnt, 1);
- spin_lock_init(&psock->maps_lock);
-
- rcu_assign_sk_user_data(sock, psock);
- sock_hold(sock);
- return psock;
-}
-
-static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
-{
- struct bpf_stab *stab;
- u64 cost;
- int err;
-
- if (!capable(CAP_NET_ADMIN))
- return ERR_PTR(-EPERM);
-
- /* check sanity of attributes */
- if (attr->max_entries == 0 || attr->key_size != 4 ||
- attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
- return ERR_PTR(-EINVAL);
-
- err = bpf_tcp_ulp_register();
- if (err && err != -EEXIST)
- return ERR_PTR(err);
-
- stab = kzalloc(sizeof(*stab), GFP_USER);
- if (!stab)
- return ERR_PTR(-ENOMEM);
-
- bpf_map_init_from_attr(&stab->map, attr);
- raw_spin_lock_init(&stab->lock);
-
- /* make sure page count doesn't overflow */
- cost = (u64) stab->map.max_entries * sizeof(struct sock *);
- err = -EINVAL;
- if (cost >= U32_MAX - PAGE_SIZE)
- goto free_stab;
-
- stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
-
- /* if map size is larger than memlock limit, reject it early */
- err = bpf_map_precharge_memlock(stab->map.pages);
- if (err)
- goto free_stab;
-
- err = -ENOMEM;
- stab->sock_map = bpf_map_area_alloc(stab->map.max_entries *
- sizeof(struct sock *),
- stab->map.numa_node);
- if (!stab->sock_map)
- goto free_stab;
-
- return &stab->map;
-free_stab:
- kfree(stab);
- return ERR_PTR(err);
-}
-
-static void smap_list_map_remove(struct smap_psock *psock,
- struct sock **entry)
-{
- struct smap_psock_map_entry *e, *tmp;
-
- spin_lock_bh(&psock->maps_lock);
- list_for_each_entry_safe(e, tmp, &psock->maps, list) {
- if (e->entry == entry) {
- list_del(&e->list);
- kfree(e);
- }
- }
- spin_unlock_bh(&psock->maps_lock);
-}
-
-static void smap_list_hash_remove(struct smap_psock *psock,
- struct htab_elem *hash_link)
-{
- struct smap_psock_map_entry *e, *tmp;
-
- spin_lock_bh(&psock->maps_lock);
- list_for_each_entry_safe(e, tmp, &psock->maps, list) {
- struct htab_elem *c = rcu_dereference(e->hash_link);
-
- if (c == hash_link) {
- list_del(&e->list);
- kfree(e);
- }
- }
- spin_unlock_bh(&psock->maps_lock);
-}
-
-static void sock_map_free(struct bpf_map *map)
-{
- struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
- int i;
-
- synchronize_rcu();
-
- /* At this point no update, lookup or delete operations can happen.
- * However, be aware we can still get a socket state event updates,
- * and data ready callabacks that reference the psock from sk_user_data
- * Also psock worker threads are still in-flight. So smap_release_sock
- * will only free the psock after cancel_sync on the worker threads
- * and a grace period expire to ensure psock is really safe to remove.
- */
- rcu_read_lock();
- raw_spin_lock_bh(&stab->lock);
- for (i = 0; i < stab->map.max_entries; i++) {
- struct smap_psock *psock;
- struct sock *sock;
-
- sock = stab->sock_map[i];
- if (!sock)
- continue;
- stab->sock_map[i] = NULL;
- psock = smap_psock_sk(sock);
- /* This check handles a racing sock event that can get the
- * sk_callback_lock before this case but after xchg happens
- * causing the refcnt to hit zero and sock user data (psock)
- * to be null and queued for garbage collection.
- */
- if (likely(psock)) {
- smap_list_map_remove(psock, &stab->sock_map[i]);
- smap_release_sock(psock, sock);
- }
- }
- raw_spin_unlock_bh(&stab->lock);
- rcu_read_unlock();
-
- sock_map_remove_complete(stab);
-}
-
-static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
-{
- struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
- u32 i = key ? *(u32 *)key : U32_MAX;
- u32 *next = (u32 *)next_key;
-
- if (i >= stab->map.max_entries) {
- *next = 0;
- return 0;
- }
-
- if (i == stab->map.max_entries - 1)
- return -ENOENT;
-
- *next = i + 1;
- return 0;
-}
-
-struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
-{
- struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
-
- if (key >= map->max_entries)
- return NULL;
-
- return READ_ONCE(stab->sock_map[key]);
-}
-
-static int sock_map_delete_elem(struct bpf_map *map, void *key)
-{
- struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
- struct smap_psock *psock;
- int k = *(u32 *)key;
- struct sock *sock;
-
- if (k >= map->max_entries)
- return -EINVAL;
-
- raw_spin_lock_bh(&stab->lock);
- sock = stab->sock_map[k];
- stab->sock_map[k] = NULL;
- raw_spin_unlock_bh(&stab->lock);
- if (!sock)
- return -EINVAL;
-
- psock = smap_psock_sk(sock);
- if (!psock)
- return 0;
- if (psock->bpf_parse) {
- write_lock_bh(&sock->sk_callback_lock);
- smap_stop_sock(psock, sock);
- write_unlock_bh(&sock->sk_callback_lock);
- }
- smap_list_map_remove(psock, &stab->sock_map[k]);
- smap_release_sock(psock, sock);
- return 0;
-}
-
-/* Locking notes: Concurrent updates, deletes, and lookups are allowed and are
- * done inside rcu critical sections. This ensures on updates that the psock
- * will not be released via smap_release_sock() until concurrent updates/deletes
- * complete. All operations operate on sock_map using cmpxchg and xchg
- * operations to ensure we do not get stale references. Any reads into the
- * map must be done with READ_ONCE() because of this.
- *
- * A psock is destroyed via call_rcu and after any worker threads are cancelled
- * and syncd so we are certain all references from the update/lookup/delete
- * operations as well as references in the data path are no longer in use.
- *
- * Psocks may exist in multiple maps, but only a single set of parse/verdict
- * programs may be inherited from the maps it belongs to. A reference count
- * is kept with the total number of references to the psock from all maps. The
- * psock will not be released until this reaches zero. The psock and sock
- * user data data use the sk_callback_lock to protect critical data structures
- * from concurrent access. This allows us to avoid two updates from modifying
- * the user data in sock and the lock is required anyways for modifying
- * callbacks, we simply increase its scope slightly.
- *
- * Rules to follow,
- * - psock must always be read inside RCU critical section
- * - sk_user_data must only be modified inside sk_callback_lock and read
- * inside RCU critical section.
- * - psock->maps list must only be read & modified inside sk_callback_lock
- * - sock_map must use READ_ONCE and (cmp)xchg operations
- * - BPF verdict/parse programs must use READ_ONCE and xchg operations
- */
-
-static int __sock_map_ctx_update_elem(struct bpf_map *map,
- struct bpf_sock_progs *progs,
- struct sock *sock,
- void *key)
-{
- struct bpf_prog *verdict, *parse, *tx_msg;
- struct smap_psock *psock;
- bool new = false;
- int err = 0;
-
- /* 1. If sock map has BPF programs those will be inherited by the
- * sock being added. If the sock is already attached to BPF programs
- * this results in an error.
- */
- verdict = READ_ONCE(progs->bpf_verdict);
- parse = READ_ONCE(progs->bpf_parse);
- tx_msg = READ_ONCE(progs->bpf_tx_msg);
-
- if (parse && verdict) {
- /* bpf prog refcnt may be zero if a concurrent attach operation
- * removes the program after the above READ_ONCE() but before
- * we increment the refcnt. If this is the case abort with an
- * error.
- */
- verdict = bpf_prog_inc_not_zero(verdict);
- if (IS_ERR(verdict))
- return PTR_ERR(verdict);
-
- parse = bpf_prog_inc_not_zero(parse);
- if (IS_ERR(parse)) {
- bpf_prog_put(verdict);
- return PTR_ERR(parse);
- }
- }
-
- if (tx_msg) {
- tx_msg = bpf_prog_inc_not_zero(tx_msg);
- if (IS_ERR(tx_msg)) {
- if (parse && verdict) {
- bpf_prog_put(parse);
- bpf_prog_put(verdict);
- }
- return PTR_ERR(tx_msg);
- }
- }
-
- psock = smap_psock_sk(sock);
-
- /* 2. Do not allow inheriting programs if psock exists and has
- * already inherited programs. This would create confusion on
- * which parser/verdict program is running. If no psock exists
- * create one. Inside sk_callback_lock to ensure concurrent create
- * doesn't update user data.
- */
- if (psock) {
- if (!psock_is_smap_sk(sock)) {
- err = -EBUSY;
- goto out_progs;
- }
- if (READ_ONCE(psock->bpf_parse) && parse) {
- err = -EBUSY;
- goto out_progs;
- }
- if (READ_ONCE(psock->bpf_tx_msg) && tx_msg) {
- err = -EBUSY;
- goto out_progs;
- }
- if (!refcount_inc_not_zero(&psock->refcnt)) {
- err = -EAGAIN;
- goto out_progs;
- }
- } else {
- psock = smap_init_psock(sock, map->numa_node);
- if (IS_ERR(psock)) {
- err = PTR_ERR(psock);
- goto out_progs;
- }
-
- set_bit(SMAP_TX_RUNNING, &psock->state);
- new = true;
- }
-
- /* 3. At this point we have a reference to a valid psock that is
- * running. Attach any BPF programs needed.
- */
- if (tx_msg)
- bpf_tcp_msg_add(psock, sock, tx_msg);
- if (new) {
- err = tcp_set_ulp_id(sock, TCP_ULP_BPF);
- if (err)
- goto out_free;
- }
-
- if (parse && verdict && !psock->strp_enabled) {
- err = smap_init_sock(psock, sock);
- if (err)
- goto out_free;
- smap_init_progs(psock, verdict, parse);
- write_lock_bh(&sock->sk_callback_lock);
- smap_start_sock(psock, sock);
- write_unlock_bh(&sock->sk_callback_lock);
- }
-
- return err;
-out_free:
- smap_release_sock(psock, sock);
-out_progs:
- if (parse && verdict) {
- bpf_prog_put(parse);
- bpf_prog_put(verdict);
- }
- if (tx_msg)
- bpf_prog_put(tx_msg);
- return err;
-}
-
-static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
- struct bpf_map *map,
- void *key, u64 flags)
-{
- struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
- struct bpf_sock_progs *progs = &stab->progs;
- struct sock *osock, *sock = skops->sk;
- struct smap_psock_map_entry *e;
- struct smap_psock *psock;
- u32 i = *(u32 *)key;
- int err;
-
- if (unlikely(flags > BPF_EXIST))
- return -EINVAL;
- if (unlikely(i >= stab->map.max_entries))
- return -E2BIG;
-
- e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
- if (!e)
- return -ENOMEM;
-
- err = __sock_map_ctx_update_elem(map, progs, sock, key);
- if (err)
- goto out;
-
- /* psock guaranteed to be present. */
- psock = smap_psock_sk(sock);
- raw_spin_lock_bh(&stab->lock);
- osock = stab->sock_map[i];
- if (osock && flags == BPF_NOEXIST) {
- err = -EEXIST;
- goto out_unlock;
- }
- if (!osock && flags == BPF_EXIST) {
- err = -ENOENT;
- goto out_unlock;
- }
-
- e->entry = &stab->sock_map[i];
- e->map = map;
- spin_lock_bh(&psock->maps_lock);
- list_add_tail(&e->list, &psock->maps);
- spin_unlock_bh(&psock->maps_lock);
-
- stab->sock_map[i] = sock;
- if (osock) {
- psock = smap_psock_sk(osock);
- smap_list_map_remove(psock, &stab->sock_map[i]);
- smap_release_sock(psock, osock);
- }
- raw_spin_unlock_bh(&stab->lock);
- return 0;
-out_unlock:
- smap_release_sock(psock, sock);
- raw_spin_unlock_bh(&stab->lock);
-out:
- kfree(e);
- return err;
-}
-
-int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
-{
- struct bpf_sock_progs *progs;
- struct bpf_prog *orig;
-
- if (map->map_type == BPF_MAP_TYPE_SOCKMAP) {
- struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
-
- progs = &stab->progs;
- } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH) {
- struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-
- progs = &htab->progs;
- } else {
- return -EINVAL;
- }
-
- switch (type) {
- case BPF_SK_MSG_VERDICT:
- orig = xchg(&progs->bpf_tx_msg, prog);
- break;
- case BPF_SK_SKB_STREAM_PARSER:
- orig = xchg(&progs->bpf_parse, prog);
- break;
- case BPF_SK_SKB_STREAM_VERDICT:
- orig = xchg(&progs->bpf_verdict, prog);
- break;
- default:
- return -EOPNOTSUPP;
- }
-
- if (orig)
- bpf_prog_put(orig);
-
- return 0;
-}
-
-int sockmap_get_from_fd(const union bpf_attr *attr, int type,
- struct bpf_prog *prog)
-{
- int ufd = attr->target_fd;
- struct bpf_map *map;
- struct fd f;
- int err;
-
- f = fdget(ufd);
- map = __bpf_map_get(f);
- if (IS_ERR(map))
- return PTR_ERR(map);
-
- err = sock_map_prog(map, prog, attr->attach_type);
- fdput(f);
- return err;
-}
-
-static void *sock_map_lookup(struct bpf_map *map, void *key)
-{
- return NULL;
-}
-
-static int sock_map_update_elem(struct bpf_map *map,
- void *key, void *value, u64 flags)
-{
- struct bpf_sock_ops_kern skops;
- u32 fd = *(u32 *)value;
- struct socket *socket;
- int err;
-
- socket = sockfd_lookup(fd, &err);
- if (!socket)
- return err;
-
- skops.sk = socket->sk;
- if (!skops.sk) {
- fput(socket->file);
- return -EINVAL;
- }
-
- /* ULPs are currently supported only for TCP sockets in ESTABLISHED
- * state.
- */
- if (skops.sk->sk_type != SOCK_STREAM ||
- skops.sk->sk_protocol != IPPROTO_TCP ||
- skops.sk->sk_state != TCP_ESTABLISHED) {
- fput(socket->file);
- return -EOPNOTSUPP;
- }
-
- lock_sock(skops.sk);
- preempt_disable();
- rcu_read_lock();
- err = sock_map_ctx_update_elem(&skops, map, key, flags);
- rcu_read_unlock();
- preempt_enable();
- release_sock(skops.sk);
- fput(socket->file);
- return err;
-}
-
-static void sock_map_release(struct bpf_map *map)
-{
- struct bpf_sock_progs *progs;
- struct bpf_prog *orig;
-
- if (map->map_type == BPF_MAP_TYPE_SOCKMAP) {
- struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
-
- progs = &stab->progs;
- } else {
- struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-
- progs = &htab->progs;
- }
-
- orig = xchg(&progs->bpf_parse, NULL);
- if (orig)
- bpf_prog_put(orig);
- orig = xchg(&progs->bpf_verdict, NULL);
- if (orig)
- bpf_prog_put(orig);
-
- orig = xchg(&progs->bpf_tx_msg, NULL);
- if (orig)
- bpf_prog_put(orig);
-}
-
-static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
-{
- struct bpf_htab *htab;
- int i, err;
- u64 cost;
-
- if (!capable(CAP_NET_ADMIN))
- return ERR_PTR(-EPERM);
-
- /* check sanity of attributes */
- if (attr->max_entries == 0 ||
- attr->key_size == 0 ||
- attr->value_size != 4 ||
- attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
- return ERR_PTR(-EINVAL);
-
- if (attr->key_size > MAX_BPF_STACK)
- /* eBPF programs initialize keys on stack, so they cannot be
- * larger than max stack size
- */
- return ERR_PTR(-E2BIG);
-
- err = bpf_tcp_ulp_register();
- if (err && err != -EEXIST)
- return ERR_PTR(err);
-
- htab = kzalloc(sizeof(*htab), GFP_USER);
- if (!htab)
- return ERR_PTR(-ENOMEM);
-
- bpf_map_init_from_attr(&htab->map, attr);
-
- htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
- htab->elem_size = sizeof(struct htab_elem) +
- round_up(htab->map.key_size, 8);
- err = -EINVAL;
- if (htab->n_buckets == 0 ||
- htab->n_buckets > U32_MAX / sizeof(struct bucket))
- goto free_htab;
-
- cost = (u64) htab->n_buckets * sizeof(struct bucket) +
- (u64) htab->elem_size * htab->map.max_entries;
-
- if (cost >= U32_MAX - PAGE_SIZE)
- goto free_htab;
-
- htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
- err = bpf_map_precharge_memlock(htab->map.pages);
- if (err)
- goto free_htab;
-
- err = -ENOMEM;
- htab->buckets = bpf_map_area_alloc(
- htab->n_buckets * sizeof(struct bucket),
- htab->map.numa_node);
- if (!htab->buckets)
- goto free_htab;
-
- for (i = 0; i < htab->n_buckets; i++) {
- INIT_HLIST_HEAD(&htab->buckets[i].head);
- raw_spin_lock_init(&htab->buckets[i].lock);
- }
-
- return &htab->map;
-free_htab:
- kfree(htab);
- return ERR_PTR(err);
-}
-
-static void __bpf_htab_free(struct rcu_head *rcu)
-{
- struct bpf_htab *htab;
-
- htab = container_of(rcu, struct bpf_htab, rcu);
- bpf_map_area_free(htab->buckets);
- kfree(htab);
-}
-
-static void sock_hash_free(struct bpf_map *map)
-{
- struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- int i;
-
- synchronize_rcu();
-
- /* At this point no update, lookup or delete operations can happen.
- * However, be aware we can still get a socket state event updates,
- * and data ready callabacks that reference the psock from sk_user_data
- * Also psock worker threads are still in-flight. So smap_release_sock
- * will only free the psock after cancel_sync on the worker threads
- * and a grace period expire to ensure psock is really safe to remove.
- */
- rcu_read_lock();
- for (i = 0; i < htab->n_buckets; i++) {
- struct bucket *b = __select_bucket(htab, i);
- struct hlist_head *head;
- struct hlist_node *n;
- struct htab_elem *l;
-
- raw_spin_lock_bh(&b->lock);
- head = &b->head;
- hlist_for_each_entry_safe(l, n, head, hash_node) {
- struct sock *sock = l->sk;
- struct smap_psock *psock;
-
- hlist_del_rcu(&l->hash_node);
- psock = smap_psock_sk(sock);
- /* This check handles a racing sock event that can get
- * the sk_callback_lock before this case but after xchg
- * causing the refcnt to hit zero and sock user data
- * (psock) to be null and queued for garbage collection.
- */
- if (likely(psock)) {
- smap_list_hash_remove(psock, l);
- smap_release_sock(psock, sock);
- }
- free_htab_elem(htab, l);
- }
- raw_spin_unlock_bh(&b->lock);
- }
- rcu_read_unlock();
- call_rcu(&htab->rcu, __bpf_htab_free);
-}
-
-static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab,
- void *key, u32 key_size, u32 hash,
- struct sock *sk,
- struct htab_elem *old_elem)
-{
- struct htab_elem *l_new;
-
- if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
- if (!old_elem) {
- atomic_dec(&htab->count);
- return ERR_PTR(-E2BIG);
- }
- }
- l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
- htab->map.numa_node);
- if (!l_new) {
- atomic_dec(&htab->count);
- return ERR_PTR(-ENOMEM);
- }
-
- memcpy(l_new->key, key, key_size);
- l_new->sk = sk;
- l_new->hash = hash;
- return l_new;
-}
-
-static inline u32 htab_map_hash(const void *key, u32 key_len)
-{
- return jhash(key, key_len, 0);
-}
-
-static int sock_hash_get_next_key(struct bpf_map *map,
- void *key, void *next_key)
-{
- struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- struct htab_elem *l, *next_l;
- struct hlist_head *h;
- u32 hash, key_size;
- int i = 0;
-
- WARN_ON_ONCE(!rcu_read_lock_held());
-
- key_size = map->key_size;
- if (!key)
- goto find_first_elem;
- hash = htab_map_hash(key, key_size);
- h = select_bucket(htab, hash);
-
- l = lookup_elem_raw(h, hash, key, key_size);
- if (!l)
- goto find_first_elem;
- next_l = hlist_entry_safe(
- rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
- struct htab_elem, hash_node);
- if (next_l) {
- memcpy(next_key, next_l->key, key_size);
- return 0;
- }
-
- /* no more elements in this hash list, go to the next bucket */
- i = hash & (htab->n_buckets - 1);
- i++;
-
-find_first_elem:
- /* iterate over buckets */
- for (; i < htab->n_buckets; i++) {
- h = select_bucket(htab, i);
-
- /* pick first element in the bucket */
- next_l = hlist_entry_safe(
- rcu_dereference_raw(hlist_first_rcu(h)),
- struct htab_elem, hash_node);
- if (next_l) {
- /* if it's not empty, just return it */
- memcpy(next_key, next_l->key, key_size);
- return 0;
- }
- }
-
- /* iterated over all buckets and all elements */
- return -ENOENT;
-}
-
-static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
- struct bpf_map *map,
- void *key, u64 map_flags)
-{
- struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- struct bpf_sock_progs *progs = &htab->progs;
- struct htab_elem *l_new = NULL, *l_old;
- struct smap_psock_map_entry *e = NULL;
- struct hlist_head *head;
- struct smap_psock *psock;
- u32 key_size, hash;
- struct sock *sock;
- struct bucket *b;
- int err;
-
- sock = skops->sk;
-
- if (sock->sk_type != SOCK_STREAM ||
- sock->sk_protocol != IPPROTO_TCP)
- return -EOPNOTSUPP;
-
- if (unlikely(map_flags > BPF_EXIST))
- return -EINVAL;
-
- e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
- if (!e)
- return -ENOMEM;
-
- WARN_ON_ONCE(!rcu_read_lock_held());
- key_size = map->key_size;
- hash = htab_map_hash(key, key_size);
- b = __select_bucket(htab, hash);
- head = &b->head;
-
- err = __sock_map_ctx_update_elem(map, progs, sock, key);
- if (err)
- goto err;
-
- /* psock is valid here because otherwise above *ctx_update_elem would
- * have thrown an error. It is safe to skip error check.
- */
- psock = smap_psock_sk(sock);
- raw_spin_lock_bh(&b->lock);
- l_old = lookup_elem_raw(head, hash, key, key_size);
- if (l_old && map_flags == BPF_NOEXIST) {
- err = -EEXIST;
- goto bucket_err;
- }
- if (!l_old && map_flags == BPF_EXIST) {
- err = -ENOENT;
- goto bucket_err;
- }
-
- l_new = alloc_sock_hash_elem(htab, key, key_size, hash, sock, l_old);
- if (IS_ERR(l_new)) {
- err = PTR_ERR(l_new);
- goto bucket_err;
- }
-
- rcu_assign_pointer(e->hash_link, l_new);
- e->map = map;
- spin_lock_bh(&psock->maps_lock);
- list_add_tail(&e->list, &psock->maps);
- spin_unlock_bh(&psock->maps_lock);
-
- /* add new element to the head of the list, so that
- * concurrent search will find it before old elem
- */
- hlist_add_head_rcu(&l_new->hash_node, head);
- if (l_old) {
- psock = smap_psock_sk(l_old->sk);
-
- hlist_del_rcu(&l_old->hash_node);
- smap_list_hash_remove(psock, l_old);
- smap_release_sock(psock, l_old->sk);
- free_htab_elem(htab, l_old);
- }
- raw_spin_unlock_bh(&b->lock);
- return 0;
-bucket_err:
- smap_release_sock(psock, sock);
- raw_spin_unlock_bh(&b->lock);
-err:
- kfree(e);
- return err;
-}
-
-static int sock_hash_update_elem(struct bpf_map *map,
- void *key, void *value, u64 flags)
-{
- struct bpf_sock_ops_kern skops;
- u32 fd = *(u32 *)value;
- struct socket *socket;
- int err;
-
- socket = sockfd_lookup(fd, &err);
- if (!socket)
- return err;
-
- skops.sk = socket->sk;
- if (!skops.sk) {
- fput(socket->file);
- return -EINVAL;
- }
-
- /* ULPs are currently supported only for TCP sockets in ESTABLISHED
- * state.
- */
- if (skops.sk->sk_type != SOCK_STREAM ||
- skops.sk->sk_protocol != IPPROTO_TCP ||
- skops.sk->sk_state != TCP_ESTABLISHED) {
- fput(socket->file);
- return -EOPNOTSUPP;
- }
-
- lock_sock(skops.sk);
- preempt_disable();
- rcu_read_lock();
- err = sock_hash_ctx_update_elem(&skops, map, key, flags);
- rcu_read_unlock();
- preempt_enable();
- release_sock(skops.sk);
- fput(socket->file);
- return err;
-}
-
-static int sock_hash_delete_elem(struct bpf_map *map, void *key)
-{
- struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- struct hlist_head *head;
- struct bucket *b;
- struct htab_elem *l;
- u32 hash, key_size;
- int ret = -ENOENT;
-
- key_size = map->key_size;
- hash = htab_map_hash(key, key_size);
- b = __select_bucket(htab, hash);
- head = &b->head;
-
- raw_spin_lock_bh(&b->lock);
- l = lookup_elem_raw(head, hash, key, key_size);
- if (l) {
- struct sock *sock = l->sk;
- struct smap_psock *psock;
-
- hlist_del_rcu(&l->hash_node);
- psock = smap_psock_sk(sock);
- /* This check handles a racing sock event that can get the
- * sk_callback_lock before this case but after xchg happens
- * causing the refcnt to hit zero and sock user data (psock)
- * to be null and queued for garbage collection.
- */
- if (likely(psock)) {
- smap_list_hash_remove(psock, l);
- smap_release_sock(psock, sock);
- }
- free_htab_elem(htab, l);
- ret = 0;
- }
- raw_spin_unlock_bh(&b->lock);
- return ret;
-}
-
-struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key)
-{
- struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- struct hlist_head *head;
- struct htab_elem *l;
- u32 key_size, hash;
- struct bucket *b;
- struct sock *sk;
-
- key_size = map->key_size;
- hash = htab_map_hash(key, key_size);
- b = __select_bucket(htab, hash);
- head = &b->head;
-
- l = lookup_elem_raw(head, hash, key, key_size);
- sk = l ? l->sk : NULL;
- return sk;
-}
-
-const struct bpf_map_ops sock_map_ops = {
- .map_alloc = sock_map_alloc,
- .map_free = sock_map_free,
- .map_lookup_elem = sock_map_lookup,
- .map_get_next_key = sock_map_get_next_key,
- .map_update_elem = sock_map_update_elem,
- .map_delete_elem = sock_map_delete_elem,
- .map_release_uref = sock_map_release,
- .map_check_btf = map_check_no_btf,
-};
-
-const struct bpf_map_ops sock_hash_ops = {
- .map_alloc = sock_hash_alloc,
- .map_free = sock_hash_free,
- .map_lookup_elem = sock_map_lookup,
- .map_get_next_key = sock_hash_get_next_key,
- .map_update_elem = sock_hash_update_elem,
- .map_delete_elem = sock_hash_delete_elem,
- .map_release_uref = sock_map_release,
- .map_check_btf = map_check_no_btf,
-};
-
-static bool bpf_is_valid_sock_op(struct bpf_sock_ops_kern *ops)
-{
- return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB ||
- ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB;
-}
-BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
- struct bpf_map *, map, void *, key, u64, flags)
-{
- WARN_ON_ONCE(!rcu_read_lock_held());
-
- /* ULPs are currently supported only for TCP sockets in ESTABLISHED
- * state. This checks that the sock ops triggering the update is
- * one indicating we are (or will be soon) in an ESTABLISHED state.
- */
- if (!bpf_is_valid_sock_op(bpf_sock))
- return -EOPNOTSUPP;
- return sock_map_ctx_update_elem(bpf_sock, map, key, flags);
-}
-
-const struct bpf_func_proto bpf_sock_map_update_proto = {
- .func = bpf_sock_map_update,
- .gpl_only = false,
- .pkt_access = true,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_CONST_MAP_PTR,
- .arg3_type = ARG_PTR_TO_MAP_KEY,
- .arg4_type = ARG_ANYTHING,
-};
-
-BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, bpf_sock,
- struct bpf_map *, map, void *, key, u64, flags)
-{
- WARN_ON_ONCE(!rcu_read_lock_held());
-
- if (!bpf_is_valid_sock_op(bpf_sock))
- return -EOPNOTSUPP;
- return sock_hash_ctx_update_elem(bpf_sock, map, key, flags);
-}
-
-const struct bpf_func_proto bpf_sock_hash_update_proto = {
- .func = bpf_sock_hash_update,
- .gpl_only = false,
- .pkt_access = true,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_CONST_MAP_PTR,
- .arg3_type = ARG_PTR_TO_MAP_KEY,
- .arg4_type = ARG_ANYTHING,
-};
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 8061a439ef18..b2ade10f7ec3 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -505,7 +505,7 @@ const struct bpf_func_proto bpf_get_stack_proto = {
/* Called from eBPF program */
static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
{
- return NULL;
+ return ERR_PTR(-EOPNOTSUPP);
}
/* Called from syscall */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5742df21598c..f4ecd6ed2252 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -719,10 +719,15 @@ static int map_lookup_elem(union bpf_attr *attr)
} else {
rcu_read_lock();
ptr = map->ops->map_lookup_elem(map, key);
- if (ptr)
+ if (IS_ERR(ptr)) {
+ err = PTR_ERR(ptr);
+ } else if (!ptr) {
+ err = -ENOENT;
+ } else {
+ err = 0;
memcpy(value, ptr, value_size);
+ }
rcu_read_unlock();
- err = ptr ? 0 : -ENOENT;
}
if (err)
@@ -743,6 +748,17 @@ err_put:
return err;
}
+static void maybe_wait_bpf_programs(struct bpf_map *map)
+{
+ /* Wait for any running BPF programs to complete so that
+ * userspace, when we return to it, knows that all programs
+ * that could be running use the new map value.
+ */
+ if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS ||
+ map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
+ synchronize_rcu();
+}
+
#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
static int map_update_elem(union bpf_attr *attr)
@@ -837,6 +853,7 @@ static int map_update_elem(union bpf_attr *attr)
}
__this_cpu_dec(bpf_prog_active);
preempt_enable();
+ maybe_wait_bpf_programs(map);
out:
free_value:
kfree(value);
@@ -889,6 +906,7 @@ static int map_delete_elem(union bpf_attr *attr)
rcu_read_unlock();
__this_cpu_dec(bpf_prog_active);
preempt_enable();
+ maybe_wait_bpf_programs(map);
out:
kfree(key);
err_put:
@@ -1646,7 +1664,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
switch (ptype) {
case BPF_PROG_TYPE_SK_SKB:
case BPF_PROG_TYPE_SK_MSG:
- ret = sockmap_get_from_fd(attr, ptype, prog);
+ ret = sock_map_get_from_fd(attr, prog);
break;
case BPF_PROG_TYPE_LIRC_MODE2:
ret = lirc_prog_attach(attr, prog);
@@ -1700,10 +1718,10 @@ static int bpf_prog_detach(const union bpf_attr *attr)
ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
break;
case BPF_SK_MSG_VERDICT:
- return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, NULL);
+ return sock_map_get_from_fd(attr, NULL);
case BPF_SK_SKB_STREAM_PARSER:
case BPF_SK_SKB_STREAM_VERDICT:
- return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, NULL);
+ return sock_map_get_from_fd(attr, NULL);
case BPF_LIRC_MODE2:
return lirc_prog_detach(attr);
case BPF_FLOW_DISSECTOR:
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index 9f8463afda9c..ef0b7b6ef8a5 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -154,7 +154,7 @@ void __xsk_map_flush(struct bpf_map *map)
static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
{
- return NULL;
+ return ERR_PTR(-EOPNOTSUPP);
}
static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
diff --git a/net/Kconfig b/net/Kconfig
index 228dfa382eec..f235edb593ba 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -300,8 +300,11 @@ config BPF_JIT
config BPF_STREAM_PARSER
bool "enable BPF STREAM_PARSER"
+ depends on INET
depends on BPF_SYSCALL
+ depends on CGROUP_BPF
select STREAM_PARSER
+ select NET_SOCK_MSG
---help---
Enabling this allows a stream parser to be used with
BPF_MAP_TYPE_SOCKMAP.
@@ -413,6 +416,14 @@ config GRO_CELLS
config SOCK_VALIDATE_XMIT
bool
+config NET_SOCK_MSG
+ bool
+ default n
+ help
+ The NET_SOCK_MSG provides a framework for plain sockets (e.g. TCP) or
+ ULPs (upper layer modules, e.g. TLS) to process L7 application data
+ with the help of BPF programs.
+
config NET_DEVLINK
tristate "Network physical/parent device Netlink interface"
help
diff --git a/net/core/Makefile b/net/core/Makefile
index 80175e6a2eb8..fccd31e0e7f7 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -16,6 +16,7 @@ obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
obj-y += net-sysfs.o
obj-$(CONFIG_PAGE_POOL) += page_pool.o
obj-$(CONFIG_PROC_FS) += net-procfs.o
+obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o
obj-$(CONFIG_NET_PKTGEN) += pktgen.o
obj-$(CONFIG_NETPOLL) += netpoll.o
obj-$(CONFIG_FIB_RULES) += fib_rules.o
@@ -27,6 +28,7 @@ obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o
+obj-$(CONFIG_BPF_STREAM_PARSER) += sock_map.o
obj-$(CONFIG_DST_CACHE) += dst_cache.o
obj-$(CONFIG_HWBM) += hwbm.o
obj-$(CONFIG_NET_DEVLINK) += devlink.o
diff --git a/net/core/dev.c b/net/core/dev.c
index 8497feea8fb5..022ad73d6253 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4291,6 +4291,9 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
struct netdev_rx_queue *rxqueue;
void *orig_data, *orig_data_end;
u32 metalen, act = XDP_DROP;
+ __be16 orig_eth_type;
+ struct ethhdr *eth;
+ bool orig_bcast;
int hlen, off;
u32 mac_len;
@@ -4331,6 +4334,9 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
xdp->data_hard_start = skb->data - skb_headroom(skb);
orig_data_end = xdp->data_end;
orig_data = xdp->data;
+ eth = (struct ethhdr *)xdp->data;
+ orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
+ orig_eth_type = eth->h_proto;
rxqueue = netif_get_rxqueue(skb);
xdp->rxq = &rxqueue->xdp_rxq;
@@ -4354,6 +4360,14 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
}
+ /* check if XDP changed eth hdr such SKB needs update */
+ eth = (struct ethhdr *)xdp->data;
+ if ((orig_eth_type != eth->h_proto) ||
+ (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
+ __skb_push(skb, ETH_HLEN);
+ skb->protocol = eth_type_trans(skb, skb->dev);
+ }
+
switch (act) {
case XDP_REDIRECT:
case XDP_TX:
diff --git a/net/core/filter.c b/net/core/filter.c
index 80da21b097b8..1a3ac6c46873 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -38,6 +38,7 @@
#include <net/protocol.h>
#include <net/netlink.h>
#include <linux/skbuff.h>
+#include <linux/skmsg.h>
#include <net/sock.h>
#include <net/flow_dissector.h>
#include <linux/errno.h>
@@ -2142,123 +2143,7 @@ static const struct bpf_func_proto bpf_redirect_proto = {
.arg2_type = ARG_ANYTHING,
};
-BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
- struct bpf_map *, map, void *, key, u64, flags)
-{
- struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
-
- /* If user passes invalid input drop the packet. */
- if (unlikely(flags & ~(BPF_F_INGRESS)))
- return SK_DROP;
-
- tcb->bpf.flags = flags;
- tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key);
- if (!tcb->bpf.sk_redir)
- return SK_DROP;
-
- return SK_PASS;
-}
-
-static const struct bpf_func_proto bpf_sk_redirect_hash_proto = {
- .func = bpf_sk_redirect_hash,
- .gpl_only = false,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_CONST_MAP_PTR,
- .arg3_type = ARG_PTR_TO_MAP_KEY,
- .arg4_type = ARG_ANYTHING,
-};
-
-BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
- struct bpf_map *, map, u32, key, u64, flags)
-{
- struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
-
- /* If user passes invalid input drop the packet. */
- if (unlikely(flags & ~(BPF_F_INGRESS)))
- return SK_DROP;
-
- tcb->bpf.flags = flags;
- tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key);
- if (!tcb->bpf.sk_redir)
- return SK_DROP;
-
- return SK_PASS;
-}
-
-struct sock *do_sk_redirect_map(struct sk_buff *skb)
-{
- struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
-
- return tcb->bpf.sk_redir;
-}
-
-static const struct bpf_func_proto bpf_sk_redirect_map_proto = {
- .func = bpf_sk_redirect_map,
- .gpl_only = false,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_CONST_MAP_PTR,
- .arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_ANYTHING,
-};
-
-BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg_buff *, msg,
- struct bpf_map *, map, void *, key, u64, flags)
-{
- /* If user passes invalid input drop the packet. */
- if (unlikely(flags & ~(BPF_F_INGRESS)))
- return SK_DROP;
-
- msg->flags = flags;
- msg->sk_redir = __sock_hash_lookup_elem(map, key);
- if (!msg->sk_redir)
- return SK_DROP;
-
- return SK_PASS;
-}
-
-static const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
- .func = bpf_msg_redirect_hash,
- .gpl_only = false,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_CONST_MAP_PTR,
- .arg3_type = ARG_PTR_TO_MAP_KEY,
- .arg4_type = ARG_ANYTHING,
-};
-
-BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg,
- struct bpf_map *, map, u32, key, u64, flags)
-{
- /* If user passes invalid input drop the packet. */
- if (unlikely(flags & ~(BPF_F_INGRESS)))
- return SK_DROP;
-
- msg->flags = flags;
- msg->sk_redir = __sock_map_lookup_elem(map, key);
- if (!msg->sk_redir)
- return SK_DROP;
-
- return SK_PASS;
-}
-
-struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
-{
- return msg->sk_redir;
-}
-
-static const struct bpf_func_proto bpf_msg_redirect_map_proto = {
- .func = bpf_msg_redirect_map,
- .gpl_only = false,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_CONST_MAP_PTR,
- .arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_ANYTHING,
-};
-
-BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg_buff *, msg, u32, bytes)
+BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes)
{
msg->apply_bytes = bytes;
return 0;
@@ -2272,7 +2157,7 @@ static const struct bpf_func_proto bpf_msg_apply_bytes_proto = {
.arg2_type = ARG_ANYTHING,
};
-BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg_buff *, msg, u32, bytes)
+BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes)
{
msg->cork_bytes = bytes;
return 0;
@@ -2286,45 +2171,37 @@ static const struct bpf_func_proto bpf_msg_cork_bytes_proto = {
.arg2_type = ARG_ANYTHING,
};
-#define sk_msg_iter_var(var) \
- do { \
- var++; \
- if (var == MAX_SKB_FRAGS) \
- var = 0; \
- } while (0)
-
-BPF_CALL_4(bpf_msg_pull_data,
- struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags)
+BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
+ u32, end, u64, flags)
{
- unsigned int len = 0, offset = 0, copy = 0, poffset = 0;
- int bytes = end - start, bytes_sg_total;
- struct scatterlist *sg = msg->sg_data;
- int first_sg, last_sg, i, shift;
- unsigned char *p, *to, *from;
+ u32 len = 0, offset = 0, copy = 0, poffset = 0, bytes = end - start;
+ u32 first_sge, last_sge, i, shift, bytes_sg_total;
+ struct scatterlist *sge;
+ u8 *raw, *to, *from;
struct page *page;
if (unlikely(flags || end <= start))
return -EINVAL;
/* First find the starting scatterlist element */
- i = msg->sg_start;
+ i = msg->sg.start;
do {
- len = sg[i].length;
+ len = sk_msg_elem(msg, i)->length;
if (start < offset + len)
break;
offset += len;
- sk_msg_iter_var(i);
- } while (i != msg->sg_end);
+ sk_msg_iter_var_next(i);
+ } while (i != msg->sg.end);
if (unlikely(start >= offset + len))
return -EINVAL;
- first_sg = i;
+ first_sge = i;
/* The start may point into the sg element so we need to also
* account for the headroom.
*/
bytes_sg_total = start - offset + bytes;
- if (!msg->sg_copy[i] && bytes_sg_total <= len)
+ if (!msg->sg.copy[i] && bytes_sg_total <= len)
goto out;
/* At this point we need to linearize multiple scatterlist
@@ -2338,12 +2215,12 @@ BPF_CALL_4(bpf_msg_pull_data,
* will copy the entire sg entry.
*/
do {
- copy += sg[i].length;
- sk_msg_iter_var(i);
+ copy += sk_msg_elem(msg, i)->length;
+ sk_msg_iter_var_next(i);
if (bytes_sg_total <= copy)
break;
- } while (i != msg->sg_end);
- last_sg = i;
+ } while (i != msg->sg.end);
+ last_sge = i;
if (unlikely(bytes_sg_total > copy))
return -EINVAL;
@@ -2352,63 +2229,61 @@ BPF_CALL_4(bpf_msg_pull_data,
get_order(copy));
if (unlikely(!page))
return -ENOMEM;
- p = page_address(page);
- i = first_sg;
+ raw = page_address(page);
+ i = first_sge;
do {
- from = sg_virt(&sg[i]);
- len = sg[i].length;
- to = p + poffset;
+ sge = sk_msg_elem(msg, i);
+ from = sg_virt(sge);
+ len = sge->length;
+ to = raw + poffset;
memcpy(to, from, len);
poffset += len;
- sg[i].length = 0;
- put_page(sg_page(&sg[i]));
+ sge->length = 0;
+ put_page(sg_page(sge));
- sk_msg_iter_var(i);
- } while (i != last_sg);
+ sk_msg_iter_var_next(i);
+ } while (i != last_sge);
- sg[first_sg].length = copy;
- sg_set_page(&sg[first_sg], page, copy, 0);
+ sg_set_page(&msg->sg.data[first_sge], page, copy, 0);
/* To repair sg ring we need to shift entries. If we only
* had a single entry though we can just replace it and
* be done. Otherwise walk the ring and shift the entries.
*/
- WARN_ON_ONCE(last_sg == first_sg);
- shift = last_sg > first_sg ?
- last_sg - first_sg - 1 :
- MAX_SKB_FRAGS - first_sg + last_sg - 1;
+ WARN_ON_ONCE(last_sge == first_sge);
+ shift = last_sge > first_sge ?
+ last_sge - first_sge - 1 :
+ MAX_SKB_FRAGS - first_sge + last_sge - 1;
if (!shift)
goto out;
- i = first_sg;
- sk_msg_iter_var(i);
+ i = first_sge;
+ sk_msg_iter_var_next(i);
do {
- int move_from;
+ u32 move_from;
- if (i + shift >= MAX_SKB_FRAGS)
- move_from = i + shift - MAX_SKB_FRAGS;
+ if (i + shift >= MAX_MSG_FRAGS)
+ move_from = i + shift - MAX_MSG_FRAGS;
else
move_from = i + shift;
-
- if (move_from == msg->sg_end)
+ if (move_from == msg->sg.end)
break;
- sg[i] = sg[move_from];
- sg[move_from].length = 0;
- sg[move_from].page_link = 0;
- sg[move_from].offset = 0;
-
- sk_msg_iter_var(i);
+ msg->sg.data[i] = msg->sg.data[move_from];
+ msg->sg.data[move_from].length = 0;
+ msg->sg.data[move_from].page_link = 0;
+ msg->sg.data[move_from].offset = 0;
+ sk_msg_iter_var_next(i);
} while (1);
- msg->sg_end -= shift;
- if (msg->sg_end < 0)
- msg->sg_end += MAX_SKB_FRAGS;
+
+ msg->sg.end = msg->sg.end - shift > msg->sg.end ?
+ msg->sg.end - shift + MAX_MSG_FRAGS :
+ msg->sg.end - shift;
out:
- msg->data = sg_virt(&sg[first_sg]) + start - offset;
+ msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset;
msg->data_end = msg->data + bytes;
-
return 0;
}
@@ -4821,9 +4696,12 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
struct sk_buff *skb, u8 family, u8 proto)
{
- int dif = skb->dev->ifindex;
bool refcounted = false;
struct sock *sk = NULL;
+ int dif = 0;
+
+ if (skb->dev)
+ dif = skb->dev->ifindex;
if (family == AF_INET) {
__be32 src4 = tuple->ipv4.saddr;
@@ -4839,21 +4717,24 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport,
dst4, tuple->ipv4.dport,
dif, sdif, &udp_table, skb);
-#if IS_REACHABLE(CONFIG_IPV6)
+#if IS_ENABLED(CONFIG_IPV6)
} else {
struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr;
struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr;
+ u16 hnum = ntohs(tuple->ipv6.dport);
int sdif = inet6_sdif(skb);
if (proto == IPPROTO_TCP)
sk = __inet6_lookup(net, &tcp_hashinfo, skb, 0,
src6, tuple->ipv6.sport,
- dst6, tuple->ipv6.dport,
+ dst6, hnum,
dif, sdif, &refcounted);
- else
- sk = __udp6_lib_lookup(net, src6, tuple->ipv6.sport,
- dst6, tuple->ipv6.dport,
- dif, sdif, &udp_table, skb);
+ else if (likely(ipv6_bpf_stub))
+ sk = ipv6_bpf_stub->udp6_lib_lookup(net,
+ src6, tuple->ipv6.sport,
+ dst6, hnum,
+ dif, sdif,
+ &udp_table, skb);
#endif
}
@@ -5200,6 +5081,9 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
}
}
+const struct bpf_func_proto bpf_sock_map_update_proto __weak;
+const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
+
static const struct bpf_func_proto *
sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -5223,6 +5107,9 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
}
}
+const struct bpf_func_proto bpf_msg_redirect_map_proto __weak;
+const struct bpf_func_proto bpf_msg_redirect_hash_proto __weak;
+
static const struct bpf_func_proto *
sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -5244,6 +5131,9 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
}
}
+const struct bpf_func_proto bpf_sk_redirect_map_proto __weak;
+const struct bpf_func_proto bpf_sk_redirect_hash_proto __weak;
+
static const struct bpf_func_proto *
sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -6998,22 +6888,22 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
switch (si->off) {
case offsetof(struct sk_msg_md, data):
- *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data),
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data),
si->dst_reg, si->src_reg,
- offsetof(struct sk_msg_buff, data));
+ offsetof(struct sk_msg, data));
break;
case offsetof(struct sk_msg_md, data_end):
- *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data_end),
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data_end),
si->dst_reg, si->src_reg,
- offsetof(struct sk_msg_buff, data_end));
+ offsetof(struct sk_msg, data_end));
break;
case offsetof(struct sk_msg_md, family):
BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
- struct sk_msg_buff, sk),
+ struct sk_msg, sk),
si->dst_reg, si->src_reg,
- offsetof(struct sk_msg_buff, sk));
+ offsetof(struct sk_msg, sk));
*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
offsetof(struct sock_common, skc_family));
break;
@@ -7022,9 +6912,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
- struct sk_msg_buff, sk),
+ struct sk_msg, sk),
si->dst_reg, si->src_reg,
- offsetof(struct sk_msg_buff, sk));
+ offsetof(struct sk_msg, sk));
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
offsetof(struct sock_common, skc_daddr));
break;
@@ -7034,9 +6924,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
skc_rcv_saddr) != 4);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
- struct sk_msg_buff, sk),
+ struct sk_msg, sk),
si->dst_reg, si->src_reg,
- offsetof(struct sk_msg_buff, sk));
+ offsetof(struct sk_msg, sk));
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
offsetof(struct sock_common,
skc_rcv_saddr));
@@ -7051,9 +6941,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
off = si->off;
off -= offsetof(struct sk_msg_md, remote_ip6[0]);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
- struct sk_msg_buff, sk),
+ struct sk_msg, sk),
si->dst_reg, si->src_reg,
- offsetof(struct sk_msg_buff, sk));
+ offsetof(struct sk_msg, sk));
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
offsetof(struct sock_common,
skc_v6_daddr.s6_addr32[0]) +
@@ -7072,9 +6962,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
off = si->off;
off -= offsetof(struct sk_msg_md, local_ip6[0]);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
- struct sk_msg_buff, sk),
+ struct sk_msg, sk),
si->dst_reg, si->src_reg,
- offsetof(struct sk_msg_buff, sk));
+ offsetof(struct sk_msg, sk));
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
offsetof(struct sock_common,
skc_v6_rcv_saddr.s6_addr32[0]) +
@@ -7088,9 +6978,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
- struct sk_msg_buff, sk),
+ struct sk_msg, sk),
si->dst_reg, si->src_reg,
- offsetof(struct sk_msg_buff, sk));
+ offsetof(struct sk_msg, sk));
*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
offsetof(struct sock_common, skc_dport));
#ifndef __BIG_ENDIAN_BITFIELD
@@ -7102,9 +6992,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
- struct sk_msg_buff, sk),
+ struct sk_msg, sk),
si->dst_reg, si->src_reg,
- offsetof(struct sk_msg_buff, sk));
+ offsetof(struct sk_msg, sk));
*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
offsetof(struct sock_common, skc_num));
break;
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
new file mode 100644
index 000000000000..56a99d0c9aa0
--- /dev/null
+++ b/net/core/skmsg.c
@@ -0,0 +1,802 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
+
+#include <linux/skmsg.h>
+#include <linux/skbuff.h>
+#include <linux/scatterlist.h>
+
+#include <net/sock.h>
+#include <net/tcp.h>
+
+static bool sk_msg_try_coalesce_ok(struct sk_msg *msg, int elem_first_coalesce)
+{
+ if (msg->sg.end > msg->sg.start &&
+ elem_first_coalesce < msg->sg.end)
+ return true;
+
+ if (msg->sg.end < msg->sg.start &&
+ (elem_first_coalesce > msg->sg.start ||
+ elem_first_coalesce < msg->sg.end))
+ return true;
+
+ return false;
+}
+
+int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len,
+ int elem_first_coalesce)
+{
+ struct page_frag *pfrag = sk_page_frag(sk);
+ int ret = 0;
+
+ len -= msg->sg.size;
+ while (len > 0) {
+ struct scatterlist *sge;
+ u32 orig_offset;
+ int use, i;
+
+ if (!sk_page_frag_refill(sk, pfrag))
+ return -ENOMEM;
+
+ orig_offset = pfrag->offset;
+ use = min_t(int, len, pfrag->size - orig_offset);
+ if (!sk_wmem_schedule(sk, use))
+ return -ENOMEM;
+
+ i = msg->sg.end;
+ sk_msg_iter_var_prev(i);
+ sge = &msg->sg.data[i];
+
+ if (sk_msg_try_coalesce_ok(msg, elem_first_coalesce) &&
+ sg_page(sge) == pfrag->page &&
+ sge->offset + sge->length == orig_offset) {
+ sge->length += use;
+ } else {
+ if (sk_msg_full(msg)) {
+ ret = -ENOSPC;
+ break;
+ }
+
+ sge = &msg->sg.data[msg->sg.end];
+ sg_unmark_end(sge);
+ sg_set_page(sge, pfrag->page, use, orig_offset);
+ get_page(pfrag->page);
+ sk_msg_iter_next(msg, end);
+ }
+
+ sk_mem_charge(sk, use);
+ msg->sg.size += use;
+ pfrag->offset += use;
+ len -= use;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sk_msg_alloc);
+
+int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src,
+ u32 off, u32 len)
+{
+ int i = src->sg.start;
+ struct scatterlist *sge = sk_msg_elem(src, i);
+ u32 sge_len, sge_off;
+
+ if (sk_msg_full(dst))
+ return -ENOSPC;
+
+ while (off) {
+ if (sge->length > off)
+ break;
+ off -= sge->length;
+ sk_msg_iter_var_next(i);
+ if (i == src->sg.end && off)
+ return -ENOSPC;
+ sge = sk_msg_elem(src, i);
+ }
+
+ while (len) {
+ sge_len = sge->length - off;
+ sge_off = sge->offset + off;
+ if (sge_len > len)
+ sge_len = len;
+ off = 0;
+ len -= sge_len;
+ sk_msg_page_add(dst, sg_page(sge), sge_len, sge_off);
+ sk_mem_charge(sk, sge_len);
+ sk_msg_iter_var_next(i);
+ if (i == src->sg.end && len)
+ return -ENOSPC;
+ sge = sk_msg_elem(src, i);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sk_msg_clone);
+
+void sk_msg_return_zero(struct sock *sk, struct sk_msg *msg, int bytes)
+{
+ int i = msg->sg.start;
+
+ do {
+ struct scatterlist *sge = sk_msg_elem(msg, i);
+
+ if (bytes < sge->length) {
+ sge->length -= bytes;
+ sge->offset += bytes;
+ sk_mem_uncharge(sk, bytes);
+ break;
+ }
+
+ sk_mem_uncharge(sk, sge->length);
+ bytes -= sge->length;
+ sge->length = 0;
+ sge->offset = 0;
+ sk_msg_iter_var_next(i);
+ } while (bytes && i != msg->sg.end);
+ msg->sg.start = i;
+}
+EXPORT_SYMBOL_GPL(sk_msg_return_zero);
+
+void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes)
+{
+ int i = msg->sg.start;
+
+ do {
+ struct scatterlist *sge = &msg->sg.data[i];
+ int uncharge = (bytes < sge->length) ? bytes : sge->length;
+
+ sk_mem_uncharge(sk, uncharge);
+ bytes -= uncharge;
+ sk_msg_iter_var_next(i);
+ } while (i != msg->sg.end);
+}
+EXPORT_SYMBOL_GPL(sk_msg_return);
+
+static int sk_msg_free_elem(struct sock *sk, struct sk_msg *msg, u32 i,
+ bool charge)
+{
+ struct scatterlist *sge = sk_msg_elem(msg, i);
+ u32 len = sge->length;
+
+ if (charge)
+ sk_mem_uncharge(sk, len);
+ if (!msg->skb)
+ put_page(sg_page(sge));
+ memset(sge, 0, sizeof(*sge));
+ return len;
+}
+
+static int __sk_msg_free(struct sock *sk, struct sk_msg *msg, u32 i,
+ bool charge)
+{
+ struct scatterlist *sge = sk_msg_elem(msg, i);
+ int freed = 0;
+
+ while (msg->sg.size) {
+ msg->sg.size -= sge->length;
+ freed += sk_msg_free_elem(sk, msg, i, charge);
+ sk_msg_iter_var_next(i);
+ sk_msg_check_to_free(msg, i, msg->sg.size);
+ sge = sk_msg_elem(msg, i);
+ }
+ if (msg->skb)
+ consume_skb(msg->skb);
+ sk_msg_init(msg);
+ return freed;
+}
+
+int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg)
+{
+ return __sk_msg_free(sk, msg, msg->sg.start, false);
+}
+EXPORT_SYMBOL_GPL(sk_msg_free_nocharge);
+
+int sk_msg_free(struct sock *sk, struct sk_msg *msg)
+{
+ return __sk_msg_free(sk, msg, msg->sg.start, true);
+}
+EXPORT_SYMBOL_GPL(sk_msg_free);
+
+static void __sk_msg_free_partial(struct sock *sk, struct sk_msg *msg,
+ u32 bytes, bool charge)
+{
+ struct scatterlist *sge;
+ u32 i = msg->sg.start;
+
+ while (bytes) {
+ sge = sk_msg_elem(msg, i);
+ if (!sge->length)
+ break;
+ if (bytes < sge->length) {
+ if (charge)
+ sk_mem_uncharge(sk, bytes);
+ sge->length -= bytes;
+ sge->offset += bytes;
+ msg->sg.size -= bytes;
+ break;
+ }
+
+ msg->sg.size -= sge->length;
+ bytes -= sge->length;
+ sk_msg_free_elem(sk, msg, i, charge);
+ sk_msg_iter_var_next(i);
+ sk_msg_check_to_free(msg, i, bytes);
+ }
+ msg->sg.start = i;
+}
+
+void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes)
+{
+ __sk_msg_free_partial(sk, msg, bytes, true);
+}
+EXPORT_SYMBOL_GPL(sk_msg_free_partial);
+
+void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg,
+ u32 bytes)
+{
+ __sk_msg_free_partial(sk, msg, bytes, false);
+}
+
+void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len)
+{
+ int trim = msg->sg.size - len;
+ u32 i = msg->sg.end;
+
+ if (trim <= 0) {
+ WARN_ON(trim < 0);
+ return;
+ }
+
+ sk_msg_iter_var_prev(i);
+ msg->sg.size = len;
+ while (msg->sg.data[i].length &&
+ trim >= msg->sg.data[i].length) {
+ trim -= msg->sg.data[i].length;
+ sk_msg_free_elem(sk, msg, i, true);
+ sk_msg_iter_var_prev(i);
+ if (!trim)
+ goto out;
+ }
+
+ msg->sg.data[i].length -= trim;
+ sk_mem_uncharge(sk, trim);
+out:
+ /* If we trim data before curr pointer update copybreak and current
+ * so that any future copy operations start at new copy location.
+ * However trimed data that has not yet been used in a copy op
+ * does not require an update.
+ */
+ if (msg->sg.curr >= i) {
+ msg->sg.curr = i;
+ msg->sg.copybreak = msg->sg.data[i].length;
+ }
+ sk_msg_iter_var_next(i);
+ msg->sg.end = i;
+}
+EXPORT_SYMBOL_GPL(sk_msg_trim);
+
+int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
+ struct sk_msg *msg, u32 bytes)
+{
+ int i, maxpages, ret = 0, num_elems = sk_msg_elem_used(msg);
+ const int to_max_pages = MAX_MSG_FRAGS;
+ struct page *pages[MAX_MSG_FRAGS];
+ ssize_t orig, copied, use, offset;
+
+ orig = msg->sg.size;
+ while (bytes > 0) {
+ i = 0;
+ maxpages = to_max_pages - num_elems;
+ if (maxpages == 0) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ copied = iov_iter_get_pages(from, pages, bytes, maxpages,
+ &offset);
+ if (copied <= 0) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ iov_iter_advance(from, copied);
+ bytes -= copied;
+ msg->sg.size += copied;
+
+ while (copied) {
+ use = min_t(int, copied, PAGE_SIZE - offset);
+ sg_set_page(&msg->sg.data[msg->sg.end],
+ pages[i], use, offset);
+ sg_unmark_end(&msg->sg.data[msg->sg.end]);
+ sk_mem_charge(sk, use);
+
+ offset = 0;
+ copied -= use;
+ sk_msg_iter_next(msg, end);
+ num_elems++;
+ i++;
+ }
+ /* When zerocopy is mixed with sk_msg_*copy* operations we
+ * may have a copybreak set in this case clear and prefer
+ * zerocopy remainder when possible.
+ */
+ msg->sg.copybreak = 0;
+ msg->sg.curr = msg->sg.end;
+ }
+out:
+ /* Revert iov_iter updates, msg will need to use 'trim' later if it
+ * also needs to be cleared.
+ */
+ if (ret)
+ iov_iter_revert(from, msg->sg.size - orig);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sk_msg_zerocopy_from_iter);
+
+int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
+ struct sk_msg *msg, u32 bytes)
+{
+ int ret = -ENOSPC, i = msg->sg.curr;
+ struct scatterlist *sge;
+ u32 copy, buf_size;
+ void *to;
+
+ do {
+ sge = sk_msg_elem(msg, i);
+ /* This is possible if a trim operation shrunk the buffer */
+ if (msg->sg.copybreak >= sge->length) {
+ msg->sg.copybreak = 0;
+ sk_msg_iter_var_next(i);
+ if (i == msg->sg.end)
+ break;
+ sge = sk_msg_elem(msg, i);
+ }
+
+ buf_size = sge->length - msg->sg.copybreak;
+ copy = (buf_size > bytes) ? bytes : buf_size;
+ to = sg_virt(sge) + msg->sg.copybreak;
+ msg->sg.copybreak += copy;
+ if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY)
+ ret = copy_from_iter_nocache(to, copy, from);
+ else
+ ret = copy_from_iter(to, copy, from);
+ if (ret != copy) {
+ ret = -EFAULT;
+ goto out;
+ }
+ bytes -= copy;
+ if (!bytes)
+ break;
+ msg->sg.copybreak = 0;
+ sk_msg_iter_var_next(i);
+ } while (i != msg->sg.end);
+out:
+ msg->sg.curr = i;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter);
+
+static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
+{
+ struct sock *sk = psock->sk;
+ int copied = 0, num_sge;
+ struct sk_msg *msg;
+
+ msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC);
+ if (unlikely(!msg))
+ return -EAGAIN;
+ if (!sk_rmem_schedule(sk, skb, skb->len)) {
+ kfree(msg);
+ return -EAGAIN;
+ }
+
+ sk_msg_init(msg);
+ num_sge = skb_to_sgvec(skb, msg->sg.data, 0, skb->len);
+ if (unlikely(num_sge < 0)) {
+ kfree(msg);
+ return num_sge;
+ }
+
+ sk_mem_charge(sk, skb->len);
+ copied = skb->len;
+ msg->sg.start = 0;
+ msg->sg.end = num_sge == MAX_MSG_FRAGS ? 0 : num_sge;
+ msg->skb = skb;
+
+ sk_psock_queue_msg(psock, msg);
+ sk->sk_data_ready(sk);
+ return copied;
+}
+
+static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
+ u32 off, u32 len, bool ingress)
+{
+ if (ingress)
+ return sk_psock_skb_ingress(psock, skb);
+ else
+ return skb_send_sock_locked(psock->sk, skb, off, len);
+}
+
+static void sk_psock_backlog(struct work_struct *work)
+{
+ struct sk_psock *psock = container_of(work, struct sk_psock, work);
+ struct sk_psock_work_state *state = &psock->work_state;
+ struct sk_buff *skb;
+ bool ingress;
+ u32 len, off;
+ int ret;
+
+ /* Lock sock to avoid losing sk_socket during loop. */
+ lock_sock(psock->sk);
+ if (state->skb) {
+ skb = state->skb;
+ len = state->len;
+ off = state->off;
+ state->skb = NULL;
+ goto start;
+ }
+
+ while ((skb = skb_dequeue(&psock->ingress_skb))) {
+ len = skb->len;
+ off = 0;
+start:
+ ingress = tcp_skb_bpf_ingress(skb);
+ do {
+ ret = -EIO;
+ if (likely(psock->sk->sk_socket))
+ ret = sk_psock_handle_skb(psock, skb, off,
+ len, ingress);
+ if (ret <= 0) {
+ if (ret == -EAGAIN) {
+ state->skb = skb;
+ state->len = len;
+ state->off = off;
+ goto end;
+ }
+ /* Hard errors break pipe and stop xmit. */
+ sk_psock_report_error(psock, ret ? -ret : EPIPE);
+ sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
+ kfree_skb(skb);
+ goto end;
+ }
+ off += ret;
+ len -= ret;
+ } while (len);
+
+ if (!ingress)
+ kfree_skb(skb);
+ }
+end:
+ release_sock(psock->sk);
+}
+
+struct sk_psock *sk_psock_init(struct sock *sk, int node)
+{
+ struct sk_psock *psock = kzalloc_node(sizeof(*psock),
+ GFP_ATOMIC | __GFP_NOWARN,
+ node);
+ if (!psock)
+ return NULL;
+
+ psock->sk = sk;
+ psock->eval = __SK_NONE;
+
+ INIT_LIST_HEAD(&psock->link);
+ spin_lock_init(&psock->link_lock);
+
+ INIT_WORK(&psock->work, sk_psock_backlog);
+ INIT_LIST_HEAD(&psock->ingress_msg);
+ skb_queue_head_init(&psock->ingress_skb);
+
+ sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED);
+ refcount_set(&psock->refcnt, 1);
+
+ rcu_assign_sk_user_data(sk, psock);
+ sock_hold(sk);
+
+ return psock;
+}
+EXPORT_SYMBOL_GPL(sk_psock_init);
+
+struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock)
+{
+ struct sk_psock_link *link;
+
+ spin_lock_bh(&psock->link_lock);
+ link = list_first_entry_or_null(&psock->link, struct sk_psock_link,
+ list);
+ if (link)
+ list_del(&link->list);
+ spin_unlock_bh(&psock->link_lock);
+ return link;
+}
+
+void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
+{
+ struct sk_msg *msg, *tmp;
+
+ list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) {
+ list_del(&msg->list);
+ sk_msg_free(psock->sk, msg);
+ kfree(msg);
+ }
+}
+
+static void sk_psock_zap_ingress(struct sk_psock *psock)
+{
+ __skb_queue_purge(&psock->ingress_skb);
+ __sk_psock_purge_ingress_msg(psock);
+}
+
+static void sk_psock_link_destroy(struct sk_psock *psock)
+{
+ struct sk_psock_link *link, *tmp;
+
+ list_for_each_entry_safe(link, tmp, &psock->link, list) {
+ list_del(&link->list);
+ sk_psock_free_link(link);
+ }
+}
+
+static void sk_psock_destroy_deferred(struct work_struct *gc)
+{
+ struct sk_psock *psock = container_of(gc, struct sk_psock, gc);
+
+ /* No sk_callback_lock since already detached. */
+ if (psock->parser.enabled)
+ strp_done(&psock->parser.strp);
+
+ cancel_work_sync(&psock->work);
+
+ psock_progs_drop(&psock->progs);
+
+ sk_psock_link_destroy(psock);
+ sk_psock_cork_free(psock);
+ sk_psock_zap_ingress(psock);
+
+ if (psock->sk_redir)
+ sock_put(psock->sk_redir);
+ sock_put(psock->sk);
+ kfree(psock);
+}
+
+void sk_psock_destroy(struct rcu_head *rcu)
+{
+ struct sk_psock *psock = container_of(rcu, struct sk_psock, rcu);
+
+ INIT_WORK(&psock->gc, sk_psock_destroy_deferred);
+ schedule_work(&psock->gc);
+}
+EXPORT_SYMBOL_GPL(sk_psock_destroy);
+
+void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
+{
+ rcu_assign_sk_user_data(sk, NULL);
+ sk_psock_cork_free(psock);
+ sk_psock_restore_proto(sk, psock);
+
+ write_lock_bh(&sk->sk_callback_lock);
+ if (psock->progs.skb_parser)
+ sk_psock_stop_strp(sk, psock);
+ write_unlock_bh(&sk->sk_callback_lock);
+ sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
+
+ call_rcu_sched(&psock->rcu, sk_psock_destroy);
+}
+EXPORT_SYMBOL_GPL(sk_psock_drop);
+
+static int sk_psock_map_verd(int verdict, bool redir)
+{
+ switch (verdict) {
+ case SK_PASS:
+ return redir ? __SK_REDIRECT : __SK_PASS;
+ case SK_DROP:
+ default:
+ break;
+ }
+
+ return __SK_DROP;
+}
+
+int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
+ struct sk_msg *msg)
+{
+ struct bpf_prog *prog;
+ int ret;
+
+ preempt_disable();
+ rcu_read_lock();
+ prog = READ_ONCE(psock->progs.msg_parser);
+ if (unlikely(!prog)) {
+ ret = __SK_PASS;
+ goto out;
+ }
+
+ sk_msg_compute_data_pointers(msg);
+ msg->sk = sk;
+ ret = BPF_PROG_RUN(prog, msg);
+ ret = sk_psock_map_verd(ret, msg->sk_redir);
+ psock->apply_bytes = msg->apply_bytes;
+ if (ret == __SK_REDIRECT) {
+ if (psock->sk_redir)
+ sock_put(psock->sk_redir);
+ psock->sk_redir = msg->sk_redir;
+ if (!psock->sk_redir) {
+ ret = __SK_DROP;
+ goto out;
+ }
+ sock_hold(psock->sk_redir);
+ }
+out:
+ rcu_read_unlock();
+ preempt_enable();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sk_psock_msg_verdict);
+
+static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog,
+ struct sk_buff *skb)
+{
+ int ret;
+
+ skb->sk = psock->sk;
+ bpf_compute_data_end_sk_skb(skb);
+ preempt_disable();
+ ret = BPF_PROG_RUN(prog, skb);
+ preempt_enable();
+ /* strparser clones the skb before handing it to a upper layer,
+ * meaning skb_orphan has been called. We NULL sk on the way out
+ * to ensure we don't trigger a BUG_ON() in skb/sk operations
+ * later and because we are not charging the memory of this skb
+ * to any socket yet.
+ */
+ skb->sk = NULL;
+ return ret;
+}
+
+static struct sk_psock *sk_psock_from_strp(struct strparser *strp)
+{
+ struct sk_psock_parser *parser;
+
+ parser = container_of(strp, struct sk_psock_parser, strp);
+ return container_of(parser, struct sk_psock, parser);
+}
+
+static void sk_psock_verdict_apply(struct sk_psock *psock,
+ struct sk_buff *skb, int verdict)
+{
+ struct sk_psock *psock_other;
+ struct sock *sk_other;
+ bool ingress;
+
+ switch (verdict) {
+ case __SK_REDIRECT:
+ sk_other = tcp_skb_bpf_redirect_fetch(skb);
+ if (unlikely(!sk_other))
+ goto out_free;
+ psock_other = sk_psock(sk_other);
+ if (!psock_other || sock_flag(sk_other, SOCK_DEAD) ||
+ !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED))
+ goto out_free;
+ ingress = tcp_skb_bpf_ingress(skb);
+ if ((!ingress && sock_writeable(sk_other)) ||
+ (ingress &&
+ atomic_read(&sk_other->sk_rmem_alloc) <=
+ sk_other->sk_rcvbuf)) {
+ if (!ingress)
+ skb_set_owner_w(skb, sk_other);
+ skb_queue_tail(&psock_other->ingress_skb, skb);
+ schedule_work(&psock_other->work);
+ break;
+ }
+ /* fall-through */
+ case __SK_DROP:
+ /* fall-through */
+ default:
+out_free:
+ kfree_skb(skb);
+ }
+}
+
+static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
+{
+ struct sk_psock *psock = sk_psock_from_strp(strp);
+ struct bpf_prog *prog;
+ int ret = __SK_DROP;
+
+ rcu_read_lock();
+ prog = READ_ONCE(psock->progs.skb_verdict);
+ if (likely(prog)) {
+ skb_orphan(skb);
+ tcp_skb_bpf_redirect_clear(skb);
+ ret = sk_psock_bpf_run(psock, prog, skb);
+ ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
+ }
+ rcu_read_unlock();
+ sk_psock_verdict_apply(psock, skb, ret);
+}
+
+static int sk_psock_strp_read_done(struct strparser *strp, int err)
+{
+ return err;
+}
+
+static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb)
+{
+ struct sk_psock *psock = sk_psock_from_strp(strp);
+ struct bpf_prog *prog;
+ int ret = skb->len;
+
+ rcu_read_lock();
+ prog = READ_ONCE(psock->progs.skb_parser);
+ if (likely(prog))
+ ret = sk_psock_bpf_run(psock, prog, skb);
+ rcu_read_unlock();
+ return ret;
+}
+
+/* Called with socket lock held. */
+static void sk_psock_data_ready(struct sock *sk)
+{
+ struct sk_psock *psock;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (likely(psock)) {
+ write_lock_bh(&sk->sk_callback_lock);
+ strp_data_ready(&psock->parser.strp);
+ write_unlock_bh(&sk->sk_callback_lock);
+ }
+ rcu_read_unlock();
+}
+
+static void sk_psock_write_space(struct sock *sk)
+{
+ struct sk_psock *psock;
+ void (*write_space)(struct sock *sk);
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (likely(psock && sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)))
+ schedule_work(&psock->work);
+ write_space = psock->saved_write_space;
+ rcu_read_unlock();
+ write_space(sk);
+}
+
+int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
+{
+ static const struct strp_callbacks cb = {
+ .rcv_msg = sk_psock_strp_read,
+ .read_sock_done = sk_psock_strp_read_done,
+ .parse_msg = sk_psock_strp_parse,
+ };
+
+ psock->parser.enabled = false;
+ return strp_init(&psock->parser.strp, sk, &cb);
+}
+
+void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
+{
+ struct sk_psock_parser *parser = &psock->parser;
+
+ if (parser->enabled)
+ return;
+
+ parser->saved_data_ready = sk->sk_data_ready;
+ sk->sk_data_ready = sk_psock_data_ready;
+ sk->sk_write_space = sk_psock_write_space;
+ parser->enabled = true;
+}
+
+void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
+{
+ struct sk_psock_parser *parser = &psock->parser;
+
+ if (!parser->enabled)
+ return;
+
+ sk->sk_data_ready = parser->saved_data_ready;
+ parser->saved_data_ready = NULL;
+ strp_stop(&parser->strp);
+ parser->enabled = false;
+}
diff --git a/net/core/sock.c b/net/core/sock.c
index fdf9fc7d3f98..6fcc4bc07d19 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2239,67 +2239,6 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
}
EXPORT_SYMBOL(sk_page_frag_refill);
-int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
- int sg_start, int *sg_curr_index, unsigned int *sg_curr_size,
- int first_coalesce)
-{
- int sg_curr = *sg_curr_index, use = 0, rc = 0;
- unsigned int size = *sg_curr_size;
- struct page_frag *pfrag;
- struct scatterlist *sge;
-
- len -= size;
- pfrag = sk_page_frag(sk);
-
- while (len > 0) {
- unsigned int orig_offset;
-
- if (!sk_page_frag_refill(sk, pfrag)) {
- rc = -ENOMEM;
- goto out;
- }
-
- use = min_t(int, len, pfrag->size - pfrag->offset);
-
- if (!sk_wmem_schedule(sk, use)) {
- rc = -ENOMEM;
- goto out;
- }
-
- sk_mem_charge(sk, use);
- size += use;
- orig_offset = pfrag->offset;
- pfrag->offset += use;
-
- sge = sg + sg_curr - 1;
- if (sg_curr > first_coalesce && sg_page(sge) == pfrag->page &&
- sge->offset + sge->length == orig_offset) {
- sge->length += use;
- } else {
- sge = sg + sg_curr;
- sg_unmark_end(sge);
- sg_set_page(sge, pfrag->page, use, orig_offset);
- get_page(pfrag->page);
- sg_curr++;
-
- if (sg_curr == MAX_SKB_FRAGS)
- sg_curr = 0;
-
- if (sg_curr == sg_start) {
- rc = -ENOSPC;
- break;
- }
- }
-
- len -= use;
- }
-out:
- *sg_curr_size = size;
- *sg_curr_index = sg_curr;
- return rc;
-}
-EXPORT_SYMBOL(sk_alloc_sg);
-
static void __lock_sock(struct sock *sk)
__releases(&sk->sk_lock.slock)
__acquires(&sk->sk_lock.slock)
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
new file mode 100644
index 000000000000..3c0e44cb811a
--- /dev/null
+++ b/net/core/sock_map.c
@@ -0,0 +1,1002 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
+
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/net.h>
+#include <linux/workqueue.h>
+#include <linux/skmsg.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+
+struct bpf_stab {
+ struct bpf_map map;
+ struct sock **sks;
+ struct sk_psock_progs progs;
+ raw_spinlock_t lock;
+};
+
+#define SOCK_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
+static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_stab *stab;
+ u64 cost;
+ int err;
+
+ if (!capable(CAP_NET_ADMIN))
+ return ERR_PTR(-EPERM);
+ if (attr->max_entries == 0 ||
+ attr->key_size != 4 ||
+ attr->value_size != 4 ||
+ attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
+ return ERR_PTR(-EINVAL);
+
+ stab = kzalloc(sizeof(*stab), GFP_USER);
+ if (!stab)
+ return ERR_PTR(-ENOMEM);
+
+ bpf_map_init_from_attr(&stab->map, attr);
+ raw_spin_lock_init(&stab->lock);
+
+ /* Make sure page count doesn't overflow. */
+ cost = (u64) stab->map.max_entries * sizeof(struct sock *);
+ if (cost >= U32_MAX - PAGE_SIZE) {
+ err = -EINVAL;
+ goto free_stab;
+ }
+
+ stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+ err = bpf_map_precharge_memlock(stab->map.pages);
+ if (err)
+ goto free_stab;
+
+ stab->sks = bpf_map_area_alloc(stab->map.max_entries *
+ sizeof(struct sock *),
+ stab->map.numa_node);
+ if (stab->sks)
+ return &stab->map;
+ err = -ENOMEM;
+free_stab:
+ kfree(stab);
+ return ERR_PTR(err);
+}
+
+int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ u32 ufd = attr->target_fd;
+ struct bpf_map *map;
+ struct fd f;
+ int ret;
+
+ f = fdget(ufd);
+ map = __bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+ ret = sock_map_prog_update(map, prog, attr->attach_type);
+ fdput(f);
+ return ret;
+}
+
+static void sock_map_sk_acquire(struct sock *sk)
+ __acquires(&sk->sk_lock.slock)
+{
+ lock_sock(sk);
+ preempt_disable();
+ rcu_read_lock();
+}
+
+static void sock_map_sk_release(struct sock *sk)
+ __releases(&sk->sk_lock.slock)
+{
+ rcu_read_unlock();
+ preempt_enable();
+ release_sock(sk);
+}
+
+static void sock_map_add_link(struct sk_psock *psock,
+ struct sk_psock_link *link,
+ struct bpf_map *map, void *link_raw)
+{
+ link->link_raw = link_raw;
+ link->map = map;
+ spin_lock_bh(&psock->link_lock);
+ list_add_tail(&link->list, &psock->link);
+ spin_unlock_bh(&psock->link_lock);
+}
+
+static void sock_map_del_link(struct sock *sk,
+ struct sk_psock *psock, void *link_raw)
+{
+ struct sk_psock_link *link, *tmp;
+ bool strp_stop = false;
+
+ spin_lock_bh(&psock->link_lock);
+ list_for_each_entry_safe(link, tmp, &psock->link, list) {
+ if (link->link_raw == link_raw) {
+ struct bpf_map *map = link->map;
+ struct bpf_stab *stab = container_of(map, struct bpf_stab,
+ map);
+ if (psock->parser.enabled && stab->progs.skb_parser)
+ strp_stop = true;
+ list_del(&link->list);
+ sk_psock_free_link(link);
+ }
+ }
+ spin_unlock_bh(&psock->link_lock);
+ if (strp_stop) {
+ write_lock_bh(&sk->sk_callback_lock);
+ sk_psock_stop_strp(sk, psock);
+ write_unlock_bh(&sk->sk_callback_lock);
+ }
+}
+
+static void sock_map_unref(struct sock *sk, void *link_raw)
+{
+ struct sk_psock *psock = sk_psock(sk);
+
+ if (likely(psock)) {
+ sock_map_del_link(sk, psock, link_raw);
+ sk_psock_put(sk, psock);
+ }
+}
+
+static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
+ struct sock *sk)
+{
+ struct bpf_prog *msg_parser, *skb_parser, *skb_verdict;
+ bool skb_progs, sk_psock_is_new = false;
+ struct sk_psock *psock;
+ int ret;
+
+ skb_verdict = READ_ONCE(progs->skb_verdict);
+ skb_parser = READ_ONCE(progs->skb_parser);
+ skb_progs = skb_parser && skb_verdict;
+ if (skb_progs) {
+ skb_verdict = bpf_prog_inc_not_zero(skb_verdict);
+ if (IS_ERR(skb_verdict))
+ return PTR_ERR(skb_verdict);
+ skb_parser = bpf_prog_inc_not_zero(skb_parser);
+ if (IS_ERR(skb_parser)) {
+ bpf_prog_put(skb_verdict);
+ return PTR_ERR(skb_parser);
+ }
+ }
+
+ msg_parser = READ_ONCE(progs->msg_parser);
+ if (msg_parser) {
+ msg_parser = bpf_prog_inc_not_zero(msg_parser);
+ if (IS_ERR(msg_parser)) {
+ ret = PTR_ERR(msg_parser);
+ goto out;
+ }
+ }
+
+ psock = sk_psock_get(sk);
+ if (psock) {
+ if (!sk_has_psock(sk)) {
+ ret = -EBUSY;
+ goto out_progs;
+ }
+ if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) ||
+ (skb_progs && READ_ONCE(psock->progs.skb_parser))) {
+ sk_psock_put(sk, psock);
+ ret = -EBUSY;
+ goto out_progs;
+ }
+ } else {
+ psock = sk_psock_init(sk, map->numa_node);
+ if (!psock) {
+ ret = -ENOMEM;
+ goto out_progs;
+ }
+ sk_psock_is_new = true;
+ }
+
+ if (msg_parser)
+ psock_set_prog(&psock->progs.msg_parser, msg_parser);
+ if (sk_psock_is_new) {
+ ret = tcp_bpf_init(sk);
+ if (ret < 0)
+ goto out_drop;
+ } else {
+ tcp_bpf_reinit(sk);
+ }
+
+ write_lock_bh(&sk->sk_callback_lock);
+ if (skb_progs && !psock->parser.enabled) {
+ ret = sk_psock_init_strp(sk, psock);
+ if (ret) {
+ write_unlock_bh(&sk->sk_callback_lock);
+ goto out_drop;
+ }
+ psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
+ psock_set_prog(&psock->progs.skb_parser, skb_parser);
+ sk_psock_start_strp(sk, psock);
+ }
+ write_unlock_bh(&sk->sk_callback_lock);
+ return 0;
+out_drop:
+ sk_psock_put(sk, psock);
+out_progs:
+ if (msg_parser)
+ bpf_prog_put(msg_parser);
+out:
+ if (skb_progs) {
+ bpf_prog_put(skb_verdict);
+ bpf_prog_put(skb_parser);
+ }
+ return ret;
+}
+
+static void sock_map_free(struct bpf_map *map)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ int i;
+
+ synchronize_rcu();
+ rcu_read_lock();
+ raw_spin_lock_bh(&stab->lock);
+ for (i = 0; i < stab->map.max_entries; i++) {
+ struct sock **psk = &stab->sks[i];
+ struct sock *sk;
+
+ sk = xchg(psk, NULL);
+ if (sk)
+ sock_map_unref(sk, psk);
+ }
+ raw_spin_unlock_bh(&stab->lock);
+ rcu_read_unlock();
+
+ bpf_map_area_free(stab->sks);
+ kfree(stab);
+}
+
+static void sock_map_release_progs(struct bpf_map *map)
+{
+ psock_progs_drop(&container_of(map, struct bpf_stab, map)->progs);
+}
+
+static struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ if (unlikely(key >= map->max_entries))
+ return NULL;
+ return READ_ONCE(stab->sks[key]);
+}
+
+static void *sock_map_lookup(struct bpf_map *map, void *key)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test,
+ struct sock **psk)
+{
+ struct sock *sk;
+
+ raw_spin_lock_bh(&stab->lock);
+ sk = *psk;
+ if (!sk_test || sk_test == sk)
+ *psk = NULL;
+ raw_spin_unlock_bh(&stab->lock);
+ if (unlikely(!sk))
+ return -EINVAL;
+ sock_map_unref(sk, psk);
+ return 0;
+}
+
+static void sock_map_delete_from_link(struct bpf_map *map, struct sock *sk,
+ void *link_raw)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+
+ __sock_map_delete(stab, sk, link_raw);
+}
+
+static int sock_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ u32 i = *(u32 *)key;
+ struct sock **psk;
+
+ if (unlikely(i >= map->max_entries))
+ return -EINVAL;
+
+ psk = &stab->sks[i];
+ return __sock_map_delete(stab, NULL, psk);
+}
+
+static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ u32 i = key ? *(u32 *)key : U32_MAX;
+ u32 *key_next = next;
+
+ if (i == stab->map.max_entries - 1)
+ return -ENOENT;
+ if (i >= stab->map.max_entries)
+ *key_next = 0;
+ else
+ *key_next = i + 1;
+ return 0;
+}
+
+static int sock_map_update_common(struct bpf_map *map, u32 idx,
+ struct sock *sk, u64 flags)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ struct sk_psock_link *link;
+ struct sk_psock *psock;
+ struct sock *osk;
+ int ret;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ if (unlikely(flags > BPF_EXIST))
+ return -EINVAL;
+ if (unlikely(idx >= map->max_entries))
+ return -E2BIG;
+
+ link = sk_psock_init_link();
+ if (!link)
+ return -ENOMEM;
+
+ ret = sock_map_link(map, &stab->progs, sk);
+ if (ret < 0)
+ goto out_free;
+
+ psock = sk_psock(sk);
+ WARN_ON_ONCE(!psock);
+
+ raw_spin_lock_bh(&stab->lock);
+ osk = stab->sks[idx];
+ if (osk && flags == BPF_NOEXIST) {
+ ret = -EEXIST;
+ goto out_unlock;
+ } else if (!osk && flags == BPF_EXIST) {
+ ret = -ENOENT;
+ goto out_unlock;
+ }
+
+ sock_map_add_link(psock, link, map, &stab->sks[idx]);
+ stab->sks[idx] = sk;
+ if (osk)
+ sock_map_unref(osk, &stab->sks[idx]);
+ raw_spin_unlock_bh(&stab->lock);
+ return 0;
+out_unlock:
+ raw_spin_unlock_bh(&stab->lock);
+ if (psock)
+ sk_psock_put(sk, psock);
+out_free:
+ sk_psock_free_link(link);
+ return ret;
+}
+
+static bool sock_map_op_okay(const struct bpf_sock_ops_kern *ops)
+{
+ return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB ||
+ ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB;
+}
+
+static bool sock_map_sk_is_suitable(const struct sock *sk)
+{
+ return sk->sk_type == SOCK_STREAM &&
+ sk->sk_protocol == IPPROTO_TCP;
+}
+
+static int sock_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ u32 ufd = *(u32 *)value;
+ u32 idx = *(u32 *)key;
+ struct socket *sock;
+ struct sock *sk;
+ int ret;
+
+ sock = sockfd_lookup(ufd, &ret);
+ if (!sock)
+ return ret;
+ sk = sock->sk;
+ if (!sk) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!sock_map_sk_is_suitable(sk) ||
+ sk->sk_state != TCP_ESTABLISHED) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ sock_map_sk_acquire(sk);
+ ret = sock_map_update_common(map, idx, sk, flags);
+ sock_map_sk_release(sk);
+out:
+ fput(sock->file);
+ return ret;
+}
+
+BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, sops,
+ struct bpf_map *, map, void *, key, u64, flags)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ if (likely(sock_map_sk_is_suitable(sops->sk) &&
+ sock_map_op_okay(sops)))
+ return sock_map_update_common(map, *(u32 *)key, sops->sk,
+ flags);
+ return -EOPNOTSUPP;
+}
+
+const struct bpf_func_proto bpf_sock_map_update_proto = {
+ .func = bpf_sock_map_update,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_PTR_TO_MAP_KEY,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
+ struct bpf_map *, map, u32, key, u64, flags)
+{
+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+ if (unlikely(flags & ~(BPF_F_INGRESS)))
+ return SK_DROP;
+ tcb->bpf.flags = flags;
+ tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key);
+ if (!tcb->bpf.sk_redir)
+ return SK_DROP;
+ return SK_PASS;
+}
+
+const struct bpf_func_proto bpf_sk_redirect_map_proto = {
+ .func = bpf_sk_redirect_map,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg,
+ struct bpf_map *, map, u32, key, u64, flags)
+{
+ if (unlikely(flags & ~(BPF_F_INGRESS)))
+ return SK_DROP;
+ msg->flags = flags;
+ msg->sk_redir = __sock_map_lookup_elem(map, key);
+ if (!msg->sk_redir)
+ return SK_DROP;
+ return SK_PASS;
+}
+
+const struct bpf_func_proto bpf_msg_redirect_map_proto = {
+ .func = bpf_msg_redirect_map,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+};
+
+const struct bpf_map_ops sock_map_ops = {
+ .map_alloc = sock_map_alloc,
+ .map_free = sock_map_free,
+ .map_get_next_key = sock_map_get_next_key,
+ .map_update_elem = sock_map_update_elem,
+ .map_delete_elem = sock_map_delete_elem,
+ .map_lookup_elem = sock_map_lookup,
+ .map_release_uref = sock_map_release_progs,
+ .map_check_btf = map_check_no_btf,
+};
+
+struct bpf_htab_elem {
+ struct rcu_head rcu;
+ u32 hash;
+ struct sock *sk;
+ struct hlist_node node;
+ u8 key[0];
+};
+
+struct bpf_htab_bucket {
+ struct hlist_head head;
+ raw_spinlock_t lock;
+};
+
+struct bpf_htab {
+ struct bpf_map map;
+ struct bpf_htab_bucket *buckets;
+ u32 buckets_num;
+ u32 elem_size;
+ struct sk_psock_progs progs;
+ atomic_t count;
+};
+
+static inline u32 sock_hash_bucket_hash(const void *key, u32 len)
+{
+ return jhash(key, len, 0);
+}
+
+static struct bpf_htab_bucket *sock_hash_select_bucket(struct bpf_htab *htab,
+ u32 hash)
+{
+ return &htab->buckets[hash & (htab->buckets_num - 1)];
+}
+
+static struct bpf_htab_elem *
+sock_hash_lookup_elem_raw(struct hlist_head *head, u32 hash, void *key,
+ u32 key_size)
+{
+ struct bpf_htab_elem *elem;
+
+ hlist_for_each_entry_rcu(elem, head, node) {
+ if (elem->hash == hash &&
+ !memcmp(&elem->key, key, key_size))
+ return elem;
+ }
+
+ return NULL;
+}
+
+static struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ u32 key_size = map->key_size, hash;
+ struct bpf_htab_bucket *bucket;
+ struct bpf_htab_elem *elem;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ hash = sock_hash_bucket_hash(key, key_size);
+ bucket = sock_hash_select_bucket(htab, hash);
+ elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size);
+
+ return elem ? elem->sk : NULL;
+}
+
+static void sock_hash_free_elem(struct bpf_htab *htab,
+ struct bpf_htab_elem *elem)
+{
+ atomic_dec(&htab->count);
+ kfree_rcu(elem, rcu);
+}
+
+static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk,
+ void *link_raw)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct bpf_htab_elem *elem_probe, *elem = link_raw;
+ struct bpf_htab_bucket *bucket;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ bucket = sock_hash_select_bucket(htab, elem->hash);
+
+ /* elem may be deleted in parallel from the map, but access here
+ * is okay since it's going away only after RCU grace period.
+ * However, we need to check whether it's still present.
+ */
+ raw_spin_lock_bh(&bucket->lock);
+ elem_probe = sock_hash_lookup_elem_raw(&bucket->head, elem->hash,
+ elem->key, map->key_size);
+ if (elem_probe && elem_probe == elem) {
+ hlist_del_rcu(&elem->node);
+ sock_map_unref(elem->sk, elem);
+ sock_hash_free_elem(htab, elem);
+ }
+ raw_spin_unlock_bh(&bucket->lock);
+}
+
+static int sock_hash_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ u32 hash, key_size = map->key_size;
+ struct bpf_htab_bucket *bucket;
+ struct bpf_htab_elem *elem;
+ int ret = -ENOENT;
+
+ hash = sock_hash_bucket_hash(key, key_size);
+ bucket = sock_hash_select_bucket(htab, hash);
+
+ raw_spin_lock_bh(&bucket->lock);
+ elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size);
+ if (elem) {
+ hlist_del_rcu(&elem->node);
+ sock_map_unref(elem->sk, elem);
+ sock_hash_free_elem(htab, elem);
+ ret = 0;
+ }
+ raw_spin_unlock_bh(&bucket->lock);
+ return ret;
+}
+
+static struct bpf_htab_elem *sock_hash_alloc_elem(struct bpf_htab *htab,
+ void *key, u32 key_size,
+ u32 hash, struct sock *sk,
+ struct bpf_htab_elem *old)
+{
+ struct bpf_htab_elem *new;
+
+ if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
+ if (!old) {
+ atomic_dec(&htab->count);
+ return ERR_PTR(-E2BIG);
+ }
+ }
+
+ new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
+ htab->map.numa_node);
+ if (!new) {
+ atomic_dec(&htab->count);
+ return ERR_PTR(-ENOMEM);
+ }
+ memcpy(new->key, key, key_size);
+ new->sk = sk;
+ new->hash = hash;
+ return new;
+}
+
+static int sock_hash_update_common(struct bpf_map *map, void *key,
+ struct sock *sk, u64 flags)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ u32 key_size = map->key_size, hash;
+ struct bpf_htab_elem *elem, *elem_new;
+ struct bpf_htab_bucket *bucket;
+ struct sk_psock_link *link;
+ struct sk_psock *psock;
+ int ret;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ if (unlikely(flags > BPF_EXIST))
+ return -EINVAL;
+
+ link = sk_psock_init_link();
+ if (!link)
+ return -ENOMEM;
+
+ ret = sock_map_link(map, &htab->progs, sk);
+ if (ret < 0)
+ goto out_free;
+
+ psock = sk_psock(sk);
+ WARN_ON_ONCE(!psock);
+
+ hash = sock_hash_bucket_hash(key, key_size);
+ bucket = sock_hash_select_bucket(htab, hash);
+
+ raw_spin_lock_bh(&bucket->lock);
+ elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size);
+ if (elem && flags == BPF_NOEXIST) {
+ ret = -EEXIST;
+ goto out_unlock;
+ } else if (!elem && flags == BPF_EXIST) {
+ ret = -ENOENT;
+ goto out_unlock;
+ }
+
+ elem_new = sock_hash_alloc_elem(htab, key, key_size, hash, sk, elem);
+ if (IS_ERR(elem_new)) {
+ ret = PTR_ERR(elem_new);
+ goto out_unlock;
+ }
+
+ sock_map_add_link(psock, link, map, elem_new);
+ /* Add new element to the head of the list, so that
+ * concurrent search will find it before old elem.
+ */
+ hlist_add_head_rcu(&elem_new->node, &bucket->head);
+ if (elem) {
+ hlist_del_rcu(&elem->node);
+ sock_map_unref(elem->sk, elem);
+ sock_hash_free_elem(htab, elem);
+ }
+ raw_spin_unlock_bh(&bucket->lock);
+ return 0;
+out_unlock:
+ raw_spin_unlock_bh(&bucket->lock);
+ sk_psock_put(sk, psock);
+out_free:
+ sk_psock_free_link(link);
+ return ret;
+}
+
+static int sock_hash_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ u32 ufd = *(u32 *)value;
+ struct socket *sock;
+ struct sock *sk;
+ int ret;
+
+ sock = sockfd_lookup(ufd, &ret);
+ if (!sock)
+ return ret;
+ sk = sock->sk;
+ if (!sk) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!sock_map_sk_is_suitable(sk) ||
+ sk->sk_state != TCP_ESTABLISHED) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ sock_map_sk_acquire(sk);
+ ret = sock_hash_update_common(map, key, sk, flags);
+ sock_map_sk_release(sk);
+out:
+ fput(sock->file);
+ return ret;
+}
+
+static int sock_hash_get_next_key(struct bpf_map *map, void *key,
+ void *key_next)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct bpf_htab_elem *elem, *elem_next;
+ u32 hash, key_size = map->key_size;
+ struct hlist_head *head;
+ int i = 0;
+
+ if (!key)
+ goto find_first_elem;
+ hash = sock_hash_bucket_hash(key, key_size);
+ head = &sock_hash_select_bucket(htab, hash)->head;
+ elem = sock_hash_lookup_elem_raw(head, hash, key, key_size);
+ if (!elem)
+ goto find_first_elem;
+
+ elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&elem->node)),
+ struct bpf_htab_elem, node);
+ if (elem_next) {
+ memcpy(key_next, elem_next->key, key_size);
+ return 0;
+ }
+
+ i = hash & (htab->buckets_num - 1);
+ i++;
+find_first_elem:
+ for (; i < htab->buckets_num; i++) {
+ head = &sock_hash_select_bucket(htab, i)->head;
+ elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
+ struct bpf_htab_elem, node);
+ if (elem_next) {
+ memcpy(key_next, elem_next->key, key_size);
+ return 0;
+ }
+ }
+
+ return -ENOENT;
+}
+
+static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
+{
+ struct bpf_htab *htab;
+ int i, err;
+ u64 cost;
+
+ if (!capable(CAP_NET_ADMIN))
+ return ERR_PTR(-EPERM);
+ if (attr->max_entries == 0 ||
+ attr->key_size == 0 ||
+ attr->value_size != 4 ||
+ attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
+ return ERR_PTR(-EINVAL);
+ if (attr->key_size > MAX_BPF_STACK)
+ return ERR_PTR(-E2BIG);
+
+ htab = kzalloc(sizeof(*htab), GFP_USER);
+ if (!htab)
+ return ERR_PTR(-ENOMEM);
+
+ bpf_map_init_from_attr(&htab->map, attr);
+
+ htab->buckets_num = roundup_pow_of_two(htab->map.max_entries);
+ htab->elem_size = sizeof(struct bpf_htab_elem) +
+ round_up(htab->map.key_size, 8);
+ if (htab->buckets_num == 0 ||
+ htab->buckets_num > U32_MAX / sizeof(struct bpf_htab_bucket)) {
+ err = -EINVAL;
+ goto free_htab;
+ }
+
+ cost = (u64) htab->buckets_num * sizeof(struct bpf_htab_bucket) +
+ (u64) htab->elem_size * htab->map.max_entries;
+ if (cost >= U32_MAX - PAGE_SIZE) {
+ err = -EINVAL;
+ goto free_htab;
+ }
+
+ htab->buckets = bpf_map_area_alloc(htab->buckets_num *
+ sizeof(struct bpf_htab_bucket),
+ htab->map.numa_node);
+ if (!htab->buckets) {
+ err = -ENOMEM;
+ goto free_htab;
+ }
+
+ for (i = 0; i < htab->buckets_num; i++) {
+ INIT_HLIST_HEAD(&htab->buckets[i].head);
+ raw_spin_lock_init(&htab->buckets[i].lock);
+ }
+
+ return &htab->map;
+free_htab:
+ kfree(htab);
+ return ERR_PTR(err);
+}
+
+static void sock_hash_free(struct bpf_map *map)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct bpf_htab_bucket *bucket;
+ struct bpf_htab_elem *elem;
+ struct hlist_node *node;
+ int i;
+
+ synchronize_rcu();
+ rcu_read_lock();
+ for (i = 0; i < htab->buckets_num; i++) {
+ bucket = sock_hash_select_bucket(htab, i);
+ raw_spin_lock_bh(&bucket->lock);
+ hlist_for_each_entry_safe(elem, node, &bucket->head, node) {
+ hlist_del_rcu(&elem->node);
+ sock_map_unref(elem->sk, elem);
+ }
+ raw_spin_unlock_bh(&bucket->lock);
+ }
+ rcu_read_unlock();
+
+ bpf_map_area_free(htab->buckets);
+ kfree(htab);
+}
+
+static void sock_hash_release_progs(struct bpf_map *map)
+{
+ psock_progs_drop(&container_of(map, struct bpf_htab, map)->progs);
+}
+
+BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, sops,
+ struct bpf_map *, map, void *, key, u64, flags)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ if (likely(sock_map_sk_is_suitable(sops->sk) &&
+ sock_map_op_okay(sops)))
+ return sock_hash_update_common(map, key, sops->sk, flags);
+ return -EOPNOTSUPP;
+}
+
+const struct bpf_func_proto bpf_sock_hash_update_proto = {
+ .func = bpf_sock_hash_update,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_PTR_TO_MAP_KEY,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
+ struct bpf_map *, map, void *, key, u64, flags)
+{
+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+ if (unlikely(flags & ~(BPF_F_INGRESS)))
+ return SK_DROP;
+ tcb->bpf.flags = flags;
+ tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key);
+ if (!tcb->bpf.sk_redir)
+ return SK_DROP;
+ return SK_PASS;
+}
+
+const struct bpf_func_proto bpf_sk_redirect_hash_proto = {
+ .func = bpf_sk_redirect_hash,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_PTR_TO_MAP_KEY,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg,
+ struct bpf_map *, map, void *, key, u64, flags)
+{
+ if (unlikely(flags & ~(BPF_F_INGRESS)))
+ return SK_DROP;
+ msg->flags = flags;
+ msg->sk_redir = __sock_hash_lookup_elem(map, key);
+ if (!msg->sk_redir)
+ return SK_DROP;
+ return SK_PASS;
+}
+
+const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
+ .func = bpf_msg_redirect_hash,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_PTR_TO_MAP_KEY,
+ .arg4_type = ARG_ANYTHING,
+};
+
+const struct bpf_map_ops sock_hash_ops = {
+ .map_alloc = sock_hash_alloc,
+ .map_free = sock_hash_free,
+ .map_get_next_key = sock_hash_get_next_key,
+ .map_update_elem = sock_hash_update_elem,
+ .map_delete_elem = sock_hash_delete_elem,
+ .map_lookup_elem = sock_map_lookup,
+ .map_release_uref = sock_hash_release_progs,
+ .map_check_btf = map_check_no_btf,
+};
+
+static struct sk_psock_progs *sock_map_progs(struct bpf_map *map)
+{
+ switch (map->map_type) {
+ case BPF_MAP_TYPE_SOCKMAP:
+ return &container_of(map, struct bpf_stab, map)->progs;
+ case BPF_MAP_TYPE_SOCKHASH:
+ return &container_of(map, struct bpf_htab, map)->progs;
+ default:
+ break;
+ }
+
+ return NULL;
+}
+
+int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
+ u32 which)
+{
+ struct sk_psock_progs *progs = sock_map_progs(map);
+
+ if (!progs)
+ return -EOPNOTSUPP;
+
+ switch (which) {
+ case BPF_SK_MSG_VERDICT:
+ psock_set_prog(&progs->msg_parser, prog);
+ break;
+ case BPF_SK_SKB_STREAM_PARSER:
+ psock_set_prog(&progs->skb_parser, prog);
+ break;
+ case BPF_SK_SKB_STREAM_VERDICT:
+ psock_set_prog(&progs->skb_verdict, prog);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+void sk_psock_unlink(struct sock *sk, struct sk_psock_link *link)
+{
+ switch (link->map->map_type) {
+ case BPF_MAP_TYPE_SOCKMAP:
+ return sock_map_delete_from_link(link->map, sk,
+ link->link_raw);
+ case BPF_MAP_TYPE_SOCKHASH:
+ return sock_hash_delete_from_link(link->map, sk,
+ link->link_raw);
+ default:
+ break;
+ }
+}
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 7446b98661d8..58629314eae9 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -63,6 +63,7 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
+obj-$(CONFIG_NET_SOCK_MSG) += tcp_bpf.o
obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
new file mode 100644
index 000000000000..80debb0daf37
--- /dev/null
+++ b/net/ipv4/tcp_bpf.c
@@ -0,0 +1,655 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
+
+#include <linux/skmsg.h>
+#include <linux/filter.h>
+#include <linux/bpf.h>
+#include <linux/init.h>
+#include <linux/wait.h>
+
+#include <net/inet_common.h>
+
+static bool tcp_bpf_stream_read(const struct sock *sk)
+{
+ struct sk_psock *psock;
+ bool empty = true;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (likely(psock))
+ empty = list_empty(&psock->ingress_msg);
+ rcu_read_unlock();
+ return !empty;
+}
+
+static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock,
+ int flags, long timeo, int *err)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ int ret;
+
+ add_wait_queue(sk_sleep(sk), &wait);
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ ret = sk_wait_event(sk, &timeo,
+ !list_empty(&psock->ingress_msg) ||
+ !skb_queue_empty(&sk->sk_receive_queue), &wait);
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ remove_wait_queue(sk_sleep(sk), &wait);
+ return ret;
+}
+
+int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
+ struct msghdr *msg, int len)
+{
+ struct iov_iter *iter = &msg->msg_iter;
+ int i, ret, copied = 0;
+
+ while (copied != len) {
+ struct scatterlist *sge;
+ struct sk_msg *msg_rx;
+
+ msg_rx = list_first_entry_or_null(&psock->ingress_msg,
+ struct sk_msg, list);
+ if (unlikely(!msg_rx))
+ break;
+
+ i = msg_rx->sg.start;
+ do {
+ struct page *page;
+ int copy;
+
+ sge = sk_msg_elem(msg_rx, i);
+ copy = sge->length;
+ page = sg_page(sge);
+ if (copied + copy > len)
+ copy = len - copied;
+ ret = copy_page_to_iter(page, sge->offset, copy, iter);
+ if (ret != copy) {
+ msg_rx->sg.start = i;
+ return -EFAULT;
+ }
+
+ copied += copy;
+ sge->offset += copy;
+ sge->length -= copy;
+ sk_mem_uncharge(sk, copy);
+ if (!sge->length) {
+ i++;
+ if (i == MAX_SKB_FRAGS)
+ i = 0;
+ if (!msg_rx->skb)
+ put_page(page);
+ }
+
+ if (copied == len)
+ break;
+ } while (i != msg_rx->sg.end);
+
+ msg_rx->sg.start = i;
+ if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) {
+ list_del(&msg_rx->list);
+ if (msg_rx->skb)
+ consume_skb(msg_rx->skb);
+ kfree(msg_rx);
+ }
+ }
+
+ return copied;
+}
+EXPORT_SYMBOL_GPL(__tcp_bpf_recvmsg);
+
+int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int nonblock, int flags, int *addr_len)
+{
+ struct sk_psock *psock;
+ int copied, ret;
+
+ if (unlikely(flags & MSG_ERRQUEUE))
+ return inet_recv_error(sk, msg, len, addr_len);
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+
+ psock = sk_psock_get(sk);
+ if (unlikely(!psock))
+ return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+ lock_sock(sk);
+msg_bytes_ready:
+ copied = __tcp_bpf_recvmsg(sk, psock, msg, len);
+ if (!copied) {
+ int data, err = 0;
+ long timeo;
+
+ timeo = sock_rcvtimeo(sk, nonblock);
+ data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err);
+ if (data) {
+ if (skb_queue_empty(&sk->sk_receive_queue))
+ goto msg_bytes_ready;
+ release_sock(sk);
+ sk_psock_put(sk, psock);
+ return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+ }
+ if (err) {
+ ret = err;
+ goto out;
+ }
+ }
+ ret = copied;
+out:
+ release_sock(sk);
+ sk_psock_put(sk, psock);
+ return ret;
+}
+
+static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock,
+ struct sk_msg *msg, u32 apply_bytes, int flags)
+{
+ bool apply = apply_bytes;
+ struct scatterlist *sge;
+ u32 size, copied = 0;
+ struct sk_msg *tmp;
+ int i, ret = 0;
+
+ tmp = kzalloc(sizeof(*tmp), __GFP_NOWARN | GFP_KERNEL);
+ if (unlikely(!tmp))
+ return -ENOMEM;
+
+ lock_sock(sk);
+ tmp->sg.start = msg->sg.start;
+ i = msg->sg.start;
+ do {
+ sge = sk_msg_elem(msg, i);
+ size = (apply && apply_bytes < sge->length) ?
+ apply_bytes : sge->length;
+ if (!sk_wmem_schedule(sk, size)) {
+ if (!copied)
+ ret = -ENOMEM;
+ break;
+ }
+
+ sk_mem_charge(sk, size);
+ sk_msg_xfer(tmp, msg, i, size);
+ copied += size;
+ if (sge->length)
+ get_page(sk_msg_page(tmp, i));
+ sk_msg_iter_var_next(i);
+ tmp->sg.end = i;
+ if (apply) {
+ apply_bytes -= size;
+ if (!apply_bytes)
+ break;
+ }
+ } while (i != msg->sg.end);
+
+ if (!ret) {
+ msg->sg.start = i;
+ msg->sg.size -= apply_bytes;
+ sk_psock_queue_msg(psock, tmp);
+ sk->sk_data_ready(sk);
+ } else {
+ sk_msg_free(sk, tmp);
+ kfree(tmp);
+ }
+
+ release_sock(sk);
+ return ret;
+}
+
+static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes,
+ int flags, bool uncharge)
+{
+ bool apply = apply_bytes;
+ struct scatterlist *sge;
+ struct page *page;
+ int size, ret = 0;
+ u32 off;
+
+ while (1) {
+ sge = sk_msg_elem(msg, msg->sg.start);
+ size = (apply && apply_bytes < sge->length) ?
+ apply_bytes : sge->length;
+ off = sge->offset;
+ page = sg_page(sge);
+
+ tcp_rate_check_app_limited(sk);
+retry:
+ ret = do_tcp_sendpages(sk, page, off, size, flags);
+ if (ret <= 0)
+ return ret;
+ if (apply)
+ apply_bytes -= ret;
+ msg->sg.size -= ret;
+ sge->offset += ret;
+ sge->length -= ret;
+ if (uncharge)
+ sk_mem_uncharge(sk, ret);
+ if (ret != size) {
+ size -= ret;
+ off += ret;
+ goto retry;
+ }
+ if (!sge->length) {
+ put_page(page);
+ sk_msg_iter_next(msg, start);
+ sg_init_table(sge, 1);
+ if (msg->sg.start == msg->sg.end)
+ break;
+ }
+ if (apply && !apply_bytes)
+ break;
+ }
+
+ return 0;
+}
+
+static int tcp_bpf_push_locked(struct sock *sk, struct sk_msg *msg,
+ u32 apply_bytes, int flags, bool uncharge)
+{
+ int ret;
+
+ lock_sock(sk);
+ ret = tcp_bpf_push(sk, msg, apply_bytes, flags, uncharge);
+ release_sock(sk);
+ return ret;
+}
+
+int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg,
+ u32 bytes, int flags)
+{
+ bool ingress = sk_msg_to_ingress(msg);
+ struct sk_psock *psock = sk_psock_get(sk);
+ int ret;
+
+ if (unlikely(!psock)) {
+ sk_msg_free(sk, msg);
+ return 0;
+ }
+ ret = ingress ? bpf_tcp_ingress(sk, psock, msg, bytes, flags) :
+ tcp_bpf_push_locked(sk, msg, bytes, flags, false);
+ sk_psock_put(sk, psock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir);
+
+static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
+ struct sk_msg *msg, int *copied, int flags)
+{
+ bool cork = false, enospc = msg->sg.start == msg->sg.end;
+ struct sock *sk_redir;
+ u32 tosend;
+ int ret;
+
+more_data:
+ if (psock->eval == __SK_NONE)
+ psock->eval = sk_psock_msg_verdict(sk, psock, msg);
+
+ if (msg->cork_bytes &&
+ msg->cork_bytes > msg->sg.size && !enospc) {
+ psock->cork_bytes = msg->cork_bytes - msg->sg.size;
+ if (!psock->cork) {
+ psock->cork = kzalloc(sizeof(*psock->cork),
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (!psock->cork)
+ return -ENOMEM;
+ }
+ memcpy(psock->cork, msg, sizeof(*msg));
+ return 0;
+ }
+
+ tosend = msg->sg.size;
+ if (psock->apply_bytes && psock->apply_bytes < tosend)
+ tosend = psock->apply_bytes;
+
+ switch (psock->eval) {
+ case __SK_PASS:
+ ret = tcp_bpf_push(sk, msg, tosend, flags, true);
+ if (unlikely(ret)) {
+ *copied -= sk_msg_free(sk, msg);
+ break;
+ }
+ sk_msg_apply_bytes(psock, tosend);
+ break;
+ case __SK_REDIRECT:
+ sk_redir = psock->sk_redir;
+ sk_msg_apply_bytes(psock, tosend);
+ if (psock->cork) {
+ cork = true;
+ psock->cork = NULL;
+ }
+ sk_msg_return(sk, msg, tosend);
+ release_sock(sk);
+ ret = tcp_bpf_sendmsg_redir(sk_redir, msg, tosend, flags);
+ lock_sock(sk);
+ if (unlikely(ret < 0)) {
+ int free = sk_msg_free_nocharge(sk, msg);
+
+ if (!cork)
+ *copied -= free;
+ }
+ if (cork) {
+ sk_msg_free(sk, msg);
+ kfree(msg);
+ msg = NULL;
+ ret = 0;
+ }
+ break;
+ case __SK_DROP:
+ default:
+ sk_msg_free_partial(sk, msg, tosend);
+ sk_msg_apply_bytes(psock, tosend);
+ *copied -= tosend;
+ return -EACCES;
+ }
+
+ if (likely(!ret)) {
+ if (!psock->apply_bytes) {
+ psock->eval = __SK_NONE;
+ if (psock->sk_redir) {
+ sock_put(psock->sk_redir);
+ psock->sk_redir = NULL;
+ }
+ }
+ if (msg &&
+ msg->sg.data[msg->sg.start].page_link &&
+ msg->sg.data[msg->sg.start].length)
+ goto more_data;
+ }
+ return ret;
+}
+
+static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+{
+ struct sk_msg tmp, *msg_tx = NULL;
+ int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS;
+ int copied = 0, err = 0;
+ struct sk_psock *psock;
+ long timeo;
+
+ psock = sk_psock_get(sk);
+ if (unlikely(!psock))
+ return tcp_sendmsg(sk, msg, size);
+
+ lock_sock(sk);
+ timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+ while (msg_data_left(msg)) {
+ bool enospc = false;
+ u32 copy, osize;
+
+ if (sk->sk_err) {
+ err = -sk->sk_err;
+ goto out_err;
+ }
+
+ copy = msg_data_left(msg);
+ if (!sk_stream_memory_free(sk))
+ goto wait_for_sndbuf;
+ if (psock->cork) {
+ msg_tx = psock->cork;
+ } else {
+ msg_tx = &tmp;
+ sk_msg_init(msg_tx);
+ }
+
+ osize = msg_tx->sg.size;
+ err = sk_msg_alloc(sk, msg_tx, msg_tx->sg.size + copy, msg_tx->sg.end - 1);
+ if (err) {
+ if (err != -ENOSPC)
+ goto wait_for_memory;
+ enospc = true;
+ copy = msg_tx->sg.size - osize;
+ }
+
+ err = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_tx,
+ copy);
+ if (err < 0) {
+ sk_msg_trim(sk, msg_tx, osize);
+ goto out_err;
+ }
+
+ copied += copy;
+ if (psock->cork_bytes) {
+ if (size > psock->cork_bytes)
+ psock->cork_bytes = 0;
+ else
+ psock->cork_bytes -= size;
+ if (psock->cork_bytes && !enospc)
+ goto out_err;
+ /* All cork bytes are accounted, rerun the prog. */
+ psock->eval = __SK_NONE;
+ psock->cork_bytes = 0;
+ }
+
+ err = tcp_bpf_send_verdict(sk, psock, msg_tx, &copied, flags);
+ if (unlikely(err < 0))
+ goto out_err;
+ continue;
+wait_for_sndbuf:
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+wait_for_memory:
+ err = sk_stream_wait_memory(sk, &timeo);
+ if (err) {
+ if (msg_tx && msg_tx != psock->cork)
+ sk_msg_free(sk, msg_tx);
+ goto out_err;
+ }
+ }
+out_err:
+ if (err < 0)
+ err = sk_stream_error(sk, msg->msg_flags, err);
+ release_sock(sk);
+ sk_psock_put(sk, psock);
+ return copied ? copied : err;
+}
+
+static int tcp_bpf_sendpage(struct sock *sk, struct page *page, int offset,
+ size_t size, int flags)
+{
+ struct sk_msg tmp, *msg = NULL;
+ int err = 0, copied = 0;
+ struct sk_psock *psock;
+ bool enospc = false;
+
+ psock = sk_psock_get(sk);
+ if (unlikely(!psock))
+ return tcp_sendpage(sk, page, offset, size, flags);
+
+ lock_sock(sk);
+ if (psock->cork) {
+ msg = psock->cork;
+ } else {
+ msg = &tmp;
+ sk_msg_init(msg);
+ }
+
+ /* Catch case where ring is full and sendpage is stalled. */
+ if (unlikely(sk_msg_full(msg)))
+ goto out_err;
+
+ sk_msg_page_add(msg, page, size, offset);
+ sk_mem_charge(sk, size);
+ copied = size;
+ if (sk_msg_full(msg))
+ enospc = true;
+ if (psock->cork_bytes) {
+ if (size > psock->cork_bytes)
+ psock->cork_bytes = 0;
+ else
+ psock->cork_bytes -= size;
+ if (psock->cork_bytes && !enospc)
+ goto out_err;
+ /* All cork bytes are accounted, rerun the prog. */
+ psock->eval = __SK_NONE;
+ psock->cork_bytes = 0;
+ }
+
+ err = tcp_bpf_send_verdict(sk, psock, msg, &copied, flags);
+out_err:
+ release_sock(sk);
+ sk_psock_put(sk, psock);
+ return copied ? copied : err;
+}
+
+static void tcp_bpf_remove(struct sock *sk, struct sk_psock *psock)
+{
+ struct sk_psock_link *link;
+
+ sk_psock_cork_free(psock);
+ __sk_psock_purge_ingress_msg(psock);
+ while ((link = sk_psock_link_pop(psock))) {
+ sk_psock_unlink(sk, link);
+ sk_psock_free_link(link);
+ }
+}
+
+static void tcp_bpf_unhash(struct sock *sk)
+{
+ void (*saved_unhash)(struct sock *sk);
+ struct sk_psock *psock;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (unlikely(!psock)) {
+ rcu_read_unlock();
+ if (sk->sk_prot->unhash)
+ sk->sk_prot->unhash(sk);
+ return;
+ }
+
+ saved_unhash = psock->saved_unhash;
+ tcp_bpf_remove(sk, psock);
+ rcu_read_unlock();
+ saved_unhash(sk);
+}
+
+static void tcp_bpf_close(struct sock *sk, long timeout)
+{
+ void (*saved_close)(struct sock *sk, long timeout);
+ struct sk_psock *psock;
+
+ lock_sock(sk);
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (unlikely(!psock)) {
+ rcu_read_unlock();
+ release_sock(sk);
+ return sk->sk_prot->close(sk, timeout);
+ }
+
+ saved_close = psock->saved_close;
+ tcp_bpf_remove(sk, psock);
+ rcu_read_unlock();
+ release_sock(sk);
+ saved_close(sk, timeout);
+}
+
+enum {
+ TCP_BPF_IPV4,
+ TCP_BPF_IPV6,
+ TCP_BPF_NUM_PROTS,
+};
+
+enum {
+ TCP_BPF_BASE,
+ TCP_BPF_TX,
+ TCP_BPF_NUM_CFGS,
+};
+
+static struct proto *tcpv6_prot_saved __read_mostly;
+static DEFINE_SPINLOCK(tcpv6_prot_lock);
+static struct proto tcp_bpf_prots[TCP_BPF_NUM_PROTS][TCP_BPF_NUM_CFGS];
+
+static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
+ struct proto *base)
+{
+ prot[TCP_BPF_BASE] = *base;
+ prot[TCP_BPF_BASE].unhash = tcp_bpf_unhash;
+ prot[TCP_BPF_BASE].close = tcp_bpf_close;
+ prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg;
+ prot[TCP_BPF_BASE].stream_memory_read = tcp_bpf_stream_read;
+
+ prot[TCP_BPF_TX] = prot[TCP_BPF_BASE];
+ prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg;
+ prot[TCP_BPF_TX].sendpage = tcp_bpf_sendpage;
+}
+
+static void tcp_bpf_check_v6_needs_rebuild(struct sock *sk, struct proto *ops)
+{
+ if (sk->sk_family == AF_INET6 &&
+ unlikely(ops != smp_load_acquire(&tcpv6_prot_saved))) {
+ spin_lock_bh(&tcpv6_prot_lock);
+ if (likely(ops != tcpv6_prot_saved)) {
+ tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV6], ops);
+ smp_store_release(&tcpv6_prot_saved, ops);
+ }
+ spin_unlock_bh(&tcpv6_prot_lock);
+ }
+}
+
+static int __init tcp_bpf_v4_build_proto(void)
+{
+ tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV4], &tcp_prot);
+ return 0;
+}
+core_initcall(tcp_bpf_v4_build_proto);
+
+static void tcp_bpf_update_sk_prot(struct sock *sk, struct sk_psock *psock)
+{
+ int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
+ int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE;
+
+ sk_psock_update_proto(sk, psock, &tcp_bpf_prots[family][config]);
+}
+
+static void tcp_bpf_reinit_sk_prot(struct sock *sk, struct sk_psock *psock)
+{
+ int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
+ int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE;
+
+ /* Reinit occurs when program types change e.g. TCP_BPF_TX is removed
+ * or added requiring sk_prot hook updates. We keep original saved
+ * hooks in this case.
+ */
+ sk->sk_prot = &tcp_bpf_prots[family][config];
+}
+
+static int tcp_bpf_assert_proto_ops(struct proto *ops)
+{
+ /* In order to avoid retpoline, we make assumptions when we call
+ * into ops if e.g. a psock is not present. Make sure they are
+ * indeed valid assumptions.
+ */
+ return ops->recvmsg == tcp_recvmsg &&
+ ops->sendmsg == tcp_sendmsg &&
+ ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP;
+}
+
+void tcp_bpf_reinit(struct sock *sk)
+{
+ struct sk_psock *psock;
+
+ sock_owned_by_me(sk);
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ tcp_bpf_reinit_sk_prot(sk, psock);
+ rcu_read_unlock();
+}
+
+int tcp_bpf_init(struct sock *sk)
+{
+ struct proto *ops = READ_ONCE(sk->sk_prot);
+ struct sk_psock *psock;
+
+ sock_owned_by_me(sk);
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (unlikely(!psock || psock->sk_proto ||
+ tcp_bpf_assert_proto_ops(ops))) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+ tcp_bpf_check_v6_needs_rebuild(sk, ops);
+ tcp_bpf_update_sk_prot(sk, psock);
+ rcu_read_unlock();
+ return 0;
+}
diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c
index a5995bb2eaca..a9162aa11af9 100644
--- a/net/ipv4/tcp_ulp.c
+++ b/net/ipv4/tcp_ulp.c
@@ -6,7 +6,7 @@
*
*/
-#include<linux/module.h>
+#include <linux/module.h>
#include <linux/mm.h>
#include <linux/types.h>
#include <linux/list.h>
@@ -29,18 +29,6 @@ static struct tcp_ulp_ops *tcp_ulp_find(const char *name)
return NULL;
}
-static struct tcp_ulp_ops *tcp_ulp_find_id(const int ulp)
-{
- struct tcp_ulp_ops *e;
-
- list_for_each_entry_rcu(e, &tcp_ulp_list, list) {
- if (e->uid == ulp)
- return e;
- }
-
- return NULL;
-}
-
static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name)
{
const struct tcp_ulp_ops *ulp = NULL;
@@ -63,18 +51,6 @@ static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name)
return ulp;
}
-static const struct tcp_ulp_ops *__tcp_ulp_lookup(const int uid)
-{
- const struct tcp_ulp_ops *ulp;
-
- rcu_read_lock();
- ulp = tcp_ulp_find_id(uid);
- if (!ulp || !try_module_get(ulp->owner))
- ulp = NULL;
- rcu_read_unlock();
- return ulp;
-}
-
/* Attach new upper layer protocol to the list
* of available protocols.
*/
@@ -123,6 +99,8 @@ void tcp_cleanup_ulp(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
+ sock_owned_by_me(sk);
+
if (!icsk->icsk_ulp_ops)
return;
@@ -133,54 +111,35 @@ void tcp_cleanup_ulp(struct sock *sk)
icsk->icsk_ulp_ops = NULL;
}
-/* Change upper layer protocol for socket */
-int tcp_set_ulp(struct sock *sk, const char *name)
+static int __tcp_set_ulp(struct sock *sk, const struct tcp_ulp_ops *ulp_ops)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- const struct tcp_ulp_ops *ulp_ops;
- int err = 0;
+ int err;
+ err = -EEXIST;
if (icsk->icsk_ulp_ops)
- return -EEXIST;
-
- ulp_ops = __tcp_ulp_find_autoload(name);
- if (!ulp_ops)
- return -ENOENT;
-
- if (!ulp_ops->user_visible) {
- module_put(ulp_ops->owner);
- return -ENOENT;
- }
+ goto out_err;
err = ulp_ops->init(sk);
- if (err) {
- module_put(ulp_ops->owner);
- return err;
- }
+ if (err)
+ goto out_err;
icsk->icsk_ulp_ops = ulp_ops;
return 0;
+out_err:
+ module_put(ulp_ops->owner);
+ return err;
}
-int tcp_set_ulp_id(struct sock *sk, int ulp)
+int tcp_set_ulp(struct sock *sk, const char *name)
{
- struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_ulp_ops *ulp_ops;
- int err;
- if (icsk->icsk_ulp_ops)
- return -EEXIST;
+ sock_owned_by_me(sk);
- ulp_ops = __tcp_ulp_lookup(ulp);
+ ulp_ops = __tcp_ulp_find_autoload(name);
if (!ulp_ops)
return -ENOENT;
- err = ulp_ops->init(sk);
- if (err) {
- module_put(ulp_ops->owner);
- return err;
- }
-
- icsk->icsk_ulp_ops = ulp_ops;
- return 0;
+ return __tcp_set_ulp(sk, ulp_ops);
}
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index e9c8cfdf4b4c..3f4d61017a69 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -901,6 +901,7 @@ static const struct ipv6_stub ipv6_stub_impl = {
static const struct ipv6_bpf_stub ipv6_bpf_stub_impl = {
.inet6_bind = __inet6_bind,
+ .udp6_lib_lookup = __udp6_lib_lookup,
};
static int __init inet6_init(void)
diff --git a/net/strparser/Kconfig b/net/strparser/Kconfig
index 6cff3f6d0c3a..94da19a2a220 100644
--- a/net/strparser/Kconfig
+++ b/net/strparser/Kconfig
@@ -1,4 +1,2 @@
-
config STREAM_PARSER
- tristate
- default n
+ def_bool n
diff --git a/net/tls/Kconfig b/net/tls/Kconfig
index 73f05ece53d0..99c1a19c17b1 100644
--- a/net/tls/Kconfig
+++ b/net/tls/Kconfig
@@ -8,6 +8,7 @@ config TLS
select CRYPTO_AES
select CRYPTO_GCM
select STREAM_PARSER
+ select NET_SOCK_MSG
default n
---help---
Enable kernel support for TLS protocol. This allows symmetric
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 961b07d4d41c..276edbc04f38 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -421,7 +421,7 @@ last_record:
tls_push_record_flags = flags;
if (more) {
tls_ctx->pending_open_record_frags =
- record->num_frags;
+ !!record->num_frags;
break;
}
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index b428069a1b05..e90b6d537077 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -620,12 +620,14 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
prot[TLS_SW][TLS_BASE].sendpage = tls_sw_sendpage;
prot[TLS_BASE][TLS_SW] = prot[TLS_BASE][TLS_BASE];
- prot[TLS_BASE][TLS_SW].recvmsg = tls_sw_recvmsg;
- prot[TLS_BASE][TLS_SW].close = tls_sk_proto_close;
+ prot[TLS_BASE][TLS_SW].recvmsg = tls_sw_recvmsg;
+ prot[TLS_BASE][TLS_SW].stream_memory_read = tls_sw_stream_read;
+ prot[TLS_BASE][TLS_SW].close = tls_sk_proto_close;
prot[TLS_SW][TLS_SW] = prot[TLS_SW][TLS_BASE];
- prot[TLS_SW][TLS_SW].recvmsg = tls_sw_recvmsg;
- prot[TLS_SW][TLS_SW].close = tls_sk_proto_close;
+ prot[TLS_SW][TLS_SW].recvmsg = tls_sw_recvmsg;
+ prot[TLS_SW][TLS_SW].stream_memory_read = tls_sw_stream_read;
+ prot[TLS_SW][TLS_SW].close = tls_sk_proto_close;
#ifdef CONFIG_TLS_DEVICE
prot[TLS_HW][TLS_BASE] = prot[TLS_BASE][TLS_BASE];
@@ -724,7 +726,6 @@ static int __init tls_register(void)
build_protos(tls_prots[TLSV4], &tcp_prot);
tls_sw_proto_ops = inet_stream_ops;
- tls_sw_proto_ops.poll = tls_sw_poll;
tls_sw_proto_ops.splice_read = tls_sw_splice_read;
#ifdef CONFIG_TLS_DEVICE
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index aa9fdce272b6..a525fc4c2a4b 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -4,6 +4,7 @@
* Copyright (c) 2016-2017, Lance Chao <lancerchao@fb.com>. All rights reserved.
* Copyright (c) 2016, Fridolin Pokorny <fridolin.pokorny@gmail.com>. All rights reserved.
* Copyright (c) 2016, Nikos Mavrogiannopoulos <nmav@gnutls.org>. All rights reserved.
+ * Copyright (c) 2018, Covalent IO, Inc. http://covalent.io
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -213,153 +214,89 @@ static int tls_do_decryption(struct sock *sk,
return ret;
}
-static void trim_sg(struct sock *sk, struct scatterlist *sg,
- int *sg_num_elem, unsigned int *sg_size, int target_size)
-{
- int i = *sg_num_elem - 1;
- int trim = *sg_size - target_size;
-
- if (trim <= 0) {
- WARN_ON(trim < 0);
- return;
- }
-
- *sg_size = target_size;
- while (trim >= sg[i].length) {
- trim -= sg[i].length;
- sk_mem_uncharge(sk, sg[i].length);
- put_page(sg_page(&sg[i]));
- i--;
-
- if (i < 0)
- goto out;
- }
-
- sg[i].length -= trim;
- sk_mem_uncharge(sk, trim);
-
-out:
- *sg_num_elem = i + 1;
-}
-
-static void trim_both_sgl(struct sock *sk, int target_size)
+static void tls_trim_both_msgs(struct sock *sk, int target_size)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
struct tls_rec *rec = ctx->open_rec;
- trim_sg(sk, &rec->sg_plaintext_data[1],
- &rec->sg_plaintext_num_elem,
- &rec->sg_plaintext_size,
- target_size);
-
+ sk_msg_trim(sk, &rec->msg_plaintext, target_size);
if (target_size > 0)
target_size += tls_ctx->tx.overhead_size;
-
- trim_sg(sk, &rec->sg_encrypted_data[1],
- &rec->sg_encrypted_num_elem,
- &rec->sg_encrypted_size,
- target_size);
+ sk_msg_trim(sk, &rec->msg_encrypted, target_size);
}
-static int alloc_encrypted_sg(struct sock *sk, int len)
+static int tls_alloc_encrypted_msg(struct sock *sk, int len)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
struct tls_rec *rec = ctx->open_rec;
- int rc = 0;
-
- rc = sk_alloc_sg(sk, len,
- &rec->sg_encrypted_data[1], 0,
- &rec->sg_encrypted_num_elem,
- &rec->sg_encrypted_size, 0);
-
- if (rc == -ENOSPC)
- rec->sg_encrypted_num_elem =
- ARRAY_SIZE(rec->sg_encrypted_data) - 1;
+ struct sk_msg *msg_en = &rec->msg_encrypted;
- return rc;
+ return sk_msg_alloc(sk, msg_en, len, 0);
}
-static int move_to_plaintext_sg(struct sock *sk, int required_size)
+static int tls_clone_plaintext_msg(struct sock *sk, int required)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
struct tls_rec *rec = ctx->open_rec;
- struct scatterlist *plain_sg = &rec->sg_plaintext_data[1];
- struct scatterlist *enc_sg = &rec->sg_encrypted_data[1];
- int enc_sg_idx = 0;
+ struct sk_msg *msg_pl = &rec->msg_plaintext;
+ struct sk_msg *msg_en = &rec->msg_encrypted;
int skip, len;
- if (rec->sg_plaintext_num_elem == MAX_SKB_FRAGS)
- return -ENOSPC;
-
- /* We add page references worth len bytes from enc_sg at the
- * end of plain_sg. It is guaranteed that sg_encrypted_data
+ /* We add page references worth len bytes from encrypted sg
+ * at the end of plaintext sg. It is guaranteed that msg_en
* has enough required room (ensured by caller).
*/
- len = required_size - rec->sg_plaintext_size;
+ len = required - msg_pl->sg.size;
- /* Skip initial bytes in sg_encrypted_data to be able
- * to use same offset of both plain and encrypted data.
+ /* Skip initial bytes in msg_en's data to be able to use
+ * same offset of both plain and encrypted data.
*/
- skip = tls_ctx->tx.prepend_size + rec->sg_plaintext_size;
+ skip = tls_ctx->tx.prepend_size + msg_pl->sg.size;
- while (enc_sg_idx < rec->sg_encrypted_num_elem) {
- if (enc_sg[enc_sg_idx].length > skip)
- break;
-
- skip -= enc_sg[enc_sg_idx].length;
- enc_sg_idx++;
- }
-
- /* unmark the end of plain_sg*/
- sg_unmark_end(plain_sg + rec->sg_plaintext_num_elem - 1);
-
- while (len) {
- struct page *page = sg_page(&enc_sg[enc_sg_idx]);
- int bytes = enc_sg[enc_sg_idx].length - skip;
- int offset = enc_sg[enc_sg_idx].offset + skip;
-
- if (bytes > len)
- bytes = len;
- else
- enc_sg_idx++;
+ return sk_msg_clone(sk, msg_pl, msg_en, skip, len);
+}
- /* Skipping is required only one time */
- skip = 0;
+static struct tls_rec *tls_get_rec(struct sock *sk)
+{
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
+ struct sk_msg *msg_pl, *msg_en;
+ struct tls_rec *rec;
+ int mem_size;
- /* Increment page reference */
- get_page(page);
+ mem_size = sizeof(struct tls_rec) + crypto_aead_reqsize(ctx->aead_send);
- sg_set_page(&plain_sg[rec->sg_plaintext_num_elem], page,
- bytes, offset);
+ rec = kzalloc(mem_size, sk->sk_allocation);
+ if (!rec)
+ return NULL;
- sk_mem_charge(sk, bytes);
+ msg_pl = &rec->msg_plaintext;
+ msg_en = &rec->msg_encrypted;
- len -= bytes;
- rec->sg_plaintext_size += bytes;
+ sk_msg_init(msg_pl);
+ sk_msg_init(msg_en);
- rec->sg_plaintext_num_elem++;
+ sg_init_table(rec->sg_aead_in, 2);
+ sg_set_buf(&rec->sg_aead_in[0], rec->aad_space,
+ sizeof(rec->aad_space));
+ sg_unmark_end(&rec->sg_aead_in[1]);
- if (rec->sg_plaintext_num_elem == MAX_SKB_FRAGS)
- return -ENOSPC;
- }
+ sg_init_table(rec->sg_aead_out, 2);
+ sg_set_buf(&rec->sg_aead_out[0], rec->aad_space,
+ sizeof(rec->aad_space));
+ sg_unmark_end(&rec->sg_aead_out[1]);
- return 0;
+ return rec;
}
-static void free_sg(struct sock *sk, struct scatterlist *sg,
- int *sg_num_elem, unsigned int *sg_size)
+static void tls_free_rec(struct sock *sk, struct tls_rec *rec)
{
- int i, n = *sg_num_elem;
-
- for (i = 0; i < n; ++i) {
- sk_mem_uncharge(sk, sg[i].length);
- put_page(sg_page(&sg[i]));
- }
- *sg_num_elem = 0;
- *sg_size = 0;
+ sk_msg_free(sk, &rec->msg_encrypted);
+ sk_msg_free(sk, &rec->msg_plaintext);
+ kfree(rec);
}
static void tls_free_open_rec(struct sock *sk)
@@ -368,19 +305,10 @@ static void tls_free_open_rec(struct sock *sk)
struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
struct tls_rec *rec = ctx->open_rec;
- /* Return if there is no open record */
- if (!rec)
- return;
-
- free_sg(sk, &rec->sg_encrypted_data[1],
- &rec->sg_encrypted_num_elem,
- &rec->sg_encrypted_size);
-
- free_sg(sk, &rec->sg_plaintext_data[1],
- &rec->sg_plaintext_num_elem,
- &rec->sg_plaintext_size);
-
- kfree(rec);
+ if (rec) {
+ tls_free_rec(sk, rec);
+ ctx->open_rec = NULL;
+ }
}
int tls_tx_records(struct sock *sk, int flags)
@@ -388,6 +316,7 @@ int tls_tx_records(struct sock *sk, int flags)
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
struct tls_rec *rec, *tmp;
+ struct sk_msg *msg_en;
int tx_flags, rc = 0;
if (tls_is_partially_sent_record(tls_ctx)) {
@@ -407,9 +336,7 @@ int tls_tx_records(struct sock *sk, int flags)
* Remove the head of tx_list
*/
list_del(&rec->list);
- free_sg(sk, &rec->sg_plaintext_data[1],
- &rec->sg_plaintext_num_elem, &rec->sg_plaintext_size);
-
+ sk_msg_free(sk, &rec->msg_plaintext);
kfree(rec);
}
@@ -421,17 +348,15 @@ int tls_tx_records(struct sock *sk, int flags)
else
tx_flags = flags;
+ msg_en = &rec->msg_encrypted;
rc = tls_push_sg(sk, tls_ctx,
- &rec->sg_encrypted_data[1],
+ &msg_en->sg.data[msg_en->sg.curr],
0, tx_flags);
if (rc)
goto tx_err;
list_del(&rec->list);
- free_sg(sk, &rec->sg_plaintext_data[1],
- &rec->sg_plaintext_num_elem,
- &rec->sg_plaintext_size);
-
+ sk_msg_free(sk, &rec->msg_plaintext);
kfree(rec);
} else {
break;
@@ -451,15 +376,18 @@ static void tls_encrypt_done(struct crypto_async_request *req, int err)
struct sock *sk = req->data;
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
+ struct scatterlist *sge;
+ struct sk_msg *msg_en;
struct tls_rec *rec;
bool ready = false;
int pending;
rec = container_of(aead_req, struct tls_rec, aead_req);
+ msg_en = &rec->msg_encrypted;
- rec->sg_encrypted_data[1].offset -= tls_ctx->tx.prepend_size;
- rec->sg_encrypted_data[1].length += tls_ctx->tx.prepend_size;
-
+ sge = sk_msg_elem(msg_en, msg_en->sg.curr);
+ sge->offset -= tls_ctx->tx.prepend_size;
+ sge->length += tls_ctx->tx.prepend_size;
/* Check if error is previously set on socket */
if (err || sk->sk_err) {
@@ -497,31 +425,29 @@ static void tls_encrypt_done(struct crypto_async_request *req, int err)
/* Schedule the transmission */
if (!test_and_set_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask))
- schedule_delayed_work(&ctx->tx_work.work, 2);
+ schedule_delayed_work(&ctx->tx_work.work, 1);
}
static int tls_do_encryption(struct sock *sk,
struct tls_context *tls_ctx,
struct tls_sw_context_tx *ctx,
struct aead_request *aead_req,
- size_t data_len)
+ size_t data_len, u32 start)
{
struct tls_rec *rec = ctx->open_rec;
- struct scatterlist *plain_sg = rec->sg_plaintext_data;
- struct scatterlist *enc_sg = rec->sg_encrypted_data;
+ struct sk_msg *msg_en = &rec->msg_encrypted;
+ struct scatterlist *sge = sk_msg_elem(msg_en, start);
int rc;
- /* Skip the first index as it contains AAD data */
- rec->sg_encrypted_data[1].offset += tls_ctx->tx.prepend_size;
- rec->sg_encrypted_data[1].length -= tls_ctx->tx.prepend_size;
+ sge->offset += tls_ctx->tx.prepend_size;
+ sge->length -= tls_ctx->tx.prepend_size;
- /* If it is inplace crypto, then pass same SG list as both src, dst */
- if (rec->inplace_crypto)
- plain_sg = enc_sg;
+ msg_en->sg.curr = start;
aead_request_set_tfm(aead_req, ctx->aead_send);
aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE);
- aead_request_set_crypt(aead_req, plain_sg, enc_sg,
+ aead_request_set_crypt(aead_req, rec->sg_aead_in,
+ rec->sg_aead_out,
data_len, tls_ctx->tx.iv);
aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG,
@@ -534,8 +460,8 @@ static int tls_do_encryption(struct sock *sk,
rc = crypto_aead_encrypt(aead_req);
if (!rc || rc != -EINPROGRESS) {
atomic_dec(&ctx->encrypt_pending);
- rec->sg_encrypted_data[1].offset -= tls_ctx->tx.prepend_size;
- rec->sg_encrypted_data[1].length += tls_ctx->tx.prepend_size;
+ sge->offset -= tls_ctx->tx.prepend_size;
+ sge->length += tls_ctx->tx.prepend_size;
}
if (!rc) {
@@ -551,177 +477,318 @@ static int tls_do_encryption(struct sock *sk,
return rc;
}
+static int tls_split_open_record(struct sock *sk, struct tls_rec *from,
+ struct tls_rec **to, struct sk_msg *msg_opl,
+ struct sk_msg *msg_oen, u32 split_point,
+ u32 tx_overhead_size, u32 *orig_end)
+{
+ u32 i, j, bytes = 0, apply = msg_opl->apply_bytes;
+ struct scatterlist *sge, *osge, *nsge;
+ u32 orig_size = msg_opl->sg.size;
+ struct scatterlist tmp = { };
+ struct sk_msg *msg_npl;
+ struct tls_rec *new;
+ int ret;
+
+ new = tls_get_rec(sk);
+ if (!new)
+ return -ENOMEM;
+ ret = sk_msg_alloc(sk, &new->msg_encrypted, msg_opl->sg.size +
+ tx_overhead_size, 0);
+ if (ret < 0) {
+ tls_free_rec(sk, new);
+ return ret;
+ }
+
+ *orig_end = msg_opl->sg.end;
+ i = msg_opl->sg.start;
+ sge = sk_msg_elem(msg_opl, i);
+ while (apply && sge->length) {
+ if (sge->length > apply) {
+ u32 len = sge->length - apply;
+
+ get_page(sg_page(sge));
+ sg_set_page(&tmp, sg_page(sge), len,
+ sge->offset + apply);
+ sge->length = apply;
+ bytes += apply;
+ apply = 0;
+ } else {
+ apply -= sge->length;
+ bytes += sge->length;
+ }
+
+ sk_msg_iter_var_next(i);
+ if (i == msg_opl->sg.end)
+ break;
+ sge = sk_msg_elem(msg_opl, i);
+ }
+
+ msg_opl->sg.end = i;
+ msg_opl->sg.curr = i;
+ msg_opl->sg.copybreak = 0;
+ msg_opl->apply_bytes = 0;
+ msg_opl->sg.size = bytes;
+
+ msg_npl = &new->msg_plaintext;
+ msg_npl->apply_bytes = apply;
+ msg_npl->sg.size = orig_size - bytes;
+
+ j = msg_npl->sg.start;
+ nsge = sk_msg_elem(msg_npl, j);
+ if (tmp.length) {
+ memcpy(nsge, &tmp, sizeof(*nsge));
+ sk_msg_iter_var_next(j);
+ nsge = sk_msg_elem(msg_npl, j);
+ }
+
+ osge = sk_msg_elem(msg_opl, i);
+ while (osge->length) {
+ memcpy(nsge, osge, sizeof(*nsge));
+ sg_unmark_end(nsge);
+ sk_msg_iter_var_next(i);
+ sk_msg_iter_var_next(j);
+ if (i == *orig_end)
+ break;
+ osge = sk_msg_elem(msg_opl, i);
+ nsge = sk_msg_elem(msg_npl, j);
+ }
+
+ msg_npl->sg.end = j;
+ msg_npl->sg.curr = j;
+ msg_npl->sg.copybreak = 0;
+
+ *to = new;
+ return 0;
+}
+
+static void tls_merge_open_record(struct sock *sk, struct tls_rec *to,
+ struct tls_rec *from, u32 orig_end)
+{
+ struct sk_msg *msg_npl = &from->msg_plaintext;
+ struct sk_msg *msg_opl = &to->msg_plaintext;
+ struct scatterlist *osge, *nsge;
+ u32 i, j;
+
+ i = msg_opl->sg.end;
+ sk_msg_iter_var_prev(i);
+ j = msg_npl->sg.start;
+
+ osge = sk_msg_elem(msg_opl, i);
+ nsge = sk_msg_elem(msg_npl, j);
+
+ if (sg_page(osge) == sg_page(nsge) &&
+ osge->offset + osge->length == nsge->offset) {
+ osge->length += nsge->length;
+ put_page(sg_page(nsge));
+ }
+
+ msg_opl->sg.end = orig_end;
+ msg_opl->sg.curr = orig_end;
+ msg_opl->sg.copybreak = 0;
+ msg_opl->apply_bytes = msg_opl->sg.size + msg_npl->sg.size;
+ msg_opl->sg.size += msg_npl->sg.size;
+
+ sk_msg_free(sk, &to->msg_encrypted);
+ sk_msg_xfer_full(&to->msg_encrypted, &from->msg_encrypted);
+
+ kfree(from);
+}
+
static int tls_push_record(struct sock *sk, int flags,
unsigned char record_type)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
- struct tls_rec *rec = ctx->open_rec;
+ struct tls_rec *rec = ctx->open_rec, *tmp = NULL;
+ u32 i, split_point, uninitialized_var(orig_end);
+ struct sk_msg *msg_pl, *msg_en;
struct aead_request *req;
+ bool split;
int rc;
if (!rec)
return 0;
+ msg_pl = &rec->msg_plaintext;
+ msg_en = &rec->msg_encrypted;
+
+ split_point = msg_pl->apply_bytes;
+ split = split_point && split_point < msg_pl->sg.size;
+ if (split) {
+ rc = tls_split_open_record(sk, rec, &tmp, msg_pl, msg_en,
+ split_point, tls_ctx->tx.overhead_size,
+ &orig_end);
+ if (rc < 0)
+ return rc;
+ sk_msg_trim(sk, msg_en, msg_pl->sg.size +
+ tls_ctx->tx.overhead_size);
+ }
+
rec->tx_flags = flags;
req = &rec->aead_req;
- sg_mark_end(rec->sg_plaintext_data + rec->sg_plaintext_num_elem);
- sg_mark_end(rec->sg_encrypted_data + rec->sg_encrypted_num_elem);
+ i = msg_pl->sg.end;
+ sk_msg_iter_var_prev(i);
+ sg_mark_end(sk_msg_elem(msg_pl, i));
+
+ i = msg_pl->sg.start;
+ sg_chain(rec->sg_aead_in, 2, rec->inplace_crypto ?
+ &msg_en->sg.data[i] : &msg_pl->sg.data[i]);
+
+ i = msg_en->sg.end;
+ sk_msg_iter_var_prev(i);
+ sg_mark_end(sk_msg_elem(msg_en, i));
- tls_make_aad(rec->aad_space, rec->sg_plaintext_size,
+ i = msg_en->sg.start;
+ sg_chain(rec->sg_aead_out, 2, &msg_en->sg.data[i]);
+
+ tls_make_aad(rec->aad_space, msg_pl->sg.size,
tls_ctx->tx.rec_seq, tls_ctx->tx.rec_seq_size,
record_type);
tls_fill_prepend(tls_ctx,
- page_address(sg_page(&rec->sg_encrypted_data[1])) +
- rec->sg_encrypted_data[1].offset,
- rec->sg_plaintext_size, record_type);
-
- tls_ctx->pending_open_record_frags = 0;
+ page_address(sg_page(&msg_en->sg.data[i])) +
+ msg_en->sg.data[i].offset, msg_pl->sg.size,
+ record_type);
- rc = tls_do_encryption(sk, tls_ctx, ctx, req, rec->sg_plaintext_size);
- if (rc == -EINPROGRESS)
- return -EINPROGRESS;
+ tls_ctx->pending_open_record_frags = false;
+ rc = tls_do_encryption(sk, tls_ctx, ctx, req, msg_pl->sg.size, i);
if (rc < 0) {
- tls_err_abort(sk, EBADMSG);
+ if (rc != -EINPROGRESS) {
+ tls_err_abort(sk, EBADMSG);
+ if (split) {
+ tls_ctx->pending_open_record_frags = true;
+ tls_merge_open_record(sk, rec, tmp, orig_end);
+ }
+ }
return rc;
+ } else if (split) {
+ msg_pl = &tmp->msg_plaintext;
+ msg_en = &tmp->msg_encrypted;
+ sk_msg_trim(sk, msg_en, msg_pl->sg.size +
+ tls_ctx->tx.overhead_size);
+ tls_ctx->pending_open_record_frags = true;
+ ctx->open_rec = tmp;
}
return tls_tx_records(sk, flags);
}
-static int tls_sw_push_pending_record(struct sock *sk, int flags)
-{
- return tls_push_record(sk, flags, TLS_RECORD_TYPE_DATA);
-}
-
-static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
- int length, int *pages_used,
- unsigned int *size_used,
- struct scatterlist *to, int to_max_pages,
- bool charge)
+static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk,
+ bool full_record, u8 record_type,
+ size_t *copied, int flags)
{
- struct page *pages[MAX_SKB_FRAGS];
-
- size_t offset;
- ssize_t copied, use;
- int i = 0;
- unsigned int size = *size_used;
- int num_elem = *pages_used;
- int rc = 0;
- int maxpages;
-
- while (length > 0) {
- i = 0;
- maxpages = to_max_pages - num_elem;
- if (maxpages == 0) {
- rc = -EFAULT;
- goto out;
- }
- copied = iov_iter_get_pages(from, pages,
- length,
- maxpages, &offset);
- if (copied <= 0) {
- rc = -EFAULT;
- goto out;
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
+ struct sk_msg msg_redir = { };
+ struct sk_psock *psock;
+ struct sock *sk_redir;
+ struct tls_rec *rec;
+ int err = 0, send;
+ bool enospc;
+
+ psock = sk_psock_get(sk);
+ if (!psock)
+ return tls_push_record(sk, flags, record_type);
+more_data:
+ enospc = sk_msg_full(msg);
+ if (psock->eval == __SK_NONE)
+ psock->eval = sk_psock_msg_verdict(sk, psock, msg);
+ if (msg->cork_bytes && msg->cork_bytes > msg->sg.size &&
+ !enospc && !full_record) {
+ err = -ENOSPC;
+ goto out_err;
+ }
+ msg->cork_bytes = 0;
+ send = msg->sg.size;
+ if (msg->apply_bytes && msg->apply_bytes < send)
+ send = msg->apply_bytes;
+
+ switch (psock->eval) {
+ case __SK_PASS:
+ err = tls_push_record(sk, flags, record_type);
+ if (err < 0) {
+ *copied -= sk_msg_free(sk, msg);
+ tls_free_open_rec(sk);
+ goto out_err;
}
-
- iov_iter_advance(from, copied);
-
- length -= copied;
- size += copied;
- while (copied) {
- use = min_t(int, copied, PAGE_SIZE - offset);
-
- sg_set_page(&to[num_elem],
- pages[i], use, offset);
- sg_unmark_end(&to[num_elem]);
- if (charge)
- sk_mem_charge(sk, use);
-
- offset = 0;
- copied -= use;
-
- ++i;
- ++num_elem;
+ break;
+ case __SK_REDIRECT:
+ sk_redir = psock->sk_redir;
+ memcpy(&msg_redir, msg, sizeof(*msg));
+ if (msg->apply_bytes < send)
+ msg->apply_bytes = 0;
+ else
+ msg->apply_bytes -= send;
+ sk_msg_return_zero(sk, msg, send);
+ msg->sg.size -= send;
+ release_sock(sk);
+ err = tcp_bpf_sendmsg_redir(sk_redir, &msg_redir, send, flags);
+ lock_sock(sk);
+ if (err < 0) {
+ *copied -= sk_msg_free_nocharge(sk, &msg_redir);
+ msg->sg.size = 0;
}
+ if (msg->sg.size == 0)
+ tls_free_open_rec(sk);
+ break;
+ case __SK_DROP:
+ default:
+ sk_msg_free_partial(sk, msg, send);
+ if (msg->apply_bytes < send)
+ msg->apply_bytes = 0;
+ else
+ msg->apply_bytes -= send;
+ if (msg->sg.size == 0)
+ tls_free_open_rec(sk);
+ *copied -= send;
+ err = -EACCES;
}
- /* Mark the end in the last sg entry if newly added */
- if (num_elem > *pages_used)
- sg_mark_end(&to[num_elem - 1]);
-out:
- if (rc)
- iov_iter_revert(from, size - *size_used);
- *size_used = size;
- *pages_used = num_elem;
+ if (likely(!err)) {
+ bool reset_eval = !ctx->open_rec;
- return rc;
-}
-
-static int memcopy_from_iter(struct sock *sk, struct iov_iter *from,
- int bytes)
-{
- struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
- struct tls_rec *rec = ctx->open_rec;
- struct scatterlist *sg = &rec->sg_plaintext_data[1];
- int copy, i, rc = 0;
-
- for (i = tls_ctx->pending_open_record_frags;
- i < rec->sg_plaintext_num_elem; ++i) {
- copy = sg[i].length;
- if (copy_from_iter(
- page_address(sg_page(&sg[i])) + sg[i].offset,
- copy, from) != copy) {
- rc = -EFAULT;
- goto out;
+ rec = ctx->open_rec;
+ if (rec) {
+ msg = &rec->msg_plaintext;
+ if (!msg->apply_bytes)
+ reset_eval = true;
}
- bytes -= copy;
-
- ++tls_ctx->pending_open_record_frags;
-
- if (!bytes)
- break;
+ if (reset_eval) {
+ psock->eval = __SK_NONE;
+ if (psock->sk_redir) {
+ sock_put(psock->sk_redir);
+ psock->sk_redir = NULL;
+ }
+ }
+ if (rec)
+ goto more_data;
}
-
-out:
- return rc;
+ out_err:
+ sk_psock_put(sk, psock);
+ return err;
}
-static struct tls_rec *get_rec(struct sock *sk)
+static int tls_sw_push_pending_record(struct sock *sk, int flags)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
- struct tls_rec *rec;
- int mem_size;
-
- /* Return if we already have an open record */
- if (ctx->open_rec)
- return ctx->open_rec;
-
- mem_size = sizeof(struct tls_rec) + crypto_aead_reqsize(ctx->aead_send);
+ struct tls_rec *rec = ctx->open_rec;
+ struct sk_msg *msg_pl;
+ size_t copied;
- rec = kzalloc(mem_size, sk->sk_allocation);
if (!rec)
- return NULL;
-
- sg_init_table(&rec->sg_plaintext_data[0],
- ARRAY_SIZE(rec->sg_plaintext_data));
- sg_init_table(&rec->sg_encrypted_data[0],
- ARRAY_SIZE(rec->sg_encrypted_data));
-
- sg_set_buf(&rec->sg_plaintext_data[0], rec->aad_space,
- sizeof(rec->aad_space));
- sg_set_buf(&rec->sg_encrypted_data[0], rec->aad_space,
- sizeof(rec->aad_space));
+ return 0;
- ctx->open_rec = rec;
- rec->inplace_crypto = 1;
+ msg_pl = &rec->msg_plaintext;
+ copied = msg_pl->sg.size;
+ if (!copied)
+ return 0;
- return rec;
+ return bpf_exec_tx_verdict(msg_pl, sk, true, TLS_RECORD_TYPE_DATA,
+ &copied, flags);
}
int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
@@ -735,6 +802,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
bool is_kvec = msg->msg_iter.type & ITER_KVEC;
bool eor = !(msg->msg_flags & MSG_MORE);
size_t try_to_copy, copied = 0;
+ struct sk_msg *msg_pl, *msg_en;
struct tls_rec *rec;
int required_size;
int num_async = 0;
@@ -772,29 +840,35 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
goto send_end;
}
- rec = get_rec(sk);
+ if (ctx->open_rec)
+ rec = ctx->open_rec;
+ else
+ rec = ctx->open_rec = tls_get_rec(sk);
if (!rec) {
ret = -ENOMEM;
goto send_end;
}
- orig_size = rec->sg_plaintext_size;
+ msg_pl = &rec->msg_plaintext;
+ msg_en = &rec->msg_encrypted;
+
+ orig_size = msg_pl->sg.size;
full_record = false;
try_to_copy = msg_data_left(msg);
- record_room = TLS_MAX_PAYLOAD_SIZE - rec->sg_plaintext_size;
+ record_room = TLS_MAX_PAYLOAD_SIZE - msg_pl->sg.size;
if (try_to_copy >= record_room) {
try_to_copy = record_room;
full_record = true;
}
- required_size = rec->sg_plaintext_size + try_to_copy +
+ required_size = msg_pl->sg.size + try_to_copy +
tls_ctx->tx.overhead_size;
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
alloc_encrypted:
- ret = alloc_encrypted_sg(sk, required_size);
+ ret = tls_alloc_encrypted_msg(sk, required_size);
if (ret) {
if (ret != -ENOSPC)
goto wait_for_memory;
@@ -803,17 +877,15 @@ alloc_encrypted:
* actually allocated. The difference is due
* to max sg elements limit
*/
- try_to_copy -= required_size - rec->sg_encrypted_size;
+ try_to_copy -= required_size - msg_en->sg.size;
full_record = true;
}
if (!is_kvec && (full_record || eor) && !async_capable) {
- ret = zerocopy_from_iter(sk, &msg->msg_iter,
- try_to_copy, &rec->sg_plaintext_num_elem,
- &rec->sg_plaintext_size,
- &rec->sg_plaintext_data[1],
- ARRAY_SIZE(rec->sg_plaintext_data) - 1,
- true);
+ u32 first = msg_pl->sg.end;
+
+ ret = sk_msg_zerocopy_from_iter(sk, &msg->msg_iter,
+ msg_pl, try_to_copy);
if (ret)
goto fallback_to_reg_send;
@@ -821,25 +893,34 @@ alloc_encrypted:
num_zc++;
copied += try_to_copy;
- ret = tls_push_record(sk, msg->msg_flags, record_type);
+
+ sk_msg_sg_copy_set(msg_pl, first);
+ ret = bpf_exec_tx_verdict(msg_pl, sk, full_record,
+ record_type, &copied,
+ msg->msg_flags);
if (ret) {
if (ret == -EINPROGRESS)
num_async++;
+ else if (ret == -ENOMEM)
+ goto wait_for_memory;
+ else if (ret == -ENOSPC)
+ goto rollback_iter;
else if (ret != -EAGAIN)
goto send_end;
}
continue;
-
+rollback_iter:
+ copied -= try_to_copy;
+ sk_msg_sg_copy_clear(msg_pl, first);
+ iov_iter_revert(&msg->msg_iter,
+ msg_pl->sg.size - orig_size);
fallback_to_reg_send:
- trim_sg(sk, &rec->sg_plaintext_data[1],
- &rec->sg_plaintext_num_elem,
- &rec->sg_plaintext_size,
- orig_size);
+ sk_msg_trim(sk, msg_pl, orig_size);
}
- required_size = rec->sg_plaintext_size + try_to_copy;
+ required_size = msg_pl->sg.size + try_to_copy;
- ret = move_to_plaintext_sg(sk, required_size);
+ ret = tls_clone_plaintext_msg(sk, required_size);
if (ret) {
if (ret != -ENOSPC)
goto send_end;
@@ -848,28 +929,36 @@ fallback_to_reg_send:
* actually allocated. The difference is due
* to max sg elements limit
*/
- try_to_copy -= required_size - rec->sg_plaintext_size;
+ try_to_copy -= required_size - msg_pl->sg.size;
full_record = true;
-
- trim_sg(sk, &rec->sg_encrypted_data[1],
- &rec->sg_encrypted_num_elem,
- &rec->sg_encrypted_size,
- rec->sg_plaintext_size +
- tls_ctx->tx.overhead_size);
+ sk_msg_trim(sk, msg_en, msg_pl->sg.size +
+ tls_ctx->tx.overhead_size);
}
- ret = memcopy_from_iter(sk, &msg->msg_iter, try_to_copy);
- if (ret)
+ ret = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_pl,
+ try_to_copy);
+ if (ret < 0)
goto trim_sgl;
+ /* Open records defined only if successfully copied, otherwise
+ * we would trim the sg but not reset the open record frags.
+ */
+ tls_ctx->pending_open_record_frags = true;
copied += try_to_copy;
if (full_record || eor) {
- ret = tls_push_record(sk, msg->msg_flags, record_type);
+ ret = bpf_exec_tx_verdict(msg_pl, sk, full_record,
+ record_type, &copied,
+ msg->msg_flags);
if (ret) {
if (ret == -EINPROGRESS)
num_async++;
- else if (ret != -EAGAIN)
+ else if (ret == -ENOMEM)
+ goto wait_for_memory;
+ else if (ret != -EAGAIN) {
+ if (ret == -ENOSPC)
+ ret = 0;
goto send_end;
+ }
}
}
@@ -881,11 +970,11 @@ wait_for_memory:
ret = sk_stream_wait_memory(sk, &timeo);
if (ret) {
trim_sgl:
- trim_both_sgl(sk, orig_size);
+ tls_trim_both_msgs(sk, orig_size);
goto send_end;
}
- if (rec->sg_encrypted_size < required_size)
+ if (msg_en->sg.size < required_size)
goto alloc_encrypted;
}
@@ -928,10 +1017,10 @@ int tls_sw_sendpage(struct sock *sk, struct page *page,
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
unsigned char record_type = TLS_RECORD_TYPE_DATA;
- size_t orig_size = size;
- struct scatterlist *sg;
+ struct sk_msg *msg_pl;
struct tls_rec *rec;
int num_async = 0;
+ size_t copied = 0;
bool full_record;
int record_room;
int ret = 0;
@@ -964,26 +1053,33 @@ int tls_sw_sendpage(struct sock *sk, struct page *page,
goto sendpage_end;
}
- rec = get_rec(sk);
+ if (ctx->open_rec)
+ rec = ctx->open_rec;
+ else
+ rec = ctx->open_rec = tls_get_rec(sk);
if (!rec) {
ret = -ENOMEM;
goto sendpage_end;
}
+ msg_pl = &rec->msg_plaintext;
+
full_record = false;
- record_room = TLS_MAX_PAYLOAD_SIZE - rec->sg_plaintext_size;
+ record_room = TLS_MAX_PAYLOAD_SIZE - msg_pl->sg.size;
+ copied = 0;
copy = size;
if (copy >= record_room) {
copy = record_room;
full_record = true;
}
- required_size = rec->sg_plaintext_size + copy +
- tls_ctx->tx.overhead_size;
+
+ required_size = msg_pl->sg.size + copy +
+ tls_ctx->tx.overhead_size;
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
alloc_payload:
- ret = alloc_encrypted_sg(sk, required_size);
+ ret = tls_alloc_encrypted_msg(sk, required_size);
if (ret) {
if (ret != -ENOSPC)
goto wait_for_memory;
@@ -992,33 +1088,32 @@ alloc_payload:
* actually allocated. The difference is due
* to max sg elements limit
*/
- copy -= required_size - rec->sg_plaintext_size;
+ copy -= required_size - msg_pl->sg.size;
full_record = true;
}
- get_page(page);
- sg = &rec->sg_plaintext_data[1] + rec->sg_plaintext_num_elem;
- sg_set_page(sg, page, copy, offset);
- sg_unmark_end(sg);
-
- rec->sg_plaintext_num_elem++;
-
+ sk_msg_page_add(msg_pl, page, copy, offset);
sk_mem_charge(sk, copy);
+
offset += copy;
size -= copy;
- rec->sg_plaintext_size += copy;
- tls_ctx->pending_open_record_frags = rec->sg_plaintext_num_elem;
+ copied += copy;
- if (full_record || eor ||
- rec->sg_plaintext_num_elem ==
- ARRAY_SIZE(rec->sg_plaintext_data) - 1) {
+ tls_ctx->pending_open_record_frags = true;
+ if (full_record || eor || sk_msg_full(msg_pl)) {
rec->inplace_crypto = 0;
- ret = tls_push_record(sk, flags, record_type);
+ ret = bpf_exec_tx_verdict(msg_pl, sk, full_record,
+ record_type, &copied, flags);
if (ret) {
if (ret == -EINPROGRESS)
num_async++;
- else if (ret != -EAGAIN)
+ else if (ret == -ENOMEM)
+ goto wait_for_memory;
+ else if (ret != -EAGAIN) {
+ if (ret == -ENOSPC)
+ ret = 0;
goto sendpage_end;
+ }
}
}
continue;
@@ -1027,7 +1122,7 @@ wait_for_sndbuf:
wait_for_memory:
ret = sk_stream_wait_memory(sk, &timeo);
if (ret) {
- trim_both_sgl(sk, rec->sg_plaintext_size);
+ tls_trim_both_msgs(sk, msg_pl->sg.size);
goto sendpage_end;
}
@@ -1042,24 +1137,20 @@ wait_for_memory:
}
}
sendpage_end:
- if (orig_size > size)
- ret = orig_size - size;
- else
- ret = sk_stream_error(sk, flags, ret);
-
+ ret = sk_stream_error(sk, flags, ret);
release_sock(sk);
- return ret;
+ return copied ? copied : ret;
}
-static struct sk_buff *tls_wait_data(struct sock *sk, int flags,
- long timeo, int *err)
+static struct sk_buff *tls_wait_data(struct sock *sk, struct sk_psock *psock,
+ int flags, long timeo, int *err)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
struct sk_buff *skb;
DEFINE_WAIT_FUNC(wait, woken_wake_function);
- while (!(skb = ctx->recv_pkt)) {
+ while (!(skb = ctx->recv_pkt) && sk_psock_queue_empty(psock)) {
if (sk->sk_err) {
*err = sock_error(sk);
return NULL;
@@ -1078,7 +1169,10 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags,
add_wait_queue(sk_sleep(sk), &wait);
sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
- sk_wait_event(sk, &timeo, ctx->recv_pkt != skb, &wait);
+ sk_wait_event(sk, &timeo,
+ ctx->recv_pkt != skb ||
+ !sk_psock_queue_empty(psock),
+ &wait);
sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
remove_wait_queue(sk_sleep(sk), &wait);
@@ -1092,6 +1186,64 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags,
return skb;
}
+static int tls_setup_from_iter(struct sock *sk, struct iov_iter *from,
+ int length, int *pages_used,
+ unsigned int *size_used,
+ struct scatterlist *to,
+ int to_max_pages)
+{
+ int rc = 0, i = 0, num_elem = *pages_used, maxpages;
+ struct page *pages[MAX_SKB_FRAGS];
+ unsigned int size = *size_used;
+ ssize_t copied, use;
+ size_t offset;
+
+ while (length > 0) {
+ i = 0;
+ maxpages = to_max_pages - num_elem;
+ if (maxpages == 0) {
+ rc = -EFAULT;
+ goto out;
+ }
+ copied = iov_iter_get_pages(from, pages,
+ length,
+ maxpages, &offset);
+ if (copied <= 0) {
+ rc = -EFAULT;
+ goto out;
+ }
+
+ iov_iter_advance(from, copied);
+
+ length -= copied;
+ size += copied;
+ while (copied) {
+ use = min_t(int, copied, PAGE_SIZE - offset);
+
+ sg_set_page(&to[num_elem],
+ pages[i], use, offset);
+ sg_unmark_end(&to[num_elem]);
+ /* We do not uncharge memory from this API */
+
+ offset = 0;
+ copied -= use;
+
+ i++;
+ num_elem++;
+ }
+ }
+ /* Mark the end in the last sg entry if newly added */
+ if (num_elem > *pages_used)
+ sg_mark_end(&to[num_elem - 1]);
+out:
+ if (rc)
+ iov_iter_revert(from, size - *size_used);
+ *size_used = size;
+ *pages_used = num_elem;
+
+ return rc;
+}
+
/* This function decrypts the input skb into either out_iov or in out_sg
* or in skb buffers itself. The input parameter 'zc' indicates if
* zero-copy mode needs to be tried or not. With zero-copy mode, either
@@ -1189,9 +1341,9 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb,
sg_set_buf(&sgout[0], aad, TLS_AAD_SPACE_SIZE);
*chunk = 0;
- err = zerocopy_from_iter(sk, out_iov, data_len, &pages,
- chunk, &sgout[1],
- (n_sgout - 1), false);
+ err = tls_setup_from_iter(sk, out_iov, data_len,
+ &pages, chunk, &sgout[1],
+ (n_sgout - 1));
if (err < 0)
goto fallback_to_reg_recv;
} else if (out_sg) {
@@ -1297,6 +1449,7 @@ int tls_sw_recvmsg(struct sock *sk,
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
+ struct sk_psock *psock;
unsigned char control;
struct strp_msg *rxm;
struct sk_buff *skb;
@@ -1312,6 +1465,7 @@ int tls_sw_recvmsg(struct sock *sk,
if (unlikely(flags & MSG_ERRQUEUE))
return sock_recv_errqueue(sk, msg, len, SOL_IP, IP_RECVERR);
+ psock = sk_psock_get(sk);
lock_sock(sk);
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
@@ -1321,9 +1475,19 @@ int tls_sw_recvmsg(struct sock *sk,
bool async = false;
int chunk = 0;
- skb = tls_wait_data(sk, flags, timeo, &err);
- if (!skb)
+ skb = tls_wait_data(sk, psock, flags, timeo, &err);
+ if (!skb) {
+ if (psock) {
+ int ret = __tcp_bpf_recvmsg(sk, psock, msg, len);
+
+ if (ret > 0) {
+ copied += ret;
+ len -= ret;
+ continue;
+ }
+ }
goto recv_end;
+ }
rxm = strp_msg(skb);
@@ -1429,6 +1593,8 @@ recv_end:
}
release_sock(sk);
+ if (psock)
+ sk_psock_put(sk, psock);
return copied ? : err;
}
@@ -1451,7 +1617,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
- skb = tls_wait_data(sk, flags, timeo, &err);
+ skb = tls_wait_data(sk, NULL, flags, timeo, &err);
if (!skb)
goto splice_read_end;
@@ -1485,23 +1651,20 @@ splice_read_end:
return copied ? : err;
}
-unsigned int tls_sw_poll(struct file *file, struct socket *sock,
- struct poll_table_struct *wait)
+bool tls_sw_stream_read(const struct sock *sk)
{
- unsigned int ret;
- struct sock *sk = sock->sk;
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
+ bool ingress_empty = true;
+ struct sk_psock *psock;
- /* Grab POLLOUT and POLLHUP from the underlying socket */
- ret = ctx->sk_poll(file, sock, wait);
-
- /* Clear POLLIN bits, and set based on recv_pkt */
- ret &= ~(POLLIN | POLLRDNORM);
- if (ctx->recv_pkt)
- ret |= POLLIN | POLLRDNORM;
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (psock)
+ ingress_empty = list_empty(&psock->ingress_msg);
+ rcu_read_unlock();
- return ret;
+ return !ingress_empty || ctx->recv_pkt;
}
static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
@@ -1580,8 +1743,15 @@ static void tls_data_ready(struct sock *sk)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
+ struct sk_psock *psock;
strp_data_ready(&ctx->strp);
+
+ psock = sk_psock_get(sk);
+ if (psock && !list_empty(&psock->ingress_msg)) {
+ ctx->saved_data_ready(sk);
+ sk_psock_put(sk, psock);
+ }
}
void tls_sw_free_resources_tx(struct sock *sk)
@@ -1619,25 +1789,15 @@ void tls_sw_free_resources_tx(struct sock *sk)
rec = list_first_entry(&ctx->tx_list,
struct tls_rec, list);
-
- free_sg(sk, &rec->sg_plaintext_data[1],
- &rec->sg_plaintext_num_elem,
- &rec->sg_plaintext_size);
-
list_del(&rec->list);
+ sk_msg_free(sk, &rec->msg_plaintext);
kfree(rec);
}
list_for_each_entry_safe(rec, tmp, &ctx->tx_list, list) {
- free_sg(sk, &rec->sg_encrypted_data[1],
- &rec->sg_encrypted_num_elem,
- &rec->sg_encrypted_size);
-
- free_sg(sk, &rec->sg_plaintext_data[1],
- &rec->sg_plaintext_num_elem,
- &rec->sg_plaintext_size);
-
list_del(&rec->list);
+ sk_msg_free(sk, &rec->msg_encrypted);
+ sk_msg_free(sk, &rec->msg_plaintext);
kfree(rec);
}
@@ -1829,8 +1989,6 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
sk->sk_data_ready = tls_data_ready;
write_unlock_bh(&sk->sk_callback_lock);
- sw_ctx_rx->sk_poll = sk->sk_socket->ops->poll;
-
strp_check_rcv(&sw_ctx_rx->strp);
}
diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst
index a6258bc8ec4f..3497f2d80328 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-map.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst
@@ -15,13 +15,15 @@ SYNOPSIS
*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-f** | **--bpffs** } }
*COMMANDS* :=
- { **show** | **list** | **dump** | **update** | **lookup** | **getnext** | **delete**
- | **pin** | **help** }
+ { **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext**
+ | **delete** | **pin** | **help** }
MAP COMMANDS
=============
| **bpftool** **map { show | list }** [*MAP*]
+| **bpftool** **map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* \
+| **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**dev** *NAME*]
| **bpftool** **map dump** *MAP*
| **bpftool** **map update** *MAP* **key** *DATA* **value** *VALUE* [*UPDATE_FLAGS*]
| **bpftool** **map lookup** *MAP* **key** *DATA*
@@ -36,6 +38,11 @@ MAP COMMANDS
| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* }
| *VALUE* := { *DATA* | *MAP* | *PROG* }
| *UPDATE_FLAGS* := { **any** | **exist** | **noexist** }
+| *TYPE* := { **hash** | **array** | **prog_array** | **perf_event_array** | **percpu_hash**
+| | **percpu_array** | **stack_trace** | **cgroup_array** | **lru_hash**
+| | **lru_percpu_hash** | **lpm_trie** | **array_of_maps** | **hash_of_maps**
+| | **devmap** | **sockmap** | **cpumap** | **xskmap** | **sockhash**
+| | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage** }
DESCRIPTION
===========
@@ -47,6 +54,10 @@ DESCRIPTION
Output will start with map ID followed by map type and
zero or more named attributes (depending on kernel version).
+ **bpftool map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**dev** *NAME*]
+ Create a new map with given parameters and pin it to *bpffs*
+ as *FILE*.
+
**bpftool map dump** *MAP*
Dump all entries in a given *MAP*.
diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
index 64156a16d530..12c803003ab2 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
@@ -25,6 +25,8 @@ MAP COMMANDS
| **bpftool** **prog dump jited** *PROG* [{**file** *FILE* | **opcodes**}]
| **bpftool** **prog pin** *PROG* *FILE*
| **bpftool** **prog load** *OBJ* *FILE* [**type** *TYPE*] [**map** {**idx** *IDX* | **name** *NAME*} *MAP*] [**dev** *NAME*]
+| **bpftool** **prog attach** *PROG* *ATTACH_TYPE* *MAP*
+| **bpftool** **prog detach** *PROG* *ATTACH_TYPE* *MAP*
| **bpftool** **prog help**
|
| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* }
@@ -37,6 +39,7 @@ MAP COMMANDS
| **cgroup/bind4** | **cgroup/bind6** | **cgroup/post_bind4** | **cgroup/post_bind6** |
| **cgroup/connect4** | **cgroup/connect6** | **cgroup/sendmsg4** | **cgroup/sendmsg6**
| }
+| *ATTACH_TYPE* := { **msg_verdict** | **skb_verdict** | **skb_parse** }
DESCRIPTION
@@ -90,6 +93,14 @@ DESCRIPTION
Note: *FILE* must be located in *bpffs* mount.
+ **bpftool prog attach** *PROG* *ATTACH_TYPE* *MAP*
+ Attach bpf program *PROG* (with type specified by *ATTACH_TYPE*)
+ to the map *MAP*.
+
+ **bpftool prog detach** *PROG* *ATTACH_TYPE* *MAP*
+ Detach bpf program *PROG* (with type specified by *ATTACH_TYPE*)
+ from the map *MAP*.
+
**bpftool prog help**
Print short help message.
diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst
index 8dda77daeda9..04cd4f92ab89 100644
--- a/tools/bpf/bpftool/Documentation/bpftool.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool.rst
@@ -22,11 +22,11 @@ SYNOPSIS
| { **-j** | **--json** } [{ **-p** | **--pretty** }] }
*MAP-COMMANDS* :=
- { **show** | **list** | **dump** | **update** | **lookup** | **getnext** | **delete**
- | **pin** | **event_pipe** | **help** }
+ { **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext**
+ | **delete** | **pin** | **event_pipe** | **help** }
*PROG-COMMANDS* := { **show** | **list** | **dump jited** | **dump xlated** | **pin**
- | **load** | **help** }
+ | **load** | **attach** | **detach** | **help** }
*CGROUP-COMMANDS* := { **show** | **list** | **attach** | **detach** | **help** }
@@ -57,6 +57,10 @@ OPTIONS
-p, --pretty
Generate human-readable JSON output. Implies **-j**.
+ -m, --mapcompat
+ Allow loading maps with unknown map definitions.
+
+
SEE ALSO
========
**bpftool-map**\ (8), **bpftool-prog**\ (8), **bpftool-cgroup**\ (8)
diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
index 74288a2197ab..dac7eff4c7e5 100644
--- a/tools/bpf/bpftool/Makefile
+++ b/tools/bpf/bpftool/Makefile
@@ -46,6 +46,13 @@ CFLAGS += -DPACKAGE='"bpftool"' -D__EXPORTED_HEADERS__ \
-I$(srctree)/tools/lib/bpf \
-I$(srctree)/tools/perf
CFLAGS += -DBPFTOOL_VERSION='"$(BPFTOOL_VERSION)"'
+ifneq ($(EXTRA_CFLAGS),)
+CFLAGS += $(EXTRA_CFLAGS)
+endif
+ifneq ($(EXTRA_LDFLAGS),)
+LDFLAGS += $(EXTRA_LDFLAGS)
+endif
+
LIBS = -lelf -lbfd -lopcodes $(LIBBPF)
INSTALL ?= install
@@ -90,7 +97,7 @@ $(OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c
$(QUIET_CC)$(COMPILE.c) -MMD -o $@ $<
$(OUTPUT)bpftool: $(OBJS) $(LIBBPF)
- $(QUIET_LINK)$(CC) $(CFLAGS) -o $@ $^ $(LIBS)
+ $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ $(LIBS)
$(OUTPUT)%.o: %.c
$(QUIET_CC)$(COMPILE.c) -MMD -o $@ $<
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index df1060b852c1..c56545e87b0d 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -184,7 +184,7 @@ _bpftool()
# Deal with options
if [[ ${words[cword]} == -* ]]; then
- local c='--version --json --pretty --bpffs'
+ local c='--version --json --pretty --bpffs --mapcompat'
COMPREPLY=( $( compgen -W "$c" -- "$cur" ) )
return 0
fi
@@ -292,6 +292,23 @@ _bpftool()
fi
return 0
;;
+ attach|detach)
+ if [[ ${#words[@]} == 7 ]]; then
+ COMPREPLY=( $( compgen -W "id pinned" -- "$cur" ) )
+ return 0
+ fi
+
+ if [[ ${#words[@]} == 6 ]]; then
+ COMPREPLY=( $( compgen -W "msg_verdict skb_verdict skb_parse" -- "$cur" ) )
+ return 0
+ fi
+
+ if [[ $prev == "$command" ]]; then
+ COMPREPLY=( $( compgen -W "id pinned" -- "$cur" ) )
+ return 0
+ fi
+ return 0
+ ;;
load)
local obj
@@ -347,7 +364,7 @@ _bpftool()
;;
*)
[[ $prev == $object ]] && \
- COMPREPLY=( $( compgen -W 'dump help pin load \
+ COMPREPLY=( $( compgen -W 'dump help pin attach detach load \
show list' -- "$cur" ) )
;;
esac
@@ -370,6 +387,42 @@ _bpftool()
;;
esac
;;
+ create)
+ case $prev in
+ $command)
+ _filedir
+ return 0
+ ;;
+ type)
+ COMPREPLY=( $( compgen -W 'hash array prog_array \
+ perf_event_array percpu_hash percpu_array \
+ stack_trace cgroup_array lru_hash \
+ lru_percpu_hash lpm_trie array_of_maps \
+ hash_of_maps devmap sockmap cpumap xskmap \
+ sockhash cgroup_storage reuseport_sockarray \
+ percpu_cgroup_storage' -- \
+ "$cur" ) )
+ return 0
+ ;;
+ key|value|flags|name|entries)
+ return 0
+ ;;
+ dev)
+ _sysfs_get_netdevs
+ return 0
+ ;;
+ *)
+ _bpftool_once_attr 'type'
+ _bpftool_once_attr 'key'
+ _bpftool_once_attr 'value'
+ _bpftool_once_attr 'entries'
+ _bpftool_once_attr 'name'
+ _bpftool_once_attr 'flags'
+ _bpftool_once_attr 'dev'
+ return 0
+ ;;
+ esac
+ ;;
lookup|getnext|delete)
case $prev in
$command)
@@ -483,7 +536,7 @@ _bpftool()
*)
[[ $prev == $object ]] && \
COMPREPLY=( $( compgen -W 'delete dump getnext help \
- lookup pin event_pipe show list update' -- \
+ lookup pin event_pipe show list update create' -- \
"$cur" ) )
;;
esac
diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c
index b3a0709ea7ed..3318da8060bd 100644
--- a/tools/bpf/bpftool/common.c
+++ b/tools/bpf/bpftool/common.c
@@ -618,3 +618,24 @@ void print_dev_json(__u32 ifindex, __u64 ns_dev, __u64 ns_inode)
jsonw_string_field(json_wtr, "ifname", name);
jsonw_end_object(json_wtr);
}
+
+int parse_u32_arg(int *argc, char ***argv, __u32 *val, const char *what)
+{
+ char *endptr;
+
+ NEXT_ARGP();
+
+ if (*val) {
+ p_err("%s already specified", what);
+ return -1;
+ }
+
+ *val = strtoul(**argv, &endptr, 0);
+ if (*endptr) {
+ p_err("can't parse %s as %s", **argv, what);
+ return -1;
+ }
+ NEXT_ARGP();
+
+ return 0;
+}
diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c
index 79dc3f193547..828dde30e9ec 100644
--- a/tools/bpf/bpftool/main.c
+++ b/tools/bpf/bpftool/main.c
@@ -55,6 +55,7 @@ json_writer_t *json_wtr;
bool pretty_output;
bool json_output;
bool show_pinned;
+int bpf_flags;
struct pinned_obj_table prog_table;
struct pinned_obj_table map_table;
@@ -341,6 +342,7 @@ int main(int argc, char **argv)
{ "pretty", no_argument, NULL, 'p' },
{ "version", no_argument, NULL, 'V' },
{ "bpffs", no_argument, NULL, 'f' },
+ { "mapcompat", no_argument, NULL, 'm' },
{ 0 }
};
int opt, ret;
@@ -355,7 +357,7 @@ int main(int argc, char **argv)
hash_init(map_table.table);
opterr = 0;
- while ((opt = getopt_long(argc, argv, "Vhpjf",
+ while ((opt = getopt_long(argc, argv, "Vhpjfm",
options, NULL)) >= 0) {
switch (opt) {
case 'V':
@@ -379,6 +381,9 @@ int main(int argc, char **argv)
case 'f':
show_pinned = true;
break;
+ case 'm':
+ bpf_flags = MAPS_RELAX_COMPAT;
+ break;
default:
p_err("unrecognized option '%s'", argv[optind - 1]);
if (json_output)
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index 40492cdc4e53..28ee769bd11b 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -74,7 +74,7 @@
#define HELP_SPEC_PROGRAM \
"PROG := { id PROG_ID | pinned FILE | tag PROG_TAG }"
#define HELP_SPEC_OPTIONS \
- "OPTIONS := { {-j|--json} [{-p|--pretty}] | {-f|--bpffs} }"
+ "OPTIONS := { {-j|--json} [{-p|--pretty}] | {-f|--bpffs} | {-m|--mapcompat}"
#define HELP_SPEC_MAP \
"MAP := { id MAP_ID | pinned FILE }"
@@ -89,6 +89,7 @@ extern const char *bin_name;
extern json_writer_t *json_wtr;
extern bool json_output;
extern bool show_pinned;
+extern int bpf_flags;
extern struct pinned_obj_table prog_table;
extern struct pinned_obj_table map_table;
@@ -138,6 +139,7 @@ int do_cgroup(int argc, char **arg);
int do_perf(int argc, char **arg);
int do_net(int argc, char **arg);
+int parse_u32_arg(int *argc, char ***argv, __u32 *val, const char *what);
int prog_parse_fd(int *argc, char ***argv);
int map_parse_fd(int *argc, char ***argv);
int map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len);
diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index 6003e9598973..7bf38f0e152e 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -36,6 +36,7 @@
#include <fcntl.h>
#include <linux/err.h>
#include <linux/kernel.h>
+#include <net/if.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
@@ -94,6 +95,17 @@ static bool map_is_map_of_progs(__u32 type)
return type == BPF_MAP_TYPE_PROG_ARRAY;
}
+static int map_type_from_str(const char *type)
+{
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(map_type_name); i++)
+ /* Don't allow prefixing in case of possible future shadowing */
+ if (map_type_name[i] && !strcmp(map_type_name[i], type))
+ return i;
+ return -1;
+}
+
static void *alloc_value(struct bpf_map_info *info)
{
if (map_is_per_cpu(info->type))
@@ -336,6 +348,25 @@ static void print_entry_json(struct bpf_map_info *info, unsigned char *key,
jsonw_end_object(json_wtr);
}
+static void print_entry_error(struct bpf_map_info *info, unsigned char *key,
+ const char *value)
+{
+ int value_size = strlen(value);
+ bool single_line, break_names;
+
+ break_names = info->key_size > 16 || value_size > 16;
+ single_line = info->key_size + value_size <= 24 && !break_names;
+
+ printf("key:%c", break_names ? '\n' : ' ');
+ fprint_hex(stdout, key, info->key_size, " ");
+
+ printf(single_line ? " " : "\n");
+
+ printf("value:%c%s", break_names ? '\n' : ' ', value);
+
+ printf("\n");
+}
+
static void print_entry_plain(struct bpf_map_info *info, unsigned char *key,
unsigned char *value)
{
@@ -658,6 +689,54 @@ static int do_show(int argc, char **argv)
return errno == ENOENT ? 0 : -1;
}
+static int dump_map_elem(int fd, void *key, void *value,
+ struct bpf_map_info *map_info, struct btf *btf,
+ json_writer_t *btf_wtr)
+{
+ int num_elems = 0;
+ int lookup_errno;
+
+ if (!bpf_map_lookup_elem(fd, key, value)) {
+ if (json_output) {
+ print_entry_json(map_info, key, value, btf);
+ } else {
+ if (btf) {
+ struct btf_dumper d = {
+ .btf = btf,
+ .jw = btf_wtr,
+ .is_plain_text = true,
+ };
+
+ do_dump_btf(&d, map_info, key, value);
+ } else {
+ print_entry_plain(map_info, key, value);
+ }
+ num_elems++;
+ }
+ return num_elems;
+ }
+
+ /* lookup error handling */
+ lookup_errno = errno;
+
+ if (map_is_map_of_maps(map_info->type) ||
+ map_is_map_of_progs(map_info->type))
+ return 0;
+
+ if (json_output) {
+ jsonw_name(json_wtr, "key");
+ print_hex_data_json(key, map_info->key_size);
+ jsonw_name(json_wtr, "value");
+ jsonw_start_object(json_wtr);
+ jsonw_string_field(json_wtr, "error", strerror(lookup_errno));
+ jsonw_end_object(json_wtr);
+ } else {
+ print_entry_error(map_info, key, strerror(lookup_errno));
+ }
+
+ return 0;
+}
+
static int do_dump(int argc, char **argv)
{
struct bpf_map_info info = {};
@@ -713,40 +792,7 @@ static int do_dump(int argc, char **argv)
err = 0;
break;
}
-
- if (!bpf_map_lookup_elem(fd, key, value)) {
- if (json_output)
- print_entry_json(&info, key, value, btf);
- else
- if (btf) {
- struct btf_dumper d = {
- .btf = btf,
- .jw = btf_wtr,
- .is_plain_text = true,
- };
-
- do_dump_btf(&d, &info, key, value);
- } else {
- print_entry_plain(&info, key, value);
- }
- num_elems++;
- } else if (!map_is_map_of_maps(info.type) &&
- !map_is_map_of_progs(info.type)) {
- if (json_output) {
- jsonw_name(json_wtr, "key");
- print_hex_data_json(key, info.key_size);
- jsonw_name(json_wtr, "value");
- jsonw_start_object(json_wtr);
- jsonw_string_field(json_wtr, "error",
- "can't lookup element");
- jsonw_end_object(json_wtr);
- } else {
- p_info("can't lookup element with key: ");
- fprint_hex(stderr, key, info.key_size, " ");
- fprintf(stderr, "\n");
- }
- }
-
+ num_elems += dump_map_elem(fd, key, value, &info, btf, btf_wtr);
prev_key = key;
}
@@ -1024,6 +1070,92 @@ static int do_pin(int argc, char **argv)
return err;
}
+static int do_create(int argc, char **argv)
+{
+ struct bpf_create_map_attr attr = { NULL, };
+ const char *pinfile;
+ int err, fd;
+
+ if (!REQ_ARGS(7))
+ return -1;
+ pinfile = GET_ARG();
+
+ while (argc) {
+ if (!REQ_ARGS(2))
+ return -1;
+
+ if (is_prefix(*argv, "type")) {
+ NEXT_ARG();
+
+ if (attr.map_type) {
+ p_err("map type already specified");
+ return -1;
+ }
+
+ attr.map_type = map_type_from_str(*argv);
+ if ((int)attr.map_type < 0) {
+ p_err("unrecognized map type: %s", *argv);
+ return -1;
+ }
+ NEXT_ARG();
+ } else if (is_prefix(*argv, "name")) {
+ NEXT_ARG();
+ attr.name = GET_ARG();
+ } else if (is_prefix(*argv, "key")) {
+ if (parse_u32_arg(&argc, &argv, &attr.key_size,
+ "key size"))
+ return -1;
+ } else if (is_prefix(*argv, "value")) {
+ if (parse_u32_arg(&argc, &argv, &attr.value_size,
+ "value size"))
+ return -1;
+ } else if (is_prefix(*argv, "entries")) {
+ if (parse_u32_arg(&argc, &argv, &attr.max_entries,
+ "max entries"))
+ return -1;
+ } else if (is_prefix(*argv, "flags")) {
+ if (parse_u32_arg(&argc, &argv, &attr.map_flags,
+ "flags"))
+ return -1;
+ } else if (is_prefix(*argv, "dev")) {
+ NEXT_ARG();
+
+ if (attr.map_ifindex) {
+ p_err("offload device already specified");
+ return -1;
+ }
+
+ attr.map_ifindex = if_nametoindex(*argv);
+ if (!attr.map_ifindex) {
+ p_err("unrecognized netdevice '%s': %s",
+ *argv, strerror(errno));
+ return -1;
+ }
+ NEXT_ARG();
+ }
+ }
+
+ if (!attr.name) {
+ p_err("map name not specified");
+ return -1;
+ }
+
+ fd = bpf_create_map_xattr(&attr);
+ if (fd < 0) {
+ p_err("map create failed: %s", strerror(errno));
+ return -1;
+ }
+
+ err = do_pin_fd(fd, pinfile);
+ close(fd);
+ if (err)
+ return err;
+
+ if (json_output)
+ jsonw_null(json_wtr);
+ return 0;
+}
+
static int do_help(int argc, char **argv)
{
if (json_output) {
@@ -1033,6 +1165,9 @@ static int do_help(int argc, char **argv)
fprintf(stderr,
"Usage: %s %s { show | list } [MAP]\n"
+ " %s %s create FILE type TYPE key KEY_SIZE value VALUE_SIZE \\\n"
+ " entries MAX_ENTRIES name NAME [flags FLAGS] \\\n"
+ " [dev NAME]\n"
" %s %s dump MAP\n"
" %s %s update MAP key DATA value VALUE [UPDATE_FLAGS]\n"
" %s %s lookup MAP key DATA\n"
@@ -1047,11 +1182,17 @@ static int do_help(int argc, char **argv)
" " HELP_SPEC_PROGRAM "\n"
" VALUE := { DATA | MAP | PROG }\n"
" UPDATE_FLAGS := { any | exist | noexist }\n"
+ " TYPE := { hash | array | prog_array | perf_event_array | percpu_hash |\n"
+ " percpu_array | stack_trace | cgroup_array | lru_hash |\n"
+ " lru_percpu_hash | lpm_trie | array_of_maps | hash_of_maps |\n"
+ " devmap | sockmap | cpumap | xskmap | sockhash |\n"
+ " cgroup_storage | reuseport_sockarray | percpu_cgroup_storage }\n"
" " HELP_SPEC_OPTIONS "\n"
"",
bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
- bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2]);
+ bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
+ bin_name, argv[-2]);
return 0;
}
@@ -1067,6 +1208,7 @@ static const struct cmd cmds[] = {
{ "delete", do_delete },
{ "pin", do_pin },
{ "event_pipe", do_event_pipe },
+ { "create", do_create },
{ 0 }
};
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index b1cd3bc8db70..335028968dfb 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -77,6 +77,26 @@ static const char * const prog_type_name[] = {
[BPF_PROG_TYPE_FLOW_DISSECTOR] = "flow_dissector",
};
+static const char * const attach_type_strings[] = {
+ [BPF_SK_SKB_STREAM_PARSER] = "stream_parser",
+ [BPF_SK_SKB_STREAM_VERDICT] = "stream_verdict",
+ [BPF_SK_MSG_VERDICT] = "msg_verdict",
+ [__MAX_BPF_ATTACH_TYPE] = NULL,
+};
+
+enum bpf_attach_type parse_attach_type(const char *str)
+{
+ enum bpf_attach_type type;
+
+ for (type = 0; type < __MAX_BPF_ATTACH_TYPE; type++) {
+ if (attach_type_strings[type] &&
+ is_prefix(str, attach_type_strings[type]))
+ return type;
+ }
+
+ return __MAX_BPF_ATTACH_TYPE;
+}
+
static void print_boot_time(__u64 nsecs, char *buf, unsigned int size)
{
struct timespec real_time_ts, boot_time_ts;
@@ -697,6 +717,77 @@ int map_replace_compar(const void *p1, const void *p2)
return a->idx - b->idx;
}
+static int do_attach(int argc, char **argv)
+{
+ enum bpf_attach_type attach_type;
+ int err, mapfd, progfd;
+
+ if (!REQ_ARGS(5)) {
+ p_err("too few parameters for map attach");
+ return -EINVAL;
+ }
+
+ progfd = prog_parse_fd(&argc, &argv);
+ if (progfd < 0)
+ return progfd;
+
+ attach_type = parse_attach_type(*argv);
+ if (attach_type == __MAX_BPF_ATTACH_TYPE) {
+ p_err("invalid attach type");
+ return -EINVAL;
+ }
+ NEXT_ARG();
+
+ mapfd = map_parse_fd(&argc, &argv);
+ if (mapfd < 0)
+ return mapfd;
+
+ err = bpf_prog_attach(progfd, mapfd, attach_type, 0);
+ if (err) {
+ p_err("failed prog attach to map");
+ return -EINVAL;
+ }
+
+ if (json_output)
+ jsonw_null(json_wtr);
+ return 0;
+}
+
+static int do_detach(int argc, char **argv)
+{
+ enum bpf_attach_type attach_type;
+ int err, mapfd, progfd;
+
+ if (!REQ_ARGS(5)) {
+ p_err("too few parameters for map detach");
+ return -EINVAL;
+ }
+
+ progfd = prog_parse_fd(&argc, &argv);
+ if (progfd < 0)
+ return progfd;
+
+ attach_type = parse_attach_type(*argv);
+ if (attach_type == __MAX_BPF_ATTACH_TYPE) {
+ p_err("invalid attach type");
+ return -EINVAL;
+ }
+ NEXT_ARG();
+
+ mapfd = map_parse_fd(&argc, &argv);
+ if (mapfd < 0)
+ return mapfd;
+
+ err = bpf_prog_detach2(progfd, mapfd, attach_type);
+ if (err) {
+ p_err("failed prog detach from map");
+ return -EINVAL;
+ }
+
+ if (json_output)
+ jsonw_null(json_wtr);
+ return 0;
+}
static int do_load(int argc, char **argv)
{
enum bpf_attach_type expected_attach_type;
@@ -817,7 +908,7 @@ static int do_load(int argc, char **argv)
}
}
- obj = bpf_object__open_xattr(&attr);
+ obj = __bpf_object__open_xattr(&attr, bpf_flags);
if (IS_ERR_OR_NULL(obj)) {
p_err("failed to open object file");
goto err_free_reuse_maps;
@@ -942,6 +1033,8 @@ static int do_help(int argc, char **argv)
" %s %s pin PROG FILE\n"
" %s %s load OBJ FILE [type TYPE] [dev NAME] \\\n"
" [map { idx IDX | name NAME } MAP]\n"
+ " %s %s attach PROG ATTACH_TYPE MAP\n"
+ " %s %s detach PROG ATTACH_TYPE MAP\n"
" %s %s help\n"
"\n"
" " HELP_SPEC_MAP "\n"
@@ -953,10 +1046,12 @@ static int do_help(int argc, char **argv)
" cgroup/bind4 | cgroup/bind6 | cgroup/post_bind4 |\n"
" cgroup/post_bind6 | cgroup/connect4 | cgroup/connect6 |\n"
" cgroup/sendmsg4 | cgroup/sendmsg6 }\n"
+ " ATTACH_TYPE := { msg_verdict | skb_verdict | skb_parse }\n"
" " HELP_SPEC_OPTIONS "\n"
"",
bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
- bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2]);
+ bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
+ bin_name, argv[-2], bin_name, argv[-2]);
return 0;
}
@@ -968,6 +1063,8 @@ static const struct cmd cmds[] = {
{ "dump", do_dump },
{ "pin", do_pin },
{ "load", do_load },
+ { "attach", do_attach },
+ { "detach", do_detach },
{ 0 }
};
diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
index 6ad27257fd67..79d84413ddf2 100644
--- a/tools/lib/bpf/Makefile
+++ b/tools/lib/bpf/Makefile
@@ -69,7 +69,7 @@ FEATURE_USER = .libbpf
FEATURE_TESTS = libelf libelf-mmap bpf reallocarray
FEATURE_DISPLAY = libelf bpf
-INCLUDES = -I. -I$(srctree)/tools/include -I$(srctree)/tools/arch/$(ARCH)/include/uapi -I$(srctree)/tools/include/uapi -I$(srctree)/tools/perf
+INCLUDES = -I. -I$(srctree)/tools/include -I$(srctree)/tools/arch/$(ARCH)/include/uapi -I$(srctree)/tools/include/uapi
FEATURE_CHECK_CFLAGS-bpf = $(INCLUDES)
check_feat := 1
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 87520a87a75f..69a4d40c4227 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -69,6 +69,9 @@ struct bpf_load_program_attr {
__u32 prog_ifindex;
};
+/* Flags to direct loading requirements */
+#define MAPS_RELAX_COMPAT 0x01
+
/* Recommend log buffer size */
#define BPF_LOG_BUF_SIZE (256 * 1024)
int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr,
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index ceb918c14d80..bd71efcc53be 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -19,7 +19,6 @@
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
-#include <perf-sys.h>
#include <asm/unistd.h>
#include <linux/err.h>
#include <linux/kernel.h>
@@ -27,6 +26,7 @@
#include <linux/btf.h>
#include <linux/list.h>
#include <linux/limits.h>
+#include <linux/perf_event.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/vfs.h>
@@ -169,7 +169,7 @@ static LIST_HEAD(bpf_objects_list);
struct bpf_object {
char license[64];
- u32 kern_version;
+ __u32 kern_version;
struct bpf_program *programs;
size_t nr_programs;
@@ -540,7 +540,7 @@ static int
bpf_object__init_kversion(struct bpf_object *obj,
void *data, size_t size)
{
- u32 kver;
+ __u32 kver;
if (size != sizeof(kver)) {
pr_warning("invalid kver section in %s\n", obj->path);
@@ -562,8 +562,9 @@ static int compare_bpf_map(const void *_a, const void *_b)
}
static int
-bpf_object__init_maps(struct bpf_object *obj)
+bpf_object__init_maps(struct bpf_object *obj, int flags)
{
+ bool strict = !(flags & MAPS_RELAX_COMPAT);
int i, map_idx, map_def_sz, nr_maps = 0;
Elf_Scn *scn;
Elf_Data *data;
@@ -685,7 +686,8 @@ bpf_object__init_maps(struct bpf_object *obj)
"has unrecognized, non-zero "
"options\n",
obj->path, map_name);
- return -EINVAL;
+ if (strict)
+ return -EINVAL;
}
}
memcpy(&obj->maps[map_idx].def, def,
@@ -716,7 +718,7 @@ static bool section_have_execinstr(struct bpf_object *obj, int idx)
return false;
}
-static int bpf_object__elf_collect(struct bpf_object *obj)
+static int bpf_object__elf_collect(struct bpf_object *obj, int flags)
{
Elf *elf = obj->efile.elf;
GElf_Ehdr *ep = &obj->efile.ehdr;
@@ -843,7 +845,7 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
return LIBBPF_ERRNO__FORMAT;
}
if (obj->efile.maps_shndx >= 0) {
- err = bpf_object__init_maps(obj);
+ err = bpf_object__init_maps(obj, flags);
if (err)
goto out;
}
@@ -1295,7 +1297,7 @@ static int bpf_object__collect_reloc(struct bpf_object *obj)
static int
load_program(enum bpf_prog_type type, enum bpf_attach_type expected_attach_type,
const char *name, struct bpf_insn *insns, int insns_cnt,
- char *license, u32 kern_version, int *pfd, int prog_ifindex)
+ char *license, __u32 kern_version, int *pfd, int prog_ifindex)
{
struct bpf_load_program_attr load_attr;
char *cp, errmsg[STRERR_BUFSIZE];
@@ -1515,7 +1517,7 @@ static int bpf_object__validate(struct bpf_object *obj, bool needs_kver)
static struct bpf_object *
__bpf_object__open(const char *path, void *obj_buf, size_t obj_buf_sz,
- bool needs_kver)
+ bool needs_kver, int flags)
{
struct bpf_object *obj;
int err;
@@ -1531,7 +1533,7 @@ __bpf_object__open(const char *path, void *obj_buf, size_t obj_buf_sz,
CHECK_ERR(bpf_object__elf_init(obj), err, out);
CHECK_ERR(bpf_object__check_endianness(obj), err, out);
- CHECK_ERR(bpf_object__elf_collect(obj), err, out);
+ CHECK_ERR(bpf_object__elf_collect(obj, flags), err, out);
CHECK_ERR(bpf_object__collect_reloc(obj), err, out);
CHECK_ERR(bpf_object__validate(obj, needs_kver), err, out);
@@ -1542,7 +1544,8 @@ out:
return ERR_PTR(err);
}
-struct bpf_object *bpf_object__open_xattr(struct bpf_object_open_attr *attr)
+struct bpf_object *__bpf_object__open_xattr(struct bpf_object_open_attr *attr,
+ int flags)
{
/* param validation */
if (!attr->file)
@@ -1551,7 +1554,13 @@ struct bpf_object *bpf_object__open_xattr(struct bpf_object_open_attr *attr)
pr_debug("loading %s\n", attr->file);
return __bpf_object__open(attr->file, NULL, 0,
- bpf_prog_type__needs_kver(attr->prog_type));
+ bpf_prog_type__needs_kver(attr->prog_type),
+ flags);
+}
+
+struct bpf_object *bpf_object__open_xattr(struct bpf_object_open_attr *attr)
+{
+ return __bpf_object__open_xattr(attr, 0);
}
struct bpf_object *bpf_object__open(const char *path)
@@ -1584,7 +1593,7 @@ struct bpf_object *bpf_object__open_buffer(void *obj_buf,
pr_debug("loading object '%s' from buffer\n",
name);
- return __bpf_object__open(name, obj_buf, obj_buf_sz, true);
+ return __bpf_object__open(name, obj_buf, obj_buf_sz, true, true);
}
int bpf_object__unload(struct bpf_object *obj)
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 8af8d3663991..7e9c801a9fdd 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -61,6 +61,8 @@ struct bpf_object_open_attr {
struct bpf_object *bpf_object__open(const char *path);
struct bpf_object *bpf_object__open_xattr(struct bpf_object_open_attr *attr);
+struct bpf_object *__bpf_object__open_xattr(struct bpf_object_open_attr *attr,
+ int flags);
struct bpf_object *bpf_object__open_buffer(void *obj_buf,
size_t obj_buf_sz,
const char *name);
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 1381ab81099c..d99dd6fc3fbe 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -36,7 +36,8 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \
test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \
get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \
- test_skb_cgroup_id_kern.o bpf_flow.o netcnt_prog.o test_sk_lookup_kern.o
+ test_skb_cgroup_id_kern.o bpf_flow.o netcnt_prog.o \
+ test_sk_lookup_kern.o test_xdp_vlan.o
# Order correspond to 'make run_tests' order
TEST_PROGS := test_kmod.sh \
@@ -49,7 +50,10 @@ TEST_PROGS := test_kmod.sh \
test_lwt_seg6local.sh \
test_lirc_mode2.sh \
test_skb_cgroup_id.sh \
- test_flow_dissector.sh
+ test_flow_dissector.sh \
+ test_xdp_vlan.sh
+
+TEST_PROGS_EXTENDED := with_addr.sh
# Compile but not part of 'make run_tests'
TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr test_skb_cgroup_id_user \
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 1d407b3494f9..fda8c162d0df 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -155,6 +155,10 @@ static struct bpf_sock *(*bpf_sk_lookup_udp)(void *ctx,
(void *) BPF_FUNC_sk_lookup_udp;
static int (*bpf_sk_release)(struct bpf_sock *sk) =
(void *) BPF_FUNC_sk_release;
+static int (*bpf_skb_vlan_push)(void *ctx, __be16 vlan_proto, __u16 vlan_tci) =
+ (void *) BPF_FUNC_skb_vlan_push;
+static int (*bpf_skb_vlan_pop)(void *ctx) =
+ (void *) BPF_FUNC_skb_vlan_pop;
/* llvm builtin functions that eBPF C program may use to
* emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
index 3655508f95fd..dd49df5e2df4 100644
--- a/tools/testing/selftests/bpf/config
+++ b/tools/testing/selftests/bpf/config
@@ -19,3 +19,4 @@ CONFIG_CRYPTO_SHA256=m
CONFIG_VXLAN=y
CONFIG_GENEVE=y
CONFIG_NET_CLS_FLOWER=m
+CONFIG_LWTUNNEL=y
diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index ac7de38e5c63..10a5fa83c75a 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -71,6 +71,7 @@ int txmsg_start;
int txmsg_end;
int txmsg_ingress;
int txmsg_skb;
+int ktls;
static const struct option long_options[] = {
{"help", no_argument, NULL, 'h' },
@@ -92,6 +93,7 @@ static const struct option long_options[] = {
{"txmsg_end", required_argument, NULL, 'e'},
{"txmsg_ingress", no_argument, &txmsg_ingress, 1 },
{"txmsg_skb", no_argument, &txmsg_skb, 1 },
+ {"ktls", no_argument, &ktls, 1 },
{0, 0, NULL, 0 }
};
@@ -112,6 +114,76 @@ static void usage(char *argv[])
printf("\n");
}
+#define TCP_ULP 31
+#define TLS_TX 1
+#define TLS_RX 2
+#include <linux/tls.h>
+
+char *sock_to_string(int s)
+{
+ if (s == c1)
+ return "client1";
+ else if (s == c2)
+ return "client2";
+ else if (s == s1)
+ return "server1";
+ else if (s == s2)
+ return "server2";
+ else if (s == p1)
+ return "peer1";
+ else if (s == p2)
+ return "peer2";
+ else
+ return "unknown";
+}
+
+static int sockmap_init_ktls(int verbose, int s)
+{
+ struct tls12_crypto_info_aes_gcm_128 tls_tx = {
+ .info = {
+ .version = TLS_1_2_VERSION,
+ .cipher_type = TLS_CIPHER_AES_GCM_128,
+ },
+ };
+ struct tls12_crypto_info_aes_gcm_128 tls_rx = {
+ .info = {
+ .version = TLS_1_2_VERSION,
+ .cipher_type = TLS_CIPHER_AES_GCM_128,
+ },
+ };
+ int so_buf = 6553500;
+ int err;
+
+ err = setsockopt(s, 6, TCP_ULP, "tls", sizeof("tls"));
+ if (err) {
+ fprintf(stderr, "setsockopt: TCP_ULP(%s) failed with error %i\n", sock_to_string(s), err);
+ return -EINVAL;
+ }
+ err = setsockopt(s, SOL_TLS, TLS_TX, (void *)&tls_tx, sizeof(tls_tx));
+ if (err) {
+ fprintf(stderr, "setsockopt: TLS_TX(%s) failed with error %i\n", sock_to_string(s), err);
+ return -EINVAL;
+ }
+ err = setsockopt(s, SOL_TLS, TLS_RX, (void *)&tls_rx, sizeof(tls_rx));
+ if (err) {
+ fprintf(stderr, "setsockopt: TLS_RX(%s) failed with error %i\n", sock_to_string(s), err);
+ return -EINVAL;
+ }
+ err = setsockopt(s, SOL_SOCKET, SO_SNDBUF, &so_buf, sizeof(so_buf));
+ if (err) {
+ fprintf(stderr, "setsockopt: (%s) failed sndbuf with error %i\n", sock_to_string(s), err);
+ return -EINVAL;
+ }
+ err = setsockopt(s, SOL_SOCKET, SO_RCVBUF, &so_buf, sizeof(so_buf));
+ if (err) {
+ fprintf(stderr, "setsockopt: (%s) failed rcvbuf with error %i\n", sock_to_string(s), err);
+ return -EINVAL;
+ }
+
+ if (verbose)
+ fprintf(stdout, "socket(%s) kTLS enabled\n", sock_to_string(s));
+ return 0;
+}
static int sockmap_init_sockets(int verbose)
{
int i, err, one = 1;
@@ -456,6 +528,21 @@ static int sendmsg_test(struct sockmap_options *opt)
else
rx_fd = p2;
+ if (ktls) {
+ /* Redirecting into non-TLS socket which sends into a TLS
+ * socket is not a valid test. So in this case lets not
+ * enable kTLS but still run the test.
+ */
+ if (!txmsg_redir || (txmsg_redir && txmsg_ingress)) {
+ err = sockmap_init_ktls(opt->verbose, rx_fd);
+ if (err)
+ return err;
+ }
+ err = sockmap_init_ktls(opt->verbose, c1);
+ if (err)
+ return err;
+ }
+
rxpid = fork();
if (rxpid == 0) {
if (opt->drop_expected)
@@ -907,6 +994,8 @@ static void test_options(char *options)
strncat(options, "ingress,", OPTSTRING);
if (txmsg_skb)
strncat(options, "skb,", OPTSTRING);
+ if (ktls)
+ strncat(options, "ktls,", OPTSTRING);
}
static int __test_exec(int cgrp, int test, struct sockmap_options *opt)
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index bc9cd8537467..cf4cd32b6772 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -48,7 +48,7 @@
#define MAX_INSNS BPF_MAXINSNS
#define MAX_FIXUPS 8
-#define MAX_NR_MAPS 8
+#define MAX_NR_MAPS 13
#define POINTER_VALUE 0xcafe4all
#define TEST_DATA_LEN 64
@@ -61,10 +61,14 @@ static bool unpriv_disabled = false;
struct bpf_test {
const char *descr;
struct bpf_insn insns[MAX_INSNS];
- int fixup_map1[MAX_FIXUPS];
- int fixup_map2[MAX_FIXUPS];
- int fixup_map3[MAX_FIXUPS];
- int fixup_map4[MAX_FIXUPS];
+ int fixup_map_hash_8b[MAX_FIXUPS];
+ int fixup_map_hash_48b[MAX_FIXUPS];
+ int fixup_map_hash_16b[MAX_FIXUPS];
+ int fixup_map_array_48b[MAX_FIXUPS];
+ int fixup_map_sockmap[MAX_FIXUPS];
+ int fixup_map_sockhash[MAX_FIXUPS];
+ int fixup_map_xskmap[MAX_FIXUPS];
+ int fixup_map_stacktrace[MAX_FIXUPS];
int fixup_prog1[MAX_FIXUPS];
int fixup_prog2[MAX_FIXUPS];
int fixup_map_in_map[MAX_FIXUPS];
@@ -876,7 +880,7 @@ static struct bpf_test tests[] = {
BPF_FUNC_map_lookup_elem),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 2 },
+ .fixup_map_hash_8b = { 2 },
.errstr = "invalid indirect read from stack",
.result = REJECT,
},
@@ -1110,7 +1114,7 @@ static struct bpf_test tests[] = {
BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.errstr = "R0 invalid mem access 'map_value_or_null'",
.result = REJECT,
},
@@ -1127,7 +1131,7 @@ static struct bpf_test tests[] = {
BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.errstr = "misaligned value access",
.result = REJECT,
.flags = F_LOAD_WITH_STRICT_ALIGNMENT,
@@ -1147,7 +1151,7 @@ static struct bpf_test tests[] = {
BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 1),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.errstr = "R0 invalid mem access",
.errstr_unpriv = "R0 leaks addr",
.result = REJECT,
@@ -1237,7 +1241,7 @@ static struct bpf_test tests[] = {
BPF_FUNC_map_delete_elem),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 24 },
+ .fixup_map_hash_8b = { 24 },
.errstr_unpriv = "R1 pointer comparison",
.result_unpriv = REJECT,
.result = ACCEPT,
@@ -1391,7 +1395,7 @@ static struct bpf_test tests[] = {
offsetof(struct __sk_buff, pkt_type)),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 4 },
+ .fixup_map_hash_8b = { 4 },
.errstr = "different pointers",
.errstr_unpriv = "R1 pointer comparison",
.result = REJECT,
@@ -1414,7 +1418,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
BPF_JMP_IMM(BPF_JA, 0, 0, -12),
},
- .fixup_map1 = { 6 },
+ .fixup_map_hash_8b = { 6 },
.errstr = "different pointers",
.errstr_unpriv = "R1 pointer comparison",
.result = REJECT,
@@ -1438,7 +1442,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
BPF_JMP_IMM(BPF_JA, 0, 0, -13),
},
- .fixup_map1 = { 7 },
+ .fixup_map_hash_8b = { 7 },
.errstr = "different pointers",
.errstr_unpriv = "R1 pointer comparison",
.result = REJECT,
@@ -2575,7 +2579,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.errstr_unpriv = "R4 leaks addr",
.result_unpriv = REJECT,
.result = ACCEPT,
@@ -2592,7 +2596,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.errstr = "invalid indirect read from stack off -8+0 size 8",
.result = REJECT,
},
@@ -2894,7 +2898,7 @@ static struct bpf_test tests[] = {
BPF_STX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.errstr_unpriv = "R0 leaks addr",
.result_unpriv = REJECT,
.result = ACCEPT,
@@ -2934,7 +2938,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 1 },
+ .fixup_map_hash_8b = { 1 },
.errstr_unpriv = "R1 pointer comparison",
.result_unpriv = REJECT,
.result = ACCEPT,
@@ -4073,7 +4077,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 5 },
+ .fixup_map_hash_8b = { 5 },
.result_unpriv = ACCEPT,
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_XDP,
@@ -4089,7 +4093,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 1 },
+ .fixup_map_hash_8b = { 1 },
.result = REJECT,
.errstr = "invalid access to packet",
.prog_type = BPF_PROG_TYPE_XDP,
@@ -4117,7 +4121,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 11 },
+ .fixup_map_hash_8b = { 11 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_XDP,
},
@@ -4139,7 +4143,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 7 },
+ .fixup_map_hash_8b = { 7 },
.result = REJECT,
.errstr = "invalid access to packet",
.prog_type = BPF_PROG_TYPE_XDP,
@@ -4161,7 +4165,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 6 },
+ .fixup_map_hash_8b = { 6 },
.result = REJECT,
.errstr = "invalid access to packet",
.prog_type = BPF_PROG_TYPE_XDP,
@@ -4184,7 +4188,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 5 },
+ .fixup_map_hash_8b = { 5 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
},
@@ -4199,7 +4203,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 1 },
+ .fixup_map_hash_8b = { 1 },
.result = REJECT,
.errstr = "invalid access to packet",
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
@@ -4227,7 +4231,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 11 },
+ .fixup_map_hash_8b = { 11 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
},
@@ -4249,7 +4253,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 7 },
+ .fixup_map_hash_8b = { 7 },
.result = REJECT,
.errstr = "invalid access to packet",
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
@@ -4271,7 +4275,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 6 },
+ .fixup_map_hash_8b = { 6 },
.result = REJECT,
.errstr = "invalid access to packet",
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
@@ -4542,6 +4546,85 @@ static struct bpf_test tests[] = {
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
},
{
+ "prevent map lookup in sockmap",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_map_lookup_elem),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_sockmap = { 3 },
+ .result = REJECT,
+ .errstr = "cannot pass map_type 15 into func bpf_map_lookup_elem",
+ .prog_type = BPF_PROG_TYPE_SOCK_OPS,
+ },
+ {
+ "prevent map lookup in sockhash",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_map_lookup_elem),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_sockhash = { 3 },
+ .result = REJECT,
+ .errstr = "cannot pass map_type 18 into func bpf_map_lookup_elem",
+ .prog_type = BPF_PROG_TYPE_SOCK_OPS,
+ },
+ {
+ "prevent map lookup in xskmap",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_map_lookup_elem),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_xskmap = { 3 },
+ .result = REJECT,
+ .errstr = "cannot pass map_type 17 into func bpf_map_lookup_elem",
+ .prog_type = BPF_PROG_TYPE_XDP,
+ },
+ {
+ "prevent map lookup in stack trace",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_map_lookup_elem),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_stacktrace = { 3 },
+ .result = REJECT,
+ .errstr = "cannot pass map_type 7 into func bpf_map_lookup_elem",
+ .prog_type = BPF_PROG_TYPE_PERF_EVENT,
+ },
+ {
+ "prevent map lookup in prog array",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_map_lookup_elem),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_prog2 = { 3 },
+ .result = REJECT,
+ .errstr = "cannot pass map_type 3 into func bpf_map_lookup_elem",
+ },
+ {
"valid map access into an array with a constant",
.insns = {
BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
@@ -4555,7 +4638,7 @@ static struct bpf_test tests[] = {
offsetof(struct test_val, foo)),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr_unpriv = "R0 leaks addr",
.result_unpriv = REJECT,
.result = ACCEPT,
@@ -4577,7 +4660,7 @@ static struct bpf_test tests[] = {
offsetof(struct test_val, foo)),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr_unpriv = "R0 leaks addr",
.result_unpriv = REJECT,
.result = ACCEPT,
@@ -4601,7 +4684,7 @@ static struct bpf_test tests[] = {
offsetof(struct test_val, foo)),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr_unpriv = "R0 leaks addr",
.result_unpriv = REJECT,
.result = ACCEPT,
@@ -4629,7 +4712,7 @@ static struct bpf_test tests[] = {
offsetof(struct test_val, foo)),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr_unpriv = "R0 leaks addr",
.result_unpriv = REJECT,
.result = ACCEPT,
@@ -4649,7 +4732,7 @@ static struct bpf_test tests[] = {
offsetof(struct test_val, foo)),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr = "invalid access to map value, value_size=48 off=48 size=8",
.result = REJECT,
},
@@ -4670,7 +4753,7 @@ static struct bpf_test tests[] = {
offsetof(struct test_val, foo)),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr = "R0 min value is outside of the array range",
.result = REJECT,
.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
@@ -4692,7 +4775,7 @@ static struct bpf_test tests[] = {
offsetof(struct test_val, foo)),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr = "R0 unbounded memory access, make sure to bounds check any array access into a map",
.result = REJECT,
.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
@@ -4717,7 +4800,7 @@ static struct bpf_test tests[] = {
offsetof(struct test_val, foo)),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr_unpriv = "R0 leaks addr",
.errstr = "R0 unbounded memory access",
.result_unpriv = REJECT,
@@ -4744,7 +4827,7 @@ static struct bpf_test tests[] = {
offsetof(struct test_val, foo)),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr_unpriv = "R0 leaks addr",
.errstr = "invalid access to map value, value_size=48 off=44 size=8",
.result_unpriv = REJECT,
@@ -4774,7 +4857,7 @@ static struct bpf_test tests[] = {
offsetof(struct test_val, foo)),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3, 11 },
+ .fixup_map_hash_48b = { 3, 11 },
.errstr = "R0 pointer += pointer",
.result = REJECT,
.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
@@ -4807,7 +4890,7 @@ static struct bpf_test tests[] = {
BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 1 },
+ .fixup_map_hash_8b = { 1 },
.result = REJECT,
.errstr = "cannot pass map_type 1 into func bpf_get_local_storage",
.prog_type = BPF_PROG_TYPE_CGROUP_SKB,
@@ -4922,7 +5005,7 @@ static struct bpf_test tests[] = {
BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 1 },
+ .fixup_map_hash_8b = { 1 },
.result = REJECT,
.errstr = "cannot pass map_type 1 into func bpf_get_local_storage",
.prog_type = BPF_PROG_TYPE_CGROUP_SKB,
@@ -5024,7 +5107,7 @@ static struct bpf_test tests[] = {
BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 4 },
+ .fixup_map_hash_8b = { 4 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_SCHED_CLS
},
@@ -5045,7 +5128,7 @@ static struct bpf_test tests[] = {
BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 4 },
+ .fixup_map_hash_8b = { 4 },
.errstr = "R4 pointer arithmetic on map_value_or_null",
.result = REJECT,
.prog_type = BPF_PROG_TYPE_SCHED_CLS
@@ -5066,7 +5149,7 @@ static struct bpf_test tests[] = {
BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 4 },
+ .fixup_map_hash_8b = { 4 },
.errstr = "R4 pointer arithmetic on map_value_or_null",
.result = REJECT,
.prog_type = BPF_PROG_TYPE_SCHED_CLS
@@ -5087,7 +5170,7 @@ static struct bpf_test tests[] = {
BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 4 },
+ .fixup_map_hash_8b = { 4 },
.errstr = "R4 pointer arithmetic on map_value_or_null",
.result = REJECT,
.prog_type = BPF_PROG_TYPE_SCHED_CLS
@@ -5113,7 +5196,7 @@ static struct bpf_test tests[] = {
BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 4 },
+ .fixup_map_hash_8b = { 4 },
.result = REJECT,
.errstr = "R4 !read_ok",
.prog_type = BPF_PROG_TYPE_SCHED_CLS
@@ -5141,7 +5224,7 @@ static struct bpf_test tests[] = {
BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 4 },
+ .fixup_map_hash_8b = { 4 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_SCHED_CLS
},
@@ -5162,7 +5245,7 @@ static struct bpf_test tests[] = {
BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, offsetof(struct test_val, foo)),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr = "R0 unbounded memory access",
.result = REJECT,
.errstr_unpriv = "R0 leaks addr",
@@ -5412,7 +5495,7 @@ static struct bpf_test tests[] = {
offsetof(struct __sk_buff, cb[0])),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 2 },
+ .fixup_map_hash_8b = { 2 },
.errstr_unpriv = "R2 leaks addr into mem",
.result_unpriv = REJECT,
.result = REJECT,
@@ -5442,7 +5525,7 @@ static struct bpf_test tests[] = {
offsetof(struct __sk_buff, cb[0])),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 1 },
+ .fixup_map_hash_8b = { 1 },
.errstr_unpriv = "R2 leaks addr into ctx",
.result_unpriv = REJECT,
.result = ACCEPT,
@@ -5464,7 +5547,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 4 },
+ .fixup_map_hash_8b = { 4 },
.errstr_unpriv = "R6 leaks addr into mem",
.result_unpriv = REJECT,
.result = ACCEPT,
@@ -5484,7 +5567,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_probe_read),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
},
@@ -5503,7 +5586,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_probe_read),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
},
@@ -5521,7 +5604,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_trace_printk),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr = "invalid access to map value, value_size=48 off=0 size=0",
.result = REJECT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
@@ -5541,7 +5624,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_probe_read),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr = "invalid access to map value, value_size=48 off=0 size=56",
.result = REJECT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
@@ -5561,7 +5644,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_probe_read),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr = "R2 min value is negative",
.result = REJECT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
@@ -5585,7 +5668,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_probe_read),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
},
@@ -5606,7 +5689,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_probe_read),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
},
@@ -5626,7 +5709,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_trace_printk),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr = "invalid access to map value, value_size=48 off=4 size=0",
.result = REJECT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
@@ -5650,7 +5733,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_probe_read),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr = "invalid access to map value, value_size=48 off=4 size=52",
.result = REJECT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
@@ -5672,7 +5755,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_probe_read),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr = "R2 min value is negative",
.result = REJECT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
@@ -5694,7 +5777,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_probe_read),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr = "R2 min value is negative",
.result = REJECT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
@@ -5719,7 +5802,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_probe_read),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
},
@@ -5741,7 +5824,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_probe_read),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
},
@@ -5761,7 +5844,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_trace_printk),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr = "R1 min value is outside of the array range",
.result = REJECT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
@@ -5786,7 +5869,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_probe_read),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr = "invalid access to map value, value_size=48 off=4 size=52",
.result = REJECT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
@@ -5809,7 +5892,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_probe_read),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr = "R2 min value is negative",
.result = REJECT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
@@ -5832,7 +5915,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_probe_read),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr = "R2 min value is negative",
.result = REJECT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
@@ -5858,7 +5941,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_probe_read),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
},
@@ -5881,7 +5964,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_probe_read),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
},
@@ -5903,7 +5986,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_trace_printk),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr = "R1 min value is outside of the array range",
.result = REJECT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
@@ -5925,7 +6008,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_probe_read),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr = "R1 unbounded memory access",
.result = REJECT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
@@ -5951,7 +6034,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_probe_read),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr = "invalid access to map value, value_size=48 off=4 size=45",
.result = REJECT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
@@ -5975,7 +6058,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
},
@@ -5998,7 +6081,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.result = REJECT,
.errstr = "R1 unbounded memory access",
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
@@ -6022,7 +6105,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
},
@@ -6045,7 +6128,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.result = REJECT,
.errstr = "R1 unbounded memory access",
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
@@ -6070,7 +6153,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
},
@@ -6094,7 +6177,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
},
@@ -6118,7 +6201,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.result = REJECT,
.errstr = "R1 min value is negative",
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
@@ -6143,7 +6226,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
},
@@ -6167,7 +6250,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
},
@@ -6191,7 +6274,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.result = REJECT,
.errstr = "R1 min value is negative",
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
@@ -6210,7 +6293,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
BPF_EXIT_INSN(),
},
- .fixup_map3 = { 3, 8 },
+ .fixup_map_hash_16b = { 3, 8 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
},
@@ -6230,7 +6313,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_map_update_elem),
BPF_EXIT_INSN(),
},
- .fixup_map3 = { 3, 10 },
+ .fixup_map_hash_16b = { 3, 10 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
},
@@ -6250,8 +6333,8 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_map_update_elem),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
- .fixup_map3 = { 10 },
+ .fixup_map_hash_8b = { 3 },
+ .fixup_map_hash_16b = { 10 },
.result = REJECT,
.errstr = "invalid access to map value, value_size=8 off=0 size=16",
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
@@ -6272,7 +6355,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
BPF_EXIT_INSN(),
},
- .fixup_map3 = { 3, 9 },
+ .fixup_map_hash_16b = { 3, 9 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
},
@@ -6292,7 +6375,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
BPF_EXIT_INSN(),
},
- .fixup_map3 = { 3, 9 },
+ .fixup_map_hash_16b = { 3, 9 },
.result = REJECT,
.errstr = "invalid access to map value, value_size=16 off=12 size=8",
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
@@ -6312,7 +6395,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
BPF_EXIT_INSN(),
},
- .fixup_map3 = { 3, 9 },
+ .fixup_map_hash_16b = { 3, 9 },
.result = REJECT,
.errstr = "invalid access to map value, value_size=16 off=-4 size=8",
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
@@ -6334,7 +6417,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
BPF_EXIT_INSN(),
},
- .fixup_map3 = { 3, 10 },
+ .fixup_map_hash_16b = { 3, 10 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
},
@@ -6355,7 +6438,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
BPF_EXIT_INSN(),
},
- .fixup_map3 = { 3, 10 },
+ .fixup_map_hash_16b = { 3, 10 },
.result = REJECT,
.errstr = "invalid access to map value, value_size=16 off=12 size=8",
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
@@ -6376,7 +6459,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
BPF_EXIT_INSN(),
},
- .fixup_map3 = { 3, 10 },
+ .fixup_map_hash_16b = { 3, 10 },
.result = REJECT,
.errstr = "invalid access to map value, value_size=16 off=-4 size=8",
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
@@ -6399,7 +6482,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
BPF_EXIT_INSN(),
},
- .fixup_map3 = { 3, 11 },
+ .fixup_map_hash_16b = { 3, 11 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
},
@@ -6419,7 +6502,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
BPF_EXIT_INSN(),
},
- .fixup_map3 = { 3, 10 },
+ .fixup_map_hash_16b = { 3, 10 },
.result = REJECT,
.errstr = "R2 unbounded memory access, make sure to bounds check any array access into a map",
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
@@ -6442,7 +6525,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
BPF_EXIT_INSN(),
},
- .fixup_map3 = { 3, 11 },
+ .fixup_map_hash_16b = { 3, 11 },
.result = REJECT,
.errstr = "invalid access to map value, value_size=16 off=9 size=8",
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
@@ -6464,7 +6547,7 @@ static struct bpf_test tests[] = {
BPF_ST_MEM(BPF_DW, BPF_REG_3, 0, 42),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr_unpriv = "R0 leaks addr",
.result = ACCEPT,
.result_unpriv = REJECT,
@@ -6485,7 +6568,7 @@ static struct bpf_test tests[] = {
BPF_ST_MEM(BPF_DW, BPF_REG_3, 0, 42),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr_unpriv = "R0 leaks addr",
.result = ACCEPT,
.result_unpriv = REJECT,
@@ -6502,7 +6585,7 @@ static struct bpf_test tests[] = {
BPF_STX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, 0),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr_unpriv = "R1 !read_ok",
.errstr = "R1 !read_ok",
.result = REJECT,
@@ -6536,7 +6619,7 @@ static struct bpf_test tests[] = {
BPF_ST_MEM(BPF_DW, BPF_REG_7, -4, 24),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr_unpriv = "R0 leaks addr",
.result = ACCEPT,
.result_unpriv = REJECT,
@@ -6564,7 +6647,7 @@ static struct bpf_test tests[] = {
BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 4),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr_unpriv = "R0 leaks addr",
.result = ACCEPT,
.result_unpriv = REJECT,
@@ -6583,7 +6666,7 @@ static struct bpf_test tests[] = {
BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 22),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr = "R0 bitwise operator &= on pointer",
.result = REJECT,
},
@@ -6600,7 +6683,7 @@ static struct bpf_test tests[] = {
BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 22),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr = "R0 32-bit pointer arithmetic prohibited",
.result = REJECT,
},
@@ -6617,7 +6700,7 @@ static struct bpf_test tests[] = {
BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 22),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr = "R0 pointer arithmetic with /= operator",
.result = REJECT,
},
@@ -6634,7 +6717,7 @@ static struct bpf_test tests[] = {
BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 22),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr_unpriv = "R0 pointer arithmetic prohibited",
.errstr = "invalid mem access 'inv'",
.result = REJECT,
@@ -6658,7 +6741,7 @@ static struct bpf_test tests[] = {
BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 22),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr = "R0 invalid mem access 'inv'",
.result = REJECT,
},
@@ -6681,7 +6764,7 @@ static struct bpf_test tests[] = {
BPF_ST_MEM(BPF_DW, BPF_REG_3, 0, 42),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr_unpriv = "R0 leaks addr",
.result = ACCEPT,
.result_unpriv = REJECT,
@@ -6927,7 +7010,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
},
@@ -6953,7 +7036,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr = "invalid access to map value, value_size=48 off=0 size=49",
.result = REJECT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
@@ -6981,7 +7064,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
},
@@ -7008,7 +7091,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr = "R1 min value is outside of the array range",
.result = REJECT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
@@ -7080,7 +7163,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_csum_diff),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
},
@@ -7105,7 +7188,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_csum_diff),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
},
@@ -7128,7 +7211,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_csum_diff),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
},
@@ -7209,7 +7292,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_probe_read),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
},
@@ -7230,7 +7313,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_probe_read),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
},
@@ -7250,7 +7333,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_probe_read),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
},
@@ -7325,7 +7408,7 @@ static struct bpf_test tests[] = {
offsetof(struct test_val, foo)),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr = "R0 max value is outside of the array range",
.result = REJECT,
.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
@@ -7355,7 +7438,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_REG(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr = "R0 max value is outside of the array range",
.result = REJECT,
.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
@@ -7708,7 +7791,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.errstr = "unbounded min value",
.result = REJECT,
},
@@ -7732,7 +7815,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.errstr = "unbounded min value",
.result = REJECT,
},
@@ -7758,7 +7841,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.errstr = "unbounded min value",
.result = REJECT,
},
@@ -7783,7 +7866,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.errstr = "unbounded min value",
.result = REJECT,
},
@@ -7807,7 +7890,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.result = ACCEPT,
},
{
@@ -7831,7 +7914,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.errstr = "unbounded min value",
.result = REJECT,
},
@@ -7877,7 +7960,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.result = ACCEPT,
},
{
@@ -7902,7 +7985,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.errstr = "unbounded min value",
.result = REJECT,
},
@@ -7928,7 +8011,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.result = ACCEPT,
},
{
@@ -7953,7 +8036,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.errstr = "unbounded min value",
.result = REJECT,
},
@@ -7980,7 +8063,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.errstr = "unbounded min value",
.result = REJECT,
},
@@ -8006,7 +8089,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.errstr = "unbounded min value",
.result = REJECT,
},
@@ -8035,7 +8118,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.errstr = "unbounded min value",
.result = REJECT,
},
@@ -8065,7 +8148,7 @@ static struct bpf_test tests[] = {
BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_2, -3),
BPF_JMP_IMM(BPF_JA, 0, 0, -7),
},
- .fixup_map1 = { 4 },
+ .fixup_map_hash_8b = { 4 },
.errstr = "R0 invalid mem access 'inv'",
.result = REJECT,
},
@@ -8093,7 +8176,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.errstr = "unbounded min value",
.result = REJECT,
.result_unpriv = REJECT,
@@ -8120,7 +8203,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.errstr = "R0 max value is outside of the array range",
.result = REJECT,
},
@@ -8145,7 +8228,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.errstr = "R0 min value is negative, either use unsigned index or do a if (index >=0) check.",
.result = REJECT,
},
@@ -8171,7 +8254,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.result = ACCEPT
},
{
@@ -8196,7 +8279,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.errstr = "map_value pointer and 4294967295",
.result = REJECT
},
@@ -8222,7 +8305,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.errstr = "R0 min value is outside of the array range",
.result = REJECT
},
@@ -8246,7 +8329,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 4 },
+ .fixup_map_hash_8b = { 4 },
.errstr = "value_size=8 off=1073741825",
.result = REJECT,
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
@@ -8271,7 +8354,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 4 },
+ .fixup_map_hash_8b = { 4 },
.errstr = "value 1073741823",
.result = REJECT,
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
@@ -8307,7 +8390,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.result = ACCEPT
},
{
@@ -8346,7 +8429,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
/* not actually fully unbounded, but the bound is very high */
.errstr = "R0 unbounded memory access",
.result = REJECT
@@ -8389,7 +8472,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
/* not actually fully unbounded, but the bound is very high */
.errstr = "R0 unbounded memory access",
.result = REJECT
@@ -8418,7 +8501,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.result = ACCEPT
},
{
@@ -8445,7 +8528,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.errstr = "R0 max value is outside of the array range",
.result = REJECT
},
@@ -8475,7 +8558,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.errstr = "R0 unbounded memory access",
.result = REJECT
},
@@ -8495,7 +8578,7 @@ static struct bpf_test tests[] = {
BPF_JMP_A(0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.errstr = "map_value pointer and 2147483646",
.result = REJECT
},
@@ -8517,7 +8600,7 @@ static struct bpf_test tests[] = {
BPF_JMP_A(0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.errstr = "pointer offset 1073741822",
.result = REJECT
},
@@ -8538,7 +8621,7 @@ static struct bpf_test tests[] = {
BPF_JMP_A(0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.errstr = "pointer offset -1073741822",
.result = REJECT
},
@@ -8560,7 +8643,7 @@ static struct bpf_test tests[] = {
BPF_JMP_A(0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.errstr = "map_value pointer and 1000000000000",
.result = REJECT
},
@@ -8580,7 +8663,7 @@ static struct bpf_test tests[] = {
BPF_JMP_A(0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.result = ACCEPT,
.retval = POINTER_VALUE,
.result_unpriv = REJECT,
@@ -8601,7 +8684,7 @@ static struct bpf_test tests[] = {
BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.result = ACCEPT,
.retval = POINTER_VALUE,
.result_unpriv = REJECT,
@@ -8669,7 +8752,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 5 },
+ .fixup_map_hash_8b = { 5 },
.errstr = "variable stack read R2",
.result = REJECT,
.prog_type = BPF_PROG_TYPE_LWT_IN,
@@ -8750,7 +8833,7 @@ static struct bpf_test tests[] = {
offsetof(struct test_val, foo)),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 3 },
+ .fixup_map_hash_48b = { 3 },
.errstr_unpriv = "R0 leaks addr",
.errstr = "R0 unbounded memory access",
.result_unpriv = REJECT,
@@ -10284,7 +10367,7 @@ static struct bpf_test tests[] = {
BPF_EXIT_INSN(),
},
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
- .fixup_map1 = { 16 },
+ .fixup_map_hash_8b = { 16 },
.result = REJECT,
.errstr = "R0 min value is outside of the array range",
},
@@ -11235,7 +11318,7 @@ static struct bpf_test tests[] = {
BPF_EXIT_INSN(), /* return 0 */
},
.prog_type = BPF_PROG_TYPE_XDP,
- .fixup_map1 = { 23 },
+ .fixup_map_hash_8b = { 23 },
.result = ACCEPT,
},
{
@@ -11290,7 +11373,7 @@ static struct bpf_test tests[] = {
BPF_EXIT_INSN(), /* return 1 */
},
.prog_type = BPF_PROG_TYPE_XDP,
- .fixup_map1 = { 23 },
+ .fixup_map_hash_8b = { 23 },
.result = ACCEPT,
},
{
@@ -11345,7 +11428,7 @@ static struct bpf_test tests[] = {
BPF_EXIT_INSN(), /* return 1 */
},
.prog_type = BPF_PROG_TYPE_XDP,
- .fixup_map1 = { 23 },
+ .fixup_map_hash_8b = { 23 },
.result = REJECT,
.errstr = "invalid read from stack off -16+0 size 8",
},
@@ -11417,7 +11500,7 @@ static struct bpf_test tests[] = {
BPF_EXIT_INSN(),
},
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
- .fixup_map1 = { 12, 22 },
+ .fixup_map_hash_8b = { 12, 22 },
.result = REJECT,
.errstr = "invalid access to map value, value_size=8 off=2 size=8",
},
@@ -11489,7 +11572,7 @@ static struct bpf_test tests[] = {
BPF_EXIT_INSN(),
},
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
- .fixup_map1 = { 12, 22 },
+ .fixup_map_hash_8b = { 12, 22 },
.result = ACCEPT,
},
{
@@ -11560,7 +11643,7 @@ static struct bpf_test tests[] = {
BPF_JMP_IMM(BPF_JA, 0, 0, -8),
},
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
- .fixup_map1 = { 12, 22 },
+ .fixup_map_hash_8b = { 12, 22 },
.result = REJECT,
.errstr = "invalid access to map value, value_size=8 off=2 size=8",
},
@@ -11632,7 +11715,7 @@ static struct bpf_test tests[] = {
BPF_EXIT_INSN(),
},
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
- .fixup_map1 = { 12, 22 },
+ .fixup_map_hash_8b = { 12, 22 },
.result = ACCEPT,
},
{
@@ -11703,7 +11786,7 @@ static struct bpf_test tests[] = {
BPF_EXIT_INSN(),
},
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
- .fixup_map1 = { 12, 22 },
+ .fixup_map_hash_8b = { 12, 22 },
.result = REJECT,
.errstr = "R0 invalid mem access 'inv'",
},
@@ -12048,7 +12131,7 @@ static struct bpf_test tests[] = {
BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 13 },
+ .fixup_map_hash_8b = { 13 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_XDP,
},
@@ -12075,7 +12158,7 @@ static struct bpf_test tests[] = {
BPF_FUNC_map_lookup_elem),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 6 },
+ .fixup_map_hash_48b = { 6 },
.errstr = "invalid indirect read from stack off -8+0 size 8",
.result = REJECT,
.prog_type = BPF_PROG_TYPE_XDP,
@@ -12107,8 +12190,8 @@ static struct bpf_test tests[] = {
BPF_EXIT_INSN(),
},
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
- .fixup_map2 = { 13 },
- .fixup_map4 = { 16 },
+ .fixup_map_hash_48b = { 13 },
+ .fixup_map_array_48b = { 16 },
.result = ACCEPT,
.retval = 1,
},
@@ -12140,7 +12223,7 @@ static struct bpf_test tests[] = {
},
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
.fixup_map_in_map = { 16 },
- .fixup_map4 = { 13 },
+ .fixup_map_array_48b = { 13 },
.result = REJECT,
.errstr = "R0 invalid mem access 'map_ptr'",
},
@@ -12208,7 +12291,7 @@ static struct bpf_test tests[] = {
BPF_ST_MEM(BPF_DW, BPF_REG_6, 0, 0xdead),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.errstr = "R6 invalid mem access 'inv'",
.result = REJECT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
@@ -12232,7 +12315,7 @@ static struct bpf_test tests[] = {
BPF_LDX_MEM(BPF_DW, BPF_REG_5, BPF_REG_10, -16),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.errstr = "invalid read from stack off -16+0 size 8",
.result = REJECT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
@@ -12354,7 +12437,7 @@ static struct bpf_test tests[] = {
BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 3),
BPF_EXIT_INSN(),
},
- .fixup_map1 = { 3 },
+ .fixup_map_hash_8b = { 3 },
.result = REJECT,
.errstr = "misaligned value access off",
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
@@ -12464,7 +12547,7 @@ static struct bpf_test tests[] = {
BPF_EMIT_CALL(BPF_FUNC_get_stack),
BPF_EXIT_INSN(),
},
- .fixup_map2 = { 4 },
+ .fixup_map_hash_48b = { 4 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
},
@@ -13511,10 +13594,14 @@ static char bpf_vlog[UINT_MAX >> 8];
static void do_test_fixup(struct bpf_test *test, enum bpf_map_type prog_type,
struct bpf_insn *prog, int *map_fds)
{
- int *fixup_map1 = test->fixup_map1;
- int *fixup_map2 = test->fixup_map2;
- int *fixup_map3 = test->fixup_map3;
- int *fixup_map4 = test->fixup_map4;
+ int *fixup_map_hash_8b = test->fixup_map_hash_8b;
+ int *fixup_map_hash_48b = test->fixup_map_hash_48b;
+ int *fixup_map_hash_16b = test->fixup_map_hash_16b;
+ int *fixup_map_array_48b = test->fixup_map_array_48b;
+ int *fixup_map_sockmap = test->fixup_map_sockmap;
+ int *fixup_map_sockhash = test->fixup_map_sockhash;
+ int *fixup_map_xskmap = test->fixup_map_xskmap;
+ int *fixup_map_stacktrace = test->fixup_map_stacktrace;
int *fixup_prog1 = test->fixup_prog1;
int *fixup_prog2 = test->fixup_prog2;
int *fixup_map_in_map = test->fixup_map_in_map;
@@ -13528,40 +13615,40 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_map_type prog_type,
* for verifier and not do a runtime lookup, so the only thing
* that really matters is value size in this case.
*/
- if (*fixup_map1) {
+ if (*fixup_map_hash_8b) {
map_fds[0] = create_map(BPF_MAP_TYPE_HASH, sizeof(long long),
sizeof(long long), 1);
do {
- prog[*fixup_map1].imm = map_fds[0];
- fixup_map1++;
- } while (*fixup_map1);
+ prog[*fixup_map_hash_8b].imm = map_fds[0];
+ fixup_map_hash_8b++;
+ } while (*fixup_map_hash_8b);
}
- if (*fixup_map2) {
+ if (*fixup_map_hash_48b) {
map_fds[1] = create_map(BPF_MAP_TYPE_HASH, sizeof(long long),
sizeof(struct test_val), 1);
do {
- prog[*fixup_map2].imm = map_fds[1];
- fixup_map2++;
- } while (*fixup_map2);
+ prog[*fixup_map_hash_48b].imm = map_fds[1];
+ fixup_map_hash_48b++;
+ } while (*fixup_map_hash_48b);
}
- if (*fixup_map3) {
+ if (*fixup_map_hash_16b) {
map_fds[2] = create_map(BPF_MAP_TYPE_HASH, sizeof(long long),
sizeof(struct other_val), 1);
do {
- prog[*fixup_map3].imm = map_fds[2];
- fixup_map3++;
- } while (*fixup_map3);
+ prog[*fixup_map_hash_16b].imm = map_fds[2];
+ fixup_map_hash_16b++;
+ } while (*fixup_map_hash_16b);
}
- if (*fixup_map4) {
+ if (*fixup_map_array_48b) {
map_fds[3] = create_map(BPF_MAP_TYPE_ARRAY, sizeof(int),
sizeof(struct test_val), 1);
do {
- prog[*fixup_map4].imm = map_fds[3];
- fixup_map4++;
- } while (*fixup_map4);
+ prog[*fixup_map_array_48b].imm = map_fds[3];
+ fixup_map_array_48b++;
+ } while (*fixup_map_array_48b);
}
if (*fixup_prog1) {
@@ -13603,6 +13690,38 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_map_type prog_type,
fixup_percpu_cgroup_storage++;
} while (*fixup_percpu_cgroup_storage);
}
+ if (*fixup_map_sockmap) {
+ map_fds[9] = create_map(BPF_MAP_TYPE_SOCKMAP, sizeof(int),
+ sizeof(int), 1);
+ do {
+ prog[*fixup_map_sockmap].imm = map_fds[9];
+ fixup_map_sockmap++;
+ } while (*fixup_map_sockmap);
+ }
+ if (*fixup_map_sockhash) {
+ map_fds[10] = create_map(BPF_MAP_TYPE_SOCKHASH, sizeof(int),
+ sizeof(int), 1);
+ do {
+ prog[*fixup_map_sockhash].imm = map_fds[10];
+ fixup_map_sockhash++;
+ } while (*fixup_map_sockhash);
+ }
+ if (*fixup_map_xskmap) {
+ map_fds[11] = create_map(BPF_MAP_TYPE_XSKMAP, sizeof(int),
+ sizeof(int), 1);
+ do {
+ prog[*fixup_map_xskmap].imm = map_fds[11];
+ fixup_map_xskmap++;
+ } while (*fixup_map_xskmap);
+ }
+ if (*fixup_map_stacktrace) {
+ map_fds[12] = create_map(BPF_MAP_TYPE_STACK_TRACE, sizeof(u32),
+ sizeof(u64), 1);
+ do {
+ prog[*fixup_map_stacktrace].imm = map_fds[12];
+ fixup_map_stacktrace++;
+ } while (fixup_map_stacktrace);
+ }
}
static void do_test_single(struct bpf_test *test, bool unpriv,
diff --git a/tools/testing/selftests/bpf/test_xdp_vlan.c b/tools/testing/selftests/bpf/test_xdp_vlan.c
new file mode 100644
index 000000000000..365a7d2d9f5c
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_xdp_vlan.c
@@ -0,0 +1,292 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright(c) 2018 Jesper Dangaard Brouer.
+ *
+ * XDP/TC VLAN manipulation example
+ *
+ * GOTCHA: Remember to disable NIC hardware offloading of VLANs,
+ * else the VLAN tags are NOT inlined in the packet payload:
+ *
+ * # ethtool -K ixgbe2 rxvlan off
+ *
+ * Verify setting:
+ * # ethtool -k ixgbe2 | grep rx-vlan-offload
+ * rx-vlan-offload: off
+ *
+ */
+#include <stddef.h>
+#include <stdbool.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/in.h>
+#include <linux/pkt_cls.h>
+
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+/* linux/if_vlan.h have not exposed this as UAPI, thus mirror some here
+ *
+ * struct vlan_hdr - vlan header
+ * @h_vlan_TCI: priority and VLAN ID
+ * @h_vlan_encapsulated_proto: packet type ID or len
+ */
+struct _vlan_hdr {
+ __be16 h_vlan_TCI;
+ __be16 h_vlan_encapsulated_proto;
+};
+#define VLAN_PRIO_MASK 0xe000 /* Priority Code Point */
+#define VLAN_PRIO_SHIFT 13
+#define VLAN_CFI_MASK 0x1000 /* Canonical Format Indicator */
+#define VLAN_TAG_PRESENT VLAN_CFI_MASK
+#define VLAN_VID_MASK 0x0fff /* VLAN Identifier */
+#define VLAN_N_VID 4096
+
+struct parse_pkt {
+ __u16 l3_proto;
+ __u16 l3_offset;
+ __u16 vlan_outer;
+ __u16 vlan_inner;
+ __u8 vlan_outer_offset;
+ __u8 vlan_inner_offset;
+};
+
+char _license[] SEC("license") = "GPL";
+
+static __always_inline
+bool parse_eth_frame(struct ethhdr *eth, void *data_end, struct parse_pkt *pkt)
+{
+ __u16 eth_type;
+ __u8 offset;
+
+ offset = sizeof(*eth);
+ /* Make sure packet is large enough for parsing eth + 2 VLAN headers */
+ if ((void *)eth + offset + (2*sizeof(struct _vlan_hdr)) > data_end)
+ return false;
+
+ eth_type = eth->h_proto;
+
+ /* Handle outer VLAN tag */
+ if (eth_type == bpf_htons(ETH_P_8021Q)
+ || eth_type == bpf_htons(ETH_P_8021AD)) {
+ struct _vlan_hdr *vlan_hdr;
+
+ vlan_hdr = (void *)eth + offset;
+ pkt->vlan_outer_offset = offset;
+ pkt->vlan_outer = bpf_ntohs(vlan_hdr->h_vlan_TCI)
+ & VLAN_VID_MASK;
+ eth_type = vlan_hdr->h_vlan_encapsulated_proto;
+ offset += sizeof(*vlan_hdr);
+ }
+
+ /* Handle inner (double) VLAN tag */
+ if (eth_type == bpf_htons(ETH_P_8021Q)
+ || eth_type == bpf_htons(ETH_P_8021AD)) {
+ struct _vlan_hdr *vlan_hdr;
+
+ vlan_hdr = (void *)eth + offset;
+ pkt->vlan_inner_offset = offset;
+ pkt->vlan_inner = bpf_ntohs(vlan_hdr->h_vlan_TCI)
+ & VLAN_VID_MASK;
+ eth_type = vlan_hdr->h_vlan_encapsulated_proto;
+ offset += sizeof(*vlan_hdr);
+ }
+
+ pkt->l3_proto = bpf_ntohs(eth_type); /* Convert to host-byte-order */
+ pkt->l3_offset = offset;
+
+ return true;
+}
+
+/* Hint, VLANs are choosen to hit network-byte-order issues */
+#define TESTVLAN 4011 /* 0xFAB */
+// #define TO_VLAN 4000 /* 0xFA0 (hint 0xOA0 = 160) */
+
+SEC("xdp_drop_vlan_4011")
+int xdp_prognum0(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct parse_pkt pkt = { 0 };
+
+ if (!parse_eth_frame(data, data_end, &pkt))
+ return XDP_ABORTED;
+
+ /* Drop specific VLAN ID example */
+ if (pkt.vlan_outer == TESTVLAN)
+ return XDP_ABORTED;
+ /*
+ * Using XDP_ABORTED makes it possible to record this event,
+ * via tracepoint xdp:xdp_exception like:
+ * # perf record -a -e xdp:xdp_exception
+ * # perf script
+ */
+ return XDP_PASS;
+}
+/*
+Commands to setup VLAN on Linux to test packets gets dropped:
+
+ export ROOTDEV=ixgbe2
+ export VLANID=4011
+ ip link add link $ROOTDEV name $ROOTDEV.$VLANID type vlan id $VLANID
+ ip link set dev $ROOTDEV.$VLANID up
+
+ ip link set dev $ROOTDEV mtu 1508
+ ip addr add 100.64.40.11/24 dev $ROOTDEV.$VLANID
+
+Load prog with ip tool:
+
+ ip link set $ROOTDEV xdp off
+ ip link set $ROOTDEV xdp object xdp_vlan01_kern.o section xdp_drop_vlan_4011
+
+*/
+
+/* Changing VLAN to zero, have same practical effect as removing the VLAN. */
+#define TO_VLAN 0
+
+SEC("xdp_vlan_change")
+int xdp_prognum1(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct parse_pkt pkt = { 0 };
+
+ if (!parse_eth_frame(data, data_end, &pkt))
+ return XDP_ABORTED;
+
+ /* Change specific VLAN ID */
+ if (pkt.vlan_outer == TESTVLAN) {
+ struct _vlan_hdr *vlan_hdr = data + pkt.vlan_outer_offset;
+
+ /* Modifying VLAN, preserve top 4 bits */
+ vlan_hdr->h_vlan_TCI =
+ bpf_htons((bpf_ntohs(vlan_hdr->h_vlan_TCI) & 0xf000)
+ | TO_VLAN);
+ }
+
+ return XDP_PASS;
+}
+
+/*
+ * Show XDP+TC can cooperate, on creating a VLAN rewriter.
+ * 1. Create a XDP prog that can "pop"/remove a VLAN header.
+ * 2. Create a TC-bpf prog that egress can add a VLAN header.
+ */
+
+#ifndef ETH_ALEN /* Ethernet MAC address length */
+#define ETH_ALEN 6 /* bytes */
+#endif
+#define VLAN_HDR_SZ 4 /* bytes */
+
+SEC("xdp_vlan_remove_outer")
+int xdp_prognum2(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct parse_pkt pkt = { 0 };
+ char *dest;
+
+ if (!parse_eth_frame(data, data_end, &pkt))
+ return XDP_ABORTED;
+
+ /* Skip packet if no outer VLAN was detected */
+ if (pkt.vlan_outer_offset == 0)
+ return XDP_PASS;
+
+ /* Moving Ethernet header, dest overlap with src, memmove handle this */
+ dest = data;
+ dest+= VLAN_HDR_SZ;
+ /*
+ * Notice: Taking over vlan_hdr->h_vlan_encapsulated_proto, by
+ * only moving two MAC addrs (12 bytes), not overwriting last 2 bytes
+ */
+ __builtin_memmove(dest, data, ETH_ALEN * 2);
+ /* Note: LLVM built-in memmove inlining require size to be constant */
+
+ /* Move start of packet header seen by Linux kernel stack */
+ bpf_xdp_adjust_head(ctx, VLAN_HDR_SZ);
+
+ return XDP_PASS;
+}
+
+static __always_inline
+void shift_mac_4bytes_16bit(void *data)
+{
+ __u16 *p = data;
+
+ p[7] = p[5]; /* delete p[7] was vlan_hdr->h_vlan_TCI */
+ p[6] = p[4]; /* delete p[6] was ethhdr->h_proto */
+ p[5] = p[3];
+ p[4] = p[2];
+ p[3] = p[1];
+ p[2] = p[0];
+}
+
+static __always_inline
+void shift_mac_4bytes_32bit(void *data)
+{
+ __u32 *p = data;
+
+ /* Assuming VLAN hdr present. The 4 bytes in p[3] that gets
+ * overwritten, is ethhdr->h_proto and vlan_hdr->h_vlan_TCI.
+ * The vlan_hdr->h_vlan_encapsulated_proto take over role as
+ * ethhdr->h_proto.
+ */
+ p[3] = p[2];
+ p[2] = p[1];
+ p[1] = p[0];
+}
+
+SEC("xdp_vlan_remove_outer2")
+int xdp_prognum3(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct ethhdr *orig_eth = data;
+ struct parse_pkt pkt = { 0 };
+
+ if (!parse_eth_frame(orig_eth, data_end, &pkt))
+ return XDP_ABORTED;
+
+ /* Skip packet if no outer VLAN was detected */
+ if (pkt.vlan_outer_offset == 0)
+ return XDP_PASS;
+
+ /* Simply shift down MAC addrs 4 bytes, overwrite h_proto + TCI */
+ shift_mac_4bytes_32bit(data);
+
+ /* Move start of packet header seen by Linux kernel stack */
+ bpf_xdp_adjust_head(ctx, VLAN_HDR_SZ);
+
+ return XDP_PASS;
+}
+
+/*=====================================
+ * BELOW: TC-hook based ebpf programs
+ * ====================================
+ * The TC-clsact eBPF programs (currently) need to be attach via TC commands
+ */
+
+SEC("tc_vlan_push")
+int _tc_progA(struct __sk_buff *ctx)
+{
+ bpf_skb_vlan_push(ctx, bpf_htons(ETH_P_8021Q), TESTVLAN);
+
+ return TC_ACT_OK;
+}
+/*
+Commands to setup TC to use above bpf prog:
+
+export ROOTDEV=ixgbe2
+export FILE=xdp_vlan01_kern.o
+
+# Re-attach clsact to clear/flush existing role
+tc qdisc del dev $ROOTDEV clsact 2> /dev/null ;\
+tc qdisc add dev $ROOTDEV clsact
+
+# Attach BPF prog EGRESS
+tc filter add dev $ROOTDEV egress \
+ prio 1 handle 1 bpf da obj $FILE sec tc_vlan_push
+
+tc filter show dev $ROOTDEV egress
+*/
diff --git a/tools/testing/selftests/bpf/test_xdp_vlan.sh b/tools/testing/selftests/bpf/test_xdp_vlan.sh
new file mode 100755
index 000000000000..51a3a31d1aac
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_xdp_vlan.sh
@@ -0,0 +1,195 @@
+#!/bin/bash
+
+TESTNAME=xdp_vlan
+
+usage() {
+ echo "Testing XDP + TC eBPF VLAN manipulations: $TESTNAME"
+ echo ""
+ echo "Usage: $0 [-vfh]"
+ echo " -v | --verbose : Verbose"
+ echo " --flush : Flush before starting (e.g. after --interactive)"
+ echo " --interactive : Keep netns setup running after test-run"
+ echo ""
+}
+
+cleanup()
+{
+ local status=$?
+
+ if [ "$status" = "0" ]; then
+ echo "selftests: $TESTNAME [PASS]";
+ else
+ echo "selftests: $TESTNAME [FAILED]";
+ fi
+
+ if [ -n "$INTERACTIVE" ]; then
+ echo "Namespace setup still active explore with:"
+ echo " ip netns exec ns1 bash"
+ echo " ip netns exec ns2 bash"
+ exit $status
+ fi
+
+ set +e
+ ip link del veth1 2> /dev/null
+ ip netns del ns1 2> /dev/null
+ ip netns del ns2 2> /dev/null
+}
+
+# Using external program "getopt" to get --long-options
+OPTIONS=$(getopt -o hvfi: \
+ --long verbose,flush,help,interactive,debug -- "$@")
+if (( $? != 0 )); then
+ usage
+ echo "selftests: $TESTNAME [FAILED] Error calling getopt, unknown option?"
+ exit 2
+fi
+eval set -- "$OPTIONS"
+
+## --- Parse command line arguments / parameters ---
+while true; do
+ case "$1" in
+ -v | --verbose)
+ export VERBOSE=yes
+ shift
+ ;;
+ -i | --interactive | --debug )
+ INTERACTIVE=yes
+ shift
+ ;;
+ -f | --flush )
+ cleanup
+ shift
+ ;;
+ -- )
+ shift
+ break
+ ;;
+ -h | --help )
+ usage;
+ echo "selftests: $TESTNAME [SKIP] usage help info requested"
+ exit 0
+ ;;
+ * )
+ shift
+ break
+ ;;
+ esac
+done
+
+if [ "$EUID" -ne 0 ]; then
+ echo "selftests: $TESTNAME [FAILED] need root privileges"
+ exit 1
+fi
+
+ip link set dev lo xdp off 2>/dev/null > /dev/null
+if [ $? -ne 0 ];then
+ echo "selftests: $TESTNAME [SKIP] need ip xdp support"
+ exit 0
+fi
+
+# Interactive mode likely require us to cleanup netns
+if [ -n "$INTERACTIVE" ]; then
+ ip link del veth1 2> /dev/null
+ ip netns del ns1 2> /dev/null
+ ip netns del ns2 2> /dev/null
+fi
+
+# Exit on failure
+set -e
+
+# Some shell-tools dependencies
+which ip > /dev/null
+which tc > /dev/null
+which ethtool > /dev/null
+
+# Make rest of shell verbose, showing comments as doc/info
+if [ -n "$VERBOSE" ]; then
+ set -v
+fi
+
+# Create two namespaces
+ip netns add ns1
+ip netns add ns2
+
+# Run cleanup if failing or on kill
+trap cleanup 0 2 3 6 9
+
+# Create veth pair
+ip link add veth1 type veth peer name veth2
+
+# Move veth1 and veth2 into the respective namespaces
+ip link set veth1 netns ns1
+ip link set veth2 netns ns2
+
+# NOTICE: XDP require VLAN header inside packet payload
+# - Thus, disable VLAN offloading driver features
+# - For veth REMEMBER TX side VLAN-offload
+#
+# Disable rx-vlan-offload (mostly needed on ns1)
+ip netns exec ns1 ethtool -K veth1 rxvlan off
+ip netns exec ns2 ethtool -K veth2 rxvlan off
+#
+# Disable tx-vlan-offload (mostly needed on ns2)
+ip netns exec ns2 ethtool -K veth2 txvlan off
+ip netns exec ns1 ethtool -K veth1 txvlan off
+
+export IPADDR1=100.64.41.1
+export IPADDR2=100.64.41.2
+
+# In ns1/veth1 add IP-addr on plain net_device
+ip netns exec ns1 ip addr add ${IPADDR1}/24 dev veth1
+ip netns exec ns1 ip link set veth1 up
+
+# In ns2/veth2 create VLAN device
+export VLAN=4011
+export DEVNS2=veth2
+ip netns exec ns2 ip link add link $DEVNS2 name $DEVNS2.$VLAN type vlan id $VLAN
+ip netns exec ns2 ip addr add ${IPADDR2}/24 dev $DEVNS2.$VLAN
+ip netns exec ns2 ip link set $DEVNS2 up
+ip netns exec ns2 ip link set $DEVNS2.$VLAN up
+
+# Bringup lo in netns (to avoids confusing people using --interactive)
+ip netns exec ns1 ip link set lo up
+ip netns exec ns2 ip link set lo up
+
+# At this point, the hosts cannot reach each-other,
+# because ns2 are using VLAN tags on the packets.
+
+ip netns exec ns2 sh -c 'ping -W 1 -c 1 100.64.41.1 || echo "Okay ping fails"'
+
+
+# Now we can use the test_xdp_vlan.c program to pop/push these VLAN tags
+# ----------------------------------------------------------------------
+# In ns1: ingress use XDP to remove VLAN tags
+export DEVNS1=veth1
+export FILE=test_xdp_vlan.o
+
+# First test: Remove VLAN by setting VLAN ID 0, using "xdp_vlan_change"
+export XDP_PROG=xdp_vlan_change
+ip netns exec ns1 ip link set $DEVNS1 xdp object $FILE section $XDP_PROG
+
+# In ns1: egress use TC to add back VLAN tag 4011
+# (del cmd)
+# tc qdisc del dev $DEVNS1 clsact 2> /dev/null
+#
+ip netns exec ns1 tc qdisc add dev $DEVNS1 clsact
+ip netns exec ns1 tc filter add dev $DEVNS1 egress \
+ prio 1 handle 1 bpf da obj $FILE sec tc_vlan_push
+
+# Now the namespaces can reach each-other, test with ping:
+ip netns exec ns2 ping -W 2 -c 3 $IPADDR1
+ip netns exec ns1 ping -W 2 -c 3 $IPADDR2
+
+# Second test: Replace xdp prog, that fully remove vlan header
+#
+# Catch kernel bug for generic-XDP, that does didn't allow us to
+# remove a VLAN header, because skb->protocol still contain VLAN
+# ETH_P_8021Q indication, and this cause overwriting of our changes.
+#
+export XDP_PROG=xdp_vlan_remove_outer2
+ip netns exec ns1 ip link set $DEVNS1 xdp off
+ip netns exec ns1 ip link set $DEVNS1 xdp object $FILE section $XDP_PROG
+
+# Now the namespaces should still be able reach each-other, test with ping:
+ip netns exec ns2 ping -W 2 -c 3 $IPADDR1
+ip netns exec ns1 ping -W 2 -c 3 $IPADDR2