summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2025-11-11 03:43:51 +0300
committerJakub Kicinski <kuba@kernel.org>2025-11-11 03:43:51 +0300
commit7fc2bf8d30beebceac5841ad2227e0bd41abdf27 (patch)
tree2c15b4330839599ba6867a144102f9c0164f6cf4
parent38f073a71e85c726b09f935e7886de72bc57b15b (diff)
parent67f4cfb530150387dedc13bac7e2ab7f1a525d7f (diff)
downloadlinux-7fc2bf8d30beebceac5841ad2227e0bd41abdf27.tar.xz
Merge tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Martin KaFai Lau says: ==================== pull-request: bpf-next 2025-11-10 We've added 19 non-merge commits during the last 3 day(s) which contain a total of 22 files changed, 1345 insertions(+), 197 deletions(-). The main changes are: 1) Preserve skb metadata after a TC BPF program has changed the skb, from Jakub Sitnicki. This allows a TC program at the end of a TC filter chain to still see the skb metadata, even if another TC program at the front of the chain has changed the skb using BPF helpers. 2) Initial af_smc bpf_struct_ops support to control the smc specific syn/synack options, from D. Wythe. * tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: bpf/selftests: Add selftest for bpf_smc_hs_ctrl net/smc: bpf: Introduce generic hook for handshake flow bpf: Export necessary symbols for modules with struct_ops selftests/bpf: Cover skb metadata access after bpf_skb_change_proto selftests/bpf: Cover skb metadata access after change_head/tail helper selftests/bpf: Cover skb metadata access after bpf_skb_adjust_room selftests/bpf: Cover skb metadata access after vlan push/pop helper selftests/bpf: Expect unclone to preserve skb metadata selftests/bpf: Dump skb metadata on verification failure selftests/bpf: Verify skb metadata in BPF instead of userspace bpf: Make bpf_skb_change_head helper metadata-safe bpf: Make bpf_skb_change_proto helper metadata-safe bpf: Make bpf_skb_adjust_room metadata-safe bpf: Make bpf_skb_vlan_push helper metadata-safe bpf: Make bpf_skb_vlan_pop helper metadata-safe vlan: Make vlan_remove_tag return nothing bpf: Unclone skb head on bpf_dynptr_write to skb metadata net: Preserve metadata on pskb_expand_head net: Helper to move packet data and metadata after skb_push/pull ==================== Link: https://patch.msgid.link/20251110232427.3929291-1-martin.lau@linux.dev Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-rw-r--r--include/linux/filter.h9
-rw-r--r--include/linux/if_vlan.h13
-rw-r--r--include/linux/skbuff.h75
-rw-r--r--include/net/netns/smc.h3
-rw-r--r--include/net/smc.h53
-rw-r--r--kernel/bpf/bpf_struct_ops.c2
-rw-r--r--kernel/bpf/helpers.c6
-rw-r--r--kernel/bpf/syscall.c1
-rw-r--r--net/core/filter.c34
-rw-r--r--net/core/skbuff.c6
-rw-r--r--net/ipv4/tcp_output.c31
-rw-r--r--net/smc/Kconfig10
-rw-r--r--net/smc/Makefile1
-rw-r--r--net/smc/af_smc.c9
-rw-r--r--net/smc/smc_hs_bpf.c140
-rw-r--r--net/smc/smc_hs_bpf.h31
-rw-r--r--net/smc/smc_sysctl.c91
-rw-r--r--tools/testing/selftests/bpf/config5
-rw-r--r--tools/testing/selftests/bpf/prog_tests/test_bpf_smc.c390
-rw-r--r--tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c129
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_smc.c117
-rw-r--r--tools/testing/selftests/bpf/progs/test_xdp_meta.c386
22 files changed, 1345 insertions, 197 deletions
diff --git a/include/linux/filter.h b/include/linux/filter.h
index e116de7edc58..a104b3994230 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1781,6 +1781,8 @@ int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len);
void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len);
void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off,
void *buf, unsigned long len, bool flush);
+int __bpf_skb_meta_store_bytes(struct sk_buff *skb, u32 offset,
+ const void *from, u32 len, u64 flags);
void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset);
#else /* CONFIG_NET */
static inline int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset,
@@ -1817,6 +1819,13 @@ static inline void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, voi
{
}
+static inline int __bpf_skb_meta_store_bytes(struct sk_buff *skb, u32 offset,
+ const void *from, u32 len,
+ u64 flags)
+{
+ return -EOPNOTSUPP;
+}
+
static inline void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset)
{
return ERR_PTR(-EOPNOTSUPP);
diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 15e01935d3fa..f7f34eb15e06 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -355,16 +355,17 @@ static inline int __vlan_insert_inner_tag(struct sk_buff *skb,
__be16 vlan_proto, u16 vlan_tci,
unsigned int mac_len)
{
+ const u8 meta_len = mac_len > ETH_TLEN ? skb_metadata_len(skb) : 0;
struct vlan_ethhdr *veth;
- if (skb_cow_head(skb, VLAN_HLEN) < 0)
+ if (skb_cow_head(skb, meta_len + VLAN_HLEN) < 0)
return -ENOMEM;
skb_push(skb, VLAN_HLEN);
/* Move the mac header sans proto to the beginning of the new header. */
if (likely(mac_len > ETH_TLEN))
- memmove(skb->data, skb->data + VLAN_HLEN, mac_len - ETH_TLEN);
+ skb_postpush_data_move(skb, VLAN_HLEN, mac_len - ETH_TLEN);
if (skb_mac_header_was_set(skb))
skb->mac_header -= VLAN_HLEN;
@@ -731,18 +732,16 @@ static inline void vlan_set_encap_proto(struct sk_buff *skb,
*
* Expects the skb to contain a VLAN tag in the payload, and to have skb->data
* pointing at the MAC header.
- *
- * Returns: a new pointer to skb->data, or NULL on failure to pull.
*/
-static inline void *vlan_remove_tag(struct sk_buff *skb, u16 *vlan_tci)
+static inline void vlan_remove_tag(struct sk_buff *skb, u16 *vlan_tci)
{
struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN);
*vlan_tci = ntohs(vhdr->h_vlan_TCI);
- memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN);
vlan_set_encap_proto(skb, vhdr);
- return __skb_pull(skb, VLAN_HLEN);
+ __skb_pull(skb, VLAN_HLEN);
+ skb_postpull_data_move(skb, VLAN_HLEN, 2 * ETH_ALEN);
}
/**
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a7cc3d1f4fd1..ff90281ddf90 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4564,6 +4564,81 @@ static inline void skb_metadata_clear(struct sk_buff *skb)
skb_metadata_set(skb, 0);
}
+/**
+ * skb_data_move - Move packet data and metadata after skb_push() or skb_pull().
+ * @skb: packet to operate on
+ * @len: number of bytes pushed or pulled from &sk_buff->data
+ * @n: number of bytes to memmove() from pre-push/pull &sk_buff->data
+ *
+ * Moves @n bytes of packet data, can be zero, and all bytes of skb metadata.
+ *
+ * Assumes metadata is located immediately before &sk_buff->data prior to the
+ * push/pull, and that sufficient headroom exists to hold it after an
+ * skb_push(). Otherwise, metadata is cleared and a one-time warning is issued.
+ *
+ * Prefer skb_postpull_data_move() or skb_postpush_data_move() to calling this
+ * helper directly.
+ */
+static inline void skb_data_move(struct sk_buff *skb, const int len,
+ const unsigned int n)
+{
+ const u8 meta_len = skb_metadata_len(skb);
+ u8 *meta, *meta_end;
+
+ if (!len || (!n && !meta_len))
+ return;
+
+ if (!meta_len)
+ goto no_metadata;
+
+ meta_end = skb_metadata_end(skb);
+ meta = meta_end - meta_len;
+
+ if (WARN_ON_ONCE(meta_end + len != skb->data ||
+ meta_len > skb_headroom(skb))) {
+ skb_metadata_clear(skb);
+ goto no_metadata;
+ }
+
+ memmove(meta + len, meta, meta_len + n);
+ return;
+
+no_metadata:
+ memmove(skb->data, skb->data - len, n);
+}
+
+/**
+ * skb_postpull_data_move - Move packet data and metadata after skb_pull().
+ * @skb: packet to operate on
+ * @len: number of bytes pulled from &sk_buff->data
+ * @n: number of bytes to memmove() from pre-pull &sk_buff->data
+ *
+ * See skb_data_move() for details.
+ */
+static inline void skb_postpull_data_move(struct sk_buff *skb,
+ const unsigned int len,
+ const unsigned int n)
+{
+ DEBUG_NET_WARN_ON_ONCE(len > INT_MAX);
+ skb_data_move(skb, len, n);
+}
+
+/**
+ * skb_postpush_data_move - Move packet data and metadata after skb_push().
+ * @skb: packet to operate on
+ * @len: number of bytes pushed onto &sk_buff->data
+ * @n: number of bytes to memmove() from pre-push &sk_buff->data
+ *
+ * See skb_data_move() for details.
+ */
+static inline void skb_postpush_data_move(struct sk_buff *skb,
+ const unsigned int len,
+ const unsigned int n)
+{
+ DEBUG_NET_WARN_ON_ONCE(len > INT_MAX);
+ skb_data_move(skb, -len, n);
+}
+
struct sk_buff *skb_clone_sk(struct sk_buff *skb);
#ifdef CONFIG_NETWORK_PHY_TIMESTAMPING
diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h
index 6ceb12baec24..ed24c9f638ee 100644
--- a/include/net/netns/smc.h
+++ b/include/net/netns/smc.h
@@ -17,6 +17,9 @@ struct netns_smc {
#ifdef CONFIG_SYSCTL
struct ctl_table_header *smc_hdr;
#endif
+#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF)
+ struct smc_hs_ctrl __rcu *hs_ctrl;
+#endif /* CONFIG_SMC_HS_CTRL_BPF */
unsigned int sysctl_autocorking_size;
unsigned int sysctl_smcr_buf_type;
int sysctl_smcr_testlink_time;
diff --git a/include/net/smc.h b/include/net/smc.h
index 08bee529ed8d..bfdc4c41f019 100644
--- a/include/net/smc.h
+++ b/include/net/smc.h
@@ -17,6 +17,8 @@
#include <linux/wait.h>
#include <linux/dibs.h>
+struct tcp_sock;
+struct inet_request_sock;
struct sock;
#define SMC_MAX_PNETID_LEN 16 /* Max. length of PNET id */
@@ -50,4 +52,55 @@ struct smcd_dev {
u8 going_away : 1;
};
+#define SMC_HS_CTRL_NAME_MAX 16
+
+enum {
+ /* ops can be inherit from init_net */
+ SMC_HS_CTRL_FLAG_INHERITABLE = 0x1,
+
+ SMC_HS_CTRL_ALL_FLAGS = SMC_HS_CTRL_FLAG_INHERITABLE,
+};
+
+struct smc_hs_ctrl {
+ /* private */
+
+ struct list_head list;
+ struct module *owner;
+
+ /* public */
+
+ /* unique name */
+ char name[SMC_HS_CTRL_NAME_MAX];
+ int flags;
+
+ /* Invoked before computing SMC option for SYN packets.
+ * We can control whether to set SMC options by returning various value.
+ * Return 0 to disable SMC, or return any other value to enable it.
+ */
+ int (*syn_option)(struct tcp_sock *tp);
+
+ /* Invoked before Set up SMC options for SYN-ACK packets
+ * We can control whether to respond SMC options by returning various
+ * value. Return 0 to disable SMC, or return any other value to enable
+ * it.
+ */
+ int (*synack_option)(const struct tcp_sock *tp,
+ struct inet_request_sock *ireq);
+};
+
+#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF)
+#define smc_call_hsbpf(init_val, tp, func, ...) ({ \
+ typeof(init_val) __ret = (init_val); \
+ struct smc_hs_ctrl *ctrl; \
+ rcu_read_lock(); \
+ ctrl = rcu_dereference(sock_net((struct sock *)(tp))->smc.hs_ctrl); \
+ if (ctrl && ctrl->func) \
+ __ret = ctrl->func(tp, ##__VA_ARGS__); \
+ rcu_read_unlock(); \
+ __ret; \
+})
+#else
+#define smc_call_hsbpf(init_val, tp, ...) ({ (void)(tp); (init_val); })
+#endif /* CONFIG_SMC_HS_CTRL_BPF */
+
#endif /* _SMC_H */
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index a41e6730edcf..278490683d28 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -1162,6 +1162,7 @@ bool bpf_struct_ops_get(const void *kdata)
map = __bpf_map_inc_not_zero(&st_map->map, false);
return !IS_ERR(map);
}
+EXPORT_SYMBOL_GPL(bpf_struct_ops_get);
void bpf_struct_ops_put(const void *kdata)
{
@@ -1173,6 +1174,7 @@ void bpf_struct_ops_put(const void *kdata)
bpf_map_put(&st_map->map);
}
+EXPORT_SYMBOL_GPL(bpf_struct_ops_put);
u32 bpf_struct_ops_id(const void *kdata)
{
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index eb25e70e0bdc..3e830fd31f5f 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1842,10 +1842,8 @@ int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src,
return -EINVAL;
return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len);
case BPF_DYNPTR_TYPE_SKB_META:
- if (flags)
- return -EINVAL;
- memmove(bpf_skb_meta_pointer(dst->data, dst->offset + offset), src, len);
- return 0;
+ return __bpf_skb_meta_store_bytes(dst->data, dst->offset + offset, src,
+ len, flags);
default:
WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type);
return -EFAULT;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 8a129746bd6c..80b86e9d3c39 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1234,6 +1234,7 @@ int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size)
return src - orig_src;
}
+EXPORT_SYMBOL_GPL(bpf_obj_name_cpy);
int map_check_no_btf(const struct bpf_map *map,
const struct btf *btf,
diff --git a/net/core/filter.c b/net/core/filter.c
index 52721efba332..4124becf8604 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3253,11 +3253,11 @@ static void bpf_skb_change_protocol(struct sk_buff *skb, u16 proto)
static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len)
{
- /* Caller already did skb_cow() with len as headroom,
+ /* Caller already did skb_cow() with meta_len+len as headroom,
* so no need to do it here.
*/
skb_push(skb, len);
- memmove(skb->data, skb->data + len, off);
+ skb_postpush_data_move(skb, len, off);
memset(skb->data + off, 0, len);
/* No skb_postpush_rcsum(skb, skb->data + off, len)
@@ -3281,7 +3281,7 @@ static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len)
old_data = skb->data;
__skb_pull(skb, len);
skb_postpull_rcsum(skb, old_data + off, len);
- memmove(skb->data, old_data, off);
+ skb_postpull_data_move(skb, len, off);
return 0;
}
@@ -3326,10 +3326,11 @@ static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len)
static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
{
const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
+ const u8 meta_len = skb_metadata_len(skb);
u32 off = skb_mac_header_len(skb);
int ret;
- ret = skb_cow(skb, len_diff);
+ ret = skb_cow(skb, meta_len + len_diff);
if (unlikely(ret < 0))
return ret;
@@ -3489,6 +3490,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
u8 inner_mac_len = flags >> BPF_ADJ_ROOM_ENCAP_L2_SHIFT;
bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK;
u16 mac_len = 0, inner_net = 0, inner_trans = 0;
+ const u8 meta_len = skb_metadata_len(skb);
unsigned int gso_type = SKB_GSO_DODGY;
int ret;
@@ -3499,7 +3501,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
return -ENOTSUPP;
}
- ret = skb_cow_head(skb, len_diff);
+ ret = skb_cow_head(skb, meta_len + len_diff);
if (unlikely(ret < 0))
return ret;
@@ -3873,6 +3875,7 @@ static const struct bpf_func_proto sk_skb_change_tail_proto = {
static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room,
u64 flags)
{
+ const u8 meta_len = skb_metadata_len(skb);
u32 max_len = BPF_SKB_MAX_LEN;
u32 new_len = skb->len + head_room;
int ret;
@@ -3882,7 +3885,7 @@ static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room,
new_len < skb->len))
return -EINVAL;
- ret = skb_cow(skb, head_room);
+ ret = skb_cow(skb, meta_len + head_room);
if (likely(!ret)) {
/* Idea for this helper is that we currently only
* allow to expand on mac header. This means that
@@ -3894,6 +3897,7 @@ static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room,
* for redirection into L2 device.
*/
__skb_push(skb, head_room);
+ skb_postpush_data_move(skb, head_room, 0);
memset(skb->data, 0, head_room);
skb_reset_mac_header(skb);
skb_reset_mac_len(skb);
@@ -12102,6 +12106,18 @@ void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset)
return skb_metadata_end(skb) - skb_metadata_len(skb) + offset;
}
+int __bpf_skb_meta_store_bytes(struct sk_buff *skb, u32 offset,
+ const void *from, u32 len, u64 flags)
+{
+ if (unlikely(flags))
+ return -EINVAL;
+ if (unlikely(bpf_try_make_writable(skb, 0)))
+ return -EFAULT;
+
+ memmove(bpf_skb_meta_pointer(skb, offset), from, len);
+ return 0;
+}
+
__bpf_kfunc_start_defs();
__bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags,
struct bpf_dynptr *ptr__uninit)
@@ -12129,9 +12145,6 @@ __bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags,
* XDP context with bpf_xdp_adjust_meta(). Serves as an alternative to
* &__sk_buff->data_meta.
*
- * If passed @skb_ is a clone which shares the data with the original, the
- * dynptr will be read-only. This limitation may be lifted in the future.
- *
* Return:
* * %0 - dynptr ready to use
* * %-EINVAL - invalid flags, dynptr set to null
@@ -12149,9 +12162,6 @@ __bpf_kfunc int bpf_dynptr_from_skb_meta(struct __sk_buff *skb_, u64 flags,
bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB_META, 0, skb_metadata_len(skb));
- if (skb_cloned(skb))
- bpf_dynptr_set_rdonly(ptr);
-
return 0;
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 7ac5f8aa1235..d95658b738d1 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2234,6 +2234,10 @@ EXPORT_SYMBOL(__pskb_copy_fclone);
*
* All the pointers pointing into skb header may change and must be
* reloaded after call to this function.
+ *
+ * Note: If you skb_push() the start of the buffer after reallocating the
+ * header, call skb_postpush_data_move() first to move the metadata out of
+ * the way before writing to &sk_buff->data.
*/
int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
@@ -2305,8 +2309,6 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
skb->nohdr = 0;
atomic_set(&skb_shinfo(skb)->dataref, 1);
- skb_metadata_clear(skb);
-
/* It is not generally safe to change skb->truesize.
* For the moment, we really care of rx path, or
* when skb is orphaned (not attached to a socket).
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7f5df7a71f62..479afb714bdf 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -40,6 +40,7 @@
#include <net/tcp.h>
#include <net/tcp_ecn.h>
#include <net/mptcp.h>
+#include <net/smc.h>
#include <net/proto_memory.h>
#include <net/psp.h>
@@ -802,34 +803,36 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
mptcp_options_write(th, ptr, tp, opts);
}
-static void smc_set_option(const struct tcp_sock *tp,
+static void smc_set_option(struct tcp_sock *tp,
struct tcp_out_options *opts,
unsigned int *remaining)
{
#if IS_ENABLED(CONFIG_SMC)
- if (static_branch_unlikely(&tcp_have_smc)) {
- if (tp->syn_smc) {
- if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
- opts->options |= OPTION_SMC;
- *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
- }
+ if (static_branch_unlikely(&tcp_have_smc) && tp->syn_smc) {
+ tp->syn_smc = !!smc_call_hsbpf(1, tp, syn_option);
+ /* re-check syn_smc */
+ if (tp->syn_smc &&
+ *remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
+ opts->options |= OPTION_SMC;
+ *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
}
}
#endif
}
static void smc_set_option_cond(const struct tcp_sock *tp,
- const struct inet_request_sock *ireq,
+ struct inet_request_sock *ireq,
struct tcp_out_options *opts,
unsigned int *remaining)
{
#if IS_ENABLED(CONFIG_SMC)
- if (static_branch_unlikely(&tcp_have_smc)) {
- if (tp->syn_smc && ireq->smc_ok) {
- if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
- opts->options |= OPTION_SMC;
- *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
- }
+ if (static_branch_unlikely(&tcp_have_smc) && tp->syn_smc && ireq->smc_ok) {
+ ireq->smc_ok = !!smc_call_hsbpf(1, tp, synack_option, ireq);
+ /* re-check smc_ok */
+ if (ireq->smc_ok &&
+ *remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
+ opts->options |= OPTION_SMC;
+ *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
}
}
#endif
diff --git a/net/smc/Kconfig b/net/smc/Kconfig
index 99ecd59d1f4b..325addf83cc6 100644
--- a/net/smc/Kconfig
+++ b/net/smc/Kconfig
@@ -19,3 +19,13 @@ config SMC_DIAG
smcss.
if unsure, say Y.
+
+config SMC_HS_CTRL_BPF
+ bool "Generic eBPF hook for SMC handshake flow"
+ depends on SMC && BPF_SYSCALL
+ default y
+ help
+ SMC_HS_CTRL_BPF enables support to register generic eBPF hook for SMC
+ handshake flow, which offer much greater flexibility in modifying the behavior
+ of the SMC protocol stack compared to a complete kernel-based approach. Select
+ this option if you want filtring the handshake process via eBPF programs. \ No newline at end of file
diff --git a/net/smc/Makefile b/net/smc/Makefile
index 0e754cbc38f9..5368634c5dd6 100644
--- a/net/smc/Makefile
+++ b/net/smc/Makefile
@@ -6,3 +6,4 @@ smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o
smc-y += smc_tracepoint.o smc_inet.o
smc-$(CONFIG_SYSCTL) += smc_sysctl.o
+smc-$(CONFIG_SMC_HS_CTRL_BPF) += smc_hs_bpf.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 0ef3e16a8517..e388de8dca09 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -58,6 +58,7 @@
#include "smc_tracepoint.h"
#include "smc_sysctl.h"
#include "smc_inet.h"
+#include "smc_hs_bpf.h"
static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group
* creation on server
@@ -3600,8 +3601,16 @@ static int __init smc_init(void)
pr_err("%s: smc_inet_init fails with %d\n", __func__, rc);
goto out_ulp;
}
+ rc = bpf_smc_hs_ctrl_init();
+ if (rc) {
+ pr_err("%s: bpf_smc_hs_ctrl_init fails with %d\n", __func__,
+ rc);
+ goto out_inet;
+ }
static_branch_enable(&tcp_have_smc);
return 0;
+out_inet:
+ smc_inet_exit();
out_ulp:
tcp_unregister_ulp(&smc_ulp_ops);
out_ib:
diff --git a/net/smc/smc_hs_bpf.c b/net/smc/smc_hs_bpf.c
new file mode 100644
index 000000000000..063d23d85850
--- /dev/null
+++ b/net/smc/smc_hs_bpf.c
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Generic hook for SMC handshake flow.
+ *
+ * Copyright IBM Corp. 2016
+ * Copyright (c) 2025, Alibaba Inc.
+ *
+ * Author: D. Wythe <alibuda@linux.alibaba.com>
+ */
+
+#include <linux/bpf_verifier.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/rculist.h>
+
+#include "smc_hs_bpf.h"
+
+static DEFINE_SPINLOCK(smc_hs_ctrl_list_lock);
+static LIST_HEAD(smc_hs_ctrl_list);
+
+static int smc_hs_ctrl_reg(struct smc_hs_ctrl *ctrl)
+{
+ int ret = 0;
+
+ spin_lock(&smc_hs_ctrl_list_lock);
+ /* already exist or duplicate name */
+ if (smc_hs_ctrl_find_by_name(ctrl->name))
+ ret = -EEXIST;
+ else
+ list_add_tail_rcu(&ctrl->list, &smc_hs_ctrl_list);
+ spin_unlock(&smc_hs_ctrl_list_lock);
+ return ret;
+}
+
+static void smc_hs_ctrl_unreg(struct smc_hs_ctrl *ctrl)
+{
+ spin_lock(&smc_hs_ctrl_list_lock);
+ list_del_rcu(&ctrl->list);
+ spin_unlock(&smc_hs_ctrl_list_lock);
+
+ /* Ensure that all readers to complete */
+ synchronize_rcu();
+}
+
+struct smc_hs_ctrl *smc_hs_ctrl_find_by_name(const char *name)
+{
+ struct smc_hs_ctrl *ctrl;
+
+ list_for_each_entry_rcu(ctrl, &smc_hs_ctrl_list, list) {
+ if (strcmp(ctrl->name, name) == 0)
+ return ctrl;
+ }
+ return NULL;
+}
+
+static int __smc_bpf_stub_set_tcp_option(struct tcp_sock *tp) { return 1; }
+static int __smc_bpf_stub_set_tcp_option_cond(const struct tcp_sock *tp,
+ struct inet_request_sock *ireq)
+{
+ return 1;
+}
+
+static struct smc_hs_ctrl __smc_bpf_hs_ctrl = {
+ .syn_option = __smc_bpf_stub_set_tcp_option,
+ .synack_option = __smc_bpf_stub_set_tcp_option_cond,
+};
+
+static int smc_bpf_hs_ctrl_init(struct btf *btf) { return 0; }
+
+static int smc_bpf_hs_ctrl_reg(void *kdata, struct bpf_link *link)
+{
+ if (link)
+ return -EOPNOTSUPP;
+
+ return smc_hs_ctrl_reg(kdata);
+}
+
+static void smc_bpf_hs_ctrl_unreg(void *kdata, struct bpf_link *link)
+{
+ smc_hs_ctrl_unreg(kdata);
+}
+
+static int smc_bpf_hs_ctrl_init_member(const struct btf_type *t,
+ const struct btf_member *member,
+ void *kdata, const void *udata)
+{
+ const struct smc_hs_ctrl *u_ctrl;
+ struct smc_hs_ctrl *k_ctrl;
+ u32 moff;
+
+ u_ctrl = (const struct smc_hs_ctrl *)udata;
+ k_ctrl = (struct smc_hs_ctrl *)kdata;
+
+ moff = __btf_member_bit_offset(t, member) / 8;
+ switch (moff) {
+ case offsetof(struct smc_hs_ctrl, name):
+ if (bpf_obj_name_cpy(k_ctrl->name, u_ctrl->name,
+ sizeof(u_ctrl->name)) <= 0)
+ return -EINVAL;
+ return 1;
+ case offsetof(struct smc_hs_ctrl, flags):
+ if (u_ctrl->flags & ~SMC_HS_CTRL_ALL_FLAGS)
+ return -EINVAL;
+ k_ctrl->flags = u_ctrl->flags;
+ return 1;
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+static const struct bpf_func_proto *
+bpf_smc_hs_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ return bpf_base_func_proto(func_id, prog);
+}
+
+static const struct bpf_verifier_ops smc_bpf_verifier_ops = {
+ .get_func_proto = bpf_smc_hs_func_proto,
+ .is_valid_access = bpf_tracing_btf_ctx_access,
+};
+
+static struct bpf_struct_ops bpf_smc_hs_ctrl_ops = {
+ .name = "smc_hs_ctrl",
+ .init = smc_bpf_hs_ctrl_init,
+ .reg = smc_bpf_hs_ctrl_reg,
+ .unreg = smc_bpf_hs_ctrl_unreg,
+ .cfi_stubs = &__smc_bpf_hs_ctrl,
+ .verifier_ops = &smc_bpf_verifier_ops,
+ .init_member = smc_bpf_hs_ctrl_init_member,
+ .owner = THIS_MODULE,
+};
+
+int bpf_smc_hs_ctrl_init(void)
+{
+ return register_bpf_struct_ops(&bpf_smc_hs_ctrl_ops, smc_hs_ctrl);
+}
diff --git a/net/smc/smc_hs_bpf.h b/net/smc/smc_hs_bpf.h
new file mode 100644
index 000000000000..f5f1807c079e
--- /dev/null
+++ b/net/smc/smc_hs_bpf.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Generic hook for SMC handshake flow.
+ *
+ * Copyright IBM Corp. 2016
+ * Copyright (c) 2025, Alibaba Inc.
+ *
+ * Author: D. Wythe <alibuda@linux.alibaba.com>
+ */
+
+#ifndef __SMC_HS_CTRL
+#define __SMC_HS_CTRL
+
+#include <net/smc.h>
+
+/* Find hs_ctrl by the target name, which required to be a c-string.
+ * Return NULL if no such ctrl was found,otherwise, return a valid ctrl.
+ *
+ * Note: Caller MUST ensure it's was invoked under rcu_read_lock.
+ */
+struct smc_hs_ctrl *smc_hs_ctrl_find_by_name(const char *name);
+
+#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF)
+int bpf_smc_hs_ctrl_init(void);
+#else
+static inline int bpf_smc_hs_ctrl_init(void) { return 0; }
+#endif /* CONFIG_SMC_HS_CTRL_BPF */
+
+#endif /* __SMC_HS_CTRL */
diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c
index 7b2471904d04..b1efed546243 100644
--- a/net/smc/smc_sysctl.c
+++ b/net/smc/smc_sysctl.c
@@ -12,12 +12,14 @@
#include <linux/init.h>
#include <linux/sysctl.h>
+#include <linux/bpf.h>
#include <net/net_namespace.h>
#include "smc.h"
#include "smc_core.h"
#include "smc_llc.h"
#include "smc_sysctl.h"
+#include "smc_hs_bpf.h"
static int min_sndbuf = SMC_BUF_MIN_SIZE;
static int min_rcvbuf = SMC_BUF_MIN_SIZE;
@@ -32,6 +34,69 @@ static int conns_per_lgr_max = SMC_CONN_PER_LGR_MAX;
static unsigned int smcr_max_wr_min = 2;
static unsigned int smcr_max_wr_max = 2048;
+#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF)
+static int smc_net_replace_smc_hs_ctrl(struct net *net, const char *name)
+{
+ struct smc_hs_ctrl *ctrl = NULL;
+
+ rcu_read_lock();
+ /* null or empty name ask to clear current ctrl */
+ if (name && name[0]) {
+ ctrl = smc_hs_ctrl_find_by_name(name);
+ if (!ctrl) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+ /* no change, just return */
+ if (ctrl == rcu_dereference(net->smc.hs_ctrl)) {
+ rcu_read_unlock();
+ return 0;
+ }
+ if (!bpf_try_module_get(ctrl, ctrl->owner)) {
+ rcu_read_unlock();
+ return -EBUSY;
+ }
+ }
+ /* xhcg old ctrl with the new one atomically */
+ ctrl = unrcu_pointer(xchg(&net->smc.hs_ctrl, RCU_INITIALIZER(ctrl)));
+ /* release old ctrl */
+ if (ctrl)
+ bpf_module_put(ctrl, ctrl->owner);
+
+ rcu_read_unlock();
+ return 0;
+}
+
+static int proc_smc_hs_ctrl(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct net *net = container_of(ctl->data, struct net, smc.hs_ctrl);
+ char val[SMC_HS_CTRL_NAME_MAX];
+ const struct ctl_table tbl = {
+ .data = val,
+ .maxlen = SMC_HS_CTRL_NAME_MAX,
+ };
+ struct smc_hs_ctrl *ctrl;
+ int ret;
+
+ rcu_read_lock();
+ ctrl = rcu_dereference(net->smc.hs_ctrl);
+ if (ctrl)
+ memcpy(val, ctrl->name, sizeof(ctrl->name));
+ else
+ val[0] = '\0';
+ rcu_read_unlock();
+
+ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+ if (ret)
+ return ret;
+
+ if (write)
+ ret = smc_net_replace_smc_hs_ctrl(net, val);
+ return ret;
+}
+#endif /* CONFIG_SMC_HS_CTRL_BPF */
+
static struct ctl_table smc_table[] = {
{
.procname = "autocorking_size",
@@ -119,6 +184,15 @@ static struct ctl_table smc_table[] = {
.extra1 = &smcr_max_wr_min,
.extra2 = &smcr_max_wr_max,
},
+#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF)
+ {
+ .procname = "hs_ctrl",
+ .data = &init_net.smc.hs_ctrl,
+ .mode = 0644,
+ .maxlen = SMC_HS_CTRL_NAME_MAX,
+ .proc_handler = proc_smc_hs_ctrl,
+ },
+#endif /* CONFIG_SMC_HS_CTRL_BPF */
};
int __net_init smc_sysctl_net_init(struct net *net)
@@ -129,6 +203,16 @@ int __net_init smc_sysctl_net_init(struct net *net)
table = smc_table;
if (!net_eq(net, &init_net)) {
int i;
+#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF)
+ struct smc_hs_ctrl *ctrl;
+
+ rcu_read_lock();
+ ctrl = rcu_dereference(init_net.smc.hs_ctrl);
+ if (ctrl && ctrl->flags & SMC_HS_CTRL_FLAG_INHERITABLE &&
+ bpf_try_module_get(ctrl, ctrl->owner))
+ rcu_assign_pointer(net->smc.hs_ctrl, ctrl);
+ rcu_read_unlock();
+#endif /* CONFIG_SMC_HS_CTRL_BPF */
table = kmemdup(table, sizeof(smc_table), GFP_KERNEL);
if (!table)
@@ -161,6 +245,9 @@ err_reg:
if (!net_eq(net, &init_net))
kfree(table);
err_alloc:
+#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF)
+ smc_net_replace_smc_hs_ctrl(net, NULL);
+#endif /* CONFIG_SMC_HS_CTRL_BPF */
return -ENOMEM;
}
@@ -170,6 +257,10 @@ void __net_exit smc_sysctl_net_exit(struct net *net)
table = net->smc.smc_hdr->ctl_table_arg;
unregister_net_sysctl_table(net->smc.smc_hdr);
+#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF)
+ smc_net_replace_smc_hs_ctrl(net, NULL);
+#endif /* CONFIG_SMC_HS_CTRL_BPF */
+
if (!net_eq(net, &init_net))
kfree(table);
}
diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
index 70b28c1e653e..fcd2f9bf78c9 100644
--- a/tools/testing/selftests/bpf/config
+++ b/tools/testing/selftests/bpf/config
@@ -123,3 +123,8 @@ CONFIG_XDP_SOCKETS=y
CONFIG_XFRM_INTERFACE=y
CONFIG_TCP_CONG_DCTCP=y
CONFIG_TCP_CONG_BBR=y
+CONFIG_INFINIBAND=y
+CONFIG_SMC=y
+CONFIG_SMC_HS_CTRL_BPF=y
+CONFIG_DIBS=y
+CONFIG_DIBS_LO=y \ No newline at end of file
diff --git a/tools/testing/selftests/bpf/prog_tests/test_bpf_smc.c b/tools/testing/selftests/bpf/prog_tests/test_bpf_smc.c
new file mode 100644
index 000000000000..de22734abc4d
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/test_bpf_smc.c
@@ -0,0 +1,390 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include <linux/genetlink.h>
+#include "network_helpers.h"
+#include "bpf_smc.skel.h"
+
+#ifndef IPPROTO_SMC
+#define IPPROTO_SMC 256
+#endif
+
+#define CLIENT_IP "127.0.0.1"
+#define SERVER_IP "127.0.1.0"
+#define SERVER_IP_VIA_RISK_PATH "127.0.2.0"
+
+#define SERVICE_1 80
+#define SERVICE_2 443
+#define SERVICE_3 8443
+
+#define TEST_NS "bpf_smc_netns"
+
+static struct netns_obj *test_netns;
+
+struct smc_policy_ip_key {
+ __u32 sip;
+ __u32 dip;
+};
+
+struct smc_policy_ip_value {
+ __u8 mode;
+};
+
+#if defined(__s390x__)
+/* s390x has default seid */
+static bool setup_ueid(void) { return true; }
+static void cleanup_ueid(void) {}
+#else
+enum {
+ SMC_NETLINK_ADD_UEID = 10,
+ SMC_NETLINK_REMOVE_UEID
+};
+
+enum {
+ SMC_NLA_EID_TABLE_UNSPEC,
+ SMC_NLA_EID_TABLE_ENTRY, /* string */
+};
+
+struct msgtemplate {
+ struct nlmsghdr n;
+ struct genlmsghdr g;
+ char buf[1024];
+};
+
+#define GENLMSG_DATA(glh) ((void *)(NLMSG_DATA(glh) + GENL_HDRLEN))
+#define GENLMSG_PAYLOAD(glh) (NLMSG_PAYLOAD(glh, 0) - GENL_HDRLEN)
+#define NLA_DATA(na) ((void *)((char *)(na) + NLA_HDRLEN))
+#define NLA_PAYLOAD(len) ((len) - NLA_HDRLEN)
+
+#define SMC_GENL_FAMILY_NAME "SMC_GEN_NETLINK"
+#define SMC_BPFTEST_UEID "SMC-BPFTEST-UEID"
+
+static uint16_t smc_nl_family_id = -1;
+
+static int send_cmd(int fd, __u16 nlmsg_type, __u32 nlmsg_pid,
+ __u16 nlmsg_flags, __u8 genl_cmd, __u16 nla_type,
+ void *nla_data, int nla_len)
+{
+ struct nlattr *na;
+ struct sockaddr_nl nladdr;
+ int r, buflen;
+ char *buf;
+
+ struct msgtemplate msg = {0};
+
+ msg.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
+ msg.n.nlmsg_type = nlmsg_type;
+ msg.n.nlmsg_flags = nlmsg_flags;
+ msg.n.nlmsg_seq = 0;
+ msg.n.nlmsg_pid = nlmsg_pid;
+ msg.g.cmd = genl_cmd;
+ msg.g.version = 1;
+ na = (struct nlattr *)GENLMSG_DATA(&msg);
+ na->nla_type = nla_type;
+ na->nla_len = nla_len + 1 + NLA_HDRLEN;
+ memcpy(NLA_DATA(na), nla_data, nla_len);
+ msg.n.nlmsg_len += NLMSG_ALIGN(na->nla_len);
+
+ buf = (char *)&msg;
+ buflen = msg.n.nlmsg_len;
+ memset(&nladdr, 0, sizeof(nladdr));
+ nladdr.nl_family = AF_NETLINK;
+
+ while ((r = sendto(fd, buf, buflen, 0, (struct sockaddr *)&nladdr,
+ sizeof(nladdr))) < buflen) {
+ if (r > 0) {
+ buf += r;
+ buflen -= r;
+ } else if (errno != EAGAIN) {
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static bool get_smc_nl_family_id(void)
+{
+ struct sockaddr_nl nl_src;
+ struct msgtemplate msg;
+ struct nlattr *nl;
+ int fd, ret;
+ pid_t pid;
+
+ fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC);
+ if (!ASSERT_OK_FD(fd, "nl_family socket"))
+ return false;
+
+ pid = getpid();
+
+ memset(&nl_src, 0, sizeof(nl_src));
+ nl_src.nl_family = AF_NETLINK;
+ nl_src.nl_pid = pid;
+
+ ret = bind(fd, (struct sockaddr *)&nl_src, sizeof(nl_src));
+ if (!ASSERT_OK(ret, "nl_family bind"))
+ goto fail;
+
+ ret = send_cmd(fd, GENL_ID_CTRL, pid,
+ NLM_F_REQUEST, CTRL_CMD_GETFAMILY,
+ CTRL_ATTR_FAMILY_NAME, (void *)SMC_GENL_FAMILY_NAME,
+ strlen(SMC_GENL_FAMILY_NAME));
+ if (!ASSERT_OK(ret, "nl_family query"))
+ goto fail;
+
+ ret = recv(fd, &msg, sizeof(msg), 0);
+ if (!ASSERT_FALSE(msg.n.nlmsg_type == NLMSG_ERROR || ret < 0 ||
+ !NLMSG_OK(&msg.n, ret), "nl_family response"))
+ goto fail;
+
+ nl = (struct nlattr *)GENLMSG_DATA(&msg);
+ nl = (struct nlattr *)((char *)nl + NLA_ALIGN(nl->nla_len));
+ if (!ASSERT_EQ(nl->nla_type, CTRL_ATTR_FAMILY_ID, "nl_family nla type"))
+ goto fail;
+
+ smc_nl_family_id = *(uint16_t *)NLA_DATA(nl);
+ close(fd);
+ return true;
+fail:
+ close(fd);
+ return false;
+}
+
+static bool smc_ueid(int op)
+{
+ struct sockaddr_nl nl_src;
+ struct msgtemplate msg;
+ struct nlmsgerr *err;
+ char test_ueid[32];
+ int fd, ret;
+ pid_t pid;
+
+ /* UEID required */
+ memset(test_ueid, '\x20', sizeof(test_ueid));
+ memcpy(test_ueid, SMC_BPFTEST_UEID, strlen(SMC_BPFTEST_UEID));
+ fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC);
+ if (!ASSERT_OK_FD(fd, "ueid socket"))
+ return false;
+
+ pid = getpid();
+ memset(&nl_src, 0, sizeof(nl_src));
+ nl_src.nl_family = AF_NETLINK;
+ nl_src.nl_pid = pid;
+
+ ret = bind(fd, (struct sockaddr *)&nl_src, sizeof(nl_src));
+ if (!ASSERT_OK(ret, "ueid bind"))
+ goto fail;
+
+ ret = send_cmd(fd, smc_nl_family_id, pid,
+ NLM_F_REQUEST | NLM_F_ACK, op, SMC_NLA_EID_TABLE_ENTRY,
+ (void *)test_ueid, sizeof(test_ueid));
+ if (!ASSERT_OK(ret, "ueid cmd"))
+ goto fail;
+
+ ret = recv(fd, &msg, sizeof(msg), 0);
+ if (!ASSERT_FALSE(ret < 0 ||
+ !NLMSG_OK(&msg.n, ret), "ueid response"))
+ goto fail;
+
+ if (msg.n.nlmsg_type == NLMSG_ERROR) {
+ err = NLMSG_DATA(&msg);
+ switch (op) {
+ case SMC_NETLINK_REMOVE_UEID:
+ if (!ASSERT_FALSE((err->error && err->error != -ENOENT),
+ "ueid remove"))
+ goto fail;
+ break;
+ case SMC_NETLINK_ADD_UEID:
+ if (!ASSERT_OK(err->error, "ueid add"))
+ goto fail;
+ break;
+ default:
+ break;
+ }
+ }
+ close(fd);
+ return true;
+fail:
+ close(fd);
+ return false;
+}
+
+static bool setup_ueid(void)
+{
+ /* get smc nl id */
+ if (!get_smc_nl_family_id())
+ return false;
+ /* clear old ueid for bpftest */
+ smc_ueid(SMC_NETLINK_REMOVE_UEID);
+ /* smc-loopback required ueid */
+ return smc_ueid(SMC_NETLINK_ADD_UEID);
+}
+
+static void cleanup_ueid(void)
+{
+ smc_ueid(SMC_NETLINK_REMOVE_UEID);
+}
+#endif /* __s390x__ */
+
+static bool setup_netns(void)
+{
+ test_netns = netns_new(TEST_NS, true);
+ if (!ASSERT_OK_PTR(test_netns, "open net namespace"))
+ goto fail_netns;
+
+ SYS(fail_ip, "ip addr add 127.0.1.0/8 dev lo");
+ SYS(fail_ip, "ip addr add 127.0.2.0/8 dev lo");
+
+ return true;
+fail_ip:
+ netns_free(test_netns);
+fail_netns:
+ return false;
+}
+
+static void cleanup_netns(void)
+{
+ netns_free(test_netns);
+}
+
+static bool setup_smc(void)
+{
+ if (!setup_ueid())
+ return false;
+
+ if (!setup_netns())
+ goto fail_netns;
+
+ return true;
+fail_netns:
+ cleanup_ueid();
+ return false;
+}
+
+static int set_client_addr_cb(int fd, void *opts)
+{
+ const char *src = (const char *)opts;
+ struct sockaddr_in localaddr;
+
+ localaddr.sin_family = AF_INET;
+ localaddr.sin_port = htons(0);
+ localaddr.sin_addr.s_addr = inet_addr(src);
+ return !ASSERT_OK(bind(fd, &localaddr, sizeof(localaddr)), "client bind");
+}
+
+static void run_link(const char *src, const char *dst, int port)
+{
+ struct network_helper_opts opts = {0};
+ int server, client;
+
+ server = start_server_str(AF_INET, SOCK_STREAM, dst, port, NULL);
+ if (!ASSERT_OK_FD(server, "start service_1"))
+ return;
+
+ opts.proto = IPPROTO_TCP;
+ opts.post_socket_cb = set_client_addr_cb;
+ opts.cb_opts = (void *)src;
+
+ client = connect_to_fd_opts(server, &opts);
+ if (!ASSERT_OK_FD(client, "start connect"))
+ goto fail_client;
+
+ close(client);
+fail_client:
+ close(server);
+}
+
+static void block_link(int map_fd, const char *src, const char *dst)
+{
+ struct smc_policy_ip_value val = { .mode = /* block */ 0 };
+ struct smc_policy_ip_key key = {
+ .sip = inet_addr(src),
+ .dip = inet_addr(dst),
+ };
+
+ bpf_map_update_elem(map_fd, &key, &val, BPF_ANY);
+}
+
+/*
+ * This test describes a real-life service topology as follows:
+ *
+ * +-------------> service_1
+ * link 1 | |
+ * +--------------------> server | link 2
+ * | | V
+ * | +-------------> service_2
+ * | link 3
+ * client -------------------> server_via_unsafe_path -> service_3
+ *
+ * Among them,
+ * 1. link-1 is very suitable for using SMC.
+ * 2. link-2 is not suitable for using SMC, because the mode of this link is
+ * kind of short-link services.
+ * 3. link-3 is also not suitable for using SMC, because the RDMA link is
+ * unavailable and needs to go through a long timeout before it can fallback
+ * to TCP.
+ * To achieve this goal, we use a customized SMC ip strategy via smc_hs_ctrl.
+ */
+static void test_topo(void)
+{
+ struct bpf_smc *skel;
+ int rc, map_fd;
+
+ skel = bpf_smc__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "bpf_smc__open_and_load"))
+ return;
+
+ rc = bpf_smc__attach(skel);
+ if (!ASSERT_OK(rc, "bpf_smc__attach"))
+ goto fail;
+
+ map_fd = bpf_map__fd(skel->maps.smc_policy_ip);
+ if (!ASSERT_OK_FD(map_fd, "bpf_map__fd"))
+ goto fail;
+
+ /* Mock the process of transparent replacement, since we will modify
+ * protocol to ipproto_smc accropding to it via
+ * fmod_ret/update_socket_protocol.
+ */
+ write_sysctl("/proc/sys/net/smc/hs_ctrl", "linkcheck");
+
+ /* Configure ip strat */
+ block_link(map_fd, CLIENT_IP, SERVER_IP_VIA_RISK_PATH);
+ block_link(map_fd, SERVER_IP, SERVER_IP);
+
+ /* should go with smc */
+ run_link(CLIENT_IP, SERVER_IP, SERVICE_1);
+ /* should go with smc fallback */
+ run_link(SERVER_IP, SERVER_IP, SERVICE_2);
+
+ ASSERT_EQ(skel->bss->smc_cnt, 2, "smc count");
+ ASSERT_EQ(skel->bss->fallback_cnt, 1, "fallback count");
+
+ /* should go with smc */
+ run_link(CLIENT_IP, SERVER_IP, SERVICE_2);
+
+ ASSERT_EQ(skel->bss->smc_cnt, 3, "smc count");
+ ASSERT_EQ(skel->bss->fallback_cnt, 1, "fallback count");
+
+ /* should go with smc fallback */
+ run_link(CLIENT_IP, SERVER_IP_VIA_RISK_PATH, SERVICE_3);
+
+ ASSERT_EQ(skel->bss->smc_cnt, 4, "smc count");
+ ASSERT_EQ(skel->bss->fallback_cnt, 2, "fallback count");
+
+fail:
+ bpf_smc__destroy(skel);
+}
+
+void test_bpf_smc(void)
+{
+ if (!setup_smc()) {
+ printf("setup for smc test failed, test SKIP:\n");
+ test__skip();
+ return;
+ }
+
+ if (test__start_subtest("topo"))
+ test_topo();
+
+ cleanup_ueid();
+ cleanup_netns();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c
index 178292d1251a..ee94c281888a 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c
@@ -124,10 +124,10 @@ static int send_test_packet(int ifindex)
int n, sock = -1;
__u8 packet[sizeof(struct ethhdr) + TEST_PAYLOAD_LEN];
- /* The ethernet header is not relevant for this test and doesn't need to
- * be meaningful.
- */
- struct ethhdr eth = { 0 };
+ /* We use the Ethernet header only to identify the test packet */
+ struct ethhdr eth = {
+ .h_source = { 0x12, 0x34, 0xDE, 0xAD, 0xBE, 0xEF },
+ };
memcpy(packet, &eth, sizeof(eth));
memcpy(packet + sizeof(eth), test_payload, TEST_PAYLOAD_LEN);
@@ -160,8 +160,16 @@ static int write_test_packet(int tap_fd)
__u8 packet[sizeof(struct ethhdr) + TEST_PAYLOAD_LEN];
int n;
- /* The ethernet header doesn't need to be valid for this test */
- memset(packet, 0, sizeof(struct ethhdr));
+ /* The Ethernet header is mostly not relevant. We use it to identify the
+ * test packet and some BPF helpers we exercise expect to operate on
+ * Ethernet frames carrying IP packets. Pretend that's the case.
+ */
+ struct ethhdr eth = {
+ .h_source = { 0x12, 0x34, 0xDE, 0xAD, 0xBE, 0xEF },
+ .h_proto = htons(ETH_P_IP),
+ };
+
+ memcpy(packet, &eth, sizeof(eth));
memcpy(packet + sizeof(struct ethhdr), test_payload, TEST_PAYLOAD_LEN);
n = write(tap_fd, packet, sizeof(packet));
@@ -171,31 +179,19 @@ static int write_test_packet(int tap_fd)
return 0;
}
-static void assert_test_result(const struct bpf_map *result_map)
-{
- int err;
- __u32 map_key = 0;
- __u8 map_value[TEST_PAYLOAD_LEN];
-
- err = bpf_map__lookup_elem(result_map, &map_key, sizeof(map_key),
- &map_value, TEST_PAYLOAD_LEN, BPF_ANY);
- if (!ASSERT_OK(err, "lookup test_result"))
- return;
-
- ASSERT_MEMEQ(&map_value, &test_payload, TEST_PAYLOAD_LEN,
- "test_result map contains test payload");
-}
-
-static bool clear_test_result(struct bpf_map *result_map)
+static void dump_err_stream(const struct bpf_program *prog)
{
- const __u8 v[sizeof(test_payload)] = {};
- const __u32 k = 0;
- int err;
-
- err = bpf_map__update_elem(result_map, &k, sizeof(k), v, sizeof(v), BPF_ANY);
- ASSERT_OK(err, "update test_result");
+ char buf[512];
+ int ret;
- return err == 0;
+ ret = 0;
+ do {
+ ret = bpf_prog_stream_read(bpf_program__fd(prog),
+ BPF_STREAM_STDERR, buf, sizeof(buf),
+ NULL);
+ if (ret > 0)
+ fwrite(buf, sizeof(buf[0]), ret, stderr);
+ } while (ret > 0);
}
void test_xdp_context_veth(void)
@@ -270,11 +266,14 @@ void test_xdp_context_veth(void)
if (!ASSERT_GE(tx_ifindex, 0, "if_nametoindex tx"))
goto close;
+ skel->bss->test_pass = false;
+
ret = send_test_packet(tx_ifindex);
if (!ASSERT_OK(ret, "send_test_packet"))
goto close;
- assert_test_result(skel->maps.test_result);
+ if (!ASSERT_TRUE(skel->bss->test_pass, "test_pass"))
+ dump_err_stream(tc_prog);
close:
close_netns(nstoken);
@@ -286,7 +285,7 @@ close:
static void test_tuntap(struct bpf_program *xdp_prog,
struct bpf_program *tc_prio_1_prog,
struct bpf_program *tc_prio_2_prog,
- struct bpf_map *result_map)
+ bool *test_pass)
{
LIBBPF_OPTS(bpf_tc_hook, tc_hook, .attach_point = BPF_TC_INGRESS);
LIBBPF_OPTS(bpf_tc_opts, tc_opts, .handle = 1, .priority = 1);
@@ -295,8 +294,7 @@ static void test_tuntap(struct bpf_program *xdp_prog,
int tap_ifindex;
int ret;
- if (!clear_test_result(result_map))
- return;
+ *test_pass = false;
ns = netns_new(TAP_NETNS, true);
if (!ASSERT_OK_PTR(ns, "create and open ns"))
@@ -340,7 +338,8 @@ static void test_tuntap(struct bpf_program *xdp_prog,
if (!ASSERT_OK(ret, "write_test_packet"))
goto close;
- assert_test_result(result_map);
+ if (!ASSERT_TRUE(*test_pass, "test_pass"))
+ dump_err_stream(tc_prio_2_prog ? : tc_prio_1_prog);
close:
if (tap_fd >= 0)
@@ -411,7 +410,8 @@ static void test_tuntap_mirred(struct bpf_program *xdp_prog,
if (!ASSERT_OK(ret, "write_test_packet"))
goto close;
- ASSERT_TRUE(*test_pass, "test_pass");
+ if (!ASSERT_TRUE(*test_pass, "test_pass"))
+ dump_err_stream(tc_prog);
close:
if (tap_fd >= 0)
@@ -431,61 +431,82 @@ void test_xdp_context_tuntap(void)
test_tuntap(skel->progs.ing_xdp,
skel->progs.ing_cls,
NULL, /* tc prio 2 */
- skel->maps.test_result);
+ &skel->bss->test_pass);
if (test__start_subtest("dynptr_read"))
test_tuntap(skel->progs.ing_xdp,
skel->progs.ing_cls_dynptr_read,
NULL, /* tc prio 2 */
- skel->maps.test_result);
+ &skel->bss->test_pass);
if (test__start_subtest("dynptr_slice"))
test_tuntap(skel->progs.ing_xdp,
skel->progs.ing_cls_dynptr_slice,
NULL, /* tc prio 2 */
- skel->maps.test_result);
+ &skel->bss->test_pass);
if (test__start_subtest("dynptr_write"))
test_tuntap(skel->progs.ing_xdp_zalloc_meta,
skel->progs.ing_cls_dynptr_write,
skel->progs.ing_cls_dynptr_read,
- skel->maps.test_result);
+ &skel->bss->test_pass);
if (test__start_subtest("dynptr_slice_rdwr"))
test_tuntap(skel->progs.ing_xdp_zalloc_meta,
skel->progs.ing_cls_dynptr_slice_rdwr,
skel->progs.ing_cls_dynptr_slice,
- skel->maps.test_result);
+ &skel->bss->test_pass);
if (test__start_subtest("dynptr_offset"))
test_tuntap(skel->progs.ing_xdp_zalloc_meta,
skel->progs.ing_cls_dynptr_offset_wr,
skel->progs.ing_cls_dynptr_offset_rd,
- skel->maps.test_result);
+ &skel->bss->test_pass);
if (test__start_subtest("dynptr_offset_oob"))
test_tuntap(skel->progs.ing_xdp,
skel->progs.ing_cls_dynptr_offset_oob,
skel->progs.ing_cls,
- skel->maps.test_result);
- if (test__start_subtest("clone_data_meta_empty_on_data_write"))
+ &skel->bss->test_pass);
+ if (test__start_subtest("clone_data_meta_survives_data_write"))
test_tuntap_mirred(skel->progs.ing_xdp,
- skel->progs.clone_data_meta_empty_on_data_write,
+ skel->progs.clone_data_meta_survives_data_write,
&skel->bss->test_pass);
- if (test__start_subtest("clone_data_meta_empty_on_meta_write"))
+ if (test__start_subtest("clone_data_meta_survives_meta_write"))
test_tuntap_mirred(skel->progs.ing_xdp,
- skel->progs.clone_data_meta_empty_on_meta_write,
+ skel->progs.clone_data_meta_survives_meta_write,
&skel->bss->test_pass);
- if (test__start_subtest("clone_dynptr_empty_on_data_slice_write"))
+ if (test__start_subtest("clone_meta_dynptr_survives_data_slice_write"))
test_tuntap_mirred(skel->progs.ing_xdp,
- skel->progs.clone_dynptr_empty_on_data_slice_write,
+ skel->progs.clone_meta_dynptr_survives_data_slice_write,
&skel->bss->test_pass);
- if (test__start_subtest("clone_dynptr_empty_on_meta_slice_write"))
+ if (test__start_subtest("clone_meta_dynptr_survives_meta_slice_write"))
test_tuntap_mirred(skel->progs.ing_xdp,
- skel->progs.clone_dynptr_empty_on_meta_slice_write,
+ skel->progs.clone_meta_dynptr_survives_meta_slice_write,
&skel->bss->test_pass);
- if (test__start_subtest("clone_dynptr_rdonly_before_data_dynptr_write"))
+ if (test__start_subtest("clone_meta_dynptr_rw_before_data_dynptr_write"))
test_tuntap_mirred(skel->progs.ing_xdp,
- skel->progs.clone_dynptr_rdonly_before_data_dynptr_write,
+ skel->progs.clone_meta_dynptr_rw_before_data_dynptr_write,
&skel->bss->test_pass);
- if (test__start_subtest("clone_dynptr_rdonly_before_meta_dynptr_write"))
+ if (test__start_subtest("clone_meta_dynptr_rw_before_meta_dynptr_write"))
test_tuntap_mirred(skel->progs.ing_xdp,
- skel->progs.clone_dynptr_rdonly_before_meta_dynptr_write,
+ skel->progs.clone_meta_dynptr_rw_before_meta_dynptr_write,
&skel->bss->test_pass);
+ /* Tests for BPF helpers which touch headroom */
+ if (test__start_subtest("helper_skb_vlan_push_pop"))
+ test_tuntap(skel->progs.ing_xdp,
+ skel->progs.helper_skb_vlan_push_pop,
+ NULL, /* tc prio 2 */
+ &skel->bss->test_pass);
+ if (test__start_subtest("helper_skb_adjust_room"))
+ test_tuntap(skel->progs.ing_xdp,
+ skel->progs.helper_skb_adjust_room,
+ NULL, /* tc prio 2 */
+ &skel->bss->test_pass);
+ if (test__start_subtest("helper_skb_change_head_tail"))
+ test_tuntap(skel->progs.ing_xdp,
+ skel->progs.helper_skb_change_head_tail,
+ NULL, /* tc prio 2 */
+ &skel->bss->test_pass);
+ if (test__start_subtest("helper_skb_change_proto"))
+ test_tuntap(skel->progs.ing_xdp,
+ skel->progs.helper_skb_change_proto,
+ NULL, /* tc prio 2 */
+ &skel->bss->test_pass);
test_xdp_meta__destroy(skel);
}
diff --git a/tools/testing/selftests/bpf/progs/bpf_smc.c b/tools/testing/selftests/bpf/progs/bpf_smc.c
new file mode 100644
index 000000000000..70d8b08f5914
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_smc.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "vmlinux.h"
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "bpf_tracing_net.h"
+
+char _license[] SEC("license") = "GPL";
+
+enum {
+ BPF_SMC_LISTEN = 10,
+};
+
+struct smc_sock___local {
+ struct sock sk;
+ struct smc_sock *listen_smc;
+ bool use_fallback;
+} __attribute__((preserve_access_index));
+
+int smc_cnt = 0;
+int fallback_cnt = 0;
+
+SEC("fentry/smc_release")
+int BPF_PROG(bpf_smc_release, struct socket *sock)
+{
+ /* only count from one side (client) */
+ if (sock->sk->__sk_common.skc_state == BPF_SMC_LISTEN)
+ return 0;
+ smc_cnt++;
+ return 0;
+}
+
+SEC("fentry/smc_switch_to_fallback")
+int BPF_PROG(bpf_smc_switch_to_fallback, struct smc_sock___local *smc)
+{
+ /* only count from one side (client) */
+ if (smc && !smc->listen_smc)
+ fallback_cnt++;
+ return 0;
+}
+
+/* go with default value if no strat was found */
+bool default_ip_strat_value = true;
+
+struct smc_policy_ip_key {
+ __u32 sip;
+ __u32 dip;
+};
+
+struct smc_policy_ip_value {
+ __u8 mode;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(key_size, sizeof(struct smc_policy_ip_key));
+ __uint(value_size, sizeof(struct smc_policy_ip_value));
+ __uint(max_entries, 128);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+} smc_policy_ip SEC(".maps");
+
+static bool smc_check(__u32 src, __u32 dst)
+{
+ struct smc_policy_ip_value *value;
+ struct smc_policy_ip_key key = {
+ .sip = src,
+ .dip = dst,
+ };
+
+ value = bpf_map_lookup_elem(&smc_policy_ip, &key);
+ return value ? value->mode : default_ip_strat_value;
+}
+
+SEC("fmod_ret/update_socket_protocol")
+int BPF_PROG(smc_run, int family, int type, int protocol)
+{
+ struct task_struct *task;
+
+ if (family != AF_INET && family != AF_INET6)
+ return protocol;
+
+ if ((type & 0xf) != SOCK_STREAM)
+ return protocol;
+
+ if (protocol != 0 && protocol != IPPROTO_TCP)
+ return protocol;
+
+ task = bpf_get_current_task_btf();
+ /* Prevent from affecting other tests */
+ if (!task || !task->nsproxy->net_ns->smc.hs_ctrl)
+ return protocol;
+
+ return IPPROTO_SMC;
+}
+
+SEC("struct_ops")
+int BPF_PROG(bpf_smc_set_tcp_option_cond, const struct tcp_sock *tp,
+ struct inet_request_sock *ireq)
+{
+ return smc_check(ireq->req.__req_common.skc_daddr,
+ ireq->req.__req_common.skc_rcv_saddr);
+}
+
+SEC("struct_ops")
+int BPF_PROG(bpf_smc_set_tcp_option, struct tcp_sock *tp)
+{
+ return smc_check(tp->inet_conn.icsk_inet.sk.__sk_common.skc_rcv_saddr,
+ tp->inet_conn.icsk_inet.sk.__sk_common.skc_daddr);
+}
+
+SEC(".struct_ops")
+struct smc_hs_ctrl linkcheck = {
+ .name = "linkcheck",
+ .syn_option = (void *)bpf_smc_set_tcp_option,
+ .synack_option = (void *)bpf_smc_set_tcp_option_cond,
+};
diff --git a/tools/testing/selftests/bpf/progs/test_xdp_meta.c b/tools/testing/selftests/bpf/progs/test_xdp_meta.c
index d79cb74b571e..0a0f371a2dec 100644
--- a/tools/testing/selftests/bpf/progs/test_xdp_meta.c
+++ b/tools/testing/selftests/bpf/progs/test_xdp_meta.c
@@ -4,6 +4,7 @@
#include <linux/if_ether.h>
#include <linux/pkt_cls.h>
+#include <bpf/bpf_endian.h>
#include <bpf/bpf_helpers.h>
#include "bpf_kfuncs.h"
@@ -11,37 +12,72 @@
#define ctx_ptr(ctx, mem) (void *)(unsigned long)ctx->mem
-/* Demonstrates how metadata can be passed from an XDP program to a TC program
- * using bpf_xdp_adjust_meta.
- * For the sake of testing the metadata support in drivers, the XDP program uses
- * a fixed-size payload after the Ethernet header as metadata. The TC program
- * copies the metadata it receives into a map so it can be checked from
- * userspace.
+/* Demonstrate passing metadata from XDP to TC using bpf_xdp_adjust_meta.
+ *
+ * The XDP program extracts a fixed-size payload following the Ethernet header
+ * and stores it as packet metadata to test the driver's metadata support. The
+ * TC program then verifies if the passed metadata is correct.
*/
-struct {
- __uint(type, BPF_MAP_TYPE_ARRAY);
- __uint(max_entries, 1);
- __type(key, __u32);
- __uint(value_size, META_SIZE);
-} test_result SEC(".maps");
-
bool test_pass;
+static const __u8 smac_want[ETH_ALEN] = {
+ 0x12, 0x34, 0xDE, 0xAD, 0xBE, 0xEF,
+};
+
+static const __u8 meta_want[META_SIZE] = {
+ 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
+ 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18,
+ 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
+ 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38,
+};
+
+static bool check_smac(const struct ethhdr *eth)
+{
+ return !__builtin_memcmp(eth->h_source, smac_want, ETH_ALEN);
+}
+
+static bool check_metadata(const char *file, int line, __u8 *meta_have)
+{
+ if (!__builtin_memcmp(meta_have, meta_want, META_SIZE))
+ return true;
+
+ bpf_stream_printk(BPF_STREAM_STDERR,
+ "FAIL:%s:%d: metadata mismatch\n"
+ " have:\n %pI6\n %pI6\n"
+ " want:\n %pI6\n %pI6\n",
+ file, line,
+ &meta_have[0x00], &meta_have[0x10],
+ &meta_want[0x00], &meta_want[0x10]);
+ return false;
+}
+
+#define check_metadata(meta_have) check_metadata(__FILE__, __LINE__, meta_have)
+
+static bool check_skb_metadata(const char *file, int line, struct __sk_buff *skb)
+{
+ __u8 *data_meta = ctx_ptr(skb, data_meta);
+ __u8 *data = ctx_ptr(skb, data);
+
+ return data_meta + META_SIZE <= data && (check_metadata)(file, line, data_meta);
+}
+
+#define check_skb_metadata(skb) check_skb_metadata(__FILE__, __LINE__, skb)
+
SEC("tc")
int ing_cls(struct __sk_buff *ctx)
{
- __u8 *data, *data_meta;
- __u32 key = 0;
-
- data_meta = ctx_ptr(ctx, data_meta);
- data = ctx_ptr(ctx, data);
+ __u8 *meta_have = ctx_ptr(ctx, data_meta);
+ __u8 *data = ctx_ptr(ctx, data);
- if (data_meta + META_SIZE > data)
- return TC_ACT_SHOT;
+ if (meta_have + META_SIZE > data)
+ goto out;
- bpf_map_update_elem(&test_result, &key, data_meta, BPF_ANY);
+ if (!check_metadata(meta_have))
+ goto out;
+ test_pass = true;
+out:
return TC_ACT_SHOT;
}
@@ -49,17 +85,17 @@ int ing_cls(struct __sk_buff *ctx)
SEC("tc")
int ing_cls_dynptr_read(struct __sk_buff *ctx)
{
+ __u8 meta_have[META_SIZE];
struct bpf_dynptr meta;
- const __u32 zero = 0;
- __u8 *dst;
-
- dst = bpf_map_lookup_elem(&test_result, &zero);
- if (!dst)
- return TC_ACT_SHOT;
bpf_dynptr_from_skb_meta(ctx, 0, &meta);
- bpf_dynptr_read(dst, META_SIZE, &meta, 0, 0);
+ bpf_dynptr_read(meta_have, META_SIZE, &meta, 0, 0);
+ if (!check_metadata(meta_have))
+ goto out;
+
+ test_pass = true;
+out:
return TC_ACT_SHOT;
}
@@ -86,20 +122,18 @@ SEC("tc")
int ing_cls_dynptr_slice(struct __sk_buff *ctx)
{
struct bpf_dynptr meta;
- const __u32 zero = 0;
- __u8 *dst, *src;
-
- dst = bpf_map_lookup_elem(&test_result, &zero);
- if (!dst)
- return TC_ACT_SHOT;
+ __u8 *meta_have;
bpf_dynptr_from_skb_meta(ctx, 0, &meta);
- src = bpf_dynptr_slice(&meta, 0, NULL, META_SIZE);
- if (!src)
- return TC_ACT_SHOT;
+ meta_have = bpf_dynptr_slice(&meta, 0, NULL, META_SIZE);
+ if (!meta_have)
+ goto out;
- __builtin_memcpy(dst, src, META_SIZE);
+ if (!check_metadata(meta_have))
+ goto out;
+ test_pass = true;
+out:
return TC_ACT_SHOT;
}
@@ -129,14 +163,12 @@ int ing_cls_dynptr_slice_rdwr(struct __sk_buff *ctx)
SEC("tc")
int ing_cls_dynptr_offset_rd(struct __sk_buff *ctx)
{
- struct bpf_dynptr meta;
const __u32 chunk_len = META_SIZE / 4;
- const __u32 zero = 0;
+ __u8 meta_have[META_SIZE];
+ struct bpf_dynptr meta;
__u8 *dst, *src;
- dst = bpf_map_lookup_elem(&test_result, &zero);
- if (!dst)
- return TC_ACT_SHOT;
+ dst = meta_have;
/* 1. Regular read */
bpf_dynptr_from_skb_meta(ctx, 0, &meta);
@@ -155,9 +187,14 @@ int ing_cls_dynptr_offset_rd(struct __sk_buff *ctx)
/* 4. Read from a slice starting at an offset */
src = bpf_dynptr_slice(&meta, 2 * chunk_len, NULL, chunk_len);
if (!src)
- return TC_ACT_SHOT;
+ goto out;
__builtin_memcpy(dst, src, chunk_len);
+ if (!check_metadata(meta_have))
+ goto out;
+
+ test_pass = true;
+out:
return TC_ACT_SHOT;
}
@@ -254,7 +291,7 @@ int ing_xdp_zalloc_meta(struct xdp_md *ctx)
/* Drop any non-test packets */
if (eth + 1 > ctx_ptr(ctx, data_end))
return XDP_DROP;
- if (eth->h_proto != 0)
+ if (!check_smac(eth))
return XDP_DROP;
ret = bpf_xdp_adjust_meta(ctx, -META_SIZE);
@@ -294,9 +331,9 @@ int ing_xdp(struct xdp_md *ctx)
/* The Linux networking stack may send other packets on the test
* interface that interfere with the test. Just drop them.
- * The test packets can be recognized by their ethertype of zero.
+ * The test packets can be recognized by their source MAC address.
*/
- if (eth->h_proto != 0)
+ if (!check_smac(eth))
return XDP_DROP;
__builtin_memcpy(data_meta, payload, META_SIZE);
@@ -304,22 +341,25 @@ int ing_xdp(struct xdp_md *ctx)
}
/*
- * Check that skb->data_meta..skb->data is empty if prog writes to packet
- * _payload_ using packet pointers. Applies only to cloned skbs.
+ * Check that, when operating on a cloned packet, skb->data_meta..skb->data is
+ * kept intact if prog writes to packet _payload_ using packet pointers.
*/
SEC("tc")
-int clone_data_meta_empty_on_data_write(struct __sk_buff *ctx)
+int clone_data_meta_survives_data_write(struct __sk_buff *ctx)
{
+ __u8 *meta_have = ctx_ptr(ctx, data_meta);
struct ethhdr *eth = ctx_ptr(ctx, data);
if (eth + 1 > ctx_ptr(ctx, data_end))
goto out;
/* Ignore non-test packets */
- if (eth->h_proto != 0)
+ if (!check_smac(eth))
+ goto out;
+
+ if (meta_have + META_SIZE > eth)
goto out;
- /* Expect no metadata */
- if (ctx->data_meta != ctx->data)
+ if (!check_metadata(meta_have))
goto out;
/* Packet write to trigger unclone in prologue */
@@ -331,40 +371,44 @@ out:
}
/*
- * Check that skb->data_meta..skb->data is empty if prog writes to packet
- * _metadata_ using packet pointers. Applies only to cloned skbs.
+ * Check that, when operating on a cloned packet, skb->data_meta..skb->data is
+ * kept intact if prog writes to packet _metadata_ using packet pointers.
*/
SEC("tc")
-int clone_data_meta_empty_on_meta_write(struct __sk_buff *ctx)
+int clone_data_meta_survives_meta_write(struct __sk_buff *ctx)
{
+ __u8 *meta_have = ctx_ptr(ctx, data_meta);
struct ethhdr *eth = ctx_ptr(ctx, data);
- __u8 *md = ctx_ptr(ctx, data_meta);
if (eth + 1 > ctx_ptr(ctx, data_end))
goto out;
/* Ignore non-test packets */
- if (eth->h_proto != 0)
+ if (!check_smac(eth))
goto out;
- if (md + 1 > ctx_ptr(ctx, data)) {
- /* Expect no metadata */
- test_pass = true;
- } else {
- /* Metadata write to trigger unclone in prologue */
- *md = 42;
- }
+ if (meta_have + META_SIZE > eth)
+ goto out;
+
+ if (!check_metadata(meta_have))
+ goto out;
+
+ /* Metadata write to trigger unclone in prologue */
+ *meta_have = 42;
+
+ test_pass = true;
out:
return TC_ACT_SHOT;
}
/*
- * Check that skb_meta dynptr is writable but empty if prog writes to packet
- * _payload_ using a dynptr slice. Applies only to cloned skbs.
+ * Check that, when operating on a cloned packet, metadata remains intact if
+ * prog creates a r/w slice to packet _payload_.
*/
SEC("tc")
-int clone_dynptr_empty_on_data_slice_write(struct __sk_buff *ctx)
+int clone_meta_dynptr_survives_data_slice_write(struct __sk_buff *ctx)
{
struct bpf_dynptr data, meta;
+ __u8 meta_have[META_SIZE];
struct ethhdr *eth;
bpf_dynptr_from_skb(ctx, 0, &data);
@@ -372,51 +416,45 @@ int clone_dynptr_empty_on_data_slice_write(struct __sk_buff *ctx)
if (!eth)
goto out;
/* Ignore non-test packets */
- if (eth->h_proto != 0)
+ if (!check_smac(eth))
goto out;
- /* Expect no metadata */
bpf_dynptr_from_skb_meta(ctx, 0, &meta);
- if (bpf_dynptr_is_rdonly(&meta) || bpf_dynptr_size(&meta) > 0)
+ bpf_dynptr_read(meta_have, META_SIZE, &meta, 0, 0);
+ if (!check_metadata(meta_have))
goto out;
- /* Packet write to trigger unclone in prologue */
- eth->h_proto = 42;
-
test_pass = true;
out:
return TC_ACT_SHOT;
}
/*
- * Check that skb_meta dynptr is writable but empty if prog writes to packet
- * _metadata_ using a dynptr slice. Applies only to cloned skbs.
+ * Check that, when operating on a cloned packet, metadata remains intact if
+ * prog creates an r/w slice to packet _metadata_.
*/
SEC("tc")
-int clone_dynptr_empty_on_meta_slice_write(struct __sk_buff *ctx)
+int clone_meta_dynptr_survives_meta_slice_write(struct __sk_buff *ctx)
{
struct bpf_dynptr data, meta;
const struct ethhdr *eth;
- __u8 *md;
+ __u8 *meta_have;
bpf_dynptr_from_skb(ctx, 0, &data);
eth = bpf_dynptr_slice(&data, 0, NULL, sizeof(*eth));
if (!eth)
goto out;
/* Ignore non-test packets */
- if (eth->h_proto != 0)
+ if (!check_smac(eth))
goto out;
- /* Expect no metadata */
bpf_dynptr_from_skb_meta(ctx, 0, &meta);
- if (bpf_dynptr_is_rdonly(&meta) || bpf_dynptr_size(&meta) > 0)
+ meta_have = bpf_dynptr_slice_rdwr(&meta, 0, NULL, META_SIZE);
+ if (!meta_have)
goto out;
- /* Metadata write to trigger unclone in prologue */
- bpf_dynptr_from_skb_meta(ctx, 0, &meta);
- md = bpf_dynptr_slice_rdwr(&meta, 0, NULL, sizeof(*md));
- if (md)
- *md = 42;
+ if (!check_metadata(meta_have))
+ goto out;
test_pass = true;
out:
@@ -424,34 +462,40 @@ out:
}
/*
- * Check that skb_meta dynptr is read-only before prog writes to packet payload
- * using dynptr_write helper. Applies only to cloned skbs.
+ * Check that, when operating on a cloned packet, skb_meta dynptr is read-write
+ * before prog writes to packet _payload_ using dynptr_write helper and metadata
+ * remains intact before and after the write.
*/
SEC("tc")
-int clone_dynptr_rdonly_before_data_dynptr_write(struct __sk_buff *ctx)
+int clone_meta_dynptr_rw_before_data_dynptr_write(struct __sk_buff *ctx)
{
struct bpf_dynptr data, meta;
+ __u8 meta_have[META_SIZE];
const struct ethhdr *eth;
+ int err;
bpf_dynptr_from_skb(ctx, 0, &data);
eth = bpf_dynptr_slice(&data, 0, NULL, sizeof(*eth));
if (!eth)
goto out;
/* Ignore non-test packets */
- if (eth->h_proto != 0)
+ if (!check_smac(eth))
goto out;
- /* Expect read-only metadata before unclone */
+ /* Expect read-write metadata before unclone */
bpf_dynptr_from_skb_meta(ctx, 0, &meta);
- if (!bpf_dynptr_is_rdonly(&meta) || bpf_dynptr_size(&meta) != META_SIZE)
+ if (bpf_dynptr_is_rdonly(&meta))
+ goto out;
+
+ err = bpf_dynptr_read(meta_have, META_SIZE, &meta, 0, 0);
+ if (err || !check_metadata(meta_have))
goto out;
/* Helper write to payload will unclone the packet */
bpf_dynptr_write(&data, offsetof(struct ethhdr, h_proto), "x", 1, 0);
- /* Expect no metadata after unclone */
- bpf_dynptr_from_skb_meta(ctx, 0, &meta);
- if (bpf_dynptr_is_rdonly(&meta) || bpf_dynptr_size(&meta) != 0)
+ err = bpf_dynptr_read(meta_have, META_SIZE, &meta, 0, 0);
+ if (err || !check_metadata(meta_have))
goto out;
test_pass = true;
@@ -460,31 +504,165 @@ out:
}
/*
- * Check that skb_meta dynptr is read-only if prog writes to packet
- * metadata using dynptr_write helper. Applies only to cloned skbs.
+ * Check that, when operating on a cloned packet, skb_meta dynptr is read-write
+ * before prog writes to packet _metadata_ using dynptr_write helper and
+ * metadata remains intact before and after the write.
*/
SEC("tc")
-int clone_dynptr_rdonly_before_meta_dynptr_write(struct __sk_buff *ctx)
+int clone_meta_dynptr_rw_before_meta_dynptr_write(struct __sk_buff *ctx)
{
struct bpf_dynptr data, meta;
+ __u8 meta_have[META_SIZE];
const struct ethhdr *eth;
+ int err;
bpf_dynptr_from_skb(ctx, 0, &data);
eth = bpf_dynptr_slice(&data, 0, NULL, sizeof(*eth));
if (!eth)
goto out;
/* Ignore non-test packets */
- if (eth->h_proto != 0)
+ if (!check_smac(eth))
goto out;
- /* Expect read-only metadata */
+ /* Expect read-write metadata before unclone */
bpf_dynptr_from_skb_meta(ctx, 0, &meta);
- if (!bpf_dynptr_is_rdonly(&meta) || bpf_dynptr_size(&meta) != META_SIZE)
+ if (bpf_dynptr_is_rdonly(&meta))
goto out;
- /* Metadata write. Expect failure. */
- bpf_dynptr_from_skb_meta(ctx, 0, &meta);
- if (bpf_dynptr_write(&meta, 0, "x", 1, 0) != -EINVAL)
+ err = bpf_dynptr_read(meta_have, META_SIZE, &meta, 0, 0);
+ if (err || !check_metadata(meta_have))
+ goto out;
+
+ /* Helper write to metadata will unclone the packet */
+ bpf_dynptr_write(&meta, 0, &meta_have[0], 1, 0);
+
+ err = bpf_dynptr_read(meta_have, META_SIZE, &meta, 0, 0);
+ if (err || !check_metadata(meta_have))
+ goto out;
+
+ test_pass = true;
+out:
+ return TC_ACT_SHOT;
+}
+
+SEC("tc")
+int helper_skb_vlan_push_pop(struct __sk_buff *ctx)
+{
+ int err;
+
+ /* bpf_skb_vlan_push assumes HW offload for primary VLAN tag. Only
+ * secondary tag push triggers an actual MAC header modification.
+ */
+ err = bpf_skb_vlan_push(ctx, 0, 42);
+ if (err)
+ goto out;
+ err = bpf_skb_vlan_push(ctx, 0, 207);
+ if (err)
+ goto out;
+
+ if (!check_skb_metadata(ctx))
+ goto out;
+
+ err = bpf_skb_vlan_pop(ctx);
+ if (err)
+ goto out;
+ err = bpf_skb_vlan_pop(ctx);
+ if (err)
+ goto out;
+
+ if (!check_skb_metadata(ctx))
+ goto out;
+
+ test_pass = true;
+out:
+ return TC_ACT_SHOT;
+}
+
+SEC("tc")
+int helper_skb_adjust_room(struct __sk_buff *ctx)
+{
+ int err;
+
+ /* Grow a 1 byte hole after the MAC header */
+ err = bpf_skb_adjust_room(ctx, 1, BPF_ADJ_ROOM_MAC, 0);
+ if (err)
+ goto out;
+
+ if (!check_skb_metadata(ctx))
+ goto out;
+
+ /* Shrink a 1 byte hole after the MAC header */
+ err = bpf_skb_adjust_room(ctx, -1, BPF_ADJ_ROOM_MAC, 0);
+ if (err)
+ goto out;
+
+ if (!check_skb_metadata(ctx))
+ goto out;
+
+ /* Grow a 256 byte hole to trigger head reallocation */
+ err = bpf_skb_adjust_room(ctx, 256, BPF_ADJ_ROOM_MAC, 0);
+ if (err)
+ goto out;
+
+ if (!check_skb_metadata(ctx))
+ goto out;
+
+ test_pass = true;
+out:
+ return TC_ACT_SHOT;
+}
+
+SEC("tc")
+int helper_skb_change_head_tail(struct __sk_buff *ctx)
+{
+ int err;
+
+ /* Reserve 1 extra in the front for packet data */
+ err = bpf_skb_change_head(ctx, 1, 0);
+ if (err)
+ goto out;
+
+ if (!check_skb_metadata(ctx))
+ goto out;
+
+ /* Reserve 256 extra bytes in the front to trigger head reallocation */
+ err = bpf_skb_change_head(ctx, 256, 0);
+ if (err)
+ goto out;
+
+ if (!check_skb_metadata(ctx))
+ goto out;
+
+ /* Reserve 4k extra bytes in the back to trigger head reallocation */
+ err = bpf_skb_change_tail(ctx, ctx->len + 4096, 0);
+ if (err)
+ goto out;
+
+ if (!check_skb_metadata(ctx))
+ goto out;
+
+ test_pass = true;
+out:
+ return TC_ACT_SHOT;
+}
+
+SEC("tc")
+int helper_skb_change_proto(struct __sk_buff *ctx)
+{
+ int err;
+
+ err = bpf_skb_change_proto(ctx, bpf_htons(ETH_P_IPV6), 0);
+ if (err)
+ goto out;
+
+ if (!check_skb_metadata(ctx))
+ goto out;
+
+ err = bpf_skb_change_proto(ctx, bpf_htons(ETH_P_IP), 0);
+ if (err)
+ goto out;
+
+ if (!check_skb_metadata(ctx))
goto out;
test_pass = true;