From 9bc95a95abbe91e9315c1fe27dc124019bd2592c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sun, 27 Aug 2023 08:28:37 -0700 Subject: bpf: Mark BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE deprecated Now 'BPF_MAP_TYPE_CGRP_STORAGE + local percpu ptr' can cover all BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE functionality and more. So mark BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE deprecated. Also make changes in selftests/bpf/test_bpftool_synctypes.py and selftest libbpf_str to fix otherwise test errors. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20230827152837.2003563-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/bpf.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'tools/include/uapi/linux') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 8790b3962e4b..73b155e52204 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -932,7 +932,14 @@ enum bpf_map_type { */ BPF_MAP_TYPE_CGROUP_STORAGE = BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED, BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, - BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, + BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE_DEPRECATED, + /* BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE is available to bpf programs + * attaching to a cgroup. The new mechanism (BPF_MAP_TYPE_CGRP_STORAGE + + * local percpu kptr) supports all BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE + * functionality and more. So mark * BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE + * deprecated. + */ + BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE = BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE_DEPRECATED, BPF_MAP_TYPE_QUEUE, BPF_MAP_TYPE_STACK, BPF_MAP_TYPE_SK_STORAGE, -- cgit v1.2.3 From a9c2a608549bb1a2363d289d63907640afcf22af Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 13 Sep 2023 10:13:49 -0700 Subject: bpf: expose information about supported xdp metadata kfunc Add new xdp-rx-metadata-features member to netdev netlink which exports a bitmask of supported kfuncs. Most of the patch is autogenerated (headers), the only relevant part is netdev.yaml and the changes in netdev-genl.c to marshal into netlink. Example output on veth: $ ip link add veth0 type veth peer name veth1 # ifndex == 12 $ ./tools/net/ynl/samples/netdev 12 Select ifc ($ifindex; or 0 = dump; or -2 ntf check): 12 veth1[12] xdp-features (23): basic redirect rx-sg xdp-rx-metadata-features (3): timestamp hash xdp-zc-max-segs=0 Cc: netdev@vger.kernel.org Cc: Willem de Bruijn Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20230913171350.369987-3-sdf@google.com Signed-off-by: Martin KaFai Lau --- Documentation/netlink/specs/netdev.yaml | 21 +++++++++++++++++++++ Documentation/networking/xdp-rx-metadata.rst | 7 +++++++ include/net/xdp.h | 5 ++++- include/uapi/linux/netdev.h | 16 ++++++++++++++++ kernel/bpf/offload.c | 2 +- net/core/netdev-genl.c | 12 +++++++++++- net/core/xdp.c | 4 ++-- tools/include/uapi/linux/netdev.h | 16 ++++++++++++++++ 8 files changed, 78 insertions(+), 5 deletions(-) (limited to 'tools/include/uapi/linux') diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index 1c7284fd535b..c46fcc78fc04 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -42,6 +42,19 @@ definitions: doc: This feature informs if netdev implements non-linear XDP buffer support in ndo_xdp_xmit callback. + - + type: flags + name: xdp-rx-metadata + render-max: true + entries: + - + name: timestamp + doc: + Device is capable of exposing receive HW timestamp via bpf_xdp_metadata_rx_timestamp(). + - + name: hash + doc: + Device is capable of exposing receive packet hash via bpf_xdp_metadata_rx_hash(). attribute-sets: - @@ -68,6 +81,13 @@ attribute-sets: type: u32 checks: min: 1 + - + name: xdp-rx-metadata-features + doc: Bitmask of supported XDP receive metadata features. + See Documentation/networking/xdp-rx-metadata.rst for more details. + type: u64 + enum: xdp-rx-metadata + enum-as-flags: true operations: list: @@ -84,6 +104,7 @@ operations: - ifindex - xdp-features - xdp-zc-max-segs + - xdp-rx-metadata-features dump: reply: *dev-all - diff --git a/Documentation/networking/xdp-rx-metadata.rst b/Documentation/networking/xdp-rx-metadata.rst index 25ce72af81c2..205696780b78 100644 --- a/Documentation/networking/xdp-rx-metadata.rst +++ b/Documentation/networking/xdp-rx-metadata.rst @@ -105,6 +105,13 @@ bpf_tail_call Adding programs that access metadata kfuncs to the ``BPF_MAP_TYPE_PROG_ARRAY`` is currently not supported. +Supported Devices +================= + +It is possible to query which kfunc the particular netdev implements via +netlink. See ``xdp-rx-metadata-features`` attribute set in +``Documentation/netlink/specs/netdev.yaml``. + Example ======= diff --git a/include/net/xdp.h b/include/net/xdp.h index d59e12f8f311..349c36fb5fd8 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -386,19 +386,22 @@ void xdp_attachment_setup(struct xdp_attachment_info *info, /* Define the relationship between xdp-rx-metadata kfunc and * various other entities: * - xdp_rx_metadata enum + * - netdev netlink enum (Documentation/netlink/specs/netdev.yaml) * - kfunc name * - xdp_metadata_ops field */ #define XDP_METADATA_KFUNC_xxx \ XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_TIMESTAMP, \ + NETDEV_XDP_RX_METADATA_TIMESTAMP, \ bpf_xdp_metadata_rx_timestamp, \ xmo_rx_timestamp) \ XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_HASH, \ + NETDEV_XDP_RX_METADATA_HASH, \ bpf_xdp_metadata_rx_hash, \ xmo_rx_hash) \ enum xdp_rx_metadata { -#define XDP_METADATA_KFUNC(name, _, __) name, +#define XDP_METADATA_KFUNC(name, _, __, ___) name, XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC MAX_XDP_METADATA_KFUNC, diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index c1634b95c223..2943a151d4f1 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -38,11 +38,27 @@ enum netdev_xdp_act { NETDEV_XDP_ACT_MASK = 127, }; +/** + * enum netdev_xdp_rx_metadata + * @NETDEV_XDP_RX_METADATA_TIMESTAMP: Device is capable of exposing receive HW + * timestamp via bpf_xdp_metadata_rx_timestamp(). + * @NETDEV_XDP_RX_METADATA_HASH: Device is capable of exposing receive packet + * hash via bpf_xdp_metadata_rx_hash(). + */ +enum netdev_xdp_rx_metadata { + NETDEV_XDP_RX_METADATA_TIMESTAMP = 1, + NETDEV_XDP_RX_METADATA_HASH = 2, + + /* private: */ + NETDEV_XDP_RX_METADATA_MASK = 3, +}; + enum { NETDEV_A_DEV_IFINDEX = 1, NETDEV_A_DEV_PAD, NETDEV_A_DEV_XDP_FEATURES, NETDEV_A_DEV_XDP_ZC_MAX_SEGS, + NETDEV_A_DEV_XDP_RX_METADATA_FEATURES, __NETDEV_A_DEV_MAX, NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1) diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 6aa6de8d715d..e7a1752b5a09 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -845,7 +845,7 @@ void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id) if (!ops) goto out; -#define XDP_METADATA_KFUNC(name, _, xmo) \ +#define XDP_METADATA_KFUNC(name, _, __, xmo) \ if (func_id == bpf_xdp_metadata_kfunc_id(name)) p = ops->xmo; XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index c1aea8b756b6..fe61f85bcf33 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -5,6 +5,7 @@ #include #include #include +#include #include "netdev-genl-gen.h" @@ -12,15 +13,24 @@ static int netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info) { + u64 xdp_rx_meta = 0; void *hdr; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; +#define XDP_METADATA_KFUNC(_, flag, __, xmo) \ + if (netdev->xdp_metadata_ops && netdev->xdp_metadata_ops->xmo) \ + xdp_rx_meta |= flag; +XDP_METADATA_KFUNC_xxx +#undef XDP_METADATA_KFUNC + if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) || nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_FEATURES, - netdev->xdp_features, NETDEV_A_DEV_PAD)) { + netdev->xdp_features, NETDEV_A_DEV_PAD) || + nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES, + xdp_rx_meta, NETDEV_A_DEV_PAD)) { genlmsg_cancel(rsp, hdr); return -EINVAL; } diff --git a/net/core/xdp.c b/net/core/xdp.c index bab563b2f812..df4789ab512d 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -741,7 +741,7 @@ __bpf_kfunc int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash, __diag_pop(); BTF_SET8_START(xdp_metadata_kfunc_ids) -#define XDP_METADATA_KFUNC(_, name, __) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS) +#define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS) XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC BTF_SET8_END(xdp_metadata_kfunc_ids) @@ -752,7 +752,7 @@ static const struct btf_kfunc_id_set xdp_metadata_kfunc_set = { }; BTF_ID_LIST(xdp_metadata_kfunc_ids_unsorted) -#define XDP_METADATA_KFUNC(name, str, _) BTF_ID(func, str) +#define XDP_METADATA_KFUNC(name, _, str, __) BTF_ID(func, str) XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index c1634b95c223..2943a151d4f1 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -38,11 +38,27 @@ enum netdev_xdp_act { NETDEV_XDP_ACT_MASK = 127, }; +/** + * enum netdev_xdp_rx_metadata + * @NETDEV_XDP_RX_METADATA_TIMESTAMP: Device is capable of exposing receive HW + * timestamp via bpf_xdp_metadata_rx_timestamp(). + * @NETDEV_XDP_RX_METADATA_HASH: Device is capable of exposing receive packet + * hash via bpf_xdp_metadata_rx_hash(). + */ +enum netdev_xdp_rx_metadata { + NETDEV_XDP_RX_METADATA_TIMESTAMP = 1, + NETDEV_XDP_RX_METADATA_HASH = 2, + + /* private: */ + NETDEV_XDP_RX_METADATA_MASK = 3, +}; + enum { NETDEV_A_DEV_IFINDEX = 1, NETDEV_A_DEV_PAD, NETDEV_A_DEV_XDP_FEATURES, NETDEV_A_DEV_XDP_ZC_MAX_SEGS, + NETDEV_A_DEV_XDP_RX_METADATA_FEATURES, __NETDEV_A_DEV_MAX, NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1) -- cgit v1.2.3 From e2b2cd592adbd303bcc02451d32fedd511000fb0 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 20 Sep 2023 23:31:38 +0200 Subject: bpf: Add missed value to kprobe_multi link info Add missed value to kprobe_multi link info to hold the stats of missed kprobe_multi probe. The missed counter gets incremented when fprobe fails the recursion check or there's no rethook available for return probe. In either case the attached bpf program is not executed. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Tested-by: Song Liu Reviewed-by: Song Liu Acked-by: Hou Tao Link: https://lore.kernel.org/bpf/20230920213145.1941596-3-jolsa@kernel.org --- include/uapi/linux/bpf.h | 1 + kernel/trace/bpf_trace.c | 1 + tools/include/uapi/linux/bpf.h | 1 + 3 files changed, 3 insertions(+) (limited to 'tools/include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 5f13db15a3c7..1da5b1bcce71 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6532,6 +6532,7 @@ struct bpf_link_info { __aligned_u64 addrs; __u32 count; /* in/out: kprobe_multi function count */ __u32 flags; + __u64 missed; } kprobe_multi; struct { __u32 type; /* enum bpf_perf_event_type */ diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 54827d04c9a6..6aec6e7d612a 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2614,6 +2614,7 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link, kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); info->kprobe_multi.count = kmulti_link->cnt; info->kprobe_multi.flags = kmulti_link->flags; + info->kprobe_multi.missed = kmulti_link->fp.nmissed; if (!uaddrs) return 0; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 5f13db15a3c7..1da5b1bcce71 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6532,6 +6532,7 @@ struct bpf_link_info { __aligned_u64 addrs; __u32 count; /* in/out: kprobe_multi function count */ __u32 flags; + __u64 missed; } kprobe_multi; struct { __u32 type; /* enum bpf_perf_event_type */ -- cgit v1.2.3 From 3acf8ace68230e9558cf916847f1cc9f208abdf1 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 20 Sep 2023 23:31:39 +0200 Subject: bpf: Add missed value to kprobe perf link info Add missed value to kprobe attached through perf link info to hold the stats of missed kprobe handler execution. The kprobe's missed counter gets incremented when kprobe handler is not executed due to another kprobe running on the same cpu. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20230920213145.1941596-4-jolsa@kernel.org --- include/linux/trace_events.h | 6 ++++-- include/uapi/linux/bpf.h | 1 + kernel/bpf/syscall.c | 14 ++++++++------ kernel/trace/bpf_trace.c | 5 +++-- kernel/trace/trace_kprobe.c | 14 +++++++++++--- tools/include/uapi/linux/bpf.h | 1 + 6 files changed, 28 insertions(+), 13 deletions(-) (limited to 'tools/include/uapi/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 21ae37e49319..5eb88a66eb68 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -761,7 +761,8 @@ struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name); void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp); int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, u32 *fd_type, const char **buf, - u64 *probe_offset, u64 *probe_addr); + u64 *probe_offset, u64 *probe_addr, + unsigned long *missed); int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); #else @@ -801,7 +802,7 @@ static inline void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) static inline int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, u32 *fd_type, const char **buf, u64 *probe_offset, - u64 *probe_addr) + u64 *probe_addr, unsigned long *missed) { return -EOPNOTSUPP; } @@ -877,6 +878,7 @@ extern void perf_kprobe_destroy(struct perf_event *event); extern int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type, const char **symbol, u64 *probe_offset, u64 *probe_addr, + unsigned long *missed, bool perf_type_tracepoint); #endif #ifdef CONFIG_UPROBE_EVENTS diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1da5b1bcce71..70bfa997e896 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6548,6 +6548,7 @@ struct bpf_link_info { __u32 name_len; __u32 offset; /* offset from func_name */ __u64 addr; + __u64 missed; } kprobe; /* BPF_PERF_EVENT_KPROBE, BPF_PERF_EVENT_KRETPROBE */ struct { __aligned_u64 tp_name; /* in/out */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 85c1d908f70f..6b5280f14a53 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3374,7 +3374,7 @@ static void bpf_perf_link_dealloc(struct bpf_link *link) static int bpf_perf_link_fill_common(const struct perf_event *event, char __user *uname, u32 ulen, u64 *probe_offset, u64 *probe_addr, - u32 *fd_type) + u32 *fd_type, unsigned long *missed) { const char *buf; u32 prog_id; @@ -3385,7 +3385,7 @@ static int bpf_perf_link_fill_common(const struct perf_event *event, return -EINVAL; err = bpf_get_perf_event_info(event, &prog_id, fd_type, &buf, - probe_offset, probe_addr); + probe_offset, probe_addr, missed); if (err) return err; if (!uname) @@ -3408,6 +3408,7 @@ static int bpf_perf_link_fill_common(const struct perf_event *event, static int bpf_perf_link_fill_kprobe(const struct perf_event *event, struct bpf_link_info *info) { + unsigned long missed; char __user *uname; u64 addr, offset; u32 ulen, type; @@ -3416,7 +3417,7 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event, uname = u64_to_user_ptr(info->perf_event.kprobe.func_name); ulen = info->perf_event.kprobe.name_len; err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr, - &type); + &type, &missed); if (err) return err; if (type == BPF_FD_TYPE_KRETPROBE) @@ -3425,6 +3426,7 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event, info->perf_event.type = BPF_PERF_EVENT_KPROBE; info->perf_event.kprobe.offset = offset; + info->perf_event.kprobe.missed = missed; if (!kallsyms_show_value(current_cred())) addr = 0; info->perf_event.kprobe.addr = addr; @@ -3444,7 +3446,7 @@ static int bpf_perf_link_fill_uprobe(const struct perf_event *event, uname = u64_to_user_ptr(info->perf_event.uprobe.file_name); ulen = info->perf_event.uprobe.name_len; err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr, - &type); + &type, NULL); if (err) return err; @@ -3480,7 +3482,7 @@ static int bpf_perf_link_fill_tracepoint(const struct perf_event *event, uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name); ulen = info->perf_event.tracepoint.name_len; info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT; - return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL); + return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL, NULL); } static int bpf_perf_link_fill_perf_event(const struct perf_event *event, @@ -4813,7 +4815,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr, err = bpf_get_perf_event_info(event, &prog_id, &fd_type, &buf, &probe_offset, - &probe_addr); + &probe_addr, NULL); if (!err) err = bpf_task_fd_query_copy(attr, uattr, prog_id, fd_type, buf, diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 6aec6e7d612a..f6a7d2524949 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2384,7 +2384,8 @@ int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog) int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, u32 *fd_type, const char **buf, - u64 *probe_offset, u64 *probe_addr) + u64 *probe_offset, u64 *probe_addr, + unsigned long *missed) { bool is_tracepoint, is_syscall_tp; struct bpf_prog *prog; @@ -2419,7 +2420,7 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, #ifdef CONFIG_KPROBE_EVENTS if (flags & TRACE_EVENT_FL_KPROBE) err = bpf_get_kprobe_info(event, fd_type, buf, - probe_offset, probe_addr, + probe_offset, probe_addr, missed, event->attr.type == PERF_TYPE_TRACEPOINT); #endif #ifdef CONFIG_UPROBE_EVENTS diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 3d7a180a8427..961a78ffd6d2 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1189,6 +1189,12 @@ static const struct file_operations kprobe_events_ops = { .write = probes_write, }; +static unsigned long trace_kprobe_missed(struct trace_kprobe *tk) +{ + return trace_kprobe_is_return(tk) ? + tk->rp.kp.nmissed + tk->rp.nmissed : tk->rp.kp.nmissed; +} + /* Probes profiling interfaces */ static int probes_profile_seq_show(struct seq_file *m, void *v) { @@ -1200,8 +1206,7 @@ static int probes_profile_seq_show(struct seq_file *m, void *v) return 0; tk = to_trace_kprobe(ev); - nmissed = trace_kprobe_is_return(tk) ? - tk->rp.kp.nmissed + tk->rp.nmissed : tk->rp.kp.nmissed; + nmissed = trace_kprobe_missed(tk); seq_printf(m, " %-44s %15lu %15lu\n", trace_probe_name(&tk->tp), trace_kprobe_nhit(tk), @@ -1547,7 +1552,8 @@ NOKPROBE_SYMBOL(kretprobe_perf_func); int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type, const char **symbol, u64 *probe_offset, - u64 *probe_addr, bool perf_type_tracepoint) + u64 *probe_addr, unsigned long *missed, + bool perf_type_tracepoint) { const char *pevent = trace_event_name(event->tp_event); const char *group = event->tp_event->class->system; @@ -1566,6 +1572,8 @@ int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type, *probe_addr = kallsyms_show_value(current_cred()) ? (unsigned long)tk->rp.kp.addr : 0; *symbol = tk->symbol; + if (missed) + *missed = trace_kprobe_missed(tk); return 0; } #endif /* CONFIG_PERF_EVENTS */ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 1da5b1bcce71..70bfa997e896 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6548,6 +6548,7 @@ struct bpf_link_info { __u32 name_len; __u32 offset; /* offset from func_name */ __u64 addr; + __u64 missed; } kprobe; /* BPF_PERF_EVENT_KPROBE, BPF_PERF_EVENT_KRETPROBE */ struct { __aligned_u64 tp_name; /* in/out */ -- cgit v1.2.3 From d6247ecb6c1e17d7a33317090627f5bfe563cbb2 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Wed, 4 Oct 2023 11:23:38 -0500 Subject: bpf: Add ability to pin bpf timer to calling CPU BPF supports creating high resolution timers using bpf_timer_* helper functions. Currently, only the BPF_F_TIMER_ABS flag is supported, which specifies that the timeout should be interpreted as absolute time. It would also be useful to be able to pin that timer to a core. For example, if you wanted to make a subset of cores run without timer interrupts, and only have the timer be invoked on a single core. This patch adds support for this with a new BPF_F_TIMER_CPU_PIN flag. When specified, the HRTIMER_MODE_PINNED flag is passed to hrtimer_start(). A subsequent patch will update selftests to validate. Signed-off-by: David Vernet Signed-off-by: Daniel Borkmann Acked-by: Song Liu Acked-by: Hou Tao Link: https://lore.kernel.org/bpf/20231004162339.200702-2-void@manifault.com --- include/uapi/linux/bpf.h | 4 ++++ kernel/bpf/helpers.c | 5 ++++- tools/include/uapi/linux/bpf.h | 4 ++++ 3 files changed, 12 insertions(+), 1 deletion(-) (limited to 'tools/include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 70bfa997e896..a7d4a1a69f21 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5096,6 +5096,8 @@ union bpf_attr { * **BPF_F_TIMER_ABS** * Start the timer in absolute expire value instead of the * default relative one. + * **BPF_F_TIMER_CPU_PIN** + * Timer will be pinned to the CPU of the caller. * * Return * 0 on success. @@ -7309,9 +7311,11 @@ struct bpf_core_relo { * Flags to control bpf_timer_start() behaviour. * - BPF_F_TIMER_ABS: Timeout passed is absolute time, by default it is * relative to current time. + * - BPF_F_TIMER_CPU_PIN: Timer will be pinned to the CPU of the caller. */ enum { BPF_F_TIMER_ABS = (1ULL << 0), + BPF_F_TIMER_CPU_PIN = (1ULL << 1), }; /* BPF numbers iterator state */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index dd1c69ee3375..d2840dd5b00d 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1272,7 +1272,7 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla if (in_nmi()) return -EOPNOTSUPP; - if (flags > BPF_F_TIMER_ABS) + if (flags & ~(BPF_F_TIMER_ABS | BPF_F_TIMER_CPU_PIN)) return -EINVAL; __bpf_spin_lock_irqsave(&timer->lock); t = timer->timer; @@ -1286,6 +1286,9 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla else mode = HRTIMER_MODE_REL_SOFT; + if (flags & BPF_F_TIMER_CPU_PIN) + mode |= HRTIMER_MODE_PINNED; + hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode); out: __bpf_spin_unlock_irqrestore(&timer->lock); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 70bfa997e896..a7d4a1a69f21 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5096,6 +5096,8 @@ union bpf_attr { * **BPF_F_TIMER_ABS** * Start the timer in absolute expire value instead of the * default relative one. + * **BPF_F_TIMER_CPU_PIN** + * Timer will be pinned to the CPU of the caller. * * Return * 0 on success. @@ -7309,9 +7311,11 @@ struct bpf_core_relo { * Flags to control bpf_timer_start() behaviour. * - BPF_F_TIMER_ABS: Timeout passed is absolute time, by default it is * relative to current time. + * - BPF_F_TIMER_CPU_PIN: Timer will be pinned to the CPU of the caller. */ enum { BPF_F_TIMER_ABS = (1ULL << 0), + BPF_F_TIMER_CPU_PIN = (1ULL << 1), }; /* BPF numbers iterator state */ -- cgit v1.2.3 From dab4e1f06cabb6834de14264394ccab197007302 Mon Sep 17 00:00:00 2001 From: Martynas Pumputis Date: Sat, 7 Oct 2023 10:14:14 +0200 Subject: bpf: Derive source IP addr via bpf_*_fib_lookup() Extend the bpf_fib_lookup() helper by making it to return the source IPv4/IPv6 address if the BPF_FIB_LOOKUP_SRC flag is set. For example, the following snippet can be used to derive the desired source IP address: struct bpf_fib_lookup p = { .ipv4_dst = ip4->daddr }; ret = bpf_skb_fib_lookup(skb, p, sizeof(p), BPF_FIB_LOOKUP_SRC | BPF_FIB_LOOKUP_SKIP_NEIGH); if (ret != BPF_FIB_LKUP_RET_SUCCESS) return TC_ACT_SHOT; /* the p.ipv4_src now contains the source address */ The inability to derive the proper source address may cause malfunctions in BPF-based dataplanes for hosts containing netdevs with more than one routable IP address or for multi-homed hosts. For example, Cilium implements packet masquerading in BPF. If an egressing netdev to which the Cilium's BPF prog is attached has multiple IP addresses, then only one [hardcoded] IP address can be used for masquerading. This breaks connectivity if any other IP address should have been selected instead, for example, when a public and private addresses are attached to the same egress interface. The change was tested with Cilium [1]. Nikolay Aleksandrov helped to figure out the IPv6 addr selection. [1]: https://github.com/cilium/cilium/pull/28283 Signed-off-by: Martynas Pumputis Link: https://lore.kernel.org/r/20231007081415.33502-2-m@lambda.lt Signed-off-by: Martin KaFai Lau --- include/net/ipv6_stubs.h | 5 +++++ include/uapi/linux/bpf.h | 10 ++++++++++ net/core/filter.c | 18 +++++++++++++++++- net/ipv6/af_inet6.c | 1 + tools/include/uapi/linux/bpf.h | 10 ++++++++++ 5 files changed, 43 insertions(+), 1 deletion(-) (limited to 'tools/include/uapi/linux') diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h index c48186bf4737..21da31e1dff5 100644 --- a/include/net/ipv6_stubs.h +++ b/include/net/ipv6_stubs.h @@ -85,6 +85,11 @@ struct ipv6_bpf_stub { sockptr_t optval, unsigned int optlen); int (*ipv6_getsockopt)(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen); + int (*ipv6_dev_get_saddr)(struct net *net, + const struct net_device *dst_dev, + const struct in6_addr *daddr, + unsigned int prefs, + struct in6_addr *saddr); }; extern const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a7d4a1a69f21..e0aa457f94a9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3264,6 +3264,11 @@ union bpf_attr { * and *params*->smac will not be set as output. A common * use case is to call **bpf_redirect_neigh**\ () after * doing **bpf_fib_lookup**\ (). + * **BPF_FIB_LOOKUP_SRC** + * Derive and set source IP addr in *params*->ipv{4,6}_src + * for the nexthop. If the src addr cannot be derived, + * **BPF_FIB_LKUP_RET_NO_SRC_ADDR** is returned. In this + * case, *params*->dmac and *params*->smac are not set either. * * *ctx* is either **struct xdp_md** for XDP programs or * **struct sk_buff** tc cls_act programs. @@ -6964,6 +6969,7 @@ enum { BPF_FIB_LOOKUP_OUTPUT = (1U << 1), BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2), BPF_FIB_LOOKUP_TBID = (1U << 3), + BPF_FIB_LOOKUP_SRC = (1U << 4), }; enum { @@ -6976,6 +6982,7 @@ enum { BPF_FIB_LKUP_RET_UNSUPP_LWT, /* fwd requires encapsulation */ BPF_FIB_LKUP_RET_NO_NEIGH, /* no neighbor entry for nh */ BPF_FIB_LKUP_RET_FRAG_NEEDED, /* fragmentation required to fwd */ + BPF_FIB_LKUP_RET_NO_SRC_ADDR, /* failed to derive IP src addr */ }; struct bpf_fib_lookup { @@ -7010,6 +7017,9 @@ struct bpf_fib_lookup { __u32 rt_metric; }; + /* input: source address to consider for lookup + * output: source address result from lookup + */ union { __be32 ipv4_src; __u32 ipv6_src[4]; /* in6_addr; network order */ diff --git a/net/core/filter.c b/net/core/filter.c index a094694899c9..3880bf0b740d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5850,6 +5850,9 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, params->rt_metric = res.fi->fib_priority; params->ifindex = dev->ifindex; + if (flags & BPF_FIB_LOOKUP_SRC) + params->ipv4_src = fib_result_prefsrc(net, &res); + /* xdp and cls_bpf programs are run in RCU-bh so * rcu_read_lock_bh is not needed here */ @@ -5992,6 +5995,18 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, params->rt_metric = res.f6i->fib6_metric; params->ifindex = dev->ifindex; + if (flags & BPF_FIB_LOOKUP_SRC) { + if (res.f6i->fib6_prefsrc.plen) { + *src = res.f6i->fib6_prefsrc.addr; + } else { + err = ipv6_bpf_stub->ipv6_dev_get_saddr(net, dev, + &fl6.daddr, 0, + src); + if (err) + return BPF_FIB_LKUP_RET_NO_SRC_ADDR; + } + } + if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH) goto set_fwd_params; @@ -6010,7 +6025,8 @@ set_fwd_params: #endif #define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \ - BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID) + BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID | \ + BPF_FIB_LOOKUP_SRC) BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx, struct bpf_fib_lookup *, params, int, plen, u32, flags) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index c6ad0d6e99b5..6337fb4504fd 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -1061,6 +1061,7 @@ static const struct ipv6_bpf_stub ipv6_bpf_stub_impl = { .udp6_lib_lookup = __udp6_lib_lookup, .ipv6_setsockopt = do_ipv6_setsockopt, .ipv6_getsockopt = do_ipv6_getsockopt, + .ipv6_dev_get_saddr = ipv6_dev_get_saddr, }; static int __init inet6_init(void) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index a7d4a1a69f21..e0aa457f94a9 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3264,6 +3264,11 @@ union bpf_attr { * and *params*->smac will not be set as output. A common * use case is to call **bpf_redirect_neigh**\ () after * doing **bpf_fib_lookup**\ (). + * **BPF_FIB_LOOKUP_SRC** + * Derive and set source IP addr in *params*->ipv{4,6}_src + * for the nexthop. If the src addr cannot be derived, + * **BPF_FIB_LKUP_RET_NO_SRC_ADDR** is returned. In this + * case, *params*->dmac and *params*->smac are not set either. * * *ctx* is either **struct xdp_md** for XDP programs or * **struct sk_buff** tc cls_act programs. @@ -6964,6 +6969,7 @@ enum { BPF_FIB_LOOKUP_OUTPUT = (1U << 1), BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2), BPF_FIB_LOOKUP_TBID = (1U << 3), + BPF_FIB_LOOKUP_SRC = (1U << 4), }; enum { @@ -6976,6 +6982,7 @@ enum { BPF_FIB_LKUP_RET_UNSUPP_LWT, /* fwd requires encapsulation */ BPF_FIB_LKUP_RET_NO_NEIGH, /* no neighbor entry for nh */ BPF_FIB_LKUP_RET_FRAG_NEEDED, /* fragmentation required to fwd */ + BPF_FIB_LKUP_RET_NO_SRC_ADDR, /* failed to derive IP src addr */ }; struct bpf_fib_lookup { @@ -7010,6 +7017,9 @@ struct bpf_fib_lookup { __u32 rt_metric; }; + /* input: source address to consider for lookup + * output: source address result from lookup + */ union { __be32 ipv4_src; __u32 ipv6_src[4]; /* in6_addr; network order */ -- cgit v1.2.3 From 859051dd165ec6cc915f0f2114699021144fd249 Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Wed, 11 Oct 2023 20:51:06 +0200 Subject: bpf: Implement cgroup sockaddr hooks for unix sockets These hooks allows intercepting connect(), getsockname(), getpeername(), sendmsg() and recvmsg() for unix sockets. The unix socket hooks get write access to the address length because the address length is not fixed when dealing with unix sockets and needs to be modified when a unix socket address is modified by the hook. Because abstract socket unix addresses start with a NUL byte, we cannot recalculate the socket address in kernelspace after running the hook by calculating the length of the unix socket path using strlen(). These hooks can be used when users want to multiplex syscall to a single unix socket to multiple different processes behind the scenes by redirecting the connect() and other syscalls to process specific sockets. We do not implement support for intercepting bind() because when using bind() with unix sockets with a pathname address, this creates an inode in the filesystem which must be cleaned up. If we rewrite the address, the user might try to clean up the wrong file, leaking the socket in the filesystem where it is never cleaned up. Until we figure out a solution for this (and a use case for intercepting bind()), we opt to not allow rewriting the sockaddr in bind() calls. We also implement recvmsg() support for connected streams so that after a connect() that is modified by a sockaddr hook, any corresponding recmvsg() on the connected socket can also be modified to make the connected program think it is connected to the "intended" remote. Reviewed-by: Kuniyuki Iwashima Signed-off-by: Daan De Meyer Link: https://lore.kernel.org/r/20231011185113.140426-5-daan.j.demeyer@gmail.com Signed-off-by: Martin KaFai Lau --- include/linux/bpf-cgroup-defs.h | 5 +++++ include/linux/bpf-cgroup.h | 17 +++++++++++++++++ include/uapi/linux/bpf.h | 13 +++++++++---- kernel/bpf/cgroup.c | 11 +++++++++-- kernel/bpf/syscall.c | 15 +++++++++++++++ kernel/bpf/verifier.c | 5 ++++- net/core/filter.c | 14 ++++++++++++-- net/unix/af_unix.c | 35 ++++++++++++++++++++++++++++++++++- tools/include/uapi/linux/bpf.h | 13 +++++++++---- 9 files changed, 114 insertions(+), 14 deletions(-) (limited to 'tools/include/uapi/linux') diff --git a/include/linux/bpf-cgroup-defs.h b/include/linux/bpf-cgroup-defs.h index 7b121bd780eb..0985221d5478 100644 --- a/include/linux/bpf-cgroup-defs.h +++ b/include/linux/bpf-cgroup-defs.h @@ -28,19 +28,24 @@ enum cgroup_bpf_attach_type { CGROUP_INET6_BIND, CGROUP_INET4_CONNECT, CGROUP_INET6_CONNECT, + CGROUP_UNIX_CONNECT, CGROUP_INET4_POST_BIND, CGROUP_INET6_POST_BIND, CGROUP_UDP4_SENDMSG, CGROUP_UDP6_SENDMSG, + CGROUP_UNIX_SENDMSG, CGROUP_SYSCTL, CGROUP_UDP4_RECVMSG, CGROUP_UDP6_RECVMSG, + CGROUP_UNIX_RECVMSG, CGROUP_GETSOCKOPT, CGROUP_SETSOCKOPT, CGROUP_INET4_GETPEERNAME, CGROUP_INET6_GETPEERNAME, + CGROUP_UNIX_GETPEERNAME, CGROUP_INET4_GETSOCKNAME, CGROUP_INET6_GETSOCKNAME, + CGROUP_UNIX_GETSOCKNAME, CGROUP_INET_SOCK_RELEASE, CGROUP_LSM_START, CGROUP_LSM_END = CGROUP_LSM_START + CGROUP_LSM_NUM - 1, diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 31561e789715..98b8cea904fe 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -48,19 +48,24 @@ to_cgroup_bpf_attach_type(enum bpf_attach_type attach_type) CGROUP_ATYPE(CGROUP_INET6_BIND); CGROUP_ATYPE(CGROUP_INET4_CONNECT); CGROUP_ATYPE(CGROUP_INET6_CONNECT); + CGROUP_ATYPE(CGROUP_UNIX_CONNECT); CGROUP_ATYPE(CGROUP_INET4_POST_BIND); CGROUP_ATYPE(CGROUP_INET6_POST_BIND); CGROUP_ATYPE(CGROUP_UDP4_SENDMSG); CGROUP_ATYPE(CGROUP_UDP6_SENDMSG); + CGROUP_ATYPE(CGROUP_UNIX_SENDMSG); CGROUP_ATYPE(CGROUP_SYSCTL); CGROUP_ATYPE(CGROUP_UDP4_RECVMSG); CGROUP_ATYPE(CGROUP_UDP6_RECVMSG); + CGROUP_ATYPE(CGROUP_UNIX_RECVMSG); CGROUP_ATYPE(CGROUP_GETSOCKOPT); CGROUP_ATYPE(CGROUP_SETSOCKOPT); CGROUP_ATYPE(CGROUP_INET4_GETPEERNAME); CGROUP_ATYPE(CGROUP_INET6_GETPEERNAME); + CGROUP_ATYPE(CGROUP_UNIX_GETPEERNAME); CGROUP_ATYPE(CGROUP_INET4_GETSOCKNAME); CGROUP_ATYPE(CGROUP_INET6_GETSOCKNAME); + CGROUP_ATYPE(CGROUP_UNIX_GETSOCKNAME); CGROUP_ATYPE(CGROUP_INET_SOCK_RELEASE); default: return CGROUP_BPF_ATTACH_TYPE_INVALID; @@ -289,18 +294,27 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk, #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, uaddrlen) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_INET6_CONNECT, NULL) +#define BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, uaddrlen) \ + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UNIX_CONNECT, NULL) + #define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP4_SENDMSG, t_ctx) #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP6_SENDMSG, t_ctx) +#define BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) \ + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UNIX_SENDMSG, t_ctx) + #define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr, uaddrlen) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP4_RECVMSG, NULL) #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr, uaddrlen) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP6_RECVMSG, NULL) +#define BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, uaddr, uaddrlen) \ + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UNIX_RECVMSG, NULL) + /* The SOCK_OPS"_SK" macro should be used when sock_ops->sk is not a * fullsock and its parent fullsock cannot be traced by * sk_to_full_sk(). @@ -492,10 +506,13 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, uaddrlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, uaddrlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, uaddrlen) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, uaddrlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) ({ 0; }) #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) ({ 0; }) #define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr, uaddrlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr, uaddrlen) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, uaddr, uaddrlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(atype, major, minor, access) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos) ({ 0; }) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e0aa457f94a9..7ba61b75bc0e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1047,6 +1047,11 @@ enum bpf_attach_type { BPF_TCX_INGRESS, BPF_TCX_EGRESS, BPF_TRACE_UPROBE_MULTI, + BPF_CGROUP_UNIX_CONNECT, + BPF_CGROUP_UNIX_SENDMSG, + BPF_CGROUP_UNIX_RECVMSG, + BPF_CGROUP_UNIX_GETPEERNAME, + BPF_CGROUP_UNIX_GETSOCKNAME, __MAX_BPF_ATTACH_TYPE }; @@ -2704,8 +2709,8 @@ union bpf_attr { * *bpf_socket* should be one of the following: * * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. - * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** - * and **BPF_CGROUP_INET6_CONNECT**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**, + * **BPF_CGROUP_INET6_CONNECT** and **BPF_CGROUP_UNIX_CONNECT**. * * This helper actually implements a subset of **setsockopt()**. * It supports the following *level*\ s: @@ -2943,8 +2948,8 @@ union bpf_attr { * *bpf_socket* should be one of the following: * * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. - * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** - * and **BPF_CGROUP_INET6_CONNECT**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**, + * **BPF_CGROUP_INET6_CONNECT** and **BPF_CGROUP_UNIX_CONNECT**. * * This helper actually implements a subset of **getsockopt()**. * It supports the same set of *optname*\ s that is supported by diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index ac37bd53aee0..74ad2215e1ba 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1458,7 +1458,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); * @flags: Pointer to u32 which contains higher bits of BPF program * return value (OR'ed together). * - * socket is expected to be of type INET or INET6. + * socket is expected to be of type INET, INET6 or UNIX. * * This function will return %-EPERM if an attached program is found and * returned value != 1 during execution. In all other cases, 0 is returned. @@ -1482,7 +1482,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, /* Check socket family since not all sockets represent network * endpoint (e.g. AF_UNIX). */ - if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) + if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6 && + sk->sk_family != AF_UNIX) return 0; if (!ctx.uaddr) { @@ -2533,10 +2534,13 @@ cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UNIX_RECVMSG: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: + case BPF_CGROUP_UNIX_GETSOCKNAME: return NULL; default: return &bpf_get_retval_proto; @@ -2548,10 +2552,13 @@ cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UNIX_RECVMSG: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: + case BPF_CGROUP_UNIX_GETSOCKNAME: return NULL; default: return &bpf_set_retval_proto; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6b5280f14a53..8677837f3deb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2446,14 +2446,19 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UNIX_CONNECT: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: + case BPF_CGROUP_UNIX_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UNIX_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UNIX_RECVMSG: return 0; default: return -EINVAL; @@ -3678,14 +3683,19 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UNIX_CONNECT: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: + case BPF_CGROUP_UNIX_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UNIX_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UNIX_RECVMSG: return BPF_PROG_TYPE_CGROUP_SOCK_ADDR; case BPF_CGROUP_SOCK_OPS: return BPF_PROG_TYPE_SOCK_OPS; @@ -3942,14 +3952,19 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET6_POST_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UNIX_CONNECT: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: + case BPF_CGROUP_UNIX_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UNIX_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UNIX_RECVMSG: case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_DEVICE: case BPF_CGROUP_SYSCTL: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index eed7350e15f4..e777f50401b6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14797,10 +14797,13 @@ static int check_return_code(struct bpf_verifier_env *env, int regno) case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG || + env->prog->expected_attach_type == BPF_CGROUP_UNIX_RECVMSG || env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME || env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME || + env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETPEERNAME || env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME || - env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME) + env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME || + env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETSOCKNAME) range = tnum_range(1, 1); if (env->prog->expected_attach_type == BPF_CGROUP_INET4_BIND || env->prog->expected_attach_type == BPF_CGROUP_INET6_BIND) diff --git a/net/core/filter.c b/net/core/filter.c index ff0bd9f20b95..cc2e4babc85f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7875,14 +7875,19 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UNIX_CONNECT: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UNIX_RECVMSG: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UNIX_SENDMSG: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: + case BPF_CGROUP_UNIX_GETSOCKNAME: return &bpf_sock_addr_setsockopt_proto; default: return NULL; @@ -7893,14 +7898,19 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UNIX_CONNECT: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UNIX_RECVMSG: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UNIX_SENDMSG: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: + case BPF_CGROUP_UNIX_GETSOCKNAME: return &bpf_sock_addr_getsockopt_proto; default: return NULL; @@ -8948,8 +8958,8 @@ static bool sock_addr_is_valid_access(int off, int size, if (off % size != 0) return false; - /* Disallow access to IPv6 fields from IPv4 contex and vise - * versa. + /* Disallow access to fields not belonging to the attach type's address + * family. */ switch (off) { case bpf_ctx_range(struct bpf_sock_addr, user_ip4): diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 3e8a04a13668..e10d07c76044 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -116,6 +116,7 @@ #include #include #include +#include #include "scm.h" @@ -1381,6 +1382,10 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, if (err) goto out; + err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen); + if (err) + goto out; + if ((test_bit(SOCK_PASSCRED, &sock->flags) || test_bit(SOCK_PASSPIDFD, &sock->flags)) && !unix_sk(sk)->addr) { @@ -1490,6 +1495,10 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, if (err) goto out; + err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len); + if (err) + goto out; + if ((test_bit(SOCK_PASSCRED, &sock->flags) || test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) { err = unix_autobind(sk); @@ -1770,6 +1779,13 @@ static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer) } else { err = addr->len; memcpy(sunaddr, addr->name, addr->len); + + if (peer) + BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, + CGROUP_UNIX_GETPEERNAME); + else + BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, + CGROUP_UNIX_GETSOCKNAME); } sock_put(sk); out: @@ -1922,6 +1938,13 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, err = unix_validate_addr(sunaddr, msg->msg_namelen); if (err) goto out; + + err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk, + msg->msg_name, + &msg->msg_namelen, + NULL); + if (err) + goto out; } else { sunaddr = NULL; err = -ENOTCONN; @@ -2390,9 +2413,14 @@ int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); - if (msg->msg_name) + if (msg->msg_name) { unix_copy_addr(msg, skb->sk); + BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, + msg->msg_name, + &msg->msg_namelen); + } + if (size > skb->len - skip) size = skb->len - skip; else if (size < skb->len - skip) @@ -2744,6 +2772,11 @@ unlock: DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, state->msg->msg_name); unix_copy_addr(state->msg, skb->sk); + + BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, + state->msg->msg_name, + &state->msg->msg_namelen); + sunaddr = NULL; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e0aa457f94a9..7ba61b75bc0e 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1047,6 +1047,11 @@ enum bpf_attach_type { BPF_TCX_INGRESS, BPF_TCX_EGRESS, BPF_TRACE_UPROBE_MULTI, + BPF_CGROUP_UNIX_CONNECT, + BPF_CGROUP_UNIX_SENDMSG, + BPF_CGROUP_UNIX_RECVMSG, + BPF_CGROUP_UNIX_GETPEERNAME, + BPF_CGROUP_UNIX_GETSOCKNAME, __MAX_BPF_ATTACH_TYPE }; @@ -2704,8 +2709,8 @@ union bpf_attr { * *bpf_socket* should be one of the following: * * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. - * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** - * and **BPF_CGROUP_INET6_CONNECT**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**, + * **BPF_CGROUP_INET6_CONNECT** and **BPF_CGROUP_UNIX_CONNECT**. * * This helper actually implements a subset of **setsockopt()**. * It supports the following *level*\ s: @@ -2943,8 +2948,8 @@ union bpf_attr { * *bpf_socket* should be one of the following: * * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. - * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** - * and **BPF_CGROUP_INET6_CONNECT**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**, + * **BPF_CGROUP_INET6_CONNECT** and **BPF_CGROUP_UNIX_CONNECT**. * * This helper actually implements a subset of **getsockopt()**. * It supports the same set of *optname*\ s that is supported by -- cgit v1.2.3 From 35dfaad7188cdc043fde31709c796f5a692ba2bd Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 24 Oct 2023 23:48:58 +0200 Subject: netkit, bpf: Add bpf programmable net device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This work adds a new, minimal BPF-programmable device called "netkit" (former PoC code-name "meta") we recently presented at LSF/MM/BPF. The core idea is that BPF programs are executed within the drivers xmit routine and therefore e.g. in case of containers/Pods moving BPF processing closer to the source. One of the goals was that in case of Pod egress traffic, this allows to move BPF programs from hostns tcx ingress into the device itself, providing earlier drop or forward mechanisms, for example, if the BPF program determines that the skb must be sent out of the node, then a redirect to the physical device can take place directly without going through per-CPU backlog queue. This helps to shift processing for such traffic from softirq to process context, leading to better scheduling decisions/performance (see measurements in the slides). In this initial version, the netkit device ships as a pair, but we plan to extend this further so it can also operate in single device mode. The pair comes with a primary and a peer device. Only the primary device, typically residing in hostns, can manage BPF programs for itself and its peer. The peer device is designated for containers/Pods and cannot attach/detach BPF programs. Upon the device creation, the user can set the default policy to 'pass' or 'drop' for the case when no BPF program is attached. Additionally, the device can be operated in L3 (default) or L2 mode. The management of BPF programs is done via bpf_mprog, so that multi-attach is supported right from the beginning with similar API and dependency controls as tcx. For details on the latter see commit 053c8e1f235d ("bpf: Add generic attach/detach/query API for multi-progs"). tc BPF compatibility is provided, so that existing programs can be easily migrated. Going forward, we plan to use netkit devices in Cilium as the main device type for connecting Pods. They will be operated in L3 mode in order to simplify a Pod's neighbor management and the peer will operate in default drop mode, so that no traffic is leaving between the time when a Pod is brought up by the CNI plugin and programs attached by the agent. Additionally, the programs we attach via tcx on the physical devices are using bpf_redirect_peer() for inbound traffic into netkit device, hence the latter is also supporting the ndo_get_peer_dev callback. Similarly, we use bpf_redirect_neigh() for the way out, pushing from netkit peer to phys device directly. Also, BIG TCP is supported on netkit device. For the follow-up work in single device mode, we plan to convert Cilium's cilium_host/_net devices into a single one. An extensive test suite for checking device operations and the BPF program and link management API comes as BPF selftests in this series. Co-developed-by: Nikolay Aleksandrov Signed-off-by: Nikolay Aleksandrov Signed-off-by: Daniel Borkmann Reviewed-by: Toke Høiland-Jørgensen Acked-by: Stanislav Fomichev Acked-by: Martin KaFai Lau Link: https://github.com/borkmann/iproute2/tree/pr/netkit Link: http://vger.kernel.org/bpfconf2023_material/tcx_meta_netdev_borkmann.pdf (24ff.) Link: https://lore.kernel.org/r/20231024214904.29825-2-daniel@iogearbox.net Signed-off-by: Martin KaFai Lau --- MAINTAINERS | 9 + drivers/net/Kconfig | 9 + drivers/net/Makefile | 1 + drivers/net/netkit.c | 940 +++++++++++++++++++++++++++++++++++++++++ include/net/netkit.h | 38 ++ include/uapi/linux/bpf.h | 14 + include/uapi/linux/if_link.h | 24 ++ kernel/bpf/syscall.c | 30 +- tools/include/uapi/linux/bpf.h | 14 + 9 files changed, 1074 insertions(+), 5 deletions(-) create mode 100644 drivers/net/netkit.c create mode 100644 include/net/netkit.h (limited to 'tools/include/uapi/linux') diff --git a/MAINTAINERS b/MAINTAINERS index ed33b9df8b3d..43be6197e655 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3795,6 +3795,15 @@ L: bpf@vger.kernel.org S: Odd Fixes K: (?:\b|_)bpf(?:\b|_) +BPF [NETKIT] (BPF-programmable network device) +M: Daniel Borkmann +M: Nikolay Aleksandrov +L: bpf@vger.kernel.org +L: netdev@vger.kernel.org +S: Supported +F: drivers/net/netkit.c +F: include/net/netkit.h + BPF [NETWORKING] (struct_ops, reuseport) M: Martin KaFai Lau L: bpf@vger.kernel.org diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 44eeb5d61ba9..af0da4bb429b 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -448,6 +448,15 @@ config NLMON diagnostics, etc. This is mostly intended for developers or support to debug netlink issues. If unsure, say N. +config NETKIT + bool "BPF-programmable network device" + depends on BPF_SYSCALL + help + The netkit device is a virtual networking device where BPF programs + can be attached to the device(s) transmission routine in order to + implement the driver's internal logic. The device can be configured + to operate in L3 or L2 mode. If unsure, say N. + config NET_VRF tristate "Virtual Routing and Forwarding (Lite)" depends on IP_MULTIPLE_TABLES diff --git a/drivers/net/Makefile b/drivers/net/Makefile index 8a83db32509d..7cab36f94782 100644 --- a/drivers/net/Makefile +++ b/drivers/net/Makefile @@ -22,6 +22,7 @@ obj-$(CONFIG_MDIO) += mdio.o obj-$(CONFIG_NET) += loopback.o obj-$(CONFIG_NETDEV_LEGACY_INIT) += Space.o obj-$(CONFIG_NETCONSOLE) += netconsole.o +obj-$(CONFIG_NETKIT) += netkit.o obj-y += phy/ obj-y += pse-pd/ obj-y += mdio/ diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c new file mode 100644 index 000000000000..7e484f9fd3ae --- /dev/null +++ b/drivers/net/netkit.c @@ -0,0 +1,940 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2023 Isovalent */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#define DRV_NAME "netkit" + +struct netkit { + /* Needed in fast-path */ + struct net_device __rcu *peer; + struct bpf_mprog_entry __rcu *active; + enum netkit_action policy; + struct bpf_mprog_bundle bundle; + + /* Needed in slow-path */ + enum netkit_mode mode; + bool primary; + u32 headroom; +}; + +struct netkit_link { + struct bpf_link link; + struct net_device *dev; + u32 location; +}; + +static __always_inline int +netkit_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb, + enum netkit_action ret) +{ + const struct bpf_mprog_fp *fp; + const struct bpf_prog *prog; + + bpf_mprog_foreach_prog(entry, fp, prog) { + bpf_compute_data_pointers(skb); + ret = bpf_prog_run(prog, skb); + if (ret != NETKIT_NEXT) + break; + } + return ret; +} + +static void netkit_prep_forward(struct sk_buff *skb, bool xnet) +{ + skb_scrub_packet(skb, xnet); + skb->priority = 0; + nf_skip_egress(skb, true); +} + +static struct netkit *netkit_priv(const struct net_device *dev) +{ + return netdev_priv(dev); +} + +static netdev_tx_t netkit_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct netkit *nk = netkit_priv(dev); + enum netkit_action ret = READ_ONCE(nk->policy); + netdev_tx_t ret_dev = NET_XMIT_SUCCESS; + const struct bpf_mprog_entry *entry; + struct net_device *peer; + + rcu_read_lock(); + peer = rcu_dereference(nk->peer); + if (unlikely(!peer || !(peer->flags & IFF_UP) || + !pskb_may_pull(skb, ETH_HLEN) || + skb_orphan_frags(skb, GFP_ATOMIC))) + goto drop; + netkit_prep_forward(skb, !net_eq(dev_net(dev), dev_net(peer))); + skb->dev = peer; + entry = rcu_dereference(nk->active); + if (entry) + ret = netkit_run(entry, skb, ret); + switch (ret) { + case NETKIT_NEXT: + case NETKIT_PASS: + skb->protocol = eth_type_trans(skb, skb->dev); + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); + __netif_rx(skb); + break; + case NETKIT_REDIRECT: + skb_do_redirect(skb); + break; + case NETKIT_DROP: + default: +drop: + kfree_skb(skb); + dev_core_stats_tx_dropped_inc(dev); + ret_dev = NET_XMIT_DROP; + break; + } + rcu_read_unlock(); + return ret_dev; +} + +static int netkit_open(struct net_device *dev) +{ + struct netkit *nk = netkit_priv(dev); + struct net_device *peer = rtnl_dereference(nk->peer); + + if (!peer) + return -ENOTCONN; + if (peer->flags & IFF_UP) { + netif_carrier_on(dev); + netif_carrier_on(peer); + } + return 0; +} + +static int netkit_close(struct net_device *dev) +{ + struct netkit *nk = netkit_priv(dev); + struct net_device *peer = rtnl_dereference(nk->peer); + + netif_carrier_off(dev); + if (peer) + netif_carrier_off(peer); + return 0; +} + +static int netkit_get_iflink(const struct net_device *dev) +{ + struct netkit *nk = netkit_priv(dev); + struct net_device *peer; + int iflink = 0; + + rcu_read_lock(); + peer = rcu_dereference(nk->peer); + if (peer) + iflink = peer->ifindex; + rcu_read_unlock(); + return iflink; +} + +static void netkit_set_multicast(struct net_device *dev) +{ + /* Nothing to do, we receive whatever gets pushed to us! */ +} + +static void netkit_set_headroom(struct net_device *dev, int headroom) +{ + struct netkit *nk = netkit_priv(dev), *nk2; + struct net_device *peer; + + if (headroom < 0) + headroom = NET_SKB_PAD; + + rcu_read_lock(); + peer = rcu_dereference(nk->peer); + if (unlikely(!peer)) + goto out; + + nk2 = netkit_priv(peer); + nk->headroom = headroom; + headroom = max(nk->headroom, nk2->headroom); + + peer->needed_headroom = headroom; + dev->needed_headroom = headroom; +out: + rcu_read_unlock(); +} + +static struct net_device *netkit_peer_dev(struct net_device *dev) +{ + return rcu_dereference(netkit_priv(dev)->peer); +} + +static void netkit_uninit(struct net_device *dev); + +static const struct net_device_ops netkit_netdev_ops = { + .ndo_open = netkit_open, + .ndo_stop = netkit_close, + .ndo_start_xmit = netkit_xmit, + .ndo_set_rx_mode = netkit_set_multicast, + .ndo_set_rx_headroom = netkit_set_headroom, + .ndo_get_iflink = netkit_get_iflink, + .ndo_get_peer_dev = netkit_peer_dev, + .ndo_uninit = netkit_uninit, + .ndo_features_check = passthru_features_check, +}; + +static void netkit_get_drvinfo(struct net_device *dev, + struct ethtool_drvinfo *info) +{ + strscpy(info->driver, DRV_NAME, sizeof(info->driver)); +} + +static const struct ethtool_ops netkit_ethtool_ops = { + .get_drvinfo = netkit_get_drvinfo, +}; + +static void netkit_setup(struct net_device *dev) +{ + static const netdev_features_t netkit_features_hw_vlan = + NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_CTAG_RX | + NETIF_F_HW_VLAN_STAG_TX | + NETIF_F_HW_VLAN_STAG_RX; + static const netdev_features_t netkit_features = + netkit_features_hw_vlan | + NETIF_F_SG | + NETIF_F_FRAGLIST | + NETIF_F_HW_CSUM | + NETIF_F_RXCSUM | + NETIF_F_SCTP_CRC | + NETIF_F_HIGHDMA | + NETIF_F_GSO_SOFTWARE | + NETIF_F_GSO_ENCAP_ALL; + + ether_setup(dev); + dev->max_mtu = ETH_MAX_MTU; + + dev->flags |= IFF_NOARP; + dev->priv_flags &= ~IFF_TX_SKB_SHARING; + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + dev->priv_flags |= IFF_PHONY_HEADROOM; + dev->priv_flags |= IFF_NO_QUEUE; + + dev->ethtool_ops = &netkit_ethtool_ops; + dev->netdev_ops = &netkit_netdev_ops; + + dev->features |= netkit_features | NETIF_F_LLTX; + dev->hw_features = netkit_features; + dev->hw_enc_features = netkit_features; + dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; + dev->vlan_features = dev->features & ~netkit_features_hw_vlan; + + dev->needs_free_netdev = true; + + netif_set_tso_max_size(dev, GSO_MAX_SIZE); +} + +static struct net *netkit_get_link_net(const struct net_device *dev) +{ + struct netkit *nk = netkit_priv(dev); + struct net_device *peer = rtnl_dereference(nk->peer); + + return peer ? dev_net(peer) : dev_net(dev); +} + +static int netkit_check_policy(int policy, struct nlattr *tb, + struct netlink_ext_ack *extack) +{ + switch (policy) { + case NETKIT_PASS: + case NETKIT_DROP: + return 0; + default: + NL_SET_ERR_MSG_ATTR(extack, tb, + "Provided default xmit policy not supported"); + return -EINVAL; + } +} + +static int netkit_check_mode(int mode, struct nlattr *tb, + struct netlink_ext_ack *extack) +{ + switch (mode) { + case NETKIT_L2: + case NETKIT_L3: + return 0; + default: + NL_SET_ERR_MSG_ATTR(extack, tb, + "Provided device mode can only be L2 or L3"); + return -EINVAL; + } +} + +static int netkit_validate(struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct nlattr *attr = tb[IFLA_ADDRESS]; + + if (!attr) + return 0; + NL_SET_ERR_MSG_ATTR(extack, attr, + "Setting Ethernet address is not supported"); + return -EOPNOTSUPP; +} + +static struct rtnl_link_ops netkit_link_ops; + +static int netkit_new_link(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct nlattr *peer_tb[IFLA_MAX + 1], **tbp = tb, *attr; + enum netkit_action default_prim = NETKIT_PASS; + enum netkit_action default_peer = NETKIT_PASS; + enum netkit_mode mode = NETKIT_L3; + unsigned char ifname_assign_type; + struct ifinfomsg *ifmp = NULL; + struct net_device *peer; + char ifname[IFNAMSIZ]; + struct netkit *nk; + struct net *net; + int err; + + if (data) { + if (data[IFLA_NETKIT_MODE]) { + attr = data[IFLA_NETKIT_MODE]; + mode = nla_get_u32(attr); + err = netkit_check_mode(mode, attr, extack); + if (err < 0) + return err; + } + if (data[IFLA_NETKIT_PEER_INFO]) { + attr = data[IFLA_NETKIT_PEER_INFO]; + ifmp = nla_data(attr); + err = rtnl_nla_parse_ifinfomsg(peer_tb, attr, extack); + if (err < 0) + return err; + err = netkit_validate(peer_tb, NULL, extack); + if (err < 0) + return err; + tbp = peer_tb; + } + if (data[IFLA_NETKIT_POLICY]) { + attr = data[IFLA_NETKIT_POLICY]; + default_prim = nla_get_u32(attr); + err = netkit_check_policy(default_prim, attr, extack); + if (err < 0) + return err; + } + if (data[IFLA_NETKIT_PEER_POLICY]) { + attr = data[IFLA_NETKIT_PEER_POLICY]; + default_peer = nla_get_u32(attr); + err = netkit_check_policy(default_peer, attr, extack); + if (err < 0) + return err; + } + } + + if (ifmp && tbp[IFLA_IFNAME]) { + nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); + ifname_assign_type = NET_NAME_USER; + } else { + strscpy(ifname, "nk%d", IFNAMSIZ); + ifname_assign_type = NET_NAME_ENUM; + } + + net = rtnl_link_get_net(src_net, tbp); + if (IS_ERR(net)) + return PTR_ERR(net); + + peer = rtnl_create_link(net, ifname, ifname_assign_type, + &netkit_link_ops, tbp, extack); + if (IS_ERR(peer)) { + put_net(net); + return PTR_ERR(peer); + } + + netif_inherit_tso_max(peer, dev); + + if (mode == NETKIT_L2) + eth_hw_addr_random(peer); + if (ifmp && dev->ifindex) + peer->ifindex = ifmp->ifi_index; + + nk = netkit_priv(peer); + nk->primary = false; + nk->policy = default_peer; + nk->mode = mode; + bpf_mprog_bundle_init(&nk->bundle); + RCU_INIT_POINTER(nk->active, NULL); + RCU_INIT_POINTER(nk->peer, NULL); + + err = register_netdevice(peer); + put_net(net); + if (err < 0) + goto err_register_peer; + netif_carrier_off(peer); + if (mode == NETKIT_L2) + dev_change_flags(peer, peer->flags & ~IFF_NOARP, NULL); + + err = rtnl_configure_link(peer, NULL, 0, NULL); + if (err < 0) + goto err_configure_peer; + + if (mode == NETKIT_L2) + eth_hw_addr_random(dev); + if (tb[IFLA_IFNAME]) + nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); + else + strscpy(dev->name, "nk%d", IFNAMSIZ); + + nk = netkit_priv(dev); + nk->primary = true; + nk->policy = default_prim; + nk->mode = mode; + bpf_mprog_bundle_init(&nk->bundle); + RCU_INIT_POINTER(nk->active, NULL); + RCU_INIT_POINTER(nk->peer, NULL); + + err = register_netdevice(dev); + if (err < 0) + goto err_configure_peer; + netif_carrier_off(dev); + if (mode == NETKIT_L2) + dev_change_flags(dev, dev->flags & ~IFF_NOARP, NULL); + + rcu_assign_pointer(netkit_priv(dev)->peer, peer); + rcu_assign_pointer(netkit_priv(peer)->peer, dev); + return 0; +err_configure_peer: + unregister_netdevice(peer); + return err; +err_register_peer: + free_netdev(peer); + return err; +} + +static struct bpf_mprog_entry *netkit_entry_fetch(struct net_device *dev, + bool bundle_fallback) +{ + struct netkit *nk = netkit_priv(dev); + struct bpf_mprog_entry *entry; + + ASSERT_RTNL(); + entry = rcu_dereference_rtnl(nk->active); + if (entry) + return entry; + if (bundle_fallback) + return &nk->bundle.a; + return NULL; +} + +static void netkit_entry_update(struct net_device *dev, + struct bpf_mprog_entry *entry) +{ + struct netkit *nk = netkit_priv(dev); + + ASSERT_RTNL(); + rcu_assign_pointer(nk->active, entry); +} + +static void netkit_entry_sync(void) +{ + synchronize_rcu(); +} + +static struct net_device *netkit_dev_fetch(struct net *net, u32 ifindex, u32 which) +{ + struct net_device *dev; + struct netkit *nk; + + ASSERT_RTNL(); + + switch (which) { + case BPF_NETKIT_PRIMARY: + case BPF_NETKIT_PEER: + break; + default: + return ERR_PTR(-EINVAL); + } + + dev = __dev_get_by_index(net, ifindex); + if (!dev) + return ERR_PTR(-ENODEV); + if (dev->netdev_ops != &netkit_netdev_ops) + return ERR_PTR(-ENXIO); + + nk = netkit_priv(dev); + if (!nk->primary) + return ERR_PTR(-EACCES); + if (which == BPF_NETKIT_PEER) { + dev = rcu_dereference_rtnl(nk->peer); + if (!dev) + return ERR_PTR(-ENODEV); + } + return dev; +} + +int netkit_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_mprog_entry *entry, *entry_new; + struct bpf_prog *replace_prog = NULL; + struct net_device *dev; + int ret; + + rtnl_lock(); + dev = netkit_dev_fetch(current->nsproxy->net_ns, attr->target_ifindex, + attr->attach_type); + if (IS_ERR(dev)) { + ret = PTR_ERR(dev); + goto out; + } + entry = netkit_entry_fetch(dev, true); + if (attr->attach_flags & BPF_F_REPLACE) { + replace_prog = bpf_prog_get_type(attr->replace_bpf_fd, + prog->type); + if (IS_ERR(replace_prog)) { + ret = PTR_ERR(replace_prog); + replace_prog = NULL; + goto out; + } + } + ret = bpf_mprog_attach(entry, &entry_new, prog, NULL, replace_prog, + attr->attach_flags, attr->relative_fd, + attr->expected_revision); + if (!ret) { + if (entry != entry_new) { + netkit_entry_update(dev, entry_new); + netkit_entry_sync(); + } + bpf_mprog_commit(entry); + } +out: + if (replace_prog) + bpf_prog_put(replace_prog); + rtnl_unlock(); + return ret; +} + +int netkit_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_mprog_entry *entry, *entry_new; + struct net_device *dev; + int ret; + + rtnl_lock(); + dev = netkit_dev_fetch(current->nsproxy->net_ns, attr->target_ifindex, + attr->attach_type); + if (IS_ERR(dev)) { + ret = PTR_ERR(dev); + goto out; + } + entry = netkit_entry_fetch(dev, false); + if (!entry) { + ret = -ENOENT; + goto out; + } + ret = bpf_mprog_detach(entry, &entry_new, prog, NULL, attr->attach_flags, + attr->relative_fd, attr->expected_revision); + if (!ret) { + if (!bpf_mprog_total(entry_new)) + entry_new = NULL; + netkit_entry_update(dev, entry_new); + netkit_entry_sync(); + bpf_mprog_commit(entry); + } +out: + rtnl_unlock(); + return ret; +} + +int netkit_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) +{ + struct net_device *dev; + int ret; + + rtnl_lock(); + dev = netkit_dev_fetch(current->nsproxy->net_ns, + attr->query.target_ifindex, + attr->query.attach_type); + if (IS_ERR(dev)) { + ret = PTR_ERR(dev); + goto out; + } + ret = bpf_mprog_query(attr, uattr, netkit_entry_fetch(dev, false)); +out: + rtnl_unlock(); + return ret; +} + +static struct netkit_link *netkit_link(const struct bpf_link *link) +{ + return container_of(link, struct netkit_link, link); +} + +static int netkit_link_prog_attach(struct bpf_link *link, u32 flags, + u32 id_or_fd, u64 revision) +{ + struct netkit_link *nkl = netkit_link(link); + struct bpf_mprog_entry *entry, *entry_new; + struct net_device *dev = nkl->dev; + int ret; + + ASSERT_RTNL(); + entry = netkit_entry_fetch(dev, true); + ret = bpf_mprog_attach(entry, &entry_new, link->prog, link, NULL, flags, + id_or_fd, revision); + if (!ret) { + if (entry != entry_new) { + netkit_entry_update(dev, entry_new); + netkit_entry_sync(); + } + bpf_mprog_commit(entry); + } + return ret; +} + +static void netkit_link_release(struct bpf_link *link) +{ + struct netkit_link *nkl = netkit_link(link); + struct bpf_mprog_entry *entry, *entry_new; + struct net_device *dev; + int ret = 0; + + rtnl_lock(); + dev = nkl->dev; + if (!dev) + goto out; + entry = netkit_entry_fetch(dev, false); + if (!entry) { + ret = -ENOENT; + goto out; + } + ret = bpf_mprog_detach(entry, &entry_new, link->prog, link, 0, 0, 0); + if (!ret) { + if (!bpf_mprog_total(entry_new)) + entry_new = NULL; + netkit_entry_update(dev, entry_new); + netkit_entry_sync(); + bpf_mprog_commit(entry); + nkl->dev = NULL; + } +out: + WARN_ON_ONCE(ret); + rtnl_unlock(); +} + +static int netkit_link_update(struct bpf_link *link, struct bpf_prog *nprog, + struct bpf_prog *oprog) +{ + struct netkit_link *nkl = netkit_link(link); + struct bpf_mprog_entry *entry, *entry_new; + struct net_device *dev; + int ret = 0; + + rtnl_lock(); + dev = nkl->dev; + if (!dev) { + ret = -ENOLINK; + goto out; + } + if (oprog && link->prog != oprog) { + ret = -EPERM; + goto out; + } + oprog = link->prog; + if (oprog == nprog) { + bpf_prog_put(nprog); + goto out; + } + entry = netkit_entry_fetch(dev, false); + if (!entry) { + ret = -ENOENT; + goto out; + } + ret = bpf_mprog_attach(entry, &entry_new, nprog, link, oprog, + BPF_F_REPLACE | BPF_F_ID, + link->prog->aux->id, 0); + if (!ret) { + WARN_ON_ONCE(entry != entry_new); + oprog = xchg(&link->prog, nprog); + bpf_prog_put(oprog); + bpf_mprog_commit(entry); + } +out: + rtnl_unlock(); + return ret; +} + +static void netkit_link_dealloc(struct bpf_link *link) +{ + kfree(netkit_link(link)); +} + +static void netkit_link_fdinfo(const struct bpf_link *link, struct seq_file *seq) +{ + const struct netkit_link *nkl = netkit_link(link); + u32 ifindex = 0; + + rtnl_lock(); + if (nkl->dev) + ifindex = nkl->dev->ifindex; + rtnl_unlock(); + + seq_printf(seq, "ifindex:\t%u\n", ifindex); + seq_printf(seq, "attach_type:\t%u (%s)\n", + nkl->location, + nkl->location == BPF_NETKIT_PRIMARY ? "primary" : "peer"); +} + +static int netkit_link_fill_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + const struct netkit_link *nkl = netkit_link(link); + u32 ifindex = 0; + + rtnl_lock(); + if (nkl->dev) + ifindex = nkl->dev->ifindex; + rtnl_unlock(); + + info->netkit.ifindex = ifindex; + info->netkit.attach_type = nkl->location; + return 0; +} + +static int netkit_link_detach(struct bpf_link *link) +{ + netkit_link_release(link); + return 0; +} + +static const struct bpf_link_ops netkit_link_lops = { + .release = netkit_link_release, + .detach = netkit_link_detach, + .dealloc = netkit_link_dealloc, + .update_prog = netkit_link_update, + .show_fdinfo = netkit_link_fdinfo, + .fill_link_info = netkit_link_fill_info, +}; + +static int netkit_link_init(struct netkit_link *nkl, + struct bpf_link_primer *link_primer, + const union bpf_attr *attr, + struct net_device *dev, + struct bpf_prog *prog) +{ + bpf_link_init(&nkl->link, BPF_LINK_TYPE_NETKIT, + &netkit_link_lops, prog); + nkl->location = attr->link_create.attach_type; + nkl->dev = dev; + return bpf_link_prime(&nkl->link, link_primer); +} + +int netkit_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_link_primer link_primer; + struct netkit_link *nkl; + struct net_device *dev; + int ret; + + rtnl_lock(); + dev = netkit_dev_fetch(current->nsproxy->net_ns, + attr->link_create.target_ifindex, + attr->link_create.attach_type); + if (IS_ERR(dev)) { + ret = PTR_ERR(dev); + goto out; + } + nkl = kzalloc(sizeof(*nkl), GFP_KERNEL_ACCOUNT); + if (!nkl) { + ret = -ENOMEM; + goto out; + } + ret = netkit_link_init(nkl, &link_primer, attr, dev, prog); + if (ret) { + kfree(nkl); + goto out; + } + ret = netkit_link_prog_attach(&nkl->link, + attr->link_create.flags, + attr->link_create.netkit.relative_fd, + attr->link_create.netkit.expected_revision); + if (ret) { + nkl->dev = NULL; + bpf_link_cleanup(&link_primer); + goto out; + } + ret = bpf_link_settle(&link_primer); +out: + rtnl_unlock(); + return ret; +} + +static void netkit_release_all(struct net_device *dev) +{ + struct bpf_mprog_entry *entry; + struct bpf_tuple tuple = {}; + struct bpf_mprog_fp *fp; + struct bpf_mprog_cp *cp; + + entry = netkit_entry_fetch(dev, false); + if (!entry) + return; + netkit_entry_update(dev, NULL); + netkit_entry_sync(); + bpf_mprog_foreach_tuple(entry, fp, cp, tuple) { + if (tuple.link) + netkit_link(tuple.link)->dev = NULL; + else + bpf_prog_put(tuple.prog); + } +} + +static void netkit_uninit(struct net_device *dev) +{ + netkit_release_all(dev); +} + +static void netkit_del_link(struct net_device *dev, struct list_head *head) +{ + struct netkit *nk = netkit_priv(dev); + struct net_device *peer = rtnl_dereference(nk->peer); + + RCU_INIT_POINTER(nk->peer, NULL); + unregister_netdevice_queue(dev, head); + if (peer) { + nk = netkit_priv(peer); + RCU_INIT_POINTER(nk->peer, NULL); + unregister_netdevice_queue(peer, head); + } +} + +static int netkit_change_link(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct netkit *nk = netkit_priv(dev); + struct net_device *peer = rtnl_dereference(nk->peer); + enum netkit_action policy; + struct nlattr *attr; + int err; + + if (!nk->primary) { + NL_SET_ERR_MSG(extack, + "netkit link settings can be changed only through the primary device"); + return -EACCES; + } + + if (data[IFLA_NETKIT_MODE]) { + NL_SET_ERR_MSG_ATTR(extack, data[IFLA_NETKIT_MODE], + "netkit link operating mode cannot be changed after device creation"); + return -EACCES; + } + + if (data[IFLA_NETKIT_POLICY]) { + attr = data[IFLA_NETKIT_POLICY]; + policy = nla_get_u32(attr); + err = netkit_check_policy(policy, attr, extack); + if (err) + return err; + WRITE_ONCE(nk->policy, policy); + } + + if (data[IFLA_NETKIT_PEER_POLICY]) { + err = -EOPNOTSUPP; + attr = data[IFLA_NETKIT_PEER_POLICY]; + policy = nla_get_u32(attr); + if (peer) + err = netkit_check_policy(policy, attr, extack); + if (err) + return err; + nk = netkit_priv(peer); + WRITE_ONCE(nk->policy, policy); + } + + return 0; +} + +static size_t netkit_get_size(const struct net_device *dev) +{ + return nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_POLICY */ + nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_PEER_POLICY */ + nla_total_size(sizeof(u8)) + /* IFLA_NETKIT_PRIMARY */ + nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_MODE */ + 0; +} + +static int netkit_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct netkit *nk = netkit_priv(dev); + struct net_device *peer = rtnl_dereference(nk->peer); + + if (nla_put_u8(skb, IFLA_NETKIT_PRIMARY, nk->primary)) + return -EMSGSIZE; + if (nla_put_u32(skb, IFLA_NETKIT_POLICY, nk->policy)) + return -EMSGSIZE; + if (nla_put_u32(skb, IFLA_NETKIT_MODE, nk->mode)) + return -EMSGSIZE; + + if (peer) { + nk = netkit_priv(peer); + if (nla_put_u32(skb, IFLA_NETKIT_PEER_POLICY, nk->policy)) + return -EMSGSIZE; + } + + return 0; +} + +static const struct nla_policy netkit_policy[IFLA_NETKIT_MAX + 1] = { + [IFLA_NETKIT_PEER_INFO] = { .len = sizeof(struct ifinfomsg) }, + [IFLA_NETKIT_POLICY] = { .type = NLA_U32 }, + [IFLA_NETKIT_MODE] = { .type = NLA_U32 }, + [IFLA_NETKIT_PEER_POLICY] = { .type = NLA_U32 }, + [IFLA_NETKIT_PRIMARY] = { .type = NLA_REJECT, + .reject_message = "Primary attribute is read-only" }, +}; + +static struct rtnl_link_ops netkit_link_ops = { + .kind = DRV_NAME, + .priv_size = sizeof(struct netkit), + .setup = netkit_setup, + .newlink = netkit_new_link, + .dellink = netkit_del_link, + .changelink = netkit_change_link, + .get_link_net = netkit_get_link_net, + .get_size = netkit_get_size, + .fill_info = netkit_fill_info, + .policy = netkit_policy, + .validate = netkit_validate, + .maxtype = IFLA_NETKIT_MAX, +}; + +static __init int netkit_init(void) +{ + BUILD_BUG_ON((int)NETKIT_NEXT != (int)TCX_NEXT || + (int)NETKIT_PASS != (int)TCX_PASS || + (int)NETKIT_DROP != (int)TCX_DROP || + (int)NETKIT_REDIRECT != (int)TCX_REDIRECT); + + return rtnl_link_register(&netkit_link_ops); +} + +static __exit void netkit_exit(void) +{ + rtnl_link_unregister(&netkit_link_ops); +} + +module_init(netkit_init); +module_exit(netkit_exit); + +MODULE_DESCRIPTION("BPF-programmable network device"); +MODULE_AUTHOR("Daniel Borkmann "); +MODULE_AUTHOR("Nikolay Aleksandrov "); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_RTNL_LINK(DRV_NAME); diff --git a/include/net/netkit.h b/include/net/netkit.h new file mode 100644 index 000000000000..0ba2e6b847ca --- /dev/null +++ b/include/net/netkit.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2023 Isovalent */ +#ifndef __NET_NETKIT_H +#define __NET_NETKIT_H + +#include + +#ifdef CONFIG_NETKIT +int netkit_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog); +int netkit_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); +int netkit_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog); +int netkit_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); +#else +static inline int netkit_prog_attach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -EINVAL; +} + +static inline int netkit_link_attach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -EINVAL; +} + +static inline int netkit_prog_detach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -EINVAL; +} + +static inline int netkit_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return -EINVAL; +} +#endif /* CONFIG_NETKIT */ +#endif /* __NET_NETKIT_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7ba61b75bc0e..0f6cdf52b1da 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1052,6 +1052,8 @@ enum bpf_attach_type { BPF_CGROUP_UNIX_RECVMSG, BPF_CGROUP_UNIX_GETPEERNAME, BPF_CGROUP_UNIX_GETSOCKNAME, + BPF_NETKIT_PRIMARY, + BPF_NETKIT_PEER, __MAX_BPF_ATTACH_TYPE }; @@ -1071,6 +1073,7 @@ enum bpf_link_type { BPF_LINK_TYPE_NETFILTER = 10, BPF_LINK_TYPE_TCX = 11, BPF_LINK_TYPE_UPROBE_MULTI = 12, + BPF_LINK_TYPE_NETKIT = 13, MAX_BPF_LINK_TYPE, }; @@ -1656,6 +1659,13 @@ union bpf_attr { __u32 flags; __u32 pid; } uprobe_multi; + struct { + union { + __u32 relative_fd; + __u32 relative_id; + }; + __u64 expected_revision; + } netkit; }; } link_create; @@ -6576,6 +6586,10 @@ struct bpf_link_info { __u32 ifindex; __u32 attach_type; } tcx; + struct { + __u32 ifindex; + __u32 attach_type; + } netkit; }; } __attribute__((aligned(8))); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index fac351a93aed..a0aa05a28cf2 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -756,6 +756,30 @@ struct tunnel_msg { __u32 ifindex; }; +/* netkit section */ +enum netkit_action { + NETKIT_NEXT = -1, + NETKIT_PASS = 0, + NETKIT_DROP = 2, + NETKIT_REDIRECT = 7, +}; + +enum netkit_mode { + NETKIT_L2, + NETKIT_L3, +}; + +enum { + IFLA_NETKIT_UNSPEC, + IFLA_NETKIT_PEER_INFO, + IFLA_NETKIT_PRIMARY, + IFLA_NETKIT_POLICY, + IFLA_NETKIT_PEER_POLICY, + IFLA_NETKIT_MODE, + __IFLA_NETKIT_MAX, +}; +#define IFLA_NETKIT_MAX (__IFLA_NETKIT_MAX - 1) + /* VXLAN section */ /* include statistics in the dump */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a9b2cb500bf7..0ed286b8a0f0 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -35,8 +35,9 @@ #include #include #include -#include +#include +#include #include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ @@ -3730,6 +3731,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_LSM; case BPF_TCX_INGRESS: case BPF_TCX_EGRESS: + case BPF_NETKIT_PRIMARY: + case BPF_NETKIT_PEER: return BPF_PROG_TYPE_SCHED_CLS; default: return BPF_PROG_TYPE_UNSPEC; @@ -3781,7 +3784,9 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, return 0; case BPF_PROG_TYPE_SCHED_CLS: if (attach_type != BPF_TCX_INGRESS && - attach_type != BPF_TCX_EGRESS) + attach_type != BPF_TCX_EGRESS && + attach_type != BPF_NETKIT_PRIMARY && + attach_type != BPF_NETKIT_PEER) return -EINVAL; return 0; default: @@ -3864,7 +3869,11 @@ static int bpf_prog_attach(const union bpf_attr *attr) ret = cgroup_bpf_prog_attach(attr, ptype, prog); break; case BPF_PROG_TYPE_SCHED_CLS: - ret = tcx_prog_attach(attr, prog); + if (attr->attach_type == BPF_TCX_INGRESS || + attr->attach_type == BPF_TCX_EGRESS) + ret = tcx_prog_attach(attr, prog); + else + ret = netkit_prog_attach(attr, prog); break; default: ret = -EINVAL; @@ -3925,7 +3934,11 @@ static int bpf_prog_detach(const union bpf_attr *attr) ret = cgroup_bpf_prog_detach(attr, ptype); break; case BPF_PROG_TYPE_SCHED_CLS: - ret = tcx_prog_detach(attr, prog); + if (attr->attach_type == BPF_TCX_INGRESS || + attr->attach_type == BPF_TCX_EGRESS) + ret = tcx_prog_detach(attr, prog); + else + ret = netkit_prog_detach(attr, prog); break; default: ret = -EINVAL; @@ -3992,6 +4005,9 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_TCX_INGRESS: case BPF_TCX_EGRESS: return tcx_prog_query(attr, uattr); + case BPF_NETKIT_PRIMARY: + case BPF_NETKIT_PEER: + return netkit_prog_query(attr, uattr); default: return -EINVAL; } @@ -4973,7 +4989,11 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) ret = bpf_xdp_link_attach(attr, prog); break; case BPF_PROG_TYPE_SCHED_CLS: - ret = tcx_link_attach(attr, prog); + if (attr->link_create.attach_type == BPF_TCX_INGRESS || + attr->link_create.attach_type == BPF_TCX_EGRESS) + ret = tcx_link_attach(attr, prog); + else + ret = netkit_link_attach(attr, prog); break; case BPF_PROG_TYPE_NETFILTER: ret = bpf_nf_link_attach(attr, prog); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7ba61b75bc0e..0f6cdf52b1da 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1052,6 +1052,8 @@ enum bpf_attach_type { BPF_CGROUP_UNIX_RECVMSG, BPF_CGROUP_UNIX_GETPEERNAME, BPF_CGROUP_UNIX_GETSOCKNAME, + BPF_NETKIT_PRIMARY, + BPF_NETKIT_PEER, __MAX_BPF_ATTACH_TYPE }; @@ -1071,6 +1073,7 @@ enum bpf_link_type { BPF_LINK_TYPE_NETFILTER = 10, BPF_LINK_TYPE_TCX = 11, BPF_LINK_TYPE_UPROBE_MULTI = 12, + BPF_LINK_TYPE_NETKIT = 13, MAX_BPF_LINK_TYPE, }; @@ -1656,6 +1659,13 @@ union bpf_attr { __u32 flags; __u32 pid; } uprobe_multi; + struct { + union { + __u32 relative_fd; + __u32 relative_id; + }; + __u64 expected_revision; + } netkit; }; } link_create; @@ -6576,6 +6586,10 @@ struct bpf_link_info { __u32 ifindex; __u32 attach_type; } tcx; + struct { + __u32 ifindex; + __u32 attach_type; + } netkit; }; } __attribute__((aligned(8))); -- cgit v1.2.3 From 5c1b994de4be8a27afa3281be2ff58b38e8bc50c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 24 Oct 2023 23:48:59 +0200 Subject: tools: Sync if_link uapi header Sync if_link uapi header to the latest version as we need the refresher in tooling for netkit device. Given it's been a while since the last sync and the diff is fairly big, it has been done as its own commit. Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20231024214904.29825-3-daniel@iogearbox.net Signed-off-by: Martin KaFai Lau --- tools/include/uapi/linux/if_link.h | 141 +++++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) (limited to 'tools/include/uapi/linux') diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index 39e659c83cfd..a0aa05a28cf2 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -211,6 +211,9 @@ struct rtnl_link_stats { * @rx_nohandler: Number of packets received on the interface * but dropped by the networking stack because the device is * not designated to receive packets (e.g. backup link in a bond). + * + * @rx_otherhost_dropped: Number of packets dropped due to mismatch + * in destination MAC address. */ struct rtnl_link_stats64 { __u64 rx_packets; @@ -243,6 +246,23 @@ struct rtnl_link_stats64 { __u64 rx_compressed; __u64 tx_compressed; __u64 rx_nohandler; + + __u64 rx_otherhost_dropped; +}; + +/* Subset of link stats useful for in-HW collection. Meaning of the fields is as + * for struct rtnl_link_stats64. + */ +struct rtnl_hw_stats64 { + __u64 rx_packets; + __u64 tx_packets; + __u64 rx_bytes; + __u64 tx_bytes; + __u64 rx_errors; + __u64 tx_errors; + __u64 rx_dropped; + __u64 tx_dropped; + __u64 multicast; }; /* The struct should be in sync with struct ifmap */ @@ -350,7 +370,13 @@ enum { IFLA_GRO_MAX_SIZE, IFLA_TSO_MAX_SIZE, IFLA_TSO_MAX_SEGS, + IFLA_ALLMULTI, /* Allmulti count: > 0 means acts ALLMULTI */ + + IFLA_DEVLINK_PORT, + IFLA_GSO_IPV4_MAX_SIZE, + IFLA_GRO_IPV4_MAX_SIZE, + IFLA_DPLL_PIN, __IFLA_MAX }; @@ -539,6 +565,12 @@ enum { IFLA_BRPORT_MRP_IN_OPEN, IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT, IFLA_BRPORT_MCAST_EHT_HOSTS_CNT, + IFLA_BRPORT_LOCKED, + IFLA_BRPORT_MAB, + IFLA_BRPORT_MCAST_N_GROUPS, + IFLA_BRPORT_MCAST_MAX_GROUPS, + IFLA_BRPORT_NEIGH_VLAN_SUPPRESS, + IFLA_BRPORT_BACKUP_NHID, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) @@ -716,7 +748,79 @@ enum ipvlan_mode { #define IPVLAN_F_PRIVATE 0x01 #define IPVLAN_F_VEPA 0x02 +/* Tunnel RTM header */ +struct tunnel_msg { + __u8 family; + __u8 flags; + __u16 reserved2; + __u32 ifindex; +}; + +/* netkit section */ +enum netkit_action { + NETKIT_NEXT = -1, + NETKIT_PASS = 0, + NETKIT_DROP = 2, + NETKIT_REDIRECT = 7, +}; + +enum netkit_mode { + NETKIT_L2, + NETKIT_L3, +}; + +enum { + IFLA_NETKIT_UNSPEC, + IFLA_NETKIT_PEER_INFO, + IFLA_NETKIT_PRIMARY, + IFLA_NETKIT_POLICY, + IFLA_NETKIT_PEER_POLICY, + IFLA_NETKIT_MODE, + __IFLA_NETKIT_MAX, +}; +#define IFLA_NETKIT_MAX (__IFLA_NETKIT_MAX - 1) + /* VXLAN section */ + +/* include statistics in the dump */ +#define TUNNEL_MSG_FLAG_STATS 0x01 + +#define TUNNEL_MSG_VALID_USER_FLAGS TUNNEL_MSG_FLAG_STATS + +/* Embedded inside VXLAN_VNIFILTER_ENTRY_STATS */ +enum { + VNIFILTER_ENTRY_STATS_UNSPEC, + VNIFILTER_ENTRY_STATS_RX_BYTES, + VNIFILTER_ENTRY_STATS_RX_PKTS, + VNIFILTER_ENTRY_STATS_RX_DROPS, + VNIFILTER_ENTRY_STATS_RX_ERRORS, + VNIFILTER_ENTRY_STATS_TX_BYTES, + VNIFILTER_ENTRY_STATS_TX_PKTS, + VNIFILTER_ENTRY_STATS_TX_DROPS, + VNIFILTER_ENTRY_STATS_TX_ERRORS, + VNIFILTER_ENTRY_STATS_PAD, + __VNIFILTER_ENTRY_STATS_MAX +}; +#define VNIFILTER_ENTRY_STATS_MAX (__VNIFILTER_ENTRY_STATS_MAX - 1) + +enum { + VXLAN_VNIFILTER_ENTRY_UNSPEC, + VXLAN_VNIFILTER_ENTRY_START, + VXLAN_VNIFILTER_ENTRY_END, + VXLAN_VNIFILTER_ENTRY_GROUP, + VXLAN_VNIFILTER_ENTRY_GROUP6, + VXLAN_VNIFILTER_ENTRY_STATS, + __VXLAN_VNIFILTER_ENTRY_MAX +}; +#define VXLAN_VNIFILTER_ENTRY_MAX (__VXLAN_VNIFILTER_ENTRY_MAX - 1) + +enum { + VXLAN_VNIFILTER_UNSPEC, + VXLAN_VNIFILTER_ENTRY, + __VXLAN_VNIFILTER_MAX +}; +#define VXLAN_VNIFILTER_MAX (__VXLAN_VNIFILTER_MAX - 1) + enum { IFLA_VXLAN_UNSPEC, IFLA_VXLAN_ID, @@ -748,6 +852,8 @@ enum { IFLA_VXLAN_GPE, IFLA_VXLAN_TTL_INHERIT, IFLA_VXLAN_DF, + IFLA_VXLAN_VNIFILTER, /* only applicable with COLLECT_METADATA mode */ + IFLA_VXLAN_LOCALBYPASS, __IFLA_VXLAN_MAX }; #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) @@ -781,6 +887,7 @@ enum { IFLA_GENEVE_LABEL, IFLA_GENEVE_TTL_INHERIT, IFLA_GENEVE_DF, + IFLA_GENEVE_INNER_PROTO_INHERIT, __IFLA_GENEVE_MAX }; #define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1) @@ -826,6 +933,8 @@ enum { IFLA_GTP_FD1, IFLA_GTP_PDP_HASHSIZE, IFLA_GTP_ROLE, + IFLA_GTP_CREATE_SOCKETS, + IFLA_GTP_RESTART_COUNT, __IFLA_GTP_MAX, }; #define IFLA_GTP_MAX (__IFLA_GTP_MAX - 1) @@ -1162,6 +1271,17 @@ enum { #define IFLA_STATS_FILTER_BIT(ATTR) (1 << (ATTR - 1)) +enum { + IFLA_STATS_GETSET_UNSPEC, + IFLA_STATS_GET_FILTERS, /* Nest of IFLA_STATS_LINK_xxx, each a u32 with + * a filter mask for the corresponding group. + */ + IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS, /* 0 or 1 as u8 */ + __IFLA_STATS_GETSET_MAX, +}; + +#define IFLA_STATS_GETSET_MAX (__IFLA_STATS_GETSET_MAX - 1) + /* These are embedded into IFLA_STATS_LINK_XSTATS: * [IFLA_STATS_LINK_XSTATS] * -> [LINK_XSTATS_TYPE_xxx] @@ -1179,10 +1299,21 @@ enum { enum { IFLA_OFFLOAD_XSTATS_UNSPEC, IFLA_OFFLOAD_XSTATS_CPU_HIT, /* struct rtnl_link_stats64 */ + IFLA_OFFLOAD_XSTATS_HW_S_INFO, /* HW stats info. A nest */ + IFLA_OFFLOAD_XSTATS_L3_STATS, /* struct rtnl_hw_stats64 */ __IFLA_OFFLOAD_XSTATS_MAX }; #define IFLA_OFFLOAD_XSTATS_MAX (__IFLA_OFFLOAD_XSTATS_MAX - 1) +enum { + IFLA_OFFLOAD_XSTATS_HW_S_INFO_UNSPEC, + IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST, /* u8 */ + IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED, /* u8 */ + __IFLA_OFFLOAD_XSTATS_HW_S_INFO_MAX, +}; +#define IFLA_OFFLOAD_XSTATS_HW_S_INFO_MAX \ + (__IFLA_OFFLOAD_XSTATS_HW_S_INFO_MAX - 1) + /* XDP section */ #define XDP_FLAGS_UPDATE_IF_NOEXIST (1U << 0) @@ -1281,4 +1412,14 @@ enum { #define IFLA_MCTP_MAX (__IFLA_MCTP_MAX - 1) +/* DSA section */ + +enum { + IFLA_DSA_UNSPEC, + IFLA_DSA_MASTER, + __IFLA_DSA_MAX, +}; + +#define IFLA_DSA_MAX (__IFLA_DSA_MAX - 1) + #endif /* _UAPI_LINUX_IF_LINK_H */ -- cgit v1.2.3