From 0e33661de493db325435d565a4a722120ae4cbf3 Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Wed, 23 Nov 2016 16:52:25 +0100 Subject: bpf: add new prog type for cgroup socket filtering This program type is similar to BPF_PROG_TYPE_SOCKET_FILTER, except that it does not allow BPF_LD_[ABS|IND] instructions and hooks up the bpf_skb_load_bytes() helper. Programs of this type will be attached to cgroups for network filtering and accounting. Signed-off-by: Daniel Mack Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 9 +++++++++ net/core/filter.c | 23 +++++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7d9b2832c280..5ae679fac993 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -98,8 +98,17 @@ enum bpf_prog_type { BPF_PROG_TYPE_TRACEPOINT, BPF_PROG_TYPE_XDP, BPF_PROG_TYPE_PERF_EVENT, + BPF_PROG_TYPE_CGROUP_SKB, }; +enum bpf_attach_type { + BPF_CGROUP_INET_INGRESS, + BPF_CGROUP_INET_EGRESS, + __MAX_BPF_ATTACH_TYPE +}; + +#define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE + #define BPF_PSEUDO_MAP_FD 1 /* flags for BPF_MAP_UPDATE_ELEM command */ diff --git a/net/core/filter.c b/net/core/filter.c index dece94fef005..2de302d68038 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2630,6 +2630,17 @@ xdp_func_proto(enum bpf_func_id func_id) } } +static const struct bpf_func_proto * +cg_skb_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_skb_load_bytes: + return &bpf_skb_load_bytes_proto; + default: + return sk_filter_func_proto(func_id); + } +} + static bool __is_valid_access(int off, int size, enum bpf_access_type type) { if (off < 0 || off >= sizeof(struct __sk_buff)) @@ -2992,6 +3003,12 @@ static const struct bpf_verifier_ops xdp_ops = { .convert_ctx_access = xdp_convert_ctx_access, }; +static const struct bpf_verifier_ops cg_skb_ops = { + .get_func_proto = cg_skb_func_proto, + .is_valid_access = sk_filter_is_valid_access, + .convert_ctx_access = sk_filter_convert_ctx_access, +}; + static struct bpf_prog_type_list sk_filter_type __read_mostly = { .ops = &sk_filter_ops, .type = BPF_PROG_TYPE_SOCKET_FILTER, @@ -3012,12 +3029,18 @@ static struct bpf_prog_type_list xdp_type __read_mostly = { .type = BPF_PROG_TYPE_XDP, }; +static struct bpf_prog_type_list cg_skb_type __read_mostly = { + .ops = &cg_skb_ops, + .type = BPF_PROG_TYPE_CGROUP_SKB, +}; + static int __init register_sk_filter_ops(void) { bpf_register_prog_type(&sk_filter_type); bpf_register_prog_type(&sched_cls_type); bpf_register_prog_type(&sched_act_type); bpf_register_prog_type(&xdp_type); + bpf_register_prog_type(&cg_skb_type); return 0; } -- cgit v1.2.3 From 3007098494bec614fb55dee7bc0410bb7db5ad18 Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Wed, 23 Nov 2016 16:52:26 +0100 Subject: cgroup: add support for eBPF programs This patch adds two sets of eBPF program pointers to struct cgroup. One for such that are directly pinned to a cgroup, and one for such that are effective for it. To illustrate the logic behind that, assume the following example cgroup hierarchy. A - B - C \ D - E If only B has a program attached, it will be effective for B, C, D and E. If D then attaches a program itself, that will be effective for both D and E, and the program in B will only affect B and C. Only one program of a given type is effective for a cgroup. Attaching and detaching programs will be done through the bpf(2) syscall. For now, ingress and egress inet socket filtering are the only supported use-cases. Signed-off-by: Daniel Mack Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf-cgroup.h | 79 +++++++++++++++++++++ include/linux/cgroup-defs.h | 4 ++ init/Kconfig | 12 ++++ kernel/bpf/Makefile | 1 + kernel/bpf/cgroup.c | 167 ++++++++++++++++++++++++++++++++++++++++++++ kernel/cgroup.c | 18 +++++ 6 files changed, 281 insertions(+) create mode 100644 include/linux/bpf-cgroup.h create mode 100644 kernel/bpf/cgroup.c diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h new file mode 100644 index 000000000000..ec80d0c0953e --- /dev/null +++ b/include/linux/bpf-cgroup.h @@ -0,0 +1,79 @@ +#ifndef _BPF_CGROUP_H +#define _BPF_CGROUP_H + +#include +#include +#include + +struct sock; +struct cgroup; +struct sk_buff; + +#ifdef CONFIG_CGROUP_BPF + +extern struct static_key_false cgroup_bpf_enabled_key; +#define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key) + +struct cgroup_bpf { + /* + * Store two sets of bpf_prog pointers, one for programs that are + * pinned directly to this cgroup, and one for those that are effective + * when this cgroup is accessed. + */ + struct bpf_prog *prog[MAX_BPF_ATTACH_TYPE]; + struct bpf_prog *effective[MAX_BPF_ATTACH_TYPE]; +}; + +void cgroup_bpf_put(struct cgroup *cgrp); +void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent); + +void __cgroup_bpf_update(struct cgroup *cgrp, + struct cgroup *parent, + struct bpf_prog *prog, + enum bpf_attach_type type); + +/* Wrapper for __cgroup_bpf_update() protected by cgroup_mutex */ +void cgroup_bpf_update(struct cgroup *cgrp, + struct bpf_prog *prog, + enum bpf_attach_type type); + +int __cgroup_bpf_run_filter(struct sock *sk, + struct sk_buff *skb, + enum bpf_attach_type type); + +/* Wrappers for __cgroup_bpf_run_filter() guarded by cgroup_bpf_enabled. */ +#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled) \ + __ret = __cgroup_bpf_run_filter(sk, skb, \ + BPF_CGROUP_INET_INGRESS); \ + \ + __ret; \ +}) + +#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled && sk && sk == skb->sk) { \ + typeof(sk) __sk = sk_to_full_sk(sk); \ + if (sk_fullsock(__sk)) \ + __ret = __cgroup_bpf_run_filter(__sk, skb, \ + BPF_CGROUP_INET_EGRESS); \ + } \ + __ret; \ +}) + +#else + +struct cgroup_bpf {}; +static inline void cgroup_bpf_put(struct cgroup *cgrp) {} +static inline void cgroup_bpf_inherit(struct cgroup *cgrp, + struct cgroup *parent) {} + +#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) + +#endif /* CONFIG_CGROUP_BPF */ + +#endif /* _BPF_CGROUP_H */ diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 5b17de62c962..861b4677fc5b 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -16,6 +16,7 @@ #include #include #include +#include #ifdef CONFIG_CGROUPS @@ -300,6 +301,9 @@ struct cgroup { /* used to schedule release agent */ struct work_struct release_agent_work; + /* used to store eBPF programs */ + struct cgroup_bpf bpf; + /* ids of the ancestors at each level including self */ int ancestor_ids[]; }; diff --git a/init/Kconfig b/init/Kconfig index 34407f15e6d3..405120b5f13e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1154,6 +1154,18 @@ config CGROUP_PERF Say N if unsure. +config CGROUP_BPF + bool "Support for eBPF programs attached to cgroups" + depends on BPF_SYSCALL && SOCK_CGROUP_DATA + help + Allow attaching eBPF programs to a cgroup using the bpf(2) + syscall command BPF_PROG_ATTACH. + + In which context these programs are accessed depends on the type + of attachment. For instance, programs that are attached using + BPF_CGROUP_INET_INGRESS will be executed on the ingress path of + inet sockets. + config CGROUP_DEBUG bool "Example controller" default n diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index c4d89d6e2058..1276474ac3cd 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -5,3 +5,4 @@ obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif +obj-$(CONFIG_CGROUP_BPF) += cgroup.o diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c new file mode 100644 index 000000000000..a0ab43f264b0 --- /dev/null +++ b/kernel/bpf/cgroup.c @@ -0,0 +1,167 @@ +/* + * Functions to manage eBPF programs attached to cgroups + * + * Copyright (c) 2016 Daniel Mack + * + * This file is subject to the terms and conditions of version 2 of the GNU + * General Public License. See the file COPYING in the main directory of the + * Linux distribution for more details. + */ + +#include +#include +#include +#include +#include +#include +#include + +DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); +EXPORT_SYMBOL(cgroup_bpf_enabled_key); + +/** + * cgroup_bpf_put() - put references of all bpf programs + * @cgrp: the cgroup to modify + */ +void cgroup_bpf_put(struct cgroup *cgrp) +{ + unsigned int type; + + for (type = 0; type < ARRAY_SIZE(cgrp->bpf.prog); type++) { + struct bpf_prog *prog = cgrp->bpf.prog[type]; + + if (prog) { + bpf_prog_put(prog); + static_branch_dec(&cgroup_bpf_enabled_key); + } + } +} + +/** + * cgroup_bpf_inherit() - inherit effective programs from parent + * @cgrp: the cgroup to modify + * @parent: the parent to inherit from + */ +void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent) +{ + unsigned int type; + + for (type = 0; type < ARRAY_SIZE(cgrp->bpf.effective); type++) { + struct bpf_prog *e; + + e = rcu_dereference_protected(parent->bpf.effective[type], + lockdep_is_held(&cgroup_mutex)); + rcu_assign_pointer(cgrp->bpf.effective[type], e); + } +} + +/** + * __cgroup_bpf_update() - Update the pinned program of a cgroup, and + * propagate the change to descendants + * @cgrp: The cgroup which descendants to traverse + * @parent: The parent of @cgrp, or %NULL if @cgrp is the root + * @prog: A new program to pin + * @type: Type of pinning operation (ingress/egress) + * + * Each cgroup has a set of two pointers for bpf programs; one for eBPF + * programs it owns, and which is effective for execution. + * + * If @prog is %NULL, this function attaches a new program to the cgroup and + * releases the one that is currently attached, if any. @prog is then made + * the effective program of type @type in that cgroup. + * + * If @prog is %NULL, the currently attached program of type @type is released, + * and the effective program of the parent cgroup (if any) is inherited to + * @cgrp. + * + * Then, the descendants of @cgrp are walked and the effective program for + * each of them is set to the effective program of @cgrp unless the + * descendant has its own program attached, in which case the subbranch is + * skipped. This ensures that delegated subcgroups with own programs are left + * untouched. + * + * Must be called with cgroup_mutex held. + */ +void __cgroup_bpf_update(struct cgroup *cgrp, + struct cgroup *parent, + struct bpf_prog *prog, + enum bpf_attach_type type) +{ + struct bpf_prog *old_prog, *effective; + struct cgroup_subsys_state *pos; + + old_prog = xchg(cgrp->bpf.prog + type, prog); + + effective = (!prog && parent) ? + rcu_dereference_protected(parent->bpf.effective[type], + lockdep_is_held(&cgroup_mutex)) : + prog; + + css_for_each_descendant_pre(pos, &cgrp->self) { + struct cgroup *desc = container_of(pos, struct cgroup, self); + + /* skip the subtree if the descendant has its own program */ + if (desc->bpf.prog[type] && desc != cgrp) + pos = css_rightmost_descendant(pos); + else + rcu_assign_pointer(desc->bpf.effective[type], + effective); + } + + if (prog) + static_branch_inc(&cgroup_bpf_enabled_key); + + if (old_prog) { + bpf_prog_put(old_prog); + static_branch_dec(&cgroup_bpf_enabled_key); + } +} + +/** + * __cgroup_bpf_run_filter() - Run a program for packet filtering + * @sk: The socken sending or receiving traffic + * @skb: The skb that is being sent or received + * @type: The type of program to be exectuted + * + * If no socket is passed, or the socket is not of type INET or INET6, + * this function does nothing and returns 0. + * + * The program type passed in via @type must be suitable for network + * filtering. No further check is performed to assert that. + * + * This function will return %-EPERM if any if an attached program was found + * and if it returned != 1 during execution. In all other cases, 0 is returned. + */ +int __cgroup_bpf_run_filter(struct sock *sk, + struct sk_buff *skb, + enum bpf_attach_type type) +{ + struct bpf_prog *prog; + struct cgroup *cgrp; + int ret = 0; + + if (!sk || !sk_fullsock(sk)) + return 0; + + if (sk->sk_family != AF_INET && + sk->sk_family != AF_INET6) + return 0; + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + + rcu_read_lock(); + + prog = rcu_dereference(cgrp->bpf.effective[type]); + if (prog) { + unsigned int offset = skb->data - skb_network_header(skb); + + __skb_push(skb, offset); + ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM; + __skb_pull(skb, offset); + } + + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 85bc9beb046d..2ee9ec3051b2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5074,6 +5074,8 @@ static void css_release_work_fn(struct work_struct *work) if (cgrp->kn) RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); + + cgroup_bpf_put(cgrp); } mutex_unlock(&cgroup_mutex); @@ -5281,6 +5283,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent) if (!cgroup_on_dfl(cgrp)) cgrp->subtree_control = cgroup_control(cgrp); + if (parent) + cgroup_bpf_inherit(cgrp, parent); + cgroup_propagate_control(cgrp); /* @cgrp doesn't have dir yet so the following will only create csses */ @@ -6495,6 +6500,19 @@ static __init int cgroup_namespaces_init(void) } subsys_initcall(cgroup_namespaces_init); +#ifdef CONFIG_CGROUP_BPF +void cgroup_bpf_update(struct cgroup *cgrp, + struct bpf_prog *prog, + enum bpf_attach_type type) +{ + struct cgroup *parent = cgroup_parent(cgrp); + + mutex_lock(&cgroup_mutex); + __cgroup_bpf_update(cgrp, parent, prog, type); + mutex_unlock(&cgroup_mutex); +} +#endif /* CONFIG_CGROUP_BPF */ + #ifdef CONFIG_CGROUP_DEBUG static struct cgroup_subsys_state * debug_css_alloc(struct cgroup_subsys_state *parent_css) -- cgit v1.2.3 From f4324551489e8781d838f941b7aee4208e52e8bf Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Wed, 23 Nov 2016 16:52:27 +0100 Subject: bpf: add BPF_PROG_ATTACH and BPF_PROG_DETACH commands Extend the bpf(2) syscall by two new commands, BPF_PROG_ATTACH and BPF_PROG_DETACH which allow attaching and detaching eBPF programs to a target. On the API level, the target could be anything that has an fd in userspace, hence the name of the field in union bpf_attr is called 'target_fd'. When called with BPF_ATTACH_TYPE_CGROUP_INET_{E,IN}GRESS, the target is expected to be a valid file descriptor of a cgroup v2 directory which has the bpf controller enabled. These are the only use-cases implemented by this patch at this point, but more can be added. If a program of the given type already exists in the given cgroup, the program is swapped automically, so userspace does not have to drop an existing program first before installing a new one, which would otherwise leave a gap in which no program is attached. For more information on the propagation logic to subcgroups, please refer to the bpf cgroup controller implementation. The API is guarded by CAP_NET_ADMIN. Signed-off-by: Daniel Mack Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 8 +++++ kernel/bpf/syscall.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 5ae679fac993..1370a9d1456f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -73,6 +73,8 @@ enum bpf_cmd { BPF_PROG_LOAD, BPF_OBJ_PIN, BPF_OBJ_GET, + BPF_PROG_ATTACH, + BPF_PROG_DETACH, }; enum bpf_map_type { @@ -159,6 +161,12 @@ union bpf_attr { __aligned_u64 pathname; __u32 bpf_fd; }; + + struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */ + __u32 target_fd; /* container object to attach to */ + __u32 attach_bpf_fd; /* eBPF program to attach */ + __u32 attach_type; + }; } __attribute__((aligned(8))); /* BPF helper function descriptions: diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index eb15498b8d55..1090d16a31c1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -835,6 +835,77 @@ static int bpf_obj_get(const union bpf_attr *attr) return bpf_obj_get_user(u64_to_user_ptr(attr->pathname)); } +#ifdef CONFIG_CGROUP_BPF + +#define BPF_PROG_ATTACH_LAST_FIELD attach_type + +static int bpf_prog_attach(const union bpf_attr *attr) +{ + struct bpf_prog *prog; + struct cgroup *cgrp; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (CHECK_ATTR(BPF_PROG_ATTACH)) + return -EINVAL; + + switch (attr->attach_type) { + case BPF_CGROUP_INET_INGRESS: + case BPF_CGROUP_INET_EGRESS: + prog = bpf_prog_get_type(attr->attach_bpf_fd, + BPF_PROG_TYPE_CGROUP_SKB); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + cgrp = cgroup_get_from_fd(attr->target_fd); + if (IS_ERR(cgrp)) { + bpf_prog_put(prog); + return PTR_ERR(cgrp); + } + + cgroup_bpf_update(cgrp, prog, attr->attach_type); + cgroup_put(cgrp); + break; + + default: + return -EINVAL; + } + + return 0; +} + +#define BPF_PROG_DETACH_LAST_FIELD attach_type + +static int bpf_prog_detach(const union bpf_attr *attr) +{ + struct cgroup *cgrp; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (CHECK_ATTR(BPF_PROG_DETACH)) + return -EINVAL; + + switch (attr->attach_type) { + case BPF_CGROUP_INET_INGRESS: + case BPF_CGROUP_INET_EGRESS: + cgrp = cgroup_get_from_fd(attr->target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + cgroup_bpf_update(cgrp, NULL, attr->attach_type); + cgroup_put(cgrp); + break; + + default: + return -EINVAL; + } + + return 0; +} +#endif /* CONFIG_CGROUP_BPF */ + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -901,6 +972,16 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_OBJ_GET: err = bpf_obj_get(&attr); break; + +#ifdef CONFIG_CGROUP_BPF + case BPF_PROG_ATTACH: + err = bpf_prog_attach(&attr); + break; + case BPF_PROG_DETACH: + err = bpf_prog_detach(&attr); + break; +#endif + default: err = -EINVAL; break; -- cgit v1.2.3 From c11cd3a6ec3a817c6b71b00c559e25d855f7e5b4 Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Wed, 23 Nov 2016 16:52:28 +0100 Subject: net: filter: run cgroup eBPF ingress programs If the cgroup associated with the receiving socket has an eBPF programs installed, run them from sk_filter_trim_cap(). eBPF programs used in this context are expected to either return 1 to let the packet pass, or != 1 to drop them. The programs have access to the skb through bpf_skb_load_bytes(), and the payload starts at the network headers (L3). Note that cgroup_bpf_run_filter() is stubbed out as static inline nop for !CONFIG_CGROUP_BPF, and is otherwise guarded by a static key if the feature is unused. Signed-off-by: Daniel Mack Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index 2de302d68038..ea315af56511 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -78,6 +78,10 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) return -ENOMEM; + err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb); + if (err) + return err; + err = security_sock_rcv_skb(sk, skb); if (err) return err; -- cgit v1.2.3 From 33b486793cb31311f3a91ae4fe4be5926e7677b0 Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Wed, 23 Nov 2016 16:52:29 +0100 Subject: net: ipv4, ipv6: run cgroup eBPF egress programs If the cgroup associated with the receiving socket has an eBPF programs installed, run them from ip_output(), ip6_output() and ip_mc_output(). From mentioned functions we have two socket contexts as per 7026b1ddb6b8 ("netfilter: Pass socket pointer down through okfn()."). We explicitly need to use sk instead of skb->sk here, since otherwise the same program would run multiple times on egress when encap devices are involved, which is not desired in our case. eBPF programs used in this context are expected to either return 1 to let the packet pass, or != 1 to drop them. The programs have access to the skb through bpf_skb_load_bytes(), and the payload starts at the network headers (L3). Note that cgroup_bpf_run_filter() is stubbed out as static inline nop for !CONFIG_CGROUP_BPF, and is otherwise guarded by a static key if the feature is unused. Signed-off-by: Daniel Mack Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 26 ++++++++++++++++++++++++-- net/ipv6/ip6_output.c | 9 +++++++++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 358f2c82b030..9af2b7853be4 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -74,6 +74,7 @@ #include #include #include +#include #include #include #include @@ -285,6 +286,13 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { unsigned int mtu; + int ret; + + ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); + if (ret) { + kfree_skb(skb); + return ret; + } #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ @@ -303,6 +311,20 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk return ip_finish_output2(net, sk, skb); } +static int ip_mc_finish_output(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + int ret; + + ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); + if (ret) { + kfree_skb(skb); + return ret; + } + + return dev_loopback_xmit(net, sk, skb); +} + int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); @@ -340,7 +362,7 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (newskb) NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, newskb, NULL, newskb->dev, - dev_loopback_xmit); + ip_mc_finish_output); } /* Multicasts with ttl 0 must not go beyond the host */ @@ -356,7 +378,7 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (newskb) NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, newskb, NULL, newskb->dev, - dev_loopback_xmit); + ip_mc_finish_output); } return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 312cbd0e5038..70d0de404197 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -39,6 +39,7 @@ #include #include +#include #include #include @@ -131,6 +132,14 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { + int ret; + + ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); + if (ret) { + kfree_skb(skb); + return ret; + } + if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || dst_allfrag(skb_dst(skb)) || (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) -- cgit v1.2.3 From d8c5b17f2bc0de09fbbfa14d90e8168163a579e7 Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Wed, 23 Nov 2016 16:52:30 +0100 Subject: samples: bpf: add userspace example for attaching eBPF programs to cgroups Add a simple userpace program to demonstrate the new API to attach eBPF programs to cgroups. This is what it does: * Create arraymap in kernel with 4 byte keys and 8 byte values * Load eBPF program The eBPF program accesses the map passed in to store two pieces of information. The number of invocations of the program, which maps to the number of packets received, is stored to key 0. Key 1 is incremented on each iteration by the number of bytes stored in the skb. * Detach any eBPF program previously attached to the cgroup * Attach the new program to the cgroup using BPF_PROG_ATTACH * Once a second, read map[0] and map[1] to see how many bytes and packets were seen on any socket of tasks in the given cgroup. The program takes a cgroup path as 1st argument, and either "ingress" or "egress" as 2nd. Optionally, "drop" can be passed as 3rd argument, which will make the generated eBPF program return 0 instead of 1, so the kernel will drop the packet. libbpf gained two new wrappers for the new syscall commands. Signed-off-by: Daniel Mack Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- samples/bpf/Makefile | 2 + samples/bpf/libbpf.c | 21 ++++++ samples/bpf/libbpf.h | 3 + samples/bpf/test_cgrp2_attach.c | 147 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 173 insertions(+) create mode 100644 samples/bpf/test_cgrp2_attach.c diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index f394ac616ed8..fb17206ddb57 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -22,6 +22,7 @@ hostprogs-y += spintest hostprogs-y += map_perf_test hostprogs-y += test_overhead hostprogs-y += test_cgrp2_array_pin +hostprogs-y += test_cgrp2_attach hostprogs-y += xdp1 hostprogs-y += xdp2 hostprogs-y += test_current_task_under_cgroup @@ -49,6 +50,7 @@ spintest-objs := bpf_load.o libbpf.o spintest_user.o map_perf_test-objs := bpf_load.o libbpf.o map_perf_test_user.o test_overhead-objs := bpf_load.o libbpf.o test_overhead_user.o test_cgrp2_array_pin-objs := libbpf.o test_cgrp2_array_pin.o +test_cgrp2_attach-objs := libbpf.o test_cgrp2_attach.o xdp1-objs := bpf_load.o libbpf.o xdp1_user.o # reuse xdp1 source intentionally xdp2-objs := bpf_load.o libbpf.o xdp1_user.o diff --git a/samples/bpf/libbpf.c b/samples/bpf/libbpf.c index 9969e35550c3..9ce707bf02a7 100644 --- a/samples/bpf/libbpf.c +++ b/samples/bpf/libbpf.c @@ -104,6 +104,27 @@ int bpf_prog_load(enum bpf_prog_type prog_type, return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); } +int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type) +{ + union bpf_attr attr = { + .target_fd = target_fd, + .attach_bpf_fd = prog_fd, + .attach_type = type, + }; + + return syscall(__NR_bpf, BPF_PROG_ATTACH, &attr, sizeof(attr)); +} + +int bpf_prog_detach(int target_fd, enum bpf_attach_type type) +{ + union bpf_attr attr = { + .target_fd = target_fd, + .attach_type = type, + }; + + return syscall(__NR_bpf, BPF_PROG_DETACH, &attr, sizeof(attr)); +} + int bpf_obj_pin(int fd, const char *pathname) { union bpf_attr attr = { diff --git a/samples/bpf/libbpf.h b/samples/bpf/libbpf.h index de96a935068d..94a901d86fc2 100644 --- a/samples/bpf/libbpf.h +++ b/samples/bpf/libbpf.h @@ -15,6 +15,9 @@ int bpf_prog_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns, int insn_len, const char *license, int kern_version); +int bpf_prog_attach(int prog_fd, int attachable_fd, enum bpf_attach_type type); +int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type); + int bpf_obj_pin(int fd, const char *pathname); int bpf_obj_get(const char *pathname); diff --git a/samples/bpf/test_cgrp2_attach.c b/samples/bpf/test_cgrp2_attach.c new file mode 100644 index 000000000000..63ef2083f766 --- /dev/null +++ b/samples/bpf/test_cgrp2_attach.c @@ -0,0 +1,147 @@ +/* eBPF example program: + * + * - Creates arraymap in kernel with 4 bytes keys and 8 byte values + * + * - Loads eBPF program + * + * The eBPF program accesses the map passed in to store two pieces of + * information. The number of invocations of the program, which maps + * to the number of packets received, is stored to key 0. Key 1 is + * incremented on each iteration by the number of bytes stored in + * the skb. + * + * - Detaches any eBPF program previously attached to the cgroup + * + * - Attaches the new program to a cgroup using BPF_PROG_ATTACH + * + * - Every second, reads map[0] and map[1] to see how many bytes and + * packets were seen on any socket of tasks in the given cgroup. + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "libbpf.h" + +enum { + MAP_KEY_PACKETS, + MAP_KEY_BYTES, +}; + +static int prog_load(int map_fd, int verdict) +{ + struct bpf_insn prog[] = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), /* save r6 so it's not clobbered by BPF_CALL */ + + /* Count packets */ + BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_PACKETS), /* r0 = 0 */ + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */ + BPF_LD_MAP_FD(BPF_REG_1, map_fd), /* load map fd to r1 */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */ + BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + + /* Count bytes */ + BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */ + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */ + BPF_LD_MAP_FD(BPF_REG_1, map_fd), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */ + BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + + BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */ + BPF_EXIT_INSN(), + }; + + return bpf_prog_load(BPF_PROG_TYPE_CGROUP_SKB, + prog, sizeof(prog), "GPL", 0); +} + +static int usage(const char *argv0) +{ + printf("Usage: %s [drop]\n", argv0); + return EXIT_FAILURE; +} + +int main(int argc, char **argv) +{ + int cg_fd, map_fd, prog_fd, key, ret; + long long pkt_cnt, byte_cnt; + enum bpf_attach_type type; + int verdict = 1; + + if (argc < 3) + return usage(argv[0]); + + if (strcmp(argv[2], "ingress") == 0) + type = BPF_CGROUP_INET_INGRESS; + else if (strcmp(argv[2], "egress") == 0) + type = BPF_CGROUP_INET_EGRESS; + else + return usage(argv[0]); + + if (argc > 3 && strcmp(argv[3], "drop") == 0) + verdict = 0; + + cg_fd = open(argv[1], O_DIRECTORY | O_RDONLY); + if (cg_fd < 0) { + printf("Failed to open cgroup path: '%s'\n", strerror(errno)); + return EXIT_FAILURE; + } + + map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, + sizeof(key), sizeof(byte_cnt), + 256, 0); + if (map_fd < 0) { + printf("Failed to create map: '%s'\n", strerror(errno)); + return EXIT_FAILURE; + } + + prog_fd = prog_load(map_fd, verdict); + printf("Output from kernel verifier:\n%s\n-------\n", bpf_log_buf); + + if (prog_fd < 0) { + printf("Failed to load prog: '%s'\n", strerror(errno)); + return EXIT_FAILURE; + } + + ret = bpf_prog_detach(cg_fd, type); + printf("bpf_prog_detach() returned '%s' (%d)\n", strerror(errno), errno); + + ret = bpf_prog_attach(prog_fd, cg_fd, type); + if (ret < 0) { + printf("Failed to attach prog to cgroup: '%s'\n", + strerror(errno)); + return EXIT_FAILURE; + } + + while (1) { + key = MAP_KEY_PACKETS; + assert(bpf_lookup_elem(map_fd, &key, &pkt_cnt) == 0); + + key = MAP_KEY_BYTES; + assert(bpf_lookup_elem(map_fd, &key, &byte_cnt) == 0); + + printf("cgroup received %lld packets, %lld bytes\n", + pkt_cnt, byte_cnt); + sleep(1); + } + + return EXIT_SUCCESS; +} -- cgit v1.2.3