summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
authorStanislav Fomichev <sdf@google.com>2019-06-27 23:38:47 +0300
committerAlexei Starovoitov <ast@kernel.org>2019-06-28 01:25:16 +0300
commit0d01da6afc5402f60325c5da31b22f7d56689b49 (patch)
treeafb8dd2975f8aa50d3577cb42310eb298d6491ac /net
parent3b1c667e47e4066cfe61610825ad50bc6b4a57e1 (diff)
downloadlinux-0d01da6afc5402f60325c5da31b22f7d56689b49.tar.xz
bpf: implement getsockopt and setsockopt hooks
Implement new BPF_PROG_TYPE_CGROUP_SOCKOPT program type and BPF_CGROUP_{G,S}ETSOCKOPT cgroup hooks. BPF_CGROUP_SETSOCKOPT can modify user setsockopt arguments before passing them down to the kernel or bypass kernel completely. BPF_CGROUP_GETSOCKOPT can can inspect/modify getsockopt arguments that kernel returns. Both hooks reuse existing PTR_TO_PACKET{,_END} infrastructure. The buffer memory is pre-allocated (because I don't think there is a precedent for working with __user memory from bpf). This might be slow to do for each {s,g}etsockopt call, that's why I've added __cgroup_bpf_prog_array_is_empty that exits early if there is nothing attached to a cgroup. Note, however, that there is a race between __cgroup_bpf_prog_array_is_empty and BPF_PROG_RUN_ARRAY where cgroup program layout might have changed; this should not be a problem because in general there is a race between multiple calls to {s,g}etsocktop and user adding/removing bpf progs from a cgroup. The return code of the BPF program is handled as follows: * 0: EPERM * 1: success, continue with next BPF program in the cgroup chain v9: * allow overwriting setsockopt arguments (Alexei Starovoitov): * use set_fs (same as kernel_setsockopt) * buffer is always kzalloc'd (no small on-stack buffer) v8: * use s32 for optlen (Andrii Nakryiko) v7: * return only 0 or 1 (Alexei Starovoitov) * always run all progs (Alexei Starovoitov) * use optval=0 as kernel bypass in setsockopt (Alexei Starovoitov) (decided to use optval=-1 instead, optval=0 might be a valid input) * call getsockopt hook after kernel handlers (Alexei Starovoitov) v6: * rework cgroup chaining; stop as soon as bpf program returns 0 or 2; see patch with the documentation for the details * drop Andrii's and Martin's Acked-by (not sure they are comfortable with the new state of things) v5: * skip copy_to_user() and put_user() when ret == 0 (Martin Lau) v4: * don't export bpf_sk_fullsock helper (Martin Lau) * size != sizeof(__u64) for uapi pointers (Martin Lau) * offsetof instead of bpf_ctx_range when checking ctx access (Martin Lau) v3: * typos in BPF_PROG_CGROUP_SOCKOPT_RUN_ARRAY comments (Andrii Nakryiko) * reverse christmas tree in BPF_PROG_CGROUP_SOCKOPT_RUN_ARRAY (Andrii Nakryiko) * use __bpf_md_ptr instead of __u32 for optval{,_end} (Martin Lau) * use BPF_FIELD_SIZEOF() for consistency (Martin Lau) * new CG_SOCKOPT_ACCESS macro to wrap repeated parts v2: * moved bpf_sockopt_kern fields around to remove a hole (Martin Lau) * aligned bpf_sockopt_kern->buf to 8 bytes (Martin Lau) * bpf_prog_array_is_empty instead of bpf_prog_array_length (Martin Lau) * added [0,2] return code check to verifier (Martin Lau) * dropped unused buf[64] from the stack (Martin Lau) * use PTR_TO_SOCKET for bpf_sockopt->sk (Martin Lau) * dropped bpf_target_off from ctx rewrites (Martin Lau) * use return code for kernel bypass (Martin Lau & Andrii Nakryiko) Cc: Andrii Nakryiko <andriin@fb.com> Cc: Martin Lau <kafai@fb.com> Signed-off-by: Stanislav Fomichev <sdf@google.com> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Diffstat (limited to 'net')
-rw-r--r--net/core/filter.c2
-rw-r--r--net/socket.c30
2 files changed, 31 insertions, 1 deletions
diff --git a/net/core/filter.c b/net/core/filter.c
index 2014d76e0d2a..dc8534be12fc 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5651,7 +5651,7 @@ BPF_CALL_1(bpf_tcp_sock, struct sock *, sk)
return (unsigned long)NULL;
}
-static const struct bpf_func_proto bpf_tcp_sock_proto = {
+const struct bpf_func_proto bpf_tcp_sock_proto = {
.func = bpf_tcp_sock,
.gpl_only = false,
.ret_type = RET_PTR_TO_TCP_SOCK_OR_NULL,
diff --git a/net/socket.c b/net/socket.c
index 963df5dbdd54..0ddfbfb761d9 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2051,6 +2051,8 @@ SYSCALL_DEFINE4(recv, int, fd, void __user *, ubuf, size_t, size,
static int __sys_setsockopt(int fd, int level, int optname,
char __user *optval, int optlen)
{
+ mm_segment_t oldfs = get_fs();
+ char *kernel_optval = NULL;
int err, fput_needed;
struct socket *sock;
@@ -2063,6 +2065,22 @@ static int __sys_setsockopt(int fd, int level, int optname,
if (err)
goto out_put;
+ err = BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock->sk, &level,
+ &optname, optval, &optlen,
+ &kernel_optval);
+
+ if (err < 0) {
+ goto out_put;
+ } else if (err > 0) {
+ err = 0;
+ goto out_put;
+ }
+
+ if (kernel_optval) {
+ set_fs(KERNEL_DS);
+ optval = (char __user __force *)kernel_optval;
+ }
+
if (level == SOL_SOCKET)
err =
sock_setsockopt(sock, level, optname, optval,
@@ -2071,6 +2089,11 @@ static int __sys_setsockopt(int fd, int level, int optname,
err =
sock->ops->setsockopt(sock, level, optname, optval,
optlen);
+
+ if (kernel_optval) {
+ set_fs(oldfs);
+ kfree(kernel_optval);
+ }
out_put:
fput_light(sock->file, fput_needed);
}
@@ -2093,6 +2116,7 @@ static int __sys_getsockopt(int fd, int level, int optname,
{
int err, fput_needed;
struct socket *sock;
+ int max_optlen;
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (sock != NULL) {
@@ -2100,6 +2124,8 @@ static int __sys_getsockopt(int fd, int level, int optname,
if (err)
goto out_put;
+ max_optlen = BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen);
+
if (level == SOL_SOCKET)
err =
sock_getsockopt(sock, level, optname, optval,
@@ -2108,6 +2134,10 @@ static int __sys_getsockopt(int fd, int level, int optname,
err =
sock->ops->getsockopt(sock, level, optname, optval,
optlen);
+
+ err = BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock->sk, level, optname,
+ optval, optlen,
+ max_optlen, err);
out_put:
fput_light(sock->file, fput_needed);
}