Merge branch 'bpf_iter_tcp_udp'

Yonghong Song says: ==================== bpf iterator implments traversal of kernel data structures and these data structures are passed to a bpf program for processing. This gives great flexibility for users to examine kernel data structure without using e.g. /proc/net which has limited and fixed format. Commit 138d0be35b14 ("net: bpf: Add netlink and ipv6_route bpf_iter targets") implemented bpf iterators for netlink and ipv6_route. This patch set intends to implement bpf iterators for tcp and udp. Currently, /proc/net/tcp is used to print tcp4 stats and /proc/net/tcp6 is used to print tcp6 stats. /proc/net/udp[6] have similar usage model. In contrast, only one tcp iterator is implemented and it is bpf program resposibility to filter based on socket family. The same is for udp. This will avoid another unnecessary traversal pass if users want to check both tcp4 and tcp6. Several helpers are also implemented in this patch bpf_skc_to_{tcp, tcp6, tcp_timewait, tcp_request, udp6}_sock The argument for these helpers is not a fixed btf_id. For example, bpf_skc_to_tcp(struct sock_common *), or bpf_skc_to_tcp(struct sock *), or bpf_skc_to_tcp(struct inet_sock *), ... are all valid. At runtime, the helper will check whether pointer cast is legal or not. Please see Patch #5 for details. Since btf_id's for both arguments and return value are known at build time, the btf_id's are pre-computed once vmlinux btf becomes valid. Jiri's "adding d_path helper" patch set https://lore.kernel.org/bpf/20200616100512.2168860-1-jolsa@kernel.org/T/ provides a way to pre-compute btf id during vmlinux build time. This can be applied here as well. A followup patch can convert to build time btf id computation after Jiri's patch landed. Changelogs: v4 -> v5: - fix bpf_skc_to_udp6_sock helper as besides sk_protocol, sk_family, sk_type == SOCK_DGRAM is also needed to differentiate from SOCK_RAW (Eric) v3 -> v4: - fix bpf_skc_to_{tcp_timewait, tcp_request}_sock helper implementation as just checking sk->sk_state is not enough (Martin) - fix a few kernel test robot reported failures - move bpf_tracing_net.h from libbpf to selftests (Andrii) - remove __weak attribute from selftests CONFIG_HZ variables (Andrii) v2 -> v3: - change sock_cast*/SOCK_CAST* names to btf_sock* names for generality (Martin) - change gpl_license to false (Martin) - fix helper to cast to tcp timewait/request socket. (Martin) v1 -> v2: - guard init_sock_cast_types() defination properly with CONFIG_NET (Martin) - reuse the btf_ids, computed for new helper argument, for return values (Martin) - using BTF_TYPE_EMIT to express intent of btf type generation (Andrii) - abstract out common net macros into bpf_tracing_net.h (Andrii) ==================== Signed-off-by: Alexei Starovoitov <ast@kernel.org>
author: Alexei Starovoitov <ast@kernel.org> 2020-06-25 04:38:00 +0300
committer: Alexei Starovoitov <ast@kernel.org> 2020-06-25 04:39:08 +0300
commit: 74765da176a0627e34bacf72826ec60c075cb62c (patch)
tree: a6deaa85cbbadde0d12d69826e1dd57d70130b6a /net/ipv4/tcp_ipv4.c
parent: f9bcf96837f158db6ea982d15cd2c8161ca6bc23 (diff)
parent: cfcd75f9bf12301dfdcfe9ff6dfb240997e7745f (diff)
download: linux-74765da176a0627e34bacf72826ec60c075cb62c.tar.xz
1 files changed, 147 insertions, 6 deletions
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ad6435ba6d72..ea0df9fd7618 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2211,13 +2211,18 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock);
  */
 static void *listening_get_next(struct seq_file *seq, void *cur)
 {
-	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
+	struct tcp_seq_afinfo *afinfo;
 	struct tcp_iter_state *st = seq->private;
 	struct net *net = seq_file_net(seq);
 	struct inet_listen_hashbucket *ilb;
 	struct hlist_nulls_node *node;
 	struct sock *sk = cur;
 
+	if (st->bpf_seq_afinfo)
+		afinfo = st->bpf_seq_afinfo;
+	else
+		afinfo = PDE_DATA(file_inode(seq->file));
+
 	if (!sk) {
 get_head:
 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
@@ -2235,7 +2240,8 @@ get_sk:
 	sk_nulls_for_each_from(sk, node) {
 		if (!net_eq(sock_net(sk), net))
 			continue;
-		if (sk->sk_family == afinfo->family)
+		if (afinfo->family == AF_UNSPEC ||
+		    sk->sk_family == afinfo->family)
 			return sk;
 	}
 	spin_unlock(&ilb->lock);
@@ -2272,11 +2278,16 @@ static inline bool empty_bucket(const struct tcp_iter_state *st)
  */
 static void *established_get_first(struct seq_file *seq)
 {
-	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
+	struct tcp_seq_afinfo *afinfo;
 	struct tcp_iter_state *st = seq->private;
 	struct net *net = seq_file_net(seq);
 	void *rc = NULL;
 
+	if (st->bpf_seq_afinfo)
+		afinfo = st->bpf_seq_afinfo;
+	else
+		afinfo = PDE_DATA(file_inode(seq->file));
+
 	st->offset = 0;
 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
 		struct sock *sk;
@@ -2289,7 +2300,8 @@ static void *established_get_first(struct seq_file *seq)
 
 		spin_lock_bh(lock);
 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
-			if (sk->sk_family != afinfo->family ||
+			if ((afinfo->family != AF_UNSPEC &&
+			     sk->sk_family != afinfo->family) ||
 			    !net_eq(sock_net(sk), net)) {
 				continue;
 			}
@@ -2304,19 +2316,25 @@ out:
 
 static void *established_get_next(struct seq_file *seq, void *cur)
 {
-	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
+	struct tcp_seq_afinfo *afinfo;
 	struct sock *sk = cur;
 	struct hlist_nulls_node *node;
 	struct tcp_iter_state *st = seq->private;
 	struct net *net = seq_file_net(seq);
 
+	if (st->bpf_seq_afinfo)
+		afinfo = st->bpf_seq_afinfo;
+	else
+		afinfo = PDE_DATA(file_inode(seq->file));
+
 	++st->num;
 	++st->offset;
 
 	sk = sk_nulls_next(sk);
 
 	sk_nulls_for_each_from(sk, node) {
-		if (sk->sk_family == afinfo->family &&
+		if ((afinfo->family == AF_UNSPEC ||
+		     sk->sk_family == afinfo->family) &&
 		    net_eq(sock_net(sk), net))
 			return sk;
 	}
@@ -2595,6 +2613,74 @@ out:
 	return 0;
 }
 
+#ifdef CONFIG_BPF_SYSCALL
+struct bpf_iter__tcp {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct sock_common *, sk_common);
+	uid_t uid __aligned(8);
+};
+
+static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
+			     struct sock_common *sk_common, uid_t uid)
+{
+	struct bpf_iter__tcp ctx;
+
+	meta->seq_num--;  /* skip SEQ_START_TOKEN */
+	ctx.meta = meta;
+	ctx.sk_common = sk_common;
+	ctx.uid = uid;
+	return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
+{
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+	struct sock *sk = v;
+	uid_t uid;
+
+	if (v == SEQ_START_TOKEN)
+		return 0;
+
+	if (sk->sk_state == TCP_TIME_WAIT) {
+		uid = 0;
+	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
+		const struct request_sock *req = v;
+
+		uid = from_kuid_munged(seq_user_ns(seq),
+				       sock_i_uid(req->rsk_listener));
+	} else {
+		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
+	}
+
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, false);
+	return tcp_prog_seq_show(prog, &meta, v, uid);
+}
+
+static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
+{
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+
+	if (!v) {
+		meta.seq = seq;
+		prog = bpf_iter_get_info(&meta, true);
+		if (prog)
+			(void)tcp_prog_seq_show(prog, &meta, v, 0);
+	}
+
+	tcp_seq_stop(seq, v);
+}
+
+static const struct seq_operations bpf_iter_tcp_seq_ops = {
+	.show		= bpf_iter_tcp_seq_show,
+	.start		= tcp_seq_start,
+	.next		= tcp_seq_next,
+	.stop		= bpf_iter_tcp_seq_stop,
+};
+#endif
+
 static const struct seq_operations tcp4_seq_ops = {
 	.show		= tcp4_seq_show,
 	.start		= tcp_seq_start,
@@ -2826,8 +2912,63 @@ static struct pernet_operations __net_initdata tcp_sk_ops = {
        .exit_batch = tcp_sk_exit_batch,
 };
 
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
+		     struct sock_common *sk_common, uid_t uid)
+
+static int bpf_iter_init_tcp(void *priv_data)
+{
+	struct tcp_iter_state *st = priv_data;
+	struct tcp_seq_afinfo *afinfo;
+	int ret;
+
+	afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
+	if (!afinfo)
+		return -ENOMEM;
+
+	afinfo->family = AF_UNSPEC;
+	st->bpf_seq_afinfo = afinfo;
+	ret = bpf_iter_init_seq_net(priv_data);
+	if (ret)
+		kfree(afinfo);
+	return ret;
+}
+
+static void bpf_iter_fini_tcp(void *priv_data)
+{
+	struct tcp_iter_state *st = priv_data;
+
+	kfree(st->bpf_seq_afinfo);
+	bpf_iter_fini_seq_net(priv_data);
+}
+
+static const struct bpf_iter_reg tcp_reg_info = {
+	.target			= "tcp",
+	.seq_ops		= &bpf_iter_tcp_seq_ops,
+	.init_seq_private	= bpf_iter_init_tcp,
+	.fini_seq_private	= bpf_iter_fini_tcp,
+	.seq_priv_size		= sizeof(struct tcp_iter_state),
+	.ctx_arg_info_size	= 1,
+	.ctx_arg_info		= {
+		{ offsetof(struct bpf_iter__tcp, sk_common),
+		  PTR_TO_BTF_ID_OR_NULL },
+	},
+};
+
+static void __init bpf_iter_register(void)
+{
+	if (bpf_iter_reg_target(&tcp_reg_info))
+		pr_warn("Warning: could not register bpf iterator tcp\n");
+}
+
+#endif
+
 void __init tcp_v4_init(void)
 {
 	if (register_pernet_subsys(&tcp_sk_ops))
 		panic("Failed to create the TCP control socket.\n");
+
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+	bpf_iter_register();
+#endif
 }
author	Alexei Starovoitov <ast@kernel.org>	2020-06-25 04:38:00 +0300
committer	Alexei Starovoitov <ast@kernel.org>	2020-06-25 04:39:08 +0300
commit	74765da176a0627e34bacf72826ec60c075cb62c (patch)
tree	a6deaa85cbbadde0d12d69826e1dd57d70130b6a /net/ipv4/tcp_ipv4.c
parent	f9bcf96837f158db6ea982d15cd2c8161ca6bc23 (diff)
parent	cfcd75f9bf12301dfdcfe9ff6dfb240997e7745f (diff)
download	linux-74765da176a0627e34bacf72826ec60c075cb62c.tar.xz