summaryrefslogtreecommitdiff
path: root/include/linux
diff options
context:
space:
mode:
authorAlexei Starovoitov <ast@kernel.org>2020-07-18 06:18:18 +0300
committerAlexei Starovoitov <ast@kernel.org>2020-07-18 06:19:32 +0300
commite57892f50a07953053dcb1e0c9431197e569c258 (patch)
treec300e4e02983d21d2d2c58dfe08261c36786c519 /include/linux
parentbfdfa51702dec67e9fcd52568b4cf3c7f799db8b (diff)
parent0ab5539f85840d3c4e5a8a4783901c0038f8321e (diff)
downloadlinux-e57892f50a07953053dcb1e0c9431197e569c258.tar.xz
Merge branch 'bpf-socket-lookup'
Jakub Sitnicki says: ==================== Changelog ========= v4 -> v5: - Enforce BPF prog return value to be SK_DROP or SK_PASS. (Andrii) - Simplify prog runners now that only SK_DROP/PASS can be returned. - Enable bpf_perf_event_output from the start. (Andrii) - Drop patch "selftests/bpf: Rename test_sk_lookup_kern.c to test_ref_track_kern.c" - Remove tests for narrow loads from context at an offset wider in size than target field, while we are discussing how to fix it: https://lore.kernel.org/bpf/20200710173123.427983-1-jakub@cloudflare.com/ - Rebase onto recent bpf-next (bfdfa51702de) - Other minor changes called out in per-patch changelogs, see patches: 2, 4, 6, 13-15 - Carried over Andrii's Acks where nothing changed. v3 -> v4: - Reduce BPF prog return codes to SK_DROP/SK_PASS (Lorenz) - Default to drop on illegal return value from BPF prog (Lorenz) - Extend bpf_sk_assign to accept NULL socket pointer. - Switch to saner return values and add docs for new prog_array API (Andrii) - Add support for narrow loads from BPF context fields (Yonghong) - Fix broken build when IPv6 is compiled as a module (kernel test robot) - Fix null/wild-ptr-deref on BPF context access - Rebase to recent bpf-next (eef8a42d6ce0) - Other minor changes called out in per-patch changelogs, see patches 1-2, 4, 6, 8, 10-12, 14, 16 v2 -> v3: - Switch to link-based program attachment - Support for multi-prog attachment - Ability to skip reuseport socket selection - Code on RX path is guarded by a static key - struct in6_addr's are no longer copied into BPF prog context - BPF prog context is initialized as late as possible - Changes called out in patches 1-2, 4, 6, 8, 10-14, 16 - Patches dropped: 01/17 flow_dissector: Extract attach/detach/query helpers 03/17 inet: Store layer 4 protocol in inet_hashinfo 08/17 udp: Store layer 4 protocol in udp_table v1 -> v2: - Changes called out in patches 2, 13-15, 17 - Rebase to recent bpf-next (b4563facdcae) RFCv2 -> v1: - Switch to fetching a socket from a map and selecting a socket with bpf_sk_assign, instead of having a dedicated helper that does both. - Run reuseport logic on sockets selected by BPF sk_lookup. - Allow BPF sk_lookup to fail the lookup with no match. - Go back to having just 2 hash table lookups in UDP. RFCv1 -> RFCv2: - Make socket lookup redirection map-based. BPF program now uses a dedicated helper and a SOCKARRAY map to select the socket to redirect to. A consequence of this change is that bpf_inet_lookup context is now read-only. - Look for connected UDP sockets before allowing redirection from BPF. This makes connected UDP socket work as expected in the presence of inet_lookup prog. - Share the code for BPF_PROG_{ATTACH,DETACH,QUERY} with flow_dissector, the only other per-netns BPF prog type. Overview ======== This series proposes a new BPF program type named BPF_PROG_TYPE_SK_LOOKUP, or BPF sk_lookup for short. BPF sk_lookup program runs when transport layer is looking up a listening socket for a new connection request (TCP), or when looking up an unconnected socket for a packet (UDP). This serves as a mechanism to overcome the limits of what bind() API allows to express. Two use-cases driving this work are: (1) steer packets destined to an IP range, fixed port to a single socket 192.0.2.0/24, port 80 -> NGINX socket (2) steer packets destined to an IP address, any port to a single socket 198.51.100.1, any port -> L7 proxy socket In its context, program receives information about the packet that triggered the socket lookup. Namely IP version, L4 protocol identifier, and address 4-tuple. To select a socket BPF program fetches it from a map holding socket references, like SOCKMAP or SOCKHASH, calls bpf_sk_assign(ctx, sk, ...) helper to record the selection, and returns SK_PASS code. Transport layer then uses the selected socket as a result of socket lookup. Alternatively, program can also fail the lookup (SK_DROP), or let the lookup continue as usual (SK_PASS without selecting a socket). This lets the user match packets with listening (TCP) or receiving (UDP) sockets freely at the last possible point on the receive path, where we know that packets are destined for local delivery after undergoing policing, filtering, and routing. Program is attached to a network namespace, similar to BPF flow_dissector. We add a new attach type, BPF_SK_LOOKUP, for this. Multiple programs can be attached at the same time, in which case their return values are aggregated according the rules outlined in patch #4 description. Series structure ================ Patches are organized as so: 1: enables multiple link-based prog attachments for bpf-netns 2: introduces sk_lookup program type 3-4: hook up the program to run on ipv4/tcp socket lookup 5-6: hook up the program to run on ipv6/tcp socket lookup 7-8: hook up the program to run on ipv4/udp socket lookup 9-10: hook up the program to run on ipv6/udp socket lookup 11-13: libbpf & bpftool support for sk_lookup 14-15: verifier and selftests for sk_lookup Patches are also available on GH: https://github.com/jsitnicki/linux/commits/bpf-inet-lookup-v5 Follow-up work ============== I'll follow up with below items, which IMHO don't block the review: - benchmark results for udp6 small packet flood scenario, - user docs for new BPF prog type, Documentation/bpf/prog_sk_lookup.rst, - timeout for accept() in tests after extending network_helper.[ch]. Thanks to the reviewers for their feedback to this patch series: Cc: Alexei Starovoitov <ast@kernel.org> Cc: Andrii Nakryiko <andriin@fb.com> Cc: Lorenz Bauer <lmb@cloudflare.com> Cc: Marek Majkowski <marek@cloudflare.com> Cc: Martin KaFai Lau <kafai@fb.com> Cc: Yonghong Song <yhs@fb.com> -jkbs [RFCv1] https://lore.kernel.org/bpf/20190618130050.8344-1-jakub@cloudflare.com/ [RFCv2] https://lore.kernel.org/bpf/20190828072250.29828-1-jakub@cloudflare.com/ [v1] https://lore.kernel.org/bpf/20200511185218.1422406-18-jakub@cloudflare.com/ [v2] https://lore.kernel.org/bpf/20200506125514.1020829-1-jakub@cloudflare.com/ [v3] https://lore.kernel.org/bpf/20200702092416.11961-1-jakub@cloudflare.com/ [v4] https://lore.kernel.org/bpf/20200713174654.642628-1-jakub@cloudflare.com/ ==================== Reviewed-by: Lorenz Bauer <lmb@cloudflare.com> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/bpf-netns.h3
-rw-r--r--include/linux/bpf.h4
-rw-r--r--include/linux/bpf_types.h2
-rw-r--r--include/linux/filter.h147
4 files changed, 156 insertions, 0 deletions
diff --git a/include/linux/bpf-netns.h b/include/linux/bpf-netns.h
index 47d5b0c708c9..722f799c1a2e 100644
--- a/include/linux/bpf-netns.h
+++ b/include/linux/bpf-netns.h
@@ -8,6 +8,7 @@
enum netns_bpf_attach_type {
NETNS_BPF_INVALID = -1,
NETNS_BPF_FLOW_DISSECTOR = 0,
+ NETNS_BPF_SK_LOOKUP,
MAX_NETNS_BPF_ATTACH_TYPE
};
@@ -17,6 +18,8 @@ to_netns_bpf_attach_type(enum bpf_attach_type attach_type)
switch (attach_type) {
case BPF_FLOW_DISSECTOR:
return NETNS_BPF_FLOW_DISSECTOR;
+ case BPF_SK_LOOKUP:
+ return NETNS_BPF_SK_LOOKUP;
default:
return NETNS_BPF_INVALID;
}
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 54ad426dbea1..adb16bdc5f0a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -249,6 +249,7 @@ enum bpf_arg_type {
ARG_PTR_TO_INT, /* pointer to int */
ARG_PTR_TO_LONG, /* pointer to long */
ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */
+ ARG_PTR_TO_SOCKET_OR_NULL, /* pointer to bpf_sock (fullsock) or NULL */
ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */
ARG_PTR_TO_ALLOC_MEM, /* pointer to dynamically allocated memory */
ARG_PTR_TO_ALLOC_MEM_OR_NULL, /* pointer to dynamically allocated memory or NULL */
@@ -928,6 +929,9 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs,
void bpf_prog_array_delete_safe(struct bpf_prog_array *progs,
struct bpf_prog *old_prog);
+int bpf_prog_array_delete_safe_at(struct bpf_prog_array *array, int index);
+int bpf_prog_array_update_at(struct bpf_prog_array *array, int index,
+ struct bpf_prog *prog);
int bpf_prog_array_copy_info(struct bpf_prog_array *array,
u32 *prog_ids, u32 request_cnt,
u32 *prog_cnt);
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index a18ae82a298a..a52a5688418e 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -64,6 +64,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2,
#ifdef CONFIG_INET
BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport,
struct sk_reuseport_md, struct sk_reuseport_kern)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SK_LOOKUP, sk_lookup,
+ struct bpf_sk_lookup, struct bpf_sk_lookup_kern)
#endif
#if defined(CONFIG_BPF_JIT)
BPF_PROG_TYPE(BPF_PROG_TYPE_STRUCT_OPS, bpf_struct_ops,
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 0b0144752d78..8252572db918 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1278,4 +1278,151 @@ struct bpf_sockopt_kern {
s32 retval;
};
+struct bpf_sk_lookup_kern {
+ u16 family;
+ u16 protocol;
+ struct {
+ __be32 saddr;
+ __be32 daddr;
+ } v4;
+ struct {
+ const struct in6_addr *saddr;
+ const struct in6_addr *daddr;
+ } v6;
+ __be16 sport;
+ u16 dport;
+ struct sock *selected_sk;
+ bool no_reuseport;
+};
+
+extern struct static_key_false bpf_sk_lookup_enabled;
+
+/* Runners for BPF_SK_LOOKUP programs to invoke on socket lookup.
+ *
+ * Allowed return values for a BPF SK_LOOKUP program are SK_PASS and
+ * SK_DROP. Their meaning is as follows:
+ *
+ * SK_PASS && ctx.selected_sk != NULL: use selected_sk as lookup result
+ * SK_PASS && ctx.selected_sk == NULL: continue to htable-based socket lookup
+ * SK_DROP : terminate lookup with -ECONNREFUSED
+ *
+ * This macro aggregates return values and selected sockets from
+ * multiple BPF programs according to following rules in order:
+ *
+ * 1. If any program returned SK_PASS and a non-NULL ctx.selected_sk,
+ * macro result is SK_PASS and last ctx.selected_sk is used.
+ * 2. If any program returned SK_DROP return value,
+ * macro result is SK_DROP.
+ * 3. Otherwise result is SK_PASS and ctx.selected_sk is NULL.
+ *
+ * Caller must ensure that the prog array is non-NULL, and that the
+ * array as well as the programs it contains remain valid.
+ */
+#define BPF_PROG_SK_LOOKUP_RUN_ARRAY(array, ctx, func) \
+ ({ \
+ struct bpf_sk_lookup_kern *_ctx = &(ctx); \
+ struct bpf_prog_array_item *_item; \
+ struct sock *_selected_sk = NULL; \
+ bool _no_reuseport = false; \
+ struct bpf_prog *_prog; \
+ bool _all_pass = true; \
+ u32 _ret; \
+ \
+ migrate_disable(); \
+ _item = &(array)->items[0]; \
+ while ((_prog = READ_ONCE(_item->prog))) { \
+ /* restore most recent selection */ \
+ _ctx->selected_sk = _selected_sk; \
+ _ctx->no_reuseport = _no_reuseport; \
+ \
+ _ret = func(_prog, _ctx); \
+ if (_ret == SK_PASS && _ctx->selected_sk) { \
+ /* remember last non-NULL socket */ \
+ _selected_sk = _ctx->selected_sk; \
+ _no_reuseport = _ctx->no_reuseport; \
+ } else if (_ret == SK_DROP && _all_pass) { \
+ _all_pass = false; \
+ } \
+ _item++; \
+ } \
+ _ctx->selected_sk = _selected_sk; \
+ _ctx->no_reuseport = _no_reuseport; \
+ migrate_enable(); \
+ _all_pass || _selected_sk ? SK_PASS : SK_DROP; \
+ })
+
+static inline bool bpf_sk_lookup_run_v4(struct net *net, int protocol,
+ const __be32 saddr, const __be16 sport,
+ const __be32 daddr, const u16 dport,
+ struct sock **psk)
+{
+ struct bpf_prog_array *run_array;
+ struct sock *selected_sk = NULL;
+ bool no_reuseport = false;
+
+ rcu_read_lock();
+ run_array = rcu_dereference(net->bpf.run_array[NETNS_BPF_SK_LOOKUP]);
+ if (run_array) {
+ struct bpf_sk_lookup_kern ctx = {
+ .family = AF_INET,
+ .protocol = protocol,
+ .v4.saddr = saddr,
+ .v4.daddr = daddr,
+ .sport = sport,
+ .dport = dport,
+ };
+ u32 act;
+
+ act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, BPF_PROG_RUN);
+ if (act == SK_PASS) {
+ selected_sk = ctx.selected_sk;
+ no_reuseport = ctx.no_reuseport;
+ } else {
+ selected_sk = ERR_PTR(-ECONNREFUSED);
+ }
+ }
+ rcu_read_unlock();
+ *psk = selected_sk;
+ return no_reuseport;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol,
+ const struct in6_addr *saddr,
+ const __be16 sport,
+ const struct in6_addr *daddr,
+ const u16 dport,
+ struct sock **psk)
+{
+ struct bpf_prog_array *run_array;
+ struct sock *selected_sk = NULL;
+ bool no_reuseport = false;
+
+ rcu_read_lock();
+ run_array = rcu_dereference(net->bpf.run_array[NETNS_BPF_SK_LOOKUP]);
+ if (run_array) {
+ struct bpf_sk_lookup_kern ctx = {
+ .family = AF_INET6,
+ .protocol = protocol,
+ .v6.saddr = saddr,
+ .v6.daddr = daddr,
+ .sport = sport,
+ .dport = dport,
+ };
+ u32 act;
+
+ act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, BPF_PROG_RUN);
+ if (act == SK_PASS) {
+ selected_sk = ctx.selected_sk;
+ no_reuseport = ctx.no_reuseport;
+ } else {
+ selected_sk = ERR_PTR(-ECONNREFUSED);
+ }
+ }
+ rcu_read_unlock();
+ *psk = selected_sk;
+ return no_reuseport;
+}
+#endif /* IS_ENABLED(CONFIG_IPV6) */
+
#endif /* __LINUX_FILTER_H__ */