diff options
-rw-r--r-- | Documentation/networking/bpf_flow_dissector.rst | 126 | ||||
-rw-r--r-- | Documentation/networking/index.rst | 1 | ||||
-rw-r--r-- | net/core/filter.c | 16 | ||||
-rw-r--r-- | net/core/flow_dissector.c | 4 | ||||
-rw-r--r-- | tools/testing/selftests/bpf/prog_tests/flow_dissector.c | 68 | ||||
-rw-r--r-- | tools/testing/selftests/bpf/progs/bpf_flow.c | 19 |
6 files changed, 208 insertions, 26 deletions
diff --git a/Documentation/networking/bpf_flow_dissector.rst b/Documentation/networking/bpf_flow_dissector.rst new file mode 100644 index 000000000000..b375ae2ec2c4 --- /dev/null +++ b/Documentation/networking/bpf_flow_dissector.rst @@ -0,0 +1,126 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================== +BPF Flow Dissector +================== + +Overview +======== + +Flow dissector is a routine that parses metadata out of the packets. It's +used in the various places in the networking subsystem (RFS, flow hash, etc). + +BPF flow dissector is an attempt to reimplement C-based flow dissector logic +in BPF to gain all the benefits of BPF verifier (namely, limits on the +number of instructions and tail calls). + +API +=== + +BPF flow dissector programs operate on an ``__sk_buff``. However, only the +limited set of fields is allowed: ``data``, ``data_end`` and ``flow_keys``. +``flow_keys`` is ``struct bpf_flow_keys`` and contains flow dissector input +and output arguments. + +The inputs are: + * ``nhoff`` - initial offset of the networking header + * ``thoff`` - initial offset of the transport header, initialized to nhoff + * ``n_proto`` - L3 protocol type, parsed out of L2 header + +Flow dissector BPF program should fill out the rest of the ``struct +bpf_flow_keys`` fields. Input arguments ``nhoff/thoff/n_proto`` should be +also adjusted accordingly. + +The return code of the BPF program is either BPF_OK to indicate successful +dissection, or BPF_DROP to indicate parsing error. + +__sk_buff->data +=============== + +In the VLAN-less case, this is what the initial state of the BPF flow +dissector looks like:: + + +------+------+------------+-----------+ + | DMAC | SMAC | ETHER_TYPE | L3_HEADER | + +------+------+------------+-----------+ + ^ + | + +-- flow dissector starts here + + +.. code:: c + + skb->data + flow_keys->nhoff point to the first byte of L3_HEADER + flow_keys->thoff = nhoff + flow_keys->n_proto = ETHER_TYPE + +In case of VLAN, flow dissector can be called with the two different states. + +Pre-VLAN parsing:: + + +------+------+------+-----+-----------+-----------+ + | DMAC | SMAC | TPID | TCI |ETHER_TYPE | L3_HEADER | + +------+------+------+-----+-----------+-----------+ + ^ + | + +-- flow dissector starts here + +.. code:: c + + skb->data + flow_keys->nhoff point the to first byte of TCI + flow_keys->thoff = nhoff + flow_keys->n_proto = TPID + +Please note that TPID can be 802.1AD and, hence, BPF program would +have to parse VLAN information twice for double tagged packets. + + +Post-VLAN parsing:: + + +------+------+------+-----+-----------+-----------+ + | DMAC | SMAC | TPID | TCI |ETHER_TYPE | L3_HEADER | + +------+------+------+-----+-----------+-----------+ + ^ + | + +-- flow dissector starts here + +.. code:: c + + skb->data + flow_keys->nhoff point the to first byte of L3_HEADER + flow_keys->thoff = nhoff + flow_keys->n_proto = ETHER_TYPE + +In this case VLAN information has been processed before the flow dissector +and BPF flow dissector is not required to handle it. + + +The takeaway here is as follows: BPF flow dissector program can be called with +the optional VLAN header and should gracefully handle both cases: when single +or double VLAN is present and when it is not present. The same program +can be called for both cases and would have to be written carefully to +handle both cases. + + +Reference Implementation +======================== + +See ``tools/testing/selftests/bpf/progs/bpf_flow.c`` for the reference +implementation and ``tools/testing/selftests/bpf/flow_dissector_load.[hc]`` +for the loader. bpftool can be used to load BPF flow dissector program as well. + +The reference implementation is organized as follows: + * ``jmp_table`` map that contains sub-programs for each supported L3 protocol + * ``_dissect`` routine - entry point; it does input ``n_proto`` parsing and + does ``bpf_tail_call`` to the appropriate L3 handler + +Since BPF at this point doesn't support looping (or any jumping back), +jmp_table is used instead to handle multiple levels of encapsulation (and +IPv6 options). + + +Current Limitations +=================== +BPF flow dissector doesn't support exporting all the metadata that in-kernel +C-based implementation can export. Notable example is single VLAN (802.1Q) +and double VLAN (802.1AD) tags. Please refer to the ``struct bpf_flow_keys`` +for a set of information that's currently can be exported from the BPF context. diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 5449149be496..984e68f9e026 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -9,6 +9,7 @@ Contents: netdev-FAQ af_xdp batman-adv + bpf_flow_dissector can can_ucan_protocol device_drivers/freescale/dpaa2/index diff --git a/net/core/filter.c b/net/core/filter.c index 647c63a7b25b..fc92ebc4e200 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6613,14 +6613,8 @@ static bool flow_dissector_is_valid_access(int off, int size, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { - if (type == BPF_WRITE) { - switch (off) { - case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): - break; - default: - return false; - } - } + if (type == BPF_WRITE) + return false; switch (off) { case bpf_ctx_range(struct __sk_buff, data): @@ -6632,11 +6626,7 @@ static bool flow_dissector_is_valid_access(int off, int size, case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): info->reg_type = PTR_TO_FLOW_KEYS; break; - case bpf_ctx_range(struct __sk_buff, tc_classid): - case bpf_ctx_range(struct __sk_buff, data_meta): - case bpf_ctx_range_till(struct __sk_buff, family, local_port): - case bpf_ctx_range(struct __sk_buff, tstamp): - case bpf_ctx_range(struct __sk_buff, wire_len): + default: return false; } diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index bb1a54747d64..94a450b2191a 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -707,6 +707,7 @@ bool __skb_flow_bpf_dissect(struct bpf_prog *prog, /* Pass parameters to the BPF program */ memset(flow_keys, 0, sizeof(*flow_keys)); cb->qdisc_cb.flow_keys = flow_keys; + flow_keys->n_proto = skb->protocol; flow_keys->nhoff = skb_network_offset(skb); flow_keys->thoff = flow_keys->nhoff; @@ -716,7 +717,8 @@ bool __skb_flow_bpf_dissect(struct bpf_prog *prog, /* Restore state */ memcpy(cb, &cb_saved, sizeof(cb_saved)); - flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, 0, skb->len); + flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, + skb_network_offset(skb), skb->len); flow_keys->thoff = clamp_t(u16, flow_keys->thoff, flow_keys->nhoff, skb->len); diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c index bcbd928c96ab..fc818bc1d729 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c @@ -39,6 +39,58 @@ static struct bpf_flow_keys pkt_v6_flow_keys = { .n_proto = __bpf_constant_htons(ETH_P_IPV6), }; +#define VLAN_HLEN 4 + +static struct { + struct ethhdr eth; + __u16 vlan_tci; + __u16 vlan_proto; + struct iphdr iph; + struct tcphdr tcp; +} __packed pkt_vlan_v4 = { + .eth.h_proto = __bpf_constant_htons(ETH_P_8021Q), + .vlan_proto = __bpf_constant_htons(ETH_P_IP), + .iph.ihl = 5, + .iph.protocol = IPPROTO_TCP, + .iph.tot_len = __bpf_constant_htons(MAGIC_BYTES), + .tcp.urg_ptr = 123, + .tcp.doff = 5, +}; + +static struct bpf_flow_keys pkt_vlan_v4_flow_keys = { + .nhoff = VLAN_HLEN, + .thoff = VLAN_HLEN + sizeof(struct iphdr), + .addr_proto = ETH_P_IP, + .ip_proto = IPPROTO_TCP, + .n_proto = __bpf_constant_htons(ETH_P_IP), +}; + +static struct { + struct ethhdr eth; + __u16 vlan_tci; + __u16 vlan_proto; + __u16 vlan_tci2; + __u16 vlan_proto2; + struct ipv6hdr iph; + struct tcphdr tcp; +} __packed pkt_vlan_v6 = { + .eth.h_proto = __bpf_constant_htons(ETH_P_8021AD), + .vlan_proto = __bpf_constant_htons(ETH_P_8021Q), + .vlan_proto2 = __bpf_constant_htons(ETH_P_IPV6), + .iph.nexthdr = IPPROTO_TCP, + .iph.payload_len = __bpf_constant_htons(MAGIC_BYTES), + .tcp.urg_ptr = 123, + .tcp.doff = 5, +}; + +static struct bpf_flow_keys pkt_vlan_v6_flow_keys = { + .nhoff = VLAN_HLEN * 2, + .thoff = VLAN_HLEN * 2 + sizeof(struct ipv6hdr), + .addr_proto = ETH_P_IPV6, + .ip_proto = IPPROTO_TCP, + .n_proto = __bpf_constant_htons(ETH_P_IPV6), +}; + void test_flow_dissector(void) { struct bpf_flow_keys flow_keys; @@ -68,5 +120,21 @@ void test_flow_dissector(void) err, errno, retval, duration, size, sizeof(flow_keys)); CHECK_FLOW_KEYS("ipv6_flow_keys", flow_keys, pkt_v6_flow_keys); + err = bpf_prog_test_run(prog_fd, 10, &pkt_vlan_v4, sizeof(pkt_vlan_v4), + &flow_keys, &size, &retval, &duration); + CHECK(size != sizeof(flow_keys) || err || retval != 1, "vlan_ipv4", + "err %d errno %d retval %d duration %d size %u/%lu\n", + err, errno, retval, duration, size, sizeof(flow_keys)); + CHECK_FLOW_KEYS("vlan_ipv4_flow_keys", flow_keys, + pkt_vlan_v4_flow_keys); + + err = bpf_prog_test_run(prog_fd, 10, &pkt_vlan_v6, sizeof(pkt_vlan_v6), + &flow_keys, &size, &retval, &duration); + CHECK(size != sizeof(flow_keys) || err || retval != 1, "vlan_ipv6", + "err %d errno %d retval %d duration %d size %u/%lu\n", + err, errno, retval, duration, size, sizeof(flow_keys)); + CHECK_FLOW_KEYS("vlan_ipv6_flow_keys", flow_keys, + pkt_vlan_v6_flow_keys); + bpf_object__close(obj); } diff --git a/tools/testing/selftests/bpf/progs/bpf_flow.c b/tools/testing/selftests/bpf/progs/bpf_flow.c index 284660f5aa95..75b17cada539 100644 --- a/tools/testing/selftests/bpf/progs/bpf_flow.c +++ b/tools/testing/selftests/bpf/progs/bpf_flow.c @@ -92,7 +92,6 @@ static __always_inline int parse_eth_proto(struct __sk_buff *skb, __be16 proto) { struct bpf_flow_keys *keys = skb->flow_keys; - keys->n_proto = proto; switch (proto) { case bpf_htons(ETH_P_IP): bpf_tail_call(skb, &jmp_table, IP); @@ -119,10 +118,9 @@ static __always_inline int parse_eth_proto(struct __sk_buff *skb, __be16 proto) SEC("flow_dissector") int _dissect(struct __sk_buff *skb) { - if (!skb->vlan_present) - return parse_eth_proto(skb, skb->protocol); - else - return parse_eth_proto(skb, skb->vlan_proto); + struct bpf_flow_keys *keys = skb->flow_keys; + + return parse_eth_proto(skb, keys->n_proto); } /* Parses on IPPROTO_* */ @@ -336,15 +334,9 @@ PROG(VLAN)(struct __sk_buff *skb) { struct bpf_flow_keys *keys = skb->flow_keys; struct vlan_hdr *vlan, _vlan; - __be16 proto; - - /* Peek back to see if single or double-tagging */ - if (bpf_skb_load_bytes(skb, keys->thoff - sizeof(proto), &proto, - sizeof(proto))) - return BPF_DROP; /* Account for double-tagging */ - if (proto == bpf_htons(ETH_P_8021AD)) { + if (keys->n_proto == bpf_htons(ETH_P_8021AD)) { vlan = bpf_flow_dissect_get_header(skb, sizeof(*vlan), &_vlan); if (!vlan) return BPF_DROP; @@ -352,6 +344,7 @@ PROG(VLAN)(struct __sk_buff *skb) if (vlan->h_vlan_encapsulated_proto != bpf_htons(ETH_P_8021Q)) return BPF_DROP; + keys->nhoff += sizeof(*vlan); keys->thoff += sizeof(*vlan); } @@ -359,12 +352,14 @@ PROG(VLAN)(struct __sk_buff *skb) if (!vlan) return BPF_DROP; + keys->nhoff += sizeof(*vlan); keys->thoff += sizeof(*vlan); /* Only allow 8021AD + 8021Q double tagging and no triple tagging.*/ if (vlan->h_vlan_encapsulated_proto == bpf_htons(ETH_P_8021AD) || vlan->h_vlan_encapsulated_proto == bpf_htons(ETH_P_8021Q)) return BPF_DROP; + keys->n_proto = vlan->h_vlan_encapsulated_proto; return parse_eth_proto(skb, vlan->h_vlan_encapsulated_proto); } |