From 8050c0f0274a15841756968857cfb07b3ab809ae Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 Mar 2016 15:15:02 +0100 Subject: bpf: allow bpf_csum_diff to feed bpf_l3_csum_replace as well Commit 7d672345ed29 ("bpf: add generic bpf_csum_diff helper") added a generic checksum diff helper that can feed bpf_l4_csum_replace() with a target __wsum diff that is to be applied to the L4 checksum. This facility is very flexible, can be cascaded, allows for adding, removing, or diffing data, or for calculating the pseudo header checksum from scratch, but it can also be reused for working with the IPv4 header checksum. Thus, analogous to bpf_l4_csum_replace(), add a case for header field value of 0 to change the checksum at a given offset through a new helper csum_replace_by_diff(). Also, in addition to that, this provides an easy to use interface for feeding precalculated diffs f.e. coming from a map. It nicely complements bpf_l3_csum_replace() that currently allows only for csum updates of 2 and 4 byte diffs. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/net/checksum.h | 5 +++++ net/core/filter.c | 6 ++++++ 2 files changed, 11 insertions(+) diff --git a/include/net/checksum.h b/include/net/checksum.h index 10a16b5bd1c7..abffc64e7300 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -120,6 +120,11 @@ static inline __wsum csum_partial_ext(const void *buff, int len, __wsum sum) #define CSUM_MANGLED_0 ((__force __sum16)0xffff) +static inline void csum_replace_by_diff(__sum16 *sum, __wsum diff) +{ + *sum = csum_fold(csum_add(diff, ~csum_unfold(*sum))); +} + static inline void csum_replace4(__sum16 *sum, __be32 from, __be32 to) { __wsum tmp = csum_sub(~csum_unfold(*sum), (__force __wsum)from); diff --git a/net/core/filter.c b/net/core/filter.c index 69f4ffc0a282..356a251657a5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1447,6 +1447,12 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) return -EFAULT; switch (flags & BPF_F_HDR_FIELD_MASK) { + case 0: + if (unlikely(from != 0)) + return -EINVAL; + + csum_replace_by_diff(ptr, to); + break; case 2: csum_replace2(ptr, from, to); break; -- cgit v1.2.3 From 8afd54c87ad7089734ef0527937a256586ba828a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 Mar 2016 15:15:03 +0100 Subject: bpf: add flags to bpf_skb_store_bytes for clearing hash When overwriting parts of the packet with bpf_skb_store_bytes() that were fed previously into skb->hash calculation, we should clear the current hash with skb_clear_hash(), so that a next skb_get_hash() call can determine the correct hash related to this skb. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + net/core/filter.c | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ee2193287cbe..2e3e90309904 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -305,6 +305,7 @@ enum bpf_func_id { /* BPF_FUNC_skb_store_bytes flags. */ #define BPF_F_RECOMPUTE_CSUM (1ULL << 0) +#define BPF_F_INVALIDATE_HASH (1ULL << 1) /* BPF_FUNC_l3_csum_replace and BPF_FUNC_l4_csum_replace flags. * First 4 bits are for passing the header field size. diff --git a/net/core/filter.c b/net/core/filter.c index 356a251657a5..a1fe246a6147 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1353,7 +1353,7 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) unsigned int len = (unsigned int) r4; void *ptr; - if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM))) + if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH))) return -EINVAL; /* bpf verifier guarantees that: @@ -1384,6 +1384,8 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) if (flags & BPF_F_RECOMPUTE_CSUM) skb_postpush_rcsum(skb, ptr, len); + if (flags & BPF_F_INVALIDATE_HASH) + skb_clear_hash(skb); return 0; } -- cgit v1.2.3 From 577c50aade0f34926e4a47f61629739e6da91af6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 Mar 2016 15:15:04 +0100 Subject: bpf: make helper function protos static They are only used here, so there's no reason they should not be static. Only the vlan push/pop protos are used in the test_bpf suite. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index a1fe246a6147..ce4e18dd2c89 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1390,7 +1390,7 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) return 0; } -const struct bpf_func_proto bpf_skb_store_bytes_proto = { +static const struct bpf_func_proto bpf_skb_store_bytes_proto = { .func = bpf_skb_store_bytes, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1421,7 +1421,7 @@ static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) return 0; } -const struct bpf_func_proto bpf_skb_load_bytes_proto = { +static const struct bpf_func_proto bpf_skb_load_bytes_proto = { .func = bpf_skb_load_bytes, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1472,7 +1472,7 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) return 0; } -const struct bpf_func_proto bpf_l3_csum_replace_proto = { +static const struct bpf_func_proto bpf_l3_csum_replace_proto = { .func = bpf_l3_csum_replace, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1531,7 +1531,7 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) return 0; } -const struct bpf_func_proto bpf_l4_csum_replace_proto = { +static const struct bpf_func_proto bpf_l4_csum_replace_proto = { .func = bpf_l4_csum_replace, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1570,7 +1570,7 @@ static u64 bpf_csum_diff(u64 r1, u64 from_size, u64 r3, u64 to_size, u64 seed) return csum_partial(sp->diff, diff_size, seed); } -const struct bpf_func_proto bpf_csum_diff_proto = { +static const struct bpf_func_proto bpf_csum_diff_proto = { .func = bpf_csum_diff, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1608,7 +1608,7 @@ static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) return dev_queue_xmit(skb2); } -const struct bpf_func_proto bpf_clone_redirect_proto = { +static const struct bpf_func_proto bpf_clone_redirect_proto = { .func = bpf_clone_redirect, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1660,7 +1660,7 @@ int skb_do_redirect(struct sk_buff *skb) return dev_queue_xmit(skb); } -const struct bpf_func_proto bpf_redirect_proto = { +static const struct bpf_func_proto bpf_redirect_proto = { .func = bpf_redirect, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1799,7 +1799,7 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) return 0; } -const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { +static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { .func = bpf_skb_get_tunnel_key, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1861,7 +1861,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) return 0; } -const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { +static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { .func = bpf_skb_set_tunnel_key, .gpl_only = false, .ret_type = RET_INTEGER, -- cgit v1.2.3 From 2208087061c4ad88de188911367effc550144836 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 Mar 2016 15:15:05 +0100 Subject: bpf: allow to propagate df in bpf_skb_set_tunnel_key Added by 9a628224a61b ("ip_tunnel: Add dont fragment flag."), allow to feed df flag into tunneling facilities (currently supported on TX by vxlan, geneve and gre) as a hint from eBPF's bpf_skb_set_tunnel_key() helper. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + net/core/filter.c | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2e3e90309904..21ee6d52016f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -330,6 +330,7 @@ enum bpf_func_id { /* BPF_FUNC_skb_set_tunnel_key flags. */ #define BPF_F_ZERO_CSUM_TX (1ULL << 1) +#define BPF_F_DONT_FRAGMENT (1ULL << 2) /* user accessible mirror of in-kernel sk_buff. * new fields can only be added to the end of this structure diff --git a/net/core/filter.c b/net/core/filter.c index ce4e18dd2c89..6c9d15561d04 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1819,7 +1819,8 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) u8 compat[sizeof(struct bpf_tunnel_key)]; struct ip_tunnel_info *info; - if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX))) + if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX | + BPF_F_DONT_FRAGMENT))) return -EINVAL; if (unlikely(size != sizeof(struct bpf_tunnel_key))) { switch (size) { @@ -1844,6 +1845,9 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) info->mode = IP_TUNNEL_INFO_TX; info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM; + if (flags & BPF_F_DONT_FRAGMENT) + info->key.tun_flags |= TUNNEL_DONT_FRAGMENT; + info->key.tun_id = cpu_to_be64(from->tunnel_id); info->key.tos = from->tunnel_tos; info->key.ttl = from->tunnel_ttl; -- cgit v1.2.3 From 14ca0751c96f8d3d0f52e8ed3b3236f8b34d3460 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 Mar 2016 15:15:06 +0100 Subject: bpf: support for access to tunnel options After eBPF being able to programmatically access/manage tunnel key meta data via commit d3aa45ce6b94 ("bpf: add helpers to access tunnel metadata") and more recently also for IPv6 through c6c33454072f ("bpf: support ipv6 for bpf_skb_{set,get}_tunnel_key"), this work adds two complementary helpers to generically access their auxiliary tunnel options. Geneve and vxlan support this facility. For geneve, TLVs can be pushed, and for the vxlan case its GBP extension. I.e. setting tunnel key for geneve case only makes sense, if we can also read/write TLVs into it. In the GBP case, it provides the flexibility to easily map the group policy ID in combination with other helpers or maps. I chose to model this as two separate helpers, bpf_skb_{set,get}_tunnel_opt(), for a couple of reasons. bpf_skb_{set,get}_tunnel_key() is already rather complex by itself, and there may be cases for tunnel key backends where tunnel options are not always needed. If we would have integrated this into bpf_skb_{set,get}_tunnel_key() nevertheless, we are very limited with remaining helper arguments, so keeping compatibility on structs in case of passing in a flat buffer gets more cumbersome. Separating both also allows for more flexibility and future extensibility, f.e. options could be fed directly from a map, etc. Moreover, change geneve's xmit path to test only for info->options_len instead of TUNNEL_GENEVE_OPT flag. This makes it more consistent with vxlan's xmit path and allows for avoiding to specify a protocol flag in the API on xmit, so it can be protocol agnostic. Having info->options_len is enough information that is needed. Tested with vxlan and geneve. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- drivers/net/geneve.c | 4 +-- include/uapi/linux/bpf.h | 11 +++++++ net/core/filter.c | 83 ++++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 90 insertions(+), 8 deletions(-) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index bc5da357e16d..36db4cf0579c 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -940,7 +940,7 @@ static netdev_tx_t geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, u8 vni[3]; tunnel_id_to_vni(key->tun_id, vni); - if (key->tun_flags & TUNNEL_GENEVE_OPT) + if (info->options_len) opts = ip_tunnel_info_opts(info); if (key->tun_flags & TUNNEL_CSUM) @@ -1027,7 +1027,7 @@ static netdev_tx_t geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, u8 vni[3]; tunnel_id_to_vni(key->tun_id, vni); - if (key->tun_flags & TUNNEL_GENEVE_OPT) + if (info->options_len) opts = ip_tunnel_info_opts(info); if (key->tun_flags & TUNNEL_CSUM) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 21ee6d52016f..9221f653fee3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -298,6 +298,17 @@ enum bpf_func_id { * Return: csum result */ BPF_FUNC_csum_diff, + + /** + * bpf_skb_[gs]et_tunnel_opt(skb, opt, size) + * retrieve or populate tunnel options metadata + * @skb: pointer to skb + * @opt: pointer to raw tunnel option data + * @size: size of @opt + * Return: 0 on success for set, option size for get + */ + BPF_FUNC_skb_get_tunnel_opt, + BPF_FUNC_skb_set_tunnel_opt, __BPF_FUNC_MAX_ID, }; diff --git a/net/core/filter.c b/net/core/filter.c index 6c9d15561d04..012a10c2da94 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1809,6 +1809,32 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { .arg4_type = ARG_ANYTHING, }; +static u64 bpf_skb_get_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + u8 *to = (u8 *) (long) r2; + const struct ip_tunnel_info *info = skb_tunnel_info(skb); + + if (unlikely(!info || + !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) + return -ENOENT; + if (unlikely(size < info->options_len)) + return -ENOMEM; + + ip_tunnel_info_opts_get(to, info); + + return info->options_len; +} + +static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = { + .func = bpf_skb_get_tunnel_opt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_STACK, + .arg3_type = ARG_CONST_STACK_SIZE, +}; + static struct metadata_dst __percpu *md_dst; static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) @@ -1875,17 +1901,58 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { .arg4_type = ARG_ANYTHING, }; -static const struct bpf_func_proto *bpf_get_skb_set_tunnel_key_proto(void) +#define BPF_TUNLEN_MAX 255 + +static u64 bpf_skb_set_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + u8 *from = (u8 *) (long) r2; + struct ip_tunnel_info *info = skb_tunnel_info(skb); + const struct metadata_dst *md = this_cpu_ptr(md_dst); + + if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1)))) + return -EINVAL; + if (unlikely(size > BPF_TUNLEN_MAX)) + return -ENOMEM; + + ip_tunnel_info_opts_set(info, from, size); + + return 0; +} + +static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = { + .func = bpf_skb_set_tunnel_opt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_STACK, + .arg3_type = ARG_CONST_STACK_SIZE, +}; + +static const struct bpf_func_proto * +bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) { if (!md_dst) { - /* race is not possible, since it's called from - * verifier that is holding verifier mutex + BUILD_BUG_ON(FIELD_SIZEOF(struct ip_tunnel_info, + options_len) != 1); + + /* Race is not possible, since it's called from verifier + * that is holding verifier mutex. */ - md_dst = metadata_dst_alloc_percpu(0, GFP_KERNEL); + md_dst = metadata_dst_alloc_percpu(BPF_TUNLEN_MAX, + GFP_KERNEL); if (!md_dst) return NULL; } - return &bpf_skb_set_tunnel_key_proto; + + switch (which) { + case BPF_FUNC_skb_set_tunnel_key: + return &bpf_skb_set_tunnel_key_proto; + case BPF_FUNC_skb_set_tunnel_opt: + return &bpf_skb_set_tunnel_opt_proto; + default: + return NULL; + } } static const struct bpf_func_proto * @@ -1939,7 +2006,11 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) case BPF_FUNC_skb_get_tunnel_key: return &bpf_skb_get_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_key: - return bpf_get_skb_set_tunnel_key_proto(); + return bpf_get_skb_set_tunnel_proto(func_id); + case BPF_FUNC_skb_get_tunnel_opt: + return &bpf_skb_get_tunnel_opt_proto; + case BPF_FUNC_skb_set_tunnel_opt: + return bpf_get_skb_set_tunnel_proto(func_id); case BPF_FUNC_redirect: return &bpf_redirect_proto; case BPF_FUNC_get_route_realm: -- cgit v1.2.3 From db3c6139e6ead91b42e7c2ad044ed8beaee884e6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 Mar 2016 15:15:07 +0100 Subject: bpf, vxlan, geneve, gre: fix usage of dst_cache on xmit The assumptions from commit 0c1d70af924b ("net: use dst_cache for vxlan device"), 468dfffcd762 ("geneve: add dst caching support") and 3c1cb4d2604c ("net/ipv4: add dst cache support for gre lwtunnels") on dst_cache usage when ip_tunnel_info is used is unfortunately not always valid as assumed. While it seems correct for ip_tunnel_info front-ends such as OVS, eBPF however can fill in ip_tunnel_info for consumers like vxlan, geneve or gre with different remote dsts, tos, etc, therefore they cannot be assumed as packet independent. Right now vxlan, geneve, gre would cache the dst for eBPF and every packet would reuse the same entry that was first created on the initial route lookup. eBPF doesn't store/cache the ip_tunnel_info, so each skb may have a different one. Fix it by adding a flag that checks the ip_tunnel_info. Also the !tos test in vxlan needs to be handeled differently in this context as it is currently inferred from ip_tunnel_info as well if present. ip_tunnel_dst_cache_usable() helper is added for the three tunnel cases, which checks if we can use dst cache. Fixes: 0c1d70af924b ("net: use dst_cache for vxlan device") Fixes: 468dfffcd762 ("geneve: add dst caching support") Fixes: 3c1cb4d2604c ("net/ipv4: add dst cache support for gre lwtunnels") Signed-off-by: Daniel Borkmann Acked-by: Paolo Abeni Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- drivers/net/geneve.c | 6 ++---- drivers/net/vxlan.c | 24 ++++++++++++------------ include/net/ip_tunnels.h | 15 +++++++++++++++ net/core/filter.c | 2 +- net/ipv4/ip_gre.c | 10 ++++++---- 5 files changed, 36 insertions(+), 21 deletions(-) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 36db4cf0579c..6a0cbbe03e5d 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -775,10 +775,10 @@ static struct rtable *geneve_get_v4_rt(struct sk_buff *skb, struct flowi4 *fl4, struct ip_tunnel_info *info) { + bool use_cache = ip_tunnel_dst_cache_usable(skb, info); struct geneve_dev *geneve = netdev_priv(dev); struct dst_cache *dst_cache; struct rtable *rt = NULL; - bool use_cache = true; __u8 tos; memset(fl4, 0, sizeof(*fl4)); @@ -804,7 +804,6 @@ static struct rtable *geneve_get_v4_rt(struct sk_buff *skb, dst_cache = &geneve->dst_cache; } - use_cache = use_cache && !skb->mark; if (use_cache) { rt = dst_cache_get_ip4(dst_cache, &fl4->saddr); if (rt) @@ -832,11 +831,11 @@ static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb, struct flowi6 *fl6, struct ip_tunnel_info *info) { + bool use_cache = ip_tunnel_dst_cache_usable(skb, info); struct geneve_dev *geneve = netdev_priv(dev); struct geneve_sock *gs6 = geneve->sock6; struct dst_entry *dst = NULL; struct dst_cache *dst_cache; - bool use_cache = true; __u8 prio; memset(fl6, 0, sizeof(*fl6)); @@ -862,7 +861,6 @@ static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb, dst_cache = &geneve->dst_cache; } - use_cache = use_cache && !skb->mark; if (use_cache) { dst = dst_cache_get_ip6(dst_cache, &fl6->saddr); if (dst) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index fc998a3bd234..7294a459b13c 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1756,17 +1756,15 @@ static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan, struct sk_buff *skb, int oif, u8 tos, __be32 daddr, __be32 *saddr, struct dst_cache *dst_cache, - struct ip_tunnel_info *info) + const struct ip_tunnel_info *info) { + bool use_cache = ip_tunnel_dst_cache_usable(skb, info); struct rtable *rt = NULL; - bool use_cache = false; struct flowi4 fl4; - /* when the ip_tunnel_info is availble, the tos used for lookup is - * packet independent, so we can use the cache - */ - if (!skb->mark && (!tos || info)) { - use_cache = true; + if (tos && !info) + use_cache = false; + if (use_cache) { rt = dst_cache_get_ip4(dst_cache, saddr); if (rt) return rt; @@ -1794,13 +1792,15 @@ static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan, struct sk_buff *skb, int oif, const struct in6_addr *daddr, struct in6_addr *saddr, - struct dst_cache *dst_cache) + struct dst_cache *dst_cache, + const struct ip_tunnel_info *info) { + bool use_cache = ip_tunnel_dst_cache_usable(skb, info); struct dst_entry *ndst; struct flowi6 fl6; int err; - if (!skb->mark) { + if (use_cache) { ndst = dst_cache_get_ip6(dst_cache, saddr); if (ndst) return ndst; @@ -1820,7 +1820,7 @@ static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan, return ERR_PTR(err); *saddr = fl6.saddr; - if (!skb->mark) + if (use_cache) dst_cache_set_ip6(dst_cache, ndst, saddr); return ndst; } @@ -2018,7 +2018,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, ndst = vxlan6_get_route(vxlan, skb, rdst ? rdst->remote_ifindex : 0, &dst->sin6.sin6_addr, &saddr, - dst_cache); + dst_cache, info); if (IS_ERR(ndst)) { netdev_dbg(dev, "no route to %pI6\n", &dst->sin6.sin6_addr); @@ -2387,7 +2387,7 @@ static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) return -EINVAL; ndst = vxlan6_get_route(vxlan, skb, 0, &info->key.u.ipv6.dst, - &info->key.u.ipv6.src, NULL); + &info->key.u.ipv6.src, NULL, info); if (IS_ERR(ndst)) return PTR_ERR(ndst); dst_release(ndst); diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 5f28b606633e..e1395d70fb48 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -140,6 +140,7 @@ struct ip_tunnel { #define TUNNEL_CRIT_OPT __cpu_to_be16(0x0400) #define TUNNEL_GENEVE_OPT __cpu_to_be16(0x0800) #define TUNNEL_VXLAN_OPT __cpu_to_be16(0x1000) +#define TUNNEL_NOCACHE __cpu_to_be16(0x2000) #define TUNNEL_OPTIONS_PRESENT (TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT) @@ -206,6 +207,20 @@ static inline void ip_tunnel_key_init(struct ip_tunnel_key *key, 0, sizeof(*key) - IP_TUNNEL_KEY_SIZE); } +static inline bool +ip_tunnel_dst_cache_usable(const struct sk_buff *skb, + const struct ip_tunnel_info *info) +{ + if (skb->mark) + return false; + if (!info) + return true; + if (info->key.tun_flags & TUNNEL_NOCACHE) + return false; + + return true; +} + static inline unsigned short ip_tunnel_info_af(const struct ip_tunnel_info *tun_info) { diff --git a/net/core/filter.c b/net/core/filter.c index 012a10c2da94..a66dc03c261f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1870,7 +1870,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) info = &md->u.tun_info; info->mode = IP_TUNNEL_INFO_TX; - info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM; + info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE; if (flags & BPF_F_DONT_FRAGMENT) info->key.tun_flags |= TUNNEL_DONT_FRAGMENT; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 202437d6087b..31936d387cfd 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -527,11 +527,12 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; + struct rtable *rt = NULL; struct flowi4 fl; - struct rtable *rt; int min_headroom; int tunnel_hlen; __be16 df, flags; + bool use_cache; int err; tun_info = skb_tunnel_info(skb); @@ -540,13 +541,14 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) goto err_free_skb; key = &tun_info->key; - rt = !skb->mark ? dst_cache_get_ip4(&tun_info->dst_cache, &fl.saddr) : - NULL; + use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); + if (use_cache) + rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl.saddr); if (!rt) { rt = gre_get_rt(skb, dev, &fl, key); if (IS_ERR(rt)) goto err_free_skb; - if (!skb->mark) + if (use_cache) dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, fl.saddr); } -- cgit v1.2.3 From 1400615d64cf5afee533aff8234c837da465841b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 Mar 2016 15:15:08 +0100 Subject: vxlan: allow setting ipv6 traffic class We can already do that for IPv4, but IPv6 support was missing. Add it for vxlan, so it can be used with collect metadata frontends. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 7294a459b13c..2399099e68cf 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1789,7 +1789,7 @@ static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan, #if IS_ENABLED(CONFIG_IPV6) static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan, - struct sk_buff *skb, int oif, + struct sk_buff *skb, int oif, u8 tos, const struct in6_addr *daddr, struct in6_addr *saddr, struct dst_cache *dst_cache, @@ -1800,6 +1800,8 @@ static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan, struct flowi6 fl6; int err; + if (tos && !info) + use_cache = false; if (use_cache) { ndst = dst_cache_get_ip6(dst_cache, saddr); if (ndst) @@ -1808,6 +1810,7 @@ static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan, memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = oif; + fl6.flowi6_tos = RT_TOS(tos); fl6.daddr = *daddr; fl6.saddr = vxlan->cfg.saddr.sin6.sin6_addr; fl6.flowi6_mark = skb->mark; @@ -2016,7 +2019,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, sk = vxlan->vn6_sock->sock->sk; ndst = vxlan6_get_route(vxlan, skb, - rdst ? rdst->remote_ifindex : 0, + rdst ? rdst->remote_ifindex : 0, tos, &dst->sin6.sin6_addr, &saddr, dst_cache, info); if (IS_ERR(ndst)) { @@ -2053,6 +2056,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, if (!info) udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM6_TX); + tos = ip_tunnel_ecn_encap(tos, old_iph, skb); ttl = ttl ? : ip6_dst_hoplimit(ndst); skb_scrub_packet(skb, xnet); err = vxlan_build_skb(skb, ndst, sizeof(struct ipv6hdr), @@ -2062,8 +2066,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, return; } udp_tunnel6_xmit_skb(ndst, sk, skb, dev, - &saddr, &dst->sin6.sin6_addr, - 0, ttl, src_port, dst_port, !udp_sum); + &saddr, &dst->sin6.sin6_addr, tos, ttl, + src_port, dst_port, !udp_sum); #endif } @@ -2385,7 +2389,7 @@ static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) if (!vxlan->vn6_sock) return -EINVAL; - ndst = vxlan6_get_route(vxlan, skb, 0, + ndst = vxlan6_get_route(vxlan, skb, 0, info->key.tos, &info->key.u.ipv6.dst, &info->key.u.ipv6.src, NULL, info); if (IS_ERR(ndst)) -- cgit v1.2.3