diff options
Diffstat (limited to 'drivers/net/vxlan.c')
-rw-r--r-- | drivers/net/vxlan.c | 462 |
1 files changed, 348 insertions, 114 deletions
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 7fbd89fbe107..f8528a4cf54f 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -61,12 +61,6 @@ #define FDB_AGE_DEFAULT 300 /* 5 min */ #define FDB_AGE_INTERVAL (10 * HZ) /* rescan interval */ -#define VXLAN_N_VID (1u << 24) -#define VXLAN_VID_MASK (VXLAN_N_VID - 1) -#define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr)) - -#define VXLAN_FLAGS 0x08000000 /* struct vxlanhdr.vx_flags required value. */ - /* UDP port for VXLAN traffic. * The IANA assigned port is 4789, but the Linux default is 8472 * for compatibility with early adopters. @@ -269,15 +263,20 @@ static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb) return list_first_entry(&fdb->remotes, struct vxlan_rdst, list); } -/* Find VXLAN socket based on network namespace, address family and UDP port */ -static struct vxlan_sock *vxlan_find_sock(struct net *net, - sa_family_t family, __be16 port) +/* Find VXLAN socket based on network namespace, address family and UDP port + * and enabled unshareable flags. + */ +static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family, + __be16 port, u32 flags) { struct vxlan_sock *vs; + flags &= VXLAN_F_RCV_FLAGS; + hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) { if (inet_sk(vs->sock->sk)->inet_sport == port && - inet_sk(vs->sock->sk)->sk.sk_family == family) + inet_sk(vs->sock->sk)->sk.sk_family == family && + vs->flags == flags) return vs; } return NULL; @@ -297,11 +296,12 @@ static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, u32 id) /* Look up VNI in a per net namespace table */ static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id, - sa_family_t family, __be16 port) + sa_family_t family, __be16 port, + u32 flags) { struct vxlan_sock *vs; - vs = vxlan_find_sock(net, family, port); + vs = vxlan_find_sock(net, family, port, flags); if (!vs) return NULL; @@ -340,6 +340,11 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, ndm->ndm_flags = fdb->flags; ndm->ndm_type = RTN_UNICAST; + if (!net_eq(dev_net(vxlan->dev), vxlan->net) && + nla_put_s32(skb, NDA_LINK_NETNSID, + peernet2id(dev_net(vxlan->dev), vxlan->net))) + goto nla_put_failure; + if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr)) goto nla_put_failure; @@ -364,7 +369,8 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) goto nla_put_failure; - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; nla_put_failure: nlmsg_cancel(skb, nlh); @@ -379,6 +385,7 @@ static inline size_t vxlan_nlmsg_size(void) + nla_total_size(sizeof(__be16)) /* NDA_PORT */ + nla_total_size(sizeof(__be32)) /* NDA_VNI */ + nla_total_size(sizeof(__u32)) /* NDA_IFINDEX */ + + nla_total_size(sizeof(__s32)) /* NDA_LINK_NETNSID */ + nla_total_size(sizeof(struct nda_cacheinfo)); } @@ -545,15 +552,56 @@ static int vxlan_fdb_append(struct vxlan_fdb *f, return 1; } -static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, struct sk_buff *skb) +static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb, + unsigned int off, + struct vxlanhdr *vh, size_t hdrlen, + u32 data, struct gro_remcsum *grc, + bool nopartial) +{ + size_t start, offset, plen; + + if (skb->remcsum_offload) + return NULL; + + if (!NAPI_GRO_CB(skb)->csum_valid) + return NULL; + + start = (data & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT; + offset = start + ((data & VXLAN_RCO_UDP) ? + offsetof(struct udphdr, check) : + offsetof(struct tcphdr, check)); + + plen = hdrlen + offset + sizeof(u16); + + /* Pull checksum that will be written */ + if (skb_gro_header_hard(skb, off + plen)) { + vh = skb_gro_header_slow(skb, off + plen, off); + if (!vh) + return NULL; + } + + skb_gro_remcsum_process(skb, (void *)vh + hdrlen, + start, offset, grc, nopartial); + + skb->remcsum_offload = 1; + + return vh; +} + +static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, + struct sk_buff *skb, + struct udp_offload *uoff) { struct sk_buff *p, **pp = NULL; struct vxlanhdr *vh, *vh2; - struct ethhdr *eh, *eh2; - unsigned int hlen, off_vx, off_eth; - const struct packet_offload *ptype; - __be16 type; + unsigned int hlen, off_vx; int flush = 1; + struct vxlan_sock *vs = container_of(uoff, struct vxlan_sock, + udp_offloads); + u32 flags; + struct gro_remcsum grc; + + skb_gro_remcsum_init(&grc); off_vx = skb_gro_offset(skb); hlen = off_vx + sizeof(*vh); @@ -563,15 +611,19 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, struct sk_buff if (unlikely(!vh)) goto out; } + skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */ skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr)); - off_eth = skb_gro_offset(skb); - hlen = off_eth + sizeof(*eh); - eh = skb_gro_header_fast(skb, off_eth); - if (skb_gro_header_hard(skb, hlen)) { - eh = skb_gro_header_slow(skb, hlen, off_eth); - if (unlikely(!eh)) + flags = ntohl(vh->vx_flags); + + if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) { + vh = vxlan_gro_remcsum(skb, off_vx, vh, sizeof(struct vxlanhdr), + ntohl(vh->vx_vni), &grc, + !!(vs->flags & + VXLAN_F_REMCSUM_NOPARTIAL)); + + if (!vh) goto out; } @@ -582,54 +634,28 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, struct sk_buff continue; vh2 = (struct vxlanhdr *)(p->data + off_vx); - eh2 = (struct ethhdr *)(p->data + off_eth); - if (vh->vx_vni != vh2->vx_vni || compare_ether_header(eh, eh2)) { + if (vh->vx_flags != vh2->vx_flags || + vh->vx_vni != vh2->vx_vni) { NAPI_GRO_CB(p)->same_flow = 0; continue; } } - type = eh->h_proto; + pp = eth_gro_receive(head, skb); - rcu_read_lock(); - ptype = gro_find_receive_by_type(type); - if (ptype == NULL) { - flush = 1; - goto out_unlock; - } - - skb_gro_pull(skb, sizeof(*eh)); /* pull inner eth header */ - skb_gro_postpull_rcsum(skb, eh, sizeof(*eh)); - pp = ptype->callbacks.gro_receive(head, skb); - -out_unlock: - rcu_read_unlock(); out: + skb_gro_remcsum_cleanup(skb, &grc); NAPI_GRO_CB(skb)->flush |= flush; return pp; } -static int vxlan_gro_complete(struct sk_buff *skb, int nhoff) +static int vxlan_gro_complete(struct sk_buff *skb, int nhoff, + struct udp_offload *uoff) { - struct ethhdr *eh; - struct packet_offload *ptype; - __be16 type; - int vxlan_len = sizeof(struct vxlanhdr) + sizeof(struct ethhdr); - int err = -ENOSYS; - udp_tunnel_gro_complete(skb, nhoff); - eh = (struct ethhdr *)(skb->data + nhoff + sizeof(struct vxlanhdr)); - type = eh->h_proto; - - rcu_read_lock(); - ptype = gro_find_complete_by_type(type); - if (ptype != NULL) - err = ptype->callbacks.gro_complete(skb, nhoff + vxlan_len); - - rcu_read_unlock(); - return err; + return eth_gro_complete(skb, nhoff + sizeof(struct vxlanhdr)); } /* Notify netdevs that UDP port started listening */ @@ -991,7 +1017,7 @@ static bool vxlan_snoop(struct net_device *dev, if (net_ratelimit()) netdev_info(dev, "%pM migrated from %pIS to %pIS\n", - src_mac, &rdst->remote_ip, &src_ip); + src_mac, &rdst->remote_ip.sa, &src_ip->sa); rdst->remote_ip = *src_ip; f->updated = jiffies; @@ -1131,33 +1157,103 @@ static void vxlan_igmp_leave(struct work_struct *work) dev_put(vxlan->dev); } +static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh, + size_t hdrlen, u32 data, bool nopartial) +{ + size_t start, offset, plen; + + start = (data & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT; + offset = start + ((data & VXLAN_RCO_UDP) ? + offsetof(struct udphdr, check) : + offsetof(struct tcphdr, check)); + + plen = hdrlen + offset + sizeof(u16); + + if (!pskb_may_pull(skb, plen)) + return NULL; + + vh = (struct vxlanhdr *)(udp_hdr(skb) + 1); + + skb_remcsum_process(skb, (void *)vh + hdrlen, start, offset, + nopartial); + + return vh; +} + /* Callback from net/ipv4/udp.c to receive packets */ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) { struct vxlan_sock *vs; struct vxlanhdr *vxh; + u32 flags, vni; + struct vxlan_metadata md = {0}; /* Need Vxlan and inner Ethernet header to be present */ if (!pskb_may_pull(skb, VXLAN_HLEN)) goto error; - /* Return packets with reserved bits set */ vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1); - if (vxh->vx_flags != htonl(VXLAN_FLAGS) || - (vxh->vx_vni & htonl(0xff))) { - netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n", - ntohl(vxh->vx_flags), ntohl(vxh->vx_vni)); - goto error; + flags = ntohl(vxh->vx_flags); + vni = ntohl(vxh->vx_vni); + + if (flags & VXLAN_HF_VNI) { + flags &= ~VXLAN_HF_VNI; + } else { + /* VNI flag always required to be set */ + goto bad_flags; } if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB))) goto drop; + vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1); vs = rcu_dereference_sk_user_data(sk); if (!vs) goto drop; - vs->rcv(vs, skb, vxh->vx_vni); + if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) { + vxh = vxlan_remcsum(skb, vxh, sizeof(struct vxlanhdr), vni, + !!(vs->flags & VXLAN_F_REMCSUM_NOPARTIAL)); + if (!vxh) + goto drop; + + flags &= ~VXLAN_HF_RCO; + vni &= VXLAN_VNI_MASK; + } + + /* For backwards compatibility, only allow reserved fields to be + * used by VXLAN extensions if explicitly requested. + */ + if ((flags & VXLAN_HF_GBP) && (vs->flags & VXLAN_F_GBP)) { + struct vxlanhdr_gbp *gbp; + + gbp = (struct vxlanhdr_gbp *)vxh; + md.gbp = ntohs(gbp->policy_id); + + if (gbp->dont_learn) + md.gbp |= VXLAN_GBP_DONT_LEARN; + + if (gbp->policy_applied) + md.gbp |= VXLAN_GBP_POLICY_APPLIED; + + flags &= ~VXLAN_GBP_USED_BITS; + } + + if (flags || vni & ~VXLAN_VNI_MASK) { + /* If there are any unprocessed flags remaining treat + * this as a malformed packet. This behavior diverges from + * VXLAN RFC (RFC7348) which stipulates that bits in reserved + * in reserved fields are to be ignored. The approach here + * maintains compatbility with previous stack code, and also + * is more robust and provides a little more security in + * adding extensions to VXLAN. + */ + + goto bad_flags; + } + + md.vni = vxh->vx_vni; + vs->rcv(vs, skb, &md); return 0; drop: @@ -1165,13 +1261,17 @@ drop: kfree_skb(skb); return 0; +bad_flags: + netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n", + ntohl(vxh->vx_flags), ntohl(vxh->vx_vni)); + error: /* Return non vxlan pkt */ return 1; } -static void vxlan_rcv(struct vxlan_sock *vs, - struct sk_buff *skb, __be32 vx_vni) +static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, + struct vxlan_metadata *md) { struct iphdr *oip = NULL; struct ipv6hdr *oip6 = NULL; @@ -1182,7 +1282,7 @@ static void vxlan_rcv(struct vxlan_sock *vs, int err = 0; union vxlan_addr *remote_ip; - vni = ntohl(vx_vni) >> 8; + vni = ntohl(md->vni) >> 8; /* Is this VNI defined? */ vxlan = vxlan_vs_find_vni(vs, vni); if (!vxlan) @@ -1216,6 +1316,7 @@ static void vxlan_rcv(struct vxlan_sock *vs, goto drop; skb_reset_network_header(skb); + skb->mark = md->gbp; if (oip6) err = IP6_ECN_decapsulate(oip6, skb); @@ -1565,20 +1666,54 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb) return false; } +static void vxlan_build_gbp_hdr(struct vxlanhdr *vxh, u32 vxflags, + struct vxlan_metadata *md) +{ + struct vxlanhdr_gbp *gbp; + + if (!md->gbp) + return; + + gbp = (struct vxlanhdr_gbp *)vxh; + vxh->vx_flags |= htonl(VXLAN_HF_GBP); + + if (md->gbp & VXLAN_GBP_DONT_LEARN) + gbp->dont_learn = 1; + + if (md->gbp & VXLAN_GBP_POLICY_APPLIED) + gbp->policy_applied = 1; + + gbp->policy_id = htons(md->gbp & VXLAN_GBP_ID_MASK); +} + #if IS_ENABLED(CONFIG_IPV6) -static int vxlan6_xmit_skb(struct vxlan_sock *vs, - struct dst_entry *dst, struct sk_buff *skb, +static int vxlan6_xmit_skb(struct dst_entry *dst, struct sk_buff *skb, struct net_device *dev, struct in6_addr *saddr, struct in6_addr *daddr, __u8 prio, __u8 ttl, - __be16 src_port, __be16 dst_port, __be32 vni, - bool xnet) + __be16 src_port, __be16 dst_port, + struct vxlan_metadata *md, bool xnet, u32 vxflags) { struct vxlanhdr *vxh; int min_headroom; int err; - bool udp_sum = !udp_get_no_check6_tx(vs->sock->sk); + bool udp_sum = !(vxflags & VXLAN_F_UDP_ZERO_CSUM6_TX); + int type = udp_sum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; + u16 hdrlen = sizeof(struct vxlanhdr); + + if ((vxflags & VXLAN_F_REMCSUM_TX) && + skb->ip_summed == CHECKSUM_PARTIAL) { + int csum_start = skb_checksum_start_offset(skb); + + if (csum_start <= VXLAN_MAX_REMCSUM_START && + !(csum_start & VXLAN_RCO_SHIFT_MASK) && + (skb->csum_offset == offsetof(struct udphdr, check) || + skb->csum_offset == offsetof(struct tcphdr, check))) { + udp_sum = false; + type |= SKB_GSO_TUNNEL_REMCSUM; + } + } - skb = udp_tunnel_handle_offloads(skb, udp_sum); + skb = iptunnel_handle_offloads(skb, udp_sum, type); if (IS_ERR(skb)) { err = -EINVAL; goto err; @@ -1588,7 +1723,7 @@ static int vxlan6_xmit_skb(struct vxlan_sock *vs, min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len + VXLAN_HLEN + sizeof(struct ipv6hdr) - + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0); + + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); /* Need space for new headers (invalidates iph ptr) */ err = skb_cow_head(skb, min_headroom); @@ -1604,13 +1739,33 @@ static int vxlan6_xmit_skb(struct vxlan_sock *vs, } vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); - vxh->vx_flags = htonl(VXLAN_FLAGS); - vxh->vx_vni = vni; + vxh->vx_flags = htonl(VXLAN_HF_VNI); + vxh->vx_vni = md->vni; + + if (type & SKB_GSO_TUNNEL_REMCSUM) { + u32 data = (skb_checksum_start_offset(skb) - hdrlen) >> + VXLAN_RCO_SHIFT; + + if (skb->csum_offset == offsetof(struct udphdr, check)) + data |= VXLAN_RCO_UDP; + + vxh->vx_vni |= htonl(data); + vxh->vx_flags |= htonl(VXLAN_HF_RCO); + + if (!skb_is_gso(skb)) { + skb->ip_summed = CHECKSUM_NONE; + skb->encapsulation = 0; + } + } + + if (vxflags & VXLAN_F_GBP) + vxlan_build_gbp_hdr(vxh, vxflags, md); skb_set_inner_protocol(skb, htons(ETH_P_TEB)); - udp_tunnel6_xmit_skb(vs->sock, dst, skb, dev, saddr, daddr, prio, - ttl, src_port, dst_port); + udp_tunnel6_xmit_skb(dst, skb, dev, saddr, daddr, prio, + ttl, src_port, dst_port, + !!(vxflags & VXLAN_F_UDP_ZERO_CSUM6_TX)); return 0; err: dst_release(dst); @@ -1618,23 +1773,38 @@ err: } #endif -int vxlan_xmit_skb(struct vxlan_sock *vs, - struct rtable *rt, struct sk_buff *skb, +int vxlan_xmit_skb(struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, - __be16 src_port, __be16 dst_port, __be32 vni, bool xnet) + __be16 src_port, __be16 dst_port, + struct vxlan_metadata *md, bool xnet, u32 vxflags) { struct vxlanhdr *vxh; int min_headroom; int err; - bool udp_sum = !vs->sock->sk->sk_no_check_tx; + bool udp_sum = !!(vxflags & VXLAN_F_UDP_CSUM); + int type = udp_sum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; + u16 hdrlen = sizeof(struct vxlanhdr); + + if ((vxflags & VXLAN_F_REMCSUM_TX) && + skb->ip_summed == CHECKSUM_PARTIAL) { + int csum_start = skb_checksum_start_offset(skb); + + if (csum_start <= VXLAN_MAX_REMCSUM_START && + !(csum_start & VXLAN_RCO_SHIFT_MASK) && + (skb->csum_offset == offsetof(struct udphdr, check) || + skb->csum_offset == offsetof(struct tcphdr, check))) { + udp_sum = false; + type |= SKB_GSO_TUNNEL_REMCSUM; + } + } - skb = udp_tunnel_handle_offloads(skb, udp_sum); + skb = iptunnel_handle_offloads(skb, udp_sum, type); if (IS_ERR(skb)) return PTR_ERR(skb); min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len + VXLAN_HLEN + sizeof(struct iphdr) - + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0); + + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); /* Need space for new headers (invalidates iph ptr) */ err = skb_cow_head(skb, min_headroom); @@ -1648,13 +1818,33 @@ int vxlan_xmit_skb(struct vxlan_sock *vs, return -ENOMEM; vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); - vxh->vx_flags = htonl(VXLAN_FLAGS); - vxh->vx_vni = vni; + vxh->vx_flags = htonl(VXLAN_HF_VNI); + vxh->vx_vni = md->vni; + + if (type & SKB_GSO_TUNNEL_REMCSUM) { + u32 data = (skb_checksum_start_offset(skb) - hdrlen) >> + VXLAN_RCO_SHIFT; + + if (skb->csum_offset == offsetof(struct udphdr, check)) + data |= VXLAN_RCO_UDP; + + vxh->vx_vni |= htonl(data); + vxh->vx_flags |= htonl(VXLAN_HF_RCO); + + if (!skb_is_gso(skb)) { + skb->ip_summed = CHECKSUM_NONE; + skb->encapsulation = 0; + } + } + + if (vxflags & VXLAN_F_GBP) + vxlan_build_gbp_hdr(vxh, vxflags, md); skb_set_inner_protocol(skb, htons(ETH_P_TEB)); - return udp_tunnel_xmit_skb(vs->sock, rt, skb, src, dst, tos, - ttl, df, src_port, dst_port, xnet); + return udp_tunnel_xmit_skb(rt, skb, src, dst, tos, + ttl, df, src_port, dst_port, xnet, + !(vxflags & VXLAN_F_UDP_CSUM)); } EXPORT_SYMBOL_GPL(vxlan_xmit_skb); @@ -1711,6 +1901,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, const struct iphdr *old_iph; struct flowi4 fl4; union vxlan_addr *dst; + struct vxlan_metadata md; __be16 src_port = 0, dst_port; u32 vni; __be16 df = 0; @@ -1772,7 +1963,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, ip_rt_put(rt); dst_vxlan = vxlan_find_vni(vxlan->net, vni, - dst->sa.sa_family, dst_port); + dst->sa.sa_family, dst_port, + vxlan->flags); if (!dst_vxlan) goto tx_error; vxlan_encap_bypass(skb, vxlan, dst_vxlan); @@ -1781,12 +1973,14 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, tos = ip_tunnel_ecn_encap(tos, old_iph, skb); ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); - - err = vxlan_xmit_skb(vxlan->vn_sock, rt, skb, - fl4.saddr, dst->sin.sin_addr.s_addr, - tos, ttl, df, src_port, dst_port, - htonl(vni << 8), - !net_eq(vxlan->net, dev_net(vxlan->dev))); + md.vni = htonl(vni << 8); + md.gbp = skb->mark; + + err = vxlan_xmit_skb(rt, skb, fl4.saddr, + dst->sin.sin_addr.s_addr, tos, ttl, df, + src_port, dst_port, &md, + !net_eq(vxlan->net, dev_net(vxlan->dev)), + vxlan->flags); if (err < 0) { /* skb is already freed. */ skb = NULL; @@ -1830,7 +2024,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, dst_release(ndst); dst_vxlan = vxlan_find_vni(vxlan->net, vni, - dst->sa.sa_family, dst_port); + dst->sa.sa_family, dst_port, + vxlan->flags); if (!dst_vxlan) goto tx_error; vxlan_encap_bypass(skb, vxlan, dst_vxlan); @@ -1838,11 +2033,13 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, } ttl = ttl ? : ip6_dst_hoplimit(ndst); + md.vni = htonl(vni << 8); + md.gbp = skb->mark; - err = vxlan6_xmit_skb(vxlan->vn_sock, ndst, skb, - dev, &fl6.saddr, &fl6.daddr, 0, ttl, - src_port, dst_port, htonl(vni << 8), - !net_eq(vxlan->net, dev_net(vxlan->dev))); + err = vxlan6_xmit_skb(ndst, skb, dev, &fl6.saddr, &fl6.daddr, + 0, ttl, src_port, dst_port, &md, + !net_eq(vxlan->net, dev_net(vxlan->dev)), + vxlan->flags); #endif } @@ -1998,7 +2195,7 @@ static int vxlan_init(struct net_device *dev) spin_lock(&vn->sock_lock); vs = vxlan_find_sock(vxlan->net, ipv6 ? AF_INET6 : AF_INET, - vxlan->dst_port); + vxlan->dst_port, vxlan->flags); if (vs && atomic_add_unless(&vs->refcnt, 1, 0)) { /* If we have a socket with same port already, reuse it */ vxlan_vs_add_dev(vs, vxlan); @@ -2242,6 +2439,10 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { [IFLA_VXLAN_UDP_CSUM] = { .type = NLA_U8 }, [IFLA_VXLAN_UDP_ZERO_CSUM6_TX] = { .type = NLA_U8 }, [IFLA_VXLAN_UDP_ZERO_CSUM6_RX] = { .type = NLA_U8 }, + [IFLA_VXLAN_REMCSUM_TX] = { .type = NLA_U8 }, + [IFLA_VXLAN_REMCSUM_RX] = { .type = NLA_U8 }, + [IFLA_VXLAN_GBP] = { .type = NLA_FLAG, }, + [IFLA_VXLAN_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG }, }; static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[]) @@ -2311,15 +2512,11 @@ static struct socket *vxlan_create_sock(struct net *net, bool ipv6, if (ipv6) { udp_conf.family = AF_INET6; - udp_conf.use_udp6_tx_checksums = - !(flags & VXLAN_F_UDP_ZERO_CSUM6_TX); udp_conf.use_udp6_rx_checksums = !(flags & VXLAN_F_UDP_ZERO_CSUM6_RX); } else { udp_conf.family = AF_INET; udp_conf.local_ip.s_addr = INADDR_ANY; - udp_conf.use_udp_checksums = - !!(flags & VXLAN_F_UDP_CSUM); } udp_conf.local_udp_port = port; @@ -2363,6 +2560,7 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port, atomic_set(&vs->refcnt, 1); vs->rcv = rcv; vs->data = data; + vs->flags = (flags & VXLAN_F_RCV_FLAGS); /* Initialize the vxlan udp offloads structure */ vs->udp_offloads.port = port; @@ -2401,7 +2599,7 @@ struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port, return vs; spin_lock(&vn->sock_lock); - vs = vxlan_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port); + vs = vxlan_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port, flags); if (vs && ((vs->rcv != rcv) || !atomic_add_unless(&vs->refcnt, 1, 0))) vs = ERR_PTR(-EBUSY); @@ -2432,10 +2630,10 @@ static void vxlan_sock_work(struct work_struct *work) dev_put(vxlan->dev); } -static int vxlan_newlink(struct net *net, struct net_device *dev, +static int vxlan_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { - struct vxlan_net *vn = net_generic(net, vxlan_net_id); + struct vxlan_net *vn = net_generic(src_net, vxlan_net_id); struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_rdst *dst = &vxlan->default_dst; __u32 vni; @@ -2445,7 +2643,7 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, if (!data[IFLA_VXLAN_ID]) return -EINVAL; - vxlan->net = dev_net(dev); + vxlan->net = src_net; vni = nla_get_u32(data[IFLA_VXLAN_ID]); dst->remote_vni = vni; @@ -2481,7 +2679,7 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, if (data[IFLA_VXLAN_LINK] && (dst->remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]))) { struct net_device *lowerdev - = __dev_get_by_index(net, dst->remote_ifindex); + = __dev_get_by_index(src_net, dst->remote_ifindex); if (!lowerdev) { pr_info("ifindex %d does not exist\n", dst->remote_ifindex); @@ -2557,8 +2755,22 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX])) vxlan->flags |= VXLAN_F_UDP_ZERO_CSUM6_RX; - if (vxlan_find_vni(net, vni, use_ipv6 ? AF_INET6 : AF_INET, - vxlan->dst_port)) { + if (data[IFLA_VXLAN_REMCSUM_TX] && + nla_get_u8(data[IFLA_VXLAN_REMCSUM_TX])) + vxlan->flags |= VXLAN_F_REMCSUM_TX; + + if (data[IFLA_VXLAN_REMCSUM_RX] && + nla_get_u8(data[IFLA_VXLAN_REMCSUM_RX])) + vxlan->flags |= VXLAN_F_REMCSUM_RX; + + if (data[IFLA_VXLAN_GBP]) + vxlan->flags |= VXLAN_F_GBP; + + if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL]) + vxlan->flags |= VXLAN_F_REMCSUM_NOPARTIAL; + + if (vxlan_find_vni(src_net, vni, use_ipv6 ? AF_INET6 : AF_INET, + vxlan->dst_port, vxlan->flags)) { pr_info("duplicate VNI %u\n", vni); return -EEXIST; } @@ -2625,6 +2837,8 @@ static size_t vxlan_get_size(const struct net_device *dev) nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_CSUM */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_TX */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */ + nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_TX */ + nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_RX */ 0; } @@ -2690,18 +2904,37 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_TX, !!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM6_TX)) || nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, - !!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM6_RX))) + !!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM6_RX)) || + nla_put_u8(skb, IFLA_VXLAN_REMCSUM_TX, + !!(vxlan->flags & VXLAN_F_REMCSUM_TX)) || + nla_put_u8(skb, IFLA_VXLAN_REMCSUM_RX, + !!(vxlan->flags & VXLAN_F_REMCSUM_RX))) goto nla_put_failure; if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports)) goto nla_put_failure; + if (vxlan->flags & VXLAN_F_GBP && + nla_put_flag(skb, IFLA_VXLAN_GBP)) + goto nla_put_failure; + + if (vxlan->flags & VXLAN_F_REMCSUM_NOPARTIAL && + nla_put_flag(skb, IFLA_VXLAN_REMCSUM_NOPARTIAL)) + goto nla_put_failure; + return 0; nla_put_failure: return -EMSGSIZE; } +static struct net *vxlan_get_link_net(const struct net_device *dev) +{ + struct vxlan_dev *vxlan = netdev_priv(dev); + + return vxlan->net; +} + static struct rtnl_link_ops vxlan_link_ops __read_mostly = { .kind = "vxlan", .maxtype = IFLA_VXLAN_MAX, @@ -2713,6 +2946,7 @@ static struct rtnl_link_ops vxlan_link_ops __read_mostly = { .dellink = vxlan_dellink, .get_size = vxlan_get_size, .fill_info = vxlan_fill_info, + .get_link_net = vxlan_get_link_net, }; static void vxlan_handle_lowerdev_unregister(struct vxlan_net *vn, |