summaryrefslogtreecommitdiff
path: root/drivers/net/vxlan.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-07-05 22:31:59 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2017-07-05 22:31:59 +0300
commit5518b69b76680a4f2df96b1deca260059db0c2de (patch)
treef33cd1519c8efb4590500f2f9617400be233238c /drivers/net/vxlan.c
parent8ad06e56dcbc1984ef0ff8f6e3c19982c5809f73 (diff)
parent0e72582270c07850b92cac351c8b97d4f9c123b9 (diff)
downloadlinux-5518b69b76680a4f2df96b1deca260059db0c2de.tar.xz
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller: "Reasonably busy this cycle, but perhaps not as busy as in the 4.12 merge window: 1) Several optimizations for UDP processing under high load from Paolo Abeni. 2) Support pacing internally in TCP when using the sch_fq packet scheduler for this is not practical. From Eric Dumazet. 3) Support mutliple filter chains per qdisc, from Jiri Pirko. 4) Move to 1ms TCP timestamp clock, from Eric Dumazet. 5) Add batch dequeueing to vhost_net, from Jason Wang. 6) Flesh out more completely SCTP checksum offload support, from Davide Caratti. 7) More plumbing of extended netlink ACKs, from David Ahern, Pablo Neira Ayuso, and Matthias Schiffer. 8) Add devlink support to nfp driver, from Simon Horman. 9) Add RTM_F_FIB_MATCH flag to RTM_GETROUTE queries, from Roopa Prabhu. 10) Add stack depth tracking to BPF verifier and use this information in the various eBPF JITs. From Alexei Starovoitov. 11) Support XDP on qed device VFs, from Yuval Mintz. 12) Introduce BPF PROG ID for better introspection of installed BPF programs. From Martin KaFai Lau. 13) Add bpf_set_hash helper for TC bpf programs, from Daniel Borkmann. 14) For loads, allow narrower accesses in bpf verifier checking, from Yonghong Song. 15) Support MIPS in the BPF selftests and samples infrastructure, the MIPS eBPF JIT will be merged in via the MIPS GIT tree. From David Daney. 16) Support kernel based TLS, from Dave Watson and others. 17) Remove completely DST garbage collection, from Wei Wang. 18) Allow installing TCP MD5 rules using prefixes, from Ivan Delalande. 19) Add XDP support to Intel i40e driver, from Björn Töpel 20) Add support for TC flower offload in nfp driver, from Simon Horman, Pieter Jansen van Vuuren, Benjamin LaHaise, Jakub Kicinski, and Bert van Leeuwen. 21) IPSEC offloading support in mlx5, from Ilan Tayari. 22) Add HW PTP support to macb driver, from Rafal Ozieblo. 23) Networking refcount_t conversions, From Elena Reshetova. 24) Add sock_ops support to BPF, from Lawrence Brako. This is useful for tuning the TCP sockopt settings of a group of applications, currently via CGROUPs" * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1899 commits) net: phy: dp83867: add workaround for incorrect RX_CTRL pin strap dt-bindings: phy: dp83867: provide a workaround for incorrect RX_CTRL pin strap cxgb4: Support for get_ts_info ethtool method cxgb4: Add PTP Hardware Clock (PHC) support cxgb4: time stamping interface for PTP nfp: default to chained metadata prepend format nfp: remove legacy MAC address lookup nfp: improve order of interfaces in breakout mode net: macb: remove extraneous return when MACB_EXT_DESC is defined bpf: add missing break in for the TCP_BPF_SNDCWND_CLAMP case bpf: fix return in load_bpf_file mpls: fix rtm policy in mpls_getroute net, ax25: convert ax25_cb.refcount from atomic_t to refcount_t net, ax25: convert ax25_route.refcount from atomic_t to refcount_t net, ax25: convert ax25_uid_assoc.refcount from atomic_t to refcount_t net, sctp: convert sctp_ep_common.refcnt from atomic_t to refcount_t net, sctp: convert sctp_transport.refcnt from atomic_t to refcount_t net, sctp: convert sctp_chunk.refcnt from atomic_t to refcount_t net, sctp: convert sctp_datamsg.refcnt from atomic_t to refcount_t net, sctp: convert sctp_auth_bytes.refcnt from atomic_t to refcount_t ...
Diffstat (limited to 'drivers/net/vxlan.c')
-rw-r--r--drivers/net/vxlan.c472
1 files changed, 288 insertions, 184 deletions
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 5fa798a5c9a6..96aa7e6cf214 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -226,26 +226,37 @@ static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family,
return NULL;
}
-static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, __be32 vni)
+static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, int ifindex,
+ __be32 vni)
{
- struct vxlan_dev *vxlan;
+ struct vxlan_dev_node *node;
/* For flow based devices, map all packets to VNI 0 */
if (vs->flags & VXLAN_F_COLLECT_METADATA)
vni = 0;
- hlist_for_each_entry_rcu(vxlan, vni_head(vs, vni), hlist) {
- if (vxlan->default_dst.remote_vni == vni)
- return vxlan;
+ hlist_for_each_entry_rcu(node, vni_head(vs, vni), hlist) {
+ if (node->vxlan->default_dst.remote_vni != vni)
+ continue;
+
+ if (IS_ENABLED(CONFIG_IPV6)) {
+ const struct vxlan_config *cfg = &node->vxlan->cfg;
+
+ if ((cfg->flags & VXLAN_F_IPV6_LINKLOCAL) &&
+ cfg->remote_ifindex != ifindex)
+ continue;
+ }
+
+ return node->vxlan;
}
return NULL;
}
/* Look up VNI in a per net namespace table */
-static struct vxlan_dev *vxlan_find_vni(struct net *net, __be32 vni,
- sa_family_t family, __be16 port,
- u32 flags)
+static struct vxlan_dev *vxlan_find_vni(struct net *net, int ifindex,
+ __be32 vni, sa_family_t family,
+ __be16 port, u32 flags)
{
struct vxlan_sock *vs;
@@ -253,7 +264,7 @@ static struct vxlan_dev *vxlan_find_vni(struct net *net, __be32 vni,
if (!vs)
return NULL;
- return vxlan_vs_find_vni(vs, vni);
+ return vxlan_vs_find_vni(vs, ifindex, vni);
}
/* Fill in neighbour message in skbuff. */
@@ -305,7 +316,7 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
if (rdst->remote_vni != vxlan->default_dst.remote_vni &&
nla_put_u32(skb, NDA_VNI, be32_to_cpu(rdst->remote_vni)))
goto nla_put_failure;
- if ((vxlan->flags & VXLAN_F_COLLECT_METADATA) && fdb->vni &&
+ if ((vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) && fdb->vni &&
nla_put_u32(skb, NDA_SRC_VNI,
be32_to_cpu(fdb->vni)))
goto nla_put_failure;
@@ -419,7 +430,7 @@ static u32 eth_vni_hash(const unsigned char *addr, __be32 vni)
static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan,
const u8 *mac, __be32 vni)
{
- if (vxlan->flags & VXLAN_F_COLLECT_METADATA)
+ if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA)
return &vxlan->fdb_head[eth_vni_hash(mac, vni)];
else
return &vxlan->fdb_head[eth_hash(mac)];
@@ -434,7 +445,7 @@ static struct vxlan_fdb *__vxlan_find_mac(struct vxlan_dev *vxlan,
hlist_for_each_entry_rcu(f, head, hlist) {
if (ether_addr_equal(mac, f->eth_addr)) {
- if (vxlan->flags & VXLAN_F_COLLECT_METADATA) {
+ if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) {
if (vni == f->vni)
return f;
} else {
@@ -957,20 +968,28 @@ out:
*/
static bool vxlan_snoop(struct net_device *dev,
union vxlan_addr *src_ip, const u8 *src_mac,
- __be32 vni)
+ u32 src_ifindex, __be32 vni)
{
struct vxlan_dev *vxlan = netdev_priv(dev);
struct vxlan_fdb *f;
+ u32 ifindex = 0;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (src_ip->sa.sa_family == AF_INET6 &&
+ (ipv6_addr_type(&src_ip->sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL))
+ ifindex = src_ifindex;
+#endif
f = vxlan_find_mac(vxlan, src_mac, vni);
if (likely(f)) {
struct vxlan_rdst *rdst = first_remote_rcu(f);
- if (likely(vxlan_addr_equal(&rdst->remote_ip, src_ip)))
+ if (likely(vxlan_addr_equal(&rdst->remote_ip, src_ip) &&
+ rdst->remote_ifindex == ifindex))
return false;
/* Don't migrate static entries, drop packets */
- if (f->state & NUD_NOARP)
+ if (f->state & (NUD_PERMANENT | NUD_NOARP))
return true;
if (net_ratelimit())
@@ -993,7 +1012,7 @@ static bool vxlan_snoop(struct net_device *dev,
vxlan->cfg.dst_port,
vni,
vxlan->default_dst.remote_vni,
- 0, NTF_SELF);
+ ifindex, NTF_SELF);
spin_unlock(&vxlan->hash_lock);
}
@@ -1015,11 +1034,11 @@ static bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev)
/* The vxlan_sock is only used by dev, leaving group has
* no effect on other vxlan devices.
*/
- if (family == AF_INET && sock4 && atomic_read(&sock4->refcnt) == 1)
+ if (family == AF_INET && sock4 && refcount_read(&sock4->refcnt) == 1)
return false;
#if IS_ENABLED(CONFIG_IPV6)
sock6 = rtnl_dereference(dev->vn6_sock);
- if (family == AF_INET6 && sock6 && atomic_read(&sock6->refcnt) == 1)
+ if (family == AF_INET6 && sock6 && refcount_read(&sock6->refcnt) == 1)
return false;
#endif
@@ -1056,7 +1075,7 @@ static bool __vxlan_sock_release_prep(struct vxlan_sock *vs)
if (!vs)
return false;
- if (!atomic_dec_and_test(&vs->refcnt))
+ if (!refcount_dec_and_test(&vs->refcnt))
return false;
vn = net_generic(sock_net(vs->sock->sk), vxlan_net_id);
@@ -1077,10 +1096,10 @@ static void vxlan_sock_release(struct vxlan_dev *vxlan)
#if IS_ENABLED(CONFIG_IPV6)
struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock);
- rcu_assign_pointer(vxlan->vn6_sock, NULL);
+ RCU_INIT_POINTER(vxlan->vn6_sock, NULL);
#endif
- rcu_assign_pointer(vxlan->vn4_sock, NULL);
+ RCU_INIT_POINTER(vxlan->vn4_sock, NULL);
synchronize_net();
vxlan_vs_del_dev(vxlan);
@@ -1264,6 +1283,7 @@ static bool vxlan_set_mac(struct vxlan_dev *vxlan,
struct sk_buff *skb, __be32 vni)
{
union vxlan_addr saddr;
+ u32 ifindex = skb->dev->ifindex;
skb_reset_mac_header(skb);
skb->protocol = eth_type_trans(skb, vxlan->dev);
@@ -1284,8 +1304,8 @@ static bool vxlan_set_mac(struct vxlan_dev *vxlan,
#endif
}
- if ((vxlan->flags & VXLAN_F_LEARN) &&
- vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source, vni))
+ if ((vxlan->cfg.flags & VXLAN_F_LEARN) &&
+ vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source, ifindex, vni))
return false;
return true;
@@ -1351,7 +1371,7 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
vni = vxlan_vni(vxlan_hdr(skb)->vx_vni);
- vxlan = vxlan_vs_find_vni(vs, vni);
+ vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni);
if (!vxlan)
goto drop;
@@ -1507,7 +1527,7 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)
if (netif_rx_ni(reply) == NET_RX_DROP)
dev->stats.rx_dropped++;
- } else if (vxlan->flags & VXLAN_F_L3MISS) {
+ } else if (vxlan->cfg.flags & VXLAN_F_L3MISS) {
union vxlan_addr ipa = {
.sin.sin_addr.s_addr = tip,
.sin.sin_family = AF_INET,
@@ -1584,10 +1604,8 @@ static struct sk_buff *vxlan_na_create(struct sk_buff *request,
skb_pull(reply, sizeof(struct ipv6hdr));
skb_reset_transport_header(reply);
- na = (struct nd_msg *)skb_put(reply, sizeof(*na) + na_olen);
-
/* Neighbor Advertisement */
- memset(na, 0, sizeof(*na)+na_olen);
+ na = skb_put_zero(reply, sizeof(*na) + na_olen);
na->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT;
na->icmph.icmp6_router = isrouter;
na->icmph.icmp6_override = 1;
@@ -1667,7 +1685,7 @@ static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)
if (netif_rx_ni(reply) == NET_RX_DROP)
dev->stats.rx_dropped++;
- } else if (vxlan->flags & VXLAN_F_L3MISS) {
+ } else if (vxlan->cfg.flags & VXLAN_F_L3MISS) {
union vxlan_addr ipa = {
.sin6.sin6_addr = msg->target,
.sin6.sin6_family = AF_INET6,
@@ -1700,7 +1718,7 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
return false;
pip = ip_hdr(skb);
n = neigh_lookup(&arp_tbl, &pip->daddr, dev);
- if (!n && (vxlan->flags & VXLAN_F_L3MISS)) {
+ if (!n && (vxlan->cfg.flags & VXLAN_F_L3MISS)) {
union vxlan_addr ipa = {
.sin.sin_addr.s_addr = pip->daddr,
.sin.sin_family = AF_INET,
@@ -1721,7 +1739,7 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
return false;
pip6 = ipv6_hdr(skb);
n = neigh_lookup(ipv6_stub->nd_tbl, &pip6->daddr, dev);
- if (!n && (vxlan->flags & VXLAN_F_L3MISS)) {
+ if (!n && (vxlan->cfg.flags & VXLAN_F_L3MISS)) {
union vxlan_addr ipa = {
.sin6.sin6_addr = pip6->daddr,
.sin6.sin6_family = AF_INET6,
@@ -1829,7 +1847,7 @@ static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst,
if (err)
return err;
- vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
+ vxh = __skb_push(skb, sizeof(*vxh));
vxh->vx_flags = VXLAN_HF_VNI;
vxh->vx_vni = vxlan_vni_field(vni);
@@ -1995,8 +2013,9 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
#endif
}
- if (dst_vxlan->flags & VXLAN_F_LEARN)
- vxlan_snoop(skb->dev, &loopback, eth_hdr(skb)->h_source, vni);
+ if (dst_vxlan->cfg.flags & VXLAN_F_LEARN)
+ vxlan_snoop(skb->dev, &loopback, eth_hdr(skb)->h_source, 0,
+ vni);
u64_stats_update_begin(&tx_stats->syncp);
tx_stats->tx_packets++;
@@ -2014,8 +2033,10 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
}
static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev,
- struct vxlan_dev *vxlan, union vxlan_addr *daddr,
- __be16 dst_port, __be32 vni, struct dst_entry *dst,
+ struct vxlan_dev *vxlan,
+ union vxlan_addr *daddr,
+ __be16 dst_port, int dst_ifindex, __be32 vni,
+ struct dst_entry *dst,
u32 rt_flags)
{
#if IS_ENABLED(CONFIG_IPV6)
@@ -2031,9 +2052,9 @@ static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev,
struct vxlan_dev *dst_vxlan;
dst_release(dst);
- dst_vxlan = vxlan_find_vni(vxlan->net, vni,
+ dst_vxlan = vxlan_find_vni(vxlan->net, dst_ifindex, vni,
daddr->sa.sa_family, dst_port,
- vxlan->flags);
+ vxlan->cfg.flags);
if (!dst_vxlan) {
dev->stats.tx_errors++;
kfree_skb(skb);
@@ -2063,8 +2084,9 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
struct dst_entry *ndst = NULL;
__be32 vni, label;
__u8 tos, ttl;
+ int ifindex;
int err;
- u32 flags = vxlan->flags;
+ u32 flags = vxlan->cfg.flags;
bool udp_sum = false;
bool xnet = !net_eq(vxlan->net, dev_net(vxlan->dev));
@@ -2083,6 +2105,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port;
vni = (rdst->remote_vni) ? : default_vni;
+ ifindex = rdst->remote_ifindex;
local_ip = vxlan->cfg.saddr;
dst_cache = &rdst->dst_cache;
md->gbp = skb->mark;
@@ -2116,6 +2139,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
dst = &remote_ip;
dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port;
vni = tunnel_id_to_key32(info->key.tun_id);
+ ifindex = 0;
dst_cache = &info->dst_cache;
if (info->options_len)
md = ip_tunnel_info_opts(info);
@@ -2133,8 +2157,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
struct rtable *rt;
__be16 df = 0;
- rt = vxlan_get_route(vxlan, dev, sock4, skb,
- rdst ? rdst->remote_ifindex : 0, tos,
+ rt = vxlan_get_route(vxlan, dev, sock4, skb, ifindex, tos,
dst->sin.sin_addr.s_addr,
&local_ip.sin.sin_addr.s_addr,
dst_port, src_port,
@@ -2147,8 +2170,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
/* Bypass encapsulation if the destination is local */
if (!info) {
err = encap_bypass_if_local(skb, dev, vxlan, dst,
- dst_port, vni, &rt->dst,
- rt->rt_flags);
+ dst_port, ifindex, vni,
+ &rt->dst, rt->rt_flags);
if (err)
goto out_unlock;
} else if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT) {
@@ -2170,8 +2193,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
} else {
struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock);
- ndst = vxlan6_get_route(vxlan, dev, sock6, skb,
- rdst ? rdst->remote_ifindex : 0, tos,
+ ndst = vxlan6_get_route(vxlan, dev, sock6, skb, ifindex, tos,
label, &dst->sin6.sin6_addr,
&local_ip.sin6.sin6_addr,
dst_port, src_port,
@@ -2186,8 +2208,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
u32 rt6i_flags = ((struct rt6_info *)ndst)->rt6i_flags;
err = encap_bypass_if_local(skb, dev, vxlan, dst,
- dst_port, vni, ndst,
- rt6i_flags);
+ dst_port, ifindex, vni,
+ ndst, rt6i_flags);
if (err)
goto out_unlock;
}
@@ -2246,7 +2268,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
skb_reset_mac_header(skb);
- if (vxlan->flags & VXLAN_F_COLLECT_METADATA) {
+ if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) {
if (info && info->mode & IP_TUNNEL_INFO_BRIDGE &&
info->mode & IP_TUNNEL_INFO_TX) {
vni = tunnel_id_to_key32(info->key.tun_id);
@@ -2259,7 +2281,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
}
}
- if (vxlan->flags & VXLAN_F_PROXY) {
+ if (vxlan->cfg.flags & VXLAN_F_PROXY) {
eth = eth_hdr(skb);
if (ntohs(eth->h_proto) == ETH_P_ARP)
return arp_reduce(dev, skb, vni);
@@ -2279,7 +2301,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
f = vxlan_find_mac(vxlan, eth->h_dest, vni);
did_rsc = false;
- if (f && (f->flags & NTF_ROUTER) && (vxlan->flags & VXLAN_F_RSC) &&
+ if (f && (f->flags & NTF_ROUTER) && (vxlan->cfg.flags & VXLAN_F_RSC) &&
(ntohs(eth->h_proto) == ETH_P_IP ||
ntohs(eth->h_proto) == ETH_P_IPV6)) {
did_rsc = route_shortcircuit(dev, skb);
@@ -2290,7 +2312,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
if (f == NULL) {
f = vxlan_find_mac(vxlan, all_zeros_mac, vni);
if (f == NULL) {
- if ((vxlan->flags & VXLAN_F_L2MISS) &&
+ if ((vxlan->cfg.flags & VXLAN_F_L2MISS) &&
!is_multicast_ether_addr(eth->h_dest))
vxlan_fdb_miss(vxlan, eth->h_dest);
@@ -2365,17 +2387,22 @@ static void vxlan_vs_del_dev(struct vxlan_dev *vxlan)
struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
spin_lock(&vn->sock_lock);
- hlist_del_init_rcu(&vxlan->hlist);
+ hlist_del_init_rcu(&vxlan->hlist4.hlist);
+#if IS_ENABLED(CONFIG_IPV6)
+ hlist_del_init_rcu(&vxlan->hlist6.hlist);
+#endif
spin_unlock(&vn->sock_lock);
}
-static void vxlan_vs_add_dev(struct vxlan_sock *vs, struct vxlan_dev *vxlan)
+static void vxlan_vs_add_dev(struct vxlan_sock *vs, struct vxlan_dev *vxlan,
+ struct vxlan_dev_node *node)
{
struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
__be32 vni = vxlan->default_dst.remote_vni;
+ node->vxlan = vxlan;
spin_lock(&vn->sock_lock);
- hlist_add_head_rcu(&vxlan->hlist, vni_head(vs, vni));
+ hlist_add_head_rcu(&node->hlist, vni_head(vs, vni));
spin_unlock(&vn->sock_lock);
}
@@ -2486,10 +2513,7 @@ static int vxlan_change_mtu(struct net_device *dev, int new_mtu)
struct vxlan_rdst *dst = &vxlan->default_dst;
struct net_device *lowerdev = __dev_get_by_index(vxlan->net,
dst->remote_ifindex);
- bool use_ipv6 = false;
-
- if (dst->remote_ip.sa.sa_family == AF_INET6)
- use_ipv6 = true;
+ bool use_ipv6 = !!(vxlan->cfg.flags & VXLAN_F_IPV6);
/* This check is different than dev->max_mtu, because it looks at
* the lowerdev->mtu, rather than the static dev->max_mtu
@@ -2625,6 +2649,10 @@ static void vxlan_setup(struct net_device *dev)
netif_keep_dst(dev);
dev->priv_flags |= IFF_NO_QUEUE;
+ /* MTU range: 68 - 65535 */
+ dev->min_mtu = ETH_MIN_MTU;
+ dev->max_mtu = ETH_MAX_MTU;
+
INIT_LIST_HEAD(&vxlan->next);
spin_lock_init(&vxlan->hash_lock);
@@ -2632,8 +2660,6 @@ static void vxlan_setup(struct net_device *dev)
vxlan->age_timer.function = vxlan_cleanup;
vxlan->age_timer.data = (unsigned long) vxlan;
- vxlan->cfg.dst_port = htons(vxlan_port);
-
vxlan->dev = dev;
gro_cells_init(&vxlan->gro_cells, dev);
@@ -2689,7 +2715,8 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
[IFLA_VXLAN_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG },
};
-static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
+static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
{
if (tb[IFLA_ADDRESS]) {
if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
@@ -2703,11 +2730,19 @@ static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
}
}
+ if (tb[IFLA_MTU]) {
+ u32 mtu = nla_get_u32(tb[IFLA_MTU]);
+
+ if (mtu < ETH_MIN_MTU || mtu > ETH_MAX_MTU)
+ return -EINVAL;
+ }
+
if (!data)
return -EINVAL;
if (data[IFLA_VXLAN_ID]) {
- __u32 id = nla_get_u32(data[IFLA_VXLAN_ID]);
+ u32 id = nla_get_u32(data[IFLA_VXLAN_ID]);
+
if (id >= VXLAN_N_VID)
return -ERANGE;
}
@@ -2790,7 +2825,7 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6,
}
vs->sock = sock;
- atomic_set(&vs->refcnt, 1);
+ refcount_set(&vs->refcnt, 1);
vs->flags = (flags & VXLAN_F_RCV_FLAGS);
spin_lock(&vn->sock_lock);
@@ -2819,12 +2854,13 @@ static int __vxlan_sock_add(struct vxlan_dev *vxlan, bool ipv6)
{
struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
struct vxlan_sock *vs = NULL;
+ struct vxlan_dev_node *node;
if (!vxlan->cfg.no_share) {
spin_lock(&vn->sock_lock);
vs = vxlan_find_sock(vxlan->net, ipv6 ? AF_INET6 : AF_INET,
- vxlan->cfg.dst_port, vxlan->flags);
- if (vs && !atomic_add_unless(&vs->refcnt, 1, 0)) {
+ vxlan->cfg.dst_port, vxlan->cfg.flags);
+ if (vs && !refcount_inc_not_zero(&vs->refcnt)) {
spin_unlock(&vn->sock_lock);
return -EBUSY;
}
@@ -2832,23 +2868,27 @@ static int __vxlan_sock_add(struct vxlan_dev *vxlan, bool ipv6)
}
if (!vs)
vs = vxlan_socket_create(vxlan->net, ipv6,
- vxlan->cfg.dst_port, vxlan->flags);
+ vxlan->cfg.dst_port, vxlan->cfg.flags);
if (IS_ERR(vs))
return PTR_ERR(vs);
#if IS_ENABLED(CONFIG_IPV6)
- if (ipv6)
+ if (ipv6) {
rcu_assign_pointer(vxlan->vn6_sock, vs);
- else
+ node = &vxlan->hlist6;
+ } else
#endif
+ {
rcu_assign_pointer(vxlan->vn4_sock, vs);
- vxlan_vs_add_dev(vs, vxlan);
+ node = &vxlan->hlist4;
+ }
+ vxlan_vs_add_dev(vs, vxlan, node);
return 0;
}
static int vxlan_sock_add(struct vxlan_dev *vxlan)
{
- bool metadata = vxlan->flags & VXLAN_F_COLLECT_METADATA;
- bool ipv6 = vxlan->flags & VXLAN_F_IPV6 || metadata;
+ bool metadata = vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA;
+ bool ipv6 = vxlan->cfg.flags & VXLAN_F_IPV6 || metadata;
bool ipv4 = !ipv6 || metadata;
int ret = 0;
@@ -2868,116 +2908,176 @@ static int vxlan_sock_add(struct vxlan_dev *vxlan)
return ret;
}
-static int vxlan_dev_configure(struct net *src_net, struct net_device *dev,
- struct vxlan_config *conf,
- bool changelink)
+static int vxlan_config_validate(struct net *src_net, struct vxlan_config *conf,
+ struct net_device **lower,
+ struct vxlan_dev *old)
{
struct vxlan_net *vn = net_generic(src_net, vxlan_net_id);
- struct vxlan_dev *vxlan = netdev_priv(dev), *tmp;
- struct vxlan_rdst *dst = &vxlan->default_dst;
- unsigned short needed_headroom = ETH_HLEN;
+ struct vxlan_dev *tmp;
bool use_ipv6 = false;
- __be16 default_port = vxlan->cfg.dst_port;
- struct net_device *lowerdev = NULL;
- if (!changelink) {
- if (conf->flags & VXLAN_F_GPE) {
- /* For now, allow GPE only together with
- * COLLECT_METADATA. This can be relaxed later; in such
- * case, the other side of the PtP link will have to be
- * provided.
- */
- if ((conf->flags & ~VXLAN_F_ALLOWED_GPE) ||
- !(conf->flags & VXLAN_F_COLLECT_METADATA)) {
- pr_info("unsupported combination of extensions\n");
- return -EINVAL;
- }
- vxlan_raw_setup(dev);
- } else {
- vxlan_ether_setup(dev);
+ if (conf->flags & VXLAN_F_GPE) {
+ /* For now, allow GPE only together with
+ * COLLECT_METADATA. This can be relaxed later; in such
+ * case, the other side of the PtP link will have to be
+ * provided.
+ */
+ if ((conf->flags & ~VXLAN_F_ALLOWED_GPE) ||
+ !(conf->flags & VXLAN_F_COLLECT_METADATA)) {
+ return -EINVAL;
}
-
- /* MTU range: 68 - 65535 */
- dev->min_mtu = ETH_MIN_MTU;
- dev->max_mtu = ETH_MAX_MTU;
- vxlan->net = src_net;
}
- dst->remote_vni = conf->vni;
+ if (!conf->remote_ip.sa.sa_family && !conf->saddr.sa.sa_family) {
+ /* Unless IPv6 is explicitly requested, assume IPv4 */
+ conf->remote_ip.sa.sa_family = AF_INET;
+ conf->saddr.sa.sa_family = AF_INET;
+ } else if (!conf->remote_ip.sa.sa_family) {
+ conf->remote_ip.sa.sa_family = conf->saddr.sa.sa_family;
+ } else if (!conf->saddr.sa.sa_family) {
+ conf->saddr.sa.sa_family = conf->remote_ip.sa.sa_family;
+ }
- memcpy(&dst->remote_ip, &conf->remote_ip, sizeof(conf->remote_ip));
+ if (conf->saddr.sa.sa_family != conf->remote_ip.sa.sa_family)
+ return -EINVAL;
- /* Unless IPv6 is explicitly requested, assume IPv4 */
- if (!dst->remote_ip.sa.sa_family)
- dst->remote_ip.sa.sa_family = AF_INET;
+ if (vxlan_addr_multicast(&conf->saddr))
+ return -EINVAL;
- if (dst->remote_ip.sa.sa_family == AF_INET6 ||
- vxlan->cfg.saddr.sa.sa_family == AF_INET6) {
+ if (conf->saddr.sa.sa_family == AF_INET6) {
if (!IS_ENABLED(CONFIG_IPV6))
return -EPFNOSUPPORT;
use_ipv6 = true;
- vxlan->flags |= VXLAN_F_IPV6;
+ conf->flags |= VXLAN_F_IPV6;
+
+ if (!(conf->flags & VXLAN_F_COLLECT_METADATA)) {
+ int local_type =
+ ipv6_addr_type(&conf->saddr.sin6.sin6_addr);
+ int remote_type =
+ ipv6_addr_type(&conf->remote_ip.sin6.sin6_addr);
+
+ if (local_type & IPV6_ADDR_LINKLOCAL) {
+ if (!(remote_type & IPV6_ADDR_LINKLOCAL) &&
+ (remote_type != IPV6_ADDR_ANY))
+ return -EINVAL;
+
+ conf->flags |= VXLAN_F_IPV6_LINKLOCAL;
+ } else {
+ if (remote_type ==
+ (IPV6_ADDR_UNICAST | IPV6_ADDR_LINKLOCAL))
+ return -EINVAL;
+
+ conf->flags &= ~VXLAN_F_IPV6_LINKLOCAL;
+ }
+ }
}
- if (conf->label && !use_ipv6) {
- pr_info("label only supported in use with IPv6\n");
+ if (conf->label && !use_ipv6)
return -EINVAL;
- }
- if (conf->remote_ifindex &&
- conf->remote_ifindex != vxlan->cfg.remote_ifindex) {
- lowerdev = __dev_get_by_index(src_net, conf->remote_ifindex);
- dst->remote_ifindex = conf->remote_ifindex;
+ if (conf->remote_ifindex) {
+ struct net_device *lowerdev;
- if (!lowerdev) {
- pr_info("ifindex %d does not exist\n",
- dst->remote_ifindex);
+ lowerdev = __dev_get_by_index(src_net, conf->remote_ifindex);
+ if (!lowerdev)
return -ENODEV;
- }
#if IS_ENABLED(CONFIG_IPV6)
if (use_ipv6) {
struct inet6_dev *idev = __in6_dev_get(lowerdev);
- if (idev && idev->cnf.disable_ipv6) {
- pr_info("IPv6 is disabled via sysctl\n");
+ if (idev && idev->cnf.disable_ipv6)
return -EPERM;
- }
}
#endif
- if (!conf->mtu)
- dev->mtu = lowerdev->mtu -
- (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM);
+ *lower = lowerdev;
+ } else {
+ if (vxlan_addr_multicast(&conf->remote_ip))
+ return -EINVAL;
- needed_headroom = lowerdev->hard_header_len;
- } else if (!conf->remote_ifindex &&
- vxlan_addr_multicast(&dst->remote_ip)) {
- pr_info("multicast destination requires interface to be specified\n");
- return -EINVAL;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (conf->flags & VXLAN_F_IPV6_LINKLOCAL)
+ return -EINVAL;
+#endif
+
+ *lower = NULL;
}
- if (lowerdev) {
- dev->gso_max_size = lowerdev->gso_max_size;
- dev->gso_max_segs = lowerdev->gso_max_segs;
+ if (!conf->dst_port) {
+ if (conf->flags & VXLAN_F_GPE)
+ conf->dst_port = htons(4790); /* IANA VXLAN-GPE port */
+ else
+ conf->dst_port = htons(vxlan_port);
}
- if (conf->mtu) {
- int max_mtu = ETH_MAX_MTU;
+ if (!conf->age_interval)
+ conf->age_interval = FDB_AGE_DEFAULT;
- if (lowerdev)
- max_mtu = lowerdev->mtu;
+ list_for_each_entry(tmp, &vn->vxlan_list, next) {
+ if (tmp == old)
+ continue;
- max_mtu -= (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM);
+ if (tmp->cfg.vni != conf->vni)
+ continue;
+ if (tmp->cfg.dst_port != conf->dst_port)
+ continue;
+ if ((tmp->cfg.flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6)) !=
+ (conf->flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6)))
+ continue;
- if (conf->mtu < dev->min_mtu || conf->mtu > dev->max_mtu)
- return -EINVAL;
+ if ((conf->flags & VXLAN_F_IPV6_LINKLOCAL) &&
+ tmp->cfg.remote_ifindex != conf->remote_ifindex)
+ continue;
+
+ return -EEXIST;
+ }
- dev->mtu = conf->mtu;
+ return 0;
+}
- if (conf->mtu > max_mtu)
- dev->mtu = max_mtu;
+static void vxlan_config_apply(struct net_device *dev,
+ struct vxlan_config *conf,
+ struct net_device *lowerdev,
+ struct net *src_net,
+ bool changelink)
+{
+ struct vxlan_dev *vxlan = netdev_priv(dev);
+ struct vxlan_rdst *dst = &vxlan->default_dst;
+ unsigned short needed_headroom = ETH_HLEN;
+ bool use_ipv6 = !!(conf->flags & VXLAN_F_IPV6);
+ int max_mtu = ETH_MAX_MTU;
+
+ if (!changelink) {
+ if (conf->flags & VXLAN_F_GPE)
+ vxlan_raw_setup(dev);
+ else
+ vxlan_ether_setup(dev);
+
+ if (conf->mtu)
+ dev->mtu = conf->mtu;
+
+ vxlan->net = src_net;
}
+ dst->remote_vni = conf->vni;
+
+ memcpy(&dst->remote_ip, &conf->remote_ip, sizeof(conf->remote_ip));
+
+ if (lowerdev) {
+ dst->remote_ifindex = conf->remote_ifindex;
+
+ dev->gso_max_size = lowerdev->gso_max_size;
+ dev->gso_max_segs = lowerdev->gso_max_segs;
+
+ needed_headroom = lowerdev->hard_header_len;
+
+ max_mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM :
+ VXLAN_HEADROOM);
+ }
+
+ if (dev->mtu > max_mtu)
+ dev->mtu = max_mtu;
+
if (use_ipv6 || conf->flags & VXLAN_F_COLLECT_METADATA)
needed_headroom += VXLAN6_HEADROOM;
else
@@ -2985,31 +3085,21 @@ static int vxlan_dev_configure(struct net *src_net, struct net_device *dev,
dev->needed_headroom = needed_headroom;
memcpy(&vxlan->cfg, conf, sizeof(*conf));
- if (!vxlan->cfg.dst_port) {
- if (conf->flags & VXLAN_F_GPE)
- vxlan->cfg.dst_port = htons(4790); /* IANA VXLAN-GPE port */
- else
- vxlan->cfg.dst_port = default_port;
- }
- vxlan->flags |= conf->flags;
+}
- if (!vxlan->cfg.age_interval)
- vxlan->cfg.age_interval = FDB_AGE_DEFAULT;
+static int vxlan_dev_configure(struct net *src_net, struct net_device *dev,
+ struct vxlan_config *conf,
+ bool changelink)
+{
+ struct vxlan_dev *vxlan = netdev_priv(dev);
+ struct net_device *lowerdev;
+ int ret;
- if (changelink)
- return 0;
+ ret = vxlan_config_validate(src_net, conf, &lowerdev, vxlan);
+ if (ret)
+ return ret;
- list_for_each_entry(tmp, &vn->vxlan_list, next) {
- if (tmp->cfg.vni == conf->vni &&
- (tmp->default_dst.remote_ip.sa.sa_family == AF_INET6 ||
- tmp->cfg.saddr.sa.sa_family == AF_INET6) == use_ipv6 &&
- tmp->cfg.dst_port == vxlan->cfg.dst_port &&
- (tmp->flags & VXLAN_F_RCV_FLAGS) ==
- (vxlan->flags & VXLAN_F_RCV_FLAGS)) {
- pr_info("duplicate VNI %u\n", be32_to_cpu(conf->vni));
- return -EEXIST;
- }
- }
+ vxlan_config_apply(dev, conf, lowerdev, src_net, changelink);
return 0;
}
@@ -3073,22 +3163,35 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
}
if (data[IFLA_VXLAN_GROUP]) {
+ if (changelink && (conf->remote_ip.sa.sa_family != AF_INET))
+ return -EOPNOTSUPP;
+
conf->remote_ip.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_GROUP]);
+ conf->remote_ip.sa.sa_family = AF_INET;
} else if (data[IFLA_VXLAN_GROUP6]) {
if (!IS_ENABLED(CONFIG_IPV6))
return -EPFNOSUPPORT;
+ if (changelink && (conf->remote_ip.sa.sa_family != AF_INET6))
+ return -EOPNOTSUPP;
+
conf->remote_ip.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_GROUP6]);
conf->remote_ip.sa.sa_family = AF_INET6;
}
if (data[IFLA_VXLAN_LOCAL]) {
+ if (changelink && (conf->saddr.sa.sa_family != AF_INET))
+ return -EOPNOTSUPP;
+
conf->saddr.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_LOCAL]);
conf->saddr.sa.sa_family = AF_INET;
} else if (data[IFLA_VXLAN_LOCAL6]) {
if (!IS_ENABLED(CONFIG_IPV6))
return -EPFNOSUPPORT;
+ if (changelink && (conf->saddr.sa.sa_family != AF_INET6))
+ return -EOPNOTSUPP;
+
/* TODO: respect scope id */
conf->saddr.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_LOCAL6]);
conf->saddr.sa.sa_family = AF_INET6;
@@ -3108,12 +3211,10 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
IPV6_FLOWLABEL_MASK;
if (data[IFLA_VXLAN_LEARNING]) {
- if (nla_get_u8(data[IFLA_VXLAN_LEARNING])) {
+ if (nla_get_u8(data[IFLA_VXLAN_LEARNING]))
conf->flags |= VXLAN_F_LEARN;
- } else {
+ else
conf->flags &= ~VXLAN_F_LEARN;
- vxlan->flags &= ~VXLAN_F_LEARN;
- }
} else if (!changelink) {
/* default to learn on a new device */
conf->flags |= VXLAN_F_LEARN;
@@ -3246,7 +3347,8 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
}
static int vxlan_newlink(struct net *src_net, struct net_device *dev,
- struct nlattr *tb[], struct nlattr *data[])
+ struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
{
struct vxlan_config conf;
int err;
@@ -3259,7 +3361,8 @@ static int vxlan_newlink(struct net *src_net, struct net_device *dev,
}
static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[],
- struct nlattr *data[])
+ struct nlattr *data[],
+ struct netlink_ext_ack *extack)
{
struct vxlan_dev *vxlan = netdev_priv(dev);
struct vxlan_rdst *dst = &vxlan->default_dst;
@@ -3396,43 +3499,44 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) ||
nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) ||
nla_put_u8(skb, IFLA_VXLAN_LEARNING,
- !!(vxlan->flags & VXLAN_F_LEARN)) ||
+ !!(vxlan->cfg.flags & VXLAN_F_LEARN)) ||
nla_put_u8(skb, IFLA_VXLAN_PROXY,
- !!(vxlan->flags & VXLAN_F_PROXY)) ||
- nla_put_u8(skb, IFLA_VXLAN_RSC, !!(vxlan->flags & VXLAN_F_RSC)) ||
+ !!(vxlan->cfg.flags & VXLAN_F_PROXY)) ||
+ nla_put_u8(skb, IFLA_VXLAN_RSC,
+ !!(vxlan->cfg.flags & VXLAN_F_RSC)) ||
nla_put_u8(skb, IFLA_VXLAN_L2MISS,
- !!(vxlan->flags & VXLAN_F_L2MISS)) ||
+ !!(vxlan->cfg.flags & VXLAN_F_L2MISS)) ||
nla_put_u8(skb, IFLA_VXLAN_L3MISS,
- !!(vxlan->flags & VXLAN_F_L3MISS)) ||
+ !!(vxlan->cfg.flags & VXLAN_F_L3MISS)) ||
nla_put_u8(skb, IFLA_VXLAN_COLLECT_METADATA,
- !!(vxlan->flags & VXLAN_F_COLLECT_METADATA)) ||
+ !!(vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA)) ||
nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->cfg.age_interval) ||
nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->cfg.addrmax) ||
nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->cfg.dst_port) ||
nla_put_u8(skb, IFLA_VXLAN_UDP_CSUM,
- !(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM_TX)) ||
+ !(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM_TX)) ||
nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_TX,
- !!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM6_TX)) ||
+ !!(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM6_TX)) ||
nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_RX,
- !!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM6_RX)) ||
+ !!(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM6_RX)) ||
nla_put_u8(skb, IFLA_VXLAN_REMCSUM_TX,
- !!(vxlan->flags & VXLAN_F_REMCSUM_TX)) ||
+ !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_TX)) ||
nla_put_u8(skb, IFLA_VXLAN_REMCSUM_RX,
- !!(vxlan->flags & VXLAN_F_REMCSUM_RX)))
+ !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_RX)))
goto nla_put_failure;
if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports))
goto nla_put_failure;
- if (vxlan->flags & VXLAN_F_GBP &&
+ if (vxlan->cfg.flags & VXLAN_F_GBP &&
nla_put_flag(skb, IFLA_VXLAN_GBP))
goto nla_put_failure;
- if (vxlan->flags & VXLAN_F_GPE &&
+ if (vxlan->cfg.flags & VXLAN_F_GPE &&
nla_put_flag(skb, IFLA_VXLAN_GPE))
goto nla_put_failure;
- if (vxlan->flags & VXLAN_F_REMCSUM_NOPARTIAL &&
+ if (vxlan->cfg.flags & VXLAN_F_REMCSUM_NOPARTIAL &&
nla_put_flag(skb, IFLA_VXLAN_REMCSUM_NOPARTIAL))
goto nla_put_failure;