diff options
-rw-r--r-- | drivers/net/Makefile | 2 | ||||
-rw-r--r-- | drivers/net/vxlan/Makefile | 7 | ||||
-rw-r--r-- | drivers/net/vxlan/vxlan_core.c (renamed from drivers/net/vxlan.c) | 434 | ||||
-rw-r--r-- | drivers/net/vxlan/vxlan_multicast.c | 272 | ||||
-rw-r--r-- | drivers/net/vxlan/vxlan_private.h | 162 | ||||
-rw-r--r-- | drivers/net/vxlan/vxlan_vnifilter.c | 999 | ||||
-rw-r--r-- | include/net/vxlan.h | 54 | ||||
-rw-r--r-- | include/uapi/linux/if_link.h | 49 | ||||
-rw-r--r-- | include/uapi/linux/rtnetlink.h | 9 | ||||
-rw-r--r-- | security/selinux/nlmsgtab.c | 5 | ||||
-rwxr-xr-x | tools/testing/selftests/net/test_vxlan_vnifiltering.sh | 579 |
11 files changed, 2307 insertions, 265 deletions
diff --git a/drivers/net/Makefile b/drivers/net/Makefile index 50b23e71065f..3f1192d3c52d 100644 --- a/drivers/net/Makefile +++ b/drivers/net/Makefile @@ -31,7 +31,7 @@ obj-$(CONFIG_TUN) += tun.o obj-$(CONFIG_TAP) += tap.o obj-$(CONFIG_VETH) += veth.o obj-$(CONFIG_VIRTIO_NET) += virtio_net.o -obj-$(CONFIG_VXLAN) += vxlan.o +obj-$(CONFIG_VXLAN) += vxlan/ obj-$(CONFIG_GENEVE) += geneve.o obj-$(CONFIG_BAREUDP) += bareudp.o obj-$(CONFIG_GTP) += gtp.o diff --git a/drivers/net/vxlan/Makefile b/drivers/net/vxlan/Makefile new file mode 100644 index 000000000000..d4c255499b72 --- /dev/null +++ b/drivers/net/vxlan/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the vxlan driver +# + +obj-$(CONFIG_VXLAN) += vxlan.o + +vxlan-objs := vxlan_core.o vxlan_multicast.o vxlan_vnifilter.o diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan/vxlan_core.c index d0dc90d3dac2..4ab09dd5a32a 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -34,10 +34,10 @@ #include <net/ip6_checksum.h> #endif +#include "vxlan_private.h" + #define VXLAN_VERSION "0.1" -#define PORT_HASH_BITS 8 -#define PORT_HASH_SIZE (1<<PORT_HASH_BITS) #define FDB_AGE_DEFAULT 300 /* 5 min */ #define FDB_AGE_INTERVAL (10 * HZ) /* rescan interval */ @@ -53,41 +53,15 @@ static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); -static unsigned int vxlan_net_id; -static struct rtnl_link_ops vxlan_link_ops; +unsigned int vxlan_net_id; -static const u8 all_zeros_mac[ETH_ALEN + 2]; +const u8 all_zeros_mac[ETH_ALEN + 2]; +static struct rtnl_link_ops vxlan_link_ops; static int vxlan_sock_add(struct vxlan_dev *vxlan); static void vxlan_vs_del_dev(struct vxlan_dev *vxlan); -/* per-network namespace private data for this module */ -struct vxlan_net { - struct list_head vxlan_list; - struct hlist_head sock_list[PORT_HASH_SIZE]; - spinlock_t sock_lock; - struct notifier_block nexthop_notifier_block; -}; - -/* Forwarding table entry */ -struct vxlan_fdb { - struct hlist_node hlist; /* linked list of entries */ - struct rcu_head rcu; - unsigned long updated; /* jiffies */ - unsigned long used; - struct list_head remotes; - u8 eth_addr[ETH_ALEN]; - u16 state; /* see ndm_state */ - __be32 vni; - u16 flags; /* see ndm_flags and below */ - struct list_head nh_list; - struct nexthop __rcu *nh; - struct vxlan_dev __rcu *vdev; -}; - -#define NTF_VXLAN_ADDED_BY_USER 0x100 - /* salt for hash table */ static u32 vxlan_salt __read_mostly; @@ -98,17 +72,6 @@ static inline bool vxlan_collect_metadata(struct vxlan_sock *vs) } #if IS_ENABLED(CONFIG_IPV6) -static inline -bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b) -{ - if (a->sa.sa_family != b->sa.sa_family) - return false; - if (a->sa.sa_family == AF_INET6) - return ipv6_addr_equal(&a->sin6.sin6_addr, &b->sin6.sin6_addr); - else - return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr; -} - static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla) { if (nla_len(nla) >= sizeof(struct in6_addr)) { @@ -135,12 +98,6 @@ static int vxlan_nla_put_addr(struct sk_buff *skb, int attr, #else /* !CONFIG_IPV6 */ -static inline -bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b) -{ - return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr; -} - static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla) { if (nla_len(nla) >= sizeof(struct in6_addr)) { @@ -161,37 +118,6 @@ static int vxlan_nla_put_addr(struct sk_buff *skb, int attr, } #endif -/* Virtual Network hash table head */ -static inline struct hlist_head *vni_head(struct vxlan_sock *vs, __be32 vni) -{ - return &vs->vni_list[hash_32((__force u32)vni, VNI_HASH_BITS)]; -} - -/* Socket hash table head */ -static inline struct hlist_head *vs_head(struct net *net, __be16 port) -{ - struct vxlan_net *vn = net_generic(net, vxlan_net_id); - - return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)]; -} - -/* First remote destination for a forwarding entry. - * Guaranteed to be non-NULL because remotes are never deleted. - */ -static inline struct vxlan_rdst *first_remote_rcu(struct vxlan_fdb *fdb) -{ - if (rcu_access_pointer(fdb->nh)) - return NULL; - return list_entry_rcu(fdb->remotes.next, struct vxlan_rdst, list); -} - -static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb) -{ - if (rcu_access_pointer(fdb->nh)) - return NULL; - return list_first_entry(&fdb->remotes, struct vxlan_rdst, list); -} - /* Find VXLAN socket based on network namespace, address family, UDP port, * enabled unshareable flags and socket device binding (see l3mdev with * non-default VRF). @@ -213,18 +139,29 @@ static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family, return NULL; } -static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, int ifindex, - __be32 vni) +static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, + int ifindex, __be32 vni, + struct vxlan_vni_node **vninode) { + struct vxlan_vni_node *vnode; struct vxlan_dev_node *node; /* For flow based devices, map all packets to VNI 0 */ - if (vs->flags & VXLAN_F_COLLECT_METADATA) + if (vs->flags & VXLAN_F_COLLECT_METADATA && + !(vs->flags & VXLAN_F_VNIFILTER)) vni = 0; hlist_for_each_entry_rcu(node, vni_head(vs, vni), hlist) { - if (node->vxlan->default_dst.remote_vni != vni) + if (!node->vxlan) continue; + vnode = NULL; + if (node->vxlan->cfg.flags & VXLAN_F_VNIFILTER) { + vnode = vxlan_vnifilter_lookup(node->vxlan, vni); + if (!vnode) + continue; + } else if (node->vxlan->default_dst.remote_vni != vni) { + continue; + } if (IS_ENABLED(CONFIG_IPV6)) { const struct vxlan_config *cfg = &node->vxlan->cfg; @@ -234,6 +171,8 @@ static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, int ifindex, continue; } + if (vninode) + *vninode = vnode; return node->vxlan; } @@ -251,7 +190,7 @@ static struct vxlan_dev *vxlan_find_vni(struct net *net, int ifindex, if (!vs) return NULL; - return vxlan_vs_find_vni(vs, ifindex, vni); + return vxlan_vs_find_vni(vs, ifindex, vni, NULL); } /* Fill in neighbour message in skbuff. */ @@ -493,7 +432,7 @@ static u32 eth_hash(const unsigned char *addr) return hash_64(value, FDB_HASH_BITS); } -static u32 eth_vni_hash(const unsigned char *addr, __be32 vni) +u32 eth_vni_hash(const unsigned char *addr, __be32 vni) { /* use 1 byte of OUI and 3 bytes of NIC */ u32 key = get_unaligned((u32 *)(addr + 2)); @@ -501,7 +440,7 @@ static u32 eth_vni_hash(const unsigned char *addr, __be32 vni) return jhash_2words(key, vni, vxlan_salt) & (FDB_HASH_SIZE - 1); } -static u32 fdb_head_index(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni) +u32 fdb_head_index(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni) { if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) return eth_vni_hash(mac, vni); @@ -920,12 +859,12 @@ err_inval: return err; } -static int vxlan_fdb_create(struct vxlan_dev *vxlan, - const u8 *mac, union vxlan_addr *ip, - __u16 state, __be16 port, __be32 src_vni, - __be32 vni, __u32 ifindex, __u16 ndm_flags, - u32 nhid, struct vxlan_fdb **fdb, - struct netlink_ext_ack *extack) +int vxlan_fdb_create(struct vxlan_dev *vxlan, + const u8 *mac, union vxlan_addr *ip, + __u16 state, __be16 port, __be32 src_vni, + __be32 vni, __u32 ifindex, __u16 ndm_flags, + u32 nhid, struct vxlan_fdb **fdb, + struct netlink_ext_ack *extack) { struct vxlan_rdst *rd = NULL; struct vxlan_fdb *f; @@ -1150,13 +1089,13 @@ err_notify: } /* Add new entry to forwarding table -- assumes lock held */ -static int vxlan_fdb_update(struct vxlan_dev *vxlan, - const u8 *mac, union vxlan_addr *ip, - __u16 state, __u16 flags, - __be16 port, __be32 src_vni, __be32 vni, - __u32 ifindex, __u16 ndm_flags, u32 nhid, - bool swdev_notify, - struct netlink_ext_ack *extack) +int vxlan_fdb_update(struct vxlan_dev *vxlan, + const u8 *mac, union vxlan_addr *ip, + __u16 state, __u16 flags, + __be16 port, __be32 src_vni, __be32 vni, + __u32 ifindex, __u16 ndm_flags, u32 nhid, + bool swdev_notify, + struct netlink_ext_ack *extack) { struct vxlan_fdb *f; @@ -1307,10 +1246,10 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], return err; } -static int __vxlan_fdb_delete(struct vxlan_dev *vxlan, - const unsigned char *addr, union vxlan_addr ip, - __be16 port, __be32 src_vni, __be32 vni, - u32 ifindex, bool swdev_notify) +int __vxlan_fdb_delete(struct vxlan_dev *vxlan, + const unsigned char *addr, union vxlan_addr ip, + __be16 port, __be32 src_vni, __be32 vni, + u32 ifindex, bool swdev_notify) { struct vxlan_rdst *rd = NULL; struct vxlan_fdb *f; @@ -1519,56 +1458,6 @@ static bool vxlan_snoop(struct net_device *dev, return false; } -/* See if multicast group is already in use by other ID */ -static bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev) -{ - struct vxlan_dev *vxlan; - struct vxlan_sock *sock4; -#if IS_ENABLED(CONFIG_IPV6) - struct vxlan_sock *sock6; -#endif - unsigned short family = dev->default_dst.remote_ip.sa.sa_family; - - sock4 = rtnl_dereference(dev->vn4_sock); - - /* The vxlan_sock is only used by dev, leaving group has - * no effect on other vxlan devices. - */ - if (family == AF_INET && sock4 && refcount_read(&sock4->refcnt) == 1) - return false; -#if IS_ENABLED(CONFIG_IPV6) - sock6 = rtnl_dereference(dev->vn6_sock); - if (family == AF_INET6 && sock6 && refcount_read(&sock6->refcnt) == 1) - return false; -#endif - - list_for_each_entry(vxlan, &vn->vxlan_list, next) { - if (!netif_running(vxlan->dev) || vxlan == dev) - continue; - - if (family == AF_INET && - rtnl_dereference(vxlan->vn4_sock) != sock4) - continue; -#if IS_ENABLED(CONFIG_IPV6) - if (family == AF_INET6 && - rtnl_dereference(vxlan->vn6_sock) != sock6) - continue; -#endif - - if (!vxlan_addr_equal(&vxlan->default_dst.remote_ip, - &dev->default_dst.remote_ip)) - continue; - - if (vxlan->default_dst.remote_ifindex != - dev->default_dst.remote_ifindex) - continue; - - return true; - } - - return false; -} - static bool __vxlan_sock_release_prep(struct vxlan_sock *vs) { struct vxlan_net *vn; @@ -1602,7 +1491,10 @@ static void vxlan_sock_release(struct vxlan_dev *vxlan) RCU_INIT_POINTER(vxlan->vn4_sock, NULL); synchronize_net(); - vxlan_vs_del_dev(vxlan); + if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) + vxlan_vs_del_vnigrp(vxlan); + else + vxlan_vs_del_dev(vxlan); if (__vxlan_sock_release_prep(sock4)) { udp_tunnel_sock_release(sock4->sock); @@ -1617,76 +1509,6 @@ static void vxlan_sock_release(struct vxlan_dev *vxlan) #endif } -/* Update multicast group membership when first VNI on - * multicast address is brought up - */ -static int vxlan_igmp_join(struct vxlan_dev *vxlan) -{ - struct sock *sk; - union vxlan_addr *ip = &vxlan->default_dst.remote_ip; - int ifindex = vxlan->default_dst.remote_ifindex; - int ret = -EINVAL; - - if (ip->sa.sa_family == AF_INET) { - struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock); - struct ip_mreqn mreq = { - .imr_multiaddr.s_addr = ip->sin.sin_addr.s_addr, - .imr_ifindex = ifindex, - }; - - sk = sock4->sock->sk; - lock_sock(sk); - ret = ip_mc_join_group(sk, &mreq); - release_sock(sk); -#if IS_ENABLED(CONFIG_IPV6) - } else { - struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock); - - sk = sock6->sock->sk; - lock_sock(sk); - ret = ipv6_stub->ipv6_sock_mc_join(sk, ifindex, - &ip->sin6.sin6_addr); - release_sock(sk); -#endif - } - - return ret; -} - -/* Inverse of vxlan_igmp_join when last VNI is brought down */ -static int vxlan_igmp_leave(struct vxlan_dev *vxlan) -{ - struct sock *sk; - union vxlan_addr *ip = &vxlan->default_dst.remote_ip; - int ifindex = vxlan->default_dst.remote_ifindex; - int ret = -EINVAL; - - if (ip->sa.sa_family == AF_INET) { - struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock); - struct ip_mreqn mreq = { - .imr_multiaddr.s_addr = ip->sin.sin_addr.s_addr, - .imr_ifindex = ifindex, - }; - - sk = sock4->sock->sk; - lock_sock(sk); - ret = ip_mc_leave_group(sk, &mreq); - release_sock(sk); -#if IS_ENABLED(CONFIG_IPV6) - } else { - struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock); - - sk = sock6->sock->sk; - lock_sock(sk); - ret = ipv6_stub->ipv6_sock_mc_drop(sk, ifindex, - &ip->sin6.sin6_addr); - release_sock(sk); -#endif - } - - return ret; -} - static bool vxlan_remcsum(struct vxlanhdr *unparsed, struct sk_buff *skb, u32 vxflags) { @@ -1828,6 +1650,7 @@ static bool vxlan_ecn_decapsulate(struct vxlan_sock *vs, void *oiph, /* Callback from net/ipv4/udp.c to receive packets */ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) { + struct vxlan_vni_node *vninode = NULL; struct vxlan_dev *vxlan; struct vxlan_sock *vs; struct vxlanhdr unparsed; @@ -1860,7 +1683,7 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) vni = vxlan_vni(vxlan_hdr(skb)->vx_vni); - vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni); + vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni, &vninode); if (!vxlan) goto drop; @@ -1930,6 +1753,8 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) if (!vxlan_ecn_decapsulate(vs, oiph, skb)) { ++vxlan->dev->stats.rx_frame_errors; ++vxlan->dev->stats.rx_errors; + vxlan_vnifilter_count(vxlan, vni, vninode, + VXLAN_VNI_STATS_RX_ERRORS, 0); goto drop; } @@ -1938,10 +1763,13 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) if (unlikely(!(vxlan->dev->flags & IFF_UP))) { rcu_read_unlock(); atomic_long_inc(&vxlan->dev->rx_dropped); + vxlan_vnifilter_count(vxlan, vni, vninode, + VXLAN_VNI_STATS_RX_DROPS, 0); goto drop; } dev_sw_netstats_rx_add(vxlan->dev, skb->len); + vxlan_vnifilter_count(vxlan, vni, vninode, VXLAN_VNI_STATS_RX, skb->len); gro_cells_receive(&vxlan->gro_cells, skb); rcu_read_unlock(); @@ -1975,7 +1803,7 @@ static int vxlan_err_lookup(struct sock *sk, struct sk_buff *skb) return -ENOENT; vni = vxlan_vni(hdr->vx_vni); - vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni); + vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni, NULL); if (!vxlan) return -ENOENT; @@ -2049,8 +1877,12 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) reply->ip_summed = CHECKSUM_UNNECESSARY; reply->pkt_type = PACKET_HOST; - if (netif_rx_ni(reply) == NET_RX_DROP) + if (netif_rx_ni(reply) == NET_RX_DROP) { dev->stats.rx_dropped++; + vxlan_vnifilter_count(vxlan, vni, NULL, + VXLAN_VNI_STATS_RX_DROPS, 0); + } + } else if (vxlan->cfg.flags & VXLAN_F_L3MISS) { union vxlan_addr ipa = { .sin.sin_addr.s_addr = tip, @@ -2204,9 +2036,11 @@ static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) if (reply == NULL) goto out; - if (netif_rx_ni(reply) == NET_RX_DROP) + if (netif_rx_ni(reply) == NET_RX_DROP) { dev->stats.rx_dropped++; - + vxlan_vnifilter_count(vxlan, vni, NULL, + VXLAN_VNI_STATS_RX_DROPS, 0); + } } else if (vxlan->cfg.flags & VXLAN_F_L3MISS) { union vxlan_addr ipa = { .sin6.sin6_addr = msg->target, @@ -2540,15 +2374,20 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, tx_stats->tx_packets++; tx_stats->tx_bytes += len; u64_stats_update_end(&tx_stats->syncp); + vxlan_vnifilter_count(src_vxlan, vni, NULL, VXLAN_VNI_STATS_TX, len); if (__netif_rx(skb) == NET_RX_SUCCESS) { u64_stats_update_begin(&rx_stats->syncp); rx_stats->rx_packets++; rx_stats->rx_bytes += len; u64_stats_update_end(&rx_stats->syncp); + vxlan_vnifilter_count(dst_vxlan, vni, NULL, VXLAN_VNI_STATS_RX, + len); } else { drop: dev->stats.rx_dropped++; + vxlan_vnifilter_count(dst_vxlan, vni, NULL, + VXLAN_VNI_STATS_RX_DROPS, 0); } rcu_read_unlock(); } @@ -2578,6 +2417,8 @@ static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev, vxlan->cfg.flags); if (!dst_vxlan) { dev->stats.tx_errors++; + vxlan_vnifilter_count(vxlan, vni, NULL, + VXLAN_VNI_STATS_TX_ERRORS, 0); kfree_skb(skb); return -ENOENT; @@ -2601,15 +2442,19 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, union vxlan_addr remote_ip, local_ip; struct vxlan_metadata _md; struct vxlan_metadata *md = &_md; + unsigned int pkt_len = skb->len; __be16 src_port = 0, dst_port; struct dst_entry *ndst = NULL; - __be32 vni, label; __u8 tos, ttl; int ifindex; int err; u32 flags = vxlan->cfg.flags; bool udp_sum = false; bool xnet = !net_eq(vxlan->net, dev_net(vxlan->dev)); + __be32 vni = 0; +#if IS_ENABLED(CONFIG_IPV6) + __be32 label; +#endif info = skb_tunnel_info(skb); @@ -2647,7 +2492,9 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM_TX); else udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM6_TX); +#if IS_ENABLED(CONFIG_IPV6) label = vxlan->cfg.label; +#endif } else { if (!info) { WARN_ONCE(1, "%s: Missing encapsulation instructions\n", @@ -2674,7 +2521,9 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, } ttl = info->key.ttl; tos = info->key.tos; +#if IS_ENABLED(CONFIG_IPV6) label = info->key.label; +#endif udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM); } src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min, @@ -2821,12 +2670,14 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, label, src_port, dst_port, !udp_sum); #endif } + vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX, pkt_len); out_unlock: rcu_read_unlock(); return; drop: dev->stats.tx_dropped++; + vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); dev_kfree_skb(skb); return; @@ -2838,6 +2689,7 @@ tx_error: dev->stats.tx_carrier_errors++; dst_release(ndst); dev->stats.tx_errors++; + vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_ERRORS, 0); kfree_skb(skb); } @@ -2870,6 +2722,8 @@ static void vxlan_xmit_nh(struct sk_buff *skb, struct net_device *dev, drop: dev->stats.tx_dropped++; + vxlan_vnifilter_count(netdev_priv(dev), vni, NULL, + VXLAN_VNI_STATS_TX_DROPS, 0); dev_kfree_skb(skb); } @@ -2944,6 +2798,8 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) vxlan_fdb_miss(vxlan, eth->h_dest); dev->stats.tx_dropped++; + vxlan_vnifilter_count(vxlan, vni, NULL, + VXLAN_VNI_STATS_TX_DROPS, 0); kfree_skb(skb); return NETDEV_TX_OK; } @@ -3044,6 +2900,9 @@ static int vxlan_init(struct net_device *dev) struct vxlan_dev *vxlan = netdev_priv(dev); int err; + if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) + vxlan_vnigroup_init(vxlan); + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; @@ -3073,6 +2932,9 @@ static void vxlan_uninit(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); + if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) + vxlan_vnigroup_uninit(vxlan); + gro_cells_destroy(&vxlan->gro_cells); vxlan_fdb_delete_default(vxlan, vxlan->cfg.vni); @@ -3090,14 +2952,10 @@ static int vxlan_open(struct net_device *dev) if (ret < 0) return ret; - if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip)) { - ret = vxlan_igmp_join(vxlan); - if (ret == -EADDRINUSE) - ret = 0; - if (ret) { - vxlan_sock_release(vxlan); - return ret; - } + ret = vxlan_multicast_join(vxlan); + if (ret) { + vxlan_sock_release(vxlan); + return ret; } if (vxlan->cfg.age_interval) @@ -3134,12 +2992,9 @@ static void vxlan_flush(struct vxlan_dev *vxlan, bool do_all) static int vxlan_stop(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); - struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); int ret = 0; - if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip) && - !vxlan_group_used(vn, vxlan)) - ret = vxlan_igmp_leave(vxlan); + vxlan_multicast_leave(vxlan); del_timer_sync(&vxlan->age_timer); @@ -3369,6 +3224,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { [IFLA_VXLAN_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG }, [IFLA_VXLAN_TTL_INHERIT] = { .type = NLA_FLAG }, [IFLA_VXLAN_DF] = { .type = NLA_U8 }, + [IFLA_VXLAN_VNIFILTER] = { .type = NLA_U8 }, }; static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[], @@ -3554,6 +3410,7 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6, static int __vxlan_sock_add(struct vxlan_dev *vxlan, bool ipv6) { struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); + bool metadata = vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA; struct vxlan_sock *vs = NULL; struct vxlan_dev_node *node; int l3mdev_index = 0; @@ -3589,7 +3446,12 @@ static int __vxlan_sock_add(struct vxlan_dev *vxlan, bool ipv6) rcu_assign_pointer(vxlan->vn4_sock, vs); node = &vxlan->hlist4; } - vxlan_vs_add_dev(vs, vxlan, node); + + if (metadata && (vxlan->cfg.flags & VXLAN_F_VNIFILTER)) + vxlan_vs_add_vnigrp(vxlan, vs, ipv6); + else + vxlan_vs_add_dev(vs, vxlan, node); + return 0; } @@ -3616,13 +3478,42 @@ static int vxlan_sock_add(struct vxlan_dev *vxlan) return ret; } +int vxlan_vni_in_use(struct net *src_net, struct vxlan_dev *vxlan, + struct vxlan_config *conf, __be32 vni) +{ + struct vxlan_net *vn = net_generic(src_net, vxlan_net_id); + struct vxlan_dev *tmp; + + list_for_each_entry(tmp, &vn->vxlan_list, next) { + if (tmp == vxlan) + continue; + if (tmp->cfg.flags & VXLAN_F_VNIFILTER) { + if (!vxlan_vnifilter_lookup(tmp, vni)) + continue; + } else if (tmp->cfg.vni != vni) { + continue; + } + if (tmp->cfg.dst_port != conf->dst_port) + continue; + if ((tmp->cfg.flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6)) != + (conf->flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6))) + continue; + + if ((conf->flags & VXLAN_F_IPV6_LINKLOCAL) && + tmp->cfg.remote_ifindex != conf->remote_ifindex) + continue; + + return -EEXIST; + } + + return 0; +} + static int vxlan_config_validate(struct net *src_net, struct vxlan_config *conf, struct net_device **lower, struct vxlan_dev *old, struct netlink_ext_ack *extack) { - struct vxlan_net *vn = net_generic(src_net, vxlan_net_id); - struct vxlan_dev *tmp; bool use_ipv6 = false; if (conf->flags & VXLAN_F_GPE) { @@ -3755,22 +3646,7 @@ static int vxlan_config_validate(struct net *src_net, struct vxlan_config *conf, if (!conf->age_interval) conf->age_interval = FDB_AGE_DEFAULT; - list_for_each_entry(tmp, &vn->vxlan_list, next) { - if (tmp == old) - continue; - - if (tmp->cfg.vni != conf->vni) - continue; - if (tmp->cfg.dst_port != conf->dst_port) - continue; - if ((tmp->cfg.flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6)) != - (conf->flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6))) - continue; - - if ((conf->flags & VXLAN_F_IPV6_LINKLOCAL) && - tmp->cfg.remote_ifindex != conf->remote_ifindex) - continue; - + if (vxlan_vni_in_use(src_net, old, conf, conf->vni)) { NL_SET_ERR_MSG(extack, "A VXLAN device with the specified VNI already exists"); return -EEXIST; @@ -4226,6 +4102,21 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], if (data[IFLA_VXLAN_DF]) conf->df = nla_get_u8(data[IFLA_VXLAN_DF]); + if (data[IFLA_VXLAN_VNIFILTER]) { + err = vxlan_nl2flag(conf, data, IFLA_VXLAN_VNIFILTER, + VXLAN_F_VNIFILTER, changelink, false, + extack); + if (err) + return err; + + if ((conf->flags & VXLAN_F_VNIFILTER) && + !(conf->flags & VXLAN_F_COLLECT_METADATA)) { + NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_VNIFILTER], + "vxlan vnifilter only valid in collect metadata mode"); + return -EINVAL; + } + } + return 0; } @@ -4301,6 +4192,19 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], dst->remote_ifindex, true); spin_unlock_bh(&vxlan->hash_lock[hash_index]); + + /* If vni filtering device, also update fdb entries of + * all vnis that were using default remote ip + */ + if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) { + err = vxlan_vnilist_update_group(vxlan, &dst->remote_ip, + &conf.remote_ip, extack); + if (err) { + netdev_adjacent_change_abort(dst->remote_dev, + lowerdev, dev); + return err; + } + } } if (conf.age_interval != vxlan->cfg.age_interval) @@ -4446,6 +4350,11 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_flag(skb, IFLA_VXLAN_REMCSUM_NOPARTIAL)) goto nla_put_failure; + if (vxlan->cfg.flags & VXLAN_F_VNIFILTER && + nla_put_u8(skb, IFLA_VXLAN_VNIFILTER, + !!(vxlan->cfg.flags & VXLAN_F_VNIFILTER))) + goto nla_put_failure; + return 0; nla_put_failure: @@ -4805,6 +4714,8 @@ static int __init vxlan_init_module(void) if (rc) goto out4; + vxlan_vnifilter_init(); + return 0; out4: unregister_switchdev_notifier(&vxlan_switchdev_notifier_block); @@ -4819,6 +4730,7 @@ late_initcall(vxlan_init_module); static void __exit vxlan_cleanup_module(void) { + vxlan_vnifilter_uninit(); rtnl_link_unregister(&vxlan_link_ops); unregister_switchdev_notifier(&vxlan_switchdev_notifier_block); unregister_netdevice_notifier(&vxlan_notifier_block); diff --git a/drivers/net/vxlan/vxlan_multicast.c b/drivers/net/vxlan/vxlan_multicast.c new file mode 100644 index 000000000000..a7f2d67dc61b --- /dev/null +++ b/drivers/net/vxlan/vxlan_multicast.c @@ -0,0 +1,272 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Vxlan multicast group handling + * + */ +#include <linux/kernel.h> +#include <net/net_namespace.h> +#include <net/sock.h> +#include <linux/igmp.h> +#include <net/vxlan.h> + +#include "vxlan_private.h" + +/* Update multicast group membership when first VNI on + * multicast address is brought up + */ +int vxlan_igmp_join(struct vxlan_dev *vxlan, union vxlan_addr *rip, + int rifindex) +{ + union vxlan_addr *ip = (rip ? : &vxlan->default_dst.remote_ip); + int ifindex = (rifindex ? : vxlan->default_dst.remote_ifindex); + int ret = -EINVAL; + struct sock *sk; + + if (ip->sa.sa_family == AF_INET) { + struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock); + struct ip_mreqn mreq = { + .imr_multiaddr.s_addr = ip->sin.sin_addr.s_addr, + .imr_ifindex = ifindex, + }; + + sk = sock4->sock->sk; + lock_sock(sk); + ret = ip_mc_join_group(sk, &mreq); + release_sock(sk); +#if IS_ENABLED(CONFIG_IPV6) + } else { + struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock); + + sk = sock6->sock->sk; + lock_sock(sk); + ret = ipv6_stub->ipv6_sock_mc_join(sk, ifindex, + &ip->sin6.sin6_addr); + release_sock(sk); +#endif + } + + return ret; +} + +int vxlan_igmp_leave(struct vxlan_dev *vxlan, union vxlan_addr *rip, + int rifindex) +{ + union vxlan_addr *ip = (rip ? : &vxlan->default_dst.remote_ip); + int ifindex = (rifindex ? : vxlan->default_dst.remote_ifindex); + int ret = -EINVAL; + struct sock *sk; + + if (ip->sa.sa_family == AF_INET) { + struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock); + struct ip_mreqn mreq = { + .imr_multiaddr.s_addr = ip->sin.sin_addr.s_addr, + .imr_ifindex = ifindex, + }; + + sk = sock4->sock->sk; + lock_sock(sk); + ret = ip_mc_leave_group(sk, &mreq); + release_sock(sk); +#if IS_ENABLED(CONFIG_IPV6) + } else { + struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock); + + sk = sock6->sock->sk; + lock_sock(sk); + ret = ipv6_stub->ipv6_sock_mc_drop(sk, ifindex, + &ip->sin6.sin6_addr); + release_sock(sk); +#endif + } + + return ret; +} + +static bool vxlan_group_used_match(union vxlan_addr *ip, int ifindex, + union vxlan_addr *rip, int rifindex) +{ + if (!vxlan_addr_multicast(rip)) + return false; + + if (!vxlan_addr_equal(rip, ip)) + return false; + + if (rifindex != ifindex) + return false; + + return true; +} + +static bool vxlan_group_used_by_vnifilter(struct vxlan_dev *vxlan, + union vxlan_addr *ip, int ifindex) +{ + struct vxlan_vni_group *vg = rtnl_dereference(vxlan->vnigrp); + struct vxlan_vni_node *v, *tmp; + + if (vxlan_group_used_match(ip, ifindex, + &vxlan->default_dst.remote_ip, + vxlan->default_dst.remote_ifindex)) + return true; + + list_for_each_entry_safe(v, tmp, &vg->vni_list, vlist) { + if (!vxlan_addr_multicast(&v->remote_ip)) + continue; + + if (vxlan_group_used_match(ip, ifindex, + &v->remote_ip, + vxlan->default_dst.remote_ifindex)) + return true; + } + + return false; +} + +/* See if multicast group is already in use by other ID */ +bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev, + __be32 vni, union vxlan_addr *rip, int rifindex) +{ + union vxlan_addr *ip = (rip ? : &dev->default_dst.remote_ip); + int ifindex = (rifindex ? : dev->default_dst.remote_ifindex); + struct vxlan_dev *vxlan; + struct vxlan_sock *sock4; +#if IS_ENABLED(CONFIG_IPV6) + struct vxlan_sock *sock6; +#endif + unsigned short family = dev->default_dst.remote_ip.sa.sa_family; + + sock4 = rtnl_dereference(dev->vn4_sock); + + /* The vxlan_sock is only used by dev, leaving group has + * no effect on other vxlan devices. + */ + if (family == AF_INET && sock4 && refcount_read(&sock4->refcnt) == 1) + return false; + +#if IS_ENABLED(CONFIG_IPV6) + sock6 = rtnl_dereference(dev->vn6_sock); + if (family == AF_INET6 && sock6 && refcount_read(&sock6->refcnt) == 1) + return false; +#endif + + list_for_each_entry(vxlan, &vn->vxlan_list, next) { + if (!netif_running(vxlan->dev) || vxlan == dev) + continue; + + if (family == AF_INET && + rtnl_dereference(vxlan->vn4_sock) != sock4) + continue; +#if IS_ENABLED(CONFIG_IPV6) + if (family == AF_INET6 && + rtnl_dereference(vxlan->vn6_sock) != sock6) + continue; +#endif + if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) { + if (!vxlan_group_used_by_vnifilter(vxlan, ip, ifindex)) + continue; + } else { + if (!vxlan_group_used_match(ip, ifindex, + &vxlan->default_dst.remote_ip, + vxlan->default_dst.remote_ifindex)) + continue; + } + + return true; + } + + return false; +} + +static int vxlan_multicast_join_vnigrp(struct vxlan_dev *vxlan) +{ + struct vxlan_vni_group *vg = rtnl_dereference(vxlan->vnigrp); + struct vxlan_vni_node *v, *tmp, *vgood = NULL; + int ret = 0; + + list_for_each_entry_safe(v, tmp, &vg->vni_list, vlist) { + if (!vxlan_addr_multicast(&v->remote_ip)) + continue; + /* skip if address is same as default address */ + if (vxlan_addr_equal(&v->remote_ip, + &vxlan->default_dst.remote_ip)) + continue; + ret = vxlan_igmp_join(vxlan, &v->remote_ip, 0); + if (ret == -EADDRINUSE) + ret = 0; + if (ret) + goto out; + vgood = v; + } +out: + if (ret) { + list_for_each_entry_safe(v, tmp, &vg->vni_list, vlist) { + if (!vxlan_addr_multicast(&v->remote_ip)) + continue; + if (vxlan_addr_equal(&v->remote_ip, + &vxlan->default_dst.remote_ip)) + continue; + vxlan_igmp_leave(vxlan, &v->remote_ip, 0); + if (v == vgood) + break; + } + } + + return ret; +} + +static int vxlan_multicast_leave_vnigrp(struct vxlan_dev *vxlan) +{ + struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); + struct vxlan_vni_group *vg = rtnl_dereference(vxlan->vnigrp); + struct vxlan_vni_node *v, *tmp; + int last_err = 0, ret; + + list_for_each_entry_safe(v, tmp, &vg->vni_list, vlist) { + if (vxlan_addr_multicast(&v->remote_ip) && + !vxlan_group_used(vn, vxlan, v->vni, &v->remote_ip, + 0)) { + ret = vxlan_igmp_leave(vxlan, &v->remote_ip, 0); + if (ret) + last_err = ret; + } + } + + return last_err; +} + +int vxlan_multicast_join(struct vxlan_dev *vxlan) +{ + int ret = 0; + + if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip)) { + ret = vxlan_igmp_join(vxlan, &vxlan->default_dst.remote_ip, + vxlan->default_dst.remote_ifindex); + if (ret == -EADDRINUSE) + ret = 0; + if (ret) + return ret; + } + + if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) + return vxlan_multicast_join_vnigrp(vxlan); + + return 0; +} + +int vxlan_multicast_leave(struct vxlan_dev *vxlan) +{ + struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); + int ret = 0; + + if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip) && + !vxlan_group_used(vn, vxlan, 0, NULL, 0)) { + ret = vxlan_igmp_leave(vxlan, &vxlan->default_dst.remote_ip, + vxlan->default_dst.remote_ifindex); + if (ret) + return ret; + } + + if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) + return vxlan_multicast_leave_vnigrp(vxlan); + + return 0; +} diff --git a/drivers/net/vxlan/vxlan_private.h b/drivers/net/vxlan/vxlan_private.h new file mode 100644 index 000000000000..599c3b4fdd5e --- /dev/null +++ b/drivers/net/vxlan/vxlan_private.h @@ -0,0 +1,162 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Vxlan private header file + * + */ + +#ifndef _VXLAN_PRIVATE_H +#define _VXLAN_PRIVATE_H + +#include <linux/rhashtable.h> + +extern unsigned int vxlan_net_id; +extern const u8 all_zeros_mac[ETH_ALEN + 2]; +extern const struct rhashtable_params vxlan_vni_rht_params; + +#define PORT_HASH_BITS 8 +#define PORT_HASH_SIZE (1 << PORT_HASH_BITS) + +/* per-network namespace private data for this module */ +struct vxlan_net { + struct list_head vxlan_list; + struct hlist_head sock_list[PORT_HASH_SIZE]; + spinlock_t sock_lock; + struct notifier_block nexthop_notifier_block; +}; + +/* Forwarding table entry */ +struct vxlan_fdb { + struct hlist_node hlist; /* linked list of entries */ + struct rcu_head rcu; + unsigned long updated; /* jiffies */ + unsigned long used; + struct list_head remotes; + u8 eth_addr[ETH_ALEN]; + u16 state; /* see ndm_state */ + __be32 vni; + u16 flags; /* see ndm_flags and below */ + struct list_head nh_list; + struct nexthop __rcu *nh; + struct vxlan_dev __rcu *vdev; +}; + +#define NTF_VXLAN_ADDED_BY_USER 0x100 + +/* Virtual Network hash table head */ +static inline struct hlist_head *vni_head(struct vxlan_sock *vs, __be32 vni) +{ + return &vs->vni_list[hash_32((__force u32)vni, VNI_HASH_BITS)]; +} + +/* Socket hash table head */ +static inline struct hlist_head *vs_head(struct net *net, __be16 port) +{ + struct vxlan_net *vn = net_generic(net, vxlan_net_id); + + return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)]; +} + +/* First remote destination for a forwarding entry. + * Guaranteed to be non-NULL because remotes are never deleted. + */ +static inline struct vxlan_rdst *first_remote_rcu(struct vxlan_fdb *fdb) +{ + if (rcu_access_pointer(fdb->nh)) + return NULL; + return list_entry_rcu(fdb->remotes.next, struct vxlan_rdst, list); +} + +static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb) +{ + if (rcu_access_pointer(fdb->nh)) + return NULL; + return list_first_entry(&fdb->remotes, struct vxlan_rdst, list); +} + +#if IS_ENABLED(CONFIG_IPV6) +static inline +bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b) +{ + if (a->sa.sa_family != b->sa.sa_family) + return false; + if (a->sa.sa_family == AF_INET6) + return ipv6_addr_equal(&a->sin6.sin6_addr, &b->sin6.sin6_addr); + else + return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr; +} + +#else /* !CONFIG_IPV6 */ + +static inline +bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b) +{ + return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr; +} + +#endif + +static inline struct vxlan_vni_node * +vxlan_vnifilter_lookup(struct vxlan_dev *vxlan, __be32 vni) +{ + struct vxlan_vni_group *vg; + + vg = rcu_dereference_rtnl(vxlan->vnigrp); + if (!vg) + return NULL; + + return rhashtable_lookup_fast(&vg->vni_hash, &vni, + vxlan_vni_rht_params); +} + +/* vxlan_core.c */ +int vxlan_fdb_create(struct vxlan_dev *vxlan, + const u8 *mac, union vxlan_addr *ip, + __u16 state, __be16 port, __be32 src_vni, + __be32 vni, __u32 ifindex, __u16 ndm_flags, + u32 nhid, struct vxlan_fdb **fdb, + struct netlink_ext_ack *extack); +int __vxlan_fdb_delete(struct vxlan_dev *vxlan, + const unsigned char *addr, union vxlan_addr ip, + __be16 port, __be32 src_vni, __be32 vni, + u32 ifindex, bool swdev_notify); +u32 eth_vni_hash(const unsigned char *addr, __be32 vni); +u32 fdb_head_index(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni); +int vxlan_fdb_update(struct vxlan_dev *vxlan, + const u8 *mac, union vxlan_addr *ip, + __u16 state, __u16 flags, + __be16 port, __be32 src_vni, __be32 vni, + __u32 ifindex, __u16 ndm_flags, u32 nhid, + bool swdev_notify, struct netlink_ext_ack *extack); +int vxlan_vni_in_use(struct net *src_net, struct vxlan_dev *vxlan, + struct vxlan_config *conf, __be32 vni); + +/* vxlan_vnifilter.c */ +int vxlan_vnigroup_init(struct vxlan_dev *vxlan); +void vxlan_vnigroup_uninit(struct vxlan_dev *vxlan); + +void vxlan_vnifilter_init(void); +void vxlan_vnifilter_uninit(void); +void vxlan_vnifilter_count(struct vxlan_dev *vxlan, __be32 vni, + struct vxlan_vni_node *vninode, + int type, unsigned int len); + +void vxlan_vs_add_vnigrp(struct vxlan_dev *vxlan, + struct vxlan_sock *vs, + bool ipv6); +void vxlan_vs_del_vnigrp(struct vxlan_dev *vxlan); +int vxlan_vnilist_update_group(struct vxlan_dev *vxlan, + union vxlan_addr *old_remote_ip, + union vxlan_addr *new_remote_ip, + struct netlink_ext_ack *extack); + + +/* vxlan_multicast.c */ +int vxlan_multicast_join(struct vxlan_dev *vxlan); +int vxlan_multicast_leave(struct vxlan_dev *vxlan); +bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev, + __be32 vni, union vxlan_addr *rip, int rifindex); +int vxlan_igmp_join(struct vxlan_dev *vxlan, union vxlan_addr *rip, + int rifindex); +int vxlan_igmp_leave(struct vxlan_dev *vxlan, union vxlan_addr *rip, + int rifindex); +#endif diff --git a/drivers/net/vxlan/vxlan_vnifilter.c b/drivers/net/vxlan/vxlan_vnifilter.c new file mode 100644 index 000000000000..9f28d0b6a6b2 --- /dev/null +++ b/drivers/net/vxlan/vxlan_vnifilter.c @@ -0,0 +1,999 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Vxlan vni filter for collect metadata mode + * + * Authors: Roopa Prabhu <roopa@nvidia.com> + * + */ + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/etherdevice.h> +#include <linux/rhashtable.h> +#include <net/rtnetlink.h> +#include <net/net_namespace.h> +#include <net/sock.h> +#include <net/vxlan.h> + +#include "vxlan_private.h" + +static inline int vxlan_vni_cmp(struct rhashtable_compare_arg *arg, + const void *ptr) +{ + const struct vxlan_vni_node *vnode = ptr; + __be32 vni = *(__be32 *)arg->key; + + return vnode->vni != vni; +} + +const struct rhashtable_params vxlan_vni_rht_params = { + .head_offset = offsetof(struct vxlan_vni_node, vnode), + .key_offset = offsetof(struct vxlan_vni_node, vni), + .key_len = sizeof(__be32), + .nelem_hint = 3, + .max_size = VXLAN_N_VID, + .obj_cmpfn = vxlan_vni_cmp, + .automatic_shrinking = true, +}; + +static void vxlan_vs_add_del_vninode(struct vxlan_dev *vxlan, + struct vxlan_vni_node *v, + bool del) +{ + struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); + struct vxlan_dev_node *node; + struct vxlan_sock *vs; + + spin_lock(&vn->sock_lock); + if (del) { + if (!hlist_unhashed(&v->hlist4.hlist)) + hlist_del_init_rcu(&v->hlist4.hlist); +#if IS_ENABLED(CONFIG_IPV6) + if (!hlist_unhashed(&v->hlist6.hlist)) + hlist_del_init_rcu(&v->hlist6.hlist); +#endif + goto out; + } + +#if IS_ENABLED(CONFIG_IPV6) + vs = rtnl_dereference(vxlan->vn6_sock); + if (vs && v) { + node = &v->hlist6; + hlist_add_head_rcu(&node->hlist, vni_head(vs, v->vni)); + } +#endif + vs = rtnl_dereference(vxlan->vn4_sock); + if (vs && v) { + node = &v->hlist4; + hlist_add_head_rcu(&node->hlist, vni_head(vs, v->vni)); + } +out: + spin_unlock(&vn->sock_lock); +} + +void vxlan_vs_add_vnigrp(struct vxlan_dev *vxlan, + struct vxlan_sock *vs, + bool ipv6) +{ + struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); + struct vxlan_vni_group *vg = rtnl_dereference(vxlan->vnigrp); + struct vxlan_vni_node *v, *tmp; + struct vxlan_dev_node *node; + + if (!vg) + return; + + spin_lock(&vn->sock_lock); + list_for_each_entry_safe(v, tmp, &vg->vni_list, vlist) { +#if IS_ENABLED(CONFIG_IPV6) + if (ipv6) + node = &v->hlist6; + else +#endif + node = &v->hlist4; + node->vxlan = vxlan; + hlist_add_head_rcu(&node->hlist, vni_head(vs, v->vni)); + } + spin_unlock(&vn->sock_lock); +} + +void vxlan_vs_del_vnigrp(struct vxlan_dev *vxlan) +{ + struct vxlan_vni_group *vg = rtnl_dereference(vxlan->vnigrp); + struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); + struct vxlan_vni_node *v, *tmp; + + if (!vg) + return; + + spin_lock(&vn->sock_lock); + list_for_each_entry_safe(v, tmp, &vg->vni_list, vlist) { + hlist_del_init_rcu(&v->hlist4.hlist); +#if IS_ENABLED(CONFIG_IPV6) + hlist_del_init_rcu(&v->hlist6.hlist); +#endif + } + spin_unlock(&vn->sock_lock); +} + +static void vxlan_vnifilter_stats_get(const struct vxlan_vni_node *vninode, + struct vxlan_vni_stats *dest) +{ + int i; + + memset(dest, 0, sizeof(*dest)); + for_each_possible_cpu(i) { + struct vxlan_vni_stats_pcpu *pstats; + struct vxlan_vni_stats temp; + unsigned int start; + + pstats = per_cpu_ptr(vninode->stats, i); + do { + start = u64_stats_fetch_begin_irq(&pstats->syncp); + memcpy(&temp, &pstats->stats, sizeof(temp)); + } while (u64_stats_fetch_retry_irq(&pstats->syncp, start)); + + dest->rx_packets += temp.rx_packets; + dest->rx_bytes += temp.rx_bytes; + dest->rx_drops += temp.rx_drops; + dest->rx_errors += temp.rx_errors; + dest->tx_packets += temp.tx_packets; + dest->tx_bytes += temp.tx_bytes; + dest->tx_drops += temp.tx_drops; + dest->tx_errors += temp.tx_errors; + } +} + +static void vxlan_vnifilter_stats_add(struct vxlan_vni_node *vninode, + int type, unsigned int len) +{ + struct vxlan_vni_stats_pcpu *pstats = this_cpu_ptr(vninode->stats); + + u64_stats_update_begin(&pstats->syncp); + switch (type) { + case VXLAN_VNI_STATS_RX: + pstats->stats.rx_bytes += len; + pstats->stats.rx_packets++; + break; + case VXLAN_VNI_STATS_RX_DROPS: + pstats->stats.rx_drops++; + break; + case VXLAN_VNI_STATS_RX_ERRORS: + pstats->stats.rx_errors++; + break; + case VXLAN_VNI_STATS_TX: + pstats->stats.tx_bytes += len; + pstats->stats.tx_packets++; + break; + case VXLAN_VNI_STATS_TX_DROPS: + pstats->stats.tx_drops++; + break; + case VXLAN_VNI_STATS_TX_ERRORS: + pstats->stats.tx_errors++; + break; + } + u64_stats_update_end(&pstats->syncp); +} + +void vxlan_vnifilter_count(struct vxlan_dev *vxlan, __be32 vni, + struct vxlan_vni_node *vninode, + int type, unsigned int len) +{ + struct vxlan_vni_node *vnode; + + if (!(vxlan->cfg.flags & VXLAN_F_VNIFILTER)) + return; + + if (vninode) { + vnode = vninode; + } else { + vnode = vxlan_vnifilter_lookup(vxlan, vni); + if (!vnode) + return; + } + + vxlan_vnifilter_stats_add(vnode, type, len); +} + +static u32 vnirange(struct vxlan_vni_node *vbegin, + struct vxlan_vni_node *vend) +{ + return (be32_to_cpu(vend->vni) - be32_to_cpu(vbegin->vni)); +} + +static size_t vxlan_vnifilter_entry_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct tunnel_msg)) + + nla_total_size(0) /* VXLAN_VNIFILTER_ENTRY */ + + nla_total_size(sizeof(u32)) /* VXLAN_VNIFILTER_ENTRY_START */ + + nla_total_size(sizeof(u32)) /* VXLAN_VNIFILTER_ENTRY_END */ + + nla_total_size(sizeof(struct in6_addr));/* VXLAN_VNIFILTER_ENTRY_GROUP{6} */ +} + +static int __vnifilter_entry_fill_stats(struct sk_buff *skb, + const struct vxlan_vni_node *vbegin) +{ + struct vxlan_vni_stats vstats; + struct nlattr *vstats_attr; + + vstats_attr = nla_nest_start(skb, VXLAN_VNIFILTER_ENTRY_STATS); + if (!vstats_attr) + goto out_stats_err; + + vxlan_vnifilter_stats_get(vbegin, &vstats); + if (nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_RX_BYTES, + vstats.rx_bytes, VNIFILTER_ENTRY_STATS_PAD) || + nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_RX_PKTS, + vstats.rx_packets, VNIFILTER_ENTRY_STATS_PAD) || + nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_RX_DROPS, + vstats.rx_drops, VNIFILTER_ENTRY_STATS_PAD) || + nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_RX_ERRORS, + vstats.rx_errors, VNIFILTER_ENTRY_STATS_PAD) || + nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_TX_BYTES, + vstats.tx_bytes, VNIFILTER_ENTRY_STATS_PAD) || + nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_TX_PKTS, + vstats.tx_packets, VNIFILTER_ENTRY_STATS_PAD) || + nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_TX_DROPS, + vstats.tx_drops, VNIFILTER_ENTRY_STATS_PAD) || + nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_TX_ERRORS, + vstats.tx_errors, VNIFILTER_ENTRY_STATS_PAD)) + goto out_stats_err; + + nla_nest_end(skb, vstats_attr); + + return 0; + +out_stats_err: + nla_nest_cancel(skb, vstats_attr); + return -EMSGSIZE; +} + +static bool vxlan_fill_vni_filter_entry(struct sk_buff *skb, + struct vxlan_vni_node *vbegin, + struct vxlan_vni_node *vend, + bool fill_stats) +{ + struct nlattr *ventry; + u32 vs = be32_to_cpu(vbegin->vni); + u32 ve = 0; + + if (vbegin != vend) + ve = be32_to_cpu(vend->vni); + + ventry = nla_nest_start(skb, VXLAN_VNIFILTER_ENTRY); + if (!ventry) + return false; + + if (nla_put_u32(skb, VXLAN_VNIFILTER_ENTRY_START, vs)) + goto out_err; + + if (ve && nla_put_u32(skb, VXLAN_VNIFILTER_ENTRY_END, ve)) + goto out_err; + + if (!vxlan_addr_any(&vbegin->remote_ip)) { + if (vbegin->remote_ip.sa.sa_family == AF_INET) { + if (nla_put_in_addr(skb, VXLAN_VNIFILTER_ENTRY_GROUP, + vbegin->remote_ip.sin.sin_addr.s_addr)) + goto out_err; +#if IS_ENABLED(CONFIG_IPV6) + } else { + if (nla_put_in6_addr(skb, VXLAN_VNIFILTER_ENTRY_GROUP6, + &vbegin->remote_ip.sin6.sin6_addr)) + goto out_err; +#endif + } + } + + if (fill_stats && __vnifilter_entry_fill_stats(skb, vbegin)) + goto out_err; + + nla_nest_end(skb, ventry); + + return true; + +out_err: + nla_nest_cancel(skb, ventry); + + return false; +} + +static void vxlan_vnifilter_notify(const struct vxlan_dev *vxlan, + struct vxlan_vni_node *vninode, int cmd) +{ + struct tunnel_msg *tmsg; + struct sk_buff *skb; + struct nlmsghdr *nlh; + struct net *net = dev_net(vxlan->dev); + int err = -ENOBUFS; + + skb = nlmsg_new(vxlan_vnifilter_entry_nlmsg_size(), GFP_KERNEL); + if (!skb) + goto out_err; + + err = -EMSGSIZE; + nlh = nlmsg_put(skb, 0, 0, cmd, sizeof(*tmsg), 0); + if (!nlh) + goto out_err; + tmsg = nlmsg_data(nlh); + memset(tmsg, 0, sizeof(*tmsg)); + tmsg->family = AF_BRIDGE; + tmsg->ifindex = vxlan->dev->ifindex; + + if (!vxlan_fill_vni_filter_entry(skb, vninode, vninode, false)) + goto out_err; + + nlmsg_end(skb, nlh); + rtnl_notify(skb, net, 0, RTNLGRP_TUNNEL, NULL, GFP_KERNEL); + + return; + +out_err: + rtnl_set_sk_err(net, RTNLGRP_TUNNEL, err); + + kfree_skb(skb); +} + +static int vxlan_vnifilter_dump_dev(const struct net_device *dev, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct vxlan_vni_node *tmp, *v, *vbegin = NULL, *vend = NULL; + struct vxlan_dev *vxlan = netdev_priv(dev); + struct tunnel_msg *new_tmsg, *tmsg; + int idx = 0, s_idx = cb->args[1]; + struct vxlan_vni_group *vg; + struct nlmsghdr *nlh; + bool dump_stats; + int err = 0; + + if (!(vxlan->cfg.flags & VXLAN_F_VNIFILTER)) + return -EINVAL; + + /* RCU needed because of the vni locking rules (rcu || rtnl) */ + vg = rcu_dereference(vxlan->vnigrp); + if (!vg || !vg->num_vnis) + return 0; + + tmsg = nlmsg_data(cb->nlh); + dump_stats = !!(tmsg->flags & TUNNEL_MSG_FLAG_STATS); + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + RTM_NEWTUNNEL, sizeof(*new_tmsg), NLM_F_MULTI); + if (!nlh) + return -EMSGSIZE; + new_tmsg = nlmsg_data(nlh); + memset(new_tmsg, 0, sizeof(*new_tmsg)); + new_tmsg->family = PF_BRIDGE; + new_tmsg->ifindex = dev->ifindex; + + list_for_each_entry_safe(v, tmp, &vg->vni_list, vlist) { + if (idx < s_idx) { + idx++; + continue; + } + if (!vbegin) { + vbegin = v; + vend = v; + continue; + } + if (!dump_stats && vnirange(vend, v) == 1 && + vxlan_addr_equal(&v->remote_ip, &vend->remote_ip)) { + goto update_end; + } else { + if (!vxlan_fill_vni_filter_entry(skb, vbegin, vend, + dump_stats)) { + err = -EMSGSIZE; + break; + } + idx += vnirange(vbegin, vend) + 1; + vbegin = v; + } +update_end: + vend = v; + } + + if (!err && vbegin) { + if (!vxlan_fill_vni_filter_entry(skb, vbegin, vend, dump_stats)) + err = -EMSGSIZE; + } + + cb->args[1] = err ? idx : 0; + + nlmsg_end(skb, nlh); + + return err; +} + +static int vxlan_vnifilter_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx = 0, err = 0, s_idx = cb->args[0]; + struct net *net = sock_net(skb->sk); + struct tunnel_msg *tmsg; + struct net_device *dev; + + tmsg = nlmsg_data(cb->nlh); + + if (tmsg->flags & ~TUNNEL_MSG_VALID_USER_FLAGS) { + NL_SET_ERR_MSG(cb->extack, "Invalid tunnelmsg flags in ancillary header"); + return -EINVAL; + } + + rcu_read_lock(); + if (tmsg->ifindex) { + dev = dev_get_by_index_rcu(net, tmsg->ifindex); + if (!dev) { + err = -ENODEV; + goto out_err; + } + err = vxlan_vnifilter_dump_dev(dev, skb, cb); + /* if the dump completed without an error we return 0 here */ + if (err != -EMSGSIZE) + goto out_err; + } else { + for_each_netdev_rcu(net, dev) { + if (!netif_is_vxlan(dev)) + continue; + if (idx < s_idx) + goto skip; + err = vxlan_vnifilter_dump_dev(dev, skb, cb); + if (err == -EMSGSIZE) + break; +skip: + idx++; + } + } + cb->args[0] = idx; + rcu_read_unlock(); + + return skb->len; + +out_err: + rcu_read_unlock(); + + return err; +} + +static const struct nla_policy vni_filter_entry_policy[VXLAN_VNIFILTER_ENTRY_MAX + 1] = { + [VXLAN_VNIFILTER_ENTRY_START] = { .type = NLA_U32 }, + [VXLAN_VNIFILTER_ENTRY_END] = { .type = NLA_U32 }, + [VXLAN_VNIFILTER_ENTRY_GROUP] = { .type = NLA_BINARY, + .len = sizeof_field(struct iphdr, daddr) }, + [VXLAN_VNIFILTER_ENTRY_GROUP6] = { .type = NLA_BINARY, + .len = sizeof(struct in6_addr) }, +}; + +static const struct nla_policy vni_filter_policy[VXLAN_VNIFILTER_MAX + 1] = { + [VXLAN_VNIFILTER_ENTRY] = { .type = NLA_NESTED }, +}; + +static int vxlan_update_default_fdb_entry(struct vxlan_dev *vxlan, __be32 vni, + union vxlan_addr *old_remote_ip, + union vxlan_addr *remote_ip, + struct netlink_ext_ack *extack) +{ + struct vxlan_rdst *dst = &vxlan->default_dst; + u32 hash_index; + int err = 0; + + hash_index = fdb_head_index(vxlan, all_zeros_mac, vni); + spin_lock_bh(&vxlan->hash_lock[hash_index]); + if (remote_ip && !vxlan_addr_any(remote_ip)) { + err = vxlan_fdb_update(vxlan, all_zeros_mac, + remote_ip, + NUD_REACHABLE | NUD_PERMANENT, + NLM_F_APPEND | NLM_F_CREATE, + vxlan->cfg.dst_port, + vni, + vni, + dst->remote_ifindex, + NTF_SELF, 0, true, extack); + if (err) { + spin_unlock_bh(&vxlan->hash_lock[hash_index]); + return err; + } + } + + if (old_remote_ip && !vxlan_addr_any(old_remote_ip)) { + __vxlan_fdb_delete(vxlan, all_zeros_mac, + *old_remote_ip, + vxlan->cfg.dst_port, + vni, vni, + dst->remote_ifindex, + true); + } + spin_unlock_bh(&vxlan->hash_lock[hash_index]); + + return err; +} + +static int vxlan_vni_update_group(struct vxlan_dev *vxlan, + struct vxlan_vni_node *vninode, + union vxlan_addr *group, + bool create, bool *changed, + struct netlink_ext_ack *extack) +{ + struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); + struct vxlan_rdst *dst = &vxlan->default_dst; + union vxlan_addr *newrip = NULL, *oldrip = NULL; + union vxlan_addr old_remote_ip; + int ret = 0; + + memcpy(&old_remote_ip, &vninode->remote_ip, sizeof(old_remote_ip)); + + /* if per vni remote ip is not present use vxlan dev + * default dst remote ip for fdb entry + */ + if (group && !vxlan_addr_any(group)) { + newrip = group; + } else { + if (!vxlan_addr_any(&dst->remote_ip)) + newrip = &dst->remote_ip; + } + + /* if old rip exists, and no newrip, + * explicitly delete old rip + */ + if (!newrip && !vxlan_addr_any(&old_remote_ip)) + oldrip = &old_remote_ip; + + if (!newrip && !oldrip) + return 0; + + if (!create && oldrip && newrip && vxlan_addr_equal(oldrip, newrip)) + return 0; + + ret = vxlan_update_default_fdb_entry(vxlan, vninode->vni, + oldrip, newrip, + extack); + if (ret) + goto out; + + if (group) + memcpy(&vninode->remote_ip, group, sizeof(vninode->remote_ip)); + + if (vxlan->dev->flags & IFF_UP) { + if (vxlan_addr_multicast(&old_remote_ip) && + !vxlan_group_used(vn, vxlan, vninode->vni, + &old_remote_ip, + vxlan->default_dst.remote_ifindex)) { + ret = vxlan_igmp_leave(vxlan, &old_remote_ip, + 0); + if (ret) + goto out; + } + + if (vxlan_addr_multicast(&vninode->remote_ip)) { + ret = vxlan_igmp_join(vxlan, &vninode->remote_ip, 0); + if (ret == -EADDRINUSE) + ret = 0; + if (ret) + goto out; + } + } + + *changed = true; + + return 0; +out: + return ret; +} + +int vxlan_vnilist_update_group(struct vxlan_dev *vxlan, + union vxlan_addr *old_remote_ip, + union vxlan_addr *new_remote_ip, + struct netlink_ext_ack *extack) +{ + struct list_head *headp, *hpos; + struct vxlan_vni_group *vg; + struct vxlan_vni_node *vent; + int ret; + + vg = rtnl_dereference(vxlan->vnigrp); + + headp = &vg->vni_list; + list_for_each_prev(hpos, headp) { + vent = list_entry(hpos, struct vxlan_vni_node, vlist); + if (vxlan_addr_any(&vent->remote_ip)) { + ret = vxlan_update_default_fdb_entry(vxlan, vent->vni, + old_remote_ip, + new_remote_ip, + extack); + if (ret) + return ret; + } + } + + return 0; +} + +static void vxlan_vni_delete_group(struct vxlan_dev *vxlan, + struct vxlan_vni_node *vninode) +{ + struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); + struct vxlan_rdst *dst = &vxlan->default_dst; + + /* if per vni remote_ip not present, delete the + * default dst remote_ip previously added for this vni + */ + if (!vxlan_addr_any(&vninode->remote_ip) || + !vxlan_addr_any(&dst->remote_ip)) + __vxlan_fdb_delete(vxlan, all_zeros_mac, + (vxlan_addr_any(&vninode->remote_ip) ? + dst->remote_ip : vninode->remote_ip), + vxlan->cfg.dst_port, + vninode->vni, vninode->vni, + dst->remote_ifindex, + true); + + if (vxlan->dev->flags & IFF_UP) { + if (vxlan_addr_multicast(&vninode->remote_ip) && + !vxlan_group_used(vn, vxlan, vninode->vni, + &vninode->remote_ip, + dst->remote_ifindex)) { + vxlan_igmp_leave(vxlan, &vninode->remote_ip, 0); + } + } +} + +static int vxlan_vni_update(struct vxlan_dev *vxlan, + struct vxlan_vni_group *vg, + __be32 vni, union vxlan_addr *group, + bool *changed, + struct netlink_ext_ack *extack) +{ + struct vxlan_vni_node *vninode; + int ret; + + vninode = rhashtable_lookup_fast(&vg->vni_hash, &vni, + vxlan_vni_rht_params); + if (!vninode) + return 0; + + ret = vxlan_vni_update_group(vxlan, vninode, group, false, changed, + extack); + if (ret) + return ret; + + if (changed) + vxlan_vnifilter_notify(vxlan, vninode, RTM_NEWTUNNEL); + + return 0; +} + +static void __vxlan_vni_add_list(struct vxlan_vni_group *vg, + struct vxlan_vni_node *v) +{ + struct list_head *headp, *hpos; + struct vxlan_vni_node *vent; + + headp = &vg->vni_list; + list_for_each_prev(hpos, headp) { + vent = list_entry(hpos, struct vxlan_vni_node, vlist); + if (be32_to_cpu(v->vni) < be32_to_cpu(vent->vni)) + continue; + else + break; + } + list_add_rcu(&v->vlist, hpos); + vg->num_vnis++; +} + +static void __vxlan_vni_del_list(struct vxlan_vni_group *vg, + struct vxlan_vni_node *v) +{ + list_del_rcu(&v->vlist); + vg->num_vnis--; +} + +static struct vxlan_vni_node *vxlan_vni_alloc(struct vxlan_dev *vxlan, + __be32 vni) +{ + struct vxlan_vni_node *vninode; + + vninode = kzalloc(sizeof(*vninode), GFP_ATOMIC); + if (!vninode) + return NULL; + vninode->stats = netdev_alloc_pcpu_stats(struct vxlan_vni_stats_pcpu); + if (!vninode->stats) { + kfree(vninode); + return NULL; + } + vninode->vni = vni; + vninode->hlist4.vxlan = vxlan; +#if IS_ENABLED(CONFIG_IPV6) + vninode->hlist6.vxlan = vxlan; +#endif + + return vninode; +} + +static int vxlan_vni_add(struct vxlan_dev *vxlan, + struct vxlan_vni_group *vg, + u32 vni, union vxlan_addr *group, + struct netlink_ext_ack *extack) +{ + struct vxlan_vni_node *vninode; + __be32 v = cpu_to_be32(vni); + bool changed = false; + int err = 0; + + if (vxlan_vnifilter_lookup(vxlan, v)) + return vxlan_vni_update(vxlan, vg, v, group, &changed, extack); + + err = vxlan_vni_in_use(vxlan->net, vxlan, &vxlan->cfg, v); + if (err) { + NL_SET_ERR_MSG(extack, "VNI in use"); + return err; + } + + vninode = vxlan_vni_alloc(vxlan, v); + if (!vninode) + return -ENOMEM; + + err = rhashtable_lookup_insert_fast(&vg->vni_hash, + &vninode->vnode, + vxlan_vni_rht_params); + if (err) { + kfree(vninode); + return err; + } + + __vxlan_vni_add_list(vg, vninode); + + if (vxlan->dev->flags & IFF_UP) + vxlan_vs_add_del_vninode(vxlan, vninode, false); + + err = vxlan_vni_update_group(vxlan, vninode, group, true, &changed, + extack); + + if (changed) + vxlan_vnifilter_notify(vxlan, vninode, RTM_NEWTUNNEL); + + return err; +} + +static void vxlan_vni_node_rcu_free(struct rcu_head *rcu) +{ + struct vxlan_vni_node *v; + + v = container_of(rcu, struct vxlan_vni_node, rcu); + free_percpu(v->stats); + kfree(v); +} + +static int vxlan_vni_del(struct vxlan_dev *vxlan, + struct vxlan_vni_group *vg, + u32 vni, struct netlink_ext_ack *extack) +{ + struct vxlan_vni_node *vninode; + __be32 v = cpu_to_be32(vni); + int err = 0; + + vg = rtnl_dereference(vxlan->vnigrp); + + vninode = rhashtable_lookup_fast(&vg->vni_hash, &v, + vxlan_vni_rht_params); + if (!vninode) { + err = -ENOENT; + goto out; + } + + vxlan_vni_delete_group(vxlan, vninode); + + err = rhashtable_remove_fast(&vg->vni_hash, + &vninode->vnode, + vxlan_vni_rht_params); + if (err) + goto out; + + __vxlan_vni_del_list(vg, vninode); + + vxlan_vnifilter_notify(vxlan, vninode, RTM_DELTUNNEL); + + if (vxlan->dev->flags & IFF_UP) + vxlan_vs_add_del_vninode(vxlan, vninode, true); + + call_rcu(&vninode->rcu, vxlan_vni_node_rcu_free); + + return 0; +out: + return err; +} + +static int vxlan_vni_add_del(struct vxlan_dev *vxlan, __u32 start_vni, + __u32 end_vni, union vxlan_addr *group, + int cmd, struct netlink_ext_ack *extack) +{ + struct vxlan_vni_group *vg; + int v, err = 0; + + vg = rtnl_dereference(vxlan->vnigrp); + + for (v = start_vni; v <= end_vni; v++) { + switch (cmd) { + case RTM_NEWTUNNEL: + err = vxlan_vni_add(vxlan, vg, v, group, extack); + break; + case RTM_DELTUNNEL: + err = vxlan_vni_del(vxlan, vg, v, extack); + break; + default: + err = -EOPNOTSUPP; + break; + } + if (err) + goto out; + } + + return 0; +out: + return err; +} + +static int vxlan_process_vni_filter(struct vxlan_dev *vxlan, + struct nlattr *nlvnifilter, + int cmd, struct netlink_ext_ack *extack) +{ + struct nlattr *vattrs[VXLAN_VNIFILTER_ENTRY_MAX + 1]; + u32 vni_start = 0, vni_end = 0; + union vxlan_addr group; + int err; + + err = nla_parse_nested(vattrs, + VXLAN_VNIFILTER_ENTRY_MAX, + nlvnifilter, vni_filter_entry_policy, + extack); + if (err) + return err; + + if (vattrs[VXLAN_VNIFILTER_ENTRY_START]) { + vni_start = nla_get_u32(vattrs[VXLAN_VNIFILTER_ENTRY_START]); + vni_end = vni_start; + } + + if (vattrs[VXLAN_VNIFILTER_ENTRY_END]) + vni_end = nla_get_u32(vattrs[VXLAN_VNIFILTER_ENTRY_END]); + + if (!vni_start && !vni_end) { + NL_SET_ERR_MSG_ATTR(extack, nlvnifilter, + "vni start nor end found in vni entry"); + return -EINVAL; + } + + if (vattrs[VXLAN_VNIFILTER_ENTRY_GROUP]) { + group.sin.sin_addr.s_addr = + nla_get_in_addr(vattrs[VXLAN_VNIFILTER_ENTRY_GROUP]); + group.sa.sa_family = AF_INET; + } else if (vattrs[VXLAN_VNIFILTER_ENTRY_GROUP6]) { + group.sin6.sin6_addr = + nla_get_in6_addr(vattrs[VXLAN_VNIFILTER_ENTRY_GROUP6]); + group.sa.sa_family = AF_INET6; + } else { + memset(&group, 0, sizeof(group)); + } + + if (vxlan_addr_multicast(&group) && !vxlan->default_dst.remote_ifindex) { + NL_SET_ERR_MSG(extack, + "Local interface required for multicast remote group"); + + return -EINVAL; + } + + err = vxlan_vni_add_del(vxlan, vni_start, vni_end, &group, cmd, + extack); + if (err) + return err; + + return 0; +} + +void vxlan_vnigroup_uninit(struct vxlan_dev *vxlan) +{ + struct vxlan_vni_node *v, *tmp; + struct vxlan_vni_group *vg; + + vg = rtnl_dereference(vxlan->vnigrp); + list_for_each_entry_safe(v, tmp, &vg->vni_list, vlist) { + rhashtable_remove_fast(&vg->vni_hash, &v->vnode, + vxlan_vni_rht_params); + hlist_del_init_rcu(&v->hlist4.hlist); +#if IS_ENABLED(CONFIG_IPV6) + hlist_del_init_rcu(&v->hlist6.hlist); +#endif + __vxlan_vni_del_list(vg, v); + vxlan_vnifilter_notify(vxlan, v, RTM_DELTUNNEL); + call_rcu(&v->rcu, vxlan_vni_node_rcu_free); + } + rhashtable_destroy(&vg->vni_hash); + kfree(vg); +} + +int vxlan_vnigroup_init(struct vxlan_dev *vxlan) +{ + struct vxlan_vni_group *vg; + int ret; + + vg = kzalloc(sizeof(*vg), GFP_KERNEL); + if (!vg) + return -ENOMEM; + ret = rhashtable_init(&vg->vni_hash, &vxlan_vni_rht_params); + if (ret) { + kfree(vg); + return ret; + } + INIT_LIST_HEAD(&vg->vni_list); + rcu_assign_pointer(vxlan->vnigrp, vg); + + return 0; +} + +static int vxlan_vnifilter_process(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct tunnel_msg *tmsg; + struct vxlan_dev *vxlan; + struct net_device *dev; + struct nlattr *attr; + int err, vnis = 0; + int rem; + + /* this should validate the header and check for remaining bytes */ + err = nlmsg_parse(nlh, sizeof(*tmsg), NULL, VXLAN_VNIFILTER_MAX, + vni_filter_policy, extack); + if (err < 0) + return err; + + tmsg = nlmsg_data(nlh); + dev = __dev_get_by_index(net, tmsg->ifindex); + if (!dev) + return -ENODEV; + + if (!netif_is_vxlan(dev)) { + NL_SET_ERR_MSG_MOD(extack, "The device is not a vxlan device"); + return -EINVAL; + } + + vxlan = netdev_priv(dev); + + if (!(vxlan->cfg.flags & VXLAN_F_VNIFILTER)) + return -EOPNOTSUPP; + + nlmsg_for_each_attr(attr, nlh, sizeof(*tmsg), rem) { + switch (nla_type(attr)) { + case VXLAN_VNIFILTER_ENTRY: + err = vxlan_process_vni_filter(vxlan, attr, + nlh->nlmsg_type, extack); + break; + default: + continue; + } + vnis++; + if (err) + break; + } + + if (!vnis) { + NL_SET_ERR_MSG_MOD(extack, "No vnis found to process"); + err = -EINVAL; + } + + return err; +} + +void vxlan_vnifilter_init(void) +{ + rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_GETTUNNEL, NULL, + vxlan_vnifilter_dump, 0); + rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_NEWTUNNEL, + vxlan_vnifilter_process, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_DELTUNNEL, + vxlan_vnifilter_process, NULL, 0); +} + +void vxlan_vnifilter_uninit(void) +{ + rtnl_unregister(PF_BRIDGE, RTM_GETTUNNEL); + rtnl_unregister(PF_BRIDGE, RTM_NEWTUNNEL); + rtnl_unregister(PF_BRIDGE, RTM_DELTUNNEL); +} diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 5a934bebe630..bca5b01af247 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -227,11 +227,56 @@ struct vxlan_config { enum ifla_vxlan_df df; }; +enum { + VXLAN_VNI_STATS_RX, + VXLAN_VNI_STATS_RX_DROPS, + VXLAN_VNI_STATS_RX_ERRORS, + VXLAN_VNI_STATS_TX, + VXLAN_VNI_STATS_TX_DROPS, + VXLAN_VNI_STATS_TX_ERRORS, +}; + +struct vxlan_vni_stats { + u64 rx_packets; + u64 rx_bytes; + u64 rx_drops; + u64 rx_errors; + u64 tx_packets; + u64 tx_bytes; + u64 tx_drops; + u64 tx_errors; +}; + +struct vxlan_vni_stats_pcpu { + struct vxlan_vni_stats stats; + struct u64_stats_sync syncp; +}; + struct vxlan_dev_node { struct hlist_node hlist; struct vxlan_dev *vxlan; }; +struct vxlan_vni_node { + struct rhash_head vnode; + struct vxlan_dev_node hlist4; /* vni hash table for IPv4 socket */ +#if IS_ENABLED(CONFIG_IPV6) + struct vxlan_dev_node hlist6; /* vni hash table for IPv6 socket */ +#endif + struct list_head vlist; + __be32 vni; + union vxlan_addr remote_ip; /* default remote ip for this vni */ + struct vxlan_vni_stats_pcpu __percpu *stats; + + struct rcu_head rcu; +}; + +struct vxlan_vni_group { + struct rhashtable vni_hash; + struct list_head vni_list; + u32 num_vnis; +}; + /* Pseudo network device */ struct vxlan_dev { struct vxlan_dev_node hlist4; /* vni hash table for IPv4 socket */ @@ -254,6 +299,8 @@ struct vxlan_dev { struct vxlan_config cfg; + struct vxlan_vni_group __rcu *vnigrp; + struct hlist_head fdb_head[FDB_HASH_SIZE]; }; @@ -274,6 +321,7 @@ struct vxlan_dev { #define VXLAN_F_GPE 0x4000 #define VXLAN_F_IPV6_LINKLOCAL 0x8000 #define VXLAN_F_TTL_INHERIT 0x10000 +#define VXLAN_F_VNIFILTER 0x20000 /* Flags that are used in the receive path. These flags must match in * order for a socket to be shareable @@ -283,7 +331,8 @@ struct vxlan_dev { VXLAN_F_UDP_ZERO_CSUM6_RX | \ VXLAN_F_REMCSUM_RX | \ VXLAN_F_REMCSUM_NOPARTIAL | \ - VXLAN_F_COLLECT_METADATA) + VXLAN_F_COLLECT_METADATA | \ + VXLAN_F_VNIFILTER) /* Flags that can be set together with VXLAN_F_GPE. */ #define VXLAN_F_ALLOWED_GPE (VXLAN_F_GPE | \ @@ -292,7 +341,8 @@ struct vxlan_dev { VXLAN_F_UDP_ZERO_CSUM_TX | \ VXLAN_F_UDP_ZERO_CSUM6_TX | \ VXLAN_F_UDP_ZERO_CSUM6_RX | \ - VXLAN_F_COLLECT_METADATA) + VXLAN_F_COLLECT_METADATA | \ + VXLAN_F_VNIFILTER) struct net_device *vxlan_dev_create(struct net *net, const char *name, u8 name_assign_type, struct vxlan_config *conf); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index be09d2ad4b5d..e315e53125f4 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -713,7 +713,55 @@ enum ipvlan_mode { #define IPVLAN_F_PRIVATE 0x01 #define IPVLAN_F_VEPA 0x02 +/* Tunnel RTM header */ +struct tunnel_msg { + __u8 family; + __u8 flags; + __u16 reserved2; + __u32 ifindex; +}; + /* VXLAN section */ + +/* include statistics in the dump */ +#define TUNNEL_MSG_FLAG_STATS 0x01 + +#define TUNNEL_MSG_VALID_USER_FLAGS TUNNEL_MSG_FLAG_STATS + +/* Embedded inside VXLAN_VNIFILTER_ENTRY_STATS */ +enum { + VNIFILTER_ENTRY_STATS_UNSPEC, + VNIFILTER_ENTRY_STATS_RX_BYTES, + VNIFILTER_ENTRY_STATS_RX_PKTS, + VNIFILTER_ENTRY_STATS_RX_DROPS, + VNIFILTER_ENTRY_STATS_RX_ERRORS, + VNIFILTER_ENTRY_STATS_TX_BYTES, + VNIFILTER_ENTRY_STATS_TX_PKTS, + VNIFILTER_ENTRY_STATS_TX_DROPS, + VNIFILTER_ENTRY_STATS_TX_ERRORS, + VNIFILTER_ENTRY_STATS_PAD, + __VNIFILTER_ENTRY_STATS_MAX +}; +#define VNIFILTER_ENTRY_STATS_MAX (__VNIFILTER_ENTRY_STATS_MAX - 1) + +enum { + VXLAN_VNIFILTER_ENTRY_UNSPEC, + VXLAN_VNIFILTER_ENTRY_START, + VXLAN_VNIFILTER_ENTRY_END, + VXLAN_VNIFILTER_ENTRY_GROUP, + VXLAN_VNIFILTER_ENTRY_GROUP6, + VXLAN_VNIFILTER_ENTRY_STATS, + __VXLAN_VNIFILTER_ENTRY_MAX +}; +#define VXLAN_VNIFILTER_ENTRY_MAX (__VXLAN_VNIFILTER_ENTRY_MAX - 1) + +enum { + VXLAN_VNIFILTER_UNSPEC, + VXLAN_VNIFILTER_ENTRY, + __VXLAN_VNIFILTER_MAX +}; +#define VXLAN_VNIFILTER_MAX (__VXLAN_VNIFILTER_MAX - 1) + enum { IFLA_VXLAN_UNSPEC, IFLA_VXLAN_ID, @@ -745,6 +793,7 @@ enum { IFLA_VXLAN_GPE, IFLA_VXLAN_TTL_INHERIT, IFLA_VXLAN_DF, + IFLA_VXLAN_VNIFILTER, /* only applicable with COLLECT_METADATA mode */ __IFLA_VXLAN_MAX }; #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 93d934cc4613..0970cb4b1b88 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -185,6 +185,13 @@ enum { RTM_GETNEXTHOPBUCKET, #define RTM_GETNEXTHOPBUCKET RTM_GETNEXTHOPBUCKET + RTM_NEWTUNNEL = 120, +#define RTM_NEWTUNNEL RTM_NEWTUNNEL + RTM_DELTUNNEL, +#define RTM_DELTUNNEL RTM_DELTUNNEL + RTM_GETTUNNEL, +#define RTM_GETTUNNEL RTM_GETTUNNEL + __RTM_MAX, #define RTM_MAX (((__RTM_MAX + 3) & ~3) - 1) }; @@ -756,6 +763,8 @@ enum rtnetlink_groups { #define RTNLGRP_BRVLAN RTNLGRP_BRVLAN RTNLGRP_MCTP_IFADDR, #define RTNLGRP_MCTP_IFADDR RTNLGRP_MCTP_IFADDR + RTNLGRP_TUNNEL, +#define RTNLGRP_TUNNEL RTNLGRP_TUNNEL __RTNLGRP_MAX }; #define RTNLGRP_MAX (__RTNLGRP_MAX - 1) diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c index 94ea2a8b2bb7..6ad3ee02e023 100644 --- a/security/selinux/nlmsgtab.c +++ b/security/selinux/nlmsgtab.c @@ -91,6 +91,9 @@ static const struct nlmsg_perm nlmsg_route_perms[] = { RTM_NEWNEXTHOPBUCKET, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_DELNEXTHOPBUCKET, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_GETNEXTHOPBUCKET, NETLINK_ROUTE_SOCKET__NLMSG_READ }, + { RTM_NEWTUNNEL, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, + { RTM_DELTUNNEL, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, + { RTM_GETTUNNEL, NETLINK_ROUTE_SOCKET__NLMSG_READ }, }; static const struct nlmsg_perm nlmsg_tcpdiag_perms[] = @@ -176,7 +179,7 @@ int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm) * structures at the top of this file with the new mappings * before updating the BUILD_BUG_ON() macro! */ - BUILD_BUG_ON(RTM_MAX != (RTM_NEWNEXTHOPBUCKET + 3)); + BUILD_BUG_ON(RTM_MAX != (RTM_NEWTUNNEL + 3)); err = nlmsg_perm(nlmsg_type, perm, nlmsg_route_perms, sizeof(nlmsg_route_perms)); break; diff --git a/tools/testing/selftests/net/test_vxlan_vnifiltering.sh b/tools/testing/selftests/net/test_vxlan_vnifiltering.sh new file mode 100755 index 000000000000..704997ffc244 --- /dev/null +++ b/tools/testing/selftests/net/test_vxlan_vnifiltering.sh @@ -0,0 +1,579 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# This test is for checking the VXLAN vni filtering api and +# datapath. +# It simulates two hypervisors running two VMs each using four network +# six namespaces: two for the HVs, four for the VMs. Each VM is +# connected to a separate bridge. The VM's use overlapping vlans and +# hence the separate bridge domain. Each vxlan device is a collect +# metadata device with vni filtering and hence has the ability to +# terminate configured vni's only. + +# +--------------------------------+ +------------------------------------+ +# | vm-11 netns | | vm-21 netns | +# | | | | +# |+------------+ +-------------+ | |+-------------+ +----------------+ | +# ||veth-11.10 | |veth-11.20 | | ||veth-21.10 | | veth-21.20 | | +# ||10.0.10.11/24 |10.0.20.11/24| | ||10.0.10.21/24| | 10.0.20.21/24 | | +# |+------|-----+ +|------------+ | |+-----------|-+ +---|------------+ | +# | | | | | | | | +# | | | | | +------------+ | +# | +------------+ | | | veth-21 | | +# | | veth-11 | | | | | | +# | | | | | +-----|------+ | +# | +-----|------+ | | | | +# | | | | | | +# +------------|-------------------+ +---------------|--------------------+ +# +------------|-----------------------------------------|-------------------+ +# | +-----|------+ +-----|------+ | +# | |vethhv-11 | |vethhv-21 | | +# | +----|-------+ +-----|------+ | +# | +---|---+ +---|--+ | +# | | br1 | | br2 | | +# | +---|---+ +---|--+ | +# | +---|----+ +---|--+ | +# | | vxlan1| |vxlan2| | +# | +--|-----+ +--|---+ | +# | | | | +# | | +---------------------+ | | +# | | |veth0 | | | +# | +---------|172.16.0.1/24 -----------+ | +# | |2002:fee1::1/64 | | +# | hv-1 netns +--------|------------+ | +# +-----------------------------|--------------------------------------------+ +# | +# +-----------------------------|--------------------------------------------+ +# | hv-2 netns +--------|-------------+ | +# | | veth0 | | +# | +------| 172.16.0.2/24 |---+ | +# | | | 2002:fee1::2/64 | | | +# | | | | | | +# | | +----------------------+ | - | +# | | | | +# | +-|-------+ +--------|-+ | +# | | vxlan1 | | vxlan2 | | +# | +----|----+ +---|------+ | +# | +--|--+ +-|---+ | +# | | br1 | | br2 | | +# | +--|--+ +--|--+ | +# | +-----|-------+ +----|-------+ | +# | | vethhv-12 | |vethhv-22 | | +# | +------|------+ +-------|----+ | +# +-----------------|----------------------------|---------------------------+ +# | | +# +-----------------|-----------------+ +--------|---------------------------+ +# | +-------|---+ | | +--|---------+ | +# | | veth-12 | | | |veth-22 | | +# | +-|--------|+ | | +--|--------|+ | +# | | | | | | | | +# |+----------|--+ +---|-----------+ | |+-------|-----+ +|---------------+ | +# ||veth-12.10 | |veth-12.20 | | ||veth-22.10 | |veth-22.20 | | +# ||10.0.10.12/24| |10.0.20.12/24 | | ||10.0.10.22/24| |10.0.20.22/24 | | +# |+-------------+ +---------------+ | |+-------------+ +----------------+ | +# | | | | +# | | | | +# | vm-12 netns | |vm-22 netns | +# +-----------------------------------+ +------------------------------------+ +# +# +# This test tests the new vxlan vnifiltering api + +ret=0 +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +# all tests in this script. Can be overridden with -t option +TESTS=" + vxlan_vnifilter_api + vxlan_vnifilter_datapath + vxlan_vnifilter_datapath_pervni + vxlan_vnifilter_datapath_mgroup + vxlan_vnifilter_datapath_mgroup_pervni + vxlan_vnifilter_metadata_and_traditional_mix +" +VERBOSE=0 +PAUSE_ON_FAIL=no +PAUSE=no + +which ping6 > /dev/null 2>&1 && ping6=$(which ping6) || ping6=$(which ping) + +log_test() +{ + local rc=$1 + local expected=$2 + local msg="$3" + + if [ ${rc} -eq ${expected} ]; then + printf " TEST: %-60s [ OK ]\n" "${msg}" + nsuccess=$((nsuccess+1)) + else + ret=1 + nfail=$((nfail+1)) + printf " TEST: %-60s [FAIL]\n" "${msg}" + if [ "${PAUSE_ON_FAIL}" = "yes" ]; then + echo + echo "hit enter to continue, 'q' to quit" + read a + [ "$a" = "q" ] && exit 1 + fi + fi + + if [ "${PAUSE}" = "yes" ]; then + echo + echo "hit enter to continue, 'q' to quit" + read a + [ "$a" = "q" ] && exit 1 + fi +} + +run_cmd() +{ + local cmd="$1" + local out + local stderr="2>/dev/null" + + if [ "$VERBOSE" = "1" ]; then + printf "COMMAND: $cmd\n" + stderr= + fi + + out=$(eval $cmd $stderr) + rc=$? + if [ "$VERBOSE" = "1" -a -n "$out" ]; then + echo " $out" + fi + + return $rc +} + +check_hv_connectivity() { + ip netns exec hv-1 ping -c 1 -W 1 $1 &>/dev/null + sleep 1 + ip netns exec hv-1 ping -c 1 -W 1 $2 &>/dev/null + + return $? +} + +check_vm_connectivity() { + run_cmd "ip netns exec vm-11 ping -c 1 -W 1 10.0.10.12" + log_test $? 0 "VM connectivity over $1 (ipv4 default rdst)" + + run_cmd "ip netns exec vm-21 ping -c 1 -W 1 10.0.10.22" + log_test $? 0 "VM connectivity over $1 (ipv6 default rdst)" +} + +cleanup() { + ip link del veth-hv-1 2>/dev/null || true + ip link del vethhv-11 vethhv-12 vethhv-21 vethhv-22 2>/dev/null || true + + for ns in hv-1 hv-2 vm-11 vm-21 vm-12 vm-22 vm-31 vm-32; do + ip netns del $ns 2>/dev/null || true + done +} + +trap cleanup EXIT + +setup-hv-networking() { + hv=$1 + local1=$2 + mask1=$3 + local2=$4 + mask2=$5 + + ip netns add hv-$hv + ip link set veth-hv-$hv netns hv-$hv + ip -netns hv-$hv link set veth-hv-$hv name veth0 + ip -netns hv-$hv addr add $local1/$mask1 dev veth0 + ip -netns hv-$hv addr add $local2/$mask2 dev veth0 + ip -netns hv-$hv link set veth0 up +} + +# Setups a "VM" simulated by a netns an a veth pair +# example: setup-vm <hvid> <vmid> <brid> <VATTRS> <mcast_for_bum> +# VATTRS = comma separated "<vlan>-<v[46]>-<localip>-<remoteip>-<VTYPE>-<vxlandstport>" +# VTYPE = vxlan device type. "default = traditional device, metadata = metadata device +# vnifilter = vnifiltering device, +# vnifilterg = vnifiltering device with per vni group/remote" +# example: +# setup-vm 1 11 1 \ +# 10-v4-172.16.0.1-239.1.1.100-vnifilterg,20-v4-172.16.0.1-239.1.1.100-vnifilterg 1 +# +setup-vm() { + hvid=$1 + vmid=$2 + brid=$3 + vattrs=$4 + mcast=$5 + lastvxlandev="" + + # create bridge + ip -netns hv-$hvid link add br$brid type bridge vlan_filtering 1 vlan_default_pvid 0 \ + mcast_snooping 0 + ip -netns hv-$hvid link set br$brid up + + # create vm namespace and interfaces and connect to hypervisor + # namespace + ip netns add vm-$vmid + hvvethif="vethhv-$vmid" + vmvethif="veth-$vmid" + ip link add $hvvethif type veth peer name $vmvethif + ip link set $hvvethif netns hv-$hvid + ip link set $vmvethif netns vm-$vmid + ip -netns hv-$hvid link set $hvvethif up + ip -netns vm-$vmid link set $vmvethif up + ip -netns hv-$hvid link set $hvvethif master br$brid + + # configure VM vlan/vni filtering on hypervisor + for vmap in $(echo $vattrs | cut -d "," -f1- --output-delimiter=' ') + do + local vid=$(echo $vmap | awk -F'-' '{print ($1)}') + local family=$(echo $vmap | awk -F'-' '{print ($2)}') + local localip=$(echo $vmap | awk -F'-' '{print ($3)}') + local group=$(echo $vmap | awk -F'-' '{print ($4)}') + local vtype=$(echo $vmap | awk -F'-' '{print ($5)}') + local port=$(echo $vmap | awk -F'-' '{print ($6)}') + + ip -netns vm-$vmid link add name $vmvethif.$vid link $vmvethif type vlan id $vid + ip -netns vm-$vmid addr add 10.0.$vid.$vmid/24 dev $vmvethif.$vid + ip -netns vm-$vmid link set $vmvethif.$vid up + + tid=$vid + vxlandev="vxlan$brid" + vxlandevflags="" + + if [[ -n $vtype && $vtype == "metadata" ]]; then + vxlandevflags="$vxlandevflags external" + elif [[ -n $vtype && $vtype == "vnifilter" || $vtype == "vnifilterg" ]]; then + vxlandevflags="$vxlandevflags external vnifilter" + tid=$((vid+brid)) + else + vxlandevflags="$vxlandevflags id $tid" + vxlandev="vxlan$tid" + fi + + if [[ -n $vtype && $vtype != "vnifilterg" ]]; then + if [[ -n "$group" && "$group" != "null" ]]; then + if [ $mcast -eq 1 ]; then + vxlandevflags="$vxlandevflags group $group" + else + vxlandevflags="$vxlandevflags remote $group" + fi + fi + fi + + if [[ -n "$port" && "$port" != "default" ]]; then + vxlandevflags="$vxlandevflags dstport $port" + fi + + # create vxlan device + if [ "$vxlandev" != "$lastvxlandev" ]; then + ip -netns hv-$hvid link add $vxlandev type vxlan local $localip $vxlandevflags dev veth0 2>/dev/null + ip -netns hv-$hvid link set $vxlandev master br$brid + ip -netns hv-$hvid link set $vxlandev up + lastvxlandev=$vxlandev + fi + + # add vlan + bridge -netns hv-$hvid vlan add vid $vid dev $hvvethif + bridge -netns hv-$hvid vlan add vid $vid pvid dev $vxlandev + + # Add bridge vni filter for tx + if [[ -n $vtype && $vtype == "metadata" || $vtype == "vnifilter" || $vtype == "vnifilterg" ]]; then + bridge -netns hv-$hvid link set dev $vxlandev vlan_tunnel on + bridge -netns hv-$hvid vlan add dev $vxlandev vid $vid tunnel_info id $tid + fi + + if [[ -n $vtype && $vtype == "metadata" ]]; then + bridge -netns hv-$hvid fdb add 00:00:00:00:00:00 dev $vxlandev \ + src_vni $tid vni $tid dst $group self + elif [[ -n $vtype && $vtype == "vnifilter" ]]; then + # Add per vni rx filter with 'bridge vni' api + bridge -netns hv-$hvid vni add dev $vxlandev vni $tid + elif [[ -n $vtype && $vtype == "vnifilterg" ]]; then + # Add per vni group config with 'bridge vni' api + if [ -n "$group" ]; then + if [ "$family" == "v4" ]; then + if [ $mcast -eq 1 ]; then + bridge -netns hv-$hvid vni add dev $vxlandev vni $tid group $group + else + bridge -netns hv-$hvid vni add dev $vxlandev vni $tid remote $group + fi + else + if [ $mcast -eq 1 ]; then + bridge -netns hv-$hvid vni add dev $vxlandev vni $tid group6 $group + else + bridge -netns hv-$hvid vni add dev $vxlandev vni $tid remote6 $group + fi + fi + fi + fi + done +} + +setup_vnifilter_api() +{ + ip link add veth-host type veth peer name veth-testns + ip netns add testns + ip link set veth-testns netns testns +} + +cleanup_vnifilter_api() +{ + ip link del veth-host 2>/dev/null || true + ip netns del testns 2>/dev/null || true +} + +# tests vxlan filtering api +vxlan_vnifilter_api() +{ + hv1addr1="172.16.0.1" + hv2addr1="172.16.0.2" + hv1addr2="2002:fee1::1" + hv2addr2="2002:fee1::2" + localip="172.16.0.1" + group="239.1.1.101" + + cleanup_vnifilter_api &>/dev/null + setup_vnifilter_api + + # Duplicate vni test + # create non-vnifiltering traditional vni device + run_cmd "ip -netns testns link add vxlan100 type vxlan id 100 local $localip dev veth-testns dstport 4789" + log_test $? 0 "Create traditional vxlan device" + + # create vni filtering device + run_cmd "ip -netns testns link add vxlan-ext1 type vxlan vnifilter local $localip dev veth-testns dstport 4789" + log_test $? 1 "Cannot create vnifilter device without external flag" + + run_cmd "ip -netns testns link add vxlan-ext1 type vxlan external vnifilter local $localip dev veth-testns dstport 4789" + log_test $? 0 "Creating external vxlan device with vnifilter flag" + + run_cmd "bridge -netns testns vni add dev vxlan-ext1 vni 100" + log_test $? 0 "Cannot set in-use vni id on vnifiltering device" + + run_cmd "bridge -netns testns vni add dev vxlan-ext1 vni 200" + log_test $? 0 "Set new vni id on vnifiltering device" + + run_cmd "ip -netns testns link add vxlan-ext2 type vxlan external vnifilter local $localip dev veth-testns dstport 4789" + log_test $? 0 "Create second external vxlan device with vnifilter flag" + + run_cmd "bridge -netns testns vni add dev vxlan-ext2 vni 200" + log_test $? 255 "Cannot set in-use vni id on vnifiltering device" + + run_cmd "bridge -netns testns vni add dev vxlan-ext2 vni 300" + log_test $? 0 "Set new vni id on vnifiltering device" + + # check in bridge vni show + run_cmd "bridge -netns testns vni add dev vxlan-ext2 vni 300" + log_test $? 0 "Update vni id on vnifiltering device" + + run_cmd "bridge -netns testns vni add dev vxlan-ext2 vni 400" + log_test $? 0 "Add new vni id on vnifiltering device" + + # add multicast group per vni + run_cmd "bridge -netns testns vni add dev vxlan-ext1 vni 200 group $group" + log_test $? 0 "Set multicast group on existing vni" + + # add multicast group per vni + run_cmd "bridge -netns testns vni add dev vxlan-ext2 vni 300 group $group" + log_test $? 0 "Set multicast group on existing vni" + + # set vnifilter on an existing external vxlan device + run_cmd "ip -netns testns link set dev vxlan-ext1 type vxlan external vnifilter" + log_test $? 2 "Cannot set vnifilter flag on a device" + + # change vxlan vnifilter flag + run_cmd "ip -netns testns link set dev vxlan-ext1 type vxlan external novnifilter" + log_test $? 2 "Cannot unset vnifilter flag on a device" +} + +# Sanity test vnifilter datapath +# vnifilter vnis inherit BUM group from +# vxlan device +vxlan_vnifilter_datapath() +{ + hv1addr1="172.16.0.1" + hv2addr1="172.16.0.2" + hv1addr2="2002:fee1::1" + hv2addr2="2002:fee1::2" + + ip link add veth-hv-1 type veth peer name veth-hv-2 + setup-hv-networking 1 $hv1addr1 24 $hv1addr2 64 $hv2addr1 $hv2addr2 + setup-hv-networking 2 $hv2addr1 24 $hv2addr2 64 $hv1addr1 $hv1addr2 + + check_hv_connectivity hv2addr1 hv2addr2 + + setup-vm 1 11 1 10-v4-$hv1addr1-$hv2addr1-vnifilter,20-v4-$hv1addr1-$hv2addr1-vnifilter 0 + setup-vm 1 21 2 10-v6-$hv1addr2-$hv2addr2-vnifilter,20-v6-$hv1addr2-$hv2addr2-vnifilter 0 + + setup-vm 2 12 1 10-v4-$hv2addr1-$hv1addr1-vnifilter,20-v4-$hv2addr1-$hv1addr1-vnifilter 0 + setup-vm 2 22 2 10-v6-$hv2addr2-$hv1addr2-vnifilter,20-v6-$hv2addr2-$hv1addr2-vnifilter 0 + + check_vm_connectivity "vnifiltering vxlan" +} + +# Sanity test vnifilter datapath +# with vnifilter per vni configured BUM +# group/remote +vxlan_vnifilter_datapath_pervni() +{ + hv1addr1="172.16.0.1" + hv2addr1="172.16.0.2" + hv1addr2="2002:fee1::1" + hv2addr2="2002:fee1::2" + + ip link add veth-hv-1 type veth peer name veth-hv-2 + setup-hv-networking 1 $hv1addr1 24 $hv1addr2 64 + setup-hv-networking 2 $hv2addr1 24 $hv2addr2 64 + + check_hv_connectivity hv2addr1 hv2addr2 + + setup-vm 1 11 1 10-v4-$hv1addr1-$hv2addr1-vnifilterg,20-v4-$hv1addr1-$hv2addr1-vnifilterg 0 + setup-vm 1 21 2 10-v6-$hv1addr2-$hv2addr2-vnifilterg,20-v6-$hv1addr2-$hv2addr2-vnifilterg 0 + + setup-vm 2 12 1 10-v4-$hv2addr1-$hv1addr1-vnifilterg,20-v4-$hv2addr1-$hv1addr1-vnifilterg 0 + setup-vm 2 22 2 10-v6-$hv2addr2-$hv1addr2-vnifilterg,20-v6-$hv2addr2-$hv1addr2-vnifilterg 0 + + check_vm_connectivity "vnifiltering vxlan pervni remote" +} + + +vxlan_vnifilter_datapath_mgroup() +{ + hv1addr1="172.16.0.1" + hv2addr1="172.16.0.2" + hv1addr2="2002:fee1::1" + hv2addr2="2002:fee1::2" + group="239.1.1.100" + group6="ff07::1" + + ip link add veth-hv-1 type veth peer name veth-hv-2 + setup-hv-networking 1 $hv1addr1 24 $hv1addr2 64 + setup-hv-networking 2 $hv2addr1 24 $hv2addr2 64 + + check_hv_connectivity hv2addr1 hv2addr2 + + setup-vm 1 11 1 10-v4-$hv1addr1-$group-vnifilter,20-v4-$hv1addr1-$group-vnifilter 1 + setup-vm 1 21 2 "10-v6-$hv1addr2-$group6-vnifilter,20-v6-$hv1addr2-$group6-vnifilter" 1 + + setup-vm 2 12 1 10-v4-$hv2addr1-$group-vnifilter,20-v4-$hv2addr1-$group-vnifilter 1 + setup-vm 2 22 2 10-v6-$hv2addr2-$group6-vnifilter,20-v6-$hv2addr2-$group6-vnifilter 1 + + check_vm_connectivity "vnifiltering vxlan mgroup" +} + +vxlan_vnifilter_datapath_mgroup_pervni() +{ + hv1addr1="172.16.0.1" + hv2addr1="172.16.0.2" + hv1addr2="2002:fee1::1" + hv2addr2="2002:fee1::2" + group="239.1.1.100" + group6="ff07::1" + + ip link add veth-hv-1 type veth peer name veth-hv-2 + setup-hv-networking 1 $hv1addr1 24 $hv1addr2 64 + setup-hv-networking 2 $hv2addr1 24 $hv2addr2 64 + + check_hv_connectivity hv2addr1 hv2addr2 + + setup-vm 1 11 1 10-v4-$hv1addr1-$group-vnifilterg,20-v4-$hv1addr1-$group-vnifilterg 1 + setup-vm 1 21 2 10-v6-$hv1addr2-$group6-vnifilterg,20-v6-$hv1addr2-$group6-vnifilterg 1 + + setup-vm 2 12 1 10-v4-$hv2addr1-$group-vnifilterg,20-v4-$hv2addr1-$group-vnifilterg 1 + setup-vm 2 22 2 10-v6-$hv2addr2-$group6-vnifilterg,20-v6-$hv2addr2-$group6-vnifilterg 1 + + check_vm_connectivity "vnifiltering vxlan pervni mgroup" +} + +vxlan_vnifilter_metadata_and_traditional_mix() +{ + hv1addr1="172.16.0.1" + hv2addr1="172.16.0.2" + hv1addr2="2002:fee1::1" + hv2addr2="2002:fee1::2" + + ip link add veth-hv-1 type veth peer name veth-hv-2 + setup-hv-networking 1 $hv1addr1 24 $hv1addr2 64 + setup-hv-networking 2 $hv2addr1 24 $hv2addr2 64 + + check_hv_connectivity hv2addr1 hv2addr2 + + setup-vm 1 11 1 10-v4-$hv1addr1-$hv2addr1-vnifilter,20-v4-$hv1addr1-$hv2addr1-vnifilter 0 + setup-vm 1 21 2 10-v6-$hv1addr2-$hv2addr2-vnifilter,20-v6-$hv1addr2-$hv2addr2-vnifilter 0 + setup-vm 1 31 3 30-v4-$hv1addr1-$hv2addr1-default-4790,40-v6-$hv1addr2-$hv2addr2-default-4790,50-v4-$hv1addr1-$hv2addr1-metadata-4791 0 + + + setup-vm 2 12 1 10-v4-$hv2addr1-$hv1addr1-vnifilter,20-v4-$hv2addr1-$hv1addr1-vnifilter 0 + setup-vm 2 22 2 10-v6-$hv2addr2-$hv1addr2-vnifilter,20-v6-$hv2addr2-$hv1addr2-vnifilter 0 + setup-vm 2 32 3 30-v4-$hv2addr1-$hv1addr1-default-4790,40-v6-$hv2addr2-$hv1addr2-default-4790,50-v4-$hv2addr1-$hv1addr1-metadata-4791 0 + + check_vm_connectivity "vnifiltering vxlan pervni remote mix" + + # check VM connectivity over traditional/non-vxlan filtering vxlan devices + run_cmd "ip netns exec vm-31 ping -c 1 -W 1 10.0.30.32" + log_test $? 0 "VM connectivity over traditional vxlan (ipv4 default rdst)" + + run_cmd "ip netns exec vm-31 ping -c 1 -W 1 10.0.40.32" + log_test $? 0 "VM connectivity over traditional vxlan (ipv6 default rdst)" + + run_cmd "ip netns exec vm-31 ping -c 1 -W 1 10.0.50.32" + log_test $? 0 "VM connectivity over metadata nonfiltering vxlan (ipv4 default rdst)" +} + +while getopts :t:pP46hv o +do + case $o in + t) TESTS=$OPTARG;; + p) PAUSE_ON_FAIL=yes;; + P) PAUSE=yes;; + v) VERBOSE=$(($VERBOSE + 1));; + h) usage; exit 0;; + *) usage; exit 1;; + esac +done + +# make sure we don't pause twice +[ "${PAUSE}" = "yes" ] && PAUSE_ON_FAIL=no + +if [ "$(id -u)" -ne 0 ];then + echo "SKIP: Need root privileges" + exit $ksft_skip; +fi + +if [ ! -x "$(command -v ip)" ]; then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +ip link help vxlan 2>&1 | grep -q "vnifilter" +if [ $? -ne 0 ]; then + echo "SKIP: iproute2 too old, missing vxlan dev vnifilter setting" + sync + exit $ksft_skip +fi + +bridge vni help 2>&1 | grep -q "Usage: bridge vni" +if [ $? -ne 0 ]; then + echo "SKIP: iproute2 bridge lacks vxlan vnifiltering support" + exit $ksft_skip +fi + +# start clean +cleanup &> /dev/null + +for t in $TESTS +do + case $t in + none) setup; exit 0;; + *) $t; cleanup;; + esac +done + +if [ "$TESTS" != "none" ]; then + printf "\nTests passed: %3d\n" ${nsuccess} + printf "Tests failed: %3d\n" ${nfail} +fi + +exit $ret |