diff options
Diffstat (limited to 'drivers/net/tun.c')
-rw-r--r-- | drivers/net/tun.c | 161 |
1 files changed, 88 insertions, 73 deletions
diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 504f7f1cad94..b1038c0e2240 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -109,11 +109,11 @@ struct tap_filter { unsigned char addr[FLT_EXACT_COUNT][ETH_ALEN]; }; -/* 1024 is probably a high enough limit: modern hypervisors seem to support on - * the order of 100-200 CPUs so this leaves us some breathing space if we want - * to match a queue per guest CPU. - */ -#define MAX_TAP_QUEUES 1024 +/* DEFAULT_MAX_NUM_RSS_QUEUES were choosed to let the rx/tx queues allocated for + * the netdevice to be fit in one page. So we can make sure the success of + * memory allocation. TODO: increase the limit. */ +#define MAX_TAP_QUEUES DEFAULT_MAX_NUM_RSS_QUEUES +#define MAX_TAP_FLOWS 4096 #define TUN_FLOW_EXPIRE (3 * HZ) @@ -180,12 +180,13 @@ struct tun_struct { int debug; #endif spinlock_t lock; - struct kmem_cache *flow_cache; struct hlist_head flows[TUN_NUM_FLOW_ENTRIES]; struct timer_list flow_gc_timer; unsigned long ageing_time; unsigned int numdisabled; struct list_head disabled; + void *security; + u32 flow_count; }; static inline u32 tun_hashfn(u32 rxhash) @@ -209,8 +210,8 @@ static struct tun_flow_entry *tun_flow_create(struct tun_struct *tun, struct hlist_head *head, u32 rxhash, u16 queue_index) { - struct tun_flow_entry *e = kmem_cache_alloc(tun->flow_cache, - GFP_ATOMIC); + struct tun_flow_entry *e = kmalloc(sizeof(*e), GFP_ATOMIC); + if (e) { tun_debug(KERN_INFO, tun, "create flow: hash %u index %u\n", rxhash, queue_index); @@ -219,23 +220,18 @@ static struct tun_flow_entry *tun_flow_create(struct tun_struct *tun, e->queue_index = queue_index; e->tun = tun; hlist_add_head_rcu(&e->hash_link, head); + ++tun->flow_count; } return e; } -static void tun_flow_free(struct rcu_head *head) -{ - struct tun_flow_entry *e - = container_of(head, struct tun_flow_entry, rcu); - kmem_cache_free(e->tun->flow_cache, e); -} - static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e) { tun_debug(KERN_INFO, tun, "delete flow: hash %u index %u\n", e->rxhash, e->queue_index); hlist_del_rcu(&e->hash_link); - call_rcu(&e->rcu, tun_flow_free); + kfree_rcu(e, rcu); + --tun->flow_count; } static void tun_flow_flush(struct tun_struct *tun) @@ -302,11 +298,12 @@ static void tun_flow_cleanup(unsigned long data) } static void tun_flow_update(struct tun_struct *tun, u32 rxhash, - u16 queue_index) + struct tun_file *tfile) { struct hlist_head *head; struct tun_flow_entry *e; unsigned long delay = tun->ageing_time; + u16 queue_index = tfile->queue_index; if (!rxhash) return; @@ -315,7 +312,9 @@ static void tun_flow_update(struct tun_struct *tun, u32 rxhash, rcu_read_lock(); - if (tun->numqueues == 1) + /* We may get a very small possibility of OOO during switching, not + * worth to optimize.*/ + if (tun->numqueues == 1 || tfile->detached) goto unlock; e = tun_flow_find(head, rxhash); @@ -325,7 +324,8 @@ static void tun_flow_update(struct tun_struct *tun, u32 rxhash, e->updated = jiffies; } else { spin_lock_bh(&tun->lock); - if (!tun_flow_find(head, rxhash)) + if (!tun_flow_find(head, rxhash) && + tun->flow_count < MAX_TAP_FLOWS) tun_flow_create(tun, head, rxhash, queue_index); if (!timer_pending(&tun->flow_gc_timer)) @@ -412,24 +412,23 @@ static void __tun_detach(struct tun_file *tfile, bool clean) struct tun_struct *tun; struct net_device *dev; - tun = rcu_dereference_protected(tfile->tun, - lockdep_rtnl_is_held()); - if (tun) { + tun = rtnl_dereference(tfile->tun); + + if (tun && !tfile->detached) { u16 index = tfile->queue_index; BUG_ON(index >= tun->numqueues); dev = tun->dev; rcu_assign_pointer(tun->tfiles[index], tun->tfiles[tun->numqueues - 1]); - rcu_assign_pointer(tfile->tun, NULL); - ntfile = rcu_dereference_protected(tun->tfiles[index], - lockdep_rtnl_is_held()); + ntfile = rtnl_dereference(tun->tfiles[index]); ntfile->queue_index = index; --tun->numqueues; - if (clean) + if (clean) { + rcu_assign_pointer(tfile->tun, NULL); sock_put(&tfile->sk); - else + } else tun_disable_queue(tun, tfile); synchronize_net(); @@ -437,14 +436,19 @@ static void __tun_detach(struct tun_file *tfile, bool clean) /* Drop read queue */ skb_queue_purge(&tfile->sk.sk_receive_queue); tun_set_real_num_queues(tun); - } else if (tfile->detached && clean) + } else if (tfile->detached && clean) { tun = tun_enable_queue(tfile); + sock_put(&tfile->sk); + } if (clean) { - if (tun && tun->numqueues == 0 && tun->numdisabled == 0 && - !(tun->flags & TUN_PERSIST)) - if (tun->dev->reg_state == NETREG_REGISTERED) + if (tun && tun->numqueues == 0 && tun->numdisabled == 0) { + netif_carrier_off(tun->dev); + + if (!(tun->flags & TUN_PERSIST) && + tun->dev->reg_state == NETREG_REGISTERED) unregister_netdevice(tun->dev); + } BUG_ON(!test_bit(SOCK_EXTERNALLY_ALLOCATED, &tfile->socket.flags)); @@ -466,19 +470,21 @@ static void tun_detach_all(struct net_device *dev) int i, n = tun->numqueues; for (i = 0; i < n; i++) { - tfile = rcu_dereference_protected(tun->tfiles[i], - lockdep_rtnl_is_held()); + tfile = rtnl_dereference(tun->tfiles[i]); BUG_ON(!tfile); wake_up_all(&tfile->wq.wait); rcu_assign_pointer(tfile->tun, NULL); --tun->numqueues; } + list_for_each_entry(tfile, &tun->disabled, next) { + wake_up_all(&tfile->wq.wait); + rcu_assign_pointer(tfile->tun, NULL); + } BUG_ON(tun->numqueues != 0); synchronize_net(); for (i = 0; i < n; i++) { - tfile = rcu_dereference_protected(tun->tfiles[i], - lockdep_rtnl_is_held()); + tfile = rtnl_dereference(tun->tfiles[i]); /* Drop read queue */ skb_queue_purge(&tfile->sk.sk_receive_queue); sock_put(&tfile->sk); @@ -489,6 +495,9 @@ static void tun_detach_all(struct net_device *dev) sock_put(&tfile->sk); } BUG_ON(tun->numdisabled != 0); + + if (tun->flags & TUN_PERSIST) + module_put(THIS_MODULE); } static int tun_attach(struct tun_struct *tun, struct file *file) @@ -496,8 +505,12 @@ static int tun_attach(struct tun_struct *tun, struct file *file) struct tun_file *tfile = file->private_data; int err; + err = security_tun_dev_attach(tfile->socket.sk, tun->security); + if (err < 0) + goto out; + err = -EINVAL; - if (rcu_dereference_protected(tfile->tun, lockdep_rtnl_is_held())) + if (rtnl_dereference(tfile->tun) && !tfile->detached) goto out; err = -EBUSY; @@ -833,12 +846,6 @@ static int tun_flow_init(struct tun_struct *tun) { int i; - tun->flow_cache = kmem_cache_create("tun_flow_cache", - sizeof(struct tun_flow_entry), 0, 0, - NULL); - if (!tun->flow_cache) - return -ENOMEM; - for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) INIT_HLIST_HEAD(&tun->flows[i]); @@ -854,10 +861,6 @@ static void tun_flow_uninit(struct tun_struct *tun) { del_timer_sync(&tun->flow_gc_timer); tun_flow_flush(tun); - - /* Wait for completion of call_rcu()'s */ - rcu_barrier(); - kmem_cache_destroy(tun->flow_cache); } /* Initialize net device. */ @@ -1016,6 +1019,7 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, skb->data_len += len; skb->len += len; skb->truesize += truesize; + skb_shinfo(skb)->gso_type |= SKB_GSO_SHARED_FRAG; atomic_add(truesize, &skb->sk->sk_wmem_alloc); while (len) { int off = base & ~PAGE_MASK; @@ -1161,16 +1165,18 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, } if (gso.gso_type != VIRTIO_NET_HDR_GSO_NONE) { + unsigned short gso_type = 0; + pr_debug("GSO!\n"); switch (gso.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { case VIRTIO_NET_HDR_GSO_TCPV4: - skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; + gso_type = SKB_GSO_TCPV4; break; case VIRTIO_NET_HDR_GSO_TCPV6: - skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6; + gso_type = SKB_GSO_TCPV6; break; case VIRTIO_NET_HDR_GSO_UDP: - skb_shinfo(skb)->gso_type = SKB_GSO_UDP; + gso_type = SKB_GSO_UDP; break; default: tun->dev->stats.rx_frame_errors++; @@ -1179,9 +1185,10 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, } if (gso.gso_type & VIRTIO_NET_HDR_GSO_ECN) - skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; + gso_type |= SKB_GSO_TCP_ECN; skb_shinfo(skb)->gso_size = gso.gso_size; + skb_shinfo(skb)->gso_type |= gso_type; if (skb_shinfo(skb)->gso_size == 0) { tun->dev->stats.rx_frame_errors++; kfree_skb(skb); @@ -1206,7 +1213,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, tun->dev->stats.rx_packets++; tun->dev->stats.rx_bytes += len; - tun_flow_update(tun, rxhash, tfile->queue_index); + tun_flow_update(tun, rxhash, tfile); return total_len; } @@ -1389,6 +1396,7 @@ static void tun_free_netdev(struct net_device *dev) BUG_ON(!(list_empty(&tun->disabled))); tun_flow_uninit(tun); + security_tun_dev_free_security(tun->security); free_netdev(dev); } @@ -1562,6 +1570,9 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) struct net_device *dev; int err; + if (tfile->detached) + return -EINVAL; + dev = __dev_get_by_name(net, ifr->ifr_name); if (dev) { if (ifr->ifr_flags & IFF_TUN_EXCL) @@ -1575,7 +1586,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) if (tun_not_capable(tun)) return -EPERM; - err = security_tun_dev_attach(tfile->socket.sk); + err = security_tun_dev_open(tun->security); if (err < 0) return err; @@ -1590,6 +1601,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) else { char *name; unsigned long flags = 0; + int queues = ifr->ifr_flags & IFF_MULTI_QUEUE ? + MAX_TAP_QUEUES : 1; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; @@ -1613,8 +1626,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) name = ifr->ifr_name; dev = alloc_netdev_mqs(sizeof(struct tun_struct), name, - tun_setup, - MAX_TAP_QUEUES, MAX_TAP_QUEUES); + tun_setup, queues, queues); + if (!dev) return -ENOMEM; @@ -1632,7 +1645,9 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) spin_lock_init(&tun->lock); - security_tun_dev_post_create(&tfile->sk); + err = security_tun_dev_alloc_security(&tun->security); + if (err < 0) + goto err_free_dev; tun_net_init(dev); @@ -1657,10 +1672,10 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) device_create_file(&tun->dev->dev, &dev_attr_owner) || device_create_file(&tun->dev->dev, &dev_attr_group)) pr_err("Failed to create tun sysfs files\n"); - - netif_carrier_on(tun->dev); } + netif_carrier_on(tun->dev); + tun_debug(KERN_INFO, tun, "tun_set_iff\n"); if (ifr->ifr_flags & IFF_NO_PI) @@ -1756,8 +1771,7 @@ static void tun_detach_filter(struct tun_struct *tun, int n) struct tun_file *tfile; for (i = 0; i < n; i++) { - tfile = rcu_dereference_protected(tun->tfiles[i], - lockdep_rtnl_is_held()); + tfile = rtnl_dereference(tun->tfiles[i]); sk_detach_filter(tfile->socket.sk); } @@ -1770,8 +1784,7 @@ static int tun_attach_filter(struct tun_struct *tun) struct tun_file *tfile; for (i = 0; i < tun->numqueues; i++) { - tfile = rcu_dereference_protected(tun->tfiles[i], - lockdep_rtnl_is_held()); + tfile = rtnl_dereference(tun->tfiles[i]); ret = sk_attach_filter(&tun->fprog, tfile->socket.sk); if (ret) { tun_detach_filter(tun, i); @@ -1789,8 +1802,7 @@ static void tun_set_sndbuf(struct tun_struct *tun) int i; for (i = 0; i < tun->numqueues; i++) { - tfile = rcu_dereference_protected(tun->tfiles[i], - lockdep_rtnl_is_held()); + tfile = rtnl_dereference(tun->tfiles[i]); tfile->socket.sk->sk_sndbuf = tun->sndbuf; } } @@ -1805,22 +1817,24 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr) if (ifr->ifr_flags & IFF_ATTACH_QUEUE) { tun = tfile->detached; - if (!tun) + if (!tun) { ret = -EINVAL; - else if (tun_not_capable(tun)) - ret = -EPERM; - else - ret = tun_attach(tun, file); + goto unlock; + } + ret = security_tun_dev_attach_queue(tun->security); + if (ret < 0) + goto unlock; + ret = tun_attach(tun, file); } else if (ifr->ifr_flags & IFF_DETACH_QUEUE) { - tun = rcu_dereference_protected(tfile->tun, - lockdep_rtnl_is_held()); - if (!tun || !(tun->flags & TUN_TAP_MQ)) + tun = rtnl_dereference(tfile->tun); + if (!tun || !(tun->flags & TUN_TAP_MQ) || tfile->detached) ret = -EINVAL; else __tun_detach(tfile, false); } else ret = -EINVAL; +unlock: rtnl_unlock(); return ret; } @@ -1898,10 +1912,11 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, /* Disable/Enable persist mode. Keep an extra reference to the * module to prevent the module being unprobed. */ - if (arg) { + if (arg && !(tun->flags & TUN_PERSIST)) { tun->flags |= TUN_PERSIST; __module_get(THIS_MODULE); - } else { + } + if (!arg && (tun->flags & TUN_PERSIST)) { tun->flags &= ~TUN_PERSIST; module_put(THIS_MODULE); } |