summaryrefslogtreecommitdiff
path: root/drivers/net/tun.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/net/tun.c')
-rw-r--r--drivers/net/tun.c384
1 files changed, 244 insertions, 140 deletions
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index ebd07ad82431..060135ceaf0e 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -113,7 +113,6 @@ do { \
} while (0)
#endif
-#define TUN_HEADROOM 256
#define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
/* TUN device flags */
@@ -181,6 +180,7 @@ struct tun_file {
};
struct napi_struct napi;
bool napi_enabled;
+ bool napi_frags_enabled;
struct mutex napi_mutex; /* Protects access to the above napi */
struct list_head next;
struct tun_struct *detached;
@@ -313,32 +313,32 @@ static int tun_napi_poll(struct napi_struct *napi, int budget)
}
static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile,
- bool napi_en)
+ bool napi_en, bool napi_frags)
{
tfile->napi_enabled = napi_en;
+ tfile->napi_frags_enabled = napi_en && napi_frags;
if (napi_en) {
netif_napi_add(tun->dev, &tfile->napi, tun_napi_poll,
NAPI_POLL_WEIGHT);
napi_enable(&tfile->napi);
- mutex_init(&tfile->napi_mutex);
}
}
-static void tun_napi_disable(struct tun_struct *tun, struct tun_file *tfile)
+static void tun_napi_disable(struct tun_file *tfile)
{
if (tfile->napi_enabled)
napi_disable(&tfile->napi);
}
-static void tun_napi_del(struct tun_struct *tun, struct tun_file *tfile)
+static void tun_napi_del(struct tun_file *tfile)
{
if (tfile->napi_enabled)
netif_napi_del(&tfile->napi);
}
-static bool tun_napi_frags_enabled(const struct tun_struct *tun)
+static bool tun_napi_frags_enabled(const struct tun_file *tfile)
{
- return READ_ONCE(tun->flags) & IFF_NAPI_FRAGS;
+ return tfile->napi_frags_enabled;
}
#ifdef CONFIG_TUN_VNET_CROSS_LE
@@ -562,12 +562,11 @@ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
e->rps_rxhash = hash;
}
-/* We try to identify a flow through its rxhash first. The reason that
+/* We try to identify a flow through its rxhash. The reason that
* we do not check rxq no. is because some cards(e.g 82599), chooses
* the rxq based on the txq where the last packet of the flow comes. As
* the userspace application move between processors, we may get a
- * different rxq no. here. If we could not get rxhash, then we would
- * hope the rxq no. may help here.
+ * different rxq no. here.
*/
static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
{
@@ -578,18 +577,13 @@ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
numqueues = READ_ONCE(tun->numqueues);
txq = __skb_get_hash_symmetric(skb);
- if (txq) {
- e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq);
- if (e) {
- tun_flow_save_rps_rxhash(e, txq);
- txq = e->queue_index;
- } else
- /* use multiply and shift instead of expensive divide */
- txq = ((u64)txq * numqueues) >> 32;
- } else if (likely(skb_rx_queue_recorded(skb))) {
- txq = skb_get_rx_queue(skb);
- while (unlikely(txq >= numqueues))
- txq -= numqueues;
+ e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq);
+ if (e) {
+ tun_flow_save_rps_rxhash(e, txq);
+ txq = e->queue_index;
+ } else {
+ /* use multiply and shift instead of expensive divide */
+ txq = ((u64)txq * numqueues) >> 32;
}
return txq;
@@ -690,8 +684,8 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
tun = rtnl_dereference(tfile->tun);
if (tun && clean) {
- tun_napi_disable(tun, tfile);
- tun_napi_del(tun, tfile);
+ tun_napi_disable(tfile);
+ tun_napi_del(tfile);
}
if (tun && !tfile->detached) {
@@ -758,7 +752,7 @@ static void tun_detach_all(struct net_device *dev)
for (i = 0; i < n; i++) {
tfile = rtnl_dereference(tun->tfiles[i]);
BUG_ON(!tfile);
- tun_napi_disable(tun, tfile);
+ tun_napi_disable(tfile);
tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
tfile->socket.sk->sk_data_ready(tfile->socket.sk);
RCU_INIT_POINTER(tfile->tun, NULL);
@@ -774,7 +768,7 @@ static void tun_detach_all(struct net_device *dev)
synchronize_net();
for (i = 0; i < n; i++) {
tfile = rtnl_dereference(tun->tfiles[i]);
- tun_napi_del(tun, tfile);
+ tun_napi_del(tfile);
/* Drop read queue */
tun_queue_purge(tfile);
xdp_rxq_info_unreg(&tfile->xdp_rxq);
@@ -793,7 +787,7 @@ static void tun_detach_all(struct net_device *dev)
}
static int tun_attach(struct tun_struct *tun, struct file *file,
- bool skip_filter, bool napi)
+ bool skip_filter, bool napi, bool napi_frags)
{
struct tun_file *tfile = file->private_data;
struct net_device *dev = tun->dev;
@@ -866,9 +860,12 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
tun_enable_queue(tfile);
} else {
sock_hold(&tfile->sk);
- tun_napi_init(tun, tfile, napi);
+ tun_napi_init(tun, tfile, napi, napi_frags);
}
+ if (rtnl_dereference(tun->xdp_prog))
+ sock_set_flag(&tfile->sk, SOCK_XDP);
+
tun_set_real_num_queues(tun);
/* device is allowed to go away first, so no need to hold extra
@@ -1044,16 +1041,13 @@ static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb)
/* Select queue was not called for the skbuff, so we extract the
* RPS hash and save it into the flow_table here.
*/
+ struct tun_flow_entry *e;
__u32 rxhash;
rxhash = __skb_get_hash_symmetric(skb);
- if (rxhash) {
- struct tun_flow_entry *e;
- e = tun_flow_find(&tun->flows[tun_hashfn(rxhash)],
- rxhash);
- if (e)
- tun_flow_save_rps_rxhash(e, rxhash);
- }
+ e = tun_flow_find(&tun->flows[tun_hashfn(rxhash)], rxhash);
+ if (e)
+ tun_flow_save_rps_rxhash(e, rxhash);
}
#endif
}
@@ -1153,43 +1147,6 @@ static netdev_features_t tun_net_fix_features(struct net_device *dev,
return (features & tun->set_features) | (features & ~TUN_USER_FEATURES);
}
-#ifdef CONFIG_NET_POLL_CONTROLLER
-static void tun_poll_controller(struct net_device *dev)
-{
- /*
- * Tun only receives frames when:
- * 1) the char device endpoint gets data from user space
- * 2) the tun socket gets a sendmsg call from user space
- * If NAPI is not enabled, since both of those are synchronous
- * operations, we are guaranteed never to have pending data when we poll
- * for it so there is nothing to do here but return.
- * We need this though so netpoll recognizes us as an interface that
- * supports polling, which enables bridge devices in virt setups to
- * still use netconsole
- * If NAPI is enabled, however, we need to schedule polling for all
- * queues unless we are using napi_gro_frags(), which we call in
- * process context and not in NAPI context.
- */
- struct tun_struct *tun = netdev_priv(dev);
-
- if (tun->flags & IFF_NAPI) {
- struct tun_file *tfile;
- int i;
-
- if (tun_napi_frags_enabled(tun))
- return;
-
- rcu_read_lock();
- for (i = 0; i < tun->numqueues; i++) {
- tfile = rcu_dereference(tun->tfiles[i]);
- if (tfile->napi_enabled)
- napi_schedule(&tfile->napi);
- }
- rcu_read_unlock();
- }
- return;
-}
-#endif
static void tun_set_headroom(struct net_device *dev, int new_hr)
{
@@ -1241,13 +1198,29 @@ static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog,
struct netlink_ext_ack *extack)
{
struct tun_struct *tun = netdev_priv(dev);
+ struct tun_file *tfile;
struct bpf_prog *old_prog;
+ int i;
old_prog = rtnl_dereference(tun->xdp_prog);
rcu_assign_pointer(tun->xdp_prog, prog);
if (old_prog)
bpf_prog_put(old_prog);
+ for (i = 0; i < tun->numqueues; i++) {
+ tfile = rtnl_dereference(tun->tfiles[i]);
+ if (prog)
+ sock_set_flag(&tfile->sk, SOCK_XDP);
+ else
+ sock_reset_flag(&tfile->sk, SOCK_XDP);
+ }
+ list_for_each_entry(tfile, &tun->disabled, next) {
+ if (prog)
+ sock_set_flag(&tfile->sk, SOCK_XDP);
+ else
+ sock_reset_flag(&tfile->sk, SOCK_XDP);
+ }
+
return 0;
}
@@ -1283,9 +1256,6 @@ static const struct net_device_ops tun_netdev_ops = {
.ndo_start_xmit = tun_net_xmit,
.ndo_fix_features = tun_net_fix_features,
.ndo_select_queue = tun_select_queue,
-#ifdef CONFIG_NET_POLL_CONTROLLER
- .ndo_poll_controller = tun_poll_controller,
-#endif
.ndo_set_rx_headroom = tun_set_headroom,
.ndo_get_stats64 = tun_net_get_stats64,
};
@@ -1365,9 +1335,6 @@ static const struct net_device_ops tap_netdev_ops = {
.ndo_set_mac_address = eth_mac_addr,
.ndo_validate_addr = eth_validate_addr,
.ndo_select_queue = tun_select_queue,
-#ifdef CONFIG_NET_POLL_CONTROLLER
- .ndo_poll_controller = tun_poll_controller,
-#endif
.ndo_features_check = passthru_features_check,
.ndo_set_rx_headroom = tun_set_headroom,
.ndo_get_stats64 = tun_net_get_stats64,
@@ -1617,6 +1584,55 @@ static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,
return true;
}
+static struct sk_buff *__tun_build_skb(struct page_frag *alloc_frag, char *buf,
+ int buflen, int len, int pad)
+{
+ struct sk_buff *skb = build_skb(buf, buflen);
+
+ if (!skb)
+ return ERR_PTR(-ENOMEM);
+
+ skb_reserve(skb, pad);
+ skb_put(skb, len);
+
+ get_page(alloc_frag->page);
+ alloc_frag->offset += buflen;
+
+ return skb;
+}
+
+static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog,
+ struct xdp_buff *xdp, u32 act)
+{
+ int err;
+
+ switch (act) {
+ case XDP_REDIRECT:
+ err = xdp_do_redirect(tun->dev, xdp, xdp_prog);
+ if (err)
+ return err;
+ break;
+ case XDP_TX:
+ err = tun_xdp_tx(tun->dev, xdp);
+ if (err < 0)
+ return err;
+ break;
+ case XDP_PASS:
+ break;
+ default:
+ bpf_warn_invalid_xdp_action(act);
+ /* fall through */
+ case XDP_ABORTED:
+ trace_xdp_exception(tun->dev, xdp_prog, act);
+ /* fall through */
+ case XDP_DROP:
+ this_cpu_inc(tun->pcpu_stats->rx_dropped);
+ break;
+ }
+
+ return act;
+}
+
static struct sk_buff *tun_build_skb(struct tun_struct *tun,
struct tun_file *tfile,
struct iov_iter *from,
@@ -1624,18 +1640,17 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
int len, int *skb_xdp)
{
struct page_frag *alloc_frag = &current->task_frag;
- struct sk_buff *skb;
struct bpf_prog *xdp_prog;
int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- unsigned int delta = 0;
char *buf;
size_t copied;
- int err, pad = TUN_RX_PAD;
+ int pad = TUN_RX_PAD;
+ int err = 0;
rcu_read_lock();
xdp_prog = rcu_dereference(tun->xdp_prog);
if (xdp_prog)
- pad += TUN_HEADROOM;
+ pad += XDP_PACKET_HEADROOM;
buflen += SKB_DATA_ALIGN(len + pad);
rcu_read_unlock();
@@ -1654,17 +1669,18 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
* of xdp_prog above, this should be rare and for simplicity
* we do XDP on skb in case the headroom is not enough.
*/
- if (hdr->gso_type || !xdp_prog)
+ if (hdr->gso_type || !xdp_prog) {
*skb_xdp = 1;
- else
- *skb_xdp = 0;
+ return __tun_build_skb(alloc_frag, buf, buflen, len, pad);
+ }
+
+ *skb_xdp = 0;
local_bh_disable();
rcu_read_lock();
xdp_prog = rcu_dereference(tun->xdp_prog);
- if (xdp_prog && !*skb_xdp) {
+ if (xdp_prog) {
struct xdp_buff xdp;
- void *orig_data;
u32 act;
xdp.data_hard_start = buf;
@@ -1672,66 +1688,33 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
xdp_set_data_meta_invalid(&xdp);
xdp.data_end = xdp.data + len;
xdp.rxq = &tfile->xdp_rxq;
- orig_data = xdp.data;
- act = bpf_prog_run_xdp(xdp_prog, &xdp);
- switch (act) {
- case XDP_REDIRECT:
- get_page(alloc_frag->page);
- alloc_frag->offset += buflen;
- err = xdp_do_redirect(tun->dev, &xdp, xdp_prog);
- xdp_do_flush_map();
- if (err)
- goto err_redirect;
- rcu_read_unlock();
- local_bh_enable();
- return NULL;
- case XDP_TX:
+ act = bpf_prog_run_xdp(xdp_prog, &xdp);
+ if (act == XDP_REDIRECT || act == XDP_TX) {
get_page(alloc_frag->page);
alloc_frag->offset += buflen;
- if (tun_xdp_tx(tun->dev, &xdp) < 0)
- goto err_redirect;
- rcu_read_unlock();
- local_bh_enable();
- return NULL;
- case XDP_PASS:
- delta = orig_data - xdp.data;
- len = xdp.data_end - xdp.data;
- break;
- default:
- bpf_warn_invalid_xdp_action(act);
- /* fall through */
- case XDP_ABORTED:
- trace_xdp_exception(tun->dev, xdp_prog, act);
- /* fall through */
- case XDP_DROP:
- goto err_xdp;
}
- }
+ err = tun_xdp_act(tun, xdp_prog, &xdp, act);
+ if (err < 0)
+ goto err_xdp;
+ if (err == XDP_REDIRECT)
+ xdp_do_flush_map();
+ if (err != XDP_PASS)
+ goto out;
- skb = build_skb(buf, buflen);
- if (!skb) {
- rcu_read_unlock();
- local_bh_enable();
- return ERR_PTR(-ENOMEM);
+ pad = xdp.data - xdp.data_hard_start;
+ len = xdp.data_end - xdp.data;
}
-
- skb_reserve(skb, pad - delta);
- skb_put(skb, len);
- get_page(alloc_frag->page);
- alloc_frag->offset += buflen;
-
rcu_read_unlock();
local_bh_enable();
- return skb;
+ return __tun_build_skb(alloc_frag, buf, buflen, len, pad);
-err_redirect:
- put_page(alloc_frag->page);
err_xdp:
+ put_page(alloc_frag->page);
+out:
rcu_read_unlock();
local_bh_enable();
- this_cpu_inc(tun->pcpu_stats->rx_dropped);
return NULL;
}
@@ -1752,7 +1735,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
int err;
u32 rxhash = 0;
int skb_xdp = 1;
- bool frags = tun_napi_frags_enabled(tun);
+ bool frags = tun_napi_frags_enabled(tfile);
if (!(tun->dev->flags & IFF_UP))
return -EIO;
@@ -2306,6 +2289,8 @@ static void tun_setup(struct net_device *dev)
static int tun_validate(struct nlattr *tb[], struct nlattr *data[],
struct netlink_ext_ack *extack)
{
+ if (!data)
+ return 0;
return -EINVAL;
}
@@ -2392,18 +2377,133 @@ static void tun_sock_write_space(struct sock *sk)
kill_fasync(&tfile->fasync, SIGIO, POLL_OUT);
}
+static int tun_xdp_one(struct tun_struct *tun,
+ struct tun_file *tfile,
+ struct xdp_buff *xdp, int *flush)
+{
+ struct tun_xdp_hdr *hdr = xdp->data_hard_start;
+ struct virtio_net_hdr *gso = &hdr->gso;
+ struct tun_pcpu_stats *stats;
+ struct bpf_prog *xdp_prog;
+ struct sk_buff *skb = NULL;
+ u32 rxhash = 0, act;
+ int buflen = hdr->buflen;
+ int err = 0;
+ bool skb_xdp = false;
+
+ xdp_prog = rcu_dereference(tun->xdp_prog);
+ if (xdp_prog) {
+ if (gso->gso_type) {
+ skb_xdp = true;
+ goto build;
+ }
+ xdp_set_data_meta_invalid(xdp);
+ xdp->rxq = &tfile->xdp_rxq;
+
+ act = bpf_prog_run_xdp(xdp_prog, xdp);
+ err = tun_xdp_act(tun, xdp_prog, xdp, act);
+ if (err < 0) {
+ put_page(virt_to_head_page(xdp->data));
+ return err;
+ }
+
+ switch (err) {
+ case XDP_REDIRECT:
+ *flush = true;
+ /* fall through */
+ case XDP_TX:
+ return 0;
+ case XDP_PASS:
+ break;
+ default:
+ put_page(virt_to_head_page(xdp->data));
+ return 0;
+ }
+ }
+
+build:
+ skb = build_skb(xdp->data_hard_start, buflen);
+ if (!skb) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ skb_reserve(skb, xdp->data - xdp->data_hard_start);
+ skb_put(skb, xdp->data_end - xdp->data);
+
+ if (virtio_net_hdr_to_skb(skb, gso, tun_is_little_endian(tun))) {
+ this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
+ kfree_skb(skb);
+ err = -EINVAL;
+ goto out;
+ }
+
+ skb->protocol = eth_type_trans(skb, tun->dev);
+ skb_reset_network_header(skb);
+ skb_probe_transport_header(skb, 0);
+
+ if (skb_xdp) {
+ err = do_xdp_generic(xdp_prog, skb);
+ if (err != XDP_PASS)
+ goto out;
+ }
+
+ if (!rcu_dereference(tun->steering_prog))
+ rxhash = __skb_get_hash_symmetric(skb);
+
+ netif_receive_skb(skb);
+
+ stats = get_cpu_ptr(tun->pcpu_stats);
+ u64_stats_update_begin(&stats->syncp);
+ stats->rx_packets++;
+ stats->rx_bytes += skb->len;
+ u64_stats_update_end(&stats->syncp);
+ put_cpu_ptr(stats);
+
+ if (rxhash)
+ tun_flow_update(tun, rxhash, tfile);
+
+out:
+ return err;
+}
+
static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
{
- int ret;
+ int ret, i;
struct tun_file *tfile = container_of(sock, struct tun_file, socket);
struct tun_struct *tun = tun_get(tfile);
+ struct tun_msg_ctl *ctl = m->msg_control;
+ struct xdp_buff *xdp;
if (!tun)
return -EBADFD;
- ret = tun_get_user(tun, tfile, m->msg_control, &m->msg_iter,
+ if (ctl && (ctl->type == TUN_MSG_PTR)) {
+ int n = ctl->num;
+ int flush = 0;
+
+ local_bh_disable();
+ rcu_read_lock();
+
+ for (i = 0; i < n; i++) {
+ xdp = &((struct xdp_buff *)ctl->ptr)[i];
+ tun_xdp_one(tun, tfile, xdp, &flush);
+ }
+
+ if (flush)
+ xdp_do_flush_map();
+
+ rcu_read_unlock();
+ local_bh_enable();
+
+ ret = total_len;
+ goto out;
+ }
+
+ ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter,
m->msg_flags & MSG_DONTWAIT,
m->msg_flags & MSG_MORE);
+out:
tun_put(tun);
return ret;
}
@@ -2577,7 +2677,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
return err;
err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER,
- ifr->ifr_flags & IFF_NAPI);
+ ifr->ifr_flags & IFF_NAPI,
+ ifr->ifr_flags & IFF_NAPI_FRAGS);
if (err < 0)
return err;
@@ -2675,7 +2776,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
(ifr->ifr_flags & TUN_FEATURES);
INIT_LIST_HEAD(&tun->disabled);
- err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI);
+ err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI,
+ ifr->ifr_flags & IFF_NAPI_FRAGS);
if (err < 0)
goto err_free_flow;
@@ -2824,7 +2926,8 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr)
ret = security_tun_dev_attach_queue(tun->security);
if (ret < 0)
goto unlock;
- ret = tun_attach(tun, file, false, tun->flags & IFF_NAPI);
+ ret = tun_attach(tun, file, false, tun->flags & IFF_NAPI,
+ tun->flags & IFF_NAPI_FRAGS);
} else if (ifr->ifr_flags & IFF_DETACH_QUEUE) {
tun = rtnl_dereference(tfile->tun);
if (!tun || !(tun->flags & IFF_MULTI_QUEUE) || tfile->detached)
@@ -3242,6 +3345,7 @@ static int tun_chr_open(struct inode *inode, struct file * file)
return -ENOMEM;
}
+ mutex_init(&tfile->napi_mutex);
RCU_INIT_POINTER(tfile->tun, NULL);
tfile->flags = 0;
tfile->ifindex = 0;