diff options
-rw-r--r-- | Documentation/networking/packet_mmap.txt | 21 | ||||
-rw-r--r-- | include/uapi/linux/if_packet.h | 1 | ||||
-rw-r--r-- | net/packet/af_packet.c | 91 | ||||
-rw-r--r-- | net/packet/internal.h | 1 |
4 files changed, 102 insertions, 12 deletions
diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt index 8e48e3b14227..4288ffafba9f 100644 --- a/Documentation/networking/packet_mmap.txt +++ b/Documentation/networking/packet_mmap.txt @@ -953,6 +953,27 @@ int main(int argc, char **argp) } ------------------------------------------------------------------------------- ++ PACKET_QDISC_BYPASS +------------------------------------------------------------------------------- + +If there is a requirement to load the network with many packets in a similar +fashion as pktgen does, you might set the following option after socket +creation: + + int one = 1; + setsockopt(fd, SOL_PACKET, PACKET_QDISC_BYPASS, &one, sizeof(one)); + +This has the side-effect, that packets sent through PF_PACKET will bypass the +kernel's qdisc layer and are forcedly pushed to the driver directly. Meaning, +packet are not buffered, tc disciplines are ignored, increased loss can occur +and such packets are also not visible to other PF_PACKET sockets anymore. So, +you have been warned; generally, this can be useful for stress testing various +components of a system. + +On default, PACKET_QDISC_BYPASS is disabled and needs to be explicitly enabled +on PF_PACKET sockets. + +------------------------------------------------------------------------------- + PACKET_TIMESTAMP ------------------------------------------------------------------------------- diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h index dbf06667394b..1e24aa701cbd 100644 --- a/include/uapi/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h @@ -51,6 +51,7 @@ struct sockaddr_ll { #define PACKET_TIMESTAMP 17 #define PACKET_FANOUT 18 #define PACKET_TX_HAS_OFF 19 +#define PACKET_QDISC_BYPASS 20 #define PACKET_FANOUT_HASH 0 #define PACKET_FANOUT_LB 1 diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index e4171dd98590..9d70f1349926 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -237,6 +237,48 @@ struct packet_skb_cb { static void __fanout_unlink(struct sock *sk, struct packet_sock *po); static void __fanout_link(struct sock *sk, struct packet_sock *po); +static int packet_direct_xmit(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + const struct net_device_ops *ops = dev->netdev_ops; + netdev_features_t features; + struct netdev_queue *txq; + u16 queue_map; + int ret; + + if (unlikely(!netif_running(dev) || + !netif_carrier_ok(dev))) { + kfree_skb(skb); + return NET_XMIT_DROP; + } + + features = netif_skb_features(skb); + if (skb_needs_linearize(skb, features) && + __skb_linearize(skb)) { + kfree_skb(skb); + return NET_XMIT_DROP; + } + + queue_map = skb_get_queue_mapping(skb); + txq = netdev_get_tx_queue(dev, queue_map); + + __netif_tx_lock_bh(txq); + if (unlikely(netif_xmit_frozen_or_stopped(txq))) { + ret = NETDEV_TX_BUSY; + kfree_skb(skb); + goto out; + } + + ret = ops->ndo_start_xmit(skb, dev); + if (likely(dev_xmit_complete(ret))) + txq_trans_update(txq); + else + kfree_skb(skb); +out: + __netif_tx_unlock_bh(txq); + return ret; +} + static struct net_device *packet_cached_dev_get(struct packet_sock *po) { struct net_device *dev; @@ -261,6 +303,16 @@ static void packet_cached_dev_reset(struct packet_sock *po) RCU_INIT_POINTER(po->cached_dev, NULL); } +static bool packet_use_direct_xmit(const struct packet_sock *po) +{ + return po->xmit == packet_direct_xmit; +} + +static u16 packet_pick_tx_queue(struct net_device *dev) +{ + return (u16) smp_processor_id() % dev->real_num_tx_queues; +} + /* register_prot_hook must be invoked with the po->bind_lock held, * or from a context in which asynchronous accesses to the packet * socket is not possible (packet_create()). @@ -1994,9 +2046,10 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, skb_reserve(skb, hlen); skb_reset_network_header(skb); - skb_probe_transport_header(skb, 0); - if (po->tp_tx_has_off) { + if (!packet_use_direct_xmit(po)) + skb_probe_transport_header(skb, 0); + if (unlikely(po->tp_tx_has_off)) { int off_min, off_max, off; off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll); off_max = po->tx_ring.frame_size - tp_len; @@ -2166,12 +2219,13 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) } } + skb_set_queue_mapping(skb, packet_pick_tx_queue(dev)); skb->destructor = tpacket_destruct_skb; __packet_set_status(po, ph, TP_STATUS_SENDING); atomic_inc(&po->tx_ring.pending); status = TP_STATUS_SEND_REQUEST; - err = dev_queue_xmit(skb); + err = po->xmit(skb); if (unlikely(err > 0)) { err = net_xmit_errno(err); if (err && __packet_get_status(po, ph) == @@ -2230,8 +2284,7 @@ static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad, return skb; } -static int packet_snd(struct socket *sock, - struct msghdr *msg, size_t len) +static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name; @@ -2376,6 +2429,7 @@ static int packet_snd(struct socket *sock, skb->dev = dev; skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; + skb_set_queue_mapping(skb, packet_pick_tx_queue(dev)); if (po->has_vnet_hdr) { if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { @@ -2396,16 +2450,12 @@ static int packet_snd(struct socket *sock, len += vnet_hdr_len; } - skb_probe_transport_header(skb, reserve); - + if (!packet_use_direct_xmit(po)) + skb_probe_transport_header(skb, reserve); if (unlikely(extra_len == 4)) skb->no_fcs = 1; - /* - * Now send it - */ - - err = dev_queue_xmit(skb); + err = po->xmit(skb); if (err > 0 && (err = net_xmit_errno(err)) != 0) goto out_unlock; @@ -2427,6 +2477,7 @@ static int packet_sendmsg(struct kiocb *iocb, struct socket *sock, { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); + if (po->tx_ring.pg_vec) return tpacket_snd(po, msg); else @@ -2641,6 +2692,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol, po = pkt_sk(sk); sk->sk_family = PF_PACKET; po->num = proto; + po->xmit = dev_queue_xmit; packet_cached_dev_reset(po); @@ -3220,6 +3272,18 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv po->tp_tx_has_off = !!val; return 0; } + case PACKET_QDISC_BYPASS: + { + int val; + + if (optlen != sizeof(val)) + return -EINVAL; + if (copy_from_user(&val, optval, sizeof(val))) + return -EFAULT; + + po->xmit = val ? packet_direct_xmit : dev_queue_xmit; + return 0; + } default: return -ENOPROTOOPT; } @@ -3312,6 +3376,9 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, case PACKET_TX_HAS_OFF: val = po->tp_tx_has_off; break; + case PACKET_QDISC_BYPASS: + val = packet_use_direct_xmit(po); + break; default: return -ENOPROTOOPT; } diff --git a/net/packet/internal.h b/net/packet/internal.h index 1035fa2d909c..0a87d7b36c9e 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -114,6 +114,7 @@ struct packet_sock { unsigned int tp_tx_has_off:1; unsigned int tp_tstamp; struct net_device __rcu *cached_dev; + int (*xmit)(struct sk_buff *skb); struct packet_type prot_hook ____cacheline_aligned_in_smp; }; |