From d5a9e24afb4ab38110ebb777588ea0bd0eacbd0a Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 27 Jan 2009 16:22:11 -0800 Subject: net: Allow RX queue selection to seed TX queue hashing. The idea is that drivers which implement multiqueue RX pre-seed the SKB by recording the RX queue selected by the hardware. If such a seed is found on TX, we'll use that to select the outgoing TX queue. This helps get more consistent load balancing on router and firewall loads. Signed-off-by: David S. Miller --- include/linux/skbuff.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux/skbuff.h') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index cf2cb50f77d1..a2c2378a9c58 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1904,6 +1904,21 @@ static inline void skb_copy_queue_mapping(struct sk_buff *to, const struct sk_bu to->queue_mapping = from->queue_mapping; } +static inline void skb_record_rx_queue(struct sk_buff *skb, u16 rx_queue) +{ + skb->queue_mapping = rx_queue + 1; +} + +static inline u16 skb_get_rx_queue(struct sk_buff *skb) +{ + return skb->queue_mapping - 1; +} + +static inline bool skb_rx_queue_recorded(struct sk_buff *skb) +{ + return (skb->queue_mapping != 0); +} + #ifdef CONFIG_XFRM static inline struct sec_path *skb_sec_path(struct sk_buff *skb) { -- cgit v1.2.3 From 86911732d3996a9da07914b280621450111bb6da Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 29 Jan 2009 14:19:50 +0000 Subject: gro: Avoid copying headers of unmerged packets Unfortunately simplicity isn't always the best. The fraginfo interface turned out to be suboptimal. The problem was quite obvious. For every packet, we have to copy the headers from the frags structure into skb->head, even though for 99% of the packets this part is immediately thrown away after the merge. LRO didn't have this problem because it directly read the headers from the frags structure. This patch attempts to address this by creating an interface that allows GRO to access the headers in the first frag without having to copy it. Because all drivers that use frags place the headers in the first frag this optimisation should be enough. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/netdevice.h | 26 ++++++++++++++++++ include/linux/skbuff.h | 2 -- net/8021q/vlan_core.c | 2 ++ net/core/dev.c | 70 +++++++++++++++++++++++++++++++++++++++-------- net/core/skbuff.c | 23 ++++++++++------ net/ipv4/af_inet.c | 10 +++---- net/ipv4/tcp.c | 16 +++++------ net/ipv4/tcp_ipv4.c | 2 +- net/ipv6/af_inet6.c | 30 +++++++++++++------- net/ipv6/tcp_ipv6.c | 2 +- 10 files changed, 137 insertions(+), 46 deletions(-) (limited to 'include/linux/skbuff.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 20419508eec1..7a5057fbb7cd 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -984,6 +984,9 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, void netif_napi_del(struct napi_struct *napi); struct napi_gro_cb { + /* This indicates where we are processing relative to skb->data. */ + int data_offset; + /* This is non-zero if the packet may be of the same flow. */ int same_flow; @@ -1087,6 +1090,29 @@ extern int dev_restart(struct net_device *dev); #ifdef CONFIG_NETPOLL_TRAP extern int netpoll_trap(void); #endif +extern void *skb_gro_header(struct sk_buff *skb, unsigned int hlen); +extern int skb_gro_receive(struct sk_buff **head, + struct sk_buff *skb); + +static inline unsigned int skb_gro_offset(const struct sk_buff *skb) +{ + return NAPI_GRO_CB(skb)->data_offset; +} + +static inline unsigned int skb_gro_len(const struct sk_buff *skb) +{ + return skb->len - NAPI_GRO_CB(skb)->data_offset; +} + +static inline void skb_gro_pull(struct sk_buff *skb, unsigned int len) +{ + NAPI_GRO_CB(skb)->data_offset += len; +} + +static inline void skb_gro_reset_offset(struct sk_buff *skb) +{ + NAPI_GRO_CB(skb)->data_offset = 0; +} static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a2c2378a9c58..08670d017479 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1687,8 +1687,6 @@ extern int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen); extern struct sk_buff *skb_segment(struct sk_buff *skb, int features); -extern int skb_gro_receive(struct sk_buff **head, - struct sk_buff *skb); static inline void *skb_header_pointer(const struct sk_buff *skb, int offset, int len, void *buffer) diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index 2eb057a74654..378fa69d625a 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -98,6 +98,8 @@ drop: int vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp, unsigned int vlan_tci, struct sk_buff *skb) { + skb_gro_reset_offset(skb); + return napi_skb_finish(vlan_gro_common(napi, grp, vlan_tci, skb), skb); } EXPORT_SYMBOL(vlan_gro_receive); diff --git a/net/core/dev.c b/net/core/dev.c index cd23ae15a1d5..df406dcf7482 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -215,6 +215,13 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)]; } +static inline void *skb_gro_mac_header(struct sk_buff *skb) +{ + return skb_headlen(skb) ? skb_mac_header(skb) : + page_address(skb_shinfo(skb)->frags[0].page) + + skb_shinfo(skb)->frags[0].page_offset; +} + /* Device list insertion */ static int list_netdevice(struct net_device *dev) { @@ -2350,7 +2357,6 @@ static int napi_gro_complete(struct sk_buff *skb) out: skb_shinfo(skb)->gso_size = 0; - __skb_push(skb, -skb_network_offset(skb)); return netif_receive_skb(skb); } @@ -2368,6 +2374,25 @@ void napi_gro_flush(struct napi_struct *napi) } EXPORT_SYMBOL(napi_gro_flush); +void *skb_gro_header(struct sk_buff *skb, unsigned int hlen) +{ + unsigned int offset = skb_gro_offset(skb); + + hlen += offset; + if (hlen <= skb_headlen(skb)) + return skb->data + offset; + + if (unlikely(!skb_shinfo(skb)->nr_frags || + skb_shinfo(skb)->frags[0].size <= + hlen - skb_headlen(skb) || + PageHighMem(skb_shinfo(skb)->frags[0].page))) + return pskb_may_pull(skb, hlen) ? skb->data + offset : NULL; + + return page_address(skb_shinfo(skb)->frags[0].page) + + skb_shinfo(skb)->frags[0].page_offset + offset; +} +EXPORT_SYMBOL(skb_gro_header); + int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { struct sk_buff **pp = NULL; @@ -2388,11 +2413,13 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { struct sk_buff *p; + void *mac; if (ptype->type != type || ptype->dev || !ptype->gro_receive) continue; - skb_reset_network_header(skb); + skb_set_network_header(skb, skb_gro_offset(skb)); + mac = skb_gro_mac_header(skb); mac_len = skb->network_header - skb->mac_header; skb->mac_len = mac_len; NAPI_GRO_CB(skb)->same_flow = 0; @@ -2406,8 +2433,7 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) continue; if (p->mac_len != mac_len || - memcmp(skb_mac_header(p), skb_mac_header(skb), - mac_len)) + memcmp(skb_mac_header(p), mac, mac_len)) NAPI_GRO_CB(p)->same_flow = 0; } @@ -2434,13 +2460,11 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) if (same_flow) goto ok; - if (NAPI_GRO_CB(skb)->flush || count >= MAX_GRO_SKBS) { - __skb_push(skb, -skb_network_offset(skb)); + if (NAPI_GRO_CB(skb)->flush || count >= MAX_GRO_SKBS) goto normal; - } NAPI_GRO_CB(skb)->count = 1; - skb_shinfo(skb)->gso_size = skb->len; + skb_shinfo(skb)->gso_size = skb_gro_len(skb); skb->next = napi->gro_list; napi->gro_list = skb; ret = GRO_HELD; @@ -2488,6 +2512,8 @@ EXPORT_SYMBOL(napi_skb_finish); int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { + skb_gro_reset_offset(skb); + return napi_skb_finish(__napi_gro_receive(napi, skb), skb); } EXPORT_SYMBOL(napi_gro_receive); @@ -2506,6 +2532,7 @@ struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi, { struct net_device *dev = napi->dev; struct sk_buff *skb = napi->skb; + struct ethhdr *eth; napi->skb = NULL; @@ -2525,13 +2552,23 @@ struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi, skb->len += info->len; skb->truesize += info->len; - if (!pskb_may_pull(skb, ETH_HLEN)) { + skb_reset_mac_header(skb); + skb_gro_reset_offset(skb); + + eth = skb_gro_header(skb, sizeof(*eth)); + if (!eth) { napi_reuse_skb(napi, skb); skb = NULL; goto out; } - skb->protocol = eth_type_trans(skb, dev); + skb_gro_pull(skb, sizeof(*eth)); + + /* + * This works because the only protocols we care about don't require + * special handling. We'll fix it up properly at the end. + */ + skb->protocol = eth->h_proto; skb->ip_summed = info->ip_summed; skb->csum = info->csum; @@ -2544,10 +2581,21 @@ EXPORT_SYMBOL(napi_fraginfo_skb); int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret) { int err = NET_RX_SUCCESS; + int may; switch (ret) { case GRO_NORMAL: - return netif_receive_skb(skb); + case GRO_HELD: + may = pskb_may_pull(skb, skb_gro_offset(skb)); + BUG_ON(!may); + + skb->protocol = eth_type_trans(skb, napi->dev); + + if (ret == GRO_NORMAL) + return netif_receive_skb(skb); + + skb_gro_pull(skb, -ETH_HLEN); + break; case GRO_DROP: err = NET_RX_DROP; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 2e5f2ca3bdcd..f9f4065a7e9b 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2584,17 +2584,21 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) struct sk_buff *p = *head; struct sk_buff *nskb; unsigned int headroom; - unsigned int hlen = p->data - skb_mac_header(p); - unsigned int len = skb->len; + unsigned int len = skb_gro_len(skb); - if (hlen + p->len + len >= 65536) + if (p->len + len >= 65536) return -E2BIG; if (skb_shinfo(p)->frag_list) goto merge; - else if (!skb_headlen(p) && !skb_headlen(skb) && - skb_shinfo(p)->nr_frags + skb_shinfo(skb)->nr_frags < + else if (skb_headlen(skb) <= skb_gro_offset(skb) && + skb_shinfo(p)->nr_frags + skb_shinfo(skb)->nr_frags <= MAX_SKB_FRAGS) { + skb_shinfo(skb)->frags[0].page_offset += + skb_gro_offset(skb) - skb_headlen(skb); + skb_shinfo(skb)->frags[0].size -= + skb_gro_offset(skb) - skb_headlen(skb); + memcpy(skb_shinfo(p)->frags + skb_shinfo(p)->nr_frags, skb_shinfo(skb)->frags, skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t)); @@ -2611,7 +2615,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) } headroom = skb_headroom(p); - nskb = netdev_alloc_skb(p->dev, headroom); + nskb = netdev_alloc_skb(p->dev, headroom + skb_gro_offset(p)); if (unlikely(!nskb)) return -ENOMEM; @@ -2619,12 +2623,15 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) nskb->mac_len = p->mac_len; skb_reserve(nskb, headroom); + __skb_put(nskb, skb_gro_offset(p)); - skb_set_mac_header(nskb, -hlen); + skb_set_mac_header(nskb, skb_mac_header(p) - p->data); skb_set_network_header(nskb, skb_network_offset(p)); skb_set_transport_header(nskb, skb_transport_offset(p)); - memcpy(skb_mac_header(nskb), skb_mac_header(p), hlen); + __skb_pull(p, skb_gro_offset(p)); + memcpy(skb_mac_header(nskb), skb_mac_header(p), + p->data - skb_mac_header(p)); *NAPI_GRO_CB(nskb) = *NAPI_GRO_CB(p); skb_shinfo(nskb)->frag_list = p; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 743f5542d65a..d6770f295d5b 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1253,10 +1253,10 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, int proto; int id; - if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) + iph = skb_gro_header(skb, sizeof(*iph)); + if (unlikely(!iph)) goto out; - iph = ip_hdr(skb); proto = iph->protocol & (MAX_INET_PROTOS - 1); rcu_read_lock(); @@ -1270,7 +1270,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) goto out_unlock; - flush = ntohs(iph->tot_len) != skb->len || + flush = ntohs(iph->tot_len) != skb_gro_len(skb) || iph->frag_off != htons(IP_DF); id = ntohs(iph->id); @@ -1298,8 +1298,8 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, } NAPI_GRO_CB(skb)->flush |= flush; - __skb_pull(skb, sizeof(*iph)); - skb_reset_transport_header(skb); + skb_gro_pull(skb, sizeof(*iph)); + skb_set_transport_header(skb, skb_gro_offset(skb)); pp = ops->gro_receive(head, skb); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0cd71b84e483..1cd608253940 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2481,19 +2481,19 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) unsigned int mss = 1; int flush = 1; - if (!pskb_may_pull(skb, sizeof(*th))) + th = skb_gro_header(skb, sizeof(*th)); + if (unlikely(!th)) goto out; - th = tcp_hdr(skb); thlen = th->doff * 4; if (thlen < sizeof(*th)) goto out; - if (!pskb_may_pull(skb, thlen)) + th = skb_gro_header(skb, thlen); + if (unlikely(!th)) goto out; - th = tcp_hdr(skb); - __skb_pull(skb, thlen); + skb_gro_pull(skb, thlen); flags = tcp_flag_word(th); @@ -2521,10 +2521,10 @@ found: flush |= th->ack_seq != th2->ack_seq || th->window != th2->window; flush |= memcmp(th + 1, th2 + 1, thlen - sizeof(*th)); - total = p->len; + total = skb_gro_len(p); mss = skb_shinfo(p)->gso_size; - flush |= skb->len > mss || skb->len <= 0; + flush |= skb_gro_len(skb) > mss || !skb_gro_len(skb); flush |= ntohl(th2->seq) + total != ntohl(th->seq); if (flush || skb_gro_receive(head, skb)) { @@ -2537,7 +2537,7 @@ found: tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH); out_check_final: - flush = skb->len < mss; + flush = skb_gro_len(skb) < mss; flush |= flags & (TCP_FLAG_URG | TCP_FLAG_PSH | TCP_FLAG_RST | TCP_FLAG_SYN | TCP_FLAG_FIN); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 19d7b429a262..f6b962f56ab4 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2355,7 +2355,7 @@ struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) switch (skb->ip_summed) { case CHECKSUM_COMPLETE: - if (!tcp_v4_check(skb->len, iph->saddr, iph->daddr, + if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr, skb->csum)) { skb->ip_summed = CHECKSUM_UNNECESSARY; break; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index c802bc1658a8..bd91eadcbe3f 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -799,24 +799,34 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, int proto; __wsum csum; - if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) + iph = skb_gro_header(skb, sizeof(*iph)); + if (unlikely(!iph)) goto out; - iph = ipv6_hdr(skb); - __skb_pull(skb, sizeof(*iph)); + skb_gro_pull(skb, sizeof(*iph)); + skb_set_transport_header(skb, skb_gro_offset(skb)); - flush += ntohs(iph->payload_len) != skb->len; + flush += ntohs(iph->payload_len) != skb_gro_len(skb); rcu_read_lock(); - proto = ipv6_gso_pull_exthdrs(skb, iph->nexthdr); - iph = ipv6_hdr(skb); - IPV6_GRO_CB(skb)->proto = proto; + proto = iph->nexthdr; ops = rcu_dereference(inet6_protos[proto]); - if (!ops || !ops->gro_receive) - goto out_unlock; + if (!ops || !ops->gro_receive) { + __pskb_pull(skb, skb_gro_offset(skb)); + proto = ipv6_gso_pull_exthdrs(skb, proto); + skb_gro_pull(skb, -skb_transport_offset(skb)); + skb_reset_transport_header(skb); + __skb_push(skb, skb_gro_offset(skb)); + + if (!ops || !ops->gro_receive) + goto out_unlock; + + iph = ipv6_hdr(skb); + } + + IPV6_GRO_CB(skb)->proto = proto; flush--; - skb_reset_transport_header(skb); nlen = skb_network_header_len(skb); for (p = *head; p; p = p->next) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index e5b85d45bee8..00f1269e11e9 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -948,7 +948,7 @@ struct sk_buff **tcp6_gro_receive(struct sk_buff **head, struct sk_buff *skb) switch (skb->ip_summed) { case CHECKSUM_COMPLETE: - if (!tcp_v6_check(skb->len, &iph->saddr, &iph->daddr, + if (!tcp_v6_check(skb_gro_len(skb), &iph->saddr, &iph->daddr, skb->csum)) { skb->ip_summed = CHECKSUM_UNNECESSARY; break; -- cgit v1.2.3 From d6301d3dd1c287b32132dda15272a50c11e92a14 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sun, 8 Feb 2009 19:24:13 -0800 Subject: net: Increase default NET_SKB_PAD to 32. Several devices need to insert some "pre headers" in front of the main packet data when they transmit a packet. Currently we allocate only 16 bytes of pad room and this ends up not being enough for some types of hardware (NIU, usb-net, s390 qeth, etc.) So increase this to 32. Note that drivers still need to check in their transmit routine whether enough headroom exists, and if not use skb_realloc_headroom(). Tunneling, IPSEC, and other encapsulation methods can cause the padding area to be used up. Signed-off-by: David S. Miller --- include/linux/skbuff.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux/skbuff.h') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 08670d017479..5eba4007e07f 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1287,7 +1287,7 @@ static inline int skb_network_offset(const struct sk_buff *skb) * The networking layer reserves some headroom in skb data (via * dev_alloc_skb). This is used to avoid having to reallocate skb data when * the header has to grow. In the default case, if the header has to grow - * 16 bytes or less we avoid the reallocation. + * 32 bytes or less we avoid the reallocation. * * Unfortunately this headroom changes the DMA alignment of the resulting * network packet. As for NET_IP_ALIGN, this unaligned DMA is expensive @@ -1295,11 +1295,11 @@ static inline int skb_network_offset(const struct sk_buff *skb) * perhaps setting it to a cacheline in size (since that will maintain * cacheline alignment of the DMA). It must be a power of 2. * - * Various parts of the networking layer expect at least 16 bytes of + * Various parts of the networking layer expect at least 32 bytes of * headroom, you should not reduce this. */ #ifndef NET_SKB_PAD -#define NET_SKB_PAD 16 +#define NET_SKB_PAD 32 #endif extern int ___pskb_trim(struct sk_buff *skb, unsigned int len); -- cgit v1.2.3 From d54e6d872767ae6512978f86a35d623a8ed948c5 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 9 Feb 2009 23:45:29 -0800 Subject: net: Kill skbuff macros from the stone ages. This kills of HAVE_ALLOC_SKB and HAVE_ALIGNABLE_SKB. Nothing in-tree uses them and nothing in-tree has used them since 2.0.x times. Signed-off-by: David S. Miller --- include/linux/skbuff.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux/skbuff.h') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5eba4007e07f..924700844580 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -29,9 +29,6 @@ #include #include -#define HAVE_ALLOC_SKB /* For the drivers to know */ -#define HAVE_ALIGNABLE_SKB /* Ditto 8) */ - /* Don't change this without changing skb_csum_unnecessary! */ #define CHECKSUM_NONE 0 #define CHECKSUM_UNNECESSARY 1 -- cgit v1.2.3 From ac45f602ee3d1b6f326f68bc0c2591ceebf05ba4 Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Thu, 12 Feb 2009 05:03:37 +0000 Subject: net: infrastructure for hardware time stamping The additional per-packet information (16 bytes for time stamps, 1 byte for flags) is stored for all packets in the skb_shared_info struct. This implementation detail is hidden from users of that information via skb_* accessor functions. A separate struct resp. union is used for the additional information so that it can be stored/copied easily outside of skb_shared_info. Compared to previous implementations (reusing the tstamp field depending on the context, optional additional structures) this is the simplest solution. It does not extend sk_buff itself. TX time stamping is implemented in software if the device driver doesn't support hardware time stamping. The new semantic for hardware/software time stamping around ndo_start_xmit() is based on two assumptions about existing network device drivers which don't support hardware time stamping and know nothing about it: - they leave the new skb_shared_tx unmodified - the keep the connection to the originating socket in skb->sk alive, i.e., don't call skb_orphan() Given that skb_shared_tx is new, the first assumption is safe. The second is only true for some drivers. As a result, software TX time stamping currently works with the bnx2 driver, but not with the unmodified igb driver (the two drivers this patch series was tested with). Signed-off-by: Patrick Ohly Signed-off-by: David S. Miller --- include/linux/skbuff.h | 91 +++++++++++++++++++++++++++++++++++++++++++++++++- net/core/dev.c | 32 ++++++++++++++++-- net/core/skbuff.c | 41 +++++++++++++++++++++++ 3 files changed, 161 insertions(+), 3 deletions(-) (limited to 'include/linux/skbuff.h') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 924700844580..f96bc91bf0a3 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -132,6 +132,57 @@ struct skb_frag_struct { __u32 size; }; +#define HAVE_HW_TIME_STAMP + +/** + * skb_shared_hwtstamps - hardware time stamps + * + * @hwtstamp: hardware time stamp transformed into duration + * since arbitrary point in time + * @syststamp: hwtstamp transformed to system time base + * + * Software time stamps generated by ktime_get_real() are stored in + * skb->tstamp. The relation between the different kinds of time + * stamps is as follows: + * + * syststamp and tstamp can be compared against each other in + * arbitrary combinations. The accuracy of a + * syststamp/tstamp/"syststamp from other device" comparison is + * limited by the accuracy of the transformation into system time + * base. This depends on the device driver and its underlying + * hardware. + * + * hwtstamps can only be compared against other hwtstamps from + * the same device. + * + * This structure is attached to packets as part of the + * &skb_shared_info. Use skb_hwtstamps() to get a pointer. + */ +struct skb_shared_hwtstamps { + ktime_t hwtstamp; + ktime_t syststamp; +}; + +/** + * skb_shared_tx - instructions for time stamping of outgoing packets + * + * @hardware: generate hardware time stamp + * @software: generate software time stamp + * @in_progress: device driver is going to provide + * hardware time stamp + * + * These flags are attached to packets as part of the + * &skb_shared_info. Use skb_tx() to get a pointer. + */ +union skb_shared_tx { + struct { + __u8 hardware:1, + software:1, + in_progress:1; + }; + __u8 flags; +}; + /* This data is invariant across clones and lives at * the end of the header data, ie. at skb->end. */ @@ -143,10 +194,12 @@ struct skb_shared_info { unsigned short gso_segs; unsigned short gso_type; __be32 ip6_frag_id; + union skb_shared_tx tx_flags; #ifdef CONFIG_HAS_DMA unsigned int num_dma_maps; #endif struct sk_buff *frag_list; + struct skb_shared_hwtstamps hwtstamps; skb_frag_t frags[MAX_SKB_FRAGS]; #ifdef CONFIG_HAS_DMA dma_addr_t dma_maps[MAX_SKB_FRAGS + 1]; @@ -465,6 +518,16 @@ static inline unsigned char *skb_end_pointer(const struct sk_buff *skb) /* Internal */ #define skb_shinfo(SKB) ((struct skb_shared_info *)(skb_end_pointer(SKB))) +static inline struct skb_shared_hwtstamps *skb_hwtstamps(struct sk_buff *skb) +{ + return &skb_shinfo(skb)->hwtstamps; +} + +static inline union skb_shared_tx *skb_tx(struct sk_buff *skb) +{ + return &skb_shinfo(skb)->tx_flags; +} + /** * skb_queue_empty - check if a queue is empty * @list: queue head @@ -1730,6 +1793,11 @@ static inline void skb_copy_to_linear_data_offset(struct sk_buff *skb, extern void skb_init(void); +static inline ktime_t skb_get_ktime(const struct sk_buff *skb) +{ + return skb->tstamp; +} + /** * skb_get_timestamp - get timestamp from a skb * @skb: skb to get stamp from @@ -1739,11 +1807,18 @@ extern void skb_init(void); * This function converts the offset back to a struct timeval and stores * it in stamp. */ -static inline void skb_get_timestamp(const struct sk_buff *skb, struct timeval *stamp) +static inline void skb_get_timestamp(const struct sk_buff *skb, + struct timeval *stamp) { *stamp = ktime_to_timeval(skb->tstamp); } +static inline void skb_get_timestampns(const struct sk_buff *skb, + struct timespec *stamp) +{ + *stamp = ktime_to_timespec(skb->tstamp); +} + static inline void __net_timestamp(struct sk_buff *skb) { skb->tstamp = ktime_get_real(); @@ -1759,6 +1834,20 @@ static inline ktime_t net_invalid_timestamp(void) return ktime_set(0, 0); } +/** + * skb_tstamp_tx - queue clone of skb with send time stamps + * @orig_skb: the original outgoing packet + * @hwtstamps: hardware time stamps, may be NULL if not available + * + * If the skb has a socket associated, then this function clones the + * skb (thus sharing the actual data and optional structures), stores + * the optional hardware time stamping information (if non NULL) or + * generates a software time stamp (otherwise), then queues the clone + * to the error queue of the socket. Errors are silently ignored. + */ +extern void skb_tstamp_tx(struct sk_buff *orig_skb, + struct skb_shared_hwtstamps *hwtstamps); + extern __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len); extern __sum16 __skb_checksum_complete(struct sk_buff *skb); diff --git a/net/core/dev.c b/net/core/dev.c index 1e27a67df242..d20c28e839d3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1672,10 +1672,21 @@ static int dev_gso_segment(struct sk_buff *skb) return 0; } +static void tstamp_tx(struct sk_buff *skb) +{ + union skb_shared_tx *shtx = + skb_tx(skb); + if (unlikely(shtx->software && + !shtx->in_progress)) { + skb_tstamp_tx(skb, NULL); + } +} + int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) { const struct net_device_ops *ops = dev->netdev_ops; + int rc; prefetch(&dev->netdev_ops->ndo_start_xmit); if (likely(!skb->next)) { @@ -1689,13 +1700,29 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, goto gso; } - return ops->ndo_start_xmit(skb, dev); + rc = ops->ndo_start_xmit(skb, dev); + /* + * TODO: if skb_orphan() was called by + * dev->hard_start_xmit() (for example, the unmodified + * igb driver does that; bnx2 doesn't), then + * skb_tx_software_timestamp() will be unable to send + * back the time stamp. + * + * How can this be prevented? Always create another + * reference to the socket before calling + * dev->hard_start_xmit()? Prevent that skb_orphan() + * does anything in dev->hard_start_xmit() by clearing + * the skb destructor before the call and restoring it + * afterwards, then doing the skb_orphan() ourselves? + */ + if (likely(!rc)) + tstamp_tx(skb); + return rc; } gso: do { struct sk_buff *nskb = skb->next; - int rc; skb->next = nskb->next; nskb->next = NULL; @@ -1705,6 +1732,7 @@ gso: skb->next = nskb; return rc; } + tstamp_tx(skb); if (unlikely(netif_tx_queue_stopped(txq) && skb->next)) return NETDEV_TX_BUSY; } while (skb->next); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index ab7d2e9f02fa..e5a8351ff12d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include @@ -215,7 +216,9 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, shinfo->gso_segs = 0; shinfo->gso_type = 0; shinfo->ip6_frag_id = 0; + shinfo->tx_flags.flags = 0; shinfo->frag_list = NULL; + memset(&shinfo->hwtstamps, 0, sizeof(shinfo->hwtstamps)); if (fclone) { struct sk_buff *child = skb + 1; @@ -2945,6 +2948,44 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) } EXPORT_SYMBOL_GPL(skb_cow_data); +void skb_tstamp_tx(struct sk_buff *orig_skb, + struct skb_shared_hwtstamps *hwtstamps) +{ + struct sock *sk = orig_skb->sk; + struct sock_exterr_skb *serr; + struct sk_buff *skb; + int err; + + if (!sk) + return; + + skb = skb_clone(orig_skb, GFP_ATOMIC); + if (!skb) + return; + + if (hwtstamps) { + *skb_hwtstamps(skb) = + *hwtstamps; + } else { + /* + * no hardware time stamps available, + * so keep the skb_shared_tx and only + * store software time stamp + */ + skb->tstamp = ktime_get_real(); + } + + serr = SKB_EXT_ERR(skb); + memset(serr, 0, sizeof(*serr)); + serr->ee.ee_errno = ENOMSG; + serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; + err = sock_queue_err_skb(sk, skb); + if (err) + kfree_skb(skb); +} +EXPORT_SYMBOL_GPL(skb_tstamp_tx); + + /** * skb_partial_csum_set - set up and verify partial csum values for packet * @skb: the skb to set -- cgit v1.2.3 From d3a21be86c178964167aa54c39a01260d33e7509 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 2 Mar 2009 03:15:58 -0800 Subject: skbuff.h: fix timestamps kernel-doc Fix skbuff.h kernel-doc for timestamps: must include "struct" keyword, otherwise there are kernel-doc errors: Error(linux-next-20090227//include/linux/skbuff.h:161): cannot understand prototype: 'struct skb_shared_hwtstamps ' Error(linux-next-20090227//include/linux/skbuff.h:177): cannot understand prototype: 'union skb_shared_tx ' Signed-off-by: Randy Dunlap Signed-off-by: David S. Miller --- include/linux/skbuff.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux/skbuff.h') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 61ce97a8b868..1f659e8c2b88 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -135,8 +135,7 @@ struct skb_frag_struct { #define HAVE_HW_TIME_STAMP /** - * skb_shared_hwtstamps - hardware time stamps - * + * struct skb_shared_hwtstamps - hardware time stamps * @hwtstamp: hardware time stamp transformed into duration * since arbitrary point in time * @syststamp: hwtstamp transformed to system time base @@ -164,8 +163,7 @@ struct skb_shared_hwtstamps { }; /** - * skb_shared_tx - instructions for time stamping of outgoing packets - * + * struct skb_shared_tx - instructions for time stamping of outgoing packets * @hardware: generate hardware time stamp * @software: generate software time stamp * @in_progress: device driver is going to provide -- cgit v1.2.3 From ead2ceb0ec9f85cff19c43b5cdb2f8a054484431 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Wed, 11 Mar 2009 09:49:55 +0000 Subject: Network Drop Monitor: Adding kfree_skb_clean for non-drops and modifying end-of-line points for skbs Signed-off-by: Neil Horman include/linux/skbuff.h | 4 +++- net/core/datagram.c | 2 +- net/core/skbuff.c | 22 ++++++++++++++++++++++ net/ipv4/arp.c | 2 +- net/ipv4/udp.c | 2 +- net/packet/af_packet.c | 2 +- 6 files changed, 29 insertions(+), 5 deletions(-) Signed-off-by: David S. Miller --- include/linux/skbuff.h | 4 +++- net/core/datagram.c | 2 +- net/core/skbuff.c | 22 ++++++++++++++++++++++ net/ipv4/arp.c | 2 +- net/ipv4/udp.c | 2 +- net/packet/af_packet.c | 2 +- 6 files changed, 29 insertions(+), 5 deletions(-) (limited to 'include/linux/skbuff.h') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 1f659e8c2b88..1fbab2ae613c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -421,6 +421,7 @@ extern void skb_dma_unmap(struct device *dev, struct sk_buff *skb, #endif extern void kfree_skb(struct sk_buff *skb); +extern void consume_skb(struct sk_buff *skb); extern void __kfree_skb(struct sk_buff *skb); extern struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int fclone, int node); @@ -459,7 +460,8 @@ extern int skb_to_sgvec(struct sk_buff *skb, extern int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer); extern int skb_pad(struct sk_buff *skb, int pad); -#define dev_kfree_skb(a) kfree_skb(a) +#define dev_kfree_skb(a) consume_skb(a) +#define dev_consume_skb(a) kfree_skb_clean(a) extern void skb_over_panic(struct sk_buff *skb, int len, void *here); extern void skb_under_panic(struct sk_buff *skb, int len, diff --git a/net/core/datagram.c b/net/core/datagram.c index 5e2ac0c4b07c..d0de644b378d 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -208,7 +208,7 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, void skb_free_datagram(struct sock *sk, struct sk_buff *skb) { - kfree_skb(skb); + consume_skb(skb); sk_mem_reclaim_partial(sk); } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index e5e2111a397d..6acbf9e79eb1 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -65,6 +65,7 @@ #include #include +#include #include "kmap_skb.h" @@ -442,10 +443,31 @@ void kfree_skb(struct sk_buff *skb) smp_rmb(); else if (likely(!atomic_dec_and_test(&skb->users))) return; + trace_kfree_skb(skb, __builtin_return_address(0)); __kfree_skb(skb); } EXPORT_SYMBOL(kfree_skb); +/** + * consume_skb - free an skbuff + * @skb: buffer to free + * + * Drop a ref to the buffer and free it if the usage count has hit zero + * Functions identically to kfree_skb, but kfree_skb assumes that the frame + * is being dropped after a failure and notes that + */ +void consume_skb(struct sk_buff *skb) +{ + if (unlikely(!skb)) + return; + if (likely(atomic_read(&skb->users) == 1)) + smp_rmb(); + else if (likely(!atomic_dec_and_test(&skb->users))) + return; + __kfree_skb(skb); +} +EXPORT_SYMBOL(consume_skb); + /** * skb_recycle_check - check if skb can be reused for receive * @skb: buffer diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 3d67d1ffed77..9c220323f353 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -892,7 +892,7 @@ static int arp_process(struct sk_buff *skb) out: if (in_dev) in_dev_put(in_dev); - kfree_skb(skb); + consume_skb(skb); return 0; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 4bd178a111d5..05b7abb99f69 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1184,7 +1184,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, sk = sknext; } while (sknext); } else - kfree_skb(skb); + consume_skb(skb); spin_unlock(&hslot->lock); return 0; } diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index d8cc006fac45..74776de523ec 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -584,7 +584,7 @@ drop_n_restore: skb->len = skb_len; } drop: - kfree_skb(skb); + consume_skb(skb); return 0; } -- cgit v1.2.3 From 9247744e5eaa29aecee5342a0c8694187a6aadcd Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Sat, 21 Mar 2009 13:39:26 -0700 Subject: skb: expose and constify hash primitives Some minor changes to queue hashing: 1. Use const on accessor functions 2. Export skb_tx_hash for use in drivers (see ixgbe) Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/skbuff.h | 9 ++++++--- net/core/dev.c | 3 ++- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux/skbuff.h') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 1fbab2ae613c..bb1981fd60f3 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1969,7 +1969,7 @@ static inline void skb_set_queue_mapping(struct sk_buff *skb, u16 queue_mapping) skb->queue_mapping = queue_mapping; } -static inline u16 skb_get_queue_mapping(struct sk_buff *skb) +static inline u16 skb_get_queue_mapping(const struct sk_buff *skb) { return skb->queue_mapping; } @@ -1984,16 +1984,19 @@ static inline void skb_record_rx_queue(struct sk_buff *skb, u16 rx_queue) skb->queue_mapping = rx_queue + 1; } -static inline u16 skb_get_rx_queue(struct sk_buff *skb) +static inline u16 skb_get_rx_queue(const struct sk_buff *skb) { return skb->queue_mapping - 1; } -static inline bool skb_rx_queue_recorded(struct sk_buff *skb) +static inline bool skb_rx_queue_recorded(const struct sk_buff *skb) { return (skb->queue_mapping != 0); } +extern u16 skb_tx_hash(const struct net_device *dev, + const struct sk_buff *skb); + #ifdef CONFIG_XFRM static inline struct sec_path *skb_sec_path(struct sk_buff *skb) { diff --git a/net/core/dev.c b/net/core/dev.c index ca212acd3348..fdb9973b82a6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1725,7 +1725,7 @@ out_kfree_skb: static u32 skb_tx_hashrnd; -static u16 skb_tx_hash(struct net_device *dev, struct sk_buff *skb) +u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb) { u32 hash; @@ -1740,6 +1740,7 @@ static u16 skb_tx_hash(struct net_device *dev, struct sk_buff *skb) return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32); } +EXPORT_SYMBOL(skb_tx_hash); static struct netdev_queue *dev_pick_tx(struct net_device *dev, struct sk_buff *skb) -- cgit v1.2.3