diff options
Diffstat (limited to 'net/core/dev.c')
| -rw-r--r-- | net/core/dev.c | 641 | 
1 files changed, 387 insertions, 254 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index 1ae654391442..0dd54a69dace 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -128,7 +128,10 @@  #include <linux/jhash.h>  #include <linux/random.h>  #include <trace/events/napi.h> +#include <trace/events/net.h> +#include <trace/events/skb.h>  #include <linux/pci.h> +#include <linux/inetdevice.h>  #include "net-sysfs.h" @@ -371,6 +374,14 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)   *							--ANK (980803)   */ +static inline struct list_head *ptype_head(const struct packet_type *pt) +{ +	if (pt->type == htons(ETH_P_ALL)) +		return &ptype_all; +	else +		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; +} +  /**   *	dev_add_pack - add packet handler   *	@pt: packet type declaration @@ -386,16 +397,11 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)  void dev_add_pack(struct packet_type *pt)  { -	int hash; +	struct list_head *head = ptype_head(pt); -	spin_lock_bh(&ptype_lock); -	if (pt->type == htons(ETH_P_ALL)) -		list_add_rcu(&pt->list, &ptype_all); -	else { -		hash = ntohs(pt->type) & PTYPE_HASH_MASK; -		list_add_rcu(&pt->list, &ptype_base[hash]); -	} -	spin_unlock_bh(&ptype_lock); +	spin_lock(&ptype_lock); +	list_add_rcu(&pt->list, head); +	spin_unlock(&ptype_lock);  }  EXPORT_SYMBOL(dev_add_pack); @@ -414,15 +420,10 @@ EXPORT_SYMBOL(dev_add_pack);   */  void __dev_remove_pack(struct packet_type *pt)  { -	struct list_head *head; +	struct list_head *head = ptype_head(pt);  	struct packet_type *pt1; -	spin_lock_bh(&ptype_lock); - -	if (pt->type == htons(ETH_P_ALL)) -		head = &ptype_all; -	else -		head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; +	spin_lock(&ptype_lock);  	list_for_each_entry(pt1, head, list) {  		if (pt == pt1) { @@ -433,7 +434,7 @@ void __dev_remove_pack(struct packet_type *pt)  	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);  out: -	spin_unlock_bh(&ptype_lock); +	spin_unlock(&ptype_lock);  }  EXPORT_SYMBOL(__dev_remove_pack); @@ -1484,8 +1485,9 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)  	skb_orphan(skb);  	nf_reset(skb); -	if (!(dev->flags & IFF_UP) || -	    (skb->len > (dev->mtu + dev->hard_header_len))) { +	if (unlikely(!(dev->flags & IFF_UP) || +		     (skb->len > (dev->mtu + dev->hard_header_len + VLAN_HLEN)))) { +		atomic_long_inc(&dev->rx_dropped);  		kfree_skb(skb);  		return NET_RX_DROP;  	} @@ -1553,21 +1555,56 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)   * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues   * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.   */ -void netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) +int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)  { -	unsigned int real_num = dev->real_num_tx_queues; +	if (txq < 1 || txq > dev->num_tx_queues) +		return -EINVAL; -	if (unlikely(txq > dev->num_tx_queues)) -		; -	else if (txq > real_num) -		dev->real_num_tx_queues = txq; -	else if (txq < real_num) { -		dev->real_num_tx_queues = txq; -		qdisc_reset_all_tx_gt(dev, txq); +	if (dev->reg_state == NETREG_REGISTERED) { +		ASSERT_RTNL(); + +		if (txq < dev->real_num_tx_queues) +			qdisc_reset_all_tx_gt(dev, txq);  	} + +	dev->real_num_tx_queues = txq; +	return 0;  }  EXPORT_SYMBOL(netif_set_real_num_tx_queues); +#ifdef CONFIG_RPS +/** + *	netif_set_real_num_rx_queues - set actual number of RX queues used + *	@dev: Network device + *	@rxq: Actual number of RX queues + * + *	This must be called either with the rtnl_lock held or before + *	registration of the net device.  Returns 0 on success, or a + *	negative error code.  If called before registration, it always + *	succeeds. + */ +int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) +{ +	int rc; + +	if (rxq < 1 || rxq > dev->num_rx_queues) +		return -EINVAL; + +	if (dev->reg_state == NETREG_REGISTERED) { +		ASSERT_RTNL(); + +		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, +						  rxq); +		if (rc) +			return rc; +	} + +	dev->real_num_rx_queues = rxq; +	return 0; +} +EXPORT_SYMBOL(netif_set_real_num_rx_queues); +#endif +  static inline void __netif_reschedule(struct Qdisc *q)  {  	struct softnet_data *sd; @@ -1648,10 +1685,10 @@ EXPORT_SYMBOL(netif_device_attach);  static bool can_checksum_protocol(unsigned long features, __be16 protocol)  { -	return ((features & NETIF_F_GEN_CSUM) || -		((features & NETIF_F_IP_CSUM) && +	return ((features & NETIF_F_NO_CSUM) || +		((features & NETIF_F_V4_CSUM) &&  		 protocol == htons(ETH_P_IP)) || -		((features & NETIF_F_IPV6_CSUM) && +		((features & NETIF_F_V6_CSUM) &&  		 protocol == htons(ETH_P_IPV6)) ||  		((features & NETIF_F_FCOE_CRC) &&  		 protocol == htons(ETH_P_FCOE))); @@ -1659,17 +1696,18 @@ static bool can_checksum_protocol(unsigned long features, __be16 protocol)  static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)  { -	if (can_checksum_protocol(dev->features, skb->protocol)) -		return true; +	__be16 protocol = skb->protocol; +	int features = dev->features; -	if (skb->protocol == htons(ETH_P_8021Q)) { +	if (vlan_tx_tag_present(skb)) { +		features &= dev->vlan_features; +	} else if (protocol == htons(ETH_P_8021Q)) {  		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; -		if (can_checksum_protocol(dev->features & dev->vlan_features, -					  veh->h_vlan_encapsulated_proto)) -			return true; +		protocol = veh->h_vlan_encapsulated_proto; +		features &= dev->vlan_features;  	} -	return false; +	return can_checksum_protocol(features, protocol);  }  /** @@ -1758,6 +1796,16 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)  	__be16 type = skb->protocol;  	int err; +	if (type == htons(ETH_P_8021Q)) { +		struct vlan_ethhdr *veh; + +		if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN))) +			return ERR_PTR(-EINVAL); + +		veh = (struct vlan_ethhdr *)skb->data; +		type = veh->h_vlan_encapsulated_proto; +	} +  	skb_reset_mac_header(skb);  	skb->mac_len = skb->network_header - skb->mac_header;  	__skb_pull(skb, skb->mac_len); @@ -1902,14 +1950,14 @@ static int dev_gso_segment(struct sk_buff *skb)  /*   * Try to orphan skb early, right before transmission by the device. - * We cannot orphan skb if tx timestamp is requested, since - * drivers need to call skb_tstamp_tx() to send the timestamp. + * We cannot orphan skb if tx timestamp is requested or the sk-reference + * is needed on driver level for other reasons, e.g. see net/can/raw.c   */  static inline void skb_orphan_try(struct sk_buff *skb)  {  	struct sock *sk = skb->sk; -	if (sk && !skb_tx(skb)->flags) { +	if (sk && !skb_shinfo(skb)->tx_flags) {  		/* skb_tx_hash() wont be able to get sk.  		 * We copy sk_hash into skb->rxhash  		 */ @@ -1929,9 +1977,14 @@ static inline void skb_orphan_try(struct sk_buff *skb)  static inline int skb_needs_linearize(struct sk_buff *skb,  				      struct net_device *dev)  { +	int features = dev->features; + +	if (skb->protocol == htons(ETH_P_8021Q) || vlan_tx_tag_present(skb)) +		features &= dev->vlan_features; +  	return skb_is_nonlinear(skb) && -	       ((skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST)) || -	        (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) || +	       ((skb_has_frag_list(skb) && !(features & NETIF_F_FRAGLIST)) || +		(skb_shinfo(skb)->nr_frags && (!(features & NETIF_F_SG) ||  					      illegal_highdma(dev, skb))));  } @@ -1954,6 +2007,15 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,  		skb_orphan_try(skb); +		if (vlan_tx_tag_present(skb) && +		    !(dev->features & NETIF_F_HW_VLAN_TX)) { +			skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb)); +			if (unlikely(!skb)) +				goto out; + +			skb->vlan_tci = 0; +		} +  		if (netif_needs_gso(dev, skb)) {  			if (unlikely(dev_gso_segment(skb)))  				goto out_kfree_skb; @@ -1978,6 +2040,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,  		}  		rc = ops->ndo_start_xmit(skb, dev); +		trace_net_dev_xmit(skb, rc);  		if (rc == NETDEV_TX_OK)  			txq_trans_update(txq);  		return rc; @@ -1998,6 +2061,7 @@ gso:  			skb_dst_drop(nskb);  		rc = ops->ndo_start_xmit(nskb, dev); +		trace_net_dev_xmit(nskb, rc);  		if (unlikely(rc != NETDEV_TX_OK)) {  			if (rc & ~NETDEV_TX_MASK)  				goto out_kfree_gso_skb; @@ -2015,6 +2079,7 @@ out_kfree_gso_skb:  		skb->destructor = DEV_GSO_CB(skb)->destructor;  out_kfree_skb:  	kfree_skb(skb); +out:  	return rc;  } @@ -2058,16 +2123,16 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev,  					struct sk_buff *skb)  {  	int queue_index; -	struct sock *sk = skb->sk; +	const struct net_device_ops *ops = dev->netdev_ops; -	queue_index = sk_tx_queue_get(sk); -	if (queue_index < 0) { -		const struct net_device_ops *ops = dev->netdev_ops; +	if (ops->ndo_select_queue) { +		queue_index = ops->ndo_select_queue(dev, skb); +		queue_index = dev_cap_txqueue(dev, queue_index); +	} else { +		struct sock *sk = skb->sk; +		queue_index = sk_tx_queue_get(sk); +		if (queue_index < 0 || queue_index >= dev->real_num_tx_queues) { -		if (ops->ndo_select_queue) { -			queue_index = ops->ndo_select_queue(dev, skb); -			queue_index = dev_cap_txqueue(dev, queue_index); -		} else {  			queue_index = 0;  			if (dev->real_num_tx_queues > 1)  				queue_index = skb_tx_hash(dev, skb); @@ -2143,6 +2208,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,  	return rc;  } +static DEFINE_PER_CPU(int, xmit_recursion); +#define RECURSION_LIMIT 10 +  /**   *	dev_queue_xmit - transmit a buffer   *	@skb: buffer to transmit @@ -2186,6 +2254,7 @@ int dev_queue_xmit(struct sk_buff *skb)  #ifdef CONFIG_NET_CLS_ACT  	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);  #endif +	trace_net_dev_queue(skb);  	if (q->enqueue) {  		rc = __dev_xmit_skb(skb, q, dev, txq);  		goto out; @@ -2208,10 +2277,15 @@ int dev_queue_xmit(struct sk_buff *skb)  		if (txq->xmit_lock_owner != cpu) { +			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT) +				goto recursion_alert; +  			HARD_TX_LOCK(dev, txq, cpu);  			if (!netif_tx_queue_stopped(txq)) { +				__this_cpu_inc(xmit_recursion);  				rc = dev_hard_start_xmit(skb, dev, txq); +				__this_cpu_dec(xmit_recursion);  				if (dev_xmit_complete(rc)) {  					HARD_TX_UNLOCK(dev, txq);  					goto out; @@ -2223,7 +2297,9 @@ int dev_queue_xmit(struct sk_buff *skb)  				       "queue packet!\n", dev->name);  		} else {  			/* Recursion is detected! It is possible, -			 * unfortunately */ +			 * unfortunately +			 */ +recursion_alert:  			if (net_ratelimit())  				printk(KERN_CRIT "Dead loop on virtual device "  				       "%s, fix it urgently!\n", dev->name); @@ -2259,69 +2335,44 @@ static inline void ____napi_schedule(struct softnet_data *sd,  	__raise_softirq_irqoff(NET_RX_SOFTIRQ);  } -#ifdef CONFIG_RPS - -/* One global table that all flow-based protocols share. */ -struct rps_sock_flow_table *rps_sock_flow_table __read_mostly; -EXPORT_SYMBOL(rps_sock_flow_table); -  /* - * get_rps_cpu is called from netif_receive_skb and returns the target - * CPU from the RPS map of the receiving queue for a given skb. - * rcu_read_lock must be held on entry. + * __skb_get_rxhash: calculate a flow hash based on src/dst addresses + * and src/dst port numbers. Returns a non-zero hash number on success + * and 0 on failure.   */ -static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, -		       struct rps_dev_flow **rflowp) +__u32 __skb_get_rxhash(struct sk_buff *skb)  { +	int nhoff, hash = 0, poff;  	struct ipv6hdr *ip6;  	struct iphdr *ip; -	struct netdev_rx_queue *rxqueue; -	struct rps_map *map; -	struct rps_dev_flow_table *flow_table; -	struct rps_sock_flow_table *sock_flow_table; -	int cpu = -1;  	u8 ip_proto; -	u16 tcpu;  	u32 addr1, addr2, ihl;  	union {  		u32 v32;  		u16 v16[2];  	} ports; -	if (skb_rx_queue_recorded(skb)) { -		u16 index = skb_get_rx_queue(skb); -		if (unlikely(index >= dev->num_rx_queues)) { -			WARN_ONCE(dev->num_rx_queues > 1, "%s received packet " -				"on queue %u, but number of RX queues is %u\n", -				dev->name, index, dev->num_rx_queues); -			goto done; -		} -		rxqueue = dev->_rx + index; -	} else -		rxqueue = dev->_rx; - -	if (!rxqueue->rps_map && !rxqueue->rps_flow_table) -		goto done; - -	if (skb->rxhash) -		goto got_hash; /* Skip hash computation on packet header */ +	nhoff = skb_network_offset(skb);  	switch (skb->protocol) {  	case __constant_htons(ETH_P_IP): -		if (!pskb_may_pull(skb, sizeof(*ip))) +		if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))  			goto done; -		ip = (struct iphdr *) skb->data; -		ip_proto = ip->protocol; +		ip = (struct iphdr *) (skb->data + nhoff); +		if (ip->frag_off & htons(IP_MF | IP_OFFSET)) +			ip_proto = 0; +		else +			ip_proto = ip->protocol;  		addr1 = (__force u32) ip->saddr;  		addr2 = (__force u32) ip->daddr;  		ihl = ip->ihl;  		break;  	case __constant_htons(ETH_P_IPV6): -		if (!pskb_may_pull(skb, sizeof(*ip6))) +		if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))  			goto done; -		ip6 = (struct ipv6hdr *) skb->data; +		ip6 = (struct ipv6hdr *) (skb->data + nhoff);  		ip_proto = ip6->nexthdr;  		addr1 = (__force u32) ip6->saddr.s6_addr32[3];  		addr2 = (__force u32) ip6->daddr.s6_addr32[3]; @@ -2330,33 +2381,81 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,  	default:  		goto done;  	} -	switch (ip_proto) { -	case IPPROTO_TCP: -	case IPPROTO_UDP: -	case IPPROTO_DCCP: -	case IPPROTO_ESP: -	case IPPROTO_AH: -	case IPPROTO_SCTP: -	case IPPROTO_UDPLITE: -		if (pskb_may_pull(skb, (ihl * 4) + 4)) { -			ports.v32 = * (__force u32 *) (skb->data + (ihl * 4)); + +	ports.v32 = 0; +	poff = proto_ports_offset(ip_proto); +	if (poff >= 0) { +		nhoff += ihl * 4 + poff; +		if (pskb_may_pull(skb, nhoff + 4)) { +			ports.v32 = * (__force u32 *) (skb->data + nhoff);  			if (ports.v16[1] < ports.v16[0])  				swap(ports.v16[0], ports.v16[1]); -			break;  		} -	default: -		ports.v32 = 0; -		break;  	}  	/* get a consistent hash (same value on both flow directions) */  	if (addr2 < addr1)  		swap(addr1, addr2); -	skb->rxhash = jhash_3words(addr1, addr2, ports.v32, hashrnd); -	if (!skb->rxhash) -		skb->rxhash = 1; -got_hash: +	hash = jhash_3words(addr1, addr2, ports.v32, hashrnd); +	if (!hash) +		hash = 1; + +done: +	return hash; +} +EXPORT_SYMBOL(__skb_get_rxhash); + +#ifdef CONFIG_RPS + +/* One global table that all flow-based protocols share. */ +struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; +EXPORT_SYMBOL(rps_sock_flow_table); + +/* + * get_rps_cpu is called from netif_receive_skb and returns the target + * CPU from the RPS map of the receiving queue for a given skb. + * rcu_read_lock must be held on entry. + */ +static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, +		       struct rps_dev_flow **rflowp) +{ +	struct netdev_rx_queue *rxqueue; +	struct rps_map *map; +	struct rps_dev_flow_table *flow_table; +	struct rps_sock_flow_table *sock_flow_table; +	int cpu = -1; +	u16 tcpu; + +	if (skb_rx_queue_recorded(skb)) { +		u16 index = skb_get_rx_queue(skb); +		if (unlikely(index >= dev->real_num_rx_queues)) { +			WARN_ONCE(dev->real_num_rx_queues > 1, +				  "%s received packet on queue %u, but number " +				  "of RX queues is %u\n", +				  dev->name, index, dev->real_num_rx_queues); +			goto done; +		} +		rxqueue = dev->_rx + index; +	} else +		rxqueue = dev->_rx; + +	map = rcu_dereference(rxqueue->rps_map); +	if (map) { +		if (map->len == 1) { +			tcpu = map->cpus[0]; +			if (cpu_online(tcpu)) +				cpu = tcpu; +			goto done; +		} +	} else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) { +		goto done; +	} + +	skb_reset_network_header(skb); +	if (!skb_get_rxhash(skb)) +		goto done; +  	flow_table = rcu_dereference(rxqueue->rps_flow_table);  	sock_flow_table = rcu_dereference(rps_sock_flow_table);  	if (flow_table && sock_flow_table) { @@ -2396,7 +2495,6 @@ got_hash:  		}  	} -	map = rcu_dereference(rxqueue->rps_map);  	if (map) {  		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32]; @@ -2482,6 +2580,7 @@ enqueue:  	local_irq_restore(flags); +	atomic_long_inc(&skb->dev->rx_dropped);  	kfree_skb(skb);  	return NET_RX_DROP;  } @@ -2512,6 +2611,7 @@ int netif_rx(struct sk_buff *skb)  	if (netdev_tstamp_prequeue)  		net_timestamp_check(skb); +	trace_netif_rx(skb);  #ifdef CONFIG_RPS  	{  		struct rps_dev_flow voidflow, *rflow = &voidflow; @@ -2571,6 +2671,7 @@ static void net_tx_action(struct softirq_action *h)  			clist = clist->next;  			WARN_ON(atomic_read(&skb->users)); +			trace_kfree_skb(skb, net_tx_action);  			__kfree_skb(skb);  		}  	} @@ -2636,11 +2737,10 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);   * the ingress scheduler, you just cant add policies on ingress.   *   */ -static int ing_filter(struct sk_buff *skb) +static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)  {  	struct net_device *dev = skb->dev;  	u32 ttl = G_TC_RTTL(skb->tc_verd); -	struct netdev_queue *rxq;  	int result = TC_ACT_OK;  	struct Qdisc *q; @@ -2654,8 +2754,6 @@ static int ing_filter(struct sk_buff *skb)  	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);  	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); -	rxq = &dev->rx_queue; -  	q = rxq->qdisc;  	if (q != &noop_qdisc) {  		spin_lock(qdisc_lock(q)); @@ -2671,7 +2769,9 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,  					 struct packet_type **pt_prev,  					 int *ret, struct net_device *orig_dev)  { -	if (skb->dev->rx_queue.qdisc == &noop_qdisc) +	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue); + +	if (!rxq || rxq->qdisc == &noop_qdisc)  		goto out;  	if (*pt_prev) { @@ -2679,7 +2779,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,  		*pt_prev = NULL;  	} -	switch (ing_filter(skb)) { +	switch (ing_filter(skb, rxq)) {  	case TC_ACT_SHOT:  	case TC_ACT_STOLEN:  		kfree_skb(skb); @@ -2692,33 +2792,6 @@ out:  }  #endif -/* - * 	netif_nit_deliver - deliver received packets to network taps - * 	@skb: buffer - * - * 	This function is used to deliver incoming packets to network - * 	taps. It should be used when the normal netif_receive_skb path - * 	is bypassed, for example because of VLAN acceleration. - */ -void netif_nit_deliver(struct sk_buff *skb) -{ -	struct packet_type *ptype; - -	if (list_empty(&ptype_all)) -		return; - -	skb_reset_network_header(skb); -	skb_reset_transport_header(skb); -	skb->mac_len = skb->network_header - skb->mac_header; - -	rcu_read_lock(); -	list_for_each_entry_rcu(ptype, &ptype_all, list) { -		if (!ptype->dev || ptype->dev == skb->dev) -			deliver_skb(skb, ptype, skb->dev); -	} -	rcu_read_unlock(); -} -  /**   *	netdev_rx_handler_register - register receive handler   *	@dev: device to register a handler for @@ -2828,8 +2901,7 @@ static int __netif_receive_skb(struct sk_buff *skb)  	if (!netdev_tstamp_prequeue)  		net_timestamp_check(skb); -	if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb)) -		return NET_RX_SUCCESS; +	trace_netif_receive_skb(skb);  	/* if we've gotten here through NAPI, check netpoll */  	if (netpoll_receive_skb(skb)) @@ -2843,8 +2915,7 @@ static int __netif_receive_skb(struct sk_buff *skb)  	 * be delivered to pkt handlers that are exact matches.  Also  	 * the deliver_no_wcard flag will be set.  If packet handlers  	 * are sensitive to duplicate packets these skbs will need to -	 * be dropped at the handler.  The vlan accel path may have -	 * already set the deliver_no_wcard flag. +	 * be dropped at the handler.  	 */  	null_or_orig = NULL;  	orig_dev = skb->dev; @@ -2903,6 +2974,18 @@ ncls:  			goto out;  	} +	if (vlan_tx_tag_present(skb)) { +		if (pt_prev) { +			ret = deliver_skb(skb, pt_prev, orig_dev); +			pt_prev = NULL; +		} +		if (vlan_hwaccel_do_receive(&skb)) { +			ret = __netif_receive_skb(skb); +			goto out; +		} else if (unlikely(!skb)) +			goto out; +	} +  	/*  	 * Make sure frames received on VLAN interfaces stacked on  	 * bonding interfaces still make their way to any base bonding @@ -2930,6 +3013,7 @@ ncls:  	if (pt_prev) {  		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);  	} else { +		atomic_long_inc(&skb->dev->rx_dropped);  		kfree_skb(skb);  		/* Jamal, now you will not able to escape explaining  		 * me how you were going to use this. :-) @@ -3050,7 +3134,7 @@ out:  	return netif_receive_skb(skb);  } -static void napi_gro_flush(struct napi_struct *napi) +inline void napi_gro_flush(struct napi_struct *napi)  {  	struct sk_buff *skb, *next; @@ -3063,6 +3147,7 @@ static void napi_gro_flush(struct napi_struct *napi)  	napi->gro_count = 0;  	napi->gro_list = NULL;  } +EXPORT_SYMBOL(napi_gro_flush);  enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)  { @@ -3077,7 +3162,7 @@ enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)  	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))  		goto normal; -	if (skb_is_gso(skb) || skb_has_frags(skb)) +	if (skb_is_gso(skb) || skb_has_frag_list(skb))  		goto normal;  	rcu_read_lock(); @@ -3143,7 +3228,7 @@ pull:  			put_page(skb_shinfo(skb)->frags[0].page);  			memmove(skb_shinfo(skb)->frags,  				skb_shinfo(skb)->frags + 1, -				--skb_shinfo(skb)->nr_frags); +				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));  		}  	} @@ -3156,16 +3241,19 @@ normal:  }  EXPORT_SYMBOL(dev_gro_receive); -static gro_result_t +static inline gro_result_t  __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)  {  	struct sk_buff *p;  	for (p = napi->gro_list; p; p = p->next) { -		NAPI_GRO_CB(p)->same_flow = -			(p->dev == skb->dev) && -			!compare_ether_header(skb_mac_header(p), +		unsigned long diffs; + +		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; +		diffs |= p->vlan_tci ^ skb->vlan_tci; +		diffs |= compare_ether_header(skb_mac_header(p),  					      skb_gro_mac_header(skb)); +		NAPI_GRO_CB(p)->same_flow = !diffs;  		NAPI_GRO_CB(p)->flush = 0;  	} @@ -3218,14 +3306,14 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)  }  EXPORT_SYMBOL(napi_gro_receive); -void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) +static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)  {  	__skb_pull(skb, skb_headlen(skb));  	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb)); +	skb->vlan_tci = 0;  	napi->skb = skb;  } -EXPORT_SYMBOL(napi_reuse_skb);  struct sk_buff *napi_get_frags(struct napi_struct *napi)  { @@ -4845,7 +4933,7 @@ static void rollback_registered_many(struct list_head *head)  	dev = list_first_entry(head, struct net_device, unreg_list);  	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); -	synchronize_net(); +	rcu_barrier();  	list_for_each_entry(dev, head, unreg_list)  		dev_put(dev); @@ -4859,21 +4947,6 @@ static void rollback_registered(struct net_device *dev)  	rollback_registered_many(&single);  } -static void __netdev_init_queue_locks_one(struct net_device *dev, -					  struct netdev_queue *dev_queue, -					  void *_unused) -{ -	spin_lock_init(&dev_queue->_xmit_lock); -	netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type); -	dev_queue->xmit_lock_owner = -1; -} - -static void netdev_init_queue_locks(struct net_device *dev) -{ -	netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL); -	__netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL); -} -  unsigned long netdev_fix_features(unsigned long features, const char *name)  {  	/* Fix illegal SG+CSUM combinations. */ @@ -4941,6 +5014,66 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev,  }  EXPORT_SYMBOL(netif_stacked_transfer_operstate); +static int netif_alloc_rx_queues(struct net_device *dev) +{ +#ifdef CONFIG_RPS +	unsigned int i, count = dev->num_rx_queues; +	struct netdev_rx_queue *rx; + +	BUG_ON(count < 1); + +	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL); +	if (!rx) { +		pr_err("netdev: Unable to allocate %u rx queues.\n", count); +		return -ENOMEM; +	} +	dev->_rx = rx; + +	/* +	 * Set a pointer to first element in the array which holds the +	 * reference count. +	 */ +	for (i = 0; i < count; i++) +		rx[i].first = rx; +#endif +	return 0; +} + +static int netif_alloc_netdev_queues(struct net_device *dev) +{ +	unsigned int count = dev->num_tx_queues; +	struct netdev_queue *tx; + +	BUG_ON(count < 1); + +	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL); +	if (!tx) { +		pr_err("netdev: Unable to allocate %u tx queues.\n", +		       count); +		return -ENOMEM; +	} +	dev->_tx = tx; +	return 0; +} + +static void netdev_init_one_queue(struct net_device *dev, +				  struct netdev_queue *queue, +				  void *_unused) +{ +	queue->dev = dev; + +	/* Initialize queue lock */ +	spin_lock_init(&queue->_xmit_lock); +	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); +	queue->xmit_lock_owner = -1; +} + +static void netdev_init_queues(struct net_device *dev) +{ +	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); +	spin_lock_init(&dev->tx_global_lock); +} +  /**   *	register_netdevice	- register a network device   *	@dev: device to register @@ -4974,28 +5107,19 @@ int register_netdevice(struct net_device *dev)  	spin_lock_init(&dev->addr_list_lock);  	netdev_set_addr_lockdep_class(dev); -	netdev_init_queue_locks(dev);  	dev->iflink = -1; -#ifdef CONFIG_RPS -	if (!dev->num_rx_queues) { -		/* -		 * Allocate a single RX queue if driver never called -		 * alloc_netdev_mq -		 */ +	ret = netif_alloc_rx_queues(dev); +	if (ret) +		goto out; -		dev->_rx = kzalloc(sizeof(struct netdev_rx_queue), GFP_KERNEL); -		if (!dev->_rx) { -			ret = -ENOMEM; -			goto out; -		} +	ret = netif_alloc_netdev_queues(dev); +	if (ret) +		goto out; + +	netdev_init_queues(dev); -		dev->_rx->first = dev->_rx; -		atomic_set(&dev->_rx->count, 1); -		dev->num_rx_queues = 1; -	} -#endif  	/* Init, if this function is available */  	if (dev->netdev_ops->ndo_init) {  		ret = dev->netdev_ops->ndo_init(dev); @@ -5035,6 +5159,12 @@ int register_netdevice(struct net_device *dev)  	if (dev->features & NETIF_F_SG)  		dev->features |= NETIF_F_GSO; +	/* Enable GRO and NETIF_F_HIGHDMA for vlans by default, +	 * vlan_dev_init() will do the dev->features check, so these features +	 * are enabled only if supported by underlying device. +	 */ +	dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA); +  	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);  	ret = notifier_to_errno(ret);  	if (ret) @@ -5105,9 +5235,6 @@ int init_dummy_netdev(struct net_device *dev)  	 */  	dev->reg_state = NETREG_DUMMY; -	/* initialize the ref count */ -	atomic_set(&dev->refcnt, 1); -  	/* NAPI wants this */  	INIT_LIST_HEAD(&dev->napi_list); @@ -5115,6 +5242,11 @@ int init_dummy_netdev(struct net_device *dev)  	set_bit(__LINK_STATE_PRESENT, &dev->state);  	set_bit(__LINK_STATE_START, &dev->state); +	/* Note : We dont allocate pcpu_refcnt for dummy devices, +	 * because users of this 'device' dont need to change +	 * its refcount. +	 */ +  	return 0;  }  EXPORT_SYMBOL_GPL(init_dummy_netdev); @@ -5156,6 +5288,16 @@ out:  }  EXPORT_SYMBOL(register_netdev); +int netdev_refcnt_read(const struct net_device *dev) +{ +	int i, refcnt = 0; + +	for_each_possible_cpu(i) +		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); +	return refcnt; +} +EXPORT_SYMBOL(netdev_refcnt_read); +  /*   * netdev_wait_allrefs - wait until all references are gone.   * @@ -5170,11 +5312,14 @@ EXPORT_SYMBOL(register_netdev);  static void netdev_wait_allrefs(struct net_device *dev)  {  	unsigned long rebroadcast_time, warning_time; +	int refcnt;  	linkwatch_forget_dev(dev);  	rebroadcast_time = warning_time = jiffies; -	while (atomic_read(&dev->refcnt) != 0) { +	refcnt = netdev_refcnt_read(dev); + +	while (refcnt != 0) {  		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {  			rtnl_lock(); @@ -5201,11 +5346,13 @@ static void netdev_wait_allrefs(struct net_device *dev)  		msleep(250); +		refcnt = netdev_refcnt_read(dev); +  		if (time_after(jiffies, warning_time + 10 * HZ)) {  			printk(KERN_EMERG "unregister_netdevice: "  			       "waiting for %s to become free. Usage "  			       "count = %d\n", -			       dev->name, atomic_read(&dev->refcnt)); +			       dev->name, refcnt);  			warning_time = jiffies;  		}  	} @@ -5263,9 +5410,9 @@ void netdev_run_todo(void)  		netdev_wait_allrefs(dev);  		/* paranoia */ -		BUG_ON(atomic_read(&dev->refcnt)); -		WARN_ON(dev->ip_ptr); -		WARN_ON(dev->ip6_ptr); +		BUG_ON(netdev_refcnt_read(dev)); +		WARN_ON(rcu_dereference_raw(dev->ip_ptr)); +		WARN_ON(rcu_dereference_raw(dev->ip6_ptr));  		WARN_ON(dev->dn_ptr);  		if (dev->destructor) @@ -5342,30 +5489,34 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,  	if (ops->ndo_get_stats64) {  		memset(storage, 0, sizeof(*storage)); -		return ops->ndo_get_stats64(dev, storage); -	} -	if (ops->ndo_get_stats) { +		ops->ndo_get_stats64(dev, storage); +	} else if (ops->ndo_get_stats) {  		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); -		return storage; +	} else { +		netdev_stats_to_stats64(storage, &dev->stats); +		dev_txq_stats_fold(dev, storage);  	} -	netdev_stats_to_stats64(storage, &dev->stats); -	dev_txq_stats_fold(dev, storage); +	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);  	return storage;  }  EXPORT_SYMBOL(dev_get_stats); -static void netdev_init_one_queue(struct net_device *dev, -				  struct netdev_queue *queue, -				  void *_unused) +struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)  { -	queue->dev = dev; -} +	struct netdev_queue *queue = dev_ingress_queue(dev); -static void netdev_init_queues(struct net_device *dev) -{ -	netdev_init_one_queue(dev, &dev->rx_queue, NULL); -	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); -	spin_lock_init(&dev->tx_global_lock); +#ifdef CONFIG_NET_CLS_ACT +	if (queue) +		return queue; +	queue = kzalloc(sizeof(*queue), GFP_KERNEL); +	if (!queue) +		return NULL; +	netdev_init_one_queue(dev, queue, NULL); +	queue->qdisc = &noop_qdisc; +	queue->qdisc_sleeping = &noop_qdisc; +	rcu_assign_pointer(dev->ingress_queue, queue); +#endif +	return queue;  }  /** @@ -5382,17 +5533,18 @@ static void netdev_init_queues(struct net_device *dev)  struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,  		void (*setup)(struct net_device *), unsigned int queue_count)  { -	struct netdev_queue *tx;  	struct net_device *dev;  	size_t alloc_size;  	struct net_device *p; -#ifdef CONFIG_RPS -	struct netdev_rx_queue *rx; -	int i; -#endif  	BUG_ON(strlen(name) >= sizeof(dev->name)); +	if (queue_count < 1) { +		pr_err("alloc_netdev: Unable to allocate device " +		       "with zero queues.\n"); +		return NULL; +	} +  	alloc_size = sizeof(struct net_device);  	if (sizeof_priv) {  		/* ensure 32-byte alignment of private area */ @@ -5408,55 +5560,31 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,  		return NULL;  	} -	tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL); -	if (!tx) { -		printk(KERN_ERR "alloc_netdev: Unable to allocate " -		       "tx qdiscs.\n"); -		goto free_p; -	} - -#ifdef CONFIG_RPS -	rx = kcalloc(queue_count, sizeof(struct netdev_rx_queue), GFP_KERNEL); -	if (!rx) { -		printk(KERN_ERR "alloc_netdev: Unable to allocate " -		       "rx queues.\n"); -		goto free_tx; -	} - -	atomic_set(&rx->count, queue_count); - -	/* -	 * Set a pointer to first element in the array which holds the -	 * reference count. -	 */ -	for (i = 0; i < queue_count; i++) -		rx[i].first = rx; -#endif -  	dev = PTR_ALIGN(p, NETDEV_ALIGN);  	dev->padded = (char *)dev - (char *)p; +	dev->pcpu_refcnt = alloc_percpu(int); +	if (!dev->pcpu_refcnt) +		goto free_p; +  	if (dev_addr_init(dev)) -		goto free_rx; +		goto free_pcpu;  	dev_mc_init(dev);  	dev_uc_init(dev);  	dev_net_set(dev, &init_net); -	dev->_tx = tx;  	dev->num_tx_queues = queue_count;  	dev->real_num_tx_queues = queue_count;  #ifdef CONFIG_RPS -	dev->_rx = rx;  	dev->num_rx_queues = queue_count; +	dev->real_num_rx_queues = queue_count;  #endif  	dev->gso_max_size = GSO_MAX_SIZE; -	netdev_init_queues(dev); -  	INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);  	dev->ethtool_ntuple_list.count = 0;  	INIT_LIST_HEAD(&dev->napi_list); @@ -5467,12 +5595,8 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,  	strcpy(dev->name, name);  	return dev; -free_rx: -#ifdef CONFIG_RPS -	kfree(rx); -free_tx: -#endif -	kfree(tx); +free_pcpu: +	free_percpu(dev->pcpu_refcnt);  free_p:  	kfree(p);  	return NULL; @@ -5495,6 +5619,8 @@ void free_netdev(struct net_device *dev)  	kfree(dev->_tx); +	kfree(rcu_dereference_raw(dev->ingress_queue)); +  	/* Flush device addresses */  	dev_addr_flush(dev); @@ -5504,6 +5630,9 @@ void free_netdev(struct net_device *dev)  	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)  		netif_napi_del(p); +	free_percpu(dev->pcpu_refcnt); +	dev->pcpu_refcnt = NULL; +  	/*  Compatibility with error handling in drivers */  	if (dev->reg_state == NETREG_UNINITIALIZED) {  		kfree((char *)dev - dev->padded); @@ -5658,6 +5787,10 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char  	/* Notify protocols, that we are about to destroy  	   this device. They should clean all the things. + +	   Note that dev->reg_state stays at NETREG_REGISTERED. +	   This is wanted because this way 8021q and macvlan know +	   the device is just moving and can keep their slaves up.  	*/  	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);  	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);  | 
