diff options
Diffstat (limited to 'net/core/dev.c')
| -rw-r--r-- | net/core/dev.c | 276 | 
1 files changed, 177 insertions, 99 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index 7969fddc94e3..b28ce68830b2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1267,33 +1267,32 @@ struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)  EXPORT_SYMBOL(dev_getfirstbyhwtype);  /** - *	__dev_get_by_flags - find any device with given flags - *	@net: the applicable net namespace - *	@if_flags: IFF_* values - *	@mask: bitmask of bits in if_flags to check + * netdev_get_by_flags_rcu - find any device with given flags + * @net: the applicable net namespace + * @tracker: tracking object for the acquired reference + * @if_flags: IFF_* values + * @mask: bitmask of bits in if_flags to check + * + * Search for any interface with the given flags.   * - *	Search for any interface with the given flags. Returns NULL if a device - *	is not found or a pointer to the device. Must be called inside - *	rtnl_lock(), and result refcount is unchanged. + * Context: rcu_read_lock() must be held. + * Returns: NULL if a device is not found or a pointer to the device.   */ - -struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags, -				      unsigned short mask) +struct net_device *netdev_get_by_flags_rcu(struct net *net, netdevice_tracker *tracker, +					   unsigned short if_flags, unsigned short mask)  { -	struct net_device *dev, *ret; - -	ASSERT_RTNL(); +	struct net_device *dev; -	ret = NULL; -	for_each_netdev(net, dev) { -		if (((dev->flags ^ if_flags) & mask) == 0) { -			ret = dev; -			break; +	for_each_netdev_rcu(net, dev) { +		if (((READ_ONCE(dev->flags) ^ if_flags) & mask) == 0) { +			netdev_hold(dev, tracker, GFP_ATOMIC); +			return dev;  		}  	} -	return ret; + +	return NULL;  } -EXPORT_SYMBOL(__dev_get_by_flags); +EXPORT_IPV6_MOD(netdev_get_by_flags_rcu);  /**   *	dev_valid_name - check if name is okay for network device @@ -1769,7 +1768,7 @@ static void __dev_close(struct net_device *dev)  	list_del(&single);  } -void dev_close_many(struct list_head *head, bool unlink) +void netif_close_many(struct list_head *head, bool unlink)  {  	struct net_device *dev, *tmp; @@ -1787,7 +1786,7 @@ void dev_close_many(struct list_head *head, bool unlink)  			list_del_init(&dev->close_list);  	}  } -EXPORT_SYMBOL(dev_close_many); +EXPORT_SYMBOL_NS_GPL(netif_close_many, "NETDEV_INTERNAL");  void netif_close(struct net_device *dev)  { @@ -1795,7 +1794,7 @@ void netif_close(struct net_device *dev)  		LIST_HEAD(single);  		list_add(&dev->close_list, &single); -		dev_close_many(&single, true); +		netif_close_many(&single, true);  		list_del(&single);  	}  } @@ -3179,7 +3178,6 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)  	if (dev->reg_state == NETREG_REGISTERED ||  	    dev->reg_state == NETREG_UNREGISTERING) { -		ASSERT_RTNL();  		netdev_ops_assert_locked(dev);  		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues, @@ -3229,7 +3227,6 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)  		return -EINVAL;  	if (dev->reg_state == NETREG_REGISTERED) { -		ASSERT_RTNL();  		netdev_ops_assert_locked(dev);  		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, @@ -4028,7 +4025,10 @@ static void qdisc_pkt_len_init(struct sk_buff *skb)  		unsigned int hdr_len;  		/* mac layer + network layer */ -		hdr_len = skb_transport_offset(skb); +		if (!skb->encapsulation) +			hdr_len = skb_transport_offset(skb); +		else +			hdr_len = skb_inner_transport_offset(skb);  		/* + transport layer */  		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { @@ -4798,7 +4798,7 @@ static inline void ____napi_schedule(struct softnet_data *sd,  	if (test_bit(NAPI_STATE_THREADED, &napi->state)) {  		/* Paired with smp_mb__before_atomic() in -		 * napi_enable()/dev_set_threaded(). +		 * napi_enable()/netif_set_threaded().  		 * Use READ_ONCE() to guarantee a complete  		 * read on napi->thread. Only call  		 * wake_up_process() when it's not NULL. @@ -5749,6 +5749,7 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,  static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,  				    struct packet_type **ppt_prev)  { +	enum skb_drop_reason drop_reason = SKB_DROP_REASON_UNHANDLED_PROTO;  	struct packet_type *ptype, *pt_prev;  	rx_handler_func_t *rx_handler;  	struct sk_buff *skb = *pskb; @@ -5840,8 +5841,10 @@ skip_taps:  #endif  	skb_reset_redirect(skb);  skip_classify: -	if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) +	if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) { +		drop_reason = SKB_DROP_REASON_PFMEMALLOC;  		goto drop; +	}  	if (skb_vlan_tag_present(skb)) {  		if (pt_prev) { @@ -5939,8 +5942,6 @@ check_vlan_id:  	}  	if (pt_prev) { -		if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) -			goto drop;  		*ppt_prev = pt_prev;  	} else {  drop: @@ -5948,7 +5949,8 @@ drop:  			dev_core_stats_rx_dropped_inc(skb->dev);  		else  			dev_core_stats_rx_nohandler_inc(skb->dev); -		kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO); + +		kfree_skb_reason(skb, drop_reason);  		/* Jamal, now you will not able to escape explaining  		 * me how you were going to use this. :-)  		 */ @@ -6576,8 +6578,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done)  	 * it, we need to bound somehow the time packets are kept in  	 * the GRO layer.  	 */ -	gro_flush(&n->gro, !!timeout); -	gro_normal_list(&n->gro); +	gro_flush_normal(&n->gro, !!timeout);  	if (unlikely(!list_empty(&n->poll_list))) {  		/* If n->poll_list is not empty, we need to mask irqs */ @@ -6647,8 +6648,7 @@ static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)  	}  	/* Flush too old packets. If HZ < 1000, flush all packets */ -	gro_flush(&napi->gro, HZ >= 1000); -	gro_normal_list(&napi->gro); +	gro_flush_normal(&napi->gro, HZ >= 1000);  	clear_bit(NAPI_STATE_SCHED, &napi->state);  } @@ -6926,22 +6926,83 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)  	return HRTIMER_NORESTART;  } -int dev_set_threaded(struct net_device *dev, bool threaded) +static void napi_stop_kthread(struct napi_struct *napi) +{ +	unsigned long val, new; + +	/* Wait until the napi STATE_THREADED is unset. */ +	while (true) { +		val = READ_ONCE(napi->state); + +		/* If napi kthread own this napi or the napi is idle, +		 * STATE_THREADED can be unset here. +		 */ +		if ((val & NAPIF_STATE_SCHED_THREADED) || +		    !(val & NAPIF_STATE_SCHED)) { +			new = val & (~NAPIF_STATE_THREADED); +		} else { +			msleep(20); +			continue; +		} + +		if (try_cmpxchg(&napi->state, &val, new)) +			break; +	} + +	/* Once STATE_THREADED is unset, wait for SCHED_THREADED to be unset by +	 * the kthread. +	 */ +	while (true) { +		if (!test_bit(NAPIF_STATE_SCHED_THREADED, &napi->state)) +			break; + +		msleep(20); +	} + +	kthread_stop(napi->thread); +	napi->thread = NULL; +} + +int napi_set_threaded(struct napi_struct *napi, +		      enum netdev_napi_threaded threaded) +{ +	if (threaded) { +		if (!napi->thread) { +			int err = napi_kthread_create(napi); + +			if (err) +				return err; +		} +	} + +	if (napi->config) +		napi->config->threaded = threaded; + +	if (!threaded && napi->thread) { +		napi_stop_kthread(napi); +	} else { +		/* Make sure kthread is created before THREADED bit is set. */ +		smp_mb__before_atomic(); +		assign_bit(NAPI_STATE_THREADED, &napi->state, threaded); +	} + +	return 0; +} + +int netif_set_threaded(struct net_device *dev, +		       enum netdev_napi_threaded threaded)  {  	struct napi_struct *napi;  	int err = 0;  	netdev_assert_locked_or_invisible(dev); -	if (dev->threaded == threaded) -		return 0; -  	if (threaded) {  		list_for_each_entry(napi, &dev->napi_list, dev_list) {  			if (!napi->thread) {  				err = napi_kthread_create(napi);  				if (err) { -					threaded = false; +					threaded = NETDEV_NAPI_THREADED_DISABLED;  					break;  				}  			} @@ -6961,12 +7022,32 @@ int dev_set_threaded(struct net_device *dev, bool threaded)  	 * softirq mode will happen in the next round of napi_schedule().  	 * This should not cause hiccups/stalls to the live traffic.  	 */ -	list_for_each_entry(napi, &dev->napi_list, dev_list) -		assign_bit(NAPI_STATE_THREADED, &napi->state, threaded); +	list_for_each_entry(napi, &dev->napi_list, dev_list) { +		if (!threaded && napi->thread) +			napi_stop_kthread(napi); +		else +			assign_bit(NAPI_STATE_THREADED, &napi->state, threaded); +	}  	return err;  } -EXPORT_SYMBOL(dev_set_threaded); + +/** + * netif_threaded_enable() - enable threaded NAPIs + * @dev: net_device instance + * + * Enable threaded mode for the NAPI instances of the device. This may be useful + * for devices where multiple NAPI instances get scheduled by a single + * interrupt. Threaded NAPI allows moving the NAPI processing to cores other + * than the core where IRQ is mapped. + * + * This function should be called before @dev is registered. + */ +void netif_threaded_enable(struct net_device *dev) +{ +	WARN_ON_ONCE(netif_set_threaded(dev, NETDEV_NAPI_THREADED_ENABLED)); +} +EXPORT_SYMBOL(netif_threaded_enable);  /**   * netif_queue_set_napi - Associate queue with the napi @@ -7182,6 +7263,8 @@ static void napi_restore_config(struct napi_struct *n)  		napi_hash_add(n);  		n->config->napi_id = n->napi_id;  	} + +	WARN_ON_ONCE(napi_set_threaded(n, n->config->threaded));  }  static void napi_save_config(struct napi_struct *n) @@ -7279,7 +7362,7 @@ void netif_napi_add_weight_locked(struct net_device *dev,  	 * threaded mode will not be enabled in napi_enable().  	 */  	if (dev->threaded && napi_kthread_create(napi)) -		dev->threaded = false; +		dev->threaded = NETDEV_NAPI_THREADED_DISABLED;  	netif_napi_set_irq_locked(napi, -1);  }  EXPORT_SYMBOL(netif_napi_add_weight_locked); @@ -7448,8 +7531,7 @@ static int __napi_poll(struct napi_struct *n, bool *repoll)  	}  	/* Flush too old packets. If HZ < 1000, flush all packets */ -	gro_flush(&n->gro, HZ >= 1000); -	gro_normal_list(&n->gro); +	gro_flush_normal(&n->gro, HZ >= 1000);  	/* Some drivers may have called napi_schedule  	 * prior to exhausting their budget. @@ -9387,12 +9469,12 @@ void dev_set_rx_mode(struct net_device *dev)  }  /** - *	dev_get_flags - get flags reported to userspace - *	@dev: device + * netif_get_flags() - get flags reported to userspace + * @dev: device   * - *	Get the combination of flag bits exported through APIs to userspace. + * Get the combination of flag bits exported through APIs to userspace.   */ -unsigned int dev_get_flags(const struct net_device *dev) +unsigned int netif_get_flags(const struct net_device *dev)  {  	unsigned int flags; @@ -9415,7 +9497,7 @@ unsigned int dev_get_flags(const struct net_device *dev)  	return flags;  } -EXPORT_SYMBOL(dev_get_flags); +EXPORT_SYMBOL(netif_get_flags);  int __dev_change_flags(struct net_device *dev, unsigned int flags,  		       struct netlink_ext_ack *extack) @@ -9527,7 +9609,7 @@ int netif_change_flags(struct net_device *dev, unsigned int flags,  	return ret;  } -int __dev_set_mtu(struct net_device *dev, int new_mtu) +int __netif_set_mtu(struct net_device *dev, int new_mtu)  {  	const struct net_device_ops *ops = dev->netdev_ops; @@ -9538,7 +9620,7 @@ int __dev_set_mtu(struct net_device *dev, int new_mtu)  	WRITE_ONCE(dev->mtu, new_mtu);  	return 0;  } -EXPORT_SYMBOL(__dev_set_mtu); +EXPORT_SYMBOL_NS_GPL(__netif_set_mtu, "NETDEV_INTERNAL");  int dev_validate_mtu(struct net_device *dev, int new_mtu,  		     struct netlink_ext_ack *extack) @@ -9557,18 +9639,22 @@ int dev_validate_mtu(struct net_device *dev, int new_mtu,  }  /** - *	netif_set_mtu_ext - Change maximum transfer unit - *	@dev: device - *	@new_mtu: new transfer unit - *	@extack: netlink extended ack + * netif_set_mtu_ext() - Change maximum transfer unit + * @dev: device + * @new_mtu: new transfer unit + * @extack: netlink extended ack + * + * Change the maximum transfer size of the network device.   * - *	Change the maximum transfer size of the network device. + * Return: 0 on success, -errno on failure.   */  int netif_set_mtu_ext(struct net_device *dev, int new_mtu,  		      struct netlink_ext_ack *extack)  {  	int err, orig_mtu; +	netdev_ops_assert_locked(dev); +  	if (new_mtu == dev->mtu)  		return 0; @@ -9585,7 +9671,7 @@ int netif_set_mtu_ext(struct net_device *dev, int new_mtu,  		return err;  	orig_mtu = dev->mtu; -	err = __dev_set_mtu(dev, new_mtu); +	err = __netif_set_mtu(dev, new_mtu);  	if (!err) {  		err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev, @@ -9595,7 +9681,7 @@ int netif_set_mtu_ext(struct net_device *dev, int new_mtu,  			/* setting mtu back and notifying everyone again,  			 * so that they have a chance to revert changes.  			 */ -			__dev_set_mtu(dev, orig_mtu); +			__netif_set_mtu(dev, orig_mtu);  			call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,  						     new_mtu);  		} @@ -9649,13 +9735,15 @@ void netif_set_group(struct net_device *dev, int new_group)  }  /** - *	dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR. - *	@dev: device - *	@addr: new address - *	@extack: netlink extended ack + * netif_pre_changeaddr_notify() - Call NETDEV_PRE_CHANGEADDR. + * @dev: device + * @addr: new address + * @extack: netlink extended ack + * + * Return: 0 on success, -errno on failure.   */ -int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, -			      struct netlink_ext_ack *extack) +int netif_pre_changeaddr_notify(struct net_device *dev, const char *addr, +				struct netlink_ext_ack *extack)  {  	struct netdev_notifier_pre_changeaddr_info info = {  		.info.dev = dev, @@ -9667,7 +9755,7 @@ int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,  	rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);  	return notifier_to_errno(rc);  } -EXPORT_SYMBOL(dev_pre_changeaddr_notify); +EXPORT_SYMBOL_NS_GPL(netif_pre_changeaddr_notify, "NETDEV_INTERNAL");  int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss,  			  struct netlink_ext_ack *extack) @@ -9681,7 +9769,7 @@ int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss,  		return -EINVAL;  	if (!netif_device_present(dev))  		return -ENODEV; -	err = dev_pre_changeaddr_notify(dev, ss->__data, extack); +	err = netif_pre_changeaddr_notify(dev, ss->__data, extack);  	if (err)  		return err;  	if (memcmp(dev->dev_addr, ss->__data, dev->addr_len)) { @@ -9698,7 +9786,7 @@ int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss,  DECLARE_RWSEM(dev_addr_sem);  /* "sa" is a true struct sockaddr with limited "sa_data" member. */ -int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name) +int netif_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)  {  	size_t size = sizeof(sa->sa_data_min);  	struct net_device *dev; @@ -9724,7 +9812,7 @@ unlock:  	up_read(&dev_addr_sem);  	return ret;  } -EXPORT_SYMBOL(dev_get_mac_address); +EXPORT_SYMBOL_NS_GPL(netif_get_mac_address, "NETDEV_INTERNAL");  int netif_change_carrier(struct net_device *dev, bool new_carrier)  { @@ -9777,16 +9865,17 @@ int dev_get_phys_port_name(struct net_device *dev,  }  /** - *	dev_get_port_parent_id - Get the device's port parent identifier - *	@dev: network device - *	@ppid: pointer to a storage for the port's parent identifier - *	@recurse: allow/disallow recursion to lower devices + * netif_get_port_parent_id() - Get the device's port parent identifier + * @dev: network device + * @ppid: pointer to a storage for the port's parent identifier + * @recurse: allow/disallow recursion to lower devices + * + * Get the devices's port parent identifier.   * - *	Get the devices's port parent identifier + * Return: 0 on success, -errno on failure.   */ -int dev_get_port_parent_id(struct net_device *dev, -			   struct netdev_phys_item_id *ppid, -			   bool recurse) +int netif_get_port_parent_id(struct net_device *dev, +			     struct netdev_phys_item_id *ppid, bool recurse)  {  	const struct net_device_ops *ops = dev->netdev_ops;  	struct netdev_phys_item_id first = { }; @@ -9805,7 +9894,7 @@ int dev_get_port_parent_id(struct net_device *dev,  		return err;  	netdev_for_each_lower_dev(dev, lower_dev, iter) { -		err = dev_get_port_parent_id(lower_dev, ppid, true); +		err = netif_get_port_parent_id(lower_dev, ppid, true);  		if (err)  			break;  		if (!first.id_len) @@ -9816,7 +9905,7 @@ int dev_get_port_parent_id(struct net_device *dev,  	return err;  } -EXPORT_SYMBOL(dev_get_port_parent_id); +EXPORT_SYMBOL(netif_get_port_parent_id);  /**   *	netdev_port_same_parent_id - Indicate if two network devices have @@ -9829,8 +9918,8 @@ bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)  	struct netdev_phys_item_id a_id = { };  	struct netdev_phys_item_id b_id = { }; -	if (dev_get_port_parent_id(a, &a_id, true) || -	    dev_get_port_parent_id(b, &b_id, true)) +	if (netif_get_port_parent_id(a, &a_id, true) || +	    netif_get_port_parent_id(b, &b_id, true))  		return false;  	return netdev_phys_item_id_same(&a_id, &b_id); @@ -10731,12 +10820,14 @@ sync_lower:  			 * *before* calling udp_tunnel_get_rx_info,  			 * but *after* calling udp_tunnel_drop_rx_info.  			 */ +			udp_tunnel_nic_lock(dev);  			if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {  				dev->features = features;  				udp_tunnel_get_rx_info(dev);  			} else {  				udp_tunnel_drop_rx_info(dev);  			} +			udp_tunnel_nic_unlock(dev);  		}  		if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) { @@ -11716,7 +11807,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,  	dev->priv_len = sizeof_priv; -	ref_tracker_dir_init(&dev->refcnt_tracker, 128, name); +	ref_tracker_dir_init(&dev->refcnt_tracker, 128, "netdev");  #ifdef CONFIG_PCPU_DEV_REFCNT  	dev->pcpu_refcnt = alloc_percpu(int);  	if (!dev->pcpu_refcnt) @@ -11938,21 +12029,8 @@ static void netdev_rss_contexts_free(struct net_device *dev)  	mutex_lock(&dev->ethtool->rss_lock);  	xa_for_each(&dev->ethtool->rss_ctx, context, ctx) { -		struct ethtool_rxfh_param rxfh; - -		rxfh.indir = ethtool_rxfh_context_indir(ctx); -		rxfh.key = ethtool_rxfh_context_key(ctx); -		rxfh.hfunc = ctx->hfunc; -		rxfh.input_xfrm = ctx->input_xfrm; -		rxfh.rss_context = context; -		rxfh.rss_delete = true; -  		xa_erase(&dev->ethtool->rss_ctx, context); -		if (dev->ethtool_ops->create_rxfh_context) -			dev->ethtool_ops->remove_rxfh_context(dev, ctx, -							      context, NULL); -		else -			dev->ethtool_ops->set_rxfh(dev, &rxfh, NULL); +		dev->ethtool_ops->remove_rxfh_context(dev, ctx, context, NULL);  		kfree(ctx);  	}  	xa_destroy(&dev->ethtool->rss_ctx); @@ -12037,7 +12115,7 @@ void unregister_netdevice_many_notify(struct list_head *head,  			netdev_lock(dev);  		}  	} -	dev_close_many(&close_head, true); +	netif_close_many(&close_head, true);  	/* ... now unlock them and go over the rest. */  	list_for_each_entry(dev, head, unreg_list) {  		if (netdev_need_ops_lock(dev)) @@ -12045,7 +12123,7 @@ void unregister_netdevice_many_notify(struct list_head *head,  		else  			list_add_tail(&dev->close_list, &close_head);  	} -	dev_close_many(&close_head, true); +	netif_close_many(&close_head, true);  	list_for_each_entry(dev, head, unreg_list) {  		/* And unlink it from device chain. */  | 
