diff options
Diffstat (limited to 'net/core/dev.c')
-rw-r--r-- | net/core/dev.c | 307 |
1 files changed, 139 insertions, 168 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index 1baab07820f6..2d6771075720 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -216,18 +216,38 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; } -static inline void rps_lock(struct softnet_data *sd) +static inline void rps_lock_irqsave(struct softnet_data *sd, + unsigned long *flags) { -#ifdef CONFIG_RPS - spin_lock(&sd->input_pkt_queue.lock); -#endif + if (IS_ENABLED(CONFIG_RPS)) + spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags); + else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_save(*flags); } -static inline void rps_unlock(struct softnet_data *sd) +static inline void rps_lock_irq_disable(struct softnet_data *sd) { -#ifdef CONFIG_RPS - spin_unlock(&sd->input_pkt_queue.lock); -#endif + if (IS_ENABLED(CONFIG_RPS)) + spin_lock_irq(&sd->input_pkt_queue.lock); + else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_disable(); +} + +static inline void rps_unlock_irq_restore(struct softnet_data *sd, + unsigned long *flags) +{ + if (IS_ENABLED(CONFIG_RPS)) + spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags); + else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_restore(*flags); +} + +static inline void rps_unlock_irq_enable(struct softnet_data *sd) +{ + if (IS_ENABLED(CONFIG_RPS)) + spin_unlock_irq(&sd->input_pkt_queue.lock); + else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_enable(); } static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev, @@ -1037,7 +1057,7 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf) /* avoid cases where sscanf is not exact inverse of printf */ snprintf(buf, IFNAMSIZ, name, i); if (!strncmp(buf, name_node->name, IFNAMSIZ)) - set_bit(i, inuse); + __set_bit(i, inuse); } if (!sscanf(d->name, name, &i)) continue; @@ -1047,7 +1067,7 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf) /* avoid cases where sscanf is not exact inverse of printf */ snprintf(buf, IFNAMSIZ, name, i); if (!strncmp(buf, d->name, IFNAMSIZ)) - set_bit(i, inuse); + __set_bit(i, inuse); } i = find_first_zero_bit(inuse, max_netdevices); @@ -4456,11 +4476,11 @@ static void rps_trigger_softirq(void *data) * If yes, queue it to our IPI list and return 1 * If no, return 0 */ -static int rps_ipi_queued(struct softnet_data *sd) +static int napi_schedule_rps(struct softnet_data *sd) { -#ifdef CONFIG_RPS struct softnet_data *mysd = this_cpu_ptr(&softnet_data); +#ifdef CONFIG_RPS if (sd != mysd) { sd->rps_ipi_next = mysd->rps_ipi_list; mysd->rps_ipi_list = sd; @@ -4469,6 +4489,7 @@ static int rps_ipi_queued(struct softnet_data *sd) return 1; } #endif /* CONFIG_RPS */ + __napi_schedule_irqoff(&mysd->backlog); return 0; } @@ -4525,9 +4546,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, sd = &per_cpu(softnet_data, cpu); - local_irq_save(flags); - - rps_lock(sd); + rps_lock_irqsave(sd, &flags); if (!netif_running(skb->dev)) goto drop; qlen = skb_queue_len(&sd->input_pkt_queue); @@ -4536,26 +4555,21 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, enqueue: __skb_queue_tail(&sd->input_pkt_queue, skb); input_queue_tail_incr_save(sd, qtail); - rps_unlock(sd); - local_irq_restore(flags); + rps_unlock_irq_restore(sd, &flags); return NET_RX_SUCCESS; } /* Schedule NAPI for backlog device * We can use non atomic operation since we own the queue lock */ - if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) { - if (!rps_ipi_queued(sd)) - ____napi_schedule(sd, &sd->backlog); - } + if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) + napi_schedule_rps(sd); goto enqueue; } drop: sd->dropped++; - rps_unlock(sd); - - local_irq_restore(flags); + rps_unlock_irq_restore(sd, &flags); atomic_long_inc(&skb->dev->rx_dropped); kfree_skb(skb); @@ -4796,7 +4810,6 @@ static int netif_rx_internal(struct sk_buff *skb) struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu; - preempt_disable(); rcu_read_lock(); cpu = get_rps_cpu(skb->dev, skb, &rflow); @@ -4806,78 +4819,72 @@ static int netif_rx_internal(struct sk_buff *skb) ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); rcu_read_unlock(); - preempt_enable(); } else #endif { unsigned int qtail; - ret = enqueue_to_backlog(skb, get_cpu(), &qtail); - put_cpu(); + ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail); } return ret; } /** + * __netif_rx - Slightly optimized version of netif_rx + * @skb: buffer to post + * + * This behaves as netif_rx except that it does not disable bottom halves. + * As a result this function may only be invoked from the interrupt context + * (either hard or soft interrupt). + */ +int __netif_rx(struct sk_buff *skb) +{ + int ret; + + lockdep_assert_once(hardirq_count() | softirq_count()); + + trace_netif_rx_entry(skb); + ret = netif_rx_internal(skb); + trace_netif_rx_exit(ret); + return ret; +} +EXPORT_SYMBOL(__netif_rx); + +/** * netif_rx - post buffer to the network code * @skb: buffer to post * * This function receives a packet from a device driver and queues it for - * the upper (protocol) levels to process. It always succeeds. The buffer - * may be dropped during processing for congestion control or by the - * protocol layers. + * the upper (protocol) levels to process via the backlog NAPI device. It + * always succeeds. The buffer may be dropped during processing for + * congestion control or by the protocol layers. + * The network buffer is passed via the backlog NAPI device. Modern NIC + * driver should use NAPI and GRO. + * This function can used from interrupt and from process context. The + * caller from process context must not disable interrupts before invoking + * this function. * * return values: * NET_RX_SUCCESS (no congestion) * NET_RX_DROP (packet was dropped) * */ - int netif_rx(struct sk_buff *skb) { + bool need_bh_off = !(hardirq_count() | softirq_count()); int ret; + if (need_bh_off) + local_bh_disable(); trace_netif_rx_entry(skb); - ret = netif_rx_internal(skb); trace_netif_rx_exit(ret); - + if (need_bh_off) + local_bh_enable(); return ret; } EXPORT_SYMBOL(netif_rx); -int netif_rx_ni(struct sk_buff *skb) -{ - int err; - - trace_netif_rx_ni_entry(skb); - - preempt_disable(); - err = netif_rx_internal(skb); - if (local_softirq_pending()) - do_softirq(); - preempt_enable(); - trace_netif_rx_ni_exit(err); - - return err; -} -EXPORT_SYMBOL(netif_rx_ni); - -int netif_rx_any_context(struct sk_buff *skb) -{ - /* - * If invoked from contexts which do not invoke bottom half - * processing either at return from interrupt or when softrqs are - * reenabled, use netif_rx_ni() which invokes bottomhalf processing - * directly. - */ - if (in_interrupt()) - return netif_rx(skb); - else - return netif_rx_ni(skb); -} -EXPORT_SYMBOL(netif_rx_any_context); - static __latent_entropy void net_tx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); @@ -5650,8 +5657,7 @@ static void flush_backlog(struct work_struct *work) local_bh_disable(); sd = this_cpu_ptr(&softnet_data); - local_irq_disable(); - rps_lock(sd); + rps_lock_irq_disable(sd); skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->input_pkt_queue); @@ -5659,8 +5665,7 @@ static void flush_backlog(struct work_struct *work) input_queue_head_incr(sd); } } - rps_unlock(sd); - local_irq_enable(); + rps_unlock_irq_enable(sd); skb_queue_walk_safe(&sd->process_queue, skb, tmp) { if (skb->dev->reg_state == NETREG_UNREGISTERING) { @@ -5678,16 +5683,14 @@ static bool flush_required(int cpu) struct softnet_data *sd = &per_cpu(softnet_data, cpu); bool do_flush; - local_irq_disable(); - rps_lock(sd); + rps_lock_irq_disable(sd); /* as insertion into process_queue happens with the rps lock held, * process_queue access may race only with dequeue */ do_flush = !skb_queue_empty(&sd->input_pkt_queue) || !skb_queue_empty_lockless(&sd->process_queue); - rps_unlock(sd); - local_irq_enable(); + rps_unlock_irq_enable(sd); return do_flush; #endif @@ -5802,8 +5805,7 @@ static int process_backlog(struct napi_struct *napi, int quota) } - local_irq_disable(); - rps_lock(sd); + rps_lock_irq_disable(sd); if (skb_queue_empty(&sd->input_pkt_queue)) { /* * Inline a custom version of __napi_complete(). @@ -5819,8 +5821,7 @@ static int process_backlog(struct napi_struct *napi, int quota) skb_queue_splice_tail_init(&sd->input_pkt_queue, &sd->process_queue); } - rps_unlock(sd); - local_irq_enable(); + rps_unlock_irq_enable(sd); } return work; @@ -9143,7 +9144,7 @@ DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); static void net_set_todo(struct net_device *dev) { list_add_tail(&dev->todo_list, &net_todo_list); - dev_net(dev)->dev_unreg_count++; + atomic_inc(&dev_net(dev)->dev_unreg_count); } static netdev_features_t netdev_sync_upper_features(struct net_device *lower, @@ -9683,8 +9684,10 @@ int register_netdevice(struct net_device *dev) linkwatch_init_dev(dev); dev_init_scheduler(dev); - dev_hold(dev); + + dev_hold_track(dev, &dev->dev_registered_tracker, GFP_KERNEL); list_netdevice(dev); + add_device_randomness(dev->dev_addr, dev->addr_len); /* If the device has permanent device address, driver should @@ -9813,8 +9816,8 @@ int netdev_unregister_timeout_secs __read_mostly = 10; #define WAIT_REFS_MIN_MSECS 1 #define WAIT_REFS_MAX_MSECS 250 /** - * netdev_wait_allrefs - wait until all references are gone. - * @dev: target net_device + * netdev_wait_allrefs_any - wait until all references are gone. + * @list: list of net_devices to wait on * * This is called when unregistering network devices. * @@ -9824,37 +9827,42 @@ int netdev_unregister_timeout_secs __read_mostly = 10; * We can get stuck here if buggy protocols don't correctly * call dev_put. */ -static void netdev_wait_allrefs(struct net_device *dev) +static struct net_device *netdev_wait_allrefs_any(struct list_head *list) { unsigned long rebroadcast_time, warning_time; - int wait = 0, refcnt; - - linkwatch_forget_dev(dev); + struct net_device *dev; + int wait = 0; rebroadcast_time = warning_time = jiffies; - refcnt = netdev_refcnt_read(dev); - while (refcnt != 1) { + list_for_each_entry(dev, list, todo_list) + if (netdev_refcnt_read(dev) == 1) + return dev; + + while (true) { if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { rtnl_lock(); /* Rebroadcast unregister notification */ - call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + list_for_each_entry(dev, list, todo_list) + call_netdevice_notifiers(NETDEV_UNREGISTER, dev); __rtnl_unlock(); rcu_barrier(); rtnl_lock(); - if (test_bit(__LINK_STATE_LINKWATCH_PENDING, - &dev->state)) { - /* We must not have linkwatch events - * pending on unregister. If this - * happens, we simply run the queue - * unscheduled, resulting in a noop - * for this device. - */ - linkwatch_run_queue(); - } + list_for_each_entry(dev, list, todo_list) + if (test_bit(__LINK_STATE_LINKWATCH_PENDING, + &dev->state)) { + /* We must not have linkwatch events + * pending on unregister. If this + * happens, we simply run the queue + * unscheduled, resulting in a noop + * for this device. + */ + linkwatch_run_queue(); + break; + } __rtnl_unlock(); @@ -9869,14 +9877,18 @@ static void netdev_wait_allrefs(struct net_device *dev) wait = min(wait << 1, WAIT_REFS_MAX_MSECS); } - refcnt = netdev_refcnt_read(dev); + list_for_each_entry(dev, list, todo_list) + if (netdev_refcnt_read(dev) == 1) + return dev; - if (refcnt != 1 && - time_after(jiffies, warning_time + + if (time_after(jiffies, warning_time + netdev_unregister_timeout_secs * HZ)) { - pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", - dev->name, refcnt); - ref_tracker_dir_print(&dev->refcnt_tracker, 10); + list_for_each_entry(dev, list, todo_list) { + pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", + dev->name, netdev_refcnt_read(dev)); + ref_tracker_dir_print(&dev->refcnt_tracker, 10); + } + warning_time = jiffies; } } @@ -9908,6 +9920,7 @@ static void netdev_wait_allrefs(struct net_device *dev) */ void netdev_run_todo(void) { + struct net_device *dev, *tmp; struct list_head list; #ifdef CONFIG_LOCKDEP struct list_head unlink_list; @@ -9928,26 +9941,24 @@ void netdev_run_todo(void) __rtnl_unlock(); - /* Wait for rcu callbacks to finish before next phase */ if (!list_empty(&list)) rcu_barrier(); - while (!list_empty(&list)) { - struct net_device *dev - = list_first_entry(&list, struct net_device, todo_list); - list_del(&dev->todo_list); - + list_for_each_entry_safe(dev, tmp, &list, todo_list) { if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { - pr_err("network todo '%s' but state %d\n", - dev->name, dev->reg_state); - dump_stack(); + netdev_WARN(dev, "run_todo but not unregistering\n"); + list_del(&dev->todo_list); continue; } dev->reg_state = NETREG_UNREGISTERED; + linkwatch_forget_dev(dev); + } - netdev_wait_allrefs(dev); + while (!list_empty(&list)) { + dev = netdev_wait_allrefs_any(&list); + list_del(&dev->todo_list); /* paranoia */ BUG_ON(netdev_refcnt_read(dev) != 1); @@ -9963,11 +9974,8 @@ void netdev_run_todo(void) if (dev->needs_free_netdev) free_netdev(dev); - /* Report a network device has been unregistered */ - rtnl_lock(); - dev_net(dev)->dev_unreg_count--; - __rtnl_unlock(); - wake_up(&netdev_unregistering_wq); + if (atomic_dec_and_test(&dev_net(dev)->dev_unreg_count)) + wake_up(&netdev_unregistering_wq); /* Free network device */ kobject_put(&dev->dev.kobj); @@ -10172,7 +10180,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->pcpu_refcnt = alloc_percpu(int); if (!dev->pcpu_refcnt) goto free_dev; - dev_hold(dev); + __dev_hold(dev); #else refcount_set(&dev->dev_refcnt, 1); #endif @@ -10449,7 +10457,7 @@ void unregister_netdevice_many(struct list_head *head) synchronize_net(); list_for_each_entry(dev, head, unreg_list) { - dev_put(dev); + dev_put_track(dev, &dev->dev_registered_tracker); net_set_todo(dev); } @@ -10732,8 +10740,7 @@ static int __net_init netdev_init(struct net *net) BUILD_BUG_ON(GRO_HASH_BUCKETS > 8 * sizeof_field(struct napi_struct, gro_bitmask)); - if (net != &init_net) - INIT_LIST_HEAD(&net->dev_base_head); + INIT_LIST_HEAD(&net->dev_base_head); net->dev_name_head = netdev_create_hash(); if (net->dev_name_head == NULL) @@ -10849,14 +10856,14 @@ static struct pernet_operations __net_initdata netdev_net_ops = { .exit = netdev_exit, }; -static void __net_exit default_device_exit(struct net *net) +static void __net_exit default_device_exit_net(struct net *net) { struct net_device *dev, *aux; /* * Push all migratable network devices back to the * initial network namespace */ - rtnl_lock(); + ASSERT_RTNL(); for_each_netdev_safe(net, dev, aux) { int err; char fb_name[IFNAMSIZ]; @@ -10880,35 +10887,6 @@ static void __net_exit default_device_exit(struct net *net) BUG(); } } - rtnl_unlock(); -} - -static void __net_exit rtnl_lock_unregistering(struct list_head *net_list) -{ - /* Return with the rtnl_lock held when there are no network - * devices unregistering in any network namespace in net_list. - */ - struct net *net; - bool unregistering; - DEFINE_WAIT_FUNC(wait, woken_wake_function); - - add_wait_queue(&netdev_unregistering_wq, &wait); - for (;;) { - unregistering = false; - rtnl_lock(); - list_for_each_entry(net, net_list, exit_list) { - if (net->dev_unreg_count > 0) { - unregistering = true; - break; - } - } - if (!unregistering) - break; - __rtnl_unlock(); - - wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); - } - remove_wait_queue(&netdev_unregistering_wq, &wait); } static void __net_exit default_device_exit_batch(struct list_head *net_list) @@ -10922,18 +10900,12 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list) struct net *net; LIST_HEAD(dev_kill_list); - /* To prevent network device cleanup code from dereferencing - * loopback devices or network devices that have been freed - * wait here for all pending unregistrations to complete, - * before unregistring the loopback device and allowing the - * network namespace be freed. - * - * The netdev todo list containing all network devices - * unregistrations that happen in default_device_exit_batch - * will run in the rtnl_unlock() at the end of - * default_device_exit_batch. - */ - rtnl_lock_unregistering(net_list); + rtnl_lock(); + list_for_each_entry(net, net_list, exit_list) { + default_device_exit_net(net); + cond_resched(); + } + list_for_each_entry(net, net_list, exit_list) { for_each_netdev_reverse(net, dev) { if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) @@ -10947,7 +10919,6 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list) } static struct pernet_operations __net_initdata default_device_ops = { - .exit = default_device_exit, .exit_batch = default_device_exit_batch, }; |