diff options
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/dev.c | 307 | ||||
-rw-r--r-- | net/core/drop_monitor.c | 120 | ||||
-rw-r--r-- | net/core/filter.c | 273 | ||||
-rw-r--r-- | net/core/gro.c | 16 | ||||
-rw-r--r-- | net/core/gro_cells.c | 36 | ||||
-rw-r--r-- | net/core/link_watch.c | 6 | ||||
-rw-r--r-- | net/core/net_namespace.c | 20 | ||||
-rw-r--r-- | net/core/page_pool.c | 23 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 6 | ||||
-rw-r--r-- | net/core/skbuff.c | 51 | ||||
-rw-r--r-- | net/core/sock.c | 22 | ||||
-rw-r--r-- | net/core/sock_map.c | 77 | ||||
-rw-r--r-- | net/core/sysctl_net_core.c | 14 | ||||
-rw-r--r-- | net/core/utils.c | 4 | ||||
-rw-r--r-- | net/core/xdp.c | 78 |
15 files changed, 735 insertions, 318 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index 1baab07820f6..2d6771075720 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -216,18 +216,38 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; } -static inline void rps_lock(struct softnet_data *sd) +static inline void rps_lock_irqsave(struct softnet_data *sd, + unsigned long *flags) { -#ifdef CONFIG_RPS - spin_lock(&sd->input_pkt_queue.lock); -#endif + if (IS_ENABLED(CONFIG_RPS)) + spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags); + else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_save(*flags); } -static inline void rps_unlock(struct softnet_data *sd) +static inline void rps_lock_irq_disable(struct softnet_data *sd) { -#ifdef CONFIG_RPS - spin_unlock(&sd->input_pkt_queue.lock); -#endif + if (IS_ENABLED(CONFIG_RPS)) + spin_lock_irq(&sd->input_pkt_queue.lock); + else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_disable(); +} + +static inline void rps_unlock_irq_restore(struct softnet_data *sd, + unsigned long *flags) +{ + if (IS_ENABLED(CONFIG_RPS)) + spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags); + else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_restore(*flags); +} + +static inline void rps_unlock_irq_enable(struct softnet_data *sd) +{ + if (IS_ENABLED(CONFIG_RPS)) + spin_unlock_irq(&sd->input_pkt_queue.lock); + else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_enable(); } static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev, @@ -1037,7 +1057,7 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf) /* avoid cases where sscanf is not exact inverse of printf */ snprintf(buf, IFNAMSIZ, name, i); if (!strncmp(buf, name_node->name, IFNAMSIZ)) - set_bit(i, inuse); + __set_bit(i, inuse); } if (!sscanf(d->name, name, &i)) continue; @@ -1047,7 +1067,7 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf) /* avoid cases where sscanf is not exact inverse of printf */ snprintf(buf, IFNAMSIZ, name, i); if (!strncmp(buf, d->name, IFNAMSIZ)) - set_bit(i, inuse); + __set_bit(i, inuse); } i = find_first_zero_bit(inuse, max_netdevices); @@ -4456,11 +4476,11 @@ static void rps_trigger_softirq(void *data) * If yes, queue it to our IPI list and return 1 * If no, return 0 */ -static int rps_ipi_queued(struct softnet_data *sd) +static int napi_schedule_rps(struct softnet_data *sd) { -#ifdef CONFIG_RPS struct softnet_data *mysd = this_cpu_ptr(&softnet_data); +#ifdef CONFIG_RPS if (sd != mysd) { sd->rps_ipi_next = mysd->rps_ipi_list; mysd->rps_ipi_list = sd; @@ -4469,6 +4489,7 @@ static int rps_ipi_queued(struct softnet_data *sd) return 1; } #endif /* CONFIG_RPS */ + __napi_schedule_irqoff(&mysd->backlog); return 0; } @@ -4525,9 +4546,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, sd = &per_cpu(softnet_data, cpu); - local_irq_save(flags); - - rps_lock(sd); + rps_lock_irqsave(sd, &flags); if (!netif_running(skb->dev)) goto drop; qlen = skb_queue_len(&sd->input_pkt_queue); @@ -4536,26 +4555,21 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, enqueue: __skb_queue_tail(&sd->input_pkt_queue, skb); input_queue_tail_incr_save(sd, qtail); - rps_unlock(sd); - local_irq_restore(flags); + rps_unlock_irq_restore(sd, &flags); return NET_RX_SUCCESS; } /* Schedule NAPI for backlog device * We can use non atomic operation since we own the queue lock */ - if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) { - if (!rps_ipi_queued(sd)) - ____napi_schedule(sd, &sd->backlog); - } + if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) + napi_schedule_rps(sd); goto enqueue; } drop: sd->dropped++; - rps_unlock(sd); - - local_irq_restore(flags); + rps_unlock_irq_restore(sd, &flags); atomic_long_inc(&skb->dev->rx_dropped); kfree_skb(skb); @@ -4796,7 +4810,6 @@ static int netif_rx_internal(struct sk_buff *skb) struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu; - preempt_disable(); rcu_read_lock(); cpu = get_rps_cpu(skb->dev, skb, &rflow); @@ -4806,78 +4819,72 @@ static int netif_rx_internal(struct sk_buff *skb) ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); rcu_read_unlock(); - preempt_enable(); } else #endif { unsigned int qtail; - ret = enqueue_to_backlog(skb, get_cpu(), &qtail); - put_cpu(); + ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail); } return ret; } /** + * __netif_rx - Slightly optimized version of netif_rx + * @skb: buffer to post + * + * This behaves as netif_rx except that it does not disable bottom halves. + * As a result this function may only be invoked from the interrupt context + * (either hard or soft interrupt). + */ +int __netif_rx(struct sk_buff *skb) +{ + int ret; + + lockdep_assert_once(hardirq_count() | softirq_count()); + + trace_netif_rx_entry(skb); + ret = netif_rx_internal(skb); + trace_netif_rx_exit(ret); + return ret; +} +EXPORT_SYMBOL(__netif_rx); + +/** * netif_rx - post buffer to the network code * @skb: buffer to post * * This function receives a packet from a device driver and queues it for - * the upper (protocol) levels to process. It always succeeds. The buffer - * may be dropped during processing for congestion control or by the - * protocol layers. + * the upper (protocol) levels to process via the backlog NAPI device. It + * always succeeds. The buffer may be dropped during processing for + * congestion control or by the protocol layers. + * The network buffer is passed via the backlog NAPI device. Modern NIC + * driver should use NAPI and GRO. + * This function can used from interrupt and from process context. The + * caller from process context must not disable interrupts before invoking + * this function. * * return values: * NET_RX_SUCCESS (no congestion) * NET_RX_DROP (packet was dropped) * */ - int netif_rx(struct sk_buff *skb) { + bool need_bh_off = !(hardirq_count() | softirq_count()); int ret; + if (need_bh_off) + local_bh_disable(); trace_netif_rx_entry(skb); - ret = netif_rx_internal(skb); trace_netif_rx_exit(ret); - + if (need_bh_off) + local_bh_enable(); return ret; } EXPORT_SYMBOL(netif_rx); -int netif_rx_ni(struct sk_buff *skb) -{ - int err; - - trace_netif_rx_ni_entry(skb); - - preempt_disable(); - err = netif_rx_internal(skb); - if (local_softirq_pending()) - do_softirq(); - preempt_enable(); - trace_netif_rx_ni_exit(err); - - return err; -} -EXPORT_SYMBOL(netif_rx_ni); - -int netif_rx_any_context(struct sk_buff *skb) -{ - /* - * If invoked from contexts which do not invoke bottom half - * processing either at return from interrupt or when softrqs are - * reenabled, use netif_rx_ni() which invokes bottomhalf processing - * directly. - */ - if (in_interrupt()) - return netif_rx(skb); - else - return netif_rx_ni(skb); -} -EXPORT_SYMBOL(netif_rx_any_context); - static __latent_entropy void net_tx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); @@ -5650,8 +5657,7 @@ static void flush_backlog(struct work_struct *work) local_bh_disable(); sd = this_cpu_ptr(&softnet_data); - local_irq_disable(); - rps_lock(sd); + rps_lock_irq_disable(sd); skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->input_pkt_queue); @@ -5659,8 +5665,7 @@ static void flush_backlog(struct work_struct *work) input_queue_head_incr(sd); } } - rps_unlock(sd); - local_irq_enable(); + rps_unlock_irq_enable(sd); skb_queue_walk_safe(&sd->process_queue, skb, tmp) { if (skb->dev->reg_state == NETREG_UNREGISTERING) { @@ -5678,16 +5683,14 @@ static bool flush_required(int cpu) struct softnet_data *sd = &per_cpu(softnet_data, cpu); bool do_flush; - local_irq_disable(); - rps_lock(sd); + rps_lock_irq_disable(sd); /* as insertion into process_queue happens with the rps lock held, * process_queue access may race only with dequeue */ do_flush = !skb_queue_empty(&sd->input_pkt_queue) || !skb_queue_empty_lockless(&sd->process_queue); - rps_unlock(sd); - local_irq_enable(); + rps_unlock_irq_enable(sd); return do_flush; #endif @@ -5802,8 +5805,7 @@ static int process_backlog(struct napi_struct *napi, int quota) } - local_irq_disable(); - rps_lock(sd); + rps_lock_irq_disable(sd); if (skb_queue_empty(&sd->input_pkt_queue)) { /* * Inline a custom version of __napi_complete(). @@ -5819,8 +5821,7 @@ static int process_backlog(struct napi_struct *napi, int quota) skb_queue_splice_tail_init(&sd->input_pkt_queue, &sd->process_queue); } - rps_unlock(sd); - local_irq_enable(); + rps_unlock_irq_enable(sd); } return work; @@ -9143,7 +9144,7 @@ DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); static void net_set_todo(struct net_device *dev) { list_add_tail(&dev->todo_list, &net_todo_list); - dev_net(dev)->dev_unreg_count++; + atomic_inc(&dev_net(dev)->dev_unreg_count); } static netdev_features_t netdev_sync_upper_features(struct net_device *lower, @@ -9683,8 +9684,10 @@ int register_netdevice(struct net_device *dev) linkwatch_init_dev(dev); dev_init_scheduler(dev); - dev_hold(dev); + + dev_hold_track(dev, &dev->dev_registered_tracker, GFP_KERNEL); list_netdevice(dev); + add_device_randomness(dev->dev_addr, dev->addr_len); /* If the device has permanent device address, driver should @@ -9813,8 +9816,8 @@ int netdev_unregister_timeout_secs __read_mostly = 10; #define WAIT_REFS_MIN_MSECS 1 #define WAIT_REFS_MAX_MSECS 250 /** - * netdev_wait_allrefs - wait until all references are gone. - * @dev: target net_device + * netdev_wait_allrefs_any - wait until all references are gone. + * @list: list of net_devices to wait on * * This is called when unregistering network devices. * @@ -9824,37 +9827,42 @@ int netdev_unregister_timeout_secs __read_mostly = 10; * We can get stuck here if buggy protocols don't correctly * call dev_put. */ -static void netdev_wait_allrefs(struct net_device *dev) +static struct net_device *netdev_wait_allrefs_any(struct list_head *list) { unsigned long rebroadcast_time, warning_time; - int wait = 0, refcnt; - - linkwatch_forget_dev(dev); + struct net_device *dev; + int wait = 0; rebroadcast_time = warning_time = jiffies; - refcnt = netdev_refcnt_read(dev); - while (refcnt != 1) { + list_for_each_entry(dev, list, todo_list) + if (netdev_refcnt_read(dev) == 1) + return dev; + + while (true) { if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { rtnl_lock(); /* Rebroadcast unregister notification */ - call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + list_for_each_entry(dev, list, todo_list) + call_netdevice_notifiers(NETDEV_UNREGISTER, dev); __rtnl_unlock(); rcu_barrier(); rtnl_lock(); - if (test_bit(__LINK_STATE_LINKWATCH_PENDING, - &dev->state)) { - /* We must not have linkwatch events - * pending on unregister. If this - * happens, we simply run the queue - * unscheduled, resulting in a noop - * for this device. - */ - linkwatch_run_queue(); - } + list_for_each_entry(dev, list, todo_list) + if (test_bit(__LINK_STATE_LINKWATCH_PENDING, + &dev->state)) { + /* We must not have linkwatch events + * pending on unregister. If this + * happens, we simply run the queue + * unscheduled, resulting in a noop + * for this device. + */ + linkwatch_run_queue(); + break; + } __rtnl_unlock(); @@ -9869,14 +9877,18 @@ static void netdev_wait_allrefs(struct net_device *dev) wait = min(wait << 1, WAIT_REFS_MAX_MSECS); } - refcnt = netdev_refcnt_read(dev); + list_for_each_entry(dev, list, todo_list) + if (netdev_refcnt_read(dev) == 1) + return dev; - if (refcnt != 1 && - time_after(jiffies, warning_time + + if (time_after(jiffies, warning_time + netdev_unregister_timeout_secs * HZ)) { - pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", - dev->name, refcnt); - ref_tracker_dir_print(&dev->refcnt_tracker, 10); + list_for_each_entry(dev, list, todo_list) { + pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", + dev->name, netdev_refcnt_read(dev)); + ref_tracker_dir_print(&dev->refcnt_tracker, 10); + } + warning_time = jiffies; } } @@ -9908,6 +9920,7 @@ static void netdev_wait_allrefs(struct net_device *dev) */ void netdev_run_todo(void) { + struct net_device *dev, *tmp; struct list_head list; #ifdef CONFIG_LOCKDEP struct list_head unlink_list; @@ -9928,26 +9941,24 @@ void netdev_run_todo(void) __rtnl_unlock(); - /* Wait for rcu callbacks to finish before next phase */ if (!list_empty(&list)) rcu_barrier(); - while (!list_empty(&list)) { - struct net_device *dev - = list_first_entry(&list, struct net_device, todo_list); - list_del(&dev->todo_list); - + list_for_each_entry_safe(dev, tmp, &list, todo_list) { if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { - pr_err("network todo '%s' but state %d\n", - dev->name, dev->reg_state); - dump_stack(); + netdev_WARN(dev, "run_todo but not unregistering\n"); + list_del(&dev->todo_list); continue; } dev->reg_state = NETREG_UNREGISTERED; + linkwatch_forget_dev(dev); + } - netdev_wait_allrefs(dev); + while (!list_empty(&list)) { + dev = netdev_wait_allrefs_any(&list); + list_del(&dev->todo_list); /* paranoia */ BUG_ON(netdev_refcnt_read(dev) != 1); @@ -9963,11 +9974,8 @@ void netdev_run_todo(void) if (dev->needs_free_netdev) free_netdev(dev); - /* Report a network device has been unregistered */ - rtnl_lock(); - dev_net(dev)->dev_unreg_count--; - __rtnl_unlock(); - wake_up(&netdev_unregistering_wq); + if (atomic_dec_and_test(&dev_net(dev)->dev_unreg_count)) + wake_up(&netdev_unregistering_wq); /* Free network device */ kobject_put(&dev->dev.kobj); @@ -10172,7 +10180,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->pcpu_refcnt = alloc_percpu(int); if (!dev->pcpu_refcnt) goto free_dev; - dev_hold(dev); + __dev_hold(dev); #else refcount_set(&dev->dev_refcnt, 1); #endif @@ -10449,7 +10457,7 @@ void unregister_netdevice_many(struct list_head *head) synchronize_net(); list_for_each_entry(dev, head, unreg_list) { - dev_put(dev); + dev_put_track(dev, &dev->dev_registered_tracker); net_set_todo(dev); } @@ -10732,8 +10740,7 @@ static int __net_init netdev_init(struct net *net) BUILD_BUG_ON(GRO_HASH_BUCKETS > 8 * sizeof_field(struct napi_struct, gro_bitmask)); - if (net != &init_net) - INIT_LIST_HEAD(&net->dev_base_head); + INIT_LIST_HEAD(&net->dev_base_head); net->dev_name_head = netdev_create_hash(); if (net->dev_name_head == NULL) @@ -10849,14 +10856,14 @@ static struct pernet_operations __net_initdata netdev_net_ops = { .exit = netdev_exit, }; -static void __net_exit default_device_exit(struct net *net) +static void __net_exit default_device_exit_net(struct net *net) { struct net_device *dev, *aux; /* * Push all migratable network devices back to the * initial network namespace */ - rtnl_lock(); + ASSERT_RTNL(); for_each_netdev_safe(net, dev, aux) { int err; char fb_name[IFNAMSIZ]; @@ -10880,35 +10887,6 @@ static void __net_exit default_device_exit(struct net *net) BUG(); } } - rtnl_unlock(); -} - -static void __net_exit rtnl_lock_unregistering(struct list_head *net_list) -{ - /* Return with the rtnl_lock held when there are no network - * devices unregistering in any network namespace in net_list. - */ - struct net *net; - bool unregistering; - DEFINE_WAIT_FUNC(wait, woken_wake_function); - - add_wait_queue(&netdev_unregistering_wq, &wait); - for (;;) { - unregistering = false; - rtnl_lock(); - list_for_each_entry(net, net_list, exit_list) { - if (net->dev_unreg_count > 0) { - unregistering = true; - break; - } - } - if (!unregistering) - break; - __rtnl_unlock(); - - wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); - } - remove_wait_queue(&netdev_unregistering_wq, &wait); } static void __net_exit default_device_exit_batch(struct list_head *net_list) @@ -10922,18 +10900,12 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list) struct net *net; LIST_HEAD(dev_kill_list); - /* To prevent network device cleanup code from dereferencing - * loopback devices or network devices that have been freed - * wait here for all pending unregistrations to complete, - * before unregistring the loopback device and allowing the - * network namespace be freed. - * - * The netdev todo list containing all network devices - * unregistrations that happen in default_device_exit_batch - * will run in the rtnl_unlock() at the end of - * default_device_exit_batch. - */ - rtnl_lock_unregistering(net_list); + rtnl_lock(); + list_for_each_entry(net, net_list, exit_list) { + default_device_exit_net(net); + cond_resched(); + } + list_for_each_entry(net, net_list, exit_list) { for_each_netdev_reverse(net, dev) { if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) @@ -10947,7 +10919,6 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list) } static struct pernet_operations __net_initdata default_device_ops = { - .exit = default_device_exit, .exit_batch = default_device_exit_batch, }; diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index d5dc6be2522c..b89e3e95bffc 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -48,10 +48,22 @@ static int trace_state = TRACE_OFF; static bool monitor_hw; +#undef EM +#undef EMe + +#define EM(a, b) [a] = #b, +#define EMe(a, b) [a] = #b + +/* drop_reasons is used to translate 'enum skb_drop_reason' to string, + * which is reported to user space. + */ +static const char * const drop_reasons[] = { + TRACE_SKB_DROP_REASON +}; + /* net_dm_mutex * * An overall lock guarding every operation coming from userspace. - * It also guards the global 'hw_stats_list' list. */ static DEFINE_MUTEX(net_dm_mutex); @@ -87,11 +99,9 @@ struct per_cpu_dm_data { }; struct dm_hw_stat_delta { - struct net_device *dev; unsigned long last_rx; - struct list_head list; - struct rcu_head rcu; unsigned long last_drop_val; + struct rcu_head rcu; }; static struct genl_family net_drop_monitor_family; @@ -102,7 +112,6 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_hw_cpu_data); static int dm_hit_limit = 64; static int dm_delay = 1; static unsigned long dm_hw_check_delta = 2*HZ; -static LIST_HEAD(hw_stats_list); static enum net_dm_alert_mode net_dm_alert_mode = NET_DM_ALERT_MODE_SUMMARY; static u32 net_dm_trunc_len; @@ -126,6 +135,7 @@ struct net_dm_skb_cb { struct devlink_trap_metadata *hw_metadata; void *pc; }; + enum skb_drop_reason reason; }; #define NET_DM_SKB_CB(__skb) ((struct net_dm_skb_cb *)&((__skb)->cb[0])) @@ -273,33 +283,27 @@ static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi, int work, int budget) { - struct dm_hw_stat_delta *new_stat; - + struct net_device *dev = napi->dev; + struct dm_hw_stat_delta *stat; /* * Don't check napi structures with no associated device */ - if (!napi->dev) + if (!dev) return; rcu_read_lock(); - list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { - struct net_device *dev; - + stat = rcu_dereference(dev->dm_private); + if (stat) { /* * only add a note to our monitor buffer if: - * 1) this is the dev we received on - * 2) its after the last_rx delta - * 3) our rx_dropped count has gone up + * 1) its after the last_rx delta + * 2) our rx_dropped count has gone up */ - /* Paired with WRITE_ONCE() in dropmon_net_event() */ - dev = READ_ONCE(new_stat->dev); - if ((dev == napi->dev) && - (time_after(jiffies, new_stat->last_rx + dm_hw_check_delta)) && - (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) { + if (time_after(jiffies, stat->last_rx + dm_hw_check_delta) && + (dev->stats.rx_dropped != stat->last_drop_val)) { trace_drop_common(NULL, NULL); - new_stat->last_drop_val = napi->dev->stats.rx_dropped; - new_stat->last_rx = jiffies; - break; + stat->last_drop_val = dev->stats.rx_dropped; + stat->last_rx = jiffies; } } rcu_read_unlock(); @@ -502,6 +506,7 @@ static void net_dm_packet_trace_kfree_skb_hit(void *ignore, { ktime_t tstamp = ktime_get_real(); struct per_cpu_dm_data *data; + struct net_dm_skb_cb *cb; struct sk_buff *nskb; unsigned long flags; @@ -512,7 +517,11 @@ static void net_dm_packet_trace_kfree_skb_hit(void *ignore, if (!nskb) return; - NET_DM_SKB_CB(nskb)->pc = location; + if ((unsigned int)reason >= SKB_DROP_REASON_MAX) + reason = SKB_DROP_REASON_NOT_SPECIFIED; + cb = NET_DM_SKB_CB(nskb); + cb->reason = reason; + cb->pc = location; /* Override the timestamp because we care about the time when the * packet was dropped. */ @@ -557,7 +566,8 @@ static size_t net_dm_in_port_size(void) #define NET_DM_MAX_SYMBOL_LEN 40 -static size_t net_dm_packet_report_size(size_t payload_len) +static size_t net_dm_packet_report_size(size_t payload_len, + enum skb_drop_reason reason) { size_t size; @@ -578,6 +588,8 @@ static size_t net_dm_packet_report_size(size_t payload_len) nla_total_size(sizeof(u32)) + /* NET_DM_ATTR_PROTO */ nla_total_size(sizeof(u16)) + + /* NET_DM_ATTR_REASON */ + nla_total_size(strlen(drop_reasons[reason]) + 1) + /* NET_DM_ATTR_PAYLOAD */ nla_total_size(payload_len); } @@ -610,7 +622,7 @@ nla_put_failure: static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb, size_t payload_len) { - u64 pc = (u64)(uintptr_t) NET_DM_SKB_CB(skb)->pc; + struct net_dm_skb_cb *cb = NET_DM_SKB_CB(skb); char buf[NET_DM_MAX_SYMBOL_LEN]; struct nlattr *attr; void *hdr; @@ -624,10 +636,15 @@ static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb, if (nla_put_u16(msg, NET_DM_ATTR_ORIGIN, NET_DM_ORIGIN_SW)) goto nla_put_failure; - if (nla_put_u64_64bit(msg, NET_DM_ATTR_PC, pc, NET_DM_ATTR_PAD)) + if (nla_put_u64_64bit(msg, NET_DM_ATTR_PC, (u64)(uintptr_t)cb->pc, + NET_DM_ATTR_PAD)) goto nla_put_failure; - snprintf(buf, sizeof(buf), "%pS", NET_DM_SKB_CB(skb)->pc); + if (nla_put_string(msg, NET_DM_ATTR_REASON, + drop_reasons[cb->reason])) + goto nla_put_failure; + + snprintf(buf, sizeof(buf), "%pS", cb->pc); if (nla_put_string(msg, NET_DM_ATTR_SYMBOL, buf)) goto nla_put_failure; @@ -683,7 +700,9 @@ static void net_dm_packet_report(struct sk_buff *skb) if (net_dm_trunc_len) payload_len = min_t(size_t, net_dm_trunc_len, payload_len); - msg = nlmsg_new(net_dm_packet_report_size(payload_len), GFP_KERNEL); + msg = nlmsg_new(net_dm_packet_report_size(payload_len, + NET_DM_SKB_CB(skb)->reason), + GFP_KERNEL); if (!msg) goto out; @@ -1169,7 +1188,6 @@ err_module_put: static void net_dm_trace_off_set(void) { - struct dm_hw_stat_delta *new_stat, *temp; const struct net_dm_alert_ops *ops; int cpu; @@ -1193,13 +1211,6 @@ static void net_dm_trace_off_set(void) consume_skb(skb); } - list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) { - if (new_stat->dev == NULL) { - list_del_rcu(&new_stat->list); - kfree_rcu(new_stat, rcu); - } - } - module_put(THIS_MODULE); } @@ -1560,41 +1571,28 @@ static int dropmon_net_event(struct notifier_block *ev_block, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct dm_hw_stat_delta *new_stat = NULL; - struct dm_hw_stat_delta *tmp; + struct dm_hw_stat_delta *stat; switch (event) { case NETDEV_REGISTER: - new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL); + if (WARN_ON_ONCE(rtnl_dereference(dev->dm_private))) + break; + stat = kzalloc(sizeof(*stat), GFP_KERNEL); + if (!stat) + break; - if (!new_stat) - goto out; + stat->last_rx = jiffies; + rcu_assign_pointer(dev->dm_private, stat); - new_stat->dev = dev; - new_stat->last_rx = jiffies; - mutex_lock(&net_dm_mutex); - list_add_rcu(&new_stat->list, &hw_stats_list); - mutex_unlock(&net_dm_mutex); break; case NETDEV_UNREGISTER: - mutex_lock(&net_dm_mutex); - list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) { - if (new_stat->dev == dev) { - - /* Paired with READ_ONCE() in trace_napi_poll_hit() */ - WRITE_ONCE(new_stat->dev, NULL); - - if (trace_state == TRACE_OFF) { - list_del_rcu(&new_stat->list); - kfree_rcu(new_stat, rcu); - break; - } - } + stat = rtnl_dereference(dev->dm_private); + if (stat) { + rcu_assign_pointer(dev->dm_private, NULL); + kfree_rcu(stat, rcu); } - mutex_unlock(&net_dm_mutex); break; } -out: return NOTIFY_DONE; } diff --git a/net/core/filter.c b/net/core/filter.c index 9eb785842258..65869fd510e8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2603,7 +2603,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, * account for the headroom. */ bytes_sg_total = start - offset + bytes; - if (!test_bit(i, &msg->sg.copy) && bytes_sg_total <= len) + if (!test_bit(i, msg->sg.copy) && bytes_sg_total <= len) goto out; /* At this point we need to linearize multiple scatterlist @@ -2812,7 +2812,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, /* Place newly allocated data buffer */ sk_mem_charge(msg->sk, len); msg->sg.size += len; - __clear_bit(new, &msg->sg.copy); + __clear_bit(new, msg->sg.copy); sg_set_page(&msg->sg.data[new], page, len + copy, 0); if (rsge.length) { get_page(sg_page(&rsge)); @@ -3786,6 +3786,28 @@ static const struct bpf_func_proto sk_skb_change_head_proto = { .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; + +BPF_CALL_1(bpf_xdp_get_buff_len, struct xdp_buff*, xdp) +{ + return xdp_get_buff_len(xdp); +} + +static const struct bpf_func_proto bpf_xdp_get_buff_len_proto = { + .func = bpf_xdp_get_buff_len, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BTF_ID_LIST_SINGLE(bpf_xdp_get_buff_len_bpf_ids, struct, xdp_buff) + +const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto = { + .func = bpf_xdp_get_buff_len, + .gpl_only = false, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_xdp_get_buff_len_bpf_ids[0], +}; + static unsigned long xdp_get_metalen(const struct xdp_buff *xdp) { return xdp_data_meta_unsupported(xdp) ? 0 : @@ -3820,11 +3842,208 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { .arg2_type = ARG_ANYTHING, }; +static void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, + void *buf, unsigned long len, bool flush) +{ + unsigned long ptr_len, ptr_off = 0; + skb_frag_t *next_frag, *end_frag; + struct skb_shared_info *sinfo; + void *src, *dst; + u8 *ptr_buf; + + if (likely(xdp->data_end - xdp->data >= off + len)) { + src = flush ? buf : xdp->data + off; + dst = flush ? xdp->data + off : buf; + memcpy(dst, src, len); + return; + } + + sinfo = xdp_get_shared_info_from_buff(xdp); + end_frag = &sinfo->frags[sinfo->nr_frags]; + next_frag = &sinfo->frags[0]; + + ptr_len = xdp->data_end - xdp->data; + ptr_buf = xdp->data; + + while (true) { + if (off < ptr_off + ptr_len) { + unsigned long copy_off = off - ptr_off; + unsigned long copy_len = min(len, ptr_len - copy_off); + + src = flush ? buf : ptr_buf + copy_off; + dst = flush ? ptr_buf + copy_off : buf; + memcpy(dst, src, copy_len); + + off += copy_len; + len -= copy_len; + buf += copy_len; + } + + if (!len || next_frag == end_frag) + break; + + ptr_off += ptr_len; + ptr_buf = skb_frag_address(next_frag); + ptr_len = skb_frag_size(next_frag); + next_frag++; + } +} + +static void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len) +{ + struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); + u32 size = xdp->data_end - xdp->data; + void *addr = xdp->data; + int i; + + if (unlikely(offset > 0xffff || len > 0xffff)) + return ERR_PTR(-EFAULT); + + if (offset + len > xdp_get_buff_len(xdp)) + return ERR_PTR(-EINVAL); + + if (offset < size) /* linear area */ + goto out; + + offset -= size; + for (i = 0; i < sinfo->nr_frags; i++) { /* paged area */ + u32 frag_size = skb_frag_size(&sinfo->frags[i]); + + if (offset < frag_size) { + addr = skb_frag_address(&sinfo->frags[i]); + size = frag_size; + break; + } + offset -= frag_size; + } +out: + return offset + len < size ? addr + offset : NULL; +} + +BPF_CALL_4(bpf_xdp_load_bytes, struct xdp_buff *, xdp, u32, offset, + void *, buf, u32, len) +{ + void *ptr; + + ptr = bpf_xdp_pointer(xdp, offset, len); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + + if (!ptr) + bpf_xdp_copy_buf(xdp, offset, buf, len, false); + else + memcpy(buf, ptr, len); + + return 0; +} + +static const struct bpf_func_proto bpf_xdp_load_bytes_proto = { + .func = bpf_xdp_load_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, +}; + +BPF_CALL_4(bpf_xdp_store_bytes, struct xdp_buff *, xdp, u32, offset, + void *, buf, u32, len) +{ + void *ptr; + + ptr = bpf_xdp_pointer(xdp, offset, len); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + + if (!ptr) + bpf_xdp_copy_buf(xdp, offset, buf, len, true); + else + memcpy(ptr, buf, len); + + return 0; +} + +static const struct bpf_func_proto bpf_xdp_store_bytes_proto = { + .func = bpf_xdp_store_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, +}; + +static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) +{ + struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); + skb_frag_t *frag = &sinfo->frags[sinfo->nr_frags - 1]; + struct xdp_rxq_info *rxq = xdp->rxq; + unsigned int tailroom; + + if (!rxq->frag_size || rxq->frag_size > xdp->frame_sz) + return -EOPNOTSUPP; + + tailroom = rxq->frag_size - skb_frag_size(frag) - skb_frag_off(frag); + if (unlikely(offset > tailroom)) + return -EINVAL; + + memset(skb_frag_address(frag) + skb_frag_size(frag), 0, offset); + skb_frag_size_add(frag, offset); + sinfo->xdp_frags_size += offset; + + return 0; +} + +static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset) +{ + struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); + int i, n_frags_free = 0, len_free = 0; + + if (unlikely(offset > (int)xdp_get_buff_len(xdp) - ETH_HLEN)) + return -EINVAL; + + for (i = sinfo->nr_frags - 1; i >= 0 && offset > 0; i--) { + skb_frag_t *frag = &sinfo->frags[i]; + int shrink = min_t(int, offset, skb_frag_size(frag)); + + len_free += shrink; + offset -= shrink; + + if (skb_frag_size(frag) == shrink) { + struct page *page = skb_frag_page(frag); + + __xdp_return(page_address(page), &xdp->rxq->mem, + false, NULL); + n_frags_free++; + } else { + skb_frag_size_sub(frag, shrink); + break; + } + } + sinfo->nr_frags -= n_frags_free; + sinfo->xdp_frags_size -= len_free; + + if (unlikely(!sinfo->nr_frags)) { + xdp_buff_clear_frags_flag(xdp); + xdp->data_end -= offset; + } + + return 0; +} + BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset) { void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */ void *data_end = xdp->data_end + offset; + if (unlikely(xdp_buff_has_frags(xdp))) { /* non-linear xdp buff */ + if (offset < 0) + return bpf_xdp_frags_shrink_tail(xdp, -offset); + + return bpf_xdp_frags_increase_tail(xdp, offset); + } + /* Notice that xdp_data_hard_end have reserved some tailroom */ if (unlikely(data_end > data_hard_end)) return -EINVAL; @@ -4050,6 +4269,14 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); enum bpf_map_type map_type = ri->map_type; + /* XDP_REDIRECT is not fully supported yet for xdp frags since + * not all XDP capable drivers can map non-linear xdp_frame in + * ndo_xdp_xmit. + */ + if (unlikely(xdp_buff_has_frags(xdp) && + map_type != BPF_MAP_TYPE_CPUMAP)) + return -EOPNOTSUPP; + if (map_type == BPF_MAP_TYPE_XSKMAP) return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog); @@ -4593,10 +4820,12 @@ static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = { }; #endif -static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff, +static unsigned long bpf_xdp_copy(void *dst, const void *ctx, unsigned long off, unsigned long len) { - memcpy(dst_buff, src_buff + off, len); + struct xdp_buff *xdp = (struct xdp_buff *)ctx; + + bpf_xdp_copy_buf(xdp, off, dst, len, false); return 0; } @@ -4607,11 +4836,11 @@ BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map, if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) return -EINVAL; - if (unlikely(!xdp || - xdp_size > (unsigned long)(xdp->data_end - xdp->data))) + + if (unlikely(!xdp || xdp_size > xdp_get_buff_len(xdp))) return -EFAULT; - return bpf_event_output(map, flags, meta, meta_size, xdp->data, + return bpf_event_output(map, flags, meta, meta_size, xdp, xdp_size, bpf_xdp_copy); } @@ -4865,6 +5094,13 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, case SO_REUSEPORT: sk->sk_reuseport = valbool; break; + case SO_TXREHASH: + if (val < -1 || val > 1) { + ret = -EINVAL; + break; + } + sk->sk_txrehash = (u8)val; + break; default: ret = -EINVAL; } @@ -5043,6 +5279,9 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname, case SO_REUSEPORT: *((int *)optval) = sk->sk_reuseport; break; + case SO_TXREHASH: + *((int *)optval) = sk->sk_txrehash; + break; default: goto err_clear; } @@ -7536,6 +7775,12 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_xdp_redirect_map_proto; case BPF_FUNC_xdp_adjust_tail: return &bpf_xdp_adjust_tail_proto; + case BPF_FUNC_xdp_get_buff_len: + return &bpf_xdp_get_buff_len_proto; + case BPF_FUNC_xdp_load_bytes: + return &bpf_xdp_load_bytes_proto; + case BPF_FUNC_xdp_store_bytes: + return &bpf_xdp_store_bytes_proto; case BPF_FUNC_fib_lookup: return &bpf_xdp_fib_lookup_proto; case BPF_FUNC_check_mtu: @@ -8033,6 +8278,7 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); + int field_size; if (off < 0 || off >= sizeof(struct bpf_sock)) return false; @@ -8044,7 +8290,6 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, case offsetof(struct bpf_sock, family): case offsetof(struct bpf_sock, type): case offsetof(struct bpf_sock, protocol): - case offsetof(struct bpf_sock, dst_port): case offsetof(struct bpf_sock, src_port): case offsetof(struct bpf_sock, rx_queue_mapping): case bpf_ctx_range(struct bpf_sock, src_ip4): @@ -8053,6 +8298,14 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]): bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); + case bpf_ctx_range(struct bpf_sock, dst_port): + field_size = size == size_default ? + size_default : sizeof_field(struct bpf_sock, dst_port); + bpf_ctx_record_field_size(info, field_size); + return bpf_ctx_narrow_access_ok(off, size, field_size); + case offsetofend(struct bpf_sock, dst_port) ... + offsetof(struct bpf_sock, dst_ip4) - 1: + return false; } return size == size_default; @@ -10065,7 +10318,6 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = { .convert_ctx_access = tc_cls_act_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, .gen_ld_abs = bpf_gen_ld_abs, - .check_kfunc_call = bpf_prog_test_check_kfunc_call, }; const struct bpf_prog_ops tc_cls_act_prog_ops = { @@ -10604,7 +10856,8 @@ static bool sk_lookup_is_valid_access(int off, int size, case bpf_ctx_range(struct bpf_sk_lookup, local_ip4): case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]): case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]): - case bpf_ctx_range(struct bpf_sk_lookup, remote_port): + case offsetof(struct bpf_sk_lookup, remote_port) ... + offsetof(struct bpf_sk_lookup, local_ip4) - 1: case bpf_ctx_range(struct bpf_sk_lookup, local_port): case bpf_ctx_range(struct bpf_sk_lookup, ingress_ifindex): bpf_ctx_record_field_size(info, sizeof(__u32)); diff --git a/net/core/gro.c b/net/core/gro.c index a11b286d1495..ee5e7e889d8b 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -459,29 +459,22 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff skb_set_network_header(skb, skb_gro_offset(skb)); skb_reset_mac_len(skb); - NAPI_GRO_CB(skb)->same_flow = 0; + BUILD_BUG_ON(sizeof_field(struct napi_gro_cb, zeroed) != sizeof(u32)); + BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed), + sizeof(u32))); /* Avoid slow unaligned acc */ + *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0; NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb); - NAPI_GRO_CB(skb)->free = 0; - NAPI_GRO_CB(skb)->encap_mark = 0; - NAPI_GRO_CB(skb)->recursion_counter = 0; - NAPI_GRO_CB(skb)->is_fou = 0; NAPI_GRO_CB(skb)->is_atomic = 1; - NAPI_GRO_CB(skb)->gro_remcsum_start = 0; /* Setup for GRO checksum validation */ switch (skb->ip_summed) { case CHECKSUM_COMPLETE: NAPI_GRO_CB(skb)->csum = skb->csum; NAPI_GRO_CB(skb)->csum_valid = 1; - NAPI_GRO_CB(skb)->csum_cnt = 0; break; case CHECKSUM_UNNECESSARY: NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1; - NAPI_GRO_CB(skb)->csum_valid = 0; break; - default: - NAPI_GRO_CB(skb)->csum_cnt = 0; - NAPI_GRO_CB(skb)->csum_valid = 0; } pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive, @@ -634,7 +627,6 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) skb->encapsulation = 0; skb_shinfo(skb)->gso_type = 0; - skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); if (unlikely(skb->slow_gro)) { skb_orphan(skb); skb_ext_reset(skb); diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c index 6eb2e5ec2c50..8462f926ab45 100644 --- a/net/core/gro_cells.c +++ b/net/core/gro_cells.c @@ -89,8 +89,23 @@ int gro_cells_init(struct gro_cells *gcells, struct net_device *dev) } EXPORT_SYMBOL(gro_cells_init); +struct percpu_free_defer { + struct rcu_head rcu; + void __percpu *ptr; +}; + +static void percpu_free_defer_callback(struct rcu_head *head) +{ + struct percpu_free_defer *defer; + + defer = container_of(head, struct percpu_free_defer, rcu); + free_percpu(defer->ptr); + kfree(defer); +} + void gro_cells_destroy(struct gro_cells *gcells) { + struct percpu_free_defer *defer; int i; if (!gcells->cells) @@ -102,12 +117,23 @@ void gro_cells_destroy(struct gro_cells *gcells) __netif_napi_del(&cell->napi); __skb_queue_purge(&cell->napi_skbs); } - /* This barrier is needed because netpoll could access dev->napi_list - * under rcu protection. + /* We need to observe an rcu grace period before freeing ->cells, + * because netpoll could access dev->napi_list under rcu protection. + * Try hard using call_rcu() instead of synchronize_rcu(), + * because we might be called from cleanup_net(), and we + * definitely do not want to block this critical task. */ - synchronize_net(); - - free_percpu(gcells->cells); + defer = kmalloc(sizeof(*defer), GFP_KERNEL | __GFP_NOWARN); + if (likely(defer)) { + defer->ptr = gcells->cells; + call_rcu(&defer->rcu, percpu_free_defer_callback); + } else { + /* We do not hold RTNL at this point, synchronize_net() + * would not be able to expedite this sync. + */ + synchronize_rcu_expedited(); + free_percpu(gcells->cells); + } gcells->cells = NULL; } EXPORT_SYMBOL(gro_cells_destroy); diff --git a/net/core/link_watch.c b/net/core/link_watch.c index b0f5344d1185..95098d1a49bd 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -166,10 +166,10 @@ static void linkwatch_do_dev(struct net_device *dev) netdev_state_change(dev); } - /* Note: our callers are responsible for - * calling netdev_tracker_free(). + /* Note: our callers are responsible for calling netdev_tracker_free(). + * This is the reason we use __dev_put() instead of dev_put(). */ - dev_put(dev); + __dev_put(dev); } static void __linkwatch_run_queue(int urgent_only) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index a5b5bb99c644..0ec2f5906a27 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -44,13 +44,7 @@ EXPORT_SYMBOL_GPL(net_rwsem); static struct key_tag init_net_key_domain = { .usage = REFCOUNT_INIT(1) }; #endif -struct net init_net = { - .ns.count = REFCOUNT_INIT(1), - .dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head), -#ifdef CONFIG_KEYS - .key_domain = &init_net_key_domain, -#endif -}; +struct net init_net; EXPORT_SYMBOL(init_net); static bool init_net_initialized; @@ -301,6 +295,7 @@ struct net *get_net_ns_by_id(const struct net *net, int id) return peer; } +EXPORT_SYMBOL_GPL(get_net_ns_by_id); /* * setup_net runs the initializers for the network namespace object. @@ -363,6 +358,8 @@ out_undo: static int __net_init net_defaults_init_net(struct net *net) { net->core.sysctl_somaxconn = SOMAXCONN; + net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED; + return 0; } @@ -1084,7 +1081,7 @@ out: rtnl_set_sk_err(net, RTNLGRP_NSID, err); } -static int __init net_ns_init(void) +void __init net_ns_init(void) { struct net_generic *ng; @@ -1105,6 +1102,9 @@ static int __init net_ns_init(void) rcu_assign_pointer(init_net.gen, ng); +#ifdef CONFIG_KEYS + init_net.key_domain = &init_net_key_domain; +#endif down_write(&pernet_ops_rwsem); if (setup_net(&init_net, &init_user_ns)) panic("Could not setup the initial network namespace"); @@ -1119,12 +1119,8 @@ static int __init net_ns_init(void) RTNL_FLAG_DOIT_UNLOCKED); rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid, RTNL_FLAG_DOIT_UNLOCKED); - - return 0; } -pure_initcall(net_ns_init); - static void free_exit_list(struct pernet_operations *ops, struct list_head *net_exit_list) { ops_pre_exit_list(ops, net_exit_list); diff --git a/net/core/page_pool.c b/net/core/page_pool.c index bd62c01a2ec3..e25d359d84d9 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -423,11 +423,6 @@ static __always_inline struct page * __page_pool_put_page(struct page_pool *pool, struct page *page, unsigned int dma_sync_size, bool allow_direct) { - /* It is not the last user for the page frag case */ - if (pool->p.flags & PP_FLAG_PAGE_FRAG && - page_pool_atomic_sub_frag_count_return(page, 1)) - return NULL; - /* This allocator is optimized for the XDP mode that uses * one-frame-per-page, but have fallbacks that act like the * regular page allocator APIs. @@ -471,8 +466,8 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, return NULL; } -void page_pool_put_page(struct page_pool *pool, struct page *page, - unsigned int dma_sync_size, bool allow_direct) +void page_pool_put_defragged_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct) { page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct); if (page && !page_pool_recycle_in_ring(pool, page)) { @@ -480,7 +475,7 @@ void page_pool_put_page(struct page_pool *pool, struct page *page, page_pool_return_page(pool, page); } } -EXPORT_SYMBOL(page_pool_put_page); +EXPORT_SYMBOL(page_pool_put_defragged_page); /* Caller must not use data area after call, as this function overwrites it */ void page_pool_put_page_bulk(struct page_pool *pool, void **data, @@ -491,6 +486,10 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data, for (i = 0; i < count; i++) { struct page *page = virt_to_head_page(data[i]); + /* It is not the last user for the page frag case */ + if (!page_pool_is_last_frag(pool, page)) + continue; + page = __page_pool_put_page(pool, page, -1, false); /* Approved for bulk recycling in ptr_ring cache */ if (page) @@ -526,8 +525,7 @@ static struct page *page_pool_drain_frag(struct page_pool *pool, long drain_count = BIAS_MAX - pool->frag_users; /* Some user is still using the page frag */ - if (likely(page_pool_atomic_sub_frag_count_return(page, - drain_count))) + if (likely(page_pool_defrag_page(page, drain_count))) return NULL; if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) { @@ -548,8 +546,7 @@ static void page_pool_free_frag(struct page_pool *pool) pool->frag_page = NULL; - if (!page || - page_pool_atomic_sub_frag_count_return(page, drain_count)) + if (!page || page_pool_defrag_page(page, drain_count)) return; page_pool_return_page(pool, page); @@ -588,7 +585,7 @@ frag_reset: pool->frag_users = 1; *offset = 0; pool->frag_offset = size; - page_pool_set_frag_count(page, BIAS_MAX); + page_pool_fragment_page(page, BIAS_MAX); return page; } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 2fb8eb6791e8..20a9e1686453 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -459,7 +459,7 @@ static void rtnl_lock_unregistering_all(void) * setup_net() and cleanup_net() are not possible. */ for_each_net(net) { - if (net->dev_unreg_count > 0) { + if (atomic_read(&net->dev_unreg_count) > 0) { unregistering = true; break; } @@ -5407,8 +5407,10 @@ static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh, return -ENODEV; filter_mask = ifsm->filter_mask; - if (!filter_mask) + if (!filter_mask) { + NL_SET_ERR_MSG(extack, "Filter mask must be set for stats get"); return -EINVAL; + } nskb = nlmsg_new(if_nlmsg_stats_size(dev, filter_mask), GFP_KERNEL); if (!nskb) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b8138c372535..b32c5d782fe1 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -201,7 +201,7 @@ static void __build_skb_around(struct sk_buff *skb, void *data, skb->head = data; skb->data = data; skb_reset_tail_pointer(skb); - skb->end = skb->tail + size; + skb_set_end_offset(skb, size); skb->mac_header = (typeof(skb->mac_header))~0U; skb->transport_header = (typeof(skb->transport_header))~0U; @@ -1736,11 +1736,10 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, skb->head = data; skb->head_frag = 0; skb->data += off; + + skb_set_end_offset(skb, size); #ifdef NET_SKBUFF_DATA_USES_OFFSET - skb->end = size; off = nhead; -#else - skb->end = skb->head + size; #endif skb->tail += off; skb_headers_offset_update(skb, nhead); @@ -1788,6 +1787,38 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) } EXPORT_SYMBOL(skb_realloc_headroom); +int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) +{ + unsigned int saved_end_offset, saved_truesize; + struct skb_shared_info *shinfo; + int res; + + saved_end_offset = skb_end_offset(skb); + saved_truesize = skb->truesize; + + res = pskb_expand_head(skb, 0, 0, pri); + if (res) + return res; + + skb->truesize = saved_truesize; + + if (likely(skb_end_offset(skb) == saved_end_offset)) + return 0; + + shinfo = skb_shinfo(skb); + + /* We are about to change back skb->end, + * we need to move skb_shinfo() to its new location. + */ + memmove(skb->head + saved_end_offset, + shinfo, + offsetof(struct skb_shared_info, frags[shinfo->nr_frags])); + + skb_set_end_offset(skb, saved_end_offset); + + return 0; +} + /** * skb_expand_head - reallocate header of &sk_buff * @skb: buffer to reallocate @@ -6044,11 +6075,7 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, skb->head = data; skb->data = data; skb->head_frag = 0; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - skb->end = size; -#else - skb->end = skb->head + size; -#endif + skb_set_end_offset(skb, size); skb_set_tail_pointer(skb, skb_headlen(skb)); skb_headers_offset_update(skb, 0); skb->cloned = 0; @@ -6186,11 +6213,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, skb->head = data; skb->head_frag = 0; skb->data = data; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - skb->end = size; -#else - skb->end = skb->head + size; -#endif + skb_set_end_offset(skb, size); skb_reset_tail_pointer(skb); skb_headers_offset_update(skb, 0); skb->cloned = 0; diff --git a/net/core/sock.c b/net/core/sock.c index 6eb174805bf0..784c92eaded8 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1447,6 +1447,15 @@ set_sndbuf: break; } + case SO_TXREHASH: + if (val < -1 || val > 1) { + ret = -EINVAL; + break; + } + /* Paired with READ_ONCE() in tcp_rtx_synack() */ + WRITE_ONCE(sk->sk_txrehash, (u8)val); + break; + default: ret = -ENOPROTOOPT; break; @@ -1834,6 +1843,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val = sk->sk_reserved_mem; break; + case SO_TXREHASH: + v.val = sk->sk_txrehash; + break; + default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7). @@ -2266,6 +2279,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; /* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */ sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size); + sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1); /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1); } @@ -2611,7 +2625,8 @@ int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, switch (cmsg->cmsg_type) { case SO_MARK: - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && + !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) return -EINVAL; @@ -3278,6 +3293,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_pacing_rate = ~0UL; WRITE_ONCE(sk->sk_pacing_shift, 10); sk->sk_incoming_cpu = -1; + sk->sk_txrehash = SOCK_TXREHASH_DEFAULT; sk_rx_queue_clear(sk); /* @@ -3702,6 +3718,10 @@ int proto_register(struct proto *prot, int alloc_slab) { int ret = -ENOBUFS; + if (prot->memory_allocated && !prot->sysctl_mem) { + pr_err("%s: missing sysctl_mem\n", prot->name); + return -EINVAL; + } if (alloc_slab) { prot->slab = kmem_cache_create_usercopy(prot->name, prot->obj_size, 0, diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 1827669eedd6..2d213c4011db 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1416,38 +1416,50 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) return NULL; } -static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, - struct bpf_prog *old, u32 which) +static int sock_map_prog_lookup(struct bpf_map *map, struct bpf_prog ***pprog, + u32 which) { struct sk_psock_progs *progs = sock_map_progs(map); - struct bpf_prog **pprog; if (!progs) return -EOPNOTSUPP; switch (which) { case BPF_SK_MSG_VERDICT: - pprog = &progs->msg_parser; + *pprog = &progs->msg_parser; break; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) case BPF_SK_SKB_STREAM_PARSER: - pprog = &progs->stream_parser; + *pprog = &progs->stream_parser; break; #endif case BPF_SK_SKB_STREAM_VERDICT: if (progs->skb_verdict) return -EBUSY; - pprog = &progs->stream_verdict; + *pprog = &progs->stream_verdict; break; case BPF_SK_SKB_VERDICT: if (progs->stream_verdict) return -EBUSY; - pprog = &progs->skb_verdict; + *pprog = &progs->skb_verdict; break; default: return -EOPNOTSUPP; } + return 0; +} + +static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, + struct bpf_prog *old, u32 which) +{ + struct bpf_prog **pprog; + int ret; + + ret = sock_map_prog_lookup(map, &pprog, which); + if (ret) + return ret; + if (old) return psock_replace_prog(pprog, prog, old); @@ -1455,6 +1467,57 @@ static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, return 0; } +int sock_map_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); + u32 prog_cnt = 0, flags = 0, ufd = attr->target_fd; + struct bpf_prog **pprog; + struct bpf_prog *prog; + struct bpf_map *map; + struct fd f; + u32 id = 0; + int ret; + + if (attr->query.query_flags) + return -EINVAL; + + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + rcu_read_lock(); + + ret = sock_map_prog_lookup(map, &pprog, attr->query.attach_type); + if (ret) + goto end; + + prog = *pprog; + prog_cnt = !prog ? 0 : 1; + + if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) + goto end; + + /* we do not hold the refcnt, the bpf prog may be released + * asynchronously and the id would be set to 0. + */ + id = data_race(prog->aux->id); + if (id == 0) + prog_cnt = 0; + +end: + rcu_read_unlock(); + + if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)) || + (id != 0 && copy_to_user(prog_ids, &id, sizeof(u32))) || + copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) + ret = -EFAULT; + + fdput(f); + return ret; +} + static void sock_map_unlink(struct sock *sk, struct sk_psock_link *link) { switch (link->map->map_type) { diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 7b4d485aac7a..dbeb8ecbcd98 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -593,6 +593,15 @@ static struct ctl_table netns_core_table[] = { .extra1 = SYSCTL_ZERO, .proc_handler = proc_dointvec_minmax }, + { + .procname = "txrehash", + .data = &init_net.core.sysctl_txrehash, + .maxlen = sizeof(u8), + .mode = 0644, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + .proc_handler = proc_dou8vec_minmax, + }, { } }; @@ -611,7 +620,7 @@ __setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup); static __net_init int sysctl_core_net_init(struct net *net) { - struct ctl_table *tbl; + struct ctl_table *tbl, *tmp; tbl = netns_core_table; if (!net_eq(net, &init_net)) { @@ -619,7 +628,8 @@ static __net_init int sysctl_core_net_init(struct net *net) if (tbl == NULL) goto err_dup; - tbl[0].data = &net->core.sysctl_somaxconn; + for (tmp = tbl; tmp->procname; tmp++) + tmp->data += (char *)net - (char *)&init_net; /* Don't export any sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) { diff --git a/net/core/utils.c b/net/core/utils.c index 1f31a39236d5..938495bc1d34 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -476,9 +476,9 @@ void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb, __wsum diff, bool pseudohdr) { if (skb->ip_summed != CHECKSUM_PARTIAL) { - *sum = csum_fold(csum_add(diff, ~csum_unfold(*sum))); + csum_replace_by_diff(sum, diff); if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) - skb->csum = ~csum_add(diff, ~skb->csum); + skb->csum = ~csum_sub(diff, skb->csum); } else if (pseudohdr) { *sum = ~csum_fold(csum_add(diff, csum_unfold(*sum))); } diff --git a/net/core/xdp.c b/net/core/xdp.c index 7aba35504986..361df312ee7f 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -162,8 +162,9 @@ static void xdp_rxq_info_init(struct xdp_rxq_info *xdp_rxq) } /* Returns 0 on success, negative on failure */ -int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, - struct net_device *dev, u32 queue_index, unsigned int napi_id) +int __xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, + struct net_device *dev, u32 queue_index, + unsigned int napi_id, u32 frag_size) { if (!dev) { WARN(1, "Missing net_device from driver"); @@ -185,11 +186,12 @@ int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, xdp_rxq->dev = dev; xdp_rxq->queue_index = queue_index; xdp_rxq->napi_id = napi_id; + xdp_rxq->frag_size = frag_size; xdp_rxq->reg_state = REG_STATE_REGISTERED; return 0; } -EXPORT_SYMBOL_GPL(xdp_rxq_info_reg); +EXPORT_SYMBOL_GPL(__xdp_rxq_info_reg); void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq) { @@ -369,8 +371,8 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); * is used for those calls sites. Thus, allowing for faster recycling * of xdp_frames/pages in those cases. */ -static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, - struct xdp_buff *xdp) +void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, + struct xdp_buff *xdp) { struct xdp_mem_allocator *xa; struct page *page; @@ -406,12 +408,38 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, void xdp_return_frame(struct xdp_frame *xdpf) { + struct skb_shared_info *sinfo; + int i; + + if (likely(!xdp_frame_has_frags(xdpf))) + goto out; + + sinfo = xdp_get_shared_info_from_frame(xdpf); + for (i = 0; i < sinfo->nr_frags; i++) { + struct page *page = skb_frag_page(&sinfo->frags[i]); + + __xdp_return(page_address(page), &xdpf->mem, false, NULL); + } +out: __xdp_return(xdpf->data, &xdpf->mem, false, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) { + struct skb_shared_info *sinfo; + int i; + + if (likely(!xdp_frame_has_frags(xdpf))) + goto out; + + sinfo = xdp_get_shared_info_from_frame(xdpf); + for (i = 0; i < sinfo->nr_frags; i++) { + struct page *page = skb_frag_page(&sinfo->frags[i]); + + __xdp_return(page_address(page), &xdpf->mem, true, NULL); + } +out: __xdp_return(xdpf->data, &xdpf->mem, true, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); @@ -447,7 +475,7 @@ void xdp_return_frame_bulk(struct xdp_frame *xdpf, struct xdp_mem_allocator *xa; if (mem->type != MEM_TYPE_PAGE_POOL) { - __xdp_return(xdpf->data, &xdpf->mem, false, NULL); + xdp_return_frame(xdpf); return; } @@ -466,12 +494,38 @@ void xdp_return_frame_bulk(struct xdp_frame *xdpf, bq->xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); } + if (unlikely(xdp_frame_has_frags(xdpf))) { + struct skb_shared_info *sinfo; + int i; + + sinfo = xdp_get_shared_info_from_frame(xdpf); + for (i = 0; i < sinfo->nr_frags; i++) { + skb_frag_t *frag = &sinfo->frags[i]; + + bq->q[bq->count++] = skb_frag_address(frag); + if (bq->count == XDP_BULK_QUEUE_SIZE) + xdp_flush_frame_bulk(bq); + } + } bq->q[bq->count++] = xdpf->data; } EXPORT_SYMBOL_GPL(xdp_return_frame_bulk); void xdp_return_buff(struct xdp_buff *xdp) { + struct skb_shared_info *sinfo; + int i; + + if (likely(!xdp_buff_has_frags(xdp))) + goto out; + + sinfo = xdp_get_shared_info_from_buff(xdp); + for (i = 0; i < sinfo->nr_frags; i++) { + struct page *page = skb_frag_page(&sinfo->frags[i]); + + __xdp_return(page_address(page), &xdp->rxq->mem, true, xdp); + } +out: __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp); } @@ -561,8 +615,14 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct sk_buff *skb, struct net_device *dev) { + struct skb_shared_info *sinfo = xdp_get_shared_info_from_frame(xdpf); unsigned int headroom, frame_size; void *hard_start; + u8 nr_frags; + + /* xdp frags frame */ + if (unlikely(xdp_frame_has_frags(xdpf))) + nr_frags = sinfo->nr_frags; /* Part of headroom was reserved to xdpf */ headroom = sizeof(*xdpf) + xdpf->headroom; @@ -582,6 +642,12 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, if (xdpf->metasize) skb_metadata_set(skb, xdpf->metasize); + if (unlikely(xdp_frame_has_frags(xdpf))) + xdp_update_skb_shared_info(skb, nr_frags, + sinfo->xdp_frags_size, + nr_frags * xdpf->frame_sz, + xdp_frame_is_frag_pfmemalloc(xdpf)); + /* Essential SKB info: protocol and skb->dev */ skb->protocol = eth_type_trans(skb, dev); |