diff options
-rw-r--r-- | include/linux/netdevice.h | 5 | ||||
-rw-r--r-- | include/linux/skbuff.h | 3 | ||||
-rw-r--r-- | include/net/sock.h | 2 | ||||
-rw-r--r-- | include/net/tcp.h | 12 | ||||
-rw-r--r-- | net/core/dev.c | 31 | ||||
-rw-r--r-- | net/core/skbuff.c | 51 | ||||
-rw-r--r-- | net/core/sock.c | 3 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 25 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 1 | ||||
-rw-r--r-- | net/ipv6/tcp_ipv6.c | 1 | ||||
-rw-r--r-- | net/tls/tls_sw.c | 2 |
11 files changed, 90 insertions, 46 deletions
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7dccbfd1bf56..ac8a5f71220a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3081,6 +3081,11 @@ struct softnet_data { struct sk_buff_head input_pkt_queue; struct napi_struct backlog; + /* Another possibly contended cache line */ + spinlock_t defer_lock ____cacheline_aligned_in_smp; + int defer_count; + struct sk_buff *defer_list; + call_single_data_t defer_csd; }; static inline void input_queue_head_incr(struct softnet_data *sd) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 84d78df60453..5cbc184ca685 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -888,6 +888,7 @@ typedef unsigned char *sk_buff_data_t; * delivery_time at egress. * @napi_id: id of the NAPI struct this skb came from * @sender_cpu: (aka @napi_id) source CPU in XPS + * @alloc_cpu: CPU which did the skb allocation. * @secmark: security marking * @mark: Generic packet mark * @reserved_tailroom: (aka @mark) number of bytes of free space available @@ -1080,6 +1081,7 @@ struct sk_buff { unsigned int sender_cpu; }; #endif + u16 alloc_cpu; #ifdef CONFIG_NETWORK_SECMARK __u32 secmark; #endif @@ -1321,6 +1323,7 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size); struct sk_buff *build_skb(void *data, unsigned int frag_size); struct sk_buff *build_skb_around(struct sk_buff *skb, void *data, unsigned int frag_size); +void skb_attempt_defer_free(struct sk_buff *skb); struct sk_buff *napi_build_skb(void *data, unsigned int frag_size); diff --git a/include/net/sock.h b/include/net/sock.h index a01d6c421aa2..f9f8ecae0f8d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -292,7 +292,6 @@ struct sk_filter; * @sk_pacing_shift: scaling factor for TCP Small Queues * @sk_lingertime: %SO_LINGER l_linger setting * @sk_backlog: always used with the per-socket spinlock held - * @defer_list: head of llist storing skbs to be freed * @sk_callback_lock: used with the callbacks in the end of this struct * @sk_error_queue: rarely used * @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt, @@ -417,7 +416,6 @@ struct sock { struct sk_buff *head; struct sk_buff *tail; } sk_backlog; - struct llist_head defer_list; #define sk_rmem_alloc sk_backlog.rmem_alloc diff --git a/include/net/tcp.h b/include/net/tcp.h index 679b1964d494..94a52ad1101c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1375,18 +1375,6 @@ static inline bool tcp_checksum_complete(struct sk_buff *skb) bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason); -#ifdef CONFIG_INET -void __sk_defer_free_flush(struct sock *sk); - -static inline void sk_defer_free_flush(struct sock *sk) -{ - if (llist_empty(&sk->defer_list)) - return; - __sk_defer_free_flush(sk); -} -#else -static inline void sk_defer_free_flush(struct sock *sk) {} -#endif int tcp_filter(struct sock *sk, struct sk_buff *skb); void tcp_set_state(struct sock *sk, int state); diff --git a/net/core/dev.c b/net/core/dev.c index 4a77ebda4fb1..611bd7197064 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4545,6 +4545,12 @@ static void rps_trigger_softirq(void *data) #endif /* CONFIG_RPS */ +/* Called from hardirq (IPI) context */ +static void trigger_rx_softirq(void *data __always_unused) +{ + __raise_softirq_irqoff(NET_RX_SOFTIRQ); +} + /* * Check if this softnet_data structure is another cpu one * If yes, queue it to our IPI list and return 1 @@ -6571,6 +6577,28 @@ static int napi_threaded_poll(void *data) return 0; } +static void skb_defer_free_flush(struct softnet_data *sd) +{ + struct sk_buff *skb, *next; + unsigned long flags; + + /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */ + if (!READ_ONCE(sd->defer_list)) + return; + + spin_lock_irqsave(&sd->defer_lock, flags); + skb = sd->defer_list; + sd->defer_list = NULL; + sd->defer_count = 0; + spin_unlock_irqrestore(&sd->defer_lock, flags); + + while (skb != NULL) { + next = skb->next; + __kfree_skb(skb); + skb = next; + } +} + static __latent_entropy void net_rx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); @@ -6616,6 +6644,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) __raise_softirq_irqoff(NET_RX_SOFTIRQ); net_rps_action_and_irq_enable(sd); + skb_defer_free_flush(sd); } struct netdev_adjacent { @@ -11326,6 +11355,8 @@ static int __init net_dev_init(void) INIT_CSD(&sd->csd, rps_trigger_softirq, sd); sd->cpu = i; #endif + INIT_CSD(&sd->defer_csd, trigger_rx_softirq, NULL); + spin_lock_init(&sd->defer_lock); init_gro_hash(&sd->backlog); sd->backlog.poll = process_backlog; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 30b523fa4ad2..028a280fbabd 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -204,7 +204,7 @@ static void __build_skb_around(struct sk_buff *skb, void *data, skb_set_end_offset(skb, size); skb->mac_header = (typeof(skb->mac_header))~0U; skb->transport_header = (typeof(skb->transport_header))~0U; - + skb->alloc_cpu = raw_smp_processor_id(); /* make sure we initialize shinfo sequentially */ shinfo = skb_shinfo(skb); memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); @@ -1037,6 +1037,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) #ifdef CONFIG_NET_RX_BUSY_POLL CHECK_SKB_FIELD(napi_id); #endif + CHECK_SKB_FIELD(alloc_cpu); #ifdef CONFIG_XPS CHECK_SKB_FIELD(sender_cpu); #endif @@ -6486,3 +6487,51 @@ free_now: } EXPORT_SYMBOL(__skb_ext_put); #endif /* CONFIG_SKB_EXTENSIONS */ + +/** + * skb_attempt_defer_free - queue skb for remote freeing + * @skb: buffer + * + * Put @skb in a per-cpu list, using the cpu which + * allocated the skb/pages to reduce false sharing + * and memory zone spinlock contention. + */ +void skb_attempt_defer_free(struct sk_buff *skb) +{ + int cpu = skb->alloc_cpu; + struct softnet_data *sd; + unsigned long flags; + bool kick; + + if (WARN_ON_ONCE(cpu >= nr_cpu_ids) || + !cpu_online(cpu) || + cpu == raw_smp_processor_id()) { + __kfree_skb(skb); + return; + } + + sd = &per_cpu(softnet_data, cpu); + /* We do not send an IPI or any signal. + * Remote cpu will eventually call skb_defer_free_flush() + */ + spin_lock_irqsave(&sd->defer_lock, flags); + skb->next = sd->defer_list; + /* Paired with READ_ONCE() in skb_defer_free_flush() */ + WRITE_ONCE(sd->defer_list, skb); + sd->defer_count++; + + /* kick every time queue length reaches 128. + * This should avoid blocking in smp_call_function_single_async(). + * This condition should hardly be bit under normal conditions, + * unless cpu suddenly stopped to receive NIC interrupts. + */ + kick = sd->defer_count == 128; + + spin_unlock_irqrestore(&sd->defer_lock, flags); + + /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU + * if we are unlucky enough (this seems very unlikely). + */ + if (unlikely(kick)) + smp_call_function_single_async(cpu, &sd->defer_csd); +} diff --git a/net/core/sock.c b/net/core/sock.c index 29abec3eabd8..a0f3989de3d6 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2082,9 +2082,6 @@ void sk_destruct(struct sock *sk) { bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); - WARN_ON_ONCE(!llist_empty(&sk->defer_list)); - sk_defer_free_flush(sk); - if (rcu_access_pointer(sk->sk_reuseport_cb)) { reuseport_detach_sock(sk); use_call_rcu = true; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e20b87b3bf90..db55af9eb37b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -843,7 +843,6 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, } release_sock(sk); - sk_defer_free_flush(sk); if (spliced) return spliced; @@ -1589,20 +1588,6 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied) tcp_send_ack(sk); } -void __sk_defer_free_flush(struct sock *sk) -{ - struct llist_node *head; - struct sk_buff *skb, *n; - - head = llist_del_all(&sk->defer_list); - llist_for_each_entry_safe(skb, n, head, ll_node) { - prefetch(n); - skb_mark_not_on_list(skb); - __kfree_skb(skb); - } -} -EXPORT_SYMBOL(__sk_defer_free_flush); - static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb) { __skb_unlink(skb, &sk->sk_receive_queue); @@ -1610,11 +1595,7 @@ static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb) sock_rfree(skb); skb->destructor = NULL; skb->sk = NULL; - if (!skb_queue_empty(&sk->sk_receive_queue) || - !llist_empty(&sk->defer_list)) { - llist_add(&skb->ll_node, &sk->defer_list); - return; - } + return skb_attempt_defer_free(skb); } __kfree_skb(skb); } @@ -2453,7 +2434,6 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, __sk_flush_backlog(sk); } else { tcp_cleanup_rbuf(sk, copied); - sk_defer_free_flush(sk); sk_wait_data(sk, &timeo, last); } @@ -2571,7 +2551,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, lock_sock(sk); ret = tcp_recvmsg_locked(sk, msg, len, flags, &tss, &cmsg_flags); release_sock(sk); - sk_defer_free_flush(sk); if (cmsg_flags && ret >= 0) { if (cmsg_flags & TCP_CMSG_TS) @@ -3096,7 +3075,6 @@ int tcp_disconnect(struct sock *sk, int flags) sk->sk_frag.page = NULL; sk->sk_frag.offset = 0; } - sk_defer_free_flush(sk); sk_error_report(sk); return 0; } @@ -4225,7 +4203,6 @@ static int do_tcp_getsockopt(struct sock *sk, int level, err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname, &zc, &len, err); release_sock(sk); - sk_defer_free_flush(sk); if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags)) goto zerocopy_rcv_cmsg; switch (len) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2c2d42142555..918816ec5dd4 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2065,7 +2065,6 @@ process: sk_incoming_cpu_update(sk); - sk_defer_free_flush(sk); bh_lock_sock_nested(sk); tcp_segs_in(tcp_sk(sk), skb); ret = 0; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 54277de7474b..60bdec257ba7 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1728,7 +1728,6 @@ process: sk_incoming_cpu_update(sk); - sk_defer_free_flush(sk); bh_lock_sock_nested(sk); tcp_segs_in(tcp_sk(sk), skb); ret = 0; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index ddbe05ec5489..bc54f6c5b1a4 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1911,7 +1911,6 @@ recv_end: end: release_sock(sk); - sk_defer_free_flush(sk); if (psock) sk_psock_put(sk, psock); return copied ? : err; @@ -1983,7 +1982,6 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, splice_read_end: release_sock(sk); - sk_defer_free_flush(sk); return copied ? : err; } |