diff options
| author | David S. Miller <davem@davemloft.net> | 2017-03-25 06:49:31 +0300 |
|---|---|---|
| committer | David S. Miller <davem@davemloft.net> | 2017-03-25 06:49:31 +0300 |
| commit | 2239cc634395ecce69dd047be9104f71edc417b4 (patch) | |
| tree | c80a83afc1d175816d95920e9ea185f4f5f96650 | |
| parent | dcb421f4279f362c3eef7479616c76588b74d782 (diff) | |
| parent | 6d4339028b350efbf87c61e6d9e113e5373545c9 (diff) | |
| download | linux-2239cc634395ecce69dd047be9104f71edc417b4.tar.xz | |
Merge branch 'epoll-busypoll'
Alexander Duyck says:
====================
Add busy poll support for epoll
This patch set adds support for using busy polling with epoll. The main
idea behind this is that we record the NAPI ID for the last event that is
moved onto the ready list for the epoll context and then when we no longer
have any events on the ready list we begin polling with that ID. If the
busy polling does not yield any events then we will reset the NAPI ID to 0
and wait until a new event is added to the ready list with a valid NAPI ID
before we will resume busy polling.
Most of the changes in this set authored by me are meant to be cleanup or
fixes for various things. For example, I am trying to make it so that we
don't perform hash look-ups for the NAPI instance when we are only working
with sender_cpu and the like.
At the heart of this set is the last 3 patches which enable epoll support
and add support for obtaining the NAPI ID of a given socket. With these it
becomes possible for an application to make use of epoll and get optimal
busy poll utilization by stacking multiple sockets with the same NAPI ID on
the same epoll context.
v1: The first version of this series only allowed epoll to busy poll if all
of the sockets with a NAPI ID shared the same NAPI ID. I feel we were
too strict with this requirement, so I changed the behavior for v2.
v2: The second version was pretty much a full rewrite of the first set. The
main changes consisted of pulling apart several patches to better
address the need to clean up a few items and to make the code easier to
review. In the set however I went a bit overboard and was trying to fix
an issue that would only occur with 500+ years of uptime, and in the
process limited the range for busy_poll/busy_read unnecessarily.
v3: Split off the code for limiting busy_poll and busy_read into a separate
patch for net.
Updated patch that changed busy loop time tracking so that it uses
"local_clock() >> 10" as we originally did.
Tweaked "Change return type.." patch by moving declaration of "work"
inside the loop where is was accessed and always reset to 0.
Added "Acked-by" for patches that received acks.
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
| -rw-r--r-- | arch/alpha/include/uapi/asm/socket.h | 2 | ||||
| -rw-r--r-- | arch/avr32/include/uapi/asm/socket.h | 2 | ||||
| -rw-r--r-- | arch/frv/include/uapi/asm/socket.h | 2 | ||||
| -rw-r--r-- | arch/ia64/include/uapi/asm/socket.h | 2 | ||||
| -rw-r--r-- | arch/m32r/include/uapi/asm/socket.h | 2 | ||||
| -rw-r--r-- | arch/mips/include/uapi/asm/socket.h | 1 | ||||
| -rw-r--r-- | arch/mn10300/include/uapi/asm/socket.h | 2 | ||||
| -rw-r--r-- | arch/parisc/include/uapi/asm/socket.h | 2 | ||||
| -rw-r--r-- | arch/powerpc/include/uapi/asm/socket.h | 2 | ||||
| -rw-r--r-- | arch/s390/include/uapi/asm/socket.h | 2 | ||||
| -rw-r--r-- | arch/sparc/include/uapi/asm/socket.h | 2 | ||||
| -rw-r--r-- | arch/xtensa/include/uapi/asm/socket.h | 2 | ||||
| -rw-r--r-- | fs/eventpoll.c | 93 | ||||
| -rw-r--r-- | fs/select.c | 16 | ||||
| -rw-r--r-- | include/net/busy_poll.h | 102 | ||||
| -rw-r--r-- | include/uapi/asm-generic/socket.h | 2 | ||||
| -rw-r--r-- | net/core/datagram.c | 8 | ||||
| -rw-r--r-- | net/core/dev.c | 41 | ||||
| -rw-r--r-- | net/core/sock.c | 23 | ||||
| -rw-r--r-- | net/ipv4/tcp_ipv4.c | 2 | ||||
| -rw-r--r-- | net/ipv4/tcp_minisocks.c | 4 | ||||
| -rw-r--r-- | net/ipv6/tcp_ipv6.c | 2 | ||||
| -rw-r--r-- | net/sctp/socket.c | 9 |
23 files changed, 244 insertions, 81 deletions
diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index 089db42c1b40..1bb8cac61a28 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -101,4 +101,6 @@ #define SO_MEMINFO 55 +#define SO_INCOMING_NAPI_ID 56 + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/avr32/include/uapi/asm/socket.h b/arch/avr32/include/uapi/asm/socket.h index 6eabcbd2f82a..f824eeb0f2e4 100644 --- a/arch/avr32/include/uapi/asm/socket.h +++ b/arch/avr32/include/uapi/asm/socket.h @@ -94,4 +94,6 @@ #define SO_MEMINFO 55 +#define SO_INCOMING_NAPI_ID 56 + #endif /* _UAPI__ASM_AVR32_SOCKET_H */ diff --git a/arch/frv/include/uapi/asm/socket.h b/arch/frv/include/uapi/asm/socket.h index bd497f8356b9..a8ad9bebfc47 100644 --- a/arch/frv/include/uapi/asm/socket.h +++ b/arch/frv/include/uapi/asm/socket.h @@ -94,5 +94,7 @@ #define SO_MEMINFO 55 +#define SO_INCOMING_NAPI_ID 56 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h index f1bb54686168..6af3253e4209 100644 --- a/arch/ia64/include/uapi/asm/socket.h +++ b/arch/ia64/include/uapi/asm/socket.h @@ -103,4 +103,6 @@ #define SO_MEMINFO 55 +#define SO_INCOMING_NAPI_ID 56 + #endif /* _ASM_IA64_SOCKET_H */ diff --git a/arch/m32r/include/uapi/asm/socket.h b/arch/m32r/include/uapi/asm/socket.h index 459c46076f6f..e98b6bb897c0 100644 --- a/arch/m32r/include/uapi/asm/socket.h +++ b/arch/m32r/include/uapi/asm/socket.h @@ -94,4 +94,6 @@ #define SO_MEMINFO 55 +#define SO_INCOMING_NAPI_ID 56 + #endif /* _ASM_M32R_SOCKET_H */ diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index 688c18dd62ef..ae2b62e39d4d 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -112,5 +112,6 @@ #define SO_MEMINFO 55 +#define SO_INCOMING_NAPI_ID 56 #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/mn10300/include/uapi/asm/socket.h b/arch/mn10300/include/uapi/asm/socket.h index 312d2c457a04..e4ac1843ee01 100644 --- a/arch/mn10300/include/uapi/asm/socket.h +++ b/arch/mn10300/include/uapi/asm/socket.h @@ -94,4 +94,6 @@ #define SO_MEMINFO 55 +#define SO_INCOMING_NAPI_ID 56 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index b98ec38f2083..f754c793e82a 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -93,4 +93,6 @@ #define SO_MEMINFO 0x4030 +#define SO_INCOMING_NAPI_ID 0x4031 + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/powerpc/include/uapi/asm/socket.h b/arch/powerpc/include/uapi/asm/socket.h index 099a889240f6..5f84af7dcb2e 100644 --- a/arch/powerpc/include/uapi/asm/socket.h +++ b/arch/powerpc/include/uapi/asm/socket.h @@ -101,4 +101,6 @@ #define SO_MEMINFO 55 +#define SO_INCOMING_NAPI_ID 56 + #endif /* _ASM_POWERPC_SOCKET_H */ diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h index 6199bb34f7fa..25ac4960e707 100644 --- a/arch/s390/include/uapi/asm/socket.h +++ b/arch/s390/include/uapi/asm/socket.h @@ -100,4 +100,6 @@ #define SO_MEMINFO 55 +#define SO_INCOMING_NAPI_ID 56 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 12cd8c2ec422..b05513acd589 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -90,6 +90,8 @@ #define SO_MEMINFO 0x0039 +#define SO_INCOMING_NAPI_ID 0x003a + /* Security levels - as per NRL IPv6 - don't actually do anything */ #define SO_SECURITY_AUTHENTICATION 0x5001 #define SO_SECURITY_ENCRYPTION_TRANSPORT 0x5002 diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h index d0b85f6c1484..786606c81edd 100644 --- a/arch/xtensa/include/uapi/asm/socket.h +++ b/arch/xtensa/include/uapi/asm/socket.h @@ -105,4 +105,6 @@ #define SO_MEMINFO 55 +#define SO_INCOMING_NAPI_ID 56 + #endif /* _XTENSA_SOCKET_H */ diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 341251421ced..5420767c9b68 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -42,6 +42,7 @@ #include <linux/seq_file.h> #include <linux/compat.h> #include <linux/rculist.h> +#include <net/busy_poll.h> /* * LOCKING: @@ -224,6 +225,11 @@ struct eventpoll { /* used to optimize loop detection check */ int visited; struct list_head visited_list_link; + +#ifdef CONFIG_NET_RX_BUSY_POLL + /* used to track busy poll napi_id */ + unsigned int napi_id; +#endif }; /* Wait structure used by the poll hooks */ @@ -384,6 +390,77 @@ static inline int ep_events_available(struct eventpoll *ep) return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR; } +#ifdef CONFIG_NET_RX_BUSY_POLL +static bool ep_busy_loop_end(void *p, unsigned long start_time) +{ + struct eventpoll *ep = p; + + return ep_events_available(ep) || busy_loop_timeout(start_time); +} +#endif /* CONFIG_NET_RX_BUSY_POLL */ + +/* + * Busy poll if globally on and supporting sockets found && no events, + * busy loop will return if need_resched or ep_events_available. + * + * we must do our busy polling with irqs enabled + */ +static void ep_busy_loop(struct eventpoll *ep, int nonblock) +{ +#ifdef CONFIG_NET_RX_BUSY_POLL + unsigned int napi_id = READ_ONCE(ep->napi_id); + + if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) + napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep); +#endif +} + +static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep) +{ +#ifdef CONFIG_NET_RX_BUSY_POLL + if (ep->napi_id) + ep->napi_id = 0; +#endif +} + +/* + * Set epoll busy poll NAPI ID from sk. + */ +static inline void ep_set_busy_poll_napi_id(struct epitem *epi) +{ +#ifdef CONFIG_NET_RX_BUSY_POLL + struct eventpoll *ep; + unsigned int napi_id; + struct socket *sock; + struct sock *sk; + int err; + + if (!net_busy_loop_on()) + return; + + sock = sock_from_file(epi->ffd.file, &err); + if (!sock) + return; + + sk = sock->sk; + if (!sk) + return; + + napi_id = READ_ONCE(sk->sk_napi_id); + ep = epi->ep; + + /* Non-NAPI IDs can be rejected + * or + * Nothing to do if we already have this ID + */ + if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id) + return; + + /* record NAPI ID for use in next busy poll */ + ep->napi_id = napi_id; +#endif +} + /** * ep_call_nested - Perform a bound (possibly) nested call, by checking * that the recursion limit is not exceeded, and that @@ -1022,6 +1099,8 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k spin_lock_irqsave(&ep->lock, flags); + ep_set_busy_poll_napi_id(epi); + /* * If the event mask does not contain any poll(2) event, we consider the * descriptor to be disabled. This condition is likely the effect of the @@ -1363,6 +1442,9 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, /* We have to drop the new item inside our item list to keep track of it */ spin_lock_irqsave(&ep->lock, flags); + /* record NAPI ID of new item if present */ + ep_set_busy_poll_napi_id(epi); + /* If the file is already "ready" we drop it inside the ready list */ if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); @@ -1637,10 +1719,21 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, } fetch_events: + + if (!ep_events_available(ep)) + ep_busy_loop(ep, timed_out); + spin_lock_irqsave(&ep->lock, flags); if (!ep_events_available(ep)) { /* + * Busy poll timed out. Drop NAPI ID for now, we can add + * it back in when we have moved a socket with a valid NAPI + * ID onto the ready list. + */ + ep_reset_busy_poll_napi_id(ep); + + /* * We don't have any available event to return to the caller. * We need to sleep here, and we will be wake up by * ep_poll_callback() when events will become available. diff --git a/fs/select.c b/fs/select.c index e2112270d75a..9287d3a96e35 100644 --- a/fs/select.c +++ b/fs/select.c @@ -409,7 +409,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) int retval, i, timed_out = 0; u64 slack = 0; unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; - unsigned long busy_end = 0; + unsigned long busy_start = 0; rcu_read_lock(); retval = max_select_fd(n, fds); @@ -512,11 +512,11 @@ int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) /* only if found POLL_BUSY_LOOP sockets && not out of time */ if (can_busy_loop && !need_resched()) { - if (!busy_end) { - busy_end = busy_loop_end_time(); + if (!busy_start) { + busy_start = busy_loop_current_time(); continue; } - if (!busy_loop_timeout(busy_end)) + if (!busy_loop_timeout(busy_start)) continue; } busy_flag = 0; @@ -800,7 +800,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait, int timed_out = 0, count = 0; u64 slack = 0; unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; - unsigned long busy_end = 0; + unsigned long busy_start = 0; /* Optimise the no-wait case */ if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { @@ -853,11 +853,11 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait, /* only if found POLL_BUSY_LOOP sockets && not out of time */ if (can_busy_loop && !need_resched()) { - if (!busy_end) { - busy_end = busy_loop_end_time(); + if (!busy_start) { + busy_start = busy_loop_current_time(); continue; } - if (!busy_loop_timeout(busy_end)) + if (!busy_loop_timeout(busy_start)) continue; } busy_flag = 0; diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index c0452de83086..8ffd434676b7 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -35,83 +35,101 @@ struct napi_struct; extern unsigned int sysctl_net_busy_read __read_mostly; extern unsigned int sysctl_net_busy_poll __read_mostly; +/* 0 - Reserved to indicate value not set + * 1..NR_CPUS - Reserved for sender_cpu + * NR_CPUS+1..~0 - Region available for NAPI IDs + */ +#define MIN_NAPI_ID ((unsigned int)(NR_CPUS + 1)) + static inline bool net_busy_loop_on(void) { return sysctl_net_busy_poll; } -static inline u64 busy_loop_us_clock(void) +static inline bool sk_can_busy_loop(const struct sock *sk) { - return local_clock() >> 10; + return sk->sk_ll_usec && !signal_pending(current); } -static inline unsigned long sk_busy_loop_end_time(struct sock *sk) -{ - return busy_loop_us_clock() + ACCESS_ONCE(sk->sk_ll_usec); -} +bool sk_busy_loop_end(void *p, unsigned long start_time); -/* in poll/select we use the global sysctl_net_ll_poll value */ -static inline unsigned long busy_loop_end_time(void) +void napi_busy_loop(unsigned int napi_id, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg); + +#else /* CONFIG_NET_RX_BUSY_POLL */ +static inline unsigned long net_busy_loop_on(void) { - return busy_loop_us_clock() + ACCESS_ONCE(sysctl_net_busy_poll); + return 0; } -static inline bool sk_can_busy_loop(const struct sock *sk) +static inline bool sk_can_busy_loop(struct sock *sk) { - return sk->sk_ll_usec && sk->sk_napi_id && !signal_pending(current); + return false; } +#endif /* CONFIG_NET_RX_BUSY_POLL */ -static inline bool busy_loop_timeout(unsigned long end_time) +static inline unsigned long busy_loop_current_time(void) { - unsigned long now = busy_loop_us_clock(); - - return time_after(now, end_time); +#ifdef CONFIG_NET_RX_BUSY_POLL + return (unsigned long)(local_clock() >> 10); +#else + return 0; +#endif } -bool sk_busy_loop(struct sock *sk, int nonblock); - -/* used in the NIC receive handler to mark the skb */ -static inline void skb_mark_napi_id(struct sk_buff *skb, - struct napi_struct *napi) +/* in poll/select we use the global sysctl_net_ll_poll value */ +static inline bool busy_loop_timeout(unsigned long start_time) { - skb->napi_id = napi->napi_id; -} +#ifdef CONFIG_NET_RX_BUSY_POLL + unsigned long bp_usec = READ_ONCE(sysctl_net_busy_poll); + if (bp_usec) { + unsigned long end_time = start_time + bp_usec; + unsigned long now = busy_loop_current_time(); -#else /* CONFIG_NET_RX_BUSY_POLL */ -static inline unsigned long net_busy_loop_on(void) -{ - return 0; + return time_after(now, end_time); + } +#endif + return true; } -static inline unsigned long busy_loop_end_time(void) +static inline bool sk_busy_loop_timeout(struct sock *sk, + unsigned long start_time) { - return 0; -} +#ifdef CONFIG_NET_RX_BUSY_POLL + unsigned long bp_usec = READ_ONCE(sk->sk_ll_usec); -static inline bool sk_can_busy_loop(struct sock *sk) -{ - return false; -} + if (bp_usec) { + unsigned long end_time = start_time + bp_usec; + unsigned long now = busy_loop_current_time(); -static inline void skb_mark_napi_id(struct sk_buff *skb, - struct napi_struct *napi) -{ + return time_after(now, end_time); + } +#endif + return true; } -static inline bool busy_loop_timeout(unsigned long end_time) +static inline void sk_busy_loop(struct sock *sk, int nonblock) { - return true; +#ifdef CONFIG_NET_RX_BUSY_POLL + unsigned int napi_id = READ_ONCE(sk->sk_napi_id); + + if (napi_id >= MIN_NAPI_ID) + napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk); +#endif } -static inline bool sk_busy_loop(struct sock *sk, int nonblock) +/* used in the NIC receive handler to mark the skb */ +static inline void skb_mark_napi_id(struct sk_buff *skb, + struct napi_struct *napi) { - return false; +#ifdef CONFIG_NET_RX_BUSY_POLL + skb->napi_id = napi->napi_id; +#endif } -#endif /* CONFIG_NET_RX_BUSY_POLL */ - /* used in the protocol hanlder to propagate the napi_id to the socket */ static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb) { diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index 8313702c1eae..c98a52fb572a 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -96,4 +96,6 @@ #define SO_MEMINFO 55 +#define SO_INCOMING_NAPI_ID 56 + #endif /* __ASM_GENERIC_SOCKET_H */ diff --git a/net/core/datagram.c b/net/core/datagram.c index ea633342ab0d..4608aa245410 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -256,8 +256,12 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags, } spin_unlock_irqrestore(&queue->lock, cpu_flags); - } while (sk_can_busy_loop(sk) && - sk_busy_loop(sk, flags & MSG_DONTWAIT)); + + if (!sk_can_busy_loop(sk)) + break; + + sk_busy_loop(sk, flags & MSG_DONTWAIT); + } while (!skb_queue_empty(&sk->sk_receive_queue)); error = -EAGAIN; diff --git a/net/core/dev.c b/net/core/dev.c index 7869ae3837ca..ef9fe60ee294 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5060,27 +5060,28 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) do_softirq(); } -bool sk_busy_loop(struct sock *sk, int nonblock) +void napi_busy_loop(unsigned int napi_id, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg) { - unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0; + unsigned long start_time = loop_end ? busy_loop_current_time() : 0; int (*napi_poll)(struct napi_struct *napi, int budget); void *have_poll_lock = NULL; struct napi_struct *napi; - int rc; restart: - rc = false; napi_poll = NULL; rcu_read_lock(); - napi = napi_by_id(sk->sk_napi_id); + napi = napi_by_id(napi_id); if (!napi) goto out; preempt_disable(); for (;;) { - rc = 0; + int work = 0; + local_bh_disable(); if (!napi_poll) { unsigned long val = READ_ONCE(napi->state); @@ -5098,16 +5099,15 @@ restart: have_poll_lock = netpoll_poll_lock(napi); napi_poll = napi->poll; } - rc = napi_poll(napi, BUSY_POLL_BUDGET); - trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); + work = napi_poll(napi, BUSY_POLL_BUDGET); + trace_napi_poll(napi, work, BUSY_POLL_BUDGET); count: - if (rc > 0) - __NET_ADD_STATS(sock_net(sk), - LINUX_MIB_BUSYPOLLRXPACKETS, rc); + if (work > 0) + __NET_ADD_STATS(dev_net(napi->dev), + LINUX_MIB_BUSYPOLLRXPACKETS, work); local_bh_enable(); - if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) || - busy_loop_timeout(end_time)) + if (!loop_end || loop_end(loop_end_arg, start_time)) break; if (unlikely(need_resched())) { @@ -5116,9 +5116,8 @@ count: preempt_enable(); rcu_read_unlock(); cond_resched(); - rc = !skb_queue_empty(&sk->sk_receive_queue); - if (rc || busy_loop_timeout(end_time)) - return rc; + if (loop_end(loop_end_arg, start_time)) + return; goto restart; } cpu_relax(); @@ -5126,12 +5125,10 @@ count: if (napi_poll) busy_poll_stop(napi, have_poll_lock); preempt_enable(); - rc = !skb_queue_empty(&sk->sk_receive_queue); out: rcu_read_unlock(); - return rc; } -EXPORT_SYMBOL(sk_busy_loop); +EXPORT_SYMBOL(napi_busy_loop); #endif /* CONFIG_NET_RX_BUSY_POLL */ @@ -5143,10 +5140,10 @@ static void napi_hash_add(struct napi_struct *napi) spin_lock(&napi_hash_lock); - /* 0..NR_CPUS+1 range is reserved for sender_cpu use */ + /* 0..NR_CPUS range is reserved for sender_cpu use */ do { - if (unlikely(++napi_gen_id < NR_CPUS + 1)) - napi_gen_id = NR_CPUS + 1; + if (unlikely(++napi_gen_id < MIN_NAPI_ID)) + napi_gen_id = MIN_NAPI_ID; } while (napi_by_id(napi_gen_id)); napi->napi_id = napi_gen_id; diff --git a/net/core/sock.c b/net/core/sock.c index 1b9030ee6f4b..1a58a9dc6888 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1328,6 +1328,18 @@ int sock_getsockopt(struct socket *sock, int level, int optname, goto lenout; } + +#ifdef CONFIG_NET_RX_BUSY_POLL + case SO_INCOMING_NAPI_ID: + v.val = READ_ONCE(sk->sk_napi_id); + + /* aggregate non-NAPI IDs down to 0 */ + if (v.val < MIN_NAPI_ID) + v.val = 0; + + break; +#endif + default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7). @@ -3237,3 +3249,14 @@ static int __init proto_init(void) subsys_initcall(proto_init); #endif /* PROC_FS */ + +#ifdef CONFIG_NET_RX_BUSY_POLL +bool sk_busy_loop_end(void *p, unsigned long start_time) +{ + struct sock *sk = p; + + return !skb_queue_empty(&sk->sk_receive_queue) || + sk_busy_loop_timeout(sk, start_time); +} +EXPORT_SYMBOL(sk_busy_loop_end); +#endif /* CONFIG_NET_RX_BUSY_POLL */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 7482b5d11861..20cbd2f07f28 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1409,8 +1409,6 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) if (!nsk) goto discard; if (nsk != sk) { - sock_rps_save_rxhash(nsk, skb); - sk_mark_napi_id(nsk, skb); if (tcp_child_process(sk, nsk, skb)) { rsk = nsk; goto reset; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 1e217948be62..8f6373b0cd77 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -26,6 +26,7 @@ #include <net/tcp.h> #include <net/inet_common.h> #include <net/xfrm.h> +#include <net/busy_poll.h> int sysctl_tcp_abort_on_overflow __read_mostly; @@ -799,6 +800,9 @@ int tcp_child_process(struct sock *parent, struct sock *child, int ret = 0; int state = child->sk_state; + /* record NAPI ID of child */ + sk_mark_napi_id(child, skb); + tcp_segs_in(tcp_sk(child), skb); if (!sock_owned_by_user(child)) { ret = tcp_rcv_state_process(child, skb); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 031a8c019f7a..8e42e8f54b70 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1293,8 +1293,6 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) goto discard; if (nsk != sk) { - sock_rps_save_rxhash(nsk, skb); - sk_mark_napi_id(nsk, skb); if (tcp_child_process(sk, nsk, skb)) goto reset; if (opt_skb) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 72cc3ecf6516..ccc08fc39722 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -7518,9 +7518,12 @@ struct sk_buff *sctp_skb_recv_datagram(struct sock *sk, int flags, if (sk->sk_shutdown & RCV_SHUTDOWN) break; - if (sk_can_busy_loop(sk) && - sk_busy_loop(sk, noblock)) - continue; + if (sk_can_busy_loop(sk)) { + sk_busy_loop(sk, noblock); + + if (!skb_queue_empty(&sk->sk_receive_queue)) + continue; + } /* User doesn't want to wait. */ error = -EAGAIN; |
