summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorBjörn Töpel <bjorn.topel@intel.com>2020-11-30 21:51:56 +0300
committerDaniel Borkmann <daniel@iogearbox.net>2020-12-01 02:09:25 +0300
commit7fd3253a7de6a317a0683f83739479fb880bffc8 (patch)
tree8490a9756c6a4a7f5078fd856f23b568703e3680 /include
parent854055c0cf30d732b3514ce7956976f60496b1a1 (diff)
downloadlinux-7fd3253a7de6a317a0683f83739479fb880bffc8.tar.xz
net: Introduce preferred busy-polling
The existing busy-polling mode, enabled by the SO_BUSY_POLL socket option or system-wide using the /proc/sys/net/core/busy_read knob, is an opportunistic. That means that if the NAPI context is not scheduled, it will poll it. If, after busy-polling, the budget is exceeded the busy-polling logic will schedule the NAPI onto the regular softirq handling. One implication of the behavior above is that a busy/heavy loaded NAPI context will never enter/allow for busy-polling. Some applications prefer that most NAPI processing would be done by busy-polling. This series adds a new socket option, SO_PREFER_BUSY_POLL, that works in concert with the napi_defer_hard_irqs and gro_flush_timeout knobs. The napi_defer_hard_irqs and gro_flush_timeout knobs were introduced in commit 6f8b12d661d0 ("net: napi: add hard irqs deferral feature"), and allows for a user to defer interrupts to be enabled and instead schedule the NAPI context from a watchdog timer. When a user enables the SO_PREFER_BUSY_POLL, again with the other knobs enabled, and the NAPI context is being processed by a softirq, the softirq NAPI processing will exit early to allow the busy-polling to be performed. If the application stops performing busy-polling via a system call, the watchdog timer defined by gro_flush_timeout will timeout, and regular softirq handling will resume. In summary; Heavy traffic applications that prefer busy-polling over softirq processing should use this option. Example usage: $ echo 2 | sudo tee /sys/class/net/ens785f1/napi_defer_hard_irqs $ echo 200000 | sudo tee /sys/class/net/ens785f1/gro_flush_timeout Note that the timeout should be larger than the userspace processing window, otherwise the watchdog will timeout and fall back to regular softirq processing. Enable the SO_BUSY_POLL/SO_PREFER_BUSY_POLL options on your socket. Signed-off-by: Björn Töpel <bjorn.topel@intel.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Reviewed-by: Jakub Kicinski <kuba@kernel.org> Link: https://lore.kernel.org/bpf/20201130185205.196029-2-bjorn.topel@gmail.com
Diffstat (limited to 'include')
-rw-r--r--include/linux/netdevice.h35
-rw-r--r--include/net/busy_poll.h5
-rw-r--r--include/net/sock.h4
-rw-r--r--include/uapi/asm-generic/socket.h2
4 files changed, 30 insertions, 16 deletions
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7ce648a564f7..52d1cc2bd8a7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -350,23 +350,25 @@ struct napi_struct {
};
enum {
- NAPI_STATE_SCHED, /* Poll is scheduled */
- NAPI_STATE_MISSED, /* reschedule a napi */
- NAPI_STATE_DISABLE, /* Disable pending */
- NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */
- NAPI_STATE_LISTED, /* NAPI added to system lists */
- NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
- NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
+ NAPI_STATE_SCHED, /* Poll is scheduled */
+ NAPI_STATE_MISSED, /* reschedule a napi */
+ NAPI_STATE_DISABLE, /* Disable pending */
+ NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */
+ NAPI_STATE_LISTED, /* NAPI added to system lists */
+ NAPI_STATE_NO_BUSY_POLL, /* Do not add in napi_hash, no busy polling */
+ NAPI_STATE_IN_BUSY_POLL, /* sk_busy_loop() owns this NAPI */
+ NAPI_STATE_PREFER_BUSY_POLL, /* prefer busy-polling over softirq processing*/
};
enum {
- NAPIF_STATE_SCHED = BIT(NAPI_STATE_SCHED),
- NAPIF_STATE_MISSED = BIT(NAPI_STATE_MISSED),
- NAPIF_STATE_DISABLE = BIT(NAPI_STATE_DISABLE),
- NAPIF_STATE_NPSVC = BIT(NAPI_STATE_NPSVC),
- NAPIF_STATE_LISTED = BIT(NAPI_STATE_LISTED),
- NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
- NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
+ NAPIF_STATE_SCHED = BIT(NAPI_STATE_SCHED),
+ NAPIF_STATE_MISSED = BIT(NAPI_STATE_MISSED),
+ NAPIF_STATE_DISABLE = BIT(NAPI_STATE_DISABLE),
+ NAPIF_STATE_NPSVC = BIT(NAPI_STATE_NPSVC),
+ NAPIF_STATE_LISTED = BIT(NAPI_STATE_LISTED),
+ NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
+ NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
+ NAPIF_STATE_PREFER_BUSY_POLL = BIT(NAPI_STATE_PREFER_BUSY_POLL),
};
enum gro_result {
@@ -437,6 +439,11 @@ static inline bool napi_disable_pending(struct napi_struct *n)
return test_bit(NAPI_STATE_DISABLE, &n->state);
}
+static inline bool napi_prefer_busy_poll(struct napi_struct *n)
+{
+ return test_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state);
+}
+
bool napi_schedule_prep(struct napi_struct *n);
/**
diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index b001fa91c14e..0292b8353d7e 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -43,7 +43,7 @@ bool sk_busy_loop_end(void *p, unsigned long start_time);
void napi_busy_loop(unsigned int napi_id,
bool (*loop_end)(void *, unsigned long),
- void *loop_end_arg);
+ void *loop_end_arg, bool prefer_busy_poll);
#else /* CONFIG_NET_RX_BUSY_POLL */
static inline unsigned long net_busy_loop_on(void)
@@ -105,7 +105,8 @@ static inline void sk_busy_loop(struct sock *sk, int nonblock)
unsigned int napi_id = READ_ONCE(sk->sk_napi_id);
if (napi_id >= MIN_NAPI_ID)
- napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk);
+ napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk,
+ READ_ONCE(sk->sk_prefer_busy_poll));
#endif
}
diff --git a/include/net/sock.h b/include/net/sock.h
index a5c6ae78df77..d49b89b071b6 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -301,6 +301,7 @@ struct bpf_local_storage;
* @sk_ack_backlog: current listen backlog
* @sk_max_ack_backlog: listen backlog set in listen()
* @sk_uid: user id of owner
+ * @sk_prefer_busy_poll: prefer busypolling over softirq processing
* @sk_priority: %SO_PRIORITY setting
* @sk_type: socket type (%SOCK_STREAM, etc)
* @sk_protocol: which protocol this socket belongs in this network family
@@ -479,6 +480,9 @@ struct sock {
u32 sk_ack_backlog;
u32 sk_max_ack_backlog;
kuid_t sk_uid;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ u8 sk_prefer_busy_poll;
+#endif
struct pid *sk_peer_pid;
const struct cred *sk_peer_cred;
long sk_rcvtimeo;
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index 77f7c1638eb1..7dd02408b7ce 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -119,6 +119,8 @@
#define SO_DETACH_REUSEPORT_BPF 68
+#define SO_PREFER_BUSY_POLL 69
+
#if !defined(__KERNEL__)
#if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))