From 2d48d67fa8cd129ea85ea02d91b4a793286866f8 Mon Sep 17 00:00:00 2001 From: Eliezer Tamir Date: Mon, 24 Jun 2013 10:28:03 +0300 Subject: net: poll/select low latency socket support select/poll busy-poll support. Split sysctl value into two separate ones, one for read and one for poll. updated Documentation/sysctl/net.txt Add a new poll flag POLL_LL. When this flag is set, sock_poll will call sk_poll_ll if possible. sock_poll sets this flag in its return value to indicate to select/poll when a socket that can busy poll is found. When poll/select have nothing to report, call the low-level sock_poll again until we are out of time or we find something. Once the system call finds something, it stops setting POLL_LL, so it can return the result to the user ASAP. Signed-off-by: Eliezer Tamir Signed-off-by: David S. Miller --- fs/select.c | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/select.c b/fs/select.c index 8c1c96c27062..79b876eb91da 100644 --- a/fs/select.c +++ b/fs/select.c @@ -27,6 +27,7 @@ #include #include #include +#include #include @@ -384,9 +385,10 @@ get_max: #define POLLEX_SET (POLLPRI) static inline void wait_key_set(poll_table *wait, unsigned long in, - unsigned long out, unsigned long bit) + unsigned long out, unsigned long bit, + unsigned int ll_flag) { - wait->_key = POLLEX_SET; + wait->_key = POLLEX_SET | ll_flag; if (in & bit) wait->_key |= POLLIN_SET; if (out & bit) @@ -400,6 +402,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) poll_table *wait; int retval, i, timed_out = 0; unsigned long slack = 0; + unsigned int ll_flag = POLL_LL; + u64 ll_time = ll_end_time(); rcu_read_lock(); retval = max_select_fd(n, fds); @@ -422,6 +426,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) retval = 0; for (;;) { unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; + bool can_ll = false; inp = fds->in; outp = fds->out; exp = fds->ex; rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; @@ -449,7 +454,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) f_op = f.file->f_op; mask = DEFAULT_POLLMASK; if (f_op && f_op->poll) { - wait_key_set(wait, in, out, bit); + wait_key_set(wait, in, out, + bit, ll_flag); mask = (*f_op->poll)(f.file, wait); } fdput(f); @@ -468,6 +474,11 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) retval++; wait->_qproc = NULL; } + if (mask & POLL_LL) + can_ll = true; + /* got something, stop busy polling */ + if (retval) + ll_flag = 0; } } if (res_in) @@ -486,6 +497,9 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) break; } + if (can_ll && can_poll_ll(ll_time)) + continue; + /* * If this is the first loop and we have a timeout * given, then we convert to ktime_t and set the to @@ -717,7 +731,8 @@ struct poll_list { * pwait poll_table will be used by the fd-provided poll handler for waiting, * if pwait->_qproc is non-NULL. */ -static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait) +static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait, + bool *can_ll, unsigned int ll_flag) { unsigned int mask; int fd; @@ -731,7 +746,10 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait) mask = DEFAULT_POLLMASK; if (f.file->f_op && f.file->f_op->poll) { pwait->_key = pollfd->events|POLLERR|POLLHUP; + pwait->_key |= ll_flag; mask = f.file->f_op->poll(f.file, pwait); + if (mask & POLL_LL) + *can_ll = true; } /* Mask out unneeded events. */ mask &= pollfd->events | POLLERR | POLLHUP; @@ -750,6 +768,8 @@ static int do_poll(unsigned int nfds, struct poll_list *list, ktime_t expire, *to = NULL; int timed_out = 0, count = 0; unsigned long slack = 0; + unsigned int ll_flag = POLL_LL; + u64 ll_time = ll_end_time(); /* Optimise the no-wait case */ if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { @@ -762,6 +782,7 @@ static int do_poll(unsigned int nfds, struct poll_list *list, for (;;) { struct poll_list *walk; + bool can_ll = false; for (walk = list; walk != NULL; walk = walk->next) { struct pollfd * pfd, * pfd_end; @@ -776,9 +797,10 @@ static int do_poll(unsigned int nfds, struct poll_list *list, * this. They'll get immediately deregistered * when we break out and return. */ - if (do_pollfd(pfd, pt)) { + if (do_pollfd(pfd, pt, &can_ll, ll_flag)) { count++; pt->_qproc = NULL; + ll_flag = 0; } } } @@ -795,6 +817,8 @@ static int do_poll(unsigned int nfds, struct poll_list *list, if (count || timed_out) break; + if (can_ll && can_poll_ll(ll_time)) + continue; /* * If this is the first loop and we have a timeout * given, then we convert to ktime_t and set the to -- cgit v1.2.3 From 91e2fd337839319c7745e2cb84cfbf8cf1426a1a Mon Sep 17 00:00:00 2001 From: Eliezer Tamir Date: Fri, 28 Jun 2013 15:59:35 +0300 Subject: net: avoid calling sched_clock when LLS is off Change Low Latency Sockets code for select and poll so that when LLS is disabled sched_clock() is never called. Also, avoid sending POLL_LL to sockets if disabled. Reported-by: Andi Kleen Signed-off-by: Eliezer Tamir Signed-off-by: David S. Miller --- fs/select.c | 11 +++++++---- include/net/ll_poll.h | 17 +++++++++++++++-- 2 files changed, 22 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/select.c b/fs/select.c index 79b876eb91da..36540754bad7 100644 --- a/fs/select.c +++ b/fs/select.c @@ -402,7 +402,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) poll_table *wait; int retval, i, timed_out = 0; unsigned long slack = 0; - unsigned int ll_flag = POLL_LL; + unsigned int ll_flag = ll_get_flag(); u64 ll_time = ll_end_time(); rcu_read_lock(); @@ -497,7 +497,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) break; } - if (can_ll && can_poll_ll(ll_time)) + /* only if on, have sockets with POLL_LL and not out of time */ + if (ll_flag && can_ll && can_poll_ll(ll_time)) continue; /* @@ -768,7 +769,7 @@ static int do_poll(unsigned int nfds, struct poll_list *list, ktime_t expire, *to = NULL; int timed_out = 0, count = 0; unsigned long slack = 0; - unsigned int ll_flag = POLL_LL; + unsigned int ll_flag = ll_get_flag(); u64 ll_time = ll_end_time(); /* Optimise the no-wait case */ @@ -817,8 +818,10 @@ static int do_poll(unsigned int nfds, struct poll_list *list, if (count || timed_out) break; - if (can_ll && can_poll_ll(ll_time)) + /* only if on, have sockets with POLL_LL and not out of time */ + if (ll_flag && can_ll && can_poll_ll(ll_time)) continue; + /* * If this is the first loop and we have a timeout * given, then we convert to ktime_t and set the to diff --git a/include/net/ll_poll.h b/include/net/ll_poll.h index 6d45e6f6ff4c..6c06f7c0a22d 100644 --- a/include/net/ll_poll.h +++ b/include/net/ll_poll.h @@ -37,6 +37,11 @@ extern unsigned int sysctl_net_ll_poll __read_mostly; #define LL_FLUSH_FAILED -1 #define LL_FLUSH_BUSY -2 +static inline unsigned int ll_get_flag(void) +{ + return sysctl_net_ll_poll ? POLL_LL : 0; +} + /* a wrapper to make debug_smp_processor_id() happy * we can use sched_clock() because we don't care much about precision * we only care that the average is bounded @@ -67,10 +72,14 @@ static inline u64 ll_sk_end_time(struct sock *sk) return ((u64)ACCESS_ONCE(sk->sk_ll_usec) << 10) + ll_sched_clock(); } -/* in poll/select we use the global sysctl_net_ll_poll value */ +/* in poll/select we use the global sysctl_net_ll_poll value + * only call sched_clock() if enabled + */ static inline u64 ll_end_time(void) { - return ((u64)ACCESS_ONCE(sysctl_net_ll_poll) << 10) + ll_sched_clock(); + u64 end_time = ACCESS_ONCE(sysctl_net_ll_poll); + + return end_time ? (end_time << 10) + ll_sched_clock() : 0; } static inline bool sk_valid_ll(struct sock *sk) @@ -141,6 +150,10 @@ static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb) } #else /* CONFIG_NET_LL_RX_POLL */ +static inline unsigned long ll_get_flag(void) +{ + return 0; +} static inline u64 sk_ll_end_time(struct sock *sk) { -- cgit v1.2.3 From 1bc2774d866444c5b316fd1dc2b782f20c762cf4 Mon Sep 17 00:00:00 2001 From: Eliezer Tamir Date: Tue, 2 Jul 2013 23:22:47 +0300 Subject: net: convert lls to use time_in_range() Time in range will fail safely if we move to a different cpu with an extremely large clock skew. Add time_in_range64() and convert lls to use it. changelog: v2 - fixed double call to sched_clock in can_poll_ll - fixed checkpatchisms Signed-off-by: Eliezer Tamir Signed-off-by: David S. Miller --- fs/select.c | 10 ++++++---- include/linux/jiffies.h | 4 ++++ include/net/ll_poll.h | 26 +++++++++++++++++--------- 3 files changed, 27 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/select.c b/fs/select.c index 36540754bad7..f28a58592725 100644 --- a/fs/select.c +++ b/fs/select.c @@ -403,7 +403,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) int retval, i, timed_out = 0; unsigned long slack = 0; unsigned int ll_flag = ll_get_flag(); - u64 ll_time = ll_end_time(); + u64 ll_start = ll_start_time(ll_flag); + u64 ll_time = ll_run_time(); rcu_read_lock(); retval = max_select_fd(n, fds); @@ -498,7 +499,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) } /* only if on, have sockets with POLL_LL and not out of time */ - if (ll_flag && can_ll && can_poll_ll(ll_time)) + if (ll_flag && can_ll && can_poll_ll(ll_start, ll_time)) continue; /* @@ -770,7 +771,8 @@ static int do_poll(unsigned int nfds, struct poll_list *list, int timed_out = 0, count = 0; unsigned long slack = 0; unsigned int ll_flag = ll_get_flag(); - u64 ll_time = ll_end_time(); + u64 ll_start = ll_start_time(ll_flag); + u64 ll_time = ll_run_time(); /* Optimise the no-wait case */ if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { @@ -819,7 +821,7 @@ static int do_poll(unsigned int nfds, struct poll_list *list, break; /* only if on, have sockets with POLL_LL and not out of time */ - if (ll_flag && can_ll && can_poll_ll(ll_time)) + if (ll_flag && can_ll && can_poll_ll(ll_start, ll_time)) continue; /* diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 8fb8edf12417..97ba4e78a37e 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -139,6 +139,10 @@ static inline u64 get_jiffies_64(void) ((__s64)(a) - (__s64)(b) >= 0)) #define time_before_eq64(a,b) time_after_eq64(b,a) +#define time_in_range64(a, b, c) \ + (time_after_eq64(a, b) && \ + time_before_eq64(a, c)) + /* * These four macros compare jiffies and 'a' for convenience. */ diff --git a/include/net/ll_poll.h b/include/net/ll_poll.h index 6c06f7c0a22d..b76f004f18a9 100644 --- a/include/net/ll_poll.h +++ b/include/net/ll_poll.h @@ -67,19 +67,23 @@ static inline u64 ll_sched_clock(void) /* we don't mind a ~2.5% imprecision so <<10 instead of *1000 * sk->sk_ll_usec is a u_int so this can't overflow */ -static inline u64 ll_sk_end_time(struct sock *sk) +static inline u64 ll_sk_run_time(struct sock *sk) { - return ((u64)ACCESS_ONCE(sk->sk_ll_usec) << 10) + ll_sched_clock(); + return (u64)ACCESS_ONCE(sk->sk_ll_usec) << 10; } /* in poll/select we use the global sysctl_net_ll_poll value * only call sched_clock() if enabled */ -static inline u64 ll_end_time(void) +static inline u64 ll_run_time(void) { - u64 end_time = ACCESS_ONCE(sysctl_net_ll_poll); + return (u64)ACCESS_ONCE(sysctl_net_ll_poll) << 10; +} - return end_time ? (end_time << 10) + ll_sched_clock() : 0; +/* if flag is not set we don't need to know the time */ +static inline u64 ll_start_time(unsigned int flag) +{ + return flag ? ll_sched_clock() : 0; } static inline bool sk_valid_ll(struct sock *sk) @@ -88,9 +92,12 @@ static inline bool sk_valid_ll(struct sock *sk) !need_resched() && !signal_pending(current); } -static inline bool can_poll_ll(u64 end_time) +/* careful! time_in_range64 will evaluate now twice */ +static inline bool can_poll_ll(u64 start_time, u64 run_time) { - return !time_after64(ll_sched_clock(), end_time); + u64 now = ll_sched_clock(); + + return time_in_range64(now, start_time, start_time + run_time); } /* when used in sock_poll() nonblock is known at compile time to be true @@ -98,7 +105,8 @@ static inline bool can_poll_ll(u64 end_time) */ static inline bool sk_poll_ll(struct sock *sk, int nonblock) { - u64 end_time = nonblock ? 0 : ll_sk_end_time(sk); + u64 start_time = ll_start_time(!nonblock); + u64 run_time = ll_sk_run_time(sk); const struct net_device_ops *ops; struct napi_struct *napi; int rc = false; @@ -129,7 +137,7 @@ static inline bool sk_poll_ll(struct sock *sk, int nonblock) LINUX_MIB_LOWLATENCYRXPACKETS, rc); } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) && - can_poll_ll(end_time)); + can_poll_ll(start_time, run_time)); rc = !skb_queue_empty(&sk->sk_receive_queue); out: -- cgit v1.2.3 From cbf55001b2ddb814329735641be5d29b08c82b08 Mon Sep 17 00:00:00 2001 From: Eliezer Tamir Date: Mon, 8 Jul 2013 16:20:34 +0300 Subject: net: rename low latency sockets functions to busy poll Rename functions in include/net/ll_poll.h to busy wait. Clarify documentation about expected power use increase. Rename POLL_LL to POLL_BUSY_LOOP. Add need_resched() testing to poll/select busy loops. Note, that in select and poll can_busy_poll is dynamic and is updated continuously to reflect the existence of supported sockets with valid queue information. Signed-off-by: Eliezer Tamir Signed-off-by: David S. Miller --- Documentation/sysctl/net.txt | 12 +++++---- fs/select.c | 60 +++++++++++++++++++++++++---------------- include/net/ll_poll.h | 46 ++++++++++++++++--------------- include/uapi/asm-generic/poll.h | 2 +- net/core/datagram.c | 3 ++- net/ipv4/tcp.c | 6 ++--- net/socket.c | 12 ++++----- 7 files changed, 80 insertions(+), 61 deletions(-) (limited to 'fs') diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt index e658bbfb641f..7323b88e26be 100644 --- a/Documentation/sysctl/net.txt +++ b/Documentation/sysctl/net.txt @@ -53,22 +53,24 @@ Default: 64 low_latency_read ---------------- Low latency busy poll timeout for socket reads. (needs CONFIG_NET_LL_RX_POLL) -Approximate time in us to spin waiting for packets on the device queue. +Approximate time in us to busy loop waiting for packets on the device queue. This sets the default value of the SO_LL socket option. -Can be set or overridden per socket by setting socket option SO_LL. -Recommended value is 50. May increase power usage. +Can be set or overridden per socket by setting socket option SO_LL, which is +the preferred method of enabling. +If you need to enable the feature globally via sysctl, a value of 50 is recommended. +Will increase power usage. Default: 0 (off) low_latency_poll ---------------- Low latency busy poll timeout for poll and select. (needs CONFIG_NET_LL_RX_POLL) -Approximate time in us to spin waiting for packets on the device queue. +Approximate time in us to busy loop waiting for events. Recommended value depends on the number of sockets you poll on. For several sockets 50, for several hundreds 100. For more than that you probably want to use epoll. Note that only sockets with SO_LL set will be busy polled, so you want to either selectively set SO_LL on those sockets or set sysctl.net.low_latency_read globally. -May increase power usage. +Will increase power usage. Default: 0 (off) rmem_default diff --git a/fs/select.c b/fs/select.c index f28a58592725..25cac5faf6d6 100644 --- a/fs/select.c +++ b/fs/select.c @@ -402,9 +402,9 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) poll_table *wait; int retval, i, timed_out = 0; unsigned long slack = 0; - unsigned int ll_flag = ll_get_flag(); - u64 ll_start = ll_start_time(ll_flag); - u64 ll_time = ll_run_time(); + unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; + u64 busy_start = busy_loop_start_time(busy_flag); + u64 busy_end = busy_loop_end_time(); rcu_read_lock(); retval = max_select_fd(n, fds); @@ -427,7 +427,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) retval = 0; for (;;) { unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; - bool can_ll = false; + bool can_busy_loop = false; inp = fds->in; outp = fds->out; exp = fds->ex; rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; @@ -456,7 +456,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) mask = DEFAULT_POLLMASK; if (f_op && f_op->poll) { wait_key_set(wait, in, out, - bit, ll_flag); + bit, busy_flag); mask = (*f_op->poll)(f.file, wait); } fdput(f); @@ -475,11 +475,18 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) retval++; wait->_qproc = NULL; } - if (mask & POLL_LL) - can_ll = true; /* got something, stop busy polling */ - if (retval) - ll_flag = 0; + if (retval) { + can_busy_loop = false; + busy_flag = 0; + + /* + * only remember a returned + * POLL_BUSY_LOOP if we asked for it + */ + } else if (busy_flag & mask) + can_busy_loop = true; + } } if (res_in) @@ -498,8 +505,9 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) break; } - /* only if on, have sockets with POLL_LL and not out of time */ - if (ll_flag && can_ll && can_poll_ll(ll_start, ll_time)) + /* only if found POLL_BUSY_LOOP sockets && not out of time */ + if (!need_resched() && can_busy_loop && + busy_loop_range(busy_start, busy_end)) continue; /* @@ -734,7 +742,8 @@ struct poll_list { * if pwait->_qproc is non-NULL. */ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait, - bool *can_ll, unsigned int ll_flag) + bool *can_busy_poll, + unsigned int busy_flag) { unsigned int mask; int fd; @@ -748,10 +757,10 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait, mask = DEFAULT_POLLMASK; if (f.file->f_op && f.file->f_op->poll) { pwait->_key = pollfd->events|POLLERR|POLLHUP; - pwait->_key |= ll_flag; + pwait->_key |= busy_flag; mask = f.file->f_op->poll(f.file, pwait); - if (mask & POLL_LL) - *can_ll = true; + if (mask & busy_flag) + *can_busy_poll = true; } /* Mask out unneeded events. */ mask &= pollfd->events | POLLERR | POLLHUP; @@ -770,9 +779,10 @@ static int do_poll(unsigned int nfds, struct poll_list *list, ktime_t expire, *to = NULL; int timed_out = 0, count = 0; unsigned long slack = 0; - unsigned int ll_flag = ll_get_flag(); - u64 ll_start = ll_start_time(ll_flag); - u64 ll_time = ll_run_time(); + unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; + u64 busy_start = busy_loop_start_time(busy_flag); + u64 busy_end = busy_loop_end_time(); + /* Optimise the no-wait case */ if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { @@ -785,7 +795,7 @@ static int do_poll(unsigned int nfds, struct poll_list *list, for (;;) { struct poll_list *walk; - bool can_ll = false; + bool can_busy_loop = false; for (walk = list; walk != NULL; walk = walk->next) { struct pollfd * pfd, * pfd_end; @@ -800,10 +810,13 @@ static int do_poll(unsigned int nfds, struct poll_list *list, * this. They'll get immediately deregistered * when we break out and return. */ - if (do_pollfd(pfd, pt, &can_ll, ll_flag)) { + if (do_pollfd(pfd, pt, &can_busy_loop, + busy_flag)) { count++; pt->_qproc = NULL; - ll_flag = 0; + /* found something, stop busy polling */ + busy_flag = 0; + can_busy_loop = false; } } } @@ -820,8 +833,9 @@ static int do_poll(unsigned int nfds, struct poll_list *list, if (count || timed_out) break; - /* only if on, have sockets with POLL_LL and not out of time */ - if (ll_flag && can_ll && can_poll_ll(ll_start, ll_time)) + /* only if found POLL_BUSY_LOOP sockets && not out of time */ + if (!need_resched() && can_busy_loop && + busy_loop_range(busy_start, busy_end)) continue; /* diff --git a/include/net/ll_poll.h b/include/net/ll_poll.h index 0d620ba19bc5..f14dd88dafc8 100644 --- a/include/net/ll_poll.h +++ b/include/net/ll_poll.h @@ -37,9 +37,9 @@ extern unsigned int sysctl_net_ll_poll __read_mostly; #define LL_FLUSH_FAILED -1 #define LL_FLUSH_BUSY -2 -static inline unsigned int ll_get_flag(void) +static inline bool net_busy_loop_on(void) { - return sysctl_net_ll_poll ? POLL_LL : 0; + return sysctl_net_ll_poll; } /* a wrapper to make debug_smp_processor_id() happy @@ -47,7 +47,7 @@ static inline unsigned int ll_get_flag(void) * we only care that the average is bounded */ #ifdef CONFIG_DEBUG_PREEMPT -static inline u64 ll_sched_clock(void) +static inline u64 busy_loop_sched_clock(void) { u64 rc; @@ -58,7 +58,7 @@ static inline u64 ll_sched_clock(void) return rc; } #else /* CONFIG_DEBUG_PREEMPT */ -static inline u64 ll_sched_clock(void) +static inline u64 busy_loop_sched_clock(void) { return sched_clock(); } @@ -67,7 +67,7 @@ static inline u64 ll_sched_clock(void) /* we don't mind a ~2.5% imprecision so <<10 instead of *1000 * sk->sk_ll_usec is a u_int so this can't overflow */ -static inline u64 ll_sk_run_time(struct sock *sk) +static inline u64 sk_busy_loop_end_time(struct sock *sk) { return (u64)ACCESS_ONCE(sk->sk_ll_usec) << 10; } @@ -75,27 +75,29 @@ static inline u64 ll_sk_run_time(struct sock *sk) /* in poll/select we use the global sysctl_net_ll_poll value * only call sched_clock() if enabled */ -static inline u64 ll_run_time(void) +static inline u64 busy_loop_end_time(void) { return (u64)ACCESS_ONCE(sysctl_net_ll_poll) << 10; } -/* if flag is not set we don't need to know the time */ -static inline u64 ll_start_time(unsigned int flag) +/* if flag is not set we don't need to know the time + * so we want to avoid a potentially expensive sched_clock() + */ +static inline u64 busy_loop_start_time(unsigned int flag) { - return flag ? ll_sched_clock() : 0; + return flag ? busy_loop_sched_clock() : 0; } -static inline bool sk_valid_ll(struct sock *sk) +static inline bool sk_can_busy_loop(struct sock *sk) { return sk->sk_ll_usec && sk->sk_napi_id && !need_resched() && !signal_pending(current); } /* careful! time_in_range64 will evaluate now twice */ -static inline bool can_poll_ll(u64 start_time, u64 run_time) +static inline bool busy_loop_range(u64 start_time, u64 run_time) { - u64 now = ll_sched_clock(); + u64 now = busy_loop_sched_clock(); return time_in_range64(now, start_time, start_time + run_time); } @@ -103,10 +105,10 @@ static inline bool can_poll_ll(u64 start_time, u64 run_time) /* when used in sock_poll() nonblock is known at compile time to be true * so the loop and end_time will be optimized out */ -static inline bool sk_poll_ll(struct sock *sk, int nonblock) +static inline bool sk_busy_loop(struct sock *sk, int nonblock) { - u64 start_time = ll_start_time(!nonblock); - u64 run_time = ll_sk_run_time(sk); + u64 start_time = busy_loop_start_time(!nonblock); + u64 end_time = sk_busy_loop_end_time(sk); const struct net_device_ops *ops; struct napi_struct *napi; int rc = false; @@ -137,7 +139,7 @@ static inline bool sk_poll_ll(struct sock *sk, int nonblock) LINUX_MIB_LOWLATENCYRXPACKETS, rc); } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) && - can_poll_ll(start_time, run_time)); + busy_loop_range(start_time, end_time)); rc = !skb_queue_empty(&sk->sk_receive_queue); out: @@ -158,27 +160,27 @@ static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb) } #else /* CONFIG_NET_LL_RX_POLL */ -static inline unsigned long ll_get_flag(void) +static inline unsigned long net_busy_loop_on(void) { return 0; } -static inline u64 ll_start_time(unsigned int flag) +static inline u64 busy_loop_start_time(unsigned int flag) { return 0; } -static inline u64 ll_run_time(void) +static inline u64 busy_loop_end_time(void) { return 0; } -static inline bool sk_valid_ll(struct sock *sk) +static inline bool sk_can_busy_loop(struct sock *sk) { return false; } -static inline bool sk_poll_ll(struct sock *sk, int nonblock) +static inline bool sk_busy_poll(struct sock *sk, int nonblock) { return false; } @@ -191,7 +193,7 @@ static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb) { } -static inline bool can_poll_ll(u64 start_time, u64 run_time) +static inline bool busy_loop_range(u64 start_time, u64 run_time) { return false; } diff --git a/include/uapi/asm-generic/poll.h b/include/uapi/asm-generic/poll.h index 4aee586979ca..a9694982689f 100644 --- a/include/uapi/asm-generic/poll.h +++ b/include/uapi/asm-generic/poll.h @@ -30,7 +30,7 @@ #define POLLFREE 0x4000 /* currently only for epoll */ -#define POLL_LL 0x8000 +#define POLL_BUSY_LOOP 0x8000 struct pollfd { int fd; diff --git a/net/core/datagram.c b/net/core/datagram.c index 9cbaba98ce4c..6e9ab31e457e 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -208,7 +208,8 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, } spin_unlock_irqrestore(&queue->lock, cpu_flags); - if (sk_valid_ll(sk) && sk_poll_ll(sk, flags & MSG_DONTWAIT)) + if (sk_can_busy_loop(sk) && + sk_busy_loop(sk, flags & MSG_DONTWAIT)) continue; /* User doesn't want to wait */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 46ed9afd1f5e..15cbfa94bd8e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1554,9 +1554,9 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, struct sk_buff *skb; u32 urg_hole = 0; - if (sk_valid_ll(sk) && skb_queue_empty(&sk->sk_receive_queue) - && (sk->sk_state == TCP_ESTABLISHED)) - sk_poll_ll(sk, nonblock); + if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) && + (sk->sk_state == TCP_ESTABLISHED)) + sk_busy_loop(sk, nonblock); lock_sock(sk); diff --git a/net/socket.c b/net/socket.c index 4da14cbd49b6..45afa648364a 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1148,7 +1148,7 @@ EXPORT_SYMBOL(sock_create_lite); /* No kernel lock held - perfect */ static unsigned int sock_poll(struct file *file, poll_table *wait) { - unsigned int ll_flag = 0; + unsigned int busy_flag = 0; struct socket *sock; /* @@ -1156,16 +1156,16 @@ static unsigned int sock_poll(struct file *file, poll_table *wait) */ sock = file->private_data; - if (sk_valid_ll(sock->sk)) { + if (sk_can_busy_loop(sock->sk)) { /* this socket can poll_ll so tell the system call */ - ll_flag = POLL_LL; + busy_flag = POLL_BUSY_LOOP; /* once, only if requested by syscall */ - if (wait && (wait->_key & POLL_LL)) - sk_poll_ll(sock->sk, 1); + if (wait && (wait->_key & POLL_BUSY_LOOP)) + sk_busy_loop(sock->sk, 1); } - return ll_flag | sock->ops->poll(file, sock, wait); + return busy_flag | sock->ops->poll(file, sock, wait); } static int sock_mmap(struct file *file, struct vm_area_struct *vma) -- cgit v1.2.3 From 76b1e9b9813e412bde7446525f6c299803713545 Mon Sep 17 00:00:00 2001 From: Eliezer Tamir Date: Tue, 9 Jul 2013 13:09:21 +0300 Subject: net/fs: change busy poll time accounting Suggested by Linus: Changed time accounting for busy-poll: - Make it microsecond based. - Use unsigned longs. - Revert back to use time_after instead of time_in_range. Reorder poll/select busy loop conditions: - Clear busy_flag after one time we can't busy-poll. - Only init busy_end if we actually are going to busy-poll. Added one more missing need_resched() test. Signed-off-by: Eliezer Tamir Signed-off-by: David S. Miller --- fs/select.c | 31 ++++++++++++++++++----------- include/net/ll_poll.h | 55 +++++++++++++++++---------------------------------- 2 files changed, 38 insertions(+), 48 deletions(-) (limited to 'fs') diff --git a/fs/select.c b/fs/select.c index 25cac5faf6d6..50a804b6839f 100644 --- a/fs/select.c +++ b/fs/select.c @@ -403,8 +403,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) int retval, i, timed_out = 0; unsigned long slack = 0; unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; - u64 busy_start = busy_loop_start_time(busy_flag); - u64 busy_end = busy_loop_end_time(); + unsigned long busy_end = 0; rcu_read_lock(); retval = max_select_fd(n, fds); @@ -506,9 +505,15 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) } /* only if found POLL_BUSY_LOOP sockets && not out of time */ - if (!need_resched() && can_busy_loop && - busy_loop_range(busy_start, busy_end)) - continue; + if (can_busy_loop && !need_resched()) { + if (!busy_end) { + busy_end = busy_loop_end_time(); + continue; + } + if (!busy_loop_timeout(busy_end)) + continue; + } + busy_flag = 0; /* * If this is the first loop and we have a timeout @@ -780,9 +785,7 @@ static int do_poll(unsigned int nfds, struct poll_list *list, int timed_out = 0, count = 0; unsigned long slack = 0; unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; - u64 busy_start = busy_loop_start_time(busy_flag); - u64 busy_end = busy_loop_end_time(); - + unsigned long busy_end = 0; /* Optimise the no-wait case */ if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { @@ -834,9 +837,15 @@ static int do_poll(unsigned int nfds, struct poll_list *list, break; /* only if found POLL_BUSY_LOOP sockets && not out of time */ - if (!need_resched() && can_busy_loop && - busy_loop_range(busy_start, busy_end)) - continue; + if (can_busy_loop && !need_resched()) { + if (!busy_end) { + busy_end = busy_loop_end_time(); + continue; + } + if (!busy_loop_timeout(busy_end)) + continue; + } + busy_flag = 0; /* * If this is the first loop and we have a timeout diff --git a/include/net/ll_poll.h b/include/net/ll_poll.h index f14dd88dafc8..76f034087743 100644 --- a/include/net/ll_poll.h +++ b/include/net/ll_poll.h @@ -47,7 +47,7 @@ static inline bool net_busy_loop_on(void) * we only care that the average is bounded */ #ifdef CONFIG_DEBUG_PREEMPT -static inline u64 busy_loop_sched_clock(void) +static inline u64 busy_loop_us_clock(void) { u64 rc; @@ -55,37 +55,24 @@ static inline u64 busy_loop_sched_clock(void) rc = sched_clock(); preempt_enable_no_resched_notrace(); - return rc; + return rc >> 10; } #else /* CONFIG_DEBUG_PREEMPT */ -static inline u64 busy_loop_sched_clock(void) +static inline u64 busy_loop_us_clock(void) { - return sched_clock(); + return sched_clock() >> 10; } #endif /* CONFIG_DEBUG_PREEMPT */ -/* we don't mind a ~2.5% imprecision so <<10 instead of *1000 - * sk->sk_ll_usec is a u_int so this can't overflow - */ -static inline u64 sk_busy_loop_end_time(struct sock *sk) +static inline unsigned long sk_busy_loop_end_time(struct sock *sk) { - return (u64)ACCESS_ONCE(sk->sk_ll_usec) << 10; + return busy_loop_us_clock() + ACCESS_ONCE(sk->sk_ll_usec); } -/* in poll/select we use the global sysctl_net_ll_poll value - * only call sched_clock() if enabled - */ -static inline u64 busy_loop_end_time(void) -{ - return (u64)ACCESS_ONCE(sysctl_net_ll_poll) << 10; -} - -/* if flag is not set we don't need to know the time - * so we want to avoid a potentially expensive sched_clock() - */ -static inline u64 busy_loop_start_time(unsigned int flag) +/* in poll/select we use the global sysctl_net_ll_poll value */ +static inline unsigned long busy_loop_end_time(void) { - return flag ? busy_loop_sched_clock() : 0; + return busy_loop_us_clock() + ACCESS_ONCE(sysctl_net_ll_poll); } static inline bool sk_can_busy_loop(struct sock *sk) @@ -94,12 +81,12 @@ static inline bool sk_can_busy_loop(struct sock *sk) !need_resched() && !signal_pending(current); } -/* careful! time_in_range64 will evaluate now twice */ -static inline bool busy_loop_range(u64 start_time, u64 run_time) + +static inline bool busy_loop_timeout(unsigned long end_time) { - u64 now = busy_loop_sched_clock(); + unsigned long now = busy_loop_us_clock(); - return time_in_range64(now, start_time, start_time + run_time); + return time_after(now, end_time); } /* when used in sock_poll() nonblock is known at compile time to be true @@ -107,8 +94,7 @@ static inline bool busy_loop_range(u64 start_time, u64 run_time) */ static inline bool sk_busy_loop(struct sock *sk, int nonblock) { - u64 start_time = busy_loop_start_time(!nonblock); - u64 end_time = sk_busy_loop_end_time(sk); + unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0; const struct net_device_ops *ops; struct napi_struct *napi; int rc = false; @@ -139,7 +125,7 @@ static inline bool sk_busy_loop(struct sock *sk, int nonblock) LINUX_MIB_LOWLATENCYRXPACKETS, rc); } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) && - busy_loop_range(start_time, end_time)); + !need_resched() && !busy_loop_timeout(end_time)); rc = !skb_queue_empty(&sk->sk_receive_queue); out: @@ -165,12 +151,7 @@ static inline unsigned long net_busy_loop_on(void) return 0; } -static inline u64 busy_loop_start_time(unsigned int flag) -{ - return 0; -} - -static inline u64 busy_loop_end_time(void) +static inline unsigned long busy_loop_end_time(void) { return 0; } @@ -193,9 +174,9 @@ static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb) { } -static inline bool busy_loop_range(u64 start_time, u64 run_time) +static inline bool busy_loop_timeout(unsigned long end_time) { - return false; + return true; } #endif /* CONFIG_NET_LL_RX_POLL */ -- cgit v1.2.3