From a11e1d432b51f63ba698d044441284a661f01144 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 28 Jun 2018 09:43:44 -0700 Subject: Revert changes to convert to ->poll_mask() and aio IOCB_CMD_POLL The poll() changes were not well thought out, and completely unexplained. They also caused a huge performance regression, because "->poll()" was no longer a trivial file operation that just called down to the underlying file operations, but instead did at least two indirect calls. Indirect calls are sadly slow now with the Spectre mitigation, but the performance problem could at least be largely mitigated by changing the "->get_poll_head()" operation to just have a per-file-descriptor pointer to the poll head instead. That gets rid of one of the new indirections. But that doesn't fix the new complexity that is completely unwarranted for the regular case. The (undocumented) reason for the poll() changes was some alleged AIO poll race fixing, but we don't make the common case slower and more complex for some uncommon special case, so this all really needs way more explanations and most likely a fundamental redesign. [ This revert is a revert of about 30 different commits, not reverted individually because that would just be unnecessarily messy - Linus ] Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Linus Torvalds --- net/unix/af_unix.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 95b02a71fd47..e5473c03d667 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -638,8 +638,9 @@ static int unix_stream_connect(struct socket *, struct sockaddr *, static int unix_socketpair(struct socket *, struct socket *); static int unix_accept(struct socket *, struct socket *, int, bool); static int unix_getname(struct socket *, struct sockaddr *, int); -static __poll_t unix_poll_mask(struct socket *, __poll_t); -static __poll_t unix_dgram_poll_mask(struct socket *, __poll_t); +static __poll_t unix_poll(struct file *, struct socket *, poll_table *); +static __poll_t unix_dgram_poll(struct file *, struct socket *, + poll_table *); static int unix_ioctl(struct socket *, unsigned int, unsigned long); static int unix_shutdown(struct socket *, int); static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); @@ -680,7 +681,7 @@ static const struct proto_ops unix_stream_ops = { .socketpair = unix_socketpair, .accept = unix_accept, .getname = unix_getname, - .poll_mask = unix_poll_mask, + .poll = unix_poll, .ioctl = unix_ioctl, .listen = unix_listen, .shutdown = unix_shutdown, @@ -703,7 +704,7 @@ static const struct proto_ops unix_dgram_ops = { .socketpair = unix_socketpair, .accept = sock_no_accept, .getname = unix_getname, - .poll_mask = unix_dgram_poll_mask, + .poll = unix_dgram_poll, .ioctl = unix_ioctl, .listen = sock_no_listen, .shutdown = unix_shutdown, @@ -725,7 +726,7 @@ static const struct proto_ops unix_seqpacket_ops = { .socketpair = unix_socketpair, .accept = unix_accept, .getname = unix_getname, - .poll_mask = unix_dgram_poll_mask, + .poll = unix_dgram_poll, .ioctl = unix_ioctl, .listen = unix_listen, .shutdown = unix_shutdown, @@ -2629,10 +2630,13 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return err; } -static __poll_t unix_poll_mask(struct socket *sock, __poll_t events) +static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; - __poll_t mask = 0; + __poll_t mask; + + sock_poll_wait(file, sk_sleep(sk), wait); + mask = 0; /* exceptional events? */ if (sk->sk_err) @@ -2661,11 +2665,15 @@ static __poll_t unix_poll_mask(struct socket *sock, __poll_t events) return mask; } -static __poll_t unix_dgram_poll_mask(struct socket *sock, __poll_t events) +static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk, *other; - int writable; - __poll_t mask = 0; + unsigned int writable; + __poll_t mask; + + sock_poll_wait(file, sk_sleep(sk), wait); + mask = 0; /* exceptional events? */ if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) @@ -2691,7 +2699,7 @@ static __poll_t unix_dgram_poll_mask(struct socket *sock, __poll_t events) } /* No write status requested, avoid expensive OUT tests. */ - if (!(events & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) + if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) return mask; writable = unix_writable(sk); -- cgit v1.2.3 From dd979b4df817e9976f18fb6f9d134d6bc4a3c317 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 30 Jul 2018 09:42:10 +0200 Subject: net: simplify sock_poll_wait The wait_address argument is always directly derived from the filp argument, so remove it. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- crypto/af_alg.c | 2 +- include/net/sock.h | 11 ++++++----- net/atm/common.c | 2 +- net/caif/caif_socket.c | 2 +- net/core/datagram.c | 2 +- net/dccp/proto.c | 2 +- net/ipv4/tcp.c | 2 +- net/iucv/af_iucv.c | 2 +- net/nfc/llcp_sock.c | 2 +- net/rxrpc/af_rxrpc.c | 2 +- net/smc/af_smc.c | 2 +- net/tipc/socket.c | 2 +- net/unix/af_unix.c | 4 ++-- 13 files changed, 19 insertions(+), 18 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/crypto/af_alg.c b/crypto/af_alg.c index c166f424871c..b053179e0bc5 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -1071,7 +1071,7 @@ __poll_t af_alg_poll(struct file *file, struct socket *sock, struct af_alg_ctx *ctx = ask->private; __poll_t mask; - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); mask = 0; if (!ctx->more || ctx->used) diff --git a/include/net/sock.h b/include/net/sock.h index 83b747538bd0..0518f61926ec 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2057,16 +2057,17 @@ static inline bool skwq_has_sleeper(struct socket_wq *wq) /** * sock_poll_wait - place memory barrier behind the poll_wait call. * @filp: file - * @wait_address: socket wait queue * @p: poll_table * * See the comments in the wq_has_sleeper function. */ -static inline void sock_poll_wait(struct file *filp, - wait_queue_head_t *wait_address, poll_table *p) +static inline void sock_poll_wait(struct file *filp, poll_table *p) { - if (!poll_does_not_wait(p) && wait_address) { - poll_wait(filp, wait_address, p); + struct socket *sock = filp->private_data; + wait_queue_head_t *wq = sk_sleep(sock->sk); + + if (!poll_does_not_wait(p) && wq) { + poll_wait(filp, wq, p); /* We need to be sure we are in sync with the * socket flags modification. * diff --git a/net/atm/common.c b/net/atm/common.c index a7a68e509628..9f8cb0d2e71e 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -653,7 +653,7 @@ __poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait) struct atm_vcc *vcc; __poll_t mask; - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); mask = 0; vcc = ATM_SD(sock); diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index a6fb1b3bcad9..d18965f3291f 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -941,7 +941,7 @@ static __poll_t caif_poll(struct file *file, __poll_t mask; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); mask = 0; /* exceptional events? */ diff --git a/net/core/datagram.c b/net/core/datagram.c index 9938952c5c78..9aac0d63d53e 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -837,7 +837,7 @@ __poll_t datagram_poll(struct file *file, struct socket *sock, struct sock *sk = sock->sk; __poll_t mask; - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); mask = 0; /* exceptional events? */ diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 0d56e36a6db7..875858c8b059 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -325,7 +325,7 @@ __poll_t dccp_poll(struct file *file, struct socket *sock, __poll_t mask; struct sock *sk = sock->sk; - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); if (sk->sk_state == DCCP_LISTEN) return inet_csk_listen_poll(sk); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 514aaac1626f..f3bfb9f29520 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -507,7 +507,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) const struct tcp_sock *tp = tcp_sk(sk); int state; - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); state = inet_sk_state_load(sk); if (state == TCP_LISTEN) diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 8d1c43f8fed4..92ee91e34395 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1494,7 +1494,7 @@ __poll_t iucv_sock_poll(struct file *file, struct socket *sock, struct sock *sk = sock->sk; __poll_t mask = 0; - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); if (sk->sk_state == IUCV_LISTEN) return iucv_accept_poll(sk); diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index ea0c0c6f1874..dd4adf8b1167 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -556,7 +556,7 @@ static __poll_t llcp_sock_poll(struct file *file, struct socket *sock, pr_debug("%p\n", sk); - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); if (sk->sk_state == LLCP_LISTEN) return llcp_accept_poll(sk); diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 2b463047dd7b..ac44d8afffb1 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -741,7 +741,7 @@ static __poll_t rxrpc_poll(struct file *file, struct socket *sock, struct rxrpc_sock *rx = rxrpc_sk(sk); __poll_t mask; - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); mask = 0; /* the socket is readable if there are any messages waiting on the Rx diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index fce7e4751151..0fc94f296e54 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1535,7 +1535,7 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, mask |= EPOLLERR; } else { if (sk->sk_state != SMC_CLOSED) - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); if (sk->sk_err) mask |= EPOLLERR; if ((sk->sk_shutdown == SHUTDOWN_MASK) || diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 3d21414ba357..3763bedecf5f 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -716,7 +716,7 @@ static __poll_t tipc_poll(struct file *file, struct socket *sock, struct tipc_sock *tsk = tipc_sk(sk); __poll_t revents = 0; - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); if (sk->sk_shutdown & RCV_SHUTDOWN) revents |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e5473c03d667..1772a0e32665 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2635,7 +2635,7 @@ static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wa struct sock *sk = sock->sk; __poll_t mask; - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); mask = 0; /* exceptional events? */ @@ -2672,7 +2672,7 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, unsigned int writable; __poll_t mask; - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); mask = 0; /* exceptional events? */ -- cgit v1.2.3 From 51f7e95187f127d5eadf50541943813ff57f12ba Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Fri, 3 Aug 2018 17:24:53 -0400 Subject: af_unix: ensure POLLOUT on remote close() for connected dgram socket Applications use -ECONNREFUSED as returned from write() in order to determine that a socket should be closed. However, when using connected dgram unix sockets in a poll/write loop, a final POLLOUT event can be missed when the remote end closes. Thus, the poll is stuck forever: thread 1 (client) thread 2 (server) connect() to server write() returns -EAGAIN unix_dgram_poll() -> unix_recvq_full() is true close() ->unix_release_sock() ->wake_up_interruptible_all() unix_dgram_poll() (due to the wake_up_interruptible_all) -> unix_recvq_full() still is true ->free all skbs Now thread 1 is stuck and will not receive anymore wakeups. In this case, when thread 1 gets the -EAGAIN, it has not queued any skbs otherwise the 'free all skbs' step would in fact cause a wakeup and a POLLOUT return. So the race here is probably fairly rare because it means there are no skbs that thread 1 queued and that thread 1 schedules before the 'free all skbs' step. This issue was reported as a hang when /dev/log is closed. The fix is to signal POLLOUT if the socket is marked as SOCK_DEAD, which means a subsequent write() will get -ECONNREFUSED. Reported-by: Ian Lance Taylor Cc: David Rientjes Cc: Rainer Weikusat Cc: Eric Dumazet Signed-off-by: Jason Baron Signed-off-by: David S. Miller --- net/unix/af_unix.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 1772a0e32665..d1edfa3cad61 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -430,7 +430,12 @@ static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) connected = unix_dgram_peer_wake_connect(sk, other); - if (unix_recvq_full(other)) + /* If other is SOCK_DEAD, we want to make sure we signal + * POLLOUT, such that a subsequent write() can get a + * -ECONNREFUSED. Otherwise, if we haven't queued any skbs + * to other and its full, we will hang waiting for POLLOUT. + */ + if (unix_recvq_full(other) && !sock_flag(other, SOCK_DEAD)) return 1; if (connected) -- cgit v1.2.3