summaryrefslogtreecommitdiff
path: root/net/smc/smc_close.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-02-22 21:15:09 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2017-02-22 21:15:09 +0300
commit3051bf36c25d5153051704291782f8d44e744d36 (patch)
tree72dfc8a1d12675c6f2981d13102df954b678f11b /net/smc/smc_close.c
parent1e74a2eb1f5cc7f2f2b5aa9c9eeecbcf352220a3 (diff)
parent005c3490e9db23738d91e02788606c0fe4734723 (diff)
downloadlinux-3051bf36c25d5153051704291782f8d44e744d36.tar.xz
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller: "Highlights: 1) Support TX_RING in AF_PACKET TPACKET_V3 mode, from Sowmini Varadhan. 2) Simplify classifier state on sk_buff in order to shrink it a bit. From Willem de Bruijn. 3) Introduce SIPHASH and it's usage for secure sequence numbers and syncookies. From Jason A. Donenfeld. 4) Reduce CPU usage for ICMP replies we are going to limit or suppress, from Jesper Dangaard Brouer. 5) Introduce Shared Memory Communications socket layer, from Ursula Braun. 6) Add RACK loss detection and allow it to actually trigger fast recovery instead of just assisting after other algorithms have triggered it. From Yuchung Cheng. 7) Add xmit_more and BQL support to mvneta driver, from Simon Guinot. 8) skb_cow_data avoidance in esp4 and esp6, from Steffen Klassert. 9) Export MPLS packet stats via netlink, from Robert Shearman. 10) Significantly improve inet port bind conflict handling, especially when an application is restarted and changes it's setting of reuseport. From Josef Bacik. 11) Implement TX batching in vhost_net, from Jason Wang. 12) Extend the dummy device so that VF (virtual function) features, such as configuration, can be more easily tested. From Phil Sutter. 13) Avoid two atomic ops per page on x86 in bnx2x driver, from Eric Dumazet. 14) Add new bpf MAP, implementing a longest prefix match trie. From Daniel Mack. 15) Packet sample offloading support in mlxsw driver, from Yotam Gigi. 16) Add new aquantia driver, from David VomLehn. 17) Add bpf tracepoints, from Daniel Borkmann. 18) Add support for port mirroring to b53 and bcm_sf2 drivers, from Florian Fainelli. 19) Remove custom busy polling in many drivers, it is done in the core networking since 4.5 times. From Eric Dumazet. 20) Support XDP adjust_head in virtio_net, from John Fastabend. 21) Fix several major holes in neighbour entry confirmation, from Julian Anastasov. 22) Add XDP support to bnxt_en driver, from Michael Chan. 23) VXLAN offloads for enic driver, from Govindarajulu Varadarajan. 24) Add IPVTAP driver (IP-VLAN based tap driver) from Sainath Grandhi. 25) Support GRO in IPSEC protocols, from Steffen Klassert" * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1764 commits) Revert "ath10k: Search SMBIOS for OEM board file extension" net: socket: fix recvmmsg not returning error from sock_error bnxt_en: use eth_hw_addr_random() bpf: fix unlocking of jited image when module ronx not set arch: add ARCH_HAS_SET_MEMORY config net: napi_watchdog() can use napi_schedule_irqoff() tcp: Revert "tcp: tcp_probe: use spin_lock_bh()" net/hsr: use eth_hw_addr_random() net: mvpp2: enable building on 64-bit platforms net: mvpp2: switch to build_skb() in the RX path net: mvpp2: simplify MVPP2_PRS_RI_* definitions net: mvpp2: fix indentation of MVPP2_EXT_GLOBAL_CTRL_DEFAULT net: mvpp2: remove unused register definitions net: mvpp2: simplify mvpp2_bm_bufs_add() net: mvpp2: drop useless fields in mvpp2_bm_pool and related code net: mvpp2: remove unused 'tx_skb' field of 'struct mvpp2_tx_queue' net: mvpp2: release reference to txq_cpu[] entry after unmapping net: mvpp2: handle too large value in mvpp2_rx_time_coal_set() net: mvpp2: handle too large value handling in mvpp2_rx_pkts_coal_set() net: mvpp2: remove useless arguments in mvpp2_rx_{pkts, time}_coal_set ...
Diffstat (limited to 'net/smc/smc_close.c')
-rw-r--r--net/smc/smc_close.c442
1 files changed, 442 insertions, 0 deletions
diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
new file mode 100644
index 000000000000..03dfcc6b7661
--- /dev/null
+++ b/net/smc/smc_close.c
@@ -0,0 +1,442 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Socket Closing - normal and abnormal
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/workqueue.h>
+#include <net/sock.h>
+
+#include "smc.h"
+#include "smc_tx.h"
+#include "smc_cdc.h"
+#include "smc_close.h"
+
+#define SMC_CLOSE_WAIT_TX_PENDS_TIME (5 * HZ)
+
+static void smc_close_cleanup_listen(struct sock *parent)
+{
+ struct sock *sk;
+
+ /* Close non-accepted connections */
+ while ((sk = smc_accept_dequeue(parent, NULL)))
+ smc_close_non_accepted(sk);
+}
+
+static void smc_close_wait_tx_pends(struct smc_sock *smc)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ struct sock *sk = &smc->sk;
+ signed long timeout;
+
+ timeout = SMC_CLOSE_WAIT_TX_PENDS_TIME;
+ add_wait_queue(sk_sleep(sk), &wait);
+ while (!signal_pending(current) && timeout) {
+ int rc;
+
+ rc = sk_wait_event(sk, &timeout,
+ !smc_cdc_tx_has_pending(&smc->conn),
+ &wait);
+ if (rc)
+ break;
+ }
+ remove_wait_queue(sk_sleep(sk), &wait);
+}
+
+/* wait for sndbuf data being transmitted */
+static void smc_close_stream_wait(struct smc_sock *smc, long timeout)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ struct sock *sk = &smc->sk;
+
+ if (!timeout)
+ return;
+
+ if (!smc_tx_prepared_sends(&smc->conn))
+ return;
+
+ smc->wait_close_tx_prepared = 1;
+ add_wait_queue(sk_sleep(sk), &wait);
+ while (!signal_pending(current) && timeout) {
+ int rc;
+
+ rc = sk_wait_event(sk, &timeout,
+ !smc_tx_prepared_sends(&smc->conn) ||
+ (sk->sk_err == ECONNABORTED) ||
+ (sk->sk_err == ECONNRESET),
+ &wait);
+ if (rc)
+ break;
+ }
+ remove_wait_queue(sk_sleep(sk), &wait);
+ smc->wait_close_tx_prepared = 0;
+}
+
+void smc_close_wake_tx_prepared(struct smc_sock *smc)
+{
+ if (smc->wait_close_tx_prepared)
+ /* wake up socket closing */
+ smc->sk.sk_state_change(&smc->sk);
+}
+
+static int smc_close_wr(struct smc_connection *conn)
+{
+ conn->local_tx_ctrl.conn_state_flags.peer_done_writing = 1;
+
+ return smc_cdc_get_slot_and_msg_send(conn);
+}
+
+static int smc_close_final(struct smc_connection *conn)
+{
+ if (atomic_read(&conn->bytes_to_rcv))
+ conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+ else
+ conn->local_tx_ctrl.conn_state_flags.peer_conn_closed = 1;
+
+ return smc_cdc_get_slot_and_msg_send(conn);
+}
+
+static int smc_close_abort(struct smc_connection *conn)
+{
+ conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+
+ return smc_cdc_get_slot_and_msg_send(conn);
+}
+
+/* terminate smc socket abnormally - active abort
+ * RDMA communication no longer possible
+ */
+void smc_close_active_abort(struct smc_sock *smc)
+{
+ struct smc_cdc_conn_state_flags *txflags =
+ &smc->conn.local_tx_ctrl.conn_state_flags;
+
+ bh_lock_sock(&smc->sk);
+ smc->sk.sk_err = ECONNABORTED;
+ if (smc->clcsock && smc->clcsock->sk) {
+ smc->clcsock->sk->sk_err = ECONNABORTED;
+ smc->clcsock->sk->sk_state_change(smc->clcsock->sk);
+ }
+ switch (smc->sk.sk_state) {
+ case SMC_INIT:
+ smc->sk.sk_state = SMC_PEERABORTWAIT;
+ break;
+ case SMC_APPCLOSEWAIT1:
+ case SMC_APPCLOSEWAIT2:
+ txflags->peer_conn_abort = 1;
+ sock_release(smc->clcsock);
+ if (!smc_cdc_rxed_any_close(&smc->conn))
+ smc->sk.sk_state = SMC_PEERABORTWAIT;
+ else
+ smc->sk.sk_state = SMC_CLOSED;
+ break;
+ case SMC_PEERCLOSEWAIT1:
+ case SMC_PEERCLOSEWAIT2:
+ if (!txflags->peer_conn_closed) {
+ smc->sk.sk_state = SMC_PEERABORTWAIT;
+ txflags->peer_conn_abort = 1;
+ sock_release(smc->clcsock);
+ } else {
+ smc->sk.sk_state = SMC_CLOSED;
+ }
+ break;
+ case SMC_PROCESSABORT:
+ case SMC_APPFINCLOSEWAIT:
+ if (!txflags->peer_conn_closed) {
+ txflags->peer_conn_abort = 1;
+ sock_release(smc->clcsock);
+ }
+ smc->sk.sk_state = SMC_CLOSED;
+ break;
+ case SMC_PEERFINCLOSEWAIT:
+ case SMC_PEERABORTWAIT:
+ case SMC_CLOSED:
+ break;
+ }
+
+ sock_set_flag(&smc->sk, SOCK_DEAD);
+ bh_unlock_sock(&smc->sk);
+ smc->sk.sk_state_change(&smc->sk);
+}
+
+int smc_close_active(struct smc_sock *smc)
+{
+ struct smc_cdc_conn_state_flags *txflags =
+ &smc->conn.local_tx_ctrl.conn_state_flags;
+ long timeout = SMC_MAX_STREAM_WAIT_TIMEOUT;
+ struct smc_connection *conn = &smc->conn;
+ struct sock *sk = &smc->sk;
+ int old_state;
+ int rc = 0;
+
+ if (sock_flag(sk, SOCK_LINGER) &&
+ !(current->flags & PF_EXITING))
+ timeout = sk->sk_lingertime;
+
+again:
+ old_state = sk->sk_state;
+ switch (old_state) {
+ case SMC_INIT:
+ sk->sk_state = SMC_CLOSED;
+ if (smc->smc_listen_work.func)
+ flush_work(&smc->smc_listen_work);
+ sock_put(sk);
+ break;
+ case SMC_LISTEN:
+ sk->sk_state = SMC_CLOSED;
+ sk->sk_state_change(sk); /* wake up accept */
+ if (smc->clcsock && smc->clcsock->sk) {
+ rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR);
+ /* wake up kernel_accept of smc_tcp_listen_worker */
+ smc->clcsock->sk->sk_data_ready(smc->clcsock->sk);
+ }
+ release_sock(sk);
+ smc_close_cleanup_listen(sk);
+ flush_work(&smc->tcp_listen_work);
+ lock_sock(sk);
+ break;
+ case SMC_ACTIVE:
+ smc_close_stream_wait(smc, timeout);
+ release_sock(sk);
+ cancel_work_sync(&conn->tx_work);
+ lock_sock(sk);
+ if (sk->sk_state == SMC_ACTIVE) {
+ /* send close request */
+ rc = smc_close_final(conn);
+ sk->sk_state = SMC_PEERCLOSEWAIT1;
+ } else {
+ /* peer event has changed the state */
+ goto again;
+ }
+ break;
+ case SMC_APPFINCLOSEWAIT:
+ /* socket already shutdown wr or both (active close) */
+ if (txflags->peer_done_writing &&
+ !txflags->peer_conn_closed) {
+ /* just shutdown wr done, send close request */
+ rc = smc_close_final(conn);
+ }
+ sk->sk_state = SMC_CLOSED;
+ smc_close_wait_tx_pends(smc);
+ break;
+ case SMC_APPCLOSEWAIT1:
+ case SMC_APPCLOSEWAIT2:
+ if (!smc_cdc_rxed_any_close(conn))
+ smc_close_stream_wait(smc, timeout);
+ release_sock(sk);
+ cancel_work_sync(&conn->tx_work);
+ lock_sock(sk);
+ if (sk->sk_err != ECONNABORTED) {
+ /* confirm close from peer */
+ rc = smc_close_final(conn);
+ if (rc)
+ break;
+ }
+ if (smc_cdc_rxed_any_close(conn))
+ /* peer has closed the socket already */
+ sk->sk_state = SMC_CLOSED;
+ else
+ /* peer has just issued a shutdown write */
+ sk->sk_state = SMC_PEERFINCLOSEWAIT;
+ smc_close_wait_tx_pends(smc);
+ break;
+ case SMC_PEERCLOSEWAIT1:
+ case SMC_PEERCLOSEWAIT2:
+ case SMC_PEERFINCLOSEWAIT:
+ /* peer sending PeerConnectionClosed will cause transition */
+ break;
+ case SMC_PROCESSABORT:
+ cancel_work_sync(&conn->tx_work);
+ smc_close_abort(conn);
+ sk->sk_state = SMC_CLOSED;
+ smc_close_wait_tx_pends(smc);
+ break;
+ case SMC_PEERABORTWAIT:
+ case SMC_CLOSED:
+ /* nothing to do, add tracing in future patch */
+ break;
+ }
+
+ if (old_state != sk->sk_state)
+ sk->sk_state_change(&smc->sk);
+ return rc;
+}
+
+static void smc_close_passive_abort_received(struct smc_sock *smc)
+{
+ struct smc_cdc_conn_state_flags *txflags =
+ &smc->conn.local_tx_ctrl.conn_state_flags;
+ struct sock *sk = &smc->sk;
+
+ switch (sk->sk_state) {
+ case SMC_ACTIVE:
+ case SMC_APPFINCLOSEWAIT:
+ case SMC_APPCLOSEWAIT1:
+ case SMC_APPCLOSEWAIT2:
+ smc_close_abort(&smc->conn);
+ sk->sk_state = SMC_PROCESSABORT;
+ break;
+ case SMC_PEERCLOSEWAIT1:
+ case SMC_PEERCLOSEWAIT2:
+ if (txflags->peer_done_writing &&
+ !txflags->peer_conn_closed) {
+ /* just shutdown, but not yet closed locally */
+ smc_close_abort(&smc->conn);
+ sk->sk_state = SMC_PROCESSABORT;
+ } else {
+ sk->sk_state = SMC_CLOSED;
+ }
+ break;
+ case SMC_PEERFINCLOSEWAIT:
+ case SMC_PEERABORTWAIT:
+ sk->sk_state = SMC_CLOSED;
+ break;
+ case SMC_INIT:
+ case SMC_PROCESSABORT:
+ /* nothing to do, add tracing in future patch */
+ break;
+ }
+}
+
+/* Some kind of closing has been received: peer_conn_closed, peer_conn_abort,
+ * or peer_done_writing.
+ * Called under tasklet context.
+ */
+void smc_close_passive_received(struct smc_sock *smc)
+{
+ struct smc_cdc_conn_state_flags *rxflags =
+ &smc->conn.local_rx_ctrl.conn_state_flags;
+ struct sock *sk = &smc->sk;
+ int old_state;
+
+ sk->sk_shutdown |= RCV_SHUTDOWN;
+ if (smc->clcsock && smc->clcsock->sk)
+ smc->clcsock->sk->sk_shutdown |= RCV_SHUTDOWN;
+ sock_set_flag(&smc->sk, SOCK_DONE);
+
+ old_state = sk->sk_state;
+
+ if (rxflags->peer_conn_abort) {
+ smc_close_passive_abort_received(smc);
+ goto wakeup;
+ }
+
+ switch (sk->sk_state) {
+ case SMC_INIT:
+ if (atomic_read(&smc->conn.bytes_to_rcv) ||
+ (rxflags->peer_done_writing &&
+ !rxflags->peer_conn_closed))
+ sk->sk_state = SMC_APPCLOSEWAIT1;
+ else
+ sk->sk_state = SMC_CLOSED;
+ break;
+ case SMC_ACTIVE:
+ sk->sk_state = SMC_APPCLOSEWAIT1;
+ break;
+ case SMC_PEERCLOSEWAIT1:
+ if (rxflags->peer_done_writing)
+ sk->sk_state = SMC_PEERCLOSEWAIT2;
+ /* fall through to check for closing */
+ case SMC_PEERCLOSEWAIT2:
+ case SMC_PEERFINCLOSEWAIT:
+ if (!smc_cdc_rxed_any_close(&smc->conn))
+ break;
+ if (sock_flag(sk, SOCK_DEAD) &&
+ (sk->sk_shutdown == SHUTDOWN_MASK)) {
+ /* smc_release has already been called locally */
+ sk->sk_state = SMC_CLOSED;
+ } else {
+ /* just shutdown, but not yet closed locally */
+ sk->sk_state = SMC_APPFINCLOSEWAIT;
+ }
+ break;
+ case SMC_APPCLOSEWAIT1:
+ case SMC_APPCLOSEWAIT2:
+ case SMC_APPFINCLOSEWAIT:
+ case SMC_PEERABORTWAIT:
+ case SMC_PROCESSABORT:
+ case SMC_CLOSED:
+ /* nothing to do, add tracing in future patch */
+ break;
+ }
+
+wakeup:
+ if (old_state != sk->sk_state)
+ sk->sk_state_change(sk);
+ sk->sk_data_ready(sk); /* wakeup blocked rcvbuf consumers */
+ sk->sk_write_space(sk); /* wakeup blocked sndbuf producers */
+
+ if ((sk->sk_state == SMC_CLOSED) &&
+ (sock_flag(sk, SOCK_DEAD) || (old_state == SMC_INIT))) {
+ smc_conn_free(&smc->conn);
+ schedule_delayed_work(&smc->sock_put_work,
+ SMC_CLOSE_SOCK_PUT_DELAY);
+ }
+}
+
+void smc_close_sock_put_work(struct work_struct *work)
+{
+ struct smc_sock *smc = container_of(to_delayed_work(work),
+ struct smc_sock,
+ sock_put_work);
+
+ smc->sk.sk_prot->unhash(&smc->sk);
+ sock_put(&smc->sk);
+}
+
+int smc_close_shutdown_write(struct smc_sock *smc)
+{
+ struct smc_connection *conn = &smc->conn;
+ long timeout = SMC_MAX_STREAM_WAIT_TIMEOUT;
+ struct sock *sk = &smc->sk;
+ int old_state;
+ int rc = 0;
+
+ if (sock_flag(sk, SOCK_LINGER))
+ timeout = sk->sk_lingertime;
+
+again:
+ old_state = sk->sk_state;
+ switch (old_state) {
+ case SMC_ACTIVE:
+ smc_close_stream_wait(smc, timeout);
+ release_sock(sk);
+ cancel_work_sync(&conn->tx_work);
+ lock_sock(sk);
+ /* send close wr request */
+ rc = smc_close_wr(conn);
+ if (sk->sk_state == SMC_ACTIVE)
+ sk->sk_state = SMC_PEERCLOSEWAIT1;
+ else
+ goto again;
+ break;
+ case SMC_APPCLOSEWAIT1:
+ /* passive close */
+ if (!smc_cdc_rxed_any_close(conn))
+ smc_close_stream_wait(smc, timeout);
+ release_sock(sk);
+ cancel_work_sync(&conn->tx_work);
+ lock_sock(sk);
+ /* confirm close from peer */
+ rc = smc_close_wr(conn);
+ sk->sk_state = SMC_APPCLOSEWAIT2;
+ break;
+ case SMC_APPCLOSEWAIT2:
+ case SMC_PEERFINCLOSEWAIT:
+ case SMC_PEERCLOSEWAIT1:
+ case SMC_PEERCLOSEWAIT2:
+ case SMC_APPFINCLOSEWAIT:
+ case SMC_PROCESSABORT:
+ case SMC_PEERABORTWAIT:
+ /* nothing to do, add tracing in future patch */
+ break;
+ }
+
+ if (old_state != sk->sk_state)
+ sk->sk_state_change(&smc->sk);
+ return rc;
+}