diff options
Diffstat (limited to 'net/smc')
| -rw-r--r-- | net/smc/Kconfig | 1 | ||||
| -rw-r--r-- | net/smc/Makefile | 1 | ||||
| -rw-r--r-- | net/smc/af_smc.c | 541 | ||||
| -rw-r--r-- | net/smc/smc.h | 12 | ||||
| -rw-r--r-- | net/smc/smc_cdc.c | 7 | ||||
| -rw-r--r-- | net/smc/smc_clc.c | 23 | ||||
| -rw-r--r-- | net/smc/smc_clc.h | 20 | ||||
| -rw-r--r-- | net/smc/smc_close.c | 120 | ||||
| -rw-r--r-- | net/smc/smc_close.h | 3 | ||||
| -rw-r--r-- | net/smc/smc_core.c | 547 | ||||
| -rw-r--r-- | net/smc/smc_core.h | 41 | ||||
| -rw-r--r-- | net/smc/smc_diag.c | 1 | ||||
| -rw-r--r-- | net/smc/smc_ib.c | 40 | ||||
| -rw-r--r-- | net/smc/smc_ib.h | 4 | ||||
| -rw-r--r-- | net/smc/smc_ism.c | 32 | ||||
| -rw-r--r-- | net/smc/smc_llc.c | 11 | ||||
| -rw-r--r-- | net/smc/smc_pnet.c | 68 | ||||
| -rw-r--r-- | net/smc/smc_pnet.h | 7 | ||||
| -rw-r--r-- | net/smc/smc_rx.c | 37 | ||||
| -rw-r--r-- | net/smc/smc_tx.c | 34 | ||||
| -rw-r--r-- | net/smc/smc_wr.c | 45 | ||||
| -rw-r--r-- | net/smc/smc_wr.h | 10 |
22 files changed, 1036 insertions, 569 deletions
diff --git a/net/smc/Kconfig b/net/smc/Kconfig index c717ef0896aa..f54a70b8da82 100644 --- a/net/smc/Kconfig +++ b/net/smc/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only config SMC tristate "SMC socket protocol family" depends on INET && INFINIBAND diff --git a/net/smc/Makefile b/net/smc/Makefile index 4df96b4b8130..cb1254541f37 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 77ef53596d18..b997072c72e5 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * @@ -24,6 +25,7 @@ #include <linux/in.h> #include <linux/sched/signal.h> #include <linux/if_vlan.h> +#include <linux/rcupdate_wait.h> #include <net/sock.h> #include <net/tcp.h> @@ -122,31 +124,16 @@ struct proto smc_proto6 = { }; EXPORT_SYMBOL_GPL(smc_proto6); -static int smc_release(struct socket *sock) +static void smc_restore_fallback_changes(struct smc_sock *smc) { - struct sock *sk = sock->sk; - struct smc_sock *smc; - int rc = 0; - - if (!sk) - goto out; - - smc = smc_sk(sk); - - /* cleanup for a dangling non-blocking connect */ - if (smc->connect_info && sk->sk_state == SMC_INIT) - tcp_abort(smc->clcsock->sk, ECONNABORTED); - flush_work(&smc->connect_work); - kfree(smc->connect_info); - smc->connect_info = NULL; + smc->clcsock->file->private_data = smc->sk.sk_socket; + smc->clcsock->file = NULL; +} - if (sk->sk_state == SMC_LISTEN) - /* smc_close_non_accepted() is called and acquires - * sock lock for child sockets again - */ - lock_sock_nested(sk, SINGLE_DEPTH_NESTING); - else - lock_sock(sk); +static int __smc_release(struct smc_sock *smc) +{ + struct sock *sk = &smc->sk; + int rc = 0; if (!smc->use_fallback) { rc = smc_close_active(smc); @@ -161,26 +148,57 @@ static int smc_release(struct socket *sock) } sk->sk_state = SMC_CLOSED; sk->sk_state_change(sk); + smc_restore_fallback_changes(smc); } sk->sk_prot->unhash(sk); if (sk->sk_state == SMC_CLOSED) { if (smc->clcsock) { - mutex_lock(&smc->clcsock_release_lock); - sock_release(smc->clcsock); - smc->clcsock = NULL; - mutex_unlock(&smc->clcsock_release_lock); + release_sock(sk); + smc_clcsock_release(smc); + lock_sock(sk); } if (!smc->use_fallback) smc_conn_free(&smc->conn); } + return rc; +} + +static int smc_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int rc = 0; + + if (!sk) + goto out; + + sock_hold(sk); /* sock_put below */ + smc = smc_sk(sk); + + /* cleanup for a dangling non-blocking connect */ + if (smc->connect_nonblock && sk->sk_state == SMC_INIT) + tcp_abort(smc->clcsock->sk, ECONNABORTED); + flush_work(&smc->connect_work); + + if (sk->sk_state == SMC_LISTEN) + /* smc_close_non_accepted() is called and acquires + * sock lock for child sockets again + */ + lock_sock_nested(sk, SINGLE_DEPTH_NESTING); + else + lock_sock(sk); + + rc = __smc_release(smc); + /* detach socket */ sock_orphan(sk); sock->sk = NULL; release_sock(sk); + sock_put(sk); /* sock_hold above */ sock_put(sk); /* final sock_put */ out: return rc; @@ -255,7 +273,7 @@ static int smc_bind(struct socket *sock, struct sockaddr *uaddr, /* Check if socket is already active */ rc = -EINVAL; - if (sk->sk_state != SMC_INIT) + if (sk->sk_state != SMC_INIT || smc->connect_nonblock) goto out_rel; smc->clcsock->sk->sk_reuse = sk->sk_reuse; @@ -446,12 +464,22 @@ static void smc_link_save_peer_info(struct smc_link *link, link->peer_mtu = clc->qp_mtu; } +static void smc_switch_to_fallback(struct smc_sock *smc) +{ + smc->use_fallback = true; + if (smc->sk.sk_socket && smc->sk.sk_socket->file) { + smc->clcsock->file = smc->sk.sk_socket->file; + smc->clcsock->file->private_data = smc->clcsock; + } +} + /* fall back during connect */ static int smc_connect_fallback(struct smc_sock *smc, int reason_code) { - smc->use_fallback = true; + smc_switch_to_fallback(smc); smc->fallback_rsn = reason_code; smc_copy_sock_settings_to_clc(smc); + smc->connect_nonblock = 0; if (smc->sk.sk_state == SMC_INIT) smc->sk.sk_state = SMC_ACTIVE; return 0; @@ -491,46 +519,41 @@ static int smc_connect_abort(struct smc_sock *smc, int reason_code, mutex_unlock(&smc_client_lgr_pending); smc_conn_free(&smc->conn); + smc->connect_nonblock = 0; return reason_code; } /* check if there is a rdma device available for this connection. */ /* called for connect and listen */ -static int smc_check_rdma(struct smc_sock *smc, struct smc_ib_device **ibdev, - u8 *ibport, unsigned short vlan_id, u8 gid[]) +static int smc_find_rdma_device(struct smc_sock *smc, struct smc_init_info *ini) { - int reason_code = 0; - /* PNET table look up: search active ib_device and port * within same PNETID that also contains the ethernet device * used for the internal TCP socket */ - smc_pnet_find_roce_resource(smc->clcsock->sk, ibdev, ibport, vlan_id, - gid); - if (!(*ibdev)) - reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ - - return reason_code; + smc_pnet_find_roce_resource(smc->clcsock->sk, ini); + if (!ini->ib_dev) + return SMC_CLC_DECL_NOSMCRDEV; + return 0; } /* check if there is an ISM device available for this connection. */ /* called for connect and listen */ -static int smc_check_ism(struct smc_sock *smc, struct smcd_dev **ismdev) +static int smc_find_ism_device(struct smc_sock *smc, struct smc_init_info *ini) { /* Find ISM device with same PNETID as connecting interface */ - smc_pnet_find_ism_resource(smc->clcsock->sk, ismdev); - if (!(*ismdev)) - return SMC_CLC_DECL_CNFERR; /* configuration error */ + smc_pnet_find_ism_resource(smc->clcsock->sk, ini); + if (!ini->ism_dev) + return SMC_CLC_DECL_NOSMCDDEV; return 0; } /* Check for VLAN ID and register it on ISM device just for CLC handshake */ static int smc_connect_ism_vlan_setup(struct smc_sock *smc, - struct smcd_dev *ismdev, - unsigned short vlan_id) + struct smc_init_info *ini) { - if (vlan_id && smc_ism_get_vlan(ismdev, vlan_id)) - return SMC_CLC_DECL_CNFERR; + if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev, ini->vlan_id)) + return SMC_CLC_DECL_ISMVLANERR; return 0; } @@ -538,12 +561,11 @@ static int smc_connect_ism_vlan_setup(struct smc_sock *smc, * used, the VLAN ID will be registered again during the connection setup. */ static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd, - struct smcd_dev *ismdev, - unsigned short vlan_id) + struct smc_init_info *ini) { if (!is_smcd) return 0; - if (vlan_id && smc_ism_put_vlan(ismdev, vlan_id)) + if (ini->vlan_id && smc_ism_put_vlan(ini->ism_dev, ini->vlan_id)) return SMC_CLC_DECL_CNFERR; return 0; } @@ -551,13 +573,12 @@ static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd, /* CLC handshake during connect */ static int smc_connect_clc(struct smc_sock *smc, int smc_type, struct smc_clc_msg_accept_confirm *aclc, - struct smc_ib_device *ibdev, u8 ibport, - u8 gid[], struct smcd_dev *ismdev) + struct smc_init_info *ini) { int rc = 0; /* do inband token exchange */ - rc = smc_clc_send_proposal(smc, smc_type, ibdev, ibport, gid, ismdev); + rc = smc_clc_send_proposal(smc, smc_type, ini); if (rc) return rc; /* receive SMC Accept CLC message */ @@ -568,23 +589,19 @@ static int smc_connect_clc(struct smc_sock *smc, int smc_type, /* setup for RDMA connection of client */ static int smc_connect_rdma(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *aclc, - struct smc_ib_device *ibdev, u8 ibport) + struct smc_init_info *ini) { - int local_contact = SMC_FIRST_CONTACT; struct smc_link *link; int reason_code = 0; + ini->is_smcd = false; + ini->ib_lcl = &aclc->lcl; + ini->ib_clcqpn = ntoh24(aclc->qpn); + ini->srv_first_contact = aclc->hdr.flag; + mutex_lock(&smc_client_lgr_pending); - local_contact = smc_conn_create(smc, false, aclc->hdr.flag, ibdev, - ibport, ntoh24(aclc->qpn), &aclc->lcl, - NULL, 0); - if (local_contact < 0) { - if (local_contact == -ENOMEM) - reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ - else if (local_contact == -ENOLINK) - reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */ - else - reason_code = SMC_CLC_DECL_INTERR; /* other error */ + reason_code = smc_conn_create(smc, ini); + if (reason_code) { mutex_unlock(&smc_client_lgr_pending); return reason_code; } @@ -594,45 +611,48 @@ static int smc_connect_rdma(struct smc_sock *smc, /* create send buffer and rmb */ if (smc_buf_create(smc, false)) - return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact); + return smc_connect_abort(smc, SMC_CLC_DECL_MEM, + ini->cln_first_contact); - if (local_contact == SMC_FIRST_CONTACT) + if (ini->cln_first_contact == SMC_FIRST_CONTACT) smc_link_save_peer_info(link, aclc); if (smc_rmb_rtoken_handling(&smc->conn, aclc)) return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK, - local_contact); + ini->cln_first_contact); smc_close_init(smc); smc_rx_init(smc); - if (local_contact == SMC_FIRST_CONTACT) { + if (ini->cln_first_contact == SMC_FIRST_CONTACT) { if (smc_ib_ready_link(link)) return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK, - local_contact); + ini->cln_first_contact); } else { if (smc_reg_rmb(link, smc->conn.rmb_desc, true)) return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB, - local_contact); + ini->cln_first_contact); } smc_rmb_sync_sg_for_device(&smc->conn); reason_code = smc_clc_send_confirm(smc); if (reason_code) - return smc_connect_abort(smc, reason_code, local_contact); + return smc_connect_abort(smc, reason_code, + ini->cln_first_contact); smc_tx_init(smc); - if (local_contact == SMC_FIRST_CONTACT) { + if (ini->cln_first_contact == SMC_FIRST_CONTACT) { /* QP confirmation over RoCE fabric */ reason_code = smc_clnt_conf_first_link(smc); if (reason_code) return smc_connect_abort(smc, reason_code, - local_contact); + ini->cln_first_contact); } mutex_unlock(&smc_client_lgr_pending); smc_copy_sock_settings_to_clc(smc); + smc->connect_nonblock = 0; if (smc->sk.sk_state == SMC_INIT) smc->sk.sk_state = SMC_ACTIVE; @@ -642,23 +662,26 @@ static int smc_connect_rdma(struct smc_sock *smc, /* setup for ISM connection of client */ static int smc_connect_ism(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *aclc, - struct smcd_dev *ismdev) + struct smc_init_info *ini) { - int local_contact = SMC_FIRST_CONTACT; int rc = 0; + ini->is_smcd = true; + ini->ism_gid = aclc->gid; + ini->srv_first_contact = aclc->hdr.flag; + /* there is only one lgr role for SMC-D; use server lock */ mutex_lock(&smc_server_lgr_pending); - local_contact = smc_conn_create(smc, true, aclc->hdr.flag, NULL, 0, 0, - NULL, ismdev, aclc->gid); - if (local_contact < 0) { + rc = smc_conn_create(smc, ini); + if (rc) { mutex_unlock(&smc_server_lgr_pending); - return SMC_CLC_DECL_MEM; + return rc; } /* Create send and receive buffers */ if (smc_buf_create(smc, true)) - return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact); + return smc_connect_abort(smc, SMC_CLC_DECL_MEM, + ini->cln_first_contact); smc_conn_save_peer_info(smc, aclc); smc_close_init(smc); @@ -667,10 +690,11 @@ static int smc_connect_ism(struct smc_sock *smc, rc = smc_clc_send_confirm(smc); if (rc) - return smc_connect_abort(smc, rc, local_contact); + return smc_connect_abort(smc, rc, ini->cln_first_contact); mutex_unlock(&smc_server_lgr_pending); smc_copy_sock_settings_to_clc(smc); + smc->connect_nonblock = 0; if (smc->sk.sk_state == SMC_INIT) smc->sk.sk_state = SMC_ACTIVE; @@ -682,15 +706,9 @@ static int __smc_connect(struct smc_sock *smc) { bool ism_supported = false, rdma_supported = false; struct smc_clc_msg_accept_confirm aclc; - struct smc_ib_device *ibdev; - struct smcd_dev *ismdev; - u8 gid[SMC_GID_SIZE]; - unsigned short vlan; + struct smc_init_info ini = {0}; int smc_type; int rc = 0; - u8 ibport; - - sock_hold(&smc->sk); /* sock put in passive closing */ if (smc->use_fallback) return smc_connect_fallback(smc, smc->fallback_rsn); @@ -703,20 +721,21 @@ static int __smc_connect(struct smc_sock *smc) if (using_ipsec(smc)) return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC); - /* check for VLAN ID */ - if (smc_vlan_by_tcpsk(smc->clcsock, &vlan)) - return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR); + /* get vlan id from IP device */ + if (smc_vlan_by_tcpsk(smc->clcsock, &ini)) + return smc_connect_decline_fallback(smc, + SMC_CLC_DECL_GETVLANERR); /* check if there is an ism device available */ - if (!smc_check_ism(smc, &ismdev) && - !smc_connect_ism_vlan_setup(smc, ismdev, vlan)) { + if (!smc_find_ism_device(smc, &ini) && + !smc_connect_ism_vlan_setup(smc, &ini)) { /* ISM is supported for this connection */ ism_supported = true; smc_type = SMC_TYPE_D; } /* check if there is a rdma device available */ - if (!smc_check_rdma(smc, &ibdev, &ibport, vlan, gid)) { + if (!smc_find_rdma_device(smc, &ini)) { /* RDMA is supported for this connection */ rdma_supported = true; if (ism_supported) @@ -730,25 +749,25 @@ static int __smc_connect(struct smc_sock *smc) return smc_connect_decline_fallback(smc, SMC_CLC_DECL_NOSMCDEV); /* perform CLC handshake */ - rc = smc_connect_clc(smc, smc_type, &aclc, ibdev, ibport, gid, ismdev); + rc = smc_connect_clc(smc, smc_type, &aclc, &ini); if (rc) { - smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan); + smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini); return smc_connect_decline_fallback(smc, rc); } /* depending on previous steps, connect using rdma or ism */ if (rdma_supported && aclc.hdr.path == SMC_TYPE_R) - rc = smc_connect_rdma(smc, &aclc, ibdev, ibport); + rc = smc_connect_rdma(smc, &aclc, &ini); else if (ism_supported && aclc.hdr.path == SMC_TYPE_D) - rc = smc_connect_ism(smc, &aclc, ismdev); + rc = smc_connect_ism(smc, &aclc, &ini); else rc = SMC_CLC_DECL_MODEUNSUPP; if (rc) { - smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan); + smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini); return smc_connect_decline_fallback(smc, rc); } - smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan); + smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini); return 0; } @@ -756,17 +775,31 @@ static void smc_connect_work(struct work_struct *work) { struct smc_sock *smc = container_of(work, struct smc_sock, connect_work); - int rc; + long timeo = smc->sk.sk_sndtimeo; + int rc = 0; - lock_sock(&smc->sk); - rc = kernel_connect(smc->clcsock, &smc->connect_info->addr, - smc->connect_info->alen, smc->connect_info->flags); + if (!timeo) + timeo = MAX_SCHEDULE_TIMEOUT; + lock_sock(smc->clcsock->sk); if (smc->clcsock->sk->sk_err) { smc->sk.sk_err = smc->clcsock->sk->sk_err; - goto out; - } - if (rc < 0) { - smc->sk.sk_err = -rc; + } else if ((1 << smc->clcsock->sk->sk_state) & + (TCPF_SYN_SENT | TCP_SYN_RECV)) { + rc = sk_stream_wait_connect(smc->clcsock->sk, &timeo); + if ((rc == -EPIPE) && + ((1 << smc->clcsock->sk->sk_state) & + (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))) + rc = 0; + } + release_sock(smc->clcsock->sk); + lock_sock(&smc->sk); + if (rc != 0 || smc->sk.sk_err) { + smc->sk.sk_state = SMC_CLOSED; + if (rc == -EPIPE || rc == -EAGAIN) + smc->sk.sk_err = EPIPE; + else if (signal_pending(current)) + smc->sk.sk_err = -sock_intr_errno(timeo); + sock_put(&smc->sk); /* passive closing */ goto out; } @@ -775,12 +808,14 @@ static void smc_connect_work(struct work_struct *work) smc->sk.sk_err = -rc; out: - if (smc->sk.sk_err) - smc->sk.sk_state_change(&smc->sk); - else - smc->sk.sk_write_space(&smc->sk); - kfree(smc->connect_info); - smc->connect_info = NULL; + if (!sock_flag(&smc->sk, SOCK_DEAD)) { + if (smc->sk.sk_err) { + smc->sk.sk_state_change(&smc->sk); + } else { /* allow polling before and after fallback decision */ + smc->clcsock->sk->sk_write_space(smc->clcsock->sk); + smc->sk.sk_write_space(&smc->sk); + } + } release_sock(&smc->sk); } @@ -813,26 +848,20 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, smc_copy_sock_settings_to_clc(smc); tcp_sk(smc->clcsock->sk)->syn_smc = 1; + if (smc->connect_nonblock) { + rc = -EALREADY; + goto out; + } + rc = kernel_connect(smc->clcsock, addr, alen, flags); + if (rc && rc != -EINPROGRESS) + goto out; + + sock_hold(&smc->sk); /* sock put in passive closing */ if (flags & O_NONBLOCK) { - if (smc->connect_info) { - rc = -EALREADY; - goto out; - } - smc->connect_info = kzalloc(alen + 2 * sizeof(int), GFP_KERNEL); - if (!smc->connect_info) { - rc = -ENOMEM; - goto out; - } - smc->connect_info->alen = alen; - smc->connect_info->flags = flags ^ O_NONBLOCK; - memcpy(&smc->connect_info->addr, addr, alen); - schedule_work(&smc->connect_work); + if (schedule_work(&smc->connect_work)) + smc->connect_nonblock = 1; rc = -EINPROGRESS; } else { - rc = kernel_connect(smc->clcsock, addr, alen, flags); - if (rc) - goto out; - rc = __smc_connect(smc); if (rc < 0) goto out; @@ -872,11 +901,11 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) if (rc < 0) lsk->sk_err = -rc; if (rc < 0 || lsk->sk_state == SMC_CLOSED) { + new_sk->sk_prot->unhash(new_sk); if (new_clcsock) sock_release(new_clcsock); new_sk->sk_state = SMC_CLOSED; sock_set_flag(new_sk, SOCK_DEAD); - new_sk->sk_prot->unhash(new_sk); sock_put(new_sk); /* final */ *new_smc = NULL; goto out; @@ -927,16 +956,21 @@ struct sock *smc_accept_dequeue(struct sock *parent, smc_accept_unlink(new_sk); if (new_sk->sk_state == SMC_CLOSED) { + new_sk->sk_prot->unhash(new_sk); if (isk->clcsock) { sock_release(isk->clcsock); isk->clcsock = NULL; } - new_sk->sk_prot->unhash(new_sk); sock_put(new_sk); /* final */ continue; } - if (new_sock) + if (new_sock) { sock_graft(new_sk, new_sock); + if (isk->use_fallback) { + smc_sk(new_sk)->clcsock->file = new_sock->file; + isk->clcsock->file->private_data = isk->clcsock; + } + } return new_sk; } return NULL; @@ -947,31 +981,14 @@ void smc_close_non_accepted(struct sock *sk) { struct smc_sock *smc = smc_sk(sk); + sock_hold(sk); /* sock_put below */ lock_sock(sk); if (!sk->sk_lingertime) /* wait for peer closing */ sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT; - if (!smc->use_fallback) { - smc_close_active(smc); - sock_set_flag(sk, SOCK_DEAD); - sk->sk_shutdown |= SHUTDOWN_MASK; - } - if (smc->clcsock) { - struct socket *tcp; - - tcp = smc->clcsock; - smc->clcsock = NULL; - sock_release(tcp); - } - if (smc->use_fallback) { - sock_put(sk); /* passive closing */ - sk->sk_state = SMC_CLOSED; - } else { - if (sk->sk_state == SMC_CLOSED) - smc_conn_free(&smc->conn); - } + __smc_release(smc); release_sock(sk); - sk->sk_prot->unhash(sk); + sock_put(sk); /* sock_hold above */ sock_put(sk); /* final sock_put */ } @@ -1037,13 +1054,13 @@ static void smc_listen_out(struct smc_sock *new_smc) struct smc_sock *lsmc = new_smc->listen_smc; struct sock *newsmcsk = &new_smc->sk; - lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); if (lsmc->sk.sk_state == SMC_LISTEN) { + lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); smc_accept_enqueue(&lsmc->sk, newsmcsk); + release_sock(&lsmc->sk); } else { /* no longer listening */ smc_close_non_accepted(newsmcsk); } - release_sock(&lsmc->sk); /* Wake up accept */ lsmc->sk.sk_data_ready(&lsmc->sk); @@ -1087,7 +1104,7 @@ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code, return; } smc_conn_free(&new_smc->conn); - new_smc->use_fallback = true; + smc_switch_to_fallback(new_smc); new_smc->fallback_rsn = reason_code; if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) { if (smc_clc_send_decline(new_smc, reason_code) < 0) { @@ -1099,7 +1116,7 @@ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code, } /* listen worker: check prefixes */ -static int smc_listen_rdma_check(struct smc_sock *new_smc, +static int smc_listen_prfx_check(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc) { struct smc_clc_msg_proposal_prefix *pclc_prfx; @@ -1107,25 +1124,21 @@ static int smc_listen_rdma_check(struct smc_sock *new_smc, pclc_prfx = smc_clc_proposal_get_prefix(pclc); if (smc_clc_prfx_match(newclcsock, pclc_prfx)) - return SMC_CLC_DECL_CNFERR; + return SMC_CLC_DECL_DIFFPREFIX; return 0; } /* listen worker: initialize connection and buffers */ static int smc_listen_rdma_init(struct smc_sock *new_smc, - struct smc_clc_msg_proposal *pclc, - struct smc_ib_device *ibdev, u8 ibport, - int *local_contact) + struct smc_init_info *ini) { + int rc; + /* allocate connection / link group */ - *local_contact = smc_conn_create(new_smc, false, 0, ibdev, ibport, 0, - &pclc->lcl, NULL, 0); - if (*local_contact < 0) { - if (*local_contact == -ENOMEM) - return SMC_CLC_DECL_MEM;/* insufficient memory*/ - return SMC_CLC_DECL_INTERR; /* other error */ - } + rc = smc_conn_create(new_smc, ini); + if (rc) + return rc; /* create send buffer and rmb */ if (smc_buf_create(new_smc, false)) @@ -1137,33 +1150,30 @@ static int smc_listen_rdma_init(struct smc_sock *new_smc, /* listen worker: initialize connection and buffers for SMC-D */ static int smc_listen_ism_init(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc, - struct smcd_dev *ismdev, - int *local_contact) + struct smc_init_info *ini) { struct smc_clc_msg_smcd *pclc_smcd; + int rc; pclc_smcd = smc_get_clc_msg_smcd(pclc); - *local_contact = smc_conn_create(new_smc, true, 0, NULL, 0, 0, NULL, - ismdev, pclc_smcd->gid); - if (*local_contact < 0) { - if (*local_contact == -ENOMEM) - return SMC_CLC_DECL_MEM;/* insufficient memory*/ - return SMC_CLC_DECL_INTERR; /* other error */ - } + ini->ism_gid = pclc_smcd->gid; + rc = smc_conn_create(new_smc, ini); + if (rc) + return rc; /* Check if peer can be reached via ISM device */ if (smc_ism_cantalk(new_smc->conn.lgr->peer_gid, new_smc->conn.lgr->vlan_id, new_smc->conn.lgr->smcd)) { - if (*local_contact == SMC_FIRST_CONTACT) + if (ini->cln_first_contact == SMC_FIRST_CONTACT) smc_lgr_forget(new_smc->conn.lgr); smc_conn_free(&new_smc->conn); - return SMC_CLC_DECL_CNFERR; + return SMC_CLC_DECL_SMCDNOTALK; } /* Create send and receive buffers */ if (smc_buf_create(new_smc, true)) { - if (*local_contact == SMC_FIRST_CONTACT) + if (ini->cln_first_contact == SMC_FIRST_CONTACT) smc_lgr_forget(new_smc->conn.lgr); smc_conn_free(&new_smc->conn); return SMC_CLC_DECL_MEM; @@ -1227,15 +1237,13 @@ static void smc_listen_work(struct work_struct *work) struct socket *newclcsock = new_smc->clcsock; struct smc_clc_msg_accept_confirm cclc; struct smc_clc_msg_proposal *pclc; - struct smc_ib_device *ibdev; + struct smc_init_info ini = {0}; bool ism_supported = false; - struct smcd_dev *ismdev; u8 buf[SMC_CLC_MAX_LEN]; - int local_contact = 0; - unsigned short vlan; - int reason_code = 0; int rc = 0; - u8 ibport; + + if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN) + return smc_listen_out_err(new_smc); if (new_smc->use_fallback) { smc_listen_out_connected(new_smc); @@ -1244,7 +1252,7 @@ static void smc_listen_work(struct work_struct *work) /* check if peer is smc capable */ if (!tcp_sk(newclcsock->sk)->syn_smc) { - new_smc->use_fallback = true; + smc_switch_to_fallback(new_smc); new_smc->fallback_rsn = SMC_CLC_DECL_PEERNOSMC; smc_listen_out_connected(new_smc); return; @@ -1254,17 +1262,26 @@ static void smc_listen_work(struct work_struct *work) * wait for and receive SMC Proposal CLC message */ pclc = (struct smc_clc_msg_proposal *)&buf; - reason_code = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN, - SMC_CLC_PROPOSAL, CLC_WAIT_TIME); - if (reason_code) { - smc_listen_decline(new_smc, reason_code, 0); - return; - } + rc = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN, + SMC_CLC_PROPOSAL, CLC_WAIT_TIME); + if (rc) + goto out_decl; /* IPSec connections opt out of SMC-R optimizations */ if (using_ipsec(new_smc)) { - smc_listen_decline(new_smc, SMC_CLC_DECL_IPSEC, 0); - return; + rc = SMC_CLC_DECL_IPSEC; + goto out_decl; + } + + /* check for matching IP prefix and subnet length */ + rc = smc_listen_prfx_check(new_smc, pclc); + if (rc) + goto out_decl; + + /* get vlan id from IP device */ + if (smc_vlan_by_tcpsk(new_smc->clcsock, &ini)) { + rc = SMC_CLC_DECL_GETVLANERR; + goto out_decl; } mutex_lock(&smc_server_lgr_pending); @@ -1273,59 +1290,73 @@ static void smc_listen_work(struct work_struct *work) smc_tx_init(new_smc); /* check if ISM is available */ - if ((pclc->hdr.path == SMC_TYPE_D || pclc->hdr.path == SMC_TYPE_B) && - !smc_check_ism(new_smc, &ismdev) && - !smc_listen_ism_init(new_smc, pclc, ismdev, &local_contact)) { - ism_supported = true; + if (pclc->hdr.path == SMC_TYPE_D || pclc->hdr.path == SMC_TYPE_B) { + ini.is_smcd = true; /* prepare ISM check */ + rc = smc_find_ism_device(new_smc, &ini); + if (!rc) + rc = smc_listen_ism_init(new_smc, pclc, &ini); + if (!rc) + ism_supported = true; + else if (pclc->hdr.path == SMC_TYPE_D) + goto out_unlock; /* skip RDMA and decline */ } /* check if RDMA is available */ - if (!ism_supported && - ((pclc->hdr.path != SMC_TYPE_R && pclc->hdr.path != SMC_TYPE_B) || - smc_vlan_by_tcpsk(new_smc->clcsock, &vlan) || - smc_check_rdma(new_smc, &ibdev, &ibport, vlan, NULL) || - smc_listen_rdma_check(new_smc, pclc) || - smc_listen_rdma_init(new_smc, pclc, ibdev, ibport, - &local_contact) || - smc_listen_rdma_reg(new_smc, local_contact))) { - /* SMC not supported, decline */ - mutex_unlock(&smc_server_lgr_pending); - smc_listen_decline(new_smc, SMC_CLC_DECL_MODEUNSUPP, - local_contact); - return; + if (!ism_supported) { /* SMC_TYPE_R or SMC_TYPE_B */ + /* prepare RDMA check */ + ini.is_smcd = false; + ini.ism_dev = NULL; + ini.ib_lcl = &pclc->lcl; + rc = smc_find_rdma_device(new_smc, &ini); + if (rc) { + /* no RDMA device found */ + if (pclc->hdr.path == SMC_TYPE_B) + /* neither ISM nor RDMA device found */ + rc = SMC_CLC_DECL_NOSMCDEV; + goto out_unlock; + } + rc = smc_listen_rdma_init(new_smc, &ini); + if (rc) + goto out_unlock; + rc = smc_listen_rdma_reg(new_smc, ini.cln_first_contact); + if (rc) + goto out_unlock; } /* send SMC Accept CLC message */ - rc = smc_clc_send_accept(new_smc, local_contact); - if (rc) { - mutex_unlock(&smc_server_lgr_pending); - smc_listen_decline(new_smc, rc, local_contact); - return; - } + rc = smc_clc_send_accept(new_smc, ini.cln_first_contact); + if (rc) + goto out_unlock; /* SMC-D does not need this lock any more */ if (ism_supported) mutex_unlock(&smc_server_lgr_pending); /* receive SMC Confirm CLC message */ - reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc), - SMC_CLC_CONFIRM, CLC_WAIT_TIME); - if (reason_code) { + rc = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc), + SMC_CLC_CONFIRM, CLC_WAIT_TIME); + if (rc) { if (!ism_supported) - mutex_unlock(&smc_server_lgr_pending); - smc_listen_decline(new_smc, reason_code, local_contact); - return; + goto out_unlock; + goto out_decl; } /* finish worker */ if (!ism_supported) { - rc = smc_listen_rdma_finish(new_smc, &cclc, local_contact); + rc = smc_listen_rdma_finish(new_smc, &cclc, + ini.cln_first_contact); mutex_unlock(&smc_server_lgr_pending); if (rc) return; } smc_conn_save_peer_info(new_smc, &cclc); smc_listen_out_connected(new_smc); + return; + +out_unlock: + mutex_unlock(&smc_server_lgr_pending); +out_decl: + smc_listen_decline(new_smc, rc, ini.cln_first_contact); } static void smc_tcp_listen_work(struct work_struct *work) @@ -1372,7 +1403,8 @@ static int smc_listen(struct socket *sock, int backlog) lock_sock(sk); rc = -EINVAL; - if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN)) + if ((sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) || + smc->connect_nonblock) goto out; rc = 0; @@ -1500,8 +1532,8 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) goto out; if (msg->msg_flags & MSG_FASTOPEN) { - if (sk->sk_state == SMC_INIT) { - smc->use_fallback = true; + if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { + smc_switch_to_fallback(smc); smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP; } else { rc = -EINVAL; @@ -1571,8 +1603,8 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; - __poll_t mask = 0; struct smc_sock *smc; + __poll_t mask = 0; if (!sk) return EPOLLNVAL; @@ -1582,8 +1614,6 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, /* delegate to CLC child sock */ mask = smc->clcsock->ops->poll(file, smc->clcsock, wait); sk->sk_err = smc->clcsock->sk->sk_err; - if (sk->sk_err) - mask |= EPOLLERR; } else { if (sk->sk_state != SMC_CLOSED) sock_poll_wait(file, sock, wait); @@ -1594,9 +1624,14 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, mask |= EPOLLHUP; if (sk->sk_state == SMC_LISTEN) { /* woken up by sk_data_ready in smc_listen_work() */ - mask = smc_accept_poll(sk); + mask |= smc_accept_poll(sk); + } else if (smc->use_fallback) { /* as result of connect_work()*/ + mask |= smc->clcsock->ops->poll(file, smc->clcsock, + wait); + sk->sk_err = smc->clcsock->sk->sk_err; } else { - if (atomic_read(&smc->conn.sndbuf_space) || + if ((sk->sk_state != SMC_INIT && + atomic_read(&smc->conn.sndbuf_space)) || sk->sk_shutdown & SEND_SHUTDOWN) { mask |= EPOLLOUT | EPOLLWRNORM; } else { @@ -1702,8 +1737,8 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, case TCP_FASTOPEN_KEY: case TCP_FASTOPEN_NO_COOKIE: /* option not supported by SMC */ - if (sk->sk_state == SMC_INIT) { - smc->use_fallback = true; + if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { + smc_switch_to_fallback(smc); smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP; } else { if (!smc->use_fallback) @@ -1711,14 +1746,18 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, } break; case TCP_NODELAY: - if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) { + if (sk->sk_state != SMC_INIT && + sk->sk_state != SMC_LISTEN && + sk->sk_state != SMC_CLOSED) { if (val && !smc->use_fallback) mod_delayed_work(system_wq, &smc->conn.tx_work, 0); } break; case TCP_CORK: - if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) { + if (sk->sk_state != SMC_INIT && + sk->sk_state != SMC_LISTEN && + sk->sk_state != SMC_CLOSED) { if (!val && !smc->use_fallback) mod_delayed_work(system_wq, &smc->conn.tx_work, 0); @@ -1999,24 +2038,30 @@ static int __init smc_init(void) rc = smc_pnet_init(); if (rc) - return rc; + goto out_pernet_subsys; + + rc = smc_core_init(); + if (rc) { + pr_err("%s: smc_core_init fails with %d\n", __func__, rc); + goto out_pnet; + } rc = smc_llc_init(); if (rc) { pr_err("%s: smc_llc_init fails with %d\n", __func__, rc); - goto out_pnet; + goto out_core; } rc = smc_cdc_init(); if (rc) { pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc); - goto out_pnet; + goto out_core; } rc = proto_register(&smc_proto, 1); if (rc) { pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc); - goto out_pnet; + goto out_core; } rc = proto_register(&smc_proto6, 1); @@ -2048,21 +2093,27 @@ out_proto6: proto_unregister(&smc_proto6); out_proto: proto_unregister(&smc_proto); +out_core: + smc_core_exit(); out_pnet: smc_pnet_exit(); +out_pernet_subsys: + unregister_pernet_subsys(&smc_net_ops); + return rc; } static void __exit smc_exit(void) { - smc_core_exit(); static_branch_disable(&tcp_have_smc); - smc_ib_unregister_client(); sock_unregister(PF_SMC); + smc_core_exit(); + smc_ib_unregister_client(); proto_unregister(&smc_proto6); proto_unregister(&smc_proto); smc_pnet_exit(); unregister_pernet_subsys(&smc_net_ops); + rcu_barrier(); } module_init(smc_init); diff --git a/net/smc/smc.h b/net/smc/smc.h index adbdf195eb08..be11ba41190f 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -188,12 +188,7 @@ struct smc_connection { * 0 for SMC-R, 32 for SMC-D */ u64 peer_token; /* SMC-D token of peer */ -}; - -struct smc_connect_info { - int flags; - int alen; - struct sockaddr addr; + u8 killed : 1; /* abnormal termination */ }; struct smc_sock { /* smc sock container */ @@ -201,7 +196,6 @@ struct smc_sock { /* smc sock container */ struct socket *clcsock; /* internal tcp socket */ struct smc_connection conn; /* smc connection */ struct smc_sock *listen_smc; /* listen parent */ - struct smc_connect_info *connect_info; /* connect address & flags */ struct work_struct connect_work; /* handle non-blocking connect*/ struct work_struct tcp_listen_work;/* handle tcp socket accepts */ struct work_struct smc_listen_work;/* prepare new accept socket */ @@ -219,6 +213,10 @@ struct smc_sock { /* smc sock container */ * started, waiting for unsent * data to be sent */ + u8 connect_nonblock : 1; + /* non-blocking connect in + * flight + */ struct mutex clcsock_release_lock; /* protects clcsock of a listen * socket diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index d0b0f4c865b4..164f1584861b 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -63,7 +63,7 @@ int smc_cdc_get_free_slot(struct smc_connection *conn, rc = smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf, wr_rdma_buf, (struct smc_wr_tx_pend_priv **)pend); - if (!conn->alert_token_local) + if (conn->killed) /* abnormal termination */ rc = -EPIPE; return rc; @@ -131,6 +131,9 @@ int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn) { int rc; + if (!conn->lgr || (conn->lgr->is_smcd && conn->lgr->peer_shutdown)) + return -EPIPE; + if (conn->lgr->is_smcd) { spin_lock_bh(&conn->send_lock); rc = smcd_cdc_msg_send(conn); @@ -328,7 +331,7 @@ static void smcd_cdc_rx_tsklet(unsigned long data) struct smcd_cdc_msg cdc; struct smc_sock *smc; - if (!conn) + if (!conn || conn->killed) return; data_cdc = (struct smcd_cdc_msg *)conn->rmb_desc->cpu_addr; diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index d53fd588d1f5..0879f7bed967 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -97,17 +97,19 @@ static int smc_clc_prfx_set4_rcu(struct dst_entry *dst, __be32 ipv4, struct smc_clc_msg_proposal_prefix *prop) { struct in_device *in_dev = __in_dev_get_rcu(dst->dev); + const struct in_ifaddr *ifa; if (!in_dev) return -ENODEV; - for_ifa(in_dev) { + + in_dev_for_each_ifa_rcu(ifa, in_dev) { if (!inet_ifa_match(ipv4, ifa)) continue; prop->prefix_len = inet_mask_len(ifa->ifa_mask); prop->outgoing_subnet = ifa->ifa_address & ifa->ifa_mask; /* prop->ipv6_prefixes_cnt = 0; already done by memset before */ return 0; - } endfor_ifa(in_dev); + } return -ENOENT; } @@ -190,14 +192,15 @@ static int smc_clc_prfx_match4_rcu(struct net_device *dev, struct smc_clc_msg_proposal_prefix *prop) { struct in_device *in_dev = __in_dev_get_rcu(dev); + const struct in_ifaddr *ifa; if (!in_dev) return -ENODEV; - for_ifa(in_dev) { + in_dev_for_each_ifa_rcu(ifa, in_dev) { if (prop->prefix_len == inet_mask_len(ifa->ifa_mask) && inet_ifa_match(prop->outgoing_subnet, ifa)) return 0; - } endfor_ifa(in_dev); + } return -ENOENT; } @@ -346,7 +349,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, smc->peer_diagnosis = ntohl(dclc->peer_diagnosis); if (((struct smc_clc_msg_decline *)buf)->hdr.flag) { smc->conn.lgr->sync_err = 1; - smc_lgr_terminate(smc->conn.lgr); + smc_lgr_terminate(smc->conn.lgr, true); } } @@ -385,8 +388,7 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info) /* send CLC PROPOSAL message across internal TCP socket */ int smc_clc_send_proposal(struct smc_sock *smc, int smc_type, - struct smc_ib_device *ibdev, u8 ibport, u8 gid[], - struct smcd_dev *ismdev) + struct smc_init_info *ini) { struct smc_clc_ipv6_prefix ipv6_prfx[SMC_CLC_MAX_V6_PREFIX]; struct smc_clc_msg_proposal_prefix pclc_prfx; @@ -416,8 +418,9 @@ int smc_clc_send_proposal(struct smc_sock *smc, int smc_type, /* add SMC-R specifics */ memcpy(pclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); - memcpy(&pclc.lcl.gid, gid, SMC_GID_SIZE); - memcpy(&pclc.lcl.mac, &ibdev->mac[ibport - 1], ETH_ALEN); + memcpy(&pclc.lcl.gid, ini->ib_gid, SMC_GID_SIZE); + memcpy(&pclc.lcl.mac, &ini->ib_dev->mac[ini->ib_port - 1], + ETH_ALEN); pclc.iparea_offset = htons(0); } if (smc_type == SMC_TYPE_D || smc_type == SMC_TYPE_B) { @@ -425,7 +428,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, int smc_type, memset(&pclc_smcd, 0, sizeof(pclc_smcd)); plen += sizeof(pclc_smcd); pclc.iparea_offset = htons(SMC_CLC_PROPOSAL_MAX_OFFSET); - pclc_smcd.gid = ismdev->local_gid; + pclc_smcd.gid = ini->ism_dev->local_gid; } pclc.hdr.length = htons(plen); diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 24658e8c0de4..ca209272e5fa 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -34,16 +34,22 @@ #define SMC_CLC_DECL_CNFERR 0x03000000 /* configuration error */ #define SMC_CLC_DECL_PEERNOSMC 0x03010000 /* peer did not indicate SMC */ #define SMC_CLC_DECL_IPSEC 0x03020000 /* IPsec usage */ -#define SMC_CLC_DECL_NOSMCDEV 0x03030000 /* no SMC device found */ +#define SMC_CLC_DECL_NOSMCDEV 0x03030000 /* no SMC device found (R or D) */ +#define SMC_CLC_DECL_NOSMCDDEV 0x03030001 /* no SMC-D device found */ +#define SMC_CLC_DECL_NOSMCRDEV 0x03030002 /* no SMC-R device found */ +#define SMC_CLC_DECL_SMCDNOTALK 0x03030003 /* SMC-D dev can't talk to peer */ #define SMC_CLC_DECL_MODEUNSUPP 0x03040000 /* smc modes do not match (R or D)*/ #define SMC_CLC_DECL_RMBE_EC 0x03050000 /* peer has eyecatcher in RMBE */ #define SMC_CLC_DECL_OPTUNSUPP 0x03060000 /* fastopen sockopt not supported */ +#define SMC_CLC_DECL_DIFFPREFIX 0x03070000 /* IP prefix / subnet mismatch */ +#define SMC_CLC_DECL_GETVLANERR 0x03080000 /* err to get vlan id of ip device*/ +#define SMC_CLC_DECL_ISMVLANERR 0x03090000 /* err to reg vlan id on ism dev */ #define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */ #define SMC_CLC_DECL_PEERDECL 0x05000000 /* peer declined during handshake */ -#define SMC_CLC_DECL_INTERR 0x99990000 /* internal error */ -#define SMC_CLC_DECL_ERR_RTOK 0x99990001 /* rtoken handling failed */ -#define SMC_CLC_DECL_ERR_RDYLNK 0x99990002 /* ib ready link failed */ -#define SMC_CLC_DECL_ERR_REGRMB 0x99990003 /* reg rmb failed */ +#define SMC_CLC_DECL_INTERR 0x09990000 /* internal error */ +#define SMC_CLC_DECL_ERR_RTOK 0x09990001 /* rtoken handling failed */ +#define SMC_CLC_DECL_ERR_RDYLNK 0x09990002 /* ib ready link failed */ +#define SMC_CLC_DECL_ERR_REGRMB 0x09990003 /* reg rmb failed */ struct smc_clc_msg_hdr { /* header1 of clc messages */ u8 eyecatcher[4]; /* eye catcher */ @@ -179,6 +185,7 @@ smc_get_clc_msg_smcd(struct smc_clc_msg_proposal *prop) } struct smcd_dev; +struct smc_init_info; int smc_clc_prfx_match(struct socket *clcsock, struct smc_clc_msg_proposal_prefix *prop); @@ -186,8 +193,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, u8 expected_type, unsigned long timeout); int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info); int smc_clc_send_proposal(struct smc_sock *smc, int smc_type, - struct smc_ib_device *smcibdev, u8 ibport, u8 gid[], - struct smcd_dev *ismdev); + struct smc_init_info *ini); int smc_clc_send_confirm(struct smc_sock *smc); int smc_clc_send_accept(struct smc_sock *smc, int srv_first_contact); diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 2ad37e998509..290270c821ca 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -13,13 +13,28 @@ #include <linux/sched/signal.h> #include <net/sock.h> +#include <net/tcp.h> #include "smc.h" #include "smc_tx.h" #include "smc_cdc.h" #include "smc_close.h" -#define SMC_CLOSE_WAIT_LISTEN_CLCSOCK_TIME (5 * HZ) +/* release the clcsock that is assigned to the smc_sock */ +void smc_clcsock_release(struct smc_sock *smc) +{ + struct socket *tcp; + + if (smc->listen_smc && current_work() != &smc->smc_listen_work) + cancel_work_sync(&smc->smc_listen_work); + mutex_lock(&smc->clcsock_release_lock); + if (smc->clcsock) { + tcp = smc->clcsock; + smc->clcsock = NULL; + sock_release(tcp); + } + mutex_unlock(&smc->clcsock_release_lock); +} static void smc_close_cleanup_listen(struct sock *parent) { @@ -49,8 +64,9 @@ static void smc_close_stream_wait(struct smc_sock *smc, long timeout) rc = sk_wait_event(sk, &timeout, !smc_tx_prepared_sends(&smc->conn) || - (sk->sk_err == ECONNABORTED) || - (sk->sk_err == ECONNRESET), + sk->sk_err == ECONNABORTED || + sk->sk_err == ECONNRESET || + smc->conn.killed, &wait); if (rc) break; @@ -79,68 +95,73 @@ static int smc_close_final(struct smc_connection *conn) conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; else conn->local_tx_ctrl.conn_state_flags.peer_conn_closed = 1; + if (conn->killed) + return -EPIPE; return smc_cdc_get_slot_and_msg_send(conn); } -static int smc_close_abort(struct smc_connection *conn) +int smc_close_abort(struct smc_connection *conn) { conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; return smc_cdc_get_slot_and_msg_send(conn); } +static void smc_close_cancel_work(struct smc_sock *smc) +{ + struct sock *sk = &smc->sk; + + release_sock(sk); + cancel_work_sync(&smc->conn.close_work); + cancel_delayed_work_sync(&smc->conn.tx_work); + lock_sock(sk); + sk->sk_state = SMC_CLOSED; +} + /* terminate smc socket abnormally - active abort * link group is terminated, i.e. RDMA communication no longer possible */ -static void smc_close_active_abort(struct smc_sock *smc) +void smc_close_active_abort(struct smc_sock *smc) { struct sock *sk = &smc->sk; - - struct smc_cdc_conn_state_flags *txflags = - &smc->conn.local_tx_ctrl.conn_state_flags; + bool release_clcsock = false; if (sk->sk_state != SMC_INIT && smc->clcsock && smc->clcsock->sk) { sk->sk_err = ECONNABORTED; - if (smc->clcsock && smc->clcsock->sk) { - smc->clcsock->sk->sk_err = ECONNABORTED; - smc->clcsock->sk->sk_state_change(smc->clcsock->sk); - } + if (smc->clcsock && smc->clcsock->sk) + tcp_abort(smc->clcsock->sk, ECONNABORTED); } switch (sk->sk_state) { case SMC_ACTIVE: sk->sk_state = SMC_PEERABORTWAIT; - release_sock(sk); - cancel_delayed_work_sync(&smc->conn.tx_work); - lock_sock(sk); + smc_close_cancel_work(smc); + sk->sk_state = SMC_CLOSED; sock_put(sk); /* passive closing */ break; case SMC_APPCLOSEWAIT1: case SMC_APPCLOSEWAIT2: - if (!smc_cdc_rxed_any_close(&smc->conn)) - sk->sk_state = SMC_PEERABORTWAIT; - else - sk->sk_state = SMC_CLOSED; - release_sock(sk); - cancel_delayed_work_sync(&smc->conn.tx_work); - lock_sock(sk); + smc_close_cancel_work(smc); + sk->sk_state = SMC_CLOSED; + sock_put(sk); /* postponed passive closing */ break; case SMC_PEERCLOSEWAIT1: case SMC_PEERCLOSEWAIT2: - if (!txflags->peer_conn_closed) { - /* just SHUTDOWN_SEND done */ - sk->sk_state = SMC_PEERABORTWAIT; - } else { - sk->sk_state = SMC_CLOSED; - } + case SMC_PEERFINCLOSEWAIT: + sk->sk_state = SMC_PEERABORTWAIT; + smc_close_cancel_work(smc); + sk->sk_state = SMC_CLOSED; + smc_conn_free(&smc->conn); + release_clcsock = true; sock_put(sk); /* passive closing */ break; case SMC_PROCESSABORT: case SMC_APPFINCLOSEWAIT: + sk->sk_state = SMC_PEERABORTWAIT; + smc_close_cancel_work(smc); sk->sk_state = SMC_CLOSED; - break; - case SMC_PEERFINCLOSEWAIT: - sock_put(sk); /* passive closing */ + smc_conn_free(&smc->conn); + release_clcsock = true; break; case SMC_INIT: case SMC_PEERABORTWAIT: @@ -150,6 +171,12 @@ static void smc_close_active_abort(struct smc_sock *smc) sock_set_flag(sk, SOCK_DEAD); sk->sk_state_change(sk); + + if (release_clcsock) { + release_sock(sk); + smc_clcsock_release(smc); + lock_sock(sk); + } } static inline bool smc_close_sent_any_close(struct smc_connection *conn) @@ -199,8 +226,6 @@ again: if (sk->sk_state == SMC_ACTIVE) { /* send close request */ rc = smc_close_final(conn); - if (rc) - break; sk->sk_state = SMC_PEERCLOSEWAIT1; } else { /* peer event has changed the state */ @@ -213,8 +238,6 @@ again: !smc_close_sent_any_close(conn)) { /* just shutdown wr done, send close request */ rc = smc_close_final(conn); - if (rc) - break; } sk->sk_state = SMC_CLOSED; break; @@ -230,8 +253,6 @@ again: goto again; /* confirm close from peer */ rc = smc_close_final(conn); - if (rc) - break; if (smc_cdc_rxed_any_close(conn)) { /* peer has closed the socket already */ sk->sk_state = SMC_CLOSED; @@ -247,8 +268,6 @@ again: !smc_close_sent_any_close(conn)) { /* just shutdown wr done, send close request */ rc = smc_close_final(conn); - if (rc) - break; } /* peer sending PeerConnectionClosed will cause transition */ break; @@ -256,10 +275,12 @@ again: /* peer sending PeerConnectionClosed will cause transition */ break; case SMC_PROCESSABORT: - smc_close_abort(conn); + rc = smc_close_abort(conn); sk->sk_state = SMC_CLOSED; break; case SMC_PEERABORTWAIT: + sk->sk_state = SMC_CLOSED; + break; case SMC_CLOSED: /* nothing to do, add tracing in future patch */ break; @@ -321,18 +342,13 @@ static void smc_close_passive_work(struct work_struct *work) close_work); struct smc_sock *smc = container_of(conn, struct smc_sock, conn); struct smc_cdc_conn_state_flags *rxflags; + bool release_clcsock = false; struct sock *sk = &smc->sk; int old_state; lock_sock(sk); old_state = sk->sk_state; - if (!conn->alert_token_local) { - /* abnormal termination */ - smc_close_active_abort(smc); - goto wakeup; - } - rxflags = &conn->local_rx_ctrl.conn_state_flags; if (rxflags->peer_conn_abort) { /* peer has not received all data */ @@ -400,13 +416,13 @@ wakeup: if ((sk->sk_state == SMC_CLOSED) && (sock_flag(sk, SOCK_DEAD) || !sk->sk_socket)) { smc_conn_free(conn); - if (smc->clcsock) { - sock_release(smc->clcsock); - smc->clcsock = NULL; - } + if (smc->clcsock) + release_clcsock = true; } } release_sock(sk); + if (release_clcsock) + smc_clcsock_release(smc); sock_put(sk); /* sock_hold done by schedulers of close_work */ } @@ -434,8 +450,6 @@ again: goto again; /* send close wr request */ rc = smc_close_wr(conn); - if (rc) - break; sk->sk_state = SMC_PEERCLOSEWAIT1; break; case SMC_APPCLOSEWAIT1: @@ -449,8 +463,6 @@ again: goto again; /* confirm close from peer */ rc = smc_close_wr(conn); - if (rc) - break; sk->sk_state = SMC_APPCLOSEWAIT2; break; case SMC_APPCLOSEWAIT2: diff --git a/net/smc/smc_close.h b/net/smc/smc_close.h index 19eb6a211c23..634fea2b7c95 100644 --- a/net/smc/smc_close.h +++ b/net/smc/smc_close.h @@ -23,5 +23,8 @@ void smc_close_wake_tx_prepared(struct smc_sock *smc); int smc_close_active(struct smc_sock *smc); int smc_close_shutdown_write(struct smc_sock *smc); void smc_close_init(struct smc_sock *smc); +void smc_clcsock_release(struct smc_sock *smc); +int smc_close_abort(struct smc_connection *conn); +void smc_close_active_abort(struct smc_sock *smc); #endif /* SMC_CLOSE_H */ diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 53a17cfa61af..bb92c7c6214c 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -13,6 +13,8 @@ #include <linux/if_vlan.h> #include <linux/random.h> #include <linux/workqueue.h> +#include <linux/wait.h> +#include <linux/reboot.h> #include <net/tcp.h> #include <net/sock.h> #include <rdma/ib_verbs.h> @@ -39,23 +41,46 @@ static struct smc_lgr_list smc_lgr_list = { /* established link groups */ .num = 0, }; +static atomic_t lgr_cnt; /* number of existing link groups */ +static DECLARE_WAIT_QUEUE_HEAD(lgrs_deleted); + static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, struct smc_buf_desc *buf_desc); +/* return head of link group list and its lock for a given link group */ +static inline struct list_head *smc_lgr_list_head(struct smc_link_group *lgr, + spinlock_t **lgr_lock) +{ + if (lgr->is_smcd) { + *lgr_lock = &lgr->smcd->lgr_lock; + return &lgr->smcd->lgr_list; + } + + *lgr_lock = &smc_lgr_list.lock; + return &smc_lgr_list.list; +} + static void smc_lgr_schedule_free_work(struct smc_link_group *lgr) { /* client link group creation always follows the server link group * creation. For client use a somewhat higher removal delay time, * otherwise there is a risk of out-of-sync link groups. */ - mod_delayed_work(system_wq, &lgr->free_work, - (!lgr->is_smcd && lgr->role == SMC_CLNT) ? - SMC_LGR_FREE_DELAY_CLNT : SMC_LGR_FREE_DELAY_SERV); + if (!lgr->freeing && !lgr->freefast) { + mod_delayed_work(system_wq, &lgr->free_work, + (!lgr->is_smcd && lgr->role == SMC_CLNT) ? + SMC_LGR_FREE_DELAY_CLNT : + SMC_LGR_FREE_DELAY_SERV); + } } void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr) { - mod_delayed_work(system_wq, &lgr->free_work, SMC_LGR_FREE_DELAY_FAST); + if (!lgr->freeing && !lgr->freefast) { + lgr->freefast = 1; + mod_delayed_work(system_wq, &lgr->free_work, + SMC_LGR_FREE_DELAY_FAST); + } } /* Register connection's alert token in our lookup structure. @@ -134,16 +159,17 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn) __smc_lgr_unregister_conn(conn); } write_unlock_bh(&lgr->conns_lock); + conn->lgr = NULL; } /* Send delete link, either as client to request the initiation * of the DELETE LINK sequence from server; or as server to * initiate the delete processing. See smc_llc_rx_delete_link(). */ -static int smc_link_send_delete(struct smc_link *lnk) +static int smc_link_send_delete(struct smc_link *lnk, bool orderly) { if (lnk->state == SMC_LNK_ACTIVE && - !smc_llc_send_delete_link(lnk, SMC_LLC_REQ, true)) { + !smc_llc_send_delete_link(lnk, SMC_LLC_REQ, orderly)) { smc_llc_link_deleting(lnk); return 0; } @@ -157,69 +183,84 @@ static void smc_lgr_free_work(struct work_struct *work) struct smc_link_group *lgr = container_of(to_delayed_work(work), struct smc_link_group, free_work); + spinlock_t *lgr_lock; + struct smc_link *lnk; bool conns; - spin_lock_bh(&smc_lgr_list.lock); + smc_lgr_list_head(lgr, &lgr_lock); + spin_lock_bh(lgr_lock); + if (lgr->freeing) { + spin_unlock_bh(lgr_lock); + return; + } read_lock_bh(&lgr->conns_lock); conns = RB_EMPTY_ROOT(&lgr->conns_all); read_unlock_bh(&lgr->conns_lock); if (!conns) { /* number of lgr connections is no longer zero */ - spin_unlock_bh(&smc_lgr_list.lock); + spin_unlock_bh(lgr_lock); return; } - if (!list_empty(&lgr->list)) - list_del_init(&lgr->list); /* remove from smc_lgr_list */ - spin_unlock_bh(&smc_lgr_list.lock); + list_del_init(&lgr->list); /* remove from smc_lgr_list */ + lnk = &lgr->lnk[SMC_SINGLE_LINK]; if (!lgr->is_smcd && !lgr->terminating) { - struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; - /* try to send del link msg, on error free lgr immediately */ if (lnk->state == SMC_LNK_ACTIVE && - !smc_link_send_delete(lnk)) { + !smc_link_send_delete(lnk, true)) { /* reschedule in case we never receive a response */ smc_lgr_schedule_free_work(lgr); + spin_unlock_bh(lgr_lock); return; } } + lgr->freeing = 1; /* this instance does the freeing, no new schedule */ + spin_unlock_bh(lgr_lock); + cancel_delayed_work(&lgr->free_work); + + if (!lgr->is_smcd && lnk->state != SMC_LNK_INACTIVE) + smc_llc_link_inactive(lnk); + if (lgr->is_smcd && !lgr->terminating) + smc_ism_signal_shutdown(lgr); + smc_lgr_free(lgr); +} - if (!delayed_work_pending(&lgr->free_work)) { - struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; +static void smc_lgr_terminate_work(struct work_struct *work) +{ + struct smc_link_group *lgr = container_of(work, struct smc_link_group, + terminate_work); - if (!lgr->is_smcd && lnk->state != SMC_LNK_INACTIVE) - smc_llc_link_inactive(lnk); - if (lgr->is_smcd) - smc_ism_signal_shutdown(lgr); - smc_lgr_free(lgr); - } + smc_lgr_terminate(lgr, true); } /* create a new SMC link group */ -static int smc_lgr_create(struct smc_sock *smc, bool is_smcd, - struct smc_ib_device *smcibdev, u8 ibport, - char *peer_systemid, unsigned short vlan_id, - struct smcd_dev *smcismdev, u64 peer_gid) +static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) { struct smc_link_group *lgr; + struct list_head *lgr_list; struct smc_link *lnk; + spinlock_t *lgr_lock; u8 rndvec[3]; int rc = 0; int i; - if (is_smcd && vlan_id) { - rc = smc_ism_get_vlan(smcismdev, vlan_id); - if (rc) + if (ini->is_smcd && ini->vlan_id) { + if (smc_ism_get_vlan(ini->ism_dev, ini->vlan_id)) { + rc = SMC_CLC_DECL_ISMVLANERR; goto out; + } } lgr = kzalloc(sizeof(*lgr), GFP_KERNEL); if (!lgr) { - rc = -ENOMEM; - goto out; + rc = SMC_CLC_DECL_MEM; + goto ism_put_vlan; } - lgr->is_smcd = is_smcd; + lgr->is_smcd = ini->is_smcd; lgr->sync_err = 0; - lgr->vlan_id = vlan_id; + lgr->terminating = 0; + lgr->freefast = 0; + lgr->freeing = 0; + lgr->vlan_id = ini->vlan_id; rwlock_init(&lgr->sndbufs_lock); rwlock_init(&lgr->rmbs_lock); rwlock_init(&lgr->conns_lock); @@ -230,30 +271,42 @@ static int smc_lgr_create(struct smc_sock *smc, bool is_smcd, smc_lgr_list.num += SMC_LGR_NUM_INCR; memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE); INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work); + INIT_WORK(&lgr->terminate_work, smc_lgr_terminate_work); lgr->conns_all = RB_ROOT; - if (is_smcd) { + if (ini->is_smcd) { /* SMC-D specific settings */ - lgr->peer_gid = peer_gid; - lgr->smcd = smcismdev; + get_device(&ini->ism_dev->dev); + lgr->peer_gid = ini->ism_gid; + lgr->smcd = ini->ism_dev; + lgr_list = &ini->ism_dev->lgr_list; + lgr_lock = &lgr->smcd->lgr_lock; + lgr->peer_shutdown = 0; + atomic_inc(&ini->ism_dev->lgr_cnt); } else { /* SMC-R specific settings */ + get_device(&ini->ib_dev->ibdev->dev); lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; - memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN); + memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer, + SMC_SYSTEMID_LEN); lnk = &lgr->lnk[SMC_SINGLE_LINK]; /* initialize link */ lnk->state = SMC_LNK_ACTIVATING; lnk->link_id = SMC_SINGLE_LINK; - lnk->smcibdev = smcibdev; - lnk->ibport = ibport; - lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu; - if (!smcibdev->initialized) - smc_ib_setup_per_ibdev(smcibdev); + lnk->smcibdev = ini->ib_dev; + lnk->ibport = ini->ib_port; + lgr_list = &smc_lgr_list.list; + lgr_lock = &smc_lgr_list.lock; + lnk->path_mtu = + ini->ib_dev->pattr[ini->ib_port - 1].active_mtu; + if (!ini->ib_dev->initialized) + smc_ib_setup_per_ibdev(ini->ib_dev); get_random_bytes(rndvec, sizeof(rndvec)); lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16); rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport, - vlan_id, lnk->gid, &lnk->sgid_index); + ini->vlan_id, lnk->gid, + &lnk->sgid_index); if (rc) goto free_lgr; rc = smc_llc_link_init(lnk); @@ -271,11 +324,13 @@ static int smc_lgr_create(struct smc_sock *smc, bool is_smcd, rc = smc_wr_create_link(lnk); if (rc) goto destroy_qp; + atomic_inc(&lgr_cnt); + atomic_inc(&ini->ib_dev->lnk_cnt); } smc->conn.lgr = lgr; - spin_lock_bh(&smc_lgr_list.lock); - list_add(&lgr->list, &smc_lgr_list.list); - spin_unlock_bh(&smc_lgr_list.lock); + spin_lock_bh(lgr_lock); + list_add(&lgr->list, lgr_list); + spin_unlock_bh(lgr_lock); return 0; destroy_qp: @@ -288,7 +343,16 @@ clear_llc_lnk: smc_llc_link_clear(lnk); free_lgr: kfree(lgr); +ism_put_vlan: + if (ini->is_smcd && ini->vlan_id) + smc_ism_put_vlan(ini->ism_dev, ini->vlan_id); out: + if (rc < 0) { + if (rc == -ENOMEM) + rc = SMC_CLC_DECL_MEM; + else + rc = SMC_CLC_DECL_INTERR; + } return rc; } @@ -299,7 +363,7 @@ static void smc_buf_unuse(struct smc_connection *conn, conn->sndbuf_desc->used = 0; if (conn->rmb_desc) { if (!conn->rmb_desc->regerr) { - if (!lgr->is_smcd) { + if (!lgr->is_smcd && !list_empty(&lgr->list)) { /* unregister rmb with peer */ smc_llc_do_delete_rkey( &lgr->lnk[SMC_SINGLE_LINK], @@ -325,14 +389,16 @@ void smc_conn_free(struct smc_connection *conn) if (!lgr) return; if (lgr->is_smcd) { - smc_ism_unset_conn(conn); + if (!list_empty(&lgr->list)) + smc_ism_unset_conn(conn); tasklet_kill(&conn->rx_tsklet); } else { smc_cdc_tx_dismiss_slots(conn); } - smc_lgr_unregister_conn(conn); - smc_buf_unuse(conn, lgr); /* allow buffer reuse */ - conn->lgr = NULL; + if (!list_empty(&lgr->list)) { + smc_lgr_unregister_conn(conn); + smc_buf_unuse(conn, lgr); /* allow buffer reuse */ + } if (!lgr->conns_num) smc_lgr_schedule_free_work(lgr); @@ -347,6 +413,8 @@ static void smc_link_clear(struct smc_link *lnk) smc_ib_destroy_queue_pair(lnk); smc_ib_dealloc_protection_domain(lnk); smc_wr_free_link_mem(lnk); + if (!atomic_dec_return(&lnk->smcibdev->lnk_cnt)) + wake_up(&lnk->smcibdev->lnks_deleted); } static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, @@ -423,24 +491,101 @@ static void smc_lgr_free_bufs(struct smc_link_group *lgr) static void smc_lgr_free(struct smc_link_group *lgr) { smc_lgr_free_bufs(lgr); - if (lgr->is_smcd) - smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); - else + if (lgr->is_smcd) { + if (!lgr->terminating) { + smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); + put_device(&lgr->smcd->dev); + } + if (!atomic_dec_return(&lgr->smcd->lgr_cnt)) + wake_up(&lgr->smcd->lgrs_deleted); + } else { smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]); + put_device(&lgr->lnk[SMC_SINGLE_LINK].smcibdev->ibdev->dev); + if (!atomic_dec_return(&lgr_cnt)) + wake_up(&lgrs_deleted); + } kfree(lgr); } void smc_lgr_forget(struct smc_link_group *lgr) { - spin_lock_bh(&smc_lgr_list.lock); + struct list_head *lgr_list; + spinlock_t *lgr_lock; + + lgr_list = smc_lgr_list_head(lgr, &lgr_lock); + spin_lock_bh(lgr_lock); /* do not use this link group for new connections */ - if (!list_empty(&lgr->list)) - list_del_init(&lgr->list); - spin_unlock_bh(&smc_lgr_list.lock); + if (!list_empty(lgr_list)) + list_del_init(lgr_list); + spin_unlock_bh(lgr_lock); } -/* terminate linkgroup abnormally */ -static void __smc_lgr_terminate(struct smc_link_group *lgr) +static void smcd_unregister_all_dmbs(struct smc_link_group *lgr) +{ + int i; + + for (i = 0; i < SMC_RMBE_SIZES; i++) { + struct smc_buf_desc *buf_desc; + + list_for_each_entry(buf_desc, &lgr->rmbs[i], list) { + buf_desc->len += sizeof(struct smcd_cdc_msg); + smc_ism_unregister_dmb(lgr->smcd, buf_desc); + } + } +} + +static void smc_sk_wake_ups(struct smc_sock *smc) +{ + smc->sk.sk_write_space(&smc->sk); + smc->sk.sk_data_ready(&smc->sk); + smc->sk.sk_state_change(&smc->sk); +} + +/* kill a connection */ +static void smc_conn_kill(struct smc_connection *conn, bool soft) +{ + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + + if (conn->lgr->is_smcd && conn->lgr->peer_shutdown) + conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; + else + smc_close_abort(conn); + conn->killed = 1; + smc->sk.sk_err = ECONNABORTED; + smc_sk_wake_ups(smc); + if (conn->lgr->is_smcd) { + smc_ism_unset_conn(conn); + if (soft) + tasklet_kill(&conn->rx_tsklet); + else + tasklet_unlock_wait(&conn->rx_tsklet); + } else { + smc_cdc_tx_dismiss_slots(conn); + } + smc_lgr_unregister_conn(conn); + smc_close_active_abort(smc); +} + +static void smc_lgr_cleanup(struct smc_link_group *lgr) +{ + if (lgr->is_smcd) { + smc_ism_signal_shutdown(lgr); + smcd_unregister_all_dmbs(lgr); + smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); + put_device(&lgr->smcd->dev); + } else { + struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; + + wake_up(&lnk->wr_reg_wait); + if (lnk->state != SMC_LNK_INACTIVE) { + smc_link_send_delete(lnk, false); + smc_llc_link_inactive(lnk); + } + } +} + +/* terminate link group */ +static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft) { struct smc_connection *conn; struct smc_sock *smc; @@ -448,93 +593,174 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr) if (lgr->terminating) return; /* lgr already terminating */ + if (!soft) + cancel_delayed_work_sync(&lgr->free_work); lgr->terminating = 1; - if (!list_empty(&lgr->list)) /* forget lgr */ - list_del_init(&lgr->list); if (!lgr->is_smcd) smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); - write_lock_bh(&lgr->conns_lock); + /* kill remaining link group connections */ + read_lock_bh(&lgr->conns_lock); node = rb_first(&lgr->conns_all); while (node) { + read_unlock_bh(&lgr->conns_lock); conn = rb_entry(node, struct smc_connection, alert_node); smc = container_of(conn, struct smc_sock, conn); - sock_hold(&smc->sk); /* sock_put in close work */ - conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; - __smc_lgr_unregister_conn(conn); - conn->lgr = NULL; - write_unlock_bh(&lgr->conns_lock); - if (!schedule_work(&conn->close_work)) - sock_put(&smc->sk); - write_lock_bh(&lgr->conns_lock); + sock_hold(&smc->sk); /* sock_put below */ + lock_sock(&smc->sk); + smc_conn_kill(conn, soft); + release_sock(&smc->sk); + sock_put(&smc->sk); /* sock_hold above */ + read_lock_bh(&lgr->conns_lock); node = rb_first(&lgr->conns_all); } - write_unlock_bh(&lgr->conns_lock); - if (!lgr->is_smcd) - wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait); - smc_lgr_schedule_free_work(lgr); + read_unlock_bh(&lgr->conns_lock); + smc_lgr_cleanup(lgr); + if (soft) + smc_lgr_schedule_free_work_fast(lgr); + else + smc_lgr_free(lgr); } -void smc_lgr_terminate(struct smc_link_group *lgr) +/* unlink and terminate link group + * @soft: true if link group shutdown can take its time + * false if immediate link group shutdown is required + */ +void smc_lgr_terminate(struct smc_link_group *lgr, bool soft) { - spin_lock_bh(&smc_lgr_list.lock); - __smc_lgr_terminate(lgr); - spin_unlock_bh(&smc_lgr_list.lock); + spinlock_t *lgr_lock; + + smc_lgr_list_head(lgr, &lgr_lock); + spin_lock_bh(lgr_lock); + if (lgr->terminating) { + spin_unlock_bh(lgr_lock); + return; /* lgr already terminating */ + } + if (!soft) + lgr->freeing = 1; + list_del_init(&lgr->list); + spin_unlock_bh(lgr_lock); + __smc_lgr_terminate(lgr, soft); } /* Called when IB port is terminated */ void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport) { struct smc_link_group *lgr, *l; + LIST_HEAD(lgr_free_list); spin_lock_bh(&smc_lgr_list.lock); list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { if (!lgr->is_smcd && lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev && - lgr->lnk[SMC_SINGLE_LINK].ibport == ibport) - __smc_lgr_terminate(lgr); + lgr->lnk[SMC_SINGLE_LINK].ibport == ibport) { + list_move(&lgr->list, &lgr_free_list); + lgr->freeing = 1; + } } spin_unlock_bh(&smc_lgr_list.lock); + + list_for_each_entry_safe(lgr, l, &lgr_free_list, list) { + list_del_init(&lgr->list); + __smc_lgr_terminate(lgr, false); + } } -/* Called when SMC-D device is terminated or peer is lost */ +/* Called when peer lgr shutdown (regularly or abnormally) is received */ void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan) { struct smc_link_group *lgr, *l; LIST_HEAD(lgr_free_list); /* run common cleanup function and build free list */ - spin_lock_bh(&smc_lgr_list.lock); - list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { - if (lgr->is_smcd && lgr->smcd == dev && - (!peer_gid || lgr->peer_gid == peer_gid) && + spin_lock_bh(&dev->lgr_lock); + list_for_each_entry_safe(lgr, l, &dev->lgr_list, list) { + if ((!peer_gid || lgr->peer_gid == peer_gid) && (vlan == VLAN_VID_MASK || lgr->vlan_id == vlan)) { - __smc_lgr_terminate(lgr); + if (peer_gid) /* peer triggered termination */ + lgr->peer_shutdown = 1; list_move(&lgr->list, &lgr_free_list); } } - spin_unlock_bh(&smc_lgr_list.lock); + spin_unlock_bh(&dev->lgr_lock); /* cancel the regular free workers and actually free lgrs */ list_for_each_entry_safe(lgr, l, &lgr_free_list, list) { list_del_init(&lgr->list); - cancel_delayed_work_sync(&lgr->free_work); - if (!peer_gid && vlan == VLAN_VID_MASK) /* dev terminated? */ - smc_ism_signal_shutdown(lgr); - smc_lgr_free(lgr); + schedule_work(&lgr->terminate_work); + } +} + +/* Called when an SMCD device is removed or the smc module is unloaded */ +void smc_smcd_terminate_all(struct smcd_dev *smcd) +{ + struct smc_link_group *lgr, *lg; + LIST_HEAD(lgr_free_list); + + spin_lock_bh(&smcd->lgr_lock); + list_splice_init(&smcd->lgr_list, &lgr_free_list); + list_for_each_entry(lgr, &lgr_free_list, list) + lgr->freeing = 1; + spin_unlock_bh(&smcd->lgr_lock); + + list_for_each_entry_safe(lgr, lg, &lgr_free_list, list) { + list_del_init(&lgr->list); + __smc_lgr_terminate(lgr, false); + } + + if (atomic_read(&smcd->lgr_cnt)) + wait_event(smcd->lgrs_deleted, !atomic_read(&smcd->lgr_cnt)); +} + +/* Called when an SMCR device is removed or the smc module is unloaded. + * If smcibdev is given, all SMCR link groups using this device are terminated. + * If smcibdev is NULL, all SMCR link groups are terminated. + */ +void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) +{ + struct smc_link_group *lgr, *lg; + LIST_HEAD(lgr_free_list); + + spin_lock_bh(&smc_lgr_list.lock); + if (!smcibdev) { + list_splice_init(&smc_lgr_list.list, &lgr_free_list); + list_for_each_entry(lgr, &lgr_free_list, list) + lgr->freeing = 1; + } else { + list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) { + if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev) { + list_move(&lgr->list, &lgr_free_list); + lgr->freeing = 1; + } + } + } + spin_unlock_bh(&smc_lgr_list.lock); + + list_for_each_entry_safe(lgr, lg, &lgr_free_list, list) { + list_del_init(&lgr->list); + __smc_lgr_terminate(lgr, false); + } + + if (smcibdev) { + if (atomic_read(&smcibdev->lnk_cnt)) + wait_event(smcibdev->lnks_deleted, + !atomic_read(&smcibdev->lnk_cnt)); + } else { + if (atomic_read(&lgr_cnt)) + wait_event(lgrs_deleted, !atomic_read(&lgr_cnt)); } } /* Determine vlan of internal TCP socket. * @vlan_id: address to store the determined vlan id into */ -int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id) +int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini) { struct dst_entry *dst = sk_dst_get(clcsock->sk); struct net_device *ndev; int i, nest_lvl, rc = 0; - *vlan_id = 0; + ini->vlan_id = 0; if (!dst) { rc = -ENOTCONN; goto out; @@ -546,12 +772,12 @@ int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id) ndev = dst->dev; if (is_vlan_dev(ndev)) { - *vlan_id = vlan_dev_vlan_id(ndev); + ini->vlan_id = vlan_dev_vlan_id(ndev); goto out_rel; } rtnl_lock(); - nest_lvl = dev_get_nest_level(ndev); + nest_lvl = ndev->lower_level; for (i = 0; i < nest_lvl; i++) { struct list_head *lower = &ndev->adj_list.lower; @@ -560,7 +786,7 @@ int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id) lower = lower->next; ndev = (struct net_device *)netdev_lower_get_next(ndev, &lower); if (is_vlan_dev(ndev)) { - *vlan_id = vlan_dev_vlan_id(ndev); + ini->vlan_id = vlan_dev_vlan_id(ndev); break; } } @@ -594,39 +820,36 @@ static bool smcd_lgr_match(struct smc_link_group *lgr, } /* create a new SMC connection (and a new link group if necessary) */ -int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact, - struct smc_ib_device *smcibdev, u8 ibport, u32 clcqpn, - struct smc_clc_msg_local *lcl, struct smcd_dev *smcd, - u64 peer_gid) +int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) { struct smc_connection *conn = &smc->conn; - int local_contact = SMC_FIRST_CONTACT; + struct list_head *lgr_list; struct smc_link_group *lgr; - unsigned short vlan_id; enum smc_lgr_role role; + spinlock_t *lgr_lock; int rc = 0; + lgr_list = ini->is_smcd ? &ini->ism_dev->lgr_list : &smc_lgr_list.list; + lgr_lock = ini->is_smcd ? &ini->ism_dev->lgr_lock : &smc_lgr_list.lock; + ini->cln_first_contact = SMC_FIRST_CONTACT; role = smc->listen_smc ? SMC_SERV : SMC_CLNT; - rc = smc_vlan_by_tcpsk(smc->clcsock, &vlan_id); - if (rc) - return rc; - - if ((role == SMC_CLNT) && srv_first_contact) + if (role == SMC_CLNT && ini->srv_first_contact) /* create new link group as well */ goto create; /* determine if an existing link group can be reused */ - spin_lock_bh(&smc_lgr_list.lock); - list_for_each_entry(lgr, &smc_lgr_list.list, list) { + spin_lock_bh(lgr_lock); + list_for_each_entry(lgr, lgr_list, list) { write_lock_bh(&lgr->conns_lock); - if ((is_smcd ? smcd_lgr_match(lgr, smcd, peer_gid) : - smcr_lgr_match(lgr, lcl, role, clcqpn)) && + if ((ini->is_smcd ? + smcd_lgr_match(lgr, ini->ism_dev, ini->ism_gid) : + smcr_lgr_match(lgr, ini->ib_lcl, role, ini->ib_clcqpn)) && !lgr->sync_err && - lgr->vlan_id == vlan_id && + lgr->vlan_id == ini->vlan_id && (role == SMC_CLNT || lgr->conns_num < SMC_RMBS_PER_LGR_MAX)) { /* link group found */ - local_contact = SMC_REUSE_CONTACT; + ini->cln_first_contact = SMC_REUSE_CONTACT; conn->lgr = lgr; smc_lgr_register_conn(conn); /* add smc conn to lgr */ if (delayed_work_pending(&lgr->free_work)) @@ -636,29 +859,31 @@ int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact, } write_unlock_bh(&lgr->conns_lock); } - spin_unlock_bh(&smc_lgr_list.lock); + spin_unlock_bh(lgr_lock); - if (role == SMC_CLNT && !srv_first_contact && - (local_contact == SMC_FIRST_CONTACT)) { + if (role == SMC_CLNT && !ini->srv_first_contact && + ini->cln_first_contact == SMC_FIRST_CONTACT) { /* Server reuses a link group, but Client wants to start * a new one * send out_of_sync decline, reason synchr. error */ - return -ENOLINK; + return SMC_CLC_DECL_SYNCERR; } create: - if (local_contact == SMC_FIRST_CONTACT) { - rc = smc_lgr_create(smc, is_smcd, smcibdev, ibport, - lcl->id_for_peer, vlan_id, smcd, peer_gid); + if (ini->cln_first_contact == SMC_FIRST_CONTACT) { + rc = smc_lgr_create(smc, ini); if (rc) goto out; + lgr = conn->lgr; + write_lock_bh(&lgr->conns_lock); smc_lgr_register_conn(conn); /* add smc conn to lgr */ + write_unlock_bh(&lgr->conns_lock); } conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; conn->urg_state = SMC_URG_READ; - if (is_smcd) { + if (ini->is_smcd) { conn->rx_off = sizeof(struct smcd_cdc_msg); smcd_cdc_rx_init(conn); /* init tasklet for this conn */ } @@ -667,7 +892,7 @@ create: #endif out: - return rc ? rc : local_contact; + return rc; } /* convert the RMB size into the compressed notation - minimum 16K. @@ -1022,29 +1247,63 @@ int smc_rmb_rtoken_handling(struct smc_connection *conn, return 0; } -/* Called (from smc_exit) when module is removed */ -void smc_core_exit(void) +static void smc_core_going_away(void) { - struct smc_link_group *lgr, *lg; - LIST_HEAD(lgr_freeing_list); + struct smc_ib_device *smcibdev; + struct smcd_dev *smcd; - spin_lock_bh(&smc_lgr_list.lock); - if (!list_empty(&smc_lgr_list.list)) - list_splice_init(&smc_lgr_list.list, &lgr_freeing_list); - spin_unlock_bh(&smc_lgr_list.lock); - list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) { - list_del_init(&lgr->list); - if (!lgr->is_smcd) { - struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; + spin_lock(&smc_ib_devices.lock); + list_for_each_entry(smcibdev, &smc_ib_devices.list, list) { + int i; - if (lnk->state == SMC_LNK_ACTIVE) - smc_llc_send_delete_link(lnk, SMC_LLC_REQ, - false); - smc_llc_link_inactive(lnk); - } - cancel_delayed_work_sync(&lgr->free_work); - if (lgr->is_smcd) - smc_ism_signal_shutdown(lgr); - smc_lgr_free(lgr); /* free link group */ + for (i = 0; i < SMC_MAX_PORTS; i++) + set_bit(i, smcibdev->ports_going_away); } + spin_unlock(&smc_ib_devices.lock); + + spin_lock(&smcd_dev_list.lock); + list_for_each_entry(smcd, &smcd_dev_list.list, list) { + smcd->going_away = 1; + } + spin_unlock(&smcd_dev_list.lock); +} + +/* Clean up all SMC link groups */ +static void smc_lgrs_shutdown(void) +{ + struct smcd_dev *smcd; + + smc_core_going_away(); + + smc_smcr_terminate_all(NULL); + + spin_lock(&smcd_dev_list.lock); + list_for_each_entry(smcd, &smcd_dev_list.list, list) + smc_smcd_terminate_all(smcd); + spin_unlock(&smcd_dev_list.lock); +} + +static int smc_core_reboot_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + smc_lgrs_shutdown(); + + return 0; +} + +static struct notifier_block smc_reboot_notifier = { + .notifier_call = smc_core_reboot_event, +}; + +int __init smc_core_init(void) +{ + atomic_set(&lgr_cnt, 0); + return register_reboot_notifier(&smc_reboot_notifier); +} + +/* Called (from smc_exit) when module is removed */ +void smc_core_exit(void) +{ + unregister_reboot_notifier(&smc_reboot_notifier); + smc_lgrs_shutdown(); } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 8806d2afa6ed..c472e12951d1 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -202,8 +202,11 @@ struct smc_link_group { u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */ struct delayed_work free_work; /* delayed freeing of an lgr */ + struct work_struct terminate_work; /* abnormal lgr termination */ u8 sync_err : 1; /* lgr no longer fits to peer */ u8 terminating : 1;/* lgr is terminating */ + u8 freefast : 1; /* free worker scheduled fast */ + u8 freeing : 1; /* lgr is being freed */ bool is_smcd; /* SMC-R or SMC-D */ union { @@ -225,10 +228,30 @@ struct smc_link_group { /* Peer GID (remote) */ struct smcd_dev *smcd; /* ISM device for VLAN reg. */ + u8 peer_shutdown : 1; + /* peer triggered shutdownn */ }; }; }; +struct smc_clc_msg_local; + +struct smc_init_info { + u8 is_smcd; + unsigned short vlan_id; + int srv_first_contact; + int cln_first_contact; + /* SMC-R */ + struct smc_clc_msg_local *ib_lcl; + struct smc_ib_device *ib_dev; + u8 ib_gid[SMC_GID_SIZE]; + u8 ib_port; + u32 ib_clcqpn; + /* SMC-D */ + u64 ism_gid; + struct smcd_dev *ism_dev; +}; + /* Find the connection associated with the given alert token in the link group. * To use rbtrees we have to implement our own search core. * Requires @conns_lock @@ -262,15 +285,23 @@ static inline struct smc_connection *smc_lgr_find_conn( return res; } +static inline void smc_lgr_terminate_sched(struct smc_link_group *lgr) +{ + if (!lgr->terminating && !lgr->freeing) + schedule_work(&lgr->terminate_work); +} + struct smc_sock; struct smc_clc_msg_accept_confirm; struct smc_clc_msg_local; void smc_lgr_forget(struct smc_link_group *lgr); -void smc_lgr_terminate(struct smc_link_group *lgr); +void smc_lgr_terminate(struct smc_link_group *lgr, bool soft); void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport); void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan); +void smc_smcd_terminate_all(struct smcd_dev *dev); +void smc_smcr_terminate_all(struct smc_ib_device *smcibdev); int smc_buf_create(struct smc_sock *smc, bool is_smcd); int smc_uncompress_bufsize(u8 compressed); int smc_rmb_rtoken_handling(struct smc_connection *conn, @@ -281,15 +312,13 @@ void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn); void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn); void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn); void smc_rmb_sync_sg_for_device(struct smc_connection *conn); -int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id); +int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini); void smc_conn_free(struct smc_connection *conn); -int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact, - struct smc_ib_device *smcibdev, u8 ibport, u32 clcqpn, - struct smc_clc_msg_local *lcl, struct smcd_dev *smcd, - u64 peer_gid); +int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini); void smcd_conn_free(struct smc_connection *conn); void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr); +int smc_core_init(void); void smc_core_exit(void); static inline struct smc_link_group *smc_get_lgr(struct smc_link *link) diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 371b4cf31fcd..f38727ecf8b2 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 53f429c04843..548632621f4b 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -15,6 +15,7 @@ #include <linux/random.h> #include <linux/workqueue.h> #include <linux/scatterlist.h> +#include <linux/wait.h> #include <rdma/ib_verbs.h> #include <rdma/ib_cache.h> @@ -146,18 +147,13 @@ out: static int smc_ib_fill_mac(struct smc_ib_device *smcibdev, u8 ibport) { const struct ib_gid_attr *attr; - int rc = 0; + int rc; attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, 0); if (IS_ERR(attr)) return -ENODEV; - if (attr->ndev) - memcpy(smcibdev->mac[ibport - 1], attr->ndev->dev_addr, - ETH_ALEN); - else - rc = -ENODEV; - + rc = rdma_read_gid_l2_fields(attr, NULL, smcibdev->mac[ibport - 1]); rdma_put_gid_attr(attr); return rc; } @@ -185,6 +181,7 @@ int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, unsigned short vlan_id, u8 gid[], u8 *sgid_index) { const struct ib_gid_attr *attr; + const struct net_device *ndev; int i; for (i = 0; i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) { @@ -192,11 +189,14 @@ int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, if (IS_ERR(attr)) continue; - if (attr->ndev && + rcu_read_lock(); + ndev = rdma_read_gid_attr_ndev_rcu(attr); + if (!IS_ERR(ndev) && ((!vlan_id && !is_vlan_dev(attr->ndev)) || (vlan_id && is_vlan_dev(attr->ndev) && vlan_dev_vlan_id(attr->ndev) == vlan_id)) && attr->gid_type == IB_GID_TYPE_ROCE) { + rcu_read_unlock(); if (gid) memcpy(gid, &attr->gid, SMC_GID_SIZE); if (sgid_index) @@ -204,6 +204,7 @@ int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, rdma_put_gid_attr(attr); return 0; } + rcu_read_unlock(); rdma_put_gid_attr(attr); } return -ENODEV; @@ -242,8 +243,12 @@ static void smc_ib_port_event_work(struct work_struct *work) for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) { smc_ib_remember_port_attr(smcibdev, port_idx + 1); clear_bit(port_idx, &smcibdev->port_event_mask); - if (!smc_ib_port_active(smcibdev, port_idx + 1)) + if (!smc_ib_port_active(smcibdev, port_idx + 1)) { + set_bit(port_idx, smcibdev->ports_going_away); smc_port_terminate(smcibdev, port_idx + 1); + } else { + clear_bit(port_idx, smcibdev->ports_going_away); + } } } @@ -259,8 +264,10 @@ static void smc_ib_global_event_handler(struct ib_event_handler *handler, switch (ibevent->event) { case IB_EVENT_DEVICE_FATAL: /* terminate all ports on device */ - for (port_idx = 0; port_idx < SMC_MAX_PORTS; port_idx++) + for (port_idx = 0; port_idx < SMC_MAX_PORTS; port_idx++) { set_bit(port_idx, &smcibdev->port_event_mask); + set_bit(port_idx, smcibdev->ports_going_away); + } schedule_work(&smcibdev->port_event_work); break; case IB_EVENT_PORT_ERR: @@ -269,6 +276,10 @@ static void smc_ib_global_event_handler(struct ib_event_handler *handler, port_idx = ibevent->element.port_num - 1; if (port_idx < SMC_MAX_PORTS) { set_bit(port_idx, &smcibdev->port_event_mask); + if (ibevent->event == IB_EVENT_PORT_ERR) + set_bit(port_idx, smcibdev->ports_going_away); + else if (ibevent->event == IB_EVENT_PORT_ACTIVE) + clear_bit(port_idx, smcibdev->ports_going_away); schedule_work(&smcibdev->port_event_work); } break; @@ -307,6 +318,7 @@ static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) port_idx = ibevent->element.qp->port - 1; if (port_idx < SMC_MAX_PORTS) { set_bit(port_idx, &smcibdev->port_event_mask); + set_bit(port_idx, smcibdev->ports_going_away); schedule_work(&smcibdev->port_event_work); } break; @@ -509,9 +521,9 @@ static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev) if (!smcibdev->initialized) return; smcibdev->initialized = 0; - smc_wr_remove_dev(smcibdev); ib_destroy_cq(smcibdev->roce_cq_recv); ib_destroy_cq(smcibdev->roce_cq_send); + smc_wr_remove_dev(smcibdev); } static struct ib_client smc_ib_client; @@ -532,7 +544,8 @@ static void smc_ib_add_dev(struct ib_device *ibdev) smcibdev->ibdev = ibdev; INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work); - + atomic_set(&smcibdev->lnk_cnt, 0); + init_waitqueue_head(&smcibdev->lnks_deleted); spin_lock(&smc_ib_devices.lock); list_add_tail(&smcibdev->list, &smc_ib_devices.list); spin_unlock(&smc_ib_devices.lock); @@ -554,7 +567,7 @@ static void smc_ib_add_dev(struct ib_device *ibdev) schedule_work(&smcibdev->port_event_work); } -/* callback function for ib_register_client() */ +/* callback function for ib_unregister_client() */ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data) { struct smc_ib_device *smcibdev; @@ -564,6 +577,7 @@ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data) spin_lock(&smc_ib_devices.lock); list_del_init(&smcibdev->list); /* remove from smc_ib_devices */ spin_unlock(&smc_ib_devices.lock); + smc_smcr_terminate_all(smcibdev); smc_ib_cleanup_per_ibdev(smcibdev); ib_unregister_event_handler(&smcibdev->event_handler); kfree(smcibdev); diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index da60ab9e8d70..255db87547d3 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -14,6 +14,7 @@ #include <linux/interrupt.h> #include <linux/if_ether.h> +#include <linux/wait.h> #include <rdma/ib_verbs.h> #include <net/smc.h> @@ -47,6 +48,9 @@ struct smc_ib_device { /* ib-device infos for smc */ u8 initialized : 1; /* ib dev CQ, evthdl done */ struct work_struct port_event_work; unsigned long port_event_mask; + DECLARE_BITMAP(ports_going_away, SMC_MAX_PORTS); + atomic_t lnk_cnt; /* number of links on ibdev */ + wait_queue_head_t lnks_deleted; /* wait 4 removal of all links*/ }; struct smc_buf_desc; diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 2fff79db1a59..5c4727d5066e 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -146,6 +146,10 @@ out: int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc) { struct smcd_dmb dmb; + int rc = 0; + + if (!dmb_desc->dma_addr) + return rc; memset(&dmb, 0, sizeof(dmb)); dmb.dmb_tok = dmb_desc->token; @@ -153,7 +157,13 @@ int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc) dmb.cpu_addr = dmb_desc->cpu_addr; dmb.dma_addr = dmb_desc->dma_addr; dmb.dmb_len = dmb_desc->len; - return smcd->ops->unregister_dmb(smcd, &dmb); + rc = smcd->ops->unregister_dmb(smcd, &dmb); + if (!rc || rc == ISM_ERROR) { + dmb_desc->cpu_addr = NULL; + dmb_desc->dma_addr = 0; + } + + return rc; } int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, @@ -226,6 +236,9 @@ int smc_ism_signal_shutdown(struct smc_link_group *lgr) int rc; union smcd_sw_event_info ev_info; + if (lgr->peer_shutdown) + return 0; + memcpy(ev_info.uid, lgr->id, SMC_LGR_ID_SIZE); ev_info.vlan_id = lgr->vlan_id; ev_info.code = ISM_EVENT_REQUEST; @@ -286,9 +299,17 @@ struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, smc_pnetid_by_dev_port(parent, 0, smcd->pnetid); spin_lock_init(&smcd->lock); + spin_lock_init(&smcd->lgr_lock); INIT_LIST_HEAD(&smcd->vlan); + INIT_LIST_HEAD(&smcd->lgr_list); + init_waitqueue_head(&smcd->lgrs_deleted); smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)", WQ_MEM_RECLAIM, name); + if (!smcd->event_wq) { + kfree(smcd->conn); + kfree(smcd); + return NULL; + } return smcd; } EXPORT_SYMBOL_GPL(smcd_alloc_dev); @@ -306,11 +327,12 @@ EXPORT_SYMBOL_GPL(smcd_register_dev); void smcd_unregister_dev(struct smcd_dev *smcd) { spin_lock(&smcd_dev_list.lock); - list_del(&smcd->list); + list_del_init(&smcd->list); spin_unlock(&smcd_dev_list.lock); + smcd->going_away = 1; + smc_smcd_terminate_all(smcd); flush_workqueue(smcd->event_wq); destroy_workqueue(smcd->event_wq); - smc_smcd_terminate(smcd, 0, VLAN_VID_MASK); device_del(&smcd->dev); } @@ -337,6 +359,8 @@ void smcd_handle_event(struct smcd_dev *smcd, struct smcd_event *event) { struct smc_ism_event_work *wrk; + if (smcd->going_away) + return; /* copy event to event work queue, and let it be handled there */ wrk = kmalloc(sizeof(*wrk), GFP_ATOMIC); if (!wrk) @@ -362,7 +386,7 @@ void smcd_handle_irq(struct smcd_dev *smcd, unsigned int dmbno) spin_lock_irqsave(&smcd->lock, flags); conn = smcd->conn[dmbno]; - if (conn) + if (conn && !conn->killed) tasklet_schedule(&conn->rx_tsklet); spin_unlock_irqrestore(&smcd->lock, flags); } diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 4fd60c522802..a9f6431dd69a 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -475,7 +475,7 @@ static void smc_llc_rx_delete_link(struct smc_link *link, smc_llc_prep_delete_link(llc, link, SMC_LLC_RESP, true); } smc_llc_send_message(link, llc, sizeof(*llc)); - smc_lgr_schedule_free_work_fast(lgr); + smc_lgr_terminate_sched(lgr); } } @@ -614,7 +614,7 @@ static void smc_llc_testlink_work(struct work_struct *work) rc = wait_for_completion_interruptible_timeout(&link->llc_testlink_resp, SMC_LLC_WAIT_TIME); if (rc <= 0) { - smc_lgr_terminate(smc_get_lgr(link)); + smc_lgr_terminate(smc_get_lgr(link), true); return; } next_interval = link->llc_testlink_time; @@ -656,6 +656,7 @@ void smc_llc_link_active(struct smc_link *link, int testlink_time) void smc_llc_link_deleting(struct smc_link *link) { link->state = SMC_LNK_DELETING; + smc_wr_wakeup_tx_wait(link); } /* called in tasklet context */ @@ -663,6 +664,8 @@ void smc_llc_link_inactive(struct smc_link *link) { link->state = SMC_LNK_INACTIVE; cancel_delayed_work(&link->llc_testlink_wrk); + smc_wr_wakeup_reg_wait(link); + smc_wr_wakeup_tx_wait(link); } /* called in worker context */ @@ -695,9 +698,11 @@ int smc_llc_do_confirm_rkey(struct smc_link *link, int smc_llc_do_delete_rkey(struct smc_link *link, struct smc_buf_desc *rmb_desc) { - int rc; + int rc = 0; mutex_lock(&link->llc_delete_rkey_mutex); + if (link->state != SMC_LNK_ACTIVE) + goto out; reinit_completion(&link->llc_delete_rkey); rc = smc_llc_send_delete_rkey(link, rmb_desc); if (rc) diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 8d2f6296279c..82dedf052d86 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -26,6 +26,7 @@ #include "smc_pnet.h" #include "smc_ib.h" #include "smc_ism.h" +#include "smc_core.h" #define SMC_ASCII_BLANK 32 @@ -375,8 +376,6 @@ static int smc_pnet_fill_entry(struct net *net, return 0; error: - if (pnetelem->ndev) - dev_put(pnetelem->ndev); return rc; } @@ -603,35 +602,36 @@ static int smc_pnet_flush(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); - return smc_pnet_remove_by_pnetid(net, NULL); + smc_pnet_remove_by_pnetid(net, NULL); + return 0; } /* SMC_PNETID generic netlink operation definition */ static const struct genl_ops smc_pnet_ops[] = { { .cmd = SMC_PNETID_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = smc_pnet_policy, .doit = smc_pnet_get, .dumpit = smc_pnet_dump, .start = smc_pnet_dump_start }, { .cmd = SMC_PNETID_ADD, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = smc_pnet_policy, .doit = smc_pnet_add }, { .cmd = SMC_PNETID_DEL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = smc_pnet_policy, .doit = smc_pnet_del }, { .cmd = SMC_PNETID_FLUSH, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = smc_pnet_policy, .doit = smc_pnet_flush } }; @@ -642,6 +642,7 @@ static struct genl_family smc_pnet_nl_family __ro_after_init = { .name = SMCR_GENL_FAMILY_NAME, .version = SMCR_GENL_FAMILY_VERSION, .maxattr = SMC_PNETID_MAX, + .policy = smc_pnet_policy, .netnsok = true, .module = THIS_MODULE, .ops = smc_pnet_ops, @@ -715,7 +716,7 @@ static struct net_device *pnet_find_base_ndev(struct net_device *ndev) int i, nest_lvl; rtnl_lock(); - nest_lvl = dev_get_nest_level(ndev); + nest_lvl = ndev->lower_level; for (i = 0; i < nest_lvl; i++) { struct list_head *lower = &ndev->adj_list.lower; @@ -758,8 +759,7 @@ static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev, * IB device and port */ static void smc_pnet_find_rdma_dev(struct net_device *netdev, - struct smc_ib_device **smcibdev, - u8 *ibport, unsigned short vlan_id, u8 gid[]) + struct smc_init_info *ini) { struct smc_ib_device *ibdev; @@ -779,10 +779,11 @@ static void smc_pnet_find_rdma_dev(struct net_device *netdev, dev_put(ndev); if (netdev == ndev && smc_ib_port_active(ibdev, i) && - !smc_ib_determine_gid(ibdev, i, vlan_id, gid, - NULL)) { - *smcibdev = ibdev; - *ibport = i; + !test_bit(i - 1, ibdev->ports_going_away) && + !smc_ib_determine_gid(ibdev, i, ini->vlan_id, + ini->ib_gid, NULL)) { + ini->ib_dev = ibdev; + ini->ib_port = i; break; } } @@ -797,9 +798,7 @@ static void smc_pnet_find_rdma_dev(struct net_device *netdev, * If nothing found, try to use handshake device */ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, - struct smc_ib_device **smcibdev, - u8 *ibport, unsigned short vlan_id, - u8 gid[]) + struct smc_init_info *ini) { u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; struct smc_ib_device *ibdev; @@ -809,7 +808,7 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, ndev_pnetid) && smc_pnet_find_ndev_pnetid_by_table(ndev, ndev_pnetid)) { - smc_pnet_find_rdma_dev(ndev, smcibdev, ibport, vlan_id, gid); + smc_pnet_find_rdma_dev(ndev, ini); return; /* pnetid could not be determined */ } @@ -820,10 +819,11 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, continue; if (smc_pnet_match(ibdev->pnetid[i - 1], ndev_pnetid) && smc_ib_port_active(ibdev, i) && - !smc_ib_determine_gid(ibdev, i, vlan_id, gid, - NULL)) { - *smcibdev = ibdev; - *ibport = i; + !test_bit(i - 1, ibdev->ports_going_away) && + !smc_ib_determine_gid(ibdev, i, ini->vlan_id, + ini->ib_gid, NULL)) { + ini->ib_dev = ibdev; + ini->ib_port = i; goto out; } } @@ -833,7 +833,7 @@ out: } static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, - struct smcd_dev **smcismdev) + struct smc_init_info *ini) { u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; struct smcd_dev *ismdev; @@ -846,8 +846,9 @@ static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, spin_lock(&smcd_dev_list.lock); list_for_each_entry(ismdev, &smcd_dev_list.list, list) { - if (smc_pnet_match(ismdev->pnetid, ndev_pnetid)) { - *smcismdev = ismdev; + if (smc_pnet_match(ismdev->pnetid, ndev_pnetid) && + !ismdev->going_away) { + ini->ism_dev = ismdev; break; } } @@ -858,21 +859,18 @@ static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, * determine ib_device and port belonging to used internal TCP socket * ethernet interface. */ -void smc_pnet_find_roce_resource(struct sock *sk, - struct smc_ib_device **smcibdev, u8 *ibport, - unsigned short vlan_id, u8 gid[]) +void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini) { struct dst_entry *dst = sk_dst_get(sk); - *smcibdev = NULL; - *ibport = 0; - + ini->ib_dev = NULL; + ini->ib_port = 0; if (!dst) goto out; if (!dst->dev) goto out_rel; - smc_pnet_find_roce_by_pnetid(dst->dev, smcibdev, ibport, vlan_id, gid); + smc_pnet_find_roce_by_pnetid(dst->dev, ini); out_rel: dst_release(dst); @@ -880,17 +878,17 @@ out: return; } -void smc_pnet_find_ism_resource(struct sock *sk, struct smcd_dev **smcismdev) +void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini) { struct dst_entry *dst = sk_dst_get(sk); - *smcismdev = NULL; + ini->ism_dev = NULL; if (!dst) goto out; if (!dst->dev) goto out_rel; - smc_pnet_find_ism_by_pnetid(dst->dev, smcismdev); + smc_pnet_find_ism_by_pnetid(dst->dev, ini); out_rel: dst_release(dst); diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h index 5eac42fb45d0..4564e4d69c2e 100644 --- a/net/smc/smc_pnet.h +++ b/net/smc/smc_pnet.h @@ -18,6 +18,7 @@ struct smc_ib_device; struct smcd_dev; +struct smc_init_info; /** * struct smc_pnettable - SMC PNET table anchor @@ -43,9 +44,7 @@ int smc_pnet_init(void) __init; int smc_pnet_net_init(struct net *net); void smc_pnet_exit(void); void smc_pnet_net_exit(struct net *net); -void smc_pnet_find_roce_resource(struct sock *sk, - struct smc_ib_device **smcibdev, u8 *ibport, - unsigned short vlan_id, u8 gid[]); -void smc_pnet_find_ism_resource(struct sock *sk, struct smcd_dev **smcismdev); +void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini); +void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini); #endif diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 413a6abf227e..39d7b34d06d2 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -201,6 +201,8 @@ int smc_rx_wait(struct smc_sock *smc, long *timeo, { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct smc_connection *conn = &smc->conn; + struct smc_cdc_conn_state_flags *cflags = + &conn->local_tx_ctrl.conn_state_flags; struct sock *sk = &smc->sk; int rc; @@ -210,9 +212,10 @@ int smc_rx_wait(struct smc_sock *smc, long *timeo, add_wait_queue(sk_sleep(sk), &wait); rc = sk_wait_event(sk, timeo, sk->sk_err || + cflags->peer_conn_abort || sk->sk_shutdown & RCV_SHUTDOWN || - fcrit(conn) || - smc_cdc_rxed_any_close_or_senddone(conn), + conn->killed || + fcrit(conn), &wait); remove_wait_queue(sk_sleep(sk), &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); @@ -262,6 +265,18 @@ static int smc_rx_recv_urg(struct smc_sock *smc, struct msghdr *msg, int len, return -EAGAIN; } +static bool smc_rx_recvmsg_data_available(struct smc_sock *smc) +{ + struct smc_connection *conn = &smc->conn; + + if (smc_rx_data_available(conn)) + return true; + else if (conn->urg_state == SMC_URG_VALID) + /* we received a single urgent Byte - skip */ + smc_rx_update_cons(smc, 0); + return false; +} + /* smc_rx_recvmsg - receive data from RMBE * @msg: copy data to receive buffer * @pipe: copy data to pipe if set - indicates splice() call @@ -303,16 +318,20 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, if (read_done >= target || (pipe && read_done)) break; - if (atomic_read(&conn->bytes_to_rcv)) + if (conn->killed) + break; + + if (smc_rx_recvmsg_data_available(smc)) goto copy; - else if (conn->urg_state == SMC_URG_VALID) - /* we received a single urgent Byte - skip */ - smc_rx_update_cons(smc, 0); - if (sk->sk_shutdown & RCV_SHUTDOWN || - smc_cdc_rxed_any_close_or_senddone(conn) || - conn->local_tx_ctrl.conn_state_flags.peer_conn_abort) + if (sk->sk_shutdown & RCV_SHUTDOWN) { + /* smc_cdc_msg_recv_action() could have run after + * above smc_rx_recvmsg_data_available() + */ + if (smc_rx_recvmsg_data_available(smc)) + goto copy; break; + } if (read_done) { if (sk->sk_err || diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index f0de323d15d6..0d42e7716b91 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -76,18 +76,17 @@ static int smc_tx_wait(struct smc_sock *smc, int flags) DEFINE_WAIT_FUNC(wait, woken_wake_function); struct smc_connection *conn = &smc->conn; struct sock *sk = &smc->sk; - bool noblock; long timeo; int rc = 0; /* similar to sk_stream_wait_memory */ timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); - noblock = timeo ? false : true; add_wait_queue(sk_sleep(sk), &wait); while (1) { sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN) || + conn->killed || conn->local_tx_ctrl.conn_state_flags.peer_done_writing) { rc = -EPIPE; break; @@ -97,8 +96,8 @@ static int smc_tx_wait(struct smc_sock *smc, int flags) break; } if (!timeo) { - if (noblock) - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + /* ensure EPOLLOUT is subsequently generated */ + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); rc = -EAGAIN; break; } @@ -157,7 +156,7 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) return -ENOTCONN; if (smc->sk.sk_shutdown & SEND_SHUTDOWN || (smc->sk.sk_err == ECONNABORTED) || - conn->local_tx_ctrl.conn_state_flags.peer_conn_abort) + conn->killed) return -EPIPE; if (smc_cdc_rxed_any_close(conn)) return send_done ?: -ECONNRESET; @@ -284,10 +283,8 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, peer_rmbe_offset; rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey; rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL); - if (rc) { - conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; - smc_lgr_terminate(lgr); - } + if (rc) + smc_lgr_terminate(lgr, true); return rc; } @@ -497,10 +494,11 @@ static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) if (smc->sk.sk_err == ECONNABORTED) return sock_error(&smc->sk); + if (conn->killed) + return -EPIPE; rc = 0; - if (conn->alert_token_local) /* connection healthy */ - mod_delayed_work(system_wq, &conn->tx_work, - SMC_TX_WORK_DELAY); + mod_delayed_work(system_wq, &conn->tx_work, + SMC_TX_WORK_DELAY); } return rc; } @@ -549,6 +547,9 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) { int rc; + if (conn->killed || + conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) + return -EPIPE; /* connection being aborted */ if (conn->lgr->is_smcd) rc = smcd_tx_sndbuf_nonempty(conn); else @@ -575,9 +576,7 @@ void smc_tx_work(struct work_struct *work) int rc; lock_sock(&smc->sk); - if (smc->sk.sk_err || - !conn->alert_token_local || - conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) + if (smc->sk.sk_err) goto out; rc = smc_tx_sndbuf_nonempty(conn); @@ -610,8 +609,11 @@ void smc_tx_consumer_update(struct smc_connection *conn, bool force) ((to_confirm > conn->rmbe_update_limit) && ((sender_free <= (conn->rmb_desc->len / 2)) || conn->local_rx_ctrl.prod_flags.write_blocked))) { + if (conn->killed || + conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) + return; if ((smc_cdc_get_slot_and_msg_send(conn) < 0) && - conn->alert_token_local) { /* connection healthy */ + !conn->killed) { schedule_delayed_work(&conn->tx_work, SMC_TX_WORK_DELAY); return; diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 253aa75dc2b6..337ee52ad3d3 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -50,6 +50,26 @@ struct smc_wr_tx_pend { /* control data for a pending send request */ /*------------------------------- completion --------------------------------*/ +/* returns true if at least one tx work request is pending on the given link */ +static inline bool smc_wr_is_tx_pend(struct smc_link *link) +{ + if (find_first_bit(link->wr_tx_mask, link->wr_tx_cnt) != + link->wr_tx_cnt) { + return true; + } + return false; +} + +/* wait till all pending tx work requests on the given link are completed */ +static inline int smc_wr_tx_wait_no_pending_sends(struct smc_link *link) +{ + if (wait_event_timeout(link->wr_tx_wait, !smc_wr_is_tx_pend(link), + SMC_WR_TX_WAIT_PENDING_TIME)) + return 0; + else /* timeout */ + return -EPIPE; +} + static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id) { u32 i; @@ -75,7 +95,7 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) link->wr_reg_state = FAILED; else link->wr_reg_state = CONFIRMED; - wake_up(&link->wr_reg_wait); + smc_wr_wakeup_reg_wait(link); return; } @@ -101,7 +121,7 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) clear_bit(i, link->wr_tx_mask); } /* terminate connections of this link group abnormally */ - smc_lgr_terminate(smc_get_lgr(link)); + smc_lgr_terminate_sched(smc_get_lgr(link)); } if (pnd_snd.handler) pnd_snd.handler(&pnd_snd.priv, link, wc->status); @@ -171,6 +191,7 @@ int smc_wr_tx_get_free_slot(struct smc_link *link, struct smc_rdma_wr **wr_rdma_buf, struct smc_wr_tx_pend_priv **wr_pend_priv) { + struct smc_link_group *lgr = smc_get_lgr(link); struct smc_wr_tx_pend *wr_pend; u32 idx = link->wr_tx_cnt; struct ib_send_wr *wr_ib; @@ -179,19 +200,20 @@ int smc_wr_tx_get_free_slot(struct smc_link *link, *wr_buf = NULL; *wr_pend_priv = NULL; - if (in_softirq()) { + if (in_softirq() || lgr->terminating) { rc = smc_wr_tx_get_free_slot_index(link, &idx); if (rc) return rc; } else { - rc = wait_event_timeout( + rc = wait_event_interruptible_timeout( link->wr_tx_wait, link->state == SMC_LNK_INACTIVE || + lgr->terminating || (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY), SMC_WR_TX_WAIT_FREE_SLOT_TIME); if (!rc) { /* timeout - terminate connections */ - smc_lgr_terminate(smc_get_lgr(link)); + smc_lgr_terminate_sched(lgr); return -EPIPE; } if (idx == link->wr_tx_cnt) @@ -227,6 +249,7 @@ int smc_wr_tx_put_slot(struct smc_link *link, memset(&link->wr_tx_bufs[idx], 0, sizeof(link->wr_tx_bufs[idx])); test_and_clear_bit(idx, link->wr_tx_mask); + wake_up(&link->wr_tx_wait); return 1; } @@ -247,7 +270,7 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL); if (rc) { smc_wr_tx_put_slot(link, priv); - smc_lgr_terminate(smc_get_lgr(link)); + smc_lgr_terminate_sched(smc_get_lgr(link)); } return rc; } @@ -272,7 +295,7 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) SMC_WR_REG_MR_WAIT_TIME); if (!rc) { /* timeout - terminate connections */ - smc_lgr_terminate(smc_get_lgr(link)); + smc_lgr_terminate_sched(smc_get_lgr(link)); return -EPIPE; } if (rc == -ERESTARTSYS) @@ -373,7 +396,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) /* terminate connections of this link group * abnormally */ - smc_lgr_terminate(smc_get_lgr(link)); + smc_lgr_terminate_sched(smc_get_lgr(link)); break; default: smc_wr_rx_post(link); /* refill WR RX */ @@ -510,8 +533,10 @@ void smc_wr_free_link(struct smc_link *lnk) { struct ib_device *ibdev; - memset(lnk->wr_tx_mask, 0, - BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask)); + if (smc_wr_tx_wait_no_pending_sends(lnk)) + memset(lnk->wr_tx_mask, 0, + BITS_TO_LONGS(SMC_WR_BUF_CNT) * + sizeof(*lnk->wr_tx_mask)); if (!lnk->smcibdev) return; diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index 09bf32fd3959..3ac99c898418 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -60,6 +60,16 @@ static inline void smc_wr_tx_set_wr_id(atomic_long_t *wr_tx_id, long val) atomic_long_set(wr_tx_id, val); } +static inline void smc_wr_wakeup_tx_wait(struct smc_link *lnk) +{ + wake_up_all(&lnk->wr_tx_wait); +} + +static inline void smc_wr_wakeup_reg_wait(struct smc_link *lnk) +{ + wake_up(&lnk->wr_reg_wait); +} + /* post a new receive work request to fill a completed old work request entry */ static inline int smc_wr_rx_post(struct smc_link *link) { |
