summaryrefslogtreecommitdiff
path: root/net/smc
diff options
context:
space:
mode:
Diffstat (limited to 'net/smc')
-rw-r--r--net/smc/Makefile1
-rw-r--r--net/smc/af_smc.c441
-rw-r--r--net/smc/smc.h37
-rw-r--r--net/smc/smc_cdc.c24
-rw-r--r--net/smc/smc_close.c3
-rw-r--r--net/smc/smc_core.c7
-rw-r--r--net/smc/smc_diag.c2
-rw-r--r--net/smc/smc_netlink.c15
-rw-r--r--net/smc/smc_pnet.c53
-rw-r--r--net/smc/smc_pnet.h2
-rw-r--r--net/smc/smc_sysctl.c70
-rw-r--r--net/smc/smc_sysctl.h33
-rw-r--r--net/smc/smc_tx.c154
-rw-r--r--net/smc/smc_tx.h3
14 files changed, 749 insertions, 96 deletions
diff --git a/net/smc/Makefile b/net/smc/Makefile
index 196fb6f01b14..875efcd126a2 100644
--- a/net/smc/Makefile
+++ b/net/smc/Makefile
@@ -5,3 +5,4 @@ obj-$(CONFIG_SMC_DIAG) += smc_diag.o
smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o
smc-y += smc_tracepoint.o
+smc-$(CONFIG_SYSCTL) += smc_sysctl.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 961854e56736..f0d118e9f155 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -51,6 +51,7 @@
#include "smc_close.h"
#include "smc_stats.h"
#include "smc_tracepoint.h"
+#include "smc_sysctl.h"
static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group
* creation on server
@@ -59,12 +60,52 @@ static DEFINE_MUTEX(smc_client_lgr_pending); /* serialize link group
* creation on client
*/
+static struct workqueue_struct *smc_tcp_ls_wq; /* wq for tcp listen work */
struct workqueue_struct *smc_hs_wq; /* wq for handshake work */
struct workqueue_struct *smc_close_wq; /* wq for close work */
static void smc_tcp_listen_work(struct work_struct *);
static void smc_connect_work(struct work_struct *);
+int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
+ void *hdr;
+
+ if (cb_ctx->pos[0])
+ goto out;
+
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &smc_gen_nl_family, NLM_F_MULTI,
+ SMC_NETLINK_DUMP_HS_LIMITATION);
+ if (!hdr)
+ return -ENOMEM;
+
+ if (nla_put_u8(skb, SMC_NLA_HS_LIMITATION_ENABLED,
+ sock_net(skb->sk)->smc.limit_smc_hs))
+ goto err;
+
+ genlmsg_end(skb, hdr);
+ cb_ctx->pos[0] = 1;
+out:
+ return skb->len;
+err:
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+}
+
+int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info)
+{
+ sock_net(skb->sk)->smc.limit_smc_hs = true;
+ return 0;
+}
+
+int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info)
+{
+ sock_net(skb->sk)->smc.limit_smc_hs = false;
+ return 0;
+}
+
static void smc_set_keepalive(struct sock *sk, int val)
{
struct smc_sock *smc = smc_sk(sk);
@@ -72,6 +113,51 @@ static void smc_set_keepalive(struct sock *sk, int val)
smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
}
+static struct sock *smc_tcp_syn_recv_sock(const struct sock *sk,
+ struct sk_buff *skb,
+ struct request_sock *req,
+ struct dst_entry *dst,
+ struct request_sock *req_unhash,
+ bool *own_req)
+{
+ struct smc_sock *smc;
+
+ smc = smc_clcsock_user_data(sk);
+
+ if (READ_ONCE(sk->sk_ack_backlog) + atomic_read(&smc->queued_smc_hs) >
+ sk->sk_max_ack_backlog)
+ goto drop;
+
+ if (sk_acceptq_is_full(&smc->sk)) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+ goto drop;
+ }
+
+ /* passthrough to original syn recv sock fct */
+ return smc->ori_af_ops->syn_recv_sock(sk, skb, req, dst, req_unhash,
+ own_req);
+
+drop:
+ dst_release(dst);
+ tcp_listendrop(sk);
+ return NULL;
+}
+
+static bool smc_hs_congested(const struct sock *sk)
+{
+ const struct smc_sock *smc;
+
+ smc = smc_clcsock_user_data(sk);
+
+ if (!smc)
+ return true;
+
+ if (workqueue_congested(WORK_CPU_UNBOUND, smc_hs_wq))
+ return true;
+
+ return false;
+}
+
static struct smc_hashinfo smc_v4_hashinfo = {
.lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
};
@@ -107,12 +193,27 @@ void smc_unhash_sk(struct sock *sk)
}
EXPORT_SYMBOL_GPL(smc_unhash_sk);
+/* This will be called before user really release sock_lock. So do the
+ * work which we didn't do because of user hold the sock_lock in the
+ * BH context
+ */
+static void smc_release_cb(struct sock *sk)
+{
+ struct smc_sock *smc = smc_sk(sk);
+
+ if (smc->conn.tx_in_release_sock) {
+ smc_tx_pending(&smc->conn);
+ smc->conn.tx_in_release_sock = false;
+ }
+}
+
struct proto smc_proto = {
.name = "SMC",
.owner = THIS_MODULE,
.keepalive = smc_set_keepalive,
.hash = smc_hash_sk,
.unhash = smc_unhash_sk,
+ .release_cb = smc_release_cb,
.obj_size = sizeof(struct smc_sock),
.h.smc_hash = &smc_v4_hashinfo,
.slab_flags = SLAB_TYPESAFE_BY_RCU,
@@ -125,6 +226,7 @@ struct proto smc_proto6 = {
.keepalive = smc_set_keepalive,
.hash = smc_hash_sk,
.unhash = smc_unhash_sk,
+ .release_cb = smc_release_cb,
.obj_size = sizeof(struct smc_sock),
.h.smc_hash = &smc_v6_hashinfo,
.slab_flags = SLAB_TYPESAFE_BY_RCU,
@@ -183,7 +285,7 @@ static int smc_release(struct socket *sock)
{
struct sock *sk = sock->sk;
struct smc_sock *smc;
- int rc = 0;
+ int old_state, rc = 0;
if (!sk)
goto out;
@@ -191,8 +293,10 @@ static int smc_release(struct socket *sock)
sock_hold(sk); /* sock_put below */
smc = smc_sk(sk);
+ old_state = sk->sk_state;
+
/* cleanup for a dangling non-blocking connect */
- if (smc->connect_nonblock && sk->sk_state == SMC_INIT)
+ if (smc->connect_nonblock && old_state == SMC_INIT)
tcp_abort(smc->clcsock->sk, ECONNABORTED);
if (cancel_work_sync(&smc->connect_work))
@@ -206,6 +310,10 @@ static int smc_release(struct socket *sock)
else
lock_sock(sk);
+ if (old_state == SMC_INIT && sk->sk_state == SMC_ACTIVE &&
+ !smc->use_fallback)
+ smc_close_active_abort(smc);
+
rc = __smc_release(smc);
/* detach socket */
@@ -566,12 +674,118 @@ static void smc_stat_fallback(struct smc_sock *smc)
mutex_unlock(&net->smc.mutex_fback_rsn);
}
-static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code)
+/* must be called under rcu read lock */
+static void smc_fback_wakeup_waitqueue(struct smc_sock *smc, void *key)
+{
+ struct socket_wq *wq;
+ __poll_t flags;
+
+ wq = rcu_dereference(smc->sk.sk_wq);
+ if (!skwq_has_sleeper(wq))
+ return;
+
+ /* wake up smc sk->sk_wq */
+ if (!key) {
+ /* sk_state_change */
+ wake_up_interruptible_all(&wq->wait);
+ } else {
+ flags = key_to_poll(key);
+ if (flags & (EPOLLIN | EPOLLOUT))
+ /* sk_data_ready or sk_write_space */
+ wake_up_interruptible_sync_poll(&wq->wait, flags);
+ else if (flags & EPOLLERR)
+ /* sk_error_report */
+ wake_up_interruptible_poll(&wq->wait, flags);
+ }
+}
+
+static int smc_fback_mark_woken(wait_queue_entry_t *wait,
+ unsigned int mode, int sync, void *key)
+{
+ struct smc_mark_woken *mark =
+ container_of(wait, struct smc_mark_woken, wait_entry);
+
+ mark->woken = true;
+ mark->key = key;
+ return 0;
+}
+
+static void smc_fback_forward_wakeup(struct smc_sock *smc, struct sock *clcsk,
+ void (*clcsock_callback)(struct sock *sk))
+{
+ struct smc_mark_woken mark = { .woken = false };
+ struct socket_wq *wq;
+
+ init_waitqueue_func_entry(&mark.wait_entry,
+ smc_fback_mark_woken);
+ rcu_read_lock();
+ wq = rcu_dereference(clcsk->sk_wq);
+ if (!wq)
+ goto out;
+ add_wait_queue(sk_sleep(clcsk), &mark.wait_entry);
+ clcsock_callback(clcsk);
+ remove_wait_queue(sk_sleep(clcsk), &mark.wait_entry);
+
+ if (mark.woken)
+ smc_fback_wakeup_waitqueue(smc, mark.key);
+out:
+ rcu_read_unlock();
+}
+
+static void smc_fback_state_change(struct sock *clcsk)
+{
+ struct smc_sock *smc =
+ smc_clcsock_user_data(clcsk);
+
+ if (!smc)
+ return;
+ smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_state_change);
+}
+
+static void smc_fback_data_ready(struct sock *clcsk)
{
- wait_queue_head_t *smc_wait = sk_sleep(&smc->sk);
- wait_queue_head_t *clc_wait = sk_sleep(smc->clcsock->sk);
- unsigned long flags;
+ struct smc_sock *smc =
+ smc_clcsock_user_data(clcsk);
+ if (!smc)
+ return;
+ smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_data_ready);
+}
+
+static void smc_fback_write_space(struct sock *clcsk)
+{
+ struct smc_sock *smc =
+ smc_clcsock_user_data(clcsk);
+
+ if (!smc)
+ return;
+ smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_write_space);
+}
+
+static void smc_fback_error_report(struct sock *clcsk)
+{
+ struct smc_sock *smc =
+ smc_clcsock_user_data(clcsk);
+
+ if (!smc)
+ return;
+ smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_error_report);
+}
+
+static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code)
+{
+ struct sock *clcsk;
+ int rc = 0;
+
+ mutex_lock(&smc->clcsock_release_lock);
+ if (!smc->clcsock) {
+ rc = -EBADF;
+ goto out;
+ }
+ clcsk = smc->clcsock->sk;
+
+ if (smc->use_fallback)
+ goto out;
smc->use_fallback = true;
smc->fallback_rsn = reason_code;
smc_stat_fallback(smc);
@@ -582,22 +796,41 @@ static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code)
smc->clcsock->wq.fasync_list =
smc->sk.sk_socket->wq.fasync_list;
- /* There may be some entries remaining in
- * smc socket->wq, which should be removed
- * to clcsocket->wq during the fallback.
+ /* There might be some wait entries remaining
+ * in smc sk->sk_wq and they should be woken up
+ * as clcsock's wait queue is woken up.
*/
- spin_lock_irqsave(&smc_wait->lock, flags);
- spin_lock_nested(&clc_wait->lock, SINGLE_DEPTH_NESTING);
- list_splice_init(&smc_wait->head, &clc_wait->head);
- spin_unlock(&clc_wait->lock);
- spin_unlock_irqrestore(&smc_wait->lock, flags);
+ smc->clcsk_state_change = clcsk->sk_state_change;
+ smc->clcsk_data_ready = clcsk->sk_data_ready;
+ smc->clcsk_write_space = clcsk->sk_write_space;
+ smc->clcsk_error_report = clcsk->sk_error_report;
+
+ clcsk->sk_state_change = smc_fback_state_change;
+ clcsk->sk_data_ready = smc_fback_data_ready;
+ clcsk->sk_write_space = smc_fback_write_space;
+ clcsk->sk_error_report = smc_fback_error_report;
+
+ smc->clcsock->sk->sk_user_data =
+ (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY);
}
+out:
+ mutex_unlock(&smc->clcsock_release_lock);
+ return rc;
}
/* fall back during connect */
static int smc_connect_fallback(struct smc_sock *smc, int reason_code)
{
- smc_switch_to_fallback(smc, reason_code);
+ struct net *net = sock_net(&smc->sk);
+ int rc = 0;
+
+ rc = smc_switch_to_fallback(smc, reason_code);
+ if (rc) { /* fallback fails */
+ this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt);
+ if (smc->sk.sk_state == SMC_INIT)
+ sock_put(&smc->sk); /* passive closing */
+ return rc;
+ }
smc_copy_sock_settings_to_clc(smc);
smc->connect_nonblock = 0;
if (smc->sk.sk_state == SMC_INIT)
@@ -1166,8 +1399,14 @@ static int __smc_connect(struct smc_sock *smc)
/* perform CLC handshake */
rc = smc_connect_clc(smc, aclc2, ini);
- if (rc)
+ if (rc) {
+ /* -EAGAIN on timeout, see tcp_recvmsg() */
+ if (rc == -EAGAIN) {
+ rc = -ETIMEDOUT;
+ smc->sk.sk_err = ETIMEDOUT;
+ }
goto vlan_cleanup;
+ }
/* check if smc modes and versions of CLC proposal and accept match */
rc = smc_connect_check_aclc(ini, aclc);
@@ -1473,6 +1712,9 @@ static void smc_listen_out(struct smc_sock *new_smc)
struct smc_sock *lsmc = new_smc->listen_smc;
struct sock *newsmcsk = &new_smc->sk;
+ if (tcp_sk(new_smc->clcsock->sk)->syn_smc)
+ atomic_dec(&lsmc->queued_smc_hs);
+
if (lsmc->sk.sk_state == SMC_LISTEN) {
lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
smc_accept_enqueue(&lsmc->sk, newsmcsk);
@@ -1518,11 +1760,12 @@ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
{
/* RDMA setup failed, switch back to TCP */
smc_conn_abort(new_smc, local_first);
- if (reason_code < 0) { /* error, no fallback possible */
+ if (reason_code < 0 ||
+ smc_switch_to_fallback(new_smc, reason_code)) {
+ /* error, no fallback possible */
smc_listen_out_err(new_smc);
return;
}
- smc_switch_to_fallback(new_smc, reason_code);
if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) {
if (smc_clc_send_decline(new_smc, reason_code, version) < 0) {
smc_listen_out_err(new_smc);
@@ -1964,8 +2207,11 @@ static void smc_listen_work(struct work_struct *work)
/* check if peer is smc capable */
if (!tcp_sk(newclcsock->sk)->syn_smc) {
- smc_switch_to_fallback(new_smc, SMC_CLC_DECL_PEERNOSMC);
- smc_listen_out_connected(new_smc);
+ rc = smc_switch_to_fallback(new_smc, SMC_CLC_DECL_PEERNOSMC);
+ if (rc)
+ smc_listen_out_err(new_smc);
+ else
+ smc_listen_out_connected(new_smc);
return;
}
@@ -2074,6 +2320,9 @@ static void smc_tcp_listen_work(struct work_struct *work)
if (!new_smc)
continue;
+ if (tcp_sk(new_smc->clcsock->sk)->syn_smc)
+ atomic_inc(&lsmc->queued_smc_hs);
+
new_smc->listen_smc = lsmc;
new_smc->use_fallback = lsmc->use_fallback;
new_smc->fallback_rsn = lsmc->fallback_rsn;
@@ -2094,16 +2343,15 @@ out:
static void smc_clcsock_data_ready(struct sock *listen_clcsock)
{
- struct smc_sock *lsmc;
+ struct smc_sock *lsmc =
+ smc_clcsock_user_data(listen_clcsock);
- lsmc = (struct smc_sock *)
- ((uintptr_t)listen_clcsock->sk_user_data & ~SK_USER_DATA_NOCOPY);
if (!lsmc)
return;
lsmc->clcsk_data_ready(listen_clcsock);
if (lsmc->sk.sk_state == SMC_LISTEN) {
sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */
- if (!queue_work(smc_hs_wq, &lsmc->tcp_listen_work))
+ if (!queue_work(smc_tcp_ls_wq, &lsmc->tcp_listen_work))
sock_put(&lsmc->sk);
}
}
@@ -2141,6 +2389,18 @@ static int smc_listen(struct socket *sock, int backlog)
smc->clcsock->sk->sk_data_ready = smc_clcsock_data_ready;
smc->clcsock->sk->sk_user_data =
(void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY);
+
+ /* save original ops */
+ smc->ori_af_ops = inet_csk(smc->clcsock->sk)->icsk_af_ops;
+
+ smc->af_ops = *smc->ori_af_ops;
+ smc->af_ops.syn_recv_sock = smc_tcp_syn_recv_sock;
+
+ inet_csk(smc->clcsock->sk)->icsk_af_ops = &smc->af_ops;
+
+ if (smc->limit_smc_hs)
+ tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested;
+
rc = kernel_listen(smc->clcsock, backlog);
if (rc) {
smc->clcsock->sk->sk_data_ready = smc->clcsk_data_ready;
@@ -2254,7 +2514,9 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
if (msg->msg_flags & MSG_FASTOPEN) {
if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) {
- smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP);
+ rc = smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP);
+ if (rc)
+ goto out;
} else {
rc = -EINVAL;
goto out;
@@ -2432,6 +2694,71 @@ out:
return rc ? rc : rc1;
}
+static int __smc_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct smc_sock *smc;
+ int val, len;
+
+ smc = smc_sk(sock->sk);
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ len = min_t(int, len, sizeof(int));
+
+ if (len < 0)
+ return -EINVAL;
+
+ switch (optname) {
+ case SMC_LIMIT_HS:
+ val = smc->limit_smc_hs;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &val, len))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int __smc_setsockopt(struct socket *sock, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int val, rc;
+
+ smc = smc_sk(sk);
+
+ lock_sock(sk);
+ switch (optname) {
+ case SMC_LIMIT_HS:
+ if (optlen < sizeof(int)) {
+ rc = -EINVAL;
+ break;
+ }
+ if (copy_from_sockptr(&val, optval, sizeof(int))) {
+ rc = -EFAULT;
+ break;
+ }
+
+ smc->limit_smc_hs = !!val;
+ rc = 0;
+ break;
+ default:
+ rc = -EOPNOTSUPP;
+ break;
+ }
+ release_sock(sk);
+
+ return rc;
+}
+
static int smc_setsockopt(struct socket *sock, int level, int optname,
sockptr_t optval, unsigned int optlen)
{
@@ -2441,12 +2768,19 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
if (level == SOL_TCP && optname == TCP_ULP)
return -EOPNOTSUPP;
+ else if (level == SOL_SMC)
+ return __smc_setsockopt(sock, level, optname, optval, optlen);
smc = smc_sk(sk);
/* generic setsockopts reaching us here always apply to the
* CLC socket
*/
+ mutex_lock(&smc->clcsock_release_lock);
+ if (!smc->clcsock) {
+ mutex_unlock(&smc->clcsock_release_lock);
+ return -EBADF;
+ }
if (unlikely(!smc->clcsock->ops->setsockopt))
rc = -EOPNOTSUPP;
else
@@ -2456,6 +2790,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
sk->sk_err = smc->clcsock->sk->sk_err;
sk_error_report(sk);
}
+ mutex_unlock(&smc->clcsock_release_lock);
if (optlen < sizeof(int))
return -EINVAL;
@@ -2472,7 +2807,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
case TCP_FASTOPEN_NO_COOKIE:
/* option not supported by SMC */
if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) {
- smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP);
+ rc = smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP);
} else {
rc = -EINVAL;
}
@@ -2483,8 +2818,8 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
sk->sk_state != SMC_CLOSED) {
if (val) {
SMC_STAT_INC(smc, ndly_cnt);
- mod_delayed_work(smc->conn.lgr->tx_wq,
- &smc->conn.tx_work, 0);
+ smc_tx_pending(&smc->conn);
+ cancel_delayed_work(&smc->conn.tx_work);
}
}
break;
@@ -2494,8 +2829,8 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
sk->sk_state != SMC_CLOSED) {
if (!val) {
SMC_STAT_INC(smc, cork_cnt);
- mod_delayed_work(smc->conn.lgr->tx_wq,
- &smc->conn.tx_work, 0);
+ smc_tx_pending(&smc->conn);
+ cancel_delayed_work(&smc->conn.tx_work);
}
}
break;
@@ -2515,13 +2850,26 @@ static int smc_getsockopt(struct socket *sock, int level, int optname,
char __user *optval, int __user *optlen)
{
struct smc_sock *smc;
+ int rc;
+
+ if (level == SOL_SMC)
+ return __smc_getsockopt(sock, level, optname, optval, optlen);
smc = smc_sk(sock->sk);
+ mutex_lock(&smc->clcsock_release_lock);
+ if (!smc->clcsock) {
+ mutex_unlock(&smc->clcsock_release_lock);
+ return -EBADF;
+ }
/* socket options apply to the CLC socket */
- if (unlikely(!smc->clcsock->ops->getsockopt))
+ if (unlikely(!smc->clcsock->ops->getsockopt)) {
+ mutex_unlock(&smc->clcsock_release_lock);
return -EOPNOTSUPP;
- return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
- optval, optlen);
+ }
+ rc = smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
+ optval, optlen);
+ mutex_unlock(&smc->clcsock_release_lock);
+ return rc;
}
static int smc_ioctl(struct socket *sock, unsigned int cmd,
@@ -2623,8 +2971,10 @@ static ssize_t smc_sendpage(struct socket *sock, struct page *page,
rc = kernel_sendpage(smc->clcsock, page, offset,
size, flags);
} else {
+ lock_sock(sk);
+ rc = smc_tx_sendpage(smc, page, offset, size, flags);
+ release_sock(sk);
SMC_STAT_INC(smc, sendpage_cnt);
- rc = sock_no_sendpage(sock, page, offset, size, flags);
}
out:
@@ -2733,6 +3083,9 @@ static int __smc_create(struct net *net, struct socket *sock, int protocol,
smc->use_fallback = false; /* assume rdma capability first */
smc->fallback_rsn = 0;
+ /* default behavior from limit_smc_hs in every net namespace */
+ smc->limit_smc_hs = net->smc.limit_smc_hs;
+
rc = 0;
if (!clcsock) {
rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP,
@@ -2826,11 +3179,17 @@ unsigned int smc_net_id;
static __net_init int smc_net_init(struct net *net)
{
+ int rc;
+
+ rc = smc_sysctl_net_init(net);
+ if (rc)
+ return rc;
return smc_pnet_net_init(net);
}
static void __net_exit smc_net_exit(struct net *net)
{
+ smc_sysctl_net_exit(net);
smc_pnet_net_exit(net);
}
@@ -2880,9 +3239,14 @@ static int __init smc_init(void)
goto out_nl;
rc = -ENOMEM;
+
+ smc_tcp_ls_wq = alloc_workqueue("smc_tcp_ls_wq", 0, 0);
+ if (!smc_tcp_ls_wq)
+ goto out_pnet;
+
smc_hs_wq = alloc_workqueue("smc_hs_wq", 0, 0);
if (!smc_hs_wq)
- goto out_pnet;
+ goto out_alloc_tcp_ls_wq;
smc_close_wq = alloc_workqueue("smc_close_wq", 0, 0);
if (!smc_close_wq)
@@ -2935,12 +3299,14 @@ static int __init smc_init(void)
rc = tcp_register_ulp(&smc_ulp_ops);
if (rc) {
pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc);
- goto out_sock;
+ goto out_ib;
}
static_branch_enable(&tcp_have_smc);
return 0;
+out_ib:
+ smc_ib_unregister_client();
out_sock:
sock_unregister(PF_SMC);
out_proto6:
@@ -2953,6 +3319,8 @@ out_alloc_wqs:
destroy_workqueue(smc_close_wq);
out_alloc_hs_wq:
destroy_workqueue(smc_hs_wq);
+out_alloc_tcp_ls_wq:
+ destroy_workqueue(smc_tcp_ls_wq);
out_pnet:
smc_pnet_exit();
out_nl:
@@ -2971,6 +3339,7 @@ static void __exit smc_exit(void)
smc_core_exit();
smc_ib_unregister_client();
destroy_workqueue(smc_close_wq);
+ destroy_workqueue(smc_tcp_ls_wq);
destroy_workqueue(smc_hs_wq);
proto_unregister(&smc_proto6);
proto_unregister(&smc_proto);
diff --git a/net/smc/smc.h b/net/smc/smc.h
index 3d0b8e300deb..ea0620529ebe 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -14,6 +14,7 @@
#include <linux/socket.h>
#include <linux/types.h>
#include <linux/compiler.h> /* __aligned */
+#include <net/genetlink.h>
#include <net/sock.h>
#include "smc_ib.h"
@@ -28,6 +29,7 @@
#define SMC_MAX_ISM_DEVS 8 /* max # of proposed non-native ISM
* devices
*/
+#define SMC_AUTOCORKING_DEFAULT_SIZE 0x10000 /* 64K by default */
extern struct proto smc_proto;
extern struct proto smc_proto6;
@@ -139,6 +141,12 @@ enum smc_urg_state {
SMC_URG_READ = 3, /* data was already read */
};
+struct smc_mark_woken {
+ bool woken;
+ void *key;
+ wait_queue_entry_t wait_entry;
+};
+
struct smc_connection {
struct rb_node alert_node;
struct smc_link_group *lgr; /* link group of connection */
@@ -185,6 +193,7 @@ struct smc_connection {
* - dec on polled tx cqe
*/
wait_queue_head_t cdc_pend_tx_wq; /* wakeup on no cdc_pend_tx_wr*/
+ atomic_t tx_pushing; /* nr_threads trying tx push */
struct delayed_work tx_work; /* retry of smc_cdc_msg_send */
u32 tx_off; /* base offset in peer rmb */
@@ -204,6 +213,10 @@ struct smc_connection {
* data still pending
*/
char urg_rx_byte; /* urgent byte */
+ bool tx_in_release_sock;
+ /* flush pending tx data in
+ * sock release_cb()
+ */
atomic_t bytes_to_rcv; /* arrived data,
* not yet received
*/
@@ -228,8 +241,14 @@ struct smc_connection {
struct smc_sock { /* smc sock container */
struct sock sk;
struct socket *clcsock; /* internal tcp socket */
+ void (*clcsk_state_change)(struct sock *sk);
+ /* original stat_change fct. */
void (*clcsk_data_ready)(struct sock *sk);
- /* original data_ready fct. **/
+ /* original data_ready fct. */
+ void (*clcsk_write_space)(struct sock *sk);
+ /* original write_space fct. */
+ void (*clcsk_error_report)(struct sock *sk);
+ /* original error_report fct. */
struct smc_connection conn; /* smc connection */
struct smc_sock *listen_smc; /* listen parent */
struct work_struct connect_work; /* handle non-blocking connect*/
@@ -237,9 +256,14 @@ struct smc_sock { /* smc sock container */
struct work_struct smc_listen_work;/* prepare new accept socket */
struct list_head accept_q; /* sockets to be accepted */
spinlock_t accept_q_lock; /* protects accept_q */
+ bool limit_smc_hs; /* put constraint on handshake */
bool use_fallback; /* fallback to tcp */
int fallback_rsn; /* reason for fallback */
u32 peer_diagnosis; /* decline reason from peer */
+ atomic_t queued_smc_hs; /* queued smc handshakes */
+ struct inet_connection_sock_af_ops af_ops;
+ const struct inet_connection_sock_af_ops *ori_af_ops;
+ /* original af ops */
int sockopt_defer_accept;
/* sockopt TCP_DEFER_ACCEPT
* value
@@ -264,6 +288,12 @@ static inline struct smc_sock *smc_sk(const struct sock *sk)
return (struct smc_sock *)sk;
}
+static inline struct smc_sock *smc_clcsock_user_data(const struct sock *clcsk)
+{
+ return (struct smc_sock *)
+ ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY);
+}
+
extern struct workqueue_struct *smc_hs_wq; /* wq for handshake work */
extern struct workqueue_struct *smc_close_wq; /* wq for close work */
@@ -313,4 +343,9 @@ void smc_fill_gid_list(struct smc_link_group *lgr,
struct smc_gidlist *gidlist,
struct smc_ib_device *known_dev, u8 *known_gid);
+/* smc handshake limitation interface for netlink */
+int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb);
+int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info);
+int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info);
+
#endif /* __SMC_H */
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index 9d5a97168969..5c731f27996e 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -48,9 +48,19 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
conn->tx_cdc_seq_fin = cdcpend->ctrl_seq;
}
- if (atomic_dec_and_test(&conn->cdc_pend_tx_wr) &&
- unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq)))
- wake_up(&conn->cdc_pend_tx_wq);
+ if (atomic_dec_and_test(&conn->cdc_pend_tx_wr)) {
+ /* If user owns the sock_lock, mark the connection need sending.
+ * User context will later try to send when it release sock_lock
+ * in smc_release_cb()
+ */
+ if (sock_owned_by_user(&smc->sk))
+ conn->tx_in_release_sock = true;
+ else
+ smc_tx_pending(conn);
+
+ if (unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq)))
+ wake_up(&conn->cdc_pend_tx_wq);
+ }
WARN_ON(atomic_read(&conn->cdc_pend_tx_wr) < 0);
smc_tx_sndbuf_nonfull(smc);
@@ -350,8 +360,12 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
/* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */
if ((diff_cons && smc_tx_prepared_sends(conn)) ||
conn->local_rx_ctrl.prod_flags.cons_curs_upd_req ||
- conn->local_rx_ctrl.prod_flags.urg_data_pending)
- smc_tx_sndbuf_nonempty(conn);
+ conn->local_rx_ctrl.prod_flags.urg_data_pending) {
+ if (!sock_owned_by_user(&smc->sk))
+ smc_tx_pending(conn);
+ else
+ conn->tx_in_release_sock = true;
+ }
if (diff_cons && conn->urg_tx_pend &&
atomic_read(&conn->peer_rmbe_space) == conn->peer_rmbe_size) {
diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
index 292e4d904ab6..676cb2333d3c 100644
--- a/net/smc/smc_close.c
+++ b/net/smc/smc_close.c
@@ -57,6 +57,9 @@ static void smc_close_stream_wait(struct smc_sock *smc, long timeout)
if (!smc_tx_prepared_sends(&smc->conn))
return;
+ /* Send out corked data remaining in sndbuf */
+ smc_tx_pending(&smc->conn);
+
smc->wait_close_tx_prepared = 1;
add_wait_queue(sk_sleep(sk), &wait);
while (!signal_pending(current) && timeout) {
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 29525d03b253..f40f6ed0fbdb 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -1161,8 +1161,8 @@ void smc_conn_free(struct smc_connection *conn)
cancel_work_sync(&conn->abort_work);
}
if (!list_empty(&lgr->list)) {
- smc_lgr_unregister_conn(conn);
smc_buf_unuse(conn, lgr); /* allow buffer reuse */
+ smc_lgr_unregister_conn(conn);
}
if (!lgr->conns_num)
@@ -1864,7 +1864,8 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
(ini->smcd_version == SMC_V2 ||
lgr->vlan_id == ini->vlan_id) &&
(role == SMC_CLNT || ini->is_smcd ||
- lgr->conns_num < SMC_RMBS_PER_LGR_MAX)) {
+ (lgr->conns_num < SMC_RMBS_PER_LGR_MAX &&
+ !bitmap_full(lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX)))) {
/* link group found */
ini->first_contact_local = 0;
conn->lgr = lgr;
@@ -1988,7 +1989,7 @@ static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize,
*/
static inline int smc_rmb_wnd_update_limit(int rmbe_size)
{
- return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2);
+ return max_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2);
}
/* map an rmb buf to a link */
diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
index b8898c787d23..1fca2f90a9c7 100644
--- a/net/smc/smc_diag.c
+++ b/net/smc/smc_diag.c
@@ -146,13 +146,11 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
(req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) &&
!list_empty(&smc->conn.lgr->list)) {
struct smc_link *link = smc->conn.lnk;
- struct net *net = read_pnet(&link->smcibdev->ibdev->coredev.rdma_net);
struct smc_diag_lgrinfo linfo = {
.role = smc->conn.lgr->role,
.lnk[0].ibport = link->ibport,
.lnk[0].link_id = link->link_id,
- .lnk[0].net_cookie = net->net_cookie,
};
memcpy(linfo.lnk[0].ibname,
diff --git a/net/smc/smc_netlink.c b/net/smc/smc_netlink.c
index f13ab0661ed5..c5a62f6f52ba 100644
--- a/net/smc/smc_netlink.c
+++ b/net/smc/smc_netlink.c
@@ -111,6 +111,21 @@ static const struct genl_ops smc_gen_nl_ops[] = {
.flags = GENL_ADMIN_PERM,
.doit = smc_nl_disable_seid,
},
+ {
+ .cmd = SMC_NETLINK_DUMP_HS_LIMITATION,
+ /* can be retrieved by unprivileged users */
+ .dumpit = smc_nl_dump_hs_limitation,
+ },
+ {
+ .cmd = SMC_NETLINK_ENABLE_HS_LIMITATION,
+ .flags = GENL_ADMIN_PERM,
+ .doit = smc_nl_enable_hs_limitation,
+ },
+ {
+ .cmd = SMC_NETLINK_DISABLE_HS_LIMITATION,
+ .flags = GENL_ADMIN_PERM,
+ .doit = smc_nl_disable_hs_limitation,
+ },
};
static const struct nla_policy smc_gen_nl_policy[2] = {
diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c
index 291f1484a1b7..7984f8883472 100644
--- a/net/smc/smc_pnet.c
+++ b/net/smc/smc_pnet.c
@@ -113,7 +113,7 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name)
pnettable = &sn->pnettable;
/* remove table entry */
- write_lock(&pnettable->lock);
+ mutex_lock(&pnettable->lock);
list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist,
list) {
if (!pnet_name ||
@@ -131,7 +131,7 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name)
rc = 0;
}
}
- write_unlock(&pnettable->lock);
+ mutex_unlock(&pnettable->lock);
/* if this is not the initial namespace, stop here */
if (net != &init_net)
@@ -192,7 +192,7 @@ static int smc_pnet_add_by_ndev(struct net_device *ndev)
sn = net_generic(net, smc_net_id);
pnettable = &sn->pnettable;
- write_lock(&pnettable->lock);
+ mutex_lock(&pnettable->lock);
list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) {
if (pnetelem->type == SMC_PNET_ETH && !pnetelem->ndev &&
!strncmp(pnetelem->eth_name, ndev->name, IFNAMSIZ)) {
@@ -206,7 +206,7 @@ static int smc_pnet_add_by_ndev(struct net_device *ndev)
break;
}
}
- write_unlock(&pnettable->lock);
+ mutex_unlock(&pnettable->lock);
return rc;
}
@@ -224,7 +224,7 @@ static int smc_pnet_remove_by_ndev(struct net_device *ndev)
sn = net_generic(net, smc_net_id);
pnettable = &sn->pnettable;
- write_lock(&pnettable->lock);
+ mutex_lock(&pnettable->lock);
list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) {
if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev == ndev) {
dev_put_track(pnetelem->ndev, &pnetelem->dev_tracker);
@@ -237,7 +237,7 @@ static int smc_pnet_remove_by_ndev(struct net_device *ndev)
break;
}
}
- write_unlock(&pnettable->lock);
+ mutex_unlock(&pnettable->lock);
return rc;
}
@@ -368,12 +368,9 @@ static int smc_pnet_add_eth(struct smc_pnettable *pnettable, struct net *net,
new_pe->type = SMC_PNET_ETH;
memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN);
strncpy(new_pe->eth_name, eth_name, IFNAMSIZ);
- new_pe->ndev = ndev;
- if (ndev)
- netdev_tracker_alloc(ndev, &new_pe->dev_tracker, GFP_KERNEL);
rc = -EEXIST;
new_netdev = true;
- write_lock(&pnettable->lock);
+ mutex_lock(&pnettable->lock);
list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) {
if (tmp_pe->type == SMC_PNET_ETH &&
!strncmp(tmp_pe->eth_name, eth_name, IFNAMSIZ)) {
@@ -382,10 +379,15 @@ static int smc_pnet_add_eth(struct smc_pnettable *pnettable, struct net *net,
}
}
if (new_netdev) {
+ if (ndev) {
+ new_pe->ndev = ndev;
+ netdev_tracker_alloc(ndev, &new_pe->dev_tracker,
+ GFP_ATOMIC);
+ }
list_add_tail(&new_pe->list, &pnettable->pnetlist);
- write_unlock(&pnettable->lock);
+ mutex_unlock(&pnettable->lock);
} else {
- write_unlock(&pnettable->lock);
+ mutex_unlock(&pnettable->lock);
kfree(new_pe);
goto out_put;
}
@@ -446,7 +448,7 @@ static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name,
new_pe->ib_port = ib_port;
new_ibdev = true;
- write_lock(&pnettable->lock);
+ mutex_lock(&pnettable->lock);
list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) {
if (tmp_pe->type == SMC_PNET_IB &&
!strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) {
@@ -456,9 +458,9 @@ static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name,
}
if (new_ibdev) {
list_add_tail(&new_pe->list, &pnettable->pnetlist);
- write_unlock(&pnettable->lock);
+ mutex_unlock(&pnettable->lock);
} else {
- write_unlock(&pnettable->lock);
+ mutex_unlock(&pnettable->lock);
kfree(new_pe);
}
return (new_ibdev) ? 0 : -EEXIST;
@@ -603,7 +605,7 @@ static int _smc_pnet_dump(struct net *net, struct sk_buff *skb, u32 portid,
pnettable = &sn->pnettable;
/* dump pnettable entries */
- read_lock(&pnettable->lock);
+ mutex_lock(&pnettable->lock);
list_for_each_entry(pnetelem, &pnettable->pnetlist, list) {
if (pnetid && !smc_pnet_match(pnetelem->pnet_name, pnetid))
continue;
@@ -618,7 +620,7 @@ static int _smc_pnet_dump(struct net *net, struct sk_buff *skb, u32 portid,
break;
}
}
- read_unlock(&pnettable->lock);
+ mutex_unlock(&pnettable->lock);
return idx;
}
@@ -862,12 +864,15 @@ int smc_pnet_net_init(struct net *net)
struct smc_pnetids_ndev *pnetids_ndev = &sn->pnetids_ndev;
INIT_LIST_HEAD(&pnettable->pnetlist);
- rwlock_init(&pnettable->lock);
+ mutex_init(&pnettable->lock);
INIT_LIST_HEAD(&pnetids_ndev->list);
rwlock_init(&pnetids_ndev->lock);
smc_pnet_create_pnetids_list(net);
+ /* disable handshake limitation by default */
+ net->smc.limit_smc_hs = 0;
+
return 0;
}
@@ -942,7 +947,7 @@ static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev,
sn = net_generic(net, smc_net_id);
pnettable = &sn->pnettable;
- read_lock(&pnettable->lock);
+ mutex_lock(&pnettable->lock);
list_for_each_entry(pnetelem, &pnettable->pnetlist, list) {
if (pnetelem->type == SMC_PNET_ETH && ndev == pnetelem->ndev) {
/* get pnetid of netdev device */
@@ -951,7 +956,7 @@ static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev,
break;
}
}
- read_unlock(&pnettable->lock);
+ mutex_unlock(&pnettable->lock);
return rc;
}
@@ -1154,7 +1159,7 @@ int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port)
sn = net_generic(&init_net, smc_net_id);
pnettable = &sn->pnettable;
- read_lock(&pnettable->lock);
+ mutex_lock(&pnettable->lock);
list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) {
if (tmp_pe->type == SMC_PNET_IB &&
!strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX) &&
@@ -1164,7 +1169,7 @@ int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port)
break;
}
}
- read_unlock(&pnettable->lock);
+ mutex_unlock(&pnettable->lock);
return rc;
}
@@ -1183,7 +1188,7 @@ int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev)
sn = net_generic(&init_net, smc_net_id);
pnettable = &sn->pnettable;
- read_lock(&pnettable->lock);
+ mutex_lock(&pnettable->lock);
list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) {
if (tmp_pe->type == SMC_PNET_IB &&
!strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) {
@@ -1192,7 +1197,7 @@ int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev)
break;
}
}
- read_unlock(&pnettable->lock);
+ mutex_unlock(&pnettable->lock);
return rc;
}
diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h
index 14039272f7e4..80a88eea4949 100644
--- a/net/smc/smc_pnet.h
+++ b/net/smc/smc_pnet.h
@@ -29,7 +29,7 @@ struct smc_link_group;
* @pnetlist: List of PNETIDs
*/
struct smc_pnettable {
- rwlock_t lock;
+ struct mutex lock;
struct list_head pnetlist;
};
diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c
new file mode 100644
index 000000000000..cf3ab1334c00
--- /dev/null
+++ b/net/smc/smc_sysctl.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * smc_sysctl.c: sysctl interface to SMC subsystem.
+ *
+ * Copyright (c) 2022, Alibaba Inc.
+ *
+ * Author: Tony Lu <tonylu@linux.alibaba.com>
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <net/net_namespace.h>
+
+#include "smc.h"
+#include "smc_sysctl.h"
+
+static struct ctl_table smc_table[] = {
+ {
+ .procname = "autocorking_size",
+ .data = &init_net.smc.sysctl_autocorking_size,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_douintvec,
+ },
+ { }
+};
+
+int __net_init smc_sysctl_net_init(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = smc_table;
+ if (!net_eq(net, &init_net)) {
+ int i;
+
+ table = kmemdup(table, sizeof(smc_table), GFP_KERNEL);
+ if (!table)
+ goto err_alloc;
+
+ for (i = 0; i < ARRAY_SIZE(smc_table) - 1; i++)
+ table[i].data += (void *)net - (void *)&init_net;
+ }
+
+ net->smc.smc_hdr = register_net_sysctl(net, "net/smc", table);
+ if (!net->smc.smc_hdr)
+ goto err_reg;
+
+ net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE;
+
+ return 0;
+
+err_reg:
+ if (!net_eq(net, &init_net))
+ kfree(table);
+err_alloc:
+ return -ENOMEM;
+}
+
+void __net_exit smc_sysctl_net_exit(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = net->smc.smc_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(net->smc.smc_hdr);
+ if (!net_eq(net, &init_net))
+ kfree(table);
+}
diff --git a/net/smc/smc_sysctl.h b/net/smc/smc_sysctl.h
new file mode 100644
index 000000000000..0becc11bd2f4
--- /dev/null
+++ b/net/smc/smc_sysctl.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * smc_sysctl.c: sysctl interface to SMC subsystem.
+ *
+ * Copyright (c) 2022, Alibaba Inc.
+ *
+ * Author: Tony Lu <tonylu@linux.alibaba.com>
+ *
+ */
+
+#ifndef _SMC_SYSCTL_H
+#define _SMC_SYSCTL_H
+
+#ifdef CONFIG_SYSCTL
+
+int __net_init smc_sysctl_net_init(struct net *net);
+void __net_exit smc_sysctl_net_exit(struct net *net);
+
+#else
+
+static inline int smc_sysctl_net_init(struct net *net)
+{
+ net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE;
+ return 0;
+}
+
+static inline void smc_sysctl_net_exit(struct net *net) { }
+
+#endif /* CONFIG_SYSCTL */
+
+#endif /* _SMC_SYSCTL_H */
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index be241d53020f..98ca9229fe87 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -31,7 +31,6 @@
#include "smc_tracepoint.h"
#define SMC_TX_WORK_DELAY 0
-#define SMC_TX_CORK_DELAY (HZ >> 2) /* 250 ms */
/***************************** sndbuf producer *******************************/
@@ -132,6 +131,51 @@ static bool smc_tx_is_corked(struct smc_sock *smc)
return (tp->nonagle & TCP_NAGLE_CORK) ? true : false;
}
+/* If we have pending CDC messages, do not send:
+ * Because CQE of this CDC message will happen shortly, it gives
+ * a chance to coalesce future sendmsg() payload in to one RDMA Write,
+ * without need for a timer, and with no latency trade off.
+ * Algorithm here:
+ * 1. First message should never cork
+ * 2. If we have pending Tx CDC messages, wait for the first CDC
+ * message's completion
+ * 3. Don't cork to much data in a single RDMA Write to prevent burst
+ * traffic, total corked message should not exceed sendbuf/2
+ */
+static bool smc_should_autocork(struct smc_sock *smc)
+{
+ struct smc_connection *conn = &smc->conn;
+ int corking_size;
+
+ corking_size = min_t(unsigned int, conn->sndbuf_desc->len >> 1,
+ sock_net(&smc->sk)->smc.sysctl_autocorking_size);
+
+ if (atomic_read(&conn->cdc_pend_tx_wr) == 0 ||
+ smc_tx_prepared_sends(conn) > corking_size)
+ return false;
+ return true;
+}
+
+static bool smc_tx_should_cork(struct smc_sock *smc, struct msghdr *msg)
+{
+ struct smc_connection *conn = &smc->conn;
+
+ if (smc_should_autocork(smc))
+ return true;
+
+ /* for a corked socket defer the RDMA writes if
+ * sndbuf_space is still available. The applications
+ * should known how/when to uncork it.
+ */
+ if ((msg->msg_flags & MSG_MORE ||
+ smc_tx_is_corked(smc) ||
+ msg->msg_flags & MSG_SENDPAGE_NOTLAST) &&
+ atomic_read(&conn->sndbuf_space))
+ return true;
+
+ return false;
+}
+
/* sndbuf producer: main API called by socket layer.
* called under sock lock.
*/
@@ -236,15 +280,10 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
*/
if ((msg->msg_flags & MSG_OOB) && !send_remaining)
conn->urg_tx_pend = true;
- if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) &&
- (atomic_read(&conn->sndbuf_space) >
- (conn->sndbuf_desc->len >> 1)))
- /* for a corked socket defer the RDMA writes if there
- * is still sufficient sndbuf_space available
- */
- queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work,
- SMC_TX_CORK_DELAY);
- else
+ /* If we need to cork, do nothing and wait for the next
+ * sendmsg() call or push on tx completion
+ */
+ if (!smc_tx_should_cork(smc, msg))
smc_tx_sndbuf_nonempty(conn);
trace_smc_tx_sendmsg(smc, copylen);
@@ -260,6 +299,22 @@ out_err:
return rc;
}
+int smc_tx_sendpage(struct smc_sock *smc, struct page *page, int offset,
+ size_t size, int flags)
+{
+ struct msghdr msg = {.msg_flags = flags};
+ char *kaddr = kmap(page);
+ struct kvec iov;
+ int rc;
+
+ iov.iov_base = kaddr + offset;
+ iov.iov_len = size;
+ iov_iter_kvec(&msg.msg_iter, WRITE, &iov, 1, size);
+ rc = smc_tx_sendmsg(smc, &msg, size);
+ kunmap(page);
+ return rc;
+}
+
/***************************** sndbuf consumer *******************************/
/* sndbuf consumer: actual data transfer of one target chunk with ISM write */
@@ -576,13 +631,26 @@ static int smcd_tx_sndbuf_nonempty(struct smc_connection *conn)
return rc;
}
-int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
+static int __smc_tx_sndbuf_nonempty(struct smc_connection *conn)
{
- int rc;
+ struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
+ int rc = 0;
+
+ /* No data in the send queue */
+ if (unlikely(smc_tx_prepared_sends(conn) <= 0))
+ goto out;
+
+ /* Peer don't have RMBE space */
+ if (unlikely(atomic_read(&conn->peer_rmbe_space) <= 0)) {
+ SMC_STAT_RMB_TX_PEER_FULL(smc, !conn->lnk);
+ goto out;
+ }
if (conn->killed ||
- conn->local_rx_ctrl.conn_state_flags.peer_conn_abort)
- return -EPIPE; /* connection being aborted */
+ conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) {
+ rc = -EPIPE; /* connection being aborted */
+ goto out;
+ }
if (conn->lgr->is_smcd)
rc = smcd_tx_sndbuf_nonempty(conn);
else
@@ -590,34 +658,72 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
if (!rc) {
/* trigger socket release if connection is closing */
- struct smc_sock *smc = container_of(conn, struct smc_sock,
- conn);
smc_close_wake_tx_prepared(smc);
}
+
+out:
+ return rc;
+}
+
+int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
+{
+ int rc;
+
+ /* This make sure only one can send simultaneously to prevent wasting
+ * of CPU and CDC slot.
+ * Record whether someone has tried to push while we are pushing.
+ */
+ if (atomic_inc_return(&conn->tx_pushing) > 1)
+ return 0;
+
+again:
+ atomic_set(&conn->tx_pushing, 1);
+ smp_wmb(); /* Make sure tx_pushing is 1 before real send */
+ rc = __smc_tx_sndbuf_nonempty(conn);
+
+ /* We need to check whether someone else have added some data into
+ * the send queue and tried to push but failed after the atomic_set()
+ * when we are pushing.
+ * If so, we need to push again to prevent those data hang in the send
+ * queue.
+ */
+ if (unlikely(!atomic_dec_and_test(&conn->tx_pushing)))
+ goto again;
+
return rc;
}
/* Wakeup sndbuf consumers from process context
- * since there is more data to transmit
+ * since there is more data to transmit. The caller
+ * must hold sock lock.
*/
-void smc_tx_work(struct work_struct *work)
+void smc_tx_pending(struct smc_connection *conn)
{
- struct smc_connection *conn = container_of(to_delayed_work(work),
- struct smc_connection,
- tx_work);
struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
int rc;
- lock_sock(&smc->sk);
if (smc->sk.sk_err)
- goto out;
+ return;
rc = smc_tx_sndbuf_nonempty(conn);
if (!rc && conn->local_rx_ctrl.prod_flags.write_blocked &&
!atomic_read(&conn->bytes_to_rcv))
conn->local_rx_ctrl.prod_flags.write_blocked = 0;
+}
-out:
+/* Wakeup sndbuf consumers from process context
+ * since there is more data to transmit in locked
+ * sock.
+ */
+void smc_tx_work(struct work_struct *work)
+{
+ struct smc_connection *conn = container_of(to_delayed_work(work),
+ struct smc_connection,
+ tx_work);
+ struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
+
+ lock_sock(&smc->sk);
+ smc_tx_pending(conn);
release_sock(&smc->sk);
}
diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h
index 07e6ad76224a..34b578498b1f 100644
--- a/net/smc/smc_tx.h
+++ b/net/smc/smc_tx.h
@@ -27,9 +27,12 @@ static inline int smc_tx_prepared_sends(struct smc_connection *conn)
return smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep);
}
+void smc_tx_pending(struct smc_connection *conn);
void smc_tx_work(struct work_struct *work);
void smc_tx_init(struct smc_sock *smc);
int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len);
+int smc_tx_sendpage(struct smc_sock *smc, struct page *page, int offset,
+ size_t size, int flags);
int smc_tx_sndbuf_nonempty(struct smc_connection *conn);
void smc_tx_sndbuf_nonfull(struct smc_sock *smc);
void smc_tx_consumer_update(struct smc_connection *conn, bool force);