11 files changed, 317 insertions, 42 deletions
diff --git a/Documentation/networking/smc-sysctl.rst b/Documentation/networking/smc-sysctl.rst
new file mode 100644
index 000000000000..c53f8c61c9e4
--- /dev/null
+++ b/Documentation/networking/smc-sysctl.rst
@@ -0,0 +1,23 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=========
+SMC Sysctl
+=========
+
+/proc/sys/net/smc/* Variables
+==============================
+
+autocorking_size - INTEGER
+	Setting SMC auto corking size:
+	SMC auto corking is like TCP auto corking from the application's
+	perspective of view. When applications do consecutive small
+	write()/sendmsg() system calls, we try to coalesce these small writes
+	as much as possible, to lower total amount of CDC and RDMA Write been
+	sent.
+	autocorking_size limits the maximum corked bytes that can be sent to
+	the under device in 1 single sending. If set to 0, the SMC auto corking
+	is disabled.
+	Applications can still use TCP_CORK for optimal behavior when they
+	know how/when to uncork their sockets.
+
+	Default: 64K
diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h
index 47b166684fd8..e5389eeaf8bd 100644
--- a/include/net/netns/smc.h
+++ b/include/net/netns/smc.h
@@ -14,5 +14,9 @@ struct netns_smc {
 	struct smc_stats_rsn		*fback_rsn;
 
 	bool				limit_smc_hs;	/* constraint on handshake */
+#ifdef CONFIG_SYSCTL
+	struct ctl_table_header		*smc_hdr;
+#endif
+	unsigned int			sysctl_autocorking_size;
 };
 #endif
diff --git a/net/smc/Makefile b/net/smc/Makefile
index 196fb6f01b14..640af9a39f9c 100644
--- a/net/smc/Makefile
+++ b/net/smc/Makefile
@@ -4,4 +4,4 @@ obj-$(CONFIG_SMC)	+= smc.o
 obj-$(CONFIG_SMC_DIAG)	+= smc_diag.o
 smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
 smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o
-smc-y += smc_tracepoint.o
+smc-y += smc_tracepoint.o smc_sysctl.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 30acc31b2c45..6447607675fa 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -51,6 +51,7 @@
 #include "smc_close.h"
 #include "smc_stats.h"
 #include "smc_tracepoint.h"
+#include "smc_sysctl.h"
 
 static DEFINE_MUTEX(smc_server_lgr_pending);	/* serialize link group
 						 * creation on server
@@ -192,12 +193,27 @@ void smc_unhash_sk(struct sock *sk)
 }
 EXPORT_SYMBOL_GPL(smc_unhash_sk);
 
+/* This will be called before user really release sock_lock. So do the
+ * work which we didn't do because of user hold the sock_lock in the
+ * BH context
+ */
+static void smc_release_cb(struct sock *sk)
+{
+	struct smc_sock *smc = smc_sk(sk);
+
+	if (smc->conn.tx_in_release_sock) {
+		smc_tx_pending(&smc->conn);
+		smc->conn.tx_in_release_sock = false;
+	}
+}
+
 struct proto smc_proto = {
 	.name		= "SMC",
 	.owner		= THIS_MODULE,
 	.keepalive	= smc_set_keepalive,
 	.hash		= smc_hash_sk,
 	.unhash		= smc_unhash_sk,
+	.release_cb	= smc_release_cb,
 	.obj_size	= sizeof(struct smc_sock),
 	.h.smc_hash	= &smc_v4_hashinfo,
 	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
@@ -210,6 +226,7 @@ struct proto smc_proto6 = {
 	.keepalive	= smc_set_keepalive,
 	.hash		= smc_hash_sk,
 	.unhash		= smc_unhash_sk,
+	.release_cb	= smc_release_cb,
 	.obj_size	= sizeof(struct smc_sock),
 	.h.smc_hash	= &smc_v6_hashinfo,
 	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
@@ -2795,8 +2812,8 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
 		    sk->sk_state != SMC_CLOSED) {
 			if (val) {
 				SMC_STAT_INC(smc, ndly_cnt);
-				mod_delayed_work(smc->conn.lgr->tx_wq,
-						 &smc->conn.tx_work, 0);
+				smc_tx_pending(&smc->conn);
+				cancel_delayed_work(&smc->conn.tx_work);
 			}
 		}
 		break;
@@ -3273,9 +3290,17 @@ static int __init smc_init(void)
 		goto out_sock;
 	}
 
+	rc = smc_sysctl_init();
+	if (rc) {
+		pr_err("%s: sysctl_init fails with %d\n", __func__, rc);
+		goto out_ulp;
+	}
+
 	static_branch_enable(&tcp_have_smc);
 	return 0;
 
+out_ulp:
+	tcp_unregister_ulp(&smc_ulp_ops);
 out_sock:
 	sock_unregister(PF_SMC);
 out_proto6:
@@ -3303,6 +3328,7 @@ out_pernet_subsys:
 static void __exit smc_exit(void)
 {
 	static_branch_disable(&tcp_have_smc);
+	smc_sysctl_exit();
 	tcp_unregister_ulp(&smc_ulp_ops);
 	sock_unregister(PF_SMC);
 	smc_core_exit();
diff --git a/net/smc/smc.h b/net/smc/smc.h
index a096d8af21a0..ea0620529ebe 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -29,6 +29,7 @@
 #define SMC_MAX_ISM_DEVS	8	/* max # of proposed non-native ISM
 					 * devices
 					 */
+#define SMC_AUTOCORKING_DEFAULT_SIZE	0x10000	/* 64K by default */
 
 extern struct proto smc_proto;
 extern struct proto smc_proto6;
@@ -192,6 +193,7 @@ struct smc_connection {
 						 * - dec on polled tx cqe
 						 */
 	wait_queue_head_t	cdc_pend_tx_wq; /* wakeup on no cdc_pend_tx_wr*/
+	atomic_t		tx_pushing;     /* nr_threads trying tx push */
 	struct delayed_work	tx_work;	/* retry of smc_cdc_msg_send */
 	u32			tx_off;		/* base offset in peer rmb */
 
@@ -211,6 +213,10 @@ struct smc_connection {
 						 * data still pending
 						 */
 	char			urg_rx_byte;	/* urgent byte */
+	bool			tx_in_release_sock;
+						/* flush pending tx data in
+						 * sock release_cb()
+						 */
 	atomic_t		bytes_to_rcv;	/* arrived data,
 						 * not yet received
 						 */
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index 9d5a97168969..5c731f27996e 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -48,9 +48,19 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
 		conn->tx_cdc_seq_fin = cdcpend->ctrl_seq;
 	}
 
-	if (atomic_dec_and_test(&conn->cdc_pend_tx_wr) &&
-	    unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq)))
-		wake_up(&conn->cdc_pend_tx_wq);
+	if (atomic_dec_and_test(&conn->cdc_pend_tx_wr)) {
+		/* If user owns the sock_lock, mark the connection need sending.
+		 * User context will later try to send when it release sock_lock
+		 * in smc_release_cb()
+		 */
+		if (sock_owned_by_user(&smc->sk))
+			conn->tx_in_release_sock = true;
+		else
+			smc_tx_pending(conn);
+
+		if (unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq)))
+			wake_up(&conn->cdc_pend_tx_wq);
+	}
 	WARN_ON(atomic_read(&conn->cdc_pend_tx_wr) < 0);
 
 	smc_tx_sndbuf_nonfull(smc);
@@ -350,8 +360,12 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
 	/* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */
 	if ((diff_cons && smc_tx_prepared_sends(conn)) ||
 	    conn->local_rx_ctrl.prod_flags.cons_curs_upd_req ||
-	    conn->local_rx_ctrl.prod_flags.urg_data_pending)
-		smc_tx_sndbuf_nonempty(conn);
+	    conn->local_rx_ctrl.prod_flags.urg_data_pending) {
+		if (!sock_owned_by_user(&smc->sk))
+			smc_tx_pending(conn);
+		else
+			conn->tx_in_release_sock = true;
+	}
 
 	if (diff_cons && conn->urg_tx_pend &&
 	    atomic_read(&conn->peer_rmbe_space) == conn->peer_rmbe_size) {
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 29525d03b253..1674b2549f8b 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -1988,7 +1988,7 @@ static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize,
  */
 static inline int smc_rmb_wnd_update_limit(int rmbe_size)
 {
-	return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2);
+	return max_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2);
 }
 
 /* map an rmb buf to a link */
diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c
new file mode 100644
index 000000000000..3b59876aaac9
--- /dev/null
+++ b/net/smc/smc_sysctl.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *  Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  smc_sysctl.c: sysctl interface to SMC subsystem.
+ *
+ *  Copyright (c) 2022, Alibaba Inc.
+ *
+ *  Author: Tony Lu <tonylu@linux.alibaba.com>
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <net/net_namespace.h>
+
+#include "smc.h"
+#include "smc_sysctl.h"
+
+static struct ctl_table smc_table[] = {
+	{
+		.procname       = "autocorking_size",
+		.data           = &init_net.smc.sysctl_autocorking_size,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler	= proc_douintvec,
+	},
+	{  }
+};
+
+static __net_init int smc_sysctl_init_net(struct net *net)
+{
+	struct ctl_table *table;
+
+	table = smc_table;
+	if (!net_eq(net, &init_net)) {
+		int i;
+
+		table = kmemdup(table, sizeof(smc_table), GFP_KERNEL);
+		if (!table)
+			goto err_alloc;
+
+		for (i = 0; i < ARRAY_SIZE(smc_table) - 1; i++)
+			table[i].data += (void *)net - (void *)&init_net;
+	}
+
+	net->smc.smc_hdr = register_net_sysctl(net, "net/smc", table);
+	if (!net->smc.smc_hdr)
+		goto err_reg;
+
+	net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE;
+
+	return 0;
+
+err_reg:
+	if (!net_eq(net, &init_net))
+		kfree(table);
+err_alloc:
+	return -ENOMEM;
+}
+
+static __net_exit void smc_sysctl_exit_net(struct net *net)
+{
+	unregister_net_sysctl_table(net->smc.smc_hdr);
+}
+
+static struct pernet_operations smc_sysctl_ops __net_initdata = {
+	.init = smc_sysctl_init_net,
+	.exit = smc_sysctl_exit_net,
+};
+
+int __init smc_sysctl_init(void)
+{
+	return register_pernet_subsys(&smc_sysctl_ops);
+}
+
+void smc_sysctl_exit(void)
+{
+	unregister_pernet_subsys(&smc_sysctl_ops);
+}
diff --git a/net/smc/smc_sysctl.h b/net/smc/smc_sysctl.h
new file mode 100644
index 000000000000..49553ac236b6
--- /dev/null
+++ b/net/smc/smc_sysctl.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *  Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  smc_sysctl.c: sysctl interface to SMC subsystem.
+ *
+ *  Copyright (c) 2022, Alibaba Inc.
+ *
+ *  Author: Tony Lu <tonylu@linux.alibaba.com>
+ *
+ */
+
+#ifndef _SMC_SYSCTL_H
+#define _SMC_SYSCTL_H
+
+#ifdef CONFIG_SYSCTL
+
+int smc_sysctl_init(void);
+void smc_sysctl_exit(void);
+
+#else
+
+int smc_sysctl_init(void)
+{
+	return 0;
+}
+
+void smc_sysctl_exit(void) { }
+
+#endif /* CONFIG_SYSCTL */
+
+#endif /* _SMC_SYSCTL_H */
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index 436ac836f363..257dc0d0aeb1 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -131,6 +131,51 @@ static bool smc_tx_is_corked(struct smc_sock *smc)
 	return (tp->nonagle & TCP_NAGLE_CORK) ? true : false;
 }
 
+/* If we have pending CDC messages, do not send:
+ * Because CQE of this CDC message will happen shortly, it gives
+ * a chance to coalesce future sendmsg() payload in to one RDMA Write,
+ * without need for a timer, and with no latency trade off.
+ * Algorithm here:
+ *  1. First message should never cork
+ *  2. If we have pending Tx CDC messages, wait for the first CDC
+ *     message's completion
+ *  3. Don't cork to much data in a single RDMA Write to prevent burst
+ *     traffic, total corked message should not exceed sendbuf/2
+ */
+static bool smc_should_autocork(struct smc_sock *smc)
+{
+	struct smc_connection *conn = &smc->conn;
+	int corking_size;
+
+	corking_size = min(sock_net(&smc->sk)->smc.sysctl_autocorking_size,
+			   conn->sndbuf_desc->len >> 1);
+
+	if (atomic_read(&conn->cdc_pend_tx_wr) == 0 ||
+	    smc_tx_prepared_sends(conn) > corking_size)
+		return false;
+	return true;
+}
+
+static bool smc_tx_should_cork(struct smc_sock *smc, struct msghdr *msg)
+{
+	struct smc_connection *conn = &smc->conn;
+
+	if (smc_should_autocork(smc))
+		return true;
+
+	/* for a corked socket defer the RDMA writes if
+	 * sndbuf_space is still available. The applications
+	 * should known how/when to uncork it.
+	 */
+	if ((msg->msg_flags & MSG_MORE ||
+	     smc_tx_is_corked(smc) ||
+	     msg->msg_flags & MSG_SENDPAGE_NOTLAST) &&
+	    atomic_read(&conn->sndbuf_space))
+		return true;
+
+	return false;
+}
+
 /* sndbuf producer: main API called by socket layer.
  * called under sock lock.
  */
@@ -235,13 +280,10 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
 		 */
 		if ((msg->msg_flags & MSG_OOB) && !send_remaining)
 			conn->urg_tx_pend = true;
-		/* for a corked socket defer the RDMA writes if
-		 * sndbuf_space is still available. The applications
-		 * should known how/when to uncork it.
+		/* If we need to cork, do nothing and wait for the next
+		 * sendmsg() call or push on tx completion
 		 */
-		if (!((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc) ||
-		       msg->msg_flags & MSG_SENDPAGE_NOTLAST) &&
-		      atomic_read(&conn->sndbuf_space)))
+		if (!smc_tx_should_cork(smc, msg))
 			smc_tx_sndbuf_nonempty(conn);
 
 		trace_smc_tx_sendmsg(smc, copylen);
@@ -589,13 +631,26 @@ static int smcd_tx_sndbuf_nonempty(struct smc_connection *conn)
 	return rc;
 }
 
-int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
+static int __smc_tx_sndbuf_nonempty(struct smc_connection *conn)
 {
-	int rc;
+	struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
+	int rc = 0;
+
+	/* No data in the send queue */
+	if (unlikely(smc_tx_prepared_sends(conn) <= 0))
+		goto out;
+
+	/* Peer don't have RMBE space */
+	if (unlikely(atomic_read(&conn->peer_rmbe_space) <= 0)) {
+		SMC_STAT_RMB_TX_PEER_FULL(smc, !conn->lnk);
+		goto out;
+	}
 
 	if (conn->killed ||
-	    conn->local_rx_ctrl.conn_state_flags.peer_conn_abort)
-		return -EPIPE;	/* connection being aborted */
+	    conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) {
+		rc = -EPIPE;    /* connection being aborted */
+		goto out;
+	}
 	if (conn->lgr->is_smcd)
 		rc = smcd_tx_sndbuf_nonempty(conn);
 	else
@@ -603,10 +658,38 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
 
 	if (!rc) {
 		/* trigger socket release if connection is closing */
-		struct smc_sock *smc = container_of(conn, struct smc_sock,
-						    conn);
 		smc_close_wake_tx_prepared(smc);
 	}
+
+out:
+	return rc;
+}
+
+int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
+{
+	int rc;
+
+	/* This make sure only one can send simultaneously to prevent wasting
+	 * of CPU and CDC slot.
+	 * Record whether someone has tried to push while we are pushing.
+	 */
+	if (atomic_inc_return(&conn->tx_pushing) > 1)
+		return 0;
+
+again:
+	atomic_set(&conn->tx_pushing, 1);
+	smp_wmb(); /* Make sure tx_pushing is 1 before real send */
+	rc = __smc_tx_sndbuf_nonempty(conn);
+
+	/* We need to check whether someone else have added some data into
+	 * the send queue and tried to push but failed after the atomic_set()
+	 * when we are pushing.
+	 * If so, we need to push again to prevent those data hang in the send
+	 * queue.
+	 */
+	if (unlikely(!atomic_dec_and_test(&conn->tx_pushing)))
+		goto again;
+
 	return rc;
 }
 
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index 24be1d03fef9..34d616406d51 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -137,25 +137,28 @@ static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t)
 {
 	struct smc_ib_device *dev = from_tasklet(dev, t, send_tasklet);
 	struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
-	int i = 0, rc;
-	int polled = 0;
+	int i, rc;
 
 again:
-	polled++;
 	do {
 		memset(&wc, 0, sizeof(wc));
 		rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc);
-		if (polled == 1) {
-			ib_req_notify_cq(dev->roce_cq_send,
-					 IB_CQ_NEXT_COMP |
-					 IB_CQ_REPORT_MISSED_EVENTS);
-		}
-		if (!rc)
-			break;
 		for (i = 0; i < rc; i++)
 			smc_wr_tx_process_cqe(&wc[i]);
+		if (rc < SMC_WR_MAX_POLL_CQE)
+			/* If < SMC_WR_MAX_POLL_CQE, the CQ should have been
+			 * drained, no need to poll again. --Guangguan Wang
+			 */
+			break;
 	} while (rc > 0);
-	if (polled == 1)
+
+	/* IB_CQ_REPORT_MISSED_EVENTS make sure if ib_req_notify_cq() returns
+	 * 0, it is safe to wait for the next event.
+	 * Else we must poll the CQ again to make sure we won't miss any event
+	 */
+	if (ib_req_notify_cq(dev->roce_cq_send,
+			     IB_CQ_NEXT_COMP |
+			     IB_CQ_REPORT_MISSED_EVENTS))
 		goto again;
 }
 
@@ -478,24 +481,28 @@ static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t)
 {
 	struct smc_ib_device *dev = from_tasklet(dev, t, recv_tasklet);
 	struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
-	int polled = 0;
 	int rc;
 
 again:
-	polled++;
 	do {
 		memset(&wc, 0, sizeof(wc));
 		rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc);
-		if (polled == 1) {
-			ib_req_notify_cq(dev->roce_cq_recv,
-					 IB_CQ_SOLICITED_MASK
-					 | IB_CQ_REPORT_MISSED_EVENTS);
-		}
-		if (!rc)
+		if (rc > 0)
+			smc_wr_rx_process_cqes(&wc[0], rc);
+		if (rc < SMC_WR_MAX_POLL_CQE)
+			/* If < SMC_WR_MAX_POLL_CQE, the CQ should have been
+			 * drained, no need to poll again. --Guangguan Wang
+			 */
 			break;
-		smc_wr_rx_process_cqes(&wc[0], rc);
 	} while (rc > 0);
-	if (polled == 1)
+
+	/* IB_CQ_REPORT_MISSED_EVENTS make sure if ib_req_notify_cq() returns
+	 * 0, it is safe to wait for the next event.
+	 * Else we must poll the CQ again to make sure we won't miss any event
+	 */
+	if (ib_req_notify_cq(dev->roce_cq_recv,
+			     IB_CQ_SOLICITED_MASK |
+			     IB_CQ_REPORT_MISSED_EVENTS))
 		goto again;
 }