summaryrefslogtreecommitdiff
path: root/net/smc/smc_tx.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-06-07 04:39:49 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2018-06-07 04:39:49 +0300
commit1c8c5a9d38f607c0b6fd12c91cbe1a4418762a21 (patch)
treedcc97181d4d187252e0cc8fdf29d9b365fa3ffd0 /net/smc/smc_tx.c
parent285767604576148fc1be7fcd112e4a90eb0d6ad2 (diff)
parent7170e6045a6a8b33f4fa5753589dc77b16198e2d (diff)
downloadlinux-1c8c5a9d38f607c0b6fd12c91cbe1a4418762a21.tar.xz
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller: 1) Add Maglev hashing scheduler to IPVS, from Inju Song. 2) Lots of new TC subsystem tests from Roman Mashak. 3) Add TCP zero copy receive and fix delayed acks and autotuning with SO_RCVLOWAT, from Eric Dumazet. 4) Add XDP_REDIRECT support to mlx5 driver, from Jesper Dangaard Brouer. 5) Add ttl inherit support to vxlan, from Hangbin Liu. 6) Properly separate ipv6 routes into their logically independant components. fib6_info for the routing table, and fib6_nh for sets of nexthops, which thus can be shared. From David Ahern. 7) Add bpf_xdp_adjust_tail helper, which can be used to generate ICMP messages from XDP programs. From Nikita V. Shirokov. 8) Lots of long overdue cleanups to the r8169 driver, from Heiner Kallweit. 9) Add BTF ("BPF Type Format"), from Martin KaFai Lau. 10) Add traffic condition monitoring to iwlwifi, from Luca Coelho. 11) Plumb extack down into fib_rules, from Roopa Prabhu. 12) Add Flower classifier offload support to igb, from Vinicius Costa Gomes. 13) Add UDP GSO support, from Willem de Bruijn. 14) Add documentation for eBPF helpers, from Quentin Monnet. 15) Add TLS tx offload to mlx5, from Ilya Lesokhin. 16) Allow applications to be given the number of bytes available to read on a socket via a control message returned from recvmsg(), from Soheil Hassas Yeganeh. 17) Add x86_32 eBPF JIT compiler, from Wang YanQing. 18) Add AF_XDP sockets, with zerocopy support infrastructure as well. From Björn Töpel. 19) Remove indirect load support from all of the BPF JITs and handle these operations in the verifier by translating them into native BPF instead. From Daniel Borkmann. 20) Add GRO support to ipv6 gre tunnels, from Eran Ben Elisha. 21) Allow XDP programs to do lookups in the main kernel routing tables for forwarding. From David Ahern. 22) Allow drivers to store hardware state into an ELF section of kernel dump vmcore files, and use it in cxgb4. From Rahul Lakkireddy. 23) Various RACK and loss detection improvements in TCP, from Yuchung Cheng. 24) Add TCP SACK compression, from Eric Dumazet. 25) Add User Mode Helper support and basic bpfilter infrastructure, from Alexei Starovoitov. 26) Support ports and protocol values in RTM_GETROUTE, from Roopa Prabhu. 27) Support bulking in ->ndo_xdp_xmit() API, from Jesper Dangaard Brouer. 28) Add lots of forwarding selftests, from Petr Machata. 29) Add generic network device failover driver, from Sridhar Samudrala. * ra.kernel.org:/pub/scm/linux/kernel/git/davem/net-next: (1959 commits) strparser: Add __strp_unpause and use it in ktls. rxrpc: Fix terminal retransmission connection ID to include the channel net: hns3: Optimize PF CMDQ interrupt switching process net: hns3: Fix for VF mailbox receiving unknown message net: hns3: Fix for VF mailbox cannot receiving PF response bnx2x: use the right constant Revert "net: sched: cls: Fix offloading when ingress dev is vxlan" net: dsa: b53: Fix for brcm tag issue in Cygnus SoC enic: fix UDP rss bits netdev-FAQ: clarify DaveM's position for stable backports rtnetlink: validate attributes in do_setlink() mlxsw: Add extack messages for port_{un, }split failures netdevsim: Add extack error message for devlink reload devlink: Add extack to reload and port_{un, }split operations net: metrics: add proper netlink validation ipmr: fix error path when ipmr_new_table fails ip6mr: only set ip6mr_table from setsockopt when ip6mr_new_table succeeds net: hns3: remove unused hclgevf_cfg_func_mta_filter netfilter: provide udp*_lib_lookup for nf_tproxy qed*: Utilize FW 8.37.2.0 ...
Diffstat (limited to 'net/smc/smc_tx.c')
-rw-r--r--net/smc/smc_tx.c111
1 files changed, 74 insertions, 37 deletions
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index 72f004c9c9b1..cee666400752 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -19,6 +19,7 @@
#include <linux/sched/signal.h>
#include <net/sock.h>
+#include <net/tcp.h>
#include "smc.h"
#include "smc_wr.h"
@@ -26,11 +27,12 @@
#include "smc_tx.h"
#define SMC_TX_WORK_DELAY HZ
+#define SMC_TX_CORK_DELAY (HZ >> 2) /* 250 ms */
/***************************** sndbuf producer *******************************/
/* callback implementation for sk.sk_write_space()
- * to wakeup sndbuf producers that blocked with smc_tx_wait_memory().
+ * to wakeup sndbuf producers that blocked with smc_tx_wait().
* called under sk_socket lock.
*/
static void smc_tx_write_space(struct sock *sk)
@@ -54,7 +56,7 @@ static void smc_tx_write_space(struct sock *sk)
}
}
-/* Wakeup sndbuf producers that blocked with smc_tx_wait_memory().
+/* Wakeup sndbuf producers that blocked with smc_tx_wait().
* Cf. tcp_data_snd_check()=>tcp_check_space()=>tcp_new_space().
*/
void smc_tx_sndbuf_nonfull(struct smc_sock *smc)
@@ -64,8 +66,10 @@ void smc_tx_sndbuf_nonfull(struct smc_sock *smc)
smc->sk.sk_write_space(&smc->sk);
}
-/* blocks sndbuf producer until at least one byte of free space available */
-static int smc_tx_wait_memory(struct smc_sock *smc, int flags)
+/* blocks sndbuf producer until at least one byte of free space available
+ * or urgent Byte was consumed
+ */
+static int smc_tx_wait(struct smc_sock *smc, int flags)
{
DEFINE_WAIT_FUNC(wait, woken_wake_function);
struct smc_connection *conn = &smc->conn;
@@ -101,20 +105,28 @@ static int smc_tx_wait_memory(struct smc_sock *smc, int flags)
break;
}
sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
- if (atomic_read(&conn->sndbuf_space))
- break; /* at least 1 byte of free space available */
+ if (atomic_read(&conn->sndbuf_space) && !conn->urg_tx_pend)
+ break; /* at least 1 byte of free & no urgent data */
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
sk_wait_event(sk, &timeo,
sk->sk_err ||
(sk->sk_shutdown & SEND_SHUTDOWN) ||
smc_cdc_rxed_any_close(conn) ||
- atomic_read(&conn->sndbuf_space),
+ (atomic_read(&conn->sndbuf_space) &&
+ !conn->urg_tx_pend),
&wait);
}
remove_wait_queue(sk_sleep(sk), &wait);
return rc;
}
+static bool smc_tx_is_corked(struct smc_sock *smc)
+{
+ struct tcp_sock *tp = tcp_sk(smc->clcsock->sk);
+
+ return (tp->nonagle & TCP_NAGLE_CORK) ? true : false;
+}
+
/* sndbuf producer: main API called by socket layer.
* called under sock lock.
*/
@@ -148,8 +160,11 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
if (smc_cdc_rxed_any_close(conn))
return send_done ?: -ECONNRESET;
- if (!atomic_read(&conn->sndbuf_space)) {
- rc = smc_tx_wait_memory(smc, msg->msg_flags);
+ if (msg->msg_flags & MSG_OOB)
+ conn->local_tx_ctrl.prod_flags.urg_data_pending = 1;
+
+ if (!atomic_read(&conn->sndbuf_space) || conn->urg_tx_pend) {
+ rc = smc_tx_wait(smc, msg->msg_flags);
if (rc) {
if (send_done)
return send_done;
@@ -159,7 +174,7 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
}
/* initialize variables for 1st iteration of subsequent loop */
- /* could be just 1 byte, even after smc_tx_wait_memory above */
+ /* could be just 1 byte, even after smc_tx_wait above */
writespace = atomic_read(&conn->sndbuf_space);
/* not more than what user space asked for */
copylen = min_t(size_t, send_remaining, writespace);
@@ -171,8 +186,8 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
tx_cnt_prep = prep.count;
/* determine chunks where to write into sndbuf */
/* either unwrapped case, or 1st chunk of wrapped case */
- chunk_len = min_t(size_t,
- copylen, conn->sndbuf_size - tx_cnt_prep);
+ chunk_len = min_t(size_t, copylen, conn->sndbuf_desc->len -
+ tx_cnt_prep);
chunk_len_sum = chunk_len;
chunk_off = tx_cnt_prep;
smc_sndbuf_sync_sg_for_cpu(conn);
@@ -197,19 +212,30 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
}
smc_sndbuf_sync_sg_for_device(conn);
/* update cursors */
- smc_curs_add(conn->sndbuf_size, &prep, copylen);
+ smc_curs_add(conn->sndbuf_desc->len, &prep, copylen);
smc_curs_write(&conn->tx_curs_prep,
smc_curs_read(&prep, conn),
conn);
/* increased in send tasklet smc_cdc_tx_handler() */
smp_mb__before_atomic();
atomic_sub(copylen, &conn->sndbuf_space);
- /* guarantee 0 <= sndbuf_space <= sndbuf_size */
+ /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */
smp_mb__after_atomic();
/* since we just produced more new data into sndbuf,
* trigger sndbuf consumer: RDMA write into peer RMBE and CDC
*/
- smc_tx_sndbuf_nonempty(conn);
+ if ((msg->msg_flags & MSG_OOB) && !send_remaining)
+ conn->urg_tx_pend = true;
+ if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) &&
+ (atomic_read(&conn->sndbuf_space) >
+ (conn->sndbuf_desc->len >> 1)))
+ /* for a corked socket defer the RDMA writes if there
+ * is still sufficient sndbuf_space available
+ */
+ schedule_delayed_work(&conn->tx_work,
+ SMC_TX_CORK_DELAY);
+ else
+ smc_tx_sndbuf_nonempty(conn);
} /* while (msg_data_left(msg)) */
return send_done;
@@ -243,7 +269,7 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset,
rdma_wr.remote_addr =
lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr +
/* RMBE within RMB */
- ((conn->peer_conn_idx - 1) * conn->peer_rmbe_size) +
+ conn->tx_off +
/* offset within RMBE */
peer_rmbe_offset;
rdma_wr.rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey;
@@ -268,7 +294,7 @@ static inline void smc_tx_advance_cursors(struct smc_connection *conn,
atomic_sub(len, &conn->peer_rmbe_space);
/* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */
smp_mb__after_atomic();
- smc_curs_add(conn->sndbuf_size, sent, len);
+ smc_curs_add(conn->sndbuf_desc->len, sent, len);
}
/* sndbuf consumer: prepare all necessary (src&dst) chunks of data transmit;
@@ -281,6 +307,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
union smc_host_cursor sent, prep, prod, cons;
struct ib_sge sges[SMC_IB_MAX_SEND_SGE];
struct smc_link_group *lgr = conn->lgr;
+ struct smc_cdc_producer_flags *pflags;
int to_send, rmbespace;
struct smc_link *link;
dma_addr_t dma_addr;
@@ -291,7 +318,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn);
smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn);
/* cf. wmem_alloc - (snd_max - snd_una) */
- to_send = smc_curs_diff(conn->sndbuf_size, &sent, &prep);
+ to_send = smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep);
if (to_send <= 0)
return 0;
@@ -308,7 +335,8 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
conn);
/* if usable snd_wnd closes ask peer to advertise once it opens again */
- conn->local_tx_ctrl.prod_flags.write_blocked = (to_send >= rmbespace);
+ pflags = &conn->local_tx_ctrl.prod_flags;
+ pflags->write_blocked = (to_send >= rmbespace);
/* cf. usable snd_wnd */
len = min(to_send, rmbespace);
@@ -333,12 +361,12 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
dst_len_sum = dst_len;
src_off = sent.count;
/* dst_len determines the maximum src_len */
- if (sent.count + dst_len <= conn->sndbuf_size) {
+ if (sent.count + dst_len <= conn->sndbuf_desc->len) {
/* unwrapped src case: single chunk of entire dst_len */
src_len = dst_len;
} else {
/* wrapped src case: 2 chunks of sum dst_len; start with 1st: */
- src_len = conn->sndbuf_size - sent.count;
+ src_len = conn->sndbuf_desc->len - sent.count;
}
src_len_sum = src_len;
dma_addr = sg_dma_address(conn->sndbuf_desc->sgt[SMC_SINGLE_LINK].sgl);
@@ -350,8 +378,8 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
sges[srcchunk].lkey = link->roce_pd->local_dma_lkey;
num_sges++;
src_off += src_len;
- if (src_off >= conn->sndbuf_size)
- src_off -= conn->sndbuf_size;
+ if (src_off >= conn->sndbuf_desc->len)
+ src_off -= conn->sndbuf_desc->len;
/* modulo in send ring */
if (src_len_sum == dst_len)
break; /* either on 1st or 2nd iteration */
@@ -369,10 +397,12 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
dst_len = len - dst_len; /* remainder */
dst_len_sum += dst_len;
src_len = min_t(int,
- dst_len, conn->sndbuf_size - sent.count);
+ dst_len, conn->sndbuf_desc->len - sent.count);
src_len_sum = src_len;
}
+ if (conn->urg_tx_pend && len == to_send)
+ pflags->urg_data_present = 1;
smc_tx_advance_cursors(conn, &prod, &sent, len);
/* update connection's cursors with advanced local cursors */
smc_curs_write(&conn->local_tx_ctrl.prod,
@@ -392,6 +422,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
*/
int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
{
+ struct smc_cdc_producer_flags *pflags;
struct smc_cdc_tx_pend *pend;
struct smc_wr_buf *wr_buf;
int rc;
@@ -409,20 +440,27 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
}
rc = 0;
if (conn->alert_token_local) /* connection healthy */
- schedule_delayed_work(&conn->tx_work,
- SMC_TX_WORK_DELAY);
+ mod_delayed_work(system_wq, &conn->tx_work,
+ SMC_TX_WORK_DELAY);
}
goto out_unlock;
}
- rc = smc_tx_rdma_writes(conn);
- if (rc) {
- smc_wr_tx_put_slot(&conn->lgr->lnk[SMC_SINGLE_LINK],
- (struct smc_wr_tx_pend_priv *)pend);
- goto out_unlock;
+ if (!conn->local_tx_ctrl.prod_flags.urg_data_present) {
+ rc = smc_tx_rdma_writes(conn);
+ if (rc) {
+ smc_wr_tx_put_slot(&conn->lgr->lnk[SMC_SINGLE_LINK],
+ (struct smc_wr_tx_pend_priv *)pend);
+ goto out_unlock;
+ }
}
rc = smc_cdc_msg_send(conn, wr_buf, pend);
+ pflags = &conn->local_tx_ctrl.prod_flags;
+ if (!rc && pflags->urg_data_present) {
+ pflags->urg_data_pending = 0;
+ pflags->urg_data_present = 0;
+ }
out_unlock:
spin_unlock_bh(&conn->send_lock);
@@ -432,7 +470,7 @@ out_unlock:
/* Wakeup sndbuf consumers from process context
* since there is more data to transmit
*/
-static void smc_tx_work(struct work_struct *work)
+void smc_tx_work(struct work_struct *work)
{
struct smc_connection *conn = container_of(to_delayed_work(work),
struct smc_connection,
@@ -455,7 +493,7 @@ out:
release_sock(&smc->sk);
}
-void smc_tx_consumer_update(struct smc_connection *conn)
+void smc_tx_consumer_update(struct smc_connection *conn, bool force)
{
union smc_host_cursor cfed, cons;
int to_confirm;
@@ -466,11 +504,12 @@ void smc_tx_consumer_update(struct smc_connection *conn)
smc_curs_write(&cfed,
smc_curs_read(&conn->rx_curs_confirmed, conn),
conn);
- to_confirm = smc_curs_diff(conn->rmbe_size, &cfed, &cons);
+ to_confirm = smc_curs_diff(conn->rmb_desc->len, &cfed, &cons);
if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req ||
+ force ||
((to_confirm > conn->rmbe_update_limit) &&
- ((to_confirm > (conn->rmbe_size / 2)) ||
+ ((to_confirm > (conn->rmb_desc->len / 2)) ||
conn->local_rx_ctrl.prod_flags.write_blocked))) {
if ((smc_cdc_get_slot_and_msg_send(conn) < 0) &&
conn->alert_token_local) { /* connection healthy */
@@ -494,6 +533,4 @@ void smc_tx_consumer_update(struct smc_connection *conn)
void smc_tx_init(struct smc_sock *smc)
{
smc->sk.sk_write_space = smc_tx_write_space;
- INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work);
- spin_lock_init(&smc->conn.send_lock);
}