diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-02-22 21:15:09 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-02-22 21:15:09 +0300 |
commit | 3051bf36c25d5153051704291782f8d44e744d36 (patch) | |
tree | 72dfc8a1d12675c6f2981d13102df954b678f11b /net/smc/smc_rx.c | |
parent | 1e74a2eb1f5cc7f2f2b5aa9c9eeecbcf352220a3 (diff) | |
parent | 005c3490e9db23738d91e02788606c0fe4734723 (diff) | |
download | linux-3051bf36c25d5153051704291782f8d44e744d36.tar.xz |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller:
"Highlights:
1) Support TX_RING in AF_PACKET TPACKET_V3 mode, from Sowmini
Varadhan.
2) Simplify classifier state on sk_buff in order to shrink it a bit.
From Willem de Bruijn.
3) Introduce SIPHASH and it's usage for secure sequence numbers and
syncookies. From Jason A. Donenfeld.
4) Reduce CPU usage for ICMP replies we are going to limit or
suppress, from Jesper Dangaard Brouer.
5) Introduce Shared Memory Communications socket layer, from Ursula
Braun.
6) Add RACK loss detection and allow it to actually trigger fast
recovery instead of just assisting after other algorithms have
triggered it. From Yuchung Cheng.
7) Add xmit_more and BQL support to mvneta driver, from Simon Guinot.
8) skb_cow_data avoidance in esp4 and esp6, from Steffen Klassert.
9) Export MPLS packet stats via netlink, from Robert Shearman.
10) Significantly improve inet port bind conflict handling, especially
when an application is restarted and changes it's setting of
reuseport. From Josef Bacik.
11) Implement TX batching in vhost_net, from Jason Wang.
12) Extend the dummy device so that VF (virtual function) features,
such as configuration, can be more easily tested. From Phil
Sutter.
13) Avoid two atomic ops per page on x86 in bnx2x driver, from Eric
Dumazet.
14) Add new bpf MAP, implementing a longest prefix match trie. From
Daniel Mack.
15) Packet sample offloading support in mlxsw driver, from Yotam Gigi.
16) Add new aquantia driver, from David VomLehn.
17) Add bpf tracepoints, from Daniel Borkmann.
18) Add support for port mirroring to b53 and bcm_sf2 drivers, from
Florian Fainelli.
19) Remove custom busy polling in many drivers, it is done in the core
networking since 4.5 times. From Eric Dumazet.
20) Support XDP adjust_head in virtio_net, from John Fastabend.
21) Fix several major holes in neighbour entry confirmation, from
Julian Anastasov.
22) Add XDP support to bnxt_en driver, from Michael Chan.
23) VXLAN offloads for enic driver, from Govindarajulu Varadarajan.
24) Add IPVTAP driver (IP-VLAN based tap driver) from Sainath Grandhi.
25) Support GRO in IPSEC protocols, from Steffen Klassert"
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1764 commits)
Revert "ath10k: Search SMBIOS for OEM board file extension"
net: socket: fix recvmmsg not returning error from sock_error
bnxt_en: use eth_hw_addr_random()
bpf: fix unlocking of jited image when module ronx not set
arch: add ARCH_HAS_SET_MEMORY config
net: napi_watchdog() can use napi_schedule_irqoff()
tcp: Revert "tcp: tcp_probe: use spin_lock_bh()"
net/hsr: use eth_hw_addr_random()
net: mvpp2: enable building on 64-bit platforms
net: mvpp2: switch to build_skb() in the RX path
net: mvpp2: simplify MVPP2_PRS_RI_* definitions
net: mvpp2: fix indentation of MVPP2_EXT_GLOBAL_CTRL_DEFAULT
net: mvpp2: remove unused register definitions
net: mvpp2: simplify mvpp2_bm_bufs_add()
net: mvpp2: drop useless fields in mvpp2_bm_pool and related code
net: mvpp2: remove unused 'tx_skb' field of 'struct mvpp2_tx_queue'
net: mvpp2: release reference to txq_cpu[] entry after unmapping
net: mvpp2: handle too large value in mvpp2_rx_time_coal_set()
net: mvpp2: handle too large value handling in mvpp2_rx_pkts_coal_set()
net: mvpp2: remove useless arguments in mvpp2_rx_{pkts, time}_coal_set
...
Diffstat (limited to 'net/smc/smc_rx.c')
-rw-r--r-- | net/smc/smc_rx.c | 217 |
1 files changed, 217 insertions, 0 deletions
diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c new file mode 100644 index 000000000000..5d1878732f46 --- /dev/null +++ b/net/smc/smc_rx.c @@ -0,0 +1,217 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Manage RMBE + * copy new RMBE data into user space + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#include <linux/net.h> +#include <linux/rcupdate.h> +#include <net/sock.h> + +#include "smc.h" +#include "smc_core.h" +#include "smc_cdc.h" +#include "smc_tx.h" /* smc_tx_consumer_update() */ +#include "smc_rx.h" + +/* callback implementation for sk.sk_data_ready() + * to wakeup rcvbuf consumers that blocked with smc_rx_wait_data(). + * indirectly called by smc_cdc_msg_recv_action(). + */ +static void smc_rx_data_ready(struct sock *sk) +{ + struct socket_wq *wq; + + /* derived from sock_def_readable() */ + /* called already in smc_listen_work() */ + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (skwq_has_sleeper(wq)) + wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI | + POLLRDNORM | POLLRDBAND); + if ((sk->sk_shutdown == SHUTDOWN_MASK) || + (sk->sk_state == SMC_CLOSED)) + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); + else + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + rcu_read_unlock(); +} + +/* blocks rcvbuf consumer until >=len bytes available or timeout or interrupted + * @smc smc socket + * @timeo pointer to max seconds to wait, pointer to value 0 for no timeout + * Returns: + * 1 if at least 1 byte available in rcvbuf or if socket error/shutdown. + * 0 otherwise (nothing in rcvbuf nor timeout, e.g. interrupted). + */ +static int smc_rx_wait_data(struct smc_sock *smc, long *timeo) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + struct smc_connection *conn = &smc->conn; + struct sock *sk = &smc->sk; + int rc; + + if (atomic_read(&conn->bytes_to_rcv)) + return 1; + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + add_wait_queue(sk_sleep(sk), &wait); + rc = sk_wait_event(sk, timeo, + sk->sk_err || + sk->sk_shutdown & RCV_SHUTDOWN || + sock_flag(sk, SOCK_DONE) || + atomic_read(&conn->bytes_to_rcv) || + smc_cdc_rxed_any_close_or_senddone(conn), + &wait); + remove_wait_queue(sk_sleep(sk), &wait); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + return rc; +} + +/* rcvbuf consumer: main API called by socket layer. + * called under sk lock. + */ +int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len, + int flags) +{ + size_t copylen, read_done = 0, read_remaining = len; + size_t chunk_len, chunk_off, chunk_len_sum; + struct smc_connection *conn = &smc->conn; + union smc_host_cursor cons; + int readable, chunk; + char *rcvbuf_base; + struct sock *sk; + long timeo; + int target; /* Read at least these many bytes */ + int rc; + + if (unlikely(flags & MSG_ERRQUEUE)) + return -EINVAL; /* future work for sk.sk_family == AF_SMC */ + if (flags & MSG_OOB) + return -EINVAL; /* future work */ + + sk = &smc->sk; + if (sk->sk_state == SMC_LISTEN) + return -ENOTCONN; + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); + + msg->msg_namelen = 0; + /* we currently use 1 RMBE per RMB, so RMBE == RMB base addr */ + rcvbuf_base = conn->rmb_desc->cpu_addr; + + do { /* while (read_remaining) */ + if (read_done >= target) + break; + + if (atomic_read(&conn->bytes_to_rcv)) + goto copy; + + if (read_done) { + if (sk->sk_err || + sk->sk_state == SMC_CLOSED || + (sk->sk_shutdown & RCV_SHUTDOWN) || + !timeo || + signal_pending(current) || + smc_cdc_rxed_any_close_or_senddone(conn) || + conn->local_tx_ctrl.conn_state_flags. + peer_conn_abort) + break; + } else { + if (sock_flag(sk, SOCK_DONE)) + break; + if (sk->sk_err) { + read_done = sock_error(sk); + break; + } + if (sk->sk_shutdown & RCV_SHUTDOWN || + smc_cdc_rxed_any_close_or_senddone(conn) || + conn->local_tx_ctrl.conn_state_flags. + peer_conn_abort) + break; + if (sk->sk_state == SMC_CLOSED) { + if (!sock_flag(sk, SOCK_DONE)) { + /* This occurs when user tries to read + * from never connected socket. + */ + read_done = -ENOTCONN; + break; + } + break; + } + if (signal_pending(current)) { + read_done = sock_intr_errno(timeo); + break; + } + } + + if (!atomic_read(&conn->bytes_to_rcv)) { + smc_rx_wait_data(smc, &timeo); + continue; + } + +copy: + /* initialize variables for 1st iteration of subsequent loop */ + /* could be just 1 byte, even after smc_rx_wait_data above */ + readable = atomic_read(&conn->bytes_to_rcv); + /* not more than what user space asked for */ + copylen = min_t(size_t, read_remaining, readable); + smc_curs_write(&cons, + smc_curs_read(&conn->local_tx_ctrl.cons, conn), + conn); + /* determine chunks where to read from rcvbuf */ + /* either unwrapped case, or 1st chunk of wrapped case */ + chunk_len = min_t(size_t, + copylen, conn->rmbe_size - cons.count); + chunk_len_sum = chunk_len; + chunk_off = cons.count; + for (chunk = 0; chunk < 2; chunk++) { + if (!(flags & MSG_TRUNC)) { + rc = memcpy_to_msg(msg, rcvbuf_base + chunk_off, + chunk_len); + if (rc) { + if (!read_done) + read_done = -EFAULT; + goto out; + } + } + read_remaining -= chunk_len; + read_done += chunk_len; + + if (chunk_len_sum == copylen) + break; /* either on 1st or 2nd iteration */ + /* prepare next (== 2nd) iteration */ + chunk_len = copylen - chunk_len; /* remainder */ + chunk_len_sum += chunk_len; + chunk_off = 0; /* modulo offset in recv ring buffer */ + } + + /* update cursors */ + if (!(flags & MSG_PEEK)) { + smc_curs_add(conn->rmbe_size, &cons, copylen); + /* increased in recv tasklet smc_cdc_msg_rcv() */ + smp_mb__before_atomic(); + atomic_sub(copylen, &conn->bytes_to_rcv); + /* guarantee 0 <= bytes_to_rcv <= rmbe_size */ + smp_mb__after_atomic(); + smc_curs_write(&conn->local_tx_ctrl.cons, + smc_curs_read(&cons, conn), + conn); + /* send consumer cursor update if required */ + /* similar to advertising new TCP rcv_wnd if required */ + smc_tx_consumer_update(conn); + } + } while (read_remaining); +out: + return read_done; +} + +/* Initialize receive properties on connection establishment. NB: not __init! */ +void smc_rx_init(struct smc_sock *smc) +{ + smc->sk.sk_data_ready = smc_rx_data_ready; +} |