diff options
Diffstat (limited to 'net/rds')
-rw-r--r-- | net/rds/Kconfig | 7 | ||||
-rw-r--r-- | net/rds/Makefile | 4 | ||||
-rw-r--r-- | net/rds/af_rds.c | 26 | ||||
-rw-r--r-- | net/rds/ib.c | 47 | ||||
-rw-r--r-- | net/rds/ib.h | 37 | ||||
-rw-r--r-- | net/rds/ib_cm.c | 59 | ||||
-rw-r--r-- | net/rds/ib_fmr.c | 248 | ||||
-rw-r--r-- | net/rds/ib_frmr.c | 376 | ||||
-rw-r--r-- | net/rds/ib_mr.h | 148 | ||||
-rw-r--r-- | net/rds/ib_rdma.c | 495 | ||||
-rw-r--r-- | net/rds/ib_send.c | 6 | ||||
-rw-r--r-- | net/rds/ib_stats.c | 2 | ||||
-rw-r--r-- | net/rds/iw.c | 312 | ||||
-rw-r--r-- | net/rds/iw.h | 398 | ||||
-rw-r--r-- | net/rds/iw_cm.c | 769 | ||||
-rw-r--r-- | net/rds/iw_rdma.c | 837 | ||||
-rw-r--r-- | net/rds/iw_recv.c | 904 | ||||
-rw-r--r-- | net/rds/iw_ring.c | 169 | ||||
-rw-r--r-- | net/rds/iw_send.c | 981 | ||||
-rw-r--r-- | net/rds/iw_stats.c | 95 | ||||
-rw-r--r-- | net/rds/iw_sysctl.c | 123 | ||||
-rw-r--r-- | net/rds/rdma_transport.c | 21 | ||||
-rw-r--r-- | net/rds/rdma_transport.h | 5 | ||||
-rw-r--r-- | net/rds/rds.h | 1 | ||||
-rw-r--r-- | net/rds/recv.c | 20 | ||||
-rw-r--r-- | net/rds/tcp.c | 146 |
26 files changed, 1193 insertions, 5043 deletions
diff --git a/net/rds/Kconfig b/net/rds/Kconfig index f2c670ba7b9b..bffde4b46c5d 100644 --- a/net/rds/Kconfig +++ b/net/rds/Kconfig @@ -4,14 +4,13 @@ config RDS depends on INET ---help--- The RDS (Reliable Datagram Sockets) protocol provides reliable, - sequenced delivery of datagrams over Infiniband, iWARP, - or TCP. + sequenced delivery of datagrams over Infiniband or TCP. config RDS_RDMA - tristate "RDS over Infiniband and iWARP" + tristate "RDS over Infiniband" depends on RDS && INFINIBAND && INFINIBAND_ADDR_TRANS ---help--- - Allow RDS to use Infiniband and iWARP as a transport. + Allow RDS to use Infiniband as a transport. This transport supports RDMA operations. config RDS_TCP diff --git a/net/rds/Makefile b/net/rds/Makefile index 56d3f6023ced..0e72bec1529f 100644 --- a/net/rds/Makefile +++ b/net/rds/Makefile @@ -6,9 +6,7 @@ rds-y := af_rds.o bind.o cong.o connection.o info.o message.o \ obj-$(CONFIG_RDS_RDMA) += rds_rdma.o rds_rdma-y := rdma_transport.o \ ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \ - ib_sysctl.o ib_rdma.o \ - iw.o iw_cm.o iw_recv.o iw_ring.o iw_send.o iw_stats.o \ - iw_sysctl.o iw_rdma.o + ib_sysctl.o ib_rdma.o ib_fmr.o ib_frmr.o obj-$(CONFIG_RDS_TCP) += rds_tcp.o diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index b5476aebd68d..6beaeb1138f3 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -277,6 +277,27 @@ static int rds_set_transport(struct rds_sock *rs, char __user *optval, return rs->rs_transport ? 0 : -ENOPROTOOPT; } +static int rds_enable_recvtstamp(struct sock *sk, char __user *optval, + int optlen) +{ + int val, valbool; + + if (optlen != sizeof(int)) + return -EFAULT; + + if (get_user(val, (int __user *)optval)) + return -EFAULT; + + valbool = val ? 1 : 0; + + if (valbool) + sock_set_flag(sk, SOCK_RCVTSTAMP); + else + sock_reset_flag(sk, SOCK_RCVTSTAMP); + + return 0; +} + static int rds_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) { @@ -312,6 +333,11 @@ static int rds_setsockopt(struct socket *sock, int level, int optname, ret = rds_set_transport(rs, optval, optlen); release_sock(sock->sk); break; + case SO_TIMESTAMP: + lock_sock(sock->sk); + ret = rds_enable_recvtstamp(sock->sk, optval, optlen); + release_sock(sock->sk); + break; default: ret = -ENOPROTOOPT; } diff --git a/net/rds/ib.c b/net/rds/ib.c index 9481d55ff6cb..b5342fddaf98 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -42,15 +42,16 @@ #include "rds.h" #include "ib.h" +#include "ib_mr.h" -unsigned int rds_ib_fmr_1m_pool_size = RDS_FMR_1M_POOL_SIZE; -unsigned int rds_ib_fmr_8k_pool_size = RDS_FMR_8K_POOL_SIZE; +unsigned int rds_ib_mr_1m_pool_size = RDS_MR_1M_POOL_SIZE; +unsigned int rds_ib_mr_8k_pool_size = RDS_MR_8K_POOL_SIZE; unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT; -module_param(rds_ib_fmr_1m_pool_size, int, 0444); -MODULE_PARM_DESC(rds_ib_fmr_1m_pool_size, " Max number of 1M fmr per HCA"); -module_param(rds_ib_fmr_8k_pool_size, int, 0444); -MODULE_PARM_DESC(rds_ib_fmr_8k_pool_size, " Max number of 8K fmr per HCA"); +module_param(rds_ib_mr_1m_pool_size, int, 0444); +MODULE_PARM_DESC(rds_ib_mr_1m_pool_size, " Max number of 1M mr per HCA"); +module_param(rds_ib_mr_8k_pool_size, int, 0444); +MODULE_PARM_DESC(rds_ib_mr_8k_pool_size, " Max number of 8K mr per HCA"); module_param(rds_ib_retry_count, int, 0444); MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error"); @@ -139,14 +140,20 @@ static void rds_ib_add_one(struct ib_device *device) rds_ibdev->max_wrs = device->attrs.max_qp_wr; rds_ibdev->max_sge = min(device->attrs.max_sge, RDS_IB_MAX_SGE); + rds_ibdev->has_fr = (device->attrs.device_cap_flags & + IB_DEVICE_MEM_MGT_EXTENSIONS); + rds_ibdev->has_fmr = (device->alloc_fmr && device->dealloc_fmr && + device->map_phys_fmr && device->unmap_fmr); + rds_ibdev->use_fastreg = (rds_ibdev->has_fr && !rds_ibdev->has_fmr); + rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32; - rds_ibdev->max_1m_fmrs = device->attrs.max_mr ? + rds_ibdev->max_1m_mrs = device->attrs.max_mr ? min_t(unsigned int, (device->attrs.max_mr / 2), - rds_ib_fmr_1m_pool_size) : rds_ib_fmr_1m_pool_size; + rds_ib_mr_1m_pool_size) : rds_ib_mr_1m_pool_size; - rds_ibdev->max_8k_fmrs = device->attrs.max_mr ? + rds_ibdev->max_8k_mrs = device->attrs.max_mr ? min_t(unsigned int, ((device->attrs.max_mr / 2) * RDS_MR_8K_SCALE), - rds_ib_fmr_8k_pool_size) : rds_ib_fmr_8k_pool_size; + rds_ib_mr_8k_pool_size) : rds_ib_mr_8k_pool_size; rds_ibdev->max_initiator_depth = device->attrs.max_qp_init_rd_atom; rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom; @@ -172,10 +179,14 @@ static void rds_ib_add_one(struct ib_device *device) goto put_dev; } - rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, fmr_max_remaps = %d, max_1m_fmrs = %d, max_8k_fmrs = %d\n", + rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, fmr_max_remaps = %d, max_1m_mrs = %d, max_8k_mrs = %d\n", device->attrs.max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge, - rds_ibdev->fmr_max_remaps, rds_ibdev->max_1m_fmrs, - rds_ibdev->max_8k_fmrs); + rds_ibdev->fmr_max_remaps, rds_ibdev->max_1m_mrs, + rds_ibdev->max_8k_mrs); + + pr_info("RDS/IB: %s: %s supported and preferred\n", + device->name, + rds_ibdev->use_fastreg ? "FRMR" : "FMR"); INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); INIT_LIST_HEAD(&rds_ibdev->conn_list); @@ -364,7 +375,7 @@ void rds_ib_exit(void) rds_ib_sysctl_exit(); rds_ib_recv_exit(); rds_trans_unregister(&rds_ib_transport); - rds_ib_fmr_exit(); + rds_ib_mr_exit(); } struct rds_transport rds_ib_transport = { @@ -400,13 +411,13 @@ int rds_ib_init(void) INIT_LIST_HEAD(&rds_ib_devices); - ret = rds_ib_fmr_init(); + ret = rds_ib_mr_init(); if (ret) goto out; ret = ib_register_client(&rds_ib_client); if (ret) - goto out_fmr_exit; + goto out_mr_exit; ret = rds_ib_sysctl_init(); if (ret) @@ -430,8 +441,8 @@ out_sysctl: rds_ib_sysctl_exit(); out_ibreg: rds_ib_unregister_client(); -out_fmr_exit: - rds_ib_fmr_exit(); +out_mr_exit: + rds_ib_mr_exit(); out: return ret; } diff --git a/net/rds/ib.h b/net/rds/ib.h index b3fdebb57460..627fb79aee65 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -9,17 +9,12 @@ #include "rds.h" #include "rdma_transport.h" -#define RDS_FMR_1M_POOL_SIZE (8192 / 2) -#define RDS_FMR_1M_MSG_SIZE 256 -#define RDS_FMR_8K_MSG_SIZE 2 -#define RDS_MR_8K_SCALE (256 / (RDS_FMR_8K_MSG_SIZE + 1)) -#define RDS_FMR_8K_POOL_SIZE (RDS_MR_8K_SCALE * (8192 / 2)) - #define RDS_IB_MAX_SGE 8 #define RDS_IB_RECV_SGE 2 #define RDS_IB_DEFAULT_RECV_WR 1024 #define RDS_IB_DEFAULT_SEND_WR 256 +#define RDS_IB_DEFAULT_FR_WR 512 #define RDS_IB_DEFAULT_RETRY_COUNT 2 @@ -28,7 +23,6 @@ #define RDS_IB_RECYCLE_BATCH_COUNT 32 #define RDS_IB_WC_MAX 32 -#define RDS_IB_SEND_OP BIT_ULL(63) extern struct rw_semaphore rds_ib_devices_lock; extern struct list_head rds_ib_devices; @@ -129,6 +123,9 @@ struct rds_ib_connection { struct ib_wc i_send_wc[RDS_IB_WC_MAX]; struct ib_wc i_recv_wc[RDS_IB_WC_MAX]; + /* To control the number of wrs from fastreg */ + atomic_t i_fastreg_wrs; + /* interrupt handling */ struct tasklet_struct i_send_tasklet; struct tasklet_struct i_recv_tasklet; @@ -207,12 +204,16 @@ struct rds_ib_device { struct list_head conn_list; struct ib_device *dev; struct ib_pd *pd; - unsigned int max_fmrs; + bool has_fmr; + bool has_fr; + bool use_fastreg; + + unsigned int max_mrs; struct rds_ib_mr_pool *mr_1m_pool; struct rds_ib_mr_pool *mr_8k_pool; unsigned int fmr_max_remaps; - unsigned int max_8k_fmrs; - unsigned int max_1m_fmrs; + unsigned int max_8k_mrs; + unsigned int max_1m_mrs; int max_sge; unsigned int max_wrs; unsigned int max_initiator_depth; @@ -266,6 +267,8 @@ struct rds_ib_statistics { uint64_t s_ib_rdma_mr_1m_pool_flush; uint64_t s_ib_rdma_mr_1m_pool_wait; uint64_t s_ib_rdma_mr_1m_pool_depleted; + uint64_t s_ib_rdma_mr_8k_reused; + uint64_t s_ib_rdma_mr_1m_reused; uint64_t s_ib_atomic_cswp; uint64_t s_ib_atomic_fadd; }; @@ -317,8 +320,6 @@ struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device); void rds_ib_dev_put(struct rds_ib_device *rds_ibdev); extern struct ib_client rds_ib_client; -extern unsigned int rds_ib_fmr_1m_pool_size; -extern unsigned int rds_ib_fmr_8k_pool_size; extern unsigned int rds_ib_retry_count; extern spinlock_t ib_nodev_conns_lock; @@ -348,17 +349,7 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr); void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); void rds_ib_destroy_nodev_conns(void); -struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev, - int npages); -void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo); -void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); -void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, - struct rds_sock *rs, u32 *key_ret); -void rds_ib_sync_mr(void *trans_private, int dir); -void rds_ib_free_mr(void *trans_private, int invalidate); -void rds_ib_flush_mrs(void); -int rds_ib_fmr_init(void); -void rds_ib_fmr_exit(void); +void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc); /* ib_recv.c */ int rds_ib_recv_init(void); diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index da5a7fb98c77..8764970f0c24 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -236,12 +236,10 @@ static void rds_ib_cq_comp_handler_recv(struct ib_cq *cq, void *context) tasklet_schedule(&ic->i_recv_tasklet); } -static void poll_cq(struct rds_ib_connection *ic, struct ib_cq *cq, - struct ib_wc *wcs, - struct rds_ib_ack_state *ack_state) +static void poll_scq(struct rds_ib_connection *ic, struct ib_cq *cq, + struct ib_wc *wcs) { - int nr; - int i; + int nr, i; struct ib_wc *wc; while ((nr = ib_poll_cq(cq, RDS_IB_WC_MAX, wcs)) > 0) { @@ -251,10 +249,12 @@ static void poll_cq(struct rds_ib_connection *ic, struct ib_cq *cq, (unsigned long long)wc->wr_id, wc->status, wc->byte_len, be32_to_cpu(wc->ex.imm_data)); - if (wc->wr_id & RDS_IB_SEND_OP) + if (wc->wr_id <= ic->i_send_ring.w_nr || + wc->wr_id == RDS_IB_ACK_WR_ID) rds_ib_send_cqe_handler(ic, wc); else - rds_ib_recv_cqe_handler(ic, wc, ack_state); + rds_ib_mr_cqe_handler(ic, wc); + } } } @@ -263,14 +263,12 @@ static void rds_ib_tasklet_fn_send(unsigned long data) { struct rds_ib_connection *ic = (struct rds_ib_connection *)data; struct rds_connection *conn = ic->conn; - struct rds_ib_ack_state state; rds_ib_stats_inc(s_ib_tasklet_call); - memset(&state, 0, sizeof(state)); - poll_cq(ic, ic->i_send_cq, ic->i_send_wc, &state); + poll_scq(ic, ic->i_send_cq, ic->i_send_wc); ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP); - poll_cq(ic, ic->i_send_cq, ic->i_send_wc, &state); + poll_scq(ic, ic->i_send_cq, ic->i_send_wc); if (rds_conn_up(conn) && (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags) || @@ -278,6 +276,25 @@ static void rds_ib_tasklet_fn_send(unsigned long data) rds_send_xmit(ic->conn); } +static void poll_rcq(struct rds_ib_connection *ic, struct ib_cq *cq, + struct ib_wc *wcs, + struct rds_ib_ack_state *ack_state) +{ + int nr, i; + struct ib_wc *wc; + + while ((nr = ib_poll_cq(cq, RDS_IB_WC_MAX, wcs)) > 0) { + for (i = 0; i < nr; i++) { + wc = wcs + i; + rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", + (unsigned long long)wc->wr_id, wc->status, + wc->byte_len, be32_to_cpu(wc->ex.imm_data)); + + rds_ib_recv_cqe_handler(ic, wc, ack_state); + } + } +} + static void rds_ib_tasklet_fn_recv(unsigned long data) { struct rds_ib_connection *ic = (struct rds_ib_connection *)data; @@ -291,9 +308,9 @@ static void rds_ib_tasklet_fn_recv(unsigned long data) rds_ib_stats_inc(s_ib_tasklet_call); memset(&state, 0, sizeof(state)); - poll_cq(ic, ic->i_recv_cq, ic->i_recv_wc, &state); + poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state); ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); - poll_cq(ic, ic->i_recv_cq, ic->i_recv_wc, &state); + poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state); if (state.ack_next_valid) rds_ib_set_ack(ic, state.ack_next, state.ack_required); @@ -351,7 +368,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) struct ib_qp_init_attr attr; struct ib_cq_init_attr cq_attr = {}; struct rds_ib_device *rds_ibdev; - int ret; + int ret, fr_queue_space; /* * It's normal to see a null device if an incoming connection races @@ -361,6 +378,12 @@ static int rds_ib_setup_qp(struct rds_connection *conn) if (!rds_ibdev) return -EOPNOTSUPP; + /* The fr_queue_space is currently set to 512, to add extra space on + * completion queue and send queue. This extra space is used for FRMR + * registration and invalidation work requests + */ + fr_queue_space = (rds_ibdev->use_fastreg ? RDS_IB_DEFAULT_FR_WR : 0); + /* add the conn now so that connection establishment has the dev */ rds_ib_add_conn(rds_ibdev, conn); @@ -372,7 +395,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) /* Protection domain and memory range */ ic->i_pd = rds_ibdev->pd; - cq_attr.cqe = ic->i_send_ring.w_nr + 1; + cq_attr.cqe = ic->i_send_ring.w_nr + fr_queue_space + 1; ic->i_send_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_send, rds_ib_cq_event_handler, conn, @@ -412,7 +435,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) attr.event_handler = rds_ib_qp_event_handler; attr.qp_context = conn; /* + 1 to allow for the single ack message */ - attr.cap.max_send_wr = ic->i_send_ring.w_nr + 1; + attr.cap.max_send_wr = ic->i_send_ring.w_nr + fr_queue_space + 1; attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1; attr.cap.max_send_sge = rds_ibdev->max_sge; attr.cap.max_recv_sge = RDS_IB_RECV_SGE; @@ -420,6 +443,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) attr.qp_type = IB_QPT_RC; attr.send_cq = ic->i_send_cq; attr.recv_cq = ic->i_recv_cq; + atomic_set(&ic->i_fastreg_wrs, RDS_IB_DEFAULT_FR_WR); /* * XXX this can fail if max_*_wr is too large? Are we supposed @@ -739,7 +763,8 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) */ wait_event(rds_ib_ring_empty_wait, rds_ib_ring_empty(&ic->i_recv_ring) && - (atomic_read(&ic->i_signaled_sends) == 0)); + (atomic_read(&ic->i_signaled_sends) == 0) && + (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR)); tasklet_kill(&ic->i_send_tasklet); tasklet_kill(&ic->i_recv_tasklet); diff --git a/net/rds/ib_fmr.c b/net/rds/ib_fmr.c new file mode 100644 index 000000000000..4fe8f4fec4ee --- /dev/null +++ b/net/rds/ib_fmr.c @@ -0,0 +1,248 @@ +/* + * Copyright (c) 2016 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ib_mr.h" + +struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev, int npages) +{ + struct rds_ib_mr_pool *pool; + struct rds_ib_mr *ibmr = NULL; + struct rds_ib_fmr *fmr; + int err = 0; + + if (npages <= RDS_MR_8K_MSG_SIZE) + pool = rds_ibdev->mr_8k_pool; + else + pool = rds_ibdev->mr_1m_pool; + + ibmr = rds_ib_try_reuse_ibmr(pool); + if (ibmr) + return ibmr; + + ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, + rdsibdev_to_node(rds_ibdev)); + if (!ibmr) { + err = -ENOMEM; + goto out_no_cigar; + } + + fmr = &ibmr->u.fmr; + fmr->fmr = ib_alloc_fmr(rds_ibdev->pd, + (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_ATOMIC), + &pool->fmr_attr); + if (IS_ERR(fmr->fmr)) { + err = PTR_ERR(fmr->fmr); + fmr->fmr = NULL; + pr_warn("RDS/IB: %s failed (err=%d)\n", __func__, err); + goto out_no_cigar; + } + + ibmr->pool = pool; + if (pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc); + + return ibmr; + +out_no_cigar: + if (ibmr) { + if (fmr->fmr) + ib_dealloc_fmr(fmr->fmr); + kfree(ibmr); + } + atomic_dec(&pool->item_count); + return ERR_PTR(err); +} + +int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibmr, + struct scatterlist *sg, unsigned int nents) +{ + struct ib_device *dev = rds_ibdev->dev; + struct rds_ib_fmr *fmr = &ibmr->u.fmr; + struct scatterlist *scat = sg; + u64 io_addr = 0; + u64 *dma_pages; + u32 len; + int page_cnt, sg_dma_len; + int i, j; + int ret; + + sg_dma_len = ib_dma_map_sg(dev, sg, nents, DMA_BIDIRECTIONAL); + if (unlikely(!sg_dma_len)) { + pr_warn("RDS/IB: %s failed!\n", __func__); + return -EBUSY; + } + + len = 0; + page_cnt = 0; + + for (i = 0; i < sg_dma_len; ++i) { + unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]); + u64 dma_addr = ib_sg_dma_address(dev, &scat[i]); + + if (dma_addr & ~PAGE_MASK) { + if (i > 0) + return -EINVAL; + else + ++page_cnt; + } + if ((dma_addr + dma_len) & ~PAGE_MASK) { + if (i < sg_dma_len - 1) + return -EINVAL; + else + ++page_cnt; + } + + len += dma_len; + } + + page_cnt += len >> PAGE_SHIFT; + if (page_cnt > ibmr->pool->fmr_attr.max_pages) + return -EINVAL; + + dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC, + rdsibdev_to_node(rds_ibdev)); + if (!dma_pages) + return -ENOMEM; + + page_cnt = 0; + for (i = 0; i < sg_dma_len; ++i) { + unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]); + u64 dma_addr = ib_sg_dma_address(dev, &scat[i]); + + for (j = 0; j < dma_len; j += PAGE_SIZE) + dma_pages[page_cnt++] = + (dma_addr & PAGE_MASK) + j; + } + + ret = ib_map_phys_fmr(fmr->fmr, dma_pages, page_cnt, io_addr); + if (ret) + goto out; + + /* Success - we successfully remapped the MR, so we can + * safely tear down the old mapping. + */ + rds_ib_teardown_mr(ibmr); + + ibmr->sg = scat; + ibmr->sg_len = nents; + ibmr->sg_dma_len = sg_dma_len; + ibmr->remap_count++; + + if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_used); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_used); + ret = 0; + +out: + kfree(dma_pages); + + return ret; +} + +struct rds_ib_mr *rds_ib_reg_fmr(struct rds_ib_device *rds_ibdev, + struct scatterlist *sg, + unsigned long nents, + u32 *key) +{ + struct rds_ib_mr *ibmr = NULL; + struct rds_ib_fmr *fmr; + int ret; + + ibmr = rds_ib_alloc_fmr(rds_ibdev, nents); + if (IS_ERR(ibmr)) + return ibmr; + + ibmr->device = rds_ibdev; + fmr = &ibmr->u.fmr; + ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents); + if (ret == 0) + *key = fmr->fmr->rkey; + else + rds_ib_free_mr(ibmr, 0); + + return ibmr; +} + +void rds_ib_unreg_fmr(struct list_head *list, unsigned int *nfreed, + unsigned long *unpinned, unsigned int goal) +{ + struct rds_ib_mr *ibmr, *next; + struct rds_ib_fmr *fmr; + LIST_HEAD(fmr_list); + int ret = 0; + unsigned int freed = *nfreed; + + /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */ + list_for_each_entry(ibmr, list, unmap_list) { + fmr = &ibmr->u.fmr; + list_add(&fmr->fmr->list, &fmr_list); + } + + ret = ib_unmap_fmr(&fmr_list); + if (ret) + pr_warn("RDS/IB: FMR invalidation failed (err=%d)\n", ret); + + /* Now we can destroy the DMA mapping and unpin any pages */ + list_for_each_entry_safe(ibmr, next, list, unmap_list) { + fmr = &ibmr->u.fmr; + *unpinned += ibmr->sg_len; + __rds_ib_teardown_mr(ibmr); + if (freed < goal || + ibmr->remap_count >= ibmr->pool->fmr_attr.max_maps) { + if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_free); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_free); + list_del(&ibmr->unmap_list); + ib_dealloc_fmr(fmr->fmr); + kfree(ibmr); + freed++; + } + } + *nfreed = freed; +} + +void rds_ib_free_fmr_list(struct rds_ib_mr *ibmr) +{ + struct rds_ib_mr_pool *pool = ibmr->pool; + + if (ibmr->remap_count >= pool->fmr_attr.max_maps) + llist_add(&ibmr->llnode, &pool->drop_list); + else + llist_add(&ibmr->llnode, &pool->free_list); +} diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c new file mode 100644 index 000000000000..93ff038ea9d1 --- /dev/null +++ b/net/rds/ib_frmr.c @@ -0,0 +1,376 @@ +/* + * Copyright (c) 2016 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ib_mr.h" + +static struct rds_ib_mr *rds_ib_alloc_frmr(struct rds_ib_device *rds_ibdev, + int npages) +{ + struct rds_ib_mr_pool *pool; + struct rds_ib_mr *ibmr = NULL; + struct rds_ib_frmr *frmr; + int err = 0; + + if (npages <= RDS_MR_8K_MSG_SIZE) + pool = rds_ibdev->mr_8k_pool; + else + pool = rds_ibdev->mr_1m_pool; + + ibmr = rds_ib_try_reuse_ibmr(pool); + if (ibmr) + return ibmr; + + ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, + rdsibdev_to_node(rds_ibdev)); + if (!ibmr) { + err = -ENOMEM; + goto out_no_cigar; + } + + frmr = &ibmr->u.frmr; + frmr->mr = ib_alloc_mr(rds_ibdev->pd, IB_MR_TYPE_MEM_REG, + pool->fmr_attr.max_pages); + if (IS_ERR(frmr->mr)) { + pr_warn("RDS/IB: %s failed to allocate MR", __func__); + goto out_no_cigar; + } + + ibmr->pool = pool; + if (pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc); + + if (atomic_read(&pool->item_count) > pool->max_items_soft) + pool->max_items_soft = pool->max_items; + + frmr->fr_state = FRMR_IS_FREE; + return ibmr; + +out_no_cigar: + kfree(ibmr); + atomic_dec(&pool->item_count); + return ERR_PTR(err); +} + +static void rds_ib_free_frmr(struct rds_ib_mr *ibmr, bool drop) +{ + struct rds_ib_mr_pool *pool = ibmr->pool; + + if (drop) + llist_add(&ibmr->llnode, &pool->drop_list); + else + llist_add(&ibmr->llnode, &pool->free_list); + atomic_add(ibmr->sg_len, &pool->free_pinned); + atomic_inc(&pool->dirty_count); + + /* If we've pinned too many pages, request a flush */ + if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || + atomic_read(&pool->dirty_count) >= pool->max_items / 5) + queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10); +} + +static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr) +{ + struct rds_ib_frmr *frmr = &ibmr->u.frmr; + struct ib_send_wr *failed_wr; + struct ib_reg_wr reg_wr; + int ret; + + while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) { + atomic_inc(&ibmr->ic->i_fastreg_wrs); + cpu_relax(); + } + + ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len, PAGE_SIZE); + if (unlikely(ret != ibmr->sg_len)) + return ret < 0 ? ret : -EINVAL; + + /* Perform a WR for the fast_reg_mr. Each individual page + * in the sg list is added to the fast reg page list and placed + * inside the fast_reg_mr WR. The key used is a rolling 8bit + * counter, which should guarantee uniqueness. + */ + ib_update_fast_reg_key(frmr->mr, ibmr->remap_count++); + frmr->fr_state = FRMR_IS_INUSE; + + memset(®_wr, 0, sizeof(reg_wr)); + reg_wr.wr.wr_id = (unsigned long)(void *)ibmr; + reg_wr.wr.opcode = IB_WR_REG_MR; + reg_wr.wr.num_sge = 0; + reg_wr.mr = frmr->mr; + reg_wr.key = frmr->mr->rkey; + reg_wr.access = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE; + reg_wr.wr.send_flags = IB_SEND_SIGNALED; + + failed_wr = ®_wr.wr; + ret = ib_post_send(ibmr->ic->i_cm_id->qp, ®_wr.wr, &failed_wr); + WARN_ON(failed_wr != ®_wr.wr); + if (unlikely(ret)) { + /* Failure here can be because of -ENOMEM as well */ + frmr->fr_state = FRMR_IS_STALE; + atomic_inc(&ibmr->ic->i_fastreg_wrs); + if (printk_ratelimit()) + pr_warn("RDS/IB: %s returned error(%d)\n", + __func__, ret); + } + return ret; +} + +static int rds_ib_map_frmr(struct rds_ib_device *rds_ibdev, + struct rds_ib_mr_pool *pool, + struct rds_ib_mr *ibmr, + struct scatterlist *sg, unsigned int sg_len) +{ + struct ib_device *dev = rds_ibdev->dev; + struct rds_ib_frmr *frmr = &ibmr->u.frmr; + int i; + u32 len; + int ret = 0; + + /* We want to teardown old ibmr values here and fill it up with + * new sg values + */ + rds_ib_teardown_mr(ibmr); + + ibmr->sg = sg; + ibmr->sg_len = sg_len; + ibmr->sg_dma_len = 0; + frmr->sg_byte_len = 0; + WARN_ON(ibmr->sg_dma_len); + ibmr->sg_dma_len = ib_dma_map_sg(dev, ibmr->sg, ibmr->sg_len, + DMA_BIDIRECTIONAL); + if (unlikely(!ibmr->sg_dma_len)) { + pr_warn("RDS/IB: %s failed!\n", __func__); + return -EBUSY; + } + + frmr->sg_byte_len = 0; + frmr->dma_npages = 0; + len = 0; + + ret = -EINVAL; + for (i = 0; i < ibmr->sg_dma_len; ++i) { + unsigned int dma_len = ib_sg_dma_len(dev, &ibmr->sg[i]); + u64 dma_addr = ib_sg_dma_address(dev, &ibmr->sg[i]); + + frmr->sg_byte_len += dma_len; + if (dma_addr & ~PAGE_MASK) { + if (i > 0) + goto out_unmap; + else + ++frmr->dma_npages; + } + + if ((dma_addr + dma_len) & ~PAGE_MASK) { + if (i < ibmr->sg_dma_len - 1) + goto out_unmap; + else + ++frmr->dma_npages; + } + + len += dma_len; + } + frmr->dma_npages += len >> PAGE_SHIFT; + + if (frmr->dma_npages > ibmr->pool->fmr_attr.max_pages) { + ret = -EMSGSIZE; + goto out_unmap; + } + + ret = rds_ib_post_reg_frmr(ibmr); + if (ret) + goto out_unmap; + + if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_used); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_used); + + return ret; + +out_unmap: + ib_dma_unmap_sg(rds_ibdev->dev, ibmr->sg, ibmr->sg_len, + DMA_BIDIRECTIONAL); + ibmr->sg_dma_len = 0; + return ret; +} + +static int rds_ib_post_inv(struct rds_ib_mr *ibmr) +{ + struct ib_send_wr *s_wr, *failed_wr; + struct rds_ib_frmr *frmr = &ibmr->u.frmr; + struct rdma_cm_id *i_cm_id = ibmr->ic->i_cm_id; + int ret = -EINVAL; + + if (!i_cm_id || !i_cm_id->qp || !frmr->mr) + goto out; + + if (frmr->fr_state != FRMR_IS_INUSE) + goto out; + + while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) { + atomic_inc(&ibmr->ic->i_fastreg_wrs); + cpu_relax(); + } + + frmr->fr_inv = true; + s_wr = &frmr->fr_wr; + + memset(s_wr, 0, sizeof(*s_wr)); + s_wr->wr_id = (unsigned long)(void *)ibmr; + s_wr->opcode = IB_WR_LOCAL_INV; + s_wr->ex.invalidate_rkey = frmr->mr->rkey; + s_wr->send_flags = IB_SEND_SIGNALED; + + failed_wr = s_wr; + ret = ib_post_send(i_cm_id->qp, s_wr, &failed_wr); + WARN_ON(failed_wr != s_wr); + if (unlikely(ret)) { + frmr->fr_state = FRMR_IS_STALE; + frmr->fr_inv = false; + atomic_inc(&ibmr->ic->i_fastreg_wrs); + pr_err("RDS/IB: %s returned error(%d)\n", __func__, ret); + goto out; + } +out: + return ret; +} + +void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) +{ + struct rds_ib_mr *ibmr = (void *)(unsigned long)wc->wr_id; + struct rds_ib_frmr *frmr = &ibmr->u.frmr; + + if (wc->status != IB_WC_SUCCESS) { + frmr->fr_state = FRMR_IS_STALE; + if (rds_conn_up(ic->conn)) + rds_ib_conn_error(ic->conn, + "frmr completion <%pI4,%pI4> status %u(%s), vendor_err 0x%x, disconnecting and reconnecting\n", + &ic->conn->c_laddr, + &ic->conn->c_faddr, + wc->status, + ib_wc_status_msg(wc->status), + wc->vendor_err); + } + + if (frmr->fr_inv) { + frmr->fr_state = FRMR_IS_FREE; + frmr->fr_inv = false; + } + + atomic_inc(&ic->i_fastreg_wrs); +} + +void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed, + unsigned long *unpinned, unsigned int goal) +{ + struct rds_ib_mr *ibmr, *next; + struct rds_ib_frmr *frmr; + int ret = 0; + unsigned int freed = *nfreed; + + /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */ + list_for_each_entry(ibmr, list, unmap_list) { + if (ibmr->sg_dma_len) + ret |= rds_ib_post_inv(ibmr); + } + if (ret) + pr_warn("RDS/IB: %s failed (err=%d)\n", __func__, ret); + + /* Now we can destroy the DMA mapping and unpin any pages */ + list_for_each_entry_safe(ibmr, next, list, unmap_list) { + *unpinned += ibmr->sg_len; + frmr = &ibmr->u.frmr; + __rds_ib_teardown_mr(ibmr); + if (freed < goal || frmr->fr_state == FRMR_IS_STALE) { + /* Don't de-allocate if the MR is not free yet */ + if (frmr->fr_state == FRMR_IS_INUSE) + continue; + + if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_free); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_free); + list_del(&ibmr->unmap_list); + if (frmr->mr) + ib_dereg_mr(frmr->mr); + kfree(ibmr); + freed++; + } + } + *nfreed = freed; +} + +struct rds_ib_mr *rds_ib_reg_frmr(struct rds_ib_device *rds_ibdev, + struct rds_ib_connection *ic, + struct scatterlist *sg, + unsigned long nents, u32 *key) +{ + struct rds_ib_mr *ibmr = NULL; + struct rds_ib_frmr *frmr; + int ret; + + do { + if (ibmr) + rds_ib_free_frmr(ibmr, true); + ibmr = rds_ib_alloc_frmr(rds_ibdev, nents); + if (IS_ERR(ibmr)) + return ibmr; + frmr = &ibmr->u.frmr; + } while (frmr->fr_state != FRMR_IS_FREE); + + ibmr->ic = ic; + ibmr->device = rds_ibdev; + ret = rds_ib_map_frmr(rds_ibdev, ibmr->pool, ibmr, sg, nents); + if (ret == 0) { + *key = frmr->mr->rkey; + } else { + rds_ib_free_frmr(ibmr, false); + ibmr = ERR_PTR(ret); + } + + return ibmr; +} + +void rds_ib_free_frmr_list(struct rds_ib_mr *ibmr) +{ + struct rds_ib_mr_pool *pool = ibmr->pool; + struct rds_ib_frmr *frmr = &ibmr->u.frmr; + + if (frmr->fr_state == FRMR_IS_STALE) + llist_add(&ibmr->llnode, &pool->drop_list); + else + llist_add(&ibmr->llnode, &pool->free_list); +} diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h new file mode 100644 index 000000000000..1c754f4acbe5 --- /dev/null +++ b/net/rds/ib_mr.h @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2016 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _RDS_IB_MR_H +#define _RDS_IB_MR_H + +#include <linux/kernel.h> + +#include "rds.h" +#include "ib.h" + +#define RDS_MR_1M_POOL_SIZE (8192 / 2) +#define RDS_MR_1M_MSG_SIZE 256 +#define RDS_MR_8K_MSG_SIZE 2 +#define RDS_MR_8K_SCALE (256 / (RDS_MR_8K_MSG_SIZE + 1)) +#define RDS_MR_8K_POOL_SIZE (RDS_MR_8K_SCALE * (8192 / 2)) + +struct rds_ib_fmr { + struct ib_fmr *fmr; + u64 *dma; +}; + +enum rds_ib_fr_state { + FRMR_IS_FREE, /* mr invalidated & ready for use */ + FRMR_IS_INUSE, /* mr is in use or used & can be invalidated */ + FRMR_IS_STALE, /* Stale MR and needs to be dropped */ +}; + +struct rds_ib_frmr { + struct ib_mr *mr; + enum rds_ib_fr_state fr_state; + bool fr_inv; + struct ib_send_wr fr_wr; + unsigned int dma_npages; + unsigned int sg_byte_len; +}; + +/* This is stored as mr->r_trans_private. */ +struct rds_ib_mr { + struct rds_ib_device *device; + struct rds_ib_mr_pool *pool; + struct rds_ib_connection *ic; + + struct llist_node llnode; + + /* unmap_list is for freeing */ + struct list_head unmap_list; + unsigned int remap_count; + + struct scatterlist *sg; + unsigned int sg_len; + int sg_dma_len; + + union { + struct rds_ib_fmr fmr; + struct rds_ib_frmr frmr; + } u; +}; + +/* Our own little MR pool */ +struct rds_ib_mr_pool { + unsigned int pool_type; + struct mutex flush_lock; /* serialize fmr invalidate */ + struct delayed_work flush_worker; /* flush worker */ + + atomic_t item_count; /* total # of MRs */ + atomic_t dirty_count; /* # dirty of MRs */ + + struct llist_head drop_list; /* MRs not reached max_maps */ + struct llist_head free_list; /* unused MRs */ + struct llist_head clean_list; /* unused & unmapped MRs */ + wait_queue_head_t flush_wait; + + atomic_t free_pinned; /* memory pinned by free MRs */ + unsigned long max_items; + unsigned long max_items_soft; + unsigned long max_free_pinned; + struct ib_fmr_attr fmr_attr; + bool use_fastreg; +}; + +extern struct workqueue_struct *rds_ib_mr_wq; +extern unsigned int rds_ib_mr_1m_pool_size; +extern unsigned int rds_ib_mr_8k_pool_size; +extern bool prefer_frmr; + +struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev, + int npages); +void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, + struct rds_info_rdma_connection *iinfo); +void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); +void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, + struct rds_sock *rs, u32 *key_ret); +void rds_ib_sync_mr(void *trans_private, int dir); +void rds_ib_free_mr(void *trans_private, int invalidate); +void rds_ib_flush_mrs(void); +int rds_ib_mr_init(void); +void rds_ib_mr_exit(void); + +void __rds_ib_teardown_mr(struct rds_ib_mr *); +void rds_ib_teardown_mr(struct rds_ib_mr *); +struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *, int); +int rds_ib_map_fmr(struct rds_ib_device *, struct rds_ib_mr *, + struct scatterlist *, unsigned int); +struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *); +int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *, int, struct rds_ib_mr **); +struct rds_ib_mr *rds_ib_reg_fmr(struct rds_ib_device *, struct scatterlist *, + unsigned long, u32 *); +struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *); +void rds_ib_unreg_fmr(struct list_head *, unsigned int *, + unsigned long *, unsigned int); +void rds_ib_free_fmr_list(struct rds_ib_mr *); +struct rds_ib_mr *rds_ib_reg_frmr(struct rds_ib_device *rds_ibdev, + struct rds_ib_connection *ic, + struct scatterlist *sg, + unsigned long nents, u32 *key); +void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed, + unsigned long *unpinned, unsigned int goal); +void rds_ib_free_frmr_list(struct rds_ib_mr *); +#endif diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index a2340748ec86..f7164ac1ffc1 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -35,78 +35,13 @@ #include <linux/rculist.h> #include <linux/llist.h> -#include "rds.h" -#include "ib.h" +#include "ib_mr.h" + +struct workqueue_struct *rds_ib_mr_wq; static DEFINE_PER_CPU(unsigned long, clean_list_grace); #define CLEAN_LIST_BUSY_BIT 0 -/* - * This is stored as mr->r_trans_private. - */ -struct rds_ib_mr { - struct rds_ib_device *device; - struct rds_ib_mr_pool *pool; - struct ib_fmr *fmr; - - struct llist_node llnode; - - /* unmap_list is for freeing */ - struct list_head unmap_list; - unsigned int remap_count; - - struct scatterlist *sg; - unsigned int sg_len; - u64 *dma; - int sg_dma_len; -}; - -/* - * Our own little FMR pool - */ -struct rds_ib_mr_pool { - unsigned int pool_type; - struct mutex flush_lock; /* serialize fmr invalidate */ - struct delayed_work flush_worker; /* flush worker */ - - atomic_t item_count; /* total # of MRs */ - atomic_t dirty_count; /* # dirty of MRs */ - - struct llist_head drop_list; /* MRs that have reached their max_maps limit */ - struct llist_head free_list; /* unused MRs */ - struct llist_head clean_list; /* global unused & unamapped MRs */ - wait_queue_head_t flush_wait; - - atomic_t free_pinned; /* memory pinned by free MRs */ - unsigned long max_items; - unsigned long max_items_soft; - unsigned long max_free_pinned; - struct ib_fmr_attr fmr_attr; -}; - -static struct workqueue_struct *rds_ib_fmr_wq; - -int rds_ib_fmr_init(void) -{ - rds_ib_fmr_wq = create_workqueue("rds_fmr_flushd"); - if (!rds_ib_fmr_wq) - return -ENOMEM; - return 0; -} - -/* By the time this is called all the IB devices should have been torn down and - * had their pools freed. As each pool is freed its work struct is waited on, - * so the pool flushing work queue should be idle by the time we get here. - */ -void rds_ib_fmr_exit(void) -{ - destroy_workqueue(rds_ib_fmr_wq); -} - -static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **); -static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr); -static void rds_ib_mr_pool_flush_worker(struct work_struct *work); - static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr) { struct rds_ib_device *rds_ibdev; @@ -235,41 +170,6 @@ void rds_ib_destroy_nodev_conns(void) rds_conn_destroy(ic->conn); } -struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev, - int pool_type) -{ - struct rds_ib_mr_pool *pool; - - pool = kzalloc(sizeof(*pool), GFP_KERNEL); - if (!pool) - return ERR_PTR(-ENOMEM); - - pool->pool_type = pool_type; - init_llist_head(&pool->free_list); - init_llist_head(&pool->drop_list); - init_llist_head(&pool->clean_list); - mutex_init(&pool->flush_lock); - init_waitqueue_head(&pool->flush_wait); - INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); - - if (pool_type == RDS_IB_MR_1M_POOL) { - /* +1 allows for unaligned MRs */ - pool->fmr_attr.max_pages = RDS_FMR_1M_MSG_SIZE + 1; - pool->max_items = RDS_FMR_1M_POOL_SIZE; - } else { - /* pool_type == RDS_IB_MR_8K_POOL */ - pool->fmr_attr.max_pages = RDS_FMR_8K_MSG_SIZE + 1; - pool->max_items = RDS_FMR_8K_POOL_SIZE; - } - - pool->max_free_pinned = pool->max_items * pool->fmr_attr.max_pages / 4; - pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps; - pool->fmr_attr.page_shift = PAGE_SHIFT; - pool->max_items_soft = rds_ibdev->max_fmrs * 3 / 4; - - return pool; -} - void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo) { struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool; @@ -278,16 +178,7 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co iinfo->rdma_mr_size = pool_1m->fmr_attr.max_pages; } -void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool) -{ - cancel_delayed_work_sync(&pool->flush_worker); - rds_ib_flush_mr_pool(pool, 1, NULL); - WARN_ON(atomic_read(&pool->item_count)); - WARN_ON(atomic_read(&pool->free_pinned)); - kfree(pool); -} - -static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool) +struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool) { struct rds_ib_mr *ibmr = NULL; struct llist_node *ret; @@ -297,8 +188,13 @@ static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool) flag = this_cpu_ptr(&clean_list_grace); set_bit(CLEAN_LIST_BUSY_BIT, flag); ret = llist_del_first(&pool->clean_list); - if (ret) + if (ret) { ibmr = llist_entry(ret, struct rds_ib_mr, llnode); + if (pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_reused); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_reused); + } clear_bit(CLEAN_LIST_BUSY_BIT, flag); preempt_enable(); @@ -317,190 +213,6 @@ static inline void wait_clean_list_grace(void) } } -static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev, - int npages) -{ - struct rds_ib_mr_pool *pool; - struct rds_ib_mr *ibmr = NULL; - int err = 0, iter = 0; - - if (npages <= RDS_FMR_8K_MSG_SIZE) - pool = rds_ibdev->mr_8k_pool; - else - pool = rds_ibdev->mr_1m_pool; - - if (atomic_read(&pool->dirty_count) >= pool->max_items / 10) - queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10); - - /* Switch pools if one of the pool is reaching upper limit */ - if (atomic_read(&pool->dirty_count) >= pool->max_items * 9 / 10) { - if (pool->pool_type == RDS_IB_MR_8K_POOL) - pool = rds_ibdev->mr_1m_pool; - else - pool = rds_ibdev->mr_8k_pool; - } - - while (1) { - ibmr = rds_ib_reuse_fmr(pool); - if (ibmr) - return ibmr; - - /* No clean MRs - now we have the choice of either - * allocating a fresh MR up to the limit imposed by the - * driver, or flush any dirty unused MRs. - * We try to avoid stalling in the send path if possible, - * so we allocate as long as we're allowed to. - * - * We're fussy with enforcing the FMR limit, though. If the driver - * tells us we can't use more than N fmrs, we shouldn't start - * arguing with it */ - if (atomic_inc_return(&pool->item_count) <= pool->max_items) - break; - - atomic_dec(&pool->item_count); - - if (++iter > 2) { - if (pool->pool_type == RDS_IB_MR_8K_POOL) - rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted); - else - rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted); - return ERR_PTR(-EAGAIN); - } - - /* We do have some empty MRs. Flush them out. */ - if (pool->pool_type == RDS_IB_MR_8K_POOL) - rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_wait); - else - rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_wait); - rds_ib_flush_mr_pool(pool, 0, &ibmr); - if (ibmr) - return ibmr; - } - - ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, rdsibdev_to_node(rds_ibdev)); - if (!ibmr) { - err = -ENOMEM; - goto out_no_cigar; - } - - ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd, - (IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_READ | - IB_ACCESS_REMOTE_WRITE| - IB_ACCESS_REMOTE_ATOMIC), - &pool->fmr_attr); - if (IS_ERR(ibmr->fmr)) { - err = PTR_ERR(ibmr->fmr); - ibmr->fmr = NULL; - printk(KERN_WARNING "RDS/IB: ib_alloc_fmr failed (err=%d)\n", err); - goto out_no_cigar; - } - - ibmr->pool = pool; - if (pool->pool_type == RDS_IB_MR_8K_POOL) - rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc); - else - rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc); - - return ibmr; - -out_no_cigar: - if (ibmr) { - if (ibmr->fmr) - ib_dealloc_fmr(ibmr->fmr); - kfree(ibmr); - } - atomic_dec(&pool->item_count); - return ERR_PTR(err); -} - -static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibmr, - struct scatterlist *sg, unsigned int nents) -{ - struct ib_device *dev = rds_ibdev->dev; - struct scatterlist *scat = sg; - u64 io_addr = 0; - u64 *dma_pages; - u32 len; - int page_cnt, sg_dma_len; - int i, j; - int ret; - - sg_dma_len = ib_dma_map_sg(dev, sg, nents, - DMA_BIDIRECTIONAL); - if (unlikely(!sg_dma_len)) { - printk(KERN_WARNING "RDS/IB: dma_map_sg failed!\n"); - return -EBUSY; - } - - len = 0; - page_cnt = 0; - - for (i = 0; i < sg_dma_len; ++i) { - unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]); - u64 dma_addr = ib_sg_dma_address(dev, &scat[i]); - - if (dma_addr & ~PAGE_MASK) { - if (i > 0) - return -EINVAL; - else - ++page_cnt; - } - if ((dma_addr + dma_len) & ~PAGE_MASK) { - if (i < sg_dma_len - 1) - return -EINVAL; - else - ++page_cnt; - } - - len += dma_len; - } - - page_cnt += len >> PAGE_SHIFT; - if (page_cnt > ibmr->pool->fmr_attr.max_pages) - return -EINVAL; - - dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC, - rdsibdev_to_node(rds_ibdev)); - if (!dma_pages) - return -ENOMEM; - - page_cnt = 0; - for (i = 0; i < sg_dma_len; ++i) { - unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]); - u64 dma_addr = ib_sg_dma_address(dev, &scat[i]); - - for (j = 0; j < dma_len; j += PAGE_SIZE) - dma_pages[page_cnt++] = - (dma_addr & PAGE_MASK) + j; - } - - ret = ib_map_phys_fmr(ibmr->fmr, - dma_pages, page_cnt, io_addr); - if (ret) - goto out; - - /* Success - we successfully remapped the MR, so we can - * safely tear down the old mapping. */ - rds_ib_teardown_mr(ibmr); - - ibmr->sg = scat; - ibmr->sg_len = nents; - ibmr->sg_dma_len = sg_dma_len; - ibmr->remap_count++; - - if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL) - rds_ib_stats_inc(s_ib_rdma_mr_8k_used); - else - rds_ib_stats_inc(s_ib_rdma_mr_1m_used); - ret = 0; - -out: - kfree(dma_pages); - - return ret; -} - void rds_ib_sync_mr(void *trans_private, int direction) { struct rds_ib_mr *ibmr = trans_private; @@ -518,7 +230,7 @@ void rds_ib_sync_mr(void *trans_private, int direction) } } -static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr) +void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr) { struct rds_ib_device *rds_ibdev = ibmr->device; @@ -549,7 +261,7 @@ static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr) } } -static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr) +void rds_ib_teardown_mr(struct rds_ib_mr *ibmr) { unsigned int pinned = ibmr->sg_len; @@ -623,17 +335,15 @@ static void list_to_llist_nodes(struct rds_ib_mr_pool *pool, * If the number of MRs allocated exceeds the limit, we also try * to free as many MRs as needed to get back to this limit. */ -static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, - int free_all, struct rds_ib_mr **ibmr_ret) +int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, + int free_all, struct rds_ib_mr **ibmr_ret) { - struct rds_ib_mr *ibmr, *next; + struct rds_ib_mr *ibmr; struct llist_node *clean_nodes; struct llist_node *clean_tail; LIST_HEAD(unmap_list); - LIST_HEAD(fmr_list); unsigned long unpinned = 0; unsigned int nfreed = 0, dirty_to_clean = 0, free_goal; - int ret = 0; if (pool->pool_type == RDS_IB_MR_8K_POOL) rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_flush); @@ -643,7 +353,7 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, if (ibmr_ret) { DEFINE_WAIT(wait); while (!mutex_trylock(&pool->flush_lock)) { - ibmr = rds_ib_reuse_fmr(pool); + ibmr = rds_ib_reuse_mr(pool); if (ibmr) { *ibmr_ret = ibmr; finish_wait(&pool->flush_wait, &wait); @@ -655,7 +365,7 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, if (llist_empty(&pool->clean_list)) schedule(); - ibmr = rds_ib_reuse_fmr(pool); + ibmr = rds_ib_reuse_mr(pool); if (ibmr) { *ibmr_ret = ibmr; finish_wait(&pool->flush_wait, &wait); @@ -667,7 +377,7 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, mutex_lock(&pool->flush_lock); if (ibmr_ret) { - ibmr = rds_ib_reuse_fmr(pool); + ibmr = rds_ib_reuse_mr(pool); if (ibmr) { *ibmr_ret = ibmr; goto out; @@ -687,30 +397,10 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, if (list_empty(&unmap_list)) goto out; - /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */ - list_for_each_entry(ibmr, &unmap_list, unmap_list) - list_add(&ibmr->fmr->list, &fmr_list); - - ret = ib_unmap_fmr(&fmr_list); - if (ret) - printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret); - - /* Now we can destroy the DMA mapping and unpin any pages */ - list_for_each_entry_safe(ibmr, next, &unmap_list, unmap_list) { - unpinned += ibmr->sg_len; - __rds_ib_teardown_mr(ibmr); - if (nfreed < free_goal || - ibmr->remap_count >= pool->fmr_attr.max_maps) { - if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL) - rds_ib_stats_inc(s_ib_rdma_mr_8k_free); - else - rds_ib_stats_inc(s_ib_rdma_mr_1m_free); - list_del(&ibmr->unmap_list); - ib_dealloc_fmr(ibmr->fmr); - kfree(ibmr); - nfreed++; - } - } + if (pool->use_fastreg) + rds_ib_unreg_frmr(&unmap_list, &nfreed, &unpinned, free_goal); + else + rds_ib_unreg_fmr(&unmap_list, &nfreed, &unpinned, free_goal); if (!list_empty(&unmap_list)) { /* we have to make sure that none of the things we're about @@ -743,7 +433,47 @@ out: if (waitqueue_active(&pool->flush_wait)) wake_up(&pool->flush_wait); out_nolock: - return ret; + return 0; +} + +struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *pool) +{ + struct rds_ib_mr *ibmr = NULL; + int iter = 0; + + if (atomic_read(&pool->dirty_count) >= pool->max_items_soft / 10) + queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10); + + while (1) { + ibmr = rds_ib_reuse_mr(pool); + if (ibmr) + return ibmr; + + if (atomic_inc_return(&pool->item_count) <= pool->max_items) + break; + + atomic_dec(&pool->item_count); + + if (++iter > 2) { + if (pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted); + return ERR_PTR(-EAGAIN); + } + + /* We do have some empty MRs. Flush them out. */ + if (pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_wait); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_wait); + + rds_ib_flush_mr_pool(pool, 0, &ibmr); + if (ibmr) + return ibmr; + } + + return ibmr; } static void rds_ib_mr_pool_flush_worker(struct work_struct *work) @@ -762,10 +492,10 @@ void rds_ib_free_mr(void *trans_private, int invalidate) rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len); /* Return it to the pool's free list */ - if (ibmr->remap_count >= pool->fmr_attr.max_maps) - llist_add(&ibmr->llnode, &pool->drop_list); + if (rds_ibdev->use_fastreg) + rds_ib_free_frmr_list(ibmr); else - llist_add(&ibmr->llnode, &pool->free_list); + rds_ib_free_fmr_list(ibmr); atomic_add(ibmr->sg_len, &pool->free_pinned); atomic_inc(&pool->dirty_count); @@ -773,7 +503,7 @@ void rds_ib_free_mr(void *trans_private, int invalidate) /* If we've pinned too many pages, request a flush */ if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || atomic_read(&pool->dirty_count) >= pool->max_items / 5) - queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10); + queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10); if (invalidate) { if (likely(!in_interrupt())) { @@ -782,7 +512,7 @@ void rds_ib_free_mr(void *trans_private, int invalidate) /* We get here if the user created a MR marked * as use_once and invalidate at the same time. */ - queue_delayed_work(rds_ib_fmr_wq, + queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10); } } @@ -810,6 +540,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, { struct rds_ib_device *rds_ibdev; struct rds_ib_mr *ibmr = NULL; + struct rds_ib_connection *ic = rs->rs_conn->c_transport_data; int ret; rds_ibdev = rds_ib_get_device(rs->rs_bound_addr); @@ -823,29 +554,81 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, goto out; } - ibmr = rds_ib_alloc_fmr(rds_ibdev, nents); - if (IS_ERR(ibmr)) { - rds_ib_dev_put(rds_ibdev); - return ibmr; - } - - ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents); - if (ret == 0) - *key_ret = ibmr->fmr->rkey; + if (rds_ibdev->use_fastreg) + ibmr = rds_ib_reg_frmr(rds_ibdev, ic, sg, nents, key_ret); else - printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret); - - ibmr->device = rds_ibdev; - rds_ibdev = NULL; + ibmr = rds_ib_reg_fmr(rds_ibdev, sg, nents, key_ret); + if (ibmr) + rds_ibdev = NULL; out: - if (ret) { - if (ibmr) - rds_ib_free_mr(ibmr, 0); - ibmr = ERR_PTR(ret); - } + if (!ibmr) + pr_warn("RDS/IB: rds_ib_get_mr failed (errno=%d)\n", ret); + if (rds_ibdev) rds_ib_dev_put(rds_ibdev); + return ibmr; } +void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool) +{ + cancel_delayed_work_sync(&pool->flush_worker); + rds_ib_flush_mr_pool(pool, 1, NULL); + WARN_ON(atomic_read(&pool->item_count)); + WARN_ON(atomic_read(&pool->free_pinned)); + kfree(pool); +} + +struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev, + int pool_type) +{ + struct rds_ib_mr_pool *pool; + + pool = kzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) + return ERR_PTR(-ENOMEM); + + pool->pool_type = pool_type; + init_llist_head(&pool->free_list); + init_llist_head(&pool->drop_list); + init_llist_head(&pool->clean_list); + mutex_init(&pool->flush_lock); + init_waitqueue_head(&pool->flush_wait); + INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); + + if (pool_type == RDS_IB_MR_1M_POOL) { + /* +1 allows for unaligned MRs */ + pool->fmr_attr.max_pages = RDS_MR_1M_MSG_SIZE + 1; + pool->max_items = RDS_MR_1M_POOL_SIZE; + } else { + /* pool_type == RDS_IB_MR_8K_POOL */ + pool->fmr_attr.max_pages = RDS_MR_8K_MSG_SIZE + 1; + pool->max_items = RDS_MR_8K_POOL_SIZE; + } + + pool->max_free_pinned = pool->max_items * pool->fmr_attr.max_pages / 4; + pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps; + pool->fmr_attr.page_shift = PAGE_SHIFT; + pool->max_items_soft = rds_ibdev->max_mrs * 3 / 4; + pool->use_fastreg = rds_ibdev->use_fastreg; + + return pool; +} + +int rds_ib_mr_init(void) +{ + rds_ib_mr_wq = create_workqueue("rds_mr_flushd"); + if (!rds_ib_mr_wq) + return -ENOMEM; + return 0; +} + +/* By the time this is called all the IB devices should have been torn down and + * had their pools freed. As each pool is freed its work struct is waited on, + * so the pool flushing work queue should be idle by the time we get here. + */ +void rds_ib_mr_exit(void) +{ + destroy_workqueue(rds_ib_mr_wq); +} diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index eac30bf486d7..f27d2c82b036 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -195,7 +195,7 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic) send->s_op = NULL; - send->s_wr.wr_id = i | RDS_IB_SEND_OP; + send->s_wr.wr_id = i; send->s_wr.sg_list = send->s_sge; send->s_wr.ex.imm_data = 0; @@ -263,9 +263,7 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) oldest = rds_ib_ring_oldest(&ic->i_send_ring); - completed = rds_ib_ring_completed(&ic->i_send_ring, - (wc->wr_id & ~RDS_IB_SEND_OP), - oldest); + completed = rds_ib_ring_completed(&ic->i_send_ring, wc->wr_id, oldest); for (i = 0; i < completed; i++) { send = &ic->i_sends[oldest]; diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c index d77e04473056..7e78dca1f252 100644 --- a/net/rds/ib_stats.c +++ b/net/rds/ib_stats.c @@ -73,6 +73,8 @@ static const char *const rds_ib_stat_names[] = { "ib_rdma_mr_1m_pool_flush", "ib_rdma_mr_1m_pool_wait", "ib_rdma_mr_1m_pool_depleted", + "ib_rdma_mr_8k_reused", + "ib_rdma_mr_1m_reused", "ib_atomic_cswp", "ib_atomic_fadd", }; diff --git a/net/rds/iw.c b/net/rds/iw.c deleted file mode 100644 index f4a9fff829e0..000000000000 --- a/net/rds/iw.c +++ /dev/null @@ -1,312 +0,0 @@ -/* - * Copyright (c) 2006 Oracle. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ -#include <linux/kernel.h> -#include <linux/in.h> -#include <linux/if.h> -#include <linux/netdevice.h> -#include <linux/inetdevice.h> -#include <linux/if_arp.h> -#include <linux/delay.h> -#include <linux/slab.h> -#include <linux/module.h> - -#include "rds.h" -#include "iw.h" - -unsigned int fastreg_pool_size = RDS_FASTREG_POOL_SIZE; -unsigned int fastreg_message_size = RDS_FASTREG_SIZE + 1; /* +1 allows for unaligned MRs */ - -module_param(fastreg_pool_size, int, 0444); -MODULE_PARM_DESC(fastreg_pool_size, " Max number of fastreg MRs per device"); -module_param(fastreg_message_size, int, 0444); -MODULE_PARM_DESC(fastreg_message_size, " Max size of a RDMA transfer (fastreg MRs)"); - -struct list_head rds_iw_devices; - -/* NOTE: if also grabbing iwdev lock, grab this first */ -DEFINE_SPINLOCK(iw_nodev_conns_lock); -LIST_HEAD(iw_nodev_conns); - -static void rds_iw_add_one(struct ib_device *device) -{ - struct rds_iw_device *rds_iwdev; - - /* Only handle iwarp devices */ - if (device->node_type != RDMA_NODE_RNIC) - return; - - rds_iwdev = kmalloc(sizeof *rds_iwdev, GFP_KERNEL); - if (!rds_iwdev) - return; - - spin_lock_init(&rds_iwdev->spinlock); - - rds_iwdev->dma_local_lkey = !!(device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY); - rds_iwdev->max_wrs = device->attrs.max_qp_wr; - rds_iwdev->max_sge = min(device->attrs.max_sge, RDS_IW_MAX_SGE); - - rds_iwdev->dev = device; - rds_iwdev->pd = ib_alloc_pd(device); - if (IS_ERR(rds_iwdev->pd)) - goto free_dev; - - if (!rds_iwdev->dma_local_lkey) { - rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd, - IB_ACCESS_REMOTE_READ | - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(rds_iwdev->mr)) - goto err_pd; - } else - rds_iwdev->mr = NULL; - - rds_iwdev->mr_pool = rds_iw_create_mr_pool(rds_iwdev); - if (IS_ERR(rds_iwdev->mr_pool)) { - rds_iwdev->mr_pool = NULL; - goto err_mr; - } - - INIT_LIST_HEAD(&rds_iwdev->cm_id_list); - INIT_LIST_HEAD(&rds_iwdev->conn_list); - list_add_tail(&rds_iwdev->list, &rds_iw_devices); - - ib_set_client_data(device, &rds_iw_client, rds_iwdev); - return; - -err_mr: - if (rds_iwdev->mr) - ib_dereg_mr(rds_iwdev->mr); -err_pd: - ib_dealloc_pd(rds_iwdev->pd); -free_dev: - kfree(rds_iwdev); -} - -static void rds_iw_remove_one(struct ib_device *device, void *client_data) -{ - struct rds_iw_device *rds_iwdev = client_data; - struct rds_iw_cm_id *i_cm_id, *next; - - if (!rds_iwdev) - return; - - spin_lock_irq(&rds_iwdev->spinlock); - list_for_each_entry_safe(i_cm_id, next, &rds_iwdev->cm_id_list, list) { - list_del(&i_cm_id->list); - kfree(i_cm_id); - } - spin_unlock_irq(&rds_iwdev->spinlock); - - rds_iw_destroy_conns(rds_iwdev); - - if (rds_iwdev->mr_pool) - rds_iw_destroy_mr_pool(rds_iwdev->mr_pool); - - if (rds_iwdev->mr) - ib_dereg_mr(rds_iwdev->mr); - - ib_dealloc_pd(rds_iwdev->pd); - - list_del(&rds_iwdev->list); - kfree(rds_iwdev); -} - -struct ib_client rds_iw_client = { - .name = "rds_iw", - .add = rds_iw_add_one, - .remove = rds_iw_remove_one -}; - -static int rds_iw_conn_info_visitor(struct rds_connection *conn, - void *buffer) -{ - struct rds_info_rdma_connection *iinfo = buffer; - struct rds_iw_connection *ic; - - /* We will only ever look at IB transports */ - if (conn->c_trans != &rds_iw_transport) - return 0; - - iinfo->src_addr = conn->c_laddr; - iinfo->dst_addr = conn->c_faddr; - - memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid)); - memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid)); - if (rds_conn_state(conn) == RDS_CONN_UP) { - struct rds_iw_device *rds_iwdev; - struct rdma_dev_addr *dev_addr; - - ic = conn->c_transport_data; - dev_addr = &ic->i_cm_id->route.addr.dev_addr; - - rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); - rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); - - rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); - iinfo->max_send_wr = ic->i_send_ring.w_nr; - iinfo->max_recv_wr = ic->i_recv_ring.w_nr; - iinfo->max_send_sge = rds_iwdev->max_sge; - rds_iw_get_mr_info(rds_iwdev, iinfo); - } - return 1; -} - -static void rds_iw_ic_info(struct socket *sock, unsigned int len, - struct rds_info_iterator *iter, - struct rds_info_lengths *lens) -{ - rds_for_each_conn_info(sock, len, iter, lens, - rds_iw_conn_info_visitor, - sizeof(struct rds_info_rdma_connection)); -} - - -/* - * Early RDS/IB was built to only bind to an address if there is an IPoIB - * device with that address set. - * - * If it were me, I'd advocate for something more flexible. Sending and - * receiving should be device-agnostic. Transports would try and maintain - * connections between peers who have messages queued. Userspace would be - * allowed to influence which paths have priority. We could call userspace - * asserting this policy "routing". - */ -static int rds_iw_laddr_check(struct net *net, __be32 addr) -{ - int ret; - struct rdma_cm_id *cm_id; - struct sockaddr_in sin; - - /* Create a CMA ID and try to bind it. This catches both - * IB and iWARP capable NICs. - */ - cm_id = rdma_create_id(&init_net, NULL, NULL, RDMA_PS_TCP, IB_QPT_RC); - if (IS_ERR(cm_id)) - return PTR_ERR(cm_id); - - memset(&sin, 0, sizeof(sin)); - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = addr; - - /* rdma_bind_addr will only succeed for IB & iWARP devices */ - ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); - /* due to this, we will claim to support IB devices unless we - check node_type. */ - if (ret || !cm_id->device || - cm_id->device->node_type != RDMA_NODE_RNIC) - ret = -EADDRNOTAVAIL; - - rdsdebug("addr %pI4 ret %d node type %d\n", - &addr, ret, - cm_id->device ? cm_id->device->node_type : -1); - - rdma_destroy_id(cm_id); - - return ret; -} - -void rds_iw_exit(void) -{ - rds_info_deregister_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info); - rds_iw_destroy_nodev_conns(); - ib_unregister_client(&rds_iw_client); - rds_iw_sysctl_exit(); - rds_iw_recv_exit(); - rds_trans_unregister(&rds_iw_transport); -} - -struct rds_transport rds_iw_transport = { - .laddr_check = rds_iw_laddr_check, - .xmit_complete = rds_iw_xmit_complete, - .xmit = rds_iw_xmit, - .xmit_rdma = rds_iw_xmit_rdma, - .recv = rds_iw_recv, - .conn_alloc = rds_iw_conn_alloc, - .conn_free = rds_iw_conn_free, - .conn_connect = rds_iw_conn_connect, - .conn_shutdown = rds_iw_conn_shutdown, - .inc_copy_to_user = rds_iw_inc_copy_to_user, - .inc_free = rds_iw_inc_free, - .cm_initiate_connect = rds_iw_cm_initiate_connect, - .cm_handle_connect = rds_iw_cm_handle_connect, - .cm_connect_complete = rds_iw_cm_connect_complete, - .stats_info_copy = rds_iw_stats_info_copy, - .exit = rds_iw_exit, - .get_mr = rds_iw_get_mr, - .sync_mr = rds_iw_sync_mr, - .free_mr = rds_iw_free_mr, - .flush_mrs = rds_iw_flush_mrs, - .t_owner = THIS_MODULE, - .t_name = "iwarp", - .t_type = RDS_TRANS_IWARP, - .t_prefer_loopback = 1, -}; - -int rds_iw_init(void) -{ - int ret; - - INIT_LIST_HEAD(&rds_iw_devices); - - ret = ib_register_client(&rds_iw_client); - if (ret) - goto out; - - ret = rds_iw_sysctl_init(); - if (ret) - goto out_ibreg; - - ret = rds_iw_recv_init(); - if (ret) - goto out_sysctl; - - ret = rds_trans_register(&rds_iw_transport); - if (ret) - goto out_recv; - - rds_info_register_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info); - - goto out; - -out_recv: - rds_iw_recv_exit(); -out_sysctl: - rds_iw_sysctl_exit(); -out_ibreg: - ib_unregister_client(&rds_iw_client); -out: - return ret; -} - -MODULE_LICENSE("GPL"); - diff --git a/net/rds/iw.h b/net/rds/iw.h deleted file mode 100644 index 5af01d1758b3..000000000000 --- a/net/rds/iw.h +++ /dev/null @@ -1,398 +0,0 @@ -#ifndef _RDS_IW_H -#define _RDS_IW_H - -#include <linux/interrupt.h> -#include <rdma/ib_verbs.h> -#include <rdma/rdma_cm.h> -#include "rds.h" -#include "rdma_transport.h" - -#define RDS_FASTREG_SIZE 20 -#define RDS_FASTREG_POOL_SIZE 2048 - -#define RDS_IW_MAX_SGE 8 -#define RDS_IW_RECV_SGE 2 - -#define RDS_IW_DEFAULT_RECV_WR 1024 -#define RDS_IW_DEFAULT_SEND_WR 256 - -#define RDS_IW_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */ - -extern struct list_head rds_iw_devices; - -/* - * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to - * try and minimize the amount of memory tied up both the device and - * socket receive queues. - */ -/* page offset of the final full frag that fits in the page */ -#define RDS_PAGE_LAST_OFF (((PAGE_SIZE / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE) -struct rds_page_frag { - struct list_head f_item; - struct page *f_page; - unsigned long f_offset; - dma_addr_t f_mapped; -}; - -struct rds_iw_incoming { - struct list_head ii_frags; - struct rds_incoming ii_inc; -}; - -struct rds_iw_connect_private { - /* Add new fields at the end, and don't permute existing fields. */ - __be32 dp_saddr; - __be32 dp_daddr; - u8 dp_protocol_major; - u8 dp_protocol_minor; - __be16 dp_protocol_minor_mask; /* bitmask */ - __be32 dp_reserved1; - __be64 dp_ack_seq; - __be32 dp_credit; /* non-zero enables flow ctl */ -}; - -struct rds_iw_scatterlist { - struct scatterlist *list; - unsigned int len; - int dma_len; - unsigned int dma_npages; - unsigned int bytes; -}; - -struct rds_iw_mapping { - spinlock_t m_lock; /* protect the mapping struct */ - struct list_head m_list; - struct rds_iw_mr *m_mr; - uint32_t m_rkey; - struct rds_iw_scatterlist m_sg; -}; - -struct rds_iw_send_work { - struct rds_message *s_rm; - - /* We should really put these into a union: */ - struct rm_rdma_op *s_op; - struct rds_iw_mapping *s_mapping; - struct ib_mr *s_mr; - unsigned char s_remap_count; - - union { - struct ib_send_wr s_send_wr; - struct ib_rdma_wr s_rdma_wr; - struct ib_reg_wr s_reg_wr; - }; - struct ib_sge s_sge[RDS_IW_MAX_SGE]; - unsigned long s_queued; -}; - -struct rds_iw_recv_work { - struct rds_iw_incoming *r_iwinc; - struct rds_page_frag *r_frag; - struct ib_recv_wr r_wr; - struct ib_sge r_sge[2]; -}; - -struct rds_iw_work_ring { - u32 w_nr; - u32 w_alloc_ptr; - u32 w_alloc_ctr; - u32 w_free_ptr; - atomic_t w_free_ctr; -}; - -struct rds_iw_device; - -struct rds_iw_connection { - - struct list_head iw_node; - struct rds_iw_device *rds_iwdev; - struct rds_connection *conn; - - /* alphabet soup, IBTA style */ - struct rdma_cm_id *i_cm_id; - struct ib_pd *i_pd; - struct ib_mr *i_mr; - struct ib_cq *i_send_cq; - struct ib_cq *i_recv_cq; - - /* tx */ - struct rds_iw_work_ring i_send_ring; - struct rds_message *i_rm; - struct rds_header *i_send_hdrs; - u64 i_send_hdrs_dma; - struct rds_iw_send_work *i_sends; - - /* rx */ - struct tasklet_struct i_recv_tasklet; - struct mutex i_recv_mutex; - struct rds_iw_work_ring i_recv_ring; - struct rds_iw_incoming *i_iwinc; - u32 i_recv_data_rem; - struct rds_header *i_recv_hdrs; - u64 i_recv_hdrs_dma; - struct rds_iw_recv_work *i_recvs; - struct rds_page_frag i_frag; - u64 i_ack_recv; /* last ACK received */ - - /* sending acks */ - unsigned long i_ack_flags; -#ifdef KERNEL_HAS_ATOMIC64 - atomic64_t i_ack_next; /* next ACK to send */ -#else - spinlock_t i_ack_lock; /* protect i_ack_next */ - u64 i_ack_next; /* next ACK to send */ -#endif - struct rds_header *i_ack; - struct ib_send_wr i_ack_wr; - struct ib_sge i_ack_sge; - u64 i_ack_dma; - unsigned long i_ack_queued; - - /* Flow control related information - * - * Our algorithm uses a pair variables that we need to access - * atomically - one for the send credits, and one posted - * recv credits we need to transfer to remote. - * Rather than protect them using a slow spinlock, we put both into - * a single atomic_t and update it using cmpxchg - */ - atomic_t i_credits; - - /* Protocol version specific information */ - unsigned int i_flowctl:1; /* enable/disable flow ctl */ - unsigned int i_dma_local_lkey:1; - unsigned int i_fastreg_posted:1; /* fastreg posted on this connection */ - /* Batched completions */ - unsigned int i_unsignaled_wrs; - long i_unsignaled_bytes; -}; - -/* This assumes that atomic_t is at least 32 bits */ -#define IB_GET_SEND_CREDITS(v) ((v) & 0xffff) -#define IB_GET_POST_CREDITS(v) ((v) >> 16) -#define IB_SET_SEND_CREDITS(v) ((v) & 0xffff) -#define IB_SET_POST_CREDITS(v) ((v) << 16) - -struct rds_iw_cm_id { - struct list_head list; - struct rdma_cm_id *cm_id; -}; - -struct rds_iw_device { - struct list_head list; - struct list_head cm_id_list; - struct list_head conn_list; - struct ib_device *dev; - struct ib_pd *pd; - struct ib_mr *mr; - struct rds_iw_mr_pool *mr_pool; - int max_sge; - unsigned int max_wrs; - unsigned int dma_local_lkey:1; - spinlock_t spinlock; /* protect the above */ -}; - -/* bits for i_ack_flags */ -#define IB_ACK_IN_FLIGHT 0 -#define IB_ACK_REQUESTED 1 - -/* Magic WR_ID for ACKs */ -#define RDS_IW_ACK_WR_ID ((u64)0xffffffffffffffffULL) -#define RDS_IW_REG_WR_ID ((u64)0xefefefefefefefefULL) -#define RDS_IW_LOCAL_INV_WR_ID ((u64)0xdfdfdfdfdfdfdfdfULL) - -struct rds_iw_statistics { - uint64_t s_iw_connect_raced; - uint64_t s_iw_listen_closed_stale; - uint64_t s_iw_tx_cq_call; - uint64_t s_iw_tx_cq_event; - uint64_t s_iw_tx_ring_full; - uint64_t s_iw_tx_throttle; - uint64_t s_iw_tx_sg_mapping_failure; - uint64_t s_iw_tx_stalled; - uint64_t s_iw_tx_credit_updates; - uint64_t s_iw_rx_cq_call; - uint64_t s_iw_rx_cq_event; - uint64_t s_iw_rx_ring_empty; - uint64_t s_iw_rx_refill_from_cq; - uint64_t s_iw_rx_refill_from_thread; - uint64_t s_iw_rx_alloc_limit; - uint64_t s_iw_rx_credit_updates; - uint64_t s_iw_ack_sent; - uint64_t s_iw_ack_send_failure; - uint64_t s_iw_ack_send_delayed; - uint64_t s_iw_ack_send_piggybacked; - uint64_t s_iw_ack_received; - uint64_t s_iw_rdma_mr_alloc; - uint64_t s_iw_rdma_mr_free; - uint64_t s_iw_rdma_mr_used; - uint64_t s_iw_rdma_mr_pool_flush; - uint64_t s_iw_rdma_mr_pool_wait; - uint64_t s_iw_rdma_mr_pool_depleted; -}; - -extern struct workqueue_struct *rds_iw_wq; - -/* - * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h - * doesn't define it. - */ -static inline void rds_iw_dma_sync_sg_for_cpu(struct ib_device *dev, - struct scatterlist *sg, unsigned int sg_dma_len, int direction) -{ - unsigned int i; - - for (i = 0; i < sg_dma_len; ++i) { - ib_dma_sync_single_for_cpu(dev, - ib_sg_dma_address(dev, &sg[i]), - ib_sg_dma_len(dev, &sg[i]), - direction); - } -} -#define ib_dma_sync_sg_for_cpu rds_iw_dma_sync_sg_for_cpu - -static inline void rds_iw_dma_sync_sg_for_device(struct ib_device *dev, - struct scatterlist *sg, unsigned int sg_dma_len, int direction) -{ - unsigned int i; - - for (i = 0; i < sg_dma_len; ++i) { - ib_dma_sync_single_for_device(dev, - ib_sg_dma_address(dev, &sg[i]), - ib_sg_dma_len(dev, &sg[i]), - direction); - } -} -#define ib_dma_sync_sg_for_device rds_iw_dma_sync_sg_for_device - -static inline u32 rds_iw_local_dma_lkey(struct rds_iw_connection *ic) -{ - return ic->i_dma_local_lkey ? ic->i_cm_id->device->local_dma_lkey : ic->i_mr->lkey; -} - -/* ib.c */ -extern struct rds_transport rds_iw_transport; -extern struct ib_client rds_iw_client; - -extern unsigned int fastreg_pool_size; -extern unsigned int fastreg_message_size; - -extern spinlock_t iw_nodev_conns_lock; -extern struct list_head iw_nodev_conns; - -/* ib_cm.c */ -int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp); -void rds_iw_conn_free(void *arg); -int rds_iw_conn_connect(struct rds_connection *conn); -void rds_iw_conn_shutdown(struct rds_connection *conn); -void rds_iw_state_change(struct sock *sk); -int rds_iw_listen_init(void); -void rds_iw_listen_stop(void); -void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...); -int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id, - struct rdma_cm_event *event); -int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id); -void rds_iw_cm_connect_complete(struct rds_connection *conn, - struct rdma_cm_event *event); - - -#define rds_iw_conn_error(conn, fmt...) \ - __rds_iw_conn_error(conn, KERN_WARNING "RDS/IW: " fmt) - -/* ib_rdma.c */ -int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id); -void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn); -void rds_iw_remove_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn); -void __rds_iw_destroy_conns(struct list_head *list, spinlock_t *list_lock); -static inline void rds_iw_destroy_nodev_conns(void) -{ - __rds_iw_destroy_conns(&iw_nodev_conns, &iw_nodev_conns_lock); -} -static inline void rds_iw_destroy_conns(struct rds_iw_device *rds_iwdev) -{ - __rds_iw_destroy_conns(&rds_iwdev->conn_list, &rds_iwdev->spinlock); -} -struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *); -void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo); -void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *); -void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents, - struct rds_sock *rs, u32 *key_ret); -void rds_iw_sync_mr(void *trans_private, int dir); -void rds_iw_free_mr(void *trans_private, int invalidate); -void rds_iw_flush_mrs(void); - -/* ib_recv.c */ -int rds_iw_recv_init(void); -void rds_iw_recv_exit(void); -int rds_iw_recv(struct rds_connection *conn); -int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, - gfp_t page_gfp, int prefill); -void rds_iw_inc_free(struct rds_incoming *inc); -int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to); -void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context); -void rds_iw_recv_tasklet_fn(unsigned long data); -void rds_iw_recv_init_ring(struct rds_iw_connection *ic); -void rds_iw_recv_clear_ring(struct rds_iw_connection *ic); -void rds_iw_recv_init_ack(struct rds_iw_connection *ic); -void rds_iw_attempt_ack(struct rds_iw_connection *ic); -void rds_iw_ack_send_complete(struct rds_iw_connection *ic); -u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic); - -/* ib_ring.c */ -void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr); -void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr); -u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos); -void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val); -void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val); -int rds_iw_ring_empty(struct rds_iw_work_ring *ring); -int rds_iw_ring_low(struct rds_iw_work_ring *ring); -u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring); -u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest); -extern wait_queue_head_t rds_iw_ring_empty_wait; - -/* ib_send.c */ -void rds_iw_xmit_complete(struct rds_connection *conn); -int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, - unsigned int hdr_off, unsigned int sg, unsigned int off); -void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context); -void rds_iw_send_init_ring(struct rds_iw_connection *ic); -void rds_iw_send_clear_ring(struct rds_iw_connection *ic); -int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op); -void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits); -void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted); -int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted, - u32 *adv_credits, int need_posted, int max_posted); - -/* ib_stats.c */ -DECLARE_PER_CPU(struct rds_iw_statistics, rds_iw_stats); -#define rds_iw_stats_inc(member) rds_stats_inc_which(rds_iw_stats, member) -unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter, - unsigned int avail); - -/* ib_sysctl.c */ -int rds_iw_sysctl_init(void); -void rds_iw_sysctl_exit(void); -extern unsigned long rds_iw_sysctl_max_send_wr; -extern unsigned long rds_iw_sysctl_max_recv_wr; -extern unsigned long rds_iw_sysctl_max_unsig_wrs; -extern unsigned long rds_iw_sysctl_max_unsig_bytes; -extern unsigned long rds_iw_sysctl_max_recv_allocation; -extern unsigned int rds_iw_sysctl_flow_control; - -/* - * Helper functions for getting/setting the header and data SGEs in - * RDS packets (not RDMA) - */ -static inline struct ib_sge * -rds_iw_header_sge(struct rds_iw_connection *ic, struct ib_sge *sge) -{ - return &sge[0]; -} - -static inline struct ib_sge * -rds_iw_data_sge(struct rds_iw_connection *ic, struct ib_sge *sge) -{ - return &sge[1]; -} - -#endif diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c deleted file mode 100644 index aea4c911bc76..000000000000 --- a/net/rds/iw_cm.c +++ /dev/null @@ -1,769 +0,0 @@ -/* - * Copyright (c) 2006 Oracle. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ -#include <linux/kernel.h> -#include <linux/in.h> -#include <linux/slab.h> -#include <linux/vmalloc.h> -#include <linux/ratelimit.h> - -#include "rds.h" -#include "iw.h" - -/* - * Set the selected protocol version - */ -static void rds_iw_set_protocol(struct rds_connection *conn, unsigned int version) -{ - conn->c_version = version; -} - -/* - * Set up flow control - */ -static void rds_iw_set_flow_control(struct rds_connection *conn, u32 credits) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - - if (rds_iw_sysctl_flow_control && credits != 0) { - /* We're doing flow control */ - ic->i_flowctl = 1; - rds_iw_send_add_credits(conn, credits); - } else { - ic->i_flowctl = 0; - } -} - -/* - * Connection established. - * We get here for both outgoing and incoming connection. - */ -void rds_iw_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event) -{ - const struct rds_iw_connect_private *dp = NULL; - struct rds_iw_connection *ic = conn->c_transport_data; - struct rds_iw_device *rds_iwdev; - int err; - - if (event->param.conn.private_data_len) { - dp = event->param.conn.private_data; - - rds_iw_set_protocol(conn, - RDS_PROTOCOL(dp->dp_protocol_major, - dp->dp_protocol_minor)); - rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); - } - - /* update ib_device with this local ipaddr & conn */ - rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); - err = rds_iw_update_cm_id(rds_iwdev, ic->i_cm_id); - if (err) - printk(KERN_ERR "rds_iw_update_ipaddr failed (%d)\n", err); - rds_iw_add_conn(rds_iwdev, conn); - - /* If the peer gave us the last packet it saw, process this as if - * we had received a regular ACK. */ - if (dp && dp->dp_ack_seq) - rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); - - printk(KERN_NOTICE "RDS/IW: connected to %pI4<->%pI4 version %u.%u%s\n", - &conn->c_laddr, &conn->c_faddr, - RDS_PROTOCOL_MAJOR(conn->c_version), - RDS_PROTOCOL_MINOR(conn->c_version), - ic->i_flowctl ? ", flow control" : ""); - - rds_connect_complete(conn); -} - -static void rds_iw_cm_fill_conn_param(struct rds_connection *conn, - struct rdma_conn_param *conn_param, - struct rds_iw_connect_private *dp, - u32 protocol_version) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - - memset(conn_param, 0, sizeof(struct rdma_conn_param)); - /* XXX tune these? */ - conn_param->responder_resources = 1; - conn_param->initiator_depth = 1; - - if (dp) { - memset(dp, 0, sizeof(*dp)); - dp->dp_saddr = conn->c_laddr; - dp->dp_daddr = conn->c_faddr; - dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version); - dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version); - dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IW_SUPPORTED_PROTOCOLS); - dp->dp_ack_seq = rds_iw_piggyb_ack(ic); - - /* Advertise flow control */ - if (ic->i_flowctl) { - unsigned int credits; - - credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)); - dp->dp_credit = cpu_to_be32(credits); - atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits); - } - - conn_param->private_data = dp; - conn_param->private_data_len = sizeof(*dp); - } -} - -static void rds_iw_cq_event_handler(struct ib_event *event, void *data) -{ - rdsdebug("event %u data %p\n", event->event, data); -} - -static void rds_iw_qp_event_handler(struct ib_event *event, void *data) -{ - struct rds_connection *conn = data; - struct rds_iw_connection *ic = conn->c_transport_data; - - rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event); - - switch (event->event) { - case IB_EVENT_COMM_EST: - rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); - break; - case IB_EVENT_QP_REQ_ERR: - case IB_EVENT_QP_FATAL: - default: - rdsdebug("Fatal QP Event %u " - "- connection %pI4->%pI4, reconnecting\n", - event->event, &conn->c_laddr, - &conn->c_faddr); - rds_conn_drop(conn); - break; - } -} - -/* - * Create a QP - */ -static int rds_iw_init_qp_attrs(struct ib_qp_init_attr *attr, - struct rds_iw_device *rds_iwdev, - struct rds_iw_work_ring *send_ring, - void (*send_cq_handler)(struct ib_cq *, void *), - struct rds_iw_work_ring *recv_ring, - void (*recv_cq_handler)(struct ib_cq *, void *), - void *context) -{ - struct ib_device *dev = rds_iwdev->dev; - struct ib_cq_init_attr cq_attr = {}; - unsigned int send_size, recv_size; - int ret; - - /* The offset of 1 is to accommodate the additional ACK WR. */ - send_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_send_wr + 1); - recv_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_recv_wr + 1); - rds_iw_ring_resize(send_ring, send_size - 1); - rds_iw_ring_resize(recv_ring, recv_size - 1); - - memset(attr, 0, sizeof(*attr)); - attr->event_handler = rds_iw_qp_event_handler; - attr->qp_context = context; - attr->cap.max_send_wr = send_size; - attr->cap.max_recv_wr = recv_size; - attr->cap.max_send_sge = rds_iwdev->max_sge; - attr->cap.max_recv_sge = RDS_IW_RECV_SGE; - attr->sq_sig_type = IB_SIGNAL_REQ_WR; - attr->qp_type = IB_QPT_RC; - - cq_attr.cqe = send_size; - attr->send_cq = ib_create_cq(dev, send_cq_handler, - rds_iw_cq_event_handler, - context, &cq_attr); - if (IS_ERR(attr->send_cq)) { - ret = PTR_ERR(attr->send_cq); - attr->send_cq = NULL; - rdsdebug("ib_create_cq send failed: %d\n", ret); - goto out; - } - - cq_attr.cqe = recv_size; - attr->recv_cq = ib_create_cq(dev, recv_cq_handler, - rds_iw_cq_event_handler, - context, &cq_attr); - if (IS_ERR(attr->recv_cq)) { - ret = PTR_ERR(attr->recv_cq); - attr->recv_cq = NULL; - rdsdebug("ib_create_cq send failed: %d\n", ret); - goto out; - } - - ret = ib_req_notify_cq(attr->send_cq, IB_CQ_NEXT_COMP); - if (ret) { - rdsdebug("ib_req_notify_cq send failed: %d\n", ret); - goto out; - } - - ret = ib_req_notify_cq(attr->recv_cq, IB_CQ_SOLICITED); - if (ret) { - rdsdebug("ib_req_notify_cq recv failed: %d\n", ret); - goto out; - } - -out: - if (ret) { - if (attr->send_cq) - ib_destroy_cq(attr->send_cq); - if (attr->recv_cq) - ib_destroy_cq(attr->recv_cq); - } - return ret; -} - -/* - * This needs to be very careful to not leave IS_ERR pointers around for - * cleanup to trip over. - */ -static int rds_iw_setup_qp(struct rds_connection *conn) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - struct ib_device *dev = ic->i_cm_id->device; - struct ib_qp_init_attr attr; - struct rds_iw_device *rds_iwdev; - int ret; - - /* rds_iw_add_one creates a rds_iw_device object per IB device, - * and allocates a protection domain, memory range and MR pool - * for each. If that fails for any reason, it will not register - * the rds_iwdev at all. - */ - rds_iwdev = ib_get_client_data(dev, &rds_iw_client); - if (!rds_iwdev) { - printk_ratelimited(KERN_NOTICE "RDS/IW: No client_data for device %s\n", - dev->name); - return -EOPNOTSUPP; - } - - /* Protection domain and memory range */ - ic->i_pd = rds_iwdev->pd; - ic->i_mr = rds_iwdev->mr; - - ret = rds_iw_init_qp_attrs(&attr, rds_iwdev, - &ic->i_send_ring, rds_iw_send_cq_comp_handler, - &ic->i_recv_ring, rds_iw_recv_cq_comp_handler, - conn); - if (ret < 0) - goto out; - - ic->i_send_cq = attr.send_cq; - ic->i_recv_cq = attr.recv_cq; - - /* - * XXX this can fail if max_*_wr is too large? Are we supposed - * to back off until we get a value that the hardware can support? - */ - ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr); - if (ret) { - rdsdebug("rdma_create_qp failed: %d\n", ret); - goto out; - } - - ic->i_send_hdrs = ib_dma_alloc_coherent(dev, - ic->i_send_ring.w_nr * - sizeof(struct rds_header), - &ic->i_send_hdrs_dma, GFP_KERNEL); - if (!ic->i_send_hdrs) { - ret = -ENOMEM; - rdsdebug("ib_dma_alloc_coherent send failed\n"); - goto out; - } - - ic->i_recv_hdrs = ib_dma_alloc_coherent(dev, - ic->i_recv_ring.w_nr * - sizeof(struct rds_header), - &ic->i_recv_hdrs_dma, GFP_KERNEL); - if (!ic->i_recv_hdrs) { - ret = -ENOMEM; - rdsdebug("ib_dma_alloc_coherent recv failed\n"); - goto out; - } - - ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), - &ic->i_ack_dma, GFP_KERNEL); - if (!ic->i_ack) { - ret = -ENOMEM; - rdsdebug("ib_dma_alloc_coherent ack failed\n"); - goto out; - } - - ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work)); - if (!ic->i_sends) { - ret = -ENOMEM; - rdsdebug("send allocation failed\n"); - goto out; - } - rds_iw_send_init_ring(ic); - - ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work)); - if (!ic->i_recvs) { - ret = -ENOMEM; - rdsdebug("recv allocation failed\n"); - goto out; - } - - rds_iw_recv_init_ring(ic); - rds_iw_recv_init_ack(ic); - - /* Post receive buffers - as a side effect, this will update - * the posted credit count. */ - rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1); - - rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr, - ic->i_send_cq, ic->i_recv_cq); - -out: - return ret; -} - -static u32 rds_iw_protocol_compatible(const struct rds_iw_connect_private *dp) -{ - u16 common; - u32 version = 0; - - /* rdma_cm private data is odd - when there is any private data in the - * request, we will be given a pretty large buffer without telling us the - * original size. The only way to tell the difference is by looking at - * the contents, which are initialized to zero. - * If the protocol version fields aren't set, this is a connection attempt - * from an older version. This could could be 3.0 or 2.0 - we can't tell. - * We really should have changed this for OFED 1.3 :-( */ - if (dp->dp_protocol_major == 0) - return RDS_PROTOCOL_3_0; - - common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IW_SUPPORTED_PROTOCOLS; - if (dp->dp_protocol_major == 3 && common) { - version = RDS_PROTOCOL_3_0; - while ((common >>= 1) != 0) - version++; - } - printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using " - "incompatible protocol version %u.%u\n", - &dp->dp_saddr, - dp->dp_protocol_major, - dp->dp_protocol_minor); - return version; -} - -int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id, - struct rdma_cm_event *event) -{ - const struct rds_iw_connect_private *dp = event->param.conn.private_data; - struct rds_iw_connect_private dp_rep; - struct rds_connection *conn = NULL; - struct rds_iw_connection *ic = NULL; - struct rdma_conn_param conn_param; - struct rds_iw_device *rds_iwdev; - u32 version; - int err, destroy = 1; - - /* Check whether the remote protocol version matches ours. */ - version = rds_iw_protocol_compatible(dp); - if (!version) - goto out; - - rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u\n", - &dp->dp_saddr, &dp->dp_daddr, - RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version)); - - /* RDS/IW is not currently netns aware, thus init_net */ - conn = rds_conn_create(&init_net, dp->dp_daddr, dp->dp_saddr, - &rds_iw_transport, GFP_KERNEL); - if (IS_ERR(conn)) { - rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn)); - conn = NULL; - goto out; - } - - /* - * The connection request may occur while the - * previous connection exist, e.g. in case of failover. - * But as connections may be initiated simultaneously - * by both hosts, we have a random backoff mechanism - - * see the comment above rds_queue_reconnect() - */ - mutex_lock(&conn->c_cm_lock); - if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { - if (rds_conn_state(conn) == RDS_CONN_UP) { - rdsdebug("incoming connect while connecting\n"); - rds_conn_drop(conn); - rds_iw_stats_inc(s_iw_listen_closed_stale); - } else - if (rds_conn_state(conn) == RDS_CONN_CONNECTING) { - /* Wait and see - our connect may still be succeeding */ - rds_iw_stats_inc(s_iw_connect_raced); - } - mutex_unlock(&conn->c_cm_lock); - goto out; - } - - ic = conn->c_transport_data; - - rds_iw_set_protocol(conn, version); - rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); - - /* If the peer gave us the last packet it saw, process this as if - * we had received a regular ACK. */ - if (dp->dp_ack_seq) - rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); - - BUG_ON(cm_id->context); - BUG_ON(ic->i_cm_id); - - ic->i_cm_id = cm_id; - cm_id->context = conn; - - rds_iwdev = ib_get_client_data(cm_id->device, &rds_iw_client); - ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey; - - /* We got halfway through setting up the ib_connection, if we - * fail now, we have to take the long route out of this mess. */ - destroy = 0; - - err = rds_iw_setup_qp(conn); - if (err) { - rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", err); - mutex_unlock(&conn->c_cm_lock); - goto out; - } - - rds_iw_cm_fill_conn_param(conn, &conn_param, &dp_rep, version); - - /* rdma_accept() calls rdma_reject() internally if it fails */ - err = rdma_accept(cm_id, &conn_param); - mutex_unlock(&conn->c_cm_lock); - if (err) { - rds_iw_conn_error(conn, "rdma_accept failed (%d)\n", err); - goto out; - } - - return 0; - -out: - rdma_reject(cm_id, NULL, 0); - return destroy; -} - - -int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id) -{ - struct rds_connection *conn = cm_id->context; - struct rds_iw_connection *ic = conn->c_transport_data; - struct rdma_conn_param conn_param; - struct rds_iw_connect_private dp; - int ret; - - /* If the peer doesn't do protocol negotiation, we must - * default to RDSv3.0 */ - rds_iw_set_protocol(conn, RDS_PROTOCOL_3_0); - ic->i_flowctl = rds_iw_sysctl_flow_control; /* advertise flow control */ - - ret = rds_iw_setup_qp(conn); - if (ret) { - rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", ret); - goto out; - } - - rds_iw_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION); - - ret = rdma_connect(cm_id, &conn_param); - if (ret) - rds_iw_conn_error(conn, "rdma_connect failed (%d)\n", ret); - -out: - /* Beware - returning non-zero tells the rdma_cm to destroy - * the cm_id. We should certainly not do it as long as we still - * "own" the cm_id. */ - if (ret) { - struct rds_iw_connection *ic = conn->c_transport_data; - - if (ic->i_cm_id == cm_id) - ret = 0; - } - return ret; -} - -int rds_iw_conn_connect(struct rds_connection *conn) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - struct rds_iw_device *rds_iwdev; - struct sockaddr_in src, dest; - int ret; - - /* XXX I wonder what affect the port space has */ - /* delegate cm event handler to rdma_transport */ - ic->i_cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, conn, - RDMA_PS_TCP, IB_QPT_RC); - if (IS_ERR(ic->i_cm_id)) { - ret = PTR_ERR(ic->i_cm_id); - ic->i_cm_id = NULL; - rdsdebug("rdma_create_id() failed: %d\n", ret); - goto out; - } - - rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn); - - src.sin_family = AF_INET; - src.sin_addr.s_addr = (__force u32)conn->c_laddr; - src.sin_port = (__force u16)htons(0); - - /* First, bind to the local address and device. */ - ret = rdma_bind_addr(ic->i_cm_id, (struct sockaddr *) &src); - if (ret) { - rdsdebug("rdma_bind_addr(%pI4) failed: %d\n", - &conn->c_laddr, ret); - rdma_destroy_id(ic->i_cm_id); - ic->i_cm_id = NULL; - goto out; - } - - rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); - ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey; - - dest.sin_family = AF_INET; - dest.sin_addr.s_addr = (__force u32)conn->c_faddr; - dest.sin_port = (__force u16)htons(RDS_PORT); - - ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src, - (struct sockaddr *)&dest, - RDS_RDMA_RESOLVE_TIMEOUT_MS); - if (ret) { - rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id, - ret); - rdma_destroy_id(ic->i_cm_id); - ic->i_cm_id = NULL; - } - -out: - return ret; -} - -/* - * This is so careful about only cleaning up resources that were built up - * so that it can be called at any point during startup. In fact it - * can be called multiple times for a given connection. - */ -void rds_iw_conn_shutdown(struct rds_connection *conn) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - int err = 0; - struct ib_qp_attr qp_attr; - - rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id, - ic->i_pd, ic->i_send_cq, ic->i_recv_cq, - ic->i_cm_id ? ic->i_cm_id->qp : NULL); - - if (ic->i_cm_id) { - struct ib_device *dev = ic->i_cm_id->device; - - rdsdebug("disconnecting cm %p\n", ic->i_cm_id); - err = rdma_disconnect(ic->i_cm_id); - if (err) { - /* Actually this may happen quite frequently, when - * an outgoing connect raced with an incoming connect. - */ - rdsdebug("failed to disconnect, cm: %p err %d\n", - ic->i_cm_id, err); - } - - if (ic->i_cm_id->qp) { - qp_attr.qp_state = IB_QPS_ERR; - ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE); - } - - wait_event(rds_iw_ring_empty_wait, - rds_iw_ring_empty(&ic->i_send_ring) && - rds_iw_ring_empty(&ic->i_recv_ring)); - - if (ic->i_send_hdrs) - ib_dma_free_coherent(dev, - ic->i_send_ring.w_nr * - sizeof(struct rds_header), - ic->i_send_hdrs, - ic->i_send_hdrs_dma); - - if (ic->i_recv_hdrs) - ib_dma_free_coherent(dev, - ic->i_recv_ring.w_nr * - sizeof(struct rds_header), - ic->i_recv_hdrs, - ic->i_recv_hdrs_dma); - - if (ic->i_ack) - ib_dma_free_coherent(dev, sizeof(struct rds_header), - ic->i_ack, ic->i_ack_dma); - - if (ic->i_sends) - rds_iw_send_clear_ring(ic); - if (ic->i_recvs) - rds_iw_recv_clear_ring(ic); - - if (ic->i_cm_id->qp) - rdma_destroy_qp(ic->i_cm_id); - if (ic->i_send_cq) - ib_destroy_cq(ic->i_send_cq); - if (ic->i_recv_cq) - ib_destroy_cq(ic->i_recv_cq); - - /* - * If associated with an rds_iw_device: - * Move connection back to the nodev list. - * Remove cm_id from the device cm_id list. - */ - if (ic->rds_iwdev) - rds_iw_remove_conn(ic->rds_iwdev, conn); - - rdma_destroy_id(ic->i_cm_id); - - ic->i_cm_id = NULL; - ic->i_pd = NULL; - ic->i_mr = NULL; - ic->i_send_cq = NULL; - ic->i_recv_cq = NULL; - ic->i_send_hdrs = NULL; - ic->i_recv_hdrs = NULL; - ic->i_ack = NULL; - } - BUG_ON(ic->rds_iwdev); - - /* Clear pending transmit */ - if (ic->i_rm) { - rds_message_put(ic->i_rm); - ic->i_rm = NULL; - } - - /* Clear the ACK state */ - clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); -#ifdef KERNEL_HAS_ATOMIC64 - atomic64_set(&ic->i_ack_next, 0); -#else - ic->i_ack_next = 0; -#endif - ic->i_ack_recv = 0; - - /* Clear flow control state */ - ic->i_flowctl = 0; - atomic_set(&ic->i_credits, 0); - - rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr); - rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr); - - if (ic->i_iwinc) { - rds_inc_put(&ic->i_iwinc->ii_inc); - ic->i_iwinc = NULL; - } - - vfree(ic->i_sends); - ic->i_sends = NULL; - vfree(ic->i_recvs); - ic->i_recvs = NULL; - rdsdebug("shutdown complete\n"); -} - -int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp) -{ - struct rds_iw_connection *ic; - unsigned long flags; - - /* XXX too lazy? */ - ic = kzalloc(sizeof(struct rds_iw_connection), gfp); - if (!ic) - return -ENOMEM; - - INIT_LIST_HEAD(&ic->iw_node); - tasklet_init(&ic->i_recv_tasklet, rds_iw_recv_tasklet_fn, - (unsigned long) ic); - mutex_init(&ic->i_recv_mutex); -#ifndef KERNEL_HAS_ATOMIC64 - spin_lock_init(&ic->i_ack_lock); -#endif - - /* - * rds_iw_conn_shutdown() waits for these to be emptied so they - * must be initialized before it can be called. - */ - rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr); - rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr); - - ic->conn = conn; - conn->c_transport_data = ic; - - spin_lock_irqsave(&iw_nodev_conns_lock, flags); - list_add_tail(&ic->iw_node, &iw_nodev_conns); - spin_unlock_irqrestore(&iw_nodev_conns_lock, flags); - - - rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data); - return 0; -} - -/* - * Free a connection. Connection must be shut down and not set for reconnect. - */ -void rds_iw_conn_free(void *arg) -{ - struct rds_iw_connection *ic = arg; - spinlock_t *lock_ptr; - - rdsdebug("ic %p\n", ic); - - /* - * Conn is either on a dev's list or on the nodev list. - * A race with shutdown() or connect() would cause problems - * (since rds_iwdev would change) but that should never happen. - */ - lock_ptr = ic->rds_iwdev ? &ic->rds_iwdev->spinlock : &iw_nodev_conns_lock; - - spin_lock_irq(lock_ptr); - list_del(&ic->iw_node); - spin_unlock_irq(lock_ptr); - - kfree(ic); -} - -/* - * An error occurred on the connection - */ -void -__rds_iw_conn_error(struct rds_connection *conn, const char *fmt, ...) -{ - va_list ap; - - rds_conn_drop(conn); - - va_start(ap, fmt); - vprintk(fmt, ap); - va_end(ap); -} diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c deleted file mode 100644 index b09a40c1adce..000000000000 --- a/net/rds/iw_rdma.c +++ /dev/null @@ -1,837 +0,0 @@ -/* - * Copyright (c) 2006 Oracle. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ -#include <linux/kernel.h> -#include <linux/slab.h> -#include <linux/ratelimit.h> - -#include "rds.h" -#include "iw.h" - - -/* - * This is stored as mr->r_trans_private. - */ -struct rds_iw_mr { - struct rds_iw_device *device; - struct rds_iw_mr_pool *pool; - struct rdma_cm_id *cm_id; - - struct ib_mr *mr; - - struct rds_iw_mapping mapping; - unsigned char remap_count; -}; - -/* - * Our own little MR pool - */ -struct rds_iw_mr_pool { - struct rds_iw_device *device; /* back ptr to the device that owns us */ - - struct mutex flush_lock; /* serialize fmr invalidate */ - struct work_struct flush_worker; /* flush worker */ - - spinlock_t list_lock; /* protect variables below */ - atomic_t item_count; /* total # of MRs */ - atomic_t dirty_count; /* # dirty of MRs */ - struct list_head dirty_list; /* dirty mappings */ - struct list_head clean_list; /* unused & unamapped MRs */ - atomic_t free_pinned; /* memory pinned by free MRs */ - unsigned long max_message_size; /* in pages */ - unsigned long max_items; - unsigned long max_items_soft; - unsigned long max_free_pinned; - int max_pages; -}; - -static void rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all); -static void rds_iw_mr_pool_flush_worker(struct work_struct *work); -static int rds_iw_init_reg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr); -static int rds_iw_map_reg(struct rds_iw_mr_pool *pool, - struct rds_iw_mr *ibmr, - struct scatterlist *sg, unsigned int nents); -static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr); -static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool, - struct list_head *unmap_list, - struct list_head *kill_list, - int *unpinned); -static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr); - -static int rds_iw_get_device(struct sockaddr_in *src, struct sockaddr_in *dst, - struct rds_iw_device **rds_iwdev, - struct rdma_cm_id **cm_id) -{ - struct rds_iw_device *iwdev; - struct rds_iw_cm_id *i_cm_id; - - *rds_iwdev = NULL; - *cm_id = NULL; - - list_for_each_entry(iwdev, &rds_iw_devices, list) { - spin_lock_irq(&iwdev->spinlock); - list_for_each_entry(i_cm_id, &iwdev->cm_id_list, list) { - struct sockaddr_in *src_addr, *dst_addr; - - src_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.src_addr; - dst_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.dst_addr; - - rdsdebug("local ipaddr = %x port %d, " - "remote ipaddr = %x port %d" - "..looking for %x port %d, " - "remote ipaddr = %x port %d\n", - src_addr->sin_addr.s_addr, - src_addr->sin_port, - dst_addr->sin_addr.s_addr, - dst_addr->sin_port, - src->sin_addr.s_addr, - src->sin_port, - dst->sin_addr.s_addr, - dst->sin_port); -#ifdef WORKING_TUPLE_DETECTION - if (src_addr->sin_addr.s_addr == src->sin_addr.s_addr && - src_addr->sin_port == src->sin_port && - dst_addr->sin_addr.s_addr == dst->sin_addr.s_addr && - dst_addr->sin_port == dst->sin_port) { -#else - /* FIXME - needs to compare the local and remote - * ipaddr/port tuple, but the ipaddr is the only - * available information in the rds_sock (as the rest are - * zero'ed. It doesn't appear to be properly populated - * during connection setup... - */ - if (src_addr->sin_addr.s_addr == src->sin_addr.s_addr) { -#endif - spin_unlock_irq(&iwdev->spinlock); - *rds_iwdev = iwdev; - *cm_id = i_cm_id->cm_id; - return 0; - } - } - spin_unlock_irq(&iwdev->spinlock); - } - - return 1; -} - -static int rds_iw_add_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id) -{ - struct rds_iw_cm_id *i_cm_id; - - i_cm_id = kmalloc(sizeof *i_cm_id, GFP_KERNEL); - if (!i_cm_id) - return -ENOMEM; - - i_cm_id->cm_id = cm_id; - - spin_lock_irq(&rds_iwdev->spinlock); - list_add_tail(&i_cm_id->list, &rds_iwdev->cm_id_list); - spin_unlock_irq(&rds_iwdev->spinlock); - - return 0; -} - -static void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, - struct rdma_cm_id *cm_id) -{ - struct rds_iw_cm_id *i_cm_id; - - spin_lock_irq(&rds_iwdev->spinlock); - list_for_each_entry(i_cm_id, &rds_iwdev->cm_id_list, list) { - if (i_cm_id->cm_id == cm_id) { - list_del(&i_cm_id->list); - kfree(i_cm_id); - break; - } - } - spin_unlock_irq(&rds_iwdev->spinlock); -} - - -int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id) -{ - struct sockaddr_in *src_addr, *dst_addr; - struct rds_iw_device *rds_iwdev_old; - struct rdma_cm_id *pcm_id; - int rc; - - src_addr = (struct sockaddr_in *)&cm_id->route.addr.src_addr; - dst_addr = (struct sockaddr_in *)&cm_id->route.addr.dst_addr; - - rc = rds_iw_get_device(src_addr, dst_addr, &rds_iwdev_old, &pcm_id); - if (rc) - rds_iw_remove_cm_id(rds_iwdev, cm_id); - - return rds_iw_add_cm_id(rds_iwdev, cm_id); -} - -void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - - /* conn was previously on the nodev_conns_list */ - spin_lock_irq(&iw_nodev_conns_lock); - BUG_ON(list_empty(&iw_nodev_conns)); - BUG_ON(list_empty(&ic->iw_node)); - list_del(&ic->iw_node); - - spin_lock(&rds_iwdev->spinlock); - list_add_tail(&ic->iw_node, &rds_iwdev->conn_list); - spin_unlock(&rds_iwdev->spinlock); - spin_unlock_irq(&iw_nodev_conns_lock); - - ic->rds_iwdev = rds_iwdev; -} - -void rds_iw_remove_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - - /* place conn on nodev_conns_list */ - spin_lock(&iw_nodev_conns_lock); - - spin_lock_irq(&rds_iwdev->spinlock); - BUG_ON(list_empty(&ic->iw_node)); - list_del(&ic->iw_node); - spin_unlock_irq(&rds_iwdev->spinlock); - - list_add_tail(&ic->iw_node, &iw_nodev_conns); - - spin_unlock(&iw_nodev_conns_lock); - - rds_iw_remove_cm_id(ic->rds_iwdev, ic->i_cm_id); - ic->rds_iwdev = NULL; -} - -void __rds_iw_destroy_conns(struct list_head *list, spinlock_t *list_lock) -{ - struct rds_iw_connection *ic, *_ic; - LIST_HEAD(tmp_list); - - /* avoid calling conn_destroy with irqs off */ - spin_lock_irq(list_lock); - list_splice(list, &tmp_list); - INIT_LIST_HEAD(list); - spin_unlock_irq(list_lock); - - list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node) - rds_conn_destroy(ic->conn); -} - -static void rds_iw_set_scatterlist(struct rds_iw_scatterlist *sg, - struct scatterlist *list, unsigned int sg_len) -{ - sg->list = list; - sg->len = sg_len; - sg->dma_len = 0; - sg->dma_npages = 0; - sg->bytes = 0; -} - -static int rds_iw_map_scatterlist(struct rds_iw_device *rds_iwdev, - struct rds_iw_scatterlist *sg) -{ - struct ib_device *dev = rds_iwdev->dev; - int i, ret; - - WARN_ON(sg->dma_len); - - sg->dma_len = ib_dma_map_sg(dev, sg->list, sg->len, DMA_BIDIRECTIONAL); - if (unlikely(!sg->dma_len)) { - printk(KERN_WARNING "RDS/IW: dma_map_sg failed!\n"); - return -EBUSY; - } - - sg->bytes = 0; - sg->dma_npages = 0; - - ret = -EINVAL; - for (i = 0; i < sg->dma_len; ++i) { - unsigned int dma_len = ib_sg_dma_len(dev, &sg->list[i]); - u64 dma_addr = ib_sg_dma_address(dev, &sg->list[i]); - u64 end_addr; - - sg->bytes += dma_len; - - end_addr = dma_addr + dma_len; - if (dma_addr & PAGE_MASK) { - if (i > 0) - goto out_unmap; - dma_addr &= ~PAGE_MASK; - } - if (end_addr & PAGE_MASK) { - if (i < sg->dma_len - 1) - goto out_unmap; - end_addr = (end_addr + PAGE_MASK) & ~PAGE_MASK; - } - - sg->dma_npages += (end_addr - dma_addr) >> PAGE_SHIFT; - } - - /* Now gather the dma addrs into one list */ - if (sg->dma_npages > fastreg_message_size) - goto out_unmap; - - - - return 0; - -out_unmap: - ib_dma_unmap_sg(rds_iwdev->dev, sg->list, sg->len, DMA_BIDIRECTIONAL); - sg->dma_len = 0; - return ret; -} - - -struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *rds_iwdev) -{ - struct rds_iw_mr_pool *pool; - - pool = kzalloc(sizeof(*pool), GFP_KERNEL); - if (!pool) { - printk(KERN_WARNING "RDS/IW: rds_iw_create_mr_pool alloc error\n"); - return ERR_PTR(-ENOMEM); - } - - pool->device = rds_iwdev; - INIT_LIST_HEAD(&pool->dirty_list); - INIT_LIST_HEAD(&pool->clean_list); - mutex_init(&pool->flush_lock); - spin_lock_init(&pool->list_lock); - INIT_WORK(&pool->flush_worker, rds_iw_mr_pool_flush_worker); - - pool->max_message_size = fastreg_message_size; - pool->max_items = fastreg_pool_size; - pool->max_free_pinned = pool->max_items * pool->max_message_size / 4; - pool->max_pages = fastreg_message_size; - - /* We never allow more than max_items MRs to be allocated. - * When we exceed more than max_items_soft, we start freeing - * items more aggressively. - * Make sure that max_items > max_items_soft > max_items / 2 - */ - pool->max_items_soft = pool->max_items * 3 / 4; - - return pool; -} - -void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo) -{ - struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool; - - iinfo->rdma_mr_max = pool->max_items; - iinfo->rdma_mr_size = pool->max_pages; -} - -void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *pool) -{ - flush_workqueue(rds_wq); - rds_iw_flush_mr_pool(pool, 1); - BUG_ON(atomic_read(&pool->item_count)); - BUG_ON(atomic_read(&pool->free_pinned)); - kfree(pool); -} - -static inline struct rds_iw_mr *rds_iw_reuse_fmr(struct rds_iw_mr_pool *pool) -{ - struct rds_iw_mr *ibmr = NULL; - unsigned long flags; - - spin_lock_irqsave(&pool->list_lock, flags); - if (!list_empty(&pool->clean_list)) { - ibmr = list_entry(pool->clean_list.next, struct rds_iw_mr, mapping.m_list); - list_del_init(&ibmr->mapping.m_list); - } - spin_unlock_irqrestore(&pool->list_lock, flags); - - return ibmr; -} - -static struct rds_iw_mr *rds_iw_alloc_mr(struct rds_iw_device *rds_iwdev) -{ - struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool; - struct rds_iw_mr *ibmr = NULL; - int err = 0, iter = 0; - - while (1) { - ibmr = rds_iw_reuse_fmr(pool); - if (ibmr) - return ibmr; - - /* No clean MRs - now we have the choice of either - * allocating a fresh MR up to the limit imposed by the - * driver, or flush any dirty unused MRs. - * We try to avoid stalling in the send path if possible, - * so we allocate as long as we're allowed to. - * - * We're fussy with enforcing the FMR limit, though. If the driver - * tells us we can't use more than N fmrs, we shouldn't start - * arguing with it */ - if (atomic_inc_return(&pool->item_count) <= pool->max_items) - break; - - atomic_dec(&pool->item_count); - - if (++iter > 2) { - rds_iw_stats_inc(s_iw_rdma_mr_pool_depleted); - return ERR_PTR(-EAGAIN); - } - - /* We do have some empty MRs. Flush them out. */ - rds_iw_stats_inc(s_iw_rdma_mr_pool_wait); - rds_iw_flush_mr_pool(pool, 0); - } - - ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL); - if (!ibmr) { - err = -ENOMEM; - goto out_no_cigar; - } - - spin_lock_init(&ibmr->mapping.m_lock); - INIT_LIST_HEAD(&ibmr->mapping.m_list); - ibmr->mapping.m_mr = ibmr; - - err = rds_iw_init_reg(pool, ibmr); - if (err) - goto out_no_cigar; - - rds_iw_stats_inc(s_iw_rdma_mr_alloc); - return ibmr; - -out_no_cigar: - if (ibmr) { - rds_iw_destroy_fastreg(pool, ibmr); - kfree(ibmr); - } - atomic_dec(&pool->item_count); - return ERR_PTR(err); -} - -void rds_iw_sync_mr(void *trans_private, int direction) -{ - struct rds_iw_mr *ibmr = trans_private; - struct rds_iw_device *rds_iwdev = ibmr->device; - - switch (direction) { - case DMA_FROM_DEVICE: - ib_dma_sync_sg_for_cpu(rds_iwdev->dev, ibmr->mapping.m_sg.list, - ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL); - break; - case DMA_TO_DEVICE: - ib_dma_sync_sg_for_device(rds_iwdev->dev, ibmr->mapping.m_sg.list, - ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL); - break; - } -} - -/* - * Flush our pool of MRs. - * At a minimum, all currently unused MRs are unmapped. - * If the number of MRs allocated exceeds the limit, we also try - * to free as many MRs as needed to get back to this limit. - */ -static void rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all) -{ - struct rds_iw_mr *ibmr, *next; - LIST_HEAD(unmap_list); - LIST_HEAD(kill_list); - unsigned long flags; - unsigned int nfreed = 0, ncleaned = 0, unpinned = 0; - - rds_iw_stats_inc(s_iw_rdma_mr_pool_flush); - - mutex_lock(&pool->flush_lock); - - spin_lock_irqsave(&pool->list_lock, flags); - /* Get the list of all mappings to be destroyed */ - list_splice_init(&pool->dirty_list, &unmap_list); - if (free_all) - list_splice_init(&pool->clean_list, &kill_list); - spin_unlock_irqrestore(&pool->list_lock, flags); - - /* Batched invalidate of dirty MRs. - * For FMR based MRs, the mappings on the unmap list are - * actually members of an ibmr (ibmr->mapping). They either - * migrate to the kill_list, or have been cleaned and should be - * moved to the clean_list. - * For fastregs, they will be dynamically allocated, and - * will be destroyed by the unmap function. - */ - if (!list_empty(&unmap_list)) { - ncleaned = rds_iw_unmap_fastreg_list(pool, &unmap_list, - &kill_list, &unpinned); - /* If we've been asked to destroy all MRs, move those - * that were simply cleaned to the kill list */ - if (free_all) - list_splice_init(&unmap_list, &kill_list); - } - - /* Destroy any MRs that are past their best before date */ - list_for_each_entry_safe(ibmr, next, &kill_list, mapping.m_list) { - rds_iw_stats_inc(s_iw_rdma_mr_free); - list_del(&ibmr->mapping.m_list); - rds_iw_destroy_fastreg(pool, ibmr); - kfree(ibmr); - nfreed++; - } - - /* Anything that remains are laundered ibmrs, which we can add - * back to the clean list. */ - if (!list_empty(&unmap_list)) { - spin_lock_irqsave(&pool->list_lock, flags); - list_splice(&unmap_list, &pool->clean_list); - spin_unlock_irqrestore(&pool->list_lock, flags); - } - - atomic_sub(unpinned, &pool->free_pinned); - atomic_sub(ncleaned, &pool->dirty_count); - atomic_sub(nfreed, &pool->item_count); - - mutex_unlock(&pool->flush_lock); -} - -static void rds_iw_mr_pool_flush_worker(struct work_struct *work) -{ - struct rds_iw_mr_pool *pool = container_of(work, struct rds_iw_mr_pool, flush_worker); - - rds_iw_flush_mr_pool(pool, 0); -} - -void rds_iw_free_mr(void *trans_private, int invalidate) -{ - struct rds_iw_mr *ibmr = trans_private; - struct rds_iw_mr_pool *pool = ibmr->device->mr_pool; - - rdsdebug("RDS/IW: free_mr nents %u\n", ibmr->mapping.m_sg.len); - if (!pool) - return; - - /* Return it to the pool's free list */ - rds_iw_free_fastreg(pool, ibmr); - - /* If we've pinned too many pages, request a flush */ - if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || - atomic_read(&pool->dirty_count) >= pool->max_items / 10) - queue_work(rds_wq, &pool->flush_worker); - - if (invalidate) { - if (likely(!in_interrupt())) { - rds_iw_flush_mr_pool(pool, 0); - } else { - /* We get here if the user created a MR marked - * as use_once and invalidate at the same time. */ - queue_work(rds_wq, &pool->flush_worker); - } - } -} - -void rds_iw_flush_mrs(void) -{ - struct rds_iw_device *rds_iwdev; - - list_for_each_entry(rds_iwdev, &rds_iw_devices, list) { - struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool; - - if (pool) - rds_iw_flush_mr_pool(pool, 0); - } -} - -void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents, - struct rds_sock *rs, u32 *key_ret) -{ - struct rds_iw_device *rds_iwdev; - struct rds_iw_mr *ibmr = NULL; - struct rdma_cm_id *cm_id; - struct sockaddr_in src = { - .sin_addr.s_addr = rs->rs_bound_addr, - .sin_port = rs->rs_bound_port, - }; - struct sockaddr_in dst = { - .sin_addr.s_addr = rs->rs_conn_addr, - .sin_port = rs->rs_conn_port, - }; - int ret; - - ret = rds_iw_get_device(&src, &dst, &rds_iwdev, &cm_id); - if (ret || !cm_id) { - ret = -ENODEV; - goto out; - } - - if (!rds_iwdev->mr_pool) { - ret = -ENODEV; - goto out; - } - - ibmr = rds_iw_alloc_mr(rds_iwdev); - if (IS_ERR(ibmr)) - return ibmr; - - ibmr->cm_id = cm_id; - ibmr->device = rds_iwdev; - - ret = rds_iw_map_reg(rds_iwdev->mr_pool, ibmr, sg, nents); - if (ret == 0) - *key_ret = ibmr->mr->rkey; - else - printk(KERN_WARNING "RDS/IW: failed to map mr (errno=%d)\n", ret); - -out: - if (ret) { - if (ibmr) - rds_iw_free_mr(ibmr, 0); - ibmr = ERR_PTR(ret); - } - return ibmr; -} - -/* - * iWARP reg handling - * - * The life cycle of a fastreg registration is a bit different from - * FMRs. - * The idea behind fastreg is to have one MR, to which we bind different - * mappings over time. To avoid stalling on the expensive map and invalidate - * operations, these operations are pipelined on the same send queue on - * which we want to send the message containing the r_key. - * - * This creates a bit of a problem for us, as we do not have the destination - * IP in GET_MR, so the connection must be setup prior to the GET_MR call for - * RDMA to be correctly setup. If a fastreg request is present, rds_iw_xmit - * will try to queue a LOCAL_INV (if needed) and a REG_MR work request - * before queuing the SEND. When completions for these arrive, they are - * dispatched to the MR has a bit set showing that RDMa can be performed. - * - * There is another interesting aspect that's related to invalidation. - * The application can request that a mapping is invalidated in FREE_MR. - * The expectation there is that this invalidation step includes ALL - * PREVIOUSLY FREED MRs. - */ -static int rds_iw_init_reg(struct rds_iw_mr_pool *pool, - struct rds_iw_mr *ibmr) -{ - struct rds_iw_device *rds_iwdev = pool->device; - struct ib_mr *mr; - int err; - - mr = ib_alloc_mr(rds_iwdev->pd, IB_MR_TYPE_MEM_REG, - pool->max_message_size); - if (IS_ERR(mr)) { - err = PTR_ERR(mr); - - printk(KERN_WARNING "RDS/IW: ib_alloc_mr failed (err=%d)\n", err); - return err; - } - - ibmr->mr = mr; - return 0; -} - -static int rds_iw_rdma_reg_mr(struct rds_iw_mapping *mapping) -{ - struct rds_iw_mr *ibmr = mapping->m_mr; - struct rds_iw_scatterlist *m_sg = &mapping->m_sg; - struct ib_reg_wr reg_wr; - struct ib_send_wr *failed_wr; - int ret, n; - - n = ib_map_mr_sg_zbva(ibmr->mr, m_sg->list, m_sg->len, PAGE_SIZE); - if (unlikely(n != m_sg->len)) - return n < 0 ? n : -EINVAL; - - reg_wr.wr.next = NULL; - reg_wr.wr.opcode = IB_WR_REG_MR; - reg_wr.wr.wr_id = RDS_IW_REG_WR_ID; - reg_wr.wr.num_sge = 0; - reg_wr.mr = ibmr->mr; - reg_wr.key = mapping->m_rkey; - reg_wr.access = IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_READ | - IB_ACCESS_REMOTE_WRITE; - - /* - * Perform a WR for the reg_mr. Each individual page - * in the sg list is added to the fast reg page list and placed - * inside the reg_mr WR. The key used is a rolling 8bit - * counter, which should guarantee uniqueness. - */ - ib_update_fast_reg_key(ibmr->mr, ibmr->remap_count++); - mapping->m_rkey = ibmr->mr->rkey; - - failed_wr = ®_wr.wr; - ret = ib_post_send(ibmr->cm_id->qp, ®_wr.wr, &failed_wr); - BUG_ON(failed_wr != ®_wr.wr); - if (ret) - printk_ratelimited(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n", - __func__, __LINE__, ret); - return ret; -} - -static int rds_iw_rdma_fastreg_inv(struct rds_iw_mr *ibmr) -{ - struct ib_send_wr s_wr, *failed_wr; - int ret = 0; - - if (!ibmr->cm_id->qp || !ibmr->mr) - goto out; - - memset(&s_wr, 0, sizeof(s_wr)); - s_wr.wr_id = RDS_IW_LOCAL_INV_WR_ID; - s_wr.opcode = IB_WR_LOCAL_INV; - s_wr.ex.invalidate_rkey = ibmr->mr->rkey; - s_wr.send_flags = IB_SEND_SIGNALED; - - failed_wr = &s_wr; - ret = ib_post_send(ibmr->cm_id->qp, &s_wr, &failed_wr); - if (ret) { - printk_ratelimited(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n", - __func__, __LINE__, ret); - goto out; - } -out: - return ret; -} - -static int rds_iw_map_reg(struct rds_iw_mr_pool *pool, - struct rds_iw_mr *ibmr, - struct scatterlist *sg, - unsigned int sg_len) -{ - struct rds_iw_device *rds_iwdev = pool->device; - struct rds_iw_mapping *mapping = &ibmr->mapping; - u64 *dma_pages; - int ret = 0; - - rds_iw_set_scatterlist(&mapping->m_sg, sg, sg_len); - - ret = rds_iw_map_scatterlist(rds_iwdev, &mapping->m_sg); - if (ret) { - dma_pages = NULL; - goto out; - } - - if (mapping->m_sg.dma_len > pool->max_message_size) { - ret = -EMSGSIZE; - goto out; - } - - ret = rds_iw_rdma_reg_mr(mapping); - if (ret) - goto out; - - rds_iw_stats_inc(s_iw_rdma_mr_used); - -out: - kfree(dma_pages); - - return ret; -} - -/* - * "Free" a fastreg MR. - */ -static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, - struct rds_iw_mr *ibmr) -{ - unsigned long flags; - int ret; - - if (!ibmr->mapping.m_sg.dma_len) - return; - - ret = rds_iw_rdma_fastreg_inv(ibmr); - if (ret) - return; - - /* Try to post the LOCAL_INV WR to the queue. */ - spin_lock_irqsave(&pool->list_lock, flags); - - list_add_tail(&ibmr->mapping.m_list, &pool->dirty_list); - atomic_add(ibmr->mapping.m_sg.len, &pool->free_pinned); - atomic_inc(&pool->dirty_count); - - spin_unlock_irqrestore(&pool->list_lock, flags); -} - -static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool, - struct list_head *unmap_list, - struct list_head *kill_list, - int *unpinned) -{ - struct rds_iw_mapping *mapping, *next; - unsigned int ncleaned = 0; - LIST_HEAD(laundered); - - /* Batched invalidation of fastreg MRs. - * Why do we do it this way, even though we could pipeline unmap - * and remap? The reason is the application semantics - when the - * application requests an invalidation of MRs, it expects all - * previously released R_Keys to become invalid. - * - * If we implement MR reuse naively, we risk memory corruption - * (this has actually been observed). So the default behavior - * requires that a MR goes through an explicit unmap operation before - * we can reuse it again. - * - * We could probably improve on this a little, by allowing immediate - * reuse of a MR on the same socket (eg you could add small - * cache of unused MRs to strct rds_socket - GET_MR could grab one - * of these without requiring an explicit invalidate). - */ - while (!list_empty(unmap_list)) { - unsigned long flags; - - spin_lock_irqsave(&pool->list_lock, flags); - list_for_each_entry_safe(mapping, next, unmap_list, m_list) { - *unpinned += mapping->m_sg.len; - list_move(&mapping->m_list, &laundered); - ncleaned++; - } - spin_unlock_irqrestore(&pool->list_lock, flags); - } - - /* Move all laundered mappings back to the unmap list. - * We do not kill any WRs right now - it doesn't seem the - * fastreg API has a max_remap limit. */ - list_splice_init(&laundered, unmap_list); - - return ncleaned; -} - -static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, - struct rds_iw_mr *ibmr) -{ - if (ibmr->mr) - ib_dereg_mr(ibmr->mr); -} diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c deleted file mode 100644 index a66d1794b2d0..000000000000 --- a/net/rds/iw_recv.c +++ /dev/null @@ -1,904 +0,0 @@ -/* - * Copyright (c) 2006 Oracle. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ -#include <linux/kernel.h> -#include <linux/slab.h> -#include <linux/pci.h> -#include <linux/dma-mapping.h> -#include <rdma/rdma_cm.h> - -#include "rds.h" -#include "iw.h" - -static struct kmem_cache *rds_iw_incoming_slab; -static struct kmem_cache *rds_iw_frag_slab; -static atomic_t rds_iw_allocation = ATOMIC_INIT(0); - -static void rds_iw_frag_drop_page(struct rds_page_frag *frag) -{ - rdsdebug("frag %p page %p\n", frag, frag->f_page); - __free_page(frag->f_page); - frag->f_page = NULL; -} - -static void rds_iw_frag_free(struct rds_page_frag *frag) -{ - rdsdebug("frag %p page %p\n", frag, frag->f_page); - BUG_ON(frag->f_page); - kmem_cache_free(rds_iw_frag_slab, frag); -} - -/* - * We map a page at a time. Its fragments are posted in order. This - * is called in fragment order as the fragments get send completion events. - * Only the last frag in the page performs the unmapping. - * - * It's OK for ring cleanup to call this in whatever order it likes because - * DMA is not in flight and so we can unmap while other ring entries still - * hold page references in their frags. - */ -static void rds_iw_recv_unmap_page(struct rds_iw_connection *ic, - struct rds_iw_recv_work *recv) -{ - struct rds_page_frag *frag = recv->r_frag; - - rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page); - if (frag->f_mapped) - ib_dma_unmap_page(ic->i_cm_id->device, - frag->f_mapped, - RDS_FRAG_SIZE, DMA_FROM_DEVICE); - frag->f_mapped = 0; -} - -void rds_iw_recv_init_ring(struct rds_iw_connection *ic) -{ - struct rds_iw_recv_work *recv; - u32 i; - - for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) { - struct ib_sge *sge; - - recv->r_iwinc = NULL; - recv->r_frag = NULL; - - recv->r_wr.next = NULL; - recv->r_wr.wr_id = i; - recv->r_wr.sg_list = recv->r_sge; - recv->r_wr.num_sge = RDS_IW_RECV_SGE; - - sge = rds_iw_data_sge(ic, recv->r_sge); - sge->addr = 0; - sge->length = RDS_FRAG_SIZE; - sge->lkey = 0; - - sge = rds_iw_header_sge(ic, recv->r_sge); - sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); - sge->length = sizeof(struct rds_header); - sge->lkey = 0; - } -} - -static void rds_iw_recv_clear_one(struct rds_iw_connection *ic, - struct rds_iw_recv_work *recv) -{ - if (recv->r_iwinc) { - rds_inc_put(&recv->r_iwinc->ii_inc); - recv->r_iwinc = NULL; - } - if (recv->r_frag) { - rds_iw_recv_unmap_page(ic, recv); - if (recv->r_frag->f_page) - rds_iw_frag_drop_page(recv->r_frag); - rds_iw_frag_free(recv->r_frag); - recv->r_frag = NULL; - } -} - -void rds_iw_recv_clear_ring(struct rds_iw_connection *ic) -{ - u32 i; - - for (i = 0; i < ic->i_recv_ring.w_nr; i++) - rds_iw_recv_clear_one(ic, &ic->i_recvs[i]); - - if (ic->i_frag.f_page) - rds_iw_frag_drop_page(&ic->i_frag); -} - -static int rds_iw_recv_refill_one(struct rds_connection *conn, - struct rds_iw_recv_work *recv, - gfp_t kptr_gfp, gfp_t page_gfp) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - dma_addr_t dma_addr; - struct ib_sge *sge; - int ret = -ENOMEM; - - if (!recv->r_iwinc) { - if (!atomic_add_unless(&rds_iw_allocation, 1, rds_iw_sysctl_max_recv_allocation)) { - rds_iw_stats_inc(s_iw_rx_alloc_limit); - goto out; - } - recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab, - kptr_gfp); - if (!recv->r_iwinc) { - atomic_dec(&rds_iw_allocation); - goto out; - } - INIT_LIST_HEAD(&recv->r_iwinc->ii_frags); - rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr); - } - - if (!recv->r_frag) { - recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp); - if (!recv->r_frag) - goto out; - INIT_LIST_HEAD(&recv->r_frag->f_item); - recv->r_frag->f_page = NULL; - } - - if (!ic->i_frag.f_page) { - ic->i_frag.f_page = alloc_page(page_gfp); - if (!ic->i_frag.f_page) - goto out; - ic->i_frag.f_offset = 0; - } - - dma_addr = ib_dma_map_page(ic->i_cm_id->device, - ic->i_frag.f_page, - ic->i_frag.f_offset, - RDS_FRAG_SIZE, - DMA_FROM_DEVICE); - if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr)) - goto out; - - /* - * Once we get the RDS_PAGE_LAST_OFF frag then rds_iw_frag_unmap() - * must be called on this recv. This happens as completions hit - * in order or on connection shutdown. - */ - recv->r_frag->f_page = ic->i_frag.f_page; - recv->r_frag->f_offset = ic->i_frag.f_offset; - recv->r_frag->f_mapped = dma_addr; - - sge = rds_iw_data_sge(ic, recv->r_sge); - sge->addr = dma_addr; - sge->length = RDS_FRAG_SIZE; - - sge = rds_iw_header_sge(ic, recv->r_sge); - sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header); - sge->length = sizeof(struct rds_header); - - get_page(recv->r_frag->f_page); - - if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) { - ic->i_frag.f_offset += RDS_FRAG_SIZE; - } else { - put_page(ic->i_frag.f_page); - ic->i_frag.f_page = NULL; - ic->i_frag.f_offset = 0; - } - - ret = 0; -out: - return ret; -} - -/* - * This tries to allocate and post unused work requests after making sure that - * they have all the allocations they need to queue received fragments into - * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc - * pairs don't go unmatched. - * - * -1 is returned if posting fails due to temporary resource exhaustion. - */ -int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, - gfp_t page_gfp, int prefill) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - struct rds_iw_recv_work *recv; - struct ib_recv_wr *failed_wr; - unsigned int posted = 0; - int ret = 0; - u32 pos; - - while ((prefill || rds_conn_up(conn)) && - rds_iw_ring_alloc(&ic->i_recv_ring, 1, &pos)) { - if (pos >= ic->i_recv_ring.w_nr) { - printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n", - pos); - ret = -EINVAL; - break; - } - - recv = &ic->i_recvs[pos]; - ret = rds_iw_recv_refill_one(conn, recv, kptr_gfp, page_gfp); - if (ret) { - ret = -1; - break; - } - - /* XXX when can this fail? */ - ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr); - rdsdebug("recv %p iwinc %p page %p addr %lu ret %d\n", recv, - recv->r_iwinc, recv->r_frag->f_page, - (long) recv->r_frag->f_mapped, ret); - if (ret) { - rds_iw_conn_error(conn, "recv post on " - "%pI4 returned %d, disconnecting and " - "reconnecting\n", &conn->c_faddr, - ret); - ret = -1; - break; - } - - posted++; - } - - /* We're doing flow control - update the window. */ - if (ic->i_flowctl && posted) - rds_iw_advertise_credits(conn, posted); - - if (ret) - rds_iw_ring_unalloc(&ic->i_recv_ring, 1); - return ret; -} - -static void rds_iw_inc_purge(struct rds_incoming *inc) -{ - struct rds_iw_incoming *iwinc; - struct rds_page_frag *frag; - struct rds_page_frag *pos; - - iwinc = container_of(inc, struct rds_iw_incoming, ii_inc); - rdsdebug("purging iwinc %p inc %p\n", iwinc, inc); - - list_for_each_entry_safe(frag, pos, &iwinc->ii_frags, f_item) { - list_del_init(&frag->f_item); - rds_iw_frag_drop_page(frag); - rds_iw_frag_free(frag); - } -} - -void rds_iw_inc_free(struct rds_incoming *inc) -{ - struct rds_iw_incoming *iwinc; - - iwinc = container_of(inc, struct rds_iw_incoming, ii_inc); - - rds_iw_inc_purge(inc); - rdsdebug("freeing iwinc %p inc %p\n", iwinc, inc); - BUG_ON(!list_empty(&iwinc->ii_frags)); - kmem_cache_free(rds_iw_incoming_slab, iwinc); - atomic_dec(&rds_iw_allocation); - BUG_ON(atomic_read(&rds_iw_allocation) < 0); -} - -int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to) -{ - struct rds_iw_incoming *iwinc; - struct rds_page_frag *frag; - unsigned long to_copy; - unsigned long frag_off = 0; - int copied = 0; - int ret; - u32 len; - - iwinc = container_of(inc, struct rds_iw_incoming, ii_inc); - frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item); - len = be32_to_cpu(inc->i_hdr.h_len); - - while (iov_iter_count(to) && copied < len) { - if (frag_off == RDS_FRAG_SIZE) { - frag = list_entry(frag->f_item.next, - struct rds_page_frag, f_item); - frag_off = 0; - } - to_copy = min_t(unsigned long, iov_iter_count(to), - RDS_FRAG_SIZE - frag_off); - to_copy = min_t(unsigned long, to_copy, len - copied); - - /* XXX needs + offset for multiple recvs per page */ - rds_stats_add(s_copy_to_user, to_copy); - ret = copy_page_to_iter(frag->f_page, - frag->f_offset + frag_off, - to_copy, - to); - if (ret != to_copy) - return -EFAULT; - - frag_off += to_copy; - copied += to_copy; - } - - return copied; -} - -/* ic starts out kzalloc()ed */ -void rds_iw_recv_init_ack(struct rds_iw_connection *ic) -{ - struct ib_send_wr *wr = &ic->i_ack_wr; - struct ib_sge *sge = &ic->i_ack_sge; - - sge->addr = ic->i_ack_dma; - sge->length = sizeof(struct rds_header); - sge->lkey = rds_iw_local_dma_lkey(ic); - - wr->sg_list = sge; - wr->num_sge = 1; - wr->opcode = IB_WR_SEND; - wr->wr_id = RDS_IW_ACK_WR_ID; - wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED; -} - -/* - * You'd think that with reliable IB connections you wouldn't need to ack - * messages that have been received. The problem is that IB hardware generates - * an ack message before it has DMAed the message into memory. This creates a - * potential message loss if the HCA is disabled for any reason between when it - * sends the ack and before the message is DMAed and processed. This is only a - * potential issue if another HCA is available for fail-over. - * - * When the remote host receives our ack they'll free the sent message from - * their send queue. To decrease the latency of this we always send an ack - * immediately after we've received messages. - * - * For simplicity, we only have one ack in flight at a time. This puts - * pressure on senders to have deep enough send queues to absorb the latency of - * a single ack frame being in flight. This might not be good enough. - * - * This is implemented by have a long-lived send_wr and sge which point to a - * statically allocated ack frame. This ack wr does not fall under the ring - * accounting that the tx and rx wrs do. The QP attribute specifically makes - * room for it beyond the ring size. Send completion notices its special - * wr_id and avoids working with the ring in that case. - */ -#ifndef KERNEL_HAS_ATOMIC64 -static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq, - int ack_required) -{ - unsigned long flags; - - spin_lock_irqsave(&ic->i_ack_lock, flags); - ic->i_ack_next = seq; - if (ack_required) - set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); - spin_unlock_irqrestore(&ic->i_ack_lock, flags); -} - -static u64 rds_iw_get_ack(struct rds_iw_connection *ic) -{ - unsigned long flags; - u64 seq; - - clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); - - spin_lock_irqsave(&ic->i_ack_lock, flags); - seq = ic->i_ack_next; - spin_unlock_irqrestore(&ic->i_ack_lock, flags); - - return seq; -} -#else -static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq, - int ack_required) -{ - atomic64_set(&ic->i_ack_next, seq); - if (ack_required) { - smp_mb__before_atomic(); - set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); - } -} - -static u64 rds_iw_get_ack(struct rds_iw_connection *ic) -{ - clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); - smp_mb__after_atomic(); - - return atomic64_read(&ic->i_ack_next); -} -#endif - - -static void rds_iw_send_ack(struct rds_iw_connection *ic, unsigned int adv_credits) -{ - struct rds_header *hdr = ic->i_ack; - struct ib_send_wr *failed_wr; - u64 seq; - int ret; - - seq = rds_iw_get_ack(ic); - - rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq); - rds_message_populate_header(hdr, 0, 0, 0); - hdr->h_ack = cpu_to_be64(seq); - hdr->h_credit = adv_credits; - rds_message_make_checksum(hdr); - ic->i_ack_queued = jiffies; - - ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr); - if (unlikely(ret)) { - /* Failed to send. Release the WR, and - * force another ACK. - */ - clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); - set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); - - rds_iw_stats_inc(s_iw_ack_send_failure); - - rds_iw_conn_error(ic->conn, "sending ack failed\n"); - } else - rds_iw_stats_inc(s_iw_ack_sent); -} - -/* - * There are 3 ways of getting acknowledgements to the peer: - * 1. We call rds_iw_attempt_ack from the recv completion handler - * to send an ACK-only frame. - * However, there can be only one such frame in the send queue - * at any time, so we may have to postpone it. - * 2. When another (data) packet is transmitted while there's - * an ACK in the queue, we piggyback the ACK sequence number - * on the data packet. - * 3. If the ACK WR is done sending, we get called from the - * send queue completion handler, and check whether there's - * another ACK pending (postponed because the WR was on the - * queue). If so, we transmit it. - * - * We maintain 2 variables: - * - i_ack_flags, which keeps track of whether the ACK WR - * is currently in the send queue or not (IB_ACK_IN_FLIGHT) - * - i_ack_next, which is the last sequence number we received - * - * Potentially, send queue and receive queue handlers can run concurrently. - * It would be nice to not have to use a spinlock to synchronize things, - * but the one problem that rules this out is that 64bit updates are - * not atomic on all platforms. Things would be a lot simpler if - * we had atomic64 or maybe cmpxchg64 everywhere. - * - * Reconnecting complicates this picture just slightly. When we - * reconnect, we may be seeing duplicate packets. The peer - * is retransmitting them, because it hasn't seen an ACK for - * them. It is important that we ACK these. - * - * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with - * this flag set *MUST* be acknowledged immediately. - */ - -/* - * When we get here, we're called from the recv queue handler. - * Check whether we ought to transmit an ACK. - */ -void rds_iw_attempt_ack(struct rds_iw_connection *ic) -{ - unsigned int adv_credits; - - if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) - return; - - if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) { - rds_iw_stats_inc(s_iw_ack_send_delayed); - return; - } - - /* Can we get a send credit? */ - if (!rds_iw_send_grab_credits(ic, 1, &adv_credits, 0, RDS_MAX_ADV_CREDIT)) { - rds_iw_stats_inc(s_iw_tx_throttle); - clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); - return; - } - - clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); - rds_iw_send_ack(ic, adv_credits); -} - -/* - * We get here from the send completion handler, when the - * adapter tells us the ACK frame was sent. - */ -void rds_iw_ack_send_complete(struct rds_iw_connection *ic) -{ - clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); - rds_iw_attempt_ack(ic); -} - -/* - * This is called by the regular xmit code when it wants to piggyback - * an ACK on an outgoing frame. - */ -u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic) -{ - if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) - rds_iw_stats_inc(s_iw_ack_send_piggybacked); - return rds_iw_get_ack(ic); -} - -/* - * It's kind of lame that we're copying from the posted receive pages into - * long-lived bitmaps. We could have posted the bitmaps and rdma written into - * them. But receiving new congestion bitmaps should be a *rare* event, so - * hopefully we won't need to invest that complexity in making it more - * efficient. By copying we can share a simpler core with TCP which has to - * copy. - */ -static void rds_iw_cong_recv(struct rds_connection *conn, - struct rds_iw_incoming *iwinc) -{ - struct rds_cong_map *map; - unsigned int map_off; - unsigned int map_page; - struct rds_page_frag *frag; - unsigned long frag_off; - unsigned long to_copy; - unsigned long copied; - uint64_t uncongested = 0; - void *addr; - - /* catch completely corrupt packets */ - if (be32_to_cpu(iwinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES) - return; - - map = conn->c_fcong; - map_page = 0; - map_off = 0; - - frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item); - frag_off = 0; - - copied = 0; - - while (copied < RDS_CONG_MAP_BYTES) { - uint64_t *src, *dst; - unsigned int k; - - to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off); - BUG_ON(to_copy & 7); /* Must be 64bit aligned. */ - - addr = kmap_atomic(frag->f_page); - - src = addr + frag_off; - dst = (void *)map->m_page_addrs[map_page] + map_off; - for (k = 0; k < to_copy; k += 8) { - /* Record ports that became uncongested, ie - * bits that changed from 0 to 1. */ - uncongested |= ~(*src) & *dst; - *dst++ = *src++; - } - kunmap_atomic(addr); - - copied += to_copy; - - map_off += to_copy; - if (map_off == PAGE_SIZE) { - map_off = 0; - map_page++; - } - - frag_off += to_copy; - if (frag_off == RDS_FRAG_SIZE) { - frag = list_entry(frag->f_item.next, - struct rds_page_frag, f_item); - frag_off = 0; - } - } - - /* the congestion map is in little endian order */ - uncongested = le64_to_cpu(uncongested); - - rds_cong_map_updated(map, uncongested); -} - -/* - * Rings are posted with all the allocations they'll need to queue the - * incoming message to the receiving socket so this can't fail. - * All fragments start with a header, so we can make sure we're not receiving - * garbage, and we can tell a small 8 byte fragment from an ACK frame. - */ -struct rds_iw_ack_state { - u64 ack_next; - u64 ack_recv; - unsigned int ack_required:1; - unsigned int ack_next_valid:1; - unsigned int ack_recv_valid:1; -}; - -static void rds_iw_process_recv(struct rds_connection *conn, - struct rds_iw_recv_work *recv, u32 byte_len, - struct rds_iw_ack_state *state) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - struct rds_iw_incoming *iwinc = ic->i_iwinc; - struct rds_header *ihdr, *hdr; - - /* XXX shut down the connection if port 0,0 are seen? */ - - rdsdebug("ic %p iwinc %p recv %p byte len %u\n", ic, iwinc, recv, - byte_len); - - if (byte_len < sizeof(struct rds_header)) { - rds_iw_conn_error(conn, "incoming message " - "from %pI4 didn't include a " - "header, disconnecting and " - "reconnecting\n", - &conn->c_faddr); - return; - } - byte_len -= sizeof(struct rds_header); - - ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs]; - - /* Validate the checksum. */ - if (!rds_message_verify_checksum(ihdr)) { - rds_iw_conn_error(conn, "incoming message " - "from %pI4 has corrupted header - " - "forcing a reconnect\n", - &conn->c_faddr); - rds_stats_inc(s_recv_drop_bad_checksum); - return; - } - - /* Process the ACK sequence which comes with every packet */ - state->ack_recv = be64_to_cpu(ihdr->h_ack); - state->ack_recv_valid = 1; - - /* Process the credits update if there was one */ - if (ihdr->h_credit) - rds_iw_send_add_credits(conn, ihdr->h_credit); - - if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && byte_len == 0) { - /* This is an ACK-only packet. The fact that it gets - * special treatment here is that historically, ACKs - * were rather special beasts. - */ - rds_iw_stats_inc(s_iw_ack_received); - - /* - * Usually the frags make their way on to incs and are then freed as - * the inc is freed. We don't go that route, so we have to drop the - * page ref ourselves. We can't just leave the page on the recv - * because that confuses the dma mapping of pages and each recv's use - * of a partial page. We can leave the frag, though, it will be - * reused. - * - * FIXME: Fold this into the code path below. - */ - rds_iw_frag_drop_page(recv->r_frag); - return; - } - - /* - * If we don't already have an inc on the connection then this - * fragment has a header and starts a message.. copy its header - * into the inc and save the inc so we can hang upcoming fragments - * off its list. - */ - if (!iwinc) { - iwinc = recv->r_iwinc; - recv->r_iwinc = NULL; - ic->i_iwinc = iwinc; - - hdr = &iwinc->ii_inc.i_hdr; - memcpy(hdr, ihdr, sizeof(*hdr)); - ic->i_recv_data_rem = be32_to_cpu(hdr->h_len); - - rdsdebug("ic %p iwinc %p rem %u flag 0x%x\n", ic, iwinc, - ic->i_recv_data_rem, hdr->h_flags); - } else { - hdr = &iwinc->ii_inc.i_hdr; - /* We can't just use memcmp here; fragments of a - * single message may carry different ACKs */ - if (hdr->h_sequence != ihdr->h_sequence || - hdr->h_len != ihdr->h_len || - hdr->h_sport != ihdr->h_sport || - hdr->h_dport != ihdr->h_dport) { - rds_iw_conn_error(conn, - "fragment header mismatch; forcing reconnect\n"); - return; - } - } - - list_add_tail(&recv->r_frag->f_item, &iwinc->ii_frags); - recv->r_frag = NULL; - - if (ic->i_recv_data_rem > RDS_FRAG_SIZE) - ic->i_recv_data_rem -= RDS_FRAG_SIZE; - else { - ic->i_recv_data_rem = 0; - ic->i_iwinc = NULL; - - if (iwinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) - rds_iw_cong_recv(conn, iwinc); - else { - rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr, - &iwinc->ii_inc, GFP_ATOMIC); - state->ack_next = be64_to_cpu(hdr->h_sequence); - state->ack_next_valid = 1; - } - - /* Evaluate the ACK_REQUIRED flag *after* we received - * the complete frame, and after bumping the next_rx - * sequence. */ - if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) { - rds_stats_inc(s_recv_ack_required); - state->ack_required = 1; - } - - rds_inc_put(&iwinc->ii_inc); - } -} - -/* - * Plucking the oldest entry from the ring can be done concurrently with - * the thread refilling the ring. Each ring operation is protected by - * spinlocks and the transient state of refilling doesn't change the - * recording of which entry is oldest. - * - * This relies on IB only calling one cq comp_handler for each cq so that - * there will only be one caller of rds_recv_incoming() per RDS connection. - */ -void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context) -{ - struct rds_connection *conn = context; - struct rds_iw_connection *ic = conn->c_transport_data; - - rdsdebug("conn %p cq %p\n", conn, cq); - - rds_iw_stats_inc(s_iw_rx_cq_call); - - tasklet_schedule(&ic->i_recv_tasklet); -} - -static inline void rds_poll_cq(struct rds_iw_connection *ic, - struct rds_iw_ack_state *state) -{ - struct rds_connection *conn = ic->conn; - struct ib_wc wc; - struct rds_iw_recv_work *recv; - - while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) { - rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", - (unsigned long long)wc.wr_id, wc.status, wc.byte_len, - be32_to_cpu(wc.ex.imm_data)); - rds_iw_stats_inc(s_iw_rx_cq_event); - - recv = &ic->i_recvs[rds_iw_ring_oldest(&ic->i_recv_ring)]; - - rds_iw_recv_unmap_page(ic, recv); - - /* - * Also process recvs in connecting state because it is possible - * to get a recv completion _before_ the rdmacm ESTABLISHED - * event is processed. - */ - if (rds_conn_up(conn) || rds_conn_connecting(conn)) { - /* We expect errors as the qp is drained during shutdown */ - if (wc.status == IB_WC_SUCCESS) { - rds_iw_process_recv(conn, recv, wc.byte_len, state); - } else { - rds_iw_conn_error(conn, "recv completion on " - "%pI4 had status %u, disconnecting and " - "reconnecting\n", &conn->c_faddr, - wc.status); - } - } - - rds_iw_ring_free(&ic->i_recv_ring, 1); - } -} - -void rds_iw_recv_tasklet_fn(unsigned long data) -{ - struct rds_iw_connection *ic = (struct rds_iw_connection *) data; - struct rds_connection *conn = ic->conn; - struct rds_iw_ack_state state = { 0, }; - - rds_poll_cq(ic, &state); - ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); - rds_poll_cq(ic, &state); - - if (state.ack_next_valid) - rds_iw_set_ack(ic, state.ack_next, state.ack_required); - if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) { - rds_send_drop_acked(conn, state.ack_recv, NULL); - ic->i_ack_recv = state.ack_recv; - } - if (rds_conn_up(conn)) - rds_iw_attempt_ack(ic); - - /* If we ever end up with a really empty receive ring, we're - * in deep trouble, as the sender will definitely see RNR - * timeouts. */ - if (rds_iw_ring_empty(&ic->i_recv_ring)) - rds_iw_stats_inc(s_iw_rx_ring_empty); - - /* - * If the ring is running low, then schedule the thread to refill. - */ - if (rds_iw_ring_low(&ic->i_recv_ring)) - queue_delayed_work(rds_wq, &conn->c_recv_w, 0); -} - -int rds_iw_recv(struct rds_connection *conn) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - int ret = 0; - - rdsdebug("conn %p\n", conn); - - /* - * If we get a temporary posting failure in this context then - * we're really low and we want the caller to back off for a bit. - */ - mutex_lock(&ic->i_recv_mutex); - if (rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0)) - ret = -ENOMEM; - else - rds_iw_stats_inc(s_iw_rx_refill_from_thread); - mutex_unlock(&ic->i_recv_mutex); - - if (rds_conn_up(conn)) - rds_iw_attempt_ack(ic); - - return ret; -} - -int rds_iw_recv_init(void) -{ - struct sysinfo si; - int ret = -ENOMEM; - - /* Default to 30% of all available RAM for recv memory */ - si_meminfo(&si); - rds_iw_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE; - - rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming", - sizeof(struct rds_iw_incoming), - 0, 0, NULL); - if (!rds_iw_incoming_slab) - goto out; - - rds_iw_frag_slab = kmem_cache_create("rds_iw_frag", - sizeof(struct rds_page_frag), - 0, 0, NULL); - if (!rds_iw_frag_slab) - kmem_cache_destroy(rds_iw_incoming_slab); - else - ret = 0; -out: - return ret; -} - -void rds_iw_recv_exit(void) -{ - kmem_cache_destroy(rds_iw_incoming_slab); - kmem_cache_destroy(rds_iw_frag_slab); -} diff --git a/net/rds/iw_ring.c b/net/rds/iw_ring.c deleted file mode 100644 index da8e3b63f663..000000000000 --- a/net/rds/iw_ring.c +++ /dev/null @@ -1,169 +0,0 @@ -/* - * Copyright (c) 2006 Oracle. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ -#include <linux/kernel.h> - -#include "rds.h" -#include "iw.h" - -/* - * Locking for IB rings. - * We assume that allocation is always protected by a mutex - * in the caller (this is a valid assumption for the current - * implementation). - * - * Freeing always happens in an interrupt, and hence only - * races with allocations, but not with other free()s. - * - * The interaction between allocation and freeing is that - * the alloc code has to determine the number of free entries. - * To this end, we maintain two counters; an allocation counter - * and a free counter. Both are allowed to run freely, and wrap - * around. - * The number of used entries is always (alloc_ctr - free_ctr) % NR. - * - * The current implementation makes free_ctr atomic. When the - * caller finds an allocation fails, it should set an "alloc fail" - * bit and retry the allocation. The "alloc fail" bit essentially tells - * the CQ completion handlers to wake it up after freeing some - * more entries. - */ - -/* - * This only happens on shutdown. - */ -DECLARE_WAIT_QUEUE_HEAD(rds_iw_ring_empty_wait); - -void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr) -{ - memset(ring, 0, sizeof(*ring)); - ring->w_nr = nr; - rdsdebug("ring %p nr %u\n", ring, ring->w_nr); -} - -static inline u32 __rds_iw_ring_used(struct rds_iw_work_ring *ring) -{ - u32 diff; - - /* This assumes that atomic_t has at least as many bits as u32 */ - diff = ring->w_alloc_ctr - (u32) atomic_read(&ring->w_free_ctr); - BUG_ON(diff > ring->w_nr); - - return diff; -} - -void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr) -{ - /* We only ever get called from the connection setup code, - * prior to creating the QP. */ - BUG_ON(__rds_iw_ring_used(ring)); - ring->w_nr = nr; -} - -static int __rds_iw_ring_empty(struct rds_iw_work_ring *ring) -{ - return __rds_iw_ring_used(ring) == 0; -} - -u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos) -{ - u32 ret = 0, avail; - - avail = ring->w_nr - __rds_iw_ring_used(ring); - - rdsdebug("ring %p val %u next %u free %u\n", ring, val, - ring->w_alloc_ptr, avail); - - if (val && avail) { - ret = min(val, avail); - *pos = ring->w_alloc_ptr; - - ring->w_alloc_ptr = (ring->w_alloc_ptr + ret) % ring->w_nr; - ring->w_alloc_ctr += ret; - } - - return ret; -} - -void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val) -{ - ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr; - atomic_add(val, &ring->w_free_ctr); - - if (__rds_iw_ring_empty(ring) && - waitqueue_active(&rds_iw_ring_empty_wait)) - wake_up(&rds_iw_ring_empty_wait); -} - -void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val) -{ - ring->w_alloc_ptr = (ring->w_alloc_ptr - val) % ring->w_nr; - ring->w_alloc_ctr -= val; -} - -int rds_iw_ring_empty(struct rds_iw_work_ring *ring) -{ - return __rds_iw_ring_empty(ring); -} - -int rds_iw_ring_low(struct rds_iw_work_ring *ring) -{ - return __rds_iw_ring_used(ring) <= (ring->w_nr >> 1); -} - - -/* - * returns the oldest alloced ring entry. This will be the next one - * freed. This can't be called if there are none allocated. - */ -u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring) -{ - return ring->w_free_ptr; -} - -/* - * returns the number of completed work requests. - */ - -u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest) -{ - u32 ret; - - if (oldest <= (unsigned long long)wr_id) - ret = (unsigned long long)wr_id - oldest + 1; - else - ret = ring->w_nr - oldest + (unsigned long long)wr_id + 1; - - rdsdebug("ring %p ret %u wr_id %u oldest %u\n", ring, ret, - wr_id, oldest); - return ret; -} diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c deleted file mode 100644 index e20bd503f4bd..000000000000 --- a/net/rds/iw_send.c +++ /dev/null @@ -1,981 +0,0 @@ -/* - * Copyright (c) 2006 Oracle. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ -#include <linux/kernel.h> -#include <linux/in.h> -#include <linux/device.h> -#include <linux/dmapool.h> -#include <linux/ratelimit.h> - -#include "rds.h" -#include "iw.h" - -static void rds_iw_send_rdma_complete(struct rds_message *rm, - int wc_status) -{ - int notify_status; - - switch (wc_status) { - case IB_WC_WR_FLUSH_ERR: - return; - - case IB_WC_SUCCESS: - notify_status = RDS_RDMA_SUCCESS; - break; - - case IB_WC_REM_ACCESS_ERR: - notify_status = RDS_RDMA_REMOTE_ERROR; - break; - - default: - notify_status = RDS_RDMA_OTHER_ERROR; - break; - } - rds_rdma_send_complete(rm, notify_status); -} - -static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic, - struct rm_rdma_op *op) -{ - if (op->op_mapped) { - ib_dma_unmap_sg(ic->i_cm_id->device, - op->op_sg, op->op_nents, - op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - op->op_mapped = 0; - } -} - -static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic, - struct rds_iw_send_work *send, - int wc_status) -{ - struct rds_message *rm = send->s_rm; - - rdsdebug("ic %p send %p rm %p\n", ic, send, rm); - - ib_dma_unmap_sg(ic->i_cm_id->device, - rm->data.op_sg, rm->data.op_nents, - DMA_TO_DEVICE); - - if (rm->rdma.op_active) { - rds_iw_send_unmap_rdma(ic, &rm->rdma); - - /* If the user asked for a completion notification on this - * message, we can implement three different semantics: - * 1. Notify when we received the ACK on the RDS message - * that was queued with the RDMA. This provides reliable - * notification of RDMA status at the expense of a one-way - * packet delay. - * 2. Notify when the IB stack gives us the completion event for - * the RDMA operation. - * 3. Notify when the IB stack gives us the completion event for - * the accompanying RDS messages. - * Here, we implement approach #3. To implement approach #2, - * call rds_rdma_send_complete from the cq_handler. To implement #1, - * don't call rds_rdma_send_complete at all, and fall back to the notify - * handling in the ACK processing code. - * - * Note: There's no need to explicitly sync any RDMA buffers using - * ib_dma_sync_sg_for_cpu - the completion for the RDMA - * operation itself unmapped the RDMA buffers, which takes care - * of synching. - */ - rds_iw_send_rdma_complete(rm, wc_status); - - if (rm->rdma.op_write) - rds_stats_add(s_send_rdma_bytes, rm->rdma.op_bytes); - else - rds_stats_add(s_recv_rdma_bytes, rm->rdma.op_bytes); - } - - /* If anyone waited for this message to get flushed out, wake - * them up now */ - rds_message_unmapped(rm); - - rds_message_put(rm); - send->s_rm = NULL; -} - -void rds_iw_send_init_ring(struct rds_iw_connection *ic) -{ - struct rds_iw_send_work *send; - u32 i; - - for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { - struct ib_sge *sge; - - send->s_rm = NULL; - send->s_op = NULL; - send->s_mapping = NULL; - - send->s_send_wr.next = NULL; - send->s_send_wr.wr_id = i; - send->s_send_wr.sg_list = send->s_sge; - send->s_send_wr.num_sge = 1; - send->s_send_wr.opcode = IB_WR_SEND; - send->s_send_wr.send_flags = 0; - send->s_send_wr.ex.imm_data = 0; - - sge = rds_iw_data_sge(ic, send->s_sge); - sge->lkey = 0; - - sge = rds_iw_header_sge(ic, send->s_sge); - sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header)); - sge->length = sizeof(struct rds_header); - sge->lkey = 0; - - send->s_mr = ib_alloc_mr(ic->i_pd, IB_MR_TYPE_MEM_REG, - fastreg_message_size); - if (IS_ERR(send->s_mr)) { - printk(KERN_WARNING "RDS/IW: ib_alloc_mr failed\n"); - break; - } - } -} - -void rds_iw_send_clear_ring(struct rds_iw_connection *ic) -{ - struct rds_iw_send_work *send; - u32 i; - - for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { - BUG_ON(!send->s_mr); - ib_dereg_mr(send->s_mr); - if (send->s_send_wr.opcode == 0xdead) - continue; - if (send->s_rm) - rds_iw_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR); - if (send->s_op) - rds_iw_send_unmap_rdma(ic, send->s_op); - } -} - -/* - * The _oldest/_free ring operations here race cleanly with the alloc/unalloc - * operations performed in the send path. As the sender allocs and potentially - * unallocs the next free entry in the ring it doesn't alter which is - * the next to be freed, which is what this is concerned with. - */ -void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context) -{ - struct rds_connection *conn = context; - struct rds_iw_connection *ic = conn->c_transport_data; - struct ib_wc wc; - struct rds_iw_send_work *send; - u32 completed; - u32 oldest; - u32 i; - int ret; - - rdsdebug("cq %p conn %p\n", cq, conn); - rds_iw_stats_inc(s_iw_tx_cq_call); - ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); - if (ret) - rdsdebug("ib_req_notify_cq send failed: %d\n", ret); - - while (ib_poll_cq(cq, 1, &wc) > 0) { - rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", - (unsigned long long)wc.wr_id, wc.status, wc.byte_len, - be32_to_cpu(wc.ex.imm_data)); - rds_iw_stats_inc(s_iw_tx_cq_event); - - if (wc.status != IB_WC_SUCCESS) { - printk(KERN_ERR "WC Error: status = %d opcode = %d\n", wc.status, wc.opcode); - break; - } - - if (wc.opcode == IB_WC_LOCAL_INV && wc.wr_id == RDS_IW_LOCAL_INV_WR_ID) { - ic->i_fastreg_posted = 0; - continue; - } - - if (wc.opcode == IB_WC_REG_MR && wc.wr_id == RDS_IW_REG_WR_ID) { - ic->i_fastreg_posted = 1; - continue; - } - - if (wc.wr_id == RDS_IW_ACK_WR_ID) { - if (time_after(jiffies, ic->i_ack_queued + HZ/2)) - rds_iw_stats_inc(s_iw_tx_stalled); - rds_iw_ack_send_complete(ic); - continue; - } - - oldest = rds_iw_ring_oldest(&ic->i_send_ring); - - completed = rds_iw_ring_completed(&ic->i_send_ring, wc.wr_id, oldest); - - for (i = 0; i < completed; i++) { - send = &ic->i_sends[oldest]; - - /* In the error case, wc.opcode sometimes contains garbage */ - switch (send->s_send_wr.opcode) { - case IB_WR_SEND: - if (send->s_rm) - rds_iw_send_unmap_rm(ic, send, wc.status); - break; - case IB_WR_REG_MR: - case IB_WR_RDMA_WRITE: - case IB_WR_RDMA_READ: - case IB_WR_RDMA_READ_WITH_INV: - /* Nothing to be done - the SG list will be unmapped - * when the SEND completes. */ - break; - default: - printk_ratelimited(KERN_NOTICE - "RDS/IW: %s: unexpected opcode 0x%x in WR!\n", - __func__, send->s_send_wr.opcode); - break; - } - - send->s_send_wr.opcode = 0xdead; - send->s_send_wr.num_sge = 1; - if (time_after(jiffies, send->s_queued + HZ/2)) - rds_iw_stats_inc(s_iw_tx_stalled); - - /* If a RDMA operation produced an error, signal this right - * away. If we don't, the subsequent SEND that goes with this - * RDMA will be canceled with ERR_WFLUSH, and the application - * never learn that the RDMA failed. */ - if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) { - struct rds_message *rm; - - rm = rds_send_get_message(conn, send->s_op); - if (rm) - rds_iw_send_rdma_complete(rm, wc.status); - } - - oldest = (oldest + 1) % ic->i_send_ring.w_nr; - } - - rds_iw_ring_free(&ic->i_send_ring, completed); - - if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) || - test_bit(0, &conn->c_map_queued)) - queue_delayed_work(rds_wq, &conn->c_send_w, 0); - - /* We expect errors as the qp is drained during shutdown */ - if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) { - rds_iw_conn_error(conn, - "send completion on %pI4 " - "had status %u, disconnecting and reconnecting\n", - &conn->c_faddr, wc.status); - } - } -} - -/* - * This is the main function for allocating credits when sending - * messages. - * - * Conceptually, we have two counters: - * - send credits: this tells us how many WRs we're allowed - * to submit without overruning the receiver's queue. For - * each SEND WR we post, we decrement this by one. - * - * - posted credits: this tells us how many WRs we recently - * posted to the receive queue. This value is transferred - * to the peer as a "credit update" in a RDS header field. - * Every time we transmit credits to the peer, we subtract - * the amount of transferred credits from this counter. - * - * It is essential that we avoid situations where both sides have - * exhausted their send credits, and are unable to send new credits - * to the peer. We achieve this by requiring that we send at least - * one credit update to the peer before exhausting our credits. - * When new credits arrive, we subtract one credit that is withheld - * until we've posted new buffers and are ready to transmit these - * credits (see rds_iw_send_add_credits below). - * - * The RDS send code is essentially single-threaded; rds_send_xmit - * grabs c_send_lock to ensure exclusive access to the send ring. - * However, the ACK sending code is independent and can race with - * message SENDs. - * - * In the send path, we need to update the counters for send credits - * and the counter of posted buffers atomically - when we use the - * last available credit, we cannot allow another thread to race us - * and grab the posted credits counter. Hence, we have to use a - * spinlock to protect the credit counter, or use atomics. - * - * Spinlocks shared between the send and the receive path are bad, - * because they create unnecessary delays. An early implementation - * using a spinlock showed a 5% degradation in throughput at some - * loads. - * - * This implementation avoids spinlocks completely, putting both - * counters into a single atomic, and updating that atomic using - * atomic_add (in the receive path, when receiving fresh credits), - * and using atomic_cmpxchg when updating the two counters. - */ -int rds_iw_send_grab_credits(struct rds_iw_connection *ic, - u32 wanted, u32 *adv_credits, int need_posted, int max_posted) -{ - unsigned int avail, posted, got = 0, advertise; - long oldval, newval; - - *adv_credits = 0; - if (!ic->i_flowctl) - return wanted; - -try_again: - advertise = 0; - oldval = newval = atomic_read(&ic->i_credits); - posted = IB_GET_POST_CREDITS(oldval); - avail = IB_GET_SEND_CREDITS(oldval); - - rdsdebug("wanted=%u credits=%u posted=%u\n", - wanted, avail, posted); - - /* The last credit must be used to send a credit update. */ - if (avail && !posted) - avail--; - - if (avail < wanted) { - struct rds_connection *conn = ic->i_cm_id->context; - - /* Oops, there aren't that many credits left! */ - set_bit(RDS_LL_SEND_FULL, &conn->c_flags); - got = avail; - } else { - /* Sometimes you get what you want, lalala. */ - got = wanted; - } - newval -= IB_SET_SEND_CREDITS(got); - - /* - * If need_posted is non-zero, then the caller wants - * the posted regardless of whether any send credits are - * available. - */ - if (posted && (got || need_posted)) { - advertise = min_t(unsigned int, posted, max_posted); - newval -= IB_SET_POST_CREDITS(advertise); - } - - /* Finally bill everything */ - if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval) - goto try_again; - - *adv_credits = advertise; - return got; -} - -void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - - if (credits == 0) - return; - - rdsdebug("credits=%u current=%u%s\n", - credits, - IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)), - test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : ""); - - atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits); - if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags)) - queue_delayed_work(rds_wq, &conn->c_send_w, 0); - - WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384); - - rds_iw_stats_inc(s_iw_rx_credit_updates); -} - -void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - - if (posted == 0) - return; - - atomic_add(IB_SET_POST_CREDITS(posted), &ic->i_credits); - - /* Decide whether to send an update to the peer now. - * If we would send a credit update for every single buffer we - * post, we would end up with an ACK storm (ACK arrives, - * consumes buffer, we refill the ring, send ACK to remote - * advertising the newly posted buffer... ad inf) - * - * Performance pretty much depends on how often we send - * credit updates - too frequent updates mean lots of ACKs. - * Too infrequent updates, and the peer will run out of - * credits and has to throttle. - * For the time being, 16 seems to be a good compromise. - */ - if (IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)) >= 16) - set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); -} - -static inline void -rds_iw_xmit_populate_wr(struct rds_iw_connection *ic, - struct rds_iw_send_work *send, unsigned int pos, - unsigned long buffer, unsigned int length, - int send_flags) -{ - struct ib_sge *sge; - - WARN_ON(pos != send - ic->i_sends); - - send->s_send_wr.send_flags = send_flags; - send->s_send_wr.opcode = IB_WR_SEND; - send->s_send_wr.num_sge = 2; - send->s_send_wr.next = NULL; - send->s_queued = jiffies; - send->s_op = NULL; - - if (length != 0) { - sge = rds_iw_data_sge(ic, send->s_sge); - sge->addr = buffer; - sge->length = length; - sge->lkey = rds_iw_local_dma_lkey(ic); - - sge = rds_iw_header_sge(ic, send->s_sge); - } else { - /* We're sending a packet with no payload. There is only - * one SGE */ - send->s_send_wr.num_sge = 1; - sge = &send->s_sge[0]; - } - - sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header)); - sge->length = sizeof(struct rds_header); - sge->lkey = rds_iw_local_dma_lkey(ic); -} - -/* - * This can be called multiple times for a given message. The first time - * we see a message we map its scatterlist into the IB device so that - * we can provide that mapped address to the IB scatter gather entries - * in the IB work requests. We translate the scatterlist into a series - * of work requests that fragment the message. These work requests complete - * in order so we pass ownership of the message to the completion handler - * once we send the final fragment. - * - * The RDS core uses the c_send_lock to only enter this function once - * per connection. This makes sure that the tx ring alloc/unalloc pairs - * don't get out of sync and confuse the ring. - */ -int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, - unsigned int hdr_off, unsigned int sg, unsigned int off) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - struct ib_device *dev = ic->i_cm_id->device; - struct rds_iw_send_work *send = NULL; - struct rds_iw_send_work *first; - struct rds_iw_send_work *prev; - struct ib_send_wr *failed_wr; - struct scatterlist *scat; - u32 pos; - u32 i; - u32 work_alloc; - u32 credit_alloc; - u32 posted; - u32 adv_credits = 0; - int send_flags = 0; - int sent; - int ret; - int flow_controlled = 0; - - BUG_ON(off % RDS_FRAG_SIZE); - BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header)); - - /* Fastreg support */ - if (rds_rdma_cookie_key(rm->m_rdma_cookie) && !ic->i_fastreg_posted) { - ret = -EAGAIN; - goto out; - } - - /* FIXME we may overallocate here */ - if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) - i = 1; - else - i = ceil(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE); - - work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos); - if (work_alloc == 0) { - set_bit(RDS_LL_SEND_FULL, &conn->c_flags); - rds_iw_stats_inc(s_iw_tx_ring_full); - ret = -ENOMEM; - goto out; - } - - credit_alloc = work_alloc; - if (ic->i_flowctl) { - credit_alloc = rds_iw_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT); - adv_credits += posted; - if (credit_alloc < work_alloc) { - rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc); - work_alloc = credit_alloc; - flow_controlled++; - } - if (work_alloc == 0) { - set_bit(RDS_LL_SEND_FULL, &conn->c_flags); - rds_iw_stats_inc(s_iw_tx_throttle); - ret = -ENOMEM; - goto out; - } - } - - /* map the message the first time we see it */ - if (!ic->i_rm) { - /* - printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n", - be16_to_cpu(rm->m_inc.i_hdr.h_dport), - rm->m_inc.i_hdr.h_flags, - be32_to_cpu(rm->m_inc.i_hdr.h_len)); - */ - if (rm->data.op_nents) { - rm->data.op_count = ib_dma_map_sg(dev, - rm->data.op_sg, - rm->data.op_nents, - DMA_TO_DEVICE); - rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count); - if (rm->data.op_count == 0) { - rds_iw_stats_inc(s_iw_tx_sg_mapping_failure); - rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); - ret = -ENOMEM; /* XXX ? */ - goto out; - } - } else { - rm->data.op_count = 0; - } - - ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; - ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes; - rds_message_addref(rm); - rm->data.op_dmasg = 0; - rm->data.op_dmaoff = 0; - ic->i_rm = rm; - - /* Finalize the header */ - if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags)) - rm->m_inc.i_hdr.h_flags |= RDS_FLAG_ACK_REQUIRED; - if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) - rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED; - - /* If it has a RDMA op, tell the peer we did it. This is - * used by the peer to release use-once RDMA MRs. */ - if (rm->rdma.op_active) { - struct rds_ext_header_rdma ext_hdr; - - ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey); - rds_message_add_extension(&rm->m_inc.i_hdr, - RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); - } - if (rm->m_rdma_cookie) { - rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr, - rds_rdma_cookie_key(rm->m_rdma_cookie), - rds_rdma_cookie_offset(rm->m_rdma_cookie)); - } - - /* Note - rds_iw_piggyb_ack clears the ACK_REQUIRED bit, so - * we should not do this unless we have a chance of at least - * sticking the header into the send ring. Which is why we - * should call rds_iw_ring_alloc first. */ - rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_iw_piggyb_ack(ic)); - rds_message_make_checksum(&rm->m_inc.i_hdr); - - /* - * Update adv_credits since we reset the ACK_REQUIRED bit. - */ - rds_iw_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits); - adv_credits += posted; - BUG_ON(adv_credits > 255); - } - - send = &ic->i_sends[pos]; - first = send; - prev = NULL; - scat = &rm->data.op_sg[rm->data.op_dmasg]; - sent = 0; - i = 0; - - /* Sometimes you want to put a fence between an RDMA - * READ and the following SEND. - * We could either do this all the time - * or when requested by the user. Right now, we let - * the application choose. - */ - if (rm->rdma.op_active && rm->rdma.op_fence) - send_flags = IB_SEND_FENCE; - - /* - * We could be copying the header into the unused tail of the page. - * That would need to be changed in the future when those pages might - * be mapped userspace pages or page cache pages. So instead we always - * use a second sge and our long-lived ring of mapped headers. We send - * the header after the data so that the data payload can be aligned on - * the receiver. - */ - - /* handle a 0-len message */ - if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) { - rds_iw_xmit_populate_wr(ic, send, pos, 0, 0, send_flags); - goto add_header; - } - - /* if there's data reference it with a chain of work reqs */ - for (; i < work_alloc && scat != &rm->data.op_sg[rm->data.op_count]; i++) { - unsigned int len; - - send = &ic->i_sends[pos]; - - len = min(RDS_FRAG_SIZE, - ib_sg_dma_len(dev, scat) - rm->data.op_dmaoff); - rds_iw_xmit_populate_wr(ic, send, pos, - ib_sg_dma_address(dev, scat) + rm->data.op_dmaoff, len, - send_flags); - - /* - * We want to delay signaling completions just enough to get - * the batching benefits but not so much that we create dead time - * on the wire. - */ - if (ic->i_unsignaled_wrs-- == 0) { - ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; - send->s_send_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; - } - - ic->i_unsignaled_bytes -= len; - if (ic->i_unsignaled_bytes <= 0) { - ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes; - send->s_send_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; - } - - /* - * Always signal the last one if we're stopping due to flow control. - */ - if (flow_controlled && i == (work_alloc-1)) - send->s_send_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; - - rdsdebug("send %p wr %p num_sge %u next %p\n", send, - &send->s_send_wr, send->s_send_wr.num_sge, send->s_send_wr.next); - - sent += len; - rm->data.op_dmaoff += len; - if (rm->data.op_dmaoff == ib_sg_dma_len(dev, scat)) { - scat++; - rm->data.op_dmaoff = 0; - rm->data.op_dmasg++; - } - -add_header: - /* Tack on the header after the data. The header SGE should already - * have been set up to point to the right header buffer. */ - memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header)); - - if (0) { - struct rds_header *hdr = &ic->i_send_hdrs[pos]; - - printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n", - be16_to_cpu(hdr->h_dport), - hdr->h_flags, - be32_to_cpu(hdr->h_len)); - } - if (adv_credits) { - struct rds_header *hdr = &ic->i_send_hdrs[pos]; - - /* add credit and redo the header checksum */ - hdr->h_credit = adv_credits; - rds_message_make_checksum(hdr); - adv_credits = 0; - rds_iw_stats_inc(s_iw_tx_credit_updates); - } - - if (prev) - prev->s_send_wr.next = &send->s_send_wr; - prev = send; - - pos = (pos + 1) % ic->i_send_ring.w_nr; - } - - /* Account the RDS header in the number of bytes we sent, but just once. - * The caller has no concept of fragmentation. */ - if (hdr_off == 0) - sent += sizeof(struct rds_header); - - /* if we finished the message then send completion owns it */ - if (scat == &rm->data.op_sg[rm->data.op_count]) { - prev->s_rm = ic->i_rm; - prev->s_send_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; - ic->i_rm = NULL; - } - - if (i < work_alloc) { - rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i); - work_alloc = i; - } - if (ic->i_flowctl && i < credit_alloc) - rds_iw_send_add_credits(conn, credit_alloc - i); - - /* XXX need to worry about failed_wr and partial sends. */ - failed_wr = &first->s_send_wr; - ret = ib_post_send(ic->i_cm_id->qp, &first->s_send_wr, &failed_wr); - rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, - first, &first->s_send_wr, ret, failed_wr); - BUG_ON(failed_wr != &first->s_send_wr); - if (ret) { - printk(KERN_WARNING "RDS/IW: ib_post_send to %pI4 " - "returned %d\n", &conn->c_faddr, ret); - rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); - if (prev->s_rm) { - ic->i_rm = prev->s_rm; - prev->s_rm = NULL; - } - goto out; - } - - ret = sent; -out: - BUG_ON(adv_credits); - return ret; -} - -static int rds_iw_build_send_reg(struct rds_iw_send_work *send, - struct scatterlist *sg, - int sg_nents) -{ - int n; - - n = ib_map_mr_sg(send->s_mr, sg, sg_nents, PAGE_SIZE); - if (unlikely(n != sg_nents)) - return n < 0 ? n : -EINVAL; - - send->s_reg_wr.wr.opcode = IB_WR_REG_MR; - send->s_reg_wr.wr.wr_id = 0; - send->s_reg_wr.wr.num_sge = 0; - send->s_reg_wr.mr = send->s_mr; - send->s_reg_wr.key = send->s_mr->rkey; - send->s_reg_wr.access = IB_ACCESS_REMOTE_WRITE; - - ib_update_fast_reg_key(send->s_mr, send->s_remap_count++); - - return 0; -} - -int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - struct rds_iw_send_work *send = NULL; - struct rds_iw_send_work *first; - struct rds_iw_send_work *prev; - struct ib_send_wr *failed_wr; - struct rds_iw_device *rds_iwdev; - struct scatterlist *scat; - unsigned long len; - u64 remote_addr = op->op_remote_addr; - u32 pos, fr_pos; - u32 work_alloc; - u32 i; - u32 j; - int sent; - int ret; - int num_sge; - int sg_nents; - - rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); - - /* map the message the first time we see it */ - if (!op->op_mapped) { - op->op_count = ib_dma_map_sg(ic->i_cm_id->device, - op->op_sg, op->op_nents, (op->op_write) ? - DMA_TO_DEVICE : DMA_FROM_DEVICE); - rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count); - if (op->op_count == 0) { - rds_iw_stats_inc(s_iw_tx_sg_mapping_failure); - ret = -ENOMEM; /* XXX ? */ - goto out; - } - - op->op_mapped = 1; - } - - if (!op->op_write) { - /* Alloc space on the send queue for the fastreg */ - work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos); - if (work_alloc != 1) { - rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); - rds_iw_stats_inc(s_iw_tx_ring_full); - ret = -ENOMEM; - goto out; - } - } - - /* - * Instead of knowing how to return a partial rdma read/write we insist that there - * be enough work requests to send the entire message. - */ - i = ceil(op->op_count, rds_iwdev->max_sge); - - work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos); - if (work_alloc != i) { - rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); - rds_iw_stats_inc(s_iw_tx_ring_full); - ret = -ENOMEM; - goto out; - } - - send = &ic->i_sends[pos]; - if (!op->op_write) { - first = prev = &ic->i_sends[fr_pos]; - } else { - first = send; - prev = NULL; - } - scat = &op->op_sg[0]; - sent = 0; - num_sge = op->op_count; - sg_nents = 0; - - for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) { - send->s_rdma_wr.wr.send_flags = 0; - send->s_queued = jiffies; - - /* - * We want to delay signaling completions just enough to get - * the batching benefits but not so much that we create dead time on the wire. - */ - if (ic->i_unsignaled_wrs-- == 0) { - ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; - send->s_rdma_wr.wr.send_flags = IB_SEND_SIGNALED; - } - - /* To avoid the need to have the plumbing to invalidate the fastreg_mr used - * for local access after RDS is finished with it, using - * IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed. - */ - if (op->op_write) - send->s_rdma_wr.wr.opcode = IB_WR_RDMA_WRITE; - else - send->s_rdma_wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV; - - send->s_rdma_wr.remote_addr = remote_addr; - send->s_rdma_wr.rkey = op->op_rkey; - send->s_op = op; - - if (num_sge > rds_iwdev->max_sge) { - send->s_rdma_wr.wr.num_sge = rds_iwdev->max_sge; - num_sge -= rds_iwdev->max_sge; - } else - send->s_rdma_wr.wr.num_sge = num_sge; - - send->s_rdma_wr.wr.next = NULL; - - if (prev) - prev->s_send_wr.next = &send->s_rdma_wr.wr; - - for (j = 0; j < send->s_rdma_wr.wr.num_sge && - scat != &op->op_sg[op->op_count]; j++) { - len = ib_sg_dma_len(ic->i_cm_id->device, scat); - - if (send->s_rdma_wr.wr.opcode == IB_WR_RDMA_READ_WITH_INV) - sg_nents++; - else { - send->s_sge[j].addr = ib_sg_dma_address(ic->i_cm_id->device, scat); - send->s_sge[j].length = len; - send->s_sge[j].lkey = rds_iw_local_dma_lkey(ic); - } - - sent += len; - rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr); - remote_addr += len; - - scat++; - } - - if (send->s_rdma_wr.wr.opcode == IB_WR_RDMA_READ_WITH_INV) { - send->s_rdma_wr.wr.num_sge = 1; - send->s_sge[0].addr = conn->c_xmit_rm->m_rs->rs_user_addr; - send->s_sge[0].length = conn->c_xmit_rm->m_rs->rs_user_bytes; - send->s_sge[0].lkey = ic->i_sends[fr_pos].s_mr->lkey; - } - - rdsdebug("send %p wr %p num_sge %u next %p\n", send, - &send->s_rdma_wr, - send->s_rdma_wr.wr.num_sge, - send->s_rdma_wr.wr.next); - - prev = send; - if (++send == &ic->i_sends[ic->i_send_ring.w_nr]) - send = ic->i_sends; - } - - /* if we finished the message then send completion owns it */ - if (scat == &op->op_sg[op->op_count]) - first->s_rdma_wr.wr.send_flags = IB_SEND_SIGNALED; - - if (i < work_alloc) { - rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i); - work_alloc = i; - } - - /* On iWARP, local memory access by a remote system (ie, RDMA Read) is not - * recommended. Putting the lkey on the wire is a security hole, as it can - * allow for memory access to all of memory on the remote system. Some - * adapters do not allow using the lkey for this at all. To bypass this use a - * fastreg_mr (or possibly a dma_mr) - */ - if (!op->op_write) { - ret = rds_iw_build_send_reg(&ic->i_sends[fr_pos], - &op->op_sg[0], sg_nents); - if (ret) { - printk(KERN_WARNING "RDS/IW: failed to reg send mem\n"); - goto out; - } - work_alloc++; - } - - failed_wr = &first->s_rdma_wr.wr; - ret = ib_post_send(ic->i_cm_id->qp, &first->s_rdma_wr.wr, &failed_wr); - rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, - first, &first->s_rdma_wr, ret, failed_wr); - BUG_ON(failed_wr != &first->s_rdma_wr.wr); - if (ret) { - printk(KERN_WARNING "RDS/IW: rdma ib_post_send to %pI4 " - "returned %d\n", &conn->c_faddr, ret); - rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); - goto out; - } - -out: - return ret; -} - -void rds_iw_xmit_complete(struct rds_connection *conn) -{ - struct rds_iw_connection *ic = conn->c_transport_data; - - /* We may have a pending ACK or window update we were unable - * to send previously (due to flow control). Try again. */ - rds_iw_attempt_ack(ic); -} diff --git a/net/rds/iw_stats.c b/net/rds/iw_stats.c deleted file mode 100644 index 5fe67f6a1d80..000000000000 --- a/net/rds/iw_stats.c +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Copyright (c) 2006 Oracle. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ -#include <linux/percpu.h> -#include <linux/seq_file.h> -#include <linux/proc_fs.h> - -#include "rds.h" -#include "iw.h" - -DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_iw_statistics, rds_iw_stats); - -static const char *const rds_iw_stat_names[] = { - "iw_connect_raced", - "iw_listen_closed_stale", - "iw_tx_cq_call", - "iw_tx_cq_event", - "iw_tx_ring_full", - "iw_tx_throttle", - "iw_tx_sg_mapping_failure", - "iw_tx_stalled", - "iw_tx_credit_updates", - "iw_rx_cq_call", - "iw_rx_cq_event", - "iw_rx_ring_empty", - "iw_rx_refill_from_cq", - "iw_rx_refill_from_thread", - "iw_rx_alloc_limit", - "iw_rx_credit_updates", - "iw_ack_sent", - "iw_ack_send_failure", - "iw_ack_send_delayed", - "iw_ack_send_piggybacked", - "iw_ack_received", - "iw_rdma_mr_alloc", - "iw_rdma_mr_free", - "iw_rdma_mr_used", - "iw_rdma_mr_pool_flush", - "iw_rdma_mr_pool_wait", - "iw_rdma_mr_pool_depleted", -}; - -unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter, - unsigned int avail) -{ - struct rds_iw_statistics stats = {0, }; - uint64_t *src; - uint64_t *sum; - size_t i; - int cpu; - - if (avail < ARRAY_SIZE(rds_iw_stat_names)) - goto out; - - for_each_online_cpu(cpu) { - src = (uint64_t *)&(per_cpu(rds_iw_stats, cpu)); - sum = (uint64_t *)&stats; - for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++) - *(sum++) += *(src++); - } - - rds_stats_info_copy(iter, (uint64_t *)&stats, rds_iw_stat_names, - ARRAY_SIZE(rds_iw_stat_names)); -out: - return ARRAY_SIZE(rds_iw_stat_names); -} diff --git a/net/rds/iw_sysctl.c b/net/rds/iw_sysctl.c deleted file mode 100644 index 139239d2cb22..000000000000 --- a/net/rds/iw_sysctl.c +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (c) 2006 Oracle. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ -#include <linux/kernel.h> -#include <linux/sysctl.h> -#include <linux/proc_fs.h> - -#include "iw.h" - -static struct ctl_table_header *rds_iw_sysctl_hdr; - -unsigned long rds_iw_sysctl_max_send_wr = RDS_IW_DEFAULT_SEND_WR; -unsigned long rds_iw_sysctl_max_recv_wr = RDS_IW_DEFAULT_RECV_WR; -unsigned long rds_iw_sysctl_max_recv_allocation = (128 * 1024 * 1024) / RDS_FRAG_SIZE; -static unsigned long rds_iw_sysctl_max_wr_min = 1; -/* hardware will fail CQ creation long before this */ -static unsigned long rds_iw_sysctl_max_wr_max = (u32)~0; - -unsigned long rds_iw_sysctl_max_unsig_wrs = 16; -static unsigned long rds_iw_sysctl_max_unsig_wr_min = 1; -static unsigned long rds_iw_sysctl_max_unsig_wr_max = 64; - -unsigned long rds_iw_sysctl_max_unsig_bytes = (16 << 20); -static unsigned long rds_iw_sysctl_max_unsig_bytes_min = 1; -static unsigned long rds_iw_sysctl_max_unsig_bytes_max = ~0UL; - -unsigned int rds_iw_sysctl_flow_control = 1; - -static struct ctl_table rds_iw_sysctl_table[] = { - { - .procname = "max_send_wr", - .data = &rds_iw_sysctl_max_send_wr, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - .extra1 = &rds_iw_sysctl_max_wr_min, - .extra2 = &rds_iw_sysctl_max_wr_max, - }, - { - .procname = "max_recv_wr", - .data = &rds_iw_sysctl_max_recv_wr, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - .extra1 = &rds_iw_sysctl_max_wr_min, - .extra2 = &rds_iw_sysctl_max_wr_max, - }, - { - .procname = "max_unsignaled_wr", - .data = &rds_iw_sysctl_max_unsig_wrs, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - .extra1 = &rds_iw_sysctl_max_unsig_wr_min, - .extra2 = &rds_iw_sysctl_max_unsig_wr_max, - }, - { - .procname = "max_unsignaled_bytes", - .data = &rds_iw_sysctl_max_unsig_bytes, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - .extra1 = &rds_iw_sysctl_max_unsig_bytes_min, - .extra2 = &rds_iw_sysctl_max_unsig_bytes_max, - }, - { - .procname = "max_recv_allocation", - .data = &rds_iw_sysctl_max_recv_allocation, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, - { - .procname = "flow_control", - .data = &rds_iw_sysctl_flow_control, - .maxlen = sizeof(rds_iw_sysctl_flow_control), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { } -}; - -void rds_iw_sysctl_exit(void) -{ - unregister_net_sysctl_table(rds_iw_sysctl_hdr); -} - -int rds_iw_sysctl_init(void) -{ - rds_iw_sysctl_hdr = register_net_sysctl(&init_net, "net/rds/iw", rds_iw_sysctl_table); - if (!rds_iw_sysctl_hdr) - return -ENOMEM; - return 0; -} diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index 9c1fed81bf0f..7220bebcf558 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -49,9 +49,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id, event->event, rdma_event_msg(event->event)); - if (cm_id->device->node_type == RDMA_NODE_RNIC) - trans = &rds_iw_transport; - else + if (cm_id->device->node_type == RDMA_NODE_IB_CA) trans = &rds_ib_transport; /* Prevent shutdown from tearing down the connection @@ -119,6 +117,14 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, rds_conn_drop(conn); break; + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + if (conn) { + pr_info("RDS: RDMA_CM_EVENT_TIMEWAIT_EXIT event: dropping connection %pI4->%pI4\n", + &conn->c_laddr, &conn->c_faddr); + rds_conn_drop(conn); + } + break; + default: /* things like device disconnect? */ printk(KERN_ERR "RDS: unknown event %u (%s)!\n", @@ -200,10 +206,6 @@ static int rds_rdma_init(void) if (ret) goto out; - ret = rds_iw_init(); - if (ret) - goto err_iw_init; - ret = rds_ib_init(); if (ret) goto err_ib_init; @@ -211,8 +213,6 @@ static int rds_rdma_init(void) goto out; err_ib_init: - rds_iw_exit(); -err_iw_init: rds_rdma_listen_stop(); out: return ret; @@ -224,11 +224,10 @@ static void rds_rdma_exit(void) /* stop listening first to ensure no new connections are attempted */ rds_rdma_listen_stop(); rds_ib_exit(); - rds_iw_exit(); } module_exit(rds_rdma_exit); MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>"); -MODULE_DESCRIPTION("RDS: IB/iWARP transport"); +MODULE_DESCRIPTION("RDS: IB transport"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h index faba4e382695..ff2010e9d20c 100644 --- a/net/rds/rdma_transport.h +++ b/net/rds/rdma_transport.h @@ -16,9 +16,4 @@ extern struct rds_transport rds_ib_transport; int rds_ib_init(void); void rds_ib_exit(void); -/* from iw.c */ -extern struct rds_transport rds_iw_transport; -int rds_iw_init(void); -void rds_iw_exit(void); - #endif diff --git a/net/rds/rds.h b/net/rds/rds.h index 0e2797bdc316..80256b08eac0 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -222,6 +222,7 @@ struct rds_incoming { __be32 i_saddr; rds_rdma_cookie_t i_rdma_cookie; + struct timeval i_rx_tstamp; }; struct rds_mr { diff --git a/net/rds/recv.c b/net/rds/recv.c index a00462b0d01d..c0be1ecd11c9 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -35,6 +35,8 @@ #include <net/sock.h> #include <linux/in.h> #include <linux/export.h> +#include <linux/time.h> +#include <linux/rds.h> #include "rds.h" @@ -46,6 +48,8 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, inc->i_conn = conn; inc->i_saddr = saddr; inc->i_rdma_cookie = 0; + inc->i_rx_tstamp.tv_sec = 0; + inc->i_rx_tstamp.tv_usec = 0; } EXPORT_SYMBOL_GPL(rds_inc_init); @@ -228,6 +232,8 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, be32_to_cpu(inc->i_hdr.h_len), inc->i_hdr.h_dport); + if (sock_flag(sk, SOCK_RCVTSTAMP)) + do_gettimeofday(&inc->i_rx_tstamp); rds_inc_addref(inc); list_add_tail(&inc->i_item, &rs->rs_recv_queue); __rds_wake_sk_sleep(sk); @@ -381,7 +387,8 @@ static int rds_notify_cong(struct rds_sock *rs, struct msghdr *msghdr) /* * Receive any control messages. */ -static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg) +static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg, + struct rds_sock *rs) { int ret = 0; @@ -392,6 +399,15 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg) return ret; } + if ((inc->i_rx_tstamp.tv_sec != 0) && + sock_flag(rds_rs_to_sk(rs), SOCK_RCVTSTAMP)) { + ret = put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP, + sizeof(struct timeval), + &inc->i_rx_tstamp); + if (ret) + return ret; + } + return 0; } @@ -474,7 +490,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, msg->msg_flags |= MSG_TRUNC; } - if (rds_cmsg_recv(inc, msg)) { + if (rds_cmsg_recv(inc, msg, rs)) { ret = -EFAULT; goto out; } diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 9d6ddbacd875..61ed2a8764ba 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -37,7 +37,6 @@ #include <net/tcp.h> #include <net/net_namespace.h> #include <net/netns/generic.h> -#include <net/tcp.h> #include "rds.h" #include "tcp.h" @@ -53,7 +52,34 @@ static LIST_HEAD(rds_tcp_conn_list); static struct kmem_cache *rds_tcp_conn_slab; -#define RDS_TCP_DEFAULT_BUFSIZE (128 * 1024) +static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *fpos); + +int rds_tcp_min_sndbuf = SOCK_MIN_SNDBUF; +int rds_tcp_min_rcvbuf = SOCK_MIN_RCVBUF; + +static struct ctl_table rds_tcp_sysctl_table[] = { +#define RDS_TCP_SNDBUF 0 + { + .procname = "rds_tcp_sndbuf", + /* data is per-net pointer */ + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = rds_tcp_skbuf_handler, + .extra1 = &rds_tcp_min_sndbuf, + }, +#define RDS_TCP_RCVBUF 1 + { + .procname = "rds_tcp_rcvbuf", + /* data is per-net pointer */ + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = rds_tcp_skbuf_handler, + .extra1 = &rds_tcp_min_rcvbuf, + }, + { } +}; /* doing it this way avoids calling tcp_sk() */ void rds_tcp_nonagle(struct socket *sock) @@ -67,15 +93,6 @@ void rds_tcp_nonagle(struct socket *sock) set_fs(oldfs); } -/* All module specific customizations to the RDS-TCP socket should be done in - * rds_tcp_tune() and applied after socket creation. In general these - * customizations should be tunable via module_param() - */ -void rds_tcp_tune(struct socket *sock) -{ - rds_tcp_nonagle(sock); -} - u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc) { return tcp_sk(tc->t_sock->sk)->snd_nxt; @@ -273,8 +290,34 @@ static int rds_tcp_netid; struct rds_tcp_net { struct socket *rds_tcp_listen_sock; struct work_struct rds_tcp_accept_w; + struct ctl_table_header *rds_tcp_sysctl; + struct ctl_table *ctl_table; + int sndbuf_size; + int rcvbuf_size; }; +/* All module specific customizations to the RDS-TCP socket should be done in + * rds_tcp_tune() and applied after socket creation. + */ +void rds_tcp_tune(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct net *net = sock_net(sk); + struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); + + rds_tcp_nonagle(sock); + lock_sock(sk); + if (rtn->sndbuf_size > 0) { + sk->sk_sndbuf = rtn->sndbuf_size; + sk->sk_userlocks |= SOCK_SNDBUF_LOCK; + } + if (rtn->rcvbuf_size > 0) { + sk->sk_sndbuf = rtn->rcvbuf_size; + sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + } + release_sock(sk); +} + static void rds_tcp_accept_worker(struct work_struct *work) { struct rds_tcp_net *rtn = container_of(work, @@ -296,20 +339,60 @@ void rds_tcp_accept_work(struct sock *sk) static __net_init int rds_tcp_init_net(struct net *net) { struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); + struct ctl_table *tbl; + int err = 0; + memset(rtn, 0, sizeof(*rtn)); + + /* {snd, rcv}buf_size default to 0, which implies we let the + * stack pick the value, and permit auto-tuning of buffer size. + */ + if (net == &init_net) { + tbl = rds_tcp_sysctl_table; + } else { + tbl = kmemdup(rds_tcp_sysctl_table, + sizeof(rds_tcp_sysctl_table), GFP_KERNEL); + if (!tbl) { + pr_warn("could not set allocate syctl table\n"); + return -ENOMEM; + } + rtn->ctl_table = tbl; + } + tbl[RDS_TCP_SNDBUF].data = &rtn->sndbuf_size; + tbl[RDS_TCP_RCVBUF].data = &rtn->rcvbuf_size; + rtn->rds_tcp_sysctl = register_net_sysctl(net, "net/rds/tcp", tbl); + if (!rtn->rds_tcp_sysctl) { + pr_warn("could not register sysctl\n"); + err = -ENOMEM; + goto fail; + } rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net); if (!rtn->rds_tcp_listen_sock) { pr_warn("could not set up listen sock\n"); - return -EAFNOSUPPORT; + unregister_net_sysctl_table(rtn->rds_tcp_sysctl); + rtn->rds_tcp_sysctl = NULL; + err = -EAFNOSUPPORT; + goto fail; } INIT_WORK(&rtn->rds_tcp_accept_w, rds_tcp_accept_worker); return 0; + +fail: + if (net != &init_net) + kfree(tbl); + return err; } static void __net_exit rds_tcp_exit_net(struct net *net) { struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); + if (rtn->rds_tcp_sysctl) + unregister_net_sysctl_table(rtn->rds_tcp_sysctl); + + if (net != &init_net && rtn->ctl_table) + kfree(rtn->ctl_table); + /* If rds_tcp_exit_net() is called as a result of netns deletion, * the rds_tcp_kill_sock() device notifier would already have cleaned * up the listen socket, thus there is no work to do in this function. @@ -384,6 +467,45 @@ static struct notifier_block rds_tcp_dev_notifier = { .priority = -10, /* must be called after other network notifiers */ }; +/* when sysctl is used to modify some kernel socket parameters,this + * function resets the RDS connections in that netns so that we can + * restart with new parameters. The assumption is that such reset + * events are few and far-between. + */ +static void rds_tcp_sysctl_reset(struct net *net) +{ + struct rds_tcp_connection *tc, *_tc; + + spin_lock_irq(&rds_tcp_conn_lock); + list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { + struct net *c_net = read_pnet(&tc->conn->c_net); + + if (net != c_net || !tc->t_sock) + continue; + + rds_conn_drop(tc->conn); /* reconnect with new parameters */ + } + spin_unlock_irq(&rds_tcp_conn_lock); +} + +static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *fpos) +{ + struct net *net = current->nsproxy->net_ns; + int err; + + err = proc_dointvec_minmax(ctl, write, buffer, lenp, fpos); + if (err < 0) { + pr_warn("Invalid input. Must be >= %d\n", + *(int *)(ctl->extra1)); + return err; + } + if (write) + rds_tcp_sysctl_reset(net); + return 0; +} + static void rds_tcp_exit(void) { rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); |