summaryrefslogtreecommitdiff
path: root/drivers/nvme/host/tcp.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/nvme/host/tcp.c')
-rw-r--r--drivers/nvme/host/tcp.c217
1 files changed, 192 insertions, 25 deletions
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 55abfe5e1d25..83a6b18b01ad 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -54,6 +54,8 @@ MODULE_PARM_DESC(tls_handshake_timeout,
"nvme TLS handshake timeout in seconds (default 10)");
#endif
+static atomic_t nvme_tcp_cpu_queues[NR_CPUS];
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
/* lockdep can detect a circular dependency of the form
* sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock
@@ -127,6 +129,7 @@ enum nvme_tcp_queue_flags {
NVME_TCP_Q_ALLOCATED = 0,
NVME_TCP_Q_LIVE = 1,
NVME_TCP_Q_POLLING = 2,
+ NVME_TCP_Q_IO_CPU_SET = 3,
};
enum nvme_tcp_recv_state {
@@ -214,6 +217,19 @@ static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue)
return queue - queue->ctrl->queues;
}
+static inline bool nvme_tcp_recv_pdu_supported(enum nvme_tcp_pdu_type type)
+{
+ switch (type) {
+ case nvme_tcp_c2h_term:
+ case nvme_tcp_c2h_data:
+ case nvme_tcp_r2t:
+ case nvme_tcp_rsp:
+ return true;
+ default:
+ return false;
+ }
+}
+
/*
* Check if the queue is TLS encrypted
*/
@@ -437,7 +453,8 @@ nvme_tcp_fetch_request(struct nvme_tcp_queue *queue)
return NULL;
}
- list_del(&req->entry);
+ list_del_init(&req->entry);
+ init_llist_node(&req->lentry);
return req;
}
@@ -545,6 +562,8 @@ static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
req->queue = queue;
nvme_req(rq)->ctrl = &ctrl->ctrl;
nvme_req(rq)->cmd = &pdu->cmd;
+ init_llist_node(&req->lentry);
+ INIT_LIST_HEAD(&req->entry);
return 0;
}
@@ -749,6 +768,14 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
return -EPROTO;
}
+ if (llist_on_list(&req->lentry) ||
+ !list_empty(&req->entry)) {
+ dev_err(queue->ctrl->ctrl.device,
+ "req %d unexpected r2t while processing request\n",
+ rq->tag);
+ return -EPROTO;
+ }
+
req->pdu_len = 0;
req->h2cdata_left = r2t_length;
req->h2cdata_offset = r2t_offset;
@@ -760,6 +787,40 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
return 0;
}
+static void nvme_tcp_handle_c2h_term(struct nvme_tcp_queue *queue,
+ struct nvme_tcp_term_pdu *pdu)
+{
+ u16 fes;
+ const char *msg;
+ u32 plen = le32_to_cpu(pdu->hdr.plen);
+
+ static const char * const msg_table[] = {
+ [NVME_TCP_FES_INVALID_PDU_HDR] = "Invalid PDU Header Field",
+ [NVME_TCP_FES_PDU_SEQ_ERR] = "PDU Sequence Error",
+ [NVME_TCP_FES_HDR_DIGEST_ERR] = "Header Digest Error",
+ [NVME_TCP_FES_DATA_OUT_OF_RANGE] = "Data Transfer Out Of Range",
+ [NVME_TCP_FES_DATA_LIMIT_EXCEEDED] = "Data Transfer Limit Exceeded",
+ [NVME_TCP_FES_UNSUPPORTED_PARAM] = "Unsupported Parameter",
+ };
+
+ if (plen < NVME_TCP_MIN_C2HTERM_PLEN ||
+ plen > NVME_TCP_MAX_C2HTERM_PLEN) {
+ dev_err(queue->ctrl->ctrl.device,
+ "Received a malformed C2HTermReq PDU (plen = %u)\n",
+ plen);
+ return;
+ }
+
+ fes = le16_to_cpu(pdu->fes);
+ if (fes && fes < ARRAY_SIZE(msg_table))
+ msg = msg_table[fes];
+ else
+ msg = "Unknown";
+
+ dev_err(queue->ctrl->ctrl.device,
+ "Received C2HTermReq (FES = %s)\n", msg);
+}
+
static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
unsigned int *offset, size_t *len)
{
@@ -781,6 +842,25 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
return 0;
hdr = queue->pdu;
+ if (unlikely(hdr->hlen != sizeof(struct nvme_tcp_rsp_pdu))) {
+ if (!nvme_tcp_recv_pdu_supported(hdr->type))
+ goto unsupported_pdu;
+
+ dev_err(queue->ctrl->ctrl.device,
+ "pdu type %d has unexpected header length (%d)\n",
+ hdr->type, hdr->hlen);
+ return -EPROTO;
+ }
+
+ if (unlikely(hdr->type == nvme_tcp_c2h_term)) {
+ /*
+ * C2HTermReq never includes Header or Data digests.
+ * Skip the checks.
+ */
+ nvme_tcp_handle_c2h_term(queue, (void *)queue->pdu);
+ return -EINVAL;
+ }
+
if (queue->hdr_digest) {
ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen);
if (unlikely(ret))
@@ -804,10 +884,13 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
nvme_tcp_init_recv_ctx(queue);
return nvme_tcp_handle_r2t(queue, (void *)queue->pdu);
default:
- dev_err(queue->ctrl->ctrl.device,
- "unsupported pdu type (%d)\n", hdr->type);
- return -EINVAL;
+ goto unsupported_pdu;
}
+
+unsupported_pdu:
+ dev_err(queue->ctrl->ctrl.device,
+ "unsupported pdu type (%d)\n", hdr->type);
+ return -EINVAL;
}
static inline void nvme_tcp_end_request(struct request *rq, u16 status)
@@ -1277,7 +1360,7 @@ static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
queue->nr_cqe = 0;
consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
release_sock(sk);
- return consumed;
+ return consumed == -EAGAIN ? 0 : consumed;
}
static void nvme_tcp_io_work(struct work_struct *w)
@@ -1305,6 +1388,11 @@ static void nvme_tcp_io_work(struct work_struct *w)
else if (unlikely(result < 0))
return;
+ /* did we get some space after spending time in recv? */
+ if (nvme_tcp_queue_has_pending(queue) &&
+ sk_stream_is_writeable(queue->sock->sk))
+ pending = true;
+
if (!pending || !queue->rd_enabled)
return;
@@ -1446,8 +1534,11 @@ static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
msg.msg_control = cbuf;
msg.msg_controllen = sizeof(cbuf);
}
+ msg.msg_flags = MSG_WAITALL;
ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
iov.iov_len, msg.msg_flags);
+ if (ret >= 0 && ret < sizeof(*icresp))
+ ret = -ECONNRESET;
if (ret < 0) {
pr_warn("queue %d: failed to receive icresp, error %d\n",
nvme_tcp_queue_id(queue), ret);
@@ -1562,23 +1653,56 @@ static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
ctrl->io_queues[HCTX_TYPE_POLL];
}
+/*
+ * Track the number of queues assigned to each cpu using a global per-cpu
+ * counter and select the least used cpu from the mq_map. Our goal is to spread
+ * different controllers I/O threads across different cpu cores.
+ *
+ * Note that the accounting is not 100% perfect, but we don't need to be, we're
+ * simply putting our best effort to select the best candidate cpu core that we
+ * find at any given point.
+ */
static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
{
struct nvme_tcp_ctrl *ctrl = queue->ctrl;
- int qid = nvme_tcp_queue_id(queue);
- int n = 0;
+ struct blk_mq_tag_set *set = &ctrl->tag_set;
+ int qid = nvme_tcp_queue_id(queue) - 1;
+ unsigned int *mq_map = NULL;
+ int cpu, min_queues = INT_MAX, io_cpu;
+
+ if (wq_unbound)
+ goto out;
if (nvme_tcp_default_queue(queue))
- n = qid - 1;
+ mq_map = set->map[HCTX_TYPE_DEFAULT].mq_map;
else if (nvme_tcp_read_queue(queue))
- n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1;
+ mq_map = set->map[HCTX_TYPE_READ].mq_map;
else if (nvme_tcp_poll_queue(queue))
- n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
- ctrl->io_queues[HCTX_TYPE_READ] - 1;
- if (wq_unbound)
- queue->io_cpu = WORK_CPU_UNBOUND;
- else
- queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
+ mq_map = set->map[HCTX_TYPE_POLL].mq_map;
+
+ if (WARN_ON(!mq_map))
+ goto out;
+
+ /* Search for the least used cpu from the mq_map */
+ io_cpu = WORK_CPU_UNBOUND;
+ for_each_online_cpu(cpu) {
+ int num_queues = atomic_read(&nvme_tcp_cpu_queues[cpu]);
+
+ if (mq_map[cpu] != qid)
+ continue;
+ if (num_queues < min_queues) {
+ io_cpu = cpu;
+ min_queues = num_queues;
+ }
+ }
+ if (io_cpu != WORK_CPU_UNBOUND) {
+ queue->io_cpu = io_cpu;
+ atomic_inc(&nvme_tcp_cpu_queues[io_cpu]);
+ set_bit(NVME_TCP_Q_IO_CPU_SET, &queue->flags);
+ }
+out:
+ dev_dbg(ctrl->ctrl.device, "queue %d: using cpu %d\n",
+ qid, queue->io_cpu);
}
static void nvme_tcp_tls_done(void *data, int status, key_serial_t pskid)
@@ -1653,9 +1777,14 @@ static int nvme_tcp_start_tls(struct nvme_ctrl *nctrl,
qid, ret);
tls_handshake_cancel(queue->sock->sk);
} else {
- dev_dbg(nctrl->device,
- "queue %d: TLS handshake complete, error %d\n",
- qid, queue->tls_err);
+ if (queue->tls_err) {
+ dev_err(nctrl->device,
+ "queue %d: TLS handshake complete, error %d\n",
+ qid, queue->tls_err);
+ } else {
+ dev_dbg(nctrl->device,
+ "queue %d: TLS handshake complete\n", qid);
+ }
ret = queue->tls_err;
}
return ret;
@@ -1722,7 +1851,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
queue->sock->sk->sk_allocation = GFP_ATOMIC;
queue->sock->sk->sk_use_task_frag = false;
- nvme_tcp_set_queue_io_cpu(queue);
+ queue->io_cpu = WORK_CPU_UNBOUND;
queue->request = NULL;
queue->data_remaining = 0;
queue->ddgst_remaining = 0;
@@ -1836,7 +1965,7 @@ static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
cancel_work_sync(&queue->io_work);
}
-static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
+static void nvme_tcp_stop_queue_nowait(struct nvme_ctrl *nctrl, int qid)
{
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
struct nvme_tcp_queue *queue = &ctrl->queues[qid];
@@ -1844,6 +1973,9 @@ static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
if (!test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
return;
+ if (test_and_clear_bit(NVME_TCP_Q_IO_CPU_SET, &queue->flags))
+ atomic_dec(&nvme_tcp_cpu_queues[queue->io_cpu]);
+
mutex_lock(&queue->queue_lock);
if (test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags))
__nvme_tcp_stop_queue(queue);
@@ -1852,6 +1984,31 @@ static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
mutex_unlock(&queue->queue_lock);
}
+static void nvme_tcp_wait_queue(struct nvme_ctrl *nctrl, int qid)
+{
+ struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
+ struct nvme_tcp_queue *queue = &ctrl->queues[qid];
+ int timeout = 100;
+
+ while (timeout > 0) {
+ if (!test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags) ||
+ !sk_wmem_alloc_get(queue->sock->sk))
+ return;
+ msleep(2);
+ timeout -= 2;
+ }
+ dev_warn(nctrl->device,
+ "qid %d: timeout draining sock wmem allocation expired\n",
+ qid);
+}
+
+static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
+{
+ nvme_tcp_stop_queue_nowait(nctrl, qid);
+ nvme_tcp_wait_queue(nctrl, qid);
+}
+
+
static void nvme_tcp_setup_sock_ops(struct nvme_tcp_queue *queue)
{
write_lock_bh(&queue->sock->sk->sk_callback_lock);
@@ -1878,9 +2035,10 @@ static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
nvme_tcp_init_recv_ctx(queue);
nvme_tcp_setup_sock_ops(queue);
- if (idx)
+ if (idx) {
+ nvme_tcp_set_queue_io_cpu(queue);
ret = nvmf_connect_io_queue(nctrl, idx);
- else
+ } else
ret = nvmf_connect_admin_queue(nctrl);
if (!ret) {
@@ -1918,7 +2076,9 @@ static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl)
int i;
for (i = 1; i < ctrl->queue_count; i++)
- nvme_tcp_stop_queue(ctrl, i);
+ nvme_tcp_stop_queue_nowait(ctrl, i);
+ for (i = 1; i < ctrl->queue_count; i++)
+ nvme_tcp_wait_queue(ctrl, i);
}
static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl,
@@ -2455,6 +2615,8 @@ static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
ctrl->async_req.offset = 0;
ctrl->async_req.curr_bio = NULL;
ctrl->async_req.data_len = 0;
+ init_llist_node(&ctrl->async_req.lentry);
+ INIT_LIST_HEAD(&ctrl->async_req.entry);
nvme_tcp_queue_request(&ctrl->async_req, true, true);
}
@@ -2624,6 +2786,7 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
{
struct nvme_tcp_queue *queue = hctx->driver_data;
struct sock *sk = queue->sock->sk;
+ int ret;
if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags))
return 0;
@@ -2631,9 +2794,9 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
set_bit(NVME_TCP_Q_POLLING, &queue->flags);
if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue))
sk_busy_loop(sk, true);
- nvme_tcp_try_recv(queue);
+ ret = nvme_tcp_try_recv(queue);
clear_bit(NVME_TCP_Q_POLLING, &queue->flags);
- return queue->nr_cqe;
+ return ret < 0 ? ret : queue->nr_cqe;
}
static int nvme_tcp_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
@@ -2856,6 +3019,7 @@ static struct nvmf_transport_ops nvme_tcp_transport = {
static int __init nvme_tcp_init_module(void)
{
unsigned int wq_flags = WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_SYSFS;
+ int cpu;
BUILD_BUG_ON(sizeof(struct nvme_tcp_hdr) != 8);
BUILD_BUG_ON(sizeof(struct nvme_tcp_cmd_pdu) != 72);
@@ -2873,6 +3037,9 @@ static int __init nvme_tcp_init_module(void)
if (!nvme_tcp_wq)
return -ENOMEM;
+ for_each_possible_cpu(cpu)
+ atomic_set(&nvme_tcp_cpu_queues[cpu], 0);
+
nvmf_register_transport(&nvme_tcp_transport);
return 0;
}