From a249d3066de62ce2ed68fdf6445556658ecba222 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 26 Apr 2023 08:04:41 -0700 Subject: nvme-fabrics: add queue setup helpers tcp and rdma transports have lots of duplicate code setting up the different queue mappings. Add common helpers. Cc: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 92 +++---------------------------------------------- 1 file changed, 5 insertions(+), 87 deletions(-) (limited to 'drivers/nvme/host/tcp.c') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index bf0230442d57..260b3554d821 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1802,58 +1802,12 @@ out_free_queues: return ret; } -static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl) -{ - unsigned int nr_io_queues; - - nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus()); - nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus()); - nr_io_queues += min(ctrl->opts->nr_poll_queues, num_online_cpus()); - - return nr_io_queues; -} - -static void nvme_tcp_set_io_queues(struct nvme_ctrl *nctrl, - unsigned int nr_io_queues) -{ - struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); - struct nvmf_ctrl_options *opts = nctrl->opts; - - if (opts->nr_write_queues && opts->nr_io_queues < nr_io_queues) { - /* - * separate read/write queues - * hand out dedicated default queues only after we have - * sufficient read queues. - */ - ctrl->io_queues[HCTX_TYPE_READ] = opts->nr_io_queues; - nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ]; - ctrl->io_queues[HCTX_TYPE_DEFAULT] = - min(opts->nr_write_queues, nr_io_queues); - nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT]; - } else { - /* - * shared read/write queues - * either no write queues were requested, or we don't have - * sufficient queue count to have dedicated default queues. - */ - ctrl->io_queues[HCTX_TYPE_DEFAULT] = - min(opts->nr_io_queues, nr_io_queues); - nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT]; - } - - if (opts->nr_poll_queues && nr_io_queues) { - /* map dedicated poll queues only if we have queues left */ - ctrl->io_queues[HCTX_TYPE_POLL] = - min(opts->nr_poll_queues, nr_io_queues); - } -} - static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) { unsigned int nr_io_queues; int ret; - nr_io_queues = nvme_tcp_nr_io_queues(ctrl); + nr_io_queues = nvmf_nr_io_queues(ctrl->opts); ret = nvme_set_queue_count(ctrl, &nr_io_queues); if (ret) return ret; @@ -1868,8 +1822,8 @@ static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) dev_info(ctrl->device, "creating %d I/O queues.\n", nr_io_queues); - nvme_tcp_set_io_queues(ctrl, nr_io_queues); - + nvmf_set_io_queues(ctrl->opts, nr_io_queues, + to_tcp_ctrl(ctrl)->io_queues); return __nvme_tcp_alloc_io_queues(ctrl); } @@ -2449,44 +2403,8 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx, static void nvme_tcp_map_queues(struct blk_mq_tag_set *set) { struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(set->driver_data); - struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; - - if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) { - /* separate read/write queues */ - set->map[HCTX_TYPE_DEFAULT].nr_queues = - ctrl->io_queues[HCTX_TYPE_DEFAULT]; - set->map[HCTX_TYPE_DEFAULT].queue_offset = 0; - set->map[HCTX_TYPE_READ].nr_queues = - ctrl->io_queues[HCTX_TYPE_READ]; - set->map[HCTX_TYPE_READ].queue_offset = - ctrl->io_queues[HCTX_TYPE_DEFAULT]; - } else { - /* shared read/write queues */ - set->map[HCTX_TYPE_DEFAULT].nr_queues = - ctrl->io_queues[HCTX_TYPE_DEFAULT]; - set->map[HCTX_TYPE_DEFAULT].queue_offset = 0; - set->map[HCTX_TYPE_READ].nr_queues = - ctrl->io_queues[HCTX_TYPE_DEFAULT]; - set->map[HCTX_TYPE_READ].queue_offset = 0; - } - blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); - blk_mq_map_queues(&set->map[HCTX_TYPE_READ]); - - if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) { - /* map dedicated poll queues only if we have queues left */ - set->map[HCTX_TYPE_POLL].nr_queues = - ctrl->io_queues[HCTX_TYPE_POLL]; - set->map[HCTX_TYPE_POLL].queue_offset = - ctrl->io_queues[HCTX_TYPE_DEFAULT] + - ctrl->io_queues[HCTX_TYPE_READ]; - blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]); - } - - dev_info(ctrl->ctrl.device, - "mapped %d/%d/%d default/read/poll queues.\n", - ctrl->io_queues[HCTX_TYPE_DEFAULT], - ctrl->io_queues[HCTX_TYPE_READ], - ctrl->io_queues[HCTX_TYPE_POLL]); + + nvmf_map_queues(set, &ctrl->ctrl, ctrl->io_queues); } static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) -- cgit v1.2.3 From 7769887817c3dc49b55957badcbd515cf925728b Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Jun 2023 23:55:04 +0100 Subject: nvme-tcp: Use sendmsg(MSG_SPLICE_PAGES) rather then sendpage When transmitting data, call down into TCP using a sendmsg with MSG_SPLICE_PAGES instead of sendpage. Signed-off-by: David Howells Tested-by: Sagi Grimberg Acked-by: Willem de Bruijn cc: Keith Busch cc: Jens Axboe cc: Christoph Hellwig cc: Chaitanya Kulkarni cc: Jens Axboe cc: Matthew Wilcox cc: linux-nvme@lists.infradead.org Link: https://lore.kernel.org/r/20230623225513.2732256-8-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- drivers/nvme/host/tcp.c | 49 +++++++++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 22 deletions(-) (limited to 'drivers/nvme/host/tcp.c') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index bf0230442d57..47ae17f16c05 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -997,25 +997,28 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) u32 h2cdata_left = req->h2cdata_left; while (true) { + struct bio_vec bvec; + struct msghdr msg = { + .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, + }; struct page *page = nvme_tcp_req_cur_page(req); size_t offset = nvme_tcp_req_cur_offset(req); size_t len = nvme_tcp_req_cur_length(req); bool last = nvme_tcp_pdu_last_send(req, len); int req_data_sent = req->data_sent; - int ret, flags = MSG_DONTWAIT; + int ret; if (last && !queue->data_digest && !nvme_tcp_queue_more(queue)) - flags |= MSG_EOR; + msg.msg_flags |= MSG_EOR; else - flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST; + msg.msg_flags |= MSG_MORE; - if (sendpage_ok(page)) { - ret = kernel_sendpage(queue->sock, page, offset, len, - flags); - } else { - ret = sock_no_sendpage(queue->sock, page, offset, len, - flags); - } + if (!sendpage_ok(page)) + msg.msg_flags &= ~MSG_SPLICE_PAGES, + + bvec_set_page(&bvec, page, len, offset); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len); + ret = sock_sendmsg(queue->sock, &msg); if (ret <= 0) return ret; @@ -1054,22 +1057,24 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req) { struct nvme_tcp_queue *queue = req->queue; struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req); + struct bio_vec bvec; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, }; bool inline_data = nvme_tcp_has_inline_data(req); u8 hdgst = nvme_tcp_hdgst_len(queue); int len = sizeof(*pdu) + hdgst - req->offset; - int flags = MSG_DONTWAIT; int ret; if (inline_data || nvme_tcp_queue_more(queue)) - flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST; + msg.msg_flags |= MSG_MORE; else - flags |= MSG_EOR; + msg.msg_flags |= MSG_EOR; if (queue->hdr_digest && !req->offset) nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); - ret = kernel_sendpage(queue->sock, virt_to_page(pdu), - offset_in_page(pdu) + req->offset, len, flags); + bvec_set_virt(&bvec, (void *)pdu + req->offset, len); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len); + ret = sock_sendmsg(queue->sock, &msg); if (unlikely(ret <= 0)) return ret; @@ -1093,6 +1098,8 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req) { struct nvme_tcp_queue *queue = req->queue; struct nvme_tcp_data_pdu *pdu = nvme_tcp_req_data_pdu(req); + struct bio_vec bvec; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_MORE, }; u8 hdgst = nvme_tcp_hdgst_len(queue); int len = sizeof(*pdu) - req->offset + hdgst; int ret; @@ -1101,13 +1108,11 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req) nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); if (!req->h2cdata_left) - ret = kernel_sendpage(queue->sock, virt_to_page(pdu), - offset_in_page(pdu) + req->offset, len, - MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST); - else - ret = sock_no_sendpage(queue->sock, virt_to_page(pdu), - offset_in_page(pdu) + req->offset, len, - MSG_DONTWAIT | MSG_MORE); + msg.msg_flags |= MSG_SPLICE_PAGES; + + bvec_set_virt(&bvec, (void *)pdu + req->offset, len); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len); + ret = sock_sendmsg(queue->sock, &msg); if (unlikely(ret <= 0)) return ret; -- cgit v1.2.3 From c97d3fb9e0e716b77d5c0d3051d46e2f41dc6fe4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 29 Jun 2023 22:47:53 +0100 Subject: nvme-tcp: Fix comma-related oops Fix a comma that should be a semicolon. The comma is at the end of an if-body and thus makes the statement after (a bvec_set_page()) conditional too, resulting in an oops because we didn't fill out the bio_vec[]: BUG: kernel NULL pointer dereference, address: 0000000000000008 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page ... Workqueue: nvme_tcp_wq nvme_tcp_io_work [nvme_tcp] RIP: 0010:skb_splice_from_iter+0xf1/0x370 ... Call Trace: tcp_sendmsg_locked+0x3a6/0xdd0 tcp_sendmsg+0x31/0x50 inet_sendmsg+0x47/0x80 sock_sendmsg+0x99/0xb0 nvme_tcp_try_send_data+0x149/0x490 [nvme_tcp] nvme_tcp_try_send+0x1b7/0x300 [nvme_tcp] nvme_tcp_io_work+0x40/0xc0 [nvme_tcp] process_one_work+0x21c/0x430 worker_thread+0x54/0x3e0 kthread+0xf8/0x130 Fixes: 7769887817c3 ("nvme-tcp: Use sendmsg(MSG_SPLICE_PAGES) rather then sendpage") Reported-by: Aurelien Aptel Link: https://lore.kernel.org/r/253mt0il43o.fsf@mtr-vdi-124.i-did-not-set--mail-host-address--so-tickle-me/ Signed-off-by: David Howells cc: Sagi Grimberg cc: Willem de Bruijn cc: Keith Busch cc: Jens Axboe cc: Christoph Hellwig cc: Chaitanya Kulkarni cc: "David S. Miller" cc: Eric Dumazet cc: Jakub Kicinski cc: Paolo Abeni cc: Jens Axboe cc: Jens Axboe cc: Matthew Wilcox cc: linux-nvme@lists.infradead.org cc: netdev@vger.kernel.org Reviewed-by: Chaitanya Kulkarni Signed-off-by: David S. Miller --- drivers/nvme/host/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/nvme/host/tcp.c') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 3e7dd6f91832..9ce417cd32a7 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1014,7 +1014,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) msg.msg_flags |= MSG_MORE; if (!sendpage_ok(page)) - msg.msg_flags &= ~MSG_SPLICE_PAGES, + msg.msg_flags &= ~MSG_SPLICE_PAGES; bvec_set_page(&bvec, page, len, offset); iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len); -- cgit v1.2.3