From 71f28f3136aff5890cd56de78abc673f8393cad9 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 13 Jul 2022 22:07:10 +0800 Subject: ublk_drv: add io_uring based userspace block driver This is the driver part of userspace block driver(ublk driver), the other part is userspace daemon part(ublksrv)[1]. The two parts communicate by io_uring's IORING_OP_URING_CMD with one shared cmd buffer for storing io command, and the buffer is read only for ublksrv, each io command is indexed by io request tag directly, and is written by ublk driver. For example, when one READ io request is submitted to ublk block driver, ublk driver stores the io command into cmd buffer first, then completes one IORING_OP_URING_CMD for notifying ublksrv, and the URING_CMD is issued to ublk driver beforehand by ublksrv for getting notification of any new io request, and each URING_CMD is associated with one io request by tag. After ublksrv gets the io command, it translates and handles the ublk io request, such as, for the ublk-loop target, ublksrv translates the request into same request on another file or disk, like the kernel loop block driver. In ublksrv's implementation, the io is still handled by io_uring, and share same ring with IORING_OP_URING_CMD command. When the target io request is done, the same IORING_OP_URING_CMD is issued to ublk driver for both committing io request result and getting future notification of new io request. Another thing done by ublk driver is to copy data between kernel io request and ublksrv's io buffer: 1) before ubsrv handles WRITE request, copy the request's data into ublksrv's userspace io buffer, so that ublksrv can handle the write request 2) after ubsrv handles READ request, copy ublksrv's userspace io buffer into this READ request, then ublk driver can complete the READ request Zero copy may be switched if mm is ready to support it. ublk driver doesn't handle any logic of the specific user space driver, so it is small/simple enough. [1] ublksrv https://github.com/ming1/ubdsrv Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20220713140711.97356-2-ming.lei@redhat.com Signed-off-by: Jens Axboe --- include/uapi/linux/ublk_cmd.h | 156 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 include/uapi/linux/ublk_cmd.h (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h new file mode 100644 index 000000000000..4f0c16ec875e --- /dev/null +++ b/include/uapi/linux/ublk_cmd.h @@ -0,0 +1,156 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef USER_BLK_DRV_CMD_INC_H +#define USER_BLK_DRV_CMD_INC_H + +#include + +/* ublk server command definition */ + +/* + * Admin commands, issued by ublk server, and handled by ublk driver. + */ +#define UBLK_CMD_GET_QUEUE_AFFINITY 0x01 +#define UBLK_CMD_GET_DEV_INFO 0x02 +#define UBLK_CMD_ADD_DEV 0x04 +#define UBLK_CMD_DEL_DEV 0x05 +#define UBLK_CMD_START_DEV 0x06 +#define UBLK_CMD_STOP_DEV 0x07 + +/* + * IO commands, issued by ublk server, and handled by ublk driver. + * + * FETCH_REQ: issued via sqe(URING_CMD) beforehand for fetching IO request + * from ublk driver, should be issued only when starting device. After + * the associated cqe is returned, request's tag can be retrieved via + * cqe->userdata. + * + * COMMIT_AND_FETCH_REQ: issued via sqe(URING_CMD) after ublkserver handled + * this IO request, request's handling result is committed to ublk + * driver, meantime FETCH_REQ is piggyback, and FETCH_REQ has to be + * handled before completing io request. + */ +#define UBLK_IO_FETCH_REQ 0x20 +#define UBLK_IO_COMMIT_AND_FETCH_REQ 0x21 + +/* only ABORT means that no re-fetch */ +#define UBLK_IO_RES_OK 0 +#define UBLK_IO_RES_ABORT (-ENODEV) + +#define UBLKSRV_CMD_BUF_OFFSET 0 +#define UBLKSRV_IO_BUF_OFFSET 0x80000000 + +/* tag bit is 12bit, so at most 4096 IOs for each queue */ +#define UBLK_MAX_QUEUE_DEPTH 4096 + +/* + * zero copy requires 4k block size, and can remap ublk driver's io + * request into ublksrv's vm space + */ +#define UBLK_F_SUPPORT_ZERO_COPY (1UL << 0) + +/* device state */ +#define UBLK_S_DEV_DEAD 0 +#define UBLK_S_DEV_LIVE 1 + +/* shipped via sqe->cmd of io_uring command */ +struct ublksrv_ctrl_cmd { + /* sent to which device, must be valid */ + __u32 dev_id; + + /* sent to which queue, must be -1 if the cmd isn't for queue */ + __u16 queue_id; + /* + * cmd specific buffer, can be IN or OUT. + */ + __u16 len; + __u64 addr; + + /* inline data */ + __u64 data[2]; +}; + +struct ublksrv_ctrl_dev_info { + __u16 nr_hw_queues; + __u16 queue_depth; + __u16 block_size; + __u16 state; + + __u32 rq_max_blocks; + __u32 dev_id; + + __u64 dev_blocks; + + __s32 ublksrv_pid; + __s32 reserved0; + __u64 flags[2]; + + /* For ublksrv internal use, invisible to ublk driver */ + __u64 ublksrv_flags; + __u64 reserved1[9]; +}; + +#define UBLK_IO_OP_READ 0 +#define UBLK_IO_OP_WRITE 1 +#define UBLK_IO_OP_FLUSH 2 +#define UBLK_IO_OP_DISCARD 3 +#define UBLK_IO_OP_WRITE_SAME 4 +#define UBLK_IO_OP_WRITE_ZEROES 5 + +#define UBLK_IO_F_FAILFAST_DEV (1U << 8) +#define UBLK_IO_F_FAILFAST_TRANSPORT (1U << 9) +#define UBLK_IO_F_FAILFAST_DRIVER (1U << 10) +#define UBLK_IO_F_META (1U << 11) +#define UBLK_IO_F_INTEGRITY (1U << 12) +#define UBLK_IO_F_FUA (1U << 13) +#define UBLK_IO_F_PREFLUSH (1U << 14) +#define UBLK_IO_F_NOUNMAP (1U << 15) +#define UBLK_IO_F_SWAP (1U << 16) + +/* + * io cmd is described by this structure, and stored in share memory, indexed + * by request tag. + * + * The data is stored by ublk driver, and read by ublksrv after one fetch command + * returns. + */ +struct ublksrv_io_desc { + /* op: bit 0-7, flags: bit 8-31 */ + __u32 op_flags; + + __u32 nr_sectors; + + /* start sector for this io */ + __u64 start_sector; + + /* buffer address in ublksrv daemon vm space, from ublk driver */ + __u64 addr; +}; + +static inline __u8 ublksrv_get_op(const struct ublksrv_io_desc *iod) +{ + return iod->op_flags & 0xff; +} + +static inline __u32 ublksrv_get_flags(const struct ublksrv_io_desc *iod) +{ + return iod->op_flags >> 8; +} + +/* issued to ublk driver via /dev/ublkcN */ +struct ublksrv_io_cmd { + __u16 q_id; + + /* for fetch/commit which result */ + __u16 tag; + + /* io result, it is valid for COMMIT* command only */ + __s32 result; + + /* + * userspace buffer address in ublksrv daemon process, valid for + * FETCH* command only + */ + __u64 addr; +}; + +#endif -- cgit v1.2.3 From 0edb3696c1713c42f52acbd8355b545e58f782b1 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 13 Jul 2022 22:07:11 +0800 Subject: ublk_drv: support to complete io command via task_work_add Use task_work_add if it is available, since task_work_add can bring up better performance, especially batching signaling ->ubq_daemon can be done. It is observed that task_work_add() can boost iops by +4% on random 4k io test. Also except for completing io command, all other code paths are same with completing io command via io_uring_cmd_complete_in_task. Meantime add one flag of UBLK_F_URING_CMD_COMP_IN_TASK for comparing the mode easily. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20220713140711.97356-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 75 ++++++++++++++++++++++++++++++++++++++----- include/uapi/linux/ublk_cmd.h | 6 ++++ 2 files changed, 73 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 922a84c86fc6..35fa06ee70ff 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -41,10 +41,15 @@ #include #include #include +#include #include #define UBLK_MINORS (1U << MINORBITS) +struct ublk_rq_data { + struct callback_head work; +}; + struct ublk_uring_cmd_pdu { struct request *req; }; @@ -91,6 +96,7 @@ struct ublk_queue { int q_id; int q_depth; + unsigned long flags; struct task_struct *ubq_daemon; char *io_cmd_buf; @@ -149,6 +155,14 @@ static DEFINE_MUTEX(ublk_ctl_mutex); static struct miscdevice ublk_misc; +static inline bool ublk_can_use_task_work(const struct ublk_queue *ubq) +{ + if (IS_BUILTIN(CONFIG_BLK_DEV_UBLK) && + !(ubq->flags & UBLK_F_URING_CMD_COMP_IN_TASK)) + return true; + return false; +} + static struct ublk_device *ublk_get_device(struct ublk_device *ub) { if (kobject_get_unless_zero(&ub->cdev_dev.kobj)) @@ -500,12 +514,10 @@ static void __ublk_fail_req(struct ublk_io *io, struct request *req) #define UBLK_REQUEUE_DELAY_MS 3 -static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd) +static inline void __ublk_rq_task_work(struct request *req) { - struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); - struct ublk_device *ub = cmd->file->private_data; - struct request *req = pdu->req; struct ublk_queue *ubq = req->mq_hctx->driver_data; + struct ublk_device *ub = ubq->dev; int tag = req->tag; struct ublk_io *io = &ubq->ios[tag]; bool task_exiting = current != ubq->ubq_daemon || @@ -557,13 +569,27 @@ static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd) io_uring_cmd_done(io->cmd, UBLK_IO_RES_OK, 0); } +static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd) +{ + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + + __ublk_rq_task_work(pdu->req); +} + +static void ublk_rq_task_work_fn(struct callback_head *work) +{ + struct ublk_rq_data *data = container_of(work, + struct ublk_rq_data, work); + struct request *req = blk_mq_rq_from_pdu(data); + + __ublk_rq_task_work(req); +} + static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct ublk_queue *ubq = hctx->driver_data; struct request *rq = bd->rq; - struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd; - struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); blk_status_t res; /* fill iod to slot in io cmd buffer */ @@ -574,16 +600,36 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(bd->rq); if (unlikely(ubq_daemon_is_dying(ubq))) { + fail: mod_delayed_work(system_wq, &ubq->dev->monitor_work, 0); return BLK_STS_IOERR; } - pdu->req = rq; - io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb); + if (ublk_can_use_task_work(ubq)) { + struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq); + enum task_work_notify_mode notify_mode = bd->last ? + TWA_SIGNAL_NO_IPI : TWA_NONE; + + if (task_work_add(ubq->ubq_daemon, &data->work, notify_mode)) + goto fail; + } else { + struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd; + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + + pdu->req = rq; + io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb); + } return BLK_STS_OK; } +static void ublk_commit_rqs(struct blk_mq_hw_ctx *hctx) +{ + struct ublk_queue *ubq = hctx->driver_data; + + if (ublk_can_use_task_work(ubq)) + __set_notify_signal(ubq->ubq_daemon); +} static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, unsigned int hctx_idx) @@ -595,9 +641,20 @@ static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, return 0; } +static int ublk_init_rq(struct blk_mq_tag_set *set, struct request *req, + unsigned int hctx_idx, unsigned int numa_node) +{ + struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); + + init_task_work(&data->work, ublk_rq_task_work_fn); + return 0; +} + static const struct blk_mq_ops ublk_mq_ops = { .queue_rq = ublk_queue_rq, + .commit_rqs = ublk_commit_rqs, .init_hctx = ublk_init_hctx, + .init_request = ublk_init_rq, }; static int ublk_ch_open(struct inode *inode, struct file *filp) @@ -912,6 +969,7 @@ static int ublk_init_queue(struct ublk_device *ub, int q_id) void *ptr; int size; + ubq->flags = ub->dev_info.flags[0]; ubq->q_id = q_id; ubq->q_depth = ub->dev_info.queue_depth; size = ublk_queue_cmd_buf_size(ub, q_id); @@ -1099,6 +1157,7 @@ static int ublk_add_dev(struct ublk_device *ub) ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues; ub->tag_set.queue_depth = ub->dev_info.queue_depth; ub->tag_set.numa_node = NUMA_NO_NODE; + ub->tag_set.cmd_size = sizeof(struct ublk_rq_data); ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; ub->tag_set.driver_data = ub; diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 4f0c16ec875e..a3f5e7c21807 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -48,6 +48,12 @@ */ #define UBLK_F_SUPPORT_ZERO_COPY (1UL << 0) +/* + * Force to complete io cmd via io_uring_cmd_complete_in_task so that + * performance comparison is done easily with using task_work_add + */ +#define UBLK_F_URING_CMD_COMP_IN_TASK (1UL << 1) + /* device state */ #define UBLK_S_DEV_DEAD 0 #define UBLK_S_DEV_LIVE 1 -- cgit v1.2.3 From d276a22314c2bad9136c5e0b09eb3c8a560e1161 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Jul 2022 08:30:13 +0200 Subject: ublk: remove UBLK_IO_F_INTEGRITY The ublk protocol has no mechanism to actually transfer the integrity metadata, so don't define this flag, which requires that an integrity payload is attached to a bio. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220718063013.335531-1-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 3 --- include/uapi/linux/ublk_cmd.h | 1 - 2 files changed, 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 42afab25864f..796d8230fb60 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -389,9 +389,6 @@ static inline unsigned int ublk_req_build_flags(struct request *req) if (req->cmd_flags & REQ_META) flags |= UBLK_IO_F_META; - if (req->cmd_flags & REQ_INTEGRITY) - flags |= UBLK_IO_F_INTEGRITY; - if (req->cmd_flags & REQ_FUA) flags |= UBLK_IO_F_FUA; diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index a3f5e7c21807..d6879eea2fde 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -106,7 +106,6 @@ struct ublksrv_ctrl_dev_info { #define UBLK_IO_F_FAILFAST_TRANSPORT (1U << 9) #define UBLK_IO_F_FAILFAST_DRIVER (1U << 10) #define UBLK_IO_F_META (1U << 11) -#define UBLK_IO_F_INTEGRITY (1U << 12) #define UBLK_IO_F_FUA (1U << 13) #define UBLK_IO_F_PREFLUSH (1U << 14) #define UBLK_IO_F_NOUNMAP (1U << 15) -- cgit v1.2.3 From 5f8bcc837a9640ba4bf5e7b1d7f9b254ea029f47 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 21 Jul 2022 15:09:10 +0200 Subject: ublk: remove UBLK_IO_F_PREFLUSH REQ_PREFLUSH is turned into REQ_OP_FLUSH by the flush state machine and thus never seen by a blk-mq based driver. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20220721130916.1869719-3-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 3 --- include/uapi/linux/ublk_cmd.h | 1 - 2 files changed, 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index b90481b295a7..07913b5bccd9 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -392,9 +392,6 @@ static inline unsigned int ublk_req_build_flags(struct request *req) if (req->cmd_flags & REQ_FUA) flags |= UBLK_IO_F_FUA; - if (req->cmd_flags & REQ_PREFLUSH) - flags |= UBLK_IO_F_PREFLUSH; - if (req->cmd_flags & REQ_NOUNMAP) flags |= UBLK_IO_F_NOUNMAP; diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index d6879eea2fde..917580b34198 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -107,7 +107,6 @@ struct ublksrv_ctrl_dev_info { #define UBLK_IO_F_FAILFAST_DRIVER (1U << 10) #define UBLK_IO_F_META (1U << 11) #define UBLK_IO_F_FUA (1U << 13) -#define UBLK_IO_F_PREFLUSH (1U << 14) #define UBLK_IO_F_NOUNMAP (1U << 15) #define UBLK_IO_F_SWAP (1U << 16) -- cgit v1.2.3 From 6d8c5afc9ab14595707ff25d971dde45728eba3e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 22 Jul 2022 18:38:17 +0800 Subject: ublk_drv: make sure that correct flags(features) returned to userspace Userspace may support more features or new added flags, but the driver side can be old, so make sure correct flags(features) returned to userpsace, then userspace can work as expected. Also mark the 2nd flags as reversed, just use the 1st one. When we run out of flags, the reserved one can be handled at that time. Reviewed-by: Christoph Hellwig Reviewed-by: ZiyangZhang Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20220722103817.631258-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 17 ++++++++++++++--- include/uapi/linux/ublk_cmd.h | 7 ++++--- 2 files changed, 18 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 67f91a80a7ab..255b2de46a24 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -46,6 +46,9 @@ #define UBLK_MINORS (1U << MINORBITS) +/* All UBLK_F_* have to be included into UBLK_F_ALL */ +#define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_URING_CMD_COMP_IN_TASK) + struct ublk_rq_data { struct callback_head work; }; @@ -953,7 +956,7 @@ static int ublk_init_queue(struct ublk_device *ub, int q_id) void *ptr; int size; - ubq->flags = ub->dev_info.flags[0]; + ubq->flags = ub->dev_info.flags; ubq->q_id = q_id; ubq->q_depth = ub->dev_info.queue_depth; size = ublk_queue_cmd_buf_size(ub, q_id); @@ -1246,7 +1249,7 @@ out_put_device: static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info) { pr_devel("%s: dev id %d flags %llx\n", __func__, - info->dev_id, info->flags[0]); + info->dev_id, info->flags); pr_devel("\t nr_hw_queues %d queue_depth %d block size %d dev_capacity %lld\n", info->nr_hw_queues, info->queue_depth, info->block_size, info->dev_blocks); @@ -1298,8 +1301,16 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) /* update device id */ ub->dev_info.dev_id = ub->ub_number; + /* + * 64bit flags will be copied back to userspace as feature + * negotiation result, so have to clear flags which driver + * doesn't support yet, then userspace can get correct flags + * (features) to handle. + */ + ub->dev_info.flags &= UBLK_F_ALL; + /* We are not ready to support zero copy */ - ub->dev_info.flags[0] &= ~UBLK_F_SUPPORT_ZERO_COPY; + ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY; ub->bs_shift = ilog2(ub->dev_info.block_size); ub->dev_info.nr_hw_queues = min_t(unsigned int, diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 917580b34198..ca33092354ab 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -46,13 +46,13 @@ * zero copy requires 4k block size, and can remap ublk driver's io * request into ublksrv's vm space */ -#define UBLK_F_SUPPORT_ZERO_COPY (1UL << 0) +#define UBLK_F_SUPPORT_ZERO_COPY (1ULL << 0) /* * Force to complete io cmd via io_uring_cmd_complete_in_task so that * performance comparison is done easily with using task_work_add */ -#define UBLK_F_URING_CMD_COMP_IN_TASK (1UL << 1) +#define UBLK_F_URING_CMD_COMP_IN_TASK (1ULL << 1) /* device state */ #define UBLK_S_DEV_DEAD 0 @@ -88,7 +88,8 @@ struct ublksrv_ctrl_dev_info { __s32 ublksrv_pid; __s32 reserved0; - __u64 flags[2]; + __u64 flags; + __u64 flags_reserved; /* For ublksrv internal use, invisible to ublk driver */ __u64 ublksrv_flags; -- cgit v1.2.3