diff options
Diffstat (limited to 'drivers/block/ublk_drv.c')
-rw-r--r-- | drivers/block/ublk_drv.c | 1095 |
1 files changed, 767 insertions, 328 deletions
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 40f971a66d3e..99abd67b708b 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -48,8 +48,12 @@ #define UBLK_MINORS (1U << MINORBITS) +#define UBLK_INVALID_BUF_IDX ((u16)-1) + /* private ioctl command mirror */ #define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC) +#define UBLK_CMD_UPDATE_SIZE _IOC_NR(UBLK_U_CMD_UPDATE_SIZE) +#define UBLK_CMD_QUIESCE_DEV _IOC_NR(UBLK_U_CMD_QUIESCE_DEV) #define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF) #define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF) @@ -64,7 +68,12 @@ | UBLK_F_CMD_IOCTL_ENCODE \ | UBLK_F_USER_COPY \ | UBLK_F_ZONED \ - | UBLK_F_USER_RECOVERY_FAIL_IO) + | UBLK_F_USER_RECOVERY_FAIL_IO \ + | UBLK_F_UPDATE_SIZE \ + | UBLK_F_AUTO_BUF_REG \ + | UBLK_F_QUIESCE \ + | UBLK_F_PER_IO_DAEMON \ + | UBLK_F_BUF_REG_OFF_DAEMON) #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \ | UBLK_F_USER_RECOVERY_REISSUE \ @@ -76,10 +85,6 @@ UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \ UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT) -struct ublk_rq_data { - struct kref ref; -}; - struct ublk_uring_cmd_pdu { /* * Store requests in same batch temporarily for queuing them to @@ -99,6 +104,7 @@ struct ublk_uring_cmd_pdu { * setup in ublk uring_cmd handler */ struct ublk_queue *ubq; + u16 tag; }; @@ -131,28 +137,68 @@ struct ublk_uring_cmd_pdu { */ #define UBLK_IO_FLAG_NEED_GET_DATA 0x08 +/* + * request buffer is registered automatically, so we have to unregister it + * before completing this request. + * + * io_uring will unregister buffer automatically for us during exiting. + */ +#define UBLK_IO_FLAG_AUTO_BUF_REG 0x10 + /* atomic RW with ubq->cancel_lock */ #define UBLK_IO_FLAG_CANCELED 0x80000000 +/* + * Initialize refcount to a large number to include any registered buffers. + * UBLK_IO_COMMIT_AND_FETCH_REQ will release these references minus those for + * any buffers registered on the io daemon task. + */ +#define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2) + struct ublk_io { /* userspace buffer address from io cmd */ - __u64 addr; + union { + __u64 addr; + struct ublk_auto_buf_reg buf; + }; unsigned int flags; int res; - struct io_uring_cmd *cmd; -}; + union { + /* valid if UBLK_IO_FLAG_ACTIVE is set */ + struct io_uring_cmd *cmd; + /* valid if UBLK_IO_FLAG_OWNED_BY_SRV is set */ + struct request *req; + }; + + struct task_struct *task; + + /* + * The number of uses of this I/O by the ublk server + * if user copy or zero copy are enabled: + * - UBLK_REFCOUNT_INIT from dispatch to the server + * until UBLK_IO_COMMIT_AND_FETCH_REQ + * - 1 for each inflight ublk_ch_{read,write}_iter() call + * - 1 for each io_uring registered buffer not registered on task + * The I/O can only be completed once all references are dropped. + * User copy and buffer registration operations are only permitted + * if the reference count is nonzero. + */ + refcount_t ref; + /* Count of buffers registered on task and not yet unregistered */ + unsigned task_registered_buffers; + + void *buf_ctx_handle; +} ____cacheline_aligned_in_smp; struct ublk_queue { int q_id; int q_depth; unsigned long flags; - struct task_struct *ubq_daemon; struct ublksrv_io_desc *io_cmd_buf; bool force_abort; - bool timeout; bool canceling; bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */ unsigned short nr_io_ready; /* how many ios setup */ @@ -189,7 +235,10 @@ struct ublk_device { struct completion completion; unsigned int nr_queues_ready; - unsigned int nr_privileged_daemon; + bool unprivileged_daemons; + struct mutex cancel_mutex; + bool canceling; + pid_t ublksrv_tgid; }; /* header of ublk_params */ @@ -198,16 +247,18 @@ struct ublk_params_header { __u32 types; }; +static void ublk_io_release(void *priv); static void ublk_stop_dev_unlocked(struct ublk_device *ub); static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq); static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, - struct ublk_queue *ubq, int tag, size_t offset); + const struct ublk_queue *ubq, struct ublk_io *io, + size_t offset); static inline unsigned int ublk_req_build_flags(struct request *req); -static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq, - int tag); -static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub) + +static inline struct ublksrv_io_desc * +ublk_get_iod(const struct ublk_queue *ubq, unsigned tag) { - return ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY); + return &ubq->io_cmd_buf[tag]; } static inline bool ublk_dev_is_zoned(const struct ublk_device *ub) @@ -361,8 +412,7 @@ static int ublk_report_zones(struct gendisk *disk, sector_t sector, if (ret) goto free_req; - ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length, - GFP_KERNEL); + ret = blk_rq_map_kern(req, buffer, buffer_length, GFP_KERNEL); if (ret) goto erase_desc; @@ -482,7 +532,6 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, #endif static inline void __ublk_complete_rq(struct request *req); -static void ublk_complete_rq(struct kref *ref); static dev_t ublk_chr_devt; static const struct class ublk_chr_class = { @@ -609,14 +658,25 @@ static void ublk_apply_params(struct ublk_device *ub) ublk_dev_param_zoned_apply(ub); } +static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq) +{ + return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY; +} + +static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq) +{ + return ubq->flags & UBLK_F_AUTO_BUF_REG; +} + static inline bool ublk_support_user_copy(const struct ublk_queue *ubq) { - return ubq->flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY); + return ubq->flags & UBLK_F_USER_COPY; } static inline bool ublk_need_map_io(const struct ublk_queue *ubq) { - return !ublk_support_user_copy(ubq); + return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq) && + !ublk_support_auto_buf_reg(ubq); } static inline bool ublk_need_req_ref(const struct ublk_queue *ubq) @@ -624,42 +684,42 @@ static inline bool ublk_need_req_ref(const struct ublk_queue *ubq) /* * read()/write() is involved in user copy, so request reference * has to be grabbed + * + * for zero copy, request buffer need to be registered to io_uring + * buffer table, so reference is needed + * + * For auto buffer register, ublk server still may issue + * UBLK_IO_COMMIT_AND_FETCH_REQ before one registered buffer is used up, + * so reference is required too. */ - return ublk_support_user_copy(ubq); + return ublk_support_user_copy(ubq) || ublk_support_zero_copy(ubq) || + ublk_support_auto_buf_reg(ubq); } static inline void ublk_init_req_ref(const struct ublk_queue *ubq, - struct request *req) + struct ublk_io *io) { - if (ublk_need_req_ref(ubq)) { - struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); - - kref_init(&data->ref); - } + if (ublk_need_req_ref(ubq)) + refcount_set(&io->ref, UBLK_REFCOUNT_INIT); } -static inline bool ublk_get_req_ref(const struct ublk_queue *ubq, - struct request *req) +static inline bool ublk_get_req_ref(struct ublk_io *io) { - if (ublk_need_req_ref(ubq)) { - struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); - - return kref_get_unless_zero(&data->ref); - } + return refcount_inc_not_zero(&io->ref); +} - return true; +static inline void ublk_put_req_ref(struct ublk_io *io, struct request *req) +{ + if (refcount_dec_and_test(&io->ref)) + __ublk_complete_rq(req); } -static inline void ublk_put_req_ref(const struct ublk_queue *ubq, - struct request *req) +static inline bool ublk_sub_req_ref(struct ublk_io *io) { - if (ublk_need_req_ref(ubq)) { - struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); + unsigned sub_refs = UBLK_REFCOUNT_INIT - io->task_registered_buffers; - kref_put(&data->ref, ublk_complete_rq); - } else { - __ublk_complete_rq(req); - } + io->task_registered_buffers = 0; + return refcount_sub_and_test(sub_refs, &io->ref); } static inline bool ublk_need_get_data(const struct ublk_queue *ubq) @@ -692,12 +752,6 @@ static inline bool ublk_rq_has_data(const struct request *rq) return bio_has_data(rq->bio); } -static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq, - int tag) -{ - return &ubq->io_cmd_buf[tag]; -} - static inline struct ublksrv_io_desc * ublk_queue_cmd_buf(struct ublk_device *ub, int q_id) { @@ -942,7 +996,7 @@ static inline bool ublk_need_unmap_req(const struct request *req) } static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, - struct ublk_io *io) + const struct ublk_io *io) { const unsigned int rq_bytes = blk_rq_bytes(req); @@ -966,7 +1020,7 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, static int ublk_unmap_io(const struct ublk_queue *ubq, const struct request *req, - struct ublk_io *io) + const struct ublk_io *io) { const unsigned int rq_bytes = blk_rq_bytes(req); @@ -1061,11 +1115,6 @@ static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu( return io_uring_cmd_to_pdu(ioucmd, struct ublk_uring_cmd_pdu); } -static inline bool ubq_daemon_is_dying(struct ublk_queue *ubq) -{ - return !ubq->ubq_daemon || ubq->ubq_daemon->flags & PF_EXITING; -} - /* todo: handle partial completion */ static inline void __ublk_complete_rq(struct request *req) { @@ -1106,7 +1155,7 @@ static inline void __ublk_complete_rq(struct request *req) if (blk_update_request(req, BLK_STS_OK, io->res)) blk_mq_requeue_request(req, true); - else + else if (likely(!blk_should_fake_timeout(req->q))) __blk_mq_end_request(req, BLK_STS_OK); return; @@ -1114,18 +1163,12 @@ exit: blk_mq_end_request(req, res); } -static void ublk_complete_rq(struct kref *ref) +static struct io_uring_cmd *__ublk_prep_compl_io_cmd(struct ublk_io *io, + struct request *req) { - struct ublk_rq_data *data = container_of(ref, struct ublk_rq_data, - ref); - struct request *req = blk_mq_rq_from_pdu(data); - - __ublk_complete_rq(req); -} + /* read cmd first because req will overwrite it */ + struct io_uring_cmd *cmd = io->cmd; -static void ubq_complete_io_cmd(struct ublk_io *io, int res, - unsigned issue_flags) -{ /* mark this cmd owned by ublksrv */ io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV; @@ -1135,8 +1178,17 @@ static void ubq_complete_io_cmd(struct ublk_io *io, int res, */ io->flags &= ~UBLK_IO_FLAG_ACTIVE; + io->req = req; + return cmd; +} + +static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req, + int res, unsigned issue_flags) +{ + struct io_uring_cmd *cmd = __ublk_prep_compl_io_cmd(io, req); + /* tell ublksrv one io request is coming */ - io_uring_cmd_done(io->cmd, res, 0, issue_flags); + io_uring_cmd_done(cmd, res, 0, issue_flags); } #define UBLK_REQUEUE_DELAY_MS 3 @@ -1151,28 +1203,97 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq, blk_mq_end_request(rq, BLK_STS_IOERR); } +static void +ublk_auto_buf_reg_fallback(const struct ublk_queue *ubq, struct ublk_io *io) +{ + unsigned tag = io - ubq->ios; + struct ublksrv_io_desc *iod = ublk_get_iod(ubq, tag); + + iod->op_flags |= UBLK_IO_F_NEED_REG_BUF; +} + +static bool ublk_auto_buf_reg(const struct ublk_queue *ubq, struct request *req, + struct ublk_io *io, unsigned int issue_flags) +{ + int ret; + + ret = io_buffer_register_bvec(io->cmd, req, ublk_io_release, + io->buf.index, issue_flags); + if (ret) { + if (io->buf.flags & UBLK_AUTO_BUF_REG_FALLBACK) { + ublk_auto_buf_reg_fallback(ubq, io); + return true; + } + blk_mq_end_request(req, BLK_STS_IOERR); + return false; + } + + io->task_registered_buffers = 1; + io->buf_ctx_handle = io_uring_cmd_ctx_handle(io->cmd); + io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG; + return true; +} + +static bool ublk_prep_auto_buf_reg(struct ublk_queue *ubq, + struct request *req, struct ublk_io *io, + unsigned int issue_flags) +{ + ublk_init_req_ref(ubq, io); + if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) + return ublk_auto_buf_reg(ubq, req, io, issue_flags); + + return true; +} + +static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req, + struct ublk_io *io) +{ + unsigned mapped_bytes = ublk_map_io(ubq, req, io); + + /* partially mapped, update io descriptor */ + if (unlikely(mapped_bytes != blk_rq_bytes(req))) { + /* + * Nothing mapped, retry until we succeed. + * + * We may never succeed in mapping any bytes here because + * of OOM. TODO: reserve one buffer with single page pinned + * for providing forward progress guarantee. + */ + if (unlikely(!mapped_bytes)) { + blk_mq_requeue_request(req, false); + blk_mq_delay_kick_requeue_list(req->q, + UBLK_REQUEUE_DELAY_MS); + return false; + } + + ublk_get_iod(ubq, req->tag)->nr_sectors = + mapped_bytes >> 9; + } + + return true; +} + static void ublk_dispatch_req(struct ublk_queue *ubq, struct request *req, unsigned int issue_flags) { int tag = req->tag; struct ublk_io *io = &ubq->ios[tag]; - unsigned int mapped_bytes; - pr_devel("%s: complete: op %d, qid %d tag %d io_flags %x addr %llx\n", - __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags, + pr_devel("%s: complete: qid %d tag %d io_flags %x addr %llx\n", + __func__, ubq->q_id, req->tag, io->flags, ublk_get_iod(ubq, req->tag)->addr); /* * Task is exiting if either: * - * (1) current != ubq_daemon. + * (1) current != io->task. * io_uring_cmd_complete_in_task() tries to run task_work - * in a workqueue if ubq_daemon(cmd's task) is PF_EXITING. + * in a workqueue if cmd's task is PF_EXITING. * * (2) current->flags & PF_EXITING. */ - if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING)) { + if (unlikely(current != io->task || current->flags & PF_EXITING)) { __ublk_abort_rq(ubq, req); return; } @@ -1180,54 +1301,22 @@ static void ublk_dispatch_req(struct ublk_queue *ubq, if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) { /* * We have not handled UBLK_IO_NEED_GET_DATA command yet, - * so immepdately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv + * so immediately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv * and notify it. */ - if (!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA)) { - io->flags |= UBLK_IO_FLAG_NEED_GET_DATA; - pr_devel("%s: need get data. op %d, qid %d tag %d io_flags %x\n", - __func__, io->cmd->cmd_op, ubq->q_id, - req->tag, io->flags); - ubq_complete_io_cmd(io, UBLK_IO_RES_NEED_GET_DATA, issue_flags); - return; - } - /* - * We have handled UBLK_IO_NEED_GET_DATA command, - * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just - * do the copy work. - */ - io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA; - /* update iod->addr because ublksrv may have passed a new io buffer */ - ublk_get_iod(ubq, req->tag)->addr = io->addr; - pr_devel("%s: update iod->addr: op %d, qid %d tag %d io_flags %x addr %llx\n", - __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags, - ublk_get_iod(ubq, req->tag)->addr); + io->flags |= UBLK_IO_FLAG_NEED_GET_DATA; + pr_devel("%s: need get data. qid %d tag %d io_flags %x\n", + __func__, ubq->q_id, req->tag, io->flags); + ublk_complete_io_cmd(io, req, UBLK_IO_RES_NEED_GET_DATA, + issue_flags); + return; } - mapped_bytes = ublk_map_io(ubq, req, io); - - /* partially mapped, update io descriptor */ - if (unlikely(mapped_bytes != blk_rq_bytes(req))) { - /* - * Nothing mapped, retry until we succeed. - * - * We may never succeed in mapping any bytes here because - * of OOM. TODO: reserve one buffer with single page pinned - * for providing forward progress guarantee. - */ - if (unlikely(!mapped_bytes)) { - blk_mq_requeue_request(req, false); - blk_mq_delay_kick_requeue_list(req->q, - UBLK_REQUEUE_DELAY_MS); - return; - } - - ublk_get_iod(ubq, req->tag)->nr_sectors = - mapped_bytes >> 9; - } + if (!ublk_start_io(ubq, req, io)) + return; - ublk_init_req_ref(ubq, req); - ubq_complete_io_cmd(io, UBLK_IO_RES_OK, issue_flags); + if (ublk_prep_auto_buf_reg(ubq, req, io, issue_flags)) + ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags); } static void ublk_cmd_tw_cb(struct io_uring_cmd *cmd, @@ -1253,24 +1342,22 @@ static void ublk_cmd_list_tw_cb(struct io_uring_cmd *cmd, { struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); struct request *rq = pdu->req_list; - struct ublk_queue *ubq = pdu->ubq; struct request *next; do { next = rq->rq_next; rq->rq_next = NULL; - ublk_dispatch_req(ubq, rq, issue_flags); + ublk_dispatch_req(rq->mq_hctx->driver_data, rq, issue_flags); rq = next; } while (rq); } -static void ublk_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l) +static void ublk_queue_cmd_list(struct ublk_io *io, struct rq_list *l) { - struct request *rq = rq_list_peek(l); - struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd; + struct io_uring_cmd *cmd = io->cmd; struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); - pdu->req_list = rq; + pdu->req_list = rq_list_peek(l); rq_list_init(l); io_uring_cmd_complete_in_task(cmd, ublk_cmd_list_tw_cb); } @@ -1278,17 +1365,23 @@ static void ublk_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l) static enum blk_eh_timer_return ublk_timeout(struct request *rq) { struct ublk_queue *ubq = rq->mq_hctx->driver_data; + pid_t tgid = ubq->dev->ublksrv_tgid; + struct task_struct *p; + struct pid *pid; - if (ubq->flags & UBLK_F_UNPRIVILEGED_DEV) { - if (!ubq->timeout) { - send_sig(SIGKILL, ubq->ubq_daemon, 0); - ubq->timeout = true; - } + if (!(ubq->flags & UBLK_F_UNPRIVILEGED_DEV)) + return BLK_EH_RESET_TIMER; - return BLK_EH_DONE; - } + if (unlikely(!tgid)) + return BLK_EH_RESET_TIMER; - return BLK_EH_RESET_TIMER; + rcu_read_lock(); + pid = find_vpid(tgid); + p = pid_task(pid, PIDTYPE_PID); + if (p) + send_sig(SIGKILL, p, 0); + rcu_read_unlock(); + return BLK_EH_DONE; } static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq, @@ -1296,7 +1389,7 @@ static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq, { blk_status_t res; - if (unlikely(ubq->fail_io)) + if (unlikely(READ_ONCE(ubq->fail_io))) return BLK_STS_TARGET; /* With recovery feature enabled, force_abort is set in @@ -1308,7 +1401,8 @@ static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq, * Note: force_abort is guaranteed to be seen because it is set * before request queue is unqiuesced. */ - if (ublk_nosrv_should_queue_io(ubq) && unlikely(ubq->force_abort)) + if (ublk_nosrv_should_queue_io(ubq) && + unlikely(READ_ONCE(ubq->force_abort))) return BLK_STS_IOERR; if (check_cancel && unlikely(ubq->canceling)) @@ -1348,28 +1442,39 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_OK; } +static inline bool ublk_belong_to_same_batch(const struct ublk_io *io, + const struct ublk_io *io2) +{ + return (io_uring_cmd_ctx_handle(io->cmd) == + io_uring_cmd_ctx_handle(io2->cmd)) && + (io->task == io2->task); +} + static void ublk_queue_rqs(struct rq_list *rqlist) { struct rq_list requeue_list = { }; struct rq_list submit_list = { }; - struct ublk_queue *ubq = NULL; + struct ublk_io *io = NULL; struct request *req; while ((req = rq_list_pop(rqlist))) { struct ublk_queue *this_q = req->mq_hctx->driver_data; + struct ublk_io *this_io = &this_q->ios[req->tag]; - if (ubq && ubq != this_q && !rq_list_empty(&submit_list)) - ublk_queue_cmd_list(ubq, &submit_list); - ubq = this_q; - - if (ublk_prep_req(ubq, req, true) == BLK_STS_OK) - rq_list_add_tail(&submit_list, req); - else + if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) { rq_list_add_tail(&requeue_list, req); + continue; + } + + if (io && !ublk_belong_to_same_batch(io, this_io) && + !rq_list_empty(&submit_list)) + ublk_queue_cmd_list(io, &submit_list); + io = this_io; + rq_list_add_tail(&submit_list, req); } - if (ubq && !rq_list_empty(&submit_list)) - ublk_queue_cmd_list(ubq, &submit_list); + if (!rq_list_empty(&submit_list)) + ublk_queue_cmd_list(io, &submit_list); *rqlist = requeue_list; } @@ -1397,17 +1502,6 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) /* All old ioucmds have to be completed */ ubq->nr_io_ready = 0; - /* - * old daemon is PF_EXITING, put it now - * - * It could be NULL in case of closing one quisced device. - */ - if (ubq->ubq_daemon) - put_task_struct(ubq->ubq_daemon); - /* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */ - ubq->ubq_daemon = NULL; - ubq->timeout = false; - for (i = 0; i < ubq->q_depth; i++) { struct ublk_io *io = &ubq->ios[i]; @@ -1418,6 +1512,20 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) io->flags &= UBLK_IO_FLAG_CANCELED; io->cmd = NULL; io->addr = 0; + + /* + * old task is PF_EXITING, put it now + * + * It could be NULL in case of closing one quiesced + * device. + */ + if (io->task) { + put_task_struct(io->task); + io->task = NULL; + } + + WARN_ON_ONCE(refcount_read(&io->ref)); + WARN_ON_ONCE(io->task_registered_buffers); } } @@ -1429,6 +1537,7 @@ static int ublk_ch_open(struct inode *inode, struct file *filp) if (test_and_set_bit(UB_STATE_OPEN, &ub->state)) return -EBUSY; filp->private_data = ub; + ub->ublksrv_tgid = current->tgid; return 0; } @@ -1439,10 +1548,11 @@ static void ublk_reset_ch_dev(struct ublk_device *ub) for (i = 0; i < ub->dev_info.nr_hw_queues; i++) ublk_queue_reinit(ub, ublk_get_queue(ub, i)); - /* set to NULL, otherwise new ubq_daemon cannot mmap the io_cmd_buf */ + /* set to NULL, otherwise new tasks cannot mmap io_cmd_buf */ ub->mm = NULL; ub->nr_queues_ready = 0; - ub->nr_privileged_daemon = 0; + ub->unprivileged_daemons = false; + ub->ublksrv_tgid = -1; } static struct gendisk *ublk_get_disk(struct ublk_device *ub) @@ -1464,6 +1574,27 @@ static void ublk_put_disk(struct gendisk *disk) put_device(disk_to_dev(disk)); } +/* + * Use this function to ensure that ->canceling is consistently set for + * the device and all queues. Do not set these flags directly. + * + * Caller must ensure that: + * - cancel_mutex is held. This ensures that there is no concurrent + * access to ub->canceling and no concurrent writes to ubq->canceling. + * - there are no concurrent reads of ubq->canceling from the queue_rq + * path. This can be done by quiescing the queue, or through other + * means. + */ +static void ublk_set_canceling(struct ublk_device *ub, bool canceling) + __must_hold(&ub->cancel_mutex) +{ + int i; + + ub->canceling = canceling; + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) + ublk_get_queue(ub, i)->canceling = canceling; +} + static int ublk_ch_release(struct inode *inode, struct file *filp) { struct ublk_device *ub = filp->private_data; @@ -1492,12 +1623,11 @@ static int ublk_ch_release(struct inode *inode, struct file *filp) * All requests may be inflight, so ->canceling may not be set, set * it now. */ - for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { - struct ublk_queue *ubq = ublk_get_queue(ub, i); - - ubq->canceling = true; - ublk_abort_queue(ub, ubq); - } + mutex_lock(&ub->cancel_mutex); + ublk_set_canceling(ub, true); + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) + ublk_abort_queue(ub, ublk_get_queue(ub, i)); + mutex_unlock(&ub->cancel_mutex); blk_mq_kick_requeue_list(disk->queue); /* @@ -1515,7 +1645,6 @@ static int ublk_ch_release(struct inode *inode, struct file *filp) * Transition the device to the nosrv state. What exactly this * means depends on the recovery flags */ - blk_mq_quiesce_queue(disk->queue); if (ublk_nosrv_should_stop_dev(ub)) { /* * Allow any pending/future I/O to pass through quickly @@ -1523,8 +1652,7 @@ static int ublk_ch_release(struct inode *inode, struct file *filp) * waits for all pending I/O to complete */ for (i = 0; i < ub->dev_info.nr_hw_queues; i++) - ublk_get_queue(ub, i)->force_abort = true; - blk_mq_unquiesce_queue(disk->queue); + WRITE_ONCE(ublk_get_queue(ub, i)->force_abort, true); ublk_stop_dev_unlocked(ub); } else { @@ -1534,9 +1662,8 @@ static int ublk_ch_release(struct inode *inode, struct file *filp) } else { ub->dev_info.state = UBLK_S_DEV_FAIL_IO; for (i = 0; i < ub->dev_info.nr_hw_queues; i++) - ublk_get_queue(ub, i)->fail_io = true; + WRITE_ONCE(ublk_get_queue(ub, i)->fail_io, true); } - blk_mq_unquiesce_queue(disk->queue); } unlock: mutex_unlock(&ub->mutex); @@ -1587,30 +1714,6 @@ static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma) return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); } -static void ublk_commit_completion(struct ublk_device *ub, - const struct ublksrv_io_cmd *ub_cmd) -{ - u32 qid = ub_cmd->q_id, tag = ub_cmd->tag; - struct ublk_queue *ubq = ublk_get_queue(ub, qid); - struct ublk_io *io = &ubq->ios[tag]; - struct request *req; - - /* now this cmd slot is owned by nbd driver */ - io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV; - io->res = ub_cmd->result; - - /* find the io request and complete */ - req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag); - if (WARN_ON_ONCE(unlikely(!req))) - return; - - if (req_op(req) == REQ_OP_ZONE_APPEND) - req->__sector = ub_cmd->zone_append_lba; - - if (likely(!blk_should_fake_timeout(req->q))) - ublk_put_req_ref(ubq, req); -} - static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io, struct request *req) { @@ -1639,37 +1742,22 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq) for (i = 0; i < ubq->q_depth; i++) { struct ublk_io *io = &ubq->ios[i]; - if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) { - struct request *rq; - - /* - * Either we fail the request or ublk_rq_task_work_cb - * will do it - */ - rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i); - if (rq && blk_mq_request_started(rq)) - __ublk_fail_req(ubq, io, rq); - } + if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV) + __ublk_fail_req(ubq, io, io->req); } } -/* Must be called when queue is frozen */ -static void ublk_mark_queue_canceling(struct ublk_queue *ubq) -{ - spin_lock(&ubq->cancel_lock); - if (!ubq->canceling) - ubq->canceling = true; - spin_unlock(&ubq->cancel_lock); -} - -static void ublk_start_cancel(struct ublk_queue *ubq) +static void ublk_start_cancel(struct ublk_device *ub) { - struct ublk_device *ub = ubq->dev; struct gendisk *disk = ublk_get_disk(ub); /* Our disk has been dead */ if (!disk) return; + + mutex_lock(&ub->cancel_mutex); + if (ub->canceling) + goto out; /* * Now we are serialized with ublk_queue_rq() * @@ -1678,8 +1766,10 @@ static void ublk_start_cancel(struct ublk_queue *ubq) * touch completed uring_cmd */ blk_mq_quiesce_queue(disk->queue); - ublk_mark_queue_canceling(ubq); + ublk_set_canceling(ub, true); blk_mq_unquiesce_queue(disk->queue); +out: + mutex_unlock(&ub->cancel_mutex); ublk_put_disk(disk); } @@ -1705,7 +1795,7 @@ static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag, * that ublk_dispatch_req() is always called */ req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag); - if (req && blk_mq_request_started(req)) + if (req && blk_mq_request_started(req) && req->tag == tag) return; spin_lock(&ubq->cancel_lock); @@ -1739,6 +1829,7 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd, struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); struct ublk_queue *ubq = pdu->ubq; struct task_struct *task; + struct ublk_io *io; if (WARN_ON_ONCE(!ubq)) return; @@ -1747,13 +1838,13 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd, return; task = io_uring_cmd_get_task(cmd); - if (WARN_ON_ONCE(task && task != ubq->ubq_daemon)) + io = &ubq->ios[pdu->tag]; + if (WARN_ON_ONCE(task && task != io->task)) return; - if (!ubq->canceling) - ublk_start_cancel(ubq); + ublk_start_cancel(ubq->dev); - WARN_ON_ONCE(ubq->ios[pdu->tag].cmd != cmd); + WARN_ON_ONCE(io->cmd != cmd); ublk_cancel_cmd(ubq, pdu->tag, issue_flags); } @@ -1875,9 +1966,11 @@ static void ublk_reset_io_flags(struct ublk_device *ub) for (j = 0; j < ubq->q_depth; j++) ubq->ios[j].flags &= ~UBLK_IO_FLAG_CANCELED; spin_unlock(&ubq->cancel_lock); - ubq->canceling = false; ubq->fail_io = false; } + mutex_lock(&ub->cancel_mutex); + ublk_set_canceling(ub, false); + mutex_unlock(&ub->cancel_mutex); } /* device can only be started after all IOs are ready */ @@ -1885,14 +1978,10 @@ static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq) __must_hold(&ub->mutex) { ubq->nr_io_ready++; - if (ublk_queue_ready(ubq)) { - ubq->ubq_daemon = current; - get_task_struct(ubq->ubq_daemon); + if (ublk_queue_ready(ubq)) ub->nr_queues_ready++; - - if (capable(CAP_SYS_ADMIN)) - ub->nr_privileged_daemon++; - } + if (!ub->unprivileged_daemons && !capable(CAP_SYS_ADMIN)) + ub->unprivileged_daemons = true; if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues) { /* now we are ready for handling ublk io request */ @@ -1914,12 +2003,66 @@ static inline int ublk_check_cmd_op(u32 cmd_op) return 0; } -static inline void ublk_fill_io_cmd(struct ublk_io *io, - struct io_uring_cmd *cmd, unsigned long buf_addr) +static inline int ublk_set_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd *cmd) +{ + io->buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr)); + + if (io->buf.reserved0 || io->buf.reserved1) + return -EINVAL; + + if (io->buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK) + return -EINVAL; + return 0; +} + +static int ublk_handle_auto_buf_reg(struct ublk_io *io, + struct io_uring_cmd *cmd, + u16 *buf_idx) { + if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) { + io->flags &= ~UBLK_IO_FLAG_AUTO_BUF_REG; + + /* + * `UBLK_F_AUTO_BUF_REG` only works iff `UBLK_IO_FETCH_REQ` + * and `UBLK_IO_COMMIT_AND_FETCH_REQ` are issued from same + * `io_ring_ctx`. + * + * If this uring_cmd's io_ring_ctx isn't same with the + * one for registering the buffer, it is ublk server's + * responsibility for unregistering the buffer, otherwise + * this ublk request gets stuck. + */ + if (io->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd)) + *buf_idx = io->buf.index; + } + + return ublk_set_auto_buf_reg(io, cmd); +} + +/* Once we return, `io->req` can't be used any more */ +static inline struct request * +ublk_fill_io_cmd(struct ublk_io *io, struct io_uring_cmd *cmd) +{ + struct request *req = io->req; + io->cmd = cmd; io->flags |= UBLK_IO_FLAG_ACTIVE; + /* now this cmd slot is owned by ublk driver */ + io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV; + + return req; +} + +static inline int +ublk_config_io_buf(const struct ublk_queue *ubq, struct ublk_io *io, + struct io_uring_cmd *cmd, unsigned long buf_addr, + u16 *buf_idx) +{ + if (ublk_support_auto_buf_reg(ubq)) + return ublk_handle_auto_buf_reg(io, cmd, buf_idx); + io->addr = buf_addr; + return 0; } static inline void ublk_prep_cancel(struct io_uring_cmd *cmd, @@ -1941,38 +2084,99 @@ static void ublk_io_release(void *priv) { struct request *rq = priv; struct ublk_queue *ubq = rq->mq_hctx->driver_data; + struct ublk_io *io = &ubq->ios[rq->tag]; - ublk_put_req_ref(ubq, rq); + /* + * task_registered_buffers may be 0 if buffers were registered off task + * but unregistered on task. Or after UBLK_IO_COMMIT_AND_FETCH_REQ. + */ + if (current == io->task && io->task_registered_buffers) + io->task_registered_buffers--; + else + ublk_put_req_ref(io, rq); } static int ublk_register_io_buf(struct io_uring_cmd *cmd, - struct ublk_queue *ubq, unsigned int tag, + const struct ublk_queue *ubq, + struct ublk_io *io, unsigned int index, unsigned int issue_flags) { struct ublk_device *ub = cmd->file->private_data; struct request *req; int ret; - req = __ublk_check_and_get_req(ub, ubq, tag, 0); + if (!ublk_support_zero_copy(ubq)) + return -EINVAL; + + req = __ublk_check_and_get_req(ub, ubq, io, 0); if (!req) return -EINVAL; ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index, issue_flags); if (ret) { - ublk_put_req_ref(ubq, req); + ublk_put_req_ref(io, req); return ret; } return 0; } +static int +ublk_daemon_register_io_buf(struct io_uring_cmd *cmd, + const struct ublk_queue *ubq, struct ublk_io *io, + unsigned index, unsigned issue_flags) +{ + unsigned new_registered_buffers; + struct request *req = io->req; + int ret; + + /* + * Ensure there are still references for ublk_sub_req_ref() to release. + * If not, fall back on the thread-safe buffer registration. + */ + new_registered_buffers = io->task_registered_buffers + 1; + if (unlikely(new_registered_buffers >= UBLK_REFCOUNT_INIT)) + return ublk_register_io_buf(cmd, ubq, io, index, issue_flags); + + if (!ublk_support_zero_copy(ubq) || !ublk_rq_has_data(req)) + return -EINVAL; + + ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index, + issue_flags); + if (ret) + return ret; + + io->task_registered_buffers = new_registered_buffers; + return 0; +} + static int ublk_unregister_io_buf(struct io_uring_cmd *cmd, + const struct ublk_device *ub, unsigned int index, unsigned int issue_flags) { + if (!(ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY)) + return -EINVAL; + return io_buffer_unregister_bvec(cmd, index, issue_flags); } +static int ublk_check_fetch_buf(const struct ublk_queue *ubq, __u64 buf_addr) +{ + if (ublk_need_map_io(ubq)) { + /* + * FETCH_RQ has to provide IO buffer if NEED GET + * DATA is not enabled + */ + if (!buf_addr && !ublk_need_get_data(ubq)) + return -EINVAL; + } else if (buf_addr) { + /* User copy requires addr to be unset */ + return -EINVAL; + } + return 0; +} + static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq, struct ublk_io *io, __u64 buf_addr) { @@ -1999,59 +2203,135 @@ static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq, WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV); - if (ublk_need_map_io(ubq)) { - /* - * FETCH_RQ has to provide IO buffer if NEED GET - * DATA is not enabled - */ - if (!buf_addr && !ublk_need_get_data(ubq)) - goto out; - } else if (buf_addr) { - /* User copy requires addr to be unset */ - ret = -EINVAL; + ublk_fill_io_cmd(io, cmd); + ret = ublk_config_io_buf(ubq, io, cmd, buf_addr, NULL); + if (ret) goto out; - } - ublk_fill_io_cmd(io, cmd, buf_addr); + WRITE_ONCE(io->task, get_task_struct(current)); ublk_mark_io_ready(ub, ubq); out: mutex_unlock(&ub->mutex); return ret; } +static int ublk_check_commit_and_fetch(const struct ublk_queue *ubq, + struct ublk_io *io, __u64 buf_addr) +{ + struct request *req = io->req; + + if (ublk_need_map_io(ubq)) { + /* + * COMMIT_AND_FETCH_REQ has to provide IO buffer if + * NEED GET DATA is not enabled or it is Read IO. + */ + if (!buf_addr && (!ublk_need_get_data(ubq) || + req_op(req) == REQ_OP_READ)) + return -EINVAL; + } else if (req_op(req) != REQ_OP_ZONE_APPEND && buf_addr) { + /* + * User copy requires addr to be unset when command is + * not zone append + */ + return -EINVAL; + } + + return 0; +} + +static bool ublk_need_complete_req(const struct ublk_queue *ubq, + struct ublk_io *io) +{ + if (ublk_need_req_ref(ubq)) + return ublk_sub_req_ref(io); + return true; +} + +static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io, + struct request *req) +{ + /* + * We have handled UBLK_IO_NEED_GET_DATA command, + * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just + * do the copy work. + */ + io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA; + /* update iod->addr because ublksrv may have passed a new io buffer */ + ublk_get_iod(ubq, req->tag)->addr = io->addr; + pr_devel("%s: update iod->addr: qid %d tag %d io_flags %x addr %llx\n", + __func__, ubq->q_id, req->tag, io->flags, + ublk_get_iod(ubq, req->tag)->addr); + + return ublk_start_io(ubq, req, io); +} + static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags, const struct ublksrv_io_cmd *ub_cmd) { + u16 buf_idx = UBLK_INVALID_BUF_IDX; struct ublk_device *ub = cmd->file->private_data; struct ublk_queue *ubq; struct ublk_io *io; u32 cmd_op = cmd->cmd_op; unsigned tag = ub_cmd->tag; - int ret = -EINVAL; struct request *req; + int ret; + bool compl; pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n", __func__, cmd->cmd_op, ub_cmd->q_id, tag, ub_cmd->result); - if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues) + ret = ublk_check_cmd_op(cmd_op); + if (ret) goto out; - ubq = ublk_get_queue(ub, ub_cmd->q_id); - if (!ubq || ub_cmd->q_id != ubq->q_id) - goto out; + /* + * io_buffer_unregister_bvec() doesn't access the ubq or io, + * so no need to validate the q_id, tag, or task + */ + if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF) + return ublk_unregister_io_buf(cmd, ub, ub_cmd->addr, + issue_flags); - if (ubq->ubq_daemon && ubq->ubq_daemon != current) + ret = -EINVAL; + if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues) goto out; + ubq = ublk_get_queue(ub, ub_cmd->q_id); + if (tag >= ubq->q_depth) goto out; io = &ubq->ios[tag]; + /* UBLK_IO_FETCH_REQ can be handled on any task, which sets io->task */ + if (unlikely(_IOC_NR(cmd_op) == UBLK_IO_FETCH_REQ)) { + ret = ublk_check_fetch_buf(ubq, ub_cmd->addr); + if (ret) + goto out; + ret = ublk_fetch(cmd, ubq, io, ub_cmd->addr); + if (ret) + goto out; + + ublk_prep_cancel(cmd, issue_flags, ubq, tag); + return -EIOCBQUEUED; + } + + if (READ_ONCE(io->task) != current) { + /* + * ublk_register_io_buf() accesses only the io's refcount, + * so can be handled on any task + */ + if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF) + return ublk_register_io_buf(cmd, ubq, io, ub_cmd->addr, + issue_flags); + + goto out; + } /* there is pending io cmd, something must be wrong */ - if (io->flags & UBLK_IO_FLAG_ACTIVE) { + if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) { ret = -EBUSY; goto out; } @@ -2064,54 +2344,44 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, ^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA)) goto out; - ret = ublk_check_cmd_op(cmd_op); - if (ret) - goto out; - - ret = -EINVAL; switch (_IOC_NR(cmd_op)) { case UBLK_IO_REGISTER_IO_BUF: - return ublk_register_io_buf(cmd, ubq, tag, ub_cmd->addr, issue_flags); - case UBLK_IO_UNREGISTER_IO_BUF: - return ublk_unregister_io_buf(cmd, ub_cmd->addr, issue_flags); - case UBLK_IO_FETCH_REQ: - ret = ublk_fetch(cmd, ubq, io, ub_cmd->addr); - if (ret) - goto out; - break; + return ublk_daemon_register_io_buf(cmd, ubq, io, ub_cmd->addr, + issue_flags); case UBLK_IO_COMMIT_AND_FETCH_REQ: - req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag); - - if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) + ret = ublk_check_commit_and_fetch(ubq, io, ub_cmd->addr); + if (ret) goto out; + io->res = ub_cmd->result; + req = ublk_fill_io_cmd(io, cmd); + ret = ublk_config_io_buf(ubq, io, cmd, ub_cmd->addr, &buf_idx); + compl = ublk_need_complete_req(ubq, io); + + /* can't touch 'ublk_io' any more */ + if (buf_idx != UBLK_INVALID_BUF_IDX) + io_buffer_unregister_bvec(cmd, buf_idx, issue_flags); + if (req_op(req) == REQ_OP_ZONE_APPEND) + req->__sector = ub_cmd->zone_append_lba; + if (compl) + __ublk_complete_rq(req); - if (ublk_need_map_io(ubq)) { - /* - * COMMIT_AND_FETCH_REQ has to provide IO buffer if - * NEED GET DATA is not enabled or it is Read IO. - */ - if (!ub_cmd->addr && (!ublk_need_get_data(ubq) || - req_op(req) == REQ_OP_READ)) - goto out; - } else if (req_op(req) != REQ_OP_ZONE_APPEND && ub_cmd->addr) { - /* - * User copy requires addr to be unset when command is - * not zone append - */ - ret = -EINVAL; + if (ret) goto out; - } - - ublk_fill_io_cmd(io, cmd, ub_cmd->addr); - ublk_commit_completion(ub, ub_cmd); break; case UBLK_IO_NEED_GET_DATA: - if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) - goto out; - ublk_fill_io_cmd(io, cmd, ub_cmd->addr); - req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag); - ublk_dispatch_req(ubq, req, issue_flags); - return -EIOCBQUEUED; + /* + * ublk_get_data() may fail and fallback to requeue, so keep + * uring_cmd active first and prepare for handling new requeued + * request + */ + req = ublk_fill_io_cmd(io, cmd); + ret = ublk_config_io_buf(ubq, io, cmd, ub_cmd->addr, NULL); + WARN_ON_ONCE(ret); + if (likely(ublk_get_data(ubq, io, req))) { + __ublk_prep_compl_io_cmd(io, req); + return UBLK_IO_RES_OK; + } + break; default: goto out; } @@ -2125,18 +2395,20 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, } static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, - struct ublk_queue *ubq, int tag, size_t offset) + const struct ublk_queue *ubq, struct ublk_io *io, size_t offset) { + unsigned tag = io - ubq->ios; struct request *req; - if (!ublk_need_req_ref(ubq)) - return NULL; - + /* + * can't use io->req in case of concurrent UBLK_IO_COMMIT_AND_FETCH_REQ, + * which would overwrite it with io->cmd + */ req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag); if (!req) return NULL; - if (!ublk_get_req_ref(ubq, req)) + if (!ublk_get_req_ref(io)) return NULL; if (unlikely(!blk_mq_request_started(req) || req->tag != tag)) @@ -2150,7 +2422,7 @@ static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, return req; fail_put: - ublk_put_req_ref(ubq, req); + ublk_put_req_ref(io, req); return NULL; } @@ -2217,7 +2489,8 @@ static inline bool ublk_check_ubuf_dir(const struct request *req, } static struct request *ublk_check_and_get_req(struct kiocb *iocb, - struct iov_iter *iter, size_t *off, int dir) + struct iov_iter *iter, size_t *off, int dir, + struct ublk_io **io) { struct ublk_device *ub = iocb->ki_filp->private_data; struct ublk_queue *ubq; @@ -2245,10 +2518,14 @@ static struct request *ublk_check_and_get_req(struct kiocb *iocb, if (!ubq) return ERR_PTR(-EINVAL); + if (!ublk_support_user_copy(ubq)) + return ERR_PTR(-EACCES); + if (tag >= ubq->q_depth) return ERR_PTR(-EINVAL); - req = __ublk_check_and_get_req(ub, ubq, tag, buf_off); + *io = &ubq->ios[tag]; + req = __ublk_check_and_get_req(ub, ubq, *io, buf_off); if (!req) return ERR_PTR(-EINVAL); @@ -2261,42 +2538,40 @@ static struct request *ublk_check_and_get_req(struct kiocb *iocb, *off = buf_off; return req; fail: - ublk_put_req_ref(ubq, req); + ublk_put_req_ref(*io, req); return ERR_PTR(-EACCES); } static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to) { - struct ublk_queue *ubq; struct request *req; + struct ublk_io *io; size_t buf_off; size_t ret; - req = ublk_check_and_get_req(iocb, to, &buf_off, ITER_DEST); + req = ublk_check_and_get_req(iocb, to, &buf_off, ITER_DEST, &io); if (IS_ERR(req)) return PTR_ERR(req); ret = ublk_copy_user_pages(req, buf_off, to, ITER_DEST); - ubq = req->mq_hctx->driver_data; - ublk_put_req_ref(ubq, req); + ublk_put_req_ref(io, req); return ret; } static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from) { - struct ublk_queue *ubq; struct request *req; + struct ublk_io *io; size_t buf_off; size_t ret; - req = ublk_check_and_get_req(iocb, from, &buf_off, ITER_SOURCE); + req = ublk_check_and_get_req(iocb, from, &buf_off, ITER_SOURCE, &io); if (IS_ERR(req)) return PTR_ERR(req); ret = ublk_copy_user_pages(req, buf_off, from, ITER_SOURCE); - ubq = req->mq_hctx->driver_data; - ublk_put_req_ref(ubq, req); + ublk_put_req_ref(io, req); return ret; } @@ -2315,9 +2590,16 @@ static void ublk_deinit_queue(struct ublk_device *ub, int q_id) { int size = ublk_queue_cmd_buf_size(ub, q_id); struct ublk_queue *ubq = ublk_get_queue(ub, q_id); + int i; + + for (i = 0; i < ubq->q_depth; i++) { + struct ublk_io *io = &ubq->ios[i]; + if (io->task) + put_task_struct(io->task); + WARN_ON_ONCE(refcount_read(&io->ref)); + WARN_ON_ONCE(io->task_registered_buffers); + } - if (ubq->ubq_daemon) - put_task_struct(ubq->ubq_daemon); if (ubq->io_cmd_buf) free_pages((unsigned long)ubq->io_cmd_buf, get_order(size)); } @@ -2354,7 +2636,7 @@ static void ublk_deinit_queues(struct ublk_device *ub) for (i = 0; i < nr_queues; i++) ublk_deinit_queue(ub, i); - kfree(ub->__queues); + kvfree(ub->__queues); } static int ublk_init_queues(struct ublk_device *ub) @@ -2365,7 +2647,7 @@ static int ublk_init_queues(struct ublk_device *ub) int i, ret = -ENOMEM; ub->queue_size = ubq_size; - ub->__queues = kcalloc(nr_queues, ubq_size, GFP_KERNEL); + ub->__queues = kvcalloc(nr_queues, ubq_size, GFP_KERNEL); if (!ub->__queues) return ret; @@ -2421,6 +2703,7 @@ static void ublk_cdev_rel(struct device *dev) ublk_deinit_queues(ub); ublk_free_dev_number(ub); mutex_destroy(&ub->mutex); + mutex_destroy(&ub->cancel_mutex); kfree(ub); } @@ -2468,7 +2751,6 @@ static int ublk_add_tag_set(struct ublk_device *ub) ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues; ub->tag_set.queue_depth = ub->dev_info.queue_depth; ub->tag_set.numa_node = NUMA_NO_NODE; - ub->tag_set.cmd_size = sizeof(struct ublk_rq_data); ub->tag_set.driver_data = ub; return blk_mq_alloc_tag_set(&ub->tag_set); } @@ -2570,6 +2852,9 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, if (wait_for_completion_interruptible(&ub->completion) != 0) return -EINTR; + if (ub->ublksrv_tgid != ublksrv_pid) + return -EINVAL; + mutex_lock(&ub->mutex); if (ub->dev_info.state == UBLK_S_DEV_LIVE || test_bit(UB_STATE_USED, &ub->state)) { @@ -2591,8 +2876,8 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, ublk_apply_params(ub); - /* don't probe partitions if any one ubq daemon is un-trusted */ - if (ub->nr_privileged_daemon != ub->nr_queues_ready) + /* don't probe partitions if any daemon task is un-trusted */ + if (ub->unprivileged_daemons) set_bit(GD_SUPPRESS_PART_SCAN, &disk->state); ublk_get_device(ub); @@ -2691,6 +2976,10 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) if (copy_from_user(&info, argp, sizeof(info))) return -EFAULT; + if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || !info.queue_depth || + info.nr_hw_queues > UBLK_MAX_NR_QUEUES || !info.nr_hw_queues) + return -EINVAL; + if (capable(CAP_SYS_ADMIN)) info.flags &= ~UBLK_F_UNPRIVILEGED_DEV; else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV)) @@ -2709,6 +2998,11 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) return -EINVAL; } + if ((info.flags & UBLK_F_QUIESCE) && !(info.flags & UBLK_F_USER_RECOVERY)) { + pr_warn("UBLK_F_QUIESCE requires UBLK_F_USER_RECOVERY\n"); + return -EINVAL; + } + /* * unprivileged device can't be trusted, but RECOVERY and * RECOVERY_REISSUE still may hang error handling, so can't @@ -2725,8 +3019,11 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) * For USER_COPY, we depends on userspace to fill request * buffer by pwrite() to ublk char device, which can't be * used for unprivileged device + * + * Same with zero copy or auto buffer register. */ - if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY)) + if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY | + UBLK_F_AUTO_BUF_REG)) return -EINVAL; } @@ -2762,6 +3059,7 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) goto out_unlock; mutex_init(&ub->mutex); spin_lock_init(&ub->lock); + mutex_init(&ub->cancel_mutex); ret = ublk_alloc_dev_number(ub, header->dev_id); if (ret < 0) @@ -2781,15 +3079,23 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) ub->dev_info.flags &= UBLK_F_ALL; ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE | - UBLK_F_URING_CMD_COMP_IN_TASK; + UBLK_F_URING_CMD_COMP_IN_TASK | + UBLK_F_PER_IO_DAEMON | + UBLK_F_BUF_REG_OFF_DAEMON; - /* GET_DATA isn't needed any more with USER_COPY */ - if (ublk_dev_is_user_copy(ub)) + /* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */ + if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY | + UBLK_F_AUTO_BUF_REG)) ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA; - /* Zoned storage support requires user copy feature */ + /* + * Zoned storage support requires reuse `ublksrv_io_cmd->addr` for + * returning write_append_lba, which is only allowed in case of + * user copy or zero copy + */ if (ublk_dev_is_zoned(ub) && - (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !ublk_dev_is_user_copy(ub))) { + (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !(ub->dev_info.flags & + (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY)))) { ret = -EINVAL; goto out_free_dev_number; } @@ -2825,6 +3131,7 @@ out_free_dev_number: ublk_free_dev_number(ub); out_free_ub: mutex_destroy(&ub->mutex); + mutex_destroy(&ub->cancel_mutex); kfree(ub); out_unlock: mutex_unlock(&ublk_ctl_mutex); @@ -3040,14 +3347,17 @@ static int ublk_ctrl_end_recovery(struct ublk_device *ub, int ublksrv_pid = (int)header->data[0]; int ret = -EINVAL; - pr_devel("%s: Waiting for new ubq_daemons(nr: %d) are ready, dev id %d...\n", - __func__, ub->dev_info.nr_hw_queues, header->dev_id); - /* wait until new ubq_daemon sending all FETCH_REQ */ + pr_devel("%s: Waiting for all FETCH_REQs, dev id %d...\n", __func__, + header->dev_id); + if (wait_for_completion_interruptible(&ub->completion)) return -EINTR; - pr_devel("%s: All new ubq_daemons(nr: %d) are ready, dev id %d\n", - __func__, ub->dev_info.nr_hw_queues, header->dev_id); + pr_devel("%s: All FETCH_REQs received, dev id %d\n", __func__, + header->dev_id); + + if (ub->ublksrv_tgid != ublksrv_pid) + return -EINVAL; mutex_lock(&ub->mutex); if (ublk_nosrv_should_stop_dev(ub)) @@ -3082,6 +3392,125 @@ static int ublk_ctrl_get_features(const struct ublksrv_ctrl_cmd *header) return 0; } +static void ublk_ctrl_set_size(struct ublk_device *ub, const struct ublksrv_ctrl_cmd *header) +{ + struct ublk_param_basic *p = &ub->params.basic; + u64 new_size = header->data[0]; + + mutex_lock(&ub->mutex); + p->dev_sectors = new_size; + set_capacity_and_notify(ub->ub_disk, p->dev_sectors); + mutex_unlock(&ub->mutex); +} + +struct count_busy { + const struct ublk_queue *ubq; + unsigned int nr_busy; +}; + +static bool ublk_count_busy_req(struct request *rq, void *data) +{ + struct count_busy *idle = data; + + if (!blk_mq_request_started(rq) && rq->mq_hctx->driver_data == idle->ubq) + idle->nr_busy += 1; + return true; +} + +/* uring_cmd is guaranteed to be active if the associated request is idle */ +static bool ubq_has_idle_io(const struct ublk_queue *ubq) +{ + struct count_busy data = { + .ubq = ubq, + }; + + blk_mq_tagset_busy_iter(&ubq->dev->tag_set, ublk_count_busy_req, &data); + return data.nr_busy < ubq->q_depth; +} + +/* Wait until each hw queue has at least one idle IO */ +static int ublk_wait_for_idle_io(struct ublk_device *ub, + unsigned int timeout_ms) +{ + unsigned int elapsed = 0; + int ret; + + while (elapsed < timeout_ms && !signal_pending(current)) { + unsigned int queues_cancelable = 0; + int i; + + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { + struct ublk_queue *ubq = ublk_get_queue(ub, i); + + queues_cancelable += !!ubq_has_idle_io(ubq); + } + + /* + * Each queue needs at least one active command for + * notifying ublk server + */ + if (queues_cancelable == ub->dev_info.nr_hw_queues) + break; + + msleep(UBLK_REQUEUE_DELAY_MS); + elapsed += UBLK_REQUEUE_DELAY_MS; + } + + if (signal_pending(current)) + ret = -EINTR; + else if (elapsed >= timeout_ms) + ret = -EBUSY; + else + ret = 0; + + return ret; +} + +static int ublk_ctrl_quiesce_dev(struct ublk_device *ub, + const struct ublksrv_ctrl_cmd *header) +{ + /* zero means wait forever */ + u64 timeout_ms = header->data[0]; + struct gendisk *disk; + int ret = -ENODEV; + + if (!(ub->dev_info.flags & UBLK_F_QUIESCE)) + return -EOPNOTSUPP; + + mutex_lock(&ub->mutex); + disk = ublk_get_disk(ub); + if (!disk) + goto unlock; + if (ub->dev_info.state == UBLK_S_DEV_DEAD) + goto put_disk; + + ret = 0; + /* already in expected state */ + if (ub->dev_info.state != UBLK_S_DEV_LIVE) + goto put_disk; + + /* Mark the device as canceling */ + mutex_lock(&ub->cancel_mutex); + blk_mq_quiesce_queue(disk->queue); + ublk_set_canceling(ub, true); + blk_mq_unquiesce_queue(disk->queue); + mutex_unlock(&ub->cancel_mutex); + + if (!timeout_ms) + timeout_ms = UINT_MAX; + ret = ublk_wait_for_idle_io(ub, timeout_ms); + +put_disk: + ublk_put_disk(disk); +unlock: + mutex_unlock(&ub->mutex); + + /* Cancel pending uring_cmd */ + if (!ret) + ublk_cancel_dev(ub); + return ret; +} + /* * All control commands are sent via /dev/ublk-control, so we have to check * the destination device's permission @@ -3167,6 +3596,8 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub, case UBLK_CMD_SET_PARAMS: case UBLK_CMD_START_USER_RECOVERY: case UBLK_CMD_END_USER_RECOVERY: + case UBLK_CMD_UPDATE_SIZE: + case UBLK_CMD_QUIESCE_DEV: mask = MAY_READ | MAY_WRITE; break; default: @@ -3258,6 +3689,13 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, case UBLK_CMD_END_USER_RECOVERY: ret = ublk_ctrl_end_recovery(ub, header); break; + case UBLK_CMD_UPDATE_SIZE: + ublk_ctrl_set_size(ub, header); + ret = 0; + break; + case UBLK_CMD_QUIESCE_DEV: + ret = ublk_ctrl_quiesce_dev(ub, header); + break; default: ret = -EOPNOTSUPP; break; @@ -3291,6 +3729,7 @@ static int __init ublk_init(void) BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET); + BUILD_BUG_ON(sizeof(struct ublk_auto_buf_reg) != 8); init_waitqueue_head(&ublk_idr_wq); |