diff options
Diffstat (limited to 'tools/testing/selftests/ublk/kublk.c')
-rw-r--r-- | tools/testing/selftests/ublk/kublk.c | 885 |
1 files changed, 718 insertions, 167 deletions
diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 91c282bc7674..95188065b2e9 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -5,22 +5,24 @@ #include "kublk.h" +#define MAX_NR_TGT_ARG 64 + unsigned int ublk_dbg_mask = UBLK_LOG; static const struct ublk_tgt_ops *tgt_ops_list[] = { &null_tgt_ops, &loop_tgt_ops, &stripe_tgt_ops, + &fault_inject_tgt_ops, }; static const struct ublk_tgt_ops *ublk_find_tgt(const char *name) { - const struct ublk_tgt_ops *ops; int i; if (name == NULL) return NULL; - for (i = 0; sizeof(tgt_ops_list) / sizeof(ops); i++) + for (i = 0; i < ARRAY_SIZE(tgt_ops_list); i++) if (strcmp(tgt_ops_list[i]->name, name) == 0) return tgt_ops_list[i]; return NULL; @@ -118,6 +120,27 @@ static int ublk_ctrl_start_dev(struct ublk_dev *dev, return __ublk_ctrl_cmd(dev, &data); } +static int ublk_ctrl_start_user_recovery(struct ublk_dev *dev) +{ + struct ublk_ctrl_cmd_data data = { + .cmd_op = UBLK_U_CMD_START_USER_RECOVERY, + }; + + return __ublk_ctrl_cmd(dev, &data); +} + +static int ublk_ctrl_end_user_recovery(struct ublk_dev *dev, int daemon_pid) +{ + struct ublk_ctrl_cmd_data data = { + .cmd_op = UBLK_U_CMD_END_USER_RECOVERY, + .flags = CTRL_CMD_HAS_DATA, + }; + + dev->dev_info.ublksrv_pid = data.data[0] = daemon_pid; + + return __ublk_ctrl_cmd(dev, &data); +} + static int ublk_ctrl_add_dev(struct ublk_dev *dev) { struct ublk_ctrl_cmd_data data = { @@ -193,6 +216,30 @@ static int ublk_ctrl_get_features(struct ublk_dev *dev, return __ublk_ctrl_cmd(dev, &data); } +static int ublk_ctrl_update_size(struct ublk_dev *dev, + __u64 nr_sects) +{ + struct ublk_ctrl_cmd_data data = { + .cmd_op = UBLK_U_CMD_UPDATE_SIZE, + .flags = CTRL_CMD_HAS_DATA, + }; + + data.data[0] = nr_sects; + return __ublk_ctrl_cmd(dev, &data); +} + +static int ublk_ctrl_quiesce_dev(struct ublk_dev *dev, + unsigned int timeout_ms) +{ + struct ublk_ctrl_cmd_data data = { + .cmd_op = UBLK_U_CMD_QUIESCE_DEV, + .flags = CTRL_CMD_HAS_DATA, + }; + + data.data[0] = timeout_ms; + return __ublk_ctrl_cmd(dev, &data); +} + static const char *ublk_dev_state_desc(struct ublk_dev *dev) { switch (dev->dev_info.state) { @@ -207,10 +254,73 @@ static const char *ublk_dev_state_desc(struct ublk_dev *dev) }; } +static void ublk_print_cpu_set(const cpu_set_t *set, char *buf, unsigned len) +{ + unsigned done = 0; + int i; + + for (i = 0; i < CPU_SETSIZE; i++) { + if (CPU_ISSET(i, set)) + done += snprintf(&buf[done], len - done, "%d ", i); + } +} + +static void ublk_adjust_affinity(cpu_set_t *set) +{ + int j, updated = 0; + + /* + * Just keep the 1st CPU now. + * + * In future, auto affinity selection can be tried. + */ + for (j = 0; j < CPU_SETSIZE; j++) { + if (CPU_ISSET(j, set)) { + if (!updated) { + updated = 1; + continue; + } + CPU_CLR(j, set); + } + } +} + +/* Caller must free the allocated buffer */ +static int ublk_ctrl_get_affinity(struct ublk_dev *ctrl_dev, cpu_set_t **ptr_buf) +{ + struct ublk_ctrl_cmd_data data = { + .cmd_op = UBLK_U_CMD_GET_QUEUE_AFFINITY, + .flags = CTRL_CMD_HAS_DATA | CTRL_CMD_HAS_BUF, + }; + cpu_set_t *buf; + int i, ret; + + buf = malloc(sizeof(cpu_set_t) * ctrl_dev->dev_info.nr_hw_queues); + if (!buf) + return -ENOMEM; + + for (i = 0; i < ctrl_dev->dev_info.nr_hw_queues; i++) { + data.data[0] = i; + data.len = sizeof(cpu_set_t); + data.addr = (__u64)&buf[i]; + + ret = __ublk_ctrl_cmd(ctrl_dev, &data); + if (ret < 0) { + free(buf); + return ret; + } + ublk_adjust_affinity(&buf[i]); + } + + *ptr_buf = buf; + return 0; +} + static void ublk_ctrl_dump(struct ublk_dev *dev) { struct ublksrv_ctrl_dev_info *info = &dev->dev_info; struct ublk_params p; + cpu_set_t *affinity; int ret; ret = ublk_ctrl_get_params(dev, &p); @@ -219,12 +329,31 @@ static void ublk_ctrl_dump(struct ublk_dev *dev) return; } + ret = ublk_ctrl_get_affinity(dev, &affinity); + if (ret < 0) { + ublk_err("failed to get affinity %m\n"); + return; + } + ublk_log("dev id %d: nr_hw_queues %d queue_depth %d block size %d dev_capacity %lld\n", info->dev_id, info->nr_hw_queues, info->queue_depth, 1 << p.basic.logical_bs_shift, p.basic.dev_sectors); ublk_log("\tmax rq size %d daemon pid %d flags 0x%llx state %s\n", info->max_io_buf_bytes, info->ublksrv_pid, info->flags, ublk_dev_state_desc(dev)); + + if (affinity) { + char buf[512]; + int i; + + for (i = 0; i < info->nr_hw_queues; i++) { + ublk_print_cpu_set(&affinity[i], buf, sizeof(buf)); + printf("\tqueue %u: affinity(%s)\n", + i, buf); + } + free(affinity); + } + fflush(stdout); } @@ -283,16 +412,6 @@ static void ublk_queue_deinit(struct ublk_queue *q) int i; int nr_ios = q->q_depth; - io_uring_unregister_buffers(&q->ring); - - io_uring_unregister_ring_fd(&q->ring); - - if (q->ring.ring_fd > 0) { - io_uring_unregister_files(&q->ring); - close(q->ring.ring_fd); - q->ring.ring_fd = -1; - } - if (q->io_cmd_buf) munmap(q->io_cmd_buf, ublk_queue_cmd_buf_sz(q)); @@ -300,25 +419,32 @@ static void ublk_queue_deinit(struct ublk_queue *q) free(q->ios[i].buf_addr); } -static int ublk_queue_init(struct ublk_queue *q) +static void ublk_thread_deinit(struct ublk_thread *t) +{ + io_uring_unregister_buffers(&t->ring); + + io_uring_unregister_ring_fd(&t->ring); + + if (t->ring.ring_fd > 0) { + io_uring_unregister_files(&t->ring); + close(t->ring.ring_fd); + t->ring.ring_fd = -1; + } +} + +static int ublk_queue_init(struct ublk_queue *q, unsigned extra_flags) { struct ublk_dev *dev = q->dev; int depth = dev->dev_info.queue_depth; - int i, ret = -1; + int i; int cmd_buf_size, io_buf_size; unsigned long off; - int ring_depth = dev->tgt.sq_depth, cq_depth = dev->tgt.cq_depth; q->tgt_ops = dev->tgt.ops; - q->state = 0; + q->flags = 0; q->q_depth = depth; - q->cmd_inflight = 0; - q->tid = gettid(); - - if (dev->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY) { - q->state |= UBLKSRV_NO_BUF; - q->state |= UBLKSRV_ZC; - } + q->flags = dev->dev_info.flags; + q->flags |= extra_flags; cmd_buf_size = ublk_queue_cmd_buf_sz(q); off = UBLKSRV_CMD_BUF_OFFSET + q->q_id * ublk_queue_max_cmd_buf_sz(); @@ -333,9 +459,10 @@ static int ublk_queue_init(struct ublk_queue *q) io_buf_size = dev->dev_info.max_io_buf_bytes; for (i = 0; i < q->q_depth; i++) { q->ios[i].buf_addr = NULL; - q->ios[i].flags = UBLKSRV_NEED_FETCH_RQ | UBLKSRV_IO_FREE; + q->ios[i].flags = UBLKS_IO_NEED_FETCH_RQ | UBLKS_IO_FREE; + q->ios[i].tag = i; - if (q->state & UBLKSRV_NO_BUF) + if (ublk_queue_no_buf(q)) continue; if (posix_memalign((void **)&q->ios[i].buf_addr, @@ -346,37 +473,57 @@ static int ublk_queue_init(struct ublk_queue *q) } } - ret = ublk_setup_ring(&q->ring, ring_depth, cq_depth, - IORING_SETUP_COOP_TASKRUN); + return 0; + fail: + ublk_queue_deinit(q); + ublk_err("ublk dev %d queue %d failed\n", + dev->dev_info.dev_id, q->q_id); + return -ENOMEM; +} + +static int ublk_thread_init(struct ublk_thread *t) +{ + struct ublk_dev *dev = t->dev; + int ring_depth = dev->tgt.sq_depth, cq_depth = dev->tgt.cq_depth; + int ret; + + ret = ublk_setup_ring(&t->ring, ring_depth, cq_depth, + IORING_SETUP_COOP_TASKRUN | + IORING_SETUP_SINGLE_ISSUER | + IORING_SETUP_DEFER_TASKRUN); if (ret < 0) { - ublk_err("ublk dev %d queue %d setup io_uring failed %d\n", - q->dev->dev_info.dev_id, q->q_id, ret); + ublk_err("ublk dev %d thread %d setup io_uring failed %d\n", + dev->dev_info.dev_id, t->idx, ret); goto fail; } - if (dev->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY) { - ret = io_uring_register_buffers_sparse(&q->ring, q->q_depth); + if (dev->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_AUTO_BUF_REG)) { + unsigned nr_ios = dev->dev_info.queue_depth * dev->dev_info.nr_hw_queues; + unsigned max_nr_ios_per_thread = nr_ios / dev->nthreads; + max_nr_ios_per_thread += !!(nr_ios % dev->nthreads); + ret = io_uring_register_buffers_sparse( + &t->ring, max_nr_ios_per_thread); if (ret) { - ublk_err("ublk dev %d queue %d register spare buffers failed %d", - dev->dev_info.dev_id, q->q_id, ret); + ublk_err("ublk dev %d thread %d register spare buffers failed %d", + dev->dev_info.dev_id, t->idx, ret); goto fail; } } - io_uring_register_ring_fd(&q->ring); + io_uring_register_ring_fd(&t->ring); - ret = io_uring_register_files(&q->ring, dev->fds, dev->nr_fds); + ret = io_uring_register_files(&t->ring, dev->fds, dev->nr_fds); if (ret) { - ublk_err("ublk dev %d queue %d register files failed %d\n", - q->dev->dev_info.dev_id, q->q_id, ret); + ublk_err("ublk dev %d thread %d register files failed %d\n", + t->dev->dev_info.dev_id, t->idx, ret); goto fail; } return 0; - fail: - ublk_queue_deinit(q); - ublk_err("ublk dev %d queue %d failed\n", - dev->dev_info.dev_id, q->q_id); +fail: + ublk_thread_deinit(t); + ublk_err("ublk dev %d thread %d init failed\n", + dev->dev_info.dev_id, t->idx); return -ENOMEM; } @@ -418,34 +565,57 @@ static void ublk_dev_unprep(struct ublk_dev *dev) close(dev->fds[0]); } -int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag) +static void ublk_set_auto_buf_reg(const struct ublk_queue *q, + struct io_uring_sqe *sqe, + unsigned short tag) { + struct ublk_auto_buf_reg buf = {}; + + if (q->tgt_ops->buf_index) + buf.index = q->tgt_ops->buf_index(q, tag); + else + buf.index = q->ios[tag].buf_index; + + if (ublk_queue_auto_zc_fallback(q)) + buf.flags = UBLK_AUTO_BUF_REG_FALLBACK; + + sqe->addr = ublk_auto_buf_reg_to_sqe_addr(&buf); +} + +int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io) +{ + struct ublk_queue *q = ublk_io_to_queue(io); struct ublksrv_io_cmd *cmd; struct io_uring_sqe *sqe[1]; unsigned int cmd_op = 0; __u64 user_data; /* only freed io can be issued */ - if (!(io->flags & UBLKSRV_IO_FREE)) + if (!(io->flags & UBLKS_IO_FREE)) return 0; - /* we issue because we need either fetching or committing */ + /* + * we issue because we need either fetching or committing or + * getting data + */ if (!(io->flags & - (UBLKSRV_NEED_FETCH_RQ | UBLKSRV_NEED_COMMIT_RQ_COMP))) + (UBLKS_IO_NEED_FETCH_RQ | UBLKS_IO_NEED_COMMIT_RQ_COMP | UBLKS_IO_NEED_GET_DATA))) return 0; - if (io->flags & UBLKSRV_NEED_COMMIT_RQ_COMP) + if (io->flags & UBLKS_IO_NEED_GET_DATA) + cmd_op = UBLK_U_IO_NEED_GET_DATA; + else if (io->flags & UBLKS_IO_NEED_COMMIT_RQ_COMP) cmd_op = UBLK_U_IO_COMMIT_AND_FETCH_REQ; - else if (io->flags & UBLKSRV_NEED_FETCH_RQ) + else if (io->flags & UBLKS_IO_NEED_FETCH_RQ) cmd_op = UBLK_U_IO_FETCH_REQ; - if (io_uring_sq_space_left(&q->ring) < 1) - io_uring_submit(&q->ring); + if (io_uring_sq_space_left(&t->ring) < 1) + io_uring_submit(&t->ring); - ublk_queue_alloc_sqes(q, sqe, 1); + ublk_io_alloc_sqes(t, sqe, 1); if (!sqe[0]) { - ublk_err("%s: run out of sqe %d, tag %d\n", - __func__, q->q_id, tag); + ublk_err("%s: run out of sqe. thread %u, tag %d\n", + __func__, t->idx, io->tag); return -1; } @@ -460,49 +630,84 @@ int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag) sqe[0]->opcode = IORING_OP_URING_CMD; sqe[0]->flags = IOSQE_FIXED_FILE; sqe[0]->rw_flags = 0; - cmd->tag = tag; + cmd->tag = io->tag; cmd->q_id = q->q_id; - if (!(q->state & UBLKSRV_NO_BUF)) + if (!ublk_queue_no_buf(q)) cmd->addr = (__u64) (uintptr_t) io->buf_addr; else cmd->addr = 0; - user_data = build_user_data(tag, _IOC_NR(cmd_op), 0, 0); + if (ublk_queue_use_auto_zc(q)) + ublk_set_auto_buf_reg(q, sqe[0], io->tag); + + user_data = build_user_data(io->tag, _IOC_NR(cmd_op), 0, q->q_id, 0); io_uring_sqe_set_data64(sqe[0], user_data); io->flags = 0; - q->cmd_inflight += 1; + t->cmd_inflight += 1; - ublk_dbg(UBLK_DBG_IO_CMD, "%s: (qid %d tag %u cmd_op %u) iof %x stopping %d\n", - __func__, q->q_id, tag, cmd_op, - io->flags, !!(q->state & UBLKSRV_QUEUE_STOPPING)); + ublk_dbg(UBLK_DBG_IO_CMD, "%s: (thread %u qid %d tag %u cmd_op %u) iof %x stopping %d\n", + __func__, t->idx, q->q_id, io->tag, cmd_op, + io->flags, !!(t->state & UBLKS_T_STOPPING)); return 1; } -static void ublk_submit_fetch_commands(struct ublk_queue *q) +static void ublk_submit_fetch_commands(struct ublk_thread *t) { - int i = 0; + struct ublk_queue *q; + struct ublk_io *io; + int i = 0, j = 0; - for (i = 0; i < q->q_depth; i++) - ublk_queue_io_cmd(q, &q->ios[i], i); + if (t->dev->per_io_tasks) { + /* + * Lexicographically order all the (qid,tag) pairs, with + * qid taking priority (so (1,0) > (0,1)). Then make + * this thread the daemon for every Nth entry in this + * list (N is the number of threads), starting at this + * thread's index. This ensures that each queue is + * handled by as many ublk server threads as possible, + * so that load that is concentrated on one or a few + * queues can make use of all ublk server threads. + */ + const struct ublksrv_ctrl_dev_info *dinfo = &t->dev->dev_info; + int nr_ios = dinfo->nr_hw_queues * dinfo->queue_depth; + for (i = t->idx; i < nr_ios; i += t->dev->nthreads) { + int q_id = i / dinfo->queue_depth; + int tag = i % dinfo->queue_depth; + q = &t->dev->q[q_id]; + io = &q->ios[tag]; + io->buf_index = j++; + ublk_queue_io_cmd(t, io); + } + } else { + /* + * Service exclusively the queue whose q_id matches our + * thread index. + */ + struct ublk_queue *q = &t->dev->q[t->idx]; + for (i = 0; i < q->q_depth; i++) { + io = &q->ios[i]; + io->buf_index = i; + ublk_queue_io_cmd(t, io); + } + } } -static int ublk_queue_is_idle(struct ublk_queue *q) +static int ublk_thread_is_idle(struct ublk_thread *t) { - return !io_uring_sq_ready(&q->ring) && !q->io_inflight; + return !io_uring_sq_ready(&t->ring) && !t->io_inflight; } -static int ublk_queue_is_done(struct ublk_queue *q) +static int ublk_thread_is_done(struct ublk_thread *t) { - return (q->state & UBLKSRV_QUEUE_STOPPING) && ublk_queue_is_idle(q); + return (t->state & UBLKS_T_STOPPING) && ublk_thread_is_idle(t); } -static inline void ublksrv_handle_tgt_cqe(struct ublk_queue *q, - struct io_uring_cqe *cqe) +static inline void ublksrv_handle_tgt_cqe(struct ublk_thread *t, + struct ublk_queue *q, + struct io_uring_cqe *cqe) { - unsigned tag = user_data_to_tag(cqe->user_data); - if (cqe->res < 0 && cqe->res != -EAGAIN) ublk_err("%s: failed tgt io: res %d qid %u tag %u, cmd_op %u\n", __func__, cqe->res, q->q_id, @@ -510,122 +715,159 @@ static inline void ublksrv_handle_tgt_cqe(struct ublk_queue *q, user_data_to_op(cqe->user_data)); if (q->tgt_ops->tgt_io_done) - q->tgt_ops->tgt_io_done(q, tag, cqe); + q->tgt_ops->tgt_io_done(t, q, cqe); } -static void ublk_handle_cqe(struct io_uring *r, - struct io_uring_cqe *cqe, void *data) +static void ublk_handle_uring_cmd(struct ublk_thread *t, + struct ublk_queue *q, + const struct io_uring_cqe *cqe) { - struct ublk_queue *q = container_of(r, struct ublk_queue, ring); - unsigned tag = user_data_to_tag(cqe->user_data); - unsigned cmd_op = user_data_to_op(cqe->user_data); int fetch = (cqe->res != UBLK_IO_RES_ABORT) && - !(q->state & UBLKSRV_QUEUE_STOPPING); - struct ublk_io *io; - - if (cqe->res < 0 && cqe->res != -ENODEV) - ublk_err("%s: res %d userdata %llx queue state %x\n", __func__, - cqe->res, cqe->user_data, q->state); - - ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (qid %d tag %u cmd_op %u target %d/%d) stopping %d\n", - __func__, cqe->res, q->q_id, tag, cmd_op, - is_target_io(cqe->user_data), - user_data_to_tgt_data(cqe->user_data), - (q->state & UBLKSRV_QUEUE_STOPPING)); - - /* Don't retrieve io in case of target io */ - if (is_target_io(cqe->user_data)) { - ublksrv_handle_tgt_cqe(q, cqe); - return; - } - - io = &q->ios[tag]; - q->cmd_inflight--; + !(t->state & UBLKS_T_STOPPING); + unsigned tag = user_data_to_tag(cqe->user_data); + struct ublk_io *io = &q->ios[tag]; if (!fetch) { - q->state |= UBLKSRV_QUEUE_STOPPING; - io->flags &= ~UBLKSRV_NEED_FETCH_RQ; + t->state |= UBLKS_T_STOPPING; + io->flags &= ~UBLKS_IO_NEED_FETCH_RQ; } if (cqe->res == UBLK_IO_RES_OK) { assert(tag < q->q_depth); if (q->tgt_ops->queue_io) - q->tgt_ops->queue_io(q, tag); + q->tgt_ops->queue_io(t, q, tag); + } else if (cqe->res == UBLK_IO_RES_NEED_GET_DATA) { + io->flags |= UBLKS_IO_NEED_GET_DATA | UBLKS_IO_FREE; + ublk_queue_io_cmd(t, io); } else { /* * COMMIT_REQ will be completed immediately since no fetching * piggyback is required. * * Marking IO_FREE only, then this io won't be issued since - * we only issue io with (UBLKSRV_IO_FREE | UBLKSRV_NEED_*) + * we only issue io with (UBLKS_IO_FREE | UBLKSRV_NEED_*) * * */ - io->flags = UBLKSRV_IO_FREE; + io->flags = UBLKS_IO_FREE; + } +} + +static void ublk_handle_cqe(struct ublk_thread *t, + struct io_uring_cqe *cqe, void *data) +{ + struct ublk_dev *dev = t->dev; + unsigned q_id = user_data_to_q_id(cqe->user_data); + struct ublk_queue *q = &dev->q[q_id]; + unsigned cmd_op = user_data_to_op(cqe->user_data); + + if (cqe->res < 0 && cqe->res != -ENODEV) + ublk_err("%s: res %d userdata %llx queue state %x\n", __func__, + cqe->res, cqe->user_data, q->flags); + + ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (qid %d tag %u cmd_op %u target %d/%d) stopping %d\n", + __func__, cqe->res, q->q_id, user_data_to_tag(cqe->user_data), + cmd_op, is_target_io(cqe->user_data), + user_data_to_tgt_data(cqe->user_data), + (t->state & UBLKS_T_STOPPING)); + + /* Don't retrieve io in case of target io */ + if (is_target_io(cqe->user_data)) { + ublksrv_handle_tgt_cqe(t, q, cqe); + return; } + + t->cmd_inflight--; + + ublk_handle_uring_cmd(t, q, cqe); } -static int ublk_reap_events_uring(struct io_uring *r) +static int ublk_reap_events_uring(struct ublk_thread *t) { struct io_uring_cqe *cqe; unsigned head; int count = 0; - io_uring_for_each_cqe(r, head, cqe) { - ublk_handle_cqe(r, cqe, NULL); + io_uring_for_each_cqe(&t->ring, head, cqe) { + ublk_handle_cqe(t, cqe, NULL); count += 1; } - io_uring_cq_advance(r, count); + io_uring_cq_advance(&t->ring, count); return count; } -static int ublk_process_io(struct ublk_queue *q) +static int ublk_process_io(struct ublk_thread *t) { int ret, reapped; - ublk_dbg(UBLK_DBG_QUEUE, "dev%d-q%d: to_submit %d inflight cmd %u stopping %d\n", - q->dev->dev_info.dev_id, - q->q_id, io_uring_sq_ready(&q->ring), - q->cmd_inflight, - (q->state & UBLKSRV_QUEUE_STOPPING)); + ublk_dbg(UBLK_DBG_THREAD, "dev%d-t%u: to_submit %d inflight cmd %u stopping %d\n", + t->dev->dev_info.dev_id, + t->idx, io_uring_sq_ready(&t->ring), + t->cmd_inflight, + (t->state & UBLKS_T_STOPPING)); - if (ublk_queue_is_done(q)) + if (ublk_thread_is_done(t)) return -ENODEV; - ret = io_uring_submit_and_wait(&q->ring, 1); - reapped = ublk_reap_events_uring(&q->ring); + ret = io_uring_submit_and_wait(&t->ring, 1); + reapped = ublk_reap_events_uring(t); - ublk_dbg(UBLK_DBG_QUEUE, "submit result %d, reapped %d stop %d idle %d\n", - ret, reapped, (q->state & UBLKSRV_QUEUE_STOPPING), - (q->state & UBLKSRV_QUEUE_IDLE)); + ublk_dbg(UBLK_DBG_THREAD, "submit result %d, reapped %d stop %d idle %d\n", + ret, reapped, (t->state & UBLKS_T_STOPPING), + (t->state & UBLKS_T_IDLE)); return reapped; } +static void ublk_thread_set_sched_affinity(const struct ublk_thread *t, + cpu_set_t *cpuset) +{ + if (sched_setaffinity(0, sizeof(*cpuset), cpuset) < 0) + ublk_err("ublk dev %u thread %u set affinity failed", + t->dev->dev_info.dev_id, t->idx); +} + +struct ublk_thread_info { + struct ublk_dev *dev; + unsigned idx; + sem_t *ready; + cpu_set_t *affinity; +}; + static void *ublk_io_handler_fn(void *data) { - struct ublk_queue *q = data; - int dev_id = q->dev->dev_info.dev_id; + struct ublk_thread_info *info = data; + struct ublk_thread *t = &info->dev->threads[info->idx]; + int dev_id = info->dev->dev_info.dev_id; int ret; - ret = ublk_queue_init(q); + t->dev = info->dev; + t->idx = info->idx; + + ret = ublk_thread_init(t); if (ret) { - ublk_err("ublk dev %d queue %d init queue failed\n", - dev_id, q->q_id); + ublk_err("ublk dev %d thread %u init failed\n", + dev_id, t->idx); return NULL; } - ublk_dbg(UBLK_DBG_QUEUE, "tid %d: ublk dev %d queue %d started\n", - q->tid, dev_id, q->q_id); + /* IO perf is sensitive with queue pthread affinity on NUMA machine*/ + if (info->affinity) + ublk_thread_set_sched_affinity(t, info->affinity); + sem_post(info->ready); + + ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %u started\n", + gettid(), dev_id, t->idx); /* submit all io commands to ublk driver */ - ublk_submit_fetch_commands(q); + ublk_submit_fetch_commands(t); do { - if (ublk_process_io(q) < 0) + if (ublk_process_io(t) < 0) break; } while (1); - ublk_dbg(UBLK_DBG_QUEUE, "ublk dev %d queue %d exited\n", dev_id, q->q_id); - ublk_queue_deinit(q); + ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %d exiting\n", + gettid(), dev_id, t->idx); + ublk_thread_deinit(t); return NULL; } @@ -639,7 +881,7 @@ static void ublk_set_parameters(struct ublk_dev *dev) dev->dev_info.dev_id, ret); } -static int ublk_send_dev_event(const struct dev_ctx *ctx, int dev_id) +static int ublk_send_dev_event(const struct dev_ctx *ctx, struct ublk_dev *dev, int dev_id) { uint64_t id; int evtfd = ctx->_evtfd; @@ -652,36 +894,91 @@ static int ublk_send_dev_event(const struct dev_ctx *ctx, int dev_id) else id = ERROR_EVTFD_DEVID; + if (dev && ctx->shadow_dev) + memcpy(&ctx->shadow_dev->q, &dev->q, sizeof(dev->q)); + if (write(evtfd, &id, sizeof(id)) != sizeof(id)) return -EINVAL; + close(evtfd); + shmdt(ctx->shadow_dev); + return 0; } static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) { - int ret, i; - void *thread_ret; const struct ublksrv_ctrl_dev_info *dinfo = &dev->dev_info; + struct ublk_thread_info *tinfo; + unsigned long long extra_flags = 0; + cpu_set_t *affinity_buf; + void *thread_ret; + sem_t ready; + int ret, i; ublk_dbg(UBLK_DBG_DEV, "%s enter\n", __func__); + tinfo = calloc(sizeof(struct ublk_thread_info), dev->nthreads); + if (!tinfo) + return -ENOMEM; + + sem_init(&ready, 0, 0); ret = ublk_dev_prep(ctx, dev); if (ret) return ret; + ret = ublk_ctrl_get_affinity(dev, &affinity_buf); + if (ret) + return ret; + + if (ctx->auto_zc_fallback) + extra_flags = UBLKS_Q_AUTO_BUF_REG_FALLBACK; + for (i = 0; i < dinfo->nr_hw_queues; i++) { dev->q[i].dev = dev; dev->q[i].q_id = i; - pthread_create(&dev->q[i].thread, NULL, + + ret = ublk_queue_init(&dev->q[i], extra_flags); + if (ret) { + ublk_err("ublk dev %d queue %d init queue failed\n", + dinfo->dev_id, i); + goto fail; + } + } + + for (i = 0; i < dev->nthreads; i++) { + tinfo[i].dev = dev; + tinfo[i].idx = i; + tinfo[i].ready = &ready; + + /* + * If threads are not tied 1:1 to queues, setting thread + * affinity based on queue affinity makes little sense. + * However, thread CPU affinity has significant impact + * on performance, so to compare fairly, we'll still set + * thread CPU affinity based on queue affinity where + * possible. + */ + if (dev->nthreads == dinfo->nr_hw_queues) + tinfo[i].affinity = &affinity_buf[i]; + pthread_create(&dev->threads[i].thread, NULL, ublk_io_handler_fn, - &dev->q[i]); + &tinfo[i]); } + for (i = 0; i < dev->nthreads; i++) + sem_wait(&ready); + free(tinfo); + free(affinity_buf); + /* everything is fine now, start us */ - ublk_set_parameters(dev); - ret = ublk_ctrl_start_dev(dev, getpid()); + if (ctx->recovery) + ret = ublk_ctrl_end_user_recovery(dev, getpid()); + else { + ublk_set_parameters(dev); + ret = ublk_ctrl_start_dev(dev, getpid()); + } if (ret < 0) { ublk_err("%s: ublk_ctrl_start_dev failed: %d\n", __func__, ret); goto fail; @@ -691,12 +988,14 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) if (ctx->fg) ublk_ctrl_dump(dev); else - ublk_send_dev_event(ctx, dev->dev_info.dev_id); + ublk_send_dev_event(ctx, dev, dev->dev_info.dev_id); /* wait until we are terminated */ - for (i = 0; i < dinfo->nr_hw_queues; i++) - pthread_join(dev->q[i].thread, &thread_ret); + for (i = 0; i < dev->nthreads; i++) + pthread_join(dev->threads[i].thread, &thread_ret); fail: + for (i = 0; i < dinfo->nr_hw_queues; i++) + ublk_queue_deinit(&dev->q[i]); ublk_dev_unprep(dev); ublk_dbg(UBLK_DBG_DEV, "%s exit\n", __func__); @@ -802,13 +1101,14 @@ wait: static int __cmd_dev_add(const struct dev_ctx *ctx) { + unsigned nthreads = ctx->nthreads; unsigned nr_queues = ctx->nr_hw_queues; const char *tgt_type = ctx->tgt_type; unsigned depth = ctx->queue_depth; __u64 features; const struct ublk_tgt_ops *ops; struct ublksrv_ctrl_dev_info *info; - struct ublk_dev *dev; + struct ublk_dev *dev = NULL; int dev_id = ctx->dev_id; int ret, i; @@ -816,35 +1116,66 @@ static int __cmd_dev_add(const struct dev_ctx *ctx) if (!ops) { ublk_err("%s: no such tgt type, type %s\n", __func__, tgt_type); - return -ENODEV; + ret = -ENODEV; + goto fail; } if (nr_queues > UBLK_MAX_QUEUES || depth > UBLK_QUEUE_DEPTH) { ublk_err("%s: invalid nr_queues or depth queues %u depth %u\n", __func__, nr_queues, depth); - return -EINVAL; + ret = -EINVAL; + goto fail; + } + + /* default to 1:1 threads:queues if nthreads is unspecified */ + if (!nthreads) + nthreads = nr_queues; + + if (nthreads > UBLK_MAX_THREADS) { + ublk_err("%s: %u is too many threads (max %u)\n", + __func__, nthreads, UBLK_MAX_THREADS); + ret = -EINVAL; + goto fail; + } + + if (nthreads != nr_queues && !ctx->per_io_tasks) { + ublk_err("%s: threads %u must be same as queues %u if " + "not using per_io_tasks\n", + __func__, nthreads, nr_queues); + ret = -EINVAL; + goto fail; } dev = ublk_ctrl_init(); if (!dev) { ublk_err("%s: can't alloc dev id %d, type %s\n", __func__, dev_id, tgt_type); - return -ENOMEM; + ret = -ENOMEM; + goto fail; } /* kernel doesn't support get_features */ ret = ublk_ctrl_get_features(dev, &features); - if (ret < 0) - return -EINVAL; + if (ret < 0) { + ret = -EINVAL; + goto fail; + } - if (!(features & UBLK_F_CMD_IOCTL_ENCODE)) - return -ENOTSUP; + if (!(features & UBLK_F_CMD_IOCTL_ENCODE)) { + ret = -ENOTSUP; + goto fail; + } info = &dev->dev_info; info->dev_id = ctx->dev_id; info->nr_hw_queues = nr_queues; info->queue_depth = depth; info->flags = ctx->flags; + if ((features & UBLK_F_QUIESCE) && + (info->flags & UBLK_F_USER_RECOVERY)) + info->flags |= UBLK_F_QUIESCE; + dev->nthreads = nthreads; + dev->per_io_tasks = ctx->per_io_tasks; dev->tgt.ops = ops; dev->tgt.sq_depth = depth; dev->tgt.cq_depth = depth; @@ -856,7 +1187,10 @@ static int __cmd_dev_add(const struct dev_ctx *ctx) } } - ret = ublk_ctrl_add_dev(dev); + if (ctx->recovery) + ret = ublk_ctrl_start_user_recovery(dev); + else + ret = ublk_ctrl_add_dev(dev); if (ret < 0) { ublk_err("%s: can't add dev id %d, type %s ret %d\n", __func__, dev_id, tgt_type, ret); @@ -870,8 +1204,9 @@ static int __cmd_dev_add(const struct dev_ctx *ctx) fail: if (ret < 0) - ublk_send_dev_event(ctx, -1); - ublk_ctrl_deinit(dev); + ublk_send_dev_event(ctx, dev, -1); + if (dev) + ublk_ctrl_deinit(dev); return ret; } @@ -884,30 +1219,60 @@ static int cmd_dev_add(struct dev_ctx *ctx) if (ctx->fg) goto run; + ctx->_shmid = shmget(IPC_PRIVATE, sizeof(struct ublk_dev), IPC_CREAT | 0666); + if (ctx->_shmid < 0) { + ublk_err("%s: failed to shmget %s\n", __func__, strerror(errno)); + exit(-1); + } + ctx->shadow_dev = (struct ublk_dev *)shmat(ctx->_shmid, NULL, 0); + if (ctx->shadow_dev == (struct ublk_dev *)-1) { + ublk_err("%s: failed to shmat %s\n", __func__, strerror(errno)); + exit(-1); + } ctx->_evtfd = eventfd(0, 0); if (ctx->_evtfd < 0) { ublk_err("%s: failed to create eventfd %s\n", __func__, strerror(errno)); exit(-1); } - setsid(); res = fork(); if (res == 0) { + int res2; + + setsid(); + res2 = fork(); + if (res2 == 0) { + /* prepare for detaching */ + close(STDIN_FILENO); + close(STDOUT_FILENO); + close(STDERR_FILENO); run: - res = __cmd_dev_add(ctx); - return res; + res = __cmd_dev_add(ctx); + return res; + } else { + /* detached from the foreground task */ + exit(EXIT_SUCCESS); + } } else if (res > 0) { uint64_t id; + int exit_code = EXIT_FAILURE; res = read(ctx->_evtfd, &id, sizeof(id)); close(ctx->_evtfd); if (res == sizeof(id) && id != ERROR_EVTFD_DEVID) { ctx->dev_id = id - 1; - return __cmd_dev_list(ctx); + if (__cmd_dev_list(ctx) >= 0) + exit_code = EXIT_SUCCESS; } - exit(EXIT_FAILURE); + shmdt(ctx->shadow_dev); + shmctl(ctx->_shmid, IPC_RMID, NULL); + /* wait for child and detach from it */ + wait(NULL); + if (exit_code == EXIT_FAILURE) + ublk_err("%s: command failed\n", __func__); + exit(exit_code); } else { - return res; + exit(EXIT_FAILURE); } } @@ -969,6 +1334,9 @@ static int __cmd_dev_list(struct dev_ctx *ctx) ublk_err("%s: can't get dev info from %d: %d\n", __func__, ctx->dev_id, ret); } else { + if (ctx->shadow_dev) + memcpy(&dev->q, ctx->shadow_dev->q, sizeof(dev->q)); + ublk_ctrl_dump(dev); } @@ -1006,6 +1374,10 @@ static int cmd_dev_get_features(void) [const_ilog2(UBLK_F_USER_COPY)] = "USER_COPY", [const_ilog2(UBLK_F_ZONED)] = "ZONED", [const_ilog2(UBLK_F_USER_RECOVERY_FAIL_IO)] = "RECOVERY_FAIL_IO", + [const_ilog2(UBLK_F_UPDATE_SIZE)] = "UPDATE_SIZE", + [const_ilog2(UBLK_F_AUTO_BUF_REG)] = "AUTO_BUF_REG", + [const_ilog2(UBLK_F_QUIESCE)] = "QUIESCE", + [const_ilog2(UBLK_F_PER_IO_DAEMON)] = "PER_IO_DAEMON", }; struct ublk_dev *dev; __u64 features = 0; @@ -1039,15 +1411,105 @@ static int cmd_dev_get_features(void) return ret; } +static int cmd_dev_update_size(struct dev_ctx *ctx) +{ + struct ublk_dev *dev = ublk_ctrl_init(); + struct ublk_params p; + int ret = -EINVAL; + + if (!dev) + return -ENODEV; + + if (ctx->dev_id < 0) { + fprintf(stderr, "device id isn't provided\n"); + goto out; + } + + dev->dev_info.dev_id = ctx->dev_id; + ret = ublk_ctrl_get_params(dev, &p); + if (ret < 0) { + ublk_err("failed to get params %d %s\n", ret, strerror(-ret)); + goto out; + } + + if (ctx->size & ((1 << p.basic.logical_bs_shift) - 1)) { + ublk_err("size isn't aligned with logical block size\n"); + ret = -EINVAL; + goto out; + } + + ret = ublk_ctrl_update_size(dev, ctx->size >> 9); +out: + ublk_ctrl_deinit(dev); + return ret; +} + +static int cmd_dev_quiesce(struct dev_ctx *ctx) +{ + struct ublk_dev *dev = ublk_ctrl_init(); + int ret = -EINVAL; + + if (!dev) + return -ENODEV; + + if (ctx->dev_id < 0) { + fprintf(stderr, "device id isn't provided for quiesce\n"); + goto out; + } + dev->dev_info.dev_id = ctx->dev_id; + ret = ublk_ctrl_quiesce_dev(dev, 10000); + +out: + ublk_ctrl_deinit(dev); + return ret; +} + +static void __cmd_create_help(char *exe, bool recovery) +{ + int i; + + printf("%s %s -t [null|loop|stripe|fault_inject] [-q nr_queues] [-d depth] [-n dev_id]\n", + exe, recovery ? "recover" : "add"); + printf("\t[--foreground] [--quiet] [-z] [--auto_zc] [--auto_zc_fallback] [--debug_mask mask] [-r 0|1 ] [-g]\n"); + printf("\t[-e 0|1 ] [-i 0|1]\n"); + printf("\t[--nthreads threads] [--per_io_tasks]\n"); + printf("\t[target options] [backfile1] [backfile2] ...\n"); + printf("\tdefault: nr_queues=2(max 32), depth=128(max 1024), dev_id=-1(auto allocation)\n"); + printf("\tdefault: nthreads=nr_queues"); + + for (i = 0; i < sizeof(tgt_ops_list) / sizeof(tgt_ops_list[0]); i++) { + const struct ublk_tgt_ops *ops = tgt_ops_list[i]; + + if (ops->usage) + ops->usage(ops); + } +} + +static void cmd_add_help(char *exe) +{ + __cmd_create_help(exe, false); + printf("\n"); +} + +static void cmd_recover_help(char *exe) +{ + __cmd_create_help(exe, true); + printf("\tPlease provide exact command line for creating this device with real dev_id\n"); + printf("\n"); +} + static int cmd_dev_help(char *exe) { - printf("%s add -t [null|loop] [-q nr_queues] [-d depth] [-n dev_id] [backfile1] [backfile2] ...\n", exe); - printf("\t default: nr_queues=2(max 4), depth=128(max 128), dev_id=-1(auto allocation)\n"); + cmd_add_help(exe); + cmd_recover_help(exe); + printf("%s del [-n dev_id] -a \n", exe); - printf("\t -a delete all devices -n delete specified device\n"); + printf("\t -a delete all devices -n delete specified device\n\n"); printf("%s list [-n dev_id] -a \n", exe); - printf("\t -a list all devices, -n list specified device, default -a \n"); + printf("\t -a list all devices, -n list specified device, default -a \n\n"); printf("%s features\n", exe); + printf("%s update_size -n dev_id -s|--size size_in_bytes \n", exe); + printf("%s quiesce -n dev_id\n", exe); return 0; } @@ -1063,9 +1525,18 @@ int main(int argc, char *argv[]) { "quiet", 0, NULL, 0 }, { "zero_copy", 0, NULL, 'z' }, { "foreground", 0, NULL, 0 }, - { "chunk_size", 1, NULL, 0 }, + { "recovery", 1, NULL, 'r' }, + { "recovery_fail_io", 1, NULL, 'e'}, + { "recovery_reissue", 1, NULL, 'i'}, + { "get_data", 1, NULL, 'g'}, + { "auto_zc", 0, NULL, 0 }, + { "auto_zc_fallback", 0, NULL, 0 }, + { "size", 1, NULL, 's'}, + { "nthreads", 1, NULL, 0 }, + { "per_io_tasks", 0, NULL, 0 }, { 0, 0, 0, 0 } }; + const struct ublk_tgt_ops *ops = NULL; int option_idx, opt; const char *cmd = argv[1]; struct dev_ctx ctx = { @@ -1073,15 +1544,18 @@ int main(int argc, char *argv[]) .nr_hw_queues = 2, .dev_id = -1, .tgt_type = "unknown", - .chunk_size = 65536, /* def chunk size is 64K */ }; int ret = -EINVAL, i; + int tgt_argc = 1; + char *tgt_argv[MAX_NR_TGT_ARG] = { NULL }; + int value; if (argc == 1) return ret; + opterr = 0; optind = 2; - while ((opt = getopt_long(argc, argv, "t:n:d:q:az", + while ((opt = getopt_long(argc, argv, "t:n:d:q:r:e:i:s:gaz", longopts, &option_idx)) != -1) { switch (opt) { case 'a': @@ -1103,6 +1577,27 @@ int main(int argc, char *argv[]) case 'z': ctx.flags |= UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_USER_COPY; break; + case 'r': + value = strtol(optarg, NULL, 10); + if (value) + ctx.flags |= UBLK_F_USER_RECOVERY; + break; + case 'e': + value = strtol(optarg, NULL, 10); + if (value) + ctx.flags |= UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO; + break; + case 'i': + value = strtol(optarg, NULL, 10); + if (value) + ctx.flags |= UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE; + break; + case 'g': + ctx.flags |= UBLK_F_NEED_GET_DATA; + break; + case 's': + ctx.size = strtoull(optarg, NULL, 10); + break; case 0: if (!strcmp(longopts[option_idx].name, "debug_mask")) ublk_dbg_mask = strtol(optarg, NULL, 16); @@ -1110,19 +1605,71 @@ int main(int argc, char *argv[]) ublk_dbg_mask = 0; if (!strcmp(longopts[option_idx].name, "foreground")) ctx.fg = 1; - if (!strcmp(longopts[option_idx].name, "chunk_size")) - ctx.chunk_size = strtol(optarg, NULL, 10); + if (!strcmp(longopts[option_idx].name, "auto_zc")) + ctx.flags |= UBLK_F_AUTO_BUF_REG; + if (!strcmp(longopts[option_idx].name, "auto_zc_fallback")) + ctx.auto_zc_fallback = 1; + if (!strcmp(longopts[option_idx].name, "nthreads")) + ctx.nthreads = strtol(optarg, NULL, 10); + if (!strcmp(longopts[option_idx].name, "per_io_tasks")) + ctx.per_io_tasks = 1; + break; + case '?': + /* + * target requires every option must have argument + */ + if (argv[optind][0] == '-' || argv[optind - 1][0] != '-') { + fprintf(stderr, "every target option requires argument: %s %s\n", + argv[optind - 1], argv[optind]); + exit(EXIT_FAILURE); + } + + if (tgt_argc < (MAX_NR_TGT_ARG - 1) / 2) { + tgt_argv[tgt_argc++] = argv[optind - 1]; + tgt_argv[tgt_argc++] = argv[optind]; + } else { + fprintf(stderr, "too many target options\n"); + exit(EXIT_FAILURE); + } + optind += 1; + break; } } + /* auto_zc_fallback depends on F_AUTO_BUF_REG & F_SUPPORT_ZERO_COPY */ + if (ctx.auto_zc_fallback && + !((ctx.flags & UBLK_F_AUTO_BUF_REG) && + (ctx.flags & UBLK_F_SUPPORT_ZERO_COPY))) { + ublk_err("%s: auto_zc_fallback is set but neither " + "F_AUTO_BUF_REG nor F_SUPPORT_ZERO_COPY is enabled\n", + __func__); + return -EINVAL; + } + i = optind; while (i < argc && ctx.nr_files < MAX_BACK_FILES) { ctx.files[ctx.nr_files++] = argv[i++]; } + ops = ublk_find_tgt(ctx.tgt_type); + if (ops && ops->parse_cmd_line) { + optind = 0; + + tgt_argv[0] = ctx.tgt_type; + ops->parse_cmd_line(&ctx, tgt_argc, tgt_argv); + } + if (!strcmp(cmd, "add")) ret = cmd_dev_add(&ctx); - else if (!strcmp(cmd, "del")) + else if (!strcmp(cmd, "recover")) { + if (ctx.dev_id < 0) { + fprintf(stderr, "device id isn't provided for recovering\n"); + ret = -EINVAL; + } else { + ctx.recovery = 1; + ret = cmd_dev_add(&ctx); + } + } else if (!strcmp(cmd, "del")) ret = cmd_dev_del(&ctx); else if (!strcmp(cmd, "list")) { ctx.all = 1; @@ -1131,6 +1678,10 @@ int main(int argc, char *argv[]) ret = cmd_dev_help(argv[0]); else if (!strcmp(cmd, "features")) ret = cmd_dev_get_features(); + else if (!strcmp(cmd, "update_size")) + ret = cmd_dev_update_size(&ctx); + else if (!strcmp(cmd, "quiesce")) + ret = cmd_dev_quiesce(&ctx); else cmd_dev_help(argv[0]); |