diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-04-14 01:51:31 +0300 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-04-14 01:51:31 +0300 |
| commit | 7fe6ac157b7e15c8976bd62ad7cb98e248884e83 (patch) | |
| tree | 64677a680f3bccc7efb8f4cfcb288006e1433cd3 /tools | |
| parent | b8f82cb0d84d00c04cdbdce42f67df71b8507e8b (diff) | |
| parent | 36446de0c30c62b9d89502fd36c4904996d86ecd (diff) | |
| download | linux-7fe6ac157b7e15c8976bd62ad7cb98e248884e83.tar.xz | |
Merge tag 'for-7.1/block-20260411' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux
Pull block updates from Jens Axboe:
- Add shared memory zero-copy I/O support for ublk, bypassing per-I/O
copies between kernel and userspace by matching registered buffer
PFNs at I/O time. Includes selftests.
- Refactor bio integrity to support filesystem initiated integrity
operations and arbitrary buffer alignment.
- Clean up bio allocation, splitting bio_alloc_bioset() into clear fast
and slow paths. Add bio_await() and bio_submit_or_kill() helpers,
unify synchronous bi_end_io callbacks.
- Fix zone write plug refcount handling and plug removal races. Add
support for serializing zone writes at QD=1 for rotational zoned
devices, yielding significant throughput improvements.
- Add SED-OPAL ioctls for Single User Mode management and a STACK_RESET
command.
- Add io_uring passthrough (uring_cmd) support to the BSG layer.
- Replace pp_buf in partition scanning with struct seq_buf.
- zloop improvements and cleanups.
- drbd genl cleanup, switching to pre_doit/post_doit.
- NVMe pull request via Keith:
- Fabrics authentication updates
- Enhanced block queue limits support
- Workqueue usage updates
- A new write zeroes device quirk
- Tagset cleanup fix for loop device
- MD pull requests via Yu Kuai:
- Fix raid5 soft lockup in retry_aligned_read()
- Fix raid10 deadlock with check operation and nowait requests
- Fix raid1 overlapping writes on writemostly disks
- Fix sysfs deadlock on array_state=clear
- Proactive RAID-5 parity building with llbitmap, with
write_zeroes_unmap optimization for initial sync
- Fix llbitmap barrier ordering, rdev skipping, and bitmap_ops
version mismatch fallback
- Fix bcache use-after-free and uninitialized closure
- Validate raid5 journal metadata payload size
- Various cleanups
- Various other fixes, improvements, and cleanups
* tag 'for-7.1/block-20260411' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: (146 commits)
ublk: fix tautological comparison warning in ublk_ctrl_reg_buf
scsi: bsg: fix buffer overflow in scsi_bsg_uring_cmd()
block: refactor blkdev_zone_mgmt_ioctl
MAINTAINERS: update ublk driver maintainer email
Documentation: ublk: address review comments for SHMEM_ZC docs
ublk: allow buffer registration before device is started
ublk: replace xarray with IDA for shmem buffer index allocation
ublk: simplify PFN range loop in __ublk_ctrl_reg_buf
ublk: verify all pages in multi-page bvec fall within registered range
ublk: widen ublk_shmem_buf_reg.len to __u64 for 4GB buffer support
xfs: use bio_await in xfs_zone_gc_reset_sync
block: add a bio_submit_or_kill helper
block: factor out a bio_await helper
block: unify the synchronous bi_end_io callbacks
xfs: fix number of GC bvecs
selftests/ublk: add read-only buffer registration test
selftests/ublk: add filesystem fio verify test for shmem_zc
selftests/ublk: add hugetlbfs shmem_zc test for loop target
selftests/ublk: add shared memory zero-copy test
selftests/ublk: add UBLK_F_SHMEM_ZC support for loop target
...
Diffstat (limited to 'tools')
| -rw-r--r-- | tools/testing/selftests/ublk/Makefile | 6 | ||||
| -rw-r--r-- | tools/testing/selftests/ublk/fault_inject.c | 52 | ||||
| -rw-r--r-- | tools/testing/selftests/ublk/file_backed.c | 38 | ||||
| -rw-r--r-- | tools/testing/selftests/ublk/kublk.c | 354 | ||||
| -rw-r--r-- | tools/testing/selftests/ublk/kublk.h | 18 | ||||
| -rwxr-xr-x | tools/testing/selftests/ublk/test_common.sh | 15 | ||||
| -rwxr-xr-x | tools/testing/selftests/ublk/test_generic_17.sh | 35 | ||||
| -rwxr-xr-x | tools/testing/selftests/ublk/test_shmemzc_01.sh | 72 | ||||
| -rwxr-xr-x | tools/testing/selftests/ublk/test_shmemzc_02.sh | 68 | ||||
| -rwxr-xr-x | tools/testing/selftests/ublk/test_shmemzc_03.sh | 69 | ||||
| -rwxr-xr-x | tools/testing/selftests/ublk/test_shmemzc_04.sh | 72 |
11 files changed, 790 insertions, 9 deletions
diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 8ac2d4a682a1..ec6a8ce83d38 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -18,6 +18,7 @@ TEST_PROGS += test_generic_10.sh TEST_PROGS += test_generic_12.sh TEST_PROGS += test_generic_13.sh TEST_PROGS += test_generic_16.sh +TEST_PROGS += test_generic_17.sh TEST_PROGS += test_batch_01.sh TEST_PROGS += test_batch_02.sh @@ -51,6 +52,11 @@ TEST_PROGS += test_stripe_06.sh TEST_PROGS += test_part_01.sh TEST_PROGS += test_part_02.sh +TEST_PROGS += test_shmemzc_01.sh +TEST_PROGS += test_shmemzc_02.sh +TEST_PROGS += test_shmemzc_03.sh +TEST_PROGS += test_shmemzc_04.sh + TEST_PROGS += test_stress_01.sh TEST_PROGS += test_stress_02.sh TEST_PROGS += test_stress_03.sh diff --git a/tools/testing/selftests/ublk/fault_inject.c b/tools/testing/selftests/ublk/fault_inject.c index 3b897f69c014..150896e02ff8 100644 --- a/tools/testing/selftests/ublk/fault_inject.c +++ b/tools/testing/selftests/ublk/fault_inject.c @@ -10,11 +10,17 @@ #include "kublk.h" +struct fi_opts { + long long delay_ns; + bool die_during_fetch; +}; + static int ublk_fault_inject_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) { const struct ublksrv_ctrl_dev_info *info = &dev->dev_info; unsigned long dev_size = 250UL << 30; + struct fi_opts *opts = NULL; if (ctx->auto_zc_fallback) { ublk_err("%s: not support auto_zc_fallback\n", __func__); @@ -35,17 +41,52 @@ static int ublk_fault_inject_tgt_init(const struct dev_ctx *ctx, }; ublk_set_integrity_params(ctx, &dev->tgt.params); - dev->private_data = (void *)(unsigned long)(ctx->fault_inject.delay_us * 1000); + opts = calloc(1, sizeof(*opts)); + if (!opts) { + ublk_err("%s: couldn't allocate memory for opts\n", __func__); + return -ENOMEM; + } + + opts->delay_ns = ctx->fault_inject.delay_us * 1000; + opts->die_during_fetch = ctx->fault_inject.die_during_fetch; + dev->private_data = opts; + return 0; } +static void ublk_fault_inject_pre_fetch_io(struct ublk_thread *t, + struct ublk_queue *q, int tag, + bool batch) +{ + struct fi_opts *opts = q->dev->private_data; + + if (!opts->die_during_fetch) + return; + + /* + * Each queue fetches its IOs in increasing order of tags, so + * dying just before we're about to fetch tag 1 (regardless of + * what queue we're on) guarantees that we've fetched a nonempty + * proper subset of the tags on that queue. + */ + if (tag == 1) { + /* + * Ensure our commands are actually live in the kernel + * before we die. + */ + io_uring_submit(&t->ring); + raise(SIGKILL); + } +} + static int ublk_fault_inject_queue_io(struct ublk_thread *t, struct ublk_queue *q, int tag) { const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); struct io_uring_sqe *sqe; + struct fi_opts *opts = q->dev->private_data; struct __kernel_timespec ts = { - .tv_nsec = (long long)q->dev->private_data, + .tv_nsec = opts->delay_ns, }; ublk_io_alloc_sqes(t, &sqe, 1); @@ -77,29 +118,34 @@ static void ublk_fault_inject_cmd_line(struct dev_ctx *ctx, int argc, char *argv { static const struct option longopts[] = { { "delay_us", 1, NULL, 0 }, + { "die_during_fetch", 1, NULL, 0 }, { 0, 0, 0, 0 } }; int option_idx, opt; ctx->fault_inject.delay_us = 0; + ctx->fault_inject.die_during_fetch = false; while ((opt = getopt_long(argc, argv, "", longopts, &option_idx)) != -1) { switch (opt) { case 0: if (!strcmp(longopts[option_idx].name, "delay_us")) ctx->fault_inject.delay_us = strtoll(optarg, NULL, 10); + if (!strcmp(longopts[option_idx].name, "die_during_fetch")) + ctx->fault_inject.die_during_fetch = strtoll(optarg, NULL, 10); } } } static void ublk_fault_inject_usage(const struct ublk_tgt_ops *ops) { - printf("\tfault_inject: [--delay_us us (default 0)]\n"); + printf("\tfault_inject: [--delay_us us (default 0)] [--die_during_fetch 1]\n"); } const struct ublk_tgt_ops fault_inject_tgt_ops = { .name = "fault_inject", .init_tgt = ublk_fault_inject_tgt_init, + .pre_fetch_io = ublk_fault_inject_pre_fetch_io, .queue_io = ublk_fault_inject_queue_io, .tgt_io_done = ublk_fault_inject_tgt_io_done, .parse_cmd_line = ublk_fault_inject_cmd_line, diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c index 228af2580ac6..d28da98f917a 100644 --- a/tools/testing/selftests/ublk/file_backed.c +++ b/tools/testing/selftests/ublk/file_backed.c @@ -27,6 +27,40 @@ static int loop_queue_flush_io(struct ublk_thread *t, struct ublk_queue *q, return 1; } +/* + * Shared memory zero-copy I/O: when UBLK_IO_F_SHMEM_ZC is set, the + * request's data lives in a registered shared memory buffer. Decode + * index + offset from iod->addr and use the server's mmap of that + * buffer as the I/O buffer for the backing file. + */ +static int loop_queue_shmem_zc_io(struct ublk_thread *t, struct ublk_queue *q, + const struct ublksrv_io_desc *iod, int tag) +{ + unsigned ublk_op = ublksrv_get_op(iod); + enum io_uring_op op = ublk_to_uring_op(iod, 0); + __u64 file_offset = iod->start_sector << 9; + __u32 len = iod->nr_sectors << 9; + __u32 shmem_idx = ublk_shmem_zc_index(iod->addr); + __u32 shmem_off = ublk_shmem_zc_offset(iod->addr); + struct io_uring_sqe *sqe[1]; + void *addr; + + if (shmem_idx >= UBLK_BUF_MAX || !shmem_table[shmem_idx].mmap_base) + return -EINVAL; + + addr = shmem_table[shmem_idx].mmap_base + shmem_off; + + ublk_io_alloc_sqes(t, sqe, 1); + if (!sqe[0]) + return -ENOMEM; + + io_uring_prep_rw(op, sqe[0], ublk_get_registered_fd(q, 1), + addr, len, file_offset); + io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE); + sqe[0]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1); + return 1; +} + static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, const struct ublksrv_io_desc *iod, int tag) { @@ -41,6 +75,10 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, void *addr = io->buf_addr; unsigned short buf_index = ublk_io_buf_idx(t, q, tag); + /* shared memory zero-copy path */ + if (iod->op_flags & UBLK_IO_F_SHMEM_ZC) + return loop_queue_shmem_zc_io(t, q, iod, tag); + if (iod->op_flags & UBLK_IO_F_INTEGRITY) { ublk_io_alloc_sqes(t, sqe, 1); /* Use second backing file for integrity data */ diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index e1c3b3c55e56..fbd9b1e7342a 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -4,6 +4,7 @@ */ #include <linux/fs.h> +#include <sys/un.h> #include "kublk.h" #define MAX_NR_TGT_ARG 64 @@ -796,6 +797,8 @@ static void ublk_submit_fetch_commands(struct ublk_thread *t) q = &t->dev->q[q_id]; io = &q->ios[tag]; io->buf_index = j++; + if (q->tgt_ops->pre_fetch_io) + q->tgt_ops->pre_fetch_io(t, q, tag, false); ublk_queue_io_cmd(t, io); } } else { @@ -807,6 +810,8 @@ static void ublk_submit_fetch_commands(struct ublk_thread *t) for (i = 0; i < q->q_depth; i++) { io = &q->ios[i]; io->buf_index = i; + if (q->tgt_ops->pre_fetch_io) + q->tgt_ops->pre_fetch_io(t, q, i, false); ublk_queue_io_cmd(t, io); } } @@ -983,6 +988,9 @@ static void ublk_batch_setup_queues(struct ublk_thread *t) if (t->q_map[i] == 0) continue; + if (q->tgt_ops->pre_fetch_io) + q->tgt_ops->pre_fetch_io(t, q, 0, true); + ret = ublk_batch_queue_prep_io_cmds(t, q); ublk_assert(ret >= 0); } @@ -1085,13 +1093,316 @@ static int ublk_send_dev_event(const struct dev_ctx *ctx, struct ublk_dev *dev, } +/* + * Shared memory registration socket listener. + * + * The parent daemon context listens on a per-device unix socket at + * /run/ublk/ublkb<dev_id>.sock for shared memory registration requests + * from clients. Clients send a memfd via SCM_RIGHTS; the server + * registers it with the kernel, mmaps it, and returns the assigned index. + */ +#define UBLK_SHMEM_SOCK_DIR "/run/ublk" + +/* defined in kublk.h, shared with file_backed.c (loop target) */ +struct ublk_shmem_entry shmem_table[UBLK_BUF_MAX]; +int shmem_count; + +static void ublk_shmem_sock_path(int dev_id, char *buf, size_t len) +{ + snprintf(buf, len, "%s/ublkb%d.sock", UBLK_SHMEM_SOCK_DIR, dev_id); +} + +static int ublk_shmem_sock_create(int dev_id) +{ + struct sockaddr_un addr = { .sun_family = AF_UNIX }; + char path[108]; + int fd; + + mkdir(UBLK_SHMEM_SOCK_DIR, 0755); + ublk_shmem_sock_path(dev_id, path, sizeof(path)); + unlink(path); + + fd = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0); + if (fd < 0) + return -1; + + snprintf(addr.sun_path, sizeof(addr.sun_path), "%s", path); + if (bind(fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) { + close(fd); + return -1; + } + + listen(fd, 4); + ublk_dbg(UBLK_DBG_DEV, "shmem socket created: %s\n", path); + return fd; +} + +static void ublk_shmem_sock_destroy(int dev_id, int sock_fd) +{ + char path[108]; + + if (sock_fd >= 0) + close(sock_fd); + ublk_shmem_sock_path(dev_id, path, sizeof(path)); + unlink(path); +} + +/* Receive a memfd from a client via SCM_RIGHTS */ +static int ublk_shmem_recv_fd(int client_fd) +{ + char buf[1]; + struct iovec iov = { .iov_base = buf, .iov_len = sizeof(buf) }; + union { + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + struct cmsghdr align; + } u; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = u.cmsg_buf, + .msg_controllen = sizeof(u.cmsg_buf), + }; + struct cmsghdr *cmsg; + + if (recvmsg(client_fd, &msg, 0) <= 0) + return -1; + + cmsg = CMSG_FIRSTHDR(&msg); + if (!cmsg || cmsg->cmsg_level != SOL_SOCKET || + cmsg->cmsg_type != SCM_RIGHTS) + return -1; + + return *(int *)CMSG_DATA(cmsg); +} + +/* Register a shared memory buffer: store fd, mmap it, return index */ +static int ublk_shmem_register(int shmem_fd) +{ + off_t size; + void *base; + int idx; + + if (shmem_count >= UBLK_BUF_MAX) + return -1; + + size = lseek(shmem_fd, 0, SEEK_END); + if (size <= 0) + return -1; + + base = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, + shmem_fd, 0); + if (base == MAP_FAILED) + return -1; + + idx = shmem_count++; + shmem_table[idx].fd = shmem_fd; + shmem_table[idx].mmap_base = base; + shmem_table[idx].size = size; + + ublk_dbg(UBLK_DBG_DEV, "shmem registered: index=%d fd=%d size=%zu\n", + idx, shmem_fd, (size_t)size); + return idx; +} + +static void ublk_shmem_unregister_all(void) +{ + int i; + + for (i = 0; i < shmem_count; i++) { + if (shmem_table[i].mmap_base) { + munmap(shmem_table[i].mmap_base, + shmem_table[i].size); + close(shmem_table[i].fd); + shmem_table[i].mmap_base = NULL; + } + } + shmem_count = 0; +} + +static int ublk_ctrl_reg_buf(struct ublk_dev *dev, void *addr, size_t size, + __u32 flags) +{ + struct ublk_shmem_buf_reg buf_reg = { + .addr = (unsigned long)addr, + .len = size, + .flags = flags, + }; + struct ublk_ctrl_cmd_data data = { + .cmd_op = UBLK_U_CMD_REG_BUF, + .flags = CTRL_CMD_HAS_BUF, + .addr = (unsigned long)&buf_reg, + .len = sizeof(buf_reg), + }; + + return __ublk_ctrl_cmd(dev, &data); +} + +/* + * Handle one client connection: receive memfd, mmap it, register + * the VA range with kernel, send back the assigned index. + */ +static void ublk_shmem_handle_client(int sock_fd, struct ublk_dev *dev) +{ + int client_fd, memfd, idx, ret; + int32_t reply; + off_t size; + void *base; + + client_fd = accept(sock_fd, NULL, NULL); + if (client_fd < 0) + return; + + memfd = ublk_shmem_recv_fd(client_fd); + if (memfd < 0) { + reply = -1; + goto out; + } + + /* mmap the memfd in server address space */ + size = lseek(memfd, 0, SEEK_END); + if (size <= 0) { + reply = -1; + close(memfd); + goto out; + } + base = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, memfd, 0); + if (base == MAP_FAILED) { + reply = -1; + close(memfd); + goto out; + } + + /* Register server's VA range with kernel for PFN matching */ + ret = ublk_ctrl_reg_buf(dev, base, size, 0); + if (ret < 0) { + ublk_dbg(UBLK_DBG_DEV, + "shmem_zc: kernel reg failed %d\n", ret); + munmap(base, size); + close(memfd); + reply = ret; + goto out; + } + + /* Store in table for I/O handling */ + idx = ublk_shmem_register(memfd); + if (idx >= 0) { + shmem_table[idx].mmap_base = base; + shmem_table[idx].size = size; + } + reply = idx; +out: + send(client_fd, &reply, sizeof(reply), 0); + close(client_fd); +} + +struct shmem_listener_info { + int dev_id; + int stop_efd; /* eventfd to signal listener to stop */ + int sock_fd; /* listener socket fd (output) */ + struct ublk_dev *dev; +}; + +/* + * Socket listener thread: runs in the parent daemon context alongside + * the I/O threads. Accepts shared memory registration requests from + * clients via SCM_RIGHTS. Exits when stop_efd is signaled. + */ +static void *ublk_shmem_listener_fn(void *data) +{ + struct shmem_listener_info *info = data; + struct pollfd pfds[2]; + + info->sock_fd = ublk_shmem_sock_create(info->dev_id); + if (info->sock_fd < 0) + return NULL; + + pfds[0].fd = info->sock_fd; + pfds[0].events = POLLIN; + pfds[1].fd = info->stop_efd; + pfds[1].events = POLLIN; + + while (1) { + int ret = poll(pfds, 2, -1); + + if (ret < 0) + break; + + /* Stop signal from parent */ + if (pfds[1].revents & POLLIN) + break; + + /* Client connection */ + if (pfds[0].revents & POLLIN) + ublk_shmem_handle_client(info->sock_fd, info->dev); + } + + return NULL; +} + +static int ublk_shmem_htlb_setup(const struct dev_ctx *ctx, + struct ublk_dev *dev) +{ + int fd, idx, ret; + struct stat st; + void *base; + + fd = open(ctx->htlb_path, O_RDWR); + if (fd < 0) { + ublk_err("htlb: can't open %s\n", ctx->htlb_path); + return -errno; + } + + if (fstat(fd, &st) < 0 || st.st_size <= 0) { + ublk_err("htlb: invalid file size\n"); + close(fd); + return -EINVAL; + } + + base = mmap(NULL, st.st_size, + ctx->rdonly_shmem_buf ? PROT_READ : PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, fd, 0); + if (base == MAP_FAILED) { + ublk_err("htlb: mmap failed\n"); + close(fd); + return -ENOMEM; + } + + ret = ublk_ctrl_reg_buf(dev, base, st.st_size, + ctx->rdonly_shmem_buf ? UBLK_SHMEM_BUF_READ_ONLY : 0); + if (ret < 0) { + ublk_err("htlb: reg_buf failed: %d\n", ret); + munmap(base, st.st_size); + close(fd); + return ret; + } + + if (shmem_count >= UBLK_BUF_MAX) { + munmap(base, st.st_size); + close(fd); + return -ENOMEM; + } + + idx = shmem_count++; + shmem_table[idx].fd = fd; + shmem_table[idx].mmap_base = base; + shmem_table[idx].size = st.st_size; + + ublk_dbg(UBLK_DBG_DEV, "htlb registered: index=%d size=%zu\n", + idx, (size_t)st.st_size); + return 0; +} + static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) { const struct ublksrv_ctrl_dev_info *dinfo = &dev->dev_info; + struct shmem_listener_info linfo = {}; struct ublk_thread_info *tinfo; unsigned long long extra_flags = 0; cpu_set_t *affinity_buf; unsigned char (*q_thread_map)[UBLK_MAX_QUEUES] = NULL; + uint64_t stop_val = 1; + pthread_t listener; void *thread_ret; sem_t ready; int ret, i; @@ -1180,15 +1491,44 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) goto fail_start; } + if (ctx->htlb_path) { + ret = ublk_shmem_htlb_setup(ctx, dev); + if (ret < 0) { + ublk_err("htlb setup failed: %d\n", ret); + ublk_ctrl_stop_dev(dev); + goto fail_start; + } + } + ublk_ctrl_get_info(dev); if (ctx->fg) ublk_ctrl_dump(dev); else ublk_send_dev_event(ctx, dev, dev->dev_info.dev_id); fail_start: - /* wait until we are terminated */ - for (i = 0; i < dev->nthreads; i++) + /* + * Wait for I/O threads to exit. While waiting, a listener + * thread accepts shared memory registration requests from + * clients via a per-device unix socket (SCM_RIGHTS fd passing). + */ + linfo.dev_id = dinfo->dev_id; + linfo.dev = dev; + linfo.stop_efd = eventfd(0, 0); + if (linfo.stop_efd >= 0) + pthread_create(&listener, NULL, + ublk_shmem_listener_fn, &linfo); + + for (i = 0; i < (int)dev->nthreads; i++) pthread_join(tinfo[i].thread, &thread_ret); + + /* Signal listener thread to stop and wait for it */ + if (linfo.stop_efd >= 0) { + write(linfo.stop_efd, &stop_val, sizeof(stop_val)); + pthread_join(listener, NULL); + close(linfo.stop_efd); + ublk_shmem_sock_destroy(dinfo->dev_id, linfo.sock_fd); + } + ublk_shmem_unregister_all(); free(tinfo); fail: for (i = 0; i < dinfo->nr_hw_queues; i++) @@ -1618,6 +1958,7 @@ static int cmd_dev_get_features(void) FEAT_NAME(UBLK_F_SAFE_STOP_DEV), FEAT_NAME(UBLK_F_BATCH_IO), FEAT_NAME(UBLK_F_NO_AUTO_PART_SCAN), + FEAT_NAME(UBLK_F_SHMEM_ZC), }; struct ublk_dev *dev; __u64 features = 0; @@ -1790,6 +2131,9 @@ int main(int argc, char *argv[]) { "safe", 0, NULL, 0 }, { "batch", 0, NULL, 'b'}, { "no_auto_part_scan", 0, NULL, 0 }, + { "shmem_zc", 0, NULL, 0 }, + { "htlb", 1, NULL, 0 }, + { "rdonly_shmem_buf", 0, NULL, 0 }, { 0, 0, 0, 0 } }; const struct ublk_tgt_ops *ops = NULL; @@ -1905,6 +2249,12 @@ int main(int argc, char *argv[]) ctx.safe_stop = 1; if (!strcmp(longopts[option_idx].name, "no_auto_part_scan")) ctx.flags |= UBLK_F_NO_AUTO_PART_SCAN; + if (!strcmp(longopts[option_idx].name, "shmem_zc")) + ctx.flags |= UBLK_F_SHMEM_ZC; + if (!strcmp(longopts[option_idx].name, "htlb")) + ctx.htlb_path = strdup(optarg); + if (!strcmp(longopts[option_idx].name, "rdonly_shmem_buf")) + ctx.rdonly_shmem_buf = 1; break; case '?': /* diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 02f0c55d006b..742c41d77df1 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -60,6 +60,7 @@ struct stripe_ctx { struct fault_inject_ctx { /* fault_inject */ unsigned long delay_us; + bool die_during_fetch; }; struct dev_ctx { @@ -80,6 +81,7 @@ struct dev_ctx { unsigned int no_ublk_fixed_fd:1; unsigned int safe_stop:1; unsigned int no_auto_part_scan:1; + unsigned int rdonly_shmem_buf:1; __u32 integrity_flags; __u8 metadata_size; __u8 pi_offset; @@ -95,6 +97,8 @@ struct dev_ctx { /* for 'update_size' command */ unsigned long long size; + char *htlb_path; + union { struct stripe_ctx stripe; struct fault_inject_ctx fault_inject; @@ -138,6 +142,8 @@ struct ublk_tgt_ops { int (*init_tgt)(const struct dev_ctx *ctx, struct ublk_dev *); void (*deinit_tgt)(struct ublk_dev *); + void (*pre_fetch_io)(struct ublk_thread *t, struct ublk_queue *q, + int tag, bool batch); int (*queue_io)(struct ublk_thread *, struct ublk_queue *, int tag); void (*tgt_io_done)(struct ublk_thread *, struct ublk_queue *, const struct io_uring_cqe *); @@ -599,6 +605,18 @@ static inline void ublk_queued_tgt_io(struct ublk_thread *t, struct ublk_queue * } } +/* shared memory zero-copy support */ +#define UBLK_BUF_MAX 256 + +struct ublk_shmem_entry { + int fd; + void *mmap_base; + size_t size; +}; + +extern struct ublk_shmem_entry shmem_table[UBLK_BUF_MAX]; +extern int shmem_count; + extern const struct ublk_tgt_ops null_tgt_ops; extern const struct ublk_tgt_ops loop_tgt_ops; extern const struct ublk_tgt_ops stripe_tgt_ops; diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index 163a40007910..af2ea4fa1111 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -88,6 +88,7 @@ _remove_tmp_dir() { _mkfs_mount_test() { local dev=$1 + shift local err_code=0 local mnt_dir; @@ -99,12 +100,17 @@ _mkfs_mount_test() fi mount -t ext4 "$dev" "$mnt_dir" > /dev/null 2>&1 + if [ $# -gt 0 ]; then + cd "$mnt_dir" && "$@" + err_code=$? + cd - > /dev/null + fi umount "$dev" - err_code=$? - _remove_tmp_dir "$mnt_dir" - if [ $err_code -ne 0 ]; then - return $err_code + if [ $err_code -eq 0 ]; then + err_code=$? fi + _remove_tmp_dir "$mnt_dir" + return $err_code } _check_root() { @@ -132,6 +138,7 @@ _prep_test() { local base_dir=${TMPDIR:-./ublktest-dir} mkdir -p "$base_dir" UBLK_TEST_DIR=$(mktemp -d ${base_dir}/${TID}.XXXXXX) + UBLK_TEST_DIR=$(realpath ${UBLK_TEST_DIR}) UBLK_TMP=$(mktemp ${UBLK_TEST_DIR}/ublk_test_XXXXX) [ "$UBLK_TEST_QUIET" -eq 0 ] && echo "ublk $type: $*" echo "ublk selftest: $TID starting at $(date '+%F %T')" | tee /dev/kmsg diff --git a/tools/testing/selftests/ublk/test_generic_17.sh b/tools/testing/selftests/ublk/test_generic_17.sh new file mode 100755 index 000000000000..2278b5fc9dba --- /dev/null +++ b/tools/testing/selftests/ublk/test_generic_17.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +_prep_test "fault_inject" "teardown after incomplete recovery" + +# First start and stop a ublk server with device configured for recovery +dev_id=$(_add_ublk_dev -t fault_inject -r 1) +_check_add_dev $TID $? +state=$(__ublk_kill_daemon "${dev_id}" "QUIESCED") +if [ "$state" != "QUIESCED" ]; then + echo "device isn't quiesced($state) after $action" + ERR_CODE=255 +fi + +# Then recover the device, but use --die_during_fetch to have the ublk +# server die while a queue has some (but not all) I/Os fetched +${UBLK_PROG} recover -n "${dev_id}" --foreground -t fault_inject --die_during_fetch 1 +RECOVER_RES=$? +# 137 is the result when dying of SIGKILL +if (( RECOVER_RES != 137 )); then + echo "recover command exited with unexpected code ${RECOVER_RES}!" + ERR_CODE=255 +fi + +# Clean up the device. This can only succeed once teardown of the above +# exited ublk server completes. So if teardown never completes, we will +# time out here +_ublk_del_dev "${dev_id}" + +_cleanup_test "fault_inject" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_shmemzc_01.sh b/tools/testing/selftests/ublk/test_shmemzc_01.sh new file mode 100755 index 000000000000..47210af2aa20 --- /dev/null +++ b/tools/testing/selftests/ublk/test_shmemzc_01.sh @@ -0,0 +1,72 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Test: shmem_zc with hugetlbfs buffer on null target +# +# kublk and fio both mmap the same hugetlbfs file (MAP_SHARED), +# so they share physical pages. The kernel PFN match enables +# zero-copy I/O without socket-based fd passing. + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +_prep_test "shmem_zc" "null target hugetlbfs shmem zero-copy test" + +if ! _have_program fio; then + echo "SKIP: fio not available" + exit "$UBLK_SKIP_CODE" +fi + +if ! grep -q hugetlbfs /proc/filesystems; then + echo "SKIP: hugetlbfs not supported" + exit "$UBLK_SKIP_CODE" +fi + +# Allocate hugepages +OLD_NR_HP=$(cat /proc/sys/vm/nr_hugepages) +echo 10 > /proc/sys/vm/nr_hugepages +NR_HP=$(cat /proc/sys/vm/nr_hugepages) +if [ "$NR_HP" -lt 2 ]; then + echo "SKIP: cannot allocate hugepages" + echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages + exit "$UBLK_SKIP_CODE" +fi + +# Mount hugetlbfs +HTLB_MNT=$(mktemp -d "${UBLK_TEST_DIR}/htlb_mnt_XXXXXX") +if ! mount -t hugetlbfs none "$HTLB_MNT"; then + echo "SKIP: cannot mount hugetlbfs" + rmdir "$HTLB_MNT" + echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages + exit "$UBLK_SKIP_CODE" +fi + +HTLB_FILE="$HTLB_MNT/ublk_buf" +fallocate -l 4M "$HTLB_FILE" + +dev_id=$(_add_ublk_dev -t null --shmem_zc --htlb "$HTLB_FILE") +_check_add_dev $TID $? + +fio --name=htlb_zc \ + --filename=/dev/ublkb"${dev_id}" \ + --ioengine=io_uring \ + --rw=randwrite \ + --direct=1 \ + --bs=4k \ + --size=4M \ + --iodepth=32 \ + --mem=mmaphuge:"$HTLB_FILE" \ + > /dev/null 2>&1 +ERR_CODE=$? + +# Delete device first so daemon releases the htlb mmap +_ublk_del_dev "${dev_id}" + +rm -f "$HTLB_FILE" +umount "$HTLB_MNT" +rmdir "$HTLB_MNT" +echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages + +_cleanup_test "shmem_zc" + +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_shmemzc_02.sh b/tools/testing/selftests/ublk/test_shmemzc_02.sh new file mode 100755 index 000000000000..aed9262494e9 --- /dev/null +++ b/tools/testing/selftests/ublk/test_shmemzc_02.sh @@ -0,0 +1,68 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Test: shmem_zc with hugetlbfs buffer on loop target +# +# kublk and fio both mmap the same hugetlbfs file (MAP_SHARED), +# so they share physical pages. The kernel PFN match enables +# zero-copy I/O without socket-based fd passing. + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +_prep_test "shmem_zc" "loop target hugetlbfs shmem zero-copy test" + +if ! _have_program fio; then + echo "SKIP: fio not available" + exit "$UBLK_SKIP_CODE" +fi + +if ! grep -q hugetlbfs /proc/filesystems; then + echo "SKIP: hugetlbfs not supported" + exit "$UBLK_SKIP_CODE" +fi + +# Allocate hugepages +OLD_NR_HP=$(cat /proc/sys/vm/nr_hugepages) +echo 10 > /proc/sys/vm/nr_hugepages +NR_HP=$(cat /proc/sys/vm/nr_hugepages) +if [ "$NR_HP" -lt 2 ]; then + echo "SKIP: cannot allocate hugepages" + echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages + exit "$UBLK_SKIP_CODE" +fi + +# Mount hugetlbfs +HTLB_MNT=$(mktemp -d "${UBLK_TEST_DIR}/htlb_mnt_XXXXXX") +if ! mount -t hugetlbfs none "$HTLB_MNT"; then + echo "SKIP: cannot mount hugetlbfs" + rmdir "$HTLB_MNT" + echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages + exit "$UBLK_SKIP_CODE" +fi + +HTLB_FILE="$HTLB_MNT/ublk_buf" +fallocate -l 4M "$HTLB_FILE" + +_create_backfile 0 128M +BACKFILE="${UBLK_BACKFILES[0]}" + +dev_id=$(_add_ublk_dev -t loop --shmem_zc --htlb "$HTLB_FILE" "$BACKFILE") +_check_add_dev $TID $? + +_run_fio_verify_io --filename=/dev/ublkb"${dev_id}" \ + --size=128M \ + --mem=mmaphuge:"$HTLB_FILE" +ERR_CODE=$? + +# Delete device first so daemon releases the htlb mmap +_ublk_del_dev "${dev_id}" + +rm -f "$HTLB_FILE" +umount "$HTLB_MNT" +rmdir "$HTLB_MNT" +echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages + +_cleanup_test "shmem_zc" + +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_shmemzc_03.sh b/tools/testing/selftests/ublk/test_shmemzc_03.sh new file mode 100755 index 000000000000..db967a9ffe81 --- /dev/null +++ b/tools/testing/selftests/ublk/test_shmemzc_03.sh @@ -0,0 +1,69 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Test: shmem_zc with fio verify over filesystem on loop target +# +# mkfs + mount ext4 on the ublk device, then run fio verify on a +# file inside that filesystem. Exercises the full stack: +# filesystem -> block layer -> ublk shmem_zc -> loop target backing file. + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +_prep_test "shmem_zc" "loop target hugetlbfs shmem zero-copy fs verify test" + +if ! _have_program fio; then + echo "SKIP: fio not available" + exit "$UBLK_SKIP_CODE" +fi + +if ! grep -q hugetlbfs /proc/filesystems; then + echo "SKIP: hugetlbfs not supported" + exit "$UBLK_SKIP_CODE" +fi + +# Allocate hugepages +OLD_NR_HP=$(cat /proc/sys/vm/nr_hugepages) +echo 10 > /proc/sys/vm/nr_hugepages +NR_HP=$(cat /proc/sys/vm/nr_hugepages) +if [ "$NR_HP" -lt 2 ]; then + echo "SKIP: cannot allocate hugepages" + echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages + exit "$UBLK_SKIP_CODE" +fi + +# Mount hugetlbfs +HTLB_MNT=$(mktemp -d "${UBLK_TEST_DIR}/htlb_mnt_XXXXXX") +if ! mount -t hugetlbfs none "$HTLB_MNT"; then + echo "SKIP: cannot mount hugetlbfs" + rmdir "$HTLB_MNT" + echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages + exit "$UBLK_SKIP_CODE" +fi + +HTLB_FILE="$HTLB_MNT/ublk_buf" +fallocate -l 4M "$HTLB_FILE" + +_create_backfile 0 256M +BACKFILE="${UBLK_BACKFILES[0]}" + +dev_id=$(_add_ublk_dev -t loop --shmem_zc --htlb "$HTLB_FILE" "$BACKFILE") +_check_add_dev $TID $? + +_mkfs_mount_test /dev/ublkb"${dev_id}" \ + _run_fio_verify_io --filename=testfile \ + --size=128M \ + --mem=mmaphuge:"$HTLB_FILE" +ERR_CODE=$? + +# Delete device first so daemon releases the htlb mmap +_ublk_del_dev "${dev_id}" + +rm -f "$HTLB_FILE" +umount "$HTLB_MNT" +rmdir "$HTLB_MNT" +echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages + +_cleanup_test "shmem_zc" + +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_shmemzc_04.sh b/tools/testing/selftests/ublk/test_shmemzc_04.sh new file mode 100755 index 000000000000..899de088ece4 --- /dev/null +++ b/tools/testing/selftests/ublk/test_shmemzc_04.sh @@ -0,0 +1,72 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Test: shmem_zc with read-only buffer registration on null target +# +# Same as test_shmemzc_01 but with --rdonly_shmem_buf: pages are pinned +# without FOLL_WRITE (UBLK_BUF_F_READ). Write I/O works because +# the server only reads from the shared buffer. + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +_prep_test "shmem_zc" "null target hugetlbfs shmem zero-copy rdonly_buf test" + +if ! _have_program fio; then + echo "SKIP: fio not available" + exit "$UBLK_SKIP_CODE" +fi + +if ! grep -q hugetlbfs /proc/filesystems; then + echo "SKIP: hugetlbfs not supported" + exit "$UBLK_SKIP_CODE" +fi + +# Allocate hugepages +OLD_NR_HP=$(cat /proc/sys/vm/nr_hugepages) +echo 10 > /proc/sys/vm/nr_hugepages +NR_HP=$(cat /proc/sys/vm/nr_hugepages) +if [ "$NR_HP" -lt 2 ]; then + echo "SKIP: cannot allocate hugepages" + echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages + exit "$UBLK_SKIP_CODE" +fi + +# Mount hugetlbfs +HTLB_MNT=$(mktemp -d "${UBLK_TEST_DIR}/htlb_mnt_XXXXXX") +if ! mount -t hugetlbfs none "$HTLB_MNT"; then + echo "SKIP: cannot mount hugetlbfs" + rmdir "$HTLB_MNT" + echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages + exit "$UBLK_SKIP_CODE" +fi + +HTLB_FILE="$HTLB_MNT/ublk_buf" +fallocate -l 4M "$HTLB_FILE" + +dev_id=$(_add_ublk_dev -t null --shmem_zc --htlb "$HTLB_FILE" --rdonly_shmem_buf) +_check_add_dev $TID $? + +fio --name=htlb_zc_rdonly \ + --filename=/dev/ublkb"${dev_id}" \ + --ioengine=io_uring \ + --rw=randwrite \ + --direct=1 \ + --bs=4k \ + --size=4M \ + --iodepth=32 \ + --mem=mmaphuge:"$HTLB_FILE" \ + > /dev/null 2>&1 +ERR_CODE=$? + +# Delete device first so daemon releases the htlb mmap +_ublk_del_dev "${dev_id}" + +rm -f "$HTLB_FILE" +umount "$HTLB_MNT" +rmdir "$HTLB_MNT" +echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages + +_cleanup_test "shmem_zc" + +_show_result $TID $ERR_CODE |
