summaryrefslogtreecommitdiff
path: root/tools
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-04-14 01:51:31 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2026-04-14 01:51:31 +0300
commit7fe6ac157b7e15c8976bd62ad7cb98e248884e83 (patch)
tree64677a680f3bccc7efb8f4cfcb288006e1433cd3 /tools
parentb8f82cb0d84d00c04cdbdce42f67df71b8507e8b (diff)
parent36446de0c30c62b9d89502fd36c4904996d86ecd (diff)
downloadlinux-7fe6ac157b7e15c8976bd62ad7cb98e248884e83.tar.xz
Merge tag 'for-7.1/block-20260411' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux
Pull block updates from Jens Axboe: - Add shared memory zero-copy I/O support for ublk, bypassing per-I/O copies between kernel and userspace by matching registered buffer PFNs at I/O time. Includes selftests. - Refactor bio integrity to support filesystem initiated integrity operations and arbitrary buffer alignment. - Clean up bio allocation, splitting bio_alloc_bioset() into clear fast and slow paths. Add bio_await() and bio_submit_or_kill() helpers, unify synchronous bi_end_io callbacks. - Fix zone write plug refcount handling and plug removal races. Add support for serializing zone writes at QD=1 for rotational zoned devices, yielding significant throughput improvements. - Add SED-OPAL ioctls for Single User Mode management and a STACK_RESET command. - Add io_uring passthrough (uring_cmd) support to the BSG layer. - Replace pp_buf in partition scanning with struct seq_buf. - zloop improvements and cleanups. - drbd genl cleanup, switching to pre_doit/post_doit. - NVMe pull request via Keith: - Fabrics authentication updates - Enhanced block queue limits support - Workqueue usage updates - A new write zeroes device quirk - Tagset cleanup fix for loop device - MD pull requests via Yu Kuai: - Fix raid5 soft lockup in retry_aligned_read() - Fix raid10 deadlock with check operation and nowait requests - Fix raid1 overlapping writes on writemostly disks - Fix sysfs deadlock on array_state=clear - Proactive RAID-5 parity building with llbitmap, with write_zeroes_unmap optimization for initial sync - Fix llbitmap barrier ordering, rdev skipping, and bitmap_ops version mismatch fallback - Fix bcache use-after-free and uninitialized closure - Validate raid5 journal metadata payload size - Various cleanups - Various other fixes, improvements, and cleanups * tag 'for-7.1/block-20260411' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: (146 commits) ublk: fix tautological comparison warning in ublk_ctrl_reg_buf scsi: bsg: fix buffer overflow in scsi_bsg_uring_cmd() block: refactor blkdev_zone_mgmt_ioctl MAINTAINERS: update ublk driver maintainer email Documentation: ublk: address review comments for SHMEM_ZC docs ublk: allow buffer registration before device is started ublk: replace xarray with IDA for shmem buffer index allocation ublk: simplify PFN range loop in __ublk_ctrl_reg_buf ublk: verify all pages in multi-page bvec fall within registered range ublk: widen ublk_shmem_buf_reg.len to __u64 for 4GB buffer support xfs: use bio_await in xfs_zone_gc_reset_sync block: add a bio_submit_or_kill helper block: factor out a bio_await helper block: unify the synchronous bi_end_io callbacks xfs: fix number of GC bvecs selftests/ublk: add read-only buffer registration test selftests/ublk: add filesystem fio verify test for shmem_zc selftests/ublk: add hugetlbfs shmem_zc test for loop target selftests/ublk: add shared memory zero-copy test selftests/ublk: add UBLK_F_SHMEM_ZC support for loop target ...
Diffstat (limited to 'tools')
-rw-r--r--tools/testing/selftests/ublk/Makefile6
-rw-r--r--tools/testing/selftests/ublk/fault_inject.c52
-rw-r--r--tools/testing/selftests/ublk/file_backed.c38
-rw-r--r--tools/testing/selftests/ublk/kublk.c354
-rw-r--r--tools/testing/selftests/ublk/kublk.h18
-rwxr-xr-xtools/testing/selftests/ublk/test_common.sh15
-rwxr-xr-xtools/testing/selftests/ublk/test_generic_17.sh35
-rwxr-xr-xtools/testing/selftests/ublk/test_shmemzc_01.sh72
-rwxr-xr-xtools/testing/selftests/ublk/test_shmemzc_02.sh68
-rwxr-xr-xtools/testing/selftests/ublk/test_shmemzc_03.sh69
-rwxr-xr-xtools/testing/selftests/ublk/test_shmemzc_04.sh72
11 files changed, 790 insertions, 9 deletions
diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile
index 8ac2d4a682a1..ec6a8ce83d38 100644
--- a/tools/testing/selftests/ublk/Makefile
+++ b/tools/testing/selftests/ublk/Makefile
@@ -18,6 +18,7 @@ TEST_PROGS += test_generic_10.sh
TEST_PROGS += test_generic_12.sh
TEST_PROGS += test_generic_13.sh
TEST_PROGS += test_generic_16.sh
+TEST_PROGS += test_generic_17.sh
TEST_PROGS += test_batch_01.sh
TEST_PROGS += test_batch_02.sh
@@ -51,6 +52,11 @@ TEST_PROGS += test_stripe_06.sh
TEST_PROGS += test_part_01.sh
TEST_PROGS += test_part_02.sh
+TEST_PROGS += test_shmemzc_01.sh
+TEST_PROGS += test_shmemzc_02.sh
+TEST_PROGS += test_shmemzc_03.sh
+TEST_PROGS += test_shmemzc_04.sh
+
TEST_PROGS += test_stress_01.sh
TEST_PROGS += test_stress_02.sh
TEST_PROGS += test_stress_03.sh
diff --git a/tools/testing/selftests/ublk/fault_inject.c b/tools/testing/selftests/ublk/fault_inject.c
index 3b897f69c014..150896e02ff8 100644
--- a/tools/testing/selftests/ublk/fault_inject.c
+++ b/tools/testing/selftests/ublk/fault_inject.c
@@ -10,11 +10,17 @@
#include "kublk.h"
+struct fi_opts {
+ long long delay_ns;
+ bool die_during_fetch;
+};
+
static int ublk_fault_inject_tgt_init(const struct dev_ctx *ctx,
struct ublk_dev *dev)
{
const struct ublksrv_ctrl_dev_info *info = &dev->dev_info;
unsigned long dev_size = 250UL << 30;
+ struct fi_opts *opts = NULL;
if (ctx->auto_zc_fallback) {
ublk_err("%s: not support auto_zc_fallback\n", __func__);
@@ -35,17 +41,52 @@ static int ublk_fault_inject_tgt_init(const struct dev_ctx *ctx,
};
ublk_set_integrity_params(ctx, &dev->tgt.params);
- dev->private_data = (void *)(unsigned long)(ctx->fault_inject.delay_us * 1000);
+ opts = calloc(1, sizeof(*opts));
+ if (!opts) {
+ ublk_err("%s: couldn't allocate memory for opts\n", __func__);
+ return -ENOMEM;
+ }
+
+ opts->delay_ns = ctx->fault_inject.delay_us * 1000;
+ opts->die_during_fetch = ctx->fault_inject.die_during_fetch;
+ dev->private_data = opts;
+
return 0;
}
+static void ublk_fault_inject_pre_fetch_io(struct ublk_thread *t,
+ struct ublk_queue *q, int tag,
+ bool batch)
+{
+ struct fi_opts *opts = q->dev->private_data;
+
+ if (!opts->die_during_fetch)
+ return;
+
+ /*
+ * Each queue fetches its IOs in increasing order of tags, so
+ * dying just before we're about to fetch tag 1 (regardless of
+ * what queue we're on) guarantees that we've fetched a nonempty
+ * proper subset of the tags on that queue.
+ */
+ if (tag == 1) {
+ /*
+ * Ensure our commands are actually live in the kernel
+ * before we die.
+ */
+ io_uring_submit(&t->ring);
+ raise(SIGKILL);
+ }
+}
+
static int ublk_fault_inject_queue_io(struct ublk_thread *t,
struct ublk_queue *q, int tag)
{
const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag);
struct io_uring_sqe *sqe;
+ struct fi_opts *opts = q->dev->private_data;
struct __kernel_timespec ts = {
- .tv_nsec = (long long)q->dev->private_data,
+ .tv_nsec = opts->delay_ns,
};
ublk_io_alloc_sqes(t, &sqe, 1);
@@ -77,29 +118,34 @@ static void ublk_fault_inject_cmd_line(struct dev_ctx *ctx, int argc, char *argv
{
static const struct option longopts[] = {
{ "delay_us", 1, NULL, 0 },
+ { "die_during_fetch", 1, NULL, 0 },
{ 0, 0, 0, 0 }
};
int option_idx, opt;
ctx->fault_inject.delay_us = 0;
+ ctx->fault_inject.die_during_fetch = false;
while ((opt = getopt_long(argc, argv, "",
longopts, &option_idx)) != -1) {
switch (opt) {
case 0:
if (!strcmp(longopts[option_idx].name, "delay_us"))
ctx->fault_inject.delay_us = strtoll(optarg, NULL, 10);
+ if (!strcmp(longopts[option_idx].name, "die_during_fetch"))
+ ctx->fault_inject.die_during_fetch = strtoll(optarg, NULL, 10);
}
}
}
static void ublk_fault_inject_usage(const struct ublk_tgt_ops *ops)
{
- printf("\tfault_inject: [--delay_us us (default 0)]\n");
+ printf("\tfault_inject: [--delay_us us (default 0)] [--die_during_fetch 1]\n");
}
const struct ublk_tgt_ops fault_inject_tgt_ops = {
.name = "fault_inject",
.init_tgt = ublk_fault_inject_tgt_init,
+ .pre_fetch_io = ublk_fault_inject_pre_fetch_io,
.queue_io = ublk_fault_inject_queue_io,
.tgt_io_done = ublk_fault_inject_tgt_io_done,
.parse_cmd_line = ublk_fault_inject_cmd_line,
diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c
index 228af2580ac6..d28da98f917a 100644
--- a/tools/testing/selftests/ublk/file_backed.c
+++ b/tools/testing/selftests/ublk/file_backed.c
@@ -27,6 +27,40 @@ static int loop_queue_flush_io(struct ublk_thread *t, struct ublk_queue *q,
return 1;
}
+/*
+ * Shared memory zero-copy I/O: when UBLK_IO_F_SHMEM_ZC is set, the
+ * request's data lives in a registered shared memory buffer. Decode
+ * index + offset from iod->addr and use the server's mmap of that
+ * buffer as the I/O buffer for the backing file.
+ */
+static int loop_queue_shmem_zc_io(struct ublk_thread *t, struct ublk_queue *q,
+ const struct ublksrv_io_desc *iod, int tag)
+{
+ unsigned ublk_op = ublksrv_get_op(iod);
+ enum io_uring_op op = ublk_to_uring_op(iod, 0);
+ __u64 file_offset = iod->start_sector << 9;
+ __u32 len = iod->nr_sectors << 9;
+ __u32 shmem_idx = ublk_shmem_zc_index(iod->addr);
+ __u32 shmem_off = ublk_shmem_zc_offset(iod->addr);
+ struct io_uring_sqe *sqe[1];
+ void *addr;
+
+ if (shmem_idx >= UBLK_BUF_MAX || !shmem_table[shmem_idx].mmap_base)
+ return -EINVAL;
+
+ addr = shmem_table[shmem_idx].mmap_base + shmem_off;
+
+ ublk_io_alloc_sqes(t, sqe, 1);
+ if (!sqe[0])
+ return -ENOMEM;
+
+ io_uring_prep_rw(op, sqe[0], ublk_get_registered_fd(q, 1),
+ addr, len, file_offset);
+ io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE);
+ sqe[0]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1);
+ return 1;
+}
+
static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q,
const struct ublksrv_io_desc *iod, int tag)
{
@@ -41,6 +75,10 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q,
void *addr = io->buf_addr;
unsigned short buf_index = ublk_io_buf_idx(t, q, tag);
+ /* shared memory zero-copy path */
+ if (iod->op_flags & UBLK_IO_F_SHMEM_ZC)
+ return loop_queue_shmem_zc_io(t, q, iod, tag);
+
if (iod->op_flags & UBLK_IO_F_INTEGRITY) {
ublk_io_alloc_sqes(t, sqe, 1);
/* Use second backing file for integrity data */
diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c
index e1c3b3c55e56..fbd9b1e7342a 100644
--- a/tools/testing/selftests/ublk/kublk.c
+++ b/tools/testing/selftests/ublk/kublk.c
@@ -4,6 +4,7 @@
*/
#include <linux/fs.h>
+#include <sys/un.h>
#include "kublk.h"
#define MAX_NR_TGT_ARG 64
@@ -796,6 +797,8 @@ static void ublk_submit_fetch_commands(struct ublk_thread *t)
q = &t->dev->q[q_id];
io = &q->ios[tag];
io->buf_index = j++;
+ if (q->tgt_ops->pre_fetch_io)
+ q->tgt_ops->pre_fetch_io(t, q, tag, false);
ublk_queue_io_cmd(t, io);
}
} else {
@@ -807,6 +810,8 @@ static void ublk_submit_fetch_commands(struct ublk_thread *t)
for (i = 0; i < q->q_depth; i++) {
io = &q->ios[i];
io->buf_index = i;
+ if (q->tgt_ops->pre_fetch_io)
+ q->tgt_ops->pre_fetch_io(t, q, i, false);
ublk_queue_io_cmd(t, io);
}
}
@@ -983,6 +988,9 @@ static void ublk_batch_setup_queues(struct ublk_thread *t)
if (t->q_map[i] == 0)
continue;
+ if (q->tgt_ops->pre_fetch_io)
+ q->tgt_ops->pre_fetch_io(t, q, 0, true);
+
ret = ublk_batch_queue_prep_io_cmds(t, q);
ublk_assert(ret >= 0);
}
@@ -1085,13 +1093,316 @@ static int ublk_send_dev_event(const struct dev_ctx *ctx, struct ublk_dev *dev,
}
+/*
+ * Shared memory registration socket listener.
+ *
+ * The parent daemon context listens on a per-device unix socket at
+ * /run/ublk/ublkb<dev_id>.sock for shared memory registration requests
+ * from clients. Clients send a memfd via SCM_RIGHTS; the server
+ * registers it with the kernel, mmaps it, and returns the assigned index.
+ */
+#define UBLK_SHMEM_SOCK_DIR "/run/ublk"
+
+/* defined in kublk.h, shared with file_backed.c (loop target) */
+struct ublk_shmem_entry shmem_table[UBLK_BUF_MAX];
+int shmem_count;
+
+static void ublk_shmem_sock_path(int dev_id, char *buf, size_t len)
+{
+ snprintf(buf, len, "%s/ublkb%d.sock", UBLK_SHMEM_SOCK_DIR, dev_id);
+}
+
+static int ublk_shmem_sock_create(int dev_id)
+{
+ struct sockaddr_un addr = { .sun_family = AF_UNIX };
+ char path[108];
+ int fd;
+
+ mkdir(UBLK_SHMEM_SOCK_DIR, 0755);
+ ublk_shmem_sock_path(dev_id, path, sizeof(path));
+ unlink(path);
+
+ fd = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0);
+ if (fd < 0)
+ return -1;
+
+ snprintf(addr.sun_path, sizeof(addr.sun_path), "%s", path);
+ if (bind(fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
+ close(fd);
+ return -1;
+ }
+
+ listen(fd, 4);
+ ublk_dbg(UBLK_DBG_DEV, "shmem socket created: %s\n", path);
+ return fd;
+}
+
+static void ublk_shmem_sock_destroy(int dev_id, int sock_fd)
+{
+ char path[108];
+
+ if (sock_fd >= 0)
+ close(sock_fd);
+ ublk_shmem_sock_path(dev_id, path, sizeof(path));
+ unlink(path);
+}
+
+/* Receive a memfd from a client via SCM_RIGHTS */
+static int ublk_shmem_recv_fd(int client_fd)
+{
+ char buf[1];
+ struct iovec iov = { .iov_base = buf, .iov_len = sizeof(buf) };
+ union {
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+ struct cmsghdr align;
+ } u;
+ struct msghdr msg = {
+ .msg_iov = &iov,
+ .msg_iovlen = 1,
+ .msg_control = u.cmsg_buf,
+ .msg_controllen = sizeof(u.cmsg_buf),
+ };
+ struct cmsghdr *cmsg;
+
+ if (recvmsg(client_fd, &msg, 0) <= 0)
+ return -1;
+
+ cmsg = CMSG_FIRSTHDR(&msg);
+ if (!cmsg || cmsg->cmsg_level != SOL_SOCKET ||
+ cmsg->cmsg_type != SCM_RIGHTS)
+ return -1;
+
+ return *(int *)CMSG_DATA(cmsg);
+}
+
+/* Register a shared memory buffer: store fd, mmap it, return index */
+static int ublk_shmem_register(int shmem_fd)
+{
+ off_t size;
+ void *base;
+ int idx;
+
+ if (shmem_count >= UBLK_BUF_MAX)
+ return -1;
+
+ size = lseek(shmem_fd, 0, SEEK_END);
+ if (size <= 0)
+ return -1;
+
+ base = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED,
+ shmem_fd, 0);
+ if (base == MAP_FAILED)
+ return -1;
+
+ idx = shmem_count++;
+ shmem_table[idx].fd = shmem_fd;
+ shmem_table[idx].mmap_base = base;
+ shmem_table[idx].size = size;
+
+ ublk_dbg(UBLK_DBG_DEV, "shmem registered: index=%d fd=%d size=%zu\n",
+ idx, shmem_fd, (size_t)size);
+ return idx;
+}
+
+static void ublk_shmem_unregister_all(void)
+{
+ int i;
+
+ for (i = 0; i < shmem_count; i++) {
+ if (shmem_table[i].mmap_base) {
+ munmap(shmem_table[i].mmap_base,
+ shmem_table[i].size);
+ close(shmem_table[i].fd);
+ shmem_table[i].mmap_base = NULL;
+ }
+ }
+ shmem_count = 0;
+}
+
+static int ublk_ctrl_reg_buf(struct ublk_dev *dev, void *addr, size_t size,
+ __u32 flags)
+{
+ struct ublk_shmem_buf_reg buf_reg = {
+ .addr = (unsigned long)addr,
+ .len = size,
+ .flags = flags,
+ };
+ struct ublk_ctrl_cmd_data data = {
+ .cmd_op = UBLK_U_CMD_REG_BUF,
+ .flags = CTRL_CMD_HAS_BUF,
+ .addr = (unsigned long)&buf_reg,
+ .len = sizeof(buf_reg),
+ };
+
+ return __ublk_ctrl_cmd(dev, &data);
+}
+
+/*
+ * Handle one client connection: receive memfd, mmap it, register
+ * the VA range with kernel, send back the assigned index.
+ */
+static void ublk_shmem_handle_client(int sock_fd, struct ublk_dev *dev)
+{
+ int client_fd, memfd, idx, ret;
+ int32_t reply;
+ off_t size;
+ void *base;
+
+ client_fd = accept(sock_fd, NULL, NULL);
+ if (client_fd < 0)
+ return;
+
+ memfd = ublk_shmem_recv_fd(client_fd);
+ if (memfd < 0) {
+ reply = -1;
+ goto out;
+ }
+
+ /* mmap the memfd in server address space */
+ size = lseek(memfd, 0, SEEK_END);
+ if (size <= 0) {
+ reply = -1;
+ close(memfd);
+ goto out;
+ }
+ base = mmap(NULL, size, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, memfd, 0);
+ if (base == MAP_FAILED) {
+ reply = -1;
+ close(memfd);
+ goto out;
+ }
+
+ /* Register server's VA range with kernel for PFN matching */
+ ret = ublk_ctrl_reg_buf(dev, base, size, 0);
+ if (ret < 0) {
+ ublk_dbg(UBLK_DBG_DEV,
+ "shmem_zc: kernel reg failed %d\n", ret);
+ munmap(base, size);
+ close(memfd);
+ reply = ret;
+ goto out;
+ }
+
+ /* Store in table for I/O handling */
+ idx = ublk_shmem_register(memfd);
+ if (idx >= 0) {
+ shmem_table[idx].mmap_base = base;
+ shmem_table[idx].size = size;
+ }
+ reply = idx;
+out:
+ send(client_fd, &reply, sizeof(reply), 0);
+ close(client_fd);
+}
+
+struct shmem_listener_info {
+ int dev_id;
+ int stop_efd; /* eventfd to signal listener to stop */
+ int sock_fd; /* listener socket fd (output) */
+ struct ublk_dev *dev;
+};
+
+/*
+ * Socket listener thread: runs in the parent daemon context alongside
+ * the I/O threads. Accepts shared memory registration requests from
+ * clients via SCM_RIGHTS. Exits when stop_efd is signaled.
+ */
+static void *ublk_shmem_listener_fn(void *data)
+{
+ struct shmem_listener_info *info = data;
+ struct pollfd pfds[2];
+
+ info->sock_fd = ublk_shmem_sock_create(info->dev_id);
+ if (info->sock_fd < 0)
+ return NULL;
+
+ pfds[0].fd = info->sock_fd;
+ pfds[0].events = POLLIN;
+ pfds[1].fd = info->stop_efd;
+ pfds[1].events = POLLIN;
+
+ while (1) {
+ int ret = poll(pfds, 2, -1);
+
+ if (ret < 0)
+ break;
+
+ /* Stop signal from parent */
+ if (pfds[1].revents & POLLIN)
+ break;
+
+ /* Client connection */
+ if (pfds[0].revents & POLLIN)
+ ublk_shmem_handle_client(info->sock_fd, info->dev);
+ }
+
+ return NULL;
+}
+
+static int ublk_shmem_htlb_setup(const struct dev_ctx *ctx,
+ struct ublk_dev *dev)
+{
+ int fd, idx, ret;
+ struct stat st;
+ void *base;
+
+ fd = open(ctx->htlb_path, O_RDWR);
+ if (fd < 0) {
+ ublk_err("htlb: can't open %s\n", ctx->htlb_path);
+ return -errno;
+ }
+
+ if (fstat(fd, &st) < 0 || st.st_size <= 0) {
+ ublk_err("htlb: invalid file size\n");
+ close(fd);
+ return -EINVAL;
+ }
+
+ base = mmap(NULL, st.st_size,
+ ctx->rdonly_shmem_buf ? PROT_READ : PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, fd, 0);
+ if (base == MAP_FAILED) {
+ ublk_err("htlb: mmap failed\n");
+ close(fd);
+ return -ENOMEM;
+ }
+
+ ret = ublk_ctrl_reg_buf(dev, base, st.st_size,
+ ctx->rdonly_shmem_buf ? UBLK_SHMEM_BUF_READ_ONLY : 0);
+ if (ret < 0) {
+ ublk_err("htlb: reg_buf failed: %d\n", ret);
+ munmap(base, st.st_size);
+ close(fd);
+ return ret;
+ }
+
+ if (shmem_count >= UBLK_BUF_MAX) {
+ munmap(base, st.st_size);
+ close(fd);
+ return -ENOMEM;
+ }
+
+ idx = shmem_count++;
+ shmem_table[idx].fd = fd;
+ shmem_table[idx].mmap_base = base;
+ shmem_table[idx].size = st.st_size;
+
+ ublk_dbg(UBLK_DBG_DEV, "htlb registered: index=%d size=%zu\n",
+ idx, (size_t)st.st_size);
+ return 0;
+}
+
static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
{
const struct ublksrv_ctrl_dev_info *dinfo = &dev->dev_info;
+ struct shmem_listener_info linfo = {};
struct ublk_thread_info *tinfo;
unsigned long long extra_flags = 0;
cpu_set_t *affinity_buf;
unsigned char (*q_thread_map)[UBLK_MAX_QUEUES] = NULL;
+ uint64_t stop_val = 1;
+ pthread_t listener;
void *thread_ret;
sem_t ready;
int ret, i;
@@ -1180,15 +1491,44 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
goto fail_start;
}
+ if (ctx->htlb_path) {
+ ret = ublk_shmem_htlb_setup(ctx, dev);
+ if (ret < 0) {
+ ublk_err("htlb setup failed: %d\n", ret);
+ ublk_ctrl_stop_dev(dev);
+ goto fail_start;
+ }
+ }
+
ublk_ctrl_get_info(dev);
if (ctx->fg)
ublk_ctrl_dump(dev);
else
ublk_send_dev_event(ctx, dev, dev->dev_info.dev_id);
fail_start:
- /* wait until we are terminated */
- for (i = 0; i < dev->nthreads; i++)
+ /*
+ * Wait for I/O threads to exit. While waiting, a listener
+ * thread accepts shared memory registration requests from
+ * clients via a per-device unix socket (SCM_RIGHTS fd passing).
+ */
+ linfo.dev_id = dinfo->dev_id;
+ linfo.dev = dev;
+ linfo.stop_efd = eventfd(0, 0);
+ if (linfo.stop_efd >= 0)
+ pthread_create(&listener, NULL,
+ ublk_shmem_listener_fn, &linfo);
+
+ for (i = 0; i < (int)dev->nthreads; i++)
pthread_join(tinfo[i].thread, &thread_ret);
+
+ /* Signal listener thread to stop and wait for it */
+ if (linfo.stop_efd >= 0) {
+ write(linfo.stop_efd, &stop_val, sizeof(stop_val));
+ pthread_join(listener, NULL);
+ close(linfo.stop_efd);
+ ublk_shmem_sock_destroy(dinfo->dev_id, linfo.sock_fd);
+ }
+ ublk_shmem_unregister_all();
free(tinfo);
fail:
for (i = 0; i < dinfo->nr_hw_queues; i++)
@@ -1618,6 +1958,7 @@ static int cmd_dev_get_features(void)
FEAT_NAME(UBLK_F_SAFE_STOP_DEV),
FEAT_NAME(UBLK_F_BATCH_IO),
FEAT_NAME(UBLK_F_NO_AUTO_PART_SCAN),
+ FEAT_NAME(UBLK_F_SHMEM_ZC),
};
struct ublk_dev *dev;
__u64 features = 0;
@@ -1790,6 +2131,9 @@ int main(int argc, char *argv[])
{ "safe", 0, NULL, 0 },
{ "batch", 0, NULL, 'b'},
{ "no_auto_part_scan", 0, NULL, 0 },
+ { "shmem_zc", 0, NULL, 0 },
+ { "htlb", 1, NULL, 0 },
+ { "rdonly_shmem_buf", 0, NULL, 0 },
{ 0, 0, 0, 0 }
};
const struct ublk_tgt_ops *ops = NULL;
@@ -1905,6 +2249,12 @@ int main(int argc, char *argv[])
ctx.safe_stop = 1;
if (!strcmp(longopts[option_idx].name, "no_auto_part_scan"))
ctx.flags |= UBLK_F_NO_AUTO_PART_SCAN;
+ if (!strcmp(longopts[option_idx].name, "shmem_zc"))
+ ctx.flags |= UBLK_F_SHMEM_ZC;
+ if (!strcmp(longopts[option_idx].name, "htlb"))
+ ctx.htlb_path = strdup(optarg);
+ if (!strcmp(longopts[option_idx].name, "rdonly_shmem_buf"))
+ ctx.rdonly_shmem_buf = 1;
break;
case '?':
/*
diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h
index 02f0c55d006b..742c41d77df1 100644
--- a/tools/testing/selftests/ublk/kublk.h
+++ b/tools/testing/selftests/ublk/kublk.h
@@ -60,6 +60,7 @@ struct stripe_ctx {
struct fault_inject_ctx {
/* fault_inject */
unsigned long delay_us;
+ bool die_during_fetch;
};
struct dev_ctx {
@@ -80,6 +81,7 @@ struct dev_ctx {
unsigned int no_ublk_fixed_fd:1;
unsigned int safe_stop:1;
unsigned int no_auto_part_scan:1;
+ unsigned int rdonly_shmem_buf:1;
__u32 integrity_flags;
__u8 metadata_size;
__u8 pi_offset;
@@ -95,6 +97,8 @@ struct dev_ctx {
/* for 'update_size' command */
unsigned long long size;
+ char *htlb_path;
+
union {
struct stripe_ctx stripe;
struct fault_inject_ctx fault_inject;
@@ -138,6 +142,8 @@ struct ublk_tgt_ops {
int (*init_tgt)(const struct dev_ctx *ctx, struct ublk_dev *);
void (*deinit_tgt)(struct ublk_dev *);
+ void (*pre_fetch_io)(struct ublk_thread *t, struct ublk_queue *q,
+ int tag, bool batch);
int (*queue_io)(struct ublk_thread *, struct ublk_queue *, int tag);
void (*tgt_io_done)(struct ublk_thread *, struct ublk_queue *,
const struct io_uring_cqe *);
@@ -599,6 +605,18 @@ static inline void ublk_queued_tgt_io(struct ublk_thread *t, struct ublk_queue *
}
}
+/* shared memory zero-copy support */
+#define UBLK_BUF_MAX 256
+
+struct ublk_shmem_entry {
+ int fd;
+ void *mmap_base;
+ size_t size;
+};
+
+extern struct ublk_shmem_entry shmem_table[UBLK_BUF_MAX];
+extern int shmem_count;
+
extern const struct ublk_tgt_ops null_tgt_ops;
extern const struct ublk_tgt_ops loop_tgt_ops;
extern const struct ublk_tgt_ops stripe_tgt_ops;
diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh
index 163a40007910..af2ea4fa1111 100755
--- a/tools/testing/selftests/ublk/test_common.sh
+++ b/tools/testing/selftests/ublk/test_common.sh
@@ -88,6 +88,7 @@ _remove_tmp_dir() {
_mkfs_mount_test()
{
local dev=$1
+ shift
local err_code=0
local mnt_dir;
@@ -99,12 +100,17 @@ _mkfs_mount_test()
fi
mount -t ext4 "$dev" "$mnt_dir" > /dev/null 2>&1
+ if [ $# -gt 0 ]; then
+ cd "$mnt_dir" && "$@"
+ err_code=$?
+ cd - > /dev/null
+ fi
umount "$dev"
- err_code=$?
- _remove_tmp_dir "$mnt_dir"
- if [ $err_code -ne 0 ]; then
- return $err_code
+ if [ $err_code -eq 0 ]; then
+ err_code=$?
fi
+ _remove_tmp_dir "$mnt_dir"
+ return $err_code
}
_check_root() {
@@ -132,6 +138,7 @@ _prep_test() {
local base_dir=${TMPDIR:-./ublktest-dir}
mkdir -p "$base_dir"
UBLK_TEST_DIR=$(mktemp -d ${base_dir}/${TID}.XXXXXX)
+ UBLK_TEST_DIR=$(realpath ${UBLK_TEST_DIR})
UBLK_TMP=$(mktemp ${UBLK_TEST_DIR}/ublk_test_XXXXX)
[ "$UBLK_TEST_QUIET" -eq 0 ] && echo "ublk $type: $*"
echo "ublk selftest: $TID starting at $(date '+%F %T')" | tee /dev/kmsg
diff --git a/tools/testing/selftests/ublk/test_generic_17.sh b/tools/testing/selftests/ublk/test_generic_17.sh
new file mode 100755
index 000000000000..2278b5fc9dba
--- /dev/null
+++ b/tools/testing/selftests/ublk/test_generic_17.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
+
+ERR_CODE=0
+
+_prep_test "fault_inject" "teardown after incomplete recovery"
+
+# First start and stop a ublk server with device configured for recovery
+dev_id=$(_add_ublk_dev -t fault_inject -r 1)
+_check_add_dev $TID $?
+state=$(__ublk_kill_daemon "${dev_id}" "QUIESCED")
+if [ "$state" != "QUIESCED" ]; then
+ echo "device isn't quiesced($state) after $action"
+ ERR_CODE=255
+fi
+
+# Then recover the device, but use --die_during_fetch to have the ublk
+# server die while a queue has some (but not all) I/Os fetched
+${UBLK_PROG} recover -n "${dev_id}" --foreground -t fault_inject --die_during_fetch 1
+RECOVER_RES=$?
+# 137 is the result when dying of SIGKILL
+if (( RECOVER_RES != 137 )); then
+ echo "recover command exited with unexpected code ${RECOVER_RES}!"
+ ERR_CODE=255
+fi
+
+# Clean up the device. This can only succeed once teardown of the above
+# exited ublk server completes. So if teardown never completes, we will
+# time out here
+_ublk_del_dev "${dev_id}"
+
+_cleanup_test "fault_inject"
+_show_result $TID $ERR_CODE
diff --git a/tools/testing/selftests/ublk/test_shmemzc_01.sh b/tools/testing/selftests/ublk/test_shmemzc_01.sh
new file mode 100755
index 000000000000..47210af2aa20
--- /dev/null
+++ b/tools/testing/selftests/ublk/test_shmemzc_01.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Test: shmem_zc with hugetlbfs buffer on null target
+#
+# kublk and fio both mmap the same hugetlbfs file (MAP_SHARED),
+# so they share physical pages. The kernel PFN match enables
+# zero-copy I/O without socket-based fd passing.
+
+. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
+
+ERR_CODE=0
+
+_prep_test "shmem_zc" "null target hugetlbfs shmem zero-copy test"
+
+if ! _have_program fio; then
+ echo "SKIP: fio not available"
+ exit "$UBLK_SKIP_CODE"
+fi
+
+if ! grep -q hugetlbfs /proc/filesystems; then
+ echo "SKIP: hugetlbfs not supported"
+ exit "$UBLK_SKIP_CODE"
+fi
+
+# Allocate hugepages
+OLD_NR_HP=$(cat /proc/sys/vm/nr_hugepages)
+echo 10 > /proc/sys/vm/nr_hugepages
+NR_HP=$(cat /proc/sys/vm/nr_hugepages)
+if [ "$NR_HP" -lt 2 ]; then
+ echo "SKIP: cannot allocate hugepages"
+ echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages
+ exit "$UBLK_SKIP_CODE"
+fi
+
+# Mount hugetlbfs
+HTLB_MNT=$(mktemp -d "${UBLK_TEST_DIR}/htlb_mnt_XXXXXX")
+if ! mount -t hugetlbfs none "$HTLB_MNT"; then
+ echo "SKIP: cannot mount hugetlbfs"
+ rmdir "$HTLB_MNT"
+ echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages
+ exit "$UBLK_SKIP_CODE"
+fi
+
+HTLB_FILE="$HTLB_MNT/ublk_buf"
+fallocate -l 4M "$HTLB_FILE"
+
+dev_id=$(_add_ublk_dev -t null --shmem_zc --htlb "$HTLB_FILE")
+_check_add_dev $TID $?
+
+fio --name=htlb_zc \
+ --filename=/dev/ublkb"${dev_id}" \
+ --ioengine=io_uring \
+ --rw=randwrite \
+ --direct=1 \
+ --bs=4k \
+ --size=4M \
+ --iodepth=32 \
+ --mem=mmaphuge:"$HTLB_FILE" \
+ > /dev/null 2>&1
+ERR_CODE=$?
+
+# Delete device first so daemon releases the htlb mmap
+_ublk_del_dev "${dev_id}"
+
+rm -f "$HTLB_FILE"
+umount "$HTLB_MNT"
+rmdir "$HTLB_MNT"
+echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages
+
+_cleanup_test "shmem_zc"
+
+_show_result $TID $ERR_CODE
diff --git a/tools/testing/selftests/ublk/test_shmemzc_02.sh b/tools/testing/selftests/ublk/test_shmemzc_02.sh
new file mode 100755
index 000000000000..aed9262494e9
--- /dev/null
+++ b/tools/testing/selftests/ublk/test_shmemzc_02.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Test: shmem_zc with hugetlbfs buffer on loop target
+#
+# kublk and fio both mmap the same hugetlbfs file (MAP_SHARED),
+# so they share physical pages. The kernel PFN match enables
+# zero-copy I/O without socket-based fd passing.
+
+. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
+
+ERR_CODE=0
+
+_prep_test "shmem_zc" "loop target hugetlbfs shmem zero-copy test"
+
+if ! _have_program fio; then
+ echo "SKIP: fio not available"
+ exit "$UBLK_SKIP_CODE"
+fi
+
+if ! grep -q hugetlbfs /proc/filesystems; then
+ echo "SKIP: hugetlbfs not supported"
+ exit "$UBLK_SKIP_CODE"
+fi
+
+# Allocate hugepages
+OLD_NR_HP=$(cat /proc/sys/vm/nr_hugepages)
+echo 10 > /proc/sys/vm/nr_hugepages
+NR_HP=$(cat /proc/sys/vm/nr_hugepages)
+if [ "$NR_HP" -lt 2 ]; then
+ echo "SKIP: cannot allocate hugepages"
+ echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages
+ exit "$UBLK_SKIP_CODE"
+fi
+
+# Mount hugetlbfs
+HTLB_MNT=$(mktemp -d "${UBLK_TEST_DIR}/htlb_mnt_XXXXXX")
+if ! mount -t hugetlbfs none "$HTLB_MNT"; then
+ echo "SKIP: cannot mount hugetlbfs"
+ rmdir "$HTLB_MNT"
+ echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages
+ exit "$UBLK_SKIP_CODE"
+fi
+
+HTLB_FILE="$HTLB_MNT/ublk_buf"
+fallocate -l 4M "$HTLB_FILE"
+
+_create_backfile 0 128M
+BACKFILE="${UBLK_BACKFILES[0]}"
+
+dev_id=$(_add_ublk_dev -t loop --shmem_zc --htlb "$HTLB_FILE" "$BACKFILE")
+_check_add_dev $TID $?
+
+_run_fio_verify_io --filename=/dev/ublkb"${dev_id}" \
+ --size=128M \
+ --mem=mmaphuge:"$HTLB_FILE"
+ERR_CODE=$?
+
+# Delete device first so daemon releases the htlb mmap
+_ublk_del_dev "${dev_id}"
+
+rm -f "$HTLB_FILE"
+umount "$HTLB_MNT"
+rmdir "$HTLB_MNT"
+echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages
+
+_cleanup_test "shmem_zc"
+
+_show_result $TID $ERR_CODE
diff --git a/tools/testing/selftests/ublk/test_shmemzc_03.sh b/tools/testing/selftests/ublk/test_shmemzc_03.sh
new file mode 100755
index 000000000000..db967a9ffe81
--- /dev/null
+++ b/tools/testing/selftests/ublk/test_shmemzc_03.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Test: shmem_zc with fio verify over filesystem on loop target
+#
+# mkfs + mount ext4 on the ublk device, then run fio verify on a
+# file inside that filesystem. Exercises the full stack:
+# filesystem -> block layer -> ublk shmem_zc -> loop target backing file.
+
+. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
+
+ERR_CODE=0
+
+_prep_test "shmem_zc" "loop target hugetlbfs shmem zero-copy fs verify test"
+
+if ! _have_program fio; then
+ echo "SKIP: fio not available"
+ exit "$UBLK_SKIP_CODE"
+fi
+
+if ! grep -q hugetlbfs /proc/filesystems; then
+ echo "SKIP: hugetlbfs not supported"
+ exit "$UBLK_SKIP_CODE"
+fi
+
+# Allocate hugepages
+OLD_NR_HP=$(cat /proc/sys/vm/nr_hugepages)
+echo 10 > /proc/sys/vm/nr_hugepages
+NR_HP=$(cat /proc/sys/vm/nr_hugepages)
+if [ "$NR_HP" -lt 2 ]; then
+ echo "SKIP: cannot allocate hugepages"
+ echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages
+ exit "$UBLK_SKIP_CODE"
+fi
+
+# Mount hugetlbfs
+HTLB_MNT=$(mktemp -d "${UBLK_TEST_DIR}/htlb_mnt_XXXXXX")
+if ! mount -t hugetlbfs none "$HTLB_MNT"; then
+ echo "SKIP: cannot mount hugetlbfs"
+ rmdir "$HTLB_MNT"
+ echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages
+ exit "$UBLK_SKIP_CODE"
+fi
+
+HTLB_FILE="$HTLB_MNT/ublk_buf"
+fallocate -l 4M "$HTLB_FILE"
+
+_create_backfile 0 256M
+BACKFILE="${UBLK_BACKFILES[0]}"
+
+dev_id=$(_add_ublk_dev -t loop --shmem_zc --htlb "$HTLB_FILE" "$BACKFILE")
+_check_add_dev $TID $?
+
+_mkfs_mount_test /dev/ublkb"${dev_id}" \
+ _run_fio_verify_io --filename=testfile \
+ --size=128M \
+ --mem=mmaphuge:"$HTLB_FILE"
+ERR_CODE=$?
+
+# Delete device first so daemon releases the htlb mmap
+_ublk_del_dev "${dev_id}"
+
+rm -f "$HTLB_FILE"
+umount "$HTLB_MNT"
+rmdir "$HTLB_MNT"
+echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages
+
+_cleanup_test "shmem_zc"
+
+_show_result $TID $ERR_CODE
diff --git a/tools/testing/selftests/ublk/test_shmemzc_04.sh b/tools/testing/selftests/ublk/test_shmemzc_04.sh
new file mode 100755
index 000000000000..899de088ece4
--- /dev/null
+++ b/tools/testing/selftests/ublk/test_shmemzc_04.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Test: shmem_zc with read-only buffer registration on null target
+#
+# Same as test_shmemzc_01 but with --rdonly_shmem_buf: pages are pinned
+# without FOLL_WRITE (UBLK_BUF_F_READ). Write I/O works because
+# the server only reads from the shared buffer.
+
+. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
+
+ERR_CODE=0
+
+_prep_test "shmem_zc" "null target hugetlbfs shmem zero-copy rdonly_buf test"
+
+if ! _have_program fio; then
+ echo "SKIP: fio not available"
+ exit "$UBLK_SKIP_CODE"
+fi
+
+if ! grep -q hugetlbfs /proc/filesystems; then
+ echo "SKIP: hugetlbfs not supported"
+ exit "$UBLK_SKIP_CODE"
+fi
+
+# Allocate hugepages
+OLD_NR_HP=$(cat /proc/sys/vm/nr_hugepages)
+echo 10 > /proc/sys/vm/nr_hugepages
+NR_HP=$(cat /proc/sys/vm/nr_hugepages)
+if [ "$NR_HP" -lt 2 ]; then
+ echo "SKIP: cannot allocate hugepages"
+ echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages
+ exit "$UBLK_SKIP_CODE"
+fi
+
+# Mount hugetlbfs
+HTLB_MNT=$(mktemp -d "${UBLK_TEST_DIR}/htlb_mnt_XXXXXX")
+if ! mount -t hugetlbfs none "$HTLB_MNT"; then
+ echo "SKIP: cannot mount hugetlbfs"
+ rmdir "$HTLB_MNT"
+ echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages
+ exit "$UBLK_SKIP_CODE"
+fi
+
+HTLB_FILE="$HTLB_MNT/ublk_buf"
+fallocate -l 4M "$HTLB_FILE"
+
+dev_id=$(_add_ublk_dev -t null --shmem_zc --htlb "$HTLB_FILE" --rdonly_shmem_buf)
+_check_add_dev $TID $?
+
+fio --name=htlb_zc_rdonly \
+ --filename=/dev/ublkb"${dev_id}" \
+ --ioengine=io_uring \
+ --rw=randwrite \
+ --direct=1 \
+ --bs=4k \
+ --size=4M \
+ --iodepth=32 \
+ --mem=mmaphuge:"$HTLB_FILE" \
+ > /dev/null 2>&1
+ERR_CODE=$?
+
+# Delete device first so daemon releases the htlb mmap
+_ublk_del_dev "${dev_id}"
+
+rm -f "$HTLB_FILE"
+umount "$HTLB_MNT"
+rmdir "$HTLB_MNT"
+echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages
+
+_cleanup_test "shmem_zc"
+
+_show_result $TID $ERR_CODE