summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-05-23 23:06:15 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2022-05-23 23:06:15 +0300
commit9836e93c0a7e031ac6a71c56171c229de1eea7cf (patch)
treef53f3460e86752c50aac9ee16b4426c84d277899 /include
parente1a8fde7203fa8a3e3f35d4f9df47477d23529c1 (diff)
parent3fe07bcd800d6e5e4e4263ca2564d69095c157bf (diff)
downloadlinux-9836e93c0a7e031ac6a71c56171c229de1eea7cf.tar.xz
Merge tag 'for-5.19/io_uring-passthrough-2022-05-22' of git://git.kernel.dk/linux-block
Pull io_uring NVMe command passthrough from Jens Axboe: "On top of everything else, this adds support for passthrough for io_uring. The initial feature for this is NVMe passthrough support, which allows non-filesystem based IO commands and admin commands. To support this, io_uring grows support for SQE and CQE members that are twice as big, allowing to pass in a full NVMe command without having to copy data around. And to complete with more than just a single 32-bit value as the output" * tag 'for-5.19/io_uring-passthrough-2022-05-22' of git://git.kernel.dk/linux-block: (22 commits) io_uring: cleanup handling of the two task_work lists nvme: enable uring-passthrough for admin commands nvme: helper for uring-passthrough checks blk-mq: fix passthrough plugging nvme: add vectored-io support for uring-cmd nvme: wire-up uring-cmd support for io-passthru on char-device. nvme: refactor nvme_submit_user_cmd() block: wire-up support for passthrough plugging fs,io_uring: add infrastructure for uring-cmd io_uring: support CQE32 for nop operation io_uring: enable CQE32 io_uring: support CQE32 in /proc info io_uring: add tracing for additional CQE32 fields io_uring: overflow processing for CQE32 io_uring: flush completions for CQE32 io_uring: modify io_get_cqe for CQE32 io_uring: add CQE32 completion processing io_uring: add CQE32 setup processing io_uring: change ring size calculation for CQE32 io_uring: store add. return values for CQE32 ...
Diffstat (limited to 'include')
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/linux/io_uring.h33
-rw-r--r--include/trace/events/io_uring.h18
-rw-r--r--include/uapi/linux/io_uring.h24
-rw-r--r--include/uapi/linux/nvme_ioctl.h28
5 files changed, 99 insertions, 6 deletions
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bbde95387a23..87b5af1d9fbe 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1953,6 +1953,7 @@ struct dir_context {
#define REMAP_FILE_ADVISORY (REMAP_FILE_CAN_SHORTEN)
struct iov_iter;
+struct io_uring_cmd;
struct file_operations {
struct module *owner;
@@ -1995,6 +1996,7 @@ struct file_operations {
struct file *file_out, loff_t pos_out,
loff_t len, unsigned int remap_flags);
int (*fadvise)(struct file *, loff_t, loff_t, int);
+ int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
} __randomize_layout;
struct inode_operations {
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 24651c229ed2..4a2f6cc5a492 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -5,7 +5,32 @@
#include <linux/sched.h>
#include <linux/xarray.h>
+enum io_uring_cmd_flags {
+ IO_URING_F_COMPLETE_DEFER = 1,
+ IO_URING_F_UNLOCKED = 2,
+ /* int's last bit, sign checks are usually faster than a bit test */
+ IO_URING_F_NONBLOCK = INT_MIN,
+
+ /* ctx state flags, for URING_CMD */
+ IO_URING_F_SQE128 = 4,
+ IO_URING_F_CQE32 = 8,
+ IO_URING_F_IOPOLL = 16,
+};
+
+struct io_uring_cmd {
+ struct file *file;
+ const void *cmd;
+ /* callback to defer completions to task context */
+ void (*task_work_cb)(struct io_uring_cmd *cmd);
+ u32 cmd_op;
+ u32 pad;
+ u8 pdu[32]; /* available inline for free use */
+};
+
#if defined(CONFIG_IO_URING)
+void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2);
+void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
+ void (*task_work_cb)(struct io_uring_cmd *));
struct sock *io_uring_get_socket(struct file *file);
void __io_uring_cancel(bool cancel_all);
void __io_uring_free(struct task_struct *tsk);
@@ -30,6 +55,14 @@ static inline void io_uring_free(struct task_struct *tsk)
__io_uring_free(tsk);
}
#else
+static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret,
+ ssize_t ret2)
+{
+}
+static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
+ void (*task_work_cb)(struct io_uring_cmd *))
+{
+}
static inline struct sock *io_uring_get_socket(struct file *file)
{
return NULL;
diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index 610e493234db..66fcc5a1a5b1 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -321,13 +321,16 @@ TRACE_EVENT(io_uring_fail_link,
* @user_data: user data associated with the request
* @res: result of the request
* @cflags: completion flags
+ * @extra1: extra 64-bit data for CQE32
+ * @extra2: extra 64-bit data for CQE32
*
*/
TRACE_EVENT(io_uring_complete,
- TP_PROTO(void *ctx, void *req, u64 user_data, int res, unsigned cflags),
+ TP_PROTO(void *ctx, void *req, u64 user_data, int res, unsigned cflags,
+ u64 extra1, u64 extra2),
- TP_ARGS(ctx, req, user_data, res, cflags),
+ TP_ARGS(ctx, req, user_data, res, cflags, extra1, extra2),
TP_STRUCT__entry (
__field( void *, ctx )
@@ -335,6 +338,8 @@ TRACE_EVENT(io_uring_complete,
__field( u64, user_data )
__field( int, res )
__field( unsigned, cflags )
+ __field( u64, extra1 )
+ __field( u64, extra2 )
),
TP_fast_assign(
@@ -343,12 +348,17 @@ TRACE_EVENT(io_uring_complete,
__entry->user_data = user_data;
__entry->res = res;
__entry->cflags = cflags;
+ __entry->extra1 = extra1;
+ __entry->extra2 = extra2;
),
- TP_printk("ring %p, req %p, user_data 0x%llx, result %d, cflags 0x%x",
+ TP_printk("ring %p, req %p, user_data 0x%llx, result %d, cflags 0x%x "
+ "extra1 %llu extra2 %llu ",
__entry->ctx, __entry->req,
__entry->user_data,
- __entry->res, __entry->cflags)
+ __entry->res, __entry->cflags,
+ (unsigned long long) __entry->extra1,
+ (unsigned long long) __entry->extra2)
);
/**
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index cc9544629eee..53e7dae92e42 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -22,6 +22,7 @@ struct io_uring_sqe {
union {
__u64 off; /* offset into file */
__u64 addr2;
+ __u32 cmd_op;
};
union {
__u64 addr; /* pointer to buffer or iovecs */
@@ -61,8 +62,17 @@ struct io_uring_sqe {
__s32 splice_fd_in;
__u32 file_index;
};
- __u64 addr3;
- __u64 __pad2[1];
+ union {
+ struct {
+ __u64 addr3;
+ __u64 __pad2[1];
+ };
+ /*
+ * If the ring is initialized with IORING_SETUP_SQE128, then
+ * this field is used for 80 bytes of arbitrary command data
+ */
+ __u8 cmd[0];
+ };
};
/*
@@ -128,6 +138,9 @@ enum {
*/
#define IORING_SETUP_TASKRUN_FLAG (1U << 9)
+#define IORING_SETUP_SQE128 (1U << 10) /* SQEs are 128 byte */
+#define IORING_SETUP_CQE32 (1U << 11) /* CQEs are 32 byte */
+
enum io_uring_op {
IORING_OP_NOP,
IORING_OP_READV,
@@ -175,6 +188,7 @@ enum io_uring_op {
IORING_OP_FGETXATTR,
IORING_OP_GETXATTR,
IORING_OP_SOCKET,
+ IORING_OP_URING_CMD,
/* this goes last, obviously */
IORING_OP_LAST,
@@ -251,6 +265,12 @@ struct io_uring_cqe {
__u64 user_data; /* sqe->data submission passed back */
__s32 res; /* result code for this event */
__u32 flags;
+
+ /*
+ * If the ring is initialized with IORING_SETUP_CQE32, then this field
+ * contains 16-bytes of padding, doubling the size of the CQE.
+ */
+ __u64 big_cqe[];
};
/*
diff --git a/include/uapi/linux/nvme_ioctl.h b/include/uapi/linux/nvme_ioctl.h
index b2e43185e3b5..2f76cba67166 100644
--- a/include/uapi/linux/nvme_ioctl.h
+++ b/include/uapi/linux/nvme_ioctl.h
@@ -70,6 +70,28 @@ struct nvme_passthru_cmd64 {
__u64 result;
};
+/* same as struct nvme_passthru_cmd64, minus the 8b result field */
+struct nvme_uring_cmd {
+ __u8 opcode;
+ __u8 flags;
+ __u16 rsvd1;
+ __u32 nsid;
+ __u32 cdw2;
+ __u32 cdw3;
+ __u64 metadata;
+ __u64 addr;
+ __u32 metadata_len;
+ __u32 data_len;
+ __u32 cdw10;
+ __u32 cdw11;
+ __u32 cdw12;
+ __u32 cdw13;
+ __u32 cdw14;
+ __u32 cdw15;
+ __u32 timeout_ms;
+ __u32 rsvd2;
+};
+
#define nvme_admin_cmd nvme_passthru_cmd
#define NVME_IOCTL_ID _IO('N', 0x40)
@@ -83,4 +105,10 @@ struct nvme_passthru_cmd64 {
#define NVME_IOCTL_IO64_CMD _IOWR('N', 0x48, struct nvme_passthru_cmd64)
#define NVME_IOCTL_IO64_CMD_VEC _IOWR('N', 0x49, struct nvme_passthru_cmd64)
+/* io_uring async commands: */
+#define NVME_URING_CMD_IO _IOWR('N', 0x80, struct nvme_uring_cmd)
+#define NVME_URING_CMD_IO_VEC _IOWR('N', 0x81, struct nvme_uring_cmd)
+#define NVME_URING_CMD_ADMIN _IOWR('N', 0x82, struct nvme_uring_cmd)
+#define NVME_URING_CMD_ADMIN_VEC _IOWR('N', 0x83, struct nvme_uring_cmd)
+
#endif /* _UAPI_LINUX_NVME_IOCTL_H */