diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-03-30 22:18:49 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-03-30 22:18:49 +0300 |
commit | e59cd88028dbd41472453e5883f78330aa73c56e (patch) | |
tree | 5576a15b6cf3e70df7cf6d0a2d7711b4bc0417ed /include | |
parent | 1592614838cb52f4313ceff64894e2ca78591498 (diff) | |
parent | 3d9932a8b240c9019f48358e8a6928c53c2c7f6b (diff) | |
download | linux-e59cd88028dbd41472453e5883f78330aa73c56e.tar.xz |
Merge tag 'for-5.7/io_uring-2020-03-29' of git://git.kernel.dk/linux-block
Pull io_uring updates from Jens Axboe:
"Here are the io_uring changes for this merge window. Light on new
features this time around (just splice + buffer selection), lots of
cleanups, fixes, and improvements to existing support. In particular,
this contains:
- Cleanup fixed file update handling for stack fallback (Hillf)
- Re-work of how pollable async IO is handled, we no longer require
thread offload to handle that. Instead we rely using poll to drive
this, with task_work execution.
- In conjunction with the above, allow expendable buffer selection,
so that poll+recv (for example) no longer has to be a split
operation.
- Make sure we honor RLIMIT_FSIZE for buffered writes
- Add support for splice (Pavel)
- Linked work inheritance fixes and optimizations (Pavel)
- Async work fixes and cleanups (Pavel)
- Improve io-wq locking (Pavel)
- Hashed link write improvements (Pavel)
- SETUP_IOPOLL|SETUP_SQPOLL improvements (Xiaoguang)"
* tag 'for-5.7/io_uring-2020-03-29' of git://git.kernel.dk/linux-block: (54 commits)
io_uring: cleanup io_alloc_async_ctx()
io_uring: fix missing 'return' in comment
io-wq: handle hashed writes in chains
io-uring: drop 'free_pfile' in struct io_file_put
io-uring: drop completion when removing file
io_uring: Fix ->data corruption on re-enqueue
io-wq: close cancel gap for hashed linked work
io_uring: make spdxcheck.py happy
io_uring: honor original task RLIMIT_FSIZE
io-wq: hash dependent work
io-wq: split hashing and enqueueing
io-wq: don't resched if there is no work
io-wq: remove duplicated cancel code
io_uring: fix truncated async read/readv and write/writev retry
io_uring: dual license io_uring.h uapi header
io_uring: io_uring_enter(2) don't poll while SETUP_IOPOLL|SETUP_SQPOLL enabled
io_uring: Fix unused function warnings
io_uring: add end-of-bits marker and build time verify it
io_uring: provide means of removing buffers
io_uring: add IOSQE_BUFFER_SELECT support for IORING_OP_RECVMSG
...
Diffstat (limited to 'include')
-rw-r--r-- | include/linux/socket.h | 4 | ||||
-rw-r--r-- | include/linux/splice.h | 3 | ||||
-rw-r--r-- | include/net/compat.h | 3 | ||||
-rw-r--r-- | include/trace/events/io_uring.h | 103 | ||||
-rw-r--r-- | include/uapi/linux/io_uring.h | 42 |
5 files changed, 151 insertions, 4 deletions
diff --git a/include/linux/socket.h b/include/linux/socket.h index 15f3412d481e..54338fac45cb 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -391,6 +391,10 @@ extern int recvmsg_copy_msghdr(struct msghdr *msg, struct user_msghdr __user *umsg, unsigned flags, struct sockaddr __user **uaddr, struct iovec **iov); +extern int __copy_msghdr_from_user(struct msghdr *kmsg, + struct user_msghdr __user *umsg, + struct sockaddr __user **save_addr, + struct iovec __user **uiov, size_t *nsegs); /* helpers which do the actual work for syscalls */ extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size, diff --git a/include/linux/splice.h b/include/linux/splice.h index 74b4911ac16d..ebbbfea48aa0 100644 --- a/include/linux/splice.h +++ b/include/linux/splice.h @@ -78,6 +78,9 @@ extern ssize_t add_to_pipe(struct pipe_inode_info *, struct pipe_buffer *); extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *, splice_direct_actor *); +extern long do_splice(struct file *in, loff_t __user *off_in, + struct file *out, loff_t __user *off_out, + size_t len, unsigned int flags); /* * for dynamic pipe sizing diff --git a/include/net/compat.h b/include/net/compat.h index f277653c7e17..e341260642fe 100644 --- a/include/net/compat.h +++ b/include/net/compat.h @@ -38,6 +38,9 @@ struct compat_cmsghdr { #define compat_mmsghdr mmsghdr #endif /* defined(CONFIG_COMPAT) */ +int __get_compat_msghdr(struct msghdr *kmsg, struct compat_msghdr __user *umsg, + struct sockaddr __user **save_addr, compat_uptr_t *ptr, + compat_size_t *len); int get_compat_msghdr(struct msghdr *, struct compat_msghdr __user *, struct sockaddr __user **, struct iovec **); struct sock_fprog __user *get_compat_bpf_fprog(char __user *optval); diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index 27bd9e4f927b..9f0d3b7d56b0 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -357,6 +357,109 @@ TRACE_EVENT(io_uring_submit_sqe, __entry->force_nonblock, __entry->sq_thread) ); +TRACE_EVENT(io_uring_poll_arm, + + TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask, int events), + + TP_ARGS(ctx, opcode, user_data, mask, events), + + TP_STRUCT__entry ( + __field( void *, ctx ) + __field( u8, opcode ) + __field( u64, user_data ) + __field( int, mask ) + __field( int, events ) + ), + + TP_fast_assign( + __entry->ctx = ctx; + __entry->opcode = opcode; + __entry->user_data = user_data; + __entry->mask = mask; + __entry->events = events; + ), + + TP_printk("ring %p, op %d, data 0x%llx, mask 0x%x, events 0x%x", + __entry->ctx, __entry->opcode, + (unsigned long long) __entry->user_data, + __entry->mask, __entry->events) +); + +TRACE_EVENT(io_uring_poll_wake, + + TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask), + + TP_ARGS(ctx, opcode, user_data, mask), + + TP_STRUCT__entry ( + __field( void *, ctx ) + __field( u8, opcode ) + __field( u64, user_data ) + __field( int, mask ) + ), + + TP_fast_assign( + __entry->ctx = ctx; + __entry->opcode = opcode; + __entry->user_data = user_data; + __entry->mask = mask; + ), + + TP_printk("ring %p, op %d, data 0x%llx, mask 0x%x", + __entry->ctx, __entry->opcode, + (unsigned long long) __entry->user_data, + __entry->mask) +); + +TRACE_EVENT(io_uring_task_add, + + TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask), + + TP_ARGS(ctx, opcode, user_data, mask), + + TP_STRUCT__entry ( + __field( void *, ctx ) + __field( u8, opcode ) + __field( u64, user_data ) + __field( int, mask ) + ), + + TP_fast_assign( + __entry->ctx = ctx; + __entry->opcode = opcode; + __entry->user_data = user_data; + __entry->mask = mask; + ), + + TP_printk("ring %p, op %d, data 0x%llx, mask %x", + __entry->ctx, __entry->opcode, + (unsigned long long) __entry->user_data, + __entry->mask) +); + +TRACE_EVENT(io_uring_task_run, + + TP_PROTO(void *ctx, u8 opcode, u64 user_data), + + TP_ARGS(ctx, opcode, user_data), + + TP_STRUCT__entry ( + __field( void *, ctx ) + __field( u8, opcode ) + __field( u64, user_data ) + ), + + TP_fast_assign( + __entry->ctx = ctx; + __entry->opcode = opcode; + __entry->user_data = user_data; + ), + + TP_printk("ring %p, op %d, data 0x%llx", + __entry->ctx, __entry->opcode, + (unsigned long long) __entry->user_data) +); + #endif /* _TRACE_IO_URING_H */ /* This part must be outside protection */ diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 3f7961c1c243..e48d746b8e2a 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */ /* * Header file for the io_uring interface. * @@ -23,7 +23,10 @@ struct io_uring_sqe { __u64 off; /* offset into file */ __u64 addr2; }; - __u64 addr; /* pointer to buffer or iovecs */ + union { + __u64 addr; /* pointer to buffer or iovecs */ + __u64 splice_off_in; + }; __u32 len; /* buffer size or number of iovecs */ union { __kernel_rwf_t rw_flags; @@ -37,14 +40,21 @@ struct io_uring_sqe { __u32 open_flags; __u32 statx_flags; __u32 fadvise_advice; + __u32 splice_flags; }; __u64 user_data; /* data to be passed back at completion time */ union { struct { - /* index into fixed buffers, if used */ - __u16 buf_index; + /* pack this to avoid bogus arm OABI complaints */ + union { + /* index into fixed buffers, if used */ + __u16 buf_index; + /* for grouped buffer selection */ + __u16 buf_group; + } __attribute__((packed)); /* personality to use, if used */ __u16 personality; + __s32 splice_fd_in; }; __u64 __pad2[3]; }; @@ -56,6 +66,7 @@ enum { IOSQE_IO_LINK_BIT, IOSQE_IO_HARDLINK_BIT, IOSQE_ASYNC_BIT, + IOSQE_BUFFER_SELECT_BIT, }; /* @@ -71,6 +82,8 @@ enum { #define IOSQE_IO_HARDLINK (1U << IOSQE_IO_HARDLINK_BIT) /* always go async */ #define IOSQE_ASYNC (1U << IOSQE_ASYNC_BIT) +/* select buffer from sqe->buf_group */ +#define IOSQE_BUFFER_SELECT (1U << IOSQE_BUFFER_SELECT_BIT) /* * io_uring_setup() flags @@ -113,6 +126,9 @@ enum { IORING_OP_RECV, IORING_OP_OPENAT2, IORING_OP_EPOLL_CTL, + IORING_OP_SPLICE, + IORING_OP_PROVIDE_BUFFERS, + IORING_OP_REMOVE_BUFFERS, /* this goes last, obviously */ IORING_OP_LAST, @@ -129,6 +145,12 @@ enum { #define IORING_TIMEOUT_ABS (1U << 0) /* + * sqe->splice_flags + * extends splice(2) flags + */ +#define SPLICE_F_FD_IN_FIXED (1U << 31) /* the last bit of __u32 */ + +/* * IO completion data structure (Completion Queue Entry) */ struct io_uring_cqe { @@ -138,6 +160,17 @@ struct io_uring_cqe { }; /* + * cqe->flags + * + * IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID + */ +#define IORING_CQE_F_BUFFER (1U << 0) + +enum { + IORING_CQE_BUFFER_SHIFT = 16, +}; + +/* * Magic offsets for the application to mmap the data it needs */ #define IORING_OFF_SQ_RING 0ULL @@ -204,6 +237,7 @@ struct io_uring_params { #define IORING_FEAT_SUBMIT_STABLE (1U << 2) #define IORING_FEAT_RW_CUR_POS (1U << 3) #define IORING_FEAT_CUR_PERSONALITY (1U << 4) +#define IORING_FEAT_FAST_POLL (1U << 5) /* * io_uring_register(2) opcodes and arguments |