diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-03-11 21:35:31 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-03-11 21:35:31 +0300 |
commit | d2c84bdce25a678c1e1f116d65b58790bd241af0 (patch) | |
tree | 45499e5ded0bec5bc0ac7e305ee19198374a02c4 | |
parent | 0f1a876682f0979d6a1e5f86861dd562d1758936 (diff) | |
parent | 606559dc4fa36a954a51fbf1c6c0cc320f551fe0 (diff) | |
download | linux-d2c84bdce25a678c1e1f116d65b58790bd241af0.tar.xz |
Merge tag 'for-6.9/io_uring-20240310' of git://git.kernel.dk/linux
Pull io_uring updates from Jens Axboe:
- Make running of task_work internal loops more fair, and unify how the
different methods deal with them (me)
- Support for per-ring NAPI. The two minor networking patches are in a
shared branch with netdev (Stefan)
- Add support for truncate (Tony)
- Export SQPOLL utilization stats (Xiaobing)
- Multishot fixes (Pavel)
- Fix for a race in manipulating the request flags via poll (Pavel)
- Cleanup the multishot checking by making it generic, moving it out of
opcode handlers (Pavel)
- Various tweaks and cleanups (me, Kunwu, Alexander)
* tag 'for-6.9/io_uring-20240310' of git://git.kernel.dk/linux: (53 commits)
io_uring: Fix sqpoll utilization check racing with dying sqpoll
io_uring/net: dedup io_recv_finish req completion
io_uring: refactor DEFER_TASKRUN multishot checks
io_uring: fix mshot io-wq checks
io_uring/net: add io_req_msg_cleanup() helper
io_uring/net: simplify msghd->msg_inq checking
io_uring/kbuf: rename REQ_F_PARTIAL_IO to REQ_F_BL_NO_RECYCLE
io_uring/net: remove dependency on REQ_F_PARTIAL_IO for sr->done_io
io_uring/net: correctly handle multishot recvmsg retry setup
io_uring/net: clear REQ_F_BL_EMPTY in the multishot retry handler
io_uring: fix io_queue_proc modifying req->flags
io_uring: fix mshot read defer taskrun cqe posting
io_uring/net: fix overflow check in io_recvmsg_mshot_prep()
io_uring/net: correct the type of variable
io_uring/sqpoll: statistics of the true utilization of sq threads
io_uring/net: move recv/recvmsg flags out of retry loop
io_uring/kbuf: flag request if buffer pool is empty after buffer pick
io_uring/net: improve the usercopy for sendmsg/recvmsg
io_uring/net: move receive multishot out of the generic msghdr path
io_uring/net: unify how recvmsg and sendmsg copy in the msghdr
...
-rw-r--r-- | fs/internal.h | 1 | ||||
-rw-r--r-- | fs/open.c | 53 | ||||
-rw-r--r-- | include/linux/io_uring_types.h | 137 | ||||
-rw-r--r-- | include/net/busy_poll.h | 4 | ||||
-rw-r--r-- | include/trace/events/io_uring.h | 30 | ||||
-rw-r--r-- | include/uapi/linux/io_uring.h | 13 | ||||
-rw-r--r-- | io_uring/Makefile | 3 | ||||
-rw-r--r-- | io_uring/cancel.c | 3 | ||||
-rw-r--r-- | io_uring/cancel.h | 10 | ||||
-rw-r--r-- | io_uring/fdinfo.c | 18 | ||||
-rw-r--r-- | io_uring/filetable.h | 2 | ||||
-rw-r--r-- | io_uring/io_uring.c | 249 | ||||
-rw-r--r-- | io_uring/io_uring.h | 77 | ||||
-rw-r--r-- | io_uring/kbuf.c | 35 | ||||
-rw-r--r-- | io_uring/kbuf.h | 61 | ||||
-rw-r--r-- | io_uring/napi.c | 332 | ||||
-rw-r--r-- | io_uring/napi.h | 104 | ||||
-rw-r--r-- | io_uring/net.c | 382 | ||||
-rw-r--r-- | io_uring/opdef.c | 10 | ||||
-rw-r--r-- | io_uring/poll.c | 33 | ||||
-rw-r--r-- | io_uring/register.c | 13 | ||||
-rw-r--r-- | io_uring/rsrc.h | 2 | ||||
-rw-r--r-- | io_uring/rw.c | 13 | ||||
-rw-r--r-- | io_uring/sqpoll.c | 59 | ||||
-rw-r--r-- | io_uring/sqpoll.h | 1 | ||||
-rw-r--r-- | io_uring/truncate.c | 48 | ||||
-rw-r--r-- | io_uring/truncate.h | 4 | ||||
-rw-r--r-- | io_uring/uring_cmd.c | 1 | ||||
-rw-r--r-- | io_uring/xattr.c | 2 | ||||
-rw-r--r-- | net/core/dev.c | 57 |
30 files changed, 1253 insertions, 504 deletions
diff --git a/fs/internal.h b/fs/internal.h index 7d3edcdf59cc..49c1fcfee4b3 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -183,6 +183,7 @@ extern struct open_how build_open_how(int flags, umode_t mode); extern int build_open_flags(const struct open_how *how, struct open_flags *op); struct file *file_close_fd_locked(struct files_struct *files, unsigned fd); +long do_ftruncate(struct file *file, loff_t length, int small); long do_sys_ftruncate(unsigned int fd, loff_t length, int small); int chmod_common(const struct path *path, umode_t mode); int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, diff --git a/fs/open.c b/fs/open.c index 0a73afe04d34..a7d4bb2c725f 100644 --- a/fs/open.c +++ b/fs/open.c @@ -154,49 +154,52 @@ COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length } #endif -long do_sys_ftruncate(unsigned int fd, loff_t length, int small) +long do_ftruncate(struct file *file, loff_t length, int small) { struct inode *inode; struct dentry *dentry; - struct fd f; int error; - error = -EINVAL; - if (length < 0) - goto out; - error = -EBADF; - f = fdget(fd); - if (!f.file) - goto out; - /* explicitly opened as large or we are on 64-bit box */ - if (f.file->f_flags & O_LARGEFILE) + if (file->f_flags & O_LARGEFILE) small = 0; - dentry = f.file->f_path.dentry; + dentry = file->f_path.dentry; inode = dentry->d_inode; - error = -EINVAL; - if (!S_ISREG(inode->i_mode) || !(f.file->f_mode & FMODE_WRITE)) - goto out_putf; + if (!S_ISREG(inode->i_mode) || !(file->f_mode & FMODE_WRITE)) + return -EINVAL; - error = -EINVAL; /* Cannot ftruncate over 2^31 bytes without large file support */ if (small && length > MAX_NON_LFS) - goto out_putf; + return -EINVAL; - error = -EPERM; /* Check IS_APPEND on real upper inode */ - if (IS_APPEND(file_inode(f.file))) - goto out_putf; + if (IS_APPEND(file_inode(file))) + return -EPERM; sb_start_write(inode->i_sb); - error = security_file_truncate(f.file); + error = security_file_truncate(file); if (!error) - error = do_truncate(file_mnt_idmap(f.file), dentry, length, - ATTR_MTIME | ATTR_CTIME, f.file); + error = do_truncate(file_mnt_idmap(file), dentry, length, + ATTR_MTIME | ATTR_CTIME, file); sb_end_write(inode->i_sb); -out_putf: + + return error; +} + +long do_sys_ftruncate(unsigned int fd, loff_t length, int small) +{ + struct fd f; + int error; + + if (length < 0) + return -EINVAL; + f = fdget(fd); + if (!f.file) + return -EBADF; + + error = do_ftruncate(f.file, length, small); + fdput(f); -out: return error; } diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 854ad67a5f70..e24893625085 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -2,6 +2,7 @@ #define IO_URING_TYPES_H #include <linux/blkdev.h> +#include <linux/hashtable.h> #include <linux/task_work.h> #include <linux/bitmap.h> #include <linux/llist.h> @@ -240,12 +241,14 @@ struct io_ring_ctx { unsigned int poll_activated: 1; unsigned int drain_disabled: 1; unsigned int compat: 1; + unsigned int iowq_limits_set : 1; struct task_struct *submitter_task; struct io_rings *rings; struct percpu_ref refs; enum task_work_notify_mode notify_method; + unsigned sq_thread_idle; } ____cacheline_aligned_in_smp; /* submission data */ @@ -274,10 +277,20 @@ struct io_ring_ctx { */ struct io_rsrc_node *rsrc_node; atomic_t cancel_seq; + + /* + * ->iopoll_list is protected by the ctx->uring_lock for + * io_uring instances that don't use IORING_SETUP_SQPOLL. + * For SQPOLL, only the single threaded io_sq_thread() will + * manipulate the list, hence no extra locking is needed there. + */ + bool poll_multi_queue; + struct io_wq_work_list iopoll_list; + struct io_file_table file_table; + struct io_mapped_ubuf **user_bufs; unsigned nr_user_files; unsigned nr_user_bufs; - struct io_mapped_ubuf **user_bufs; struct io_submit_state submit_state; @@ -289,15 +302,6 @@ struct io_ring_ctx { struct io_alloc_cache netmsg_cache; /* - * ->iopoll_list is protected by the ctx->uring_lock for - * io_uring instances that don't use IORING_SETUP_SQPOLL. - * For SQPOLL, only the single threaded io_sq_thread() will - * manipulate the list, hence no extra locking is needed there. - */ - struct io_wq_work_list iopoll_list; - bool poll_multi_queue; - - /* * Any cancelable uring_cmd is added to this list in * ->uring_cmd() by io_uring_cmd_insert_cancelable() */ @@ -343,8 +347,8 @@ struct io_ring_ctx { spinlock_t completion_lock; /* IRQ completion list, under ->completion_lock */ - struct io_wq_work_list locked_free_list; unsigned int locked_free_nr; + struct io_wq_work_list locked_free_list; struct list_head io_buffers_comp; struct list_head cq_overflow_list; @@ -366,9 +370,6 @@ struct io_ring_ctx { unsigned int file_alloc_start; unsigned int file_alloc_end; - struct xarray personalities; - u32 pers_next; - struct list_head io_buffers_cache; /* deferred free list, protected by ->uring_lock */ @@ -389,6 +390,9 @@ struct io_ring_ctx { struct wait_queue_head rsrc_quiesce_wq; unsigned rsrc_quiesce; + u32 pers_next; + struct xarray personalities; + /* hashed buffered write serialization */ struct io_wq_hash *hash_map; @@ -405,11 +409,22 @@ struct io_ring_ctx { /* io-wq management, e.g. thread count */ u32 iowq_limits[2]; - bool iowq_limits_set; struct callback_head poll_wq_task_work; struct list_head defer_list; - unsigned sq_thread_idle; + +#ifdef CONFIG_NET_RX_BUSY_POLL + struct list_head napi_list; /* track busy poll napi_id */ + spinlock_t napi_lock; /* napi_list lock */ + + /* napi busy poll default timeout */ + unsigned int napi_busy_poll_to; + bool napi_prefer_busy_poll; + bool napi_enabled; + + DECLARE_HASHTABLE(napi_ht, 4); +#endif + /* protected by ->completion_lock */ unsigned evfd_last_cq_tail; @@ -455,7 +470,6 @@ enum { REQ_F_SKIP_LINK_CQES_BIT, REQ_F_SINGLE_POLL_BIT, REQ_F_DOUBLE_POLL_BIT, - REQ_F_PARTIAL_IO_BIT, REQ_F_APOLL_MULTISHOT_BIT, REQ_F_CLEAR_POLLIN_BIT, REQ_F_HASH_LOCKED_BIT, @@ -463,75 +477,88 @@ enum { REQ_F_SUPPORT_NOWAIT_BIT, REQ_F_ISREG_BIT, REQ_F_POLL_NO_LAZY_BIT, + REQ_F_CANCEL_SEQ_BIT, + REQ_F_CAN_POLL_BIT, + REQ_F_BL_EMPTY_BIT, + REQ_F_BL_NO_RECYCLE_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, }; +typedef u64 __bitwise io_req_flags_t; +#define IO_REQ_FLAG(bitno) ((__force io_req_flags_t) BIT_ULL((bitno))) + enum { /* ctx owns file */ - REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT), + REQ_F_FIXED_FILE = IO_REQ_FLAG(REQ_F_FIXED_FILE_BIT), /* drain existing IO first */ - REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT), + REQ_F_IO_DRAIN = IO_REQ_FLAG(REQ_F_IO_DRAIN_BIT), /* linked sqes */ - REQ_F_LINK = BIT(REQ_F_LINK_BIT), + REQ_F_LINK = IO_REQ_FLAG(REQ_F_LINK_BIT), /* doesn't sever on completion < 0 */ - REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT), + REQ_F_HARDLINK = IO_REQ_FLAG(REQ_F_HARDLINK_BIT), /* IOSQE_ASYNC */ - REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), + REQ_F_FORCE_ASYNC = IO_REQ_FLAG(REQ_F_FORCE_ASYNC_BIT), /* IOSQE_BUFFER_SELECT */ - REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), + REQ_F_BUFFER_SELECT = IO_REQ_FLAG(REQ_F_BUFFER_SELECT_BIT), /* IOSQE_CQE_SKIP_SUCCESS */ - REQ_F_CQE_SKIP = BIT(REQ_F_CQE_SKIP_BIT), + REQ_F_CQE_SKIP = IO_REQ_FLAG(REQ_F_CQE_SKIP_BIT), /* fail rest of links */ - REQ_F_FAIL = BIT(REQ_F_FAIL_BIT), + REQ_F_FAIL = IO_REQ_FLAG(REQ_F_FAIL_BIT), /* on inflight list, should be cancelled and waited on exit reliably */ - REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT), + REQ_F_INFLIGHT = IO_REQ_FLAG(REQ_F_INFLIGHT_BIT), /* read/write uses file position */ - REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT), + REQ_F_CUR_POS = IO_REQ_FLAG(REQ_F_CUR_POS_BIT), /* must not punt to workers */ - REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), + REQ_F_NOWAIT = IO_REQ_FLAG(REQ_F_NOWAIT_BIT), /* has or had linked timeout */ - REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), + REQ_F_LINK_TIMEOUT = IO_REQ_FLAG(REQ_F_LINK_TIMEOUT_BIT), /* needs cleanup */ - REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), + REQ_F_NEED_CLEANUP = IO_REQ_FLAG(REQ_F_NEED_CLEANUP_BIT), /* already went through poll handler */ - REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), + REQ_F_POLLED = IO_REQ_FLAG(REQ_F_POLLED_BIT), /* buffer already selected */ - REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), + REQ_F_BUFFER_SELECTED = IO_REQ_FLAG(REQ_F_BUFFER_SELECTED_BIT), /* buffer selected from ring, needs commit */ - REQ_F_BUFFER_RING = BIT(REQ_F_BUFFER_RING_BIT), + REQ_F_BUFFER_RING = IO_REQ_FLAG(REQ_F_BUFFER_RING_BIT), /* caller should reissue async */ - REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT), + REQ_F_REISSUE = IO_REQ_FLAG(REQ_F_REISSUE_BIT), /* supports async reads/writes */ - REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT), + REQ_F_SUPPORT_NOWAIT = IO_REQ_FLAG(REQ_F_SUPPORT_NOWAIT_BIT), /* regular file */ - REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), + REQ_F_ISREG = IO_REQ_FLAG(REQ_F_ISREG_BIT), /* has creds assigned */ - REQ_F_CREDS = BIT(REQ_F_CREDS_BIT), + REQ_F_CREDS = IO_REQ_FLAG(REQ_F_CREDS_BIT), /* skip refcounting if not set */ - REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT), + REQ_F_REFCOUNT = IO_REQ_FLAG(REQ_F_REFCOUNT_BIT), /* there is a linked timeout that has to be armed */ - REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT), + REQ_F_ARM_LTIMEOUT = IO_REQ_FLAG(REQ_F_ARM_LTIMEOUT_BIT), /* ->async_data allocated */ - REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT), + REQ_F_ASYNC_DATA = IO_REQ_FLAG(REQ_F_ASYNC_DATA_BIT), /* don't post CQEs while failing linked requests */ - REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT), + REQ_F_SKIP_LINK_CQES = IO_REQ_FLAG(REQ_F_SKIP_LINK_CQES_BIT), /* single poll may be active */ - REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT), + REQ_F_SINGLE_POLL = IO_REQ_FLAG(REQ_F_SINGLE_POLL_BIT), /* double poll may active */ - REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT), - /* request has already done partial IO */ - REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT), + REQ_F_DOUBLE_POLL = IO_REQ_FLAG(REQ_F_DOUBLE_POLL_BIT), /* fast poll multishot mode */ - REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT), + REQ_F_APOLL_MULTISHOT = IO_REQ_FLAG(REQ_F_APOLL_MULTISHOT_BIT), /* recvmsg special flag, clear EPOLLIN */ - REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT), + REQ_F_CLEAR_POLLIN = IO_REQ_FLAG(REQ_F_CLEAR_POLLIN_BIT), /* hashed into ->cancel_hash_locked, protected by ->uring_lock */ - REQ_F_HASH_LOCKED = BIT(REQ_F_HASH_LOCKED_BIT), + REQ_F_HASH_LOCKED = IO_REQ_FLAG(REQ_F_HASH_LOCKED_BIT), /* don't use lazy poll wake for this request */ - REQ_F_POLL_NO_LAZY = BIT(REQ_F_POLL_NO_LAZY_BIT), + REQ_F_POLL_NO_LAZY = IO_REQ_FLAG(REQ_F_POLL_NO_LAZY_BIT), + /* cancel sequence is set and valid */ + REQ_F_CANCEL_SEQ = IO_REQ_FLAG(REQ_F_CANCEL_SEQ_BIT), + /* file is pollable */ + REQ_F_CAN_POLL = IO_REQ_FLAG(REQ_F_CAN_POLL_BIT), + /* buffer list was empty after selection of buffer */ + REQ_F_BL_EMPTY = IO_REQ_FLAG(REQ_F_BL_EMPTY_BIT), + /* don't recycle provided buffers for this request */ + REQ_F_BL_NO_RECYCLE = IO_REQ_FLAG(REQ_F_BL_NO_RECYCLE_BIT), }; typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts); @@ -592,15 +619,17 @@ struct io_kiocb { * and after selection it points to the buffer ID itself. */ u16 buf_index; - unsigned int flags; + + unsigned nr_tw; + + /* REQ_F_* flags */ + io_req_flags_t flags; struct io_cqe cqe; struct io_ring_ctx *ctx; struct task_struct *task; - struct io_rsrc_node *rsrc_node; - union { /* store used ubuf, so we can prevent reloading */ struct io_mapped_ubuf *imu; @@ -621,10 +650,12 @@ struct io_kiocb { /* cache ->apoll->events */ __poll_t apoll_events; }; + + struct io_rsrc_node *rsrc_node; + atomic_t refs; atomic_t poll_refs; struct io_task_work io_task_work; - unsigned nr_tw; /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ struct hlist_node hash_node; /* internal polling, see IORING_FEAT_FAST_POLL */ diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 4dabeb6c76d3..9b09acac538e 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -48,6 +48,10 @@ void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), void *loop_end_arg, bool prefer_busy_poll, u16 budget); +void napi_busy_loop_rcu(unsigned int napi_id, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg, bool prefer_busy_poll, u16 budget); + #else /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long net_busy_loop_on(void) { diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index 69454f1f98b0..e948df7ce625 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -148,7 +148,7 @@ TRACE_EVENT(io_uring_queue_async_work, __field( void *, req ) __field( u64, user_data ) __field( u8, opcode ) - __field( unsigned int, flags ) + __field( unsigned long long, flags ) __field( struct io_wq_work *, work ) __field( int, rw ) @@ -159,7 +159,7 @@ TRACE_EVENT(io_uring_queue_async_work, __entry->ctx = req->ctx; __entry->req = req; __entry->user_data = req->cqe.user_data; - __entry->flags = req->flags; + __entry->flags = (__force unsigned long long) req->flags; __entry->opcode = req->opcode; __entry->work = &req->work; __entry->rw = rw; @@ -167,10 +167,10 @@ TRACE_EVENT(io_uring_queue_async_work, __assign_str(op_str, io_uring_get_opcode(req->opcode)); ), - TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%x, %s queue, work %p", + TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%llx, %s queue, work %p", __entry->ctx, __entry->req, __entry->user_data, - __get_str(op_str), - __entry->flags, __entry->rw ? "hashed" : "normal", __entry->work) + __get_str(op_str), __entry->flags, + __entry->rw ? "hashed" : "normal", __entry->work) ); /** @@ -378,7 +378,7 @@ TRACE_EVENT(io_uring_submit_req, __field( void *, req ) __field( unsigned long long, user_data ) __field( u8, opcode ) - __field( u32, flags ) + __field( unsigned long long, flags ) __field( bool, sq_thread ) __string( op_str, io_uring_get_opcode(req->opcode) ) @@ -389,16 +389,16 @@ TRACE_EVENT(io_uring_submit_req, __entry->req = req; __entry->user_data = req->cqe.user_data; __entry->opcode = req->opcode; - __entry->flags = req->flags; + __entry->flags = (__force unsigned long long) req->flags; __entry->sq_thread = req->ctx->flags & IORING_SETUP_SQPOLL; __assign_str(op_str, io_uring_get_opcode(req->opcode)); ), - TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%x, " + TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%llx, " "sq_thread %d", __entry->ctx, __entry->req, - __entry->user_data, __get_str(op_str), - __entry->flags, __entry->sq_thread) + __entry->user_data, __get_str(op_str), __entry->flags, + __entry->sq_thread) ); /* @@ -602,29 +602,25 @@ TRACE_EVENT(io_uring_cqe_overflow, * * @tctx: pointer to a io_uring_task * @count: how many functions it ran - * @loops: how many loops it ran * */ TRACE_EVENT(io_uring_task_work_run, - TP_PROTO(void *tctx, unsigned int count, unsigned int loops), + TP_PROTO(void *tctx, unsigned int count), - TP_ARGS(tctx, count, loops), + TP_ARGS(tctx, count), TP_STRUCT__entry ( __field( void *, tctx ) __field( unsigned int, count ) - __field( unsigned int, loops ) ), TP_fast_assign( __entry->tctx = tctx; __entry->count = count; - __entry->loops = loops; ), - TP_printk("tctx %p, count %u, loops %u", - __entry->tctx, __entry->count, __entry->loops) + TP_printk("tctx %p, count %u", __entry->tctx, __entry->count) ); TRACE_EVENT(io_uring_short_write, diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 7a673b52827b..7bd10201a02b 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -255,6 +255,7 @@ enum io_uring_op { IORING_OP_FUTEX_WAKE, IORING_OP_FUTEX_WAITV, IORING_OP_FIXED_FD_INSTALL, + IORING_OP_FTRUNCATE, /* this goes last, obviously */ IORING_OP_LAST, @@ -570,6 +571,10 @@ enum { /* return status information for a buffer group */ IORING_REGISTER_PBUF_STATUS = 26, + /* set/clear busy poll settings */ + IORING_REGISTER_NAPI = 27, + IORING_UNREGISTER_NAPI = 28, + /* this goes last */ IORING_REGISTER_LAST, @@ -703,6 +708,14 @@ struct io_uring_buf_status { __u32 resv[8]; }; +/* argument for IORING_(UN)REGISTER_NAPI */ +struct io_uring_napi { + __u32 busy_poll_to; + __u8 prefer_busy_poll; + __u8 pad[3]; + __u64 resv; +}; + /* * io_uring_restriction->opcode values */ diff --git a/io_uring/Makefile b/io_uring/Makefile index 2cdc51825405..2e1d4e03799c 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -8,6 +8,7 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ statx.o net.o msg_ring.o timeout.o \ sqpoll.o fdinfo.o tctx.o poll.o \ cancel.o kbuf.o rsrc.o rw.o opdef.o \ - notif.o waitid.o register.o + notif.o waitid.o register.o truncate.o obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FUTEX) += futex.o +obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o diff --git a/io_uring/cancel.c b/io_uring/cancel.c index 8a8b07dfc444..acfcdd7f059a 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -58,9 +58,8 @@ bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd) return false; if (cd->flags & IORING_ASYNC_CANCEL_ALL) { check_seq: - if (cd->seq == req->work.cancel_seq) + if (io_cancel_match_sequence(req, cd->seq)) return false; - req->work.cancel_seq = cd->seq; } return true; diff --git a/io_uring/cancel.h b/io_uring/cancel.h index c0a8e7c520b6..76b32e65c03c 100644 --- a/io_uring/cancel.h +++ b/io_uring/cancel.h @@ -25,4 +25,14 @@ void init_hash_table(struct io_hash_table *table, unsigned size); int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg); bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd); +static inline bool io_cancel_match_sequence(struct io_kiocb *req, int sequence) +{ + if ((req->flags & REQ_F_CANCEL_SEQ) && sequence == req->work.cancel_seq) + return true; + + req->flags |= REQ_F_CANCEL_SEQ; + req->work.cancel_seq = sequence; + return false; +} + #endif diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index 976e9500f651..8d444dd1b0a7 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -55,6 +55,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) struct io_ring_ctx *ctx = f->private_data; struct io_overflow_cqe *ocqe; struct io_rings *r = ctx->rings; + struct rusage sq_usage; unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1; unsigned int sq_head = READ_ONCE(r->sq.head); unsigned int sq_tail = READ_ONCE(r->sq.tail); @@ -64,6 +65,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) unsigned int sq_shift = 0; unsigned int sq_entries, cq_entries; int sq_pid = -1, sq_cpu = -1; + u64 sq_total_time = 0, sq_work_time = 0; bool has_lock; unsigned int i; @@ -145,12 +147,24 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) { struct io_sq_data *sq = ctx->sq_data; - sq_pid = sq->task_pid; - sq_cpu = sq->sq_cpu; + /* + * sq->thread might be NULL if we raced with the sqpoll + * thread termination. + */ + if (sq->thread) { + sq_pid = sq->task_pid; + sq_cpu = sq->sq_cpu; + getrusage(sq->thread, RUSAGE_SELF, &sq_usage); + sq_total_time = (sq_usage.ru_stime.tv_sec * 1000000 + + sq_usage.ru_stime.tv_usec); + sq_work_time = sq->work_time; + } } seq_printf(m, "SqThread:\t%d\n", sq_pid); seq_printf(m, "SqThreadCpu:\t%d\n", sq_cpu); + seq_printf(m, "SqTotalTime:\t%llu\n", sq_total_time); + seq_printf(m, "SqWorkTime:\t%llu\n", sq_work_time); seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files); for (i = 0; has_lock && i < ctx->nr_user_files; i++) { struct file *f = io_file_from_index(&ctx->file_table, i); diff --git a/io_uring/filetable.h b/io_uring/filetable.h index b47adf170c31..b2435c4dca1f 100644 --- a/io_uring/filetable.h +++ b/io_uring/filetable.h @@ -17,7 +17,7 @@ int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset); int io_register_file_alloc_range(struct io_ring_ctx *ctx, struct io_uring_file_index_range __user *arg); -unsigned int io_file_get_flags(struct file *file); +io_req_flags_t io_file_get_flags(struct file *file); static inline void io_file_bitmap_clear(struct io_file_table *table, int bit) { diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index cd9a137ad6ce..cf348c33f485 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -59,7 +59,6 @@ #include <linux/bvec.h> #include <linux/net.h> #include <net/sock.h> -#include <net/af_unix.h> #include <linux/anon_inodes.h> #include <linux/sched/mm.h> #include <linux/uaccess.h> @@ -95,6 +94,7 @@ #include "notif.h" #include "waitid.h" #include "futex.h" +#include "napi.h" #include "timeout.h" #include "poll.h" @@ -122,11 +122,6 @@ #define IO_COMPL_BATCH 32 #define IO_REQ_ALLOC_BATCH 8 -enum { - IO_CHECK_CQ_OVERFLOW_BIT, - IO_CHECK_CQ_DROPPED_BIT, -}; - struct io_defer_entry { struct list_head list; struct io_kiocb *req; @@ -349,6 +344,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func); INIT_WQ_LIST(&ctx->submit_state.compl_reqs); INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd); + io_napi_init(ctx); + return ctx; err: kfree(ctx->cancel_table.hbs); @@ -463,7 +460,6 @@ static void io_prep_async_work(struct io_kiocb *req) req->work.list.next = NULL; req->work.flags = 0; - req->work.cancel_seq = atomic_read(&ctx->cancel_seq); if (req->flags & REQ_F_FORCE_ASYNC) req->work.flags |= IO_WQ_WORK_CONCURRENT; @@ -670,7 +666,6 @@ static void io_cq_unlock_post(struct io_ring_ctx *ctx) io_commit_cqring_flush(ctx); } -/* Returns true if there are no backlogged entries after the flush */ static void io_cqring_overflow_kill(struct io_ring_ctx *ctx) { struct io_overflow_cqe *ocqe; @@ -949,6 +944,8 @@ bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags) u64 user_data = req->cqe.user_data; struct io_uring_cqe *cqe; + lockdep_assert(!io_wq_current_is_worker()); + if (!defer) return __io_post_aux_cqe(ctx, user_data, res, cflags, false); @@ -1025,15 +1022,15 @@ static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) { - if (req->ctx->task_complete && req->ctx->submitter_task != current) { + struct io_ring_ctx *ctx = req->ctx; + + if (ctx->task_complete && ctx->submitter_task != current) { req->io_task_work.func = io_req_task_complete; io_req_task_work_add(req); } else if (!(issue_flags & IO_URING_F_UNLOCKED) || - !(req->ctx->flags & IORING_SETUP_IOPOLL)) { + !(ctx->flags & IORING_SETUP_IOPOLL)) { __io_req_complete_post(req, issue_flags); } else { - struct io_ring_ctx *ctx = req->ctx; - mutex_lock(&ctx->uring_lock); __io_req_complete_post(req, issue_flags & ~IO_URING_F_UNLOCKED); mutex_unlock(&ctx->uring_lock); @@ -1174,40 +1171,44 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts) percpu_ref_put(&ctx->refs); } -static unsigned int handle_tw_list(struct llist_node *node, - struct io_ring_ctx **ctx, - struct io_tw_state *ts, - struct llist_node *last) +/* + * Run queued task_work, returning the number of entries processed in *count. + * If more entries than max_entries are available, stop processing once this + * is reached and return the rest of the list. + */ +struct llist_node *io_handle_tw_list(struct llist_node *node, + unsigned int *count, + unsigned int max_entries) { - unsigned int count = 0; + struct io_ring_ctx *ctx = NULL; + struct io_tw_state ts = { }; - while (node && node != last) { + do { struct llist_node *next = node->next; struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); - prefetch(container_of(next, struct io_kiocb, io_task_work.node)); - - if (req->ctx != *ctx) { - ctx_flush_and_put(*ctx, ts); - *ctx = req->ctx; + if (req->ctx != ctx) { + ctx_flush_and_put(ctx, &ts); + ctx = req->ctx; /* if not contended, grab and improve batching */ - ts->locked = mutex_trylock(&(*ctx)->uring_lock); - percpu_ref_get(&(*ctx)->refs); + ts.locked = mutex_trylock(&ctx->uring_lock); + percpu_ref_get(&ctx->refs); } INDIRECT_CALL_2(req->io_task_work.func, io_poll_task_func, io_req_rw_complete, - req, ts); + req, &ts); node = next; - count++; + (*count)++; if (unlikely(need_resched())) { - ctx_flush_and_put(*ctx, ts); - *ctx = NULL; + ctx_flush_and_put(ctx, &ts); + ctx = NULL; cond_resched(); } - } + } while (node && *count < max_entries); - return count; + ctx_flush_and_put(ctx, &ts); + return node; } /** @@ -1224,22 +1225,6 @@ static inline struct llist_node *io_llist_xchg(struct llist_head *head, return xchg(&head->first, new); } -/** - * io_llist_cmpxchg - possibly swap all entries in a lock-less list - * @head: the head of lock-less list to delete all entries - * @old: expected old value of the first entry of the list - * @new: new entry as the head of the list - * - * perform a cmpxchg on the first entry of the list. - */ - -static inline struct llist_node *io_llist_cmpxchg(struct llist_head *head, - struct llist_node *old, - struct llist_node *new) -{ - return cmpxchg(&head->first, old, new); -} - static __cold void io_fallback_tw(struct io_uring_task *tctx, bool sync) { struct llist_node *node = llist_del_all(&tctx->task_list); @@ -1268,45 +1253,41 @@ static __cold void io_fallback_tw(struct io_uring_task *tctx, bool sync) } } -void tctx_task_work(struct callback_head *cb) +struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, + unsigned int max_entries, + unsigned int *count) { - struct io_tw_state ts = {}; - struct io_ring_ctx *ctx = NULL; - struct io_uring_task *tctx = container_of(cb, struct io_uring_task, - task_work); - struct llist_node fake = {}; struct llist_node *node; - unsigned int loops = 0; - unsigned int count = 0; if (unlikely(current->flags & PF_EXITING)) { io_fallback_tw(tctx, true); - return; + return NULL; } - do { - loops++; - node = io_llist_xchg(&tctx->task_list, &fake); - count += handle_tw_list(node, &ctx, &ts, &fake); - - /* skip expensive cmpxchg if there are items in the list */ - if (READ_ONCE(tctx->task_list.first) != &fake) - continue; - if (ts.locked && !wq_list_empty(&ctx->submit_state.compl_reqs)) { - io_submit_flush_completions(ctx); - if (READ_ONCE(tctx->task_list.first) != &fake) - continue; - } - node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL); - } while (node != &fake); - - ctx_flush_and_put(ctx, &ts); + node = llist_del_all(&tctx->task_list); + if (node) { + node = llist_reverse_order(node); + node = io_handle_tw_list(node, count, max_entries); + } /* relaxed read is enough as only the task itself sets ->in_cancel */ if (unlikely(atomic_read(&tctx->in_cancel))) io_uring_drop_tctx_refs(current); - trace_io_uring_task_work_run(tctx, count, loops); + trace_io_uring_task_work_run(tctx, *count); + return node; +} + +void tctx_task_work(struct callback_head *cb) +{ + struct io_uring_task *tctx; + struct llist_node *ret; + unsigned int count = 0; + + tctx = container_of(cb, struct io_uring_task, task_work); + ret = tctx_task_work_run(tctx, UINT_MAX, &count); + /* can't happen */ + WARN_ON_ONCE(ret); } static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) @@ -1389,6 +1370,15 @@ static void io_req_normal_work_add(struct io_kiocb *req) if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); + /* SQPOLL doesn't need the task_work added, it'll run it itself */ + if (ctx->flags & IORING_SETUP_SQPOLL) { + struct io_sq_data *sqd = ctx->sq_data; + + if (wq_has_sleeper(&sqd->wait)) + wake_up(&sqd->wait); + return; + } + if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method))) return; @@ -1420,7 +1410,20 @@ static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) } } -static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts) +static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events, + int min_events) +{ + if (llist_empty(&ctx->work_llist)) + return false; + if (events < min_events) + return true; + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) + atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); + return false; +} + +static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts, + int min_events) { struct llist_node *node; unsigned int loops = 0; @@ -1440,7 +1443,6 @@ again: struct llist_node *next = node->next; struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); - prefetch(container_of(next, struct io_kiocb, io_task_work.node)); INDIRECT_CALL_2(req->io_task_work.func, io_poll_task_func, io_req_rw_complete, req, ts); @@ -1449,18 +1451,20 @@ again: } loops++; - if (!llist_empty(&ctx->work_llist)) + if (io_run_local_work_continue(ctx, ret, min_events)) goto again; if (ts->locked) { io_submit_flush_completions(ctx); - if (!llist_empty(&ctx->work_llist)) + if (io_run_local_work_continue(ctx, ret, min_events)) goto again; } + trace_io_uring_local_work_run(ctx, ret, loops); return ret; } -static inline int io_run_local_work_locked(struct io_ring_ctx *ctx) +static inline int io_run_local_work_locked(struct io_ring_ctx *ctx, + int min_events) { struct io_tw_state ts = { .locked = true, }; int ret; @@ -1468,20 +1472,20 @@ static inline int io_run_local_work_locked(struct io_ring_ctx *ctx) if (llist_empty(&ctx->work_llist)) return 0; - ret = __io_run_local_work(ctx, &ts); + ret = __io_run_local_work(ctx, &ts, min_events); /* shouldn't happen! */ if (WARN_ON_ONCE(!ts.locked)) mutex_lock(&ctx->uring_lock); return ret; } -static int io_run_local_work(struct io_ring_ctx *ctx) +static int io_run_local_work(struct io_ring_ctx *ctx, int min_events) { struct io_tw_state ts = {}; int ret; ts.locked = mutex_trylock(&ctx->uring_lock); - ret = __io_run_local_work(ctx, &ts); + ret = __io_run_local_work(ctx, &ts, min_events); if (ts.locked) mutex_unlock(&ctx->uring_lock); @@ -1677,7 +1681,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) io_task_work_pending(ctx)) { u32 tail = ctx->cached_cq_tail; - (void) io_run_local_work_locked(ctx); + (void) io_run_local_work_locked(ctx, min); if (task_work_pending(current) || wq_list_empty(&ctx->iopoll_list)) { @@ -1768,9 +1772,9 @@ static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags) } } -unsigned int io_file_get_flags(struct file *file) +io_req_flags_t io_file_get_flags(struct file *file) { - unsigned int res = 0; + io_req_flags_t res = 0; if (S_ISREG(file_inode(file)->i_mode)) res |= REQ_F_ISREG; @@ -1966,10 +1970,28 @@ fail: goto fail; } + /* + * If DEFER_TASKRUN is set, it's only allowed to post CQEs from the + * submitter task context. Final request completions are handed to the + * right context, however this is not the case of auxiliary CQEs, + * which is the main mean of operation for multishot requests. + * Don't allow any multishot execution from io-wq. It's more restrictive + * than necessary and also cleaner. + */ + if (req->flags & REQ_F_APOLL_MULTISHOT) { + err = -EBADFD; + if (!io_file_can_poll(req)) + goto fail; + err = -ECANCELED; + if (io_arm_poll_handler(req, issue_flags) != IO_APOLL_OK) + goto fail; + return; + } + if (req->flags & REQ_F_FORCE_ASYNC) { bool opcode_poll = def->pollin || def->pollout; - if (opcode_poll && file_can_poll(req->file)) { + if (opcode_poll && io_file_can_poll(req)) { needs_poll = true; issue_flags |= IO_URING_F_NONBLOCK; } @@ -2171,7 +2193,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, /* req is partially pre-initialised, see io_preinit_req() */ req->opcode = opcode = READ_ONCE(sqe->opcode); /* same numerical values with corresponding REQ_F_*, safe to copy */ - req->flags = sqe_flags = READ_ONCE(sqe->flags); + sqe_flags = READ_ONCE(sqe->flags); + req->flags = (io_req_flags_t) sqe_flags; req->cqe.user_data = READ_ONCE(sqe->user_data); req->file = NULL; req->rsrc_node = NULL; @@ -2475,33 +2498,6 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) return ret; } -struct io_wait_queue { - struct wait_queue_entry wq; - struct io_ring_ctx *ctx; - unsigned cq_tail; - unsigned nr_timeouts; - ktime_t timeout; -}; - -static inline bool io_has_work(struct io_ring_ctx *ctx) -{ - return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq) || - !llist_empty(&ctx->work_llist); -} - -static inline bool io_should_wake(struct io_wait_queue *iowq) -{ - struct io_ring_ctx *ctx = iowq->ctx; - int dist = READ_ONCE(ctx->rings->cq.tail) - (int) iowq->cq_tail; - - /* - * Wake up if we have enough events, or if a timeout occurred since we - * started waiting. For timeouts, we always want to return to userspace, - * regardless of event count. - */ - return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; -} - static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, int wake_flags, void *key) { @@ -2520,7 +2516,7 @@ int io_run_task_work_sig(struct io_ring_ctx *ctx) { if (!llist_empty(&ctx->work_llist)) { __set_current_state(TASK_RUNNING); - if (io_run_local_work(ctx) > 0) + if (io_run_local_work(ctx, INT_MAX) > 0) return 0; } if (io_run_task_work() > 0) @@ -2588,7 +2584,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, if (!io_allowed_run_tw(ctx)) return -EEXIST; if (!llist_empty(&ctx->work_llist)) - io_run_local_work(ctx); + io_run_local_work(ctx, min_events); io_run_task_work(); io_cqring_overflow_flush(ctx); /* if user messes with these they will just get an early return */ @@ -2621,16 +2617,19 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, if (get_timespec64(&ts, uts)) return -EFAULT; + iowq.timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns()); + io_napi_adjust_timeout(ctx, &iowq, &ts); } + io_napi_busy_loop(ctx, &iowq); + trace_io_uring_cqring_wait(ctx, min_events); do { + int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail); unsigned long check_cq; if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { - int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail); - atomic_set(&ctx->cq_wait_nr, nr_wait); set_current_state(TASK_INTERRUPTIBLE); } else { @@ -2649,7 +2648,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, */ io_run_task_work(); if (!llist_empty(&ctx->work_llist)) - io_run_local_work(ctx); + io_run_local_work(ctx, nr_wait); /* * Non-local task_work will be run on exit to userspace, but @@ -2917,6 +2916,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_req_caches_free(ctx); if (ctx->hash_map) io_wq_put_hash(ctx->hash_map); + io_napi_free(ctx); kfree(ctx->cancel_table.hbs); kfree(ctx->cancel_table_locked.hbs); kfree(ctx->io_bl); @@ -3304,7 +3304,7 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && io_allowed_defer_tw_run(ctx)) - ret |= io_run_local_work(ctx) > 0; + ret |= io_run_local_work(ctx, INT_MAX) > 0; ret |= io_cancel_defer_files(ctx, task, cancel_all); mutex_lock(&ctx->uring_lock); ret |= io_poll_remove_all(ctx, task, cancel_all); @@ -3666,7 +3666,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, * it should handle ownership problems if any. */ if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) - (void)io_run_local_work_locked(ctx); + (void)io_run_local_work_locked(ctx, min_complete); } mutex_unlock(&ctx->uring_lock); } @@ -4153,7 +4153,7 @@ static int __init io_uring_init(void) BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8)); BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS); - BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int)); + BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof_field(struct io_kiocb, flags)); BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32)); @@ -4175,9 +4175,8 @@ static int __init io_uring_init(void) SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU, offsetof(struct io_kiocb, cmd.data), sizeof_field(struct io_kiocb, cmd.data), NULL); - io_buf_cachep = kmem_cache_create("io_buffer", sizeof(struct io_buffer), 0, - SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, - NULL); + io_buf_cachep = KMEM_CACHE(io_buffer, + SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); #ifdef CONFIG_SYSCTL register_sysctl_init("kernel", kernel_io_uring_disabled_table); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index d5495710c178..6426ee382276 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -5,6 +5,7 @@ #include <linux/lockdep.h> #include <linux/resume_user_mode.h> #include <linux/kasan.h> +#include <linux/poll.h> #include <linux/io_uring_types.h> #include <uapi/linux/eventpoll.h> #include "io-wq.h" @@ -34,6 +35,32 @@ enum { IOU_STOP_MULTISHOT = -ECANCELED, }; +struct io_wait_queue { + struct wait_queue_entry wq; + struct io_ring_ctx *ctx; + unsigned cq_tail; + unsigned nr_timeouts; + ktime_t timeout; + +#ifdef CONFIG_NET_RX_BUSY_POLL + unsigned int napi_busy_poll_to; + bool napi_prefer_busy_poll; +#endif +}; + +static inline bool io_should_wake(struct io_wait_queue *iowq) +{ + struct io_ring_ctx *ctx = iowq->ctx; + int dist = READ_ONCE(ctx->rings->cq.tail) - (int) iowq->cq_tail; + + /* + * Wake up if we have enough events, or if a timeout occurred since we + * started waiting. For timeouts, we always want to return to userspace, + * regardless of event count. + */ + return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; +} + bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow); void io_req_cqe_overflow(struct io_kiocb *req); int io_run_task_work_sig(struct io_ring_ctx *ctx); @@ -56,6 +83,8 @@ void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use); void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts); void io_req_task_queue_fail(struct io_kiocb *req, int ret); void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts); +struct llist_node *io_handle_tw_list(struct llist_node *node, unsigned int *count, unsigned int max_entries); +struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, unsigned int max_entries, unsigned int *count); void tctx_task_work(struct callback_head *cb); __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); int io_uring_alloc_task_context(struct task_struct *task, @@ -207,7 +236,7 @@ static inline void io_ring_submit_unlock(struct io_ring_ctx *ctx, unsigned issue_flags) { lockdep_assert_held(&ctx->uring_lock); - if (issue_flags & IO_URING_F_UNLOCKED) + if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) mutex_unlock(&ctx->uring_lock); } @@ -220,7 +249,7 @@ static inline void io_ring_submit_lock(struct io_ring_ctx *ctx, * The only exception is when we've detached the request and issue it * from an async worker thread, grab the lock for that case. */ - if (issue_flags & IO_URING_F_UNLOCKED) + if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) mutex_lock(&ctx->uring_lock); lockdep_assert_held(&ctx->uring_lock); } @@ -274,6 +303,8 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) static inline int io_run_task_work(void) { + bool ret = false; + /* * Always check-and-clear the task_work notification signal. With how * signaling works for task_work, we can find it set with nothing to @@ -285,18 +316,26 @@ static inline int io_run_task_work(void) * PF_IO_WORKER never returns to userspace, so check here if we have * notify work that needs processing. */ - if (current->flags & PF_IO_WORKER && - test_thread_flag(TIF_NOTIFY_RESUME)) { - __set_current_state(TASK_RUNNING); - resume_user_mode_work(NULL); + if (current->flags & PF_IO_WORKER) { + if (test_thread_flag(TIF_NOTIFY_RESUME)) { + __set_current_state(TASK_RUNNING); + resume_user_mode_work(NULL); + } + if (current->io_uring) { + unsigned int count = 0; + + tctx_task_work_run(current->io_uring, UINT_MAX, &count); + if (count) + ret = true; + } } if (task_work_pending(current)) { __set_current_state(TASK_RUNNING); task_work_run(); - return 1; + ret = true; } - return 0; + return ret; } static inline bool io_task_work_pending(struct io_ring_ctx *ctx) @@ -398,4 +437,26 @@ static inline size_t uring_sqe_size(struct io_ring_ctx *ctx) return 2 * sizeof(struct io_uring_sqe); return sizeof(struct io_uring_sqe); } + +static inline bool io_file_can_poll(struct io_kiocb *req) +{ + if (req->flags & REQ_F_CAN_POLL) + return true; + if (file_can_poll(req->file)) { + req->flags |= REQ_F_CAN_POLL; + return true; + } + return false; +} + +enum { + IO_CHECK_CQ_OVERFLOW_BIT, + IO_CHECK_CQ_DROPPED_BIT, +}; + +static inline bool io_has_work(struct io_ring_ctx *ctx) +{ + return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq) || + !llist_empty(&ctx->work_llist); +} #endif diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 18df5a9d2f5e..9be42bff936b 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -81,15 +81,6 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) struct io_buffer_list *bl; struct io_buffer *buf; - /* - * For legacy provided buffer mode, don't recycle if we already did - * IO to this buffer. For ring-mapped provided buffer mode, we should - * increment ring->head to explicitly monopolize the buffer to avoid - * multiple use. - */ - if (req->flags & REQ_F_PARTIAL_IO) - return false; - io_ring_submit_lock(ctx, issue_flags); buf = req->kbuf; @@ -102,10 +93,8 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) return true; } -unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) +void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) { - unsigned int cflags; - /* * We can add this buffer back to two lists: * @@ -118,21 +107,17 @@ unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) * We migrate buffers from the comp_list to the issue cache list * when we need one. */ - if (req->flags & REQ_F_BUFFER_RING) { - /* no buffers to recycle for this case */ - cflags = __io_put_kbuf_list(req, NULL); - } else if (issue_flags & IO_URING_F_UNLOCKED) { + if (issue_flags & IO_URING_F_UNLOCKED) { struct io_ring_ctx *ctx = req->ctx; spin_lock(&ctx->completion_lock); - cflags = __io_put_kbuf_list(req, &ctx->io_buffers_comp); + __io_put_kbuf_list(req, &ctx->io_buffers_comp); spin_unlock(&ctx->completion_lock); } else { lockdep_assert_held(&req->ctx->uring_lock); - cflags = __io_put_kbuf_list(req, &req->ctx->io_buffers_cache); + __io_put_kbuf_list(req, &req->ctx->io_buffers_cache); } - return cflags; } static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, @@ -145,6 +130,8 @@ static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, list_del(&kbuf->list); if (*len == 0 || *len > kbuf->len) *len = kbuf->len; + if (list_empty(&bl->buf_list)) + req->flags |= REQ_F_BL_EMPTY; req->flags |= REQ_F_BUFFER_SELECTED; req->kbuf = kbuf; req->buf_index = kbuf->bid; @@ -158,12 +145,16 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, unsigned int issue_flags) { struct io_uring_buf_ring *br = bl->buf_ring; + __u16 tail, head = bl->head; struct io_uring_buf *buf; - __u16 head = bl->head; - if (unlikely(smp_load_acquire(&br->tail) == head)) + tail = smp_load_acquire(&br->tail); + if (unlikely(tail == head)) return NULL; + if (head + 1 == tail) + req->flags |= REQ_F_BL_EMPTY; + head &= bl->mask; /* mmaped buffers are always contig */ if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) { @@ -180,7 +171,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, req->buf_list = bl; req->buf_index = buf->bid; - if (issue_flags & IO_URING_F_UNLOCKED || !file_can_poll(req->file)) { + if (issue_flags & IO_URING_F_UNLOCKED || !io_file_can_poll(req)) { /* * If we came in unlocked, we have no choice but to consume the * buffer here, otherwise nothing ensures that the buffer won't diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 53dfaa71a397..5218bfd79e87 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -57,7 +57,7 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg); void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx); -unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags); +void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags); bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); @@ -73,21 +73,9 @@ static inline bool io_kbuf_recycle_ring(struct io_kiocb *req) * to monopolize the buffer. */ if (req->buf_list) { - if (req->flags & REQ_F_PARTIAL_IO) { - /* - * If we end up here, then the io_uring_lock has - * been kept held since we retrieved the buffer. - * For the io-wq case, we already cleared - * req->buf_list when the buffer was retrieved, - * hence it cannot be set here for that case. - */ - req->buf_list->head++; - req->buf_list = NULL; - } else { - req->buf_index = req->buf_list->bgid; - req->flags &= ~REQ_F_BUFFER_RING; - return true; - } + req->buf_index = req->buf_list->bgid; + req->flags &= ~REQ_F_BUFFER_RING; + return true; } return false; } @@ -101,6 +89,8 @@ static inline bool io_do_buffer_select(struct io_kiocb *req) static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) { + if (req->flags & REQ_F_BL_NO_RECYCLE) + return false; if (req->flags & REQ_F_BUFFER_SELECTED) return io_kbuf_recycle_legacy(req, issue_flags); if (req->flags & REQ_F_BUFFER_RING) @@ -108,41 +98,54 @@ static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) return false; } -static inline unsigned int __io_put_kbuf_list(struct io_kiocb *req, - struct list_head *list) +static inline void __io_put_kbuf_ring(struct io_kiocb *req) { - unsigned int ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); + if (req->buf_list) { + req->buf_index = req->buf_list->bgid; + req->buf_list->head++; + } + req->flags &= ~REQ_F_BUFFER_RING; +} +static inline void __io_put_kbuf_list(struct io_kiocb *req, + struct list_head *list) +{ if (req->flags & REQ_F_BUFFER_RING) { - if (req->buf_list) { - req->buf_index = req->buf_list->bgid; - req->buf_list->head++; - } - req->flags &= ~REQ_F_BUFFER_RING; + __io_put_kbuf_ring(req); } else { req->buf_index = req->kbuf->bgid; list_add(&req->kbuf->list, list); req->flags &= ~REQ_F_BUFFER_SELECTED; } - - return ret; } static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req) { + unsigned int ret; + lockdep_assert_held(&req->ctx->completion_lock); if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) return 0; - return __io_put_kbuf_list(req, &req->ctx->io_buffers_comp); + + ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); + __io_put_kbuf_list(req, &req->ctx->io_buffers_comp); + return ret; } static inline unsigned int io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) { + unsigned int ret; - if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) + if (!(req->flags & (REQ_F_BUFFER_RING | REQ_F_BUFFER_SELECTED))) return 0; - return __io_put_kbuf(req, issue_flags); + + ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); + if (req->flags & REQ_F_BUFFER_RING) + __io_put_kbuf_ring(req); + else + __io_put_kbuf(req, issue_flags); + return ret; } #endif diff --git a/io_uring/napi.c b/io_uring/napi.c new file mode 100644 index 000000000000..883a1a665907 --- /dev/null +++ b/io_uring/napi.c @@ -0,0 +1,332 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "io_uring.h" +#include "napi.h" + +#ifdef CONFIG_NET_RX_BUSY_POLL + +/* Timeout for cleanout of stale entries. */ +#define NAPI_TIMEOUT (60 * SEC_CONVERSION) + +struct io_napi_entry { + unsigned int napi_id; + struct list_head list; + + unsigned long timeout; + struct hlist_node node; + + struct rcu_head rcu; +}; + +static struct io_napi_entry *io_napi_hash_find(struct hlist_head *hash_list, + unsigned int napi_id) +{ + struct io_napi_entry *e; + + hlist_for_each_entry_rcu(e, hash_list, node) { + if (e->napi_id != napi_id) + continue; + e->timeout = jiffies + NAPI_TIMEOUT; + return e; + } + + return NULL; +} + +void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock) +{ + struct hlist_head *hash_list; + unsigned int napi_id; + struct sock *sk; + struct io_napi_entry *e; + + sk = sock->sk; + if (!sk) + return; + + napi_id = READ_ONCE(sk->sk_napi_id); + + /* Non-NAPI IDs can be rejected. */ + if (napi_id < MIN_NAPI_ID) + return; + + hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))]; + + rcu_read_lock(); + e = io_napi_hash_find(hash_list, napi_id); + if (e) { + e->timeout = jiffies + NAPI_TIMEOUT; + rcu_read_unlock(); + return; + } + rcu_read_unlock(); + + e = kmalloc(sizeof(*e), GFP_NOWAIT); + if (!e) + return; + + e->napi_id = napi_id; + e->timeout = jiffies + NAPI_TIMEOUT; + + spin_lock(&ctx->napi_lock); + if (unlikely(io_napi_hash_find(hash_list, napi_id))) { + spin_unlock(&ctx->napi_lock); + kfree(e); + return; + } + + hlist_add_tail_rcu(&e->node, hash_list); + list_add_tail(&e->list, &ctx->napi_list); + spin_unlock(&ctx->napi_lock); +} + +static void __io_napi_remove_stale(struct io_ring_ctx *ctx) +{ + struct io_napi_entry *e; + unsigned int i; + + spin_lock(&ctx->napi_lock); + hash_for_each(ctx->napi_ht, i, e, node) { + if (time_after(jiffies, e->timeout)) { + list_del(&e->list); + hash_del_rcu(&e->node); + kfree_rcu(e, rcu); + } + } + spin_unlock(&ctx->napi_lock); +} + +static inline void io_napi_remove_stale(struct io_ring_ctx *ctx, bool is_stale) +{ + if (is_stale) + __io_napi_remove_stale(ctx); +} + +static inline bool io_napi_busy_loop_timeout(unsigned long start_time, + unsigned long bp_usec) +{ + if (bp_usec) { + unsigned long end_time = start_time + bp_usec; + unsigned long now = busy_loop_current_time(); + + return time_after(now, end_time); + } + + return true; +} + +static bool io_napi_busy_loop_should_end(void *data, + unsigned long start_time) +{ + struct io_wait_queue *iowq = data; + + if (signal_pending(current)) + return true; + if (io_should_wake(iowq) || io_has_work(iowq->ctx)) + return true; + if (io_napi_busy_loop_timeout(start_time, iowq->napi_busy_poll_to)) + return true; + + return false; +} + +static bool __io_napi_do_busy_loop(struct io_ring_ctx *ctx, + void *loop_end_arg) +{ + struct io_napi_entry *e; + bool (*loop_end)(void *, unsigned long) = NULL; + bool is_stale = false; + + if (loop_end_arg) + loop_end = io_napi_busy_loop_should_end; + + list_for_each_entry_rcu(e, &ctx->napi_list, list) { + napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg, + ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET); + + if (time_after(jiffies, e->timeout)) + is_stale = true; + } + + return is_stale; +} + +static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx, + struct io_wait_queue *iowq) +{ + unsigned long start_time = busy_loop_current_time(); + void *loop_end_arg = NULL; + bool is_stale = false; + + /* Singular lists use a different napi loop end check function and are + * only executed once. + */ + if (list_is_singular(&ctx->napi_list)) + loop_end_arg = iowq; + + rcu_read_lock(); + do { + is_stale = __io_napi_do_busy_loop(ctx, loop_end_arg); + } while (!io_napi_busy_loop_should_end(iowq, start_time) && !loop_end_arg); + rcu_read_unlock(); + + io_napi_remove_stale(ctx, is_stale); +} + +/* + * io_napi_init() - Init napi settings + * @ctx: pointer to io-uring context structure + * + * Init napi settings in the io-uring context. + */ +void io_napi_init(struct io_ring_ctx *ctx) +{ + INIT_LIST_HEAD(&ctx->napi_list); + spin_lock_init(&ctx->napi_lock); + ctx->napi_prefer_busy_poll = false; + ctx->napi_busy_poll_to = READ_ONCE(sysctl_net_busy_poll); +} + +/* + * io_napi_free() - Deallocate napi + * @ctx: pointer to io-uring context structure + * + * Free the napi list and the hash table in the io-uring context. + */ +void io_napi_free(struct io_ring_ctx *ctx) +{ + struct io_napi_entry *e; + LIST_HEAD(napi_list); + unsigned int i; + + spin_lock(&ctx->napi_lock); + hash_for_each(ctx->napi_ht, i, e, node) { + hash_del_rcu(&e->node); + kfree_rcu(e, rcu); + } + spin_unlock(&ctx->napi_lock); +} + +/* + * io_napi_register() - Register napi with io-uring + * @ctx: pointer to io-uring context structure + * @arg: pointer to io_uring_napi structure + * + * Register napi in the io-uring context. + */ +int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) +{ + const struct io_uring_napi curr = { + .busy_poll_to = ctx->napi_busy_poll_to, + .prefer_busy_poll = ctx->napi_prefer_busy_poll + }; + struct io_uring_napi napi; + + if (copy_from_user(&napi, arg, sizeof(napi))) + return -EFAULT; + if (napi.pad[0] || napi.pad[1] || napi.pad[2] || napi.resv) + return -EINVAL; + + if (copy_to_user(arg, &curr, sizeof(curr))) + return -EFAULT; + + WRITE_ONCE(ctx->napi_busy_poll_to, napi.busy_poll_to); + WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi.prefer_busy_poll); + WRITE_ONCE(ctx->napi_enabled, true); + return 0; +} + +/* + * io_napi_unregister() - Unregister napi with io-uring + * @ctx: pointer to io-uring context structure + * @arg: pointer to io_uring_napi structure + * + * Unregister napi. If arg has been specified copy the busy poll timeout and + * prefer busy poll setting to the passed in structure. + */ +int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) +{ + const struct io_uring_napi curr = { + .busy_poll_to = ctx->napi_busy_poll_to, + .prefer_busy_poll = ctx->napi_prefer_busy_poll + }; + + if (arg && copy_to_user(arg, &curr, sizeof(curr))) + return -EFAULT; + + WRITE_ONCE(ctx->napi_busy_poll_to, 0); + WRITE_ONCE(ctx->napi_prefer_busy_poll, false); + WRITE_ONCE(ctx->napi_enabled, false); + return 0; +} + +/* + * __io_napi_adjust_timeout() - Add napi id to the busy poll list + * @ctx: pointer to io-uring context structure + * @iowq: pointer to io wait queue + * @ts: pointer to timespec or NULL + * + * Adjust the busy loop timeout according to timespec and busy poll timeout. + */ +void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, + struct timespec64 *ts) +{ + unsigned int poll_to = READ_ONCE(ctx->napi_busy_poll_to); + + if (ts) { + struct timespec64 poll_to_ts = ns_to_timespec64(1000 * (s64)poll_to); + + if (timespec64_compare(ts, &poll_to_ts) > 0) { + *ts = timespec64_sub(*ts, poll_to_ts); + } else { + u64 to = timespec64_to_ns(ts); + + do_div(to, 1000); + ts->tv_sec = 0; + ts->tv_nsec = 0; + } + } + + iowq->napi_busy_poll_to = poll_to; +} + +/* + * __io_napi_busy_loop() - execute busy poll loop + * @ctx: pointer to io-uring context structure + * @iowq: pointer to io wait queue + * + * Execute the busy poll loop and merge the spliced off list. + */ +void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) +{ + iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll); + + if (!(ctx->flags & IORING_SETUP_SQPOLL) && ctx->napi_enabled) + io_napi_blocking_busy_loop(ctx, iowq); +} + +/* + * io_napi_sqpoll_busy_poll() - busy poll loop for sqpoll + * @ctx: pointer to io-uring context structure + * + * Splice of the napi list and execute the napi busy poll loop. + */ +int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx) +{ + LIST_HEAD(napi_list); + bool is_stale = false; + + if (!READ_ONCE(ctx->napi_busy_poll_to)) + return 0; + if (list_empty_careful(&ctx->napi_list)) + return 0; + + rcu_read_lock(); + is_stale = __io_napi_do_busy_loop(ctx, NULL); + rcu_read_unlock(); + + io_napi_remove_stale(ctx, is_stale); + return 1; +} + +#endif diff --git a/io_uring/napi.h b/io_uring/napi.h new file mode 100644 index 000000000000..6fc0393d0dbe --- /dev/null +++ b/io_uring/napi.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef IOU_NAPI_H +#define IOU_NAPI_H + +#include <linux/kernel.h> +#include <linux/io_uring.h> +#include <net/busy_poll.h> + +#ifdef CONFIG_NET_RX_BUSY_POLL + +void io_napi_init(struct io_ring_ctx *ctx); +void io_napi_free(struct io_ring_ctx *ctx); + +int io_register_napi(struct io_ring_ctx *ctx, void __user *arg); +int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg); + +void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock); + +void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, + struct io_wait_queue *iowq, struct timespec64 *ts); +void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq); +int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx); + +static inline bool io_napi(struct io_ring_ctx *ctx) +{ + return !list_empty(&ctx->napi_list); +} + +static inline void io_napi_adjust_timeout(struct io_ring_ctx *ctx, + struct io_wait_queue *iowq, + struct timespec64 *ts) +{ + if (!io_napi(ctx)) + return; + __io_napi_adjust_timeout(ctx, iowq, ts); +} + +static inline void io_napi_busy_loop(struct io_ring_ctx *ctx, + struct io_wait_queue *iowq) +{ + if (!io_napi(ctx)) + return; + __io_napi_busy_loop(ctx, iowq); +} + +/* + * io_napi_add() - Add napi id to the busy poll list + * @req: pointer to io_kiocb request + * + * Add the napi id of the socket to the napi busy poll list and hash table. + */ +static inline void io_napi_add(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + struct socket *sock; + + if (!READ_ONCE(ctx->napi_busy_poll_to)) + return; + + sock = sock_from_file(req->file); + if (sock) + __io_napi_add(ctx, sock); +} + +#else + +static inline void io_napi_init(struct io_ring_ctx *ctx) +{ +} +static inline void io_napi_free(struct io_ring_ctx *ctx) +{ +} +static inline int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) +{ + return -EOPNOTSUPP; +} +static inline int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) +{ + return -EOPNOTSUPP; +} +static inline bool io_napi(struct io_ring_ctx *ctx) +{ + return false; +} +static inline void io_napi_add(struct io_kiocb *req) +{ +} +static inline void io_napi_adjust_timeout(struct io_ring_ctx *ctx, + struct io_wait_queue *iowq, + struct timespec64 *ts) +{ +} +static inline void io_napi_busy_loop(struct io_ring_ctx *ctx, + struct io_wait_queue *iowq) +{ +} +static inline int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx) +{ + return 0; +} +#endif /* CONFIG_NET_RX_BUSY_POLL */ + +#endif diff --git a/io_uring/net.c b/io_uring/net.c index 161622029147..19451f0dbf81 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -78,19 +78,6 @@ struct io_sr_msg { */ #define MULTISHOT_MAX_RETRY 32 -static inline bool io_check_multishot(struct io_kiocb *req, - unsigned int issue_flags) -{ - /* - * When ->locked_cq is set we only allow to post CQEs from the original - * task context. Usual request completions will be handled in other - * generic paths but multipoll may decide to post extra cqes. - */ - return !(issue_flags & IO_URING_F_IOWQ) || - !(issue_flags & IO_URING_F_MULTISHOT) || - !req->ctx->task_complete; -} - int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown); @@ -204,16 +191,130 @@ static int io_setup_async_msg(struct io_kiocb *req, return -EAGAIN; } +#ifdef CONFIG_COMPAT +static int io_compat_msg_copy_hdr(struct io_kiocb *req, + struct io_async_msghdr *iomsg, + struct compat_msghdr *msg, int ddir) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + struct compat_iovec __user *uiov; + int ret; + + if (copy_from_user(msg, sr->umsg_compat, sizeof(*msg))) + return -EFAULT; + + uiov = compat_ptr(msg->msg_iov); + if (req->flags & REQ_F_BUFFER_SELECT) { + compat_ssize_t clen; + + iomsg->free_iov = NULL; + if (msg->msg_iovlen == 0) { + sr->len = 0; + } else if (msg->msg_iovlen > 1) { + return -EINVAL; + } else { + if (!access_ok(uiov, sizeof(*uiov))) + return -EFAULT; + if (__get_user(clen, &uiov->iov_len)) + return -EFAULT; + if (clen < 0) + return -EINVAL; + sr->len = clen; + } + + return 0; + } + + iomsg->free_iov = iomsg->fast_iov; + ret = __import_iovec(ddir, (struct iovec __user *)uiov, msg->msg_iovlen, + UIO_FASTIOV, &iomsg->free_iov, + &iomsg->msg.msg_iter, true); + if (unlikely(ret < 0)) + return ret; + + return 0; +} +#endif + +static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, + struct user_msghdr *msg, int ddir) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + int ret; + + if (!user_access_begin(sr->umsg, sizeof(*sr->umsg))) + return -EFAULT; + + ret = -EFAULT; + unsafe_get_user(msg->msg_name, &sr->umsg->msg_name, ua_end); + unsafe_get_user(msg->msg_namelen, &sr->umsg->msg_namelen, ua_end); + unsafe_get_user(msg->msg_iov, &sr->umsg->msg_iov, ua_end); + unsafe_get_user(msg->msg_iovlen, &sr->umsg->msg_iovlen, ua_end); + unsafe_get_user(msg->msg_control, &sr->umsg->msg_control, ua_end); + unsafe_get_user(msg->msg_controllen, &sr->umsg->msg_controllen, ua_end); + msg->msg_flags = 0; + + if (req->flags & REQ_F_BUFFER_SELECT) { + if (msg->msg_iovlen == 0) { + sr->len = iomsg->fast_iov[0].iov_len = 0; + iomsg->fast_iov[0].iov_base = NULL; + iomsg->free_iov = NULL; + } else if (msg->msg_iovlen > 1) { + ret = -EINVAL; + goto ua_end; + } else { + /* we only need the length for provided buffers */ + if (!access_ok(&msg->msg_iov[0].iov_len, sizeof(__kernel_size_t))) + goto ua_end; + unsafe_get_user(iomsg->fast_iov[0].iov_len, + &msg->msg_iov[0].iov_len, ua_end); + sr->len = iomsg->fast_iov[0].iov_len; + iomsg->free_iov = NULL; + } + ret = 0; +ua_end: + user_access_end(); + return ret; + } + + user_access_end(); + iomsg->free_iov = iomsg->fast_iov; + ret = __import_iovec(ddir, msg->msg_iov, msg->msg_iovlen, UIO_FASTIOV, + &iomsg->free_iov, &iomsg->msg.msg_iter, false); + if (unlikely(ret < 0)) + return ret; + + return 0; +} + static int io_sendmsg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + struct user_msghdr msg; int ret; iomsg->msg.msg_name = &iomsg->addr; - iomsg->free_iov = iomsg->fast_iov; - ret = sendmsg_copy_msghdr(&iomsg->msg, sr->umsg, sr->msg_flags, - &iomsg->free_iov); + iomsg->msg.msg_iter.nr_segs = 0; + +#ifdef CONFIG_COMPAT + if (unlikely(req->ctx->compat)) { + struct compat_msghdr cmsg; + + ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_SOURCE); + if (unlikely(ret)) + return ret; + + return __get_compat_msghdr(&iomsg->msg, &cmsg, NULL); + } +#endif + + ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_SOURCE); + if (unlikely(ret)) + return ret; + + ret = __copy_msghdr(&iomsg->msg, &msg, NULL); + /* save msg_control as sys_sendmsg() overwrites it */ sr->msg_control = iomsg->msg.msg_control_user; return ret; @@ -273,6 +374,8 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + sr->done_io = 0; + if (req->opcode == IORING_OP_SEND) { if (READ_ONCE(sqe->__pad3[0])) return -EINVAL; @@ -295,10 +398,20 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->ctx->compat) sr->msg_flags |= MSG_CMSG_COMPAT; #endif - sr->done_io = 0; return 0; } +static void io_req_msg_cleanup(struct io_kiocb *req, + struct io_async_msghdr *kmsg, + unsigned int issue_flags) +{ + req->flags &= ~REQ_F_NEED_CLEANUP; + /* fast path, check for non-NULL to avoid function call */ + if (kmsg->free_iov) + kfree(kmsg->free_iov); + io_netmsg_recycle(req, issue_flags); +} + int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); @@ -341,18 +454,14 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) kmsg->msg.msg_controllen = 0; kmsg->msg.msg_control = NULL; sr->done_io += ret; - req->flags |= REQ_F_PARTIAL_IO; + req->flags |= REQ_F_BL_NO_RECYCLE; return io_setup_async_msg(req, kmsg, issue_flags); } if (ret == -ERESTARTSYS) ret = -EINTR; req_set_fail(req); } - /* fast path, check for non-NULL to avoid function call */ - if (kmsg->free_iov) - kfree(kmsg->free_iov); - req->flags &= ~REQ_F_NEED_CLEANUP; - io_netmsg_recycle(req, issue_flags); + io_req_msg_cleanup(req, kmsg, issue_flags); if (ret >= 0) ret += sr->done_io; else if (sr->done_io) @@ -420,7 +529,7 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) sr->len -= ret; sr->buf += ret; sr->done_io += ret; - req->flags |= REQ_F_PARTIAL_IO; + req->flags |= REQ_F_BL_NO_RECYCLE; return io_setup_async_addr(req, &__address, issue_flags); } if (ret == -ERESTARTSYS) @@ -435,142 +544,77 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } -static bool io_recvmsg_multishot_overflow(struct io_async_msghdr *iomsg) +static int io_recvmsg_mshot_prep(struct io_kiocb *req, + struct io_async_msghdr *iomsg, + int namelen, size_t controllen) { - int hdr; - - if (iomsg->namelen < 0) - return true; - if (check_add_overflow((int)sizeof(struct io_uring_recvmsg_out), - iomsg->namelen, &hdr)) - return true; - if (check_add_overflow(hdr, (int)iomsg->controllen, &hdr)) - return true; + if ((req->flags & (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) == + (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) { + int hdr; + + if (unlikely(namelen < 0)) + return -EOVERFLOW; + if (check_add_overflow(sizeof(struct io_uring_recvmsg_out), + namelen, &hdr)) + return -EOVERFLOW; + if (check_add_overflow(hdr, controllen, &hdr)) + return -EOVERFLOW; + + iomsg->namelen = namelen; + iomsg->controllen = controllen; + return 0; + } - return false; + return 0; } -static int __io_recvmsg_copy_hdr(struct io_kiocb *req, - struct io_async_msghdr *iomsg) +static int io_recvmsg_copy_hdr(struct io_kiocb *req, + struct io_async_msghdr *iomsg) { - struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct user_msghdr msg; int ret; - if (copy_from_user(&msg, sr->umsg, sizeof(*sr->umsg))) - return -EFAULT; - - ret = __copy_msghdr(&iomsg->msg, &msg, &iomsg->uaddr); - if (ret) - return ret; - - if (req->flags & REQ_F_BUFFER_SELECT) { - if (msg.msg_iovlen == 0) { - sr->len = iomsg->fast_iov[0].iov_len = 0; - iomsg->fast_iov[0].iov_base = NULL; - iomsg->free_iov = NULL; - } else if (msg.msg_iovlen > 1) { - return -EINVAL; - } else { - if (copy_from_user(iomsg->fast_iov, msg.msg_iov, sizeof(*msg.msg_iov))) - return -EFAULT; - sr->len = iomsg->fast_iov[0].iov_len; - iomsg->free_iov = NULL; - } - - if (req->flags & REQ_F_APOLL_MULTISHOT) { - iomsg->namelen = msg.msg_namelen; - iomsg->controllen = msg.msg_controllen; - if (io_recvmsg_multishot_overflow(iomsg)) - return -EOVERFLOW; - } - } else { - iomsg->free_iov = iomsg->fast_iov; - ret = __import_iovec(ITER_DEST, msg.msg_iov, msg.msg_iovlen, UIO_FASTIOV, - &iomsg->free_iov, &iomsg->msg.msg_iter, - false); - if (ret > 0) - ret = 0; - } - - return ret; -} + iomsg->msg.msg_name = &iomsg->addr; + iomsg->msg.msg_iter.nr_segs = 0; #ifdef CONFIG_COMPAT -static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, - struct io_async_msghdr *iomsg) -{ - struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct compat_msghdr msg; - struct compat_iovec __user *uiov; - int ret; + if (unlikely(req->ctx->compat)) { + struct compat_msghdr cmsg; - if (copy_from_user(&msg, sr->umsg_compat, sizeof(msg))) - return -EFAULT; - - ret = __get_compat_msghdr(&iomsg->msg, &msg, &iomsg->uaddr); - if (ret) - return ret; - - uiov = compat_ptr(msg.msg_iov); - if (req->flags & REQ_F_BUFFER_SELECT) { - compat_ssize_t clen; - - iomsg->free_iov = NULL; - if (msg.msg_iovlen == 0) { - sr->len = 0; - } else if (msg.msg_iovlen > 1) { - return -EINVAL; - } else { - if (!access_ok(uiov, sizeof(*uiov))) - return -EFAULT; - if (__get_user(clen, &uiov->iov_len)) - return -EFAULT; - if (clen < 0) - return -EINVAL; - sr->len = clen; - } + ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_DEST); + if (unlikely(ret)) + return ret; - if (req->flags & REQ_F_APOLL_MULTISHOT) { - iomsg->namelen = msg.msg_namelen; - iomsg->controllen = msg.msg_controllen; - if (io_recvmsg_multishot_overflow(iomsg)) - return -EOVERFLOW; - } - } else { - iomsg->free_iov = iomsg->fast_iov; - ret = __import_iovec(ITER_DEST, (struct iovec __user *)uiov, msg.msg_iovlen, - UIO_FASTIOV, &iomsg->free_iov, - &iomsg->msg.msg_iter, true); - if (ret < 0) + ret = __get_compat_msghdr(&iomsg->msg, &cmsg, &iomsg->uaddr); + if (unlikely(ret)) return ret; - } - return 0; -} + return io_recvmsg_mshot_prep(req, iomsg, cmsg.msg_namelen, + cmsg.msg_controllen); + } #endif -static int io_recvmsg_copy_hdr(struct io_kiocb *req, - struct io_async_msghdr *iomsg) -{ - iomsg->msg.msg_name = &iomsg->addr; - iomsg->msg.msg_iter.nr_segs = 0; + ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST); + if (unlikely(ret)) + return ret; -#ifdef CONFIG_COMPAT - if (req->ctx->compat) - return __io_compat_recvmsg_copy_hdr(req, iomsg); -#endif + ret = __copy_msghdr(&iomsg->msg, &msg, &iomsg->uaddr); + if (unlikely(ret)) + return ret; - return __io_recvmsg_copy_hdr(req, iomsg); + return io_recvmsg_mshot_prep(req, iomsg, msg.msg_namelen, + msg.msg_controllen); } int io_recvmsg_prep_async(struct io_kiocb *req) { + struct io_async_msghdr *iomsg; int ret; if (!io_msg_alloc_async_prep(req)) return -ENOMEM; - ret = io_recvmsg_copy_hdr(req, req->async_data); + iomsg = req->async_data; + ret = io_recvmsg_copy_hdr(req, iomsg); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; return ret; @@ -582,6 +626,8 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + sr->done_io = 0; + if (unlikely(sqe->file_index || sqe->addr2)) return -EINVAL; @@ -618,7 +664,6 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->ctx->compat) sr->msg_flags |= MSG_CMSG_COMPAT; #endif - sr->done_io = 0; sr->nr_multishot_loops = 0; return 0; } @@ -627,6 +672,7 @@ static inline void io_recv_prep_retry(struct io_kiocb *req) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + req->flags &= ~REQ_F_BL_EMPTY; sr->done_io = 0; sr->len = 0; /* get from the provided buffer */ req->buf_index = sr->buf_group; @@ -645,30 +691,22 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, unsigned int cflags; cflags = io_put_kbuf(req, issue_flags); - if (msg->msg_inq && msg->msg_inq != -1) + if (msg->msg_inq > 0) cflags |= IORING_CQE_F_SOCK_NONEMPTY; - if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { - io_req_set_res(req, *ret, cflags); - *ret = IOU_OK; - return true; - } - - if (mshot_finished) - goto finish; - /* * Fill CQE for this receive and see if we should keep trying to * receive from this socket. */ - if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER, + if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished && + io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER, *ret, cflags | IORING_CQE_F_MORE)) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); int mshot_retry_ret = IOU_ISSUE_SKIP_COMPLETE; io_recv_prep_retry(req); /* Known not-empty or unknown state, retry */ - if (cflags & IORING_CQE_F_SOCK_NONEMPTY || msg->msg_inq == -1) { + if (cflags & IORING_CQE_F_SOCK_NONEMPTY || msg->msg_inq < 0) { if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY) return false; /* mshot retries exceeded, force a requeue */ @@ -681,8 +719,8 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, *ret = -EAGAIN; return true; } - /* Otherwise stop multishot but use the current result. */ -finish: + + /* Finish the request / stop multishot. */ io_req_set_res(req, *ret, cflags); if (issue_flags & IO_URING_F_MULTISHOT) @@ -803,8 +841,9 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) (sr->flags & IORING_RECVSEND_POLL_FIRST)) return io_setup_async_msg(req, kmsg, issue_flags); - if (!io_check_multishot(req, issue_flags)) - return io_setup_async_msg(req, kmsg, issue_flags); + flags = sr->msg_flags; + if (force_nonblock) + flags |= MSG_DONTWAIT; retry_multishot: if (io_do_buffer_select(req)) { @@ -826,10 +865,6 @@ retry_multishot: iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_DEST, buf, len); } - flags = sr->msg_flags; - if (force_nonblock) - flags |= MSG_DONTWAIT; - kmsg->msg.msg_get_inq = 1; kmsg->msg.msg_inq = -1; if (req->flags & REQ_F_APOLL_MULTISHOT) { @@ -855,7 +890,7 @@ retry_multishot: } if (ret > 0 && io_net_retry(sock, flags)) { sr->done_io += ret; - req->flags |= REQ_F_PARTIAL_IO; + req->flags |= REQ_F_BL_NO_RECYCLE; return io_setup_async_msg(req, kmsg, issue_flags); } if (ret == -ERESTARTSYS) @@ -875,13 +910,10 @@ retry_multishot: if (!io_recv_finish(req, &ret, &kmsg->msg, mshot_finished, issue_flags)) goto retry_multishot; - if (mshot_finished) { - /* fast path, check for non-NULL to avoid function call */ - if (kmsg->free_iov) - kfree(kmsg->free_iov); - io_netmsg_recycle(req, issue_flags); - req->flags &= ~REQ_F_NEED_CLEANUP; - } + if (mshot_finished) + io_req_msg_cleanup(req, kmsg, issue_flags); + else if (ret == -EAGAIN) + return io_setup_async_msg(req, kmsg, issue_flags); return ret; } @@ -900,9 +932,6 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) (sr->flags & IORING_RECVSEND_POLL_FIRST)) return -EAGAIN; - if (!io_check_multishot(req, issue_flags)) - return -EAGAIN; - sock = sock_from_file(req->file); if (unlikely(!sock)) return -ENOTSOCK; @@ -915,6 +944,10 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) msg.msg_iocb = NULL; msg.msg_ubuf = NULL; + flags = sr->msg_flags; + if (force_nonblock) + flags |= MSG_DONTWAIT; + retry_multishot: if (io_do_buffer_select(req)) { void __user *buf; @@ -933,9 +966,6 @@ retry_multishot: msg.msg_inq = -1; msg.msg_flags = 0; - flags = sr->msg_flags; - if (force_nonblock) - flags |= MSG_DONTWAIT; if (flags & MSG_WAITALL) min_ret = iov_iter_count(&msg.msg_iter); @@ -953,7 +983,7 @@ retry_multishot: sr->len -= ret; sr->buf += ret; sr->done_io += ret; - req->flags |= REQ_F_PARTIAL_IO; + req->flags |= REQ_F_BL_NO_RECYCLE; return -EAGAIN; } if (ret == -ERESTARTSYS) @@ -1003,6 +1033,8 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *notif; + zc->done_io = 0; + if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))) return -EINVAL; /* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */ @@ -1055,8 +1087,6 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (zc->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; - zc->done_io = 0; - #ifdef CONFIG_COMPAT if (req->ctx->compat) zc->msg_flags |= MSG_CMSG_COMPAT; @@ -1196,7 +1226,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) zc->len -= ret; zc->buf += ret; zc->done_io += ret; - req->flags |= REQ_F_PARTIAL_IO; + req->flags |= REQ_F_BL_NO_RECYCLE; return io_setup_async_addr(req, &__address, issue_flags); } if (ret == -ERESTARTSYS) @@ -1266,7 +1296,7 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) if (ret > 0 && io_net_retry(sock, flags)) { sr->done_io += ret; - req->flags |= REQ_F_PARTIAL_IO; + req->flags |= REQ_F_BL_NO_RECYCLE; return io_setup_async_msg(req, kmsg, issue_flags); } if (ret == -ERESTARTSYS) @@ -1301,7 +1331,7 @@ void io_sendrecv_fail(struct io_kiocb *req) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - if (req->flags & REQ_F_PARTIAL_IO) + if (sr->done_io) req->cqe.res = sr->done_io; if ((req->flags & REQ_F_NEED_CLEANUP) && @@ -1351,8 +1381,6 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags) struct file *file; int ret, fd; - if (!io_check_multishot(req, issue_flags)) - return -EAGAIN; retry: if (!fixed) { fd = __get_unused_fd_flags(accept->flags, accept->nofile); diff --git a/io_uring/opdef.c b/io_uring/opdef.c index b1ee3a9c3807..9c080aadc5a6 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -35,6 +35,7 @@ #include "rw.h" #include "waitid.h" #include "futex.h" +#include "truncate.h" static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags) { @@ -474,6 +475,12 @@ const struct io_issue_def io_issue_defs[] = { .prep = io_install_fixed_fd_prep, .issue = io_install_fixed_fd, }, + [IORING_OP_FTRUNCATE] = { + .needs_file = 1, + .hash_reg_file = 1, + .prep = io_ftruncate_prep, + .issue = io_ftruncate, + }, }; const struct io_cold_def io_cold_defs[] = { @@ -712,6 +719,9 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_FIXED_FD_INSTALL] = { .name = "FIXED_FD_INSTALL", }, + [IORING_OP_FTRUNCATE] = { + .name = "FTRUNCATE", + }, }; const char *io_uring_get_opcode(u8 opcode) diff --git a/io_uring/poll.c b/io_uring/poll.c index 7513afc7b702..5f779139cae1 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -15,6 +15,7 @@ #include "io_uring.h" #include "refs.h" +#include "napi.h" #include "opdef.h" #include "kbuf.h" #include "poll.h" @@ -343,8 +344,8 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) * Release all references, retry if someone tried to restart * task_work while we were executing it. */ - } while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs) & - IO_POLL_REF_MASK); + v &= IO_POLL_REF_MASK; + } while (atomic_sub_return(v, &req->poll_refs) & IO_POLL_REF_MASK); return IOU_POLL_NO_ACTION; } @@ -539,14 +540,6 @@ static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt, poll->wait.private = (void *) wqe_private; if (poll->events & EPOLLEXCLUSIVE) { - /* - * Exclusive waits may only wake a limited amount of entries - * rather than all of them, this may interfere with lazy - * wake if someone does wait(events > 1). Ensure we don't do - * lazy wake for those, as we need to process each one as they - * come in. - */ - req->flags |= REQ_F_POLL_NO_LAZY; add_wait_queue_exclusive(head, &poll->wait); } else { add_wait_queue(head, &poll->wait); @@ -588,10 +581,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req, struct io_poll_table *ipt, __poll_t mask, unsigned issue_flags) { - struct io_ring_ctx *ctx = req->ctx; - INIT_HLIST_NODE(&req->hash_node); - req->work.cancel_seq = atomic_read(&ctx->cancel_seq); io_init_poll_iocb(poll, mask); poll->file = req->file; req->apoll_events = poll->events; @@ -618,6 +608,17 @@ static int __io_arm_poll_handler(struct io_kiocb *req, if (issue_flags & IO_URING_F_UNLOCKED) req->flags &= ~REQ_F_HASH_LOCKED; + + /* + * Exclusive waits may only wake a limited amount of entries + * rather than all of them, this may interfere with lazy + * wake if someone does wait(events > 1). Ensure we don't do + * lazy wake for those, as we need to process each one as they + * come in. + */ + if (poll->events & EPOLLEXCLUSIVE) + req->flags |= REQ_F_POLL_NO_LAZY; + mask = vfs_poll(req->file, &ipt->pt) & poll->events; if (unlikely(ipt->error || !ipt->nr_entries)) { @@ -652,6 +653,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req, __io_poll_execute(req, mask); return 0; } + io_napi_add(req); if (ipt->owning) { /* @@ -727,7 +729,7 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) if (!def->pollin && !def->pollout) return IO_APOLL_ABORTED; - if (!file_can_poll(req->file)) + if (!io_file_can_poll(req)) return IO_APOLL_ABORTED; if (!(req->flags & REQ_F_APOLL_MULTISHOT)) mask |= EPOLLONESHOT; @@ -818,9 +820,8 @@ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, if (poll_only && req->opcode != IORING_OP_POLL_ADD) continue; if (cd->flags & IORING_ASYNC_CANCEL_ALL) { - if (cd->seq == req->work.cancel_seq) + if (io_cancel_match_sequence(req, cd->seq)) continue; - req->work.cancel_seq = cd->seq; } *out_bucket = hb; return req; diff --git a/io_uring/register.c b/io_uring/register.c index 5e62c1208996..99c37775f974 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -26,6 +26,7 @@ #include "register.h" #include "cancel.h" #include "kbuf.h" +#include "napi.h" #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) @@ -550,6 +551,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_register_pbuf_status(ctx, arg); break; + case IORING_REGISTER_NAPI: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_register_napi(ctx, arg); + break; + case IORING_UNREGISTER_NAPI: + ret = -EINVAL; + if (nr_args != 1) + break; + ret = io_unregister_napi(ctx, arg); + break; default: ret = -EINVAL; break; diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index c6f199bbee28..e21000238954 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -2,8 +2,6 @@ #ifndef IOU_RSRC_H #define IOU_RSRC_H -#include <net/af_unix.h> - #include "alloc_cache.h" #define IO_NODE_ALLOC_CACHE_MAX 32 diff --git a/io_uring/rw.c b/io_uring/rw.c index d5e79d9bdc71..47e097ab5d7e 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -11,6 +11,7 @@ #include <linux/nospec.h> #include <linux/compat.h> #include <linux/io_uring/cmd.h> +#include <linux/indirect_call_wrapper.h> #include <uapi/linux/io_uring.h> @@ -274,7 +275,7 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res) * current cycle. */ io_req_io_end(req); - req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO; + req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE; return true; } req_set_fail(req); @@ -341,7 +342,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) io_req_end_write(req); if (unlikely(res != req->cqe.res)) { if (res == -EAGAIN && io_rw_should_reissue(req)) { - req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO; + req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE; return; } req->cqe.res = res; @@ -682,7 +683,7 @@ static bool io_rw_should_retry(struct io_kiocb *req) * just use poll if we can, and don't attempt if the fs doesn't * support callback based unlocks */ - if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC)) + if (io_file_can_poll(req) || !(req->file->f_mode & FMODE_BUF_RASYNC)) return false; wait->wait.func = io_async_buf_func; @@ -721,7 +722,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) struct file *file = req->file; int ret; - if (unlikely(!file || !(file->f_mode & mode))) + if (unlikely(!(file->f_mode & mode))) return -EBADF; if (!(req->flags & REQ_F_FIXED_FILE)) @@ -831,7 +832,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) * If we can poll, just do that. For a vectored read, we'll * need to copy state first. */ - if (file_can_poll(req->file) && !io_issue_defs[req->opcode].vectored) + if (io_file_can_poll(req) && !io_issue_defs[req->opcode].vectored) return -EAGAIN; /* IOPOLL retry should happen for io-wq threads */ if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) @@ -930,7 +931,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) /* * Multishot MUST be used on a pollable file */ - if (!file_can_poll(req->file)) + if (!io_file_can_poll(req)) return -EBADFD; ret = __io_read(req, issue_flags); diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index 65b5dbe3c850..363052b4ea76 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -15,9 +15,11 @@ #include <uapi/linux/io_uring.h> #include "io_uring.h" +#include "napi.h" #include "sqpoll.h" #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 +#define IORING_TW_CAP_ENTRIES_VALUE 8 enum { IO_SQ_THREAD_SHOULD_STOP = 0, @@ -193,6 +195,9 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) ret = io_submit_sqes(ctx, to_submit); mutex_unlock(&ctx->uring_lock); + if (io_napi(ctx)) + ret += io_napi_sqpoll_busy_poll(ctx); + if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait)) wake_up(&ctx->sqo_sq_wait); if (creds) @@ -219,10 +224,52 @@ static bool io_sqd_handle_event(struct io_sq_data *sqd) return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); } +/* + * Run task_work, processing the retry_list first. The retry_list holds + * entries that we passed on in the previous run, if we had more task_work + * than we were asked to process. Newly queued task_work isn't run until the + * retry list has been fully processed. + */ +static unsigned int io_sq_tw(struct llist_node **retry_list, int max_entries) +{ + struct io_uring_task *tctx = current->io_uring; + unsigned int count = 0; + + if (*retry_list) { + *retry_list = io_handle_tw_list(*retry_list, &count, max_entries); + if (count >= max_entries) + return count; + max_entries -= count; + } + + *retry_list = tctx_task_work_run(tctx, max_entries, &count); + return count; +} + +static bool io_sq_tw_pending(struct llist_node *retry_list) +{ + struct io_uring_task *tctx = current->io_uring; + + return retry_list || !llist_empty(&tctx->task_list); +} + +static void io_sq_update_worktime(struct io_sq_data *sqd, struct rusage *start) +{ + struct rusage end; + + getrusage(current, RUSAGE_SELF, &end); + end.ru_stime.tv_sec -= start->ru_stime.tv_sec; + end.ru_stime.tv_usec -= start->ru_stime.tv_usec; + + sqd->work_time += end.ru_stime.tv_usec + end.ru_stime.tv_sec * 1000000; +} + static int io_sq_thread(void *data) { + struct llist_node *retry_list = NULL; struct io_sq_data *sqd = data; struct io_ring_ctx *ctx; + struct rusage start; unsigned long timeout = 0; char buf[TASK_COMM_LEN]; DEFINE_WAIT(wait); @@ -251,18 +298,21 @@ static int io_sq_thread(void *data) } cap_entries = !list_is_singular(&sqd->ctx_list); + getrusage(current, RUSAGE_SELF, &start); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { int ret = __io_sq_thread(ctx, cap_entries); if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list))) sqt_spin = true; } - if (io_run_task_work()) + if (io_sq_tw(&retry_list, IORING_TW_CAP_ENTRIES_VALUE)) sqt_spin = true; if (sqt_spin || !time_after(jiffies, timeout)) { - if (sqt_spin) + if (sqt_spin) { + io_sq_update_worktime(sqd, &start); timeout = jiffies + sqd->sq_thread_idle; + } if (unlikely(need_resched())) { mutex_unlock(&sqd->lock); cond_resched(); @@ -273,7 +323,7 @@ static int io_sq_thread(void *data) } prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE); - if (!io_sqd_events_pending(sqd) && !task_work_pending(current)) { + if (!io_sqd_events_pending(sqd) && !io_sq_tw_pending(retry_list)) { bool needs_sched = true; list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { @@ -312,6 +362,9 @@ static int io_sq_thread(void *data) timeout = jiffies + sqd->sq_thread_idle; } + if (retry_list) + io_sq_tw(&retry_list, UINT_MAX); + io_uring_cancel_generic(true, sqd); sqd->thread = NULL; list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) diff --git a/io_uring/sqpoll.h b/io_uring/sqpoll.h index 8df37e8c9149..4171666b1cf4 100644 --- a/io_uring/sqpoll.h +++ b/io_uring/sqpoll.h @@ -16,6 +16,7 @@ struct io_sq_data { pid_t task_pid; pid_t task_tgid; + u64 work_time; unsigned long state; struct completion exited; }; diff --git a/io_uring/truncate.c b/io_uring/truncate.c new file mode 100644 index 000000000000..62ee73d34d72 --- /dev/null +++ b/io_uring/truncate.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/syscalls.h> +#include <linux/io_uring.h> + +#include <uapi/linux/io_uring.h> + +#include "../fs/internal.h" + +#include "io_uring.h" +#include "truncate.h" + +struct io_ftrunc { + struct file *file; + loff_t len; +}; + +int io_ftruncate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_ftrunc *ft = io_kiocb_to_cmd(req, struct io_ftrunc); + + if (sqe->rw_flags || sqe->addr || sqe->len || sqe->buf_index || + sqe->splice_fd_in || sqe->addr3) + return -EINVAL; + + ft->len = READ_ONCE(sqe->off); + + req->flags |= REQ_F_FORCE_ASYNC; + return 0; +} + +int io_ftruncate(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_ftrunc *ft = io_kiocb_to_cmd(req, struct io_ftrunc); + int ret; + + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); + + ret = do_ftruncate(req->file, ft->len, 1); + + io_req_set_res(req, ret, 0); + return IOU_OK; +} diff --git a/io_uring/truncate.h b/io_uring/truncate.h new file mode 100644 index 000000000000..ec088293a478 --- /dev/null +++ b/io_uring/truncate.h @@ -0,0 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 + +int io_ftruncate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_ftruncate(struct io_kiocb *req, unsigned int issue_flags); diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index c33fca585dde..42f63adfa54a 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -5,6 +5,7 @@ #include <linux/io_uring/cmd.h> #include <linux/security.h> #include <linux/nospec.h> +#include <net/sock.h> #include <uapi/linux/io_uring.h> #include <asm/ioctls.h> diff --git a/io_uring/xattr.c b/io_uring/xattr.c index e1c810e0b85a..44905b82eea8 100644 --- a/io_uring/xattr.c +++ b/io_uring/xattr.c @@ -112,7 +112,7 @@ int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags) WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); - ret = do_getxattr(mnt_idmap(req->file->f_path.mnt), + ret = do_getxattr(file_mnt_idmap(req->file), req->file->f_path.dentry, &ix->ctx); diff --git a/net/core/dev.c b/net/core/dev.c index 76e6438f4858..a892f7265189 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6177,8 +6177,13 @@ static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) clear_bit(NAPI_STATE_SCHED, &napi->state); } -static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll, - u16 budget) +enum { + NAPI_F_PREFER_BUSY_POLL = 1, + NAPI_F_END_ON_RESCHED = 2, +}; + +static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, + unsigned flags, u16 budget) { bool skip_schedule = false; unsigned long timeout; @@ -6198,7 +6203,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool local_bh_disable(); - if (prefer_busy_poll) { + if (flags & NAPI_F_PREFER_BUSY_POLL) { napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs); timeout = READ_ONCE(napi->dev->gro_flush_timeout); if (napi->defer_hard_irqs_count && timeout) { @@ -6222,23 +6227,23 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool local_bh_enable(); } -void napi_busy_loop(unsigned int napi_id, - bool (*loop_end)(void *, unsigned long), - void *loop_end_arg, bool prefer_busy_poll, u16 budget) +static void __napi_busy_loop(unsigned int napi_id, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg, unsigned flags, u16 budget) { unsigned long start_time = loop_end ? busy_loop_current_time() : 0; int (*napi_poll)(struct napi_struct *napi, int budget); void *have_poll_lock = NULL; struct napi_struct *napi; + WARN_ON_ONCE(!rcu_read_lock_held()); + restart: napi_poll = NULL; - rcu_read_lock(); - napi = napi_by_id(napi_id); if (!napi) - goto out; + return; if (!IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_disable(); @@ -6254,14 +6259,14 @@ restart: */ if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | NAPIF_STATE_IN_BUSY_POLL)) { - if (prefer_busy_poll) + if (flags & NAPI_F_PREFER_BUSY_POLL) set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); goto count; } if (cmpxchg(&napi->state, val, val | NAPIF_STATE_IN_BUSY_POLL | NAPIF_STATE_SCHED) != val) { - if (prefer_busy_poll) + if (flags & NAPI_F_PREFER_BUSY_POLL) set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); goto count; } @@ -6281,12 +6286,15 @@ count: break; if (unlikely(need_resched())) { + if (flags & NAPI_F_END_ON_RESCHED) + break; if (napi_poll) - busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); + busy_poll_stop(napi, have_poll_lock, flags, budget); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_enable(); rcu_read_unlock(); cond_resched(); + rcu_read_lock(); if (loop_end(loop_end_arg, start_time)) return; goto restart; @@ -6294,10 +6302,31 @@ count: cpu_relax(); } if (napi_poll) - busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); + busy_poll_stop(napi, have_poll_lock, flags, budget); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_enable(); -out: +} + +void napi_busy_loop_rcu(unsigned int napi_id, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg, bool prefer_busy_poll, u16 budget) +{ + unsigned flags = NAPI_F_END_ON_RESCHED; + + if (prefer_busy_poll) + flags |= NAPI_F_PREFER_BUSY_POLL; + + __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget); +} + +void napi_busy_loop(unsigned int napi_id, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg, bool prefer_busy_poll, u16 budget) +{ + unsigned flags = prefer_busy_poll ? NAPI_F_PREFER_BUSY_POLL : 0; + + rcu_read_lock(); + __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget); rcu_read_unlock(); } EXPORT_SYMBOL(napi_busy_loop); |