summaryrefslogtreecommitdiff
path: root/fs/io_uring.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/io_uring.c')
-rw-r--r--fs/io_uring.c4342
1 files changed, 2589 insertions, 1753 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c
index bc18af5e0a93..e0823f58f795 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -57,7 +57,7 @@
#include <linux/mman.h>
#include <linux/percpu.h>
#include <linux/slab.h>
-#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
#include <linux/bvec.h>
#include <linux/net.h>
#include <net/sock.h>
@@ -78,7 +78,8 @@
#include <linux/task_work.h>
#include <linux/pagemap.h>
#include <linux/io_uring.h>
-#include <linux/tracehook.h>
+#include <linux/audit.h>
+#include <linux/security.h>
#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
@@ -103,11 +104,14 @@
#define IORING_MAX_REG_BUFFERS (1U << 14)
-#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
- IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
- IOSQE_BUFFER_SELECT)
+#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
+ IOSQE_IO_HARDLINK | IOSQE_ASYNC)
+
+#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \
+ IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS)
+
#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
- REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS)
+ REQ_F_POLLED | REQ_F_CREDS | REQ_F_ASYNC_DATA)
#define IO_TCTX_REFS_CACHE_NR (1U << 10)
@@ -195,8 +199,10 @@ struct io_rings {
};
enum io_uring_cmd_flags {
- IO_URING_F_NONBLOCK = 1,
- IO_URING_F_COMPLETE_DEFER = 2,
+ IO_URING_F_COMPLETE_DEFER = 1,
+ IO_URING_F_UNLOCKED = 2,
+ /* int's last bit, sign checks are usually faster than a bit test */
+ IO_URING_F_NONBLOCK = INT_MIN,
};
struct io_mapped_ubuf {
@@ -255,11 +261,18 @@ struct io_rsrc_data {
bool quiesce;
};
+struct io_buffer_list {
+ struct list_head list;
+ struct list_head buf_list;
+ __u16 bgid;
+};
+
struct io_buffer {
struct list_head list;
__u64 addr;
__u32 len;
__u16 bid;
+ __u16 bgid;
};
struct io_restriction {
@@ -305,28 +318,27 @@ struct io_submit_link {
};
struct io_submit_state {
- struct blk_plug plug;
+ /* inline/task_work completion list, under ->uring_lock */
+ struct io_wq_work_node free_list;
+ /* batch completion logic */
+ struct io_wq_work_list compl_reqs;
struct io_submit_link link;
- /*
- * io_kiocb alloc cache
- */
- void *reqs[IO_REQ_CACHE_SIZE];
- unsigned int free_reqs;
-
bool plug_started;
+ bool need_plug;
+ bool flush_cqes;
+ unsigned short submit_nr;
+ struct blk_plug plug;
+};
- /*
- * Batch completion logic
- */
- struct io_kiocb *compl_reqs[IO_COMPL_BATCH];
- unsigned int compl_nr;
- /* inline/task_work completion list, under ->uring_lock */
- struct list_head free_list;
-
- unsigned int ios_left;
+struct io_ev_fd {
+ struct eventfd_ctx *cq_ev_fd;
+ unsigned int eventfd_async: 1;
+ struct rcu_head rcu;
};
+#define IO_BUFFERS_HASH_BITS 5
+
struct io_ring_ctx {
/* const or read-mostly hot data */
struct {
@@ -336,10 +348,11 @@ struct io_ring_ctx {
unsigned int flags;
unsigned int compat: 1;
unsigned int drain_next: 1;
- unsigned int eventfd_async: 1;
unsigned int restricted: 1;
unsigned int off_timeout_used: 1;
unsigned int drain_active: 1;
+ unsigned int drain_disabled: 1;
+ unsigned int has_evfd: 1;
} ____cacheline_aligned_in_smp;
/* submission data */
@@ -368,6 +381,7 @@ struct io_ring_ctx {
* uring_lock, and updated through io_uring_register(2)
*/
struct io_rsrc_node *rsrc_node;
+ int rsrc_cached_refs;
struct io_file_table file_table;
unsigned nr_user_files;
unsigned nr_user_bufs;
@@ -377,14 +391,16 @@ struct io_ring_ctx {
struct list_head timeout_list;
struct list_head ltimeout_list;
struct list_head cq_overflow_list;
- struct xarray io_buffers;
+ struct list_head *io_buffers;
+ struct list_head io_buffers_cache;
+ struct list_head apoll_cache;
struct xarray personalities;
u32 pers_next;
unsigned sq_thread_idle;
} ____cacheline_aligned_in_smp;
/* IRQ completion list, under ->completion_lock */
- struct list_head locked_free_list;
+ struct io_wq_work_list locked_free_list;
unsigned int locked_free_nr;
const struct cred *sq_creds; /* cred used for __io_sq_thread() */
@@ -398,8 +414,7 @@ struct io_ring_ctx {
struct {
unsigned cached_cq_tail;
unsigned cq_entries;
- struct eventfd_ctx *cq_ev_fd;
- struct wait_queue_head poll_wait;
+ struct io_ev_fd __rcu *io_ev_fd;
struct wait_queue_head cq_wait;
unsigned cq_extra;
atomic_t cq_timeouts;
@@ -417,10 +432,12 @@ struct io_ring_ctx {
* For SQPOLL, only the single threaded io_sq_thread() will
* manipulate the list, hence no extra locking is needed there.
*/
- struct list_head iopoll_list;
+ struct io_wq_work_list iopoll_list;
struct hlist_head *cancel_hash;
unsigned cancel_hash_bits;
bool poll_multi_queue;
+
+ struct list_head io_buffers_comp;
} ____cacheline_aligned_in_smp;
struct io_restriction restrictions;
@@ -436,6 +453,8 @@ struct io_ring_ctx {
struct llist_head rsrc_put_llist;
struct list_head rsrc_ref_list;
spinlock_t rsrc_ref_lock;
+
+ struct list_head io_buffers_pages;
};
/* Keep this last, we don't need it for the fast path */
@@ -461,6 +480,11 @@ struct io_ring_ctx {
};
};
+/*
+ * Arbitrary limit, can be raised if need be
+ */
+#define IO_RINGFD_REG_MAX 16
+
struct io_uring_task {
/* submission side */
int cached_refs;
@@ -469,12 +493,13 @@ struct io_uring_task {
const struct io_ring_ctx *last;
struct io_wq *io_wq;
struct percpu_counter inflight;
- atomic_t inflight_tracked;
atomic_t in_idle;
spinlock_t task_lock;
struct io_wq_work_list task_list;
+ struct io_wq_work_list prior_task_list;
struct callback_head task_work;
+ struct file **registered_rings;
bool task_running;
};
@@ -486,8 +511,6 @@ struct io_poll_iocb {
struct file *file;
struct wait_queue_head *head;
__poll_t events;
- bool done;
- bool canceled;
struct wait_queue_entry wait;
};
@@ -561,7 +584,8 @@ struct io_rw {
/* NOTE: kiocb has the file as the first member, so don't do it here */
struct kiocb kiocb;
u64 addr;
- u64 len;
+ u32 len;
+ u32 flags;
};
struct io_connect {
@@ -580,7 +604,7 @@ struct io_sr_msg {
int msg_flags;
int bgid;
size_t len;
- struct io_buffer *kbuf;
+ size_t done_io;
};
struct io_open {
@@ -623,10 +647,10 @@ struct io_epoll {
struct io_splice {
struct file *file_out;
- struct file *file_in;
loff_t off_out;
loff_t off_in;
u64 len;
+ int splice_fd_in;
unsigned int flags;
};
@@ -644,7 +668,7 @@ struct io_statx {
int dfd;
unsigned int mask;
unsigned int flags;
- const char __user *filename;
+ struct filename *filename;
struct statx __user *buffer;
};
@@ -692,9 +716,10 @@ struct io_hardlink {
int flags;
};
-struct io_completion {
+struct io_msg {
struct file *file;
- u32 cflags;
+ u64 user_data;
+ u32 len;
};
struct io_async_connect {
@@ -710,11 +735,15 @@ struct io_async_msghdr {
struct sockaddr_storage addr;
};
-struct io_async_rw {
- struct iovec fast_iov[UIO_FASTIOV];
- const struct iovec *free_iovec;
+struct io_rw_state {
struct iov_iter iter;
struct iov_iter_state iter_state;
+ struct iovec fast_iov[UIO_FASTIOV];
+};
+
+struct io_async_rw {
+ struct io_rw_state s;
+ const struct iovec *free_iovec;
size_t bytes_done;
struct wait_page_queue wpq;
};
@@ -726,6 +755,7 @@ enum {
REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,
REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
+ REQ_F_CQE_SKIP_BIT = IOSQE_CQE_SKIP_SUCCESS_BIT,
/* first byte is taken by user flags, shift it to not overlap */
REQ_F_FAIL_BIT = 8,
@@ -741,9 +771,13 @@ enum {
REQ_F_CREDS_BIT,
REQ_F_REFCOUNT_BIT,
REQ_F_ARM_LTIMEOUT_BIT,
+ REQ_F_ASYNC_DATA_BIT,
+ REQ_F_SKIP_LINK_CQES_BIT,
+ REQ_F_SINGLE_POLL_BIT,
+ REQ_F_DOUBLE_POLL_BIT,
+ REQ_F_PARTIAL_IO_BIT,
/* keep async read/write and isreg together and in order */
- REQ_F_NOWAIT_READ_BIT,
- REQ_F_NOWAIT_WRITE_BIT,
+ REQ_F_SUPPORT_NOWAIT_BIT,
REQ_F_ISREG_BIT,
/* not a real bit, just to check we're not overflowing the space */
@@ -763,6 +797,8 @@ enum {
REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),
/* IOSQE_BUFFER_SELECT */
REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
+ /* IOSQE_CQE_SKIP_SUCCESS */
+ REQ_F_CQE_SKIP = BIT(REQ_F_CQE_SKIP_BIT),
/* fail rest of links */
REQ_F_FAIL = BIT(REQ_F_FAIL_BIT),
@@ -784,10 +820,8 @@ enum {
REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT),
/* caller should reissue async */
REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT),
- /* supports async reads */
- REQ_F_NOWAIT_READ = BIT(REQ_F_NOWAIT_READ_BIT),
- /* supports async writes */
- REQ_F_NOWAIT_WRITE = BIT(REQ_F_NOWAIT_WRITE_BIT),
+ /* supports async reads/writes */
+ REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT),
/* regular file */
REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
/* has creds assigned */
@@ -796,6 +830,16 @@ enum {
REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT),
/* there is a linked timeout that has to be armed */
REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT),
+ /* ->async_data allocated */
+ REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT),
+ /* don't post CQEs while failing linked requests */
+ REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT),
+ /* single poll may be active */
+ REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT),
+ /* double poll may active */
+ REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT),
+ /* request has already done partial IO */
+ REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT),
};
struct async_poll {
@@ -822,7 +866,7 @@ enum {
* NOTE! Each of the iocb union members has the file pointer
* as the first entry in their struct definition. So you can
* access the file pointer through any of the sub-structs,
- * or directly as just 'ki_filp' in this struct.
+ * or directly as just 'file' in this struct.
*/
struct io_kiocb {
union {
@@ -852,39 +896,52 @@ struct io_kiocb {
struct io_mkdir mkdir;
struct io_symlink symlink;
struct io_hardlink hardlink;
- /* use only after cleaning per-op data, see io_clean_op() */
- struct io_completion compl;
+ struct io_msg msg;
};
- /* opcode allocated if it needs to store data for async defer */
- void *async_data;
u8 opcode;
/* polled IO has completed */
u8 iopoll_completed;
-
u16 buf_index;
+ unsigned int flags;
+
+ u64 user_data;
u32 result;
+ /* fd initially, then cflags for completion */
+ union {
+ u32 cflags;
+ int fd;
+ };
struct io_ring_ctx *ctx;
- unsigned int flags;
- atomic_t refs;
struct task_struct *task;
- u64 user_data;
- struct io_kiocb *link;
struct percpu_ref *fixed_rsrc_refs;
+ /* store used ubuf, so we can prevent reloading */
+ struct io_mapped_ubuf *imu;
- /* used with ctx->iopoll_list with reads/writes */
- struct list_head inflight_entry;
+ union {
+ /* used by request caches, completion batching and iopoll */
+ struct io_wq_work_node comp_list;
+ /* cache ->apoll->events */
+ int apoll_events;
+ };
+ atomic_t refs;
+ atomic_t poll_refs;
struct io_task_work io_task_work;
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
struct hlist_node hash_node;
+ /* internal polling, see IORING_FEAT_FAST_POLL */
struct async_poll *apoll;
- struct io_wq_work work;
+ /* opcode allocated if it needs to store data for async defer */
+ void *async_data;
+ /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
+ struct io_buffer *kbuf;
+ /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */
+ struct io_kiocb *link;
+ /* custom credentials, valid IFF REQ_F_CREDS is set */
const struct cred *creds;
-
- /* store used ubuf, so we can prevent reloading */
- struct io_mapped_ubuf *imu;
+ struct io_wq_work work;
};
struct io_tctx_node {
@@ -902,21 +959,24 @@ struct io_defer_entry {
struct io_op_def {
/* needs req->file assigned */
unsigned needs_file : 1;
+ /* should block plug */
+ unsigned plug : 1;
/* hash wq insertion if file is a regular file */
unsigned hash_reg_file : 1;
/* unbound wq insertion if file is a non-regular file */
unsigned unbound_nonreg_file : 1;
- /* opcode is not supported by this kernel */
- unsigned not_supported : 1;
/* set if opcode supports polled "wait" */
unsigned pollin : 1;
unsigned pollout : 1;
+ unsigned poll_exclusive : 1;
/* op supports buffer selection */
unsigned buffer_select : 1;
/* do prep async if is going to be punted */
unsigned needs_async_setup : 1;
- /* should block plug */
- unsigned plug : 1;
+ /* opcode is not supported by this kernel */
+ unsigned not_supported : 1;
+ /* skip auditing */
+ unsigned audit_skip : 1;
/* size of async data needed, if any */
unsigned short async_size;
};
@@ -930,6 +990,7 @@ static const struct io_op_def io_op_defs[] = {
.buffer_select = 1,
.needs_async_setup = 1,
.plug = 1,
+ .audit_skip = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_WRITEV] = {
@@ -939,16 +1000,19 @@ static const struct io_op_def io_op_defs[] = {
.pollout = 1,
.needs_async_setup = 1,
.plug = 1,
+ .audit_skip = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_FSYNC] = {
.needs_file = 1,
+ .audit_skip = 1,
},
[IORING_OP_READ_FIXED] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.plug = 1,
+ .audit_skip = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_WRITE_FIXED] = {
@@ -957,15 +1021,20 @@ static const struct io_op_def io_op_defs[] = {
.unbound_nonreg_file = 1,
.pollout = 1,
.plug = 1,
+ .audit_skip = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_POLL_ADD] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
+ .audit_skip = 1,
+ },
+ [IORING_OP_POLL_REMOVE] = {
+ .audit_skip = 1,
},
- [IORING_OP_POLL_REMOVE] = {},
[IORING_OP_SYNC_FILE_RANGE] = {
.needs_file = 1,
+ .audit_skip = 1,
},
[IORING_OP_SENDMSG] = {
.needs_file = 1,
@@ -983,18 +1052,24 @@ static const struct io_op_def io_op_defs[] = {
.async_size = sizeof(struct io_async_msghdr),
},
[IORING_OP_TIMEOUT] = {
+ .audit_skip = 1,
.async_size = sizeof(struct io_timeout_data),
},
[IORING_OP_TIMEOUT_REMOVE] = {
/* used by timeout updates' prep() */
+ .audit_skip = 1,
},
[IORING_OP_ACCEPT] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
+ .poll_exclusive = 1,
+ },
+ [IORING_OP_ASYNC_CANCEL] = {
+ .audit_skip = 1,
},
- [IORING_OP_ASYNC_CANCEL] = {},
[IORING_OP_LINK_TIMEOUT] = {
+ .audit_skip = 1,
.async_size = sizeof(struct io_timeout_data),
},
[IORING_OP_CONNECT] = {
@@ -1009,14 +1084,19 @@ static const struct io_op_def io_op_defs[] = {
},
[IORING_OP_OPENAT] = {},
[IORING_OP_CLOSE] = {},
- [IORING_OP_FILES_UPDATE] = {},
- [IORING_OP_STATX] = {},
+ [IORING_OP_FILES_UPDATE] = {
+ .audit_skip = 1,
+ },
+ [IORING_OP_STATX] = {
+ .audit_skip = 1,
+ },
[IORING_OP_READ] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
.plug = 1,
+ .audit_skip = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_WRITE] = {
@@ -1025,39 +1105,50 @@ static const struct io_op_def io_op_defs[] = {
.unbound_nonreg_file = 1,
.pollout = 1,
.plug = 1,
+ .audit_skip = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_FADVISE] = {
.needs_file = 1,
+ .audit_skip = 1,
},
[IORING_OP_MADVISE] = {},
[IORING_OP_SEND] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
+ .audit_skip = 1,
},
[IORING_OP_RECV] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
+ .audit_skip = 1,
},
[IORING_OP_OPENAT2] = {
},
[IORING_OP_EPOLL_CTL] = {
.unbound_nonreg_file = 1,
+ .audit_skip = 1,
},
[IORING_OP_SPLICE] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
+ .audit_skip = 1,
+ },
+ [IORING_OP_PROVIDE_BUFFERS] = {
+ .audit_skip = 1,
+ },
+ [IORING_OP_REMOVE_BUFFERS] = {
+ .audit_skip = 1,
},
- [IORING_OP_PROVIDE_BUFFERS] = {},
- [IORING_OP_REMOVE_BUFFERS] = {},
[IORING_OP_TEE] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
+ .audit_skip = 1,
},
[IORING_OP_SHUTDOWN] = {
.needs_file = 1,
@@ -1067,6 +1158,9 @@ static const struct io_op_def io_op_defs[] = {
[IORING_OP_MKDIRAT] = {},
[IORING_OP_SYMLINKAT] = {},
[IORING_OP_LINKAT] = {},
+ [IORING_OP_MSG_RING] = {
+ .needs_file = 1,
+ },
};
/* requests with any of those set should undergo io_disarm_next() */
@@ -1079,8 +1173,8 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
bool cancel_all);
static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
-static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
- long res, unsigned int cflags);
+static void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags);
+
static void io_put_req(struct io_kiocb *req);
static void io_put_req_deferred(struct io_kiocb *req);
static void io_dismantle_req(struct io_kiocb *req);
@@ -1089,13 +1183,16 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
struct io_uring_rsrc_update2 *up,
unsigned nr_args);
static void io_clean_op(struct io_kiocb *req);
-static struct file *io_file_get(struct io_ring_ctx *ctx,
- struct io_kiocb *req, int fd, bool fixed);
+static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
+ unsigned issue_flags);
+static inline struct file *io_file_get_normal(struct io_kiocb *req, int fd);
+static void io_drop_inflight_file(struct io_kiocb *req);
+static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags);
static void __io_queue_sqe(struct io_kiocb *req);
static void io_rsrc_put_work(struct work_struct *work);
static void io_req_task_queue(struct io_kiocb *req);
-static void io_submit_flush_completions(struct io_ring_ctx *ctx);
+static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
static int io_req_prep_async(struct io_kiocb *req);
static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
@@ -1103,6 +1200,7 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags);
static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);
+static void io_eventfd_signal(struct io_ring_ctx *ctx);
static struct kmem_cache *req_cachep;
@@ -1154,12 +1252,6 @@ static inline bool req_ref_put_and_test(struct io_kiocb *req)
return atomic_dec_and_test(&req->refs);
}
-static inline void req_ref_put(struct io_kiocb *req)
-{
- WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
- WARN_ON_ONCE(req_ref_put_and_test(req));
-}
-
static inline void req_ref_get(struct io_kiocb *req)
{
WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
@@ -1167,6 +1259,12 @@ static inline void req_ref_get(struct io_kiocb *req)
atomic_inc(&req->refs);
}
+static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
+{
+ if (!wq_list_empty(&ctx->submit_state.compl_reqs))
+ __io_submit_flush_completions(ctx);
+}
+
static inline void __io_req_set_refcount(struct io_kiocb *req, int nr)
{
if (!(req->flags & REQ_F_REFCOUNT)) {
@@ -1180,48 +1278,192 @@ static inline void io_req_set_refcount(struct io_kiocb *req)
__io_req_set_refcount(req, 1);
}
-static inline void io_req_set_rsrc_node(struct io_kiocb *req)
+#define IO_RSRC_REF_BATCH 100
+
+static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
+ struct io_ring_ctx *ctx)
+ __must_hold(&ctx->uring_lock)
{
- struct io_ring_ctx *ctx = req->ctx;
+ struct percpu_ref *ref = req->fixed_rsrc_refs;
+
+ if (ref) {
+ if (ref == &ctx->rsrc_node->refs)
+ ctx->rsrc_cached_refs++;
+ else
+ percpu_ref_put(ref);
+ }
+}
+
+static inline void io_req_put_rsrc(struct io_kiocb *req, struct io_ring_ctx *ctx)
+{
+ if (req->fixed_rsrc_refs)
+ percpu_ref_put(req->fixed_rsrc_refs);
+}
+
+static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
+ __must_hold(&ctx->uring_lock)
+{
+ if (ctx->rsrc_cached_refs) {
+ percpu_ref_put_many(&ctx->rsrc_node->refs, ctx->rsrc_cached_refs);
+ ctx->rsrc_cached_refs = 0;
+ }
+}
+
+static void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
+ __must_hold(&ctx->uring_lock)
+{
+ ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH;
+ percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH);
+}
+static inline void io_req_set_rsrc_node(struct io_kiocb *req,
+ struct io_ring_ctx *ctx,
+ unsigned int issue_flags)
+{
if (!req->fixed_rsrc_refs) {
req->fixed_rsrc_refs = &ctx->rsrc_node->refs;
- percpu_ref_get(req->fixed_rsrc_refs);
+
+ if (!(issue_flags & IO_URING_F_UNLOCKED)) {
+ lockdep_assert_held(&ctx->uring_lock);
+ ctx->rsrc_cached_refs--;
+ if (unlikely(ctx->rsrc_cached_refs < 0))
+ io_rsrc_refs_refill(ctx);
+ } else {
+ percpu_ref_get(req->fixed_rsrc_refs);
+ }
+ }
+}
+
+static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list)
+{
+ struct io_buffer *kbuf = req->kbuf;
+ unsigned int cflags;
+
+ cflags = IORING_CQE_F_BUFFER | (kbuf->bid << IORING_CQE_BUFFER_SHIFT);
+ req->flags &= ~REQ_F_BUFFER_SELECTED;
+ list_add(&kbuf->list, list);
+ req->kbuf = NULL;
+ return cflags;
+}
+
+static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req)
+{
+ lockdep_assert_held(&req->ctx->completion_lock);
+
+ if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
+ return 0;
+ return __io_put_kbuf(req, &req->ctx->io_buffers_comp);
+}
+
+static inline unsigned int io_put_kbuf(struct io_kiocb *req,
+ unsigned issue_flags)
+{
+ unsigned int cflags;
+
+ if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
+ return 0;
+
+ /*
+ * We can add this buffer back to two lists:
+ *
+ * 1) The io_buffers_cache list. This one is protected by the
+ * ctx->uring_lock. If we already hold this lock, add back to this
+ * list as we can grab it from issue as well.
+ * 2) The io_buffers_comp list. This one is protected by the
+ * ctx->completion_lock.
+ *
+ * We migrate buffers from the comp_list to the issue cache list
+ * when we need one.
+ */
+ if (issue_flags & IO_URING_F_UNLOCKED) {
+ struct io_ring_ctx *ctx = req->ctx;
+
+ spin_lock(&ctx->completion_lock);
+ cflags = __io_put_kbuf(req, &ctx->io_buffers_comp);
+ spin_unlock(&ctx->completion_lock);
+ } else {
+ lockdep_assert_held(&req->ctx->uring_lock);
+
+ cflags = __io_put_kbuf(req, &req->ctx->io_buffers_cache);
}
+
+ return cflags;
}
-static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl)
+static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
+ unsigned int bgid)
{
- bool got = percpu_ref_tryget(ref);
+ struct list_head *hash_list;
+ struct io_buffer_list *bl;
+
+ hash_list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)];
+ list_for_each_entry(bl, hash_list, list)
+ if (bl->bgid == bgid || bgid == -1U)
+ return bl;
- /* already at zero, wait for ->release() */
- if (!got)
- wait_for_completion(compl);
- percpu_ref_resurrect(ref);
- if (got)
- percpu_ref_put(ref);
+ return NULL;
+}
+
+static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_buffer_list *bl;
+ struct io_buffer *buf;
+
+ if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
+ return;
+ /* don't recycle if we already did IO to this buffer */
+ if (req->flags & REQ_F_PARTIAL_IO)
+ return;
+
+ if (issue_flags & IO_URING_F_UNLOCKED)
+ mutex_lock(&ctx->uring_lock);
+
+ lockdep_assert_held(&ctx->uring_lock);
+
+ buf = req->kbuf;
+ bl = io_buffer_get_list(ctx, buf->bgid);
+ list_add(&buf->list, &bl->buf_list);
+ req->flags &= ~REQ_F_BUFFER_SELECTED;
+ req->kbuf = NULL;
+
+ if (issue_flags & IO_URING_F_UNLOCKED)
+ mutex_unlock(&ctx->uring_lock);
}
static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
bool cancel_all)
+ __must_hold(&req->ctx->timeout_lock)
{
- struct io_kiocb *req;
+ if (task && head->task != task)
+ return false;
+ return cancel_all;
+}
+/*
+ * As io_match_task() but protected against racing with linked timeouts.
+ * User must not hold timeout_lock.
+ */
+static bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
+ bool cancel_all)
+{
if (task && head->task != task)
return false;
- if (cancel_all)
- return true;
+ return cancel_all;
+}
- io_for_each_link(req, head) {
- if (req->flags & REQ_F_INFLIGHT)
- return true;
- }
- return false;
+static inline bool req_has_async_data(struct io_kiocb *req)
+{
+ return req->flags & REQ_F_ASYNC_DATA;
}
static inline void req_set_fail(struct io_kiocb *req)
{
req->flags |= REQ_F_FAIL;
+ if (req->flags & REQ_F_CQE_SKIP) {
+ req->flags &= ~REQ_F_CQE_SKIP;
+ req->flags |= REQ_F_SKIP_LINK_CQES;
+ }
}
static inline void req_fail_link_node(struct io_kiocb *req, int res)
@@ -1230,7 +1472,7 @@ static inline void req_fail_link_node(struct io_kiocb *req, int res)
req->result = res;
}
-static void io_ring_ctx_ref_free(struct percpu_ref *ref)
+static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
{
struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
@@ -1242,7 +1484,7 @@ static inline bool io_is_timeout_noseq(struct io_kiocb *req)
return !req->timeout.off;
}
-static void io_fallback_req_func(struct work_struct *work)
+static __cold void io_fallback_req_func(struct work_struct *work)
{
struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
fallback_work.work);
@@ -1255,18 +1497,16 @@ static void io_fallback_req_func(struct work_struct *work)
req->io_task_work.func(req, &locked);
if (locked) {
- if (ctx->submit_state.compl_nr)
- io_submit_flush_completions(ctx);
+ io_submit_flush_completions(ctx);
mutex_unlock(&ctx->uring_lock);
}
percpu_ref_put(&ctx->refs);
-
}
-static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
+static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
{
struct io_ring_ctx *ctx;
- int hash_bits;
+ int i, hash_bits;
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
@@ -1293,6 +1533,13 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
/* set invalid range, so io_import_fixed() fails meeting it */
ctx->dummy_ubuf->ubuf = -1UL;
+ ctx->io_buffers = kcalloc(1U << IO_BUFFERS_HASH_BITS,
+ sizeof(struct list_head), GFP_KERNEL);
+ if (!ctx->io_buffers)
+ goto err;
+ for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++)
+ INIT_LIST_HEAD(&ctx->io_buffers[i]);
+
if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
goto err;
@@ -1300,16 +1547,18 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
ctx->flags = p->flags;
init_waitqueue_head(&ctx->sqo_sq_wait);
INIT_LIST_HEAD(&ctx->sqd_list);
- init_waitqueue_head(&ctx->poll_wait);
INIT_LIST_HEAD(&ctx->cq_overflow_list);
+ INIT_LIST_HEAD(&ctx->io_buffers_cache);
+ INIT_LIST_HEAD(&ctx->apoll_cache);
init_completion(&ctx->ref_comp);
- xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1);
xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
mutex_init(&ctx->uring_lock);
init_waitqueue_head(&ctx->cq_wait);
spin_lock_init(&ctx->completion_lock);
spin_lock_init(&ctx->timeout_lock);
- INIT_LIST_HEAD(&ctx->iopoll_list);
+ INIT_WQ_LIST(&ctx->iopoll_list);
+ INIT_LIST_HEAD(&ctx->io_buffers_pages);
+ INIT_LIST_HEAD(&ctx->io_buffers_comp);
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
INIT_LIST_HEAD(&ctx->ltimeout_list);
@@ -1318,13 +1567,15 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
init_llist_head(&ctx->rsrc_put_llist);
INIT_LIST_HEAD(&ctx->tctx_list);
- INIT_LIST_HEAD(&ctx->submit_state.free_list);
- INIT_LIST_HEAD(&ctx->locked_free_list);
+ ctx->submit_state.free_list.next = NULL;
+ INIT_WQ_LIST(&ctx->locked_free_list);
INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
+ INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
return ctx;
err:
kfree(ctx->dummy_ubuf);
kfree(ctx->cancel_hash);
+ kfree(ctx->io_buffers);
kfree(ctx);
return NULL;
}
@@ -1348,26 +1599,13 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq)
return false;
}
-#define FFS_ASYNC_READ 0x1UL
-#define FFS_ASYNC_WRITE 0x2UL
-#ifdef CONFIG_64BIT
-#define FFS_ISREG 0x4UL
-#else
-#define FFS_ISREG 0x0UL
-#endif
-#define FFS_MASK ~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG)
+#define FFS_NOWAIT 0x1UL
+#define FFS_ISREG 0x2UL
+#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG)
static inline bool io_req_ffs_set(struct io_kiocb *req)
{
- return IS_ENABLED(CONFIG_64BIT) && (req->flags & REQ_F_FIXED_FILE);
-}
-
-static void io_req_track_inflight(struct io_kiocb *req)
-{
- if (!(req->flags & REQ_F_INFLIGHT)) {
- req->flags |= REQ_F_INFLIGHT;
- atomic_inc(&current->io_uring->inflight_tracked);
- }
+ return req->flags & REQ_F_FIXED_FILE;
}
static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
@@ -1413,14 +1651,6 @@ static void io_prep_async_work(struct io_kiocb *req)
if (def->unbound_nonreg_file)
req->work.flags |= IO_WQ_WORK_UNBOUND;
}
-
- switch (req->opcode) {
- case IORING_OP_SPLICE:
- case IORING_OP_TEE:
- if (!S_ISREG(file_inode(req->splice.file_in)->i_mode))
- req->work.flags |= IO_WQ_WORK_UNBOUND;
- break;
- }
}
static void io_prep_async_link(struct io_kiocb *req)
@@ -1430,25 +1660,32 @@ static void io_prep_async_link(struct io_kiocb *req)
if (req->flags & REQ_F_LINK_TIMEOUT) {
struct io_ring_ctx *ctx = req->ctx;
- spin_lock(&ctx->completion_lock);
+ spin_lock_irq(&ctx->timeout_lock);
io_for_each_link(cur, req)
io_prep_async_work(cur);
- spin_unlock(&ctx->completion_lock);
+ spin_unlock_irq(&ctx->timeout_lock);
} else {
io_for_each_link(cur, req)
io_prep_async_work(cur);
}
}
-static void io_queue_async_work(struct io_kiocb *req, bool *locked)
+static inline void io_req_add_compl_list(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_submit_state *state = &ctx->submit_state;
+
+ if (!(req->flags & REQ_F_CQE_SKIP))
+ ctx->submit_state.flush_cqes = true;
+ wq_list_add_tail(&req->comp_list, &state->compl_reqs);
+}
+
+static void io_queue_async_work(struct io_kiocb *req, bool *dont_use)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *link = io_prep_linked_timeout(req);
struct io_uring_task *tctx = req->task->io_uring;
- /* must not take the lock, NULL it as a precaution */
- locked = NULL;
-
BUG_ON(!tctx);
BUG_ON(!tctx->io_wq);
@@ -1465,8 +1702,8 @@ static void io_queue_async_work(struct io_kiocb *req, bool *locked)
if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
req->work.flags |= IO_WQ_WORK_CANCEL;
- trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
- &req->work, req->flags);
+ trace_io_uring_queue_async_work(ctx, req, req->user_data, req->opcode, req->flags,
+ &req->work, io_wq_is_hashed(&req->work));
io_wq_enqueue(tctx->io_wq, &req->work);
if (link)
io_queue_linked_timeout(link);
@@ -1484,12 +1721,12 @@ static void io_kill_timeout(struct io_kiocb *req, int status)
atomic_set(&req->ctx->cq_timeouts,
atomic_read(&req->ctx->cq_timeouts) + 1);
list_del_init(&req->timeout.list);
- io_cqring_fill_event(req->ctx, req->user_data, status, 0);
+ io_fill_cqe_req(req, status, 0);
io_put_req_deferred(req);
}
}
-static void io_queue_deferred(struct io_ring_ctx *ctx)
+static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
{
while (!list_empty(&ctx->defer_list)) {
struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
@@ -1503,16 +1740,15 @@ static void io_queue_deferred(struct io_ring_ctx *ctx)
}
}
-static void io_flush_timeouts(struct io_ring_ctx *ctx)
+static __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
__must_hold(&ctx->completion_lock)
{
u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
+ struct io_kiocb *req, *tmp;
spin_lock_irq(&ctx->timeout_lock);
- while (!list_empty(&ctx->timeout_list)) {
+ list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
u32 events_needed, events_got;
- struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
- struct io_kiocb, timeout.list);
if (io_is_timeout_noseq(req))
break;
@@ -1529,29 +1765,33 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx)
if (events_got < events_needed)
break;
- list_del_init(&req->timeout.list);
io_kill_timeout(req, 0);
}
ctx->cq_last_tm_flush = seq;
spin_unlock_irq(&ctx->timeout_lock);
}
-static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
-{
- if (ctx->off_timeout_used)
- io_flush_timeouts(ctx);
- if (ctx->drain_active)
- io_queue_deferred(ctx);
-}
-
static inline void io_commit_cqring(struct io_ring_ctx *ctx)
{
- if (unlikely(ctx->off_timeout_used || ctx->drain_active))
- __io_commit_cqring_flush(ctx);
/* order cqe stores with ring update */
smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
}
+static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
+{
+ if (ctx->off_timeout_used || ctx->drain_active) {
+ spin_lock(&ctx->completion_lock);
+ if (ctx->off_timeout_used)
+ io_flush_timeouts(ctx);
+ if (ctx->drain_active)
+ io_queue_deferred(ctx);
+ io_commit_cqring(ctx);
+ spin_unlock(&ctx->completion_lock);
+ }
+ if (ctx->has_evfd)
+ io_eventfd_signal(ctx);
+}
+
static inline bool io_sqring_full(struct io_ring_ctx *ctx)
{
struct io_rings *r = ctx->rings;
@@ -1581,23 +1821,34 @@ static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
return &rings->cqes[tail & mask];
}
-static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
+static void io_eventfd_signal(struct io_ring_ctx *ctx)
{
- if (likely(!ctx->cq_ev_fd))
- return false;
+ struct io_ev_fd *ev_fd;
+
+ rcu_read_lock();
+ /*
+ * rcu_dereference ctx->io_ev_fd once and use it for both for checking
+ * and eventfd_signal
+ */
+ ev_fd = rcu_dereference(ctx->io_ev_fd);
+
+ /*
+ * Check again if ev_fd exists incase an io_eventfd_unregister call
+ * completed between the NULL check of ctx->io_ev_fd at the start of
+ * the function and rcu_read_lock.
+ */
+ if (unlikely(!ev_fd))
+ goto out;
if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
- return false;
- return !ctx->eventfd_async || io_wq_current_is_worker();
+ goto out;
+
+ if (!ev_fd->eventfd_async || io_wq_current_is_worker())
+ eventfd_signal(ev_fd->cq_ev_fd, 1);
+out:
+ rcu_read_unlock();
}
-/*
- * This should only get called when at least one event has been posted.
- * Some applications rely on the eventfd notification count only changing
- * IFF a new CQE has been added to the CQ ring. There's no depedency on
- * 1:1 relationship between how many times this function is called (and
- * hence the eventfd count) and number of CQEs posted to the CQ ring.
- */
-static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
+static inline void io_cqring_wake(struct io_ring_ctx *ctx)
{
/*
* wake_up_all() may seem excessive, but io_wake_function() and
@@ -1606,27 +1857,32 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
*/
if (wq_has_sleeper(&ctx->cq_wait))
wake_up_all(&ctx->cq_wait);
- if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait))
- wake_up(&ctx->sq_data->wait);
- if (io_should_trigger_evfd(ctx))
- eventfd_signal(ctx->cq_ev_fd, 1);
- if (waitqueue_active(&ctx->poll_wait))
- wake_up_interruptible(&ctx->poll_wait);
+}
+
+/*
+ * This should only get called when at least one event has been posted.
+ * Some applications rely on the eventfd notification count only changing
+ * IFF a new CQE has been added to the CQ ring. There's no depedency on
+ * 1:1 relationship between how many times this function is called (and
+ * hence the eventfd count) and number of CQEs posted to the CQ ring.
+ */
+static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx)
+{
+ if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
+ ctx->has_evfd))
+ __io_commit_cqring_flush(ctx);
+
+ io_cqring_wake(ctx);
}
static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
{
- /* see waitqueue_active() comment */
- smp_mb();
+ if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
+ ctx->has_evfd))
+ __io_commit_cqring_flush(ctx);
- if (ctx->flags & IORING_SETUP_SQPOLL) {
- if (waitqueue_active(&ctx->cq_wait))
- wake_up_all(&ctx->cq_wait);
- }
- if (io_should_trigger_evfd(ctx))
- eventfd_signal(ctx->cq_ev_fd, 1);
- if (waitqueue_active(&ctx->poll_wait))
- wake_up_interruptible(&ctx->poll_wait);
+ if (ctx->flags & IORING_SETUP_SQPOLL)
+ io_cqring_wake(ctx);
}
/* Returns true if there are no backlogged entries after the flush */
@@ -1721,8 +1977,20 @@ static inline void io_get_task_refs(int nr)
io_task_refs_refill(tctx);
}
+static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
+{
+ struct io_uring_task *tctx = task->io_uring;
+ unsigned int refs = tctx->cached_refs;
+
+ if (refs) {
+ tctx->cached_refs = 0;
+ percpu_counter_sub(&tctx->inflight, refs);
+ put_task_struct_many(task, refs);
+ }
+}
+
static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
- long res, unsigned int cflags)
+ s32 res, u32 cflags)
{
struct io_overflow_cqe *ocqe;
@@ -1749,13 +2017,11 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
return true;
}
-static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
- long res, unsigned int cflags)
+static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data,
+ s32 res, u32 cflags)
{
struct io_uring_cqe *cqe;
- trace_io_uring_complete(ctx, user_data, res, cflags);
-
/*
* If we can't get a cq entry, userspace overflowed the
* submission (by quite a lot). Increment the overflow count in
@@ -1771,20 +2037,33 @@ static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data
return io_cqring_event_overflow(ctx, user_data, res, cflags);
}
-/* not as hot to bloat with inlining */
-static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
- long res, unsigned int cflags)
+static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
{
- return __io_cqring_fill_event(ctx, user_data, res, cflags);
+ trace_io_uring_complete(req->ctx, req, req->user_data, res, cflags);
+ return __io_fill_cqe(req->ctx, req->user_data, res, cflags);
}
-static void io_req_complete_post(struct io_kiocb *req, long res,
- unsigned int cflags)
+static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
+{
+ if (!(req->flags & REQ_F_CQE_SKIP))
+ __io_fill_cqe_req(req, res, cflags);
+}
+
+static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data,
+ s32 res, u32 cflags)
+{
+ ctx->cq_extra++;
+ trace_io_uring_complete(ctx, NULL, user_data, res, cflags);
+ return __io_fill_cqe(ctx, user_data, res, cflags);
+}
+
+static void __io_req_complete_post(struct io_kiocb *req, s32 res,
+ u32 cflags)
{
struct io_ring_ctx *ctx = req->ctx;
- spin_lock(&ctx->completion_lock);
- __io_cqring_fill_event(ctx, req->user_data, res, cflags);
+ if (!(req->flags & REQ_F_CQE_SKIP))
+ __io_fill_cqe_req(req, res, cflags);
/*
* If we're the last reference to this request, add to our locked
* free_list cache.
@@ -1798,40 +2077,42 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
req->link = NULL;
}
}
+ io_req_put_rsrc(req, ctx);
+ /*
+ * Selected buffer deallocation in io_clean_op() assumes that
+ * we don't hold ->completion_lock. Clean them here to avoid
+ * deadlocks.
+ */
+ io_put_kbuf_comp(req);
io_dismantle_req(req);
io_put_task(req->task, 1);
- list_add(&req->inflight_entry, &ctx->locked_free_list);
+ wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
ctx->locked_free_nr++;
- } else {
- if (!percpu_ref_tryget(&ctx->refs))
- req = NULL;
- }
- io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
-
- if (req) {
- io_cqring_ev_posted(ctx);
- percpu_ref_put(&ctx->refs);
}
}
-static inline bool io_req_needs_clean(struct io_kiocb *req)
+static void io_req_complete_post(struct io_kiocb *req, s32 res,
+ u32 cflags)
{
- return req->flags & IO_REQ_CLEAN_FLAGS;
+ struct io_ring_ctx *ctx = req->ctx;
+
+ spin_lock(&ctx->completion_lock);
+ __io_req_complete_post(req, res, cflags);
+ io_commit_cqring(ctx);
+ spin_unlock(&ctx->completion_lock);
+ io_cqring_ev_posted(ctx);
}
-static void io_req_complete_state(struct io_kiocb *req, long res,
- unsigned int cflags)
+static inline void io_req_complete_state(struct io_kiocb *req, s32 res,
+ u32 cflags)
{
- if (io_req_needs_clean(req))
- io_clean_op(req);
req->result = res;
- req->compl.cflags = cflags;
+ req->cflags = cflags;
req->flags |= REQ_F_COMPLETE_INLINE;
}
static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
- long res, unsigned cflags)
+ s32 res, u32 cflags)
{
if (issue_flags & IO_URING_F_COMPLETE_DEFER)
io_req_complete_state(req, res, cflags);
@@ -1839,15 +2120,15 @@ static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
io_req_complete_post(req, res, cflags);
}
-static inline void io_req_complete(struct io_kiocb *req, long res)
+static inline void io_req_complete(struct io_kiocb *req, s32 res)
{
__io_req_complete(req, 0, res, 0);
}
-static void io_req_complete_failed(struct io_kiocb *req, long res)
+static void io_req_complete_failed(struct io_kiocb *req, s32 res)
{
req_set_fail(req);
- io_req_complete_post(req, res, 0);
+ io_req_complete_post(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
}
static void io_req_complete_fail_submit(struct io_kiocb *req)
@@ -1878,7 +2159,7 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
struct io_submit_state *state)
{
spin_lock(&ctx->completion_lock);
- list_splice_init(&ctx->locked_free_list, &state->free_list);
+ wq_list_splice(&ctx->locked_free_list, &state->free_list);
ctx->locked_free_nr = 0;
spin_unlock(&ctx->completion_lock);
}
@@ -1887,7 +2168,6 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
{
struct io_submit_state *state = &ctx->submit_state;
- int nr;
/*
* If we have more than a batch's worth of requests in our IRQ side
@@ -1896,20 +2176,7 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
*/
if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH)
io_flush_cached_locked_reqs(ctx, state);
-
- nr = state->free_reqs;
- while (!list_empty(&state->free_list)) {
- struct io_kiocb *req = list_first_entry(&state->free_list,
- struct io_kiocb, inflight_entry);
-
- list_del(&req->inflight_entry);
- state->reqs[nr++] = req;
- if (nr == ARRAY_SIZE(state->reqs))
- break;
- }
-
- state->free_reqs = nr;
- return nr != 0;
+ return !!state->free_list.next;
}
/*
@@ -1918,38 +2185,54 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
* Because of that, io_alloc_req() should be called only under ->uring_lock
* and with extra caution to not get a request that is still worked on.
*/
-static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
+static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{
struct io_submit_state *state = &ctx->submit_state;
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
+ void *reqs[IO_REQ_ALLOC_BATCH];
+ struct io_kiocb *req;
int ret, i;
- BUILD_BUG_ON(ARRAY_SIZE(state->reqs) < IO_REQ_ALLOC_BATCH);
-
- if (likely(state->free_reqs || io_flush_cached_reqs(ctx)))
- goto got_req;
+ if (likely(state->free_list.next || io_flush_cached_reqs(ctx)))
+ return true;
- ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH,
- state->reqs);
+ ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
/*
* Bulk alloc is all-or-nothing. If we fail to get a batch,
* retry single alloc to be on the safe side.
*/
if (unlikely(ret <= 0)) {
- state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
- if (!state->reqs[0])
- return NULL;
+ reqs[0] = kmem_cache_alloc(req_cachep, gfp);
+ if (!reqs[0])
+ return false;
ret = 1;
}
- for (i = 0; i < ret; i++)
- io_preinit_req(state->reqs[i], ctx);
- state->free_reqs = ret;
-got_req:
- state->free_reqs--;
- return state->reqs[state->free_reqs];
+ percpu_ref_get_many(&ctx->refs, ret);
+ for (i = 0; i < ret; i++) {
+ req = reqs[i];
+
+ io_preinit_req(req, ctx);
+ wq_stack_add_head(&req->comp_list, &state->free_list);
+ }
+ return true;
+}
+
+static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)
+{
+ if (unlikely(!ctx->submit_state.free_list.next))
+ return __io_alloc_req_refill(ctx);
+ return true;
+}
+
+static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
+{
+ struct io_wq_work_node *node;
+
+ node = wq_stack_extract(&ctx->submit_state.free_list);
+ return container_of(node, struct io_kiocb, comp_list);
}
static inline void io_put_file(struct file *file)
@@ -1958,35 +2241,28 @@ static inline void io_put_file(struct file *file)
fput(file);
}
-static void io_dismantle_req(struct io_kiocb *req)
+static inline void io_dismantle_req(struct io_kiocb *req)
{
unsigned int flags = req->flags;
- if (io_req_needs_clean(req))
+ if (unlikely(flags & IO_REQ_CLEAN_FLAGS))
io_clean_op(req);
if (!(flags & REQ_F_FIXED_FILE))
io_put_file(req->file);
- if (req->fixed_rsrc_refs)
- percpu_ref_put(req->fixed_rsrc_refs);
- if (req->async_data) {
- kfree(req->async_data);
- req->async_data = NULL;
- }
}
-static void __io_free_req(struct io_kiocb *req)
+static __cold void __io_free_req(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
+ io_req_put_rsrc(req, ctx);
io_dismantle_req(req);
io_put_task(req->task, 1);
spin_lock(&ctx->completion_lock);
- list_add(&req->inflight_entry, &ctx->locked_free_list);
+ wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
ctx->locked_free_nr++;
spin_unlock(&ctx->completion_lock);
-
- percpu_ref_put(&ctx->refs);
}
static inline void io_remove_next_linked(struct io_kiocb *req)
@@ -2010,8 +2286,8 @@ static bool io_kill_linked_timeout(struct io_kiocb *req)
link->timeout.head = NULL;
if (hrtimer_try_to_cancel(&io->timer) != -1) {
list_del(&link->timeout.list);
- io_cqring_fill_event(link->ctx, link->user_data,
- -ECANCELED, 0);
+ /* leave REQ_F_CQE_SKIP to io_fill_cqe_req */
+ io_fill_cqe_req(link, -ECANCELED, 0);
io_put_req_deferred(link);
return true;
}
@@ -2023,6 +2299,7 @@ static void io_fail_links(struct io_kiocb *req)
__must_hold(&req->ctx->completion_lock)
{
struct io_kiocb *nxt, *link = req->link;
+ bool ignore_cqes = req->flags & REQ_F_SKIP_LINK_CQES;
req->link = NULL;
while (link) {
@@ -2034,8 +2311,13 @@ static void io_fail_links(struct io_kiocb *req)
nxt = link->link;
link->link = NULL;
- trace_io_uring_fail_link(req, link);
- io_cqring_fill_event(link->ctx, link->user_data, res, 0);
+ trace_io_uring_fail_link(req->ctx, req, req->user_data,
+ req->opcode, link);
+
+ if (!ignore_cqes) {
+ link->flags &= ~REQ_F_CQE_SKIP;
+ io_fill_cqe_req(link, res, 0);
+ }
io_put_req_deferred(link);
link = nxt;
}
@@ -2052,8 +2334,8 @@ static bool io_disarm_next(struct io_kiocb *req)
req->flags &= ~REQ_F_ARM_LTIMEOUT;
if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
io_remove_next_linked(req);
- io_cqring_fill_event(link->ctx, link->user_data,
- -ECANCELED, 0);
+ /* leave REQ_F_CQE_SKIP to io_fill_cqe_req */
+ io_fill_cqe_req(link, -ECANCELED, 0);
io_put_req_deferred(link);
posted = true;
}
@@ -2072,98 +2354,158 @@ static bool io_disarm_next(struct io_kiocb *req)
return posted;
}
-static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
+static void __io_req_find_next_prep(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ bool posted;
+
+ spin_lock(&ctx->completion_lock);
+ posted = io_disarm_next(req);
+ if (posted)
+ io_commit_cqring(ctx);
+ spin_unlock(&ctx->completion_lock);
+ if (posted)
+ io_cqring_ev_posted(ctx);
+}
+
+static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
{
struct io_kiocb *nxt;
+ if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
+ return NULL;
/*
* If LINK is set, we have dependent requests in this chain. If we
* didn't fail this request, queue the first one up, moving any other
* dependencies to the next request. In case of failure, fail the rest
* of the chain.
*/
- if (req->flags & IO_DISARM_MASK) {
- struct io_ring_ctx *ctx = req->ctx;
- bool posted;
-
- spin_lock(&ctx->completion_lock);
- posted = io_disarm_next(req);
- if (posted)
- io_commit_cqring(req->ctx);
- spin_unlock(&ctx->completion_lock);
- if (posted)
- io_cqring_ev_posted(ctx);
- }
+ if (unlikely(req->flags & IO_DISARM_MASK))
+ __io_req_find_next_prep(req);
nxt = req->link;
req->link = NULL;
return nxt;
}
-static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
-{
- if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
- return NULL;
- return __io_req_find_next(req);
-}
-
static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
{
if (!ctx)
return;
if (*locked) {
- if (ctx->submit_state.compl_nr)
- io_submit_flush_completions(ctx);
+ io_submit_flush_completions(ctx);
mutex_unlock(&ctx->uring_lock);
*locked = false;
}
percpu_ref_put(&ctx->refs);
}
+static inline void ctx_commit_and_unlock(struct io_ring_ctx *ctx)
+{
+ io_commit_cqring(ctx);
+ spin_unlock(&ctx->completion_lock);
+ io_cqring_ev_posted(ctx);
+}
+
+static void handle_prev_tw_list(struct io_wq_work_node *node,
+ struct io_ring_ctx **ctx, bool *uring_locked)
+{
+ if (*ctx && !*uring_locked)
+ spin_lock(&(*ctx)->completion_lock);
+
+ do {
+ struct io_wq_work_node *next = node->next;
+ struct io_kiocb *req = container_of(node, struct io_kiocb,
+ io_task_work.node);
+
+ prefetch(container_of(next, struct io_kiocb, io_task_work.node));
+
+ if (req->ctx != *ctx) {
+ if (unlikely(!*uring_locked && *ctx))
+ ctx_commit_and_unlock(*ctx);
+
+ ctx_flush_and_put(*ctx, uring_locked);
+ *ctx = req->ctx;
+ /* if not contended, grab and improve batching */
+ *uring_locked = mutex_trylock(&(*ctx)->uring_lock);
+ percpu_ref_get(&(*ctx)->refs);
+ if (unlikely(!*uring_locked))
+ spin_lock(&(*ctx)->completion_lock);
+ }
+ if (likely(*uring_locked))
+ req->io_task_work.func(req, uring_locked);
+ else
+ __io_req_complete_post(req, req->result,
+ io_put_kbuf_comp(req));
+ node = next;
+ } while (node);
+
+ if (unlikely(!*uring_locked))
+ ctx_commit_and_unlock(*ctx);
+}
+
+static void handle_tw_list(struct io_wq_work_node *node,
+ struct io_ring_ctx **ctx, bool *locked)
+{
+ do {
+ struct io_wq_work_node *next = node->next;
+ struct io_kiocb *req = container_of(node, struct io_kiocb,
+ io_task_work.node);
+
+ prefetch(container_of(next, struct io_kiocb, io_task_work.node));
+
+ if (req->ctx != *ctx) {
+ ctx_flush_and_put(*ctx, locked);
+ *ctx = req->ctx;
+ /* if not contended, grab and improve batching */
+ *locked = mutex_trylock(&(*ctx)->uring_lock);
+ percpu_ref_get(&(*ctx)->refs);
+ }
+ req->io_task_work.func(req, locked);
+ node = next;
+ } while (node);
+}
+
static void tctx_task_work(struct callback_head *cb)
{
- bool locked = false;
+ bool uring_locked = false;
struct io_ring_ctx *ctx = NULL;
struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
task_work);
while (1) {
- struct io_wq_work_node *node;
+ struct io_wq_work_node *node1, *node2;
- if (!tctx->task_list.first && locked && ctx->submit_state.compl_nr)
+ if (!tctx->task_list.first &&
+ !tctx->prior_task_list.first && uring_locked)
io_submit_flush_completions(ctx);
spin_lock_irq(&tctx->task_lock);
- node = tctx->task_list.first;
+ node1 = tctx->prior_task_list.first;
+ node2 = tctx->task_list.first;
INIT_WQ_LIST(&tctx->task_list);
- if (!node)
+ INIT_WQ_LIST(&tctx->prior_task_list);
+ if (!node2 && !node1)
tctx->task_running = false;
spin_unlock_irq(&tctx->task_lock);
- if (!node)
+ if (!node2 && !node1)
break;
- do {
- struct io_wq_work_node *next = node->next;
- struct io_kiocb *req = container_of(node, struct io_kiocb,
- io_task_work.node);
-
- if (req->ctx != ctx) {
- ctx_flush_and_put(ctx, &locked);
- ctx = req->ctx;
- /* if not contended, grab and improve batching */
- locked = mutex_trylock(&ctx->uring_lock);
- percpu_ref_get(&ctx->refs);
- }
- req->io_task_work.func(req, &locked);
- node = next;
- } while (node);
+ if (node1)
+ handle_prev_tw_list(node1, &ctx, &uring_locked);
+ if (node2)
+ handle_tw_list(node2, &ctx, &uring_locked);
cond_resched();
}
- ctx_flush_and_put(ctx, &locked);
+ ctx_flush_and_put(ctx, &uring_locked);
+
+ /* relaxed read is enough as only the task itself sets ->in_idle */
+ if (unlikely(atomic_read(&tctx->in_idle)))
+ io_uring_drop_tctx_refs(current);
}
-static void io_req_task_work_add(struct io_kiocb *req)
+static void io_req_task_work_add(struct io_kiocb *req, bool priority)
{
struct task_struct *tsk = req->task;
struct io_uring_task *tctx = tsk->io_uring;
@@ -2174,8 +2516,13 @@ static void io_req_task_work_add(struct io_kiocb *req)
WARN_ON_ONCE(!tctx);
+ io_drop_inflight_file(req);
+
spin_lock_irqsave(&tctx->task_lock, flags);
- wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
+ if (priority)
+ wq_list_add_tail(&req->io_task_work.node, &tctx->prior_task_list);
+ else
+ wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
running = tctx->task_running;
if (!running)
tctx->task_running = true;
@@ -2192,15 +2539,15 @@ static void io_req_task_work_add(struct io_kiocb *req)
* will do the job.
*/
notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL;
- if (!task_work_add(tsk, &tctx->task_work, notify)) {
- wake_up_process(tsk);
+ if (likely(!task_work_add(tsk, &tctx->task_work, notify))) {
+ if (notify == TWA_NONE)
+ wake_up_process(tsk);
return;
}
spin_lock_irqsave(&tctx->task_lock, flags);
tctx->task_running = false;
- node = tctx->task_list.first;
- INIT_WQ_LIST(&tctx->task_list);
+ node = wq_list_merge(&tctx->prior_task_list, &tctx->task_list);
spin_unlock_irqrestore(&tctx->task_lock, flags);
while (node) {
@@ -2237,19 +2584,19 @@ static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
{
req->result = ret;
req->io_task_work.func = io_req_task_cancel;
- io_req_task_work_add(req);
+ io_req_task_work_add(req, false);
}
static void io_req_task_queue(struct io_kiocb *req)
{
req->io_task_work.func = io_req_task_submit;
- io_req_task_work_add(req);
+ io_req_task_work_add(req, false);
}
static void io_req_task_queue_reissue(struct io_kiocb *req)
{
req->io_task_work.func = io_queue_async_work;
- io_req_task_work_add(req);
+ io_req_task_work_add(req, false);
}
static inline void io_queue_next(struct io_kiocb *req)
@@ -2271,77 +2618,75 @@ static void io_free_req_work(struct io_kiocb *req, bool *locked)
io_free_req(req);
}
-struct req_batch {
- struct task_struct *task;
- int task_refs;
- int ctx_refs;
-};
-
-static inline void io_init_req_batch(struct req_batch *rb)
+static void io_free_batch_list(struct io_ring_ctx *ctx,
+ struct io_wq_work_node *node)
+ __must_hold(&ctx->uring_lock)
{
- rb->task_refs = 0;
- rb->ctx_refs = 0;
- rb->task = NULL;
-}
+ struct task_struct *task = NULL;
+ int task_refs = 0;
-static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
- struct req_batch *rb)
-{
- if (rb->ctx_refs)
- percpu_ref_put_many(&ctx->refs, rb->ctx_refs);
- if (rb->task)
- io_put_task(rb->task, rb->task_refs);
-}
+ do {
+ struct io_kiocb *req = container_of(node, struct io_kiocb,
+ comp_list);
-static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
- struct io_submit_state *state)
-{
- io_queue_next(req);
- io_dismantle_req(req);
+ if (unlikely(req->flags & REQ_F_REFCOUNT)) {
+ node = req->comp_list.next;
+ if (!req_ref_put_and_test(req))
+ continue;
+ }
- if (req->task != rb->task) {
- if (rb->task)
- io_put_task(rb->task, rb->task_refs);
- rb->task = req->task;
- rb->task_refs = 0;
- }
- rb->task_refs++;
- rb->ctx_refs++;
+ io_req_put_rsrc_locked(req, ctx);
+ io_queue_next(req);
+ io_dismantle_req(req);
- if (state->free_reqs != ARRAY_SIZE(state->reqs))
- state->reqs[state->free_reqs++] = req;
- else
- list_add(&req->inflight_entry, &state->free_list);
+ if (req->task != task) {
+ if (task)
+ io_put_task(task, task_refs);
+ task = req->task;
+ task_refs = 0;
+ }
+ task_refs++;
+ node = req->comp_list.next;
+ wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
+ } while (node);
+
+ if (task)
+ io_put_task(task, task_refs);
}
-static void io_submit_flush_completions(struct io_ring_ctx *ctx)
+static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{
+ struct io_wq_work_node *node, *prev;
struct io_submit_state *state = &ctx->submit_state;
- int i, nr = state->compl_nr;
- struct req_batch rb;
-
- spin_lock(&ctx->completion_lock);
- for (i = 0; i < nr; i++) {
- struct io_kiocb *req = state->compl_reqs[i];
-
- __io_cqring_fill_event(ctx, req->user_data, req->result,
- req->compl.cflags);
- }
- io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
- io_cqring_ev_posted(ctx);
- io_init_req_batch(&rb);
- for (i = 0; i < nr; i++) {
- struct io_kiocb *req = state->compl_reqs[i];
+ if (state->flush_cqes) {
+ spin_lock(&ctx->completion_lock);
+ wq_list_for_each(node, prev, &state->compl_reqs) {
+ struct io_kiocb *req = container_of(node, struct io_kiocb,
+ comp_list);
+
+ if (!(req->flags & REQ_F_CQE_SKIP))
+ __io_fill_cqe_req(req, req->result, req->cflags);
+ if ((req->flags & REQ_F_POLLED) && req->apoll) {
+ struct async_poll *apoll = req->apoll;
+
+ if (apoll->double_poll)
+ kfree(apoll->double_poll);
+ list_add(&apoll->poll.wait.entry,
+ &ctx->apoll_cache);
+ req->flags &= ~REQ_F_POLLED;
+ }
+ }
- if (req_ref_put_and_test(req))
- io_req_free_batch(&rb, req, &ctx->submit_state);
+ io_commit_cqring(ctx);
+ spin_unlock(&ctx->completion_lock);
+ io_cqring_ev_posted(ctx);
+ state->flush_cqes = false;
}
- io_req_free_batch_finish(ctx, &rb);
- state->compl_nr = 0;
+ io_free_batch_list(ctx, state->compl_reqs.first);
+ INIT_WQ_LIST(&state->compl_reqs);
}
/*
@@ -2369,7 +2714,7 @@ static inline void io_put_req_deferred(struct io_kiocb *req)
{
if (req_ref_put_and_test(req)) {
req->io_task_work.func = io_free_req_work;
- io_req_task_work_add(req);
+ io_req_task_work_add(req, false);
}
}
@@ -2388,82 +2733,35 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
}
-static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf)
-{
- unsigned int cflags;
-
- cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
- cflags |= IORING_CQE_F_BUFFER;
- req->flags &= ~REQ_F_BUFFER_SELECTED;
- kfree(kbuf);
- return cflags;
-}
-
-static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
-{
- struct io_buffer *kbuf;
-
- if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
- return 0;
- kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
- return io_put_kbuf(req, kbuf);
-}
-
static inline bool io_run_task_work(void)
{
- if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) {
+ if (test_thread_flag(TIF_NOTIFY_SIGNAL) || task_work_pending(current)) {
__set_current_state(TASK_RUNNING);
- tracehook_notify_signal();
+ clear_notify_signal();
+ if (task_work_pending(current))
+ task_work_run();
return true;
}
return false;
}
-/*
- * Find and free completed poll iocbs
- */
-static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
- struct list_head *done)
+static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
{
- struct req_batch rb;
- struct io_kiocb *req;
-
- /* order with ->result store in io_complete_rw_iopoll() */
- smp_rmb();
-
- io_init_req_batch(&rb);
- while (!list_empty(done)) {
- req = list_first_entry(done, struct io_kiocb, inflight_entry);
- list_del(&req->inflight_entry);
-
- __io_cqring_fill_event(ctx, req->user_data, req->result,
- io_put_rw_kbuf(req));
- (*nr_events)++;
-
- if (req_ref_put_and_test(req))
- io_req_free_batch(&rb, req, &ctx->submit_state);
- }
-
- io_commit_cqring(ctx);
- io_cqring_ev_posted_iopoll(ctx);
- io_req_free_batch_finish(ctx, &rb);
-}
-
-static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
- long min)
-{
- struct io_kiocb *req, *tmp;
- LIST_HEAD(done);
- bool spin;
+ struct io_wq_work_node *pos, *start, *prev;
+ unsigned int poll_flags = BLK_POLL_NOSLEEP;
+ DEFINE_IO_COMP_BATCH(iob);
+ int nr_events = 0;
/*
* Only spin for completions if we don't have multiple devices hanging
- * off our complete list, and we're under the requested amount.
+ * off our complete list.
*/
- spin = !ctx->poll_multi_queue && *nr_events < min;
+ if (ctx->poll_multi_queue || force_nonspin)
+ poll_flags |= BLK_POLL_ONESHOT;
- list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) {
+ wq_list_for_each(pos, start, &ctx->iopoll_list) {
+ struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
struct kiocb *kiocb = &req->rw.kiocb;
int ret;
@@ -2472,47 +2770,63 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
* If we find a request that requires polling, break out
* and complete those lists first, if we have entries there.
*/
- if (READ_ONCE(req->iopoll_completed)) {
- list_move_tail(&req->inflight_entry, &done);
- continue;
- }
- if (!list_empty(&done))
+ if (READ_ONCE(req->iopoll_completed))
break;
- ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
+ ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags);
if (unlikely(ret < 0))
return ret;
else if (ret)
- spin = false;
+ poll_flags |= BLK_POLL_ONESHOT;
/* iopoll may have completed current req */
- if (READ_ONCE(req->iopoll_completed))
- list_move_tail(&req->inflight_entry, &done);
+ if (!rq_list_empty(iob.req_list) ||
+ READ_ONCE(req->iopoll_completed))
+ break;
}
- if (!list_empty(&done))
- io_iopoll_complete(ctx, nr_events, &done);
+ if (!rq_list_empty(iob.req_list))
+ iob.complete(&iob);
+ else if (!pos)
+ return 0;
- return 0;
+ prev = start;
+ wq_list_for_each_resume(pos, prev) {
+ struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
+
+ /* order with io_complete_rw_iopoll(), e.g. ->result updates */
+ if (!smp_load_acquire(&req->iopoll_completed))
+ break;
+ nr_events++;
+ if (unlikely(req->flags & REQ_F_CQE_SKIP))
+ continue;
+ __io_fill_cqe_req(req, req->result, io_put_kbuf(req, 0));
+ }
+
+ if (unlikely(!nr_events))
+ return 0;
+
+ io_commit_cqring(ctx);
+ io_cqring_ev_posted_iopoll(ctx);
+ pos = start ? start->next : ctx->iopoll_list.first;
+ wq_list_cut(&ctx->iopoll_list, prev, start);
+ io_free_batch_list(ctx, pos);
+ return nr_events;
}
/*
* We can't just wait for polled events to come to us, we have to actively
* find and complete them.
*/
-static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
+static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
{
if (!(ctx->flags & IORING_SETUP_IOPOLL))
return;
mutex_lock(&ctx->uring_lock);
- while (!list_empty(&ctx->iopoll_list)) {
- unsigned int nr_events = 0;
-
- io_do_iopoll(ctx, &nr_events, 0);
-
+ while (!wq_list_empty(&ctx->iopoll_list)) {
/* let it sleep and repeat later if can't complete a request */
- if (nr_events == 0)
+ if (io_do_iopoll(ctx, true) == 0)
break;
/*
* Ensure we allow local-to-the-cpu processing to take place,
@@ -2559,7 +2873,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
* forever, while the workqueue is stuck trying to acquire the
* very same mutex.
*/
- if (list_empty(&ctx->iopoll_list)) {
+ if (wq_list_empty(&ctx->iopoll_list)) {
u32 tail = ctx->cached_cq_tail;
mutex_unlock(&ctx->uring_lock);
@@ -2568,11 +2882,15 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
/* some requests don't go through iopoll_list */
if (tail != ctx->cached_cq_tail ||
- list_empty(&ctx->iopoll_list))
+ wq_list_empty(&ctx->iopoll_list))
break;
}
- ret = io_do_iopoll(ctx, &nr_events, min);
- } while (!ret && nr_events < min && !need_resched());
+ ret = io_do_iopoll(ctx, !min);
+ if (ret < 0)
+ break;
+ nr_events += ret;
+ ret = 0;
+ } while (nr_events < min && !need_resched());
out:
mutex_unlock(&ctx->uring_lock);
return ret;
@@ -2597,9 +2915,9 @@ static bool io_resubmit_prep(struct io_kiocb *req)
{
struct io_async_rw *rw = req->async_data;
- if (!rw)
+ if (!req_has_async_data(req))
return !io_req_prep_async(req);
- iov_iter_restore(&rw->iter, &rw->iter_state);
+ iov_iter_restore(&rw->s.iter, &rw->s.iter_state);
return true;
}
@@ -2641,9 +2959,13 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
static bool __io_complete_rw_common(struct io_kiocb *req, long res)
{
- if (req->rw.kiocb.ki_flags & IOCB_WRITE)
+ if (req->rw.kiocb.ki_flags & IOCB_WRITE) {
kiocb_end_write(req);
- if (res != req->result) {
+ fsnotify_modify(req->file);
+ } else {
+ fsnotify_access(req->file);
+ }
+ if (unlikely(res != req->result)) {
if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
io_rw_should_reissue(req)) {
req->flags |= REQ_F_REISSUE;
@@ -2655,33 +2977,29 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)
return false;
}
-static void io_req_task_complete(struct io_kiocb *req, bool *locked)
+static inline void io_req_task_complete(struct io_kiocb *req, bool *locked)
{
- unsigned int cflags = io_put_rw_kbuf(req);
- long res = req->result;
+ int res = req->result;
if (*locked) {
- struct io_ring_ctx *ctx = req->ctx;
- struct io_submit_state *state = &ctx->submit_state;
-
- io_req_complete_state(req, res, cflags);
- state->compl_reqs[state->compl_nr++] = req;
- if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
- io_submit_flush_completions(ctx);
+ io_req_complete_state(req, res, io_put_kbuf(req, 0));
+ io_req_add_compl_list(req);
} else {
- io_req_complete_post(req, res, cflags);
+ io_req_complete_post(req, res,
+ io_put_kbuf(req, IO_URING_F_UNLOCKED));
}
}
-static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
+static void __io_complete_rw(struct io_kiocb *req, long res,
unsigned int issue_flags)
{
if (__io_complete_rw_common(req, res))
return;
- __io_req_complete(req, issue_flags, req->result, io_put_rw_kbuf(req));
+ __io_req_complete(req, issue_flags, req->result,
+ io_put_kbuf(req, issue_flags));
}
-static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
+static void io_complete_rw(struct kiocb *kiocb, long res)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
@@ -2689,10 +3007,10 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
return;
req->result = res;
req->io_task_work.func = io_req_task_complete;
- io_req_task_work_add(req);
+ io_req_task_work_add(req, !!(req->ctx->flags & IORING_SETUP_SQPOLL));
}
-static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
+static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
@@ -2703,12 +3021,11 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
req->flags |= REQ_F_REISSUE;
return;
}
+ req->result = res;
}
- WRITE_ONCE(req->result, res);
- /* order with io_iopoll_complete() checking ->result */
- smp_wmb();
- WRITE_ONCE(req->iopoll_completed, 1);
+ /* order with io_iopoll_complete() checking ->iopoll_completed */
+ smp_store_release(&req->iopoll_completed, 1);
}
/*
@@ -2717,13 +3034,13 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
* find it from a io_do_iopoll() thread before the issuer is done
* accessing the kiocb cookie.
*/
-static void io_iopoll_req_issued(struct io_kiocb *req)
+static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
- const bool in_async = io_wq_current_is_worker();
+ const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
/* workqueue context doesn't hold uring_lock, grab it now */
- if (unlikely(in_async))
+ if (unlikely(needs_lock))
mutex_lock(&ctx->uring_lock);
/*
@@ -2731,23 +3048,15 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
* how we do polling eventually, not spinning if we're on potentially
* different devices.
*/
- if (list_empty(&ctx->iopoll_list)) {
+ if (wq_list_empty(&ctx->iopoll_list)) {
ctx->poll_multi_queue = false;
} else if (!ctx->poll_multi_queue) {
struct io_kiocb *list_req;
- unsigned int queue_num0, queue_num1;
- list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb,
- inflight_entry);
-
- if (list_req->file != req->file) {
+ list_req = container_of(ctx->iopoll_list.first, struct io_kiocb,
+ comp_list);
+ if (list_req->file != req->file)
ctx->poll_multi_queue = true;
- } else {
- queue_num0 = blk_qc_t_to_queue_num(list_req->rw.kiocb.ki_cookie);
- queue_num1 = blk_qc_t_to_queue_num(req->rw.kiocb.ki_cookie);
- if (queue_num0 != queue_num1)
- ctx->poll_multi_queue = true;
- }
}
/*
@@ -2755,11 +3064,11 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
* it to the front so we find it first.
*/
if (READ_ONCE(req->iopoll_completed))
- list_add(&req->inflight_entry, &ctx->iopoll_list);
+ wq_list_add_head(&req->comp_list, &ctx->iopoll_list);
else
- list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
+ wq_list_add_tail(&req->comp_list, &ctx->iopoll_list);
- if (unlikely(in_async)) {
+ if (unlikely(needs_lock)) {
/*
* If IORING_SETUP_SQPOLL is enabled, sqes are either handle
* in sq thread task context or in io worker task context. If
@@ -2784,10 +3093,8 @@ static bool io_bdev_nowait(struct block_device *bdev)
* any file. For now, just ensure that anything potentially problematic is done
* inline.
*/
-static bool __io_file_supports_nowait(struct file *file, int rw)
+static bool __io_file_supports_nowait(struct file *file, umode_t mode)
{
- umode_t mode = file_inode(file)->i_mode;
-
if (S_ISBLK(mode)) {
if (IS_ENABLED(CONFIG_BLOCK) &&
io_bdev_nowait(I_BDEV(file->f_mapping->host)))
@@ -2807,57 +3114,38 @@ static bool __io_file_supports_nowait(struct file *file, int rw)
/* any ->read/write should understand O_NONBLOCK */
if (file->f_flags & O_NONBLOCK)
return true;
+ return file->f_mode & FMODE_NOWAIT;
+}
- if (!(file->f_mode & FMODE_NOWAIT))
- return false;
-
- if (rw == READ)
- return file->f_op->read_iter != NULL;
+/*
+ * If we tracked the file through the SCM inflight mechanism, we could support
+ * any file. For now, just ensure that anything potentially problematic is done
+ * inline.
+ */
+static unsigned int io_file_get_flags(struct file *file)
+{
+ umode_t mode = file_inode(file)->i_mode;
+ unsigned int res = 0;
- return file->f_op->write_iter != NULL;
+ if (S_ISREG(mode))
+ res |= FFS_ISREG;
+ if (__io_file_supports_nowait(file, mode))
+ res |= FFS_NOWAIT;
+ return res;
}
-static bool io_file_supports_nowait(struct io_kiocb *req, int rw)
+static inline bool io_file_supports_nowait(struct io_kiocb *req)
{
- if (rw == READ && (req->flags & REQ_F_NOWAIT_READ))
- return true;
- else if (rw == WRITE && (req->flags & REQ_F_NOWAIT_WRITE))
- return true;
-
- return __io_file_supports_nowait(req->file, rw);
+ return req->flags & REQ_F_SUPPORT_NOWAIT;
}
-static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- int rw)
+static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- struct io_ring_ctx *ctx = req->ctx;
struct kiocb *kiocb = &req->rw.kiocb;
- struct file *file = req->file;
unsigned ioprio;
int ret;
- if (!io_req_ffs_set(req) && S_ISREG(file_inode(file)->i_mode))
- req->flags |= REQ_F_ISREG;
-
kiocb->ki_pos = READ_ONCE(sqe->off);
- if (kiocb->ki_pos == -1 && !(file->f_mode & FMODE_STREAM)) {
- req->flags |= REQ_F_CUR_POS;
- kiocb->ki_pos = file->f_pos;
- }
- kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
- kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
- ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
- if (unlikely(ret))
- return ret;
-
- /*
- * If the file is marked O_NONBLOCK, still allow retry for it if it
- * supports async. Otherwise it's impossible to use O_NONBLOCK files
- * reliably. If not, or it IOCB_NOWAIT is set, don't retry.
- */
- if ((kiocb->ki_flags & IOCB_NOWAIT) ||
- ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req, rw)))
- req->flags |= REQ_F_NOWAIT;
ioprio = READ_ONCE(sqe->ioprio);
if (ioprio) {
@@ -2866,31 +3154,14 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
return ret;
kiocb->ki_ioprio = ioprio;
- } else
- kiocb->ki_ioprio = get_current_ioprio();
-
- if (ctx->flags & IORING_SETUP_IOPOLL) {
- if (!(kiocb->ki_flags & IOCB_DIRECT) ||
- !kiocb->ki_filp->f_op->iopoll)
- return -EOPNOTSUPP;
-
- kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
- kiocb->ki_complete = io_complete_rw_iopoll;
- req->iopoll_completed = 0;
} else {
- if (kiocb->ki_flags & IOCB_HIPRI)
- return -EINVAL;
- kiocb->ki_complete = io_complete_rw;
- }
-
- if (req->opcode == IORING_OP_READ_FIXED ||
- req->opcode == IORING_OP_WRITE_FIXED) {
- req->imu = NULL;
- io_req_set_rsrc_node(req);
+ kiocb->ki_ioprio = get_current_ioprio();
}
+ req->imu = NULL;
req->rw.addr = READ_ONCE(sqe->addr);
req->rw.len = READ_ONCE(sqe->len);
+ req->rw.flags = READ_ONCE(sqe->rw_flags);
req->buf_index = READ_ONCE(sqe->buf_index);
return 0;
}
@@ -2912,18 +3183,34 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
ret = -EINTR;
fallthrough;
default:
- kiocb->ki_complete(kiocb, ret, 0);
+ kiocb->ki_complete(kiocb, ret);
+ }
+}
+
+static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
+{
+ struct kiocb *kiocb = &req->rw.kiocb;
+
+ if (kiocb->ki_pos != -1)
+ return &kiocb->ki_pos;
+
+ if (!(req->file->f_mode & FMODE_STREAM)) {
+ req->flags |= REQ_F_CUR_POS;
+ kiocb->ki_pos = req->file->f_pos;
+ return &kiocb->ki_pos;
}
+
+ kiocb->ki_pos = 0;
+ return NULL;
}
-static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
+static void kiocb_done(struct io_kiocb *req, ssize_t ret,
unsigned int issue_flags)
{
- struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
struct io_async_rw *io = req->async_data;
/* add previously done IO, if any */
- if (io && io->bytes_done > 0) {
+ if (req_has_async_data(req) && io->bytes_done > 0) {
if (ret < 0)
ret = io->bytes_done;
else
@@ -2931,29 +3218,18 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
}
if (req->flags & REQ_F_CUR_POS)
- req->file->f_pos = kiocb->ki_pos;
- if (ret >= 0 && (kiocb->ki_complete == io_complete_rw))
- __io_complete_rw(req, ret, 0, issue_flags);
+ req->file->f_pos = req->rw.kiocb.ki_pos;
+ if (ret >= 0 && (req->rw.kiocb.ki_complete == io_complete_rw))
+ __io_complete_rw(req, ret, issue_flags);
else
- io_rw_done(kiocb, ret);
+ io_rw_done(&req->rw.kiocb, ret);
if (req->flags & REQ_F_REISSUE) {
req->flags &= ~REQ_F_REISSUE;
- if (io_resubmit_prep(req)) {
+ if (io_resubmit_prep(req))
io_req_task_queue_reissue(req);
- } else {
- unsigned int cflags = io_put_rw_kbuf(req);
- struct io_ring_ctx *ctx = req->ctx;
-
- req_set_fail(req);
- if (!(issue_flags & IO_URING_F_NONBLOCK)) {
- mutex_lock(&ctx->uring_lock);
- __io_req_complete(req, issue_flags, ret, cflags);
- mutex_unlock(&ctx->uring_lock);
- } else {
- __io_req_complete(req, issue_flags, ret, cflags);
- }
- }
+ else
+ io_req_task_queue_fail(req, ret);
}
}
@@ -3015,15 +3291,18 @@ static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter
return 0;
}
-static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
+static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
+ unsigned int issue_flags)
{
- struct io_ring_ctx *ctx = req->ctx;
struct io_mapped_ubuf *imu = req->imu;
u16 index, buf_index = req->buf_index;
if (likely(!imu)) {
+ struct io_ring_ctx *ctx = req->ctx;
+
if (unlikely(buf_index >= ctx->nr_user_bufs))
return -EFAULT;
+ io_req_set_rsrc_node(req, ctx, issue_flags);
index = array_index_nospec(buf_index, ctx->nr_user_bufs);
imu = READ_ONCE(ctx->user_bufs[index]);
req->imu = imu;
@@ -3049,59 +3328,64 @@ static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
mutex_lock(&ctx->uring_lock);
}
+static void io_buffer_add_list(struct io_ring_ctx *ctx,
+ struct io_buffer_list *bl, unsigned int bgid)
+{
+ struct list_head *list;
+
+ list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)];
+ INIT_LIST_HEAD(&bl->buf_list);
+ bl->bgid = bgid;
+ list_add(&bl->list, list);
+}
+
static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
- int bgid, struct io_buffer *kbuf,
- bool needs_lock)
+ int bgid, unsigned int issue_flags)
{
- struct io_buffer *head;
+ struct io_buffer *kbuf = req->kbuf;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_buffer_list *bl;
if (req->flags & REQ_F_BUFFER_SELECTED)
return kbuf;
- io_ring_submit_lock(req->ctx, needs_lock);
+ io_ring_submit_lock(ctx, needs_lock);
- lockdep_assert_held(&req->ctx->uring_lock);
+ lockdep_assert_held(&ctx->uring_lock);
- head = xa_load(&req->ctx->io_buffers, bgid);
- if (head) {
- if (!list_empty(&head->list)) {
- kbuf = list_last_entry(&head->list, struct io_buffer,
- list);
- list_del(&kbuf->list);
- } else {
- kbuf = head;
- xa_erase(&req->ctx->io_buffers, bgid);
- }
+ bl = io_buffer_get_list(ctx, bgid);
+ if (bl && !list_empty(&bl->buf_list)) {
+ kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
+ list_del(&kbuf->list);
if (*len > kbuf->len)
*len = kbuf->len;
+ req->flags |= REQ_F_BUFFER_SELECTED;
+ req->kbuf = kbuf;
} else {
kbuf = ERR_PTR(-ENOBUFS);
}
io_ring_submit_unlock(req->ctx, needs_lock);
-
return kbuf;
}
static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
- bool needs_lock)
+ unsigned int issue_flags)
{
struct io_buffer *kbuf;
u16 bgid;
- kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
bgid = req->buf_index;
- kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
+ kbuf = io_buffer_select(req, len, bgid, issue_flags);
if (IS_ERR(kbuf))
return kbuf;
- req->rw.addr = (u64) (unsigned long) kbuf;
- req->flags |= REQ_F_BUFFER_SELECTED;
return u64_to_user_ptr(kbuf->addr);
}
#ifdef CONFIG_COMPAT
static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
- bool needs_lock)
+ unsigned int issue_flags)
{
struct compat_iovec __user *uiov;
compat_ssize_t clen;
@@ -3117,7 +3401,7 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
return -EINVAL;
len = clen;
- buf = io_rw_buffer_select(req, &len, needs_lock);
+ buf = io_rw_buffer_select(req, &len, issue_flags);
if (IS_ERR(buf))
return PTR_ERR(buf);
iov[0].iov_base = buf;
@@ -3127,7 +3411,7 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
#endif
static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
- bool needs_lock)
+ unsigned int issue_flags)
{
struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
void __user *buf;
@@ -3139,7 +3423,7 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
len = iov[0].iov_len;
if (len < 0)
return -EINVAL;
- buf = io_rw_buffer_select(req, &len, needs_lock);
+ buf = io_rw_buffer_select(req, &len, issue_flags);
if (IS_ERR(buf))
return PTR_ERR(buf);
iov[0].iov_base = buf;
@@ -3148,12 +3432,11 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
}
static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
- bool needs_lock)
+ unsigned int issue_flags)
{
if (req->flags & REQ_F_BUFFER_SELECTED) {
- struct io_buffer *kbuf;
+ struct io_buffer *kbuf = req->kbuf;
- kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
iov[0].iov_len = kbuf->len;
return 0;
@@ -3163,52 +3446,77 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
#ifdef CONFIG_COMPAT
if (req->ctx->compat)
- return io_compat_import(req, iov, needs_lock);
+ return io_compat_import(req, iov, issue_flags);
#endif
- return __io_iov_buffer_select(req, iov, needs_lock);
+ return __io_iov_buffer_select(req, iov, issue_flags);
}
-static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
- struct iov_iter *iter, bool needs_lock)
+static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
+ struct io_rw_state *s,
+ unsigned int issue_flags)
{
- void __user *buf = u64_to_user_ptr(req->rw.addr);
- size_t sqe_len = req->rw.len;
+ struct iov_iter *iter = &s->iter;
u8 opcode = req->opcode;
+ struct iovec *iovec;
+ void __user *buf;
+ size_t sqe_len;
ssize_t ret;
if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
- *iovec = NULL;
- return io_import_fixed(req, rw, iter);
+ ret = io_import_fixed(req, rw, iter, issue_flags);
+ if (ret)
+ return ERR_PTR(ret);
+ return NULL;
}
/* buffer index only valid with fixed read/write, or buffer select */
- if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
- return -EINVAL;
+ if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT)))
+ return ERR_PTR(-EINVAL);
+
+ buf = u64_to_user_ptr(req->rw.addr);
+ sqe_len = req->rw.len;
if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
if (req->flags & REQ_F_BUFFER_SELECT) {
- buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
+ buf = io_rw_buffer_select(req, &sqe_len, issue_flags);
if (IS_ERR(buf))
- return PTR_ERR(buf);
+ return ERR_CAST(buf);
req->rw.len = sqe_len;
}
- ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
- *iovec = NULL;
- return ret;
+ ret = import_single_range(rw, buf, sqe_len, s->fast_iov, iter);
+ if (ret)
+ return ERR_PTR(ret);
+ return NULL;
}
+ iovec = s->fast_iov;
if (req->flags & REQ_F_BUFFER_SELECT) {
- ret = io_iov_buffer_select(req, *iovec, needs_lock);
- if (!ret)
- iov_iter_init(iter, rw, *iovec, 1, (*iovec)->iov_len);
- *iovec = NULL;
- return ret;
+ ret = io_iov_buffer_select(req, iovec, issue_flags);
+ if (ret)
+ return ERR_PTR(ret);
+ iov_iter_init(iter, rw, iovec, 1, iovec->iov_len);
+ return NULL;
}
- return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter,
+ ret = __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, &iovec, iter,
req->ctx->compat);
+ if (unlikely(ret < 0))
+ return ERR_PTR(ret);
+ return iovec;
+}
+
+static inline int io_import_iovec(int rw, struct io_kiocb *req,
+ struct iovec **iovec, struct io_rw_state *s,
+ unsigned int issue_flags)
+{
+ *iovec = __io_import_iovec(rw, req, s, issue_flags);
+ if (unlikely(IS_ERR(*iovec)))
+ return PTR_ERR(*iovec);
+
+ iov_iter_save_state(&s->iter, &s->iter_state);
+ return 0;
}
static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
@@ -3225,6 +3533,7 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
struct kiocb *kiocb = &req->rw.kiocb;
struct file *file = req->file;
ssize_t ret = 0;
+ loff_t *ppos;
/*
* Don't support polled IO through this interface, and we can't
@@ -3233,9 +3542,12 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
*/
if (kiocb->ki_flags & IOCB_HIPRI)
return -EOPNOTSUPP;
- if (kiocb->ki_flags & IOCB_NOWAIT)
+ if ((kiocb->ki_flags & IOCB_NOWAIT) &&
+ !(kiocb->ki_filp->f_flags & O_NONBLOCK))
return -EAGAIN;
+ ppos = io_kiocb_ppos(kiocb);
+
while (iov_iter_count(iter)) {
struct iovec iovec;
ssize_t nr;
@@ -3249,10 +3561,10 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
if (rw == READ) {
nr = file->f_op->read(file, iovec.iov_base,
- iovec.iov_len, io_kiocb_ppos(kiocb));
+ iovec.iov_len, ppos);
} else {
nr = file->f_op->write(file, iovec.iov_base,
- iovec.iov_len, io_kiocb_ppos(kiocb));
+ iovec.iov_len, ppos);
}
if (nr < 0) {
@@ -3260,13 +3572,15 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
ret = nr;
break;
}
+ ret += nr;
if (!iov_iter_is_bvec(iter)) {
iov_iter_advance(iter, nr);
} else {
- req->rw.len -= nr;
req->rw.addr += nr;
+ req->rw.len -= nr;
+ if (!req->rw.len)
+ break;
}
- ret += nr;
if (nr != iovec.iov_len)
break;
}
@@ -3279,7 +3593,7 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
{
struct io_async_rw *rw = req->async_data;
- memcpy(&rw->iter, iter, sizeof(*iter));
+ memcpy(&rw->s.iter, iter, sizeof(*iter));
rw->free_iovec = iovec;
rw->bytes_done = 0;
/* can only be fixed buffers, no need to do anything */
@@ -3288,33 +3602,36 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
if (!iovec) {
unsigned iov_off = 0;
- rw->iter.iov = rw->fast_iov;
+ rw->s.iter.iov = rw->s.fast_iov;
if (iter->iov != fast_iov) {
iov_off = iter->iov - fast_iov;
- rw->iter.iov += iov_off;
+ rw->s.iter.iov += iov_off;
}
- if (rw->fast_iov != fast_iov)
- memcpy(rw->fast_iov + iov_off, fast_iov + iov_off,
+ if (rw->s.fast_iov != fast_iov)
+ memcpy(rw->s.fast_iov + iov_off, fast_iov + iov_off,
sizeof(struct iovec) * iter->nr_segs);
} else {
req->flags |= REQ_F_NEED_CLEANUP;
}
}
-static inline int io_alloc_async_data(struct io_kiocb *req)
+static inline bool io_alloc_async_data(struct io_kiocb *req)
{
WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
- return req->async_data == NULL;
+ if (req->async_data) {
+ req->flags |= REQ_F_ASYNC_DATA;
+ return false;
+ }
+ return true;
}
static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
- const struct iovec *fast_iov,
- struct iov_iter *iter, bool force)
+ struct io_rw_state *s, bool force)
{
if (!force && !io_op_defs[req->opcode].needs_async_setup)
return 0;
- if (!req->async_data) {
+ if (!req_has_async_data(req)) {
struct io_async_rw *iorw;
if (io_alloc_async_data(req)) {
@@ -3322,10 +3639,10 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
return -ENOMEM;
}
- io_req_map_rw(req, iovec, fast_iov, iter);
+ io_req_map_rw(req, iovec, s->fast_iov, &s->iter);
iorw = req->async_data;
/* we've copied and mapped the iter, ensure state is saved */
- iov_iter_save_state(&iorw->iter, &iorw->iter_state);
+ iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state);
}
return 0;
}
@@ -3333,10 +3650,11 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
{
struct io_async_rw *iorw = req->async_data;
- struct iovec *iov = iorw->fast_iov;
+ struct iovec *iov;
int ret;
- ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
+ /* submission path, ->uring_lock should already be taken */
+ ret = io_import_iovec(rw, req, &iov, &iorw->s, 0);
if (unlikely(ret < 0))
return ret;
@@ -3344,19 +3662,11 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
iorw->free_iovec = iov;
if (iov)
req->flags |= REQ_F_NEED_CLEANUP;
- iov_iter_save_state(&iorw->iter, &iorw->iter_state);
return 0;
}
-static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- if (unlikely(!(req->file->f_mode & FMODE_READ)))
- return -EBADF;
- return io_prep_rw(req, sqe, READ);
-}
-
/*
- * This is our waitqueue callback handler, registered through lock_page_async()
+ * This is our waitqueue callback handler, registered through __folio_lock_async()
* when we initially tried to do the IO with the iocb armed our waitqueue.
* This gets called when the page is unlocked, and we generally expect that to
* happen when the page IO is completed and the page is now uptodate. This will
@@ -3428,7 +3738,7 @@ static bool io_rw_should_retry(struct io_kiocb *req)
static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
{
- if (req->file->f_op->read_iter)
+ if (likely(req->file->f_op->read_iter))
return call_read_iter(req->file, &req->rw.kiocb, iter);
else if (req->file->f_op->read)
return loop_rw_iter(READ, req, iter);
@@ -3442,57 +3752,119 @@ static bool need_read_all(struct io_kiocb *req)
S_ISBLK(file_inode(req->file)->i_mode);
}
+static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
+{
+ struct kiocb *kiocb = &req->rw.kiocb;
+ struct io_ring_ctx *ctx = req->ctx;
+ struct file *file = req->file;
+ int ret;
+
+ if (unlikely(!file || !(file->f_mode & mode)))
+ return -EBADF;
+
+ if (!io_req_ffs_set(req))
+ req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;
+
+ kiocb->ki_flags = iocb_flags(file);
+ ret = kiocb_set_rw_flags(kiocb, req->rw.flags);
+ if (unlikely(ret))
+ return ret;
+
+ /*
+ * If the file is marked O_NONBLOCK, still allow retry for it if it
+ * supports async. Otherwise it's impossible to use O_NONBLOCK files
+ * reliably. If not, or it IOCB_NOWAIT is set, don't retry.
+ */
+ if ((kiocb->ki_flags & IOCB_NOWAIT) ||
+ ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req)))
+ req->flags |= REQ_F_NOWAIT;
+
+ if (ctx->flags & IORING_SETUP_IOPOLL) {
+ if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
+ return -EOPNOTSUPP;
+
+ kiocb->private = NULL;
+ kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
+ kiocb->ki_complete = io_complete_rw_iopoll;
+ req->iopoll_completed = 0;
+ } else {
+ if (kiocb->ki_flags & IOCB_HIPRI)
+ return -EINVAL;
+ kiocb->ki_complete = io_complete_rw;
+ }
+
+ return 0;
+}
+
static int io_read(struct io_kiocb *req, unsigned int issue_flags)
{
- struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
+ struct io_rw_state __s, *s = &__s;
+ struct iovec *iovec;
struct kiocb *kiocb = &req->rw.kiocb;
- struct iov_iter __iter, *iter = &__iter;
- struct io_async_rw *rw = req->async_data;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
- struct iov_iter_state __state, *state;
+ struct io_async_rw *rw;
ssize_t ret, ret2;
+ loff_t *ppos;
+
+ if (!req_has_async_data(req)) {
+ ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
+ if (unlikely(ret < 0))
+ return ret;
+ } else {
+ /*
+ * Safe and required to re-import if we're using provided
+ * buffers, as we dropped the selected one before retry.
+ */
+ if (req->flags & REQ_F_BUFFER_SELECT) {
+ ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
+ if (unlikely(ret < 0))
+ return ret;
+ }
- if (rw) {
- iter = &rw->iter;
- state = &rw->iter_state;
+ rw = req->async_data;
+ s = &rw->s;
/*
* We come here from an earlier attempt, restore our state to
* match in case it doesn't. It's cheap enough that we don't
* need to make this conditional.
*/
- iov_iter_restore(iter, state);
+ iov_iter_restore(&s->iter, &s->iter_state);
iovec = NULL;
- } else {
- ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
- if (ret < 0)
- return ret;
- state = &__state;
- iov_iter_save_state(iter, state);
}
- req->result = iov_iter_count(iter);
+ ret = io_rw_init_file(req, FMODE_READ);
+ if (unlikely(ret)) {
+ kfree(iovec);
+ return ret;
+ }
+ req->result = iov_iter_count(&s->iter);
- /* Ensure we clear previously set non-block flag */
- if (!force_nonblock)
- kiocb->ki_flags &= ~IOCB_NOWAIT;
- else
+ if (force_nonblock) {
+ /* If the file doesn't support async, just async punt */
+ if (unlikely(!io_file_supports_nowait(req))) {
+ ret = io_setup_async_rw(req, iovec, s, true);
+ return ret ?: -EAGAIN;
+ }
kiocb->ki_flags |= IOCB_NOWAIT;
-
- /* If the file doesn't support async, just async punt */
- if (force_nonblock && !io_file_supports_nowait(req, READ)) {
- ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
- return ret ?: -EAGAIN;
+ } else {
+ /* Ensure we clear previously set non-block flag */
+ kiocb->ki_flags &= ~IOCB_NOWAIT;
}
- ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result);
+ ppos = io_kiocb_update_pos(req);
+
+ ret = rw_verify_area(READ, req->file, ppos, req->result);
if (unlikely(ret)) {
kfree(iovec);
return ret;
}
- ret = io_iter_do_read(req, iter);
+ ret = io_iter_do_read(req, &s->iter);
if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
req->flags &= ~REQ_F_REISSUE;
+ /* if we can poll, just do that */
+ if (req->opcode == IORING_OP_READ && file_can_poll(req->file))
+ return -EAGAIN;
/* IOPOLL retry should happen for io-wq threads */
if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
goto done;
@@ -3502,7 +3874,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
ret = 0;
} else if (ret == -EIOCBQUEUED) {
goto out_free;
- } else if (ret <= 0 || ret == req->result || !force_nonblock ||
+ } else if (ret == req->result || ret <= 0 || !force_nonblock ||
(req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
/* read all, failed, already did sync or don't want to retry */
goto done;
@@ -3513,22 +3885,19 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
* untouched in case of error. Restore it and we'll advance it
* manually if we need to.
*/
- iov_iter_restore(iter, state);
+ iov_iter_restore(&s->iter, &s->iter_state);
- ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
+ ret2 = io_setup_async_rw(req, iovec, s, true);
if (ret2)
return ret2;
iovec = NULL;
rw = req->async_data;
+ s = &rw->s;
/*
* Now use our persistent iterator and state, if we aren't already.
* We've restored and mapped the iter to match.
*/
- if (iter != &rw->iter) {
- iter = &rw->iter;
- state = &rw->iter_state;
- }
do {
/*
@@ -3536,11 +3905,11 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
* above or inside this loop. Advance the iter by the bytes
* that were consumed.
*/
- iov_iter_advance(iter, ret);
- if (!iov_iter_count(iter))
+ iov_iter_advance(&s->iter, ret);
+ if (!iov_iter_count(&s->iter))
break;
rw->bytes_done += ret;
- iov_iter_save_state(iter, state);
+ iov_iter_save_state(&s->iter, &s->iter_state);
/* if we can retry, do so with the callbacks armed */
if (!io_rw_should_retry(req)) {
@@ -3554,15 +3923,15 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
* desired page gets unlocked. We can also get a partial read
* here, and if we do, then just retry at the new offset.
*/
- ret = io_iter_do_read(req, iter);
+ ret = io_iter_do_read(req, &s->iter);
if (ret == -EIOCBQUEUED)
return 0;
/* we got some bytes, but not all. retry. */
kiocb->ki_flags &= ~IOCB_WAITQ;
- iov_iter_restore(iter, state);
+ iov_iter_restore(&s->iter, &s->iter_state);
} while (ret > 0);
done:
- kiocb_done(kiocb, ret, issue_flags);
+ kiocb_done(req, ret, issue_flags);
out_free:
/* it's faster to check here then delegate to kfree */
if (iovec)
@@ -3570,53 +3939,52 @@ out_free:
return 0;
}
-static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
- return -EBADF;
- return io_prep_rw(req, sqe, WRITE);
-}
-
static int io_write(struct io_kiocb *req, unsigned int issue_flags)
{
- struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
+ struct io_rw_state __s, *s = &__s;
+ struct iovec *iovec;
struct kiocb *kiocb = &req->rw.kiocb;
- struct iov_iter __iter, *iter = &__iter;
- struct io_async_rw *rw = req->async_data;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
- struct iov_iter_state __state, *state;
ssize_t ret, ret2;
+ loff_t *ppos;
- if (rw) {
- iter = &rw->iter;
- state = &rw->iter_state;
- iov_iter_restore(iter, state);
- iovec = NULL;
- } else {
- ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
- if (ret < 0)
+ if (!req_has_async_data(req)) {
+ ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags);
+ if (unlikely(ret < 0))
return ret;
- state = &__state;
- iov_iter_save_state(iter, state);
+ } else {
+ struct io_async_rw *rw = req->async_data;
+
+ s = &rw->s;
+ iov_iter_restore(&s->iter, &s->iter_state);
+ iovec = NULL;
}
- req->result = iov_iter_count(iter);
+ ret = io_rw_init_file(req, FMODE_WRITE);
+ if (unlikely(ret)) {
+ kfree(iovec);
+ return ret;
+ }
+ req->result = iov_iter_count(&s->iter);
- /* Ensure we clear previously set non-block flag */
- if (!force_nonblock)
- kiocb->ki_flags &= ~IOCB_NOWAIT;
- else
- kiocb->ki_flags |= IOCB_NOWAIT;
+ if (force_nonblock) {
+ /* If the file doesn't support async, just async punt */
+ if (unlikely(!io_file_supports_nowait(req)))
+ goto copy_iov;
- /* If the file doesn't support async, just async punt */
- if (force_nonblock && !io_file_supports_nowait(req, WRITE))
- goto copy_iov;
+ /* file path doesn't support NOWAIT for non-direct_IO */
+ if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
+ (req->flags & REQ_F_ISREG))
+ goto copy_iov;
- /* file path doesn't support NOWAIT for non-direct_IO */
- if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
- (req->flags & REQ_F_ISREG))
- goto copy_iov;
+ kiocb->ki_flags |= IOCB_NOWAIT;
+ } else {
+ /* Ensure we clear previously set non-block flag */
+ kiocb->ki_flags &= ~IOCB_NOWAIT;
+ }
+
+ ppos = io_kiocb_update_pos(req);
- ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result);
+ ret = rw_verify_area(WRITE, req->file, ppos, req->result);
if (unlikely(ret))
goto out_free;
@@ -3634,10 +4002,10 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
}
kiocb->ki_flags |= IOCB_WRITE;
- if (req->file->f_op->write_iter)
- ret2 = call_write_iter(req->file, kiocb, iter);
+ if (likely(req->file->f_op->write_iter))
+ ret2 = call_write_iter(req->file, kiocb, &s->iter);
else if (req->file->f_op->write)
- ret2 = loop_rw_iter(WRITE, req, iter);
+ ret2 = loop_rw_iter(WRITE, req, &s->iter);
else
ret2 = -EINVAL;
@@ -3657,14 +4025,14 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
goto done;
if (!force_nonblock || ret2 != -EAGAIN) {
/* IOPOLL retry should happen for io-wq threads */
- if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
+ if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
goto copy_iov;
done:
- kiocb_done(kiocb, ret2, issue_flags);
+ kiocb_done(req, ret2, issue_flags);
} else {
copy_iov:
- iov_iter_restore(iter, state);
- ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
+ iov_iter_restore(&s->iter, &s->iter_state);
+ ret = io_setup_async_rw(req, iovec, s, false);
return ret ?: -EAGAIN;
}
out_free:
@@ -3800,7 +4168,7 @@ static int io_mkdirat_prep(struct io_kiocb *req,
return 0;
}
-static int io_mkdirat(struct io_kiocb *req, int issue_flags)
+static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_mkdir *mkd = &req->mkdir;
int ret;
@@ -3849,7 +4217,7 @@ static int io_symlinkat_prep(struct io_kiocb *req,
return 0;
}
-static int io_symlinkat(struct io_kiocb *req, int issue_flags)
+static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_symlink *sl = &req->symlink;
int ret;
@@ -3899,7 +4267,7 @@ static int io_linkat_prep(struct io_kiocb *req,
return 0;
}
-static int io_linkat(struct io_kiocb *req, int issue_flags)
+static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_hardlink *lnk = &req->hardlink;
int ret;
@@ -3966,18 +4334,11 @@ static int __io_splice_prep(struct io_kiocb *req,
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- sp->file_in = NULL;
sp->len = READ_ONCE(sqe->len);
sp->flags = READ_ONCE(sqe->splice_flags);
-
if (unlikely(sp->flags & ~valid_flags))
return -EINVAL;
-
- sp->file_in = io_file_get(req->ctx, req, READ_ONCE(sqe->splice_fd_in),
- (sp->flags & SPLICE_F_FD_IN_FIXED));
- if (!sp->file_in)
- return -EBADF;
- req->flags |= REQ_F_NEED_CLEANUP;
+ sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in);
return 0;
}
@@ -3992,20 +4353,29 @@ static int io_tee_prep(struct io_kiocb *req,
static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_splice *sp = &req->splice;
- struct file *in = sp->file_in;
struct file *out = sp->file_out;
unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
+ struct file *in;
long ret = 0;
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
+
+ if (sp->flags & SPLICE_F_FD_IN_FIXED)
+ in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags);
+ else
+ in = io_file_get_normal(req, sp->splice_fd_in);
+ if (!in) {
+ ret = -EBADF;
+ goto done;
+ }
+
if (sp->len)
ret = do_tee(in, out, sp->len, flags);
if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
io_put_file(in);
- req->flags &= ~REQ_F_NEED_CLEANUP;
-
+done:
if (ret != sp->len)
req_set_fail(req);
io_req_complete(req, ret);
@@ -4024,15 +4394,24 @@ static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_splice *sp = &req->splice;
- struct file *in = sp->file_in;
struct file *out = sp->file_out;
unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
loff_t *poff_in, *poff_out;
+ struct file *in;
long ret = 0;
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
+ if (sp->flags & SPLICE_F_FD_IN_FIXED)
+ in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags);
+ else
+ in = io_file_get_normal(req, sp->splice_fd_in);
+ if (!in) {
+ ret = -EBADF;
+ goto done;
+ }
+
poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
@@ -4041,8 +4420,7 @@ static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
io_put_file(in);
- req->flags &= ~REQ_F_NEED_CLEANUP;
-
+done:
if (ret != sp->len)
req_set_fail(req);
io_req_complete(req, ret);
@@ -4063,13 +4441,56 @@ static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
return 0;
}
+static int io_msg_ring_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ if (unlikely(sqe->addr || sqe->ioprio || sqe->rw_flags ||
+ sqe->splice_fd_in || sqe->buf_index || sqe->personality))
+ return -EINVAL;
+
+ req->msg.user_data = READ_ONCE(sqe->off);
+ req->msg.len = READ_ONCE(sqe->len);
+ return 0;
+}
+
+static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_ring_ctx *target_ctx;
+ struct io_msg *msg = &req->msg;
+ bool filled;
+ int ret;
+
+ ret = -EBADFD;
+ if (req->file->f_op != &io_uring_fops)
+ goto done;
+
+ ret = -EOVERFLOW;
+ target_ctx = req->file->private_data;
+
+ spin_lock(&target_ctx->completion_lock);
+ filled = io_fill_cqe_aux(target_ctx, msg->user_data, msg->len, 0);
+ io_commit_cqring(target_ctx);
+ spin_unlock(&target_ctx->completion_lock);
+
+ if (filled) {
+ io_cqring_ev_posted(target_ctx);
+ ret = 0;
+ }
+
+done:
+ if (ret < 0)
+ req_set_fail(req);
+ __io_req_complete(req, issue_flags, ret, 0);
+ /* put file to avoid an attempt to IOPOLL the req */
+ io_put_file(req->file);
+ req->file = NULL;
+ return 0;
+}
+
static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_ring_ctx *ctx = req->ctx;
- if (!req->file)
- return -EBADF;
-
if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
@@ -4129,6 +4550,8 @@ static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
req->sync.len);
if (ret < 0)
req_set_fail(req);
+ else
+ fsnotify_modify(req->file);
io_req_complete(req, ret);
return 0;
}
@@ -4286,8 +4709,8 @@ static int io_remove_buffers_prep(struct io_kiocb *req,
return 0;
}
-static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
- int bgid, unsigned nbufs)
+static int __io_remove_buffers(struct io_ring_ctx *ctx,
+ struct io_buffer_list *bl, unsigned nbufs)
{
unsigned i = 0;
@@ -4296,18 +4719,16 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
return 0;
/* the head kbuf is the list itself */
- while (!list_empty(&buf->list)) {
+ while (!list_empty(&bl->buf_list)) {
struct io_buffer *nxt;
- nxt = list_first_entry(&buf->list, struct io_buffer, list);
+ nxt = list_first_entry(&bl->buf_list, struct io_buffer, list);
list_del(&nxt->list);
- kfree(nxt);
if (++i == nbufs)
return i;
+ cond_resched();
}
i++;
- kfree(buf);
- xa_erase(&ctx->io_buffers, bgid);
return i;
}
@@ -4316,24 +4737,24 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_provide_buf *p = &req->pbuf;
struct io_ring_ctx *ctx = req->ctx;
- struct io_buffer *head;
+ struct io_buffer_list *bl;
int ret = 0;
- bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
- io_ring_submit_lock(ctx, !force_nonblock);
+ io_ring_submit_lock(ctx, needs_lock);
lockdep_assert_held(&ctx->uring_lock);
ret = -ENOENT;
- head = xa_load(&ctx->io_buffers, p->bgid);
- if (head)
- ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
+ bl = io_buffer_get_list(ctx, p->bgid);
+ if (bl)
+ ret = __io_remove_buffers(ctx, bl, p->nbufs);
if (ret < 0)
req_set_fail(req);
/* complete before unlock, IOPOLL may need the lock */
__io_req_complete(req, issue_flags, ret, 0);
- io_ring_submit_unlock(ctx, !force_nonblock);
+ io_ring_submit_unlock(ctx, needs_lock);
return 0;
}
@@ -4372,58 +4793,104 @@ static int io_provide_buffers_prep(struct io_kiocb *req,
return 0;
}
-static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
+static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
+{
+ struct io_buffer *buf;
+ struct page *page;
+ int bufs_in_page;
+
+ /*
+ * Completions that don't happen inline (eg not under uring_lock) will
+ * add to ->io_buffers_comp. If we don't have any free buffers, check
+ * the completion list and splice those entries first.
+ */
+ if (!list_empty_careful(&ctx->io_buffers_comp)) {
+ spin_lock(&ctx->completion_lock);
+ if (!list_empty(&ctx->io_buffers_comp)) {
+ list_splice_init(&ctx->io_buffers_comp,
+ &ctx->io_buffers_cache);
+ spin_unlock(&ctx->completion_lock);
+ return 0;
+ }
+ spin_unlock(&ctx->completion_lock);
+ }
+
+ /*
+ * No free buffers and no completion entries either. Allocate a new
+ * page worth of buffer entries and add those to our freelist.
+ */
+ page = alloc_page(GFP_KERNEL_ACCOUNT);
+ if (!page)
+ return -ENOMEM;
+
+ list_add(&page->lru, &ctx->io_buffers_pages);
+
+ buf = page_address(page);
+ bufs_in_page = PAGE_SIZE / sizeof(*buf);
+ while (bufs_in_page) {
+ list_add_tail(&buf->list, &ctx->io_buffers_cache);
+ buf++;
+ bufs_in_page--;
+ }
+
+ return 0;
+}
+
+static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
+ struct io_buffer_list *bl)
{
struct io_buffer *buf;
u64 addr = pbuf->addr;
int i, bid = pbuf->bid;
for (i = 0; i < pbuf->nbufs; i++) {
- buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT);
- if (!buf)
+ if (list_empty(&ctx->io_buffers_cache) &&
+ io_refill_buffer_cache(ctx))
break;
-
+ buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer,
+ list);
+ list_move_tail(&buf->list, &bl->buf_list);
buf->addr = addr;
buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
buf->bid = bid;
+ buf->bgid = pbuf->bgid;
addr += pbuf->len;
bid++;
- if (!*head) {
- INIT_LIST_HEAD(&buf->list);
- *head = buf;
- } else {
- list_add_tail(&buf->list, &(*head)->list);
- }
+ cond_resched();
}
- return i ? i : -ENOMEM;
+ return i ? 0 : -ENOMEM;
}
static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_provide_buf *p = &req->pbuf;
struct io_ring_ctx *ctx = req->ctx;
- struct io_buffer *head, *list;
+ struct io_buffer_list *bl;
int ret = 0;
- bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
- io_ring_submit_lock(ctx, !force_nonblock);
+ io_ring_submit_lock(ctx, needs_lock);
lockdep_assert_held(&ctx->uring_lock);
- list = head = xa_load(&ctx->io_buffers, p->bgid);
-
- ret = io_add_buffers(p, &head);
- if (ret >= 0 && !list) {
- ret = xa_insert(&ctx->io_buffers, p->bgid, head, GFP_KERNEL);
- if (ret < 0)
- __io_remove_buffers(ctx, head, p->bgid, -1U);
+ bl = io_buffer_get_list(ctx, p->bgid);
+ if (unlikely(!bl)) {
+ bl = kmalloc(sizeof(*bl), GFP_KERNEL);
+ if (!bl) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ io_buffer_add_list(ctx, bl, p->bgid);
}
+
+ ret = io_add_buffers(ctx, p, bl);
+err:
if (ret < 0)
req_set_fail(req);
/* complete before unlock, IOPOLL may need the lock */
__io_req_complete(req, issue_flags, ret, 0);
- io_ring_submit_unlock(ctx, !force_nonblock);
+ io_ring_submit_unlock(ctx, needs_lock);
return 0;
}
@@ -4548,6 +5015,8 @@ static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
+ const char __user *path;
+
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
@@ -4557,10 +5026,22 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
req->statx.dfd = READ_ONCE(sqe->fd);
req->statx.mask = READ_ONCE(sqe->len);
- req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ path = u64_to_user_ptr(READ_ONCE(sqe->addr));
req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
req->statx.flags = READ_ONCE(sqe->statx_flags);
+ req->statx.filename = getname_flags(path,
+ getname_statx_lookup_flags(req->statx.flags),
+ NULL);
+
+ if (IS_ERR(req->statx.filename)) {
+ int ret = PTR_ERR(req->statx.filename);
+
+ req->statx.filename = NULL;
+ return ret;
+ }
+
+ req->flags |= REQ_F_NEED_CLEANUP;
return 0;
}
@@ -4730,6 +5211,8 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
+ if (unlikely(sqe->addr2 || sqe->file_index))
+ return -EINVAL;
sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
@@ -4756,8 +5239,9 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
if (unlikely(!sock))
return -ENOTSOCK;
- kmsg = req->async_data;
- if (!kmsg) {
+ if (req_has_async_data(req)) {
+ kmsg = req->async_data;
+ } else {
ret = io_sendmsg_copy_hdr(req, &iomsg);
if (ret)
return ret;
@@ -4771,17 +5255,18 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
min_ret = iov_iter_count(&kmsg->msg.msg_iter);
ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
- if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
- return io_setup_async_msg(req, kmsg);
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
+ if (ret < min_ret) {
+ if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
+ return io_setup_async_msg(req, kmsg);
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
+ req_set_fail(req);
+ }
/* fast path, check for non-NULL to avoid function call */
if (kmsg->free_iov)
kfree(kmsg->free_iov);
req->flags &= ~REQ_F_NEED_CLEANUP;
- if (ret < min_ret)
- req_set_fail(req);
__io_req_complete(req, issue_flags, ret, 0);
return 0;
}
@@ -4817,13 +5302,13 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags)
msg.msg_flags = flags;
ret = sock_sendmsg(sock, &msg);
- if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
- return -EAGAIN;
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
-
- if (ret < min_ret)
+ if (ret < min_ret) {
+ if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
+ return -EAGAIN;
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
req_set_fail(req);
+ }
__io_req_complete(req, issue_flags, ret, 0);
return 0;
}
@@ -4916,23 +5401,11 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req,
}
static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
- bool needs_lock)
+ unsigned int issue_flags)
{
struct io_sr_msg *sr = &req->sr_msg;
- struct io_buffer *kbuf;
- kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
- if (IS_ERR(kbuf))
- return kbuf;
-
- sr->kbuf = kbuf;
- req->flags |= REQ_F_BUFFER_SELECTED;
- return kbuf;
-}
-
-static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
-{
- return io_put_kbuf(req, req->sr_msg.kbuf);
+ return io_buffer_select(req, &sr->len, sr->bgid, issue_flags);
}
static int io_recvmsg_prep_async(struct io_kiocb *req)
@@ -4951,6 +5424,8 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
+ if (unlikely(sqe->addr2 || sqe->file_index))
+ return -EINVAL;
sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
@@ -4963,25 +5438,34 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (req->ctx->compat)
sr->msg_flags |= MSG_CMSG_COMPAT;
#endif
+ sr->done_io = 0;
return 0;
}
+static bool io_net_retry(struct socket *sock, int flags)
+{
+ if (!(flags & MSG_WAITALL))
+ return false;
+ return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
+}
+
static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_async_msghdr iomsg, *kmsg;
+ struct io_sr_msg *sr = &req->sr_msg;
struct socket *sock;
struct io_buffer *kbuf;
unsigned flags;
- int min_ret = 0;
- int ret, cflags = 0;
+ int ret, min_ret = 0;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
sock = sock_from_file(req->file);
if (unlikely(!sock))
return -ENOTSOCK;
- kmsg = req->async_data;
- if (!kmsg) {
+ if (req_has_async_data(req)) {
+ kmsg = req->async_data;
+ } else {
ret = io_recvmsg_copy_hdr(req, &iomsg);
if (ret)
return ret;
@@ -4989,7 +5473,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
}
if (req->flags & REQ_F_BUFFER_SELECT) {
- kbuf = io_recv_buffer_select(req, !force_nonblock);
+ kbuf = io_recv_buffer_select(req, issue_flags);
if (IS_ERR(kbuf))
return PTR_ERR(kbuf);
kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
@@ -5006,20 +5490,30 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
kmsg->uaddr, flags);
- if (force_nonblock && ret == -EAGAIN)
- return io_setup_async_msg(req, kmsg);
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
+ if (ret < min_ret) {
+ if (ret == -EAGAIN && force_nonblock)
+ return io_setup_async_msg(req, kmsg);
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
+ if (ret > 0 && io_net_retry(sock, flags)) {
+ sr->done_io += ret;
+ req->flags |= REQ_F_PARTIAL_IO;
+ return io_setup_async_msg(req, kmsg);
+ }
+ req_set_fail(req);
+ } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
+ req_set_fail(req);
+ }
- if (req->flags & REQ_F_BUFFER_SELECTED)
- cflags = io_put_recv_kbuf(req);
/* fast path, check for non-NULL to avoid function call */
if (kmsg->free_iov)
kfree(kmsg->free_iov);
req->flags &= ~REQ_F_NEED_CLEANUP;
- if (ret < min_ret || ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
- req_set_fail(req);
- __io_req_complete(req, issue_flags, ret, cflags);
+ if (ret >= 0)
+ ret += sr->done_io;
+ else if (sr->done_io)
+ ret = sr->done_io;
+ __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
return 0;
}
@@ -5032,8 +5526,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
struct socket *sock;
struct iovec iov;
unsigned flags;
- int min_ret = 0;
- int ret, cflags = 0;
+ int ret, min_ret = 0;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
sock = sock_from_file(req->file);
@@ -5041,7 +5534,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
return -ENOTSOCK;
if (req->flags & REQ_F_BUFFER_SELECT) {
- kbuf = io_recv_buffer_select(req, !force_nonblock);
+ kbuf = io_recv_buffer_select(req, issue_flags);
if (IS_ERR(kbuf))
return PTR_ERR(kbuf);
buf = u64_to_user_ptr(kbuf->addr);
@@ -5065,16 +5558,29 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
min_ret = iov_iter_count(&msg.msg_iter);
ret = sock_recvmsg(sock, &msg, flags);
- if (force_nonblock && ret == -EAGAIN)
- return -EAGAIN;
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
+ if (ret < min_ret) {
+ if (ret == -EAGAIN && force_nonblock)
+ return -EAGAIN;
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
+ if (ret > 0 && io_net_retry(sock, flags)) {
+ sr->len -= ret;
+ sr->buf += ret;
+ sr->done_io += ret;
+ req->flags |= REQ_F_PARTIAL_IO;
+ return -EAGAIN;
+ }
+ req_set_fail(req);
+ } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
out_free:
- if (req->flags & REQ_F_BUFFER_SELECTED)
- cflags = io_put_recv_kbuf(req);
- if (ret < min_ret || ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
req_set_fail(req);
- __io_req_complete(req, issue_flags, ret, cflags);
+ }
+
+ if (ret >= 0)
+ ret += sr->done_io;
+ else if (sr->done_io)
+ ret = sr->done_io;
+ __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
return 0;
}
@@ -5093,8 +5599,7 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
accept->nofile = rlimit(RLIMIT_NOFILE);
accept->file_slot = READ_ONCE(sqe->file_index);
- if (accept->file_slot && ((req->open.how.flags & O_CLOEXEC) ||
- (accept->flags & SOCK_CLOEXEC)))
+ if (accept->file_slot && (accept->flags & SOCK_CLOEXEC))
return -EINVAL;
if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
return -EINVAL;
@@ -5112,9 +5617,6 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
struct file *file;
int ret, fd;
- if (req->file->f_flags & O_NONBLOCK)
- req->flags |= REQ_F_NOWAIT;
-
if (!fixed) {
fd = __get_unused_fd_flags(accept->flags, accept->nofile);
if (unlikely(fd < 0))
@@ -5172,7 +5674,7 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
int ret;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
- if (req->async_data) {
+ if (req_has_async_data(req)) {
io = req->async_data;
} else {
ret = move_addr_to_kernel(req->connect.addr,
@@ -5188,7 +5690,7 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
ret = __sys_connect_file(req->file, &io->address,
req->connect.addr_len, file_flags);
if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
- if (req->async_data)
+ if (req_has_async_data(req))
return -EAGAIN;
if (io_alloc_async_data(req)) {
ret = -ENOMEM;
@@ -5241,52 +5743,23 @@ struct io_poll_table {
int error;
};
-static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
- __poll_t mask, io_req_tw_func_t func)
-{
- /* for instances that support it check for an event match first: */
- if (mask && !(mask & poll->events))
- return 0;
-
- trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
-
- list_del_init(&poll->wait.entry);
+#define IO_POLL_CANCEL_FLAG BIT(31)
+#define IO_POLL_REF_MASK GENMASK(30, 0)
- req->result = mask;
- req->io_task_work.func = func;
-
- /*
- * If this fails, then the task is exiting. When a task exits, the
- * work gets canceled, so just cancel this request as well instead
- * of executing it. We can't safely execute it anyway, as we may not
- * have the needed state needed for it anyway.
- */
- io_req_task_work_add(req);
- return 1;
+/*
+ * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can
+ * bump it and acquire ownership. It's disallowed to modify requests while not
+ * owning it, that prevents from races for enqueueing task_work's and b/w
+ * arming poll and wakeups.
+ */
+static inline bool io_poll_get_ownership(struct io_kiocb *req)
+{
+ return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
}
-static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
- __acquires(&req->ctx->completion_lock)
+static void io_poll_mark_cancelled(struct io_kiocb *req)
{
- struct io_ring_ctx *ctx = req->ctx;
-
- /* req->task == current here, checking PF_EXITING is safe */
- if (unlikely(req->task->flags & PF_EXITING))
- WRITE_ONCE(poll->canceled, true);
-
- if (!req->result && !READ_ONCE(poll->canceled)) {
- struct poll_table_struct pt = { ._key = poll->events };
-
- req->result = vfs_poll(req->file, &pt) & poll->events;
- }
-
- spin_lock(&ctx->completion_lock);
- if (!req->result && !READ_ONCE(poll->canceled)) {
- add_wait_queue(poll->head, &poll->wait);
- return true;
- }
-
- return false;
+ atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs);
}
static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
@@ -5304,148 +5777,268 @@ static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
return &req->apoll->poll;
}
-static void io_poll_remove_double(struct io_kiocb *req)
- __must_hold(&req->ctx->completion_lock)
+static void io_poll_req_insert(struct io_kiocb *req)
{
- struct io_poll_iocb *poll = io_poll_get_double(req);
+ struct io_ring_ctx *ctx = req->ctx;
+ struct hlist_head *list;
- lockdep_assert_held(&req->ctx->completion_lock);
+ list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
+ hlist_add_head(&req->hash_node, list);
+}
+
+static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
+ wait_queue_func_t wake_func)
+{
+ poll->head = NULL;
+#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
+ /* mask in events that we always want/need */
+ poll->events = events | IO_POLL_UNMASK;
+ INIT_LIST_HEAD(&poll->wait.entry);
+ init_waitqueue_func_entry(&poll->wait, wake_func);
+}
- if (poll && poll->head) {
- struct wait_queue_head *head = poll->head;
+static inline void io_poll_remove_entry(struct io_poll_iocb *poll)
+{
+ struct wait_queue_head *head = smp_load_acquire(&poll->head);
+ if (head) {
spin_lock_irq(&head->lock);
list_del_init(&poll->wait.entry);
- if (poll->wait.private)
- req_ref_put(req);
poll->head = NULL;
spin_unlock_irq(&head->lock);
}
}
-static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask)
- __must_hold(&req->ctx->completion_lock)
+static void io_poll_remove_entries(struct io_kiocb *req)
+{
+ /*
+ * Nothing to do if neither of those flags are set. Avoid dipping
+ * into the poll/apoll/double cachelines if we can.
+ */
+ if (!(req->flags & (REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL)))
+ return;
+
+ /*
+ * While we hold the waitqueue lock and the waitqueue is nonempty,
+ * wake_up_pollfree() will wait for us. However, taking the waitqueue
+ * lock in the first place can race with the waitqueue being freed.
+ *
+ * We solve this as eventpoll does: by taking advantage of the fact that
+ * all users of wake_up_pollfree() will RCU-delay the actual free. If
+ * we enter rcu_read_lock() and see that the pointer to the queue is
+ * non-NULL, we can then lock it without the memory being freed out from
+ * under us.
+ *
+ * Keep holding rcu_read_lock() as long as we hold the queue lock, in
+ * case the caller deletes the entry from the queue, leaving it empty.
+ * In that case, only RCU prevents the queue memory from being freed.
+ */
+ rcu_read_lock();
+ if (req->flags & REQ_F_SINGLE_POLL)
+ io_poll_remove_entry(io_poll_get_single(req));
+ if (req->flags & REQ_F_DOUBLE_POLL)
+ io_poll_remove_entry(io_poll_get_double(req));
+ rcu_read_unlock();
+}
+
+/*
+ * All poll tw should go through this. Checks for poll events, manages
+ * references, does rewait, etc.
+ *
+ * Returns a negative error on failure. >0 when no action require, which is
+ * either spurious wakeup or multishot CQE is served. 0 when it's done with
+ * the request, then the mask is stored in req->result.
+ */
+static int io_poll_check_events(struct io_kiocb *req, bool locked)
{
struct io_ring_ctx *ctx = req->ctx;
- unsigned flags = IORING_CQE_F_MORE;
- int error;
+ int v;
+
+ /* req->task == current here, checking PF_EXITING is safe */
+ if (unlikely(req->task->flags & PF_EXITING))
+ io_poll_mark_cancelled(req);
+
+ do {
+ v = atomic_read(&req->poll_refs);
- if (READ_ONCE(req->poll.canceled)) {
- error = -ECANCELED;
- req->poll.events |= EPOLLONESHOT;
+ /* tw handler should be the owner, and so have some references */
+ if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK)))
+ return 0;
+ if (v & IO_POLL_CANCEL_FLAG)
+ return -ECANCELED;
+
+ if (!req->result) {
+ struct poll_table_struct pt = { ._key = req->apoll_events };
+ unsigned flags = locked ? 0 : IO_URING_F_UNLOCKED;
+
+ if (unlikely(!io_assign_file(req, flags)))
+ return -EBADF;
+ req->result = vfs_poll(req->file, &pt) & req->apoll_events;
+ }
+
+ /* multishot, just fill an CQE and proceed */
+ if (req->result && !(req->apoll_events & EPOLLONESHOT)) {
+ __poll_t mask = mangle_poll(req->result & req->apoll_events);
+ bool filled;
+
+ spin_lock(&ctx->completion_lock);
+ filled = io_fill_cqe_aux(ctx, req->user_data, mask,
+ IORING_CQE_F_MORE);
+ io_commit_cqring(ctx);
+ spin_unlock(&ctx->completion_lock);
+ if (unlikely(!filled))
+ return -ECANCELED;
+ io_cqring_ev_posted(ctx);
+ } else if (req->result) {
+ return 0;
+ }
+
+ /*
+ * Release all references, retry if someone tried to restart
+ * task_work while we were executing it.
+ */
+ } while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs));
+
+ return 1;
+}
+
+static void io_poll_task_func(struct io_kiocb *req, bool *locked)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ int ret;
+
+ ret = io_poll_check_events(req, *locked);
+ if (ret > 0)
+ return;
+
+ if (!ret) {
+ req->result = mangle_poll(req->result & req->poll.events);
} else {
- error = mangle_poll(mask);
- }
- if (req->poll.events & EPOLLONESHOT)
- flags = 0;
- if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) {
- req->poll.events |= EPOLLONESHOT;
- flags = 0;
+ req->result = ret;
+ req_set_fail(req);
}
- if (flags & IORING_CQE_F_MORE)
- ctx->cq_extra++;
- return !(flags & IORING_CQE_F_MORE);
+ io_poll_remove_entries(req);
+ spin_lock(&ctx->completion_lock);
+ hash_del(&req->hash_node);
+ __io_req_complete_post(req, req->result, 0);
+ io_commit_cqring(ctx);
+ spin_unlock(&ctx->completion_lock);
+ io_cqring_ev_posted(ctx);
}
-static inline bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
- __must_hold(&req->ctx->completion_lock)
+static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
{
- bool done;
+ struct io_ring_ctx *ctx = req->ctx;
+ int ret;
+
+ ret = io_poll_check_events(req, *locked);
+ if (ret > 0)
+ return;
+
+ io_poll_remove_entries(req);
+ spin_lock(&ctx->completion_lock);
+ hash_del(&req->hash_node);
+ spin_unlock(&ctx->completion_lock);
- done = __io_poll_complete(req, mask);
- io_commit_cqring(req->ctx);
- return done;
+ if (!ret)
+ io_req_task_submit(req, locked);
+ else
+ io_req_complete_failed(req, ret);
}
-static void io_poll_task_func(struct io_kiocb *req, bool *locked)
+static void __io_poll_execute(struct io_kiocb *req, int mask, int events)
{
- struct io_ring_ctx *ctx = req->ctx;
- struct io_kiocb *nxt;
+ req->result = mask;
+ /*
+ * This is useful for poll that is armed on behalf of another
+ * request, and where the wakeup path could be on a different
+ * CPU. We want to avoid pulling in req->apoll->events for that
+ * case.
+ */
+ req->apoll_events = events;
+ if (req->opcode == IORING_OP_POLL_ADD)
+ req->io_task_work.func = io_poll_task_func;
+ else
+ req->io_task_work.func = io_apoll_task_func;
- if (io_poll_rewait(req, &req->poll)) {
- spin_unlock(&ctx->completion_lock);
- } else {
- bool done;
+ trace_io_uring_task_add(req->ctx, req, req->user_data, req->opcode, mask);
+ io_req_task_work_add(req, false);
+}
- if (req->poll.done) {
- spin_unlock(&ctx->completion_lock);
- return;
- }
- done = __io_poll_complete(req, req->result);
- if (done) {
- io_poll_remove_double(req);
- hash_del(&req->hash_node);
- req->poll.done = true;
- } else {
- req->result = 0;
- add_wait_queue(req->poll.head, &req->poll.wait);
- }
- io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
- io_cqring_ev_posted(ctx);
+static inline void io_poll_execute(struct io_kiocb *req, int res, int events)
+{
+ if (io_poll_get_ownership(req))
+ __io_poll_execute(req, res, events);
+}
- if (done) {
- nxt = io_put_req_find_next(req);
- if (nxt)
- io_req_task_submit(nxt, locked);
- }
- }
+static void io_poll_cancel_req(struct io_kiocb *req)
+{
+ io_poll_mark_cancelled(req);
+ /* kick tw, which should complete the request */
+ io_poll_execute(req, 0, 0);
}
-static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
- int sync, void *key)
+#define wqe_to_req(wait) ((void *)((unsigned long) (wait)->private & ~1))
+#define wqe_is_double(wait) ((unsigned long) (wait)->private & 1)
+
+static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
+ void *key)
{
- struct io_kiocb *req = wait->private;
- struct io_poll_iocb *poll = io_poll_get_single(req);
+ struct io_kiocb *req = wqe_to_req(wait);
+ struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb,
+ wait);
__poll_t mask = key_to_poll(key);
- unsigned long flags;
- /* for instances that support it check for an event match first: */
- if (mask && !(mask & poll->events))
- return 0;
- if (!(poll->events & EPOLLONESHOT))
- return poll->wait.func(&poll->wait, mode, sync, key);
+ if (unlikely(mask & POLLFREE)) {
+ io_poll_mark_cancelled(req);
+ /* we have to kick tw in case it's not already */
+ io_poll_execute(req, 0, poll->events);
- list_del_init(&wait->entry);
+ /*
+ * If the waitqueue is being freed early but someone is already
+ * holds ownership over it, we have to tear down the request as
+ * best we can. That means immediately removing the request from
+ * its waitqueue and preventing all further accesses to the
+ * waitqueue via the request.
+ */
+ list_del_init(&poll->wait.entry);
+
+ /*
+ * Careful: this *must* be the last step, since as soon
+ * as req->head is NULL'ed out, the request can be
+ * completed and freed, since aio_poll_complete_work()
+ * will no longer need to take the waitqueue lock.
+ */
+ smp_store_release(&poll->head, NULL);
+ return 1;
+ }
- if (poll->head) {
- bool done;
+ /* for instances that support it check for an event match first */
+ if (mask && !(mask & poll->events))
+ return 0;
- spin_lock_irqsave(&poll->head->lock, flags);
- done = list_empty(&poll->wait.entry);
- if (!done)
+ if (io_poll_get_ownership(req)) {
+ /* optional, saves extra locking for removal in tw handler */
+ if (mask && poll->events & EPOLLONESHOT) {
list_del_init(&poll->wait.entry);
- /* make sure double remove sees this as being gone */
- wait->private = NULL;
- spin_unlock_irqrestore(&poll->head->lock, flags);
- if (!done) {
- /* use wait func handler, so it matches the rq type */
- poll->wait.func(&poll->wait, mode, sync, key);
+ poll->head = NULL;
+ if (wqe_is_double(wait))
+ req->flags &= ~REQ_F_DOUBLE_POLL;
+ else
+ req->flags &= ~REQ_F_SINGLE_POLL;
}
+ __io_poll_execute(req, mask, poll->events);
}
- req_ref_put(req);
return 1;
}
-static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
- wait_queue_func_t wake_func)
-{
- poll->head = NULL;
- poll->done = false;
- poll->canceled = false;
-#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
- /* mask in events that we always want/need */
- poll->events = events | IO_POLL_UNMASK;
- INIT_LIST_HEAD(&poll->wait.entry);
- init_waitqueue_func_entry(&poll->wait, wake_func);
-}
-
static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
struct wait_queue_head *head,
struct io_poll_iocb **poll_ptr)
{
struct io_kiocb *req = pt->req;
+ unsigned long wqe_private = (unsigned long) req;
/*
* The file being polled uses multiple waitqueues for poll handling
@@ -5453,10 +6046,10 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
* if this happens.
*/
if (unlikely(pt->nr_entries)) {
- struct io_poll_iocb *poll_one = poll;
+ struct io_poll_iocb *first = poll;
/* double add on the same waitqueue head, ignore */
- if (poll_one->head == head)
+ if (first->head == head)
return;
/* already have a 2nd entry, fail a third attempt */
if (*poll_ptr) {
@@ -5465,25 +6058,25 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
pt->error = -EINVAL;
return;
}
- /*
- * Can't handle multishot for double wait for now, turn it
- * into one-shot mode.
- */
- if (!(poll_one->events & EPOLLONESHOT))
- poll_one->events |= EPOLLONESHOT;
+
poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
if (!poll) {
pt->error = -ENOMEM;
return;
}
- io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake);
- req_ref_get(req);
- poll->wait.private = req;
+ /* mark as double wq entry */
+ wqe_private |= 1;
+ req->flags |= REQ_F_DOUBLE_POLL;
+ io_init_poll_iocb(poll, first->events, first->wait.func);
*poll_ptr = poll;
+ if (req->opcode == IORING_OP_POLL_ADD)
+ req->flags |= REQ_F_ASYNC_DATA;
}
+ req->flags |= REQ_F_SINGLE_POLL;
pt->nr_entries++;
poll->head = head;
+ poll->wait.private = (void *) wqe_private;
if (poll->events & EPOLLEXCLUSIVE)
add_wait_queue_exclusive(head, &poll->wait);
@@ -5491,103 +6084,79 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
add_wait_queue(head, &poll->wait);
}
-static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
+static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
struct poll_table_struct *p)
{
struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
- struct async_poll *apoll = pt->req->apoll;
- __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
+ __io_queue_proc(&pt->req->poll, pt, head,
+ (struct io_poll_iocb **) &pt->req->async_data);
}
-static void io_async_task_func(struct io_kiocb *req, bool *locked)
+static int __io_arm_poll_handler(struct io_kiocb *req,
+ struct io_poll_iocb *poll,
+ struct io_poll_table *ipt, __poll_t mask)
{
- struct async_poll *apoll = req->apoll;
struct io_ring_ctx *ctx = req->ctx;
-
- trace_io_uring_task_run(req->ctx, req, req->opcode, req->user_data);
-
- if (io_poll_rewait(req, &apoll->poll)) {
- spin_unlock(&ctx->completion_lock);
- return;
- }
-
- hash_del(&req->hash_node);
- io_poll_remove_double(req);
- apoll->poll.done = true;
- spin_unlock(&ctx->completion_lock);
-
- if (!READ_ONCE(apoll->poll.canceled))
- io_req_task_submit(req, locked);
- else
- io_req_complete_failed(req, -ECANCELED);
-}
-
-static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
- void *key)
-{
- struct io_kiocb *req = wait->private;
- struct io_poll_iocb *poll = &req->apoll->poll;
-
- trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
- key_to_poll(key));
-
- return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
-}
-
-static void io_poll_req_insert(struct io_kiocb *req)
-{
- struct io_ring_ctx *ctx = req->ctx;
- struct hlist_head *list;
-
- list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
- hlist_add_head(&req->hash_node, list);
-}
-
-static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
- struct io_poll_iocb *poll,
- struct io_poll_table *ipt, __poll_t mask,
- wait_queue_func_t wake_func)
- __acquires(&ctx->completion_lock)
-{
- struct io_ring_ctx *ctx = req->ctx;
- bool cancel = false;
+ int v;
INIT_HLIST_NODE(&req->hash_node);
- io_init_poll_iocb(poll, mask, wake_func);
+ io_init_poll_iocb(poll, mask, io_poll_wake);
poll->file = req->file;
- poll->wait.private = req;
ipt->pt._key = mask;
ipt->req = req;
ipt->error = 0;
ipt->nr_entries = 0;
+ /*
+ * Take the ownership to delay any tw execution up until we're done
+ * with poll arming. see io_poll_get_ownership().
+ */
+ atomic_set(&req->poll_refs, 1);
mask = vfs_poll(req->file, &ipt->pt) & poll->events;
- if (unlikely(!ipt->nr_entries) && !ipt->error)
- ipt->error = -EINVAL;
+
+ if (mask && (poll->events & EPOLLONESHOT)) {
+ io_poll_remove_entries(req);
+ /* no one else has access to the req, forget about the ref */
+ return mask;
+ }
+ if (!mask && unlikely(ipt->error || !ipt->nr_entries)) {
+ io_poll_remove_entries(req);
+ if (!ipt->error)
+ ipt->error = -EINVAL;
+ return 0;
+ }
spin_lock(&ctx->completion_lock);
- if (ipt->error || (mask && (poll->events & EPOLLONESHOT)))
- io_poll_remove_double(req);
- if (likely(poll->head)) {
- spin_lock_irq(&poll->head->lock);
- if (unlikely(list_empty(&poll->wait.entry))) {
- if (ipt->error)
- cancel = true;
- ipt->error = 0;
- mask = 0;
- }
- if ((mask && (poll->events & EPOLLONESHOT)) || ipt->error)
- list_del_init(&poll->wait.entry);
- else if (cancel)
- WRITE_ONCE(poll->canceled, true);
- else if (!poll->done) /* actually waiting for an event */
- io_poll_req_insert(req);
- spin_unlock_irq(&poll->head->lock);
+ io_poll_req_insert(req);
+ spin_unlock(&ctx->completion_lock);
+
+ if (mask) {
+ /* can't multishot if failed, just queue the event we've got */
+ if (unlikely(ipt->error || !ipt->nr_entries))
+ poll->events |= EPOLLONESHOT;
+ __io_poll_execute(req, mask, poll->events);
+ return 0;
}
- return mask;
+ /*
+ * Release ownership. If someone tried to queue a tw while it was
+ * locked, kick it off for them.
+ */
+ v = atomic_dec_return(&req->poll_refs);
+ if (unlikely(v & IO_POLL_REF_MASK))
+ __io_poll_execute(req, 0, poll->events);
+ return 0;
+}
+
+static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
+ struct poll_table_struct *p)
+{
+ struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
+ struct async_poll *apoll = pt->req->apoll;
+
+ __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
}
enum {
@@ -5596,24 +6165,21 @@ enum {
IO_APOLL_READY
};
-static int io_arm_poll_handler(struct io_kiocb *req)
+static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
{
const struct io_op_def *def = &io_op_defs[req->opcode];
struct io_ring_ctx *ctx = req->ctx;
struct async_poll *apoll;
struct io_poll_table ipt;
- __poll_t ret, mask = EPOLLONESHOT | POLLERR | POLLPRI;
- int rw;
+ __poll_t mask = EPOLLONESHOT | POLLERR | POLLPRI;
+ int ret;
- if (!req->file || !file_can_poll(req->file))
- return IO_APOLL_ABORTED;
- if (req->flags & REQ_F_POLLED)
- return IO_APOLL_ABORTED;
if (!def->pollin && !def->pollout)
return IO_APOLL_ABORTED;
+ if (!file_can_poll(req->file) || (req->flags & REQ_F_POLLED))
+ return IO_APOLL_ABORTED;
if (def->pollin) {
- rw = READ;
mask |= POLLIN | POLLRDNORM;
/* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
@@ -5621,80 +6187,46 @@ static int io_arm_poll_handler(struct io_kiocb *req)
(req->sr_msg.msg_flags & MSG_ERRQUEUE))
mask &= ~POLLIN;
} else {
- rw = WRITE;
mask |= POLLOUT | POLLWRNORM;
}
-
- /* if we can't nonblock try, then no point in arming a poll handler */
- if (!io_file_supports_nowait(req, rw))
- return IO_APOLL_ABORTED;
-
- apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
- if (unlikely(!apoll))
- return IO_APOLL_ABORTED;
+ if (def->poll_exclusive)
+ mask |= EPOLLEXCLUSIVE;
+ if (!(issue_flags & IO_URING_F_UNLOCKED) &&
+ !list_empty(&ctx->apoll_cache)) {
+ apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
+ poll.wait.entry);
+ list_del_init(&apoll->poll.wait.entry);
+ } else {
+ apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
+ if (unlikely(!apoll))
+ return IO_APOLL_ABORTED;
+ }
apoll->double_poll = NULL;
req->apoll = apoll;
req->flags |= REQ_F_POLLED;
ipt.pt._qproc = io_async_queue_proc;
- io_req_set_refcount(req);
- ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
- io_async_wake);
- spin_unlock(&ctx->completion_lock);
+ io_kbuf_recycle(req, issue_flags);
+
+ ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask);
if (ret || ipt.error)
return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;
- trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data,
+ trace_io_uring_poll_arm(ctx, req, req->user_data, req->opcode,
mask, apoll->poll.events);
return IO_APOLL_OK;
}
-static bool __io_poll_remove_one(struct io_kiocb *req,
- struct io_poll_iocb *poll, bool do_cancel)
- __must_hold(&req->ctx->completion_lock)
-{
- bool do_complete = false;
-
- if (!poll->head)
- return false;
- spin_lock_irq(&poll->head->lock);
- if (do_cancel)
- WRITE_ONCE(poll->canceled, true);
- if (!list_empty(&poll->wait.entry)) {
- list_del_init(&poll->wait.entry);
- do_complete = true;
- }
- spin_unlock_irq(&poll->head->lock);
- hash_del(&req->hash_node);
- return do_complete;
-}
-
-static bool io_poll_remove_one(struct io_kiocb *req)
- __must_hold(&req->ctx->completion_lock)
-{
- bool do_complete;
-
- io_poll_remove_double(req);
- do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true);
-
- if (do_complete) {
- io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0);
- io_commit_cqring(req->ctx);
- req_set_fail(req);
- io_put_req_deferred(req);
- }
- return do_complete;
-}
-
/*
* Returns true if we found and killed one or more poll requests
*/
-static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
- bool cancel_all)
+static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx,
+ struct task_struct *tsk, bool cancel_all)
{
struct hlist_node *tmp;
struct io_kiocb *req;
- int posted = 0, i;
+ bool found = false;
+ int i;
spin_lock(&ctx->completion_lock);
for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
@@ -5702,16 +6234,15 @@ static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
list = &ctx->cancel_hash[i];
hlist_for_each_entry_safe(req, tmp, list, hash_node) {
- if (io_match_task(req, tsk, cancel_all))
- posted += io_poll_remove_one(req);
+ if (io_match_task_safe(req, tsk, cancel_all)) {
+ hlist_del_init(&req->hash_node);
+ io_poll_cancel_req(req);
+ found = true;
+ }
}
}
spin_unlock(&ctx->completion_lock);
-
- if (posted)
- io_cqring_ev_posted(ctx);
-
- return posted != 0;
+ return found;
}
static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr,
@@ -5732,19 +6263,26 @@ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr,
return NULL;
}
+static bool io_poll_disarm(struct io_kiocb *req)
+ __must_hold(&ctx->completion_lock)
+{
+ if (!io_poll_get_ownership(req))
+ return false;
+ io_poll_remove_entries(req);
+ hash_del(&req->hash_node);
+ return true;
+}
+
static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr,
bool poll_only)
__must_hold(&ctx->completion_lock)
{
- struct io_kiocb *req;
+ struct io_kiocb *req = io_poll_find(ctx, sqe_addr, poll_only);
- req = io_poll_find(ctx, sqe_addr, poll_only);
if (!req)
return -ENOENT;
- if (io_poll_remove_one(req))
- return 0;
-
- return -EALREADY;
+ io_poll_cancel_req(req);
+ return 0;
}
static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
@@ -5794,23 +6332,6 @@ static int io_poll_update_prep(struct io_kiocb *req,
return 0;
}
-static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
- void *key)
-{
- struct io_kiocb *req = wait->private;
- struct io_poll_iocb *poll = &req->poll;
-
- return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
-}
-
-static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
- struct poll_table_struct *p)
-{
- struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
-
- __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->async_data);
-}
-
static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_poll_iocb *poll = &req->poll;
@@ -5823,105 +6344,71 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
flags = READ_ONCE(sqe->len);
if (flags & ~IORING_POLL_ADD_MULTI)
return -EINVAL;
+ if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP))
+ return -EINVAL;
io_req_set_refcount(req);
- poll->events = io_poll_parse_events(sqe, flags);
+ req->apoll_events = poll->events = io_poll_parse_events(sqe, flags);
return 0;
}
static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_poll_iocb *poll = &req->poll;
- struct io_ring_ctx *ctx = req->ctx;
struct io_poll_table ipt;
- __poll_t mask;
- bool done;
+ int ret;
ipt.pt._qproc = io_poll_queue_proc;
- mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
- io_poll_wake);
-
- if (mask) { /* no async, we'd stolen it */
- ipt.error = 0;
- done = io_poll_complete(req, mask);
- }
- spin_unlock(&ctx->completion_lock);
-
- if (mask) {
- io_cqring_ev_posted(ctx);
- if (done)
- io_put_req(req);
- }
- return ipt.error;
+ ret = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events);
+ ret = ret ?: ipt.error;
+ if (ret)
+ __io_req_complete(req, issue_flags, ret, 0);
+ return 0;
}
static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *preq;
- bool completing;
- int ret;
+ int ret2, ret = 0;
+ bool locked;
spin_lock(&ctx->completion_lock);
preq = io_poll_find(ctx, req->poll_update.old_user_data, true);
- if (!preq) {
- ret = -ENOENT;
- goto err;
- }
-
- if (!req->poll_update.update_events && !req->poll_update.update_user_data) {
- completing = true;
- ret = io_poll_remove_one(preq) ? 0 : -EALREADY;
- goto err;
- }
-
- /*
- * Don't allow racy completion with singleshot, as we cannot safely
- * update those. For multishot, if we're racing with completion, just
- * let completion re-add it.
- */
- completing = !__io_poll_remove_one(preq, &preq->poll, false);
- if (completing && (preq->poll.events & EPOLLONESHOT)) {
- ret = -EALREADY;
- goto err;
- }
- /* we now have a detached poll request. reissue. */
- ret = 0;
-err:
- if (ret < 0) {
+ if (!preq || !io_poll_disarm(preq)) {
spin_unlock(&ctx->completion_lock);
- req_set_fail(req);
- io_req_complete(req, ret);
- return 0;
- }
- /* only mask one event flags, keep behavior flags */
- if (req->poll_update.update_events) {
- preq->poll.events &= ~0xffff;
- preq->poll.events |= req->poll_update.events & 0xffff;
- preq->poll.events |= IO_POLL_UNMASK;
+ ret = preq ? -EALREADY : -ENOENT;
+ goto out;
}
- if (req->poll_update.update_user_data)
- preq->user_data = req->poll_update.new_user_data;
spin_unlock(&ctx->completion_lock);
- /* complete update request, we're done with it */
- io_req_complete(req, ret);
-
- if (!completing) {
- ret = io_poll_add(preq, issue_flags);
- if (ret < 0) {
- req_set_fail(preq);
- io_req_complete(preq, ret);
+ if (req->poll_update.update_events || req->poll_update.update_user_data) {
+ /* only mask one event flags, keep behavior flags */
+ if (req->poll_update.update_events) {
+ preq->poll.events &= ~0xffff;
+ preq->poll.events |= req->poll_update.events & 0xffff;
+ preq->poll.events |= IO_POLL_UNMASK;
}
+ if (req->poll_update.update_user_data)
+ preq->user_data = req->poll_update.new_user_data;
+
+ ret2 = io_poll_add(preq, issue_flags);
+ /* successfully updated, don't complete poll request */
+ if (!ret2)
+ goto out;
}
- return 0;
-}
-static void io_req_task_timeout(struct io_kiocb *req, bool *locked)
-{
- req_set_fail(req);
- io_req_complete_post(req, -ETIME, 0);
+ req_set_fail(preq);
+ preq->result = -ECANCELED;
+ locked = !(issue_flags & IO_URING_F_UNLOCKED);
+ io_req_task_complete(preq, &locked);
+out:
+ if (ret < 0)
+ req_set_fail(req);
+ /* complete update request, we're done with it */
+ __io_req_complete(req, issue_flags, ret, 0);
+ return 0;
}
static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
@@ -5938,8 +6425,12 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
atomic_read(&req->ctx->cq_timeouts) + 1);
spin_unlock_irqrestore(&ctx->timeout_lock, flags);
- req->io_task_work.func = io_req_task_timeout;
- io_req_task_work_add(req);
+ if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS))
+ req_set_fail(req);
+
+ req->result = -ETIME;
+ req->io_task_work.func = io_req_task_complete;
+ io_req_task_work_add(req, false);
return HRTIMER_NORESTART;
}
@@ -5974,10 +6465,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
if (IS_ERR(req))
return PTR_ERR(req);
-
- req_set_fail(req);
- io_cqring_fill_event(ctx, req->user_data, -ECANCELED, 0);
- io_put_req_deferred(req);
+ io_req_task_queue_fail(req, -ECANCELED);
return 0;
}
@@ -6065,6 +6553,8 @@ static int io_timeout_remove_prep(struct io_kiocb *req,
return -EINVAL;
if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
return -EFAULT;
+ if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0)
+ return -EINVAL;
} else if (tr->flags) {
/* timeout removal doesn't support flags */
return -EINVAL;
@@ -6126,7 +6616,8 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (off && is_timeout_link)
return -EINVAL;
flags = READ_ONCE(sqe->timeout_flags);
- if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK))
+ if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK |
+ IORING_TIMEOUT_ETIME_SUCCESS))
return -EINVAL;
/* more than one clock specified is invalid, obviously */
if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
@@ -6137,7 +6628,9 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (unlikely(off && !req->ctx->off_timeout_used))
req->ctx->off_timeout_used = true;
- if (!req->async_data && io_alloc_async_data(req))
+ if (WARN_ON_ONCE(req_has_async_data(req)))
+ return -EFAULT;
+ if (io_alloc_async_data(req))
return -ENOMEM;
data = req->async_data;
@@ -6147,6 +6640,10 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
return -EFAULT;
+ if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0)
+ return -EINVAL;
+
+ INIT_LIST_HEAD(&req->timeout.list);
data->mode = io_translate_timeout_mode(flags);
hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode);
@@ -6260,16 +6757,21 @@ static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)
WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current);
ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
- if (ret != -ENOENT)
- return ret;
+ /*
+ * Fall-through even for -EALREADY, as we may have poll armed
+ * that need unarming.
+ */
+ if (!ret)
+ return 0;
spin_lock(&ctx->completion_lock);
+ ret = io_poll_cancel(ctx, sqe_addr, false);
+ if (ret != -ENOENT)
+ goto out;
+
spin_lock_irq(&ctx->timeout_lock);
ret = io_timeout_cancel(ctx, sqe_addr);
spin_unlock_irq(&ctx->timeout_lock);
- if (ret != -ENOENT)
- goto out;
- ret = io_poll_cancel(ctx, sqe_addr, false);
out:
spin_unlock(&ctx->completion_lock);
return ret;
@@ -6294,6 +6796,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
u64 sqe_addr = req->cancel.addr;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
struct io_tctx_node *node;
int ret;
@@ -6302,7 +6805,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
goto done;
/* slow path, try all io-wq's */
- io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+ io_ring_submit_lock(ctx, needs_lock);
ret = -ENOENT;
list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
struct io_uring_task *tctx = node->task->io_uring;
@@ -6311,7 +6814,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
if (ret != -ENOENT)
break;
}
- io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+ io_ring_submit_unlock(ctx, needs_lock);
done:
if (ret < 0)
req_set_fail(req);
@@ -6338,6 +6841,7 @@ static int io_rsrc_update_prep(struct io_kiocb *req,
static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
struct io_uring_rsrc_update2 up;
int ret;
@@ -6346,11 +6850,12 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
up.nr = 0;
up.tags = 0;
up.resv = 0;
+ up.resv2 = 0;
- io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+ io_ring_submit_lock(ctx, needs_lock);
ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
&up, req->rsrc_update.nr_args);
- io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+ io_ring_submit_unlock(ctx, needs_lock);
if (ret < 0)
req_set_fail(req);
@@ -6366,11 +6871,10 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
case IORING_OP_READV:
case IORING_OP_READ_FIXED:
case IORING_OP_READ:
- return io_read_prep(req, sqe);
case IORING_OP_WRITEV:
case IORING_OP_WRITE_FIXED:
case IORING_OP_WRITE:
- return io_write_prep(req, sqe);
+ return io_prep_rw(req, sqe);
case IORING_OP_POLL_ADD:
return io_poll_add_prep(req, sqe);
case IORING_OP_POLL_REMOVE:
@@ -6435,6 +6939,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return io_symlinkat_prep(req, sqe);
case IORING_OP_LINKAT:
return io_linkat_prep(req, sqe);
+ case IORING_OP_MSG_RING:
+ return io_msg_ring_prep(req, sqe);
}
printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
@@ -6444,9 +6950,14 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
static int io_req_prep_async(struct io_kiocb *req)
{
- if (!io_op_defs[req->opcode].needs_async_setup)
+ const struct io_op_def *def = &io_op_defs[req->opcode];
+
+ /* assign early for deferred execution for non-fixed file */
+ if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE))
+ req->file = io_file_get_normal(req, req->fd);
+ if (!def->needs_async_setup)
return 0;
- if (WARN_ON_ONCE(req->async_data))
+ if (WARN_ON_ONCE(req_has_async_data(req)))
return -EFAULT;
if (io_alloc_async_data(req))
return -EAGAIN;
@@ -6478,92 +6989,57 @@ static u32 io_get_sequence(struct io_kiocb *req)
return seq;
}
-static bool io_drain_req(struct io_kiocb *req)
+static __cold void io_drain_req(struct io_kiocb *req)
{
- struct io_kiocb *pos;
struct io_ring_ctx *ctx = req->ctx;
struct io_defer_entry *de;
int ret;
- u32 seq;
-
- if (req->flags & REQ_F_FAIL) {
- io_req_complete_fail_submit(req);
- return true;
- }
-
- /*
- * If we need to drain a request in the middle of a link, drain the
- * head request and the next request/link after the current link.
- * Considering sequential execution of links, IOSQE_IO_DRAIN will be
- * maintained for every request of our link.
- */
- if (ctx->drain_next) {
- req->flags |= REQ_F_IO_DRAIN;
- ctx->drain_next = false;
- }
- /* not interested in head, start from the first linked */
- io_for_each_link(pos, req->link) {
- if (pos->flags & REQ_F_IO_DRAIN) {
- ctx->drain_next = true;
- req->flags |= REQ_F_IO_DRAIN;
- break;
- }
- }
+ u32 seq = io_get_sequence(req);
/* Still need defer if there is pending req in defer list. */
- if (likely(list_empty_careful(&ctx->defer_list) &&
- !(req->flags & REQ_F_IO_DRAIN))) {
+ spin_lock(&ctx->completion_lock);
+ if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) {
+ spin_unlock(&ctx->completion_lock);
+queue:
ctx->drain_active = false;
- return false;
+ io_req_task_queue(req);
+ return;
}
-
- seq = io_get_sequence(req);
- /* Still a chance to pass the sequence check */
- if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
- return false;
+ spin_unlock(&ctx->completion_lock);
ret = io_req_prep_async(req);
- if (ret)
- goto fail;
+ if (ret) {
+fail:
+ io_req_complete_failed(req, ret);
+ return;
+ }
io_prep_async_link(req);
de = kmalloc(sizeof(*de), GFP_KERNEL);
if (!de) {
ret = -ENOMEM;
-fail:
- io_req_complete_failed(req, ret);
- return true;
+ goto fail;
}
spin_lock(&ctx->completion_lock);
if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
spin_unlock(&ctx->completion_lock);
kfree(de);
- io_queue_async_work(req, NULL);
- return true;
+ goto queue;
}
- trace_io_uring_defer(ctx, req, req->user_data);
+ trace_io_uring_defer(ctx, req, req->user_data, req->opcode);
de->req = req;
de->seq = seq;
list_add_tail(&de->list, &ctx->defer_list);
spin_unlock(&ctx->completion_lock);
- return true;
}
static void io_clean_op(struct io_kiocb *req)
{
if (req->flags & REQ_F_BUFFER_SELECTED) {
- switch (req->opcode) {
- case IORING_OP_READV:
- case IORING_OP_READ_FIXED:
- case IORING_OP_READ:
- kfree((void *)(unsigned long)req->rw.addr);
- break;
- case IORING_OP_RECVMSG:
- case IORING_OP_RECV:
- kfree(req->sr_msg.kbuf);
- break;
- }
+ spin_lock(&req->ctx->completion_lock);
+ io_put_kbuf_comp(req);
+ spin_unlock(&req->ctx->completion_lock);
}
if (req->flags & REQ_F_NEED_CLEANUP) {
@@ -6586,11 +7062,6 @@ static void io_clean_op(struct io_kiocb *req)
kfree(io->free_iov);
break;
}
- case IORING_OP_SPLICE:
- case IORING_OP_TEE:
- if (!(req->splice.flags & SPLICE_F_FD_IN_FIXED))
- io_put_file(req->splice.file_in);
- break;
case IORING_OP_OPENAT:
case IORING_OP_OPENAT2:
if (req->open.filename)
@@ -6614,6 +7085,10 @@ static void io_clean_op(struct io_kiocb *req)
putname(req->hardlink.oldpath);
putname(req->hardlink.newpath);
break;
+ case IORING_OP_STATX:
+ if (req->statx.filename)
+ putname(req->statx.filename);
+ break;
}
}
if ((req->flags & REQ_F_POLLED) && req->apoll) {
@@ -6621,26 +7096,46 @@ static void io_clean_op(struct io_kiocb *req)
kfree(req->apoll);
req->apoll = NULL;
}
- if (req->flags & REQ_F_INFLIGHT) {
- struct io_uring_task *tctx = req->task->io_uring;
-
- atomic_dec(&tctx->inflight_tracked);
- }
if (req->flags & REQ_F_CREDS)
put_cred(req->creds);
-
+ if (req->flags & REQ_F_ASYNC_DATA) {
+ kfree(req->async_data);
+ req->async_data = NULL;
+ }
req->flags &= ~IO_REQ_CLEAN_FLAGS;
}
+static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags)
+{
+ if (req->file || !io_op_defs[req->opcode].needs_file)
+ return true;
+
+ if (req->flags & REQ_F_FIXED_FILE)
+ req->file = io_file_get_fixed(req, req->fd, issue_flags);
+ else
+ req->file = io_file_get_normal(req, req->fd);
+ if (req->file)
+ return true;
+
+ req_set_fail(req);
+ req->result = -EBADF;
+ return false;
+}
+
static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
{
- struct io_ring_ctx *ctx = req->ctx;
const struct cred *creds = NULL;
int ret;
- if ((req->flags & REQ_F_CREDS) && req->creds != current_cred())
+ if (unlikely(!io_assign_file(req, issue_flags)))
+ return -EBADF;
+
+ if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
creds = override_creds(req->creds);
+ if (!io_op_defs[req->opcode].audit_skip)
+ audit_uring_entry(req->opcode);
+
switch (req->opcode) {
case IORING_OP_NOP:
ret = io_nop(req, issue_flags);
@@ -6751,18 +7246,24 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
case IORING_OP_LINKAT:
ret = io_linkat(req, issue_flags);
break;
+ case IORING_OP_MSG_RING:
+ ret = io_msg_ring(req, issue_flags);
+ break;
default:
ret = -EINVAL;
break;
}
+ if (!io_op_defs[req->opcode].audit_skip)
+ audit_uring_exit(!ret, ret);
+
if (creds)
revert_creds(creds);
if (ret)
return ret;
/* If the op doesn't have a file, we're not polling for it */
- if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file)
- io_iopoll_req_issued(req);
+ if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file)
+ io_iopoll_req_issued(req, issue_flags);
return 0;
}
@@ -6778,8 +7279,11 @@ static struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
static void io_wq_submit_work(struct io_wq_work *work)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+ const struct io_op_def *def = &io_op_defs[req->opcode];
+ unsigned int issue_flags = IO_URING_F_UNLOCKED;
+ bool needs_poll = false;
struct io_kiocb *timeout;
- int ret = 0;
+ int ret = 0, err = -ECANCELED;
/* one will be dropped by ->io_free_work() after returning to io-wq */
if (!(req->flags & REQ_F_REFCOUNT))
@@ -6791,24 +7295,49 @@ static void io_wq_submit_work(struct io_wq_work *work)
if (timeout)
io_queue_linked_timeout(timeout);
+
/* either cancelled or io-wq is dying, so don't touch tctx->iowq */
- if (work->flags & IO_WQ_WORK_CANCEL)
- ret = -ECANCELED;
+ if (work->flags & IO_WQ_WORK_CANCEL) {
+fail:
+ io_req_task_queue_fail(req, err);
+ return;
+ }
+ if (!io_assign_file(req, issue_flags)) {
+ err = -EBADF;
+ work->flags |= IO_WQ_WORK_CANCEL;
+ goto fail;
+ }
- if (!ret) {
- do {
- ret = io_issue_sqe(req, 0);
- /*
- * We can get EAGAIN for polled IO even though we're
- * forcing a sync submission from here, since we can't
- * wait for request slots on the block side.
- */
- if (ret != -EAGAIN)
- break;
- cond_resched();
- } while (1);
+ if (req->flags & REQ_F_FORCE_ASYNC) {
+ bool opcode_poll = def->pollin || def->pollout;
+
+ if (opcode_poll && file_can_poll(req->file)) {
+ needs_poll = true;
+ issue_flags |= IO_URING_F_NONBLOCK;
+ }
}
+ do {
+ ret = io_issue_sqe(req, issue_flags);
+ if (ret != -EAGAIN)
+ break;
+ /*
+ * We can get EAGAIN for iopolled IO even though we're
+ * forcing a sync submission from here, since we can't
+ * wait for request slots on the block side.
+ */
+ if (!needs_poll) {
+ cond_resched();
+ continue;
+ }
+
+ if (io_arm_poll_handler(req, issue_flags) == IO_APOLL_OK)
+ return;
+ /* aborted or ready, in either case retry blocking */
+ needs_poll = false;
+ issue_flags &= ~IO_URING_F_NONBLOCK;
+ } while (1);
+
/* avoid locking problems by failing it from a clean context */
if (ret)
io_req_task_queue_fail(req, ret);
@@ -6832,62 +7361,68 @@ static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file
{
unsigned long file_ptr = (unsigned long) file;
- if (__io_file_supports_nowait(file, READ))
- file_ptr |= FFS_ASYNC_READ;
- if (__io_file_supports_nowait(file, WRITE))
- file_ptr |= FFS_ASYNC_WRITE;
- if (S_ISREG(file_inode(file)->i_mode))
- file_ptr |= FFS_ISREG;
+ file_ptr |= io_file_get_flags(file);
file_slot->file_ptr = file_ptr;
}
-static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx,
- struct io_kiocb *req, int fd)
+static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
+ unsigned int issue_flags)
{
- struct file *file;
+ struct io_ring_ctx *ctx = req->ctx;
+ struct file *file = NULL;
unsigned long file_ptr;
+ if (issue_flags & IO_URING_F_UNLOCKED)
+ mutex_lock(&ctx->uring_lock);
+
if (unlikely((unsigned int)fd >= ctx->nr_user_files))
- return NULL;
+ goto out;
fd = array_index_nospec(fd, ctx->nr_user_files);
file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
file = (struct file *) (file_ptr & FFS_MASK);
file_ptr &= ~FFS_MASK;
/* mask in overlapping REQ_F and FFS bits */
- req->flags |= (file_ptr << REQ_F_NOWAIT_READ_BIT);
- io_req_set_rsrc_node(req);
+ req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT);
+ io_req_set_rsrc_node(req, ctx, 0);
+out:
+ if (issue_flags & IO_URING_F_UNLOCKED)
+ mutex_unlock(&ctx->uring_lock);
return file;
}
-static struct file *io_file_get_normal(struct io_ring_ctx *ctx,
- struct io_kiocb *req, int fd)
+/*
+ * Drop the file for requeue operations. Only used of req->file is the
+ * io_uring descriptor itself.
+ */
+static void io_drop_inflight_file(struct io_kiocb *req)
+{
+ if (unlikely(req->flags & REQ_F_INFLIGHT)) {
+ fput(req->file);
+ req->file = NULL;
+ req->flags &= ~REQ_F_INFLIGHT;
+ }
+}
+
+static struct file *io_file_get_normal(struct io_kiocb *req, int fd)
{
struct file *file = fget(fd);
- trace_io_uring_file_get(ctx, fd);
+ trace_io_uring_file_get(req->ctx, req, req->user_data, fd);
/* we don't allow fixed io_uring files */
- if (file && unlikely(file->f_op == &io_uring_fops))
- io_req_track_inflight(req);
+ if (file && file->f_op == &io_uring_fops)
+ req->flags |= REQ_F_INFLIGHT;
return file;
}
-static inline struct file *io_file_get(struct io_ring_ctx *ctx,
- struct io_kiocb *req, int fd, bool fixed)
-{
- if (fixed)
- return io_file_get_fixed(ctx, req, fd);
- else
- return io_file_get_normal(ctx, req, fd);
-}
-
static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
{
struct io_kiocb *prev = req->timeout.prev;
- int ret;
+ int ret = -ENOENT;
if (prev) {
- ret = io_try_cancel_userdata(req, prev->user_data);
+ if (!(req->task->flags & PF_EXITING))
+ ret = io_try_cancel_userdata(req, prev->user_data);
io_req_complete_post(req, ret ?: -ETIME, 0);
io_put_req(prev);
} else {
@@ -6921,7 +7456,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
spin_unlock_irqrestore(&ctx->timeout_lock, flags);
req->io_task_work.func = io_req_task_link_timeout;
- io_req_task_work_add(req);
+ io_req_task_work_add(req, false);
return HRTIMER_NORESTART;
}
@@ -6947,67 +7482,64 @@ static void io_queue_linked_timeout(struct io_kiocb *req)
io_put_req(req);
}
-static void __io_queue_sqe(struct io_kiocb *req)
+static void io_queue_sqe_arm_apoll(struct io_kiocb *req)
+ __must_hold(&req->ctx->uring_lock)
+{
+ struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
+
+ switch (io_arm_poll_handler(req, 0)) {
+ case IO_APOLL_READY:
+ io_req_task_queue(req);
+ break;
+ case IO_APOLL_ABORTED:
+ /*
+ * Queued up for async execution, worker will release
+ * submit reference when the iocb is actually submitted.
+ */
+ io_queue_async_work(req, NULL);
+ break;
+ case IO_APOLL_OK:
+ break;
+ }
+
+ if (linked_timeout)
+ io_queue_linked_timeout(linked_timeout);
+}
+
+static inline void __io_queue_sqe(struct io_kiocb *req)
__must_hold(&req->ctx->uring_lock)
{
struct io_kiocb *linked_timeout;
int ret;
-issue_sqe:
ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
+ if (req->flags & REQ_F_COMPLETE_INLINE) {
+ io_req_add_compl_list(req);
+ return;
+ }
/*
* We async punt it if the file wasn't marked NOWAIT, or if the file
* doesn't support non-blocking read/write attempts
*/
if (likely(!ret)) {
- if (req->flags & REQ_F_COMPLETE_INLINE) {
- struct io_ring_ctx *ctx = req->ctx;
- struct io_submit_state *state = &ctx->submit_state;
-
- state->compl_reqs[state->compl_nr++] = req;
- if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
- io_submit_flush_completions(ctx);
- return;
- }
-
linked_timeout = io_prep_linked_timeout(req);
if (linked_timeout)
io_queue_linked_timeout(linked_timeout);
} else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
- linked_timeout = io_prep_linked_timeout(req);
-
- switch (io_arm_poll_handler(req)) {
- case IO_APOLL_READY:
- if (linked_timeout)
- io_queue_linked_timeout(linked_timeout);
- goto issue_sqe;
- case IO_APOLL_ABORTED:
- /*
- * Queued up for async execution, worker will release
- * submit reference when the iocb is actually submitted.
- */
- io_queue_async_work(req, NULL);
- break;
- }
-
- if (linked_timeout)
- io_queue_linked_timeout(linked_timeout);
+ io_queue_sqe_arm_apoll(req);
} else {
io_req_complete_failed(req, ret);
}
}
-static inline void io_queue_sqe(struct io_kiocb *req)
+static void io_queue_sqe_fallback(struct io_kiocb *req)
__must_hold(&req->ctx->uring_lock)
{
- if (unlikely(req->ctx->drain_active) && io_drain_req(req))
- return;
-
- if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) {
- __io_queue_sqe(req);
- } else if (req->flags & REQ_F_FAIL) {
+ if (req->flags & REQ_F_FAIL) {
io_req_complete_fail_submit(req);
+ } else if (unlikely(req->ctx->drain_active)) {
+ io_drain_req(req);
} else {
int ret = io_req_prep_async(req);
@@ -7018,6 +7550,15 @@ static inline void io_queue_sqe(struct io_kiocb *req)
}
}
+static inline void io_queue_sqe(struct io_kiocb *req)
+ __must_hold(&req->ctx->uring_lock)
+{
+ if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))))
+ __io_queue_sqe(req);
+ else
+ io_queue_sqe_fallback(req);
+}
+
/*
* Check SQE restrictions (opcode and flags).
*
@@ -7027,9 +7568,6 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx,
struct io_kiocb *req,
unsigned int sqe_flags)
{
- if (likely(!ctx->restricted))
- return true;
-
if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
return false;
@@ -7044,16 +7582,35 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx,
return true;
}
+static void io_init_req_drain(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_kiocb *head = ctx->submit_state.link.head;
+
+ ctx->drain_active = true;
+ if (head) {
+ /*
+ * If we need to drain a request in the middle of a link, drain
+ * the head request and the next request/link after the current
+ * link. Considering sequential execution of links,
+ * REQ_F_IO_DRAIN will be maintained for every request of our
+ * link.
+ */
+ head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
+ ctx->drain_next = true;
+ }
+}
+
static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
const struct io_uring_sqe *sqe)
__must_hold(&ctx->uring_lock)
{
- struct io_submit_state *state;
unsigned int sqe_flags;
- int personality, ret = 0;
+ int personality;
+ u8 opcode;
/* req is partially pre-initialised, see io_preinit_req() */
- req->opcode = READ_ONCE(sqe->opcode);
+ req->opcode = opcode = READ_ONCE(sqe->opcode);
/* same numerical values with corresponding REQ_F_*, safe to copy */
req->flags = sqe_flags = READ_ONCE(sqe->flags);
req->user_data = READ_ONCE(sqe->user_data);
@@ -7061,49 +7618,72 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
req->fixed_rsrc_refs = NULL;
req->task = current;
- /* enforce forwards compatibility on users */
- if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
- return -EINVAL;
- if (unlikely(req->opcode >= IORING_OP_LAST))
+ if (unlikely(opcode >= IORING_OP_LAST)) {
+ req->opcode = 0;
return -EINVAL;
- if (!io_check_restriction(ctx, req, sqe_flags))
- return -EACCES;
+ }
+ if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
+ /* enforce forwards compatibility on users */
+ if (sqe_flags & ~SQE_VALID_FLAGS)
+ return -EINVAL;
+ if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
+ !io_op_defs[opcode].buffer_select)
+ return -EOPNOTSUPP;
+ if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS)
+ ctx->drain_disabled = true;
+ if (sqe_flags & IOSQE_IO_DRAIN) {
+ if (ctx->drain_disabled)
+ return -EOPNOTSUPP;
+ io_init_req_drain(req);
+ }
+ }
+ if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
+ if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
+ return -EACCES;
+ /* knock it to the slow queue path, will be drained there */
+ if (ctx->drain_active)
+ req->flags |= REQ_F_FORCE_ASYNC;
+ /* if there is no link, we're at "next" request and need to drain */
+ if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {
+ ctx->drain_next = false;
+ ctx->drain_active = true;
+ req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
+ }
+ }
- if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
- !io_op_defs[req->opcode].buffer_select)
- return -EOPNOTSUPP;
- if (unlikely(sqe_flags & IOSQE_IO_DRAIN))
- ctx->drain_active = true;
+ if (io_op_defs[opcode].needs_file) {
+ struct io_submit_state *state = &ctx->submit_state;
+
+ req->fd = READ_ONCE(sqe->fd);
+
+ /*
+ * Plug now if we have more than 2 IO left after this, and the
+ * target is potentially a read/write to block based storage.
+ */
+ if (state->need_plug && io_op_defs[opcode].plug) {
+ state->plug_started = true;
+ state->need_plug = false;
+ blk_start_plug_nr_ios(&state->plug, state->submit_nr);
+ }
+ }
personality = READ_ONCE(sqe->personality);
if (personality) {
+ int ret;
+
req->creds = xa_load(&ctx->personalities, personality);
if (!req->creds)
return -EINVAL;
get_cred(req->creds);
+ ret = security_uring_override_creds(req->creds);
+ if (ret) {
+ put_cred(req->creds);
+ return ret;
+ }
req->flags |= REQ_F_CREDS;
}
- state = &ctx->submit_state;
-
- /*
- * Plug now if we have more than 1 IO left after this, and the target
- * is potentially a read/write to block based storage.
- */
- if (!state->plug_started && state->ios_left > 1 &&
- io_op_defs[req->opcode].plug) {
- blk_start_plug(&state->plug);
- state->plug_started = true;
- }
-
- if (io_op_defs[req->opcode].needs_file) {
- req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd),
- (sqe_flags & IOSQE_FIXED_FILE));
- if (unlikely(!req->file))
- ret = -EBADF;
- }
- state->ios_left--;
- return ret;
+ return io_req_prep(req, sqe);
}
static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
@@ -7115,7 +7695,8 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
ret = io_init_req(ctx, req, sqe);
if (unlikely(ret)) {
-fail_req:
+ trace_io_uring_req_failed(sqe, ctx, req, ret);
+
/* fail even hard links since we don't submit */
if (link->head) {
/*
@@ -7138,14 +7719,10 @@ fail_req:
return ret;
}
req_fail_link_node(req, ret);
- } else {
- ret = io_req_prep(req, sqe);
- if (unlikely(ret))
- goto fail_req;
}
/* don't need @sqe from now on */
- trace_io_uring_submit_sqe(ctx, req, req->opcode, req->user_data,
+ trace_io_uring_submit_sqe(ctx, req, req->user_data, req->opcode,
req->flags, true,
ctx->flags & IORING_SETUP_SQPOLL);
@@ -7171,33 +7748,32 @@ fail_req:
link->last->link = req;
link->last = req;
+ if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK))
+ return 0;
/* last request of a link, enqueue the link */
- if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
- link->head = NULL;
- io_queue_sqe(head);
- }
- } else {
- if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
- link->head = req;
- link->last = req;
- } else {
- io_queue_sqe(req);
- }
+ link->head = NULL;
+ req = head;
+ } else if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
+ link->head = req;
+ link->last = req;
+ return 0;
}
+ io_queue_sqe(req);
return 0;
}
/*
* Batched submission is done, ensure local IO is flushed out.
*/
-static void io_submit_state_end(struct io_submit_state *state,
- struct io_ring_ctx *ctx)
+static void io_submit_state_end(struct io_ring_ctx *ctx)
{
+ struct io_submit_state *state = &ctx->submit_state;
+
if (state->link.head)
io_queue_sqe(state->link.head);
- if (state->compl_nr)
- io_submit_flush_completions(ctx);
+ /* flush only after queuing links as they can generate completions */
+ io_submit_flush_completions(ctx);
if (state->plug_started)
blk_finish_plug(&state->plug);
}
@@ -7209,7 +7785,8 @@ static void io_submit_state_start(struct io_submit_state *state,
unsigned int max_ios)
{
state->plug_started = false;
- state->ios_left = max_ios;
+ state->need_plug = max_ios > 2;
+ state->submit_nr = max_ios;
/* set only head, no need to init link_last in advance */
state->link.head = NULL;
}
@@ -7261,45 +7838,51 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
__must_hold(&ctx->uring_lock)
{
+ unsigned int entries = io_sqring_entries(ctx);
int submitted = 0;
+ if (unlikely(!entries))
+ return 0;
/* make sure SQ entry isn't read before tail */
- nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
- if (!percpu_ref_tryget_many(&ctx->refs, nr))
- return -EAGAIN;
+ nr = min3(nr, ctx->sq_entries, entries);
io_get_task_refs(nr);
io_submit_state_start(&ctx->submit_state, nr);
- while (submitted < nr) {
+ do {
const struct io_uring_sqe *sqe;
struct io_kiocb *req;
- req = io_alloc_req(ctx);
- if (unlikely(!req)) {
+ if (unlikely(!io_alloc_req_refill(ctx))) {
if (!submitted)
submitted = -EAGAIN;
break;
}
+ req = io_alloc_req(ctx);
sqe = io_get_sqe(ctx);
if (unlikely(!sqe)) {
- list_add(&req->inflight_entry, &ctx->submit_state.free_list);
+ wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
break;
}
/* will complete beyond this point, count as submitted */
submitted++;
- if (io_submit_sqe(ctx, req, sqe))
- break;
- }
+ if (io_submit_sqe(ctx, req, sqe)) {
+ /*
+ * Continue submitting even for sqe failure if the
+ * ring was setup with IORING_SETUP_SUBMIT_ALL
+ */
+ if (!(ctx->flags & IORING_SETUP_SUBMIT_ALL))
+ break;
+ }
+ } while (submitted < nr);
if (unlikely(submitted != nr)) {
int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
int unused = nr - ref_used;
current->io_uring->cached_refs += unused;
- percpu_ref_put_many(&ctx->refs, unused);
}
- io_submit_state_end(&ctx->submit_state, ctx);
+ io_submit_state_end(ctx);
/* Commit SQ ring head once we've consumed and submitted all SQEs */
io_commit_sqring(ctx);
@@ -7338,16 +7921,15 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;
- if (!list_empty(&ctx->iopoll_list) || to_submit) {
- unsigned nr_events = 0;
+ if (!wq_list_empty(&ctx->iopoll_list) || to_submit) {
const struct cred *creds = NULL;
if (ctx->sq_creds != current_cred())
creds = override_creds(ctx->sq_creds);
mutex_lock(&ctx->uring_lock);
- if (!list_empty(&ctx->iopoll_list))
- io_do_iopoll(ctx, &nr_events, 0);
+ if (!wq_list_empty(&ctx->iopoll_list))
+ io_do_iopoll(ctx, true);
/*
* Don't submit if refs are dying, good for io_uring_register(),
@@ -7367,7 +7949,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
return ret;
}
-static void io_sqd_update_thread_idle(struct io_sq_data *sqd)
+static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd)
{
struct io_ring_ctx *ctx;
unsigned sq_thread_idle = 0;
@@ -7410,6 +7992,8 @@ static int io_sq_thread(void *data)
set_cpus_allowed_ptr(current, cpu_online_mask);
current->flags |= PF_NO_SETAFFINITY;
+ audit_alloc_kernel(current);
+
mutex_lock(&sqd->lock);
while (1) {
bool cap_entries, sqt_spin = false;
@@ -7424,7 +8008,7 @@ static int io_sq_thread(void *data)
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
int ret = __io_sq_thread(ctx, cap_entries);
- if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list)))
+ if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list)))
sqt_spin = true;
}
if (io_run_task_work())
@@ -7438,17 +8022,24 @@ static int io_sq_thread(void *data)
}
prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
- if (!io_sqd_events_pending(sqd) && !current->task_works) {
+ if (!io_sqd_events_pending(sqd) && !task_work_pending(current)) {
bool needs_sched = true;
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
io_ring_set_wakeup_flag(ctx);
if ((ctx->flags & IORING_SETUP_IOPOLL) &&
- !list_empty_careful(&ctx->iopoll_list)) {
+ !wq_list_empty(&ctx->iopoll_list)) {
needs_sched = false;
break;
}
+
+ /*
+ * Ensure the store of the wakeup flag is not
+ * reordered with the load of the SQ tail
+ */
+ smp_mb();
+
if (io_sqring_entries(ctx)) {
needs_sched = false;
break;
@@ -7475,6 +8066,8 @@ static int io_sq_thread(void *data)
io_run_task_work();
mutex_unlock(&sqd->lock);
+ audit_free(current);
+
complete(&sqd->exited);
do_exit(0);
}
@@ -7518,17 +8111,17 @@ static int io_run_task_work_sig(void)
{
if (io_run_task_work())
return 1;
- if (!signal_pending(current))
- return 0;
if (test_thread_flag(TIF_NOTIFY_SIGNAL))
return -ERESTARTSYS;
- return -EINTR;
+ if (task_sigpending(current))
+ return -EINTR;
+ return 0;
}
/* when returns >0, the caller should retry */
static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq,
- signed long *timeout)
+ ktime_t timeout)
{
int ret;
@@ -7540,8 +8133,9 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
if (test_bit(0, &ctx->check_cq_overflow))
return 1;
- *timeout = schedule_timeout(*timeout);
- return !*timeout ? -ETIME : 1;
+ if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS))
+ return -ETIME;
+ return 1;
}
/*
@@ -7554,7 +8148,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
{
struct io_wait_queue iowq;
struct io_rings *rings = ctx->rings;
- signed long timeout = MAX_SCHEDULE_TIMEOUT;
+ ktime_t timeout = KTIME_MAX;
int ret;
do {
@@ -7565,14 +8159,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
break;
} while (1);
- if (uts) {
- struct timespec64 ts;
-
- if (get_timespec64(&ts, uts))
- return -EFAULT;
- timeout = timespec64_to_jiffies(&ts);
- }
-
if (sig) {
#ifdef CONFIG_COMPAT
if (in_compat_syscall())
@@ -7586,6 +8172,14 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
return ret;
}
+ if (uts) {
+ struct timespec64 ts;
+
+ if (get_timespec64(&ts, uts))
+ return -EFAULT;
+ timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
+ }
+
init_waitqueue_func_entry(&iowq.wq, io_wake_function);
iowq.wq.private = current;
INIT_LIST_HEAD(&iowq.wq.entry);
@@ -7602,7 +8196,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
}
prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
TASK_INTERRUPTIBLE);
- ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
+ ret = io_cqring_wait_schedule(ctx, &iowq, timeout);
finish_wait(&ctx->cq_wait, &iowq.wq);
cond_resched();
} while (ret > 0);
@@ -7621,7 +8215,7 @@ static void io_free_page_table(void **table, size_t size)
kfree(table);
}
-static void **io_alloc_page_table(size_t size)
+static __cold void **io_alloc_page_table(size_t size)
{
unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
size_t init_size = size;
@@ -7650,16 +8244,21 @@ static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
kfree(ref_node);
}
-static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
+static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)
{
struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
struct io_ring_ctx *ctx = node->rsrc_data->ctx;
unsigned long flags;
bool first_add = false;
+ unsigned long delay = HZ;
spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
node->done = true;
+ /* if we are mid-quiesce then do not delay */
+ if (node->rsrc_data->quiesce)
+ delay = 0;
+
while (!list_empty(&ctx->rsrc_ref_list)) {
node = list_first_entry(&ctx->rsrc_ref_list,
struct io_rsrc_node, node);
@@ -7672,10 +8271,10 @@ static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
if (first_add)
- mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ);
+ mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
}
-static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
+static struct io_rsrc_node *io_rsrc_node_alloc(void)
{
struct io_rsrc_node *ref_node;
@@ -7696,10 +8295,13 @@ static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
struct io_rsrc_data *data_to_kill)
+ __must_hold(&ctx->uring_lock)
{
WARN_ON_ONCE(!ctx->rsrc_backup_node);
WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
+ io_rsrc_refs_drop(ctx);
+
if (data_to_kill) {
struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
@@ -7723,11 +8325,12 @@ static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
{
if (ctx->rsrc_backup_node)
return 0;
- ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx);
+ ctx->rsrc_backup_node = io_rsrc_node_alloc();
return ctx->rsrc_backup_node ? 0 : -ENOMEM;
}
-static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ctx)
+static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
+ struct io_ring_ctx *ctx)
{
int ret;
@@ -7750,7 +8353,15 @@ static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ct
ret = wait_for_completion_interruptible(&data->done);
if (!ret) {
mutex_lock(&ctx->uring_lock);
- break;
+ if (atomic_read(&data->refs) > 0) {
+ /*
+ * it has been revived by another thread while
+ * we were unlocked
+ */
+ mutex_unlock(&ctx->uring_lock);
+ } else {
+ break;
+ }
}
atomic_inc(&data->refs);
@@ -7783,9 +8394,9 @@ static void io_rsrc_data_free(struct io_rsrc_data *data)
kfree(data);
}
-static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
- u64 __user *utags, unsigned nr,
- struct io_rsrc_data **pdata)
+static __cold int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
+ u64 __user *utags, unsigned nr,
+ struct io_rsrc_data **pdata)
{
struct io_rsrc_data *data;
int ret = -ENOMEM;
@@ -8045,10 +8656,15 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
refcount_add(skb->truesize, &sk->sk_wmem_alloc);
skb_queue_head(&sk->sk_receive_queue, skb);
- for (i = 0; i < nr_files; i++)
- fput(fpl->fp[i]);
+ for (i = 0; i < nr; i++) {
+ struct file *file = io_file_from_index(ctx, i + offset);
+
+ if (file)
+ fput(file);
+ }
} else {
kfree_skb(skb);
+ free_uid(fpl->user);
kfree(fpl);
}
@@ -8174,8 +8790,7 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
io_ring_submit_lock(ctx, lock_ring);
spin_lock(&ctx->completion_lock);
- io_cqring_fill_event(ctx, prsrc->tag, 0, 0);
- ctx->cq_extra++;
+ io_fill_cqe_aux(ctx, prsrc->tag, 0, 0);
io_commit_cqring(ctx);
spin_unlock(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
@@ -8337,13 +8952,15 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
struct io_rsrc_node *node, void *rsrc)
{
+ u64 *tag_slot = io_get_tag_slot(data, idx);
struct io_rsrc_put *prsrc;
prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
if (!prsrc)
return -ENOMEM;
- prsrc->tag = *io_get_tag_slot(data, idx);
+ prsrc->tag = *tag_slot;
+ *tag_slot = 0;
prsrc->rsrc = rsrc;
list_add(&prsrc->list, &node->rsrc_list);
return 0;
@@ -8353,12 +8970,12 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
unsigned int issue_flags, u32 slot_index)
{
struct io_ring_ctx *ctx = req->ctx;
- bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
bool needs_switch = false;
struct io_fixed_file *file_slot;
int ret = -EBADF;
- io_ring_submit_lock(ctx, !force_nonblock);
+ io_ring_submit_lock(ctx, needs_lock);
if (file->f_op == &io_uring_fops)
goto err;
ret = -ENXIO;
@@ -8399,7 +9016,7 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
err:
if (needs_switch)
io_rsrc_node_switch(ctx, ctx->file_data);
- io_ring_submit_unlock(ctx, !force_nonblock);
+ io_ring_submit_unlock(ctx, needs_lock);
if (ret)
fput(file);
return ret;
@@ -8409,11 +9026,12 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
{
unsigned int offset = req->close.file_slot - 1;
struct io_ring_ctx *ctx = req->ctx;
+ bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
struct io_fixed_file *file_slot;
struct file *file;
- int ret, i;
+ int ret;
- io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+ io_ring_submit_lock(ctx, needs_lock);
ret = -ENXIO;
if (unlikely(!ctx->file_data))
goto out;
@@ -8424,8 +9042,8 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
if (ret)
goto out;
- i = array_index_nospec(offset, ctx->nr_user_files);
- file_slot = io_fixed_file_slot(&ctx->file_table, i);
+ offset = array_index_nospec(offset, ctx->nr_user_files);
+ file_slot = io_fixed_file_slot(&ctx->file_table, offset);
ret = -EBADF;
if (!file_slot->file_ptr)
goto out;
@@ -8439,7 +9057,7 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
io_rsrc_node_switch(ctx, ctx->file_data);
ret = 0;
out:
- io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+ io_ring_submit_unlock(ctx, needs_lock);
return ret;
}
@@ -8481,8 +9099,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
if (file_slot->file_ptr) {
file = (struct file *)(file_slot->file_ptr & FFS_MASK);
- err = io_queue_rsrc_removal(data, up->offset + done,
- ctx->rsrc_node, file);
+ err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file);
if (err)
break;
file_slot->file_ptr = 0;
@@ -8507,7 +9124,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
err = -EBADF;
break;
}
- *io_get_tag_slot(data, up->offset + done) = tag;
+ *io_get_tag_slot(data, i) = tag;
io_fixed_file_set(file_slot, file);
err = io_sqe_file_register(ctx, file, i);
if (err) {
@@ -8555,8 +9172,8 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
return io_wq_create(concurrency, &data);
}
-static int io_uring_alloc_task_context(struct task_struct *task,
- struct io_ring_ctx *ctx)
+static __cold int io_uring_alloc_task_context(struct task_struct *task,
+ struct io_ring_ctx *ctx)
{
struct io_uring_task *tctx;
int ret;
@@ -8565,8 +9182,16 @@ static int io_uring_alloc_task_context(struct task_struct *task,
if (unlikely(!tctx))
return -ENOMEM;
+ tctx->registered_rings = kcalloc(IO_RINGFD_REG_MAX,
+ sizeof(struct file *), GFP_KERNEL);
+ if (unlikely(!tctx->registered_rings)) {
+ kfree(tctx);
+ return -ENOMEM;
+ }
+
ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
if (unlikely(ret)) {
+ kfree(tctx->registered_rings);
kfree(tctx);
return ret;
}
@@ -8575,6 +9200,7 @@ static int io_uring_alloc_task_context(struct task_struct *task,
if (IS_ERR(tctx->io_wq)) {
ret = PTR_ERR(tctx->io_wq);
percpu_counter_destroy(&tctx->inflight);
+ kfree(tctx->registered_rings);
kfree(tctx);
return ret;
}
@@ -8582,10 +9208,10 @@ static int io_uring_alloc_task_context(struct task_struct *task,
xa_init(&tctx->xa);
init_waitqueue_head(&tctx->wait);
atomic_set(&tctx->in_idle, 0);
- atomic_set(&tctx->inflight_tracked, 0);
task->io_uring = tctx;
spin_lock_init(&tctx->task_lock);
INIT_WQ_LIST(&tctx->task_list);
+ INIT_WQ_LIST(&tctx->prior_task_list);
init_task_work(&tctx->task_work, tctx_task_work);
return 0;
}
@@ -8598,13 +9224,14 @@ void __io_uring_free(struct task_struct *tsk)
WARN_ON_ONCE(tctx->io_wq);
WARN_ON_ONCE(tctx->cached_refs);
+ kfree(tctx->registered_rings);
percpu_counter_destroy(&tctx->inflight);
kfree(tctx);
tsk->io_uring = NULL;
}
-static int io_sq_offload_create(struct io_ring_ctx *ctx,
- struct io_uring_params *p)
+static __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
+ struct io_uring_params *p)
{
int ret;
@@ -8627,6 +9254,10 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
struct io_sq_data *sqd;
bool attached;
+ ret = security_uring_sqpoll();
+ if (ret)
+ return ret;
+
sqd = io_get_sq_data(p, &attached);
if (IS_ERR(sqd)) {
ret = PTR_ERR(sqd);
@@ -8753,10 +9384,9 @@ static void io_mem_free(void *ptr)
static void *io_mem_alloc(size_t size)
{
- gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
- __GFP_NORETRY | __GFP_ACCOUNT;
+ gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
- return (void *) __get_free_pages(gfp_flags, get_order(size));
+ return (void *) __get_free_pages(gfp, get_order(size));
}
static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
@@ -9152,7 +9782,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
i = array_index_nospec(offset, ctx->nr_user_bufs);
if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
- err = io_queue_rsrc_removal(ctx->buf_data, offset,
+ err = io_queue_rsrc_removal(ctx->buf_data, i,
ctx->rsrc_node, ctx->user_bufs[i]);
if (unlikely(err)) {
io_buffer_unmap(ctx, &imu);
@@ -9171,33 +9801,55 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
return done ? done : err;
}
-static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
+static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned int eventfd_async)
{
+ struct io_ev_fd *ev_fd;
__s32 __user *fds = arg;
int fd;
- if (ctx->cq_ev_fd)
+ ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
+ lockdep_is_held(&ctx->uring_lock));
+ if (ev_fd)
return -EBUSY;
if (copy_from_user(&fd, fds, sizeof(*fds)))
return -EFAULT;
- ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
- if (IS_ERR(ctx->cq_ev_fd)) {
- int ret = PTR_ERR(ctx->cq_ev_fd);
+ ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
+ if (!ev_fd)
+ return -ENOMEM;
- ctx->cq_ev_fd = NULL;
+ ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
+ if (IS_ERR(ev_fd->cq_ev_fd)) {
+ int ret = PTR_ERR(ev_fd->cq_ev_fd);
+ kfree(ev_fd);
return ret;
}
-
+ ev_fd->eventfd_async = eventfd_async;
+ ctx->has_evfd = true;
+ rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
return 0;
}
+static void io_eventfd_put(struct rcu_head *rcu)
+{
+ struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
+
+ eventfd_ctx_put(ev_fd->cq_ev_fd);
+ kfree(ev_fd);
+}
+
static int io_eventfd_unregister(struct io_ring_ctx *ctx)
{
- if (ctx->cq_ev_fd) {
- eventfd_ctx_put(ctx->cq_ev_fd);
- ctx->cq_ev_fd = NULL;
+ struct io_ev_fd *ev_fd;
+
+ ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
+ lockdep_is_held(&ctx->uring_lock));
+ if (ev_fd) {
+ ctx->has_evfd = false;
+ rcu_assign_pointer(ctx->io_ev_fd, NULL);
+ call_rcu(&ev_fd->rcu, io_eventfd_put);
return 0;
}
@@ -9206,38 +9858,49 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx)
static void io_destroy_buffers(struct io_ring_ctx *ctx)
{
- struct io_buffer *buf;
- unsigned long index;
+ int i;
- xa_for_each(&ctx->io_buffers, index, buf) {
- __io_remove_buffers(ctx, buf, index, -1U);
- cond_resched();
+ for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++) {
+ struct list_head *list = &ctx->io_buffers[i];
+
+ while (!list_empty(list)) {
+ struct io_buffer_list *bl;
+
+ bl = list_first_entry(list, struct io_buffer_list, list);
+ __io_remove_buffers(ctx, bl, -1U);
+ list_del(&bl->list);
+ kfree(bl);
+ }
}
-}
-static void io_req_cache_free(struct list_head *list)
-{
- struct io_kiocb *req, *nxt;
+ while (!list_empty(&ctx->io_buffers_pages)) {
+ struct page *page;
- list_for_each_entry_safe(req, nxt, list, inflight_entry) {
- list_del(&req->inflight_entry);
- kmem_cache_free(req_cachep, req);
+ page = list_first_entry(&ctx->io_buffers_pages, struct page, lru);
+ list_del_init(&page->lru);
+ __free_page(page);
}
}
static void io_req_caches_free(struct io_ring_ctx *ctx)
{
struct io_submit_state *state = &ctx->submit_state;
+ int nr = 0;
mutex_lock(&ctx->uring_lock);
+ io_flush_cached_locked_reqs(ctx, state);
- if (state->free_reqs) {
- kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
- state->free_reqs = 0;
- }
+ while (state->free_list.next) {
+ struct io_wq_work_node *node;
+ struct io_kiocb *req;
- io_flush_cached_locked_reqs(ctx, state);
- io_req_cache_free(&state->free_list);
+ node = wq_stack_extract(&state->free_list);
+ req = container_of(node, struct io_kiocb, comp_list);
+ kmem_cache_free(req_cachep, req);
+ nr++;
+ }
+ if (nr)
+ percpu_ref_put_many(&ctx->refs, nr);
mutex_unlock(&ctx->uring_lock);
}
@@ -9247,7 +9910,19 @@ static void io_wait_rsrc_data(struct io_rsrc_data *data)
wait_for_completion(&data->done);
}
-static void io_ring_ctx_free(struct io_ring_ctx *ctx)
+static void io_flush_apoll_cache(struct io_ring_ctx *ctx)
+{
+ struct async_poll *apoll;
+
+ while (!list_empty(&ctx->apoll_cache)) {
+ apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
+ poll.wait.entry);
+ list_del(&apoll->poll.wait.entry);
+ kfree(apoll);
+ }
+}
+
+static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
{
io_sq_thread_finish(ctx);
@@ -9256,6 +9931,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
ctx->mm_account = NULL;
}
+ io_rsrc_refs_drop(ctx);
/* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
io_wait_rsrc_data(ctx->buf_data);
io_wait_rsrc_data(ctx->file_data);
@@ -9267,8 +9943,9 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
__io_sqe_files_unregister(ctx);
if (ctx->rings)
__io_cqring_overflow_flush(ctx, true);
- mutex_unlock(&ctx->uring_lock);
io_eventfd_unregister(ctx);
+ io_flush_apoll_cache(ctx);
+ mutex_unlock(&ctx->uring_lock);
io_destroy_buffers(ctx);
if (ctx->sq_creds)
put_cred(ctx->sq_creds);
@@ -9279,6 +9956,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
if (ctx->rsrc_backup_node)
io_rsrc_node_destroy(ctx->rsrc_backup_node);
flush_delayed_work(&ctx->rsrc_put_work);
+ flush_delayed_work(&ctx->fallback_work);
WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
@@ -9301,6 +9979,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_wq_put_hash(ctx->hash_map);
kfree(ctx->cancel_hash);
kfree(ctx->dummy_ubuf);
+ kfree(ctx->io_buffers);
kfree(ctx);
}
@@ -9309,7 +9988,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
struct io_ring_ctx *ctx = file->private_data;
__poll_t mask = 0;
- poll_wait(file, &ctx->poll_wait, wait);
+ poll_wait(file, &ctx->cq_wait, wait);
/*
* synchronizes with barrier from wq_has_sleeper call in
* io_commit_cqring
@@ -9356,7 +10035,7 @@ struct io_tctx_exit {
struct io_ring_ctx *ctx;
};
-static void io_tctx_exit_cb(struct callback_head *cb)
+static __cold void io_tctx_exit_cb(struct callback_head *cb)
{
struct io_uring_task *tctx = current->io_uring;
struct io_tctx_exit *work;
@@ -9371,14 +10050,14 @@ static void io_tctx_exit_cb(struct callback_head *cb)
complete(&work->completion);
}
-static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
+static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
return req->ctx == data;
}
-static void io_ring_exit_work(struct work_struct *work)
+static __cold void io_ring_exit_work(struct work_struct *work)
{
struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
unsigned long timeout = jiffies + HZ * 60 * 5;
@@ -9407,6 +10086,8 @@ static void io_ring_exit_work(struct work_struct *work)
io_sq_thread_unpark(sqd);
}
+ io_req_caches_free(ctx);
+
if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
/* there is little hope left, don't run it too often */
interval = HZ * 60;
@@ -9433,7 +10114,6 @@ static void io_ring_exit_work(struct work_struct *work)
ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
if (WARN_ON_ONCE(ret))
continue;
- wake_up_process(node->task);
mutex_unlock(&ctx->uring_lock);
wait_for_completion(&exit.completion);
@@ -9447,8 +10127,8 @@ static void io_ring_exit_work(struct work_struct *work)
}
/* Returns true if we found and killed one or more timeouts */
-static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
- bool cancel_all)
+static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx,
+ struct task_struct *tsk, bool cancel_all)
{
struct io_kiocb *req, *tmp;
int canceled = 0;
@@ -9470,7 +10150,7 @@ static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
return canceled != 0;
}
-static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
+static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
{
unsigned long index;
struct creds *creds;
@@ -9517,30 +10197,20 @@ static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
struct io_task_cancel *cancel = data;
- bool ret;
-
- if (!cancel->all && (req->flags & REQ_F_LINK_TIMEOUT)) {
- struct io_ring_ctx *ctx = req->ctx;
- /* protect against races with linked timeouts */
- spin_lock(&ctx->completion_lock);
- ret = io_match_task(req, cancel->task, cancel->all);
- spin_unlock(&ctx->completion_lock);
- } else {
- ret = io_match_task(req, cancel->task, cancel->all);
- }
- return ret;
+ return io_match_task_safe(req, cancel->task, cancel->all);
}
-static bool io_cancel_defer_files(struct io_ring_ctx *ctx,
- struct task_struct *task, bool cancel_all)
+static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
+ struct task_struct *task,
+ bool cancel_all)
{
struct io_defer_entry *de;
LIST_HEAD(list);
spin_lock(&ctx->completion_lock);
list_for_each_entry_reverse(de, &ctx->defer_list, list) {
- if (io_match_task(de->req, task, cancel_all)) {
+ if (io_match_task_safe(de->req, task, cancel_all)) {
list_cut_position(&list, &ctx->defer_list, &de->list);
break;
}
@@ -9558,7 +10228,7 @@ static bool io_cancel_defer_files(struct io_ring_ctx *ctx,
return true;
}
-static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
+static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
{
struct io_tctx_node *node;
enum io_wq_cancel cret;
@@ -9582,9 +10252,9 @@ static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
return ret;
}
-static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
- struct task_struct *task,
- bool cancel_all)
+static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
+ struct task_struct *task,
+ bool cancel_all)
{
struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
struct io_uring_task *tctx = task ? task->io_uring : NULL;
@@ -9608,7 +10278,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
/* SQPOLL thread does its own polling */
if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
(ctx->sq_data && ctx->sq_data->thread == current)) {
- while (!list_empty_careful(&ctx->iopoll_list)) {
+ while (!wq_list_empty(&ctx->iopoll_list)) {
io_iopoll_try_reap_events(ctx);
ret = true;
}
@@ -9683,7 +10353,7 @@ static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
/*
* Remove this io_uring_file -> task mapping.
*/
-static void io_uring_del_tctx_node(unsigned long index)
+static __cold void io_uring_del_tctx_node(unsigned long index)
{
struct io_uring_task *tctx = current->io_uring;
struct io_tctx_node *node;
@@ -9706,7 +10376,7 @@ static void io_uring_del_tctx_node(unsigned long index)
kfree(node);
}
-static void io_uring_clean_tctx(struct io_uring_task *tctx)
+static __cold void io_uring_clean_tctx(struct io_uring_task *tctx)
{
struct io_wq *wq = tctx->io_wq;
struct io_tctx_node *node;
@@ -9718,7 +10388,7 @@ static void io_uring_clean_tctx(struct io_uring_task *tctx)
}
if (wq) {
/*
- * Must be after io_uring_del_task_file() (removes nodes under
+ * Must be after io_uring_del_tctx_node() (removes nodes under
* uring_lock) to avoid race with io_uring_try_cancel_iowq().
*/
io_wq_put_and_exit(wq);
@@ -9729,27 +10399,16 @@ static void io_uring_clean_tctx(struct io_uring_task *tctx)
static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
{
if (tracked)
- return atomic_read(&tctx->inflight_tracked);
+ return 0;
return percpu_counter_sum(&tctx->inflight);
}
-static void io_uring_drop_tctx_refs(struct task_struct *task)
-{
- struct io_uring_task *tctx = task->io_uring;
- unsigned int refs = tctx->cached_refs;
-
- if (refs) {
- tctx->cached_refs = 0;
- percpu_counter_sub(&tctx->inflight, refs);
- put_task_struct_many(task, refs);
- }
-}
-
/*
* Find any io_uring ctx that this task has registered or done IO on, and cancel
- * requests. @sqd should be not-null IIF it's an SQPOLL thread cancellation.
+ * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation.
*/
-static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
+static __cold void io_uring_cancel_generic(bool cancel_all,
+ struct io_sq_data *sqd)
{
struct io_uring_task *tctx = current->io_uring;
struct io_ring_ctx *ctx;
@@ -9788,8 +10447,10 @@ static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
cancel_all);
}
- prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
+ prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE);
+ io_run_task_work();
io_uring_drop_tctx_refs(current);
+
/*
* If we've seen completions, retry without waiting. This
* avoids a race where a completion comes in before we did
@@ -9799,10 +10460,14 @@ static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
schedule();
finish_wait(&tctx->wait, &wait);
} while (1);
- atomic_dec(&tctx->in_idle);
io_uring_clean_tctx(tctx);
if (cancel_all) {
+ /*
+ * We shouldn't run task_works after cancel, so just leave
+ * ->in_idle set for normal exit.
+ */
+ atomic_dec(&tctx->in_idle);
/* for exec all current's requests should be gone, kill tctx */
__io_uring_free(current);
}
@@ -9813,6 +10478,144 @@ void __io_uring_cancel(bool cancel_all)
io_uring_cancel_generic(cancel_all, NULL);
}
+void io_uring_unreg_ringfd(void)
+{
+ struct io_uring_task *tctx = current->io_uring;
+ int i;
+
+ for (i = 0; i < IO_RINGFD_REG_MAX; i++) {
+ if (tctx->registered_rings[i]) {
+ fput(tctx->registered_rings[i]);
+ tctx->registered_rings[i] = NULL;
+ }
+ }
+}
+
+static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd,
+ int start, int end)
+{
+ struct file *file;
+ int offset;
+
+ for (offset = start; offset < end; offset++) {
+ offset = array_index_nospec(offset, IO_RINGFD_REG_MAX);
+ if (tctx->registered_rings[offset])
+ continue;
+
+ file = fget(fd);
+ if (!file) {
+ return -EBADF;
+ } else if (file->f_op != &io_uring_fops) {
+ fput(file);
+ return -EOPNOTSUPP;
+ }
+ tctx->registered_rings[offset] = file;
+ return offset;
+ }
+
+ return -EBUSY;
+}
+
+/*
+ * Register a ring fd to avoid fdget/fdput for each io_uring_enter()
+ * invocation. User passes in an array of struct io_uring_rsrc_update
+ * with ->data set to the ring_fd, and ->offset given for the desired
+ * index. If no index is desired, application may set ->offset == -1U
+ * and we'll find an available index. Returns number of entries
+ * successfully processed, or < 0 on error if none were processed.
+ */
+static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg,
+ unsigned nr_args)
+{
+ struct io_uring_rsrc_update __user *arg = __arg;
+ struct io_uring_rsrc_update reg;
+ struct io_uring_task *tctx;
+ int ret, i;
+
+ if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
+ return -EINVAL;
+
+ mutex_unlock(&ctx->uring_lock);
+ ret = io_uring_add_tctx_node(ctx);
+ mutex_lock(&ctx->uring_lock);
+ if (ret)
+ return ret;
+
+ tctx = current->io_uring;
+ for (i = 0; i < nr_args; i++) {
+ int start, end;
+
+ if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
+ ret = -EFAULT;
+ break;
+ }
+
+ if (reg.resv) {
+ ret = -EINVAL;
+ break;
+ }
+
+ if (reg.offset == -1U) {
+ start = 0;
+ end = IO_RINGFD_REG_MAX;
+ } else {
+ if (reg.offset >= IO_RINGFD_REG_MAX) {
+ ret = -EINVAL;
+ break;
+ }
+ start = reg.offset;
+ end = start + 1;
+ }
+
+ ret = io_ring_add_registered_fd(tctx, reg.data, start, end);
+ if (ret < 0)
+ break;
+
+ reg.offset = ret;
+ if (copy_to_user(&arg[i], &reg, sizeof(reg))) {
+ fput(tctx->registered_rings[reg.offset]);
+ tctx->registered_rings[reg.offset] = NULL;
+ ret = -EFAULT;
+ break;
+ }
+ }
+
+ return i ? i : ret;
+}
+
+static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg,
+ unsigned nr_args)
+{
+ struct io_uring_rsrc_update __user *arg = __arg;
+ struct io_uring_task *tctx = current->io_uring;
+ struct io_uring_rsrc_update reg;
+ int ret = 0, i;
+
+ if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
+ return -EINVAL;
+ if (!tctx)
+ return 0;
+
+ for (i = 0; i < nr_args; i++) {
+ if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
+ ret = -EFAULT;
+ break;
+ }
+ if (reg.resv || reg.data || reg.offset >= IO_RINGFD_REG_MAX) {
+ ret = -EINVAL;
+ break;
+ }
+
+ reg.offset = array_index_nospec(reg.offset, IO_RINGFD_REG_MAX);
+ if (tctx->registered_rings[reg.offset]) {
+ fput(tctx->registered_rings[reg.offset]);
+ tctx->registered_rings[reg.offset] = NULL;
+ }
+ }
+
+ return i ? i : ret;
+}
+
static void *io_uring_validate_mmap_request(struct file *file,
loff_t pgoff, size_t sz)
{
@@ -9842,7 +10645,7 @@ static void *io_uring_validate_mmap_request(struct file *file,
#ifdef CONFIG_MMU
-static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
+static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
{
size_t sz = vma->vm_end - vma->vm_start;
unsigned long pfn;
@@ -9925,6 +10728,8 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz
return -EINVAL;
if (copy_from_user(&arg, argp, sizeof(arg)))
return -EFAULT;
+ if (arg.pad)
+ return -EINVAL;
*sig = u64_to_user_ptr(arg.sigmask);
*argsz = arg.sigmask_sz;
*ts = u64_to_user_ptr(arg.ts);
@@ -9943,12 +10748,28 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
io_run_task_work();
if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
- IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)))
+ IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
+ IORING_ENTER_REGISTERED_RING)))
return -EINVAL;
- f = fdget(fd);
- if (unlikely(!f.file))
- return -EBADF;
+ /*
+ * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
+ * need only dereference our task private array to find it.
+ */
+ if (flags & IORING_ENTER_REGISTERED_RING) {
+ struct io_uring_task *tctx = current->io_uring;
+
+ if (!tctx || fd >= IO_RINGFD_REG_MAX)
+ return -EINVAL;
+ fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
+ f.file = tctx->registered_rings[fd];
+ if (unlikely(!f.file))
+ return -EBADF;
+ } else {
+ f = fdget(fd);
+ if (unlikely(!f.file))
+ return -EBADF;
+ }
ret = -EOPNOTSUPP;
if (unlikely(f.file->f_op != &io_uring_fops))
@@ -10022,12 +10843,13 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
out:
percpu_ref_put(&ctx->refs);
out_fput:
- fdput(f);
+ if (!(flags & IORING_ENTER_REGISTERED_RING))
+ fdput(f);
return submitted ? submitted : ret;
}
#ifdef CONFIG_PROC_FS
-static int io_uring_show_cred(struct seq_file *m, unsigned int id,
+static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
const struct cred *cred)
{
struct user_namespace *uns = seq_user_ns(m);
@@ -10059,11 +10881,59 @@ static int io_uring_show_cred(struct seq_file *m, unsigned int id,
return 0;
}
-static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
+static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
+ struct seq_file *m)
{
struct io_sq_data *sq = NULL;
+ struct io_overflow_cqe *ocqe;
+ struct io_rings *r = ctx->rings;
+ unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
+ unsigned int sq_head = READ_ONCE(r->sq.head);
+ unsigned int sq_tail = READ_ONCE(r->sq.tail);
+ unsigned int cq_head = READ_ONCE(r->cq.head);
+ unsigned int cq_tail = READ_ONCE(r->cq.tail);
+ unsigned int sq_entries, cq_entries;
bool has_lock;
- int i;
+ unsigned int i;
+
+ /*
+ * we may get imprecise sqe and cqe info if uring is actively running
+ * since we get cached_sq_head and cached_cq_tail without uring_lock
+ * and sq_tail and cq_head are changed by userspace. But it's ok since
+ * we usually use these info when it is stuck.
+ */
+ seq_printf(m, "SqMask:\t0x%x\n", sq_mask);
+ seq_printf(m, "SqHead:\t%u\n", sq_head);
+ seq_printf(m, "SqTail:\t%u\n", sq_tail);
+ seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head);
+ seq_printf(m, "CqMask:\t0x%x\n", cq_mask);
+ seq_printf(m, "CqHead:\t%u\n", cq_head);
+ seq_printf(m, "CqTail:\t%u\n", cq_tail);
+ seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail);
+ seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head);
+ sq_entries = min(sq_tail - sq_head, ctx->sq_entries);
+ for (i = 0; i < sq_entries; i++) {
+ unsigned int entry = i + sq_head;
+ unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
+ struct io_uring_sqe *sqe;
+
+ if (sq_idx > sq_mask)
+ continue;
+ sqe = &ctx->sq_sqes[sq_idx];
+ seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n",
+ sq_idx, sqe->opcode, sqe->fd, sqe->flags,
+ sqe->user_data);
+ }
+ seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head);
+ cq_entries = min(cq_tail - cq_head, ctx->cq_entries);
+ for (i = 0; i < cq_entries; i++) {
+ unsigned int entry = i + cq_head;
+ struct io_uring_cqe *cqe = &r->cqes[entry & cq_mask];
+
+ seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n",
+ entry & cq_mask, cqe->user_data, cqe->res,
+ cqe->flags);
+ }
/*
* Avoid ABBA deadlock between the seq lock and the io_uring mutex,
@@ -10105,7 +10975,10 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
xa_for_each(&ctx->personalities, index, cred)
io_uring_show_cred(m, index, cred);
}
- seq_printf(m, "PollList:\n");
+ if (has_lock)
+ mutex_unlock(&ctx->uring_lock);
+
+ seq_puts(m, "PollList:\n");
spin_lock(&ctx->completion_lock);
for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
struct hlist_head *list = &ctx->cancel_hash[i];
@@ -10113,14 +10986,22 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
hlist_for_each_entry(req, list, hash_node)
seq_printf(m, " op=%d, task_works=%d\n", req->opcode,
- req->task->task_works != NULL);
+ task_work_pending(req->task));
+ }
+
+ seq_puts(m, "CqOverflowList:\n");
+ list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) {
+ struct io_uring_cqe *cqe = &ocqe->cqe;
+
+ seq_printf(m, " user_data=%llu, res=%d, flags=%x\n",
+ cqe->user_data, cqe->res, cqe->flags);
+
}
+
spin_unlock(&ctx->completion_lock);
- if (has_lock)
- mutex_unlock(&ctx->uring_lock);
}
-static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
+static __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
{
struct io_ring_ctx *ctx = f->private_data;
@@ -10144,8 +11025,8 @@ static const struct file_operations io_uring_fops = {
#endif
};
-static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
- struct io_uring_params *p)
+static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
+ struct io_uring_params *p)
{
struct io_rings *rings;
size_t size, sq_array_offset;
@@ -10221,8 +11102,8 @@ static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
return ERR_PTR(ret);
#endif
- file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
- O_RDWR | O_CLOEXEC);
+ file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx,
+ O_RDWR | O_CLOEXEC, NULL);
#if defined(CONFIG_UNIX)
if (IS_ERR(file)) {
sock_release(ctx->ring_sock);
@@ -10234,8 +11115,8 @@ static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
return file;
}
-static int io_uring_create(unsigned entries, struct io_uring_params *p,
- struct io_uring_params __user *params)
+static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
+ struct io_uring_params __user *params)
{
struct io_ring_ctx *ctx;
struct file *file;
@@ -10330,7 +11211,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
- IORING_FEAT_RSRC_TAGS;
+ IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
+ IORING_FEAT_LINKED_FILE;
if (copy_to_user(params, p, sizeof(*p))) {
ret = -EFAULT;
@@ -10381,7 +11263,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
- IORING_SETUP_R_DISABLED))
+ IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL))
return -EINVAL;
return io_uring_create(entries, &p, params);
@@ -10393,7 +11275,8 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries,
return io_uring_setup(entries, params);
}
-static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
+static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned nr_args)
{
struct io_uring_probe *p;
size_t size;
@@ -10449,8 +11332,8 @@ static int io_register_personality(struct io_ring_ctx *ctx)
return id;
}
-static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg,
- unsigned int nr_args)
+static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
+ void __user *arg, unsigned int nr_args)
{
struct io_uring_restriction *res;
size_t size;
@@ -10540,8 +11423,6 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
__u32 tmp;
int err;
- if (up->resv)
- return -EINVAL;
if (check_add_overflow(up->offset, nr_args, &tmp))
return -EOVERFLOW;
err = io_rsrc_node_switch_start(ctx);
@@ -10567,6 +11448,8 @@ static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
memset(&up, 0, sizeof(up));
if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
return -EFAULT;
+ if (up.resv || up.resv2)
+ return -EINVAL;
return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
}
@@ -10579,12 +11462,12 @@ static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
return -EINVAL;
if (copy_from_user(&up, arg, sizeof(up)))
return -EFAULT;
- if (!up.nr || up.resv)
+ if (!up.nr || up.resv || up.resv2)
return -EINVAL;
return __io_register_rsrc_update(ctx, type, &up, up.nr);
}
-static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
+static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
unsigned int size, unsigned int type)
{
struct io_uring_rsrc_register rr;
@@ -10610,8 +11493,8 @@ static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
return -EINVAL;
}
-static int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg,
- unsigned len)
+static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
+ void __user *arg, unsigned len)
{
struct io_uring_task *tctx = current->io_uring;
cpumask_var_t new_mask;
@@ -10627,7 +11510,15 @@ static int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg,
if (len > cpumask_size())
len = cpumask_size();
- if (copy_from_user(new_mask, arg, len)) {
+ if (in_compat_syscall()) {
+ ret = compat_get_bitmap(cpumask_bits(new_mask),
+ (const compat_ulong_t __user *)arg,
+ len * 8 /* CHAR_BIT */);
+ } else {
+ ret = copy_from_user(new_mask, arg, len);
+ }
+
+ if (ret) {
free_cpumask_var(new_mask);
return -EFAULT;
}
@@ -10637,7 +11528,7 @@ static int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg,
return ret;
}
-static int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
+static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
{
struct io_uring_task *tctx = current->io_uring;
@@ -10647,8 +11538,8 @@ static int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
return io_wq_cpu_affinity(tctx->io_wq, NULL);
}
-static int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
- void __user *arg)
+static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
+ void __user *arg)
__must_hold(&ctx->uring_lock)
{
struct io_tctx_node *node;
@@ -10684,10 +11575,11 @@ static int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
- memcpy(ctx->iowq_limits, new_count, sizeof(new_count));
+ for (i = 0; i < ARRAY_SIZE(new_count); i++)
+ if (new_count[i])
+ ctx->iowq_limits[i] = new_count[i];
ctx->iowq_limits_set = true;
- ret = -EINVAL;
if (tctx && tctx->io_wq) {
ret = io_wq_max_workers(tctx->io_wq, new_count);
if (ret)
@@ -10729,57 +11621,6 @@ err:
return ret;
}
-static bool io_register_op_must_quiesce(int op)
-{
- switch (op) {
- case IORING_REGISTER_BUFFERS:
- case IORING_UNREGISTER_BUFFERS:
- case IORING_REGISTER_FILES:
- case IORING_UNREGISTER_FILES:
- case IORING_REGISTER_FILES_UPDATE:
- case IORING_REGISTER_PROBE:
- case IORING_REGISTER_PERSONALITY:
- case IORING_UNREGISTER_PERSONALITY:
- case IORING_REGISTER_FILES2:
- case IORING_REGISTER_FILES_UPDATE2:
- case IORING_REGISTER_BUFFERS2:
- case IORING_REGISTER_BUFFERS_UPDATE:
- case IORING_REGISTER_IOWQ_AFF:
- case IORING_UNREGISTER_IOWQ_AFF:
- case IORING_REGISTER_IOWQ_MAX_WORKERS:
- return false;
- default:
- return true;
- }
-}
-
-static int io_ctx_quiesce(struct io_ring_ctx *ctx)
-{
- long ret;
-
- percpu_ref_kill(&ctx->refs);
-
- /*
- * Drop uring mutex before waiting for references to exit. If another
- * thread is currently inside io_uring_enter() it might need to grab the
- * uring_lock to make progress. If we hold it here across the drain
- * wait, then we can deadlock. It's safe to drop the mutex here, since
- * no new references will come in after we've killed the percpu ref.
- */
- mutex_unlock(&ctx->uring_lock);
- do {
- ret = wait_for_completion_interruptible(&ctx->ref_comp);
- if (!ret)
- break;
- ret = io_run_task_work_sig();
- } while (ret >= 0);
- mutex_lock(&ctx->uring_lock);
-
- if (ret)
- io_refs_resurrect(&ctx->refs, &ctx->ref_comp);
- return ret;
-}
-
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
void __user *arg, unsigned nr_args)
__releases(ctx->uring_lock)
@@ -10803,12 +11644,6 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
return -EACCES;
}
- if (io_register_op_must_quiesce(opcode)) {
- ret = io_ctx_quiesce(ctx);
- if (ret)
- return ret;
- }
-
switch (opcode) {
case IORING_REGISTER_BUFFERS:
ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
@@ -10832,17 +11667,16 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
ret = io_register_files_update(ctx, arg, nr_args);
break;
case IORING_REGISTER_EVENTFD:
- case IORING_REGISTER_EVENTFD_ASYNC:
ret = -EINVAL;
if (nr_args != 1)
break;
- ret = io_eventfd_register(ctx, arg);
- if (ret)
+ ret = io_eventfd_register(ctx, arg, 0);
+ break;
+ case IORING_REGISTER_EVENTFD_ASYNC:
+ ret = -EINVAL;
+ if (nr_args != 1)
break;
- if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
- ctx->eventfd_async = 1;
- else
- ctx->eventfd_async = 0;
+ ret = io_eventfd_register(ctx, arg, 1);
break;
case IORING_UNREGISTER_EVENTFD:
ret = -EINVAL;
@@ -10909,16 +11743,17 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break;
ret = io_register_iowq_max_workers(ctx, arg);
break;
+ case IORING_REGISTER_RING_FDS:
+ ret = io_ringfd_register(ctx, arg, nr_args);
+ break;
+ case IORING_UNREGISTER_RING_FDS:
+ ret = io_ringfd_unregister(ctx, arg, nr_args);
+ break;
default:
ret = -EINVAL;
break;
}
- if (io_register_op_must_quiesce(opcode)) {
- /* bring the ctx back to life */
- percpu_ref_reinit(&ctx->refs);
- reinit_completion(&ctx->ref_comp);
- }
return ret;
}
@@ -10944,8 +11779,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
mutex_lock(&ctx->uring_lock);
ret = __io_uring_register(ctx, opcode, arg, nr_args);
mutex_unlock(&ctx->uring_lock);
- trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
- ctx->cq_ev_fd != NULL, ret);
+ trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
out_fput:
fdput(f);
return ret;
@@ -11002,6 +11836,8 @@ static int __init io_uring_init(void)
/* should fit into one byte */
BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
+ BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8));
+ BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS);
BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));