diff options
Diffstat (limited to 'fs/io_uring.c')
| -rw-r--r-- | fs/io_uring.c | 301 | 
1 files changed, 162 insertions, 139 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c index 5190bfb6a665..c687f57fb651 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -357,7 +357,6 @@ struct io_timeout_data {  	struct hrtimer			timer;  	struct timespec64		ts;  	enum hrtimer_mode		mode; -	u32				seq_offset;  };  struct io_accept { @@ -385,7 +384,7 @@ struct io_timeout {  	struct file			*file;  	u64				addr;  	int				flags; -	unsigned			count; +	u32				count;  };  struct io_rw { @@ -508,6 +507,7 @@ enum {  	REQ_F_FORCE_ASYNC_BIT	= IOSQE_ASYNC_BIT,  	REQ_F_BUFFER_SELECT_BIT	= IOSQE_BUFFER_SELECT_BIT, +	REQ_F_LINK_HEAD_BIT,  	REQ_F_LINK_NEXT_BIT,  	REQ_F_FAIL_LINK_BIT,  	REQ_F_INFLIGHT_BIT, @@ -543,6 +543,8 @@ enum {  	/* IOSQE_BUFFER_SELECT */  	REQ_F_BUFFER_SELECT	= BIT(REQ_F_BUFFER_SELECT_BIT), +	/* head of a link */ +	REQ_F_LINK_HEAD		= BIT(REQ_F_LINK_HEAD_BIT),  	/* already grabbed next link */  	REQ_F_LINK_NEXT		= BIT(REQ_F_LINK_NEXT_BIT),  	/* fail rest of links */ @@ -955,8 +957,8 @@ static inline bool __req_need_defer(struct io_kiocb *req)  {  	struct io_ring_ctx *ctx = req->ctx; -	return req->sequence != ctx->cached_cq_tail + ctx->cached_sq_dropped -					+ atomic_read(&ctx->cached_cq_overflow); +	return req->sequence != ctx->cached_cq_tail +				+ atomic_read(&ctx->cached_cq_overflow);  }  static inline bool req_need_defer(struct io_kiocb *req) @@ -1437,7 +1439,7 @@ static bool io_link_cancel_timeout(struct io_kiocb *req)  	if (ret != -1) {  		io_cqring_fill_event(req, -ECANCELED);  		io_commit_cqring(ctx); -		req->flags &= ~REQ_F_LINK; +		req->flags &= ~REQ_F_LINK_HEAD;  		io_put_req(req);  		return true;  	} @@ -1473,7 +1475,7 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)  		list_del_init(&req->link_list);  		if (!list_empty(&nxt->link_list)) -			nxt->flags |= REQ_F_LINK; +			nxt->flags |= REQ_F_LINK_HEAD;  		*nxtptr = nxt;  		break;  	} @@ -1484,7 +1486,7 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)  }  /* - * Called if REQ_F_LINK is set, and we fail the head request + * Called if REQ_F_LINK_HEAD is set, and we fail the head request   */  static void io_fail_links(struct io_kiocb *req)  { @@ -1517,7 +1519,7 @@ static void io_fail_links(struct io_kiocb *req)  static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)  { -	if (likely(!(req->flags & REQ_F_LINK))) +	if (likely(!(req->flags & REQ_F_LINK_HEAD)))  		return;  	/* @@ -1669,7 +1671,7 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)  static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req)  { -	if ((req->flags & REQ_F_LINK) || io_is_fallback_req(req)) +	if ((req->flags & REQ_F_LINK_HEAD) || io_is_fallback_req(req))  		return false;  	if (!(req->flags & REQ_F_FIXED_FILE) || req->io) @@ -2562,7 +2564,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock)  	req->result = 0;  	io_size = ret; -	if (req->flags & REQ_F_LINK) +	if (req->flags & REQ_F_LINK_HEAD)  		req->result = io_size;  	/* @@ -2653,7 +2655,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock)  	req->result = 0;  	io_size = ret; -	if (req->flags & REQ_F_LINK) +	if (req->flags & REQ_F_LINK_HEAD)  		req->result = io_size;  	/* @@ -2760,7 +2762,7 @@ static bool io_splice_punt(struct file *file)  		return false;  	if (!io_file_supports_async(file))  		return true; -	return !(file->f_mode & O_NONBLOCK); +	return !(file->f_flags & O_NONBLOCK);  }  static int io_splice(struct io_kiocb *req, bool force_nonblock) @@ -4153,25 +4155,62 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,  	return 1;  } +static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll) +	__acquires(&req->ctx->completion_lock) +{ +	struct io_ring_ctx *ctx = req->ctx; + +	if (!req->result && !READ_ONCE(poll->canceled)) { +		struct poll_table_struct pt = { ._key = poll->events }; + +		req->result = vfs_poll(req->file, &pt) & poll->events; +	} + +	spin_lock_irq(&ctx->completion_lock); +	if (!req->result && !READ_ONCE(poll->canceled)) { +		add_wait_queue(poll->head, &poll->wait); +		return true; +	} + +	return false; +} +  static void io_async_task_func(struct callback_head *cb)  {  	struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);  	struct async_poll *apoll = req->apoll;  	struct io_ring_ctx *ctx = req->ctx; +	bool canceled;  	trace_io_uring_task_run(req->ctx, req->opcode, req->user_data); -	WARN_ON_ONCE(!list_empty(&req->apoll->poll.wait.entry)); +	if (io_poll_rewait(req, &apoll->poll)) { +		spin_unlock_irq(&ctx->completion_lock); +		return; +	} -	if (hash_hashed(&req->hash_node)) { -		spin_lock_irq(&ctx->completion_lock); +	if (hash_hashed(&req->hash_node))  		hash_del(&req->hash_node); -		spin_unlock_irq(&ctx->completion_lock); + +	canceled = READ_ONCE(apoll->poll.canceled); +	if (canceled) { +		io_cqring_fill_event(req, -ECANCELED); +		io_commit_cqring(ctx);  	} +	spin_unlock_irq(&ctx->completion_lock); +  	/* restore ->work in case we need to retry again */  	memcpy(&req->work, &apoll->work, sizeof(req->work)); +	if (canceled) { +		kfree(apoll); +		io_cqring_ev_posted(ctx); +		req_set_fail_links(req); +		io_double_put_req(req); +		return; +	} +  	__set_current_state(TASK_RUNNING);  	mutex_lock(&ctx->uring_lock);  	__io_queue_sqe(req, NULL); @@ -4315,11 +4354,13 @@ static bool __io_poll_remove_one(struct io_kiocb *req,  static bool io_poll_remove_one(struct io_kiocb *req)  { +	struct async_poll *apoll = NULL;  	bool do_complete;  	if (req->opcode == IORING_OP_POLL_ADD) {  		do_complete = __io_poll_remove_one(req, &req->poll);  	} else { +		apoll = req->apoll;  		/* non-poll requests have submit ref still */  		do_complete = __io_poll_remove_one(req, &req->apoll->poll);  		if (do_complete) @@ -4328,6 +4369,14 @@ static bool io_poll_remove_one(struct io_kiocb *req)  	hash_del(&req->hash_node); +	if (do_complete && apoll) { +		/* +		 * restore ->work because we need to call io_req_work_drop_env. +		 */ +		memcpy(&req->work, &apoll->work, sizeof(req->work)); +		kfree(apoll); +	} +  	if (do_complete) {  		io_cqring_fill_event(req, -ECANCELED);  		io_commit_cqring(req->ctx); @@ -4342,7 +4391,7 @@ static void io_poll_remove_all(struct io_ring_ctx *ctx)  {  	struct hlist_node *tmp;  	struct io_kiocb *req; -	int i; +	int posted = 0, i;  	spin_lock_irq(&ctx->completion_lock);  	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { @@ -4350,11 +4399,12 @@ static void io_poll_remove_all(struct io_ring_ctx *ctx)  		list = &ctx->cancel_hash[i];  		hlist_for_each_entry_safe(req, tmp, list, hash_node) -			io_poll_remove_one(req); +			posted += io_poll_remove_one(req);  	}  	spin_unlock_irq(&ctx->completion_lock); -	io_cqring_ev_posted(ctx); +	if (posted) +		io_cqring_ev_posted(ctx);  }  static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr) @@ -4423,18 +4473,11 @@ static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt)  	struct io_ring_ctx *ctx = req->ctx;  	struct io_poll_iocb *poll = &req->poll; -	if (!req->result && !READ_ONCE(poll->canceled)) { -		struct poll_table_struct pt = { ._key = poll->events }; - -		req->result = vfs_poll(req->file, &pt) & poll->events; -	} - -	spin_lock_irq(&ctx->completion_lock); -	if (!req->result && !READ_ONCE(poll->canceled)) { -		add_wait_queue(poll->head, &poll->wait); +	if (io_poll_rewait(req, poll)) {  		spin_unlock_irq(&ctx->completion_lock);  		return;  	} +  	hash_del(&req->hash_node);  	io_poll_complete(req, req->result, 0);  	req->flags |= REQ_F_COMP_LOCKED; @@ -4665,11 +4708,12 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,  static int io_timeout(struct io_kiocb *req)  { -	unsigned count;  	struct io_ring_ctx *ctx = req->ctx;  	struct io_timeout_data *data;  	struct list_head *entry;  	unsigned span = 0; +	u32 count = req->timeout.count; +	u32 seq = req->sequence;  	data = &req->io->timeout; @@ -4678,7 +4722,6 @@ static int io_timeout(struct io_kiocb *req)  	 * timeout event to be satisfied. If it isn't set, then this is  	 * a pure timeout request, sequence isn't used.  	 */ -	count = req->timeout.count;  	if (!count) {  		req->flags |= REQ_F_TIMEOUT_NOSEQ;  		spin_lock_irq(&ctx->completion_lock); @@ -4686,8 +4729,7 @@ static int io_timeout(struct io_kiocb *req)  		goto add;  	} -	req->sequence = ctx->cached_sq_head + count - 1; -	data->seq_offset = count; +	req->sequence = seq + count;  	/*  	 * Insertion sort, ensuring the first entry in the list is always @@ -4696,26 +4738,26 @@ static int io_timeout(struct io_kiocb *req)  	spin_lock_irq(&ctx->completion_lock);  	list_for_each_prev(entry, &ctx->timeout_list) {  		struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list); -		unsigned nxt_sq_head; +		unsigned nxt_seq;  		long long tmp, tmp_nxt; -		u32 nxt_offset = nxt->io->timeout.seq_offset; +		u32 nxt_offset = nxt->timeout.count;  		if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)  			continue;  		/* -		 * Since cached_sq_head + count - 1 can overflow, use type long +		 * Since seq + count can overflow, use type long  		 * long to store it.  		 */ -		tmp = (long long)ctx->cached_sq_head + count - 1; -		nxt_sq_head = nxt->sequence - nxt_offset + 1; -		tmp_nxt = (long long)nxt_sq_head + nxt_offset - 1; +		tmp = (long long)seq + count; +		nxt_seq = nxt->sequence - nxt_offset; +		tmp_nxt = (long long)nxt_seq + nxt_offset;  		/*  		 * cached_sq_head may overflow, and it will never overflow twice  		 * once there is some timeout req still be valid.  		 */ -		if (ctx->cached_sq_head < nxt_sq_head) +		if (seq < nxt_seq)  			tmp += UINT_MAX;  		if (tmp > tmp_nxt) @@ -5476,7 +5518,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)  {  	struct io_kiocb *nxt; -	if (!(req->flags & REQ_F_LINK)) +	if (!(req->flags & REQ_F_LINK_HEAD))  		return NULL;  	/* for polled retry, if flag is set, we already went through here */  	if (req->flags & REQ_F_POLLED) @@ -5604,54 +5646,11 @@ static inline void io_queue_link_head(struct io_kiocb *req)  		io_queue_sqe(req, NULL);  } -#define SQE_VALID_FLAGS	(IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK|	\ -				IOSQE_IO_HARDLINK | IOSQE_ASYNC | \ -				IOSQE_BUFFER_SELECT) - -static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, +static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,  			  struct io_submit_state *state, struct io_kiocb **link)  {  	struct io_ring_ctx *ctx = req->ctx; -	unsigned int sqe_flags; -	int ret, id, fd; - -	sqe_flags = READ_ONCE(sqe->flags); - -	/* enforce forwards compatibility on users */ -	if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) { -		ret = -EINVAL; -		goto err_req; -	} - -	if ((sqe_flags & IOSQE_BUFFER_SELECT) && -	    !io_op_defs[req->opcode].buffer_select) { -		ret = -EOPNOTSUPP; -		goto err_req; -	} - -	id = READ_ONCE(sqe->personality); -	if (id) { -		req->work.creds = idr_find(&ctx->personality_idr, id); -		if (unlikely(!req->work.creds)) { -			ret = -EINVAL; -			goto err_req; -		} -		get_cred(req->work.creds); -	} - -	/* same numerical values with corresponding REQ_F_*, safe to copy */ -	req->flags |= sqe_flags & (IOSQE_IO_DRAIN | IOSQE_IO_HARDLINK | -					IOSQE_ASYNC | IOSQE_FIXED_FILE | -					IOSQE_BUFFER_SELECT); - -	fd = READ_ONCE(sqe->fd); -	ret = io_req_set_file(state, req, fd, sqe_flags); -	if (unlikely(ret)) { -err_req: -		io_cqring_add_event(req, ret); -		io_double_put_req(req); -		return false; -	} +	int ret;  	/*  	 * If we already have a head request, queue this one for async @@ -5670,42 +5669,39 @@ err_req:  		 * next after the link request. The last one is done via  		 * drain_next flag to persist the effect across calls.  		 */ -		if (sqe_flags & IOSQE_IO_DRAIN) { +		if (req->flags & REQ_F_IO_DRAIN) {  			head->flags |= REQ_F_IO_DRAIN;  			ctx->drain_next = 1;  		} -		if (io_alloc_async_ctx(req)) { -			ret = -EAGAIN; -			goto err_req; -		} +		if (io_alloc_async_ctx(req)) +			return -EAGAIN;  		ret = io_req_defer_prep(req, sqe);  		if (ret) {  			/* fail even hard links since we don't submit */  			head->flags |= REQ_F_FAIL_LINK; -			goto err_req; +			return ret;  		}  		trace_io_uring_link(ctx, req, head);  		list_add_tail(&req->link_list, &head->link_list);  		/* last request of a link, enqueue the link */ -		if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK))) { +		if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {  			io_queue_link_head(head);  			*link = NULL;  		}  	} else {  		if (unlikely(ctx->drain_next)) {  			req->flags |= REQ_F_IO_DRAIN; -			req->ctx->drain_next = 0; +			ctx->drain_next = 0;  		} -		if (sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) { -			req->flags |= REQ_F_LINK; +		if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { +			req->flags |= REQ_F_LINK_HEAD;  			INIT_LIST_HEAD(&req->link_list); -			if (io_alloc_async_ctx(req)) { -				ret = -EAGAIN; -				goto err_req; -			} +			if (io_alloc_async_ctx(req)) +				return -EAGAIN; +  			ret = io_req_defer_prep(req, sqe);  			if (ret)  				req->flags |= REQ_F_FAIL_LINK; @@ -5715,7 +5711,7 @@ err_req:  		}  	} -	return true; +	return 0;  }  /* @@ -5789,15 +5785,23 @@ static inline void io_consume_sqe(struct io_ring_ctx *ctx)  	ctx->cached_sq_head++;  } -static void io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, -			const struct io_uring_sqe *sqe) +#define SQE_VALID_FLAGS	(IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK|	\ +				IOSQE_IO_HARDLINK | IOSQE_ASYNC | \ +				IOSQE_BUFFER_SELECT) + +static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, +		       const struct io_uring_sqe *sqe, +		       struct io_submit_state *state, bool async)  { +	unsigned int sqe_flags; +	int id, fd; +  	/*  	 * All io need record the previous position, if LINK vs DARIN,  	 * it can be used to mark the position of the first IO in the  	 * link list.  	 */ -	req->sequence = ctx->cached_sq_head; +	req->sequence = ctx->cached_sq_head - ctx->cached_sq_dropped;  	req->opcode = READ_ONCE(sqe->opcode);  	req->user_data = READ_ONCE(sqe->user_data);  	req->io = NULL; @@ -5808,17 +5812,50 @@ static void io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,  	refcount_set(&req->refs, 2);  	req->task = NULL;  	req->result = 0; +	req->needs_fixed_file = async;  	INIT_IO_WORK(&req->work, io_wq_submit_work); + +	if (unlikely(req->opcode >= IORING_OP_LAST)) +		return -EINVAL; + +	if (io_op_defs[req->opcode].needs_mm && !current->mm) { +		if (unlikely(!mmget_not_zero(ctx->sqo_mm))) +			return -EFAULT; +		use_mm(ctx->sqo_mm); +	} + +	sqe_flags = READ_ONCE(sqe->flags); +	/* enforce forwards compatibility on users */ +	if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) +		return -EINVAL; + +	if ((sqe_flags & IOSQE_BUFFER_SELECT) && +	    !io_op_defs[req->opcode].buffer_select) +		return -EOPNOTSUPP; + +	id = READ_ONCE(sqe->personality); +	if (id) { +		req->work.creds = idr_find(&ctx->personality_idr, id); +		if (unlikely(!req->work.creds)) +			return -EINVAL; +		get_cred(req->work.creds); +	} + +	/* same numerical values with corresponding REQ_F_*, safe to copy */ +	req->flags |= sqe_flags & (IOSQE_IO_DRAIN | IOSQE_IO_HARDLINK | +					IOSQE_ASYNC | IOSQE_FIXED_FILE | +					IOSQE_BUFFER_SELECT | IOSQE_IO_LINK); + +	fd = READ_ONCE(sqe->fd); +	return io_req_set_file(state, req, fd, sqe_flags);  }  static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, -			  struct file *ring_file, int ring_fd, -			  struct mm_struct **mm, bool async) +			  struct file *ring_file, int ring_fd, bool async)  {  	struct io_submit_state state, *statep = NULL;  	struct io_kiocb *link = NULL;  	int i, submitted = 0; -	bool mm_fault = false;  	/* if we have a backlog and couldn't flush it all, return BUSY */  	if (test_bit(0, &ctx->sq_check_overflow)) { @@ -5858,34 +5895,23 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,  			break;  		} -		io_init_req(ctx, req, sqe); +		err = io_init_req(ctx, req, sqe, statep, async);  		io_consume_sqe(ctx);  		/* will complete beyond this point, count as submitted */  		submitted++; -		if (unlikely(req->opcode >= IORING_OP_LAST)) { -			err = -EINVAL; +		if (unlikely(err)) {  fail_req:  			io_cqring_add_event(req, err);  			io_double_put_req(req);  			break;  		} -		if (io_op_defs[req->opcode].needs_mm && !*mm) { -			mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm); -			if (unlikely(mm_fault)) { -				err = -EFAULT; -				goto fail_req; -			} -			use_mm(ctx->sqo_mm); -			*mm = ctx->sqo_mm; -		} - -		req->needs_fixed_file = async;  		trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,  						true, async); -		if (!io_submit_sqe(req, sqe, statep, &link)) -			break; +		err = io_submit_sqe(req, sqe, statep, &link); +		if (err) +			goto fail_req;  	}  	if (unlikely(submitted != nr)) { @@ -5904,10 +5930,19 @@ fail_req:  	return submitted;  } +static inline void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) +{ +	struct mm_struct *mm = current->mm; + +	if (mm) { +		unuse_mm(mm); +		mmput(mm); +	} +} +  static int io_sq_thread(void *data)  {  	struct io_ring_ctx *ctx = data; -	struct mm_struct *cur_mm = NULL;  	const struct cred *old_cred;  	mm_segment_t old_fs;  	DEFINE_WAIT(wait); @@ -5948,11 +5983,7 @@ static int io_sq_thread(void *data)  			 * adding ourselves to the waitqueue, as the unuse/drop  			 * may sleep.  			 */ -			if (cur_mm) { -				unuse_mm(cur_mm); -				mmput(cur_mm); -				cur_mm = NULL; -			} +			io_sq_thread_drop_mm(ctx);  			/*  			 * We're polling. If we're within the defined idle @@ -6016,7 +6047,7 @@ static int io_sq_thread(void *data)  		}  		mutex_lock(&ctx->uring_lock); -		ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true); +		ret = io_submit_sqes(ctx, to_submit, NULL, -1, true);  		mutex_unlock(&ctx->uring_lock);  		timeout = jiffies + ctx->sq_thread_idle;  	} @@ -6025,10 +6056,7 @@ static int io_sq_thread(void *data)  		task_work_run();  	set_fs(old_fs); -	if (cur_mm) { -		unuse_mm(cur_mm); -		mmput(cur_mm); -	} +	io_sq_thread_drop_mm(ctx);  	revert_creds(old_cred);  	kthread_parkme(); @@ -7509,13 +7537,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,  			wake_up(&ctx->sqo_wait);  		submitted = to_submit;  	} else if (to_submit) { -		struct mm_struct *cur_mm; -  		mutex_lock(&ctx->uring_lock); -		/* already have mm, so io_submit_sqes() won't try to grab it */ -		cur_mm = ctx->sqo_mm; -		submitted = io_submit_sqes(ctx, to_submit, f.file, fd, -					   &cur_mm, false); +		submitted = io_submit_sqes(ctx, to_submit, f.file, fd, false);  		mutex_unlock(&ctx->uring_lock);  		if (submitted != to_submit)  | 
