diff options
Diffstat (limited to 'fs')
-rw-r--r-- | fs/Kconfig.binfmt | 3 | ||||
-rw-r--r-- | fs/afs/write.c | 9 | ||||
-rw-r--r-- | fs/cachefiles/xattr.c | 23 | ||||
-rw-r--r-- | fs/cifs/connect.c | 3 | ||||
-rw-r--r-- | fs/fuse/dev.c | 12 | ||||
-rw-r--r-- | fs/fuse/file.c | 1 | ||||
-rw-r--r-- | fs/fuse/fuse_i.h | 1 | ||||
-rw-r--r-- | fs/fuse/inode.c | 3 | ||||
-rw-r--r-- | fs/fuse/ioctl.c | 9 | ||||
-rw-r--r-- | fs/io_uring.c | 407 | ||||
-rw-r--r-- | fs/ocfs2/super.c | 22 | ||||
-rw-r--r-- | fs/pipe.c | 11 |
12 files changed, 344 insertions, 160 deletions
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 4d5ae61580aa..68e586283764 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -36,6 +36,9 @@ config COMPAT_BINFMT_ELF config ARCH_BINFMT_ELF_STATE bool +config ARCH_BINFMT_ELF_EXTRA_PHDRS + bool + config ARCH_HAVE_ELF_PROT bool diff --git a/fs/afs/write.c b/fs/afs/write.c index 5e9157d0da29..f447c902318d 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -703,7 +703,7 @@ static int afs_writepages_region(struct address_space *mapping, struct folio *folio; struct page *head_page; ssize_t ret; - int n; + int n, skips = 0; _enter("%llx,%llx,", start, end); @@ -754,8 +754,15 @@ static int afs_writepages_region(struct address_space *mapping, #ifdef CONFIG_AFS_FSCACHE folio_wait_fscache(folio); #endif + } else { + start += folio_size(folio); } folio_put(folio); + if (wbc->sync_mode == WB_SYNC_NONE) { + if (skips >= 5 || need_resched()) + break; + skips++; + } continue; } diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index 83f41bd0c3a9..35465109d9c4 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -28,6 +28,11 @@ struct cachefiles_xattr { static const char cachefiles_xattr_cache[] = XATTR_USER_PREFIX "CacheFiles.cache"; +struct cachefiles_vol_xattr { + __be32 reserved; /* Reserved, should be 0 */ + __u8 data[]; /* netfs volume coherency data */ +} __packed; + /* * set the state xattr on a cache file */ @@ -185,6 +190,7 @@ void cachefiles_prepare_to_write(struct fscache_cookie *cookie) */ bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume) { + struct cachefiles_vol_xattr *buf; unsigned int len = volume->vcookie->coherency_len; const void *p = volume->vcookie->coherency; struct dentry *dentry = volume->dentry; @@ -192,10 +198,17 @@ bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume) _enter("%x,#%d", volume->vcookie->debug_id, len); + len += sizeof(*buf); + buf = kmalloc(len, GFP_KERNEL); + if (!buf) + return false; + buf->reserved = cpu_to_be32(0); + memcpy(buf->data, p, len); + ret = cachefiles_inject_write_error(); if (ret == 0) ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache, - p, len, 0); + buf, len, 0); if (ret < 0) { trace_cachefiles_vfs_error(NULL, d_inode(dentry), ret, cachefiles_trace_setxattr_error); @@ -209,6 +222,7 @@ bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume) cachefiles_coherency_vol_set_ok); } + kfree(buf); _leave(" = %d", ret); return ret == 0; } @@ -218,7 +232,7 @@ bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume) */ int cachefiles_check_volume_xattr(struct cachefiles_volume *volume) { - struct cachefiles_xattr *buf; + struct cachefiles_vol_xattr *buf; struct dentry *dentry = volume->dentry; unsigned int len = volume->vcookie->coherency_len; const void *p = volume->vcookie->coherency; @@ -228,6 +242,7 @@ int cachefiles_check_volume_xattr(struct cachefiles_volume *volume) _enter(""); + len += sizeof(*buf); buf = kmalloc(len, GFP_KERNEL); if (!buf) return -ENOMEM; @@ -245,7 +260,9 @@ int cachefiles_check_volume_xattr(struct cachefiles_volume *volume) "Failed to read xattr with error %zd", xlen); } why = cachefiles_coherency_vol_check_xattr; - } else if (memcmp(buf->data, p, len) != 0) { + } else if (buf->reserved != cpu_to_be32(0)) { + why = cachefiles_coherency_vol_check_resv; + } else if (memcmp(buf->data, p, len - sizeof(*buf)) != 0) { why = cachefiles_coherency_vol_check_cmp; } else { why = cachefiles_coherency_vol_check_ok; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 053cb449eb16..d3020abfe404 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3924,7 +3924,8 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, /* only send once per connect */ spin_lock(&cifs_tcp_ses_lock); - if (server->tcpStatus != CifsNeedSessSetup) { + if ((server->tcpStatus != CifsNeedSessSetup) && + (ses->status == CifsGood)) { spin_unlock(&cifs_tcp_ses_lock); return 0; } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index cd54a529460d..592730fd6e42 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -941,7 +941,17 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, while (count) { if (cs->write && cs->pipebufs && page) { - return fuse_ref_page(cs, page, offset, count); + /* + * Can't control lifetime of pipe buffers, so always + * copy user pages. + */ + if (cs->req->args->user_pages) { + err = fuse_copy_fill(cs); + if (err) + return err; + } else { + return fuse_ref_page(cs, page, offset, count); + } } else if (!cs->len) { if (cs->move_pages && page && offset == 0 && count == PAGE_SIZE) { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 829094451774..0fc150c1c50b 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1413,6 +1413,7 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, (PAGE_SIZE - ret) & (PAGE_SIZE - 1); } + ap->args.user_pages = true; if (write) ap->args.in_pages = true; else diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index e8e59fbdefeb..eac4984cc753 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -256,6 +256,7 @@ struct fuse_args { bool nocreds:1; bool in_pages:1; bool out_pages:1; + bool user_pages:1; bool out_argvar:1; bool page_zeroing:1; bool page_replace:1; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index ee846ce371d8..9ee36aa73251 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -23,6 +23,7 @@ #include <linux/exportfs.h> #include <linux/posix_acl.h> #include <linux/pid_namespace.h> +#include <uapi/linux/magic.h> MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); MODULE_DESCRIPTION("Filesystem in Userspace"); @@ -50,8 +51,6 @@ MODULE_PARM_DESC(max_user_congthresh, "Global limit for the maximum congestion threshold an " "unprivileged user can set"); -#define FUSE_SUPER_MAGIC 0x65735546 - #define FUSE_DEFAULT_BLKSIZE 512 /** Maximum number of outstanding background requests */ diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index fbc09dab1f85..df58966bc874 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -394,9 +394,12 @@ static int fuse_priv_ioctl(struct inode *inode, struct fuse_file *ff, args.out_args[1].value = ptr; err = fuse_simple_request(fm, &args); - if (!err && outarg.flags & FUSE_IOCTL_RETRY) - err = -EIO; - + if (!err) { + if (outarg.result < 0) + err = outarg.result; + else if (outarg.flags & FUSE_IOCTL_RETRY) + err = -EIO; + } return err; } diff --git a/fs/io_uring.c b/fs/io_uring.c index 257d659fc25f..496a2af7d12c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -264,11 +264,18 @@ struct io_rsrc_data { bool quiesce; }; +struct io_buffer_list { + struct list_head list; + struct list_head buf_list; + __u16 bgid; +}; + struct io_buffer { struct list_head list; __u64 addr; __u32 len; __u16 bid; + __u16 bgid; }; struct io_restriction { @@ -333,6 +340,8 @@ struct io_ev_fd { struct rcu_head rcu; }; +#define IO_BUFFERS_HASH_BITS 5 + struct io_ring_ctx { /* const or read-mostly hot data */ struct { @@ -346,6 +355,7 @@ struct io_ring_ctx { unsigned int off_timeout_used: 1; unsigned int drain_active: 1; unsigned int drain_disabled: 1; + unsigned int has_evfd: 1; } ____cacheline_aligned_in_smp; /* submission data */ @@ -384,8 +394,9 @@ struct io_ring_ctx { struct list_head timeout_list; struct list_head ltimeout_list; struct list_head cq_overflow_list; - struct xarray io_buffers; + struct list_head *io_buffers; struct list_head io_buffers_cache; + struct list_head apoll_cache; struct xarray personalities; u32 pers_next; unsigned sq_thread_idle; @@ -769,6 +780,8 @@ enum { REQ_F_ARM_LTIMEOUT_BIT, REQ_F_ASYNC_DATA_BIT, REQ_F_SKIP_LINK_CQES_BIT, + REQ_F_SINGLE_POLL_BIT, + REQ_F_DOUBLE_POLL_BIT, /* keep async read/write and isreg together and in order */ REQ_F_SUPPORT_NOWAIT_BIT, REQ_F_ISREG_BIT, @@ -827,6 +840,10 @@ enum { REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT), /* don't post CQEs while failing linked requests */ REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT), + /* single poll may be active */ + REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT), + /* double poll may active */ + REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT), }; struct async_poll { @@ -906,6 +923,7 @@ struct io_kiocb { /* used by request caches, completion batching and iopoll */ struct io_wq_work_node comp_list; atomic_t refs; + atomic_t poll_refs; struct io_kiocb *link; struct io_task_work io_task_work; /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ @@ -914,12 +932,11 @@ struct io_kiocb { struct async_poll *apoll; /* opcode allocated if it needs to store data for async defer */ void *async_data; - struct io_wq_work work; /* custom credentials, valid IFF REQ_F_CREDS is set */ - const struct cred *creds; /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ struct io_buffer *kbuf; - atomic_t poll_refs; + const struct cred *creds; + struct io_wq_work work; }; struct io_tctx_node { @@ -1173,6 +1190,7 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file, static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags); static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer); +static void io_eventfd_signal(struct io_ring_ctx *ctx); static struct kmem_cache *req_cachep; @@ -1351,6 +1369,38 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req, return cflags; } +static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, + unsigned int bgid) +{ + struct list_head *hash_list; + struct io_buffer_list *bl; + + hash_list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)]; + list_for_each_entry(bl, hash_list, list) + if (bl->bgid == bgid || bgid == -1U) + return bl; + + return NULL; +} + +static void io_kbuf_recycle(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; + struct io_buffer *buf; + + if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) + return; + + lockdep_assert_held(&ctx->uring_lock); + + buf = req->kbuf; + bl = io_buffer_get_list(ctx, buf->bgid); + list_add(&buf->list, &bl->buf_list); + req->flags &= ~REQ_F_BUFFER_SELECTED; + req->kbuf = NULL; +} + static bool io_match_task(struct io_kiocb *head, struct task_struct *task, bool cancel_all) __must_hold(&req->ctx->timeout_lock) @@ -1461,7 +1511,7 @@ static __cold void io_fallback_req_func(struct work_struct *work) static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; - int hash_bits; + int i, hash_bits; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) @@ -1488,6 +1538,13 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) /* set invalid range, so io_import_fixed() fails meeting it */ ctx->dummy_ubuf->ubuf = -1UL; + ctx->io_buffers = kcalloc(1U << IO_BUFFERS_HASH_BITS, + sizeof(struct list_head), GFP_KERNEL); + if (!ctx->io_buffers) + goto err; + for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++) + INIT_LIST_HEAD(&ctx->io_buffers[i]); + if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) goto err; @@ -1497,8 +1554,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->sqd_list); INIT_LIST_HEAD(&ctx->cq_overflow_list); INIT_LIST_HEAD(&ctx->io_buffers_cache); + INIT_LIST_HEAD(&ctx->apoll_cache); init_completion(&ctx->ref_comp); - xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1); xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->cq_wait); @@ -1527,6 +1584,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) err: kfree(ctx->dummy_ubuf); kfree(ctx->cancel_hash); + kfree(ctx->io_buffers); kfree(ctx); return NULL; } @@ -1740,22 +1798,27 @@ static __cold void io_flush_timeouts(struct io_ring_ctx *ctx) spin_unlock_irq(&ctx->timeout_lock); } -static __cold void __io_commit_cqring_flush(struct io_ring_ctx *ctx) -{ - if (ctx->off_timeout_used) - io_flush_timeouts(ctx); - if (ctx->drain_active) - io_queue_deferred(ctx); -} - static inline void io_commit_cqring(struct io_ring_ctx *ctx) { - if (unlikely(ctx->off_timeout_used || ctx->drain_active)) - __io_commit_cqring_flush(ctx); /* order cqe stores with ring update */ smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); } +static void __io_commit_cqring_flush(struct io_ring_ctx *ctx) +{ + if (ctx->off_timeout_used || ctx->drain_active) { + spin_lock(&ctx->completion_lock); + if (ctx->off_timeout_used) + io_flush_timeouts(ctx); + if (ctx->drain_active) + io_queue_deferred(ctx); + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + } + if (ctx->has_evfd) + io_eventfd_signal(ctx); +} + static inline bool io_sqring_full(struct io_ring_ctx *ctx) { struct io_rings *r = ctx->rings; @@ -1789,10 +1852,6 @@ static void io_eventfd_signal(struct io_ring_ctx *ctx) { struct io_ev_fd *ev_fd; - /* Return quickly if ctx->io_ev_fd doesn't exist */ - if (likely(!rcu_dereference_raw(ctx->io_ev_fd))) - return; - rcu_read_lock(); /* * rcu_dereference ctx->io_ev_fd once and use it for both for checking @@ -1812,19 +1871,11 @@ static void io_eventfd_signal(struct io_ring_ctx *ctx) if (!ev_fd->eventfd_async || io_wq_current_is_worker()) eventfd_signal(ev_fd->cq_ev_fd, 1); - out: rcu_read_unlock(); } -/* - * This should only get called when at least one event has been posted. - * Some applications rely on the eventfd notification count only changing - * IFF a new CQE has been added to the CQ ring. There's no depedency on - * 1:1 relationship between how many times this function is called (and - * hence the eventfd count) and number of CQEs posted to the CQ ring. - */ -static void io_cqring_ev_posted(struct io_ring_ctx *ctx) +static inline void io_cqring_wake(struct io_ring_ctx *ctx) { /* * wake_up_all() may seem excessive, but io_wake_function() and @@ -1833,19 +1884,32 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) */ if (wq_has_sleeper(&ctx->cq_wait)) wake_up_all(&ctx->cq_wait); - io_eventfd_signal(ctx); +} + +/* + * This should only get called when at least one event has been posted. + * Some applications rely on the eventfd notification count only changing + * IFF a new CQE has been added to the CQ ring. There's no depedency on + * 1:1 relationship between how many times this function is called (and + * hence the eventfd count) and number of CQEs posted to the CQ ring. + */ +static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx) +{ + if (unlikely(ctx->off_timeout_used || ctx->drain_active || + ctx->has_evfd)) + __io_commit_cqring_flush(ctx); + + io_cqring_wake(ctx); } static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) { - /* see waitqueue_active() comment */ - smp_mb(); + if (unlikely(ctx->off_timeout_used || ctx->drain_active || + ctx->has_evfd)) + __io_commit_cqring_flush(ctx); - if (ctx->flags & IORING_SETUP_SQPOLL) { - if (waitqueue_active(&ctx->cq_wait)) - wake_up_all(&ctx->cq_wait); - } - io_eventfd_signal(ctx); + if (ctx->flags & IORING_SETUP_SQPOLL) + io_cqring_wake(ctx); } /* Returns true if there are no backlogged entries after the flush */ @@ -1980,7 +2044,7 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, return true; } -static inline bool __fill_cqe(struct io_ring_ctx *ctx, u64 user_data, +static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { struct io_uring_cqe *cqe; @@ -2000,16 +2064,16 @@ static inline bool __fill_cqe(struct io_ring_ctx *ctx, u64 user_data, return io_cqring_event_overflow(ctx, user_data, res, cflags); } -static inline bool __io_fill_cqe(struct io_kiocb *req, s32 res, u32 cflags) +static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) { trace_io_uring_complete(req->ctx, req, req->user_data, res, cflags); - return __fill_cqe(req->ctx, req->user_data, res, cflags); + return __io_fill_cqe(req->ctx, req->user_data, res, cflags); } static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) { if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe(req, res, cflags); + __io_fill_cqe_req(req, res, cflags); } static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, @@ -2017,7 +2081,7 @@ static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, { ctx->cq_extra++; trace_io_uring_complete(ctx, NULL, user_data, res, cflags); - return __fill_cqe(ctx, user_data, res, cflags); + return __io_fill_cqe(ctx, user_data, res, cflags); } static void __io_req_complete_post(struct io_kiocb *req, s32 res, @@ -2026,7 +2090,7 @@ static void __io_req_complete_post(struct io_kiocb *req, s32 res, struct io_ring_ctx *ctx = req->ctx; if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe(req, res, cflags); + __io_fill_cqe_req(req, res, cflags); /* * If we're the last reference to this request, add to our locked * free_list cache. @@ -2085,7 +2149,7 @@ static inline void io_req_complete(struct io_kiocb *req, s32 res) static void io_req_complete_failed(struct io_kiocb *req, s32 res) { req_set_fail(req); - io_req_complete_post(req, res, 0); + io_req_complete_post(req, res, io_put_kbuf(req, 0)); } static void io_req_complete_fail_submit(struct io_kiocb *req) @@ -2618,7 +2682,16 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx) comp_list); if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe(req, req->result, req->cflags); + __io_fill_cqe_req(req, req->result, req->cflags); + if ((req->flags & REQ_F_POLLED) && req->apoll) { + struct async_poll *apoll = req->apoll; + + if (apoll->double_poll) + kfree(apoll->double_poll); + list_add(&apoll->poll.wait.entry, + &ctx->apoll_cache); + req->flags &= ~REQ_F_POLLED; + } } io_commit_cqring(ctx); @@ -2740,7 +2813,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) if (unlikely(req->flags & REQ_F_CQE_SKIP)) continue; - __io_fill_cqe(req, req->result, io_put_kbuf(req, 0)); + __io_fill_cqe_req(req, req->result, io_put_kbuf(req, 0)); nr_events++; } @@ -3194,14 +3267,10 @@ static void kiocb_done(struct io_kiocb *req, ssize_t ret, if (req->flags & REQ_F_REISSUE) { req->flags &= ~REQ_F_REISSUE; - if (io_resubmit_prep(req)) { + if (io_resubmit_prep(req)) io_req_task_queue_reissue(req); - } else { - req_set_fail(req); - req->result = ret; - req->io_task_work.func = io_req_task_complete; - io_req_task_work_add(req, false); - } + else + io_req_task_queue_fail(req, ret); } } @@ -3299,30 +3368,36 @@ static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock) mutex_lock(&ctx->uring_lock); } +static void io_buffer_add_list(struct io_ring_ctx *ctx, + struct io_buffer_list *bl, unsigned int bgid) +{ + struct list_head *list; + + list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)]; + INIT_LIST_HEAD(&bl->buf_list); + bl->bgid = bgid; + list_add(&bl->list, list); +} + static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len, int bgid, unsigned int issue_flags) { struct io_buffer *kbuf = req->kbuf; - struct io_buffer *head; bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; if (req->flags & REQ_F_BUFFER_SELECTED) return kbuf; - io_ring_submit_lock(req->ctx, needs_lock); + io_ring_submit_lock(ctx, needs_lock); - lockdep_assert_held(&req->ctx->uring_lock); + lockdep_assert_held(&ctx->uring_lock); - head = xa_load(&req->ctx->io_buffers, bgid); - if (head) { - if (!list_empty(&head->list)) { - kbuf = list_last_entry(&head->list, struct io_buffer, - list); - list_del(&kbuf->list); - } else { - kbuf = head; - xa_erase(&req->ctx->io_buffers, bgid); - } + bl = io_buffer_get_list(ctx, bgid); + if (bl && !list_empty(&bl->buf_list)) { + kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list); + list_del(&kbuf->list); if (*len > kbuf->len) *len = kbuf->len; req->flags |= REQ_F_BUFFER_SELECTED; @@ -3537,13 +3612,15 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) ret = nr; break; } + ret += nr; if (!iov_iter_is_bvec(iter)) { iov_iter_advance(iter, nr); } else { - req->rw.len -= nr; req->rw.addr += nr; + req->rw.len -= nr; + if (!req->rw.len) + break; } - ret += nr; if (nr != iovec.iov_len) break; } @@ -3737,6 +3814,16 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(ret < 0)) return ret; } else { + /* + * Safe and required to re-import if we're using provided + * buffers, as we dropped the selected one before retry. + */ + if (req->flags & REQ_F_BUFFER_SELECT) { + ret = io_import_iovec(READ, req, &iovec, s, issue_flags); + if (unlikely(ret < 0)) + return ret; + } + rw = req->async_data; s = &rw->s; /* @@ -3773,6 +3860,9 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { req->flags &= ~REQ_F_REISSUE; + /* if we can poll, just do that */ + if (req->opcode == IORING_OP_READ && file_can_poll(req->file)) + return -EAGAIN; /* IOPOLL retry should happen for io-wq threads */ if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) goto done; @@ -4345,9 +4435,8 @@ static int io_nop(struct io_kiocb *req, unsigned int issue_flags) static int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index || - sqe->rw_flags || sqe->splice_fd_in || sqe->buf_index || - sqe->personality)) + if (unlikely(sqe->addr || sqe->ioprio || sqe->rw_flags || + sqe->splice_fd_in || sqe->buf_index || sqe->personality)) return -EINVAL; if (req->file->f_op != &io_uring_fops) @@ -4605,8 +4694,8 @@ static int io_remove_buffers_prep(struct io_kiocb *req, return 0; } -static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf, - int bgid, unsigned nbufs) +static int __io_remove_buffers(struct io_ring_ctx *ctx, + struct io_buffer_list *bl, unsigned nbufs) { unsigned i = 0; @@ -4615,17 +4704,16 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf, return 0; /* the head kbuf is the list itself */ - while (!list_empty(&buf->list)) { + while (!list_empty(&bl->buf_list)) { struct io_buffer *nxt; - nxt = list_first_entry(&buf->list, struct io_buffer, list); + nxt = list_first_entry(&bl->buf_list, struct io_buffer, list); list_del(&nxt->list); if (++i == nbufs) return i; cond_resched(); } i++; - xa_erase(&ctx->io_buffers, bgid); return i; } @@ -4634,7 +4722,7 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) { struct io_provide_buf *p = &req->pbuf; struct io_ring_ctx *ctx = req->ctx; - struct io_buffer *head; + struct io_buffer_list *bl; int ret = 0; bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; @@ -4643,9 +4731,9 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) lockdep_assert_held(&ctx->uring_lock); ret = -ENOENT; - head = xa_load(&ctx->io_buffers, p->bgid); - if (head) - ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs); + bl = io_buffer_get_list(ctx, p->bgid); + if (bl) + ret = __io_remove_buffers(ctx, bl, p->nbufs); if (ret < 0) req_set_fail(req); @@ -4734,7 +4822,7 @@ static int io_refill_buffer_cache(struct io_ring_ctx *ctx) } static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, - struct io_buffer **head) + struct io_buffer_list *bl) { struct io_buffer *buf; u64 addr = pbuf->addr; @@ -4746,29 +4834,24 @@ static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, break; buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer, list); - list_del(&buf->list); + list_move_tail(&buf->list, &bl->buf_list); buf->addr = addr; buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT); buf->bid = bid; + buf->bgid = pbuf->bgid; addr += pbuf->len; bid++; - if (!*head) { - INIT_LIST_HEAD(&buf->list); - *head = buf; - } else { - list_add_tail(&buf->list, &(*head)->list); - } cond_resched(); } - return i ? i : -ENOMEM; + return i ? 0 : -ENOMEM; } static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) { struct io_provide_buf *p = &req->pbuf; struct io_ring_ctx *ctx = req->ctx; - struct io_buffer *head, *list; + struct io_buffer_list *bl; int ret = 0; bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; @@ -4776,14 +4859,18 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) lockdep_assert_held(&ctx->uring_lock); - list = head = xa_load(&ctx->io_buffers, p->bgid); - - ret = io_add_buffers(ctx, p, &head); - if (ret >= 0 && !list) { - ret = xa_insert(&ctx->io_buffers, p->bgid, head, GFP_KERNEL); - if (ret < 0) - __io_remove_buffers(ctx, head, p->bgid, -1U); + bl = io_buffer_get_list(ctx, p->bgid); + if (unlikely(!bl)) { + bl = kmalloc(sizeof(*bl), GFP_KERNEL); + if (!bl) { + ret = -ENOMEM; + goto err; + } + io_buffer_add_list(ctx, bl, p->bgid); } + + ret = io_add_buffers(ctx, p, bl); +err: if (ret < 0) req_set_fail(req); /* complete before unlock, IOPOLL may need the lock */ @@ -5464,8 +5551,7 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) accept->nofile = rlimit(RLIMIT_NOFILE); accept->file_slot = READ_ONCE(sqe->file_index); - if (accept->file_slot && ((req->open.how.flags & O_CLOEXEC) || - (accept->flags & SOCK_CLOEXEC))) + if (accept->file_slot && (accept->flags & SOCK_CLOEXEC)) return -EINVAL; if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return -EINVAL; @@ -5782,8 +5868,12 @@ static inline void io_poll_remove_entry(struct io_poll_iocb *poll) static void io_poll_remove_entries(struct io_kiocb *req) { - struct io_poll_iocb *poll = io_poll_get_single(req); - struct io_poll_iocb *poll_double = io_poll_get_double(req); + /* + * Nothing to do if neither of those flags are set. Avoid dipping + * into the poll/apoll/double cachelines if we can. + */ + if (!(req->flags & (REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL))) + return; /* * While we hold the waitqueue lock and the waitqueue is nonempty, @@ -5801,9 +5891,10 @@ static void io_poll_remove_entries(struct io_kiocb *req) * In that case, only RCU prevents the queue memory from being freed. */ rcu_read_lock(); - io_poll_remove_entry(poll); - if (poll_double) - io_poll_remove_entry(poll_double); + if (req->flags & REQ_F_SINGLE_POLL) + io_poll_remove_entry(io_poll_get_single(req)); + if (req->flags & REQ_F_DOUBLE_POLL) + io_poll_remove_entry(io_poll_get_double(req)); rcu_read_unlock(); } @@ -5835,13 +5926,13 @@ static int io_poll_check_events(struct io_kiocb *req) return -ECANCELED; if (!req->result) { - struct poll_table_struct pt = { ._key = poll->events }; + struct poll_table_struct pt = { ._key = req->cflags }; - req->result = vfs_poll(req->file, &pt) & poll->events; + req->result = vfs_poll(req->file, &pt) & req->cflags; } /* multishot, just fill an CQE and proceed */ - if (req->result && !(poll->events & EPOLLONESHOT)) { + if (req->result && !(req->cflags & EPOLLONESHOT)) { __poll_t mask = mangle_poll(req->result & poll->events); bool filled; @@ -5912,9 +6003,16 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked) io_req_complete_failed(req, ret); } -static void __io_poll_execute(struct io_kiocb *req, int mask) +static void __io_poll_execute(struct io_kiocb *req, int mask, int events) { req->result = mask; + /* + * This is useful for poll that is armed on behalf of another + * request, and where the wakeup path could be on a different + * CPU. We want to avoid pulling in req->apoll->events for that + * case. + */ + req->cflags = events; if (req->opcode == IORING_OP_POLL_ADD) req->io_task_work.func = io_poll_task_func; else @@ -5924,17 +6022,17 @@ static void __io_poll_execute(struct io_kiocb *req, int mask) io_req_task_work_add(req, false); } -static inline void io_poll_execute(struct io_kiocb *req, int res) +static inline void io_poll_execute(struct io_kiocb *req, int res, int events) { if (io_poll_get_ownership(req)) - __io_poll_execute(req, res); + __io_poll_execute(req, res, events); } static void io_poll_cancel_req(struct io_kiocb *req) { io_poll_mark_cancelled(req); /* kick tw, which should complete the request */ - io_poll_execute(req, 0); + io_poll_execute(req, 0, 0); } static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, @@ -5948,7 +6046,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, if (unlikely(mask & POLLFREE)) { io_poll_mark_cancelled(req); /* we have to kick tw in case it's not already */ - io_poll_execute(req, 0); + io_poll_execute(req, 0, poll->events); /* * If the waitqueue is being freed early but someone is already @@ -5978,8 +6076,9 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, if (mask && poll->events & EPOLLONESHOT) { list_del_init(&poll->wait.entry); poll->head = NULL; + req->flags &= ~REQ_F_SINGLE_POLL; } - __io_poll_execute(req, mask); + __io_poll_execute(req, mask, poll->events); } return 1; } @@ -6014,12 +6113,14 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, pt->error = -ENOMEM; return; } + req->flags |= REQ_F_DOUBLE_POLL; io_init_poll_iocb(poll, first->events, first->wait.func); *poll_ptr = poll; if (req->opcode == IORING_OP_POLL_ADD) req->flags |= REQ_F_ASYNC_DATA; } + req->flags |= REQ_F_SINGLE_POLL; pt->nr_entries++; poll->head = head; poll->wait.private = req; @@ -6083,7 +6184,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req, /* can't multishot if failed, just queue the event we've got */ if (unlikely(ipt->error || !ipt->nr_entries)) poll->events |= EPOLLONESHOT; - __io_poll_execute(req, mask); + __io_poll_execute(req, mask, poll->events); return 0; } io_add_napi(req->file, req->ctx); @@ -6094,7 +6195,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req, */ v = atomic_dec_return(&req->poll_refs); if (unlikely(v & IO_POLL_REF_MASK)) - __io_poll_execute(req, 0); + __io_poll_execute(req, 0, poll->events); return 0; } @@ -6113,7 +6214,7 @@ enum { IO_APOLL_READY }; -static int io_arm_poll_handler(struct io_kiocb *req) +static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) { const struct io_op_def *def = &io_op_defs[req->opcode]; struct io_ring_ctx *ctx = req->ctx; @@ -6138,9 +6239,16 @@ static int io_arm_poll_handler(struct io_kiocb *req) mask |= POLLOUT | POLLWRNORM; } - apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); - if (unlikely(!apoll)) - return IO_APOLL_ABORTED; + if (!(issue_flags & IO_URING_F_UNLOCKED) && + !list_empty(&ctx->apoll_cache)) { + apoll = list_first_entry(&ctx->apoll_cache, struct async_poll, + poll.wait.entry); + list_del_init(&apoll->poll.wait.entry); + } else { + apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); + if (unlikely(!apoll)) + return IO_APOLL_ABORTED; + } apoll->double_poll = NULL; req->apoll = apoll; req->flags |= REQ_F_POLLED; @@ -6285,7 +6393,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe return -EINVAL; io_req_set_refcount(req); - poll->events = io_poll_parse_events(sqe, flags); + req->cflags = poll->events = io_poll_parse_events(sqe, flags); return 0; } @@ -6402,10 +6510,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) if (IS_ERR(req)) return PTR_ERR(req); - - req_set_fail(req); - io_fill_cqe_req(req, -ECANCELED, 0); - io_put_req_deferred(req); + io_req_task_queue_fail(req, -ECANCELED); return 0; } @@ -7245,7 +7350,7 @@ static void io_wq_submit_work(struct io_wq_work *work) continue; } - if (io_arm_poll_handler(req) == IO_APOLL_OK) + if (io_arm_poll_handler(req, issue_flags) == IO_APOLL_OK) return; /* aborted or ready, in either case retry blocking */ needs_poll = false; @@ -7391,7 +7496,7 @@ static void io_queue_sqe_arm_apoll(struct io_kiocb *req) { struct io_kiocb *linked_timeout = io_prep_linked_timeout(req); - switch (io_arm_poll_handler(req)) { + switch (io_arm_poll_handler(req, 0)) { case IO_APOLL_READY: io_req_task_queue(req); break; @@ -7400,8 +7505,12 @@ static void io_queue_sqe_arm_apoll(struct io_kiocb *req) * Queued up for async execution, worker will release * submit reference when the iocb is actually submitted. */ + io_kbuf_recycle(req); io_queue_async_work(req, NULL); break; + case IO_APOLL_OK: + io_kbuf_recycle(req); + break; } if (linked_timeout) @@ -7770,8 +7879,14 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) } /* will complete beyond this point, count as submitted */ submitted++; - if (io_submit_sqe(ctx, req, sqe)) - break; + if (io_submit_sqe(ctx, req, sqe)) { + /* + * Continue submitting even for sqe failure if the + * ring was setup with IORING_SETUP_SUBMIT_ALL + */ + if (!(ctx->flags & IORING_SETUP_SUBMIT_ALL)) + break; + } } while (submitted < nr); if (unlikely(submitted != nr)) { @@ -9829,7 +9944,7 @@ static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, return ret; } ev_fd->eventfd_async = eventfd_async; - + ctx->has_evfd = true; rcu_assign_pointer(ctx->io_ev_fd, ev_fd); return 0; } @@ -9849,6 +9964,7 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx) ev_fd = rcu_dereference_protected(ctx->io_ev_fd, lockdep_is_held(&ctx->uring_lock)); if (ev_fd) { + ctx->has_evfd = false; rcu_assign_pointer(ctx->io_ev_fd, NULL); call_rcu(&ev_fd->rcu, io_eventfd_put); return 0; @@ -9859,11 +9975,20 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx) static void io_destroy_buffers(struct io_ring_ctx *ctx) { - struct io_buffer *buf; - unsigned long index; + int i; - xa_for_each(&ctx->io_buffers, index, buf) - __io_remove_buffers(ctx, buf, index, -1U); + for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++) { + struct list_head *list = &ctx->io_buffers[i]; + + while (!list_empty(list)) { + struct io_buffer_list *bl; + + bl = list_first_entry(list, struct io_buffer_list, list); + __io_remove_buffers(ctx, bl, -1U); + list_del(&bl->list); + kfree(bl); + } + } while (!list_empty(&ctx->io_buffers_pages)) { struct page *page; @@ -9902,6 +10027,18 @@ static void io_wait_rsrc_data(struct io_rsrc_data *data) wait_for_completion(&data->done); } +static void io_flush_apoll_cache(struct io_ring_ctx *ctx) +{ + struct async_poll *apoll; + + while (!list_empty(&ctx->apoll_cache)) { + apoll = list_first_entry(&ctx->apoll_cache, struct async_poll, + poll.wait.entry); + list_del(&apoll->poll.wait.entry); + kfree(apoll); + } +} + static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_sq_thread_finish(ctx); @@ -9924,6 +10061,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) if (ctx->rings) __io_cqring_overflow_flush(ctx, true); io_eventfd_unregister(ctx); + io_flush_apoll_cache(ctx); mutex_unlock(&ctx->uring_lock); io_destroy_buffers(ctx); if (ctx->sq_creds) @@ -9959,6 +10097,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_free_napi_list(ctx); kfree(ctx->cancel_hash); kfree(ctx->dummy_ubuf); + kfree(ctx->io_buffers); kfree(ctx); } @@ -11234,7 +11373,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ | - IORING_SETUP_R_DISABLED)) + IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL)) return -EINVAL; return io_uring_create(entries, &p, params); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 2772dec9dcea..8bde30fa5387 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1105,17 +1105,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) goto read_super_error; } - root = d_make_root(inode); - if (!root) { - status = -ENOMEM; - mlog_errno(status); - goto read_super_error; - } - - sb->s_root = root; - - ocfs2_complete_mount_recovery(osb); - osb->osb_dev_kset = kset_create_and_add(sb->s_id, NULL, &ocfs2_kset->kobj); if (!osb->osb_dev_kset) { @@ -1133,6 +1122,17 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) goto read_super_error; } + root = d_make_root(inode); + if (!root) { + status = -ENOMEM; + mlog_errno(status); + goto read_super_error; + } + + sb->s_root = root; + + ocfs2_complete_mount_recovery(osb); + if (ocfs2_mount_local(osb)) snprintf(nodestr, sizeof(nodestr), "local"); else diff --git a/fs/pipe.c b/fs/pipe.c index cc28623a67b6..2667db9506e2 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -253,7 +253,8 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) */ was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); for (;;) { - unsigned int head = pipe->head; + /* Read ->head with a barrier vs post_one_notification() */ + unsigned int head = smp_load_acquire(&pipe->head); unsigned int tail = pipe->tail; unsigned int mask = pipe->ring_size - 1; @@ -831,10 +832,8 @@ void free_pipe_info(struct pipe_inode_info *pipe) int i; #ifdef CONFIG_WATCH_QUEUE - if (pipe->watch_queue) { + if (pipe->watch_queue) watch_queue_clear(pipe->watch_queue); - put_watch_queue(pipe->watch_queue); - } #endif (void) account_pipe_buffers(pipe->user, pipe->nr_accounted, 0); @@ -844,6 +843,10 @@ void free_pipe_info(struct pipe_inode_info *pipe) if (buf->ops) pipe_buf_release(pipe, buf); } +#ifdef CONFIG_WATCH_QUEUE + if (pipe->watch_queue) + put_watch_queue(pipe->watch_queue); +#endif if (pipe->tmp_page) __free_page(pipe->tmp_page); kfree(pipe->bufs); |