diff options
Diffstat (limited to 'fs')
48 files changed, 601 insertions, 260 deletions
diff --git a/fs/afs/callback.c b/fs/afs/callback.c index 1c7955f5cdaf..128f2dbe256a 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -203,8 +203,7 @@ void afs_put_cb_interest(struct afs_net *net, struct afs_cb_interest *cbi) */ void afs_init_callback_state(struct afs_server *server) { - if (!test_and_clear_bit(AFS_SERVER_FL_NEW, &server->flags)) - server->cb_s_break++; + server->cb_s_break++; } /* diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 8ee5972893ed..2f8acb4c556d 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -34,7 +34,7 @@ static void SRXAFSCB_TellMeAboutYourself(struct work_struct *); static int afs_deliver_yfs_cb_callback(struct afs_call *); #define CM_NAME(name) \ - const char afs_SRXCB##name##_name[] __tracepoint_string = \ + char afs_SRXCB##name##_name[] __tracepoint_string = \ "CB." #name /* diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 1a4ce07fb406..9cedc3fc1b77 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -216,9 +216,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root) set_nlink(inode, 2); inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; - inode->i_ctime.tv_sec = get_seconds(); - inode->i_ctime.tv_nsec = 0; - inode->i_atime = inode->i_mtime = inode->i_ctime; + inode->i_ctime = inode->i_atime = inode->i_mtime = current_time(inode); inode->i_blocks = 0; inode_set_iversion_raw(inode, 0); inode->i_generation = 0; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index bb1f244b2b3a..3904ab0b9563 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -474,7 +474,6 @@ struct afs_server { time64_t put_time; /* Time at which last put */ time64_t update_at; /* Time at which to next update the record */ unsigned long flags; -#define AFS_SERVER_FL_NEW 0 /* New server, don't inc cb_s_break */ #define AFS_SERVER_FL_NOT_READY 1 /* The record is not ready for use */ #define AFS_SERVER_FL_NOT_FOUND 2 /* VL server says no such server */ #define AFS_SERVER_FL_VL_FAIL 3 /* Failed to access VL server */ @@ -827,7 +826,7 @@ static inline struct afs_cb_interest *afs_get_cb_interest(struct afs_cb_interest static inline unsigned int afs_calc_vnode_cb_break(struct afs_vnode *vnode) { - return vnode->cb_break + vnode->cb_s_break + vnode->cb_v_break; + return vnode->cb_break + vnode->cb_v_break; } static inline bool afs_cb_is_broken(unsigned int cb_break, @@ -835,7 +834,6 @@ static inline bool afs_cb_is_broken(unsigned int cb_break, const struct afs_cb_interest *cbi) { return !cbi || cb_break != (vnode->cb_break + - cbi->server->cb_s_break + vnode->volume->cb_v_break); } diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 2c588f9bbbda..15c7e82d80cb 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -572,13 +572,17 @@ static void afs_deliver_to_call(struct afs_call *call) case -ENODATA: case -EBADMSG: case -EMSGSIZE: - default: abort_code = RXGEN_CC_UNMARSHAL; if (state != AFS_CALL_CL_AWAIT_REPLY) abort_code = RXGEN_SS_UNMARSHAL; rxrpc_kernel_abort_call(call->net->socket, call->rxcall, abort_code, ret, "KUM"); goto local_abort; + default: + abort_code = RX_USER_ABORT; + rxrpc_kernel_abort_call(call->net->socket, call->rxcall, + abort_code, ret, "KER"); + goto local_abort; } } @@ -610,6 +614,7 @@ static long afs_wait_for_call_to_complete(struct afs_call *call, bool stalled = false; u64 rtt; u32 life, last_life; + bool rxrpc_complete = false; DECLARE_WAITQUEUE(myself, current); @@ -621,7 +626,7 @@ static long afs_wait_for_call_to_complete(struct afs_call *call, rtt2 = 2; timeout = rtt2; - last_life = rxrpc_kernel_check_life(call->net->socket, call->rxcall); + rxrpc_kernel_check_life(call->net->socket, call->rxcall, &last_life); add_wait_queue(&call->waitq, &myself); for (;;) { @@ -639,7 +644,12 @@ static long afs_wait_for_call_to_complete(struct afs_call *call, if (afs_check_call_state(call, AFS_CALL_COMPLETE)) break; - life = rxrpc_kernel_check_life(call->net->socket, call->rxcall); + if (!rxrpc_kernel_check_life(call->net->socket, call->rxcall, &life)) { + /* rxrpc terminated the call. */ + rxrpc_complete = true; + break; + } + if (timeout == 0 && life == last_life && signal_pending(current)) { if (stalled) @@ -663,12 +673,16 @@ static long afs_wait_for_call_to_complete(struct afs_call *call, remove_wait_queue(&call->waitq, &myself); __set_current_state(TASK_RUNNING); - /* Kill off the call if it's still live. */ if (!afs_check_call_state(call, AFS_CALL_COMPLETE)) { - _debug("call interrupted"); - if (rxrpc_kernel_abort_call(call->net->socket, call->rxcall, - RX_USER_ABORT, -EINTR, "KWI")) - afs_set_call_complete(call, -EINTR, 0); + if (rxrpc_complete) { + afs_set_call_complete(call, call->error, call->abort_code); + } else { + /* Kill off the call if it's still live. */ + _debug("call interrupted"); + if (rxrpc_kernel_abort_call(call->net->socket, call->rxcall, + RX_USER_ABORT, -EINTR, "KWI")) + afs_set_call_complete(call, -EINTR, 0); + } } spin_lock_bh(&call->state_lock); diff --git a/fs/afs/server.c b/fs/afs/server.c index 642afa2e9783..65b33b6da48b 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -226,7 +226,6 @@ static struct afs_server *afs_alloc_server(struct afs_net *net, RCU_INIT_POINTER(server->addresses, alist); server->addr_version = alist->version; server->uuid = *uuid; - server->flags = (1UL << AFS_SERVER_FL_NEW); server->update_at = ktime_get_real_seconds() + afs_server_update_delay; rwlock_init(&server->fs_lock); INIT_HLIST_HEAD(&server->cb_volumes); diff --git a/fs/afs/write.c b/fs/afs/write.c index 72efcfcf9f95..0122d7445fba 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -264,6 +264,7 @@ static void afs_kill_pages(struct address_space *mapping, first = page->index + 1; lock_page(page); generic_error_remove_page(mapping, page); + unlock_page(page); } __pagevec_release(&pv); diff --git a/fs/block_dev.c b/fs/block_dev.c index 78d3257435c0..bb28e2ead679 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -264,7 +264,8 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, bio_for_each_segment_all(bvec, &bio, i, iter_all) { if (should_dirty && !PageCompound(bvec->bv_page)) set_page_dirty_lock(bvec->bv_page); - put_page(bvec->bv_page); + if (!bio_flagged(&bio, BIO_NO_PAGE_REF)) + put_page(bvec->bv_page); } if (unlikely(bio.bi_status)) @@ -307,10 +308,10 @@ static void blkdev_bio_end_io(struct bio *bio) struct blkdev_dio *dio = bio->bi_private; bool should_dirty = dio->should_dirty; - if (dio->multi_bio && !atomic_dec_and_test(&dio->ref)) { - if (bio->bi_status && !dio->bio.bi_status) - dio->bio.bi_status = bio->bi_status; - } else { + if (bio->bi_status && !dio->bio.bi_status) + dio->bio.bi_status = bio->bi_status; + + if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) { if (!dio->is_sync) { struct kiocb *iocb = dio->iocb; ssize_t ret; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 920bf3b4b0ef..cccc75d15970 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -7,6 +7,7 @@ #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/highmem.h> +#include <linux/sched/mm.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -427,9 +428,13 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, unsigned long this_sum_bytes = 0; int i; u64 offset; + unsigned nofs_flag; + + nofs_flag = memalloc_nofs_save(); + sums = kvzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size), + GFP_KERNEL); + memalloc_nofs_restore(nofs_flag); - sums = kzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size), - GFP_NOFS); if (!sums) return BLK_STS_RESOURCE; @@ -472,8 +477,10 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, bytes_left = bio->bi_iter.bi_size - total_bytes; - sums = kzalloc(btrfs_ordered_sum_size(fs_info, bytes_left), - GFP_NOFS); + nofs_flag = memalloc_nofs_save(); + sums = kvzalloc(btrfs_ordered_sum_size(fs_info, + bytes_left), GFP_KERNEL); + memalloc_nofs_restore(nofs_flag); BUG_ON(!sums); /* -ENOMEM */ sums->len = bytes_left; ordered = btrfs_lookup_ordered_extent(inode, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 82fdda8ff5ab..2973608824ec 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6783,7 +6783,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, u64 extent_start = 0; u64 extent_end = 0; u64 objectid = btrfs_ino(inode); - u8 extent_type; + int extent_type = -1; struct btrfs_path *path = NULL; struct btrfs_root *root = inode->root; struct btrfs_file_extent_item *item; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ec2d8919e7fb..cd4e693406a0 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -501,6 +501,16 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + /* + * If the fs is mounted with nologreplay, which requires it to be + * mounted in RO mode as well, we can not allow discard on free space + * inside block groups, because log trees refer to extents that are not + * pinned in a block group's free space cache (pinning the extents is + * precisely the first phase of replaying a log tree). + */ + if (btrfs_test_opt(fs_info, NOLOGREPLAY)) + return -EROFS; + rcu_read_lock(); list_for_each_entry_rcu(device, &fs_info->fs_devices->devices, dev_list) { diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 6fde2b2741ef..45e3cfd1198b 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -6,6 +6,7 @@ #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/writeback.h> +#include <linux/sched/mm.h> #include "ctree.h" #include "transaction.h" #include "btrfs_inode.h" @@ -442,7 +443,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) cur = entry->list.next; sum = list_entry(cur, struct btrfs_ordered_sum, list); list_del(&sum->list); - kfree(sum); + kvfree(sum); } kmem_cache_free(btrfs_ordered_extent_cache, entry); } diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index dc6140013ae8..61d22a56c0ba 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -366,11 +366,11 @@ int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, static int prop_compression_validate(const char *value, size_t len) { - if (!strncmp("lzo", value, len)) + if (!strncmp("lzo", value, 3)) return 0; - else if (!strncmp("zlib", value, len)) + else if (!strncmp("zlib", value, 4)) return 0; - else if (!strncmp("zstd", value, len)) + else if (!strncmp("zstd", value, 4)) return 0; return -EINVAL; @@ -396,7 +396,7 @@ static int prop_compression_apply(struct inode *inode, btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); } else if (!strncmp("zlib", value, 4)) { type = BTRFS_COMPRESS_ZLIB; - } else if (!strncmp("zstd", value, len)) { + } else if (!strncmp("zstd", value, 4)) { type = BTRFS_COMPRESS_ZSTD; btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); } else { diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index d09b6cdb785a..b283d3a6e837 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -205,28 +205,17 @@ static struct root_entry *lookup_root_entry(struct rb_root *root, u64 objectid) #ifdef CONFIG_STACKTRACE static void __save_stack_trace(struct ref_action *ra) { - struct stack_trace stack_trace; - - stack_trace.max_entries = MAX_TRACE; - stack_trace.nr_entries = 0; - stack_trace.entries = ra->trace; - stack_trace.skip = 2; - save_stack_trace(&stack_trace); - ra->trace_len = stack_trace.nr_entries; + ra->trace_len = stack_trace_save(ra->trace, MAX_TRACE, 2); } static void __print_stack_trace(struct btrfs_fs_info *fs_info, struct ref_action *ra) { - struct stack_trace trace; - if (ra->trace_len == 0) { btrfs_err(fs_info, " ref-verify: no stacktrace"); return; } - trace.nr_entries = ra->trace_len; - trace.entries = ra->trace; - print_stack_trace(&trace, 2); + stack_trace_print(ra->trace, ra->trace_len, 2); } #else static void inline __save_stack_trace(struct ref_action *ra) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index a8f429882249..0637149fb9f9 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1766,6 +1766,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn) { struct ceph_inode_info *dci = ceph_inode(dir); + unsigned hash; switch (dci->i_dir_layout.dl_dir_hash) { case 0: /* for backward compat */ @@ -1773,8 +1774,11 @@ unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn) return dn->d_name.hash; default: - return ceph_str_hash(dci->i_dir_layout.dl_dir_hash, + spin_lock(&dn->d_lock); + hash = ceph_str_hash(dci->i_dir_layout.dl_dir_hash, dn->d_name.name, dn->d_name.len); + spin_unlock(&dn->d_lock); + return hash; } } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 2d61ddda9bf5..c2feb310ac1e 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1163,6 +1163,19 @@ static int splice_dentry(struct dentry **pdn, struct inode *in) return 0; } +static int d_name_cmp(struct dentry *dentry, const char *name, size_t len) +{ + int ret; + + /* take d_lock to ensure dentry->d_name stability */ + spin_lock(&dentry->d_lock); + ret = dentry->d_name.len - len; + if (!ret) + ret = memcmp(dentry->d_name.name, name, len); + spin_unlock(&dentry->d_lock); + return ret; +} + /* * Incorporate results into the local cache. This is either just * one inode, or a directory, dentry, and possibly linked-to inode (e.g., @@ -1412,7 +1425,8 @@ retry_lookup: err = splice_dentry(&req->r_dentry, in); if (err < 0) goto done; - } else if (rinfo->head->is_dentry) { + } else if (rinfo->head->is_dentry && + !d_name_cmp(req->r_dentry, rinfo->dname, rinfo->dname_len)) { struct ceph_vino *ptvino = NULL; if ((le32_to_cpu(rinfo->diri.in->cap.caps) & CEPH_CAP_FILE_SHARED) || diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 21c33ed048ed..9049c2a3e972 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1414,6 +1414,15 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove); ci->i_prealloc_cap_flush = NULL; } + + if (drop && + ci->i_wrbuffer_ref_head == 0 && + ci->i_wr_ref == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { + ceph_put_snap_context(ci->i_head_snapc); + ci->i_head_snapc = NULL; + } } spin_unlock(&ci->i_ceph_lock); while (!list_empty(&to_remove)) { @@ -2161,10 +2170,39 @@ retry: return path; } +/* Duplicate the dentry->d_name.name safely */ +static int clone_dentry_name(struct dentry *dentry, const char **ppath, + int *ppathlen) +{ + u32 len; + char *name; + +retry: + len = READ_ONCE(dentry->d_name.len); + name = kmalloc(len + 1, GFP_NOFS); + if (!name) + return -ENOMEM; + + spin_lock(&dentry->d_lock); + if (dentry->d_name.len != len) { + spin_unlock(&dentry->d_lock); + kfree(name); + goto retry; + } + memcpy(name, dentry->d_name.name, len); + spin_unlock(&dentry->d_lock); + + name[len] = '\0'; + *ppath = name; + *ppathlen = len; + return 0; +} + static int build_dentry_path(struct dentry *dentry, struct inode *dir, const char **ppath, int *ppathlen, u64 *pino, - int *pfreepath) + bool *pfreepath, bool parent_locked) { + int ret; char *path; rcu_read_lock(); @@ -2173,8 +2211,15 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir, if (dir && ceph_snap(dir) == CEPH_NOSNAP) { *pino = ceph_ino(dir); rcu_read_unlock(); - *ppath = dentry->d_name.name; - *ppathlen = dentry->d_name.len; + if (parent_locked) { + *ppath = dentry->d_name.name; + *ppathlen = dentry->d_name.len; + } else { + ret = clone_dentry_name(dentry, ppath, ppathlen); + if (ret) + return ret; + *pfreepath = true; + } return 0; } rcu_read_unlock(); @@ -2182,13 +2227,13 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir, if (IS_ERR(path)) return PTR_ERR(path); *ppath = path; - *pfreepath = 1; + *pfreepath = true; return 0; } static int build_inode_path(struct inode *inode, const char **ppath, int *ppathlen, u64 *pino, - int *pfreepath) + bool *pfreepath) { struct dentry *dentry; char *path; @@ -2204,7 +2249,7 @@ static int build_inode_path(struct inode *inode, if (IS_ERR(path)) return PTR_ERR(path); *ppath = path; - *pfreepath = 1; + *pfreepath = true; return 0; } @@ -2215,7 +2260,7 @@ static int build_inode_path(struct inode *inode, static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, struct inode *rdiri, const char *rpath, u64 rino, const char **ppath, int *pathlen, - u64 *ino, int *freepath) + u64 *ino, bool *freepath, bool parent_locked) { int r = 0; @@ -2225,7 +2270,7 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, ceph_snap(rinode)); } else if (rdentry) { r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino, - freepath); + freepath, parent_locked); dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, *ppath); } else if (rpath || rino) { @@ -2251,7 +2296,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, const char *path2 = NULL; u64 ino1 = 0, ino2 = 0; int pathlen1 = 0, pathlen2 = 0; - int freepath1 = 0, freepath2 = 0; + bool freepath1 = false, freepath2 = false; int len; u16 releases; void *p, *end; @@ -2259,16 +2304,19 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, ret = set_request_path_attr(req->r_inode, req->r_dentry, req->r_parent, req->r_path1, req->r_ino1.ino, - &path1, &pathlen1, &ino1, &freepath1); + &path1, &pathlen1, &ino1, &freepath1, + test_bit(CEPH_MDS_R_PARENT_LOCKED, + &req->r_req_flags)); if (ret < 0) { msg = ERR_PTR(ret); goto out; } + /* If r_old_dentry is set, then assume that its parent is locked */ ret = set_request_path_attr(NULL, req->r_old_dentry, req->r_old_dentry_dir, req->r_path2, req->r_ino2.ino, - &path2, &pathlen2, &ino2, &freepath2); + &path2, &pathlen2, &ino2, &freepath2, true); if (ret < 0) { msg = ERR_PTR(ret); goto out_free1; diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 89aa37fa0f84..b26e12cd8ec3 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -572,7 +572,12 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) old_snapc = NULL; update_snapc: - if (ci->i_head_snapc) { + if (ci->i_wrbuffer_ref_head == 0 && + ci->i_wr_ref == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { + ci->i_head_snapc = NULL; + } else { ci->i_head_snapc = ceph_get_snap_context(new_snapc); dout(" new snapc is %p\n", new_snapc); } diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 5b18d4585740..585ad3207cb1 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1333,6 +1333,7 @@ cifsFileInfo_get_locked(struct cifsFileInfo *cifs_file) } struct cifsFileInfo *cifsFileInfo_get(struct cifsFileInfo *cifs_file); +void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_hdlr); void cifsFileInfo_put(struct cifsFileInfo *cifs_file); #define CIFS_CACHE_READ_FLG 1 @@ -1855,6 +1856,7 @@ GLOBAL_EXTERN spinlock_t gidsidlock; #endif /* CONFIG_CIFS_ACL */ void cifs_oplock_break(struct work_struct *work); +void cifs_queue_oplock_break(struct cifsFileInfo *cfile); extern const struct slow_work_ops cifs_oplock_break_ops; extern struct workqueue_struct *cifsiod_wq; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 89006e044973..7037a137fa53 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -360,13 +360,31 @@ cifsFileInfo_get(struct cifsFileInfo *cifs_file) return cifs_file; } -/* - * Release a reference on the file private data. This may involve closing - * the filehandle out on the server. Must be called without holding - * tcon->open_file_lock and cifs_file->file_info_lock. +/** + * cifsFileInfo_put - release a reference of file priv data + * + * Always potentially wait for oplock handler. See _cifsFileInfo_put(). */ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) { + _cifsFileInfo_put(cifs_file, true); +} + +/** + * _cifsFileInfo_put - release a reference of file priv data + * + * This may involve closing the filehandle @cifs_file out on the + * server. Must be called without holding tcon->open_file_lock and + * cifs_file->file_info_lock. + * + * If @wait_for_oplock_handler is true and we are releasing the last + * reference, wait for any running oplock break handler of the file + * and cancel any pending one. If calling this function from the + * oplock break handler, you need to pass false. + * + */ +void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler) +{ struct inode *inode = d_inode(cifs_file->dentry); struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink); struct TCP_Server_Info *server = tcon->ses->server; @@ -414,7 +432,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) spin_unlock(&tcon->open_file_lock); - oplock_break_cancelled = cancel_work_sync(&cifs_file->oplock_break); + oplock_break_cancelled = wait_oplock_handler ? + cancel_work_sync(&cifs_file->oplock_break) : false; if (!tcon->need_reconnect && !cifs_file->invalidHandle) { struct TCP_Server_Info *server = tcon->ses->server; @@ -2858,7 +2877,6 @@ static void collect_uncached_write_data(struct cifs_aio_ctx *ctx) struct cifs_tcon *tcon; struct cifs_sb_info *cifs_sb; struct dentry *dentry = ctx->cfile->dentry; - unsigned int i; int rc; tcon = tlink_tcon(ctx->cfile->tlink); @@ -2922,10 +2940,6 @@ restart_loop: kref_put(&wdata->refcount, cifs_uncached_writedata_release); } - if (!ctx->direct_io) - for (i = 0; i < ctx->npages; i++) - put_page(ctx->bv[i].bv_page); - cifs_stats_bytes_written(tcon, ctx->total_len); set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(dentry->d_inode)->flags); @@ -3563,7 +3577,6 @@ collect_uncached_read_data(struct cifs_aio_ctx *ctx) struct iov_iter *to = &ctx->iter; struct cifs_sb_info *cifs_sb; struct cifs_tcon *tcon; - unsigned int i; int rc; tcon = tlink_tcon(ctx->cfile->tlink); @@ -3647,15 +3660,8 @@ again: kref_put(&rdata->refcount, cifs_uncached_readdata_release); } - if (!ctx->direct_io) { - for (i = 0; i < ctx->npages; i++) { - if (ctx->should_dirty) - set_page_dirty(ctx->bv[i].bv_page); - put_page(ctx->bv[i].bv_page); - } - + if (!ctx->direct_io) ctx->total_len = ctx->len - iov_iter_count(to); - } /* mask nodata case */ if (rc == -ENODATA) @@ -4603,6 +4609,7 @@ void cifs_oplock_break(struct work_struct *work) cinode); cifs_dbg(FYI, "Oplock release rc = %d\n", rc); } + _cifsFileInfo_put(cfile, false /* do not wait for ourself */); cifs_done_oplock_break(cinode); } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 53fdb5df0d2e..538fd7d807e4 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1735,6 +1735,10 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry, if (rc == 0 || rc != -EBUSY) goto do_rename_exit; + /* Don't fall back to using SMB on SMB 2+ mount */ + if (server->vals->protocol_id != 0) + goto do_rename_exit; + /* open-file renames don't work across directories */ if (to_dentry->d_parent != from_dentry->d_parent) goto do_rename_exit; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index bee203055b30..0dc6f08020ac 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -501,8 +501,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &pCifsInode->flags); - queue_work(cifsoplockd_wq, - &netfile->oplock_break); + cifs_queue_oplock_break(netfile); netfile->oplock_break_cancelled = false; spin_unlock(&tcon->open_file_lock); @@ -607,6 +606,28 @@ void cifs_put_writer(struct cifsInodeInfo *cinode) spin_unlock(&cinode->writers_lock); } +/** + * cifs_queue_oplock_break - queue the oplock break handler for cfile + * + * This function is called from the demultiplex thread when it + * receives an oplock break for @cfile. + * + * Assumes the tcon->open_file_lock is held. + * Assumes cfile->file_info_lock is NOT held. + */ +void cifs_queue_oplock_break(struct cifsFileInfo *cfile) +{ + /* + * Bump the handle refcount now while we hold the + * open_file_lock to enforce the validity of it for the oplock + * break handler. The matching put is done at the end of the + * handler. + */ + cifsFileInfo_get(cfile); + + queue_work(cifsoplockd_wq, &cfile->oplock_break); +} + void cifs_done_oplock_break(struct cifsInodeInfo *cinode) { clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags); @@ -768,6 +789,11 @@ cifs_aio_ctx_alloc(void) { struct cifs_aio_ctx *ctx; + /* + * Must use kzalloc to initialize ctx->bv to NULL and ctx->direct_io + * to false so that we know when we have to unreference pages within + * cifs_aio_ctx_release() + */ ctx = kzalloc(sizeof(struct cifs_aio_ctx), GFP_KERNEL); if (!ctx) return NULL; @@ -786,7 +812,23 @@ cifs_aio_ctx_release(struct kref *refcount) struct cifs_aio_ctx, refcount); cifsFileInfo_put(ctx->cfile); - kvfree(ctx->bv); + + /* + * ctx->bv is only set if setup_aio_ctx_iter() was call successfuly + * which means that iov_iter_get_pages() was a success and thus that + * we have taken reference on pages. + */ + if (ctx->bv) { + unsigned i; + + for (i = 0; i < ctx->npages; i++) { + if (ctx->should_dirty) + set_page_dirty(ctx->bv[i].bv_page); + put_page(ctx->bv[i].bv_page); + } + kvfree(ctx->bv); + } + kfree(ctx); } diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 0e3570e40ff8..e311f58dc1c8 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -555,7 +555,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp, clear_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags); - queue_work(cifsoplockd_wq, &cfile->oplock_break); + cifs_queue_oplock_break(cfile); kfree(lw); return true; } @@ -712,8 +712,8 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags); spin_unlock(&cfile->file_info_lock); - queue_work(cifsoplockd_wq, - &cfile->oplock_break); + + cifs_queue_oplock_break(cfile); spin_unlock(&tcon->open_file_lock); spin_unlock(&cifs_tcp_ses_lock); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 00225e699d03..c36ff0d1fe2a 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -2389,6 +2389,8 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, &err_iov, &resp_buftype); + if (!rc) + SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); if (!rc || !err_iov.iov_base) { rc = -ENOENT; goto free_path; diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 21ad01d55ab2..a37774a55f3a 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -832,8 +832,11 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) } else if (rsp->DialectRevision == cpu_to_le16(SMB21_PROT_ID)) { /* ops set to 3.0 by default for default so update */ ses->server->ops = &smb21_operations; - } else if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID)) + ses->server->vals = &smb21_values; + } else if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID)) { ses->server->ops = &smb311_operations; + ses->server->vals = &smb311_values; + } } else if (le16_to_cpu(rsp->DialectRevision) != ses->server->vals->protocol_id) { /* if requested single dialect ensure returned dialect matched */ @@ -3448,8 +3451,6 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, rqst.rq_nvec = 1; rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); - cifs_small_buf_release(req); - rsp = (struct smb2_read_rsp *)rsp_iov.iov_base; if (rc) { @@ -3465,12 +3466,15 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, io_parms->tcon->tid, ses->Suid, io_parms->offset, 0); free_rsp_buf(resp_buftype, rsp_iov.iov_base); + cifs_small_buf_release(req); return rc == -ENODATA ? 0 : rc; } else trace_smb3_read_done(xid, req->PersistentFileId, io_parms->tcon->tid, ses->Suid, io_parms->offset, io_parms->length); + cifs_small_buf_release(req); + *nbytes = le32_to_cpu(rsp->DataLength); if ((*nbytes > CIFS_MAX_MSGSIZE) || (*nbytes > io_parms->length)) { @@ -3769,7 +3773,6 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, rc = cifs_send_recv(xid, io_parms->tcon->ses, &rqst, &resp_buftype, flags, &rsp_iov); - cifs_small_buf_release(req); rsp = (struct smb2_write_rsp *)rsp_iov.iov_base; if (rc) { @@ -3787,6 +3790,7 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, io_parms->offset, *nbytes); } + cifs_small_buf_release(req); free_rsp_buf(resp_buftype, rsp); return rc; } @@ -33,6 +33,7 @@ #include <linux/sizes.h> #include <linux/mmu_notifier.h> #include <linux/iomap.h> +#include <asm/pgalloc.h> #include "internal.h" #define CREATE_TRACE_POINTS @@ -1407,7 +1408,9 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, { struct address_space *mapping = vmf->vma->vm_file->f_mapping; unsigned long pmd_addr = vmf->address & PMD_MASK; + struct vm_area_struct *vma = vmf->vma; struct inode *inode = mapping->host; + pgtable_t pgtable = NULL; struct page *zero_page; spinlock_t *ptl; pmd_t pmd_entry; @@ -1422,12 +1425,22 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, DAX_PMD | DAX_ZERO_PAGE, false); + if (arch_needs_pgtable_deposit()) { + pgtable = pte_alloc_one(vma->vm_mm); + if (!pgtable) + return VM_FAULT_OOM; + } + ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); if (!pmd_none(*(vmf->pmd))) { spin_unlock(ptl); goto fallback; } + if (pgtable) { + pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); + mm_inc_nr_ptes(vma->vm_mm); + } pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot); pmd_entry = pmd_mkhuge(pmd_entry); set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); @@ -1436,6 +1449,8 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, return VM_FAULT_NOPAGE; fallback: + if (pgtable) + pte_free(vma->vm_mm, pgtable); trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry); return VM_FAULT_FALLBACK; } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 8a63e52785e9..9971a35cf1ef 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2056,10 +2056,8 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len; ret = -EINVAL; - if (rem < len) { - pipe_unlock(pipe); - goto out; - } + if (rem < len) + goto out_free; rem = len; while (rem) { @@ -2077,7 +2075,9 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); pipe->nrbufs--; } else { - pipe_buf_get(pipe, ibuf); + if (!pipe_buf_get(pipe, ibuf)) + goto out_free; + *obuf = *ibuf; obuf->flags &= ~PIPE_BUF_FLAG_GIFT; obuf->len = rem; @@ -2100,11 +2100,11 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, ret = fuse_dev_do_write(fud, &cs, len); pipe_lock(pipe); +out_free: for (idx = 0; idx < nbuf; idx++) pipe_buf_release(pipe, &bufs[idx]); pipe_unlock(pipe); -out: kvfree(bufs); return ret; } diff --git a/fs/inode.c b/fs/inode.c index e9d97add2b36..9a453f3637f8 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1817,8 +1817,13 @@ int file_remove_privs(struct file *file) int kill; int error = 0; - /* Fast path for nothing security related */ - if (IS_NOSEC(inode)) + /* + * Fast path for nothing security related. + * As well for non-regular files, e.g. blkdev inodes. + * For example, blkdev_write_iter() might get here + * trying to remove privs which it is not allowed to. + */ + if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode)) return 0; kill = dentry_needs_remove_privs(dentry); diff --git a/fs/io_uring.c b/fs/io_uring.c index 07d6ef195d05..84efb8956734 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4,15 +4,28 @@ * supporting fast/efficient IO. * * A note on the read/write ordering memory barriers that are matched between - * the application and kernel side. When the application reads the CQ ring - * tail, it must use an appropriate smp_rmb() to order with the smp_wmb() - * the kernel uses after writing the tail. Failure to do so could cause a - * delay in when the application notices that completion events available. - * This isn't a fatal condition. Likewise, the application must use an - * appropriate smp_wmb() both before writing the SQ tail, and after writing - * the SQ tail. The first one orders the sqe writes with the tail write, and - * the latter is paired with the smp_rmb() the kernel will issue before - * reading the SQ tail on submission. + * the application and kernel side. + * + * After the application reads the CQ ring tail, it must use an + * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses + * before writing the tail (using smp_load_acquire to read the tail will + * do). It also needs a smp_mb() before updating CQ head (ordering the + * entry load(s) with the head store), pairing with an implicit barrier + * through a control-dependency in io_get_cqring (smp_store_release to + * store head will do). Failure to do so could lead to reading invalid + * CQ entries. + * + * Likewise, the application must use an appropriate smp_wmb() before + * writing the SQ tail (ordering SQ entry stores with the tail store), + * which pairs with smp_load_acquire in io_get_sqring (smp_store_release + * to store the tail will do). And it needs a barrier ordering the SQ + * head load before writing new SQ entries (smp_load_acquire to read + * head will do). + * + * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application + * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after* + * updating the SQ tail; a full memory barrier smp_mb() is needed + * between. * * Also see the examples in the liburing library: * @@ -70,20 +83,108 @@ struct io_uring { u32 tail ____cacheline_aligned_in_smp; }; +/* + * This data is shared with the application through the mmap at offset + * IORING_OFF_SQ_RING. + * + * The offsets to the member fields are published through struct + * io_sqring_offsets when calling io_uring_setup. + */ struct io_sq_ring { + /* + * Head and tail offsets into the ring; the offsets need to be + * masked to get valid indices. + * + * The kernel controls head and the application controls tail. + */ struct io_uring r; + /* + * Bitmask to apply to head and tail offsets (constant, equals + * ring_entries - 1) + */ u32 ring_mask; + /* Ring size (constant, power of 2) */ u32 ring_entries; + /* + * Number of invalid entries dropped by the kernel due to + * invalid index stored in array + * + * Written by the kernel, shouldn't be modified by the + * application (i.e. get number of "new events" by comparing to + * cached value). + * + * After a new SQ head value was read by the application this + * counter includes all submissions that were dropped reaching + * the new SQ head (and possibly more). + */ u32 dropped; + /* + * Runtime flags + * + * Written by the kernel, shouldn't be modified by the + * application. + * + * The application needs a full memory barrier before checking + * for IORING_SQ_NEED_WAKEUP after updating the sq tail. + */ u32 flags; + /* + * Ring buffer of indices into array of io_uring_sqe, which is + * mmapped by the application using the IORING_OFF_SQES offset. + * + * This indirection could e.g. be used to assign fixed + * io_uring_sqe entries to operations and only submit them to + * the queue when needed. + * + * The kernel modifies neither the indices array nor the entries + * array. + */ u32 array[]; }; +/* + * This data is shared with the application through the mmap at offset + * IORING_OFF_CQ_RING. + * + * The offsets to the member fields are published through struct + * io_cqring_offsets when calling io_uring_setup. + */ struct io_cq_ring { + /* + * Head and tail offsets into the ring; the offsets need to be + * masked to get valid indices. + * + * The application controls head and the kernel tail. + */ struct io_uring r; + /* + * Bitmask to apply to head and tail offsets (constant, equals + * ring_entries - 1) + */ u32 ring_mask; + /* Ring size (constant, power of 2) */ u32 ring_entries; + /* + * Number of completion events lost because the queue was full; + * this should be avoided by the application by making sure + * there are not more requests pending thatn there is space in + * the completion queue. + * + * Written by the kernel, shouldn't be modified by the + * application (i.e. get number of "new events" by comparing to + * cached value). + * + * As completion events come in out of order this counter is not + * ordered with any other data. + */ u32 overflow; + /* + * Ring buffer of completion events. + * + * The kernel writes completion events fresh every time they are + * produced, so the application is allowed to modify pending + * entries. + */ struct io_uring_cqe cqes[]; }; @@ -221,7 +322,7 @@ struct io_kiocb { struct list_head list; unsigned int flags; refcount_t refs; -#define REQ_F_FORCE_NONBLOCK 1 /* inline submission attempt */ +#define REQ_F_NOWAIT 1 /* must not punt to workers */ #define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */ #define REQ_F_FIXED_FILE 4 /* ctx owns file */ #define REQ_F_SEQ_PREV 8 /* sequential with previous */ @@ -317,12 +418,6 @@ static void io_commit_cqring(struct io_ring_ctx *ctx) /* order cqe stores with ring update */ smp_store_release(&ring->r.tail, ctx->cached_cq_tail); - /* - * Write sider barrier of tail update, app has read side. See - * comment at the top of this file. - */ - smp_wmb(); - if (wq_has_sleeper(&ctx->cq_wait)) { wake_up_interruptible(&ctx->cq_wait); kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); @@ -336,9 +431,12 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) unsigned tail; tail = ctx->cached_cq_tail; - /* See comment at the top of the file */ - smp_rmb(); - if (tail + 1 == READ_ONCE(ring->r.head)) + /* + * writes to the cq entry need to come after reading head; the + * control dependency is enough as we're using WRITE_ONCE to + * fill the cq entry + */ + if (tail - READ_ONCE(ring->r.head) == ring->ring_entries) return NULL; ctx->cached_cq_tail++; @@ -682,11 +780,9 @@ static void io_iopoll_req_issued(struct io_kiocb *req) list_add_tail(&req->list, &ctx->poll_list); } -static void io_file_put(struct io_submit_state *state, struct file *file) +static void io_file_put(struct io_submit_state *state) { - if (!state) { - fput(file); - } else if (state->file) { + if (state->file) { int diff = state->has_refs - state->used_refs; if (diff) @@ -711,7 +807,7 @@ static struct file *io_file_get(struct io_submit_state *state, int fd) state->ios_left--; return state->file; } - io_file_put(state, NULL); + io_file_put(state); } state->file = fget_many(fd, state->ios_left); if (!state->file) @@ -742,7 +838,7 @@ static bool io_file_supports_async(struct file *file) } static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, - bool force_nonblock, struct io_submit_state *state) + bool force_nonblock) { const struct io_uring_sqe *sqe = s->sqe; struct io_ring_ctx *ctx = req->ctx; @@ -776,10 +872,14 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); if (unlikely(ret)) return ret; - if (force_nonblock) { + + /* don't allow async punt if RWF_NOWAIT was requested */ + if (kiocb->ki_flags & IOCB_NOWAIT) + req->flags |= REQ_F_NOWAIT; + + if (force_nonblock) kiocb->ki_flags |= IOCB_NOWAIT; - req->flags |= REQ_F_FORCE_NONBLOCK; - } + if (ctx->flags & IORING_SETUP_IOPOLL) { if (!(kiocb->ki_flags & IOCB_DIRECT) || !kiocb->ki_filp->f_op->iopoll) @@ -940,7 +1040,7 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len) } static int io_read(struct io_kiocb *req, const struct sqe_submit *s, - bool force_nonblock, struct io_submit_state *state) + bool force_nonblock) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw; @@ -949,7 +1049,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, size_t iov_count; int ret; - ret = io_prep_rw(req, s, force_nonblock, state); + ret = io_prep_rw(req, s, force_nonblock); if (ret) return ret; file = kiocb->ki_filp; @@ -987,7 +1087,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, } static int io_write(struct io_kiocb *req, const struct sqe_submit *s, - bool force_nonblock, struct io_submit_state *state) + bool force_nonblock) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw; @@ -996,7 +1096,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, size_t iov_count; int ret; - ret = io_prep_rw(req, s, force_nonblock, state); + ret = io_prep_rw(req, s, force_nonblock); if (ret) return ret; @@ -1338,8 +1438,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) } static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, - const struct sqe_submit *s, bool force_nonblock, - struct io_submit_state *state) + const struct sqe_submit *s, bool force_nonblock) { int ret, opcode; @@ -1355,18 +1454,18 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, case IORING_OP_READV: if (unlikely(s->sqe->buf_index)) return -EINVAL; - ret = io_read(req, s, force_nonblock, state); + ret = io_read(req, s, force_nonblock); break; case IORING_OP_WRITEV: if (unlikely(s->sqe->buf_index)) return -EINVAL; - ret = io_write(req, s, force_nonblock, state); + ret = io_write(req, s, force_nonblock); break; case IORING_OP_READ_FIXED: - ret = io_read(req, s, force_nonblock, state); + ret = io_read(req, s, force_nonblock); break; case IORING_OP_WRITE_FIXED: - ret = io_write(req, s, force_nonblock, state); + ret = io_write(req, s, force_nonblock); break; case IORING_OP_FSYNC: ret = io_fsync(req, s->sqe, force_nonblock); @@ -1439,8 +1538,7 @@ restart: struct sqe_submit *s = &req->submit; const struct io_uring_sqe *sqe = s->sqe; - /* Ensure we clear previously set forced non-block flag */ - req->flags &= ~REQ_F_FORCE_NONBLOCK; + /* Ensure we clear previously set non-block flag */ req->rw.ki_flags &= ~IOCB_NOWAIT; ret = 0; @@ -1459,7 +1557,7 @@ restart: s->has_user = cur_mm != NULL; s->needs_lock = true; do { - ret = __io_submit_sqe(ctx, req, s, false, NULL); + ret = __io_submit_sqe(ctx, req, s, false); /* * We can get EAGAIN for polled IO even though * we're forcing a sync submission from here, @@ -1470,10 +1568,11 @@ restart: break; cond_resched(); } while (1); - - /* drop submission reference */ - io_put_req(req); } + + /* drop submission reference */ + io_put_req(req); + if (ret) { io_cqring_add_event(ctx, sqe->user_data, ret, 0); io_put_req(req); @@ -1625,8 +1724,8 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, if (unlikely(ret)) goto out; - ret = __io_submit_sqe(ctx, req, s, true, state); - if (ret == -EAGAIN) { + ret = __io_submit_sqe(ctx, req, s, true); + if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { struct io_uring_sqe *sqe_copy; sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL); @@ -1671,7 +1770,7 @@ out: static void io_submit_state_end(struct io_submit_state *state) { blk_finish_plug(&state->plug); - io_file_put(state, NULL); + io_file_put(state); if (state->free_reqs) kmem_cache_free_bulk(req_cachep, state->free_reqs, &state->reqs[state->cur_req]); @@ -1700,24 +1799,10 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) * write new data to them. */ smp_store_release(&ring->r.head, ctx->cached_sq_head); - - /* - * write side barrier of head update, app has read side. See - * comment at the top of this file - */ - smp_wmb(); } } /* - * Undo last io_get_sqring() - */ -static void io_drop_sqring(struct io_ring_ctx *ctx) -{ - ctx->cached_sq_head--; -} - -/* * Fetch an sqe, if one is available. Note that s->sqe will point to memory * that is mapped by userspace. This means that care needs to be taken to * ensure that reads are stable, as we cannot rely on userspace always @@ -1739,9 +1824,8 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) * though the application is the one updating it. */ head = ctx->cached_sq_head; - /* See comment at the top of this file */ - smp_rmb(); - if (head == READ_ONCE(ring->r.tail)) + /* make sure SQ entry isn't read before tail */ + if (head == smp_load_acquire(&ring->r.tail)) return false; head = READ_ONCE(ring->array[head & ctx->sq_mask]); @@ -1755,8 +1839,6 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) /* drop invalid entries */ ctx->cached_sq_head++; ring->dropped++; - /* See comment at the top of this file */ - smp_wmb(); return false; } @@ -1866,7 +1948,8 @@ static int io_sq_thread(void *data) /* Tell userspace we may need a wakeup call */ ctx->sq_ring->flags |= IORING_SQ_NEED_WAKEUP; - smp_wmb(); + /* make sure to read SQ tail after writing flags */ + smp_mb(); if (!io_get_sqring(ctx, &sqes[0])) { if (kthread_should_stop()) { @@ -1879,13 +1962,11 @@ static int io_sq_thread(void *data) finish_wait(&ctx->sqo_wait, &wait); ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP; - smp_wmb(); continue; } finish_wait(&ctx->sqo_wait, &wait); ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP; - smp_wmb(); } i = 0; @@ -1920,13 +2001,17 @@ static int io_sq_thread(void *data) unuse_mm(cur_mm); mmput(cur_mm); } + + if (kthread_should_park()) + kthread_parkme(); + return 0; } static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) { struct io_submit_state state, *statep = NULL; - int i, ret = 0, submit = 0; + int i, submit = 0; if (to_submit > IO_PLUG_THRESHOLD) { io_submit_state_start(&state, ctx, to_submit); @@ -1935,6 +2020,7 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) for (i = 0; i < to_submit; i++) { struct sqe_submit s; + int ret; if (!io_get_sqring(ctx, &s)) break; @@ -1942,21 +2028,18 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) s.has_user = true; s.needs_lock = false; s.needs_fixed_file = false; + submit++; ret = io_submit_sqe(ctx, &s, statep); - if (ret) { - io_drop_sqring(ctx); - break; - } - - submit++; + if (ret) + io_cqring_add_event(ctx, s.sqe->user_data, ret, 0); } io_commit_sqring(ctx); if (statep) io_submit_state_end(statep); - return submit ? submit : ret; + return submit; } static unsigned io_cqring_events(struct io_cq_ring *ring) @@ -2054,6 +2137,7 @@ static void io_sq_thread_stop(struct io_ring_ctx *ctx) if (ctx->sqo_thread) { ctx->sqo_stop = 1; mb(); + kthread_park(ctx->sqo_thread); kthread_stop(ctx->sqo_thread); ctx->sqo_thread = NULL; } @@ -2236,19 +2320,23 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, mmgrab(current->mm); ctx->sqo_mm = current->mm; - ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle); - if (!ctx->sq_thread_idle) - ctx->sq_thread_idle = HZ; + if (ctx->flags & IORING_SETUP_SQPOLL) { + ret = -EPERM; + if (!capable(CAP_SYS_ADMIN)) + goto err; - ret = -EINVAL; - if (!cpu_possible(p->sq_thread_cpu)) - goto err; + ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle); + if (!ctx->sq_thread_idle) + ctx->sq_thread_idle = HZ; - if (ctx->flags & IORING_SETUP_SQPOLL) { if (p->flags & IORING_SETUP_SQ_AFF) { - int cpu; + int cpu = array_index_nospec(p->sq_thread_cpu, + nr_cpu_ids); + + ret = -EINVAL; + if (!cpu_possible(cpu)) + goto err; - cpu = array_index_nospec(p->sq_thread_cpu, NR_CPUS); ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread, ctx, cpu, "io_uring-sq"); @@ -2309,8 +2397,12 @@ static int io_account_mem(struct user_struct *user, unsigned long nr_pages) static void io_mem_free(void *ptr) { - struct page *page = virt_to_head_page(ptr); + struct page *page; + + if (!ptr) + return; + page = virt_to_head_page(ptr); if (put_page_testzero(page)) free_compound_page(page); } @@ -2351,7 +2443,7 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx) if (ctx->account_mem) io_unaccount_mem(ctx->user, imu->nr_bvecs); - kfree(imu->bvec); + kvfree(imu->bvec); imu->nr_bvecs = 0; } @@ -2443,9 +2535,9 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, if (!pages || nr_pages > got_pages) { kfree(vmas); kfree(pages); - pages = kmalloc_array(nr_pages, sizeof(struct page *), + pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); - vmas = kmalloc_array(nr_pages, + vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *), GFP_KERNEL); if (!pages || !vmas) { @@ -2457,7 +2549,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, got_pages = nr_pages; } - imu->bvec = kmalloc_array(nr_pages, sizeof(struct bio_vec), + imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec), GFP_KERNEL); ret = -ENOMEM; if (!imu->bvec) { @@ -2496,6 +2588,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, } if (ctx->account_mem) io_unaccount_mem(ctx->user, nr_pages); + kvfree(imu->bvec); goto err; } @@ -2518,12 +2611,12 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, ctx->nr_user_bufs++; } - kfree(pages); - kfree(vmas); + kvfree(pages); + kvfree(vmas); return 0; err: - kfree(pages); - kfree(vmas); + kvfree(pages); + kvfree(vmas); io_sqe_buffer_unregister(ctx); return ret; } @@ -2561,9 +2654,13 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) __poll_t mask = 0; poll_wait(file, &ctx->cq_wait, wait); - /* See comment at the top of this file */ + /* + * synchronizes with barrier from wq_has_sleeper call in + * io_commit_cqring + */ smp_rmb(); - if (READ_ONCE(ctx->sq_ring->r.tail) + 1 != ctx->cached_sq_head) + if (READ_ONCE(ctx->sq_ring->r.tail) - ctx->cached_sq_head != + ctx->sq_ring->ring_entries) mask |= EPOLLOUT | EPOLLWRNORM; if (READ_ONCE(ctx->cq_ring->r.head) != ctx->cached_cq_tail) mask |= EPOLLIN | EPOLLRDNORM; @@ -2674,24 +2771,12 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, mutex_lock(&ctx->uring_lock); submitted = io_ring_submit(ctx, to_submit); mutex_unlock(&ctx->uring_lock); - - if (submitted < 0) - goto out_ctx; } if (flags & IORING_ENTER_GETEVENTS) { unsigned nr_events = 0; min_complete = min(min_complete, ctx->cq_entries); - /* - * The application could have included the 'to_submit' count - * in how many events it wanted to wait for. If we failed to - * submit the desired count, we may need to adjust the number - * of events to poll/wait for. - */ - if (submitted < to_submit) - min_complete = min_t(unsigned, submitted, min_complete); - if (ctx->flags & IORING_SETUP_IOPOLL) { mutex_lock(&ctx->uring_lock); ret = io_iopoll_check(ctx, &nr_events, min_complete); @@ -2737,17 +2822,12 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx, return -EOVERFLOW; ctx->sq_sqes = io_mem_alloc(size); - if (!ctx->sq_sqes) { - io_mem_free(ctx->sq_ring); + if (!ctx->sq_sqes) return -ENOMEM; - } cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries)); - if (!cq_ring) { - io_mem_free(ctx->sq_ring); - io_mem_free(ctx->sq_sqes); + if (!cq_ring) return -ENOMEM; - } ctx->cq_ring = cq_ring; cq_ring->ring_mask = p->cq_entries - 1; @@ -2918,11 +2998,31 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries, static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) + __releases(ctx->uring_lock) + __acquires(ctx->uring_lock) { int ret; + /* + * We're inside the ring mutex, if the ref is already dying, then + * someone else killed the ctx or is already going through + * io_uring_register(). + */ + if (percpu_ref_is_dying(&ctx->refs)) + return -ENXIO; + percpu_ref_kill(&ctx->refs); + + /* + * Drop uring mutex before waiting for references to exit. If another + * thread is currently inside io_uring_enter() it might need to grab + * the uring_lock to make progress. If we hold it here across the drain + * wait, then we can deadlock. It's safe to drop the mutex here, since + * no new references will come in after we've killed the percpu ref. + */ + mutex_unlock(&ctx->uring_lock); wait_for_completion(&ctx->ctx_done); + mutex_lock(&ctx->uring_lock); switch (opcode) { case IORING_REGISTER_BUFFERS: diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index ff6f85fb676b..5196bfa7894d 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -329,9 +329,6 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, }; ssize_t err, err2; - if (!nfs_server_capable(file_inode(dst), NFS_CAP_COPY)) - return -EOPNOTSUPP; - src_lock = nfs_get_lock_context(nfs_file_open_context(src)); if (IS_ERR(src_lock)) return PTR_ERR(src_lock); diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 45b2322e092d..00d17198ee12 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -133,8 +133,10 @@ static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t count, unsigned int flags) { + if (!nfs_server_capable(file_inode(file_out), NFS_CAP_COPY)) + return -EOPNOTSUPP; if (file_inode(file_in) == file_inode(file_out)) - return -EINVAL; + return -EOPNOTSUPP; return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count); } diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index cfcabc33e24d..602446158bfb 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -2589,7 +2589,7 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr, ARRAY_SIZE(nfs4_acl_bitmap), &hdr); rpc_prepare_reply_pages(req, args->acl_pages, 0, - args->acl_len, replen); + args->acl_len, replen + 1); encode_nops(&hdr); } @@ -2811,7 +2811,7 @@ static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, } rpc_prepare_reply_pages(req, (struct page **)&args->page, 0, - PAGE_SIZE, replen); + PAGE_SIZE, replen + 1); encode_nops(&hdr); } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 23790c7b2289..c27ac96a95bd 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2041,7 +2041,8 @@ static int nfs23_validate_mount_data(void *options, memcpy(sap, &data->addr, sizeof(data->addr)); args->nfs_server.addrlen = sizeof(data->addr); args->nfs_server.port = ntohs(data->addr.sin_port); - if (!nfs_verify_server_address(sap)) + if (sap->sa_family != AF_INET || + !nfs_verify_server_address(sap)) goto out_no_address; if (!(data->flags & NFS_MOUNT_TCP)) diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 8f933e84cec1..9bc32af4e2da 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -442,7 +442,9 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp) struct nfsd3_readdirargs *argp = rqstp->rq_argp; struct nfsd3_readdirres *resp = rqstp->rq_resp; __be32 nfserr; - int count; + int count = 0; + struct page **p; + caddr_t page_addr = NULL; dprintk("nfsd: READDIR(3) %s %d bytes at %d\n", SVCFH_fmt(&argp->fh), @@ -462,7 +464,18 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp) nfserr = nfsd_readdir(rqstp, &resp->fh, (loff_t*) &argp->cookie, &resp->common, nfs3svc_encode_entry); memcpy(resp->verf, argp->verf, 8); - resp->count = resp->buffer - argp->buffer; + count = 0; + for (p = rqstp->rq_respages + 1; p < rqstp->rq_next_page; p++) { + page_addr = page_address(*p); + + if (((caddr_t)resp->buffer >= page_addr) && + ((caddr_t)resp->buffer < page_addr + PAGE_SIZE)) { + count += (caddr_t)resp->buffer - page_addr; + break; + } + count += PAGE_SIZE; + } + resp->count = count >> 2; if (resp->offset) { loff_t offset = argp->cookie; diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 93fea246f676..8d789124ed3c 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -573,6 +573,7 @@ int nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_readdirargs *args = rqstp->rq_argp; + int len; u32 max_blocksize = svc_max_payload(rqstp); p = decode_fh(p, &args->fh); @@ -582,8 +583,14 @@ nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p) args->verf = p; p += 2; args->dircount = ~0; args->count = ntohl(*p++); - args->count = min_t(u32, args->count, max_blocksize); - args->buffer = page_address(*(rqstp->rq_next_page++)); + len = args->count = min_t(u32, args->count, max_blocksize); + + while (len > 0) { + struct page *p = *(rqstp->rq_next_page++); + if (!args->buffer) + args->buffer = page_address(p); + len -= PAGE_SIZE; + } return xdr_argsize_check(rqstp, p); } diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index d219159b98af..7caa3801ce72 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -1010,8 +1010,9 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) cb->cb_seq_status = 1; cb->cb_status = 0; if (minorversion) { - if (!nfsd41_cb_get_slot(clp, task)) + if (!cb->cb_holds_slot && !nfsd41_cb_get_slot(clp, task)) return; + cb->cb_holds_slot = true; } rpc_call_start(task); } @@ -1038,6 +1039,9 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback return true; } + if (!cb->cb_holds_slot) + goto need_restart; + switch (cb->cb_seq_status) { case 0: /* @@ -1076,6 +1080,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback cb->cb_seq_status); } + cb->cb_holds_slot = false; clear_bit(0, &clp->cl_cb_slot_busy); rpc_wake_up_next(&clp->cl_cb_waitq); dprintk("%s: freed slot, new seqid=%d\n", __func__, @@ -1283,6 +1288,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, cb->cb_seq_status = 1; cb->cb_status = 0; cb->cb_need_restart = false; + cb->cb_holds_slot = false; } void nfsd4_run_cb(struct nfsd4_callback *cb) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 6a45fb00c5fc..f056b1d3fecd 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -265,6 +265,7 @@ find_or_allocate_block(struct nfs4_lockowner *lo, struct knfsd_fh *fh, static void free_blocked_lock(struct nfsd4_blocked_lock *nbl) { + locks_delete_block(&nbl->nbl_lock); locks_release_private(&nbl->nbl_lock); kfree(nbl); } @@ -293,11 +294,18 @@ remove_blocked_locks(struct nfs4_lockowner *lo) nbl = list_first_entry(&reaplist, struct nfsd4_blocked_lock, nbl_lru); list_del_init(&nbl->nbl_lru); - locks_delete_block(&nbl->nbl_lock); free_blocked_lock(nbl); } } +static void +nfsd4_cb_notify_lock_prepare(struct nfsd4_callback *cb) +{ + struct nfsd4_blocked_lock *nbl = container_of(cb, + struct nfsd4_blocked_lock, nbl_cb); + locks_delete_block(&nbl->nbl_lock); +} + static int nfsd4_cb_notify_lock_done(struct nfsd4_callback *cb, struct rpc_task *task) { @@ -325,6 +333,7 @@ nfsd4_cb_notify_lock_release(struct nfsd4_callback *cb) } static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops = { + .prepare = nfsd4_cb_notify_lock_prepare, .done = nfsd4_cb_notify_lock_done, .release = nfsd4_cb_notify_lock_release, }; @@ -4863,7 +4872,6 @@ nfs4_laundromat(struct nfsd_net *nn) nbl = list_first_entry(&reaplist, struct nfsd4_blocked_lock, nbl_lru); list_del_init(&nbl->nbl_lru); - locks_delete_block(&nbl->nbl_lock); free_blocked_lock(nbl); } out: diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 396c76755b03..9d6cb246c6c5 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -70,6 +70,7 @@ struct nfsd4_callback { int cb_seq_status; int cb_status; bool cb_need_restart; + bool cb_holds_slot; }; struct nfsd4_callback_ops { diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 6b9c27548997..63c6bb1f8c4d 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -346,10 +346,16 @@ static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info) __kernel_fsid_t fsid = {}; fsnotify_foreach_obj_type(type) { + struct fsnotify_mark_connector *conn; + if (!fsnotify_iter_should_report_type(iter_info, type)) continue; - fsid = iter_info->marks[type]->connector->fsid; + conn = READ_ONCE(iter_info->marks[type]->connector); + /* Mark is just getting destroyed or created? */ + if (!conn) + continue; + fsid = conn->fsid; if (WARN_ON_ONCE(!fsid.val[0] && !fsid.val[1])) continue; return fsid; @@ -408,8 +414,12 @@ static int fanotify_handle_event(struct fsnotify_group *group, return 0; } - if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) + if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) { fsid = fanotify_get_fsid(iter_info); + /* Racing with mark destruction or creation? */ + if (!fsid.val[0] && !fsid.val[1]) + return 0; + } event = fanotify_alloc_event(group, inode, mask, data, data_type, &fsid); diff --git a/fs/notify/mark.c b/fs/notify/mark.c index d593d4269561..22acb0a79b53 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -239,13 +239,13 @@ static void fsnotify_drop_object(unsigned int type, void *objp) void fsnotify_put_mark(struct fsnotify_mark *mark) { - struct fsnotify_mark_connector *conn; + struct fsnotify_mark_connector *conn = READ_ONCE(mark->connector); void *objp = NULL; unsigned int type = FSNOTIFY_OBJ_TYPE_DETACHED; bool free_conn = false; /* Catch marks that were actually never attached to object */ - if (!mark->connector) { + if (!conn) { if (refcount_dec_and_test(&mark->refcnt)) fsnotify_final_mark_destroy(mark); return; @@ -255,10 +255,9 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) * We have to be careful so that traversals of obj_list under lock can * safely grab mark reference. */ - if (!refcount_dec_and_lock(&mark->refcnt, &mark->connector->lock)) + if (!refcount_dec_and_lock(&mark->refcnt, &conn->lock)) return; - conn = mark->connector; hlist_del_init_rcu(&mark->obj_list); if (hlist_empty(&conn->list)) { objp = fsnotify_detach_connector_from_object(conn, &type); @@ -266,7 +265,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) } else { __fsnotify_recalc_mask(conn); } - mark->connector = NULL; + WRITE_ONCE(mark->connector, NULL); spin_unlock(&conn->lock); fsnotify_drop_object(type, objp); @@ -620,7 +619,7 @@ restart: /* mark should be the last entry. last is the current last entry */ hlist_add_behind_rcu(&mark->obj_list, &last->obj_list); added: - mark->connector = conn; + WRITE_ONCE(mark->connector, conn); out_err: spin_unlock(&conn->lock); spin_unlock(&mark->lock); @@ -808,6 +807,7 @@ void fsnotify_init_mark(struct fsnotify_mark *mark, refcount_set(&mark->refcnt, 1); fsnotify_get_group(group); mark->group = group; + WRITE_ONCE(mark->connector, NULL); } /* diff --git a/fs/pipe.c b/fs/pipe.c index 070aad543382..41065901106b 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -188,9 +188,9 @@ EXPORT_SYMBOL(generic_pipe_buf_steal); * in the tee() system call, when we duplicate the buffers in one * pipe into another. */ -void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) +bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - get_page(buf->page); + return try_get_page(buf->page); } EXPORT_SYMBOL(generic_pipe_buf_get); diff --git a/fs/proc/base.c b/fs/proc/base.c index 6a803a0b75df..f179568b4c76 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -407,7 +407,6 @@ static void unlock_trace(struct task_struct *task) static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { - struct stack_trace trace; unsigned long *entries; int err; @@ -430,20 +429,17 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, if (!entries) return -ENOMEM; - trace.nr_entries = 0; - trace.max_entries = MAX_STACK_TRACE_DEPTH; - trace.entries = entries; - trace.skip = 0; - err = lock_trace(task); if (!err) { - unsigned int i; + unsigned int i, nr_entries; - save_stack_trace_tsk(task, &trace); + nr_entries = stack_trace_save_tsk(task, entries, + MAX_STACK_TRACE_DEPTH, 0); - for (i = 0; i < trace.nr_entries; i++) { + for (i = 0; i < nr_entries; i++) { seq_printf(m, "[<0>] %pB\n", (void *)entries[i]); } + unlock_trace(task); } kfree(entries); @@ -489,10 +485,9 @@ static int lstats_show_proc(struct seq_file *m, void *v) lr->count, lr->time, lr->max); for (q = 0; q < LT_BACKTRACEDEPTH; q++) { unsigned long bt = lr->backtrace[q]; + if (!bt) break; - if (bt == ULONG_MAX) - break; seq_printf(m, " %ps", (void *)bt); } seq_putc(m, '\n'); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index d65390727541..7325baa8f9d4 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1626,9 +1626,11 @@ static void drop_sysctl_table(struct ctl_table_header *header) if (--header->nreg) return; - if (parent) + if (parent) { put_links(header); - start_unregistering(header); + start_unregistering(header); + } + if (!--header->count) kfree_rcu(header, rcu); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 92a91e7816d8..95ca1fe7283c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1143,6 +1143,24 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, count = -EINTR; goto out_mm; } + /* + * Avoid to modify vma->vm_flags + * without locked ops while the + * coredump reads the vm_flags. + */ + if (!mmget_still_valid(mm)) { + /* + * Silently return "count" + * like if get_task_mm() + * failed. FIXME: should this + * function have returned + * -ESRCH if get_task_mm() + * failed like if + * get_proc_task() fails? + */ + up_write(&mm->mmap_sem); + goto out_mm; + } for (vma = mm->mmap; vma; vma = vma->vm_next) { vma->vm_flags &= ~VM_SOFTDIRTY; vma_set_page_prot(vma); diff --git a/fs/splice.c b/fs/splice.c index 3ee7e82df48f..25212dcca2df 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -330,8 +330,8 @@ const struct pipe_buf_operations default_pipe_buf_ops = { .get = generic_pipe_buf_get, }; -static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) +int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { return 1; } @@ -1593,7 +1593,11 @@ retry: * Get a reference to this pipe buffer, * so we can copy the contents over. */ - pipe_buf_get(ipipe, ibuf); + if (!pipe_buf_get(ipipe, ibuf)) { + if (ret == 0) + ret = -EFAULT; + break; + } *obuf = *ibuf; /* @@ -1667,7 +1671,11 @@ static int link_pipe(struct pipe_inode_info *ipipe, * Get a reference to this pipe buffer, * so we can copy the contents over. */ - pipe_buf_get(ipipe, ibuf); + if (!pipe_buf_get(ipipe, ibuf)) { + if (ret == 0) + ret = -EFAULT; + break; + } obuf = opipe->bufs + nbuf; *obuf = *ibuf; diff --git a/fs/super.c b/fs/super.c index 583a0124bc39..2739f57515f8 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1467,11 +1467,6 @@ int vfs_get_tree(struct fs_context *fc) struct super_block *sb; int error; - if (fc->fs_type->fs_flags & FS_REQUIRES_DEV && !fc->source) { - errorf(fc, "Filesystem requires source device"); - return -ENOENT; - } - if (fc->root) return -EBUSY; diff --git a/fs/ufs/util.h b/fs/ufs/util.h index 1fd3011ea623..7fd4802222b8 100644 --- a/fs/ufs/util.h +++ b/fs/ufs/util.h @@ -229,7 +229,7 @@ ufs_get_inode_gid(struct super_block *sb, struct ufs_inode *inode) case UFS_UID_44BSD: return fs32_to_cpu(sb, inode->ui_u3.ui_44.ui_gid); case UFS_UID_EFT: - if (inode->ui_u1.oldids.ui_suid == 0xFFFF) + if (inode->ui_u1.oldids.ui_sgid == 0xFFFF) return fs32_to_cpu(sb, inode->ui_u3.ui_sun.ui_gid); /* Fall through */ default: diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 89800fc7dc9d..f5de1e726356 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -629,6 +629,8 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, /* the various vma->vm_userfaultfd_ctx still points to it */ down_write(&mm->mmap_sem); + /* no task can run (and in turn coredump) yet */ + VM_WARN_ON(!mmget_still_valid(mm)); for (vma = mm->mmap; vma; vma = vma->vm_next) if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; @@ -883,6 +885,8 @@ static int userfaultfd_release(struct inode *inode, struct file *file) * taking the mmap_sem for writing. */ down_write(&mm->mmap_sem); + if (!mmget_still_valid(mm)) + goto skip_mm; prev = NULL; for (vma = mm->mmap; vma; vma = vma->vm_next) { cond_resched(); @@ -905,6 +909,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) vma->vm_flags = new_flags; vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; } +skip_mm: up_write(&mm->mmap_sem); mmput(mm); wakeup: @@ -1333,6 +1338,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, goto out; down_write(&mm->mmap_sem); + if (!mmget_still_valid(mm)) + goto out_unlock; vma = find_vma_prev(mm, start, &prev); if (!vma) goto out_unlock; @@ -1520,6 +1527,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, goto out; down_write(&mm->mmap_sem); + if (!mmget_still_valid(mm)) + goto out_unlock; vma = find_vma_prev(mm, start, &prev); if (!vma) goto out_unlock; |