diff options
Diffstat (limited to 'fs')
153 files changed, 1310 insertions, 762 deletions
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index b740017634ef..b6ba22975781 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -9,6 +9,7 @@ #include <linux/module.h> #include <linux/errno.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/sched.h> #include <linux/file.h> #include <linux/stat.h> diff --git a/fs/affs/file.c b/fs/affs/file.c index cefa222f7881..8daeed31e1af 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -880,7 +880,7 @@ affs_truncate(struct inode *inode) if (inode->i_size > AFFS_I(inode)->mmu_private) { struct address_space *mapping = inode->i_mapping; struct page *page; - void *fsdata; + void *fsdata = NULL; loff_t isize = inode->i_size; int res; diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 7dcd59693a0c..d4ddb20d6732 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -13,6 +13,8 @@ #include "internal.h" #include "afs_cm.h" #include "protocol_yfs.h" +#define RXRPC_TRACE_ONLY_DEFINE_ENUMS +#include <trace/events/rxrpc.h> static int afs_deliver_cb_init_call_back_state(struct afs_call *); static int afs_deliver_cb_init_call_back_state3(struct afs_call *); @@ -191,7 +193,7 @@ static void afs_cm_destructor(struct afs_call *call) * Abort a service call from within an action function. */ static void afs_abort_service_call(struct afs_call *call, u32 abort_code, int error, - const char *why) + enum rxrpc_abort_reason why) { rxrpc_kernel_abort_call(call->net->socket, call->rxcall, abort_code, error, why); @@ -469,7 +471,7 @@ static void SRXAFSCB_ProbeUuid(struct work_struct *work) if (memcmp(r, &call->net->uuid, sizeof(call->net->uuid)) == 0) afs_send_empty_reply(call); else - afs_abort_service_call(call, 1, 1, "K-1"); + afs_abort_service_call(call, 1, 1, afs_abort_probeuuid_negative); afs_put_call(call); _leave(""); diff --git a/fs/afs/flock.c b/fs/afs/flock.c index bbcc5afd1576..9c6dea3139f5 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -451,7 +451,7 @@ static int afs_do_setlk_check(struct afs_vnode *vnode, struct key *key, */ static int afs_do_setlk(struct file *file, struct file_lock *fl) { - struct inode *inode = locks_inode(file); + struct inode *inode = file_inode(file); struct afs_vnode *vnode = AFS_FS_I(inode); enum afs_flock_mode mode = AFS_FS_S(inode->i_sb)->flock_mode; afs_lock_type_t type; @@ -701,7 +701,7 @@ error: */ static int afs_do_unlk(struct file *file, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(locks_inode(file)); + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); int ret; _enter("{%llx:%llu},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type); @@ -721,7 +721,7 @@ static int afs_do_unlk(struct file *file, struct file_lock *fl) */ static int afs_do_getlk(struct file *file, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(locks_inode(file)); + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); struct key *key = afs_file_key(file); int ret, lock_count; @@ -763,7 +763,7 @@ error: */ int afs_lock(struct file *file, int cmd, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(locks_inode(file)); + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); enum afs_flock_operation op; int ret; @@ -798,7 +798,7 @@ int afs_lock(struct file *file, int cmd, struct file_lock *fl) */ int afs_flock(struct file *file, int cmd, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(locks_inode(file)); + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); enum afs_flock_operation op; int ret; @@ -843,7 +843,7 @@ int afs_flock(struct file *file, int cmd, struct file_lock *fl) */ static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(locks_inode(fl->fl_file)); + struct afs_vnode *vnode = AFS_FS_I(file_inode(fl->fl_file)); _enter(""); @@ -861,7 +861,7 @@ static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl) */ static void afs_fl_release_private(struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(locks_inode(fl->fl_file)); + struct afs_vnode *vnode = AFS_FS_I(file_inode(fl->fl_file)); _enter(""); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index e3375b2a0ff3..ad8523d0d038 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -9,6 +9,7 @@ #include <linux/kernel.h> #include <linux/ktime.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/pagemap.h> #include <linux/rxrpc.h> #include <linux/key.h> diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index c62939e5ea1f..7817e2b860e5 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -13,6 +13,8 @@ #include "internal.h" #include "afs_cm.h" #include "protocol_yfs.h" +#define RXRPC_TRACE_ONLY_DEFINE_ENUMS +#include <trace/events/rxrpc.h> struct workqueue_struct *afs_async_calls; @@ -397,7 +399,8 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) error_do_abort: if (ret != -ECONNABORTED) { rxrpc_kernel_abort_call(call->net->socket, rxcall, - RX_USER_ABORT, ret, "KSD"); + RX_USER_ABORT, ret, + afs_abort_send_data_error); } else { len = 0; iov_iter_kvec(&msg.msg_iter, ITER_DEST, NULL, 0, 0); @@ -527,7 +530,8 @@ static void afs_deliver_to_call(struct afs_call *call) case -ENOTSUPP: abort_code = RXGEN_OPCODE; rxrpc_kernel_abort_call(call->net->socket, call->rxcall, - abort_code, ret, "KIV"); + abort_code, ret, + afs_abort_op_not_supported); goto local_abort; case -EIO: pr_err("kAFS: Call %u in bad state %u\n", @@ -542,12 +546,14 @@ static void afs_deliver_to_call(struct afs_call *call) if (state != AFS_CALL_CL_AWAIT_REPLY) abort_code = RXGEN_SS_UNMARSHAL; rxrpc_kernel_abort_call(call->net->socket, call->rxcall, - abort_code, ret, "KUM"); + abort_code, ret, + afs_abort_unmarshal_error); goto local_abort; default: abort_code = RX_CALL_DEAD; rxrpc_kernel_abort_call(call->net->socket, call->rxcall, - abort_code, ret, "KER"); + abort_code, ret, + afs_abort_general_error); goto local_abort; } } @@ -619,7 +625,8 @@ long afs_wait_for_call_to_complete(struct afs_call *call, /* Kill off the call if it's still live. */ _debug("call interrupted"); if (rxrpc_kernel_abort_call(call->net->socket, call->rxcall, - RX_USER_ABORT, -EINTR, "KWI")) + RX_USER_ABORT, -EINTR, + afs_abort_interrupted)) afs_set_call_complete(call, -EINTR, 0); } } @@ -836,7 +843,8 @@ void afs_send_empty_reply(struct afs_call *call) case -ENOMEM: _debug("oom"); rxrpc_kernel_abort_call(net->socket, call->rxcall, - RXGEN_SS_MARSHAL, -ENOMEM, "KOO"); + RXGEN_SS_MARSHAL, -ENOMEM, + afs_abort_oom); fallthrough; default: _leave(" [error]"); @@ -878,7 +886,8 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) if (n == -ENOMEM) { _debug("oom"); rxrpc_kernel_abort_call(net->socket, call->rxcall, - RXGEN_SS_MARSHAL, -ENOMEM, "KOO"); + RXGEN_SS_MARSHAL, -ENOMEM, + afs_abort_oom); } _leave(" [error]"); } @@ -900,6 +909,7 @@ int afs_extract_data(struct afs_call *call, bool want_more) ret = rxrpc_kernel_recv_data(net->socket, call->rxcall, iter, &call->iov_len, want_more, &remote_abort, &call->service_id); + trace_afs_receive_data(call, call->iter, want_more, ret); if (ret == 0 || ret == -EAGAIN) return ret; @@ -361,6 +361,9 @@ static int aio_ring_mremap(struct vm_area_struct *vma) spin_lock(&mm->ioctx_lock); rcu_read_lock(); table = rcu_dereference(mm->ioctx_table); + if (!table) + goto out_unlock; + for (i = 0; i < table->nr; i++) { struct kioctx *ctx; @@ -374,6 +377,7 @@ static int aio_ring_mremap(struct vm_area_struct *vma) } } +out_unlock: rcu_read_unlock(); spin_unlock(&mm->ioctx_lock); return res; diff --git a/fs/attr.c b/fs/attr.c index 1a100d392dd9..aca9ff7aed33 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -14,6 +14,7 @@ #include <linux/capability.h> #include <linux/fsnotify.h> #include <linux/fcntl.h> +#include <linux/filelock.h> #include <linux/security.h> #include <linux/evm.h> #include <linux/ima.h> diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index de63572a9404..9a780fafc539 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -2034,7 +2034,7 @@ static int elf_core_dump(struct coredump_params *cprm) * The number of segs are recored into ELF header as 16bit value. * Please check DEFAULT_MAX_MAP_COUNT definition when you modify here. */ - segs = cprm->vma_count + elf_core_extra_phdrs(); + segs = cprm->vma_count + elf_core_extra_phdrs(cprm); /* for notes section */ segs++; @@ -2074,7 +2074,7 @@ static int elf_core_dump(struct coredump_params *cprm) dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); offset += cprm->vma_data_size; - offset += elf_core_extra_data_size(); + offset += elf_core_extra_data_size(cprm); e_shoff = offset; if (e_phnum == PN_XNUM) { diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 096e3520a0b1..a05eafcacfb2 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1509,7 +1509,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) tmp->next = thread_list; thread_list = tmp; - segs = cprm->vma_count + elf_core_extra_phdrs(); + segs = cprm->vma_count + elf_core_extra_phdrs(cprm); /* for notes section */ segs++; @@ -1555,7 +1555,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); offset += cprm->vma_data_size; - offset += elf_core_extra_data_size(); + offset += elf_core_extra_data_size(cprm); e_shoff = offset; if (e_phnum == PN_XNUM) { diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 21c92c74bf71..46851511b661 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -484,6 +484,7 @@ static int add_all_parents(struct btrfs_backref_walk_ctx *ctx, u64 wanted_disk_byte = ref->wanted_disk_byte; u64 count = 0; u64 data_offset; + u8 type; if (level != 0) { eb = path->nodes[level]; @@ -538,6 +539,9 @@ static int add_all_parents(struct btrfs_backref_walk_ctx *ctx, continue; } fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + type = btrfs_file_extent_type(eb, fi); + if (type == BTRFS_FILE_EXTENT_INLINE) + goto next; disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); data_offset = btrfs_file_extent_offset(eb, fi); diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index b8fb7ef6b520..8affc88b0e0a 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -329,7 +329,16 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, &map_length, &bioc, mirror_num); if (ret) goto out_counter_dec; - BUG_ON(mirror_num != bioc->mirror_num); + /* + * This happens when dev-replace is also running, and the + * mirror_num indicates the dev-replace target. + * + * In this case, we don't need to do anything, as the read + * error just means the replace progress hasn't reached our + * read range, and later replace routine would handle it well. + */ + if (mirror_num != bioc->mirror_num) + goto out_counter_dec; } sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9; diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 0a3c261b69c9..d81b764a7644 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -358,8 +358,10 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, goto out; path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + if (!path) { + ret = -ENOMEM; + goto out; + } level = btrfs_header_level(root->node); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0888d484df80..3aa04224315e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -367,7 +367,14 @@ error: btrfs_print_tree(eb, 0); btrfs_err(fs_info, "block=%llu write time tree block corruption detected", eb->start); - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + /* + * Be noisy if this is an extent buffer from a log tree. We don't abort + * a transaction in case there's a bad log tree extent buffer, we just + * fallback to a transaction commit. Still we want to know when there is + * a bad log tree extent buffer, as that may signal a bug somewhere. + */ + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG) || + btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID); return ret; } @@ -530,6 +537,9 @@ static int validate_extent_buffer(struct extent_buffer *eb, } if (found_level != check->level) { + btrfs_err(fs_info, + "level verify failed on logical %llu mirror %u wanted %u found %u", + eb->start, eb->read_mirror, check->level, found_level); ret = -EIO; goto out; } @@ -3381,6 +3391,8 @@ out: /* * Do various sanity and dependency checks of different features. * + * @is_rw_mount: If the mount is read-write. + * * This is the place for less strict checks (like for subpage or artificial * feature dependencies). * @@ -3391,7 +3403,7 @@ out: * (space cache related) can modify on-disk format like free space tree and * screw up certain feature dependencies. */ -int btrfs_check_features(struct btrfs_fs_info *fs_info, struct super_block *sb) +int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount) { struct btrfs_super_block *disk_super = fs_info->super_copy; u64 incompat = btrfs_super_incompat_flags(disk_super); @@ -3430,7 +3442,7 @@ int btrfs_check_features(struct btrfs_fs_info *fs_info, struct super_block *sb) if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) incompat |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; - if (compat_ro_unsupp && !sb_rdonly(sb)) { + if (compat_ro_unsupp && is_rw_mount) { btrfs_err(fs_info, "cannot mount read-write because of unknown compat_ro features (0x%llx)", compat_ro); @@ -3633,7 +3645,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_alloc; } - ret = btrfs_check_features(fs_info, sb); + ret = btrfs_check_features(fs_info, !sb_rdonly(sb)); if (ret < 0) { err = ret; goto fail_alloc; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 363935cfc084..f2f295eb6103 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -50,7 +50,7 @@ int __cold open_ctree(struct super_block *sb, void __cold close_ctree(struct btrfs_fs_info *fs_info); int btrfs_validate_super(struct btrfs_fs_info *fs_info, struct btrfs_super_block *sb, int mirror_num); -int btrfs_check_features(struct btrfs_fs_info *fs_info, struct super_block *sb); +int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount); int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors); struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev); struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 9ae9cd1e7035..3c7766dfaa69 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -1551,7 +1551,7 @@ u64 count_range_bits(struct extent_io_tree *tree, u64 last = 0; int found = 0; - if (WARN_ON(search_end <= cur_start)) + if (WARN_ON(search_end < cur_start)) return 0; spin_lock(&tree->lock); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 892d78c1853c..72ba13b027a9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1713,6 +1713,11 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, BUG(); if (ret && insert_reserved) btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1); + if (ret < 0) + btrfs_err(trans->fs_info, +"failed to run delayed ref for logical %llu num_bytes %llu type %u action %u ref_mod %d: %d", + node->bytenr, node->num_bytes, node->type, + node->action, node->ref_mod, ret); return ret; } @@ -1954,8 +1959,6 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, if (ret) { unselect_delayed_ref_head(delayed_refs, locked_ref); btrfs_put_delayed_ref(ref); - btrfs_debug(fs_info, "run_one_delayed_ref returned %d", - ret); return ret; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 83dd3aa59663..3bbf8703db2a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -104,6 +104,15 @@ struct btrfs_bio_ctrl { btrfs_bio_end_io_t end_io_func; /* + * This is for metadata read, to provide the extra needed verification + * info. This has to be provided for submit_one_bio(), as + * submit_one_bio() can submit a bio if it ends at stripe boundary. If + * no such parent_check is provided, the metadata can hit false alert at + * endio time. + */ + struct btrfs_tree_parent_check *parent_check; + + /* * Tell writepage not to lock the state bits for this range, it still * does the unlocking. */ @@ -133,13 +142,24 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset; - if (!is_data_inode(&inode->vfs_inode)) + if (!is_data_inode(&inode->vfs_inode)) { + if (btrfs_op(bio) != BTRFS_MAP_WRITE) { + /* + * For metadata read, we should have the parent_check, + * and copy it to bbio for metadata verification. + */ + ASSERT(bio_ctrl->parent_check); + memcpy(&btrfs_bio(bio)->parent_check, + bio_ctrl->parent_check, + sizeof(struct btrfs_tree_parent_check)); + } btrfs_submit_metadata_bio(inode, bio, mirror_num); - else if (btrfs_op(bio) == BTRFS_MAP_WRITE) + } else if (btrfs_op(bio) == BTRFS_MAP_WRITE) { btrfs_submit_data_write_bio(inode, bio, mirror_num); - else + } else { btrfs_submit_data_read_bio(inode, bio, mirror_num, bio_ctrl->compress_type); + } /* The bio is owned by the end_io handler now */ bio_ctrl->bio = NULL; @@ -3806,6 +3826,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, lockend = round_up(start + len, inode->root->fs_info->sectorsize); prev_extent_end = lockstart; + btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end); @@ -3999,6 +4020,7 @@ check_eof_delalloc: out_unlock: unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); out: free_extent_state(delalloc_cached_state); btrfs_free_backref_share_ctx(backref_ctx); @@ -4829,6 +4851,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, struct extent_state *cached_state = NULL; struct btrfs_bio_ctrl bio_ctrl = { .mirror_num = mirror_num, + .parent_check = check, }; int ret = 0; @@ -4878,7 +4901,6 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, */ atomic_dec(&eb->io_pages); } - memcpy(&btrfs_bio(bio_ctrl.bio)->parent_check, check, sizeof(*check)); submit_one_bio(&bio_ctrl); if (ret || wait != WAIT_COMPLETE) { free_extent_state(cached_state); @@ -4905,6 +4927,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, unsigned long num_reads = 0; struct btrfs_bio_ctrl bio_ctrl = { .mirror_num = mirror_num, + .parent_check = check, }; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) @@ -4996,7 +5019,6 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, } } - memcpy(&btrfs_bio(bio_ctrl.bio)->parent_check, check, sizeof(*check)); submit_one_bio(&bio_ctrl); if (ret || wait != WAIT_COMPLETE) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 91b00eb2440e..af046d22300e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3354,7 +3354,7 @@ bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, bool search_io_tree = true; bool ret = false; - while (cur_offset < end) { + while (cur_offset <= end) { u64 delalloc_start; u64 delalloc_end; bool delalloc; @@ -3541,6 +3541,7 @@ static loff_t find_desired_extent(struct file *file, loff_t offset, int whence) struct extent_buffer *leaf = path->nodes[0]; struct btrfs_file_extent_item *extent; u64 extent_end; + u8 type; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); @@ -3596,10 +3597,16 @@ static loff_t find_desired_extent(struct file *file, loff_t offset, int whence) extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + type = btrfs_file_extent_type(leaf, extent); - if (btrfs_file_extent_disk_bytenr(leaf, extent) == 0 || - btrfs_file_extent_type(leaf, extent) == - BTRFS_FILE_EXTENT_PREALLOC) { + /* + * Can't access the extent's disk_bytenr field if this is an + * inline extent, since at that offset, it's where the extent + * data starts. + */ + if (type == BTRFS_FILE_EXTENT_PREALLOC || + (type == BTRFS_FILE_EXTENT_REG && + btrfs_file_extent_disk_bytenr(leaf, extent) == 0)) { /* * Explicit hole or prealloc extent, search for delalloc. * A prealloc extent is treated like a hole. diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index a749367e5ae2..37b86acfcbcf 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -119,6 +119,12 @@ enum { /* Indicate that we want to commit the transaction. */ BTRFS_FS_NEED_TRANS_COMMIT, + /* + * Indicate metadata over-commit is disabled. This is set when active + * zone tracking is needed. + */ + BTRFS_FS_NO_OVERCOMMIT, + #if BITS_PER_LONG == 32 /* Indicate if we have error/warn message printed on 32bit systems */ BTRFS_FS_32BIT_ERROR, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8d74d042c626..efee6d35af52 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7092,7 +7092,7 @@ next: * Other members are not utilized for inline extents. */ ASSERT(em->block_start == EXTENT_MAP_INLINE); - ASSERT(em->len = fs_info->sectorsize); + ASSERT(em->len == fs_info->sectorsize); ret = read_inline_extent(inode, path, page); if (ret < 0) @@ -9377,8 +9377,10 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (flags & RENAME_WHITEOUT) { whiteout_args.inode = new_whiteout_inode(idmap, old_dir); - if (!whiteout_args.inode) - return -ENOMEM; + if (!whiteout_args.inode) { + ret = -ENOMEM; + goto out_fscrypt_names; + } ret = btrfs_new_inode_prepare(&whiteout_args, &trans_num_items); if (ret) goto out_whiteout_inode; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 5c636e00d77d..af97413abcf4 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2765,9 +2765,19 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) /* * Old roots should be searched when inserting qgroup - * extent record + * extent record. + * + * But for INCONSISTENT (NO_ACCOUNTING) -> rescan case, + * we may have some record inserted during + * NO_ACCOUNTING (thus no old_roots populated), but + * later we start rescan, which clears NO_ACCOUNTING, + * leaving some inserted records without old_roots + * populated. + * + * Those cases are rare and should not cause too much + * time spent during commit_transaction(). */ - if (WARN_ON(!record->old_roots)) { + if (!record->old_roots) { /* Search commit root to find old_roots */ ret = btrfs_find_all_roots(&ctx, false); if (ret < 0) @@ -2787,6 +2797,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) * current root. It's safe inside commit_transaction(). */ ctx.trans = trans; + ctx.time_seq = BTRFS_SEQ_LAST; ret = btrfs_find_all_roots(&ctx, false); if (ret < 0) goto cleanup; @@ -3356,6 +3367,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) int err = -ENOMEM; int ret = 0; bool stopped = false; + bool did_leaf_rescans = false; path = btrfs_alloc_path(); if (!path) @@ -3376,6 +3388,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) } err = qgroup_rescan_leaf(trans, path); + did_leaf_rescans = true; if (err > 0) btrfs_commit_transaction(trans); @@ -3396,16 +3409,23 @@ out: mutex_unlock(&fs_info->qgroup_rescan_lock); /* - * only update status, since the previous part has already updated the - * qgroup info. + * Only update status, since the previous part has already updated the + * qgroup info, and only if we did any actual work. This also prevents + * race with a concurrent quota disable, which has already set + * fs_info->quota_root to NULL and cleared BTRFS_FS_QUOTA_ENABLED at + * btrfs_quota_disable(). */ - trans = btrfs_start_transaction(fs_info->quota_root, 1); - if (IS_ERR(trans)) { - err = PTR_ERR(trans); + if (did_leaf_rescans) { + trans = btrfs_start_transaction(fs_info->quota_root, 1); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + trans = NULL; + btrfs_err(fs_info, + "fail to start transaction for status update: %d", + err); + } + } else { trans = NULL; - btrfs_err(fs_info, - "fail to start transaction for status update: %d", - err); } mutex_lock(&fs_info->qgroup_rescan_lock); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 2d90a6b5eb00..ff4b1d583788 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1426,12 +1426,20 @@ static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bi u32 bio_size = 0; struct bio_vec *bvec; struct bvec_iter_all iter_all; + int i; bio_for_each_segment_all(bvec, bio, iter_all) bio_size += bvec->bv_len; - bitmap_set(rbio->error_bitmap, total_sector_nr, - bio_size >> rbio->bioc->fs_info->sectorsize_bits); + /* + * Since we can have multiple bios touching the error_bitmap, we cannot + * call bitmap_set() without protection. + * + * Instead use set_bit() for each bit, as set_bit() itself is atomic. + */ + for (i = total_sector_nr; i < total_sector_nr + + (bio_size >> rbio->bioc->fs_info->sectorsize_bits); i++) + set_bit(i, rbio->error_bitmap); } /* Verify the data sectors at read time. */ @@ -1886,7 +1894,7 @@ pstripe: sector->uptodate = 1; } if (failb >= 0) { - ret = verify_one_sector(rbio, faila, sector_nr); + ret = verify_one_sector(rbio, failb, sector_nr); if (ret < 0) goto cleanup; @@ -2646,7 +2654,7 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) void **pointers = NULL; void **unmap_array = NULL; int sector_nr; - int ret; + int ret = 0; /* * @pointers array stores the pointer for each sector. diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index e65e6b6600a7..d50182b6deec 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -8073,10 +8073,10 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) /* * Check that we don't overflow at later allocations, we request * clone_sources_count + 1 items, and compare to unsigned long inside - * access_ok. + * access_ok. Also set an upper limit for allocation size so this can't + * easily exhaust memory. Max number of clone sources is about 200K. */ - if (arg->clone_sources_count > - ULONG_MAX / sizeof(struct clone_root) - 1) { + if (arg->clone_sources_count > SZ_8M / sizeof(struct clone_root)) { ret = -EINVAL; goto out; } diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index d28ee4e36f3d..69c09508afb5 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -407,7 +407,8 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, return 0; used = btrfs_space_info_used(space_info, true); - if (btrfs_is_zoned(fs_info) && (space_info->flags & BTRFS_BLOCK_GROUP_METADATA)) + if (test_bit(BTRFS_FS_NO_OVERCOMMIT, &fs_info->flags) && + (space_info->flags & BTRFS_BLOCK_GROUP_METADATA)) avail = 0; else avail = calc_available_free_space(fs_info, space_info, flush); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 93f52ee85f6f..433ce221dc5c 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1705,7 +1705,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (ret) goto restore; - ret = btrfs_check_features(fs_info, sb); + ret = btrfs_check_features(fs_info, !(*flags & SB_RDONLY)); if (ret < 0) goto restore; @@ -2514,6 +2514,7 @@ static __always_inline void btrfs_exit_btrfs_fs(void) static void __exit exit_btrfs_fs(void) { btrfs_exit_btrfs_fs(); + btrfs_cleanup_fs_uuids(); } static int __init init_btrfs_fs(void) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index a3c43f0b1c95..58599189bd18 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2980,7 +2980,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = 0; if (ret) { blk_finish_plug(&plug); - btrfs_abort_transaction(trans, ret); btrfs_set_log_full_commit(trans); mutex_unlock(&root->log_mutex); goto out; @@ -3045,15 +3044,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, blk_finish_plug(&plug); btrfs_set_log_full_commit(trans); - - if (ret != -ENOSPC) { - btrfs_abort_transaction(trans, ret); - mutex_unlock(&log_root_tree->log_mutex); - goto out; - } + if (ret != -ENOSPC) + btrfs_err(fs_info, + "failed to update log for root %llu ret %d", + root->root_key.objectid, ret); btrfs_wait_tree_log_extents(log, mark); mutex_unlock(&log_root_tree->log_mutex); - ret = BTRFS_LOG_FORCE_COMMIT; goto out; } @@ -3112,7 +3108,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, goto out_wake_log_root; } else if (ret) { btrfs_set_log_full_commit(trans); - btrfs_abort_transaction(trans, ret); mutex_unlock(&log_root_tree->log_mutex); goto out_wake_log_root; } @@ -3581,17 +3576,19 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, } static int flush_dir_items_batch(struct btrfs_trans_handle *trans, - struct btrfs_root *log, + struct btrfs_inode *inode, struct extent_buffer *src, struct btrfs_path *dst_path, int start_slot, int count) { + struct btrfs_root *log = inode->root->log_root; char *ins_data = NULL; struct btrfs_item_batch batch; struct extent_buffer *dst; unsigned long src_offset; unsigned long dst_offset; + u64 last_index; struct btrfs_key key; u32 item_size; int ret; @@ -3649,6 +3646,19 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1); copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size); btrfs_release_path(dst_path); + + last_index = batch.keys[count - 1].offset; + ASSERT(last_index > inode->last_dir_index_offset); + + /* + * If for some unexpected reason the last item's index is not greater + * than the last index we logged, warn and return an error to fallback + * to a transaction commit. + */ + if (WARN_ON(last_index <= inode->last_dir_index_offset)) + ret = -EUCLEAN; + else + inode->last_dir_index_offset = last_index; out: kfree(ins_data); @@ -3698,7 +3708,6 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, } di = btrfs_item_ptr(src, i, struct btrfs_dir_item); - ctx->last_dir_item_offset = key.offset; /* * Skip ranges of items that consist only of dir item keys created @@ -3761,7 +3770,7 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, if (batch_size > 0) { int ret; - ret = flush_dir_items_batch(trans, log, src, dst_path, + ret = flush_dir_items_batch(trans, inode, src, dst_path, batch_start, batch_size); if (ret < 0) return ret; @@ -3826,7 +3835,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, path->slots[0]); if (tmp.type == BTRFS_DIR_INDEX_KEY) last_old_dentry_offset = tmp.offset; + } else if (ret < 0) { + err = ret; } + goto done; } @@ -3846,19 +3858,34 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, */ if (tmp.type == BTRFS_DIR_INDEX_KEY) last_old_dentry_offset = tmp.offset; + } else if (ret < 0) { + err = ret; + goto done; } + btrfs_release_path(path); /* - * Find the first key from this transaction again. See the note for - * log_new_dir_dentries, if we're logging a directory recursively we - * won't be holding its i_mutex, which means we can modify the directory - * while we're logging it. If we remove an entry between our first - * search and this search we'll not find the key again and can just - * bail. + * Find the first key from this transaction again or the one we were at + * in the loop below in case we had to reschedule. We may be logging the + * directory without holding its VFS lock, which happen when logging new + * dentries (through log_new_dir_dentries()) or in some cases when we + * need to log the parent directory of an inode. This means a dir index + * key might be deleted from the inode's root, and therefore we may not + * find it anymore. If we can't find it, just move to the next key. We + * can not bail out and ignore, because if we do that we will simply + * not log dir index keys that come after the one that was just deleted + * and we can end up logging a dir index range that ends at (u64)-1 + * (@last_offset is initialized to that), resulting in removing dir + * entries we should not remove at log replay time. */ search: ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); + if (ret > 0) + ret = btrfs_next_item(root, path); + if (ret < 0) + err = ret; + /* If ret is 1, there are no more keys in the inode's root. */ if (ret != 0) goto done; @@ -4031,7 +4058,6 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans, min_key = BTRFS_DIR_START_INDEX; max_key = 0; - ctx->last_dir_item_offset = inode->last_dir_index_offset; while (1) { ret = log_dir_items(trans, inode, path, dst_path, @@ -4043,8 +4069,6 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans, min_key = max_key + 1; } - inode->last_dir_index_offset = ctx->last_dir_item_offset; - return 0; } @@ -5580,8 +5604,10 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans, * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction * commits. */ - if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) + if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) { + btrfs_set_log_full_commit(trans); return BTRFS_LOG_FORCE_COMMIT; + } inode = btrfs_iget(root->fs_info->sb, ino, root); /* @@ -7459,8 +7485,11 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, * not fail, but if it does, it's not serious, just bail out and * mark the log for a full commit. */ - if (WARN_ON_ONCE(ret < 0)) + if (WARN_ON_ONCE(ret < 0)) { + fscrypt_free_filename(&fname); goto out; + } + log_pinned = true; path = btrfs_alloc_path(); diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 85b43075ac58..85cd24cb0540 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -24,8 +24,6 @@ struct btrfs_log_ctx { bool logging_new_delayed_dentries; /* Indicate if the inode being logged was logged before. */ bool logged_before; - /* Tracks the last logged dir item/index key offset. */ - u64 last_dir_item_offset; struct inode *inode; struct list_head list; /* Only used for fast fsyncs. */ diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index aa25fa335d3e..df43093b7a46 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -403,6 +403,7 @@ void btrfs_free_device(struct btrfs_device *device) static void free_fs_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device; + WARN_ON(fs_devices->opened); while (!list_empty(&fs_devices->devices)) { device = list_entry(fs_devices->devices.next, @@ -768,8 +769,11 @@ static noinline struct btrfs_device *device_list_add(const char *path, BTRFS_SUPER_FLAG_CHANGING_FSID_V2); error = lookup_bdev(path, &path_devt); - if (error) + if (error) { + btrfs_err(NULL, "failed to lookup block device for path %s: %d", + path, error); return ERR_PTR(error); + } if (fsid_change_in_progress) { if (!has_metadata_uuid) @@ -836,6 +840,9 @@ static noinline struct btrfs_device *device_list_add(const char *path, unsigned int nofs_flag; if (fs_devices->opened) { + btrfs_err(NULL, + "device %s belongs to fsid %pU, and the fs is already mounted", + path, fs_devices->fsid); mutex_unlock(&fs_devices->device_list_mutex); return ERR_PTR(-EBUSY); } @@ -905,6 +912,9 @@ static noinline struct btrfs_device *device_list_add(const char *path, * generation are equal. */ mutex_unlock(&fs_devices->device_list_mutex); + btrfs_err(NULL, +"device %s already registered with a higher generation, found %llu expect %llu", + path, found_transid, device->generation); return ERR_PTR(-EEXIST); } @@ -1172,9 +1182,22 @@ void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) mutex_lock(&uuid_mutex); close_fs_devices(fs_devices); - if (!fs_devices->opened) + if (!fs_devices->opened) { list_splice_init(&fs_devices->seed_list, &list); + /* + * If the struct btrfs_fs_devices is not assembled with any + * other device, it can be re-initialized during the next mount + * without the needing device-scan step. Therefore, it can be + * fully freed. + */ + if (fs_devices->num_devices == 1) { + list_del(&fs_devices->fs_list); + free_fs_devices(fs_devices); + } + } + + list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) { close_fs_devices(fs_devices); list_del(&fs_devices->seed_list); @@ -1591,7 +1614,7 @@ again: if (ret < 0) goto out; - while (1) { + while (search_start < search_end) { l = path->nodes[0]; slot = path->slots[0]; if (slot >= btrfs_header_nritems(l)) { @@ -1614,6 +1637,9 @@ again: if (key.type != BTRFS_DEV_EXTENT_KEY) goto next; + if (key.offset > search_end) + break; + if (key.offset > search_start) { hole_size = key.offset - search_start; dev_extent_hole_check(device, &search_start, &hole_size, @@ -1674,6 +1700,7 @@ next: else ret = 0; + ASSERT(max_hole_start + max_hole_size <= search_end); out: btrfs_free_path(path); *start = max_hole_start; @@ -2005,42 +2032,42 @@ static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) return num_devices; } +static void btrfs_scratch_superblock(struct btrfs_fs_info *fs_info, + struct block_device *bdev, int copy_num) +{ + struct btrfs_super_block *disk_super; + const size_t len = sizeof(disk_super->magic); + const u64 bytenr = btrfs_sb_offset(copy_num); + int ret; + + disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr); + if (IS_ERR(disk_super)) + return; + + memset(&disk_super->magic, 0, len); + folio_mark_dirty(virt_to_folio(disk_super)); + btrfs_release_disk_super(disk_super); + + ret = sync_blockdev_range(bdev, bytenr, bytenr + len - 1); + if (ret) + btrfs_warn(fs_info, "error clearing superblock number %d (%d)", + copy_num, ret); +} + void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct block_device *bdev, const char *device_path) { - struct btrfs_super_block *disk_super; int copy_num; if (!bdev) return; for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) { - struct page *page; - int ret; - - disk_super = btrfs_read_dev_one_super(bdev, copy_num, false); - if (IS_ERR(disk_super)) - continue; - - if (bdev_is_zoned(bdev)) { + if (bdev_is_zoned(bdev)) btrfs_reset_sb_log_zones(bdev, copy_num); - continue; - } - - memset(&disk_super->magic, 0, sizeof(disk_super->magic)); - - page = virt_to_page(disk_super); - set_page_dirty(page); - lock_page(page); - /* write_on_page() unlocks the page */ - ret = write_one_page(page); - if (ret) - btrfs_warn(fs_info, - "error clearing superblock number %d (%d)", - copy_num, ret); - btrfs_release_disk_super(disk_super); - + else + btrfs_scratch_superblock(fs_info, bdev, copy_num); } /* Notify udev that device has changed */ diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 01a13de11832..da7bb9187b68 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -63,7 +63,7 @@ struct list_head *zlib_alloc_workspace(unsigned int level) workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), zlib_inflate_workspacesize()); - workspace->strm.workspace = kvmalloc(workspacesize, GFP_KERNEL); + workspace->strm.workspace = kvzalloc(workspacesize, GFP_KERNEL); workspace->level = level; workspace->buf = NULL; /* diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index a759668477bb..1f503e8e42d4 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -539,6 +539,8 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) } atomic_set(&zone_info->active_zones_left, max_active_zones - nactive); + /* Overcommit does not work well with active zone tacking. */ + set_bit(BTRFS_FS_NO_OVERCOMMIT, &fs_info->flags); } /* Validate superblock log */ diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 8c74871e37c9..cac4083e387a 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -305,7 +305,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) struct inode *inode = rreq->inode; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_osd_request *req; + struct ceph_osd_request *req = NULL; struct ceph_vino vino = ceph_vino(inode); struct iov_iter iter; struct page **pages; @@ -313,6 +313,11 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) int err = 0; u64 len = subreq->len; + if (ceph_inode_is_shutdown(inode)) { + err = -EIO; + goto out; + } + if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq)) return; @@ -563,6 +568,9 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) dout("writepage %p idx %lu\n", page, page->index); + if (ceph_inode_is_shutdown(inode)) + return -EIO; + /* verify this is a writeable snap context */ snapc = page_snap_context(page); if (!snapc) { @@ -1643,7 +1651,7 @@ int ceph_uninline_data(struct file *file) struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_osd_request *req = NULL; - struct ceph_cap_flush *prealloc_cf; + struct ceph_cap_flush *prealloc_cf = NULL; struct folio *folio = NULL; u64 inline_version = CEPH_INLINE_NONE; struct page *pages[1]; @@ -1657,6 +1665,11 @@ int ceph_uninline_data(struct file *file) dout("uninline_data %p %llx.%llx inline_version %llu\n", inode, ceph_vinop(inode), inline_version); + if (ceph_inode_is_shutdown(inode)) { + err = -EIO; + goto out; + } + if (inline_version == CEPH_INLINE_NONE) return 0; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 4b159f97fe7b..7cc20772eac9 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -9,6 +9,7 @@ #include <linux/wait.h> #include <linux/writeback.h> #include <linux/iversion.h> +#include <linux/filelock.h> #include "super.h" #include "mds_client.h" @@ -2913,7 +2914,7 @@ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got while (true) { flags &= CEPH_FILE_MODE_MASK; - if (atomic_read(&fi->num_locks)) + if (vfs_inode_has_locks(inode)) flags |= CHECK_FILELOCK; _got = 0; ret = try_get_cap_refs(inode, need, want, endoff, @@ -4078,6 +4079,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, void *p, *end; struct cap_extra_info extra_info = {}; bool queue_trunc; + bool close_sessions = false; dout("handle_caps from mds%d\n", session->s_mds); @@ -4215,9 +4217,13 @@ void ceph_handle_caps(struct ceph_mds_session *session, realm = NULL; if (snaptrace_len) { down_write(&mdsc->snap_rwsem); - ceph_update_snap_trace(mdsc, snaptrace, - snaptrace + snaptrace_len, - false, &realm); + if (ceph_update_snap_trace(mdsc, snaptrace, + snaptrace + snaptrace_len, + false, &realm)) { + up_write(&mdsc->snap_rwsem); + close_sessions = true; + goto done; + } downgrade_write(&mdsc->snap_rwsem); } else { down_read(&mdsc->snap_rwsem); @@ -4277,6 +4283,11 @@ done_unlocked: iput(inode); out: ceph_put_string(extra_info.pool_ns); + + /* Defer closing the sessions after s_mutex lock being released */ + if (close_sessions) + ceph_mdsc_close_sessions(mdsc); + return; flush_cap_releases: diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 764598e1efd9..b5cff85925a1 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -2011,6 +2011,9 @@ static int ceph_zero_partial_object(struct inode *inode, loff_t zero = 0; int op; + if (ceph_inode_is_shutdown(inode)) + return -EIO; + if (!length) { op = offset ? CEPH_OSD_OP_DELETE : CEPH_OSD_OP_TRUNCATE; length = &zero; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index a93e6f65a756..8e5f41d45283 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -2417,10 +2417,10 @@ static int statx_to_caps(u32 want, umode_t mode) { int mask = 0; - if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME|STATX_BTIME)) + if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME|STATX_BTIME|STATX_CHANGE_COOKIE)) mask |= CEPH_CAP_AUTH_SHARED; - if (want & (STATX_NLINK|STATX_CTIME)) { + if (want & (STATX_NLINK|STATX_CTIME|STATX_CHANGE_COOKIE)) { /* * The link count for directories depends on inode->i_subdirs, * and that is only updated when Fs caps are held. @@ -2431,11 +2431,10 @@ static int statx_to_caps(u32 want, umode_t mode) mask |= CEPH_CAP_LINK_SHARED; } - if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE| - STATX_BLOCKS)) + if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE|STATX_BLOCKS|STATX_CHANGE_COOKIE)) mask |= CEPH_CAP_FILE_SHARED; - if (want & (STATX_CTIME)) + if (want & (STATX_CTIME|STATX_CHANGE_COOKIE)) mask |= CEPH_CAP_XATTR_SHARED; return mask; @@ -2478,6 +2477,11 @@ int ceph_getattr(struct mnt_idmap *idmap, const struct path *path, valid_mask |= STATX_BTIME; } + if (request_mask & STATX_CHANGE_COOKIE) { + stat->change_cookie = inode_peek_iversion_raw(inode); + valid_mask |= STATX_CHANGE_COOKIE; + } + if (ceph_snap(inode) == CEPH_NOSNAP) stat->dev = sb->s_dev; else @@ -2519,6 +2523,8 @@ int ceph_getattr(struct mnt_idmap *idmap, const struct path *path, stat->nlink = 1 + 1 + ci->i_subdirs; } + stat->attributes_mask |= STATX_ATTR_CHANGE_MONOTONIC; + stat->attributes |= STATX_ATTR_CHANGE_MONOTONIC; stat->result_mask = request_mask & valid_mask; return err; } diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index f3b461c708a8..cb51c7e9c8e2 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -7,6 +7,7 @@ #include "super.h" #include "mds_client.h" +#include <linux/filelock.h> #include <linux/ceph/pagelist.h> static u64 lock_secret; @@ -32,24 +33,36 @@ void __init ceph_flock_init(void) static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src) { - struct ceph_file_info *fi = dst->fl_file->private_data; struct inode *inode = file_inode(dst->fl_file); atomic_inc(&ceph_inode(inode)->i_filelock_ref); - atomic_inc(&fi->num_locks); + dst->fl_u.ceph.inode = igrab(inode); } +/* + * Do not use the 'fl->fl_file' in release function, which + * is possibly already released by another thread. + */ static void ceph_fl_release_lock(struct file_lock *fl) { - struct ceph_file_info *fi = fl->fl_file->private_data; - struct inode *inode = file_inode(fl->fl_file); - struct ceph_inode_info *ci = ceph_inode(inode); - atomic_dec(&fi->num_locks); + struct inode *inode = fl->fl_u.ceph.inode; + struct ceph_inode_info *ci; + + /* + * If inode is NULL it should be a request file_lock, + * nothing we can do. + */ + if (!inode) + return; + + ci = ceph_inode(inode); if (atomic_dec_and_test(&ci->i_filelock_ref)) { /* clear error when all locks are released */ spin_lock(&ci->i_ceph_lock); ci->i_ceph_flags &= ~CEPH_I_ERROR_FILELOCK; spin_unlock(&ci->i_ceph_lock); } + fl->fl_u.ceph.inode = NULL; + iput(inode); } static const struct file_lock_operations ceph_fl_lock_ops = { diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 26a0a8b9975e..27a245d959c0 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -806,6 +806,9 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, { struct ceph_mds_session *s; + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO) + return ERR_PTR(-EIO); + if (mds >= mdsc->mdsmap->possible_max_rank) return ERR_PTR(-EINVAL); @@ -1478,6 +1481,9 @@ static int __open_session(struct ceph_mds_client *mdsc, int mstate; int mds = session->s_mds; + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO) + return -EIO; + /* wait for mds to go active? */ mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds); dout("open_session to mds%d (%s)\n", mds, @@ -2860,6 +2866,11 @@ static void __do_request(struct ceph_mds_client *mdsc, return; } + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO) { + dout("do_request metadata corrupted\n"); + err = -EIO; + goto finish; + } if (req->r_timeout && time_after_eq(jiffies, req->r_started + req->r_timeout)) { dout("do_request timed out\n"); @@ -3245,6 +3256,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) u64 tid; int err, result; int mds = session->s_mds; + bool close_sessions = false; if (msg->front.iov_len < sizeof(*head)) { pr_err("mdsc_handle_reply got corrupt (short) reply\n"); @@ -3351,10 +3363,17 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) realm = NULL; if (rinfo->snapblob_len) { down_write(&mdsc->snap_rwsem); - ceph_update_snap_trace(mdsc, rinfo->snapblob, + err = ceph_update_snap_trace(mdsc, rinfo->snapblob, rinfo->snapblob + rinfo->snapblob_len, le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP, &realm); + if (err) { + up_write(&mdsc->snap_rwsem); + close_sessions = true; + if (err == -EIO) + ceph_msg_dump(msg); + goto out_err; + } downgrade_write(&mdsc->snap_rwsem); } else { down_read(&mdsc->snap_rwsem); @@ -3412,6 +3431,10 @@ out_err: req->r_end_latency, err); out: ceph_mdsc_put_request(req); + + /* Defer closing the sessions after s_mutex lock being released */ + if (close_sessions) + ceph_mdsc_close_sessions(mdsc); return; } @@ -3662,6 +3685,12 @@ static void handle_session(struct ceph_mds_session *session, break; case CEPH_SESSION_FLUSHMSG: + /* flush cap releases */ + spin_lock(&session->s_cap_lock); + if (session->s_num_cap_releases) + ceph_flush_cap_releases(mdsc, session); + spin_unlock(&session->s_cap_lock); + send_flushmsg_ack(mdsc, session, seq); break; @@ -5011,7 +5040,7 @@ static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped) } /* - * called after sb is ro. + * called after sb is ro or when metadata corrupted. */ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) { @@ -5301,7 +5330,8 @@ static void mds_peer_reset(struct ceph_connection *con) struct ceph_mds_client *mdsc = s->s_mdsc; pr_warn("mds%d closed our session\n", s->s_mds); - send_mds_reconnect(mdsc, s); + if (READ_ONCE(mdsc->fsc->mount_state) != CEPH_MOUNT_FENCE_IO) + send_mds_reconnect(mdsc, s); } static void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg) diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index e4151852184e..87007203f130 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> +#include <linux/fs.h> #include <linux/sort.h> #include <linux/slab.h> #include <linux/iversion.h> @@ -766,8 +767,10 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm; struct ceph_snap_realm *first_realm = NULL; struct ceph_snap_realm *realm_to_rebuild = NULL; + struct ceph_client *client = mdsc->fsc->client; int rebuild_snapcs; int err = -ENOMEM; + int ret; LIST_HEAD(dirty_realms); lockdep_assert_held_write(&mdsc->snap_rwsem); @@ -884,6 +887,27 @@ fail: if (first_realm) ceph_put_snap_realm(mdsc, first_realm); pr_err("%s error %d\n", __func__, err); + + /* + * When receiving a corrupted snap trace we don't know what + * exactly has happened in MDS side. And we shouldn't continue + * writing to OSD, which may corrupt the snapshot contents. + * + * Just try to blocklist this kclient and then this kclient + * must be remounted to continue after the corrupted metadata + * fixed in the MDS side. + */ + WRITE_ONCE(mdsc->fsc->mount_state, CEPH_MOUNT_FENCE_IO); + ret = ceph_monc_blocklist_add(&client->monc, &client->msgr.inst.addr); + if (ret) + pr_err("%s failed to blocklist %s: %d\n", __func__, + ceph_pr_addr(&client->msgr.inst.addr), ret); + + WARN(1, "%s: %s%sdo remount to continue%s", + __func__, ret ? "" : ceph_pr_addr(&client->msgr.inst.addr), + ret ? "" : " was blocklisted, ", + err == -EIO ? " after corrupted snaptrace is fixed" : ""); + return err; } @@ -984,6 +1008,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, __le64 *split_inos = NULL, *split_realms = NULL; int i; int locked_rwsem = 0; + bool close_sessions = false; /* decode */ if (msg->front.iov_len < sizeof(*h)) @@ -1092,8 +1117,12 @@ skip_inode: * update using the provided snap trace. if we are deleting a * snap, we can avoid queueing cap_snaps. */ - ceph_update_snap_trace(mdsc, p, e, - op == CEPH_SNAP_OP_DESTROY, NULL); + if (ceph_update_snap_trace(mdsc, p, e, + op == CEPH_SNAP_OP_DESTROY, + NULL)) { + close_sessions = true; + goto bad; + } if (op == CEPH_SNAP_OP_SPLIT) /* we took a reference when we created the realm, above */ @@ -1112,6 +1141,9 @@ bad: out: if (locked_rwsem) up_write(&mdsc->snap_rwsem); + + if (close_sessions) + ceph_mdsc_close_sessions(mdsc); return; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index f5a936ccb3fc..6ecca2c6d137 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -100,6 +100,17 @@ struct ceph_mount_options { char *mon_addr; }; +/* mount state */ +enum { + CEPH_MOUNT_MOUNTING, + CEPH_MOUNT_MOUNTED, + CEPH_MOUNT_UNMOUNTING, + CEPH_MOUNT_UNMOUNTED, + CEPH_MOUNT_SHUTDOWN, + CEPH_MOUNT_RECOVER, + CEPH_MOUNT_FENCE_IO, +}; + #define CEPH_ASYNC_CREATE_CONFLICT_BITS 8 struct ceph_fs_client { @@ -790,7 +801,6 @@ struct ceph_file_info { struct list_head rw_contexts; u32 filp_gen; - atomic_t num_locks; }; struct ceph_dir_file_info { diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 5db73c0f792a..cbc18b4a9cb2 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -278,6 +278,7 @@ build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp) * ( for NTLMSSP_AV_NB_DOMAIN_NAME followed by NTLMSSP_AV_EOL ) + * unicode length of a netbios domain name */ + kfree_sensitive(ses->auth_key.response); ses->auth_key.len = size + 2 * dlen; ses->auth_key.response = kzalloc(ses->auth_key.len, GFP_KERNEL); if (!ses->auth_key.response) { diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 2554c49a3d74..cb7c5460a80b 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -12,6 +12,7 @@ #include <linux/module.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/mount.h> #include <linux/slab.h> #include <linux/init.h> diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index cfdd5bf701a1..cd8171a1c9a0 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -26,6 +26,7 @@ #include <uapi/linux/cifs/cifs_mount.h> #include "../smbfs_common/smb2pdu.h" #include "smb2pdu.h" +#include <linux/filelock.h> #define SMB_PATH_MAX 260 #define CIFS_PORT 445 diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 23f10e0d6e7e..60dd4e37030a 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -15,6 +15,7 @@ /* want to reuse a stale file handle and only the caller knows the file info */ #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/kernel.h> #include <linux/vfs.h> #include <linux/slab.h> diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index d371259d6808..b2a04b4e89a5 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2606,11 +2606,14 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) INIT_LIST_HEAD(&tcon->pending_opens); tcon->status = TID_GOOD; - /* schedule query interfaces poll */ INIT_DELAYED_WORK(&tcon->query_interfaces, smb2_query_server_interfaces); - queue_delayed_work(cifsiod_wq, &tcon->query_interfaces, - (SMB_INTERFACE_POLL_INTERVAL * HZ)); + if (ses->server->dialect >= SMB30_PROT_ID && + (ses->server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) { + /* schedule query interfaces poll */ + queue_delayed_work(cifsiod_wq, &tcon->query_interfaces, + (SMB_INTERFACE_POLL_INTERVAL * HZ)); + } spin_lock(&cifs_tcp_ses_lock); list_add(&tcon->tcon_list, &ses->tcon_list); diff --git a/fs/cifs/dfs.c b/fs/cifs/dfs.c index b541e68378f6..b64d20374b9c 100644 --- a/fs/cifs/dfs.c +++ b/fs/cifs/dfs.c @@ -327,8 +327,8 @@ static int update_server_fullpath(struct TCP_Server_Info *server, struct cifs_sb return rc; } -static int target_share_matches_server(struct TCP_Server_Info *server, const char *tcp_host, - size_t tcp_host_len, char *share, bool *target_match) +static int target_share_matches_server(struct TCP_Server_Info *server, char *share, + bool *target_match) { int rc = 0; const char *dfs_host; @@ -338,13 +338,16 @@ static int target_share_matches_server(struct TCP_Server_Info *server, const cha extract_unc_hostname(share, &dfs_host, &dfs_host_len); /* Check if hostnames or addresses match */ - if (dfs_host_len != tcp_host_len || strncasecmp(dfs_host, tcp_host, dfs_host_len) != 0) { - cifs_dbg(FYI, "%s: %.*s doesn't match %.*s\n", __func__, (int)dfs_host_len, - dfs_host, (int)tcp_host_len, tcp_host); + cifs_server_lock(server); + if (dfs_host_len != strlen(server->hostname) || + strncasecmp(dfs_host, server->hostname, dfs_host_len)) { + cifs_dbg(FYI, "%s: %.*s doesn't match %s\n", __func__, + (int)dfs_host_len, dfs_host, server->hostname); rc = match_target_ip(server, dfs_host, dfs_host_len, target_match); if (rc) cifs_dbg(VFS, "%s: failed to match target ip: %d\n", __func__, rc); } + cifs_server_unlock(server); return rc; } @@ -358,13 +361,9 @@ static int __tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *t struct cifs_ses *root_ses = CIFS_DFS_ROOT_SES(tcon->ses); struct cifs_tcon *ipc = root_ses->tcon_ipc; char *share = NULL, *prefix = NULL; - const char *tcp_host; - size_t tcp_host_len; struct dfs_cache_tgt_iterator *tit; bool target_match; - extract_unc_hostname(server->hostname, &tcp_host, &tcp_host_len); - tit = dfs_cache_get_tgt_iterator(tl); if (!tit) { rc = -ENOENT; @@ -387,8 +386,7 @@ static int __tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *t break; } - rc = target_share_matches_server(server, tcp_host, tcp_host_len, share, - &target_match); + rc = target_share_matches_server(server, share, &target_match); if (rc) break; if (!target_match) { @@ -401,8 +399,7 @@ static int __tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *t if (ipc->need_reconnect) { scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$", server->hostname); rc = ops->tree_connect(xid, ipc->ses, tree, ipc, cifs_sb->local_nls); - if (rc) - break; + cifs_dbg(FYI, "%s: reconnect ipc: %d\n", __func__, rc); } scnprintf(tree, MAX_TREE_SIZE, "\\%s", share); @@ -498,7 +495,9 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru } if (tcon->ipc) { + cifs_server_lock(server); scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$", server->hostname); + cifs_server_unlock(server); rc = ops->tree_connect(xid, tcon->ses, tree, tcon, nlsc); goto out; } diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index 43ad1176dcb9..ac86bd0ebd63 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -269,7 +269,7 @@ static int dfscache_proc_show(struct seq_file *m, void *v) list_for_each_entry(t, &ce->tlist, list) { seq_printf(m, " %s%s\n", t->name, - ce->tgthint == t ? " (target hint)" : ""); + READ_ONCE(ce->tgthint) == t ? " (target hint)" : ""); } } } @@ -321,7 +321,7 @@ static inline void dump_tgts(const struct cache_entry *ce) cifs_dbg(FYI, "target list:\n"); list_for_each_entry(t, &ce->tlist, list) { cifs_dbg(FYI, " %s%s\n", t->name, - ce->tgthint == t ? " (target hint)" : ""); + READ_ONCE(ce->tgthint) == t ? " (target hint)" : ""); } } @@ -427,7 +427,7 @@ static int cache_entry_hash(const void *data, int size, unsigned int *hash) /* Return target hint of a DFS cache entry */ static inline char *get_tgt_name(const struct cache_entry *ce) { - struct cache_dfs_tgt *t = ce->tgthint; + struct cache_dfs_tgt *t = READ_ONCE(ce->tgthint); return t ? t->name : ERR_PTR(-ENOENT); } @@ -470,6 +470,7 @@ static struct cache_dfs_tgt *alloc_target(const char *name, int path_consumed) static int copy_ref_data(const struct dfs_info3_param *refs, int numrefs, struct cache_entry *ce, const char *tgthint) { + struct cache_dfs_tgt *target; int i; ce->ttl = max_t(int, refs[0].ttl, CACHE_MIN_TTL); @@ -496,8 +497,9 @@ static int copy_ref_data(const struct dfs_info3_param *refs, int numrefs, ce->numtgts++; } - ce->tgthint = list_first_entry_or_null(&ce->tlist, - struct cache_dfs_tgt, list); + target = list_first_entry_or_null(&ce->tlist, struct cache_dfs_tgt, + list); + WRITE_ONCE(ce->tgthint, target); return 0; } @@ -558,7 +560,8 @@ static void remove_oldest_entry_locked(void) } /* Add a new DFS cache entry */ -static int add_cache_entry_locked(struct dfs_info3_param *refs, int numrefs) +static struct cache_entry *add_cache_entry_locked(struct dfs_info3_param *refs, + int numrefs) { int rc; struct cache_entry *ce; @@ -573,11 +576,11 @@ static int add_cache_entry_locked(struct dfs_info3_param *refs, int numrefs) rc = cache_entry_hash(refs[0].path_name, strlen(refs[0].path_name), &hash); if (rc) - return rc; + return ERR_PTR(rc); ce = alloc_cache_entry(refs, numrefs); if (IS_ERR(ce)) - return PTR_ERR(ce); + return ce; spin_lock(&cache_ttl_lock); if (!cache_ttl) { @@ -594,7 +597,7 @@ static int add_cache_entry_locked(struct dfs_info3_param *refs, int numrefs) atomic_inc(&cache_count); - return 0; + return ce; } /* Check if two DFS paths are equal. @s1 and @s2 are expected to be in @cache_cp's charset */ @@ -641,7 +644,9 @@ static struct cache_entry *__lookup_cache_entry(const char *path, unsigned int h * * Use whole path components in the match. Must be called with htable_rw_lock held. * + * Return cached entry if successful. * Return ERR_PTR(-ENOENT) if the entry is not found. + * Return error ptr otherwise. */ static struct cache_entry *lookup_cache_entry(const char *path) { @@ -711,14 +716,15 @@ void dfs_cache_destroy(void) static int update_cache_entry_locked(struct cache_entry *ce, const struct dfs_info3_param *refs, int numrefs) { + struct cache_dfs_tgt *target; + char *th = NULL; int rc; - char *s, *th = NULL; WARN_ON(!rwsem_is_locked(&htable_rw_lock)); - if (ce->tgthint) { - s = ce->tgthint->name; - th = kstrdup(s, GFP_ATOMIC); + target = READ_ONCE(ce->tgthint); + if (target) { + th = kstrdup(target->name, GFP_ATOMIC); if (!th) return -ENOMEM; } @@ -767,51 +773,75 @@ static int get_dfs_referral(const unsigned int xid, struct cifs_ses *ses, const * * For interlinks, cifs_mount() and expand_dfs_referral() are supposed to * handle them properly. + * + * On success, return entry with acquired lock for reading, otherwise error ptr. */ -static int cache_refresh_path(const unsigned int xid, struct cifs_ses *ses, const char *path) +static struct cache_entry *cache_refresh_path(const unsigned int xid, + struct cifs_ses *ses, + const char *path, + bool force_refresh) { - int rc; - struct cache_entry *ce; struct dfs_info3_param *refs = NULL; + struct cache_entry *ce; int numrefs = 0; - bool newent = false; + int rc; cifs_dbg(FYI, "%s: search path: %s\n", __func__, path); - down_write(&htable_rw_lock); + down_read(&htable_rw_lock); ce = lookup_cache_entry(path); if (!IS_ERR(ce)) { - if (!cache_entry_expired(ce)) { - dump_ce(ce); - up_write(&htable_rw_lock); - return 0; - } - } else { - newent = true; + if (!force_refresh && !cache_entry_expired(ce)) + return ce; + } else if (PTR_ERR(ce) != -ENOENT) { + up_read(&htable_rw_lock); + return ce; } /* - * Either the entry was not found, or it is expired. + * Unlock shared access as we don't want to hold any locks while getting + * a new referral. The @ses used for performing the I/O could be + * reconnecting and it acquires @htable_rw_lock to look up the dfs cache + * in order to failover -- if necessary. + */ + up_read(&htable_rw_lock); + + /* + * Either the entry was not found, or it is expired, or it is a forced + * refresh. * Request a new DFS referral in order to create or update a cache entry. */ rc = get_dfs_referral(xid, ses, path, &refs, &numrefs); - if (rc) - goto out_unlock; + if (rc) { + ce = ERR_PTR(rc); + goto out; + } dump_refs(refs, numrefs); - if (!newent) { - rc = update_cache_entry_locked(ce, refs, numrefs); - goto out_unlock; + down_write(&htable_rw_lock); + /* Re-check as another task might have it added or refreshed already */ + ce = lookup_cache_entry(path); + if (!IS_ERR(ce)) { + if (force_refresh || cache_entry_expired(ce)) { + rc = update_cache_entry_locked(ce, refs, numrefs); + if (rc) + ce = ERR_PTR(rc); + } + } else if (PTR_ERR(ce) == -ENOENT) { + ce = add_cache_entry_locked(refs, numrefs); } - rc = add_cache_entry_locked(refs, numrefs); + if (IS_ERR(ce)) { + up_write(&htable_rw_lock); + goto out; + } -out_unlock: - up_write(&htable_rw_lock); + downgrade_write(&htable_rw_lock); +out: free_dfs_info_array(refs, numrefs); - return rc; + return ce; } /* @@ -878,7 +908,7 @@ static int get_targets(struct cache_entry *ce, struct dfs_cache_tgt_list *tl) } it->it_path_consumed = t->path_consumed; - if (ce->tgthint == t) + if (READ_ONCE(ce->tgthint) == t) list_add(&it->it_list, head); else list_add_tail(&it->it_list, head); @@ -931,15 +961,8 @@ int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses, const struct nl if (IS_ERR(npath)) return PTR_ERR(npath); - rc = cache_refresh_path(xid, ses, npath); - if (rc) - goto out_free_path; - - down_read(&htable_rw_lock); - - ce = lookup_cache_entry(npath); + ce = cache_refresh_path(xid, ses, npath, false); if (IS_ERR(ce)) { - up_read(&htable_rw_lock); rc = PTR_ERR(ce); goto out_free_path; } @@ -1003,72 +1026,6 @@ out_unlock: } /** - * dfs_cache_update_tgthint - update target hint of a DFS cache entry - * - * If it doesn't find the cache entry, then it will get a DFS referral for @path - * and create a new entry. - * - * In case the cache entry exists but expired, it will get a DFS referral - * for @path and then update the respective cache entry. - * - * @xid: syscall id - * @ses: smb session - * @cp: codepage - * @remap: type of character remapping for paths - * @path: path to lookup in DFS referral cache - * @it: DFS target iterator - * - * Return zero if the target hint was updated successfully, otherwise non-zero. - */ -int dfs_cache_update_tgthint(const unsigned int xid, struct cifs_ses *ses, - const struct nls_table *cp, int remap, const char *path, - const struct dfs_cache_tgt_iterator *it) -{ - int rc; - const char *npath; - struct cache_entry *ce; - struct cache_dfs_tgt *t; - - npath = dfs_cache_canonical_path(path, cp, remap); - if (IS_ERR(npath)) - return PTR_ERR(npath); - - cifs_dbg(FYI, "%s: update target hint - path: %s\n", __func__, npath); - - rc = cache_refresh_path(xid, ses, npath); - if (rc) - goto out_free_path; - - down_write(&htable_rw_lock); - - ce = lookup_cache_entry(npath); - if (IS_ERR(ce)) { - rc = PTR_ERR(ce); - goto out_unlock; - } - - t = ce->tgthint; - - if (likely(!strcasecmp(it->it_name, t->name))) - goto out_unlock; - - list_for_each_entry(t, &ce->tlist, list) { - if (!strcasecmp(t->name, it->it_name)) { - ce->tgthint = t; - cifs_dbg(FYI, "%s: new target hint: %s\n", __func__, - it->it_name); - break; - } - } - -out_unlock: - up_write(&htable_rw_lock); -out_free_path: - kfree(npath); - return rc; -} - -/** * dfs_cache_noreq_update_tgthint - update target hint of a DFS cache entry * without sending any requests to the currently connected server. * @@ -1092,21 +1049,20 @@ void dfs_cache_noreq_update_tgthint(const char *path, const struct dfs_cache_tgt cifs_dbg(FYI, "%s: path: %s\n", __func__, path); - if (!down_write_trylock(&htable_rw_lock)) - return; + down_read(&htable_rw_lock); ce = lookup_cache_entry(path); if (IS_ERR(ce)) goto out_unlock; - t = ce->tgthint; + t = READ_ONCE(ce->tgthint); if (unlikely(!strcasecmp(it->it_name, t->name))) goto out_unlock; list_for_each_entry(t, &ce->tlist, list) { if (!strcasecmp(t->name, it->it_name)) { - ce->tgthint = t; + WRITE_ONCE(ce->tgthint, t); cifs_dbg(FYI, "%s: new target hint: %s\n", __func__, it->it_name); break; @@ -1114,7 +1070,7 @@ void dfs_cache_noreq_update_tgthint(const char *path, const struct dfs_cache_tgt } out_unlock: - up_write(&htable_rw_lock); + up_read(&htable_rw_lock); } /** @@ -1299,7 +1255,6 @@ static bool target_share_equal(struct TCP_Server_Info *server, const char *s1, c * Resolve share's hostname and check if server address matches. Otherwise just ignore it * as we could not have upcall to resolve hostname or failed to convert ip address. */ - match = true; extract_unc_hostname(s1, &host, &hostlen); scnprintf(unc, sizeof(unc), "\\\\%.*s", (int)hostlen, host); @@ -1321,35 +1276,37 @@ static bool target_share_equal(struct TCP_Server_Info *server, const char *s1, c * Mark dfs tcon for reconnecting when the currently connected tcon does not match any of the new * target shares in @refs. */ -static void mark_for_reconnect_if_needed(struct cifs_tcon *tcon, struct dfs_cache_tgt_list *tl, - const struct dfs_info3_param *refs, int numrefs) +static void mark_for_reconnect_if_needed(struct TCP_Server_Info *server, + struct dfs_cache_tgt_list *old_tl, + struct dfs_cache_tgt_list *new_tl) { - struct dfs_cache_tgt_iterator *it; - int i; - - for (it = dfs_cache_get_tgt_iterator(tl); it; it = dfs_cache_get_next_tgt(tl, it)) { - for (i = 0; i < numrefs; i++) { - if (target_share_equal(tcon->ses->server, dfs_cache_get_tgt_name(it), - refs[i].node_name)) + struct dfs_cache_tgt_iterator *oit, *nit; + + for (oit = dfs_cache_get_tgt_iterator(old_tl); oit; + oit = dfs_cache_get_next_tgt(old_tl, oit)) { + for (nit = dfs_cache_get_tgt_iterator(new_tl); nit; + nit = dfs_cache_get_next_tgt(new_tl, nit)) { + if (target_share_equal(server, + dfs_cache_get_tgt_name(oit), + dfs_cache_get_tgt_name(nit))) return; } } cifs_dbg(FYI, "%s: no cached or matched targets. mark dfs share for reconnect.\n", __func__); - cifs_signal_cifsd_for_reconnect(tcon->ses->server, true); + cifs_signal_cifsd_for_reconnect(server, true); } /* Refresh dfs referral of tcon and mark it for reconnect if needed */ static int __refresh_tcon(const char *path, struct cifs_tcon *tcon, bool force_refresh) { - struct dfs_cache_tgt_list tl = DFS_CACHE_TGT_LIST_INIT(tl); + struct dfs_cache_tgt_list old_tl = DFS_CACHE_TGT_LIST_INIT(old_tl); + struct dfs_cache_tgt_list new_tl = DFS_CACHE_TGT_LIST_INIT(new_tl); struct cifs_ses *ses = CIFS_DFS_ROOT_SES(tcon->ses); struct cifs_tcon *ipc = ses->tcon_ipc; - struct dfs_info3_param *refs = NULL; bool needs_refresh = false; struct cache_entry *ce; unsigned int xid; - int numrefs = 0; int rc = 0; xid = get_xid(); @@ -1358,9 +1315,8 @@ static int __refresh_tcon(const char *path, struct cifs_tcon *tcon, bool force_r ce = lookup_cache_entry(path); needs_refresh = force_refresh || IS_ERR(ce) || cache_entry_expired(ce); if (!IS_ERR(ce)) { - rc = get_targets(ce, &tl); - if (rc) - cifs_dbg(FYI, "%s: could not get dfs targets: %d\n", __func__, rc); + rc = get_targets(ce, &old_tl); + cifs_dbg(FYI, "%s: get_targets: %d\n", __func__, rc); } up_read(&htable_rw_lock); @@ -1377,26 +1333,18 @@ static int __refresh_tcon(const char *path, struct cifs_tcon *tcon, bool force_r } spin_unlock(&ipc->tc_lock); - rc = get_dfs_referral(xid, ses, path, &refs, &numrefs); - if (!rc) { - /* Create or update a cache entry with the new referral */ - dump_refs(refs, numrefs); - - down_write(&htable_rw_lock); - ce = lookup_cache_entry(path); - if (IS_ERR(ce)) - add_cache_entry_locked(refs, numrefs); - else if (force_refresh || cache_entry_expired(ce)) - update_cache_entry_locked(ce, refs, numrefs); - up_write(&htable_rw_lock); - - mark_for_reconnect_if_needed(tcon, &tl, refs, numrefs); + ce = cache_refresh_path(xid, ses, path, true); + if (!IS_ERR(ce)) { + rc = get_targets(ce, &new_tl); + up_read(&htable_rw_lock); + cifs_dbg(FYI, "%s: get_targets: %d\n", __func__, rc); + mark_for_reconnect_if_needed(tcon->ses->server, &old_tl, &new_tl); } out: free_xid(xid); - dfs_cache_free_tgts(&tl); - free_dfs_info_array(refs, numrefs); + dfs_cache_free_tgts(&old_tl); + dfs_cache_free_tgts(&new_tl); return rc; } diff --git a/fs/cifs/dfs_cache.h b/fs/cifs/dfs_cache.h index f7cff0be9327..be3b5a44cf82 100644 --- a/fs/cifs/dfs_cache.h +++ b/fs/cifs/dfs_cache.h @@ -35,9 +35,6 @@ int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses, const struct nl struct dfs_cache_tgt_list *tgt_list); int dfs_cache_noreq_find(const char *path, struct dfs_info3_param *ref, struct dfs_cache_tgt_list *tgt_list); -int dfs_cache_update_tgthint(const unsigned int xid, struct cifs_ses *ses, - const struct nls_table *cp, int remap, const char *path, - const struct dfs_cache_tgt_iterator *it); void dfs_cache_noreq_update_tgthint(const char *path, const struct dfs_cache_tgt_iterator *it); int dfs_cache_get_tgt_referral(const char *path, const struct dfs_cache_tgt_iterator *it, struct dfs_info3_param *ref); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 22dfc1f8b4f1..2870e3b6ffe8 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -9,6 +9,7 @@ * */ #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/backing-dev.h> #include <linux/stat.h> #include <linux/fcntl.h> @@ -3889,7 +3890,7 @@ uncached_fill_pages(struct TCP_Server_Info *server, rdata->got_bytes += result; } - return rdata->got_bytes > 0 && result != -ECONNABORTED ? + return result != -ECONNABORTED && rdata->got_bytes > 0 ? rdata->got_bytes : result; } @@ -4665,7 +4666,7 @@ readpages_fill_pages(struct TCP_Server_Info *server, rdata->got_bytes += result; } - return rdata->got_bytes > 0 && result != -ECONNABORTED ? + return result != -ECONNABORTED && rdata->got_bytes > 0 ? rdata->got_bytes : result; } diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 0ff9eab697a2..4510dea77be3 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -428,6 +428,7 @@ smb3_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, oparms.disposition = FILE_CREATE; oparms.fid = &fid; oparms.reconnect = false; + oparms.mode = 0644; rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL, NULL); diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 4d3c586785a5..2a19c7987c5b 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -1277,7 +1277,9 @@ int match_target_ip(struct TCP_Server_Info *server, if (rc < 0) return rc; + spin_lock(&server->srv_lock); *result = cifs_match_ipaddr((struct sockaddr *)&server->dstaddr, (struct sockaddr *)&ss); + spin_unlock(&server->srv_lock); cifs_dbg(FYI, "%s: ip addresses match: %u\n", __func__, *result); return 0; } diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 9e7d9f0baa18..c47b254f0d1e 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -292,9 +292,10 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server) continue; } kref_get(&iface->refcount); + break; } - if (!list_entry_is_head(iface, &ses->iface_list, iface_head)) { + if (list_entry_is_head(iface, &ses->iface_list, iface_head)) { rc = 1; iface = NULL; cifs_dbg(FYI, "unable to find a suitable iface\n"); @@ -814,6 +815,7 @@ int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, return -EINVAL; } if (tilen) { + kfree_sensitive(ses->auth_key.response); ses->auth_key.response = kmemdup(bcc_ptr + tioffset, tilen, GFP_KERNEL); if (!ses->auth_key.response) { @@ -1427,6 +1429,7 @@ sess_auth_kerberos(struct sess_data *sess_data) goto out_put_spnego_key; } + kfree_sensitive(ses->auth_key.response); ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len, GFP_KERNEL); if (!ses->auth_key.response) { diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 50480751e521..4cb364454e13 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -562,17 +562,20 @@ static int cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, if ((rc == -EOPNOTSUPP) || (rc == -EINVAL)) { rc = SMBQueryInformation(xid, tcon, full_path, &fi, cifs_sb->local_nls, cifs_remap(cifs_sb)); - if (!rc) - move_cifs_info_to_smb2(&data->fi, &fi); *adjustTZ = true; } - if (!rc && (le32_to_cpu(fi.Attributes) & ATTR_REPARSE)) { + if (!rc) { int tmprc; int oplock = 0; struct cifs_fid fid; struct cifs_open_parms oparms; + move_cifs_info_to_smb2(&data->fi, &fi); + + if (!(le32_to_cpu(fi.Attributes) & ATTR_REPARSE)) + return 0; + oparms.tcon = tcon; oparms.cifs_sb = cifs_sb; oparms.desired_access = FILE_READ_ATTRIBUTES; @@ -716,17 +719,25 @@ cifs_mkdir_setinfo(struct inode *inode, const char *full_path, static int cifs_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock, void *buf) { - FILE_ALL_INFO *fi = buf; + struct cifs_open_info_data *data = buf; + FILE_ALL_INFO fi = {}; + int rc; if (!(oparms->tcon->ses->capabilities & CAP_NT_SMBS)) - return SMBLegacyOpen(xid, oparms->tcon, oparms->path, - oparms->disposition, - oparms->desired_access, - oparms->create_options, - &oparms->fid->netfid, oplock, fi, - oparms->cifs_sb->local_nls, - cifs_remap(oparms->cifs_sb)); - return CIFS_open(xid, oparms, oplock, fi); + rc = SMBLegacyOpen(xid, oparms->tcon, oparms->path, + oparms->disposition, + oparms->desired_access, + oparms->create_options, + &oparms->fid->netfid, oplock, &fi, + oparms->cifs_sb->local_nls, + cifs_remap(oparms->cifs_sb)); + else + rc = CIFS_open(xid, oparms, oplock, &fi); + + if (!rc && data) + move_cifs_info_to_smb2(&data->fi, &fi); + + return rc; } static void @@ -1050,7 +1061,7 @@ cifs_make_node(unsigned int xid, struct inode *inode, struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct inode *newinode = NULL; int rc = -EPERM; - FILE_ALL_INFO *buf = NULL; + struct cifs_open_info_data buf = {}; struct cifs_io_parms io_parms; __u32 oplock = 0; struct cifs_fid fid; @@ -1082,14 +1093,14 @@ cifs_make_node(unsigned int xid, struct inode *inode, cifs_sb->local_nls, cifs_remap(cifs_sb)); if (rc) - goto out; + return rc; rc = cifs_get_inode_info_unix(&newinode, full_path, inode->i_sb, xid); if (rc == 0) d_instantiate(dentry, newinode); - goto out; + return rc; } /* @@ -1097,19 +1108,13 @@ cifs_make_node(unsigned int xid, struct inode *inode, * support block and char device (no socket & fifo) */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) - goto out; + return rc; if (!S_ISCHR(mode) && !S_ISBLK(mode)) - goto out; + return rc; cifs_dbg(FYI, "sfu compat create special file\n"); - buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); - if (buf == NULL) { - rc = -ENOMEM; - goto out; - } - oparms.tcon = tcon; oparms.cifs_sb = cifs_sb; oparms.desired_access = GENERIC_WRITE; @@ -1124,21 +1129,21 @@ cifs_make_node(unsigned int xid, struct inode *inode, oplock = REQ_OPLOCK; else oplock = 0; - rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf); + rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, &buf); if (rc) - goto out; + return rc; /* * BB Do not bother to decode buf since no local inode yet to put * timestamps in, but we can reuse it safely. */ - pdev = (struct win_dev *)buf; + pdev = (struct win_dev *)&buf.fi; io_parms.pid = current->tgid; io_parms.tcon = tcon; io_parms.offset = 0; io_parms.length = sizeof(struct win_dev); - iov[1].iov_base = buf; + iov[1].iov_base = &buf.fi; iov[1].iov_len = sizeof(struct win_dev); if (S_ISCHR(mode)) { memcpy(pdev->type, "IntxCHR", 8); @@ -1157,8 +1162,8 @@ cifs_make_node(unsigned int xid, struct inode *inode, d_drop(dentry); /* FIXME: add code here to set EAs */ -out: - kfree(buf); + + cifs_free_open_info(&buf); return rc; } diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index ba6cc50af390..9f1dd04b555a 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -7,6 +7,7 @@ * */ #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/stat.h> #include <linux/slab.h> #include <linux/pagemap.h> diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index dc160de7a6de..e6bcd2baf446 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -530,7 +530,6 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, p = buf; spin_lock(&ses->iface_lock); - ses->iface_count = 0; /* * Go through iface_list and do kref_put to remove * any unused ifaces. ifaces in use will be removed @@ -540,6 +539,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, iface_head) { iface->is_active = 0; kref_put(&iface->refcount, release_iface); + ses->iface_count--; } spin_unlock(&ses->iface_lock); @@ -618,6 +618,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, /* just get a ref so that it doesn't get picked/freed */ iface->is_active = 1; kref_get(&iface->refcount); + ses->iface_count++; spin_unlock(&ses->iface_lock); goto next_iface; } else if (ret < 0) { @@ -4488,17 +4489,12 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst, /* copy pages form the old */ for (j = 0; j < npages; j++) { - char *dst, *src; unsigned int offset, len; rqst_page_get_length(new, j, &len, &offset); - dst = kmap_local_page(new->rq_pages[j]) + offset; - src = kmap_local_page(old->rq_pages[j]) + offset; - - memcpy(dst, src, len); - kunmap(new->rq_pages[j]); - kunmap(old->rq_pages[j]); + memcpy_page(new->rq_pages[j], offset, + old->rq_pages[j], offset, len); } } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index a5695748a89b..2c9ffa921e6f 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -541,9 +541,10 @@ static void assemble_neg_contexts(struct smb2_negotiate_req *req, struct TCP_Server_Info *server, unsigned int *total_len) { - char *pneg_ctxt; - char *hostname = NULL; unsigned int ctxt_len, neg_context_count; + struct TCP_Server_Info *pserver; + char *pneg_ctxt; + char *hostname; if (*total_len > 200) { /* In case length corrupted don't want to overrun smb buffer */ @@ -574,8 +575,9 @@ assemble_neg_contexts(struct smb2_negotiate_req *req, * secondary channels don't have the hostname field populated * use the hostname field in the primary channel instead */ - hostname = CIFS_SERVER_IS_CHAN(server) ? - server->primary_server->hostname : server->hostname; + pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + cifs_server_lock(pserver); + hostname = pserver->hostname; if (hostname && (hostname[0] != 0)) { ctxt_len = build_netname_ctxt((struct smb2_netname_neg_context *)pneg_ctxt, hostname); @@ -584,6 +586,7 @@ assemble_neg_contexts(struct smb2_negotiate_req *req, neg_context_count = 3; } else neg_context_count = 2; + cifs_server_unlock(pserver); build_posix_ctxt((struct smb2_posix_neg_context *)pneg_ctxt); *total_len += sizeof(struct smb2_posix_neg_context); @@ -1450,6 +1453,7 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) /* keep session key if binding */ if (!is_binding) { + kfree_sensitive(ses->auth_key.response); ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len, GFP_KERNEL); if (!ses->auth_key.response) { @@ -1479,8 +1483,11 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) out_put_spnego_key: key_invalidate(spnego_key); key_put(spnego_key); - if (rc) + if (rc) { kfree_sensitive(ses->auth_key.response); + ses->auth_key.response = NULL; + ses->auth_key.len = 0; + } out: sess_data->result = rc; sess_data->func = NULL; @@ -4156,12 +4163,15 @@ smb2_readv_callback(struct mid_q_entry *mid) (struct smb2_hdr *)rdata->iov[0].iov_base; struct cifs_credits credits = { .value = 0, .instance = 0 }; struct smb_rqst rqst = { .rq_iov = &rdata->iov[1], - .rq_nvec = 1, - .rq_pages = rdata->pages, - .rq_offset = rdata->page_offset, - .rq_npages = rdata->nr_pages, - .rq_pagesz = rdata->pagesz, - .rq_tailsz = rdata->tailsz }; + .rq_nvec = 1, }; + + if (rdata->got_bytes) { + rqst.rq_pages = rdata->pages; + rqst.rq_offset = rdata->page_offset; + rqst.rq_npages = rdata->nr_pages; + rqst.rq_pagesz = rdata->pagesz; + rqst.rq_tailsz = rdata->tailsz; + } WARN_ONCE(rdata->server != mid->server, "rdata server %p != mid server %p", diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c index 90789aaa6567..8c816b25ce7c 100644 --- a/fs/cifs/smbdirect.c +++ b/fs/cifs/smbdirect.c @@ -1405,6 +1405,7 @@ void smbd_destroy(struct TCP_Server_Info *server) destroy_workqueue(info->workqueue); log_rdma_event(INFO, "rdma session destroyed\n"); kfree(info); + server->smbd_conn = NULL; } /* diff --git a/fs/coredump.c b/fs/coredump.c index b31ea0f87ccb..68619329ec65 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -838,6 +838,30 @@ static int __dump_skip(struct coredump_params *cprm, size_t nr) } } +int dump_emit(struct coredump_params *cprm, const void *addr, int nr) +{ + if (cprm->to_skip) { + if (!__dump_skip(cprm, cprm->to_skip)) + return 0; + cprm->to_skip = 0; + } + return __dump_emit(cprm, addr, nr); +} +EXPORT_SYMBOL(dump_emit); + +void dump_skip_to(struct coredump_params *cprm, unsigned long pos) +{ + cprm->to_skip = pos - cprm->pos; +} +EXPORT_SYMBOL(dump_skip_to); + +void dump_skip(struct coredump_params *cprm, size_t nr) +{ + cprm->to_skip += nr; +} +EXPORT_SYMBOL(dump_skip); + +#ifdef CONFIG_ELF_CORE static int dump_emit_page(struct coredump_params *cprm, struct page *page) { struct bio_vec bvec = { @@ -871,30 +895,6 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page) return 1; } -int dump_emit(struct coredump_params *cprm, const void *addr, int nr) -{ - if (cprm->to_skip) { - if (!__dump_skip(cprm, cprm->to_skip)) - return 0; - cprm->to_skip = 0; - } - return __dump_emit(cprm, addr, nr); -} -EXPORT_SYMBOL(dump_emit); - -void dump_skip_to(struct coredump_params *cprm, unsigned long pos) -{ - cprm->to_skip = pos - cprm->pos; -} -EXPORT_SYMBOL(dump_skip_to); - -void dump_skip(struct coredump_params *cprm, size_t nr) -{ - cprm->to_skip += nr; -} -EXPORT_SYMBOL(dump_skip); - -#ifdef CONFIG_ELF_CORE int dump_user_range(struct coredump_params *cprm, unsigned long start, unsigned long len) { @@ -1271,8 +1271,9 @@ static s64 dax_unshare_iter(struct iomap_iter *iter) if (ret < 0) goto out_unlock; - ret = copy_mc_to_kernel(daddr, saddr, length); - if (ret) + if (copy_mc_to_kernel(daddr, saddr, length) == 0) + ret = length; + else ret = -EIO; out_unlock: diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index 737f185aad8d..ed4357e62f35 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -4,6 +4,7 @@ */ #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/miscdevice.h> #include <linux/poll.h> #include <linux/dlm.h> diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 481788c24a68..626a615dafc2 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -577,26 +577,25 @@ static int erofs_fc_parse_param(struct fs_context *fc, } ++ctx->devs->extra_devices; break; - case Opt_fsid: #ifdef CONFIG_EROFS_FS_ONDEMAND + case Opt_fsid: kfree(ctx->fsid); ctx->fsid = kstrdup(param->string, GFP_KERNEL); if (!ctx->fsid) return -ENOMEM; -#else - errorfc(fc, "fsid option not supported"); -#endif break; case Opt_domain_id: -#ifdef CONFIG_EROFS_FS_ONDEMAND kfree(ctx->domain_id); ctx->domain_id = kstrdup(param->string, GFP_KERNEL); if (!ctx->domain_id) return -ENOMEM; + break; #else - errorfc(fc, "domain_id option not supported"); -#endif + case Opt_fsid: + case Opt_domain_id: + errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name); break; +#endif default: return -ENOPARAM; } diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index ccf7c55d477f..5200bb86e264 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1032,12 +1032,12 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, if (!be->decompressed_pages) be->decompressed_pages = - kvcalloc(be->nr_pages, sizeof(struct page *), - GFP_KERNEL | __GFP_NOFAIL); + kcalloc(be->nr_pages, sizeof(struct page *), + GFP_KERNEL | __GFP_NOFAIL); if (!be->compressed_pages) be->compressed_pages = - kvcalloc(pclusterpages, sizeof(struct page *), - GFP_KERNEL | __GFP_NOFAIL); + kcalloc(pclusterpages, sizeof(struct page *), + GFP_KERNEL | __GFP_NOFAIL); z_erofs_parse_out_bvecs(be); err2 = z_erofs_parse_in_bvecs(be, &overlapped); @@ -1085,7 +1085,7 @@ out: } if (be->compressed_pages < be->onstack_pages || be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES) - kvfree(be->compressed_pages); + kfree(be->compressed_pages); z_erofs_fill_other_copies(be, err); for (i = 0; i < be->nr_pages; ++i) { @@ -1104,7 +1104,7 @@ out: } if (be->decompressed_pages != be->onstack_pages) - kvfree(be->decompressed_pages); + kfree(be->decompressed_pages); pcl->length = 0; pcl->partial = true; diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 0150570c33aa..98fb90b9af71 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -793,12 +793,16 @@ static int z_erofs_iomap_begin_report(struct inode *inode, loff_t offset, iomap->type = IOMAP_HOLE; iomap->addr = IOMAP_NULL_ADDR; /* - * No strict rule how to describe extents for post EOF, yet - * we need do like below. Otherwise, iomap itself will get + * No strict rule on how to describe extents for post EOF, yet + * we need to do like below. Otherwise, iomap itself will get * into an endless loop on post EOF. + * + * Calculate the effective offset by subtracting extent start + * (map.m_la) from the requested offset, and add it to length. + * (NB: offset >= map.m_la always) */ if (iomap->offset >= inode->i_size) - iomap->length = length + map.m_la - offset; + iomap->length = length + offset - map.m_la; } iomap->flags = 0; return 0; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 7decaaf27e82..a2f04a3808db 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -81,6 +81,8 @@ ext4_xattr_block_cache_find(struct inode *, struct ext4_xattr_header *, struct mb_cache_entry **); static __le32 ext4_xattr_hash_entry(char *name, size_t name_len, __le32 *value, size_t value_count); +static __le32 ext4_xattr_hash_entry_signed(char *name, size_t name_len, __le32 *value, + size_t value_count); static void ext4_xattr_rehash(struct ext4_xattr_header *); static const struct xattr_handler * const ext4_xattr_handler_map[] = { @@ -470,8 +472,22 @@ ext4_xattr_inode_verify_hashes(struct inode *ea_inode, tmp_data = cpu_to_le32(hash); e_hash = ext4_xattr_hash_entry(entry->e_name, entry->e_name_len, &tmp_data, 1); + /* All good? */ + if (e_hash == entry->e_hash) + return 0; + + /* + * Not good. Maybe the entry hash was calculated + * using the buggy signed char version? + */ + e_hash = ext4_xattr_hash_entry_signed(entry->e_name, entry->e_name_len, + &tmp_data, 1); + /* Still no match - bad */ if (e_hash != entry->e_hash) return -EFSCORRUPTED; + + /* Let people know about old hash */ + pr_warn_once("ext4: filesystem with signed xattr name hash"); } return 0; } @@ -3081,7 +3097,29 @@ static __le32 ext4_xattr_hash_entry(char *name, size_t name_len, __le32 *value, while (name_len--) { hash = (hash << NAME_HASH_SHIFT) ^ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ - *name++; + (unsigned char)*name++; + } + while (value_count--) { + hash = (hash << VALUE_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ + le32_to_cpu(*value++); + } + return cpu_to_le32(hash); +} + +/* + * ext4_xattr_hash_entry_signed() + * + * Compute the hash of an extended attribute incorrectly. + */ +static __le32 ext4_xattr_hash_entry_signed(char *name, size_t name_len, __le32 *value, size_t value_count) +{ + __u32 hash = 0; + + while (name_len--) { + hash = (hash << NAME_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ + (signed char)*name++; } while (value_count--) { hash = (hash << VALUE_HASH_SHIFT) ^ diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 6e43e19c7d1c..97e816590cd9 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2183,7 +2183,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, sector_t last_block_in_file; const unsigned blocksize = blks_to_bytes(inode, 1); struct decompress_io_ctx *dic = NULL; - struct extent_info ei = {0, }; + struct extent_info ei = {}; bool from_dnode = true; int i; int ret = 0; diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 1bd38a78ebba..342af24b2f8c 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -546,7 +546,8 @@ static bool __lookup_extent_tree(struct inode *inode, pgoff_t pgofs, struct extent_node *en; bool ret = false; - f2fs_bug_on(sbi, !et); + if (!et) + return false; trace_f2fs_lookup_extent_tree_start(inode, pgofs, type); @@ -881,12 +882,14 @@ static unsigned long long __calculate_block_age(unsigned long long new, } /* This returns a new age and allocated blocks in ei */ -static int __get_new_block_age(struct inode *inode, struct extent_info *ei) +static int __get_new_block_age(struct inode *inode, struct extent_info *ei, + block_t blkaddr) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); loff_t f_size = i_size_read(inode); unsigned long long cur_blocks = atomic64_read(&sbi->allocated_data_blocks); + struct extent_info tei = *ei; /* only fofs and len are valid */ /* * When I/O is not aligned to a PAGE_SIZE, update will happen to the last @@ -894,20 +897,20 @@ static int __get_new_block_age(struct inode *inode, struct extent_info *ei) * block here. */ if ((f_size >> PAGE_SHIFT) == ei->fofs && f_size & (PAGE_SIZE - 1) && - ei->blk == NEW_ADDR) + blkaddr == NEW_ADDR) return -EINVAL; - if (__lookup_extent_tree(inode, ei->fofs, ei, EX_BLOCK_AGE)) { + if (__lookup_extent_tree(inode, ei->fofs, &tei, EX_BLOCK_AGE)) { unsigned long long cur_age; - if (cur_blocks >= ei->last_blocks) - cur_age = cur_blocks - ei->last_blocks; + if (cur_blocks >= tei.last_blocks) + cur_age = cur_blocks - tei.last_blocks; else /* allocated_data_blocks overflow */ - cur_age = ULLONG_MAX - ei->last_blocks + cur_blocks; + cur_age = ULLONG_MAX - tei.last_blocks + cur_blocks; - if (ei->age) - ei->age = __calculate_block_age(cur_age, ei->age); + if (tei.age) + ei->age = __calculate_block_age(cur_age, tei.age); else ei->age = cur_age; ei->last_blocks = cur_blocks; @@ -915,14 +918,14 @@ static int __get_new_block_age(struct inode *inode, struct extent_info *ei) return 0; } - f2fs_bug_on(sbi, ei->blk == NULL_ADDR); + f2fs_bug_on(sbi, blkaddr == NULL_ADDR); /* the data block was allocated for the first time */ - if (ei->blk == NEW_ADDR) + if (blkaddr == NEW_ADDR) goto out; - if (__is_valid_data_blkaddr(ei->blk) && - !f2fs_is_valid_blkaddr(sbi, ei->blk, DATA_GENERIC_ENHANCE)) { + if (__is_valid_data_blkaddr(blkaddr) && + !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) { f2fs_bug_on(sbi, 1); return -EINVAL; } @@ -938,7 +941,7 @@ out: static void __update_extent_cache(struct dnode_of_data *dn, enum extent_type type) { - struct extent_info ei; + struct extent_info ei = {}; if (!__may_extent_tree(dn->inode, type)) return; @@ -953,8 +956,7 @@ static void __update_extent_cache(struct dnode_of_data *dn, enum extent_type typ else ei.blk = dn->data_blkaddr; } else if (type == EX_BLOCK_AGE) { - ei.blk = dn->data_blkaddr; - if (__get_new_block_age(dn->inode, &ei)) + if (__get_new_block_age(dn->inode, &ei, dn->data_blkaddr)) return; } __update_extent_tree_range(dn->inode, &ei, type); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 48ed2d0c8543..b90617639743 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2559,7 +2559,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, struct f2fs_map_blocks map = { .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE, .m_may_create = false }; - struct extent_info ei = {0, }; + struct extent_info ei = {}; pgoff_t pg_start, pg_end, next_pgofs; unsigned int blk_per_seg = sbi->blocks_per_seg; unsigned int total = 0, sec_num; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 25ddea478fc1..ae3c4e5474ef 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -663,8 +663,7 @@ init_thread: if (IS_ERR(fcc->f2fs_issue_flush)) { int err = PTR_ERR(fcc->f2fs_issue_flush); - kfree(fcc); - SM_I(sbi)->fcc_info = NULL; + fcc->f2fs_issue_flush = NULL; return err; } @@ -3161,7 +3160,7 @@ static int __get_segment_type_4(struct f2fs_io_info *fio) static int __get_age_segment_type(struct inode *inode, pgoff_t pgofs) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_info ei; + struct extent_info ei = {}; if (f2fs_lookup_age_extent_cache(inode, pgofs, &ei)) { if (!ei.age) @@ -5138,11 +5137,9 @@ int f2fs_build_segment_manager(struct f2fs_sb_info *sbi) init_f2fs_rwsem(&sm_info->curseg_lock); - if (!f2fs_readonly(sbi->sb)) { - err = f2fs_create_flush_cmd_control(sbi); - if (err) - return err; - } + err = f2fs_create_flush_cmd_control(sbi); + if (err) + return err; err = create_discard_cmd_control(sbi); if (err) diff --git a/fs/fcntl.c b/fs/fcntl.c index 4c043b1b9b98..b622be119706 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -10,6 +10,7 @@ #include <linux/mm.h> #include <linux/sched/task.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/capability.h> diff --git a/fs/file_table.c b/fs/file_table.c index dd88701e54a9..372653b92617 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -13,6 +13,7 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/security.h> #include <linux/cred.h> #include <linux/eventpoll.h> diff --git a/fs/freevxfs/Kconfig b/fs/freevxfs/Kconfig index c05c71d57291..0e2fc08f7de4 100644 --- a/fs/freevxfs/Kconfig +++ b/fs/freevxfs/Kconfig @@ -8,7 +8,7 @@ config VXFS_FS of SCO UnixWare (and possibly others) and optionally available for Sunsoft Solaris, HP-UX and many other operating systems. However these particular OS implementations of vxfs may differ in on-disk - data endianess and/or superblock offset. The vxfs module has been + data endianness and/or superblock offset. The vxfs module has been tested with SCO UnixWare and HP-UX B.10.20 (pa-risc 1.1 arch.) Currently only readonly access is supported and VxFX versions 2, 3 and 4. Tests were performed with HP-UX VxFS version 3. diff --git a/fs/fscache/volume.c b/fs/fscache/volume.c index ab8ceddf9efa..cdf991bdd9de 100644 --- a/fs/fscache/volume.c +++ b/fs/fscache/volume.c @@ -141,13 +141,14 @@ static bool fscache_is_acquire_pending(struct fscache_volume *volume) static void fscache_wait_on_volume_collision(struct fscache_volume *candidate, unsigned int collidee_debug_id) { - wait_var_event_timeout(&candidate->flags, - !fscache_is_acquire_pending(candidate), 20 * HZ); + wait_on_bit_timeout(&candidate->flags, FSCACHE_VOLUME_ACQUIRE_PENDING, + TASK_UNINTERRUPTIBLE, 20 * HZ); if (fscache_is_acquire_pending(candidate)) { pr_notice("Potential volume collision new=%08x old=%08x", candidate->debug_id, collidee_debug_id); fscache_stat(&fscache_n_volumes_collision); - wait_var_event(&candidate->flags, !fscache_is_acquire_pending(candidate)); + wait_on_bit(&candidate->flags, FSCACHE_VOLUME_ACQUIRE_PENDING, + TASK_UNINTERRUPTIBLE); } } @@ -279,8 +280,7 @@ static void fscache_create_volume_work(struct work_struct *work) fscache_end_cache_access(volume->cache, fscache_access_acquire_volume_end); - clear_bit_unlock(FSCACHE_VOLUME_CREATING, &volume->flags); - wake_up_bit(&volume->flags, FSCACHE_VOLUME_CREATING); + clear_and_wake_up_bit(FSCACHE_VOLUME_CREATING, &volume->flags); fscache_put_volume(volume, fscache_volume_put_create_work); } @@ -347,8 +347,8 @@ static void fscache_wake_pending_volume(struct fscache_volume *volume, hlist_bl_for_each_entry(cursor, p, h, hash_link) { if (fscache_volume_same(cursor, volume)) { fscache_see_volume(cursor, fscache_volume_see_hash_wake); - clear_bit(FSCACHE_VOLUME_ACQUIRE_PENDING, &cursor->flags); - wake_up_bit(&cursor->flags, FSCACHE_VOLUME_ACQUIRE_PENDING); + clear_and_wake_up_bit(FSCACHE_VOLUME_ACQUIRE_PENDING, + &cursor->flags); return; } } diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c index 23d1c263891f..3d192b80a561 100644 --- a/fs/fuse/acl.c +++ b/fs/fuse/acl.c @@ -11,9 +11,10 @@ #include <linux/posix_acl.h> #include <linux/posix_acl_xattr.h> -struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu) +static struct posix_acl *__fuse_get_acl(struct fuse_conn *fc, + struct mnt_idmap *idmap, + struct inode *inode, int type, bool rcu) { - struct fuse_conn *fc = get_fuse_conn(inode); int size; const char *name; void *value = NULL; @@ -25,7 +26,7 @@ struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu) if (fuse_is_bad(inode)) return ERR_PTR(-EIO); - if (!fc->posix_acl || fc->no_getxattr) + if (fc->no_getxattr) return NULL; if (type == ACL_TYPE_ACCESS) @@ -53,6 +54,46 @@ struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu) return acl; } +static inline bool fuse_no_acl(const struct fuse_conn *fc, + const struct inode *inode) +{ + /* + * Refuse interacting with POSIX ACLs for daemons that + * don't support FUSE_POSIX_ACL and are not mounted on + * the host to retain backwards compatibility. + */ + return !fc->posix_acl && (i_user_ns(inode) != &init_user_ns); +} + +struct posix_acl *fuse_get_acl(struct mnt_idmap *idmap, + struct dentry *dentry, int type) +{ + struct inode *inode = d_inode(dentry); + struct fuse_conn *fc = get_fuse_conn(inode); + + if (fuse_no_acl(fc, inode)) + return ERR_PTR(-EOPNOTSUPP); + + return __fuse_get_acl(fc, idmap, inode, type, false); +} + +struct posix_acl *fuse_get_inode_acl(struct inode *inode, int type, bool rcu) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + + /* + * FUSE daemons before FUSE_POSIX_ACL was introduced could get and set + * POSIX ACLs without them being used for permission checking by the + * vfs. Retain that behavior for backwards compatibility as there are + * filesystems that do all permission checking for acls in the daemon + * and not in the kernel. + */ + if (!fc->posix_acl) + return NULL; + + return __fuse_get_acl(fc, &nop_mnt_idmap, inode, type, rcu); +} + int fuse_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { @@ -64,7 +105,7 @@ int fuse_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, if (fuse_is_bad(inode)) return -EIO; - if (!fc->posix_acl || fc->no_setxattr) + if (fc->no_setxattr || fuse_no_acl(fc, inode)) return -EOPNOTSUPP; if (type == ACL_TYPE_ACCESS) @@ -99,7 +140,13 @@ int fuse_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, return ret; } - if (!vfsgid_in_group_p(i_gid_into_vfsgid(&nop_mnt_idmap, inode)) && + /* + * Fuse daemons without FUSE_POSIX_ACL never changed the passed + * through POSIX ACLs. Such daemons don't expect setgid bits to + * be stripped. + */ + if (fc->posix_acl && + !vfsgid_in_group_p(i_gid_into_vfsgid(&nop_mnt_idmap, inode)) && !capable_wrt_inode_uidgid(&nop_mnt_idmap, inode, CAP_FSETID)) extra_flags |= FUSE_SETXATTR_ACL_KILL_SGID; @@ -108,8 +155,15 @@ int fuse_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, } else { ret = fuse_removexattr(inode, name); } - forget_all_cached_acls(inode); - fuse_invalidate_attr(inode); + + if (fc->posix_acl) { + /* + * Fuse daemons without FUSE_POSIX_ACL never cached POSIX ACLs + * and didn't invalidate attributes. Retain that behavior. + */ + forget_all_cached_acls(inode); + fuse_invalidate_attr(inode); + } return ret; } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 6a4e1fb0a0ad..cd1eae61e84c 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1942,7 +1942,8 @@ static const struct inode_operations fuse_dir_inode_operations = { .permission = fuse_permission, .getattr = fuse_getattr, .listxattr = fuse_listxattr, - .get_inode_acl = fuse_get_acl, + .get_inode_acl = fuse_get_inode_acl, + .get_acl = fuse_get_acl, .set_acl = fuse_set_acl, .fileattr_get = fuse_fileattr_get, .fileattr_set = fuse_fileattr_set, @@ -1964,7 +1965,8 @@ static const struct inode_operations fuse_common_inode_operations = { .permission = fuse_permission, .getattr = fuse_getattr, .listxattr = fuse_listxattr, - .get_inode_acl = fuse_get_acl, + .get_inode_acl = fuse_get_inode_acl, + .get_acl = fuse_get_acl, .set_acl = fuse_set_acl, .fileattr_get = fuse_fileattr_get, .fileattr_set = fuse_fileattr_set, diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 5cfd9fb06a5a..82710d103556 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -18,6 +18,7 @@ #include <linux/falloc.h> #include <linux/uio.h> #include <linux/fs.h> +#include <linux/filelock.h> static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, unsigned int open_flags, int opcode, diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index ee084cead402..9b5058cf5bc3 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1264,12 +1264,12 @@ ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size); int fuse_removexattr(struct inode *inode, const char *name); extern const struct xattr_handler *fuse_xattr_handlers[]; -extern const struct xattr_handler *fuse_acl_xattr_handlers[]; -extern const struct xattr_handler *fuse_no_acl_xattr_handlers[]; struct posix_acl; -struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu); -int fuse_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, +struct posix_acl *fuse_get_inode_acl(struct inode *inode, int type, bool rcu); +struct posix_acl *fuse_get_acl(struct mnt_idmap *idmap, + struct dentry *dentry, int type); +int fuse_set_acl(struct mnt_idmap *, struct dentry *dentry, struct posix_acl *acl, int type); /* readdir.c */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 6b3beda16c1b..de9b9ec5ce81 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -311,7 +311,8 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, fuse_dax_dontcache(inode, attr->flags); } -static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) +static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr, + struct fuse_conn *fc) { inode->i_mode = attr->mode & S_IFMT; inode->i_size = attr->size; @@ -333,6 +334,12 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) new_decode_dev(attr->rdev)); } else BUG(); + /* + * Ensure that we don't cache acls for daemons without FUSE_POSIX_ACL + * so they see the exact same behavior as before. + */ + if (!fc->posix_acl) + inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE; } static int fuse_inode_eq(struct inode *inode, void *_nodeidp) @@ -372,7 +379,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, if (!inode) return NULL; - fuse_init_inode(inode, attr); + fuse_init_inode(inode, attr, fc); get_fuse_inode(inode)->nodeid = nodeid; inode->i_flags |= S_AUTOMOUNT; goto done; @@ -388,7 +395,7 @@ retry: if (!fc->writeback_cache || !S_ISREG(attr->mode)) inode->i_flags |= S_NOCMTIME; inode->i_generation = generation; - fuse_init_inode(inode, attr); + fuse_init_inode(inode, attr, fc); unlock_new_inode(inode); } else if (fuse_stale_inode(inode, generation, attr)) { /* nodeid was reused, any I/O on the old inode should fail */ @@ -1174,7 +1181,6 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, if ((flags & FUSE_POSIX_ACL)) { fc->default_permissions = 1; fc->posix_acl = 1; - fm->sb->s_xattr = fuse_acl_xattr_handlers; } if (flags & FUSE_CACHE_SYMLINKS) fc->cache_symlinks = 1; @@ -1420,13 +1426,6 @@ static void fuse_sb_defaults(struct super_block *sb) if (sb->s_user_ns != &init_user_ns) sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER; sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION); - - /* - * If we are not in the initial user namespace posix - * acls must be translated. - */ - if (sb->s_user_ns != &init_user_ns) - sb->s_xattr = fuse_no_acl_xattr_handlers; } static int fuse_fill_super_submount(struct super_block *sb, diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index 30aaaa4b3bfb..49c01559580f 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -203,27 +203,6 @@ static int fuse_xattr_set(const struct xattr_handler *handler, return fuse_setxattr(inode, name, value, size, flags, 0); } -static bool no_xattr_list(struct dentry *dentry) -{ - return false; -} - -static int no_xattr_get(const struct xattr_handler *handler, - struct dentry *dentry, struct inode *inode, - const char *name, void *value, size_t size) -{ - return -EOPNOTSUPP; -} - -static int no_xattr_set(const struct xattr_handler *handler, - struct mnt_idmap *idmap, - struct dentry *dentry, struct inode *nodee, - const char *name, const void *value, - size_t size, int flags) -{ - return -EOPNOTSUPP; -} - static const struct xattr_handler fuse_xattr_handler = { .prefix = "", .get = fuse_xattr_get, @@ -234,33 +213,3 @@ const struct xattr_handler *fuse_xattr_handlers[] = { &fuse_xattr_handler, NULL }; - -const struct xattr_handler *fuse_acl_xattr_handlers[] = { - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, - &fuse_xattr_handler, - NULL -}; - -static const struct xattr_handler fuse_no_acl_access_xattr_handler = { - .name = XATTR_NAME_POSIX_ACL_ACCESS, - .flags = ACL_TYPE_ACCESS, - .list = no_xattr_list, - .get = no_xattr_get, - .set = no_xattr_set, -}; - -static const struct xattr_handler fuse_no_acl_default_xattr_handler = { - .name = XATTR_NAME_POSIX_ACL_DEFAULT, - .flags = ACL_TYPE_ACCESS, - .list = no_xattr_list, - .get = no_xattr_get, - .set = no_xattr_set, -}; - -const struct xattr_handler *fuse_no_acl_xattr_handlers[] = { - &fuse_no_acl_access_xattr_handler, - &fuse_no_acl_default_xattr_handler, - &fuse_xattr_handler, - NULL -}; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index bec75ed59c72..300844f50dcd 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -15,6 +15,7 @@ #include <linux/mm.h> #include <linux/mount.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/gfs2_ondisk.h> #include <linux/falloc.h> #include <linux/swap.h> diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 723639376ae2..61323deb80bc 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -80,6 +80,15 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd) brelse(bd->bd_bh); } +static int __gfs2_writepage(struct page *page, struct writeback_control *wbc, + void *data) +{ + struct address_space *mapping = data; + int ret = mapping->a_ops->writepage(page, wbc); + mapping_set_error(mapping, ret); + return ret; +} + /** * gfs2_ail1_start_one - Start I/O on a transaction * @sdp: The superblock @@ -131,7 +140,7 @@ __acquires(&sdp->sd_ail_lock) if (!mapping) continue; spin_unlock(&sdp->sd_ail_lock); - ret = filemap_fdatawrite_wbc(mapping, wbc); + ret = write_cache_pages(mapping, wbc, __gfs2_writepage, mapping); if (need_resched()) { blk_finish_plug(plug); cond_resched(); diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 7817872a85e7..1f7bd068acf0 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -458,15 +458,16 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc) /* panic? */ return -EIO; + res = -EIO; if (HFS_I(main_inode)->cat_key.CName.len > HFS_NAMELEN) - return -EIO; + goto out; fd.search_key->cat = HFS_I(main_inode)->cat_key; if (hfs_brec_find(&fd)) - /* panic? */ goto out; if (S_ISDIR(main_inode->i_mode)) { - WARN_ON(fd.entrylength < sizeof(struct hfs_cat_dir)); + if (fd.entrylength < sizeof(struct hfs_cat_dir)) + goto out; hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, sizeof(struct hfs_cat_dir)); if (rec.type != HFS_CDR_DIR || @@ -479,6 +480,8 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc) hfs_bnode_write(fd.bnode, &rec, fd.entryoffset, sizeof(struct hfs_cat_dir)); } else if (HFS_IS_RSRC(inode)) { + if (fd.entrylength < sizeof(struct hfs_cat_file)) + goto out; hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, sizeof(struct hfs_cat_file)); hfs_inode_write_fork(inode, rec.file.RExtRec, @@ -486,7 +489,8 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc) hfs_bnode_write(fd.bnode, &rec, fd.entryoffset, sizeof(struct hfs_cat_file)); } else { - WARN_ON(fd.entrylength < sizeof(struct hfs_cat_file)); + if (fd.entrylength < sizeof(struct hfs_cat_file)) + goto out; hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, sizeof(struct hfs_cat_file)); if (rec.type != HFS_CDR_FIL || @@ -503,9 +507,10 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc) hfs_bnode_write(fd.bnode, &rec, fd.entryoffset, sizeof(struct hfs_cat_file)); } + res = 0; out: hfs_find_exit(&fd); - return 0; + return res; } static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry, diff --git a/fs/inode.c b/fs/inode.c index 1b05e0e4b5c8..4558dc2f1355 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -5,6 +5,7 @@ */ #include <linux/export.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/mm.h> #include <linux/backing-dev.h> #include <linux/hash.h> diff --git a/fs/ksmbd/auth.c b/fs/ksmbd/auth.c index 2a39ffb8423b..6e61b5bc7d86 100644 --- a/fs/ksmbd/auth.c +++ b/fs/ksmbd/auth.c @@ -322,7 +322,8 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, dn_off = le32_to_cpu(authblob->DomainName.BufferOffset); dn_len = le16_to_cpu(authblob->DomainName.Length); - if (blob_len < (u64)dn_off + dn_len || blob_len < (u64)nt_off + nt_len) + if (blob_len < (u64)dn_off + dn_len || blob_len < (u64)nt_off + nt_len || + nt_len < CIFS_ENCPWD_SIZE) return -EINVAL; /* TODO : use domain name that imported from configuration file */ diff --git a/fs/ksmbd/connection.c b/fs/ksmbd/connection.c index 12be8386446a..56be077e5d8a 100644 --- a/fs/ksmbd/connection.c +++ b/fs/ksmbd/connection.c @@ -280,7 +280,7 @@ int ksmbd_conn_handler_loop(void *p) { struct ksmbd_conn *conn = (struct ksmbd_conn *)p; struct ksmbd_transport *t = conn->transport; - unsigned int pdu_size; + unsigned int pdu_size, max_allowed_pdu_size; char hdr_buf[4] = {0,}; int size; @@ -305,20 +305,36 @@ int ksmbd_conn_handler_loop(void *p) pdu_size = get_rfc1002_len(hdr_buf); ksmbd_debug(CONN, "RFC1002 header %u bytes\n", pdu_size); + if (conn->status == KSMBD_SESS_GOOD) + max_allowed_pdu_size = + SMB3_MAX_MSGSIZE + conn->vals->max_write_size; + else + max_allowed_pdu_size = SMB3_MAX_MSGSIZE; + + if (pdu_size > max_allowed_pdu_size) { + pr_err_ratelimited("PDU length(%u) excceed maximum allowed pdu size(%u) on connection(%d)\n", + pdu_size, max_allowed_pdu_size, + conn->status); + break; + } + /* * Check if pdu size is valid (min : smb header size, * max : 0x00FFFFFF). */ if (pdu_size < __SMB2_HEADER_STRUCTURE_SIZE || pdu_size > MAX_STREAM_PROT_LEN) { - continue; + break; } /* 4 for rfc1002 length field */ size = pdu_size + 4; - conn->request_buf = kvmalloc(size, GFP_KERNEL); + conn->request_buf = kvmalloc(size, + GFP_KERNEL | + __GFP_NOWARN | + __GFP_NORETRY); if (!conn->request_buf) - continue; + break; memcpy(conn->request_buf, hdr_buf, sizeof(hdr_buf)); if (!ksmbd_smb_request(conn)) diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h index b6bd8311e6b4..fb8b2d566efb 100644 --- a/fs/ksmbd/ksmbd_netlink.h +++ b/fs/ksmbd/ksmbd_netlink.h @@ -106,7 +106,8 @@ struct ksmbd_startup_request { __u32 sub_auth[3]; /* Subauth value for Security ID */ __u32 smb2_max_credits; /* MAX credits */ __u32 smbd_max_io_size; /* smbd read write size */ - __u32 reserved[127]; /* Reserved room */ + __u32 max_connections; /* Number of maximum simultaneous connections */ + __u32 reserved[126]; /* Reserved room */ __u32 ifc_list_sz; /* interfaces list size */ __s8 ____payload[]; }; diff --git a/fs/ksmbd/ndr.c b/fs/ksmbd/ndr.c index 0c6717fb4656..3507d8f89074 100644 --- a/fs/ksmbd/ndr.c +++ b/fs/ksmbd/ndr.c @@ -242,7 +242,7 @@ int ndr_decode_dos_attr(struct ndr *n, struct xattr_dos_attrib *da) return ret; if (da->version != 3 && da->version != 4) { - pr_err("v%d version is not supported\n", da->version); + ksmbd_debug(VFS, "v%d version is not supported\n", da->version); return -EINVAL; } @@ -251,7 +251,7 @@ int ndr_decode_dos_attr(struct ndr *n, struct xattr_dos_attrib *da) return ret; if (da->version != version2) { - pr_err("ndr version mismatched(version: %d, version2: %d)\n", + ksmbd_debug(VFS, "ndr version mismatched(version: %d, version2: %d)\n", da->version, version2); return -EINVAL; } @@ -457,7 +457,7 @@ int ndr_decode_v4_ntacl(struct ndr *n, struct xattr_ntacl *acl) if (ret) return ret; if (acl->version != 4) { - pr_err("v%d version is not supported\n", acl->version); + ksmbd_debug(VFS, "v%d version is not supported\n", acl->version); return -EINVAL; } @@ -465,7 +465,7 @@ int ndr_decode_v4_ntacl(struct ndr *n, struct xattr_ntacl *acl) if (ret) return ret; if (acl->version != version2) { - pr_err("ndr version mismatched(version: %d, version2: %d)\n", + ksmbd_debug(VFS, "ndr version mismatched(version: %d, version2: %d)\n", acl->version, version2); return -EINVAL; } diff --git a/fs/ksmbd/server.h b/fs/ksmbd/server.h index ac9d932f8c8a..db7278181760 100644 --- a/fs/ksmbd/server.h +++ b/fs/ksmbd/server.h @@ -41,6 +41,7 @@ struct ksmbd_server_config { unsigned int share_fake_fscaps; struct smb_sid domain_sid; unsigned int auth_mechs; + unsigned int max_connections; char *conf[SERVER_CONF_WORK_GROUP + 1]; }; diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 795984333bcb..4ef6e1e59a40 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -12,6 +12,7 @@ #include <linux/ethtool.h> #include <linux/falloc.h> #include <linux/mount.h> +#include <linux/filelock.h> #include "glob.h" #include "smbfsctl.h" @@ -1928,13 +1929,13 @@ int smb2_tree_connect(struct ksmbd_work *work) if (conn->posix_ext_supported) status.tree_conn->posix_extensions = true; -out_err1: rsp->StructureSize = cpu_to_le16(16); + inc_rfc1001_len(work->response_buf, 16); +out_err1: rsp->Capabilities = 0; rsp->Reserved = 0; /* default manual caching */ rsp->ShareFlags = SMB2_SHAREFLAG_MANUAL_CACHING; - inc_rfc1001_len(work->response_buf, 16); if (!IS_ERR(treename)) kfree(treename); @@ -1967,6 +1968,9 @@ out_err1: rsp->hdr.Status = STATUS_ACCESS_DENIED; } + if (status.ret != KSMBD_TREE_CONN_STATUS_OK) + smb2_set_err_rsp(work); + return rc; } @@ -8660,6 +8664,7 @@ int smb3_decrypt_req(struct ksmbd_work *work) bool smb3_11_final_sess_setup_resp(struct ksmbd_work *work) { struct ksmbd_conn *conn = work->conn; + struct ksmbd_session *sess = work->sess; struct smb2_hdr *rsp = smb2_get_msg(work->response_buf); if (conn->dialect < SMB30_PROT_ID) @@ -8669,6 +8674,7 @@ bool smb3_11_final_sess_setup_resp(struct ksmbd_work *work) rsp = ksmbd_resp_buf_next(work); if (le16_to_cpu(rsp->Command) == SMB2_SESSION_SETUP_HE && + sess->user && !user_guest(sess->user) && rsp->Status == STATUS_SUCCESS) return true; return false; diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h index aa5dbe54f5a1..0c8a770fe318 100644 --- a/fs/ksmbd/smb2pdu.h +++ b/fs/ksmbd/smb2pdu.h @@ -24,8 +24,9 @@ #define SMB21_DEFAULT_IOSIZE (1024 * 1024) #define SMB3_DEFAULT_TRANS_SIZE (1024 * 1024) -#define SMB3_MIN_IOSIZE (64 * 1024) -#define SMB3_MAX_IOSIZE (8 * 1024 * 1024) +#define SMB3_MIN_IOSIZE (64 * 1024) +#define SMB3_MAX_IOSIZE (8 * 1024 * 1024) +#define SMB3_MAX_MSGSIZE (4 * 4096) /* * Definitions for SMB2 Protocol Data Units (network frames) diff --git a/fs/ksmbd/transport_ipc.c b/fs/ksmbd/transport_ipc.c index c9aca21637d5..40c721f9227e 100644 --- a/fs/ksmbd/transport_ipc.c +++ b/fs/ksmbd/transport_ipc.c @@ -308,6 +308,9 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req) if (req->smbd_max_io_size) init_smbd_max_io_size(req->smbd_max_io_size); + if (req->max_connections) + server_conf.max_connections = req->max_connections; + ret = ksmbd_set_netbios_name(req->netbios_name); ret |= ksmbd_set_server_string(req->server_string); ret |= ksmbd_set_work_group(req->work_group); diff --git a/fs/ksmbd/transport_tcp.c b/fs/ksmbd/transport_tcp.c index 63d55f543bd2..603893fd87f5 100644 --- a/fs/ksmbd/transport_tcp.c +++ b/fs/ksmbd/transport_tcp.c @@ -15,6 +15,8 @@ #define IFACE_STATE_DOWN BIT(0) #define IFACE_STATE_CONFIGURED BIT(1) +static atomic_t active_num_conn; + struct interface { struct task_struct *ksmbd_kthread; struct socket *ksmbd_socket; @@ -185,8 +187,10 @@ static int ksmbd_tcp_new_connection(struct socket *client_sk) struct tcp_transport *t; t = alloc_transport(client_sk); - if (!t) + if (!t) { + sock_release(client_sk); return -ENOMEM; + } csin = KSMBD_TCP_PEER_SOCKADDR(KSMBD_TRANS(t)->conn); if (kernel_getpeername(client_sk, csin) < 0) { @@ -239,6 +243,15 @@ static int ksmbd_kthread_fn(void *p) continue; } + if (server_conf.max_connections && + atomic_inc_return(&active_num_conn) >= server_conf.max_connections) { + pr_info_ratelimited("Limit the maximum number of connections(%u)\n", + atomic_read(&active_num_conn)); + atomic_dec(&active_num_conn); + sock_release(client_sk); + continue; + } + ksmbd_debug(CONN, "connect success: accepted new connection\n"); client_sk->sk->sk_rcvtimeo = KSMBD_TCP_RECV_TIMEOUT; client_sk->sk->sk_sndtimeo = KSMBD_TCP_SEND_TIMEOUT; @@ -295,6 +308,7 @@ static int ksmbd_tcp_readv(struct tcp_transport *t, struct kvec *iov_orig, struct msghdr ksmbd_msg; struct kvec *iov; struct ksmbd_conn *conn = KSMBD_TRANS(t)->conn; + int max_retry = 2; iov = get_conn_iovec(t, nr_segs); if (!iov) @@ -321,9 +335,11 @@ static int ksmbd_tcp_readv(struct tcp_transport *t, struct kvec *iov_orig, } else if (conn->status == KSMBD_SESS_NEED_RECONNECT) { total_read = -EAGAIN; break; - } else if (length == -ERESTARTSYS || length == -EAGAIN) { + } else if ((length == -ERESTARTSYS || length == -EAGAIN) && + max_retry) { usleep_range(1000, 2000); length = 0; + max_retry--; continue; } else if (length <= 0) { total_read = -EAGAIN; @@ -365,6 +381,8 @@ static int ksmbd_tcp_writev(struct ksmbd_transport *t, struct kvec *iov, static void ksmbd_tcp_disconnect(struct ksmbd_transport *t) { free_transport(TCP_TRANS(t)); + if (server_conf.max_connections) + atomic_dec(&active_num_conn); } static void tcp_destroy_socket(struct socket *ksmbd_socket) diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c index fb699a62e68e..aa1300b7bfc2 100644 --- a/fs/ksmbd/vfs.c +++ b/fs/ksmbd/vfs.c @@ -6,6 +6,7 @@ #include <linux/kernel.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/uaccess.h> #include <linux/backing-dev.h> #include <linux/writeback.h> diff --git a/fs/ksmbd/vfs_cache.c b/fs/ksmbd/vfs_cache.c index 8ffc89e62002..1d8126443a7f 100644 --- a/fs/ksmbd/vfs_cache.c +++ b/fs/ksmbd/vfs_cache.c @@ -5,6 +5,7 @@ */ #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/slab.h> #include <linux/vmalloc.h> diff --git a/fs/libfs.c b/fs/libfs.c index 152405c00f89..4eda519c3002 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1582,3 +1582,39 @@ bool inode_maybe_inc_iversion(struct inode *inode, bool force) return true; } EXPORT_SYMBOL(inode_maybe_inc_iversion); + +/** + * inode_query_iversion - read i_version for later use + * @inode: inode from which i_version should be read + * + * Read the inode i_version counter. This should be used by callers that wish + * to store the returned i_version for later comparison. This will guarantee + * that a later query of the i_version will result in a different value if + * anything has changed. + * + * In this implementation, we fetch the current value, set the QUERIED flag and + * then try to swap it into place with a cmpxchg, if it wasn't already set. If + * that fails, we try again with the newly fetched value from the cmpxchg. + */ +u64 inode_query_iversion(struct inode *inode) +{ + u64 cur, new; + + cur = inode_peek_iversion_raw(inode); + do { + /* If flag is already set, then no need to swap */ + if (cur & I_VERSION_QUERIED) { + /* + * This barrier (and the implicit barrier in the + * cmpxchg below) pairs with the barrier in + * inode_maybe_inc_iversion(). + */ + smp_mb(); + break; + } + + new = cur | I_VERSION_QUERIED; + } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); + return cur >> I_VERSION_QUERIED_SHIFT; +} +EXPORT_SYMBOL(inode_query_iversion); diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index a5bb3f721a9d..82b19a30e0f0 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -188,7 +188,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock) continue; if (!rpc_cmp_addr(nlm_addr(block->b_host), addr)) continue; - if (nfs_compare_fh(NFS_FH(locks_inode(fl_blocked->fl_file)), fh) != 0) + if (nfs_compare_fh(NFS_FH(file_inode(fl_blocked->fl_file)), fh) != 0) continue; /* Alright, we found a lock. Set the return status * and wake up the caller diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 99fffc9cb958..16b4de868cd2 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -12,6 +12,7 @@ #include <linux/types.h> #include <linux/errno.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/nfs_fs.h> #include <linux/utsname.h> #include <linux/freezer.h> @@ -130,7 +131,7 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl) char *nodename = req->a_host->h_rpcclnt->cl_nodename; nlmclnt_next_cookie(&argp->cookie); - memcpy(&lock->fh, NFS_FH(locks_inode(fl->fl_file)), sizeof(struct nfs_fh)); + memcpy(&lock->fh, NFS_FH(file_inode(fl->fl_file)), sizeof(struct nfs_fh)); lock->caller = nodename; lock->oh.data = req->a_owner; lock->oh.len = snprintf(req->a_owner, sizeof(req->a_owner), "%u@%s", diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h index 5bec78c8e431..17432c445fe6 100644 --- a/fs/lockd/netns.h +++ b/fs/lockd/netns.h @@ -3,6 +3,7 @@ #define __LOCKD_NETNS_H__ #include <linux/fs.h> +#include <linux/filelock.h> #include <net/netns/generic.h> struct lockd_net { diff --git a/fs/locks.c b/fs/locks.c index 8f01bee17715..624c6ac92ede 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -52,6 +52,7 @@ #include <linux/capability.h> #include <linux/file.h> #include <linux/fdtable.h> +#include <linux/filelock.h> #include <linux/fs.h> #include <linux/init.h> #include <linux/security.h> @@ -233,7 +234,7 @@ locks_check_ctx_file_list(struct file *filp, struct list_head *list, char *list_type) { struct file_lock *fl; - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); list_for_each_entry(fl, list, fl_list) if (fl->fl_file == filp) @@ -887,7 +888,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl) { struct file_lock *cfl; struct file_lock_context *ctx; - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); void *owner; void (*func)(void); @@ -1330,7 +1331,7 @@ retry: int posix_lock_file(struct file *filp, struct file_lock *fl, struct file_lock *conflock) { - return posix_lock_inode(locks_inode(filp), fl, conflock); + return posix_lock_inode(file_inode(filp), fl, conflock); } EXPORT_SYMBOL(posix_lock_file); @@ -1629,7 +1630,7 @@ EXPORT_SYMBOL(lease_get_mtime); int fcntl_getlease(struct file *filp) { struct file_lock *fl; - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); struct file_lock_context *ctx; int type = F_UNLCK; LIST_HEAD(dispose); @@ -1667,7 +1668,7 @@ int fcntl_getlease(struct file *filp) static int check_conflicting_open(struct file *filp, const long arg, int flags) { - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); int self_wcount = 0, self_rcount = 0; if (flags & FL_LAYOUT) @@ -1703,7 +1704,7 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **priv) { struct file_lock *fl, *my_fl = NULL, *lease; - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); struct file_lock_context *ctx; bool is_deleg = (*flp)->fl_flags & FL_DELEG; int error; @@ -1819,7 +1820,7 @@ static int generic_delete_lease(struct file *filp, void *owner) { int error = -EAGAIN; struct file_lock *fl, *victim = NULL; - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); struct file_lock_context *ctx; LIST_HEAD(dispose); @@ -1861,7 +1862,7 @@ static int generic_delete_lease(struct file *filp, void *owner) int generic_setlease(struct file *filp, long arg, struct file_lock **flp, void **priv) { - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); int error; if ((!uid_eq(current_fsuid(), inode->i_uid)) && !capable(CAP_LEASE)) @@ -2350,7 +2351,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, struct flock *flock) { struct file_lock *file_lock = locks_alloc_lock(); - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); struct file *f; int error; @@ -2554,7 +2555,7 @@ out: void locks_remove_posix(struct file *filp, fl_owner_t owner) { int error; - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); struct file_lock lock; struct file_lock_context *ctx; @@ -2591,7 +2592,7 @@ static void locks_remove_flock(struct file *filp, struct file_lock_context *flctx) { struct file_lock fl; - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); if (list_empty(&flctx->flc_flock)) return; @@ -2636,7 +2637,7 @@ void locks_remove_file(struct file *filp) { struct file_lock_context *ctx; - ctx = locks_inode_context(locks_inode(filp)); + ctx = locks_inode_context(file_inode(filp)); if (!ctx) return; @@ -2720,7 +2721,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, */ if (fl->fl_file != NULL) - inode = locks_inode(fl->fl_file); + inode = file_inode(fl->fl_file); seq_printf(f, "%lld: ", id); @@ -2861,7 +2862,7 @@ static void __show_fd_locks(struct seq_file *f, void show_fd_locks(struct seq_file *f, struct file *filp, struct files_struct *files) { - struct inode *inode = locks_inode(filp); + struct inode *inode = file_inode(filp); struct file_lock_context *ctx; int id = 0; diff --git a/fs/namei.c b/fs/namei.c index 1bf6256daffd..5855dc6edbd5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -20,6 +20,7 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/namei.h> #include <linux/pagemap.h> #include <linux/sched/mm.h> diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 1ead5bd740c2..14a72224b657 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -209,8 +209,8 @@ config NFS_DISABLE_UDP_SUPPORT config NFS_V4_2_READ_PLUS bool "NFS: Enable support for the NFSv4.2 READ_PLUS operation" depends on NFS_V4_2 - default y + default n help - Choose Y here to enable the use of READ_PLUS over NFS v4.2. READ_PLUS - attempts to improve read performance by compressing out sparse holes - in the file contents. + This is intended for developers only. The READ_PLUS operation has + been shown to have issues under specific conditions and should not + be used in production. diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index f8b8dae0df78..f8e420464b77 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2957,12 +2957,14 @@ static u64 nfs_access_login_time(const struct task_struct *task, const struct cred *cred) { const struct task_struct *parent; + const struct cred *pcred; u64 ret; rcu_read_lock(); for (;;) { parent = rcu_dereference(task->real_parent); - if (parent == task || cred_fscmp(parent->cred, cred) != 0) + pcred = rcu_dereference(parent->cred); + if (parent == task || cred_fscmp(pcred, cred) != 0) break; task = parent; } @@ -3023,6 +3025,7 @@ static int nfs_access_get_cached_rcu(struct inode *inode, const struct cred *cre * but do it without locking. */ struct nfs_inode *nfsi = NFS_I(inode); + u64 login_time = nfs_access_login_time(current, cred); struct nfs_access_entry *cache; int err = -ECHILD; struct list_head *lh; @@ -3037,6 +3040,8 @@ static int nfs_access_get_cached_rcu(struct inode *inode, const struct cred *cre cache = NULL; if (cache == NULL) goto out; + if ((s64)(login_time - cache->timestamp) > 0) + goto out; if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_ACCESS)) goto out; *mask = cache->mask; diff --git a/fs/nfs/export.c b/fs/nfs/export.c index 01596f2d0a1e..1a9d5aa51dfb 100644 --- a/fs/nfs/export.c +++ b/fs/nfs/export.c @@ -145,17 +145,10 @@ out: return parent; } -static u64 nfs_fetch_iversion(struct inode *inode) -{ - nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE); - return inode_peek_iversion_raw(inode); -} - const struct export_operations nfs_export_ops = { .encode_fh = nfs_encode_fh, .fh_to_dentry = nfs_fh_to_dentry, .get_parent = nfs_get_parent, - .fetch_iversion = nfs_fetch_iversion, .flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK| EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS| EXPORT_OP_NOATOMIC_ATTR, diff --git a/fs/nfs/file.c b/fs/nfs/file.c index d8ec889a4b3f..b0f3c9339e70 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -31,6 +31,7 @@ #include <linux/swap.h> #include <linux/uaccess.h> +#include <linux/filelock.h> #include "delegation.h" #include "internal.h" diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index ad34a33b0737..4974cd18ca46 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -783,6 +783,12 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, return &fl->generic_hdr; } +static bool +filelayout_lseg_is_striped(const struct nfs4_filelayout_segment *flseg) +{ + return flseg->num_fh > 1; +} + /* * filelayout_pg_test(). Called by nfs_can_coalesce_requests() * @@ -803,6 +809,8 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, size = pnfs_generic_pg_test(pgio, prev, req); if (!size) return 0; + else if (!filelayout_lseg_is_striped(FILELAYOUT_LSEG(pgio->pg_lseg))) + return size; /* see if req and prev are in the same stripe */ if (prev) { diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 7000c161c900..222a28320e1c 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -825,6 +825,8 @@ static u32 nfs_get_valid_attrmask(struct inode *inode) reply_mask |= STATX_UID | STATX_GID; if (!(cache_validity & NFS_INO_INVALID_BLOCKS)) reply_mask |= STATX_BLOCKS; + if (!(cache_validity & NFS_INO_INVALID_CHANGE)) + reply_mask |= STATX_CHANGE_COOKIE; return reply_mask; } @@ -843,7 +845,8 @@ int nfs_getattr(struct mnt_idmap *idmap, const struct path *path, request_mask &= STATX_TYPE | STATX_MODE | STATX_NLINK | STATX_UID | STATX_GID | STATX_ATIME | STATX_MTIME | STATX_CTIME | - STATX_INO | STATX_SIZE | STATX_BLOCKS; + STATX_INO | STATX_SIZE | STATX_BLOCKS | STATX_BTIME | + STATX_CHANGE_COOKIE; if ((query_flags & AT_STATX_DONT_SYNC) && !force_sync) { if (readdirplus_enabled) @@ -851,8 +854,8 @@ int nfs_getattr(struct mnt_idmap *idmap, const struct path *path, goto out_no_revalidate; } - /* Flush out writes to the server in order to update c/mtime. */ - if ((request_mask & (STATX_CTIME | STATX_MTIME)) && + /* Flush out writes to the server in order to update c/mtime/version. */ + if ((request_mask & (STATX_CTIME | STATX_MTIME | STATX_CHANGE_COOKIE)) && S_ISREG(inode->i_mode)) filemap_write_and_wait(inode->i_mapping); @@ -872,7 +875,8 @@ int nfs_getattr(struct mnt_idmap *idmap, const struct path *path, /* Is the user requesting attributes that might need revalidation? */ if (!(request_mask & (STATX_MODE|STATX_NLINK|STATX_ATIME|STATX_CTIME| STATX_MTIME|STATX_UID|STATX_GID| - STATX_SIZE|STATX_BLOCKS))) + STATX_SIZE|STATX_BLOCKS| + STATX_CHANGE_COOKIE))) goto out_no_revalidate; /* Check whether the cached attributes are stale */ @@ -910,6 +914,10 @@ out_no_revalidate: generic_fillattr(&nop_mnt_idmap, inode, stat); stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); + stat->change_cookie = inode_peek_iversion_raw(inode); + stat->attributes_mask |= STATX_ATTR_CHANGE_MONOTONIC; + if (server->change_attr_type != NFS4_CHANGE_TYPE_IS_UNDEFINED) + stat->attributes |= STATX_ATTR_CHANGE_MONOTONIC; if (S_ISDIR(inode->i_mode)) stat->blksize = NFS_SERVER(inode)->dtsize; out: diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 5edd1704f735..4c9f8bd866ab 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -23,6 +23,7 @@ #define NFS4_MAX_LOOP_ON_RECOVER (10) #include <linux/seqlock.h> +#include <linux/filelock.h> struct idmap; diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 16be6dae524f..779bfc37233c 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -21,6 +21,7 @@ #include <linux/nfs_page.h> #include <linux/nfs_mount.h> #include <linux/export.h> +#include <linux/filelock.h> #include "internal.h" #include "pnfs.h" diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 80c240e50952..1a80d548253a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -25,6 +25,7 @@ #include <linux/freezer.h> #include <linux/wait.h> #include <linux/iversion.h> +#include <linux/filelock.h> #include <linux/uaccess.h> #include <linux/sched/mm.h> diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c index 0a9b72685f98..1479583fbb62 100644 --- a/fs/nfs_common/grace.c +++ b/fs/nfs_common/grace.c @@ -9,6 +9,7 @@ #include <net/net_namespace.h> #include <net/netns/generic.h> #include <linux/fs.h> +#include <linux/filelock.h> static unsigned int grace_net_id; static DEFINE_SPINLOCK(grace_lock); diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 45b2c9e3f636..c0950edb26b0 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -662,6 +662,39 @@ static struct shrinker nfsd_file_shrinker = { }; /** + * nfsd_file_cond_queue - conditionally unhash and queue a nfsd_file + * @nf: nfsd_file to attempt to queue + * @dispose: private list to queue successfully-put objects + * + * Unhash an nfsd_file, try to get a reference to it, and then put that + * reference. If it's the last reference, queue it to the dispose list. + */ +static void +nfsd_file_cond_queue(struct nfsd_file *nf, struct list_head *dispose) + __must_hold(RCU) +{ + int decrement = 1; + + /* If we raced with someone else unhashing, ignore it */ + if (!nfsd_file_unhash(nf)) + return; + + /* If we can't get a reference, ignore it */ + if (!nfsd_file_get(nf)) + return; + + /* Extra decrement if we remove from the LRU */ + if (nfsd_file_lru_remove(nf)) + ++decrement; + + /* If refcount goes to 0, then put on the dispose list */ + if (refcount_sub_and_test(decrement, &nf->nf_ref)) { + list_add(&nf->nf_lru, dispose); + trace_nfsd_file_closing(nf); + } +} + +/** * nfsd_file_queue_for_close: try to close out any open nfsd_files for an inode * @inode: inode on which to close out nfsd_files * @dispose: list on which to gather nfsd_files to close out @@ -688,30 +721,11 @@ nfsd_file_queue_for_close(struct inode *inode, struct list_head *dispose) rcu_read_lock(); do { - int decrement = 1; - nf = rhashtable_lookup(&nfsd_file_rhash_tbl, &key, nfsd_file_rhash_params); if (!nf) break; - - /* If we raced with someone else unhashing, ignore it */ - if (!nfsd_file_unhash(nf)) - continue; - - /* If we can't get a reference, ignore it */ - if (!nfsd_file_get(nf)) - continue; - - /* Extra decrement if we remove from the LRU */ - if (nfsd_file_lru_remove(nf)) - ++decrement; - - /* If refcount goes to 0, then put on the dispose list */ - if (refcount_sub_and_test(decrement, &nf->nf_ref)) { - list_add(&nf->nf_lru, dispose); - trace_nfsd_file_closing(nf); - } + nfsd_file_cond_queue(nf, dispose); } while (1); rcu_read_unlock(); } @@ -928,11 +942,8 @@ __nfsd_file_cache_purge(struct net *net) nf = rhashtable_walk_next(&iter); while (!IS_ERR_OR_NULL(nf)) { - if (!net || nf->nf_net == net) { - nfsd_file_unhash(nf); - nfsd_file_lru_remove(nf); - list_add(&nf->nf_lru, &dispose); - } + if (!net || nf->nf_net == net) + nfsd_file_cond_queue(nf, &dispose); nf = rhashtable_walk_next(&iter); } @@ -1071,8 +1082,8 @@ nfsd_file_is_cached(struct inode *inode) static __be32 nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, - unsigned int may_flags, struct nfsd_file **pnf, - bool open, bool want_gc) + unsigned int may_flags, struct file *file, + struct nfsd_file **pnf, bool want_gc) { struct nfsd_file_lookup_key key = { .type = NFSD_FILE_KEY_FULL, @@ -1147,8 +1158,7 @@ wait_for_construction: status = nfserrno(nfsd_open_break_lease(file_inode(nf->nf_file), may_flags)); out: if (status == nfs_ok) { - if (open) - this_cpu_inc(nfsd_file_acquisitions); + this_cpu_inc(nfsd_file_acquisitions); *pnf = nf; } else { if (refcount_dec_and_test(&nf->nf_ref)) @@ -1158,20 +1168,23 @@ out: out_status: put_cred(key.cred); - if (open) - trace_nfsd_file_acquire(rqstp, key.inode, may_flags, nf, status); + trace_nfsd_file_acquire(rqstp, key.inode, may_flags, nf, status); return status; open_file: trace_nfsd_file_alloc(nf); nf->nf_mark = nfsd_file_mark_find_or_create(nf, key.inode); if (nf->nf_mark) { - if (open) { + if (file) { + get_file(file); + nf->nf_file = file; + status = nfs_ok; + trace_nfsd_file_opened(nf, status); + } else { status = nfsd_open_verified(rqstp, fhp, may_flags, &nf->nf_file); trace_nfsd_file_open(nf, status); - } else - status = nfs_ok; + } } else status = nfserr_jukebox; /* @@ -1207,7 +1220,7 @@ __be32 nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned int may_flags, struct nfsd_file **pnf) { - return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, true, true); + return nfsd_file_do_acquire(rqstp, fhp, may_flags, NULL, pnf, true); } /** @@ -1228,28 +1241,30 @@ __be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned int may_flags, struct nfsd_file **pnf) { - return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, true, false); + return nfsd_file_do_acquire(rqstp, fhp, may_flags, NULL, pnf, false); } /** - * nfsd_file_create - Get a struct nfsd_file, do not open + * nfsd_file_acquire_opened - Get a struct nfsd_file using existing open file * @rqstp: the RPC transaction being executed * @fhp: the NFS filehandle of the file just created * @may_flags: NFSD_MAY_ settings for the file + * @file: cached, already-open file (may be NULL) * @pnf: OUT: new or found "struct nfsd_file" object * - * The nfsd_file_object returned by this API is reference-counted - * but not garbage-collected. The object is released immediately - * one RCU grace period after the final nfsd_file_put(). + * Acquire a nfsd_file object that is not GC'ed. If one doesn't already exist, + * and @file is non-NULL, use it to instantiate a new nfsd_file instead of + * opening a new one. * * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in * network byte order is returned. */ __be32 -nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp, - unsigned int may_flags, struct nfsd_file **pnf) +nfsd_file_acquire_opened(struct svc_rqst *rqstp, struct svc_fh *fhp, + unsigned int may_flags, struct file *file, + struct nfsd_file **pnf) { - return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, false, false); + return nfsd_file_do_acquire(rqstp, fhp, may_flags, file, pnf, false); } /* diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index b7efb2c3ddb1..41516a4263ea 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -60,7 +60,8 @@ __be32 nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned int may_flags, struct nfsd_file **nfp); __be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned int may_flags, struct nfsd_file **nfp); -__be32 nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp, - unsigned int may_flags, struct nfsd_file **nfp); +__be32 nfsd_file_acquire_opened(struct svc_rqst *rqstp, struct svc_fh *fhp, + unsigned int may_flags, struct file *file, + struct nfsd_file **nfp); int nfsd_file_cache_stats_show(struct seq_file *m, void *v); #endif /* _FS_NFSD_FILECACHE_H */ diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 8c854ba3285b..ec49b200b797 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -10,6 +10,7 @@ #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <linux/filelock.h> #include <linux/percpu_counter.h> #include <linux/siphash.h> @@ -195,7 +196,7 @@ struct nfsd_net { atomic_t nfsd_courtesy_clients; struct shrinker nfsd_client_shrinker; - struct delayed_work nfsd_shrinker_work; + struct work_struct nfsd_shrinker_work; }; /* Simple check to find out if a given net was properly initialized */ diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index bd880d55f565..f189ba7995f5 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -937,7 +937,7 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * the client wants us to do more in this compound: */ if (!nfsd4_last_compound_op(rqstp)) - __clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); + clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); /* check stateid */ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, @@ -1318,6 +1318,7 @@ try_again: /* allow 20secs for mount/unmount for now - revisit */ if (signal_pending(current) || (schedule_timeout(20*HZ) == 0)) { + finish_wait(&nn->nfsd_ssc_waitq, &wait); kfree(work); return nfserr_eagain; } @@ -2607,12 +2608,11 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) cstate->minorversion = args->minorversion; fh_init(current_fh, NFS4_FHSIZE); fh_init(save_fh, NFS4_FHSIZE); - /* * Don't use the deferral mechanism for NFSv4; compounds make it * too hard to avoid non-idempotency problems. */ - __clear_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); + clear_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); /* * According to RFC3010, this takes precedence over all other errors. @@ -2734,7 +2734,7 @@ encode_op: out: cstate->status = status; /* Reset deferral mechanism for RPC deferrals */ - __set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); + set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); return rpc_success; } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 7b2ee535ade8..c1684da6c01f 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4411,7 +4411,7 @@ nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) if (!count) count = atomic_long_read(&num_delegations); if (count) - mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0); + queue_work(laundry_wq, &nn->nfsd_shrinker_work); return (unsigned long)count; } @@ -4421,7 +4421,7 @@ nfsd4_state_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc) return SHRINK_STOP; } -int +void nfsd4_init_leases_net(struct nfsd_net *nn) { struct sysinfo si; @@ -4443,16 +4443,6 @@ nfsd4_init_leases_net(struct nfsd_net *nn) nn->nfs4_max_clients = max_t(int, max_clients, NFS4_CLIENTS_PER_GB); atomic_set(&nn->nfsd_courtesy_clients, 0); - nn->nfsd_client_shrinker.scan_objects = nfsd4_state_shrinker_scan; - nn->nfsd_client_shrinker.count_objects = nfsd4_state_shrinker_count; - nn->nfsd_client_shrinker.seeks = DEFAULT_SEEKS; - return register_shrinker(&nn->nfsd_client_shrinker, "nfsd-client"); -} - -void -nfsd4_leases_net_shutdown(struct nfsd_net *nn) -{ - unregister_shrinker(&nn->nfsd_client_shrinker); } static void init_nfs4_replay(struct nfs4_replay *rp) @@ -5262,18 +5252,10 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, if (!fp->fi_fds[oflag]) { spin_unlock(&fp->fi_lock); - if (!open->op_filp) { - status = nfsd_file_acquire(rqstp, cur_fh, access, &nf); - if (status != nfs_ok) - goto out_put_access; - } else { - status = nfsd_file_create(rqstp, cur_fh, access, &nf); - if (status != nfs_ok) - goto out_put_access; - nf->nf_file = open->op_filp; - open->op_filp = NULL; - trace_nfsd_file_create(rqstp, access, nf); - } + status = nfsd_file_acquire_opened(rqstp, cur_fh, access, + open->op_filp, &nf); + if (status != nfs_ok) + goto out_put_access; spin_lock(&fp->fi_lock); if (!fp->fi_fds[oflag]) { @@ -5374,7 +5356,7 @@ static int nfsd4_check_conflicting_opens(struct nfs4_client *clp, { struct nfs4_ol_stateid *st; struct file *f = fp->fi_deleg_file->nf_file; - struct inode *ino = locks_inode(f); + struct inode *ino = file_inode(f); int writes; writes = atomic_read(&ino->i_writecount); @@ -6243,8 +6225,7 @@ deleg_reaper(struct nfsd_net *nn) static void nfsd4_state_shrinker_worker(struct work_struct *work) { - struct delayed_work *dwork = to_delayed_work(work); - struct nfsd_net *nn = container_of(dwork, struct nfsd_net, + struct nfsd_net *nn = container_of(work, struct nfsd_net, nfsd_shrinker_work); courtesy_client_reaper(nn); @@ -7828,7 +7809,7 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) return status; } - inode = locks_inode(nf->nf_file); + inode = file_inode(nf->nf_file); flctx = locks_inode_context(inode); if (flctx && !list_empty_careful(&flctx->flc_posix)) { @@ -8074,11 +8055,20 @@ static int nfs4_state_create_net(struct net *net) INIT_LIST_HEAD(&nn->blocked_locks_lru); INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main); - INIT_DELAYED_WORK(&nn->nfsd_shrinker_work, nfsd4_state_shrinker_worker); + INIT_WORK(&nn->nfsd_shrinker_work, nfsd4_state_shrinker_worker); get_net(net); + nn->nfsd_client_shrinker.scan_objects = nfsd4_state_shrinker_scan; + nn->nfsd_client_shrinker.count_objects = nfsd4_state_shrinker_count; + nn->nfsd_client_shrinker.seeks = DEFAULT_SEEKS; + + if (register_shrinker(&nn->nfsd_client_shrinker, "nfsd-client")) + goto err_shrinker; return 0; +err_shrinker: + put_net(net); + kfree(nn->sessionid_hashtbl); err_sessionid: kfree(nn->unconf_id_hashtbl); err_unconf_id: @@ -8171,6 +8161,8 @@ nfs4_state_shutdown_net(struct net *net) struct list_head *pos, *next, reaplist; struct nfsd_net *nn = net_generic(net, nfsd_net_id); + unregister_shrinker(&nn->nfsd_client_shrinker); + cancel_work(&nn->nfsd_shrinker_work); cancel_delayed_work_sync(&nn->laundromat_work); locks_end_grace(&nn->nfsd4_manager); @@ -8190,7 +8182,6 @@ nfs4_state_shutdown_net(struct net *net) nfsd4_client_tracking_exit(net); nfs4_state_destroy_net(net); - rhltable_destroy(&nfs4_file_rhltable); #ifdef CONFIG_NFSD_V4_2_INTER_SSC nfsd4_ssc_shutdown_umount(nn); #endif @@ -8200,6 +8191,7 @@ void nfs4_state_shutdown(void) { nfsd4_destroy_callback_queue(); + rhltable_destroy(&nfs4_file_rhltable); } static void diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 2b4ae858c89b..e12e5a4ad502 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2523,7 +2523,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) argp->rqstp->rq_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE; if (readcount > 1 || max_reply > PAGE_SIZE - auth_slack) - __clear_bit(RQ_SPLICE_OK, &argp->rqstp->rq_flags); + clear_bit(RQ_SPLICE_OK, &argp->rqstp->rq_flags); return true; } @@ -2965,7 +2965,9 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, goto out; } - err = vfs_getattr(&path, &stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT); + err = vfs_getattr(&path, &stat, + STATX_BASIC_STATS | STATX_BTIME | STATX_CHANGE_COOKIE, + AT_STATX_SYNC_AS_STAT); if (err) goto out_nfserr; if (!(stat.result_mask & STATX_BTIME)) @@ -3629,6 +3631,17 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, case nfserr_noent: xdr_truncate_encode(xdr, start_offset); goto skip_entry; + case nfserr_jukebox: + /* + * The pseudoroot should only display dentries that lead to + * exports. If we get EJUKEBOX here, then we can't tell whether + * this entry should be included. Just fail the whole READDIR + * with NFS4ERR_DELAY in that case, and hope that the situation + * will resolve itself by the client's next attempt. + */ + if (cd->rd_fhp->fh_export->ex_flags & NFSEXP_V4ROOT) + goto fail; + fallthrough; default: /* * If the client requested the RDATTR_ERROR attribute, diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index d1e581a60480..c2577ee7ffb2 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1457,9 +1457,7 @@ static __net_init int nfsd_init_net(struct net *net) goto out_idmap_error; nn->nfsd_versions = NULL; nn->nfsd4_minorversions = NULL; - retval = nfsd4_init_leases_net(nn); - if (retval) - goto out_drc_error; + nfsd4_init_leases_net(nn); retval = nfsd_reply_cache_init(nn); if (retval) goto out_cache_error; @@ -1469,8 +1467,6 @@ static __net_init int nfsd_init_net(struct net *net) return 0; out_cache_error: - nfsd4_leases_net_shutdown(nn); -out_drc_error: nfsd_idmap_shutdown(net); out_idmap_error: nfsd_export_shutdown(net); @@ -1486,7 +1482,6 @@ static __net_exit void nfsd_exit_net(struct net *net) nfsd_idmap_shutdown(net); nfsd_export_shutdown(net); nfsd_netns_free_versions(net_generic(net, nfsd_net_id)); - nfsd4_leases_net_shutdown(nn); } static struct pernet_operations nfsd_net_ops = { diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 93b42ef9ed91..fa0144a74267 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -504,8 +504,7 @@ extern void unregister_cld_notifier(void); extern void nfsd4_ssc_init_umount_work(struct nfsd_net *nn); #endif -extern int nfsd4_init_leases_net(struct nfsd_net *nn); -extern void nfsd4_leases_net_shutdown(struct nfsd_net *nn); +extern void nfsd4_init_leases_net(struct nfsd_net *nn); #else /* CONFIG_NFSD_V4 */ static inline int nfsd4_is_junction(struct dentry *dentry) @@ -513,8 +512,7 @@ static inline int nfsd4_is_junction(struct dentry *dentry) return 0; } -static inline int nfsd4_init_leases_net(struct nfsd_net *nn) { return 0; }; -static inline void nfsd4_leases_net_shutdown(struct nfsd_net *nn) {}; +static inline void nfsd4_init_leases_net(struct nfsd_net *nn) { }; #define register_cld_notifier() 0 #define unregister_cld_notifier() do { } while(0) diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 73c1bbdc99c3..ccd8485fee04 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -628,6 +628,10 @@ void fh_fill_pre_attrs(struct svc_fh *fhp) stat.mtime = inode->i_mtime; stat.ctime = inode->i_ctime; stat.size = inode->i_size; + if (v4 && IS_I_VERSION(inode)) { + stat.change_cookie = inode_query_iversion(inode); + stat.result_mask |= STATX_CHANGE_COOKIE; + } } if (v4) fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode); @@ -659,6 +663,10 @@ void fh_fill_post_attrs(struct svc_fh *fhp) if (err) { fhp->fh_post_saved = false; fhp->fh_post_attr.ctime = inode->i_ctime; + if (v4 && IS_I_VERSION(inode)) { + fhp->fh_post_attr.change_cookie = inode_query_iversion(inode); + fhp->fh_post_attr.result_mask |= STATX_CHANGE_COOKIE; + } } else fhp->fh_post_saved = true; if (v4) @@ -748,3 +756,37 @@ enum fsid_source fsid_source(const struct svc_fh *fhp) return FSIDSOURCE_UUID; return FSIDSOURCE_DEV; } + +/* + * We could use i_version alone as the change attribute. However, i_version + * can go backwards on a regular file after an unclean shutdown. On its own + * that doesn't necessarily cause a problem, but if i_version goes backwards + * and then is incremented again it could reuse a value that was previously + * used before boot, and a client who queried the two values might incorrectly + * assume nothing changed. + * + * By using both ctime and the i_version counter we guarantee that as long as + * time doesn't go backwards we never reuse an old value. If the filesystem + * advertises STATX_ATTR_CHANGE_MONOTONIC, then this mitigation is not + * needed. + * + * We only need to do this for regular files as well. For directories, we + * assume that the new change attr is always logged to stable storage in some + * fashion before the results can be seen. + */ +u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode) +{ + u64 chattr; + + if (stat->result_mask & STATX_CHANGE_COOKIE) { + chattr = stat->change_cookie; + if (S_ISREG(inode->i_mode) && + !(stat->attributes & STATX_ATTR_CHANGE_MONOTONIC)) { + chattr += (u64)stat->ctime.tv_sec << 30; + chattr += stat->ctime.tv_nsec; + } + } else { + chattr = time_to_chattr(&stat->ctime); + } + return chattr; +} diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 513e028b0bbe..4e0ecf0ae2cf 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -293,34 +293,7 @@ static inline void fh_clear_pre_post_attrs(struct svc_fh *fhp) fhp->fh_pre_saved = false; } -/* - * We could use i_version alone as the change attribute. However, - * i_version can go backwards after a reboot. On its own that doesn't - * necessarily cause a problem, but if i_version goes backwards and then - * is incremented again it could reuse a value that was previously used - * before boot, and a client who queried the two values might - * incorrectly assume nothing changed. - * - * By using both ctime and the i_version counter we guarantee that as - * long as time doesn't go backwards we never reuse an old value. - */ -static inline u64 nfsd4_change_attribute(struct kstat *stat, - struct inode *inode) -{ - if (inode->i_sb->s_export_op->fetch_iversion) - return inode->i_sb->s_export_op->fetch_iversion(inode); - else if (IS_I_VERSION(inode)) { - u64 chattr; - - chattr = stat->ctime.tv_sec; - chattr <<= 30; - chattr += stat->ctime.tv_nsec; - chattr += inode_query_iversion(inode); - return chattr; - } else - return time_to_chattr(&stat->ctime); -} - +u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode); extern void fh_fill_pre_attrs(struct svc_fh *fhp); extern void fh_fill_post_attrs(struct svc_fh *fhp); extern void fh_fill_both_attrs(struct svc_fh *fhp); diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 35cab9a65c17..a82d91afdc9c 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -211,7 +211,7 @@ nfsd_proc_read(struct svc_rqst *rqstp) if (resp->status == nfs_ok) resp->status = fh_getattr(&resp->fh, &resp->stat); else if (resp->status == nfserr_jukebox) - __set_bit(RQ_DROPME, &rqstp->rq_flags); + set_bit(RQ_DROPME, &rqstp->rq_flags); return rpc_success; } @@ -246,7 +246,7 @@ nfsd_proc_write(struct svc_rqst *rqstp) if (resp->status == nfs_ok) resp->status = fh_getattr(&resp->fh, &resp->stat); else if (resp->status == nfserr_jukebox) - __set_bit(RQ_DROPME, &rqstp->rq_flags); + set_bit(RQ_DROPME, &rqstp->rq_flags); return rpc_success; } diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 56fba1cba3af..325d3d3f1211 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -453,8 +453,8 @@ static void nfsd_shutdown_net(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); - nfsd_file_cache_shutdown_net(net); nfs4_state_shutdown_net(net); + nfsd_file_cache_shutdown_net(net); if (nn->lockd_up) { lockd_down(net); nn->lockd_up = false; diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index c852ae8eaf37..8f9c82d9e075 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -981,43 +981,6 @@ TRACE_EVENT(nfsd_file_acquire, ) ); -TRACE_EVENT(nfsd_file_create, - TP_PROTO( - const struct svc_rqst *rqstp, - unsigned int may_flags, - const struct nfsd_file *nf - ), - - TP_ARGS(rqstp, may_flags, nf), - - TP_STRUCT__entry( - __field(const void *, nf_inode) - __field(const void *, nf_file) - __field(unsigned long, may_flags) - __field(unsigned long, nf_flags) - __field(unsigned long, nf_may) - __field(unsigned int, nf_ref) - __field(u32, xid) - ), - - TP_fast_assign( - __entry->nf_inode = nf->nf_inode; - __entry->nf_file = nf->nf_file; - __entry->may_flags = may_flags; - __entry->nf_flags = nf->nf_flags; - __entry->nf_may = nf->nf_may; - __entry->nf_ref = refcount_read(&nf->nf_ref); - __entry->xid = be32_to_cpu(rqstp->rq_xid); - ), - - TP_printk("xid=0x%x inode=%p may_flags=%s ref=%u nf_flags=%s nf_may=%s nf_file=%p", - __entry->xid, __entry->nf_inode, - show_nfsd_may_flags(__entry->may_flags), - __entry->nf_ref, show_nf_flags(__entry->nf_flags), - show_nfsd_may_flags(__entry->nf_may), __entry->nf_file - ) -); - TRACE_EVENT(nfsd_file_insert_err, TP_PROTO( const struct svc_rqst *rqstp, @@ -1079,8 +1042,8 @@ TRACE_EVENT(nfsd_file_cons_err, ) ); -TRACE_EVENT(nfsd_file_open, - TP_PROTO(struct nfsd_file *nf, __be32 status), +DECLARE_EVENT_CLASS(nfsd_file_open_class, + TP_PROTO(const struct nfsd_file *nf, __be32 status), TP_ARGS(nf, status), TP_STRUCT__entry( __field(void *, nf_inode) /* cannot be dereferenced */ @@ -1104,6 +1067,17 @@ TRACE_EVENT(nfsd_file_open, __entry->nf_file) ) +#define DEFINE_NFSD_FILE_OPEN_EVENT(name) \ +DEFINE_EVENT(nfsd_file_open_class, name, \ + TP_PROTO( \ + const struct nfsd_file *nf, \ + __be32 status \ + ), \ + TP_ARGS(nf, status)) + +DEFINE_NFSD_FILE_OPEN_EVENT(nfsd_file_open); +DEFINE_NFSD_FILE_OPEN_EVENT(nfsd_file_opened); + TRACE_EVENT(nfsd_file_is_cached, TP_PROTO( const struct inode *inode, diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index dbdfef7ae85b..43fb57a301d3 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -170,9 +170,14 @@ static inline void fh_drop_write(struct svc_fh *fh) static inline __be32 fh_getattr(const struct svc_fh *fh, struct kstat *stat) { + u32 request_mask = STATX_BASIC_STATS; struct path p = {.mnt = fh->fh_export->ex_path.mnt, .dentry = fh->fh_dentry}; - return nfserrno(vfs_getattr(&p, stat, STATX_BASIC_STATS, + + if (fh->fh_maxsize == NFS4_FHSIZE) + request_mask |= (STATX_BTIME | STATX_CHANGE_COOKIE); + + return nfserrno(vfs_getattr(&p, stat, request_mask, AT_STATX_SYNC_AS_STAT)); } diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index b9d15c3df3cc..40ce92a332fe 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -480,9 +480,18 @@ static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr, ret = nilfs_btnode_submit_block(btnc, ptr, 0, REQ_OP_READ, &bh, &submit_ptr); if (ret) { - if (ret != -EEXIST) - return ret; - goto out_check; + if (likely(ret == -EEXIST)) + goto out_check; + if (ret == -ENOENT) { + /* + * Block address translation failed due to invalid + * value of 'ptr'. In this case, return internal code + * -EINVAL (broken bmap) to notify bmap layer of fatal + * metadata corruption. + */ + ret = -EINVAL; + } + return ret; } if (ra) { diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 91994b9955b5..5ccc638ae92f 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -1114,7 +1114,14 @@ static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp) minseg = range[0] + segbytes - 1; do_div(minseg, segbytes); + + if (range[1] < 4096) + goto out; + maxseg = NILFS_SB2_OFFSET_BYTES(range[1]); + if (maxseg < segbytes) + goto out; + do_div(maxseg, segbytes); maxseg--; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 6edb6e0dd61f..1422b8ba24ed 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -409,6 +409,15 @@ int nilfs_resize_fs(struct super_block *sb, __u64 newsize) goto out; /* + * Prevent underflow in second superblock position calculation. + * The exact minimum size check is done in nilfs_sufile_resize(). + */ + if (newsize < 4096) { + ret = -ENOSPC; + goto out; + } + + /* * Write lock is required to protect some functions depending * on the number of segments, the number of reserved segments, * and so forth. diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 2064e6473d30..3a4c9c150cbf 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -544,9 +544,15 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs, { struct nilfs_super_block **sbp = nilfs->ns_sbp; struct buffer_head **sbh = nilfs->ns_sbh; - u64 sb2off = NILFS_SB2_OFFSET_BYTES(bdev_nr_bytes(nilfs->ns_bdev)); + u64 sb2off, devsize = bdev_nr_bytes(nilfs->ns_bdev); int valid[2], swp = 0; + if (devsize < NILFS_SEG_MIN_BLOCKS * NILFS_MIN_BLOCK_SIZE + 4096) { + nilfs_err(sb, "device size too small"); + return -EINVAL; + } + sb2off = NILFS_SB2_OFFSET_BYTES(devsize); + sbp[0] = nilfs_read_super_block(sb, NILFS_SB_OFFSET_BYTES, blocksize, &sbh[0]); sbp[1] = nilfs_read_super_block(sb, sb2off, blocksize, &sbh[1]); diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 181d5677ccd1..e9bdc1ff08c9 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -390,10 +390,10 @@ static int ntfs_truncate(struct inode *inode, loff_t new_size) new_valid = ntfs_up_block(sb, min_t(u64, ni->i_valid, new_size)); - ni_lock(ni); - truncate_setsize(inode, new_size); + ni_lock(ni); + down_write(&ni->file.run_lock); err = attr_set_size(ni, ATTR_DATA, NULL, 0, &ni->file.run, new_size, &new_valid, ni->mi.sbi->options->prealloc, NULL); diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c index 73a3854b2afb..f37174e79fad 100644 --- a/fs/ocfs2/locks.c +++ b/fs/ocfs2/locks.c @@ -8,6 +8,7 @@ */ #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/fcntl.h> #include <cluster/masklog.h> diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 64e6ddcfe329..05d4414d0c33 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -9,6 +9,7 @@ #include <linux/module.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/miscdevice.h> #include <linux/mutex.h> #include <linux/slab.h> diff --git a/fs/open.c b/fs/open.c index f9d48da3c630..68051c1c0579 100644 --- a/fs/open.c +++ b/fs/open.c @@ -33,6 +33,7 @@ #include <linux/dnotify.h> #include <linux/compat.h> #include <linux/mnt_idmapping.h> +#include <linux/filelock.h> #include "internal.h" @@ -871,7 +872,7 @@ static int do_dentry_open(struct file *f, if (error) goto cleanup_all; - error = break_lease(locks_inode(f), f->f_flags); + error = break_lease(file_inode(f), f->f_flags); if (error) goto cleanup_all; diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 167fa43b24f9..4ecb91a9bbeb 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -14,6 +14,7 @@ #include "orangefs-kernel.h" #include "orangefs-bufmap.h" #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/pagemap.h> static int flush_racache(struct inode *inode) diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 6e4e65ee050d..c14e90764e35 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -792,7 +792,7 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) if (!c->metacopy && c->stat.size) { err = ovl_copy_up_file(ofs, c->dentry, tmpfile, c->stat.size); if (err) - return err; + goto out_fput; } err = ovl_copy_up_metadata(c, temp); @@ -1011,6 +1011,10 @@ static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, if (err) return err; + if (!kuid_has_mapping(current_user_ns(), ctx.stat.uid) || + !kgid_has_mapping(current_user_ns(), ctx.stat.gid)) + return -EOVERFLOW; + ctx.metacopy = ovl_need_meta_copy_up(dentry, ctx.stat.mode, flags); if (parent) { diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 8cc52110f1f9..fda167069617 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -28,6 +28,7 @@ #include <linux/security.h> #include <linux/evm.h> #include <linux/fsnotify.h> +#include <linux/filelock.h> #include "internal.h" diff --git a/fs/proc/fd.c b/fs/proc/fd.c index f516c1a68094..b3140deebbbf 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -12,6 +12,7 @@ #include <linux/file.h> #include <linux/seq_file.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/proc_fs.h> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e35a0398db63..af1c49ae11b1 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -745,9 +745,7 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, page = pfn_swap_entry_to_page(swpent); } if (page) { - int mapcount = page_mapcount(page); - - if (mapcount >= 2) + if (page_mapcount(page) >= 2 || hugetlb_pmd_shared(pte)) mss->shared_hugetlb += huge_page_size(hstate_vma(vma)); else mss->private_hugetlb += huge_page_size(hstate_vma(vma)); diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h index b3fdc8212c5f..95f8e8901768 100644 --- a/fs/squashfs/squashfs_fs.h +++ b/fs/squashfs/squashfs_fs.h @@ -183,7 +183,7 @@ static inline int squashfs_block_size(__le32 raw) #define SQUASHFS_ID_BLOCK_BYTES(A) (SQUASHFS_ID_BLOCKS(A) *\ sizeof(u64)) /* xattr id lookup table defines */ -#define SQUASHFS_XATTR_BYTES(A) ((A) * sizeof(struct squashfs_xattr_id)) +#define SQUASHFS_XATTR_BYTES(A) (((u64) (A)) * sizeof(struct squashfs_xattr_id)) #define SQUASHFS_XATTR_BLOCK(A) (SQUASHFS_XATTR_BYTES(A) / \ SQUASHFS_METADATA_SIZE) diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h index 659082e9e51d..72f6f4b37863 100644 --- a/fs/squashfs/squashfs_fs_sb.h +++ b/fs/squashfs/squashfs_fs_sb.h @@ -63,7 +63,7 @@ struct squashfs_sb_info { long long bytes_used; unsigned int inodes; unsigned int fragments; - int xattr_ids; + unsigned int xattr_ids; unsigned int ids; bool panic_on_errors; const struct squashfs_decompressor_thread_ops *thread_ops; diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h index d8a270d3ac4c..f1a463d8bfa0 100644 --- a/fs/squashfs/xattr.h +++ b/fs/squashfs/xattr.h @@ -10,12 +10,12 @@ #ifdef CONFIG_SQUASHFS_XATTR extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64, - u64 *, int *); + u64 *, unsigned int *); extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *, unsigned int *, unsigned long long *); #else static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb, - u64 start, u64 *xattr_table_start, int *xattr_ids) + u64 start, u64 *xattr_table_start, unsigned int *xattr_ids) { struct squashfs_xattr_id_table *id_table; diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c index 087cab8c78f4..c8469c656e0d 100644 --- a/fs/squashfs/xattr_id.c +++ b/fs/squashfs/xattr_id.c @@ -56,7 +56,7 @@ int squashfs_xattr_lookup(struct super_block *sb, unsigned int index, * Read uncompressed xattr id lookup table indexes from disk into memory */ __le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 table_start, - u64 *xattr_table_start, int *xattr_ids) + u64 *xattr_table_start, unsigned int *xattr_ids) { struct squashfs_sb_info *msblk = sb->s_fs_info; unsigned int len, indexes; diff --git a/fs/stat.c b/fs/stat.c index f540047e1177..7c238da22ef0 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -18,6 +18,7 @@ #include <linux/syscalls.h> #include <linux/pagemap.h> #include <linux/compat.h> +#include <linux/iversion.h> #include <linux/uaccess.h> #include <asm/unistd.h> @@ -122,6 +123,11 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, stat->attributes_mask |= (STATX_ATTR_AUTOMOUNT | STATX_ATTR_DAX); + if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) { + stat->result_mask |= STATX_CHANGE_COOKIE; + stat->change_cookie = inode_query_iversion(inode); + } + idmap = mnt_idmap(path->mnt); if (inode->i_op->getattr) return inode->i_op->getattr(idmap, path, stat, @@ -602,9 +608,11 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer) memset(&tmp, 0, sizeof(tmp)); - tmp.stx_mask = stat->result_mask; + /* STATX_CHANGE_COOKIE is kernel-only for now */ + tmp.stx_mask = stat->result_mask & ~STATX_CHANGE_COOKIE; tmp.stx_blksize = stat->blksize; - tmp.stx_attributes = stat->attributes; + /* STATX_ATTR_CHANGE_MONOTONIC is kernel-only for now */ + tmp.stx_attributes = stat->attributes & ~STATX_ATTR_CHANGE_MONOTONIC; tmp.stx_nlink = stat->nlink; tmp.stx_uid = from_kuid_munged(current_user_ns(), stat->uid); tmp.stx_gid = from_kgid_munged(current_user_ns(), stat->gid); @@ -643,6 +651,11 @@ int do_statx(int dfd, struct filename *filename, unsigned int flags, if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE) return -EINVAL; + /* STATX_CHANGE_COOKIE is kernel-only for now. Ignore requests + * from userland. + */ + mask &= ~STATX_CHANGE_COOKIE; + error = vfs_statx(dfd, filename, flags, &stat, mask); if (error) return error; diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 1d7c2a812fc1..34e416327dd4 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -595,7 +595,7 @@ static void udf_do_extend_final_block(struct inode *inode, */ if (new_elen <= (last_ext->extLength & UDF_EXTENT_LENGTH_MASK)) return; - added_bytes = (last_ext->extLength & UDF_EXTENT_LENGTH_MASK) - new_elen; + added_bytes = new_elen - (last_ext->extLength & UDF_EXTENT_LENGTH_MASK); last_ext->extLength += added_bytes; UDF_I(inode)->i_lenExtents += added_bytes; @@ -684,7 +684,7 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, struct kernel_lb_addr eloc, tmpeloc; int c = 1; loff_t lbcount = 0, b_off = 0; - udf_pblk_t newblocknum, newblock; + udf_pblk_t newblocknum, newblock = 0; sector_t offset = 0; int8_t etype; struct udf_inode_info *iinfo = UDF_I(inode); @@ -787,7 +787,6 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, ret = udf_do_extend_file(inode, &prev_epos, laarr, hole_len); if (ret < 0) { *err = ret; - newblock = 0; goto out_free; } c = 0; @@ -852,7 +851,6 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, goal, err); if (!newblocknum) { *err = -ENOSPC; - newblock = 0; goto out_free; } if (isBeyondEOF) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 98ac37e34e3d..cc694846617a 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -108,6 +108,21 @@ static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx) return ctx->features & UFFD_FEATURE_INITIALIZED; } +static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, + vm_flags_t flags) +{ + const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP; + + vma->vm_flags = flags; + /* + * For shared mappings, we want to enable writenotify while + * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply + * recalculate vma->vm_page_prot whenever userfaultfd-wp changes. + */ + if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed) + vma_set_page_prot(vma); +} + static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode, int wake_flags, void *key) { @@ -618,7 +633,8 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, for_each_vma(vmi, vma) { if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; - vma->vm_flags &= ~__VM_UFFD_FLAGS; + userfaultfd_set_vm_flags(vma, + vma->vm_flags & ~__VM_UFFD_FLAGS); } } mmap_write_unlock(mm); @@ -652,7 +668,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) octx = vma->vm_userfaultfd_ctx.ctx; if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) { vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; - vma->vm_flags &= ~__VM_UFFD_FLAGS; + userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); return 0; } @@ -733,7 +749,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, } else { /* Drop uffd context if remap feature not enabled */ vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; - vma->vm_flags &= ~__VM_UFFD_FLAGS; + userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); } } @@ -895,7 +911,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) prev = vma; } - vma->vm_flags = new_flags; + userfaultfd_set_vm_flags(vma, new_flags); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; } mmap_write_unlock(mm); @@ -1463,7 +1479,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, * the next vma was merged into the current one and * the current one has not been updated yet. */ - vma->vm_flags = new_flags; + userfaultfd_set_vm_flags(vma, new_flags); vma->vm_userfaultfd_ctx.ctx = ctx; if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma)) @@ -1651,7 +1667,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, * the next vma was merged into the current one and * the current one has not been updated yet. */ - vma->vm_flags = new_flags; + userfaultfd_set_vm_flags(vma, new_flags); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; skip: diff --git a/fs/utimes.c b/fs/utimes.c index 5f71e2209e6f..3701b3946f88 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -7,6 +7,7 @@ #include <linux/uaccess.h> #include <linux/compat.h> #include <asm/unistd.h> +#include <linux/filelock.h> static bool nsec_valid(long nsec) { diff --git a/fs/xattr.c b/fs/xattr.c index 80a6460b620c..14a7eb3c8fa8 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -9,6 +9,7 @@ Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> */ #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/xattr.h> diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 4c16c8c31fcb..35f574421670 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -4666,7 +4666,12 @@ xfs_btree_space_to_height( const unsigned int *limits, unsigned long long leaf_blocks) { - unsigned long long node_blocks = limits[1]; + /* + * The root btree block can have fewer than minrecs pointers in it + * because the tree might not be big enough to require that amount of + * fanout. Hence it has a minimum size of 2 pointers, not limits[1]. + */ + unsigned long long node_blocks = 2; unsigned long long blocks_left = leaf_blocks - 1; unsigned int height = 1; diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index ad22a003f959..f3d328e4a440 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -236,6 +236,7 @@ xfs_extent_busy_update_extent( * */ busyp->bno = fend; + busyp->length = bend - fend; } else if (bbno < fbno) { /* * Case 8: diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index f35e2cee5265..ddeaccc04aec 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1853,12 +1853,20 @@ xfs_inodegc_worker( struct xfs_inodegc, work); struct llist_node *node = llist_del_all(&gc->list); struct xfs_inode *ip, *n; + unsigned int nofs_flag; WRITE_ONCE(gc->items, 0); if (!node) return; + /* + * We can allocate memory here while doing writeback on behalf of + * memory reclaim. To avoid memory allocation deadlocks set the + * task-wide nofs context for the following operations. + */ + nofs_flag = memalloc_nofs_save(); + ip = llist_entry(node, struct xfs_inode, i_gclist); trace_xfs_inodegc_worker(ip->i_mount, READ_ONCE(gc->shrinker_hits)); @@ -1867,6 +1875,8 @@ xfs_inodegc_worker( xfs_iflags_set(ip, XFS_INACTIVATING); xfs_inodegc_inactivate(ip); } + + memalloc_nofs_restore(nofs_flag); } /* diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index fbb6f5483687..55bb01173cde 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -754,7 +754,7 @@ xfs_bulkstat_fmt( static int xfs_bulk_ireq_setup( struct xfs_mount *mp, - struct xfs_bulk_ireq *hdr, + const struct xfs_bulk_ireq *hdr, struct xfs_ibulk *breq, void __user *ubuffer) { @@ -780,7 +780,7 @@ xfs_bulk_ireq_setup( switch (hdr->ino) { case XFS_BULK_IREQ_SPECIAL_ROOT: - hdr->ino = mp->m_sb.sb_rootino; + breq->startino = mp->m_sb.sb_rootino; break; default: return -EINVAL; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 669c1bc5c3a7..fc1946f80a4a 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -83,7 +83,7 @@ xfs_iomap_valid( return true; } -const struct iomap_page_ops xfs_iomap_page_ops = { +static const struct iomap_page_ops xfs_iomap_page_ops = { .iomap_valid = xfs_iomap_valid, }; diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index f9878021e7d0..e88f18f85e4b 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -34,6 +34,7 @@ typedef __u32 xfs_nlink_t; #include <linux/module.h> #include <linux/mutex.h> #include <linux/file.h> +#include <linux/filelock.h> #include <linux/swap.h> #include <linux/errno.h> #include <linux/sched/signal.h> diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index a7303a9aa405..7dc0db7f5a76 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -68,7 +68,7 @@ restart: while (1) { struct xfs_dquot *batch[XFS_DQ_LOOKUP_BATCH]; - int error = 0; + int error; int i; mutex_lock(&qi->qi_tree_lock); diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index fe46bce8cae6..5535778a98f9 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -416,8 +416,6 @@ xfs_reflink_fill_cow_hole( goto convert; } - ASSERT(cmap->br_startoff > imap->br_startoff); - /* Allocate the entire reservation as unwritten blocks. */ nimaps = 1; error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount, diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index cb29234fab7e..72ef97320b99 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -442,6 +442,10 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, data_size = zonefs_check_zone_condition(inode, zone, false, false); } + } else if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_RO && + data_size > isize) { + /* Do not expose garbage data */ + data_size = isize; } /* @@ -805,6 +809,24 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) ret = submit_bio_wait(bio); + /* + * If the file zone was written underneath the file system, the zone + * write pointer may not be where we expect it to be, but the zone + * append write can still succeed. So check manually that we wrote where + * we intended to, that is, at zi->i_wpoffset. + */ + if (!ret) { + sector_t wpsector = + zi->i_zsector + (zi->i_wpoffset >> SECTOR_SHIFT); + + if (bio->bi_iter.bi_sector != wpsector) { + zonefs_warn(inode->i_sb, + "Corrupted write pointer %llu for zone at %llu\n", + wpsector, zi->i_zsector); + ret = -EIO; + } + } + zonefs_file_write_dio_end_io(iocb, size, ret, 0); trace_zonefs_file_dio_append(inode, size, ret); |