diff options
Diffstat (limited to 'fs')
109 files changed, 1656 insertions, 777 deletions
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index e4e7c804625e..e9be8b5571a4 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -35,6 +35,8 @@ static const char * const bch2_btree_update_modes[] = { NULL }; +static void bch2_btree_update_to_text(struct printbuf *, struct btree_update *); + static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *, btree_path_idx_t, struct btree *, struct keylist *); static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); @@ -1782,11 +1784,24 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t int ret; lockdep_assert_held(&c->gc_lock); - BUG_ON(!btree_node_intent_locked(path, b->c.level)); BUG_ON(!b->c.level); BUG_ON(!as || as->b); bch2_verify_keylist_sorted(keys); + if (!btree_node_intent_locked(path, b->c.level)) { + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + prt_printf(&buf, "%s(): node not locked at level %u\n", + __func__, b->c.level); + bch2_btree_update_to_text(&buf, as); + bch2_btree_path_to_text(&buf, trans, path_idx); + + bch2_print_string_as_lines(KERN_ERR, buf.buf); + printbuf_exit(&buf); + bch2_fs_emergency_read_only(c); + return -EIO; + } + ret = bch2_btree_node_lock_write(trans, path, &b->c); if (ret) return ret; diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 038da6a61f6b..6cbf4819e923 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -11,6 +11,14 @@ #define FSCK_ERR_RATELIMIT_NR 10 +void bch2_log_msg_start(struct bch_fs *c, struct printbuf *out) +{ +#ifdef BCACHEFS_LOG_PREFIX + prt_printf(out, bch2_log_msg(c, "")); +#endif + printbuf_indent_add(out, 2); +} + bool bch2_inconsistent_error(struct bch_fs *c) { set_bit(BCH_FS_error, &c->flags); diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index 7acf2a27ca28..5730eb6b2f38 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -18,6 +18,8 @@ struct work_struct; /* Error messages: */ +void bch2_log_msg_start(struct bch_fs *, struct printbuf *); + /* * Inconsistency errors: The on disk data is inconsistent. If these occur during * initial recovery, they don't indicate a bug in the running code - we walk all diff --git a/fs/bcachefs/xattr_format.h b/fs/bcachefs/xattr_format.h index c7916011ef34..67426e33d04e 100644 --- a/fs/bcachefs/xattr_format.h +++ b/fs/bcachefs/xattr_format.h @@ -13,7 +13,13 @@ struct bch_xattr { __u8 x_type; __u8 x_name_len; __le16 x_val_len; - __u8 x_name[] __counted_by(x_name_len); + /* + * x_name contains the name and value counted by + * x_name_len + x_val_len. The introduction of + * __counted_by(x_name_len) caused a false positive + * detection of an out of bounds write. + */ + __u8 x_name[]; } __packed __aligned(8); #endif /* _BCACHEFS_XATTR_FORMAT_H */ diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 8054f44d39cf..855ef7956106 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -831,6 +831,7 @@ static int load_elf_binary(struct linux_binprm *bprm) struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; struct elf_phdr *elf_property_phdata = NULL; unsigned long elf_brk; + bool brk_moved = false; int retval, i; unsigned long elf_entry; unsigned long e_entry; @@ -1098,15 +1099,19 @@ out_free_interp: /* Calculate any requested alignment. */ alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum); - /* - * There are effectively two types of ET_DYN - * binaries: programs (i.e. PIE: ET_DYN with PT_INTERP) - * and loaders (ET_DYN without PT_INTERP, since they - * _are_ the ELF interpreter). The loaders must - * be loaded away from programs since the program - * may otherwise collide with the loader (especially - * for ET_EXEC which does not have a randomized - * position). For example to handle invocations of + /** + * DOC: PIE handling + * + * There are effectively two types of ET_DYN ELF + * binaries: programs (i.e. PIE: ET_DYN with + * PT_INTERP) and loaders (i.e. static PIE: ET_DYN + * without PT_INTERP, usually the ELF interpreter + * itself). Loaders must be loaded away from programs + * since the program may otherwise collide with the + * loader (especially for ET_EXEC which does not have + * a randomized position). + * + * For example, to handle invocations of * "./ld.so someprog" to test out a new version of * the loader, the subsequent program that the * loader loads must avoid the loader itself, so @@ -1119,6 +1124,9 @@ out_free_interp: * ELF_ET_DYN_BASE and loaders are loaded into the * independently randomized mmap region (0 load_bias * without MAP_FIXED nor MAP_FIXED_NOREPLACE). + * + * See below for "brk" handling details, which is + * also affected by program vs loader and ASLR. */ if (interpreter) { /* On ET_DYN with PT_INTERP, we do the ASLR. */ @@ -1235,8 +1243,6 @@ out_free_interp: start_data += load_bias; end_data += load_bias; - current->mm->start_brk = current->mm->brk = ELF_PAGEALIGN(elf_brk); - if (interpreter) { elf_entry = load_elf_interp(interp_elf_ex, interpreter, @@ -1292,27 +1298,44 @@ out_free_interp: mm->end_data = end_data; mm->start_stack = bprm->p; - if ((current->flags & PF_RANDOMIZE) && (snapshot_randomize_va_space > 1)) { + /** + * DOC: "brk" handling + * + * For architectures with ELF randomization, when executing a + * loader directly (i.e. static PIE: ET_DYN without PT_INTERP), + * move the brk area out of the mmap region and into the unused + * ELF_ET_DYN_BASE region. Since "brk" grows up it may collide + * early with the stack growing down or other regions being put + * into the mmap region by the kernel (e.g. vdso). + * + * In the CONFIG_COMPAT_BRK case, though, everything is turned + * off because we're not allowed to move the brk at all. + */ + if (!IS_ENABLED(CONFIG_COMPAT_BRK) && + IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && + elf_ex->e_type == ET_DYN && !interpreter) { + elf_brk = ELF_ET_DYN_BASE; + /* This counts as moving the brk, so let brk(2) know. */ + brk_moved = true; + } + mm->start_brk = mm->brk = ELF_PAGEALIGN(elf_brk); + + if ((current->flags & PF_RANDOMIZE) && snapshot_randomize_va_space > 1) { /* - * For architectures with ELF randomization, when executing - * a loader directly (i.e. no interpreter listed in ELF - * headers), move the brk area out of the mmap region - * (since it grows up, and may collide early with the stack - * growing down), and into the unused ELF_ET_DYN_BASE region. + * If we didn't move the brk to ELF_ET_DYN_BASE (above), + * leave a gap between .bss and brk. */ - if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && - elf_ex->e_type == ET_DYN && !interpreter) { - mm->brk = mm->start_brk = ELF_ET_DYN_BASE; - } else { - /* Otherwise leave a gap between .bss and brk. */ + if (!brk_moved) mm->brk = mm->start_brk = mm->brk + PAGE_SIZE; - } mm->brk = mm->start_brk = arch_randomize_brk(mm); + brk_moved = true; + } + #ifdef compat_brk_randomized + if (brk_moved) current->brk_randomized = 1; #endif - } if (current->personality & MMAP_PAGE_ZERO) { /* Why this, you ask??? Well SVr4 maps page 0 as read-only, diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index b96b23594334..b14faa1d57b8 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1888,6 +1888,17 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) up_write(&space_info->groups_sem); goto next; } + + /* + * Cache the zone_unusable value before turning the block group + * to read only. As soon as the block group is read only it's + * zone_unusable value gets moved to the block group's read-only + * bytes and isn't available for calculations anymore. We also + * cache it before unlocking the block group, to prevent races + * (reports from KCSAN and such tools) with tasks updating it. + */ + zone_unusable = bg->zone_unusable; + spin_unlock(&bg->lock); spin_unlock(&space_info->lock); @@ -1904,13 +1915,6 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) goto next; } - /* - * Cache the zone_unusable value before turning the block group - * to read only. As soon as the blog group is read only it's - * zone_unusable value gets moved to the block group's read-only - * bytes and isn't available for calculations anymore. - */ - zone_unusable = bg->zone_unusable; ret = inc_block_group_ro(bg, 0); up_write(&space_info->groups_sem); if (ret < 0) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index b2fa33911c28..029fba82b81d 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -516,6 +516,14 @@ static inline void btrfs_assert_inode_locked(struct btrfs_inode *inode) lockdep_assert_held(&inode->vfs_inode.i_rwsem); } +static inline void btrfs_update_inode_mapping_flags(struct btrfs_inode *inode) +{ + if (inode->flags & BTRFS_INODE_NODATASUM) + mapping_clear_stable_writes(inode->vfs_inode.i_mapping); + else + mapping_set_stable_writes(inode->vfs_inode.i_mapping); +} + /* Array of bytes with variable length, hexadecimal format 0x1234 */ #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 0c4d486c3048..18d2210dc724 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -606,7 +606,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) free_extent_map(em); cb->nr_folios = DIV_ROUND_UP(compressed_len, PAGE_SIZE); - cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct page *), GFP_NOFS); + cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct folio *), GFP_NOFS); if (!cb->compressed_folios) { ret = BLK_STS_RESOURCE; goto out_free_bio; diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index e815d165cccc..de23c4b3515e 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -94,8 +94,6 @@ static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { lockdep_assert_held(&discard_ctl->lock); - if (!btrfs_run_discard_work(discard_ctl)) - return; if (list_empty(&block_group->discard_list) || block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) { @@ -118,6 +116,9 @@ static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, if (!btrfs_is_block_group_data_only(block_group)) return; + if (!btrfs_run_discard_work(discard_ctl)) + return; + spin_lock(&discard_ctl->lock); __add_to_discard_list(discard_ctl, block_group); spin_unlock(&discard_ctl->lock); @@ -167,13 +168,7 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, block_group->discard_eligible_time = 0; queued = !list_empty(&block_group->discard_list); list_del_init(&block_group->discard_list); - /* - * If the block group is currently running in the discard workfn, we - * don't want to deref it, since it's still being used by the workfn. - * The workfn will notice this case and deref the block group when it is - * finished. - */ - if (queued && !running) + if (queued) btrfs_put_block_group(block_group); spin_unlock(&discard_ctl->lock); @@ -250,6 +245,18 @@ again: block_group->used != 0) { if (btrfs_is_block_group_data_only(block_group)) { __add_to_discard_list(discard_ctl, block_group); + /* + * The block group must have been moved to other + * discard list even if discard was disabled in + * the meantime or a transaction abort happened, + * otherwise we can end up in an infinite loop, + * always jumping into the 'again' label and + * keep getting this block group over and over + * in case there are no other block groups in + * the discard lists. + */ + ASSERT(block_group->discard_index != + BTRFS_DISCARD_INDEX_UNUSED); } else { list_del_init(&block_group->discard_list); btrfs_put_block_group(block_group); @@ -260,9 +267,10 @@ again: block_group->discard_cursor = block_group->start; block_group->discard_state = BTRFS_DISCARD_EXTENTS; } - discard_ctl->block_group = block_group; } if (block_group) { + btrfs_get_block_group(block_group); + discard_ctl->block_group = block_group; *discard_state = block_group->discard_state; *discard_index = block_group->discard_index; } @@ -493,9 +501,20 @@ static void btrfs_discard_workfn(struct work_struct *work) block_group = peek_discard_list(discard_ctl, &discard_state, &discard_index, now); - if (!block_group || !btrfs_run_discard_work(discard_ctl)) + if (!block_group) + return; + if (!btrfs_run_discard_work(discard_ctl)) { + spin_lock(&discard_ctl->lock); + btrfs_put_block_group(block_group); + discard_ctl->block_group = NULL; + spin_unlock(&discard_ctl->lock); return; + } if (now < block_group->discard_eligible_time) { + spin_lock(&discard_ctl->lock); + btrfs_put_block_group(block_group); + discard_ctl->block_group = NULL; + spin_unlock(&discard_ctl->lock); btrfs_discard_schedule_work(discard_ctl, false); return; } @@ -547,15 +566,7 @@ static void btrfs_discard_workfn(struct work_struct *work) spin_lock(&discard_ctl->lock); discard_ctl->prev_discard = trimmed; discard_ctl->prev_discard_time = now; - /* - * If the block group was removed from the discard list while it was - * running in this workfn, then we didn't deref it, since this function - * still owned that reference. But we set the discard_ctl->block_group - * back to NULL, so we can use that condition to know that now we need - * to deref the block_group. - */ - if (discard_ctl->block_group == NULL) - btrfs_put_block_group(block_group); + btrfs_put_block_group(block_group); discard_ctl->block_group = NULL; __btrfs_discard_schedule_work(discard_ctl, now, false); spin_unlock(&discard_ctl->lock); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ca821e5966bd..cc8d73693701 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4329,6 +4329,14 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_cleanup_defrag_inodes(fs_info); /* + * Handle the error fs first, as it will flush and wait for all ordered + * extents. This will generate delayed iputs, thus we want to handle + * it first. + */ + if (unlikely(BTRFS_FS_ERROR(fs_info))) + btrfs_error_commit_super(fs_info); + + /* * Wait for any fixup workers to complete. * If we don't wait for them here and they are still running by the time * we call kthread_stop() against the cleaner kthread further below, we @@ -4349,6 +4357,19 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_flush_workqueue(fs_info->delalloc_workers); /* + * We can have ordered extents getting their last reference dropped from + * the fs_info->workers queue because for async writes for data bios we + * queue a work for that queue, at btrfs_wq_submit_bio(), that runs + * run_one_async_done() which calls btrfs_bio_end_io() in case the bio + * has an error, and that later function can do the final + * btrfs_put_ordered_extent() on the ordered extent attached to the bio, + * which adds a delayed iput for the inode. So we must flush the queue + * so that we don't have delayed iputs after committing the current + * transaction below and stopping the cleaner and transaction kthreads. + */ + btrfs_flush_workqueue(fs_info->workers); + + /* * When finishing a compressed write bio we schedule a work queue item * to finish an ordered extent - btrfs_finish_compressed_write_work() * calls btrfs_finish_ordered_extent() which in turns does a call to @@ -4418,9 +4439,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_err(fs_info, "commit super ret %d", ret); } - if (BTRFS_FS_ERROR(fs_info)) - btrfs_error_commit_super(fs_info); - kthread_stop(fs_info->transaction_kthread); kthread_stop(fs_info->cleaner_kthread); @@ -4543,10 +4561,6 @@ static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info) /* cleanup FS via transaction */ btrfs_cleanup_transaction(fs_info); - mutex_lock(&fs_info->cleaner_mutex); - btrfs_run_delayed_iputs(fs_info); - mutex_unlock(&fs_info->cleaner_mutex); - down_write(&fs_info->cleanup_work_sem); up_write(&fs_info->cleanup_work_sem); } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b2fae67f8fa3..06922529f19d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1871,7 +1871,7 @@ static int submit_eb_subpage(struct folio *folio, struct writeback_control *wbc) subpage->bitmaps)) { spin_unlock_irqrestore(&subpage->lock, flags); spin_unlock(&folio->mapping->i_private_lock); - bit_start++; + bit_start += sectors_per_node; continue; } @@ -2842,10 +2842,10 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, return eb; } -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, u64 start) { +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct extent_buffer *eb, *exists = NULL; int ret; @@ -2881,8 +2881,11 @@ again: free_eb: btrfs_release_extent_buffer(eb); return exists; -} +#else + /* Stub to avoid linker error when compiled with optimizations turned off. */ + return NULL; #endif +} static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info, struct folio *folio) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 6c5328bfabc2..2aefc64cdd29 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -297,6 +297,8 @@ static inline int num_extent_pages(const struct extent_buffer *eb) */ static inline int num_extent_folios(const struct extent_buffer *eb) { + if (!eb->folios[0]) + return 0; if (folio_order(eb->folios[0])) return 1; return num_extent_pages(eb); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0b568c8d24cb..cd4e40a71918 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -874,7 +874,6 @@ again: ret = PTR_ERR(folio); return ret; } - folio_wait_writeback(folio); /* Only support page sized folio yet. */ ASSERT(folio_order(folio) == 0); ret = set_folio_extent_mapped(folio); @@ -2104,15 +2103,20 @@ static void btrfs_punch_hole_lock_range(struct inode *inode, * will always return true. * So here we need to do extra page alignment for * filemap_range_has_page(). + * + * And do not decrease page_lockend right now, as it can be 0. */ const u64 page_lockstart = round_up(lockstart, PAGE_SIZE); - const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1; + const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE); while (1) { truncate_pagecache_range(inode, lockstart, lockend); lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, cached_state); + /* The same page or adjacent pages. */ + if (page_lockend <= page_lockstart) + break; /* * We can't have ordered extents in the range, nor dirty/writeback * pages, because we have locked the inode's VFS lock in exclusive @@ -2124,7 +2128,7 @@ static void btrfs_punch_hole_lock_range(struct inode *inode, * we do, unlock the range and retry. */ if (!filemap_range_has_page(inode->i_mapping, page_lockstart, - page_lockend)) + page_lockend - 1)) break; unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index b572d6b9730b..f46fba127caa 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -285,6 +285,7 @@ enum { #define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) +#define BTRFS_WARNING_COMMIT_INTERVAL (300) #define BTRFS_DEFAULT_MAX_INLINE (2048) struct btrfs_dev_replace { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 38756f8cef46..a06fca7934d5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1116,6 +1116,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, struct extent_state *cached = NULL; struct extent_map *em; int ret = 0; + bool free_pages = false; u64 start = async_extent->start; u64 end = async_extent->start + async_extent->ram_size - 1; @@ -1136,7 +1137,10 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, } if (async_extent->compress_type == BTRFS_COMPRESS_NONE) { + ASSERT(!async_extent->folios); + ASSERT(async_extent->nr_folios == 0); submit_uncompressed_range(inode, async_extent, locked_folio); + free_pages = true; goto done; } @@ -1152,6 +1156,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, * fall back to uncompressed. */ submit_uncompressed_range(inode, async_extent, locked_folio); + free_pages = true; goto done; } @@ -1193,6 +1198,8 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, done: if (async_chunk->blkcg_css) kthread_associate_blkcg(NULL); + if (free_pages) + free_async_extent_pages(async_extent); kfree(async_extent); return; @@ -2083,12 +2090,13 @@ next_slot: /* * If the found extent starts after requested offset, then - * adjust extent_end to be right before this extent begins + * adjust cur_offset to be right before this extent begins. */ if (found_key.offset > cur_offset) { - extent_end = found_key.offset; - extent_type = 0; - goto must_cow; + if (cow_start == (u64)-1) + cow_start = cur_offset; + cur_offset = found_key.offset; + goto next_slot; } /* @@ -3845,12 +3853,13 @@ static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc) * * On failure clean up the inode. */ -static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path) +static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path *path) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; struct btrfs_inode_item *inode_item; - struct btrfs_root *root = BTRFS_I(inode)->root; + struct inode *vfs_inode = &inode->vfs_inode; struct btrfs_key location; unsigned long ptr; int maybe_acls; @@ -3859,17 +3868,17 @@ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path) bool filled = false; int first_xattr_slot; - ret = btrfs_init_file_extent_tree(BTRFS_I(inode)); + ret = btrfs_init_file_extent_tree(inode); if (ret) goto out; - ret = btrfs_fill_inode(inode, &rdev); + ret = btrfs_fill_inode(vfs_inode, &rdev); if (!ret) filled = true; ASSERT(path); - btrfs_get_inode_key(BTRFS_I(inode), &location); + btrfs_get_inode_key(inode, &location); ret = btrfs_lookup_inode(NULL, root, path, &location, 0); if (ret) { @@ -3889,41 +3898,41 @@ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path) inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); - inode->i_mode = btrfs_inode_mode(leaf, inode_item); - set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); - i_uid_write(inode, btrfs_inode_uid(leaf, inode_item)); - i_gid_write(inode, btrfs_inode_gid(leaf, inode_item)); - btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item)); - btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0, - round_up(i_size_read(inode), fs_info->sectorsize)); - - inode_set_atime(inode, btrfs_timespec_sec(leaf, &inode_item->atime), + vfs_inode->i_mode = btrfs_inode_mode(leaf, inode_item); + set_nlink(vfs_inode, btrfs_inode_nlink(leaf, inode_item)); + i_uid_write(vfs_inode, btrfs_inode_uid(leaf, inode_item)); + i_gid_write(vfs_inode, btrfs_inode_gid(leaf, inode_item)); + btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); + btrfs_inode_set_file_extent_range(inode, 0, + round_up(i_size_read(vfs_inode), fs_info->sectorsize)); + + inode_set_atime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->atime), btrfs_timespec_nsec(leaf, &inode_item->atime)); - inode_set_mtime(inode, btrfs_timespec_sec(leaf, &inode_item->mtime), + inode_set_mtime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->mtime), btrfs_timespec_nsec(leaf, &inode_item->mtime)); - inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime), + inode_set_ctime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->ctime), btrfs_timespec_nsec(leaf, &inode_item->ctime)); - BTRFS_I(inode)->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime); - BTRFS_I(inode)->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime); + inode->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime); + inode->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime); - inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); - BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); - BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item); + inode_set_bytes(vfs_inode, btrfs_inode_nbytes(leaf, inode_item)); + inode->generation = btrfs_inode_generation(leaf, inode_item); + inode->last_trans = btrfs_inode_transid(leaf, inode_item); - inode_set_iversion_queried(inode, - btrfs_inode_sequence(leaf, inode_item)); - inode->i_generation = BTRFS_I(inode)->generation; - inode->i_rdev = 0; + inode_set_iversion_queried(vfs_inode, btrfs_inode_sequence(leaf, inode_item)); + vfs_inode->i_generation = inode->generation; + vfs_inode->i_rdev = 0; rdev = btrfs_inode_rdev(leaf, inode_item); - if (S_ISDIR(inode->i_mode)) - BTRFS_I(inode)->index_cnt = (u64)-1; + if (S_ISDIR(vfs_inode->i_mode)) + inode->index_cnt = (u64)-1; btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item), - &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); + &inode->flags, &inode->ro_flags); + btrfs_update_inode_mapping_flags(inode); cache_index: /* @@ -3935,9 +3944,8 @@ cache_index: * This is required for both inode re-read from disk and delayed inode * in the delayed_nodes xarray. */ - if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info)) - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + if (inode->last_trans == btrfs_get_fs_generation(fs_info)) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); /* * We don't persist the id of the transaction where an unlink operation @@ -3966,7 +3974,7 @@ cache_index: * transaction commits on fsync if our inode is a directory, or if our * inode is not a directory, logging its parent unnecessarily. */ - BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans; + inode->last_unlink_trans = inode->last_trans; /* * Same logic as for last_unlink_trans. We don't persist the generation @@ -3974,15 +3982,15 @@ cache_index: * operation, so after eviction and reloading the inode we must be * pessimistic and assume the last transaction that modified the inode. */ - BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans; + inode->last_reflink_trans = inode->last_trans; path->slots[0]++; - if (inode->i_nlink != 1 || + if (vfs_inode->i_nlink != 1 || path->slots[0] >= btrfs_header_nritems(leaf)) goto cache_acl; btrfs_item_key_to_cpu(leaf, &location, path->slots[0]); - if (location.objectid != btrfs_ino(BTRFS_I(inode))) + if (location.objectid != btrfs_ino(inode)) goto cache_acl; ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); @@ -3990,13 +3998,12 @@ cache_index: struct btrfs_inode_ref *ref; ref = (struct btrfs_inode_ref *)ptr; - BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref); + inode->dir_index = btrfs_inode_ref_index(leaf, ref); } else if (location.type == BTRFS_INODE_EXTREF_KEY) { struct btrfs_inode_extref *extref; extref = (struct btrfs_inode_extref *)ptr; - BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf, - extref); + inode->dir_index = btrfs_inode_extref_index(leaf, extref); } cache_acl: /* @@ -4004,50 +4011,49 @@ cache_acl: * any xattrs or acls */ maybe_acls = acls_after_inode_item(leaf, path->slots[0], - btrfs_ino(BTRFS_I(inode)), &first_xattr_slot); + btrfs_ino(inode), &first_xattr_slot); if (first_xattr_slot != -1) { path->slots[0] = first_xattr_slot; - ret = btrfs_load_inode_props(inode, path); + ret = btrfs_load_inode_props(vfs_inode, path); if (ret) btrfs_err(fs_info, "error loading props for ino %llu (root %llu): %d", - btrfs_ino(BTRFS_I(inode)), - btrfs_root_id(root), ret); + btrfs_ino(inode), btrfs_root_id(root), ret); } if (!maybe_acls) - cache_no_acl(inode); + cache_no_acl(vfs_inode); - switch (inode->i_mode & S_IFMT) { + switch (vfs_inode->i_mode & S_IFMT) { case S_IFREG: - inode->i_mapping->a_ops = &btrfs_aops; - inode->i_fop = &btrfs_file_operations; - inode->i_op = &btrfs_file_inode_operations; + vfs_inode->i_mapping->a_ops = &btrfs_aops; + vfs_inode->i_fop = &btrfs_file_operations; + vfs_inode->i_op = &btrfs_file_inode_operations; break; case S_IFDIR: - inode->i_fop = &btrfs_dir_file_operations; - inode->i_op = &btrfs_dir_inode_operations; + vfs_inode->i_fop = &btrfs_dir_file_operations; + vfs_inode->i_op = &btrfs_dir_inode_operations; break; case S_IFLNK: - inode->i_op = &btrfs_symlink_inode_operations; - inode_nohighmem(inode); - inode->i_mapping->a_ops = &btrfs_aops; + vfs_inode->i_op = &btrfs_symlink_inode_operations; + inode_nohighmem(vfs_inode); + vfs_inode->i_mapping->a_ops = &btrfs_aops; break; default: - inode->i_op = &btrfs_special_inode_operations; - init_special_inode(inode, inode->i_mode, rdev); + vfs_inode->i_op = &btrfs_special_inode_operations; + init_special_inode(vfs_inode, vfs_inode->i_mode, rdev); break; } - btrfs_sync_inode_flags_to_i_flags(inode); + btrfs_sync_inode_flags_to_i_flags(vfs_inode); - ret = btrfs_add_inode_to_root(BTRFS_I(inode), true); + ret = btrfs_add_inode_to_root(inode, true); if (ret) goto out; return 0; out: - iget_failed(inode); + iget_failed(vfs_inode); return ret; } @@ -5602,7 +5608,7 @@ static int btrfs_find_actor(struct inode *inode, void *opaque) args->root == BTRFS_I(inode)->root; } -static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) +static struct btrfs_inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) { struct inode *inode; struct btrfs_iget_args args; @@ -5614,7 +5620,9 @@ static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) inode = iget5_locked_rcu(root->fs_info->sb, hashval, btrfs_find_actor, btrfs_init_locked_inode, (void *)&args); - return inode; + if (!inode) + return NULL; + return BTRFS_I(inode); } /* @@ -5624,22 +5632,22 @@ static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, struct btrfs_path *path) { - struct inode *inode; + struct btrfs_inode *inode; int ret; inode = btrfs_iget_locked(ino, root); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) - return inode; + if (!(inode->vfs_inode.i_state & I_NEW)) + return &inode->vfs_inode; ret = btrfs_read_locked_inode(inode, path); if (ret) return ERR_PTR(ret); - unlock_new_inode(inode); - return inode; + unlock_new_inode(&inode->vfs_inode); + return &inode->vfs_inode; } /* @@ -5647,7 +5655,7 @@ struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, */ struct inode *btrfs_iget(u64 ino, struct btrfs_root *root) { - struct inode *inode; + struct btrfs_inode *inode; struct btrfs_path *path; int ret; @@ -5655,20 +5663,22 @@ struct inode *btrfs_iget(u64 ino, struct btrfs_root *root) if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) - return inode; + if (!(inode->vfs_inode.i_state & I_NEW)) + return &inode->vfs_inode; path = btrfs_alloc_path(); - if (!path) + if (!path) { + iget_failed(&inode->vfs_inode); return ERR_PTR(-ENOMEM); + } ret = btrfs_read_locked_inode(inode, path); btrfs_free_path(path); if (ret) return ERR_PTR(ret); - unlock_new_inode(inode); - return inode; + unlock_new_inode(&inode->vfs_inode); + return &inode->vfs_inode; } static struct inode *new_simple_dir(struct inode *dir, @@ -6339,6 +6349,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, if (btrfs_test_opt(fs_info, NODATACOW)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | BTRFS_INODE_NODATASUM; + btrfs_update_inode_mapping_flags(BTRFS_I(inode)); } ret = btrfs_insert_inode_locked(inode); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e666c141cae0..10a97f0af8d4 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -393,6 +393,7 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, update_flags: binode->flags = binode_flags; + btrfs_update_inode_mapping_flags(binode); btrfs_sync_inode_flags_to_i_flags(inode); inode_inc_iversion(inode); inode_set_ctime_current(inode); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 531312efee8d..5d0060eb8ff4 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1541,8 +1541,8 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, u64 extent_gen; int ret; - if (unlikely(!extent_root)) { - btrfs_err(fs_info, "no valid extent root for scrub"); + if (unlikely(!extent_root || !csum_root)) { + btrfs_err(fs_info, "no valid extent or csum root for scrub"); return -EUCLEAN; } memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) * diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index f437138fefbc..bb8a0945b0fd 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -487,10 +487,8 @@ static int fs_path_ensure_buf(struct fs_path *p, int len) if (p->buf_len >= len) return 0; - if (len > PATH_MAX) { - WARN_ON(1); - return -ENOMEM; - } + if (WARN_ON(len > PATH_MAX)) + return -ENAMETOOLONG; path_len = p->end - p->start; old_buf_len = p->buf_len; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index a5f29ff3fbc2..8098f5cc9b7b 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -569,6 +569,10 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) break; case Opt_commit_interval: ctx->commit_interval = result.uint_32; + if (ctx->commit_interval > BTRFS_WARNING_COMMIT_INTERVAL) { + btrfs_warn(NULL, "excessive commit interval %u, use with care", + ctx->commit_interval); + } if (ctx->commit_interval == 0) ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; break; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 43979891f7c8..2b66a6130269 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -2235,7 +2235,7 @@ int btrfs_verify_level_key(struct extent_buffer *eb, btrfs_err(fs_info, "tree level mismatch detected, bytenr=%llu level expected=%u has=%u", eb->start, check->level, found_level); - return -EIO; + return -EUCLEAN; } if (!check->has_first_key) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3f8afbd1ebb5..b80110fe30fe 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -733,82 +733,6 @@ const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb) return has_metadata_uuid ? sb->metadata_uuid : sb->fsid; } -/* - * We can have very weird soft links passed in. - * One example is "/proc/self/fd/<fd>", which can be a soft link to - * a block device. - * - * But it's never a good idea to use those weird names. - * Here we check if the path (not following symlinks) is a good one inside - * "/dev/". - */ -static bool is_good_dev_path(const char *dev_path) -{ - struct path path = { .mnt = NULL, .dentry = NULL }; - char *path_buf = NULL; - char *resolved_path; - bool is_good = false; - int ret; - - if (!dev_path) - goto out; - - path_buf = kmalloc(PATH_MAX, GFP_KERNEL); - if (!path_buf) - goto out; - - /* - * Do not follow soft link, just check if the original path is inside - * "/dev/". - */ - ret = kern_path(dev_path, 0, &path); - if (ret) - goto out; - resolved_path = d_path(&path, path_buf, PATH_MAX); - if (IS_ERR(resolved_path)) - goto out; - if (strncmp(resolved_path, "/dev/", strlen("/dev/"))) - goto out; - is_good = true; -out: - kfree(path_buf); - path_put(&path); - return is_good; -} - -static int get_canonical_dev_path(const char *dev_path, char *canonical) -{ - struct path path = { .mnt = NULL, .dentry = NULL }; - char *path_buf = NULL; - char *resolved_path; - int ret; - - if (!dev_path) { - ret = -EINVAL; - goto out; - } - - path_buf = kmalloc(PATH_MAX, GFP_KERNEL); - if (!path_buf) { - ret = -ENOMEM; - goto out; - } - - ret = kern_path(dev_path, LOOKUP_FOLLOW, &path); - if (ret) - goto out; - resolved_path = d_path(&path, path_buf, PATH_MAX); - if (IS_ERR(resolved_path)) { - ret = PTR_ERR(resolved_path); - goto out; - } - ret = strscpy(canonical, resolved_path, PATH_MAX); -out: - kfree(path_buf); - path_put(&path); - return ret; -} - static bool is_same_device(struct btrfs_device *device, const char *new_path) { struct path old = { .mnt = NULL, .dentry = NULL }; @@ -1513,23 +1437,12 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, bool new_device_added = false; struct btrfs_device *device = NULL; struct file *bdev_file; - char *canonical_path = NULL; u64 bytenr; dev_t devt; int ret; lockdep_assert_held(&uuid_mutex); - if (!is_good_dev_path(path)) { - canonical_path = kmalloc(PATH_MAX, GFP_KERNEL); - if (canonical_path) { - ret = get_canonical_dev_path(path, canonical_path); - if (ret < 0) { - kfree(canonical_path); - canonical_path = NULL; - } - } - } /* * Avoid an exclusive open here, as the systemd-udev may initiate the * device scan which may race with the user's mount or mkfs command, @@ -1574,8 +1487,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, goto free_disk_super; } - device = device_list_add(canonical_path ? : path, disk_super, - &new_device_added); + device = device_list_add(path, disk_super, &new_device_added); if (!IS_ERR(device) && new_device_added) btrfs_free_stale_devices(device->devt, device); @@ -1584,7 +1496,6 @@ free_disk_super: error_bdev_put: fput(bdev_file); - kfree(canonical_path); return device; } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index aaf925897fdd..f39656668967 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1277,7 +1277,7 @@ struct zone_info { static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, struct zone_info *info, unsigned long *active, - struct btrfs_chunk_map *map) + struct btrfs_chunk_map *map, bool new) { struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; struct btrfs_device *device; @@ -1307,6 +1307,8 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, return 0; } + ASSERT(!new || btrfs_dev_is_empty_zone(device, info->physical)); + /* This zone will be used for allocation, so mark this zone non-empty. */ btrfs_dev_clear_zone_empty(device, info->physical); @@ -1319,6 +1321,18 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, * to determine the allocation offset within the zone. */ WARN_ON(!IS_ALIGNED(info->physical, fs_info->zone_size)); + + if (new) { + sector_t capacity; + + capacity = bdev_zone_capacity(device->bdev, info->physical >> SECTOR_SHIFT); + up_read(&dev_replace->rwsem); + info->alloc_offset = 0; + info->capacity = capacity << SECTOR_SHIFT; + + return 0; + } + nofs_flag = memalloc_nofs_save(); ret = btrfs_get_dev_zone(device, info->physical, &zone); memalloc_nofs_restore(nofs_flag); @@ -1588,7 +1602,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } for (i = 0; i < map->num_stripes; i++) { - ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map); + ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map, new); if (ret) goto out; @@ -1659,7 +1673,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) * stripe. */ cache->alloc_offset = cache->zone_capacity; - ret = 0; } out: diff --git a/fs/buffer.c b/fs/buffer.c index cc8452f60251..2494fe3a5e69 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -176,18 +176,8 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate) } EXPORT_SYMBOL(end_buffer_write_sync); -/* - * Various filesystems appear to want __find_get_block to be non-blocking. - * But it's the page lock which protects the buffers. To get around this, - * we get exclusion from try_to_free_buffers with the blockdev mapping's - * i_private_lock. - * - * Hack idea: for the blockdev mapping, i_private_lock contention - * may be quite high. This code could TryLock the page, and if that - * succeeds, there is no need to take i_private_lock. - */ static struct buffer_head * -__find_get_block_slow(struct block_device *bdev, sector_t block) +__find_get_block_slow(struct block_device *bdev, sector_t block, bool atomic) { struct address_space *bd_mapping = bdev->bd_mapping; const int blkbits = bd_mapping->host->i_blkbits; @@ -204,7 +194,16 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) if (IS_ERR(folio)) goto out; - spin_lock(&bd_mapping->i_private_lock); + /* + * Folio lock protects the buffers. Callers that cannot block + * will fallback to serializing vs try_to_free_buffers() via + * the i_private_lock. + */ + if (atomic) + spin_lock(&bd_mapping->i_private_lock); + else + folio_lock(folio); + head = folio_buffers(folio); if (!head) goto out_unlock; @@ -236,7 +235,10 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) 1 << blkbits); } out_unlock: - spin_unlock(&bd_mapping->i_private_lock); + if (atomic) + spin_unlock(&bd_mapping->i_private_lock); + else + folio_unlock(folio); folio_put(folio); out: return ret; @@ -656,7 +658,9 @@ EXPORT_SYMBOL(generic_buffers_fsync); void write_boundary_block(struct block_device *bdev, sector_t bblock, unsigned blocksize) { - struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize); + struct buffer_head *bh; + + bh = __find_get_block_nonatomic(bdev, bblock + 1, blocksize); if (bh) { if (buffer_dirty(bh)) write_dirty_buffer(bh, 0); @@ -1388,14 +1392,15 @@ lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size) * it in the LRU and mark it as accessed. If it is not present then return * NULL */ -struct buffer_head * -__find_get_block(struct block_device *bdev, sector_t block, unsigned size) +static struct buffer_head * +find_get_block_common(struct block_device *bdev, sector_t block, + unsigned size, bool atomic) { struct buffer_head *bh = lookup_bh_lru(bdev, block, size); if (bh == NULL) { /* __find_get_block_slow will mark the page accessed */ - bh = __find_get_block_slow(bdev, block); + bh = __find_get_block_slow(bdev, block, atomic); if (bh) bh_lru_install(bh); } else @@ -1403,8 +1408,23 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size) return bh; } + +struct buffer_head * +__find_get_block(struct block_device *bdev, sector_t block, unsigned size) +{ + return find_get_block_common(bdev, block, size, true); +} EXPORT_SYMBOL(__find_get_block); +/* same as __find_get_block() but allows sleeping contexts */ +struct buffer_head * +__find_get_block_nonatomic(struct block_device *bdev, sector_t block, + unsigned size) +{ + return find_get_block_common(bdev, block, size, false); +} +EXPORT_SYMBOL(__find_get_block_nonatomic); + /** * bdev_getblk - Get a buffer_head in a block device's buffer cache. * @bdev: The block device. @@ -1422,7 +1442,12 @@ EXPORT_SYMBOL(__find_get_block); struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block, unsigned size, gfp_t gfp) { - struct buffer_head *bh = __find_get_block(bdev, block, size); + struct buffer_head *bh; + + if (gfpflags_allow_blocking(gfp)) + bh = __find_get_block_nonatomic(bdev, block, size); + else + bh = __find_get_block(bdev, block, size); might_alloc(gfp); if (bh) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 7dd6c2275085..e3ab07797c85 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -2362,7 +2362,7 @@ static int fill_fscrypt_truncate(struct inode *inode, /* Try to writeback the dirty pagecaches */ if (issued & (CEPH_CAP_FILE_BUFFER)) { - loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SHIFT - 1; + loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SIZE - 1; ret = filemap_write_and_wait_range(inode->i_mapping, orig_pos, lend); diff --git a/fs/coredump.c b/fs/coredump.c index 4ebec51fe4f2..d56f5421e62c 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -43,6 +43,8 @@ #include <linux/timekeeping.h> #include <linux/sysctl.h> #include <linux/elf.h> +#include <linux/pidfs.h> +#include <uapi/linux/pidfd.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> @@ -60,6 +62,12 @@ static void free_vma_snapshot(struct coredump_params *cprm); #define CORE_FILE_NOTE_SIZE_DEFAULT (4*1024*1024) /* Define a reasonable max cap */ #define CORE_FILE_NOTE_SIZE_MAX (16*1024*1024) +/* + * File descriptor number for the pidfd for the thread-group leader of + * the coredumping task installed into the usermode helper's file + * descriptor table. + */ +#define COREDUMP_PIDFD_NUMBER 3 static int core_uses_pid; static unsigned int core_pipe_limit; @@ -339,6 +347,27 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm, case 'C': err = cn_printf(cn, "%d", cprm->cpu); break; + /* pidfd number */ + case 'F': { + /* + * Installing a pidfd only makes sense if + * we actually spawn a usermode helper. + */ + if (!ispipe) + break; + + /* + * Note that we'll install a pidfd for the + * thread-group leader. We know that task + * linkage hasn't been removed yet and even if + * this @current isn't the actual thread-group + * leader we know that the thread-group leader + * cannot be reaped until @current has exited. + */ + cprm->pid = task_tgid(current); + err = cn_printf(cn, "%d", COREDUMP_PIDFD_NUMBER); + break; + } default: break; } @@ -493,7 +522,7 @@ static void wait_for_dump_helpers(struct file *file) } /* - * umh_pipe_setup + * umh_coredump_setup * helper function to customize the process used * to collect the core in userspace. Specifically * it sets up a pipe and installs it as fd 0 (stdin) @@ -503,11 +532,32 @@ static void wait_for_dump_helpers(struct file *file) * is a special value that we use to trap recursive * core dumps */ -static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) +static int umh_coredump_setup(struct subprocess_info *info, struct cred *new) { struct file *files[2]; struct coredump_params *cp = (struct coredump_params *)info->data; - int err = create_pipe_files(files, 0); + int err; + + if (cp->pid) { + struct file *pidfs_file __free(fput) = NULL; + + pidfs_file = pidfs_alloc_file(cp->pid, O_RDWR); + if (IS_ERR(pidfs_file)) + return PTR_ERR(pidfs_file); + + /* + * Usermode helpers are childen of either + * system_unbound_wq or of kthreadd. So we know that + * we're starting off with a clean file descriptor + * table. So we should always be able to use + * COREDUMP_PIDFD_NUMBER as our file descriptor value. + */ + err = replace_fd(COREDUMP_PIDFD_NUMBER, pidfs_file, 0); + if (err < 0) + return err; + } + + err = create_pipe_files(files, 0); if (err) return err; @@ -515,10 +565,13 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) err = replace_fd(0, files[0], 0); fput(files[0]); + if (err < 0) + return err; + /* and disallow core files too */ current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1}; - return err; + return 0; } void do_coredump(const kernel_siginfo_t *siginfo) @@ -593,7 +646,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) } if (cprm.limit == 1) { - /* See umh_pipe_setup() which sets RLIMIT_CORE = 1. + /* See umh_coredump_setup() which sets RLIMIT_CORE = 1. * * Normally core limits are irrelevant to pipes, since * we're not writing to the file system, but we use @@ -632,7 +685,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) retval = -ENOMEM; sub_info = call_usermodehelper_setup(helper_argv[0], helper_argv, NULL, GFP_KERNEL, - umh_pipe_setup, NULL, &cprm); + umh_coredump_setup, NULL, &cprm); if (sub_info) retval = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index d28141829c05..70abd4da17a6 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1826,8 +1826,8 @@ static int dlm_tcp_listen_validate(void) { /* We don't support multi-homed hosts */ if (dlm_local_count > 1) { - log_print("TCP protocol can't handle multi-homed hosts, try SCTP"); - return -EINVAL; + log_print("Detect multi-homed hosts but use only the first IP address."); + log_print("Try SCTP, if you want to enable multi-link."); } return 0; diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c index abb9c6d3b1aa..f4bf1b5e3f5b 100644 --- a/fs/erofs/fileio.c +++ b/fs/erofs/fileio.c @@ -150,10 +150,10 @@ io_retry: io->rq->bio.bi_iter.bi_sector = io->dev.m_pa >> 9; attached = 0; } - if (!attached++) - erofs_onlinefolio_split(folio); if (!bio_add_folio(&io->rq->bio, folio, len, cur)) goto io_retry; + if (!attached++) + erofs_onlinefolio_split(folio); io->dev.m_pa += len; } cur += len; diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index efd25f3101f1..2b8d9a10f002 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -446,6 +446,7 @@ int __init erofs_init_shrinker(void); void erofs_exit_shrinker(void); int __init z_erofs_init_subsystem(void); void z_erofs_exit_subsystem(void); +int z_erofs_init_super(struct super_block *sb); unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi, unsigned long nr_shrink); int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, @@ -455,7 +456,6 @@ void z_erofs_put_gbuf(void *ptr); int z_erofs_gbuf_growsize(unsigned int nrpages); int __init z_erofs_gbuf_init(void); void z_erofs_gbuf_exit(void); -int erofs_init_managed_cache(struct super_block *sb); int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb); #else static inline void erofs_shrinker_register(struct super_block *sb) {} @@ -464,7 +464,7 @@ static inline int erofs_init_shrinker(void) { return 0; } static inline void erofs_exit_shrinker(void) {} static inline int z_erofs_init_subsystem(void) { return 0; } static inline void z_erofs_exit_subsystem(void) {} -static inline int erofs_init_managed_cache(struct super_block *sb) { return 0; } +static inline int z_erofs_init_super(struct super_block *sb) { return 0; } #endif /* !CONFIG_EROFS_FS_ZIP */ #ifdef CONFIG_EROFS_FS_BACKED_BY_FILE diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 9f2bce5af9c8..b30125a2a501 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -631,9 +631,16 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) else sb->s_flags &= ~SB_POSIXACL; -#ifdef CONFIG_EROFS_FS_ZIP - xa_init(&sbi->managed_pslots); -#endif + err = z_erofs_init_super(sb); + if (err) + return err; + + if (erofs_sb_has_fragments(sbi) && sbi->packed_nid) { + inode = erofs_iget(sb, sbi->packed_nid); + if (IS_ERR(inode)) + return PTR_ERR(inode); + sbi->packed_inode = inode; + } inode = erofs_iget(sb, sbi->root_nid); if (IS_ERR(inode)) @@ -645,24 +652,11 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) iput(inode); return -EINVAL; } - sb->s_root = d_make_root(inode); if (!sb->s_root) return -ENOMEM; erofs_shrinker_register(sb); - if (erofs_sb_has_fragments(sbi) && sbi->packed_nid) { - sbi->packed_inode = erofs_iget(sb, sbi->packed_nid); - if (IS_ERR(sbi->packed_inode)) { - err = PTR_ERR(sbi->packed_inode); - sbi->packed_inode = NULL; - return err; - } - } - err = erofs_init_managed_cache(sb); - if (err) - return err; - err = erofs_xattr_prefixes_init(sb); if (err) return err; @@ -798,6 +792,16 @@ static int erofs_init_fs_context(struct fs_context *fc) return 0; } +static void erofs_drop_internal_inodes(struct erofs_sb_info *sbi) +{ + iput(sbi->packed_inode); + sbi->packed_inode = NULL; +#ifdef CONFIG_EROFS_FS_ZIP + iput(sbi->managed_cache); + sbi->managed_cache = NULL; +#endif +} + static void erofs_kill_sb(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); @@ -807,6 +811,7 @@ static void erofs_kill_sb(struct super_block *sb) kill_anon_super(sb); else kill_block_super(sb); + erofs_drop_internal_inodes(sbi); fs_put_dax(sbi->dif0.dax_dev, NULL); erofs_fscache_unregister_fs(sb); erofs_sb_free(sbi); @@ -817,17 +822,10 @@ static void erofs_put_super(struct super_block *sb) { struct erofs_sb_info *const sbi = EROFS_SB(sb); - DBG_BUGON(!sbi); - erofs_unregister_sysfs(sb); erofs_shrinker_unregister(sb); erofs_xattr_prefixes_cleanup(sb); -#ifdef CONFIG_EROFS_FS_ZIP - iput(sbi->managed_cache); - sbi->managed_cache = NULL; -#endif - iput(sbi->packed_inode); - sbi->packed_inode = NULL; + erofs_drop_internal_inodes(sbi); erofs_free_dev_context(sbi->devs); sbi->devs = NULL; erofs_fscache_unregister_fs(sb); diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index d771e06db738..87192635b2d2 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -76,9 +76,6 @@ struct z_erofs_pcluster { /* L: whether partial decompression or not */ bool partial; - /* L: indicate several pageofs_outs or not */ - bool multibases; - /* L: whether extra buffer allocations are best-effort */ bool besteffort; @@ -644,18 +641,18 @@ static const struct address_space_operations z_erofs_cache_aops = { .invalidate_folio = z_erofs_cache_invalidate_folio, }; -int erofs_init_managed_cache(struct super_block *sb) +int z_erofs_init_super(struct super_block *sb) { struct inode *const inode = new_inode(sb); if (!inode) return -ENOMEM; - set_nlink(inode, 1); inode->i_size = OFFSET_MAX; inode->i_mapping->a_ops = &z_erofs_cache_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL); EROFS_SB(sb)->managed_cache = inode; + xa_init(&EROFS_SB(sb)->managed_pslots); return 0; } @@ -1050,8 +1047,6 @@ static int z_erofs_scan_folio(struct z_erofs_frontend *f, break; erofs_onlinefolio_split(folio); - if (f->pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) - f->pcl->multibases = true; if (f->pcl->length < offset + end - map->m_la) { f->pcl->length = offset + end - map->m_la; f->pcl->pageofs_out = map->m_la & ~PAGE_MASK; @@ -1097,7 +1092,6 @@ struct z_erofs_backend { struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES]; struct super_block *sb; struct z_erofs_pcluster *pcl; - /* pages with the longest decompressed length for deduplication */ struct page **decompressed_pages; /* pages to keep the compressed data */ @@ -1106,6 +1100,8 @@ struct z_erofs_backend { struct list_head decompressed_secondary_bvecs; struct page **pagepool; unsigned int onstack_used, nr_pages; + /* indicate if temporary copies should be preserved for later use */ + bool keepxcpy; }; struct z_erofs_bvec_item { @@ -1116,18 +1112,20 @@ struct z_erofs_bvec_item { static void z_erofs_do_decompressed_bvec(struct z_erofs_backend *be, struct z_erofs_bvec *bvec) { + int poff = bvec->offset + be->pcl->pageofs_out; struct z_erofs_bvec_item *item; - unsigned int pgnr; - - if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK) && - (bvec->end == PAGE_SIZE || - bvec->offset + bvec->end == be->pcl->length)) { - pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT; - DBG_BUGON(pgnr >= be->nr_pages); - if (!be->decompressed_pages[pgnr]) { - be->decompressed_pages[pgnr] = bvec->page; + struct page **page; + + if (!(poff & ~PAGE_MASK) && (bvec->end == PAGE_SIZE || + bvec->offset + bvec->end == be->pcl->length)) { + DBG_BUGON((poff >> PAGE_SHIFT) >= be->nr_pages); + page = be->decompressed_pages + (poff >> PAGE_SHIFT); + if (!*page) { + *page = bvec->page; return; } + } else { + be->keepxcpy = true; } /* (cold path) one pcluster is requested multiple times */ @@ -1291,7 +1289,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err) .alg = pcl->algorithmformat, .inplace_io = overlapped, .partial_decoding = pcl->partial, - .fillgaps = pcl->multibases, + .fillgaps = be->keepxcpy, .gfp = pcl->besteffort ? GFP_KERNEL : GFP_NOWAIT | __GFP_NORETRY }, be->pagepool); @@ -1348,7 +1346,6 @@ static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err) pcl->length = 0; pcl->partial = true; - pcl->multibases = false; pcl->besteffort = false; pcl->bvset.nextpage = NULL; pcl->vcnt = 0; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index c01234bbac49..9a6b924958e2 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -2111,9 +2111,10 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, write_unlock_irq(&ep->lock); - if (!eavail && ep_schedule_timeout(to)) - timed_out = !schedule_hrtimeout_range(to, slack, - HRTIMER_MODE_ABS); + if (!eavail) + timed_out = !ep_schedule_timeout(to) || + !schedule_hrtimeout_range(to, slack, + HRTIMER_MODE_ABS); __set_current_state(TASK_RUNNING); /* diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index a23677de4544..b22c02d6000f 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -274,9 +274,11 @@ static int exfat_get_block(struct inode *inode, sector_t iblock, sector_t last_block; sector_t phys = 0; sector_t valid_blks; + loff_t i_size; mutex_lock(&sbi->s_lock); - last_block = EXFAT_B_TO_BLK_ROUND_UP(i_size_read(inode), sb); + i_size = i_size_read(inode); + last_block = EXFAT_B_TO_BLK_ROUND_UP(i_size, sb); if (iblock >= last_block && !create) goto done; @@ -305,102 +307,95 @@ static int exfat_get_block(struct inode *inode, sector_t iblock, if (buffer_delay(bh_result)) clear_buffer_delay(bh_result); - if (create) { + /* + * In most cases, we just need to set bh_result to mapped, unmapped + * or new status as follows: + * 1. i_size == valid_size + * 2. write case (create == 1) + * 3. direct_read (!bh_result->b_folio) + * -> the unwritten part will be zeroed in exfat_direct_IO() + * + * Otherwise, in the case of buffered read, it is necessary to take + * care the last nested block if valid_size is not equal to i_size. + */ + if (i_size == ei->valid_size || create || !bh_result->b_folio) valid_blks = EXFAT_B_TO_BLK_ROUND_UP(ei->valid_size, sb); + else + valid_blks = EXFAT_B_TO_BLK(ei->valid_size, sb); - if (iblock + max_blocks < valid_blks) { - /* The range has been written, map it */ - goto done; - } else if (iblock < valid_blks) { - /* - * The range has been partially written, - * map the written part. - */ - max_blocks = valid_blks - iblock; - goto done; - } + /* The range has been fully written, map it */ + if (iblock + max_blocks < valid_blks) + goto done; - /* The area has not been written, map and mark as new. */ - set_buffer_new(bh_result); + /* The range has been partially written, map the written part */ + if (iblock < valid_blks) { + max_blocks = valid_blks - iblock; + goto done; + } + /* The area has not been written, map and mark as new for create case */ + if (create) { + set_buffer_new(bh_result); ei->valid_size = EXFAT_BLK_TO_B(iblock + max_blocks, sb); mark_inode_dirty(inode); - } else { - valid_blks = EXFAT_B_TO_BLK(ei->valid_size, sb); + goto done; + } - if (iblock + max_blocks < valid_blks) { - /* The range has been written, map it */ - goto done; - } else if (iblock < valid_blks) { - /* - * The area has been partially written, - * map the written part. - */ - max_blocks = valid_blks - iblock; + /* + * The area has just one block partially written. + * In that case, we should read and fill the unwritten part of + * a block with zero. + */ + if (bh_result->b_folio && iblock == valid_blks && + (ei->valid_size & (sb->s_blocksize - 1))) { + loff_t size, pos; + void *addr; + + max_blocks = 1; + + /* + * No buffer_head is allocated. + * (1) bmap: It's enough to set blocknr without I/O. + * (2) read: The unwritten part should be filled with zero. + * If a folio does not have any buffers, + * let's returns -EAGAIN to fallback to + * block_read_full_folio() for per-bh IO. + */ + if (!folio_buffers(bh_result->b_folio)) { + err = -EAGAIN; goto done; - } else if (iblock == valid_blks && - (ei->valid_size & (sb->s_blocksize - 1))) { - /* - * The block has been partially written, - * zero the unwritten part and map the block. - */ - loff_t size, pos; - void *addr; - - max_blocks = 1; - - /* - * For direct read, the unwritten part will be zeroed in - * exfat_direct_IO() - */ - if (!bh_result->b_folio) - goto done; - - /* - * No buffer_head is allocated. - * (1) bmap: It's enough to fill bh_result without I/O. - * (2) read: The unwritten part should be filled with 0 - * If a folio does not have any buffers, - * let's returns -EAGAIN to fallback to - * per-bh IO like block_read_full_folio(). - */ - if (!folio_buffers(bh_result->b_folio)) { - err = -EAGAIN; - goto done; - } + } - pos = EXFAT_BLK_TO_B(iblock, sb); - size = ei->valid_size - pos; - addr = folio_address(bh_result->b_folio) + - offset_in_folio(bh_result->b_folio, pos); + pos = EXFAT_BLK_TO_B(iblock, sb); + size = ei->valid_size - pos; + addr = folio_address(bh_result->b_folio) + + offset_in_folio(bh_result->b_folio, pos); - /* Check if bh->b_data points to proper addr in folio */ - if (bh_result->b_data != addr) { - exfat_fs_error_ratelimit(sb, + /* Check if bh->b_data points to proper addr in folio */ + if (bh_result->b_data != addr) { + exfat_fs_error_ratelimit(sb, "b_data(%p) != folio_addr(%p)", bh_result->b_data, addr); - err = -EINVAL; - goto done; - } - - /* Read a block */ - err = bh_read(bh_result, 0); - if (err < 0) - goto done; + err = -EINVAL; + goto done; + } - /* Zero unwritten part of a block */ - memset(bh_result->b_data + size, 0, - bh_result->b_size - size); + /* Read a block */ + err = bh_read(bh_result, 0); + if (err < 0) + goto done; - err = 0; - } else { - /* - * The range has not been written, clear the mapped flag - * to only zero the cache and do not read from disk. - */ - clear_buffer_mapped(bh_result); - } + /* Zero unwritten part of a block */ + memset(bh_result->b_data + size, 0, bh_result->b_size - size); + err = 0; + goto done; } + + /* + * The area has not been written, clear mapped for read/bmap cases. + * If so, it will be filled with zero without reading from disk. + */ + clear_buffer_mapped(bh_result); done: bh_result->b_size = EXFAT_BLK_TO_B(max_blocks, sb); if (err < 0) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 8042ad873808..c48fd36b2d74 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -649,8 +649,8 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi, /* Hm, nope. Are (enough) root reserved clusters available? */ if (uid_eq(sbi->s_resuid, current_fsuid()) || (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) || - capable(CAP_SYS_RESOURCE) || - (flags & EXT4_MB_USE_ROOT_BLOCKS)) { + (flags & EXT4_MB_USE_ROOT_BLOCKS) || + capable(CAP_SYS_RESOURCE)) { if (free_clusters >= (nclusters + dirty_clusters + resv_clusters)) diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 87ee3a17bd29..e8c5525afc67 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -351,10 +351,9 @@ int ext4_check_blockref(const char *function, unsigned int line, { __le32 *bref = p; unsigned int blk; + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; - if (ext4_has_feature_journal(inode->i_sb) && - (inode->i_ino == - le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) + if (journal && inode == journal->j_inode) return 0; while (bref < p+max) { diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index df30d9f23512..4f81983b7918 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -278,7 +278,8 @@ struct ext4_system_blocks { /* * Flags for ext4_io_end->flags */ -#define EXT4_IO_END_UNWRITTEN 0x0001 +#define EXT4_IO_END_UNWRITTEN 0x0001 +#define EXT4_IO_END_FAILED 0x0002 struct ext4_io_end_vec { struct list_head list; /* list of io_end_vec */ @@ -3010,6 +3011,8 @@ extern int ext4_inode_attach_jinode(struct inode *inode); extern int ext4_can_truncate(struct inode *inode); extern int ext4_truncate(struct inode *); extern int ext4_break_layouts(struct inode *); +extern int ext4_truncate_page_cache_block_range(struct inode *inode, + loff_t start, loff_t end); extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); extern void ext4_set_inode_flags(struct inode *, bool init); extern int ext4_alloc_da_blocks(struct inode *inode); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index a07a98a4b97a..8dc6b4271b15 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4667,22 +4667,13 @@ static long ext4_zero_range(struct file *file, loff_t offset, goto out_mutex; } - /* - * For journalled data we need to write (and checkpoint) pages - * before discarding page cache to avoid inconsitent data on - * disk in case of crash before zeroing trans is committed. - */ - if (ext4_should_journal_data(inode)) { - ret = filemap_write_and_wait_range(mapping, start, - end - 1); - if (ret) { - filemap_invalidate_unlock(mapping); - goto out_mutex; - } + /* Now release the pages and zero block aligned part of pages */ + ret = ext4_truncate_page_cache_block_range(inode, start, end); + if (ret) { + filemap_invalidate_unlock(mapping); + goto out_mutex; } - /* Now release the pages and zero block aligned part of pages */ - truncate_pagecache_range(inode, start, end - 1); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4108b7d1696f..44780bb537f9 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -31,6 +31,7 @@ #include <linux/writeback.h> #include <linux/pagevec.h> #include <linux/mpage.h> +#include <linux/rmap.h> #include <linux/namei.h> #include <linux/uio.h> #include <linux/bio.h> @@ -383,10 +384,11 @@ static int __check_block_validity(struct inode *inode, const char *func, unsigned int line, struct ext4_map_blocks *map) { - if (ext4_has_feature_journal(inode->i_sb) && - (inode->i_ino == - le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + + if (journal && inode == journal->j_inode) return 0; + if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) { ext4_error_inode(inode, func, line, map->m_pblk, "lblock %lu mapped to illegal pblock %llu " @@ -3902,6 +3904,68 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, return ret; } +static inline void ext4_truncate_folio(struct inode *inode, + loff_t start, loff_t end) +{ + unsigned long blocksize = i_blocksize(inode); + struct folio *folio; + + /* Nothing to be done if no complete block needs to be truncated. */ + if (round_up(start, blocksize) >= round_down(end, blocksize)) + return; + + folio = filemap_lock_folio(inode->i_mapping, start >> PAGE_SHIFT); + if (IS_ERR(folio)) + return; + + if (folio_mkclean(folio)) + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); +} + +int ext4_truncate_page_cache_block_range(struct inode *inode, + loff_t start, loff_t end) +{ + unsigned long blocksize = i_blocksize(inode); + int ret; + + /* + * For journalled data we need to write (and checkpoint) pages + * before discarding page cache to avoid inconsitent data on disk + * in case of crash before freeing or unwritten converting trans + * is committed. + */ + if (ext4_should_journal_data(inode)) { + ret = filemap_write_and_wait_range(inode->i_mapping, start, + end - 1); + if (ret) + return ret; + goto truncate_pagecache; + } + + /* + * If the block size is less than the page size, the file's mapped + * blocks within one page could be freed or converted to unwritten. + * So it's necessary to remove writable userspace mappings, and then + * ext4_page_mkwrite() can be called during subsequent write access + * to these partial folios. + */ + if (!IS_ALIGNED(start | end, PAGE_SIZE) && + blocksize < PAGE_SIZE && start < inode->i_size) { + loff_t page_boundary = round_up(start, PAGE_SIZE); + + ext4_truncate_folio(inode, start, min(page_boundary, end)); + if (end > page_boundary) + ext4_truncate_folio(inode, + round_down(end, PAGE_SIZE), end); + } + +truncate_pagecache: + truncate_pagecache_range(inode, start, end - 1); + return 0; +} + static void ext4_wait_dax_page(struct inode *inode) { filemap_invalidate_unlock(inode->i_mapping); @@ -3956,17 +4020,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) trace_ext4_punch_hole(inode, offset, length, 0); - /* - * Write out all dirty pages to avoid race conditions - * Then release them. - */ - if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { - ret = filemap_write_and_wait_range(mapping, offset, - offset + length - 1); - if (ret) - return ret; - } - inode_lock(inode); /* No need to punch hole beyond i_size */ @@ -4028,8 +4081,11 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) ret = ext4_update_disksize_before_punch(inode, offset, length); if (ret) goto out_dio; - truncate_pagecache_range(inode, first_block_offset, - last_block_offset); + + ret = ext4_truncate_page_cache_block_range(inode, + first_block_offset, last_block_offset + 1); + if (ret) + goto out_dio; } if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b25a27c86696..d6f1e61c6dc8 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -6644,7 +6644,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, for (i = 0; i < count; i++) { cond_resched(); if (is_metadata) - bh = sb_find_get_block(inode->i_sb, block + i); + bh = sb_find_get_block_nonatomic(inode->i_sb, + block + i); ext4_forget(handle, is_metadata, inode, bh, block + i); } } diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 69b8a7221a2b..f041a5d93716 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -181,14 +181,25 @@ static int ext4_end_io_end(ext4_io_end_t *io_end) "list->prev 0x%p\n", io_end, inode->i_ino, io_end->list.next, io_end->list.prev); - io_end->handle = NULL; /* Following call will use up the handle */ - ret = ext4_convert_unwritten_io_end_vec(handle, io_end); + /* + * Do not convert the unwritten extents if data writeback fails, + * or stale data may be exposed. + */ + io_end->handle = NULL; /* Following call will use up the handle */ + if (unlikely(io_end->flag & EXT4_IO_END_FAILED)) { + ret = -EIO; + if (handle) + jbd2_journal_free_reserved(handle); + } else { + ret = ext4_convert_unwritten_io_end_vec(handle, io_end); + } if (ret < 0 && !ext4_forced_shutdown(inode->i_sb)) { ext4_msg(inode->i_sb, KERN_EMERG, "failed to convert unwritten extents to written " "extents -- potential data loss! " "(inode %lu, error %d)", inode->i_ino, ret); } + ext4_clear_io_unwritten_flag(io_end); ext4_release_io_end(io_end); return ret; @@ -344,6 +355,7 @@ static void ext4_end_bio(struct bio *bio) bio->bi_status, inode->i_ino, (unsigned long long) bi_sector >> (inode->i_blkbits - 9)); + io_end->flag |= EXT4_IO_END_FAILED; mapping_set_error(inode->i_mapping, blk_status_to_errno(bio->bi_status)); } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 528979de0f7c..b956e1ee9829 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2785,6 +2785,13 @@ static int ext4_check_opt_consistency(struct fs_context *fc, } if (is_remount) { + if (!sbi->s_journal && + ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT)) { + ext4_msg(NULL, KERN_WARNING, + "Remounting fs w/o journal so ignoring data_err option"); + ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT); + } + if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) && (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) { ext4_msg(NULL, KERN_ERR, "can't mount with " @@ -5428,6 +5435,11 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) "data=, fs mounted w/o journal"); goto failed_mount3a; } + if (test_opt(sb, DATA_ERR_ABORT)) { + ext4_msg(sb, KERN_ERR, + "can't mount with data_err=abort, fs mounted w/o journal"); + goto failed_mount3a; + } sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM; clear_opt(sb, JOURNAL_CHECKSUM); clear_opt(sb, DATA_FLAGS); @@ -6776,6 +6788,7 @@ static int ext4_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; int ret; + bool old_ro = sb_rdonly(sb); fc->s_fs_info = EXT4_SB(sb); @@ -6787,9 +6800,9 @@ static int ext4_reconfigure(struct fs_context *fc) if (ret < 0) return ret; - ext4_msg(sb, KERN_INFO, "re-mounted %pU %s. Quota mode: %s.", - &sb->s_uuid, sb_rdonly(sb) ? "ro" : "r/w", - ext4_quota_mode(sb)); + ext4_msg(sb, KERN_INFO, "re-mounted %pU%s.", + &sb->s_uuid, + (old_ro != sb_rdonly(sb)) ? (sb_rdonly(sb) ? " ro" : " r/w") : ""); return 0; } diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index d15c68b28952..b419555e1ea7 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -61,6 +61,12 @@ struct f2fs_attr { int id; }; +struct f2fs_base_attr { + struct attribute attr; + ssize_t (*show)(struct f2fs_base_attr *a, char *buf); + ssize_t (*store)(struct f2fs_base_attr *a, const char *buf, size_t len); +}; + static ssize_t f2fs_sbi_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf); @@ -862,6 +868,25 @@ static void f2fs_sb_release(struct kobject *kobj) complete(&sbi->s_kobj_unregister); } +static ssize_t f2fs_base_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct f2fs_base_attr *a = container_of(attr, + struct f2fs_base_attr, attr); + + return a->show ? a->show(a, buf) : 0; +} + +static ssize_t f2fs_base_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct f2fs_base_attr *a = container_of(attr, + struct f2fs_base_attr, attr); + + return a->store ? a->store(a, buf, len) : 0; +} + /* * Note that there are three feature list entries: * 1) /sys/fs/f2fs/features @@ -880,14 +905,13 @@ static void f2fs_sb_release(struct kobject *kobj) * please add new on-disk feature in this list only. * - ref. F2FS_SB_FEATURE_RO_ATTR() */ -static ssize_t f2fs_feature_show(struct f2fs_attr *a, - struct f2fs_sb_info *sbi, char *buf) +static ssize_t f2fs_feature_show(struct f2fs_base_attr *a, char *buf) { return sysfs_emit(buf, "supported\n"); } #define F2FS_FEATURE_RO_ATTR(_name) \ -static struct f2fs_attr f2fs_attr_##_name = { \ +static struct f2fs_base_attr f2fs_base_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = 0444 }, \ .show = f2fs_feature_show, \ } @@ -1256,37 +1280,38 @@ static struct attribute *f2fs_attrs[] = { }; ATTRIBUTE_GROUPS(f2fs); +#define BASE_ATTR_LIST(name) (&f2fs_base_attr_##name.attr) static struct attribute *f2fs_feat_attrs[] = { #ifdef CONFIG_FS_ENCRYPTION - ATTR_LIST(encryption), - ATTR_LIST(test_dummy_encryption_v2), + BASE_ATTR_LIST(encryption), + BASE_ATTR_LIST(test_dummy_encryption_v2), #if IS_ENABLED(CONFIG_UNICODE) - ATTR_LIST(encrypted_casefold), + BASE_ATTR_LIST(encrypted_casefold), #endif #endif /* CONFIG_FS_ENCRYPTION */ #ifdef CONFIG_BLK_DEV_ZONED - ATTR_LIST(block_zoned), + BASE_ATTR_LIST(block_zoned), #endif - ATTR_LIST(atomic_write), - ATTR_LIST(extra_attr), - ATTR_LIST(project_quota), - ATTR_LIST(inode_checksum), - ATTR_LIST(flexible_inline_xattr), - ATTR_LIST(quota_ino), - ATTR_LIST(inode_crtime), - ATTR_LIST(lost_found), + BASE_ATTR_LIST(atomic_write), + BASE_ATTR_LIST(extra_attr), + BASE_ATTR_LIST(project_quota), + BASE_ATTR_LIST(inode_checksum), + BASE_ATTR_LIST(flexible_inline_xattr), + BASE_ATTR_LIST(quota_ino), + BASE_ATTR_LIST(inode_crtime), + BASE_ATTR_LIST(lost_found), #ifdef CONFIG_FS_VERITY - ATTR_LIST(verity), + BASE_ATTR_LIST(verity), #endif - ATTR_LIST(sb_checksum), + BASE_ATTR_LIST(sb_checksum), #if IS_ENABLED(CONFIG_UNICODE) - ATTR_LIST(casefold), + BASE_ATTR_LIST(casefold), #endif - ATTR_LIST(readonly), + BASE_ATTR_LIST(readonly), #ifdef CONFIG_F2FS_FS_COMPRESSION - ATTR_LIST(compression), + BASE_ATTR_LIST(compression), #endif - ATTR_LIST(pin_file), + BASE_ATTR_LIST(pin_file), NULL, }; ATTRIBUTE_GROUPS(f2fs_feat); @@ -1362,9 +1387,14 @@ static struct kset f2fs_kset = { .kobj = {.ktype = &f2fs_ktype}, }; +static const struct sysfs_ops f2fs_feat_attr_ops = { + .show = f2fs_base_attr_show, + .store = f2fs_base_attr_store, +}; + static const struct kobj_type f2fs_feat_ktype = { .default_groups = f2fs_feat_groups, - .sysfs_ops = &f2fs_attr_ops, + .sysfs_ops = &f2fs_feat_attr_ops, }; static struct kobject f2fs_feat = { diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 3b031d24d369..8f699c67561f 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1137,6 +1137,8 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, else if (err == -EINTR) fuse_invalidate_attr(inode); + if (err == -ENOSYS) + err = -EPERM; return err; } diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 65c07aa95718..7f9cc7872214 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -843,12 +843,13 @@ static void run_queue(struct gfs2_glock *gl, const int nonblock) __releases(&gl->gl_lockref.lock) __acquires(&gl->gl_lockref.lock) { - struct gfs2_holder *gh = NULL; + struct gfs2_holder *gh; if (test_bit(GLF_LOCK, &gl->gl_flags)) return; set_bit(GLF_LOCK, &gl->gl_flags); + /* While a demote is in progress, the GLF_LOCK flag must be set. */ GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)); if (test_bit(GLF_DEMOTE, &gl->gl_flags) && @@ -860,18 +861,22 @@ __acquires(&gl->gl_lockref.lock) set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); GLOCK_BUG_ON(gl, gl->gl_demote_state == LM_ST_EXCLUSIVE); gl->gl_target = gl->gl_demote_state; + do_xmote(gl, NULL, gl->gl_target); + return; } else { if (test_bit(GLF_DEMOTE, &gl->gl_flags)) gfs2_demote_wake(gl); if (do_promote(gl)) goto out_unlock; gh = find_first_waiter(gl); + if (!gh) + goto out_unlock; gl->gl_target = gh->gh_state; if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) do_error(gl, 0); /* Fail queued try locks */ + do_xmote(gl, gh, gl->gl_target); + return; } - do_xmote(gl, gh, gl->gl_target); - return; out_sched: clear_bit(GLF_LOCK, &gl->gl_flags); diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index d303e6c8900c..a47e3afd724c 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -263,7 +263,7 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, } /* truncate len if we find any trailing uptodate block(s) */ - for ( ; i <= last; i++) { + while (++i <= last) { if (ifs_block_is_uptodate(ifs, i)) { plen -= (last - i + 1) * block_size; last = i - 1; diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 9192be7c19d8..a3c39a71c4ad 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -39,7 +39,7 @@ struct recovery_info static int do_one_pass(journal_t *journal, struct recovery_info *info, enum passtype pass); -static int scan_revoke_records(journal_t *, struct buffer_head *, +static int scan_revoke_records(journal_t *, enum passtype, struct buffer_head *, tid_t, struct recovery_info *); #ifdef __KERNEL__ @@ -287,19 +287,20 @@ static int fc_do_one_pass(journal_t *journal, int jbd2_journal_recover(journal_t *journal) { int err, err2; - journal_superblock_t * sb; - struct recovery_info info; memset(&info, 0, sizeof(info)); - sb = journal->j_superblock; /* * The journal superblock's s_start field (the current log head) * is always zero if, and only if, the journal was cleanly - * unmounted. + * unmounted. We use its in-memory version j_tail here because + * jbd2_journal_wipe() could have updated it without updating journal + * superblock. */ - if (!sb->s_start) { + if (!journal->j_tail) { + journal_superblock_t *sb = journal->j_superblock; + jbd2_debug(1, "No recovery required, last transaction %d, head block %u\n", be32_to_cpu(sb->s_sequence), be32_to_cpu(sb->s_head)); journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1; @@ -327,6 +328,12 @@ int jbd2_journal_recover(journal_t *journal) journal->j_transaction_sequence, journal->j_head); jbd2_journal_clear_revoke(journal); + /* Free revoke table allocated for replay */ + if (journal->j_revoke != journal->j_revoke_table[0] && + journal->j_revoke != journal->j_revoke_table[1]) { + jbd2_journal_destroy_revoke_table(journal->j_revoke); + journal->j_revoke = journal->j_revoke_table[1]; + } err2 = sync_blockdev(journal->j_fs_dev); if (!err) err = err2; @@ -612,6 +619,31 @@ static int do_one_pass(journal_t *journal, first_commit_ID = next_commit_ID; if (pass == PASS_SCAN) info->start_transaction = first_commit_ID; + else if (pass == PASS_REVOKE) { + /* + * Would the default revoke table have too long hash chains + * during replay? + */ + if (info->nr_revokes > JOURNAL_REVOKE_DEFAULT_HASH * 16) { + unsigned int hash_size; + + /* + * Aim for average chain length of 8, limit at 1M + * entries to avoid problems with malicious + * filesystems. + */ + hash_size = min(roundup_pow_of_two(info->nr_revokes / 8), + 1U << 20); + journal->j_revoke = + jbd2_journal_init_revoke_table(hash_size); + if (!journal->j_revoke) { + printk(KERN_ERR + "JBD2: failed to allocate revoke table for replay with %u entries. " + "Journal replay may be slow.\n", hash_size); + journal->j_revoke = journal->j_revoke_table[1]; + } + } + } jbd2_debug(1, "Starting recovery pass %d\n", pass); @@ -852,6 +884,13 @@ chksum_ok: case JBD2_REVOKE_BLOCK: /* + * If we aren't in the SCAN or REVOKE pass, then we can + * just skip over this block. + */ + if (pass != PASS_REVOKE && pass != PASS_SCAN) + continue; + + /* * Check revoke block crc in pass_scan, if csum verify * failed, check commit block time later. */ @@ -863,12 +902,7 @@ chksum_ok: need_check_commit_time = true; } - /* If we aren't in the REVOKE pass, then we can - * just skip over this block. */ - if (pass != PASS_REVOKE) - continue; - - err = scan_revoke_records(journal, bh, + err = scan_revoke_records(journal, pass, bh, next_commit_ID, info); if (err) goto failed; @@ -922,8 +956,9 @@ chksum_ok: /* Scan a revoke record, marking all blocks mentioned as revoked. */ -static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, - tid_t sequence, struct recovery_info *info) +static int scan_revoke_records(journal_t *journal, enum passtype pass, + struct buffer_head *bh, tid_t sequence, + struct recovery_info *info) { jbd2_journal_revoke_header_t *header; int offset, max; @@ -944,6 +979,11 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, if (jbd2_has_feature_64bit(journal)) record_len = 8; + if (pass == PASS_SCAN) { + info->nr_revokes += (max - offset) / record_len; + return 0; + } + while (offset + record_len <= max) { unsigned long long blocknr; int err; @@ -956,7 +996,6 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, err = jbd2_journal_set_revoke(journal, blocknr, sequence); if (err) return err; - ++info->nr_revokes; } return 0; } diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index ce63d5fde9c3..bc328c39028a 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -215,7 +215,7 @@ int __init jbd2_journal_init_revoke_table_cache(void) return 0; } -static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size) +struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size) { int shift = 0; int tmp = hash_size; @@ -231,7 +231,7 @@ static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size) table->hash_size = hash_size; table->hash_shift = shift; table->hash_table = - kmalloc_array(hash_size, sizeof(struct list_head), GFP_KERNEL); + kvmalloc_array(hash_size, sizeof(struct list_head), GFP_KERNEL); if (!table->hash_table) { kmem_cache_free(jbd2_revoke_table_cache, table); table = NULL; @@ -245,7 +245,7 @@ out: return table; } -static void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table) +void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table) { int i; struct list_head *hash_list; @@ -255,7 +255,7 @@ static void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table) J_ASSERT(list_empty(hash_list)); } - kfree(table->hash_table); + kvfree(table->hash_table); kmem_cache_free(jbd2_revoke_table_cache, table); } @@ -345,7 +345,8 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr, bh = bh_in; if (!bh) { - bh = __find_get_block(bdev, blocknr, journal->j_blocksize); + bh = __find_get_block_nonatomic(bdev, blocknr, + journal->j_blocksize); if (bh) BUFFER_TRACE(bh, "found on hash"); } @@ -355,7 +356,8 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr, /* If there is a different buffer_head lying around in * memory anywhere... */ - bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize); + bh2 = __find_get_block_nonatomic(bdev, blocknr, + journal->j_blocksize); if (bh2) { /* ... and it has RevokeValid status... */ if (bh2 != bh && buffer_revokevalid(bh2)) @@ -466,7 +468,8 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) * state machine will get very upset later on. */ if (need_cancel) { struct buffer_head *bh2; - bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size); + bh2 = __find_get_block_nonatomic(bh->b_bdev, bh->b_blocknr, + bh->b_size); if (bh2) { if (bh2 != bh) clear_buffer_revoked(bh2); @@ -495,9 +498,9 @@ void jbd2_clear_buffer_revoked_flags(journal_t *journal) struct jbd2_revoke_record_s *record; struct buffer_head *bh; record = (struct jbd2_revoke_record_s *)list_entry; - bh = __find_get_block(journal->j_fs_dev, - record->blocknr, - journal->j_blocksize); + bh = __find_get_block_nonatomic(journal->j_fs_dev, + record->blocknr, + journal->j_blocksize); if (bh) { clear_buffer_revoked(bh); __brelse(bh); diff --git a/fs/namespace.c b/fs/namespace.c index d401486fe95d..1d950974d67e 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -778,15 +778,11 @@ int __legitimize_mnt(struct vfsmount *bastard, unsigned seq) return 0; mnt = real_mount(bastard); mnt_add_count(mnt, 1); - smp_mb(); // see mntput_no_expire() + smp_mb(); // see mntput_no_expire() and do_umount() if (likely(!read_seqretry(&mount_lock, seq))) return 0; - if (bastard->mnt_flags & MNT_SYNC_UMOUNT) { - mnt_add_count(mnt, -1); - return 1; - } lock_mount_hash(); - if (unlikely(bastard->mnt_flags & MNT_DOOMED)) { + if (unlikely(bastard->mnt_flags & (MNT_SYNC_UMOUNT | MNT_DOOMED))) { mnt_add_count(mnt, -1); unlock_mount_hash(); return 1; @@ -1956,6 +1952,7 @@ static int do_umount(struct mount *mnt, int flags) umount_tree(mnt, UMOUNT_PROPAGATE); retval = 0; } else { + smp_mb(); // paired with __legitimize_mnt() shrink_submounts(mnt); retval = -EBUSY; if (!propagate_mount_busy(mnt, 2)) { @@ -2640,56 +2637,62 @@ static struct mountpoint *do_lock_mount(struct path *path, bool beneath) struct vfsmount *mnt = path->mnt; struct dentry *dentry; struct mountpoint *mp = ERR_PTR(-ENOENT); + struct path under = {}; for (;;) { - struct mount *m; + struct mount *m = real_mount(mnt); if (beneath) { - m = real_mount(mnt); + path_put(&under); read_seqlock_excl(&mount_lock); - dentry = dget(m->mnt_mountpoint); + under.mnt = mntget(&m->mnt_parent->mnt); + under.dentry = dget(m->mnt_mountpoint); read_sequnlock_excl(&mount_lock); + dentry = under.dentry; } else { dentry = path->dentry; } inode_lock(dentry->d_inode); - if (unlikely(cant_mount(dentry))) { - inode_unlock(dentry->d_inode); - goto out; - } - namespace_lock(); - if (beneath && (!is_mounted(mnt) || m->mnt_mountpoint != dentry)) { + if (unlikely(cant_mount(dentry) || !is_mounted(mnt))) + break; // not to be mounted on + + if (beneath && unlikely(m->mnt_mountpoint != dentry || + &m->mnt_parent->mnt != under.mnt)) { namespace_unlock(); inode_unlock(dentry->d_inode); - goto out; + continue; // got moved } mnt = lookup_mnt(path); - if (likely(!mnt)) + if (unlikely(mnt)) { + namespace_unlock(); + inode_unlock(dentry->d_inode); + path_put(path); + path->mnt = mnt; + path->dentry = dget(mnt->mnt_root); + continue; // got overmounted + } + mp = get_mountpoint(dentry); + if (IS_ERR(mp)) break; - - namespace_unlock(); - inode_unlock(dentry->d_inode); - if (beneath) - dput(dentry); - path_put(path); - path->mnt = mnt; - path->dentry = dget(mnt->mnt_root); - } - - mp = get_mountpoint(dentry); - if (IS_ERR(mp)) { - namespace_unlock(); - inode_unlock(dentry->d_inode); + if (beneath) { + /* + * @under duplicates the references that will stay + * at least until namespace_unlock(), so the path_put() + * below is safe (and OK to do under namespace_lock - + * we are not dropping the final references here). + */ + path_put(&under); + } + return mp; } - -out: + namespace_unlock(); + inode_unlock(dentry->d_inode); if (beneath) - dput(dentry); - + path_put(&under); return mp; } @@ -2700,14 +2703,11 @@ static inline struct mountpoint *lock_mount(struct path *path) static void unlock_mount(struct mountpoint *where) { - struct dentry *dentry = where->m_dentry; - + inode_unlock(where->m_dentry->d_inode); read_seqlock_excl(&mount_lock); put_mountpoint(where); read_sequnlock_excl(&mount_lock); - namespace_unlock(); - inode_unlock(dentry->d_inode); } static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) diff --git a/fs/netfs/main.c b/fs/netfs/main.c index 4e3e62040831..70ecc8f5f210 100644 --- a/fs/netfs/main.c +++ b/fs/netfs/main.c @@ -127,11 +127,13 @@ static int __init netfs_init(void) if (mempool_init_slab_pool(&netfs_subrequest_pool, 100, netfs_subrequest_slab) < 0) goto error_subreqpool; +#ifdef CONFIG_PROC_FS if (!proc_mkdir("fs/netfs", NULL)) goto error_proc; if (!proc_create_seq("fs/netfs/requests", S_IFREG | 0444, NULL, &netfs_requests_seq_ops)) goto error_procfile; +#endif #ifdef CONFIG_FSCACHE_STATS if (!proc_create_single("fs/netfs/stats", S_IFREG | 0444, NULL, netfs_stats_show)) @@ -144,9 +146,11 @@ static int __init netfs_init(void) return 0; error_fscache: +#ifdef CONFIG_PROC_FS error_procfile: remove_proc_subtree("fs/netfs", NULL); error_proc: +#endif mempool_exit(&netfs_subrequest_pool); error_subreqpool: kmem_cache_destroy(netfs_subrequest_slab); diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 3b0918ade53c..a10d39150abc 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -1100,6 +1100,8 @@ struct nfs_server *nfs_create_server(struct fs_context *fc) if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN) server->namelen = NFS2_MAXNAMLEN; } + /* Linux 'subtree_check' borkenness mandates this setting */ + server->fh_expire_type = NFS_FH_VOL_RENAME; if (!(fattr->valid & NFS_ATTR_FATTR)) { error = ctx->nfs_mod->rpc_ops->getattr(server, ctx->mntfh, diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 325ba0663a6d..8bdbc4dca89c 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -307,7 +307,8 @@ nfs_start_delegation_return_locked(struct nfs_inode *nfsi) if (delegation == NULL) goto out; spin_lock(&delegation->lock); - if (!test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) { + if (delegation->inode && + !test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) { clear_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags); /* Refcount matched in nfs_end_delegation_return() */ ret = nfs_get_delegation(delegation); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 2b04038b0e40..34f3471ce813 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2678,6 +2678,18 @@ nfs_unblock_rename(struct rpc_task *task, struct nfs_renamedata *data) unblock_revalidate(new_dentry); } +static bool nfs_rename_is_unsafe_cross_dir(struct dentry *old_dentry, + struct dentry *new_dentry) +{ + struct nfs_server *server = NFS_SB(old_dentry->d_sb); + + if (old_dentry->d_parent != new_dentry->d_parent) + return false; + if (server->fh_expire_type & NFS_FH_RENAME_UNSAFE) + return !(server->fh_expire_type & NFS_FH_NOEXPIRE_WITH_OPEN); + return true; +} + /* * RENAME * FIXME: Some nfsds, like the Linux user space nfsd, may generate a @@ -2765,7 +2777,8 @@ int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, } - if (S_ISREG(old_inode->i_mode)) + if (S_ISREG(old_inode->i_mode) && + nfs_rename_is_unsafe_cross_dir(old_dentry, new_dentry)) nfs_sync_inode(old_inode); task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, must_unblock ? nfs_unblock_rename : NULL); diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index 4fa304fa5bc4..29d9234d5c08 100644 --- a/fs/nfs/filelayout/filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -76,6 +76,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, struct page *scratch; struct list_head dsaddrs; struct nfs4_pnfs_ds_addr *da; + struct net *net = server->nfs_client->cl_net; /* set up xdr stream */ scratch = alloc_page(gfp_flags); @@ -159,8 +160,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, mp_count = be32_to_cpup(p); /* multipath count */ for (j = 0; j < mp_count; j++) { - da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net, - &stream, gfp_flags); + da = nfs4_decode_mp_ds_addr(net, &stream, gfp_flags); if (da) list_add_tail(&da->da_node, &dsaddrs); } @@ -170,7 +170,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, goto out_err_free_deviceid; } - dsaddr->ds_list[i] = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags); + dsaddr->ds_list[i] = nfs4_pnfs_ds_add(net, &dsaddrs, gfp_flags); if (!dsaddr->ds_list[i]) goto out_err_drain_dsaddrs; trace_fl_getdevinfo(server, &pdev->dev_id, dsaddr->ds_list[i]->ds_remotestr); diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 98b45b636be3..646cda8e2e75 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1264,6 +1264,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, case -ECONNRESET: case -EHOSTDOWN: case -EHOSTUNREACH: + case -ENETDOWN: case -ENETUNREACH: case -EADDRINUSE: case -ENOBUFS: diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index e58bedfb1dcc..4a304cf17c4b 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -49,6 +49,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, struct nfs4_pnfs_ds_addr *da; struct nfs4_ff_layout_ds *new_ds = NULL; struct nfs4_ff_ds_version *ds_versions = NULL; + struct net *net = server->nfs_client->cl_net; u32 mp_count; u32 version_count; __be32 *p; @@ -80,8 +81,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, for (i = 0; i < mp_count; i++) { /* multipath ds */ - da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net, - &stream, gfp_flags); + da = nfs4_decode_mp_ds_addr(net, &stream, gfp_flags); if (da) list_add_tail(&da->da_node, &dsaddrs); } @@ -149,7 +149,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, new_ds->ds_versions = ds_versions; new_ds->ds_versions_cnt = version_count; - new_ds->ds = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags); + new_ds->ds = nfs4_pnfs_ds_add(net, &dsaddrs, gfp_flags); if (!new_ds->ds) goto out_err_drain_dsaddrs; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 1aa67fca69b2..119e447758b9 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -74,6 +74,8 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr) int nfs_wait_bit_killable(struct wait_bit_key *key, int mode) { + if (unlikely(nfs_current_task_exiting())) + return -EINTR; schedule(); if (signal_pending_state(mode, current)) return -ERESTARTSYS; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 59bb4d0338f3..3cedb7979fcd 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -905,6 +905,11 @@ static inline u32 nfs_stateid_hash(const nfs4_stateid *stateid) NFS4_STATEID_OTHER_SIZE); } +static inline bool nfs_current_task_exiting(void) +{ + return (current->flags & PF_EXITING) != 0; +} + static inline bool nfs_error_is_fatal(int err) { switch (err) { diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index 5c21caeae075..4ec952f9f47d 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -278,6 +278,7 @@ nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred, new = __nfs_local_open_fh(clp, cred, fh, nfl, mode); if (IS_ERR(new)) return NULL; + rcu_read_lock(); /* try to swap in the pointer */ spin_lock(&clp->cl_uuid.lock); nf = rcu_dereference_protected(*pnf, 1); @@ -287,7 +288,6 @@ nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred, rcu_assign_pointer(*pnf, nf); } spin_unlock(&clp->cl_uuid.lock); - rcu_read_lock(); } nf = nfs_local_file_get(nf); rcu_read_unlock(); diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 0c3bc98cd999..c1736dbb92b6 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -39,7 +39,7 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) __set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); schedule_timeout(NFS_JUKEBOX_RETRY_TIME); res = -ERESTARTSYS; - } while (!fatal_signal_pending(current)); + } while (!fatal_signal_pending(current) && !nfs_current_task_exiting()); return res; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 6e95db6c17e9..cfdbd521fc7b 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -443,6 +443,8 @@ static int nfs4_delay_killable(long *timeout) { might_sleep(); + if (unlikely(nfs_current_task_exiting())) + return -EINTR; __set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); schedule_timeout(nfs4_update_delay(timeout)); if (!__fatal_signal_pending(current)) @@ -454,6 +456,8 @@ static int nfs4_delay_interruptible(long *timeout) { might_sleep(); + if (unlikely(nfs_current_task_exiting())) + return -EINTR; __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE_UNSAFE); schedule_timeout(nfs4_update_delay(timeout)); if (!signal_pending(current)) @@ -1774,7 +1778,8 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, rcu_read_unlock(); trace_nfs4_open_stateid_update_wait(state->inode, stateid, 0); - if (!fatal_signal_pending(current)) { + if (!fatal_signal_pending(current) && + !nfs_current_task_exiting()) { if (schedule_timeout(5*HZ) == 0) status = -EAGAIN; else @@ -3578,7 +3583,7 @@ static bool nfs4_refresh_open_old_stateid(nfs4_stateid *dst, write_sequnlock(&state->seqlock); trace_nfs4_close_stateid_update_wait(state->inode, dst, 0); - if (fatal_signal_pending(current)) + if (fatal_signal_pending(current) || nfs_current_task_exiting()) status = -EINTR; else if (schedule_timeout(5*HZ) != 0) @@ -7049,10 +7054,18 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, struct nfs4_unlockdata *p; struct nfs4_state *state = lsp->ls_state; struct inode *inode = state->inode; + struct nfs_lock_context *l_ctx; p = kzalloc(sizeof(*p), GFP_KERNEL); if (p == NULL) return NULL; + l_ctx = nfs_get_lock_context(ctx); + if (!IS_ERR(l_ctx)) { + p->l_ctx = l_ctx; + } else { + kfree(p); + return NULL; + } p->arg.fh = NFS_FH(inode); p->arg.fl = &p->fl; p->arg.seqid = seqid; @@ -7060,7 +7073,6 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, p->lsp = lsp; /* Ensure we don't close file until we're done freeing locks! */ p->ctx = get_nfs_open_context(ctx); - p->l_ctx = nfs_get_lock_context(ctx); locks_init_lock(&p->fl); locks_copy_lock(&p->fl, fl); p->server = NFS_SERVER(inode); diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 542cdf71229f..04c726cc2900 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2739,7 +2739,15 @@ out_error: pr_warn_ratelimited("NFS: state manager%s%s failed on NFSv4 server %s" " with error %d\n", section_sep, section, clp->cl_hostname, -status); - ssleep(1); + switch (status) { + case -ENETDOWN: + case -ENETUNREACH: + nfs_mark_client_ready(clp, -EIO); + break; + default: + ssleep(1); + break; + } out_drain: memalloc_nofs_restore(memflags); nfs4_end_drain_session(clp); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 5f582713bf05..683e09be25ad 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -745,6 +745,14 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, return remaining; } +static void pnfs_reset_return_info(struct pnfs_layout_hdr *lo) +{ + struct pnfs_layout_segment *lseg; + + list_for_each_entry(lseg, &lo->plh_return_segs, pls_list) + pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0); +} + static void pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo, struct list_head *free_me, @@ -1292,6 +1300,7 @@ void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo, pnfs_mark_matching_lsegs_invalid(lo, &freeme, range, seq); pnfs_free_returned_lsegs(lo, &freeme, range, seq); pnfs_set_layout_stateid(lo, stateid, NULL, true); + pnfs_reset_return_info(lo); } else pnfs_mark_layout_stateid_invalid(lo, &freeme); out_unlock: diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 30d2613e912b..91ff877185c8 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -60,6 +60,7 @@ struct nfs4_pnfs_ds { struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */ char *ds_remotestr; /* comma sep list of addrs */ struct list_head ds_addrs; + const struct net *ds_net; struct nfs_client *ds_clp; refcount_t ds_count; unsigned long ds_state; @@ -415,7 +416,8 @@ int pnfs_generic_commit_pagelist(struct inode *inode, int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, int max); void pnfs_generic_write_commit_done(struct rpc_task *task, void *data); void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds); -struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(struct list_head *dsaddrs, +struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(const struct net *net, + struct list_head *dsaddrs, gfp_t gfp_flags); void nfs4_pnfs_v3_ds_connect_unload(void); int nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index dbef837e871a..2ee20a0f0b36 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -604,12 +604,12 @@ _same_data_server_addrs_locked(const struct list_head *dsaddrs1, * Lookup DS by addresses. nfs4_ds_cache_lock is held */ static struct nfs4_pnfs_ds * -_data_server_lookup_locked(const struct list_head *dsaddrs) +_data_server_lookup_locked(const struct net *net, const struct list_head *dsaddrs) { struct nfs4_pnfs_ds *ds; list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) - if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs)) + if (ds->ds_net == net && _same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs)) return ds; return NULL; } @@ -716,7 +716,7 @@ out_err: * uncached and return cached struct nfs4_pnfs_ds. */ struct nfs4_pnfs_ds * -nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags) +nfs4_pnfs_ds_add(const struct net *net, struct list_head *dsaddrs, gfp_t gfp_flags) { struct nfs4_pnfs_ds *tmp_ds, *ds = NULL; char *remotestr; @@ -734,13 +734,14 @@ nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags) remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags); spin_lock(&nfs4_ds_cache_lock); - tmp_ds = _data_server_lookup_locked(dsaddrs); + tmp_ds = _data_server_lookup_locked(net, dsaddrs); if (tmp_ds == NULL) { INIT_LIST_HEAD(&ds->ds_addrs); list_splice_init(dsaddrs, &ds->ds_addrs); ds->ds_remotestr = remotestr; refcount_set(&ds->ds_count, 1); INIT_LIST_HEAD(&ds->ds_node); + ds->ds_net = net; ds->ds_clp = NULL; list_add(&ds->ds_node, &nfs4_data_server_cache); dprintk("%s add new data server %s\n", __func__, diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index cb01ea81724d..d0bcf744c553 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -705,8 +705,6 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb) int blocksize; int err; - down_write(&nilfs->ns_sem); - blocksize = sb_min_blocksize(sb, NILFS_MIN_BLOCK_SIZE); if (!blocksize) { nilfs_err(sb, "unable to set blocksize"); @@ -779,7 +777,6 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb) set_nilfs_init(nilfs); err = 0; out: - up_write(&nilfs->ns_sem); return err; failed_sbh: diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index e9f701f884e7..9b6a3f8d2e7c 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -430,6 +430,7 @@ static int ntfs_extend(struct inode *inode, loff_t pos, size_t count, } if (extend_init && !is_compressed(ni)) { + WARN_ON(ni->i_valid >= pos); err = ntfs_extend_initialized_size(file, ni, ni->i_valid, pos); if (err) goto out; @@ -1246,21 +1247,22 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ssize_t ret; int err; - err = check_write_restriction(inode); - if (err) - return err; - - if (is_compressed(ni) && (iocb->ki_flags & IOCB_DIRECT)) { - ntfs_inode_warn(inode, "direct i/o + compressed not supported"); - return -EOPNOTSUPP; - } - if (!inode_trylock(inode)) { if (iocb->ki_flags & IOCB_NOWAIT) return -EAGAIN; inode_lock(inode); } + ret = check_write_restriction(inode); + if (ret) + goto out; + + if (is_compressed(ni) && (iocb->ki_flags & IOCB_DIRECT)) { + ntfs_inode_warn(inode, "direct i/o + compressed not supported"); + ret = -EOPNOTSUPP; + goto out; + } + ret = generic_write_checks(iocb, from); if (ret <= 0) goto out; diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index b8ac85b548c7..821cb7874685 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6918,6 +6918,7 @@ static int ocfs2_grab_folios(struct inode *inode, loff_t start, loff_t end, if (IS_ERR(folios[numfolios])) { ret = PTR_ERR(folios[numfolios]); mlog_errno(ret); + folios[numfolios] = NULL; goto out; } diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index f1b4b3e611cb..e5f58ff2175f 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -174,7 +174,7 @@ int ocfs2_recovery_init(struct ocfs2_super *osb) struct ocfs2_recovery_map *rm; mutex_init(&osb->recovery_lock); - osb->disable_recovery = 0; + osb->recovery_state = OCFS2_REC_ENABLED; osb->recovery_thread_task = NULL; init_waitqueue_head(&osb->recovery_event); @@ -190,31 +190,53 @@ int ocfs2_recovery_init(struct ocfs2_super *osb) return 0; } -/* we can't grab the goofy sem lock from inside wait_event, so we use - * memory barriers to make sure that we'll see the null task before - * being woken up */ static int ocfs2_recovery_thread_running(struct ocfs2_super *osb) { - mb(); return osb->recovery_thread_task != NULL; } -void ocfs2_recovery_exit(struct ocfs2_super *osb) +static void ocfs2_recovery_disable(struct ocfs2_super *osb, + enum ocfs2_recovery_state state) { - struct ocfs2_recovery_map *rm; - - /* disable any new recovery threads and wait for any currently - * running ones to exit. Do this before setting the vol_state. */ mutex_lock(&osb->recovery_lock); - osb->disable_recovery = 1; + /* + * If recovery thread is not running, we can directly transition to + * final state. + */ + if (!ocfs2_recovery_thread_running(osb)) { + osb->recovery_state = state + 1; + goto out_lock; + } + osb->recovery_state = state; + /* Wait for recovery thread to acknowledge state transition */ + wait_event_cmd(osb->recovery_event, + !ocfs2_recovery_thread_running(osb) || + osb->recovery_state >= state + 1, + mutex_unlock(&osb->recovery_lock), + mutex_lock(&osb->recovery_lock)); +out_lock: mutex_unlock(&osb->recovery_lock); - wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb)); - /* At this point, we know that no more recovery threads can be - * launched, so wait for any recovery completion work to - * complete. */ + /* + * At this point we know that no more recovery work can be queued so + * wait for any recovery completion work to complete. + */ if (osb->ocfs2_wq) flush_workqueue(osb->ocfs2_wq); +} + +void ocfs2_recovery_disable_quota(struct ocfs2_super *osb) +{ + ocfs2_recovery_disable(osb, OCFS2_REC_QUOTA_WANT_DISABLE); +} + +void ocfs2_recovery_exit(struct ocfs2_super *osb) +{ + struct ocfs2_recovery_map *rm; + + /* disable any new recovery threads and wait for any currently + * running ones to exit. Do this before setting the vol_state. */ + ocfs2_recovery_disable(osb, OCFS2_REC_WANT_DISABLE); /* * Now that recovery is shut down, and the osb is about to be @@ -1249,7 +1271,7 @@ static int ocfs2_force_read_journal(struct inode *inode) } for (i = 0; i < p_blocks; i++, p_blkno++) { - bh = __find_get_block(osb->sb->s_bdev, p_blkno, + bh = __find_get_block_nonatomic(osb->sb->s_bdev, p_blkno, osb->sb->s_blocksize); /* block not cached. */ if (!bh) @@ -1472,6 +1494,18 @@ static int __ocfs2_recovery_thread(void *arg) } } restart: + if (quota_enabled) { + mutex_lock(&osb->recovery_lock); + /* Confirm that recovery thread will no longer recover quotas */ + if (osb->recovery_state == OCFS2_REC_QUOTA_WANT_DISABLE) { + osb->recovery_state = OCFS2_REC_QUOTA_DISABLED; + wake_up(&osb->recovery_event); + } + if (osb->recovery_state >= OCFS2_REC_QUOTA_DISABLED) + quota_enabled = 0; + mutex_unlock(&osb->recovery_lock); + } + status = ocfs2_super_lock(osb, 1); if (status < 0) { mlog_errno(status); @@ -1569,27 +1603,29 @@ bail: ocfs2_free_replay_slots(osb); osb->recovery_thread_task = NULL; - mb(); /* sync with ocfs2_recovery_thread_running */ + if (osb->recovery_state == OCFS2_REC_WANT_DISABLE) + osb->recovery_state = OCFS2_REC_DISABLED; wake_up(&osb->recovery_event); mutex_unlock(&osb->recovery_lock); - if (quota_enabled) - kfree(rm_quota); + kfree(rm_quota); return status; } void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num) { + int was_set = -1; + mutex_lock(&osb->recovery_lock); + if (osb->recovery_state < OCFS2_REC_WANT_DISABLE) + was_set = ocfs2_recovery_map_set(osb, node_num); trace_ocfs2_recovery_thread(node_num, osb->node_num, - osb->disable_recovery, osb->recovery_thread_task, - osb->disable_recovery ? - -1 : ocfs2_recovery_map_set(osb, node_num)); + osb->recovery_state, osb->recovery_thread_task, was_set); - if (osb->disable_recovery) + if (osb->recovery_state >= OCFS2_REC_WANT_DISABLE) goto out; if (osb->recovery_thread_task) diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index e3c3a35dc5e0..6397170f302f 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -148,6 +148,7 @@ void ocfs2_wait_for_recovery(struct ocfs2_super *osb); int ocfs2_recovery_init(struct ocfs2_super *osb); void ocfs2_recovery_exit(struct ocfs2_super *osb); +void ocfs2_recovery_disable_quota(struct ocfs2_super *osb); int ocfs2_compute_replay_slots(struct ocfs2_super *osb); void ocfs2_free_replay_slots(struct ocfs2_super *osb); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 51c52768132d..6aaa94c554c1 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -308,6 +308,21 @@ enum ocfs2_journal_trigger_type { void ocfs2_initialize_journal_triggers(struct super_block *sb, struct ocfs2_triggers triggers[]); +enum ocfs2_recovery_state { + OCFS2_REC_ENABLED = 0, + OCFS2_REC_QUOTA_WANT_DISABLE, + /* + * Must be OCFS2_REC_QUOTA_WANT_DISABLE + 1 for + * ocfs2_recovery_disable_quota() to work. + */ + OCFS2_REC_QUOTA_DISABLED, + OCFS2_REC_WANT_DISABLE, + /* + * Must be OCFS2_REC_WANT_DISABLE + 1 for ocfs2_recovery_exit() to work + */ + OCFS2_REC_DISABLED, +}; + struct ocfs2_journal; struct ocfs2_slot_info; struct ocfs2_recovery_map; @@ -370,7 +385,7 @@ struct ocfs2_super struct ocfs2_recovery_map *recovery_map; struct ocfs2_replay_map *replay_map; struct task_struct *recovery_thread_task; - int disable_recovery; + enum ocfs2_recovery_state recovery_state; wait_queue_head_t checkpoint_event; struct ocfs2_journal *journal; unsigned long osb_commit_interval; diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 2956d888c131..e272429da3db 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -453,8 +453,7 @@ out: /* Sync changes in local quota file into global quota file and * reinitialize local quota file. - * The function expects local quota file to be already locked and - * s_umount locked in shared mode. */ + * The function expects local quota file to be already locked. */ static int ocfs2_recover_local_quota_file(struct inode *lqinode, int type, struct ocfs2_quota_recovery *rec) @@ -588,7 +587,6 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, { unsigned int ino[OCFS2_MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, LOCAL_GROUP_QUOTA_SYSTEM_INODE }; - struct super_block *sb = osb->sb; struct ocfs2_local_disk_dqinfo *ldinfo; struct buffer_head *bh; handle_t *handle; @@ -600,7 +598,6 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, printk(KERN_NOTICE "ocfs2: Finishing quota recovery on device (%s) for " "slot %u\n", osb->dev_str, slot_num); - down_read(&sb->s_umount); for (type = 0; type < OCFS2_MAXQUOTAS; type++) { if (list_empty(&(rec->r_list[type]))) continue; @@ -677,7 +674,6 @@ out_put: break; } out: - up_read(&sb->s_umount); kfree(rec); return status; } @@ -843,8 +839,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type) ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk); /* - * s_umount held in exclusive mode protects us against racing with - * recovery thread... + * ocfs2_dismount_volume() has already aborted quota recovery... */ if (oinfo->dqi_rec) { ocfs2_free_quota_recovery(oinfo->dqi_rec); diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index f7b483f0de2a..6ac4dcd54588 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -698,10 +698,12 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode, ac, cl); - if (PTR_ERR(bg_bh) == -ENOSPC) + if (PTR_ERR(bg_bh) == -ENOSPC) { + ac->ac_which = OCFS2_AC_USE_MAIN_DISCONTIG; bg_bh = ocfs2_block_group_alloc_discontig(handle, alloc_inode, ac, cl); + } if (IS_ERR(bg_bh)) { status = PTR_ERR(bg_bh); bg_bh = NULL; @@ -1794,6 +1796,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, { int status; u16 chain; + u32 contig_bits; u64 next_group; struct inode *alloc_inode = ac->ac_inode; struct buffer_head *group_bh = NULL; @@ -1819,10 +1822,21 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, status = -ENOSPC; /* for now, the chain search is a bit simplistic. We just use * the 1st group with any empty bits. */ - while ((status = ac->ac_group_search(alloc_inode, group_bh, - bits_wanted, min_bits, - ac->ac_max_block, - res)) == -ENOSPC) { + while (1) { + if (ac->ac_which == OCFS2_AC_USE_MAIN_DISCONTIG) { + contig_bits = le16_to_cpu(bg->bg_contig_free_bits); + if (!contig_bits) + contig_bits = ocfs2_find_max_contig_free_bits(bg->bg_bitmap, + le16_to_cpu(bg->bg_bits), 0); + if (bits_wanted > contig_bits && contig_bits >= min_bits) + bits_wanted = contig_bits; + } + + status = ac->ac_group_search(alloc_inode, group_bh, + bits_wanted, min_bits, + ac->ac_max_block, res); + if (status != -ENOSPC) + break; if (!bg->bg_next_group) break; @@ -1982,6 +1996,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, victim = ocfs2_find_victim_chain(cl); ac->ac_chain = victim; +search: status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, res, &bits_left); if (!status) { @@ -2022,6 +2037,16 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, } } + /* Chains can't supply the bits_wanted contiguous space. + * We should switch to using every single bit when allocating + * from the global bitmap. */ + if (i == le16_to_cpu(cl->cl_next_free_rec) && + status == -ENOSPC && ac->ac_which == OCFS2_AC_USE_MAIN) { + ac->ac_which = OCFS2_AC_USE_MAIN_DISCONTIG; + ac->ac_chain = victim; + goto search; + } + set_hint: if (status != -ENOSPC) { /* If the next search of this group is not likely to @@ -2365,7 +2390,8 @@ int __ocfs2_claim_clusters(handle_t *handle, BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL - && ac->ac_which != OCFS2_AC_USE_MAIN); + && ac->ac_which != OCFS2_AC_USE_MAIN + && ac->ac_which != OCFS2_AC_USE_MAIN_DISCONTIG); if (ac->ac_which == OCFS2_AC_USE_LOCAL) { WARN_ON(min_clusters > 1); diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index b481b834857d..bcf2ed4a8631 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -29,6 +29,7 @@ struct ocfs2_alloc_context { #define OCFS2_AC_USE_MAIN 2 #define OCFS2_AC_USE_INODE 3 #define OCFS2_AC_USE_META 4 +#define OCFS2_AC_USE_MAIN_DISCONTIG 5 u32 ac_which; /* these are used by the chain search */ diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 8bb5022f3082..3d2533950bae 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1812,6 +1812,9 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) /* Orphan scan should be stopped as early as possible */ ocfs2_orphan_scan_stop(osb); + /* Stop quota recovery so that we can disable quotas */ + ocfs2_recovery_disable_quota(osb); + ocfs2_disable_quotas(osb); /* All dquots should be freed by now */ diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index aae6d2b8767d..a244f397a68c 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -23,21 +23,22 @@ static int orangefs_writepage_locked(struct page *page, struct orangefs_write_range *wr = NULL; struct iov_iter iter; struct bio_vec bv; - size_t len, wlen; + size_t wlen; ssize_t ret; - loff_t off; + loff_t len, off; set_page_writeback(page); len = i_size_read(inode); if (PagePrivate(page)) { wr = (struct orangefs_write_range *)page_private(page); - WARN_ON(wr->pos >= len); off = wr->pos; - if (off + wr->len > len) + if ((off + wr->len > len) && (off <= len)) wlen = len - off; else wlen = wr->len; + if (wlen == 0) + wlen = wr->len; } else { WARN_ON(1); off = page_offset(page); @@ -46,8 +47,6 @@ static int orangefs_writepage_locked(struct page *page, else wlen = PAGE_SIZE; } - /* Should've been handled in orangefs_invalidate_folio. */ - WARN_ON(off == len || off + wlen > len); WARN_ON(wlen == 0); bvec_set_page(&bv, page, wlen, off % PAGE_SIZE); @@ -91,8 +90,7 @@ static int orangefs_writepages_work(struct orangefs_writepages *ow, struct orangefs_write_range *wrp, wr; struct iov_iter iter; ssize_t ret; - size_t len; - loff_t off; + loff_t len, off; int i; len = i_size_read(inode); @@ -341,6 +339,8 @@ static int orangefs_write_begin(struct file *file, wr->len += len; goto okay; } else { + wr->pos = pos; + wr->len = len; ret = orangefs_launder_folio(folio); if (ret) return ret; diff --git a/fs/pidfs.c b/fs/pidfs.c index c0478b3c55d9..9aa4c705776d 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -188,20 +188,21 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) { struct pid *pid = pidfd_pid(file); - bool thread = file->f_flags & PIDFD_THREAD; struct task_struct *task; __poll_t poll_flags = 0; poll_wait(file, &pid->wait_pidfd, pts); /* - * Depending on PIDFD_THREAD, inform pollers when the thread - * or the whole thread-group exits. + * Don't wake waiters if the thread-group leader exited + * prematurely. They either get notified when the last subthread + * exits or not at all if one of the remaining subthreads execs + * and assumes the struct pid of the old thread-group leader. */ guard(rcu)(); task = pid_task(pid, PIDTYPE_PID); if (!task) poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP; - else if (task->exit_state && (thread || thread_group_empty(task))) + else if (task->exit_state && !delay_group_leader(task)) poll_flags = EPOLLIN | EPOLLRDNORM; return poll_flags; diff --git a/fs/pipe.c b/fs/pipe.c index 4d0799e4e719..88e81f84e3ea 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1271,6 +1271,10 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) struct pipe_buffer *bufs; unsigned int head, tail, mask, n; + /* nr_slots larger than limits of pipe->{head,tail} */ + if (unlikely(nr_slots > (pipe_index_t)-1u)) + return -EINVAL; + bufs = kcalloc(nr_slots, sizeof(*bufs), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (unlikely(!bufs)) diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 56815799ce79..9de6b280c4f4 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -265,7 +265,7 @@ static void parse_options(char *options) static int pstore_show_options(struct seq_file *m, struct dentry *root) { if (kmsg_bytes != CONFIG_PSTORE_DEFAULT_KMSG_BYTES) - seq_printf(m, ",kmsg_bytes=%lu", kmsg_bytes); + seq_printf(m, ",kmsg_bytes=%u", kmsg_bytes); return 0; } diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index 801d6c0b170c..a0fc51196910 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -6,7 +6,7 @@ #include <linux/time.h> #include <linux/pstore.h> -extern unsigned long kmsg_bytes; +extern unsigned int kmsg_bytes; #ifdef CONFIG_PSTORE_FTRACE extern void pstore_register_ftrace(void); @@ -35,7 +35,7 @@ static inline void pstore_unregister_pmsg(void) {} extern struct pstore_info *psinfo; -extern void pstore_set_kmsg_bytes(int); +extern void pstore_set_kmsg_bytes(unsigned int bytes); extern void pstore_get_records(int); extern void pstore_get_backend_records(struct pstore_info *psi, struct dentry *root, int quiet); diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index f56b066ab80c..557cf9d40177 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -92,8 +92,8 @@ module_param(compress, charp, 0444); MODULE_PARM_DESC(compress, "compression to use"); /* How much of the kernel log to snapshot */ -unsigned long kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES; -module_param(kmsg_bytes, ulong, 0444); +unsigned int kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES; +module_param(kmsg_bytes, uint, 0444); MODULE_PARM_DESC(kmsg_bytes, "amount of kernel log to snapshot (in bytes)"); static void *compress_workspace; @@ -107,9 +107,9 @@ static void *compress_workspace; static char *big_oops_buf; static size_t max_compressed_size; -void pstore_set_kmsg_bytes(int bytes) +void pstore_set_kmsg_bytes(unsigned int bytes) { - kmsg_bytes = bytes; + WRITE_ONCE(kmsg_bytes, bytes); } /* Tag each group of saved records with a sequence number */ @@ -278,6 +278,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, struct kmsg_dump_detail *detail) { struct kmsg_dump_iter iter; + unsigned int remaining = READ_ONCE(kmsg_bytes); unsigned long total = 0; const char *why; unsigned int part = 1; @@ -300,7 +301,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, kmsg_dump_rewind(&iter); oopscount++; - while (total < kmsg_bytes) { + while (total < remaining) { char *dst; size_t dst_size; int header_size; diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index fe738623cf1b..240d82c6f908 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -29,7 +29,6 @@ static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids, { struct cached_fid *cfid; - spin_lock(&cfids->cfid_list_lock); list_for_each_entry(cfid, &cfids->entries, entry) { if (!strcmp(cfid->path, path)) { /* @@ -38,25 +37,20 @@ static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids, * being deleted due to a lease break. */ if (!cfid->time || !cfid->has_lease) { - spin_unlock(&cfids->cfid_list_lock); return NULL; } kref_get(&cfid->refcount); - spin_unlock(&cfids->cfid_list_lock); return cfid; } } if (lookup_only) { - spin_unlock(&cfids->cfid_list_lock); return NULL; } if (cfids->num_entries >= max_cached_dirs) { - spin_unlock(&cfids->cfid_list_lock); return NULL; } cfid = init_cached_dir(path); if (cfid == NULL) { - spin_unlock(&cfids->cfid_list_lock); return NULL; } cfid->cfids = cfids; @@ -74,7 +68,6 @@ static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids, */ cfid->has_lease = true; - spin_unlock(&cfids->cfid_list_lock); return cfid; } @@ -187,8 +180,10 @@ replay_again: if (!utf16_path) return -ENOMEM; + spin_lock(&cfids->cfid_list_lock); cfid = find_or_create_cached_dir(cfids, path, lookup_only, tcon->max_cached_dirs); if (cfid == NULL) { + spin_unlock(&cfids->cfid_list_lock); kfree(utf16_path); return -ENOENT; } @@ -197,7 +192,6 @@ replay_again: * Otherwise, it is either a new entry or laundromat worker removed it * from @cfids->entries. Caller will put last reference if the latter. */ - spin_lock(&cfids->cfid_list_lock); if (cfid->has_lease && cfid->time) { spin_unlock(&cfids->cfid_list_lock); *ret_cfid = cfid; diff --git a/fs/smb/client/cifsacl.c b/fs/smb/client/cifsacl.c index 64bd68f750f8..63b3b1290bed 100644 --- a/fs/smb/client/cifsacl.c +++ b/fs/smb/client/cifsacl.c @@ -811,7 +811,23 @@ static void parse_dacl(struct smb_acl *pdacl, char *end_of_acl, return; for (i = 0; i < num_aces; ++i) { + if (end_of_acl - acl_base < acl_size) + break; + ppace[i] = (struct smb_ace *) (acl_base + acl_size); + acl_base = (char *)ppace[i]; + acl_size = offsetof(struct smb_ace, sid) + + offsetof(struct smb_sid, sub_auth); + + if (end_of_acl - acl_base < acl_size || + ppace[i]->sid.num_subauth == 0 || + ppace[i]->sid.num_subauth > SID_MAX_SUB_AUTHORITIES || + (end_of_acl - acl_base < + acl_size + sizeof(__le32) * ppace[i]->sid.num_subauth) || + (le16_to_cpu(ppace[i]->size) < + acl_size + sizeof(__le32) * ppace[i]->sid.num_subauth)) + break; + #ifdef CONFIG_CIFS_DEBUG2 dump_ace(ppace[i], end_of_acl); #endif @@ -855,7 +871,6 @@ static void parse_dacl(struct smb_acl *pdacl, char *end_of_acl, (void *)ppace[i], sizeof(struct smb_ace)); */ - acl_base = (char *)ppace[i]; acl_size = le16_to_cpu(ppace[i]->size); } @@ -1550,7 +1565,7 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, int rc = 0; struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); struct smb_version_operations *ops; - const u32 info = 0; + const u32 info = OWNER_SECINFO | GROUP_SECINFO | DACL_SECINFO; cifs_dbg(NOISY, "converting ACL to mode for %s\n", path); @@ -1604,7 +1619,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, struct tcon_link *tlink; struct smb_version_operations *ops; bool mode_from_sid, id_from_sid; - const u32 info = 0; + const u32 info = OWNER_SECINFO | GROUP_SECINFO | DACL_SECINFO; bool posix; tlink = cifs_sb_tlink(cifs_sb); diff --git a/fs/smb/client/cifspdu.h b/fs/smb/client/cifspdu.h index 48d0d6f439cf..cf9ca7e49b8b 100644 --- a/fs/smb/client/cifspdu.h +++ b/fs/smb/client/cifspdu.h @@ -1266,10 +1266,9 @@ typedef struct smb_com_query_information_rsp { typedef struct smb_com_setattr_req { struct smb_hdr hdr; /* wct = 8 */ __le16 attr; - __le16 time_low; - __le16 time_high; + __le32 last_write_time; __le16 reserved[5]; /* must be zero */ - __u16 ByteCount; + __le16 ByteCount; __u8 BufferFormat; /* 4 = ASCII */ unsigned char fileName[]; } __attribute__((packed)) SETATTR_REQ; diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index 278092a15f89..2dd5a485a1e0 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -31,6 +31,9 @@ extern void cifs_small_buf_release(void *); extern void free_rsp_buf(int, void *); extern int smb_send(struct TCP_Server_Info *, struct smb_hdr *, unsigned int /* length */); +extern int smb_send_kvec(struct TCP_Server_Info *server, + struct msghdr *msg, + size_t *sent); extern unsigned int _get_xid(void); extern void _free_xid(unsigned int); #define get_xid() \ @@ -392,6 +395,10 @@ extern int CIFSSMBQFSUnixInfo(const unsigned int xid, struct cifs_tcon *tcon); extern int CIFSSMBQFSPosixInfo(const unsigned int xid, struct cifs_tcon *tcon, struct kstatfs *FSData); +extern int SMBSetInformation(const unsigned int xid, struct cifs_tcon *tcon, + const char *fileName, __le32 attributes, __le64 write_time, + const struct nls_table *nls_codepage, + struct cifs_sb_info *cifs_sb); extern int CIFSSMBSetPathInfo(const unsigned int xid, struct cifs_tcon *tcon, const char *fileName, const FILE_BASIC_INFO *data, const struct nls_table *nls_codepage, diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index 4fc9485c5d91..4059550859a9 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -1038,15 +1038,31 @@ static __u16 convert_disposition(int disposition) static int access_flags_to_smbopen_mode(const int access_flags) { - int masked_flags = access_flags & (GENERIC_READ | GENERIC_WRITE); - - if (masked_flags == GENERIC_READ) - return SMBOPEN_READ; - else if (masked_flags == GENERIC_WRITE) + /* + * SYSTEM_SECURITY grants both read and write access to SACL, treat is as read/write. + * MAXIMUM_ALLOWED grants as many access as possible, so treat it as read/write too. + * SYNCHRONIZE as is does not grant any specific access, so do not check its mask. + * If only SYNCHRONIZE bit is specified then fallback to read access. + */ + bool with_write_flags = access_flags & (FILE_WRITE_DATA | FILE_APPEND_DATA | FILE_WRITE_EA | + FILE_DELETE_CHILD | FILE_WRITE_ATTRIBUTES | DELETE | + WRITE_DAC | WRITE_OWNER | SYSTEM_SECURITY | + MAXIMUM_ALLOWED | GENERIC_WRITE | GENERIC_ALL); + bool with_read_flags = access_flags & (FILE_READ_DATA | FILE_READ_EA | FILE_EXECUTE | + FILE_READ_ATTRIBUTES | READ_CONTROL | + SYSTEM_SECURITY | MAXIMUM_ALLOWED | GENERIC_ALL | + GENERIC_EXECUTE | GENERIC_READ); + bool with_execute_flags = access_flags & (FILE_EXECUTE | MAXIMUM_ALLOWED | GENERIC_ALL | + GENERIC_EXECUTE); + + if (with_write_flags && with_read_flags) + return SMBOPEN_READWRITE; + else if (with_write_flags) return SMBOPEN_WRITE; - - /* just go for read/write */ - return SMBOPEN_READWRITE; + else if (with_execute_flags) + return SMBOPEN_EXECUTE; + else + return SMBOPEN_READ; } int @@ -2709,6 +2725,9 @@ int cifs_query_reparse_point(const unsigned int xid, if (cap_unix(tcon->ses)) return -EOPNOTSUPP; + if (!(le32_to_cpu(tcon->fsAttrInfo.Attributes) & FILE_SUPPORTS_REPARSE_POINTS)) + return -EOPNOTSUPP; + oparms = (struct cifs_open_parms) { .tcon = tcon, .cifs_sb = cifs_sb, @@ -3400,8 +3419,7 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid, /* BB TEST with big acls that might need to be e.g. larger than 16K */ pSMB->MaxSetupCount = 0; pSMB->Fid = fid; /* file handle always le */ - pSMB->AclFlags = cpu_to_le32(CIFS_ACL_OWNER | CIFS_ACL_GROUP | - CIFS_ACL_DACL | info); + pSMB->AclFlags = cpu_to_le32(info); pSMB->ByteCount = cpu_to_le16(11); /* 3 bytes pad + 8 bytes parm */ inc_rfc1001_len(pSMB, 11); iov[0].iov_base = (char *)pSMB; @@ -5150,6 +5168,63 @@ CIFSSMBSetFileSize(const unsigned int xid, struct cifs_tcon *tcon, return rc; } +int +SMBSetInformation(const unsigned int xid, struct cifs_tcon *tcon, + const char *fileName, __le32 attributes, __le64 write_time, + const struct nls_table *nls_codepage, + struct cifs_sb_info *cifs_sb) +{ + SETATTR_REQ *pSMB; + SETATTR_RSP *pSMBr; + struct timespec64 ts; + int bytes_returned; + int name_len; + int rc; + + cifs_dbg(FYI, "In %s path %s\n", __func__, fileName); + +retry: + rc = smb_init(SMB_COM_SETATTR, 8, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + cifsConvertToUTF16((__le16 *) pSMB->fileName, + fileName, PATH_MAX, nls_codepage, + cifs_remap(cifs_sb)); + name_len++; /* trailing null */ + name_len *= 2; + } else { + name_len = copy_path_name(pSMB->fileName, fileName); + } + /* Only few attributes can be set by this command, others are not accepted by Win9x. */ + pSMB->attr = cpu_to_le16(le32_to_cpu(attributes) & + (ATTR_READONLY | ATTR_HIDDEN | ATTR_SYSTEM | ATTR_ARCHIVE)); + /* Zero write time value (in both NT and SETATTR formats) means to not change it. */ + if (le64_to_cpu(write_time) != 0) { + ts = cifs_NTtimeToUnix(write_time); + pSMB->last_write_time = cpu_to_le32(ts.tv_sec); + } + pSMB->BufferFormat = 0x04; + name_len++; /* account for buffer type byte */ + inc_rfc1001_len(pSMB, (__u16)name_len); + pSMB->ByteCount = cpu_to_le16(name_len); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) + cifs_dbg(FYI, "Send error in %s = %d\n", __func__, rc); + + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto retry; + + return rc; +} + /* Some legacy servers such as NT4 require that the file times be set on an open handle, rather than by pathname - this is awkward due to potential access conflicts on the open, but it is unavoidable for these diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index cc9c912db8de..22f29d272592 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -3028,8 +3028,10 @@ ip_rfc1001_connect(struct TCP_Server_Info *server) * sessinit is sent but no second negprot */ struct rfc1002_session_packet req = {}; - struct smb_hdr *smb_buf = (struct smb_hdr *)&req; + struct msghdr msg = {}; + struct kvec iov = {}; unsigned int len; + size_t sent; req.trailer.session_req.called_len = sizeof(req.trailer.session_req.called_name); @@ -3058,10 +3060,18 @@ ip_rfc1001_connect(struct TCP_Server_Info *server) * As per rfc1002, @len must be the number of bytes that follows the * length field of a rfc1002 session request payload. */ - len = sizeof(req) - offsetof(struct rfc1002_session_packet, trailer.session_req); + len = sizeof(req.trailer.session_req); + req.type = RFC1002_SESSION_REQUEST; + req.flags = 0; + req.length = cpu_to_be16(len); + len += offsetof(typeof(req), trailer.session_req); + iov.iov_base = &req; + iov.iov_len = len; + iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &iov, 1, len); + rc = smb_send_kvec(server, &msg, &sent); + if (rc < 0 || len != sent) + return (rc == -EINTR || rc == -EAGAIN) ? rc : -ECONNABORTED; - smb_buf->smb_buf_length = cpu_to_be32((RFC1002_SESSION_REQUEST << 24) | len); - rc = smb_send(server, smb_buf, len); /* * RFC1001 layer in at least one server requires very short break before * negprot presumably because not expecting negprot to follow so fast. @@ -3070,7 +3080,7 @@ ip_rfc1001_connect(struct TCP_Server_Info *server) */ usleep_range(1000, 2000); - return rc; + return 0; } static int @@ -3922,11 +3932,13 @@ int cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses, struct TCP_Server_Info *server) { + bool in_retry = false; int rc = 0; if (!server->ops->need_neg || !server->ops->negotiate) return -ENOSYS; +retry: /* only send once per connect */ spin_lock(&server->srv_lock); if (server->tcpStatus != CifsGood && @@ -3946,6 +3958,14 @@ cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses, spin_unlock(&server->srv_lock); rc = server->ops->negotiate(xid, ses, server); + if (rc == -EAGAIN) { + /* Allow one retry attempt */ + if (!in_retry) { + in_retry = true; + goto retry; + } + rc = -EHOSTDOWN; + } if (rc == 0) { spin_lock(&server->srv_lock); if (server->tcpStatus == CifsInNegotiate) diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index e38521a713a6..b877def5a366 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -1118,6 +1118,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, int i, opt; bool is_smb3 = !strcmp(fc->fs_type->name, "smb3"); bool skip_parsing = false; + char *hostname; cifs_dbg(FYI, "CIFS: parsing cifs mount option '%s'\n", param->key); @@ -1327,6 +1328,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, case Opt_rsize: ctx->rsize = result.uint_32; ctx->got_rsize = true; + ctx->vol_rsize = ctx->rsize; break; case Opt_wsize: ctx->wsize = result.uint_32; @@ -1342,6 +1344,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, ctx->wsize, PAGE_SIZE); } } + ctx->vol_wsize = ctx->wsize; break; case Opt_acregmax: if (result.uint_32 > CIFS_MAX_ACTIMEO / HZ) { @@ -1448,6 +1451,16 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, cifs_errorf(fc, "OOM when copying UNC string\n"); goto cifs_parse_mount_err; } + hostname = extract_hostname(ctx->UNC); + if (IS_ERR(hostname)) { + cifs_errorf(fc, "Cannot extract hostname from UNC string\n"); + goto cifs_parse_mount_err; + } + /* last byte, type, is 0x20 for servr type */ + memset(ctx->target_rfc1001_name, 0x20, RFC1001_NAME_LEN_WITH_NULL); + for (i = 0; i < RFC1001_NAME_LEN && hostname[i] != 0; i++) + ctx->target_rfc1001_name[i] = toupper(hostname[i]); + kfree(hostname); break; case Opt_user: kfree(ctx->username); diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h index 881bfc08667e..7516ccdc69c7 100644 --- a/fs/smb/client/fs_context.h +++ b/fs/smb/client/fs_context.h @@ -279,6 +279,9 @@ struct smb3_fs_context { bool use_client_guid:1; /* reuse existing guid for multichannel */ u8 client_guid[SMB2_CLIENT_GUID_SIZE]; + /* User-specified original r/wsize value */ + unsigned int vol_rsize; + unsigned int vol_wsize; unsigned int bsize; unsigned int rasize; unsigned int rsize; diff --git a/fs/smb/client/link.c b/fs/smb/client/link.c index 6e6c09cc5ce7..769752ad2c5c 100644 --- a/fs/smb/client/link.c +++ b/fs/smb/client/link.c @@ -258,7 +258,7 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, struct cifs_open_parms oparms; struct cifs_io_parms io_parms = {0}; int buf_type = CIFS_NO_BUFFER; - FILE_ALL_INFO file_info; + struct cifs_open_info_data query_data; oparms = (struct cifs_open_parms) { .tcon = tcon, @@ -270,11 +270,11 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, .fid = &fid, }; - rc = CIFS_open(xid, &oparms, &oplock, &file_info); + rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, &query_data); if (rc) return rc; - if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) { + if (query_data.fi.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) { rc = -ENOENT; /* it's not a symlink */ goto out; @@ -313,7 +313,7 @@ cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, .fid = &fid, }; - rc = CIFS_open(xid, &oparms, &oplock, NULL); + rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, NULL); if (rc) return rc; @@ -643,7 +643,8 @@ cifs_symlink(struct mnt_idmap *idmap, struct inode *inode, case CIFS_SYMLINK_TYPE_NATIVE: case CIFS_SYMLINK_TYPE_NFS: case CIFS_SYMLINK_TYPE_WSL: - if (server->ops->create_reparse_symlink) { + if (server->ops->create_reparse_symlink && + (le32_to_cpu(pTcon->fsAttrInfo.Attributes) & FILE_SUPPORTS_REPARSE_POINTS)) { rc = server->ops->create_reparse_symlink(xid, inode, direntry, pTcon, diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c index 50f96259d9ad..787d6bcb5d1d 100644 --- a/fs/smb/client/readdir.c +++ b/fs/smb/client/readdir.c @@ -733,7 +733,10 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, else cifs_buf_release(cfile->srch_inf. ntwrk_buf_start); + /* Reset all pointers to the network buffer to prevent stale references */ cfile->srch_inf.ntwrk_buf_start = NULL; + cfile->srch_inf.srch_entries_start = NULL; + cfile->srch_inf.last_entry = NULL; } rc = initiate_cifs_search(xid, file, full_path); if (rc) { @@ -756,11 +759,11 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, rc = server->ops->query_dir_next(xid, tcon, &cfile->fid, search_flags, &cfile->srch_inf); + if (rc) + return -ENOENT; /* FindFirst/Next set last_entry to NULL on malformed reply */ if (cfile->srch_inf.last_entry) cifs_save_resume_key(cfile->srch_inf.last_entry, cfile); - if (rc) - return -ENOENT; } if (index_to_find < cfile->srch_inf.index_of_last_entry) { /* we found the buffer that contains the entry */ diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index eb70ebf38464..9d6b1a470477 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -679,6 +679,22 @@ unicode_oslm_strings(char **pbcc_area, const struct nls_table *nls_cp) *pbcc_area = bcc_ptr; } +static void +ascii_oslm_strings(char **pbcc_area, const struct nls_table *nls_cp) +{ + char *bcc_ptr = *pbcc_area; + + strcpy(bcc_ptr, "Linux version "); + bcc_ptr += strlen("Linux version "); + strcpy(bcc_ptr, init_utsname()->release); + bcc_ptr += strlen(init_utsname()->release) + 1; + + strcpy(bcc_ptr, CIFS_NETWORK_OPSYS); + bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1; + + *pbcc_area = bcc_ptr; +} + static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses, const struct nls_table *nls_cp) { @@ -703,6 +719,25 @@ static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses, *pbcc_area = bcc_ptr; } +static void ascii_domain_string(char **pbcc_area, struct cifs_ses *ses, + const struct nls_table *nls_cp) +{ + char *bcc_ptr = *pbcc_area; + int len; + + /* copy domain */ + if (ses->domainName != NULL) { + len = strscpy(bcc_ptr, ses->domainName, CIFS_MAX_DOMAINNAME_LEN); + if (WARN_ON_ONCE(len < 0)) + len = CIFS_MAX_DOMAINNAME_LEN - 1; + bcc_ptr += len; + } /* else we send a null domain name so server will default to its own domain */ + *bcc_ptr = 0; + bcc_ptr++; + + *pbcc_area = bcc_ptr; +} + static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses, const struct nls_table *nls_cp) { @@ -748,25 +783,10 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses, *bcc_ptr = 0; bcc_ptr++; /* account for null termination */ - /* copy domain */ - if (ses->domainName != NULL) { - len = strscpy(bcc_ptr, ses->domainName, CIFS_MAX_DOMAINNAME_LEN); - if (WARN_ON_ONCE(len < 0)) - len = CIFS_MAX_DOMAINNAME_LEN - 1; - bcc_ptr += len; - } /* else we send a null domain name so server will default to its own domain */ - *bcc_ptr = 0; - bcc_ptr++; - /* BB check for overflow here */ - strcpy(bcc_ptr, "Linux version "); - bcc_ptr += strlen("Linux version "); - strcpy(bcc_ptr, init_utsname()->release); - bcc_ptr += strlen(init_utsname()->release) + 1; - - strcpy(bcc_ptr, CIFS_NETWORK_OPSYS); - bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1; + ascii_domain_string(&bcc_ptr, ses, nls_cp); + ascii_oslm_strings(&bcc_ptr, nls_cp); *pbcc_area = bcc_ptr; } @@ -1569,7 +1589,7 @@ sess_auth_kerberos(struct sess_data *sess_data) sess_data->iov[1].iov_len = msg->secblob_len; pSMB->req.SecurityBlobLength = cpu_to_le16(sess_data->iov[1].iov_len); - if (ses->capabilities & CAP_UNICODE) { + if (pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) { /* unicode strings must be word aligned */ if (!IS_ALIGNED(sess_data->iov[0].iov_len + sess_data->iov[1].iov_len, 2)) { *bcc_ptr = 0; @@ -1578,8 +1598,8 @@ sess_auth_kerberos(struct sess_data *sess_data) unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp); unicode_domain_string(&bcc_ptr, ses, sess_data->nls_cp); } else { - /* BB: is this right? */ - ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp); + ascii_oslm_strings(&bcc_ptr, sess_data->nls_cp); + ascii_domain_string(&bcc_ptr, ses, sess_data->nls_cp); } sess_data->iov[2].iov_len = (long) bcc_ptr - diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c index d6e2fb669c40..47c815f34bc3 100644 --- a/fs/smb/client/smb1ops.c +++ b/fs/smb/client/smb1ops.c @@ -426,13 +426,6 @@ cifs_negotiate(const unsigned int xid, { int rc; rc = CIFSSMBNegotiate(xid, ses, server); - if (rc == -EAGAIN) { - /* retry only once on 1st time connection */ - set_credits(server, 1); - rc = CIFSSMBNegotiate(xid, ses, server); - if (rc == -EAGAIN) - rc = -EHOSTDOWN; - } return rc; } @@ -444,8 +437,8 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) unsigned int wsize; /* start with specified wsize, or default */ - if (ctx->wsize) - wsize = ctx->wsize; + if (ctx->got_wsize) + wsize = ctx->vol_wsize; else if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_WRITE_CAP)) wsize = CIFS_DEFAULT_IOSIZE; else @@ -497,7 +490,7 @@ cifs_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) else defsize = server->maxBuf - sizeof(READ_RSP); - rsize = ctx->rsize ? ctx->rsize : defsize; + rsize = ctx->got_rsize ? ctx->vol_rsize : defsize; /* * no CAP_LARGE_READ_X? Then MS-CIFS states that we must limit this to @@ -548,24 +541,104 @@ static int cifs_query_path_info(const unsigned int xid, const char *full_path, struct cifs_open_info_data *data) { - int rc; + int rc = -EOPNOTSUPP; FILE_ALL_INFO fi = {}; + struct cifs_search_info search_info = {}; + bool non_unicode_wildcard = false; data->reparse_point = false; data->adjust_tz = false; - /* could do find first instead but this returns more info */ - rc = CIFSSMBQPathInfo(xid, tcon, full_path, &fi, 0 /* not legacy */, cifs_sb->local_nls, - cifs_remap(cifs_sb)); /* - * BB optimize code so we do not make the above call when server claims - * no NT SMB support and the above call failed at least once - set flag - * in tcon or mount. + * First try CIFSSMBQPathInfo() function which returns more info + * (NumberOfLinks) than CIFSFindFirst() fallback function. + * Some servers like Win9x do not support SMB_QUERY_FILE_ALL_INFO over + * TRANS2_QUERY_PATH_INFORMATION, but supports it with filehandle over + * TRANS2_QUERY_FILE_INFORMATION (function CIFSSMBQFileInfo(). But SMB + * Open command on non-NT servers works only for files, does not work + * for directories. And moreover Win9x SMB server returns bogus data in + * SMB_QUERY_FILE_ALL_INFO Attributes field. So for non-NT servers, + * do not even use CIFSSMBQPathInfo() or CIFSSMBQFileInfo() function. */ - if ((rc == -EOPNOTSUPP) || (rc == -EINVAL)) { + if (tcon->ses->capabilities & CAP_NT_SMBS) + rc = CIFSSMBQPathInfo(xid, tcon, full_path, &fi, 0 /* not legacy */, + cifs_sb->local_nls, cifs_remap(cifs_sb)); + + /* + * Non-UNICODE variant of fallback functions below expands wildcards, + * so they cannot be used for querying paths with wildcard characters. + */ + if (rc && !(tcon->ses->capabilities & CAP_UNICODE) && strpbrk(full_path, "*?\"><")) + non_unicode_wildcard = true; + + /* + * Then fallback to CIFSFindFirst() which works also with non-NT servers + * but does not does not provide NumberOfLinks. + */ + if ((rc == -EOPNOTSUPP || rc == -EINVAL) && + !non_unicode_wildcard) { + if (!(tcon->ses->capabilities & tcon->ses->server->vals->cap_nt_find)) + search_info.info_level = SMB_FIND_FILE_INFO_STANDARD; + else + search_info.info_level = SMB_FIND_FILE_FULL_DIRECTORY_INFO; + rc = CIFSFindFirst(xid, tcon, full_path, cifs_sb, NULL, + CIFS_SEARCH_CLOSE_ALWAYS | CIFS_SEARCH_CLOSE_AT_END, + &search_info, false); + if (rc == 0) { + if (!(tcon->ses->capabilities & tcon->ses->server->vals->cap_nt_find)) { + FIND_FILE_STANDARD_INFO *di; + int offset = tcon->ses->server->timeAdj; + + di = (FIND_FILE_STANDARD_INFO *)search_info.srch_entries_start; + fi.CreationTime = cpu_to_le64(cifs_UnixTimeToNT(cnvrtDosUnixTm( + di->CreationDate, di->CreationTime, offset))); + fi.LastAccessTime = cpu_to_le64(cifs_UnixTimeToNT(cnvrtDosUnixTm( + di->LastAccessDate, di->LastAccessTime, offset))); + fi.LastWriteTime = cpu_to_le64(cifs_UnixTimeToNT(cnvrtDosUnixTm( + di->LastWriteDate, di->LastWriteTime, offset))); + fi.ChangeTime = fi.LastWriteTime; + fi.Attributes = cpu_to_le32(le16_to_cpu(di->Attributes)); + fi.AllocationSize = cpu_to_le64(le32_to_cpu(di->AllocationSize)); + fi.EndOfFile = cpu_to_le64(le32_to_cpu(di->DataSize)); + } else { + FILE_FULL_DIRECTORY_INFO *di; + + di = (FILE_FULL_DIRECTORY_INFO *)search_info.srch_entries_start; + fi.CreationTime = di->CreationTime; + fi.LastAccessTime = di->LastAccessTime; + fi.LastWriteTime = di->LastWriteTime; + fi.ChangeTime = di->ChangeTime; + fi.Attributes = di->ExtFileAttributes; + fi.AllocationSize = di->AllocationSize; + fi.EndOfFile = di->EndOfFile; + fi.EASize = di->EaSize; + } + fi.NumberOfLinks = cpu_to_le32(1); + fi.DeletePending = 0; + fi.Directory = !!(le32_to_cpu(fi.Attributes) & ATTR_DIRECTORY); + cifs_buf_release(search_info.ntwrk_buf_start); + } else if (!full_path[0]) { + /* + * CIFSFindFirst() does not work on root path if the + * root path was exported on the server from the top + * level path (drive letter). + */ + rc = -EOPNOTSUPP; + } + } + + /* + * If everything failed then fallback to the legacy SMB command + * SMB_COM_QUERY_INFORMATION which works with all servers, but + * provide just few information. + */ + if ((rc == -EOPNOTSUPP || rc == -EINVAL) && !non_unicode_wildcard) { rc = SMBQueryInformation(xid, tcon, full_path, &fi, cifs_sb->local_nls, cifs_remap(cifs_sb)); data->adjust_tz = true; + } else if ((rc == -EOPNOTSUPP || rc == -EINVAL) && non_unicode_wildcard) { + /* Path with non-UNICODE wildcard character cannot exist. */ + rc = -ENOENT; } if (!rc) { @@ -573,6 +646,42 @@ static int cifs_query_path_info(const unsigned int xid, data->reparse_point = le32_to_cpu(fi.Attributes) & ATTR_REPARSE; } +#ifdef CONFIG_CIFS_XATTR + /* + * For WSL CHR and BLK reparse points it is required to fetch + * EA $LXDEV which contains major and minor device numbers. + */ + if (!rc && data->reparse_point) { + struct smb2_file_full_ea_info *ea; + + ea = (struct smb2_file_full_ea_info *)data->wsl.eas; + rc = CIFSSMBQAllEAs(xid, tcon, full_path, SMB2_WSL_XATTR_DEV, + &ea->ea_data[SMB2_WSL_XATTR_NAME_LEN + 1], + SMB2_WSL_XATTR_DEV_SIZE, cifs_sb); + if (rc == SMB2_WSL_XATTR_DEV_SIZE) { + ea->next_entry_offset = cpu_to_le32(0); + ea->flags = 0; + ea->ea_name_length = SMB2_WSL_XATTR_NAME_LEN; + ea->ea_value_length = cpu_to_le16(SMB2_WSL_XATTR_DEV_SIZE); + memcpy(&ea->ea_data[0], SMB2_WSL_XATTR_DEV, SMB2_WSL_XATTR_NAME_LEN + 1); + data->wsl.eas_len = sizeof(*ea) + SMB2_WSL_XATTR_NAME_LEN + 1 + + SMB2_WSL_XATTR_DEV_SIZE; + rc = 0; + } else if (rc >= 0) { + /* It is an error if EA $LXDEV has wrong size. */ + rc = -EINVAL; + } else { + /* + * In all other cases ignore error if fetching + * of EA $LXDEV failed. It is needed only for + * WSL CHR and BLK reparse points and wsl_to_fattr() + * handle the case when EA is missing. + */ + rc = 0; + } + } +#endif + return rc; } @@ -608,6 +717,13 @@ static int cifs_query_file_info(const unsigned int xid, struct cifs_tcon *tcon, int rc; FILE_ALL_INFO fi = {}; + /* + * CIFSSMBQFileInfo() for non-NT servers returns bogus data in + * Attributes fields. So do not use this command for non-NT servers. + */ + if (!(tcon->ses->capabilities & CAP_NT_SMBS)) + return -EOPNOTSUPP; + if (cfile->symlink_target) { data->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); if (!data->symlink_target) @@ -778,6 +894,9 @@ smb_set_file_info(struct inode *inode, const char *full_path, struct cifs_fid fid; struct cifs_open_parms oparms; struct cifsFileInfo *open_file; + FILE_BASIC_INFO new_buf; + struct cifs_open_info_data query_data; + __le64 write_time = buf->LastWriteTime; struct cifsInodeInfo *cinode = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink = NULL; @@ -785,20 +904,58 @@ smb_set_file_info(struct inode *inode, const char *full_path, /* if the file is already open for write, just use that fileid */ open_file = find_writable_file(cinode, FIND_WR_FSUID_ONLY); + if (open_file) { fid.netfid = open_file->fid.netfid; netpid = open_file->pid; tcon = tlink_tcon(open_file->tlink); - goto set_via_filehandle; + } else { + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) { + rc = PTR_ERR(tlink); + tlink = NULL; + goto out; + } + tcon = tlink_tcon(tlink); } - tlink = cifs_sb_tlink(cifs_sb); - if (IS_ERR(tlink)) { - rc = PTR_ERR(tlink); - tlink = NULL; - goto out; + /* + * Non-NT servers interprets zero time value in SMB_SET_FILE_BASIC_INFO + * over TRANS2_SET_FILE_INFORMATION as a valid time value. NT servers + * interprets zero time value as do not change existing value on server. + * API of ->set_file_info() callback expects that zero time value has + * the NT meaning - do not change. Therefore if server is non-NT and + * some time values in "buf" are zero, then fetch missing time values. + */ + if (!(tcon->ses->capabilities & CAP_NT_SMBS) && + (!buf->CreationTime || !buf->LastAccessTime || + !buf->LastWriteTime || !buf->ChangeTime)) { + rc = cifs_query_path_info(xid, tcon, cifs_sb, full_path, &query_data); + if (rc) { + if (open_file) { + cifsFileInfo_put(open_file); + open_file = NULL; + } + goto out; + } + /* + * Original write_time from buf->LastWriteTime is preserved + * as SMBSetInformation() interprets zero as do not change. + */ + new_buf = *buf; + buf = &new_buf; + if (!buf->CreationTime) + buf->CreationTime = query_data.fi.CreationTime; + if (!buf->LastAccessTime) + buf->LastAccessTime = query_data.fi.LastAccessTime; + if (!buf->LastWriteTime) + buf->LastWriteTime = query_data.fi.LastWriteTime; + if (!buf->ChangeTime) + buf->ChangeTime = query_data.fi.ChangeTime; } - tcon = tlink_tcon(tlink); + + if (open_file) + goto set_via_filehandle; rc = CIFSSMBSetPathInfo(xid, tcon, full_path, buf, cifs_sb->local_nls, cifs_sb); @@ -819,8 +976,45 @@ smb_set_file_info(struct inode *inode, const char *full_path, .fid = &fid, }; - cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for times not supported by this server\n"); - rc = CIFS_open(xid, &oparms, &oplock, NULL); + if (S_ISDIR(inode->i_mode) && !(tcon->ses->capabilities & CAP_NT_SMBS)) { + /* Opening directory path is not possible on non-NT servers. */ + rc = -EOPNOTSUPP; + } else { + /* + * Use cifs_open_file() instead of CIFS_open() as the + * cifs_open_file() selects the correct function which + * works also on non-NT servers. + */ + rc = cifs_open_file(xid, &oparms, &oplock, NULL); + /* + * Opening path for writing on non-NT servers is not + * possible when the read-only attribute is already set. + * Non-NT server in this case returns -EACCES. For those + * servers the only possible way how to clear the read-only + * bit is via SMB_COM_SETATTR command. + */ + if (rc == -EACCES && + (cinode->cifsAttrs & ATTR_READONLY) && + le32_to_cpu(buf->Attributes) != 0 && /* 0 = do not change attrs */ + !(le32_to_cpu(buf->Attributes) & ATTR_READONLY) && + !(tcon->ses->capabilities & CAP_NT_SMBS)) + rc = -EOPNOTSUPP; + } + + /* Fallback to SMB_COM_SETATTR command when absolutelty needed. */ + if (rc == -EOPNOTSUPP) { + cifs_dbg(FYI, "calling SetInformation since SetPathInfo for attrs/times not supported by this server\n"); + rc = SMBSetInformation(xid, tcon, full_path, + buf->Attributes != 0 ? buf->Attributes : cpu_to_le32(cinode->cifsAttrs), + write_time, + cifs_sb->local_nls, cifs_sb); + if (rc == 0) + cinode->cifsAttrs = le32_to_cpu(buf->Attributes); + else + rc = -EACCES; + goto out; + } + if (rc != 0) { if (rc == -EIO) rc = -EINVAL; @@ -828,6 +1022,7 @@ smb_set_file_info(struct inode *inode, const char *full_path, } netpid = current->tgid; + cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for attrs/times not supported by this server\n"); set_via_filehandle: rc = CIFSSMBSetFileInfo(xid, tcon, buf, fid.netfid, netpid); @@ -838,6 +1033,21 @@ set_via_filehandle: CIFSSMBClose(xid, tcon, fid.netfid); else cifsFileInfo_put(open_file); + + /* + * Setting the read-only bit is not honered on non-NT servers when done + * via open-semantics. So for setting it, use SMB_COM_SETATTR command. + * This command works only after the file is closed, so use it only when + * operation was called without the filehandle. + */ + if (open_file == NULL && + !(tcon->ses->capabilities & CAP_NT_SMBS) && + le32_to_cpu(buf->Attributes) & ATTR_READONLY) { + SMBSetInformation(xid, tcon, full_path, + buf->Attributes, + 0 /* do not change write time */, + cifs_sb->local_nls, cifs_sb); + } out: if (tlink != NULL) cifs_put_tlink(tlink); diff --git a/fs/smb/client/smb2file.c b/fs/smb/client/smb2file.c index d609a20fb98a..2d726e9b950c 100644 --- a/fs/smb/client/smb2file.c +++ b/fs/smb/client/smb2file.c @@ -152,16 +152,25 @@ int smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32 int err_buftype = CIFS_NO_BUFFER; struct cifs_fid *fid = oparms->fid; struct network_resiliency_req nr_ioctl_req; + bool retry_without_read_attributes = false; smb2_path = cifs_convert_path_to_utf16(oparms->path, oparms->cifs_sb); if (smb2_path == NULL) return -ENOMEM; - oparms->desired_access |= FILE_READ_ATTRIBUTES; + if (!(oparms->desired_access & FILE_READ_ATTRIBUTES)) { + oparms->desired_access |= FILE_READ_ATTRIBUTES; + retry_without_read_attributes = true; + } smb2_oplock = SMB2_OPLOCK_LEVEL_BATCH; rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, NULL, &err_iov, &err_buftype); + if (rc == -EACCES && retry_without_read_attributes) { + oparms->desired_access &= ~FILE_READ_ATTRIBUTES; + rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, NULL, &err_iov, + &err_buftype); + } if (rc && data) { struct smb2_hdr *hdr = err_iov.iov_base; diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index 826b57a5a2a8..e9fd3e204a6f 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -1273,6 +1273,14 @@ struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data, int rc; int i; + /* + * If server filesystem does not support reparse points then do not + * attempt to create reparse point. This will prevent creating unusable + * empty object on the server. + */ + if (!(le32_to_cpu(tcon->fsAttrInfo.Attributes) & FILE_SUPPORTS_REPARSE_POINTS)) + return ERR_PTR(-EOPNOTSUPP); + oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, SYNCHRONIZE | DELETE | FILE_READ_ATTRIBUTES | diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 7aeac8dd9a1d..fbb368629213 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -464,12 +464,20 @@ smb2_negotiate(const unsigned int xid, server->CurrentMid = 0; spin_unlock(&server->mid_lock); rc = SMB2_negotiate(xid, ses, server); - /* BB we probably don't need to retry with modern servers */ - if (rc == -EAGAIN) - rc = -EHOSTDOWN; return rc; } +static inline unsigned int +prevent_zero_iosize(unsigned int size, const char *type) +{ + if (size == 0) { + cifs_dbg(VFS, "SMB: Zero %ssize calculated, using minimum value %u\n", + type, CIFS_MIN_DEFAULT_IOSIZE); + return CIFS_MIN_DEFAULT_IOSIZE; + } + return size; +} + static unsigned int smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) { @@ -477,12 +485,12 @@ smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) unsigned int wsize; /* start with specified wsize, or default */ - wsize = ctx->wsize ? ctx->wsize : CIFS_DEFAULT_IOSIZE; + wsize = ctx->got_wsize ? ctx->vol_wsize : CIFS_DEFAULT_IOSIZE; wsize = min_t(unsigned int, wsize, server->max_write); if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE); - return wsize; + return prevent_zero_iosize(wsize, "w"); } static unsigned int @@ -492,7 +500,7 @@ smb3_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) unsigned int wsize; /* start with specified wsize, or default */ - wsize = ctx->wsize ? ctx->wsize : SMB3_DEFAULT_IOSIZE; + wsize = ctx->got_wsize ? ctx->vol_wsize : SMB3_DEFAULT_IOSIZE; wsize = min_t(unsigned int, wsize, server->max_write); #ifdef CONFIG_CIFS_SMB_DIRECT if (server->rdma) { @@ -514,7 +522,7 @@ smb3_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE); - return wsize; + return prevent_zero_iosize(wsize, "w"); } static unsigned int @@ -524,13 +532,13 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) unsigned int rsize; /* start with specified rsize, or default */ - rsize = ctx->rsize ? ctx->rsize : CIFS_DEFAULT_IOSIZE; + rsize = ctx->got_rsize ? ctx->vol_rsize : CIFS_DEFAULT_IOSIZE; rsize = min_t(unsigned int, rsize, server->max_read); if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE); - return rsize; + return prevent_zero_iosize(rsize, "r"); } static unsigned int @@ -540,7 +548,7 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) unsigned int rsize; /* start with specified rsize, or default */ - rsize = ctx->rsize ? ctx->rsize : SMB3_DEFAULT_IOSIZE; + rsize = ctx->got_rsize ? ctx->vol_rsize : SMB3_DEFAULT_IOSIZE; rsize = min_t(unsigned int, rsize, server->max_read); #ifdef CONFIG_CIFS_SMB_DIRECT if (server->rdma) { @@ -563,7 +571,7 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE); - return rsize; + return prevent_zero_iosize(rsize, "r"); } /* @@ -5229,7 +5237,7 @@ static int smb2_make_node(unsigned int xid, struct inode *inode, const char *full_path, umode_t mode, dev_t dev) { struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); - int rc; + int rc = -EOPNOTSUPP; /* * Check if mounted with mount parm 'sfu' mount parm. @@ -5240,7 +5248,7 @@ static int smb2_make_node(unsigned int xid, struct inode *inode, if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) { rc = cifs_sfu_make_node(xid, inode, dentry, tcon, full_path, mode, dev); - } else { + } else if (le32_to_cpu(tcon->fsAttrInfo.Attributes) & FILE_SUPPORTS_REPARSE_POINTS) { rc = smb2_mknod_reparse(xid, inode, dentry, tcon, full_path, mode, dev); } diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 163b8fea47e8..d080c777906b 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -2920,6 +2920,7 @@ replay_again: req->CreateContextsOffset = cpu_to_le32( sizeof(struct smb2_create_req) + iov[1].iov_len); + le32_add_cpu(&req->CreateContextsLength, iov[n_iov-1].iov_len); pc_buf = iov[n_iov-1].iov_base; } @@ -2966,7 +2967,7 @@ replay_again: /* Eventually save off posix specific response info and timestamps */ err_free_rsp_buf: - free_rsp_buf(resp_buftype, rsp); + free_rsp_buf(resp_buftype, rsp_iov.iov_base); kfree(pc_buf); err_free_req: cifs_small_buf_release(req); @@ -3909,12 +3910,10 @@ SMB2_query_acl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, void **data, u32 *plen, u32 extra_info) { - __u32 additional_info = OWNER_SECINFO | GROUP_SECINFO | DACL_SECINFO | - extra_info; *plen = 0; return query_info(xid, tcon, persistent_fid, volatile_fid, - 0, SMB2_O_INFO_SECURITY, additional_info, + 0, SMB2_O_INFO_SECURITY, extra_info, SMB2_MAX_BUFFER_SIZE, MIN_SEC_DESC_LEN, data, plen); } diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c index 0dc80959ce48..03434dbe9374 100644 --- a/fs/smb/client/transport.c +++ b/fs/smb/client/transport.c @@ -179,7 +179,7 @@ delete_mid(struct mid_q_entry *mid) * Our basic "send data to server" function. Should be called with srv_mutex * held. The caller is responsible for handling the results. */ -static int +int smb_send_kvec(struct TCP_Server_Info *server, struct msghdr *smb_msg, size_t *sent) { diff --git a/fs/smb/client/xattr.c b/fs/smb/client/xattr.c index 58a584f0b27e..7d49f38f01f3 100644 --- a/fs/smb/client/xattr.c +++ b/fs/smb/client/xattr.c @@ -320,10 +320,17 @@ static int cifs_xattr_get(const struct xattr_handler *handler, if (pTcon->ses->server->ops->get_acl == NULL) goto out; /* rc already EOPNOTSUPP */ - if (handler->flags == XATTR_CIFS_NTSD_FULL) { - extra_info = SACL_SECINFO; - } else { - extra_info = 0; + switch (handler->flags) { + case XATTR_CIFS_NTSD_FULL: + extra_info = OWNER_SECINFO | GROUP_SECINFO | DACL_SECINFO | SACL_SECINFO; + break; + case XATTR_CIFS_NTSD: + extra_info = OWNER_SECINFO | GROUP_SECINFO | DACL_SECINFO; + break; + case XATTR_CIFS_ACL: + default: + extra_info = DACL_SECINFO; + break; } pacl = pTcon->ses->server->ops->get_acl(cifs_sb, inode, full_path, &acllen, extra_info); diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h index 12f001333405..f79a5165a7cc 100644 --- a/fs/smb/common/smb2pdu.h +++ b/fs/smb/common/smb2pdu.h @@ -95,6 +95,9 @@ */ #define SMB3_DEFAULT_IOSIZE (4 * 1024 * 1024) +/* According to MS-SMB2 specification The minimum recommended value is 65536.*/ +#define CIFS_MIN_DEFAULT_IOSIZE (65536) + /* * SMB2 Header Definition * diff --git a/fs/smb/server/auth.c b/fs/smb/server/auth.c index 83caa3849749..b3d121052408 100644 --- a/fs/smb/server/auth.c +++ b/fs/smb/server/auth.c @@ -550,7 +550,19 @@ int ksmbd_krb5_authenticate(struct ksmbd_session *sess, char *in_blob, retval = -ENOMEM; goto out; } - sess->user = user; + + if (!sess->user) { + /* First successful authentication */ + sess->user = user; + } else { + if (!ksmbd_compare_user(sess->user, user)) { + ksmbd_debug(AUTH, "different user tried to reuse session\n"); + retval = -EPERM; + ksmbd_free_user(user); + goto out; + } + ksmbd_free_user(user); + } memcpy(sess->sess_key, resp->payload, resp->session_key_len); memcpy(out_blob, resp->payload + resp->session_key_len, diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c index 3f45f28f6f0f..9dec4c2940bc 100644 --- a/fs/smb/server/mgmt/user_session.c +++ b/fs/smb/server/mgmt/user_session.c @@ -59,10 +59,12 @@ static void ksmbd_session_rpc_clear_list(struct ksmbd_session *sess) struct ksmbd_session_rpc *entry; long index; + down_write(&sess->rpc_lock); xa_for_each(&sess->rpc_handle_list, index, entry) { xa_erase(&sess->rpc_handle_list, index); __session_rpc_close(sess, entry); } + up_write(&sess->rpc_lock); xa_destroy(&sess->rpc_handle_list); } @@ -92,7 +94,7 @@ int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name) { struct ksmbd_session_rpc *entry, *old; struct ksmbd_rpc_command *resp; - int method; + int method, id; method = __rpc_method(rpc_name); if (!method) @@ -102,26 +104,29 @@ int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name) if (!entry) return -ENOMEM; + down_read(&sess->rpc_lock); entry->method = method; - entry->id = ksmbd_ipc_id_alloc(); - if (entry->id < 0) + entry->id = id = ksmbd_ipc_id_alloc(); + if (id < 0) goto free_entry; - old = xa_store(&sess->rpc_handle_list, entry->id, entry, KSMBD_DEFAULT_GFP); + old = xa_store(&sess->rpc_handle_list, id, entry, KSMBD_DEFAULT_GFP); if (xa_is_err(old)) goto free_id; - resp = ksmbd_rpc_open(sess, entry->id); + resp = ksmbd_rpc_open(sess, id); if (!resp) goto erase_xa; + up_read(&sess->rpc_lock); kvfree(resp); - return entry->id; + return id; erase_xa: xa_erase(&sess->rpc_handle_list, entry->id); free_id: ksmbd_rpc_id_free(entry->id); free_entry: kfree(entry); + up_read(&sess->rpc_lock); return -EINVAL; } @@ -129,9 +134,11 @@ void ksmbd_session_rpc_close(struct ksmbd_session *sess, int id) { struct ksmbd_session_rpc *entry; + down_write(&sess->rpc_lock); entry = xa_erase(&sess->rpc_handle_list, id); if (entry) __session_rpc_close(sess, entry); + up_write(&sess->rpc_lock); } int ksmbd_session_rpc_method(struct ksmbd_session *sess, int id) @@ -439,6 +446,7 @@ static struct ksmbd_session *__session_create(int protocol) sess->sequence_number = 1; rwlock_init(&sess->tree_conns_lock); atomic_set(&sess->refcnt, 2); + init_rwsem(&sess->rpc_lock); ret = __init_smb2_session(sess); if (ret) diff --git a/fs/smb/server/mgmt/user_session.h b/fs/smb/server/mgmt/user_session.h index f21348381d59..c5749d6ec715 100644 --- a/fs/smb/server/mgmt/user_session.h +++ b/fs/smb/server/mgmt/user_session.h @@ -63,6 +63,7 @@ struct ksmbd_session { rwlock_t tree_conns_lock; atomic_t refcnt; + struct rw_semaphore rpc_lock; }; static inline int test_session_flag(struct ksmbd_session *sess, int bit) diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c index 81a29857b1e3..d7a8a580d013 100644 --- a/fs/smb/server/oplock.c +++ b/fs/smb/server/oplock.c @@ -146,12 +146,9 @@ static struct oplock_info *opinfo_get_list(struct ksmbd_inode *ci) { struct oplock_info *opinfo; - if (list_empty(&ci->m_op_list)) - return NULL; - down_read(&ci->m_lock); - opinfo = list_first_entry(&ci->m_op_list, struct oplock_info, - op_entry); + opinfo = list_first_entry_or_null(&ci->m_op_list, struct oplock_info, + op_entry); if (opinfo) { if (opinfo->conn == NULL || !atomic_inc_not_zero(&opinfo->refcount)) @@ -1496,7 +1493,7 @@ struct lease_ctx_info *parse_lease_state(void *open_req) if (le16_to_cpu(cc->DataOffset) + le32_to_cpu(cc->DataLength) < sizeof(struct create_lease_v2) - 4) - return NULL; + goto err_out; memcpy(lreq->lease_key, lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE); lreq->req_state = lc->lcontext.LeaseState; @@ -1512,7 +1509,7 @@ struct lease_ctx_info *parse_lease_state(void *open_req) if (le16_to_cpu(cc->DataOffset) + le32_to_cpu(cc->DataLength) < sizeof(struct create_lease)) - return NULL; + goto err_out; memcpy(lreq->lease_key, lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE); lreq->req_state = lc->lcontext.LeaseState; @@ -1521,6 +1518,9 @@ struct lease_ctx_info *parse_lease_state(void *open_req) lreq->version = 1; } return lreq; +err_out: + kfree(lreq); + return NULL; } /** diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 57839f9708bb..f2a2be8467c6 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -633,6 +633,11 @@ smb2_get_name(const char *src, const int maxlen, struct nls_table *local_nls) return name; } + if (*name == '\0') { + kfree(name); + return ERR_PTR(-EINVAL); + } + if (*name == '\\') { pr_err("not allow directory name included leading slash\n"); kfree(name); @@ -1445,7 +1450,7 @@ static int ntlm_authenticate(struct ksmbd_work *work, { struct ksmbd_conn *conn = work->conn; struct ksmbd_session *sess = work->sess; - struct channel *chann = NULL; + struct channel *chann = NULL, *old; struct ksmbd_user *user; u64 prev_id; int sz, rc; @@ -1557,7 +1562,12 @@ binding_session: return -ENOMEM; chann->conn = conn; - xa_store(&sess->ksmbd_chann_list, (long)conn, chann, KSMBD_DEFAULT_GFP); + old = xa_store(&sess->ksmbd_chann_list, (long)conn, chann, + KSMBD_DEFAULT_GFP); + if (xa_is_err(old)) { + kfree(chann); + return xa_err(old); + } } } @@ -1602,11 +1612,6 @@ static int krb5_authenticate(struct ksmbd_work *work, if (prev_sess_id && prev_sess_id != sess->id) destroy_previous_session(conn, sess->user, prev_sess_id); - if (sess->state == SMB2_SESSION_VALID) { - ksmbd_free_user(sess->user); - sess->user = NULL; - } - retval = ksmbd_krb5_authenticate(sess, in_blob, in_len, out_blob, &out_len); if (retval) { @@ -2249,10 +2254,6 @@ int smb2_session_logoff(struct ksmbd_work *work) sess->state = SMB2_SESSION_EXPIRED; up_write(&conn->session_lock); - if (sess->user) { - ksmbd_free_user(sess->user); - sess->user = NULL; - } ksmbd_all_conn_set_status(sess_id, KSMBD_SESS_NEED_SETUP); rsp->StructureSize = cpu_to_le16(4); diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 9c765b973751..474dc6e122c8 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -426,10 +426,15 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos, ksmbd_debug(VFS, "write stream data pos : %llu, count : %zd\n", *pos, count); + if (*pos >= XATTR_SIZE_MAX) { + pr_err("stream write position %lld is out of bounds\n", *pos); + return -EINVAL; + } + size = *pos + count; if (size > XATTR_SIZE_MAX) { size = XATTR_SIZE_MAX; - count = (*pos + count) - XATTR_SIZE_MAX; + count = XATTR_SIZE_MAX - *pos; } v_len = ksmbd_vfs_getcasexattr(idmap, diff --git a/fs/smb/server/vfs_cache.c b/fs/smb/server/vfs_cache.c index 8d1f30dcba7e..dfed6fce8904 100644 --- a/fs/smb/server/vfs_cache.c +++ b/fs/smb/server/vfs_cache.c @@ -661,21 +661,40 @@ __close_file_table_ids(struct ksmbd_file_table *ft, bool (*skip)(struct ksmbd_tree_connect *tcon, struct ksmbd_file *fp)) { - unsigned int id; - struct ksmbd_file *fp; - int num = 0; + struct ksmbd_file *fp; + unsigned int id = 0; + int num = 0; + + while (1) { + write_lock(&ft->lock); + fp = idr_get_next(ft->idr, &id); + if (!fp) { + write_unlock(&ft->lock); + break; + } - idr_for_each_entry(ft->idr, fp, id) { - if (skip(tcon, fp)) + if (skip(tcon, fp) || + !atomic_dec_and_test(&fp->refcount)) { + id++; + write_unlock(&ft->lock); continue; + } set_close_state_blocked_works(fp); + idr_remove(ft->idr, fp->volatile_id); + fp->volatile_id = KSMBD_NO_FID; + write_unlock(&ft->lock); + + down_write(&fp->f_ci->m_lock); + list_del_init(&fp->node); + up_write(&fp->f_ci->m_lock); - if (!atomic_dec_and_test(&fp->refcount)) - continue; __ksmbd_close_fd(ft, fp); + num++; + id++; } + return num; } @@ -713,12 +732,8 @@ static bool tree_conn_fd_check(struct ksmbd_tree_connect *tcon, static bool ksmbd_durable_scavenger_alive(void) { - mutex_lock(&durable_scavenger_lock); - if (!durable_scavenger_running) { - mutex_unlock(&durable_scavenger_lock); + if (!durable_scavenger_running) return false; - } - mutex_unlock(&durable_scavenger_lock); if (kthread_should_stop()) return false; @@ -799,9 +814,7 @@ static int ksmbd_durable_scavenger(void *dummy) break; } - mutex_lock(&durable_scavenger_lock); durable_scavenger_running = false; - mutex_unlock(&durable_scavenger_lock); module_put(THIS_MODULE); diff --git a/fs/splice.c b/fs/splice.c index 23fa5561b944..bd6e889133f5 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -45,7 +45,7 @@ * here if set to avoid blocking other users of this pipe if splice is * being done on it. */ -static noinline void noinline pipe_clear_nowait(struct file *file) +static noinline void pipe_clear_nowait(struct file *file) { fmode_t fmode = READ_ONCE(file->f_mode); diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c index 4f33a4a48886..b4071c9cf8c9 100644 --- a/fs/udf/truncate.c +++ b/fs/udf/truncate.c @@ -115,7 +115,7 @@ void udf_truncate_tail_extent(struct inode *inode) } /* This inode entry is in-memory only and thus we don't have to mark * the inode dirty */ - if (ret == 0) + if (ret >= 0) iinfo->i_lenExtents = inode->i_size; brelse(epos.bh); } diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index d80f94346199..22f4bf956ba1 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1585,8 +1585,11 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, user_uffdio_copy = (struct uffdio_copy __user *) arg; ret = -EAGAIN; - if (atomic_read(&ctx->mmap_changing)) + if (unlikely(atomic_read(&ctx->mmap_changing))) { + if (unlikely(put_user(ret, &user_uffdio_copy->copy))) + return -EFAULT; goto out; + } ret = -EFAULT; if (copy_from_user(&uffdio_copy, user_uffdio_copy, @@ -1641,8 +1644,11 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg; ret = -EAGAIN; - if (atomic_read(&ctx->mmap_changing)) + if (unlikely(atomic_read(&ctx->mmap_changing))) { + if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage))) + return -EFAULT; goto out; + } ret = -EFAULT; if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage, @@ -1744,8 +1750,11 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) user_uffdio_continue = (struct uffdio_continue __user *)arg; ret = -EAGAIN; - if (atomic_read(&ctx->mmap_changing)) + if (unlikely(atomic_read(&ctx->mmap_changing))) { + if (unlikely(put_user(ret, &user_uffdio_continue->mapped))) + return -EFAULT; goto out; + } ret = -EFAULT; if (copy_from_user(&uffdio_continue, user_uffdio_continue, @@ -1801,8 +1810,11 @@ static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long user_uffdio_poison = (struct uffdio_poison __user *)arg; ret = -EAGAIN; - if (atomic_read(&ctx->mmap_changing)) + if (unlikely(atomic_read(&ctx->mmap_changing))) { + if (unlikely(put_user(ret, &user_uffdio_poison->updated))) + return -EFAULT; goto out; + } ret = -EFAULT; if (copy_from_user(&uffdio_poison, user_uffdio_poison, @@ -1870,8 +1882,12 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx, user_uffdio_move = (struct uffdio_move __user *) arg; - if (atomic_read(&ctx->mmap_changing)) - return -EAGAIN; + ret = -EAGAIN; + if (unlikely(atomic_read(&ctx->mmap_changing))) { + if (unlikely(put_user(ret, &user_uffdio_move->move))) + return -EFAULT; + goto out; + } if (copy_from_user(&uffdio_move, user_uffdio_move, /* don't copy "move" last field */ diff --git a/fs/xattr.c b/fs/xattr.c index 02bee149ad96..8ec5b0204bfd 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -703,7 +703,7 @@ static int path_setxattrat(int dfd, const char __user *pathname, return error; filename = getname_maybe_null(pathname, at_flags); - if (!filename) { + if (!filename && dfd >= 0) { CLASS(fd, f)(dfd); if (fd_empty(f)) error = -EBADF; @@ -847,7 +847,7 @@ static ssize_t path_getxattrat(int dfd, const char __user *pathname, return error; filename = getname_maybe_null(pathname, at_flags); - if (!filename) { + if (!filename && dfd >= 0) { CLASS(fd, f)(dfd); if (fd_empty(f)) return -EBADF; @@ -1428,6 +1428,15 @@ static bool xattr_is_trusted(const char *name) return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); } +static bool xattr_is_maclabel(const char *name) +{ + const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; + + return !strncmp(name, XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN) && + security_ismaclabel(suffix); +} + /** * simple_xattr_list - list all xattr objects * @inode: inode from which to get the xattrs @@ -1460,6 +1469,17 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, if (err) return err; + err = security_inode_listsecurity(inode, buffer, remaining_size); + if (err < 0) + return err; + + if (buffer) { + if (remaining_size < err) + return -ERANGE; + buffer += err; + } + remaining_size -= err; + read_lock(&xattrs->lock); for (rbp = rb_first(&xattrs->rb_root); rbp; rbp = rb_next(rbp)) { xattr = rb_entry(rbp, struct simple_xattr, rb_node); @@ -1468,6 +1488,10 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, if (!trusted && xattr_is_trusted(xattr->name)) continue; + /* skip MAC labels; these are provided by LSM above */ + if (xattr_is_maclabel(xattr->name)) + continue; + err = xattr_list_one(&buffer, &remaining_size, xattr->name); if (err) break; |