diff options
Diffstat (limited to 'fs')
483 files changed, 16385 insertions, 11647 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index a547307c1ae8..2685a4d0d353 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -235,6 +235,7 @@ config ARCH_SUPPORTS_HUGETLBFS config HUGETLBFS bool "HugeTLB file system support" depends on X86 || IA64 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN + depends on (SYSFS || SYSCTL) help hugetlbfs is a filesystem backing for HugeTLB pages, based on ramfs. For architectures that support it, say Y here and read diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 21e154516bf2..93539aac0e5b 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -58,7 +58,7 @@ config ARCH_USE_GNU_PROPERTY config BINFMT_ELF_FDPIC bool "Kernel support for FDPIC ELF binaries" default y if !BINFMT_ELF - depends on ARM || ((M68K || SUPERH) && !MMU) + depends on ARM || ((M68K || SUPERH || XTENSA) && !MMU) select ELFCORE help ELF FDPIC binaries are based on ELF, but allow the individual load @@ -142,39 +142,6 @@ config BINFMT_ZFLAT help Support FLAT format compressed binaries -config HAVE_AOUT - def_bool n - -config BINFMT_AOUT - tristate "Kernel support for a.out and ECOFF binaries" - depends on HAVE_AOUT - help - A.out (Assembler.OUTput) is a set of formats for libraries and - executables used in the earliest versions of UNIX. Linux used - the a.out formats QMAGIC and ZMAGIC until they were replaced - with the ELF format. - - The conversion to ELF started in 1995. This option is primarily - provided for historical interest and for the benefit of those - who need to run binaries from that era. - - Most people should answer N here. If you think you may have - occasional use for this format, enable module support above - and answer M here to compile this support as a module called - binfmt_aout. - - If any crucial components of your system (such as /sbin/init - or /lib/ld.so) are still in a.out format, you will have to - say Y here. - -config OSF4_COMPAT - bool "OSF/1 v4 readv/writev compatibility" - depends on ALPHA && BINFMT_AOUT - help - Say Y if you are using OSF/1 binaries (like Netscape and Acrobat) - with v4 shared libraries freely available from Compaq. If you're - going to use shared libraries from Tru64 version 5.0 or later, say N. - config BINFMT_MISC tristate "Kernel support for MISC binaries" help diff --git a/fs/Makefile b/fs/Makefile index 93b80529f8e8..4dea17840761 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -38,7 +38,6 @@ obj-$(CONFIG_FS_DAX) += dax.o obj-$(CONFIG_FS_ENCRYPTION) += crypto/ obj-$(CONFIG_FS_VERITY) += verity/ obj-$(CONFIG_FILE_LOCKING) += locks.o -obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o obj-$(CONFIG_BINFMT_SCRIPT) += binfmt_script.o obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o diff --git a/fs/affs/super.c b/fs/affs/super.c index 4c5f30a83336..58b391446ae1 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -276,7 +276,7 @@ parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved, char *vol = match_strdup(&args[0]); if (!vol) return 0; - strlcpy(volume, vol, 32); + strscpy(volume, vol, 32); kfree(vol); break; } diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 56ae5cd5184f..230c2d19116d 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -24,9 +24,9 @@ static int afs_readdir(struct file *file, struct dir_context *ctx); static int afs_d_revalidate(struct dentry *dentry, unsigned int flags); static int afs_d_delete(const struct dentry *dentry); static void afs_d_iput(struct dentry *dentry, struct inode *inode); -static int afs_lookup_one_filldir(struct dir_context *ctx, const char *name, int nlen, +static bool afs_lookup_one_filldir(struct dir_context *ctx, const char *name, int nlen, loff_t fpos, u64 ino, unsigned dtype); -static int afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen, +static bool afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen, loff_t fpos, u64 ino, unsigned dtype); static int afs_create(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl); @@ -568,7 +568,7 @@ static int afs_readdir(struct file *file, struct dir_context *ctx) * - if afs_dir_iterate_block() spots this function, it'll pass the FID * uniquifier through dtype */ -static int afs_lookup_one_filldir(struct dir_context *ctx, const char *name, +static bool afs_lookup_one_filldir(struct dir_context *ctx, const char *name, int nlen, loff_t fpos, u64 ino, unsigned dtype) { struct afs_lookup_one_cookie *cookie = @@ -584,16 +584,16 @@ static int afs_lookup_one_filldir(struct dir_context *ctx, const char *name, if (cookie->name.len != nlen || memcmp(cookie->name.name, name, nlen) != 0) { - _leave(" = 0 [no]"); - return 0; + _leave(" = true [keep looking]"); + return true; } cookie->fid.vnode = ino; cookie->fid.unique = dtype; cookie->found = 1; - _leave(" = -1 [found]"); - return -1; + _leave(" = false [found]"); + return false; } /* @@ -636,12 +636,11 @@ static int afs_do_lookup_one(struct inode *dir, struct dentry *dentry, * - if afs_dir_iterate_block() spots this function, it'll pass the FID * uniquifier through dtype */ -static int afs_lookup_filldir(struct dir_context *ctx, const char *name, +static bool afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen, loff_t fpos, u64 ino, unsigned dtype) { struct afs_lookup_cookie *cookie = container_of(ctx, struct afs_lookup_cookie, ctx); - int ret; _enter("{%s,%u},%s,%u,,%llu,%u", cookie->name.name, cookie->name.len, name, nlen, @@ -663,12 +662,10 @@ static int afs_lookup_filldir(struct dir_context *ctx, const char *name, cookie->fids[1].unique = dtype; cookie->found = 1; if (cookie->one_only) - return -1; + return false; } - ret = cookie->nr_fids >= 50 ? -1 : 0; - _leave(" = %d", ret); - return ret; + return cookie->nr_fids < 50; } /* @@ -951,16 +951,13 @@ static bool __get_reqs_available(struct kioctx *ctx) local_irq_save(flags); kcpu = this_cpu_ptr(ctx->cpu); if (!kcpu->reqs_available) { - int old, avail = atomic_read(&ctx->reqs_available); + int avail = atomic_read(&ctx->reqs_available); do { if (avail < ctx->req_batch) goto out; - - old = avail; - avail = atomic_cmpxchg(&ctx->reqs_available, - avail, avail - ctx->req_batch); - } while (avail != old); + } while (!atomic_try_cmpxchg(&ctx->reqs_available, + &avail, avail - ctx->req_batch)); kcpu->reqs_available += ctx->req_batch; } diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index e0c3e33c4177..24192a7667ed 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -32,7 +32,7 @@ static struct inode *anon_inode_inode; */ static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen) { - return dynamic_dname(dentry, buffer, buflen, "anon_inode:%s", + return dynamic_dname(buffer, buflen, "anon_inode:%s", dentry->d_name.name); } diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 12b8fdcc445b..9d1cde8066cf 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -147,7 +147,7 @@ static int bad_inode_atomic_open(struct inode *inode, struct dentry *dentry, } static int bad_inode_tmpfile(struct user_namespace *mnt_userns, - struct inode *inode, struct dentry *dentry, + struct inode *inode, struct file *file, umode_t mode) { return -EIO; diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c deleted file mode 100644 index 0dcfc691e7e2..000000000000 --- a/fs/binfmt_aout.c +++ /dev/null @@ -1,342 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/fs/binfmt_aout.c - * - * Copyright (C) 1991, 1992, 1996 Linus Torvalds - */ - -#include <linux/module.h> - -#include <linux/time.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/mman.h> -#include <linux/a.out.h> -#include <linux/errno.h> -#include <linux/signal.h> -#include <linux/string.h> -#include <linux/fs.h> -#include <linux/file.h> -#include <linux/stat.h> -#include <linux/fcntl.h> -#include <linux/ptrace.h> -#include <linux/user.h> -#include <linux/binfmts.h> -#include <linux/personality.h> -#include <linux/init.h> -#include <linux/coredump.h> -#include <linux/slab.h> -#include <linux/sched/task_stack.h> - -#include <linux/uaccess.h> -#include <asm/cacheflush.h> - -static int load_aout_binary(struct linux_binprm *); -static int load_aout_library(struct file*); - -static struct linux_binfmt aout_format = { - .module = THIS_MODULE, - .load_binary = load_aout_binary, - .load_shlib = load_aout_library, -}; - -#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE) - -static int set_brk(unsigned long start, unsigned long end) -{ - start = PAGE_ALIGN(start); - end = PAGE_ALIGN(end); - if (end > start) - return vm_brk(start, end - start); - return 0; -} - -/* - * create_aout_tables() parses the env- and arg-strings in new user - * memory and creates the pointer tables from them, and puts their - * addresses on the "stack", returning the new stack pointer value. - */ -static unsigned long __user *create_aout_tables(char __user *p, struct linux_binprm * bprm) -{ - char __user * __user *argv; - char __user * __user *envp; - unsigned long __user *sp; - int argc = bprm->argc; - int envc = bprm->envc; - - sp = (void __user *)((-(unsigned long)sizeof(char *)) & (unsigned long) p); -#ifdef __alpha__ -/* whee.. test-programs are so much fun. */ - put_user(0, --sp); - put_user(0, --sp); - if (bprm->loader) { - put_user(0, --sp); - put_user(1003, --sp); - put_user(bprm->loader, --sp); - put_user(1002, --sp); - } - put_user(bprm->exec, --sp); - put_user(1001, --sp); -#endif - sp -= envc+1; - envp = (char __user * __user *) sp; - sp -= argc+1; - argv = (char __user * __user *) sp; -#ifndef __alpha__ - put_user((unsigned long) envp,--sp); - put_user((unsigned long) argv,--sp); -#endif - put_user(argc,--sp); - current->mm->arg_start = (unsigned long) p; - while (argc-->0) { - char c; - put_user(p,argv++); - do { - get_user(c,p++); - } while (c); - } - put_user(NULL,argv); - current->mm->arg_end = current->mm->env_start = (unsigned long) p; - while (envc-->0) { - char c; - put_user(p,envp++); - do { - get_user(c,p++); - } while (c); - } - put_user(NULL,envp); - current->mm->env_end = (unsigned long) p; - return sp; -} - -/* - * These are the functions used to load a.out style executables and shared - * libraries. There is no binary dependent code anywhere else. - */ - -static int load_aout_binary(struct linux_binprm * bprm) -{ - struct pt_regs *regs = current_pt_regs(); - struct exec ex; - unsigned long error; - unsigned long fd_offset; - unsigned long rlim; - int retval; - - ex = *((struct exec *) bprm->buf); /* exec-header */ - if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC && - N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) || - N_TRSIZE(ex) || N_DRSIZE(ex) || - i_size_read(file_inode(bprm->file)) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { - return -ENOEXEC; - } - - /* - * Requires a mmap handler. This prevents people from using a.out - * as part of an exploit attack against /proc-related vulnerabilities. - */ - if (!bprm->file->f_op->mmap) - return -ENOEXEC; - - fd_offset = N_TXTOFF(ex); - - /* Check initial limits. This avoids letting people circumvent - * size limits imposed on them by creating programs with large - * arrays in the data or bss. - */ - rlim = rlimit(RLIMIT_DATA); - if (rlim >= RLIM_INFINITY) - rlim = ~0; - if (ex.a_data + ex.a_bss > rlim) - return -ENOMEM; - - /* Flush all traces of the currently running executable */ - retval = begin_new_exec(bprm); - if (retval) - return retval; - - /* OK, This is the point of no return */ -#ifdef __alpha__ - SET_AOUT_PERSONALITY(bprm, ex); -#else - set_personality(PER_LINUX); -#endif - setup_new_exec(bprm); - - current->mm->end_code = ex.a_text + - (current->mm->start_code = N_TXTADDR(ex)); - current->mm->end_data = ex.a_data + - (current->mm->start_data = N_DATADDR(ex)); - current->mm->brk = ex.a_bss + - (current->mm->start_brk = N_BSSADDR(ex)); - - retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); - if (retval < 0) - return retval; - - - if (N_MAGIC(ex) == OMAGIC) { - unsigned long text_addr, map_size; - loff_t pos; - - text_addr = N_TXTADDR(ex); - -#ifdef __alpha__ - pos = fd_offset; - map_size = ex.a_text+ex.a_data + PAGE_SIZE - 1; -#else - pos = 32; - map_size = ex.a_text+ex.a_data; -#endif - error = vm_brk(text_addr & PAGE_MASK, map_size); - if (error) - return error; - - error = read_code(bprm->file, text_addr, pos, - ex.a_text+ex.a_data); - if ((signed long)error < 0) - return error; - } else { - if ((ex.a_text & 0xfff || ex.a_data & 0xfff) && - (N_MAGIC(ex) != NMAGIC) && printk_ratelimit()) - { - printk(KERN_NOTICE "executable not page aligned\n"); - } - - if ((fd_offset & ~PAGE_MASK) != 0 && printk_ratelimit()) - { - printk(KERN_WARNING - "fd_offset is not page aligned. Please convert program: %pD\n", - bprm->file); - } - - if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) { - error = vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); - if (error) - return error; - - read_code(bprm->file, N_TXTADDR(ex), fd_offset, - ex.a_text + ex.a_data); - goto beyond_if; - } - - error = vm_mmap(bprm->file, N_TXTADDR(ex), ex.a_text, - PROT_READ | PROT_EXEC, MAP_FIXED | MAP_PRIVATE, - fd_offset); - - if (error != N_TXTADDR(ex)) - return error; - - error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data, - PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE, - fd_offset + ex.a_text); - if (error != N_DATADDR(ex)) - return error; - } -beyond_if: - set_binfmt(&aout_format); - - retval = set_brk(current->mm->start_brk, current->mm->brk); - if (retval < 0) - return retval; - - current->mm->start_stack = - (unsigned long) create_aout_tables((char __user *) bprm->p, bprm); -#ifdef __alpha__ - regs->gp = ex.a_gpvalue; -#endif - finalize_exec(bprm); - start_thread(regs, ex.a_entry, current->mm->start_stack); - return 0; -} - -static int load_aout_library(struct file *file) -{ - struct inode * inode; - unsigned long bss, start_addr, len; - unsigned long error; - int retval; - struct exec ex; - loff_t pos = 0; - - inode = file_inode(file); - - retval = -ENOEXEC; - error = kernel_read(file, &ex, sizeof(ex), &pos); - if (error != sizeof(ex)) - goto out; - - /* We come in here for the regular a.out style of shared libraries */ - if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) || - N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) || - i_size_read(inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { - goto out; - } - - /* - * Requires a mmap handler. This prevents people from using a.out - * as part of an exploit attack against /proc-related vulnerabilities. - */ - if (!file->f_op->mmap) - goto out; - - if (N_FLAGS(ex)) - goto out; - - /* For QMAGIC, the starting address is 0x20 into the page. We mask - this off to get the starting address for the page */ - - start_addr = ex.a_entry & 0xfffff000; - - if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) { - if (printk_ratelimit()) - { - printk(KERN_WARNING - "N_TXTOFF is not page aligned. Please convert library: %pD\n", - file); - } - retval = vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); - if (retval) - goto out; - - read_code(file, start_addr, N_TXTOFF(ex), - ex.a_text + ex.a_data); - retval = 0; - goto out; - } - /* Now use mmap to map the library into memory. */ - error = vm_mmap(file, start_addr, ex.a_text + ex.a_data, - PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE, - N_TXTOFF(ex)); - retval = error; - if (error != start_addr) - goto out; - - len = PAGE_ALIGN(ex.a_text + ex.a_data); - bss = ex.a_text + ex.a_data + ex.a_bss; - if (bss > len) { - retval = vm_brk(start_addr + len, bss - len); - if (retval) - goto out; - } - retval = 0; -out: - return retval; -} - -static int __init init_aout_binfmt(void) -{ - register_binfmt(&aout_format); - return 0; -} - -static void __exit exit_aout_binfmt(void) -{ - unregister_binfmt(&aout_format); -} - -core_initcall(init_aout_binfmt); -module_exit(exit_aout_binfmt); -MODULE_LICENSE("GPL"); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 63c7ebb0da89..6a11025e5850 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -911,7 +911,7 @@ static int load_elf_binary(struct linux_binprm *bprm) interp_elf_ex = kmalloc(sizeof(*interp_elf_ex), GFP_KERNEL); if (!interp_elf_ex) { retval = -ENOMEM; - goto out_free_ph; + goto out_free_file; } /* Get the exec headers */ @@ -1354,6 +1354,7 @@ out: out_free_dentry: kfree(interp_elf_ex); kfree(interp_elf_phdata); +out_free_file: allow_write_access(interpreter); if (interpreter) fput(interpreter); diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 99f9995670ea..fa9ddcc9eb0b 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -31,7 +31,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \ - subpage.o tree-mod-log.o + subpage.o tree-mod-log.o extent-io-tree.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index d385357e19b6..18374a6d05bd 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -138,6 +138,7 @@ struct share_check { u64 root_objectid; u64 inum; int share_count; + bool have_delayed_delete_refs; }; static inline int extent_is_shared(struct share_check *sc) @@ -288,8 +289,10 @@ static void prelim_release(struct preftree *preftree) struct prelim_ref *ref, *next_ref; rbtree_postorder_for_each_entry_safe(ref, next_ref, - &preftree->root.rb_root, rbnode) + &preftree->root.rb_root, rbnode) { + free_inode_elem_list(ref->inode_list); free_pref(ref); + } preftree->root = RB_ROOT_CACHED; preftree->count = 0; @@ -647,6 +650,18 @@ unode_aux_to_inode_list(struct ulist_node *node) return (struct extent_inode_elem *)(uintptr_t)node->aux; } +static void free_leaf_list(struct ulist *ulist) +{ + struct ulist_node *node; + struct ulist_iterator uiter; + + ULIST_ITER_INIT(&uiter); + while ((node = ulist_next(ulist, &uiter))) + free_inode_elem_list(unode_aux_to_inode_list(node)); + + ulist_free(ulist); +} + /* * We maintain three separate rbtrees: one for direct refs, one for * indirect refs which have a key, and one for indirect refs which do not @@ -761,7 +776,11 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, cond_resched(); } out: - ulist_free(parents); + /* + * We may have inode lists attached to refs in the parents ulist, so we + * must free them before freeing the ulist and its refs. + */ + free_leaf_list(parents); return ret; } @@ -820,16 +839,11 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, struct preftrees *preftrees, struct share_check *sc) { struct btrfs_delayed_ref_node *node; - struct btrfs_delayed_extent_op *extent_op = head->extent_op; struct btrfs_key key; - struct btrfs_key tmp_op_key; struct rb_node *n; int count; int ret = 0; - if (extent_op && extent_op->update_key) - btrfs_disk_key_to_cpu(&tmp_op_key, &extent_op->key); - spin_lock(&head->lock); for (n = rb_first_cached(&head->ref_tree); n; n = rb_next(n)) { node = rb_entry(n, struct btrfs_delayed_ref_node, @@ -855,10 +869,16 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, case BTRFS_TREE_BLOCK_REF_KEY: { /* NORMAL INDIRECT METADATA backref */ struct btrfs_delayed_tree_ref *ref; + struct btrfs_key *key_ptr = NULL; + + if (head->extent_op && head->extent_op->update_key) { + btrfs_disk_key_to_cpu(&key, &head->extent_op->key); + key_ptr = &key; + } ref = btrfs_delayed_node_to_tree_ref(node); ret = add_indirect_ref(fs_info, preftrees, ref->root, - &tmp_op_key, ref->level + 1, + key_ptr, ref->level + 1, node->bytenr, count, sc, GFP_ATOMIC); break; @@ -884,13 +904,22 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, key.offset = ref->offset; /* - * Found a inum that doesn't match our known inum, we - * know it's shared. + * If we have a share check context and a reference for + * another inode, we can't exit immediately. This is + * because even if this is a BTRFS_ADD_DELAYED_REF + * reference we may find next a BTRFS_DROP_DELAYED_REF + * which cancels out this ADD reference. + * + * If this is a DROP reference and there was no previous + * ADD reference, then we need to signal that when we + * process references from the extent tree (through + * add_inline_refs() and add_keyed_refs()), we should + * not exit early if we find a reference for another + * inode, because one of the delayed DROP references + * may cancel that reference in the extent tree. */ - if (sc && sc->inum && ref->objectid != sc->inum) { - ret = BACKREF_FOUND_SHARED; - goto out; - } + if (sc && count < 0) + sc->have_delayed_delete_refs = true; ret = add_indirect_ref(fs_info, preftrees, ref->root, &key, 0, node->bytenr, count, sc, @@ -920,7 +949,7 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, } if (!ret) ret = extent_is_shared(sc); -out: + spin_unlock(&head->lock); return ret; } @@ -1023,7 +1052,8 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = btrfs_extent_data_ref_offset(leaf, dref); - if (sc && sc->inum && key.objectid != sc->inum) { + if (sc && sc->inum && key.objectid != sc->inum && + !sc->have_delayed_delete_refs) { ret = BACKREF_FOUND_SHARED; break; } @@ -1033,6 +1063,7 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, ret = add_indirect_ref(fs_info, preftrees, root, &key, 0, bytenr, count, sc, GFP_NOFS); + break; } default: @@ -1122,7 +1153,8 @@ static int add_keyed_refs(struct btrfs_root *extent_root, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = btrfs_extent_data_ref_offset(leaf, dref); - if (sc && sc->inum && key.objectid != sc->inum) { + if (sc && sc->inum && key.objectid != sc->inum && + !sc->have_delayed_delete_refs) { ret = BACKREF_FOUND_SHARED; break; } @@ -1354,6 +1386,12 @@ again: if (ret < 0) goto out; ref->inode_list = eie; + /* + * We transferred the list ownership to the ref, + * so set to NULL to avoid a double free in case + * an error happens after this. + */ + eie = NULL; } ret = ulist_add_merge_ptr(refs, ref->parent, ref->inode_list, @@ -1379,6 +1417,14 @@ again: eie->next = ref->inode_list; } eie = NULL; + /* + * We have transferred the inode list ownership from + * this ref to the ref we added to the 'refs' ulist. + * So set this ref's inode list to NULL to avoid + * use-after-free when our caller uses it or double + * frees in case an error happens before we return. + */ + ref->inode_list = NULL; } cond_resched(); } @@ -1395,24 +1441,6 @@ out: return ret; } -static void free_leaf_list(struct ulist *blocks) -{ - struct ulist_node *node = NULL; - struct extent_inode_elem *eie; - struct ulist_iterator uiter; - - ULIST_ITER_INIT(&uiter); - while ((node = ulist_next(blocks, &uiter))) { - if (!node->aux) - continue; - eie = unode_aux_to_inode_list(node); - free_inode_elem_list(eie); - node->aux = 0; - } - - ulist_free(blocks); -} - /* * Finds all leafs with a reference to the specified combination of bytenr and * offset. key_list_head will point to a list of corresponding keys (caller must @@ -1511,16 +1539,137 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, return ret; } -/** - * Check if an extent is shared or not +/* + * The caller has joined a transaction or is holding a read lock on the + * fs_info->commit_root_sem semaphore, so no need to worry about the root's last + * snapshot field changing while updating or checking the cache. + */ +static bool lookup_backref_shared_cache(struct btrfs_backref_shared_cache *cache, + struct btrfs_root *root, + u64 bytenr, int level, bool *is_shared) +{ + struct btrfs_backref_shared_cache_entry *entry; + + if (!cache->use_cache) + return false; + + if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL)) + return false; + + /* + * Level -1 is used for the data extent, which is not reliable to cache + * because its reference count can increase or decrease without us + * realizing. We cache results only for extent buffers that lead from + * the root node down to the leaf with the file extent item. + */ + ASSERT(level >= 0); + + entry = &cache->entries[level]; + + /* Unused cache entry or being used for some other extent buffer. */ + if (entry->bytenr != bytenr) + return false; + + /* + * We cached a false result, but the last snapshot generation of the + * root changed, so we now have a snapshot. Don't trust the result. + */ + if (!entry->is_shared && + entry->gen != btrfs_root_last_snapshot(&root->root_item)) + return false; + + /* + * If we cached a true result and the last generation used for dropping + * a root changed, we can not trust the result, because the dropped root + * could be a snapshot sharing this extent buffer. + */ + if (entry->is_shared && + entry->gen != btrfs_get_last_root_drop_gen(root->fs_info)) + return false; + + *is_shared = entry->is_shared; + /* + * If the node at this level is shared, than all nodes below are also + * shared. Currently some of the nodes below may be marked as not shared + * because we have just switched from one leaf to another, and switched + * also other nodes above the leaf and below the current level, so mark + * them as shared. + */ + if (*is_shared) { + for (int i = 0; i < level; i++) { + cache->entries[i].is_shared = true; + cache->entries[i].gen = entry->gen; + } + } + + return true; +} + +/* + * The caller has joined a transaction or is holding a read lock on the + * fs_info->commit_root_sem semaphore, so no need to worry about the root's last + * snapshot field changing while updating or checking the cache. + */ +static void store_backref_shared_cache(struct btrfs_backref_shared_cache *cache, + struct btrfs_root *root, + u64 bytenr, int level, bool is_shared) +{ + struct btrfs_backref_shared_cache_entry *entry; + u64 gen; + + if (!cache->use_cache) + return; + + if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL)) + return; + + /* + * Level -1 is used for the data extent, which is not reliable to cache + * because its reference count can increase or decrease without us + * realizing. We cache results only for extent buffers that lead from + * the root node down to the leaf with the file extent item. + */ + ASSERT(level >= 0); + + if (is_shared) + gen = btrfs_get_last_root_drop_gen(root->fs_info); + else + gen = btrfs_root_last_snapshot(&root->root_item); + + entry = &cache->entries[level]; + entry->bytenr = bytenr; + entry->is_shared = is_shared; + entry->gen = gen; + + /* + * If we found an extent buffer is shared, set the cache result for all + * extent buffers below it to true. As nodes in the path are COWed, + * their sharedness is moved to their children, and if a leaf is COWed, + * then the sharedness of a data extent becomes direct, the refcount of + * data extent is increased in the extent item at the extent tree. + */ + if (is_shared) { + for (int i = 0; i < level; i++) { + entry = &cache->entries[i]; + entry->is_shared = is_shared; + entry->gen = gen; + } + } +} + +/* + * Check if a data extent is shared or not. * - * @root: root inode belongs to - * @inum: inode number of the inode whose extent we are checking - * @bytenr: logical bytenr of the extent we are checking - * @roots: list of roots this extent is shared among - * @tmp: temporary list used for iteration + * @root: The root the inode belongs to. + * @inum: Number of the inode whose extent we are checking. + * @bytenr: Logical bytenr of the extent we are checking. + * @extent_gen: Generation of the extent (file extent item) or 0 if it is + * not known. + * @roots: List of roots this extent is shared among. + * @tmp: Temporary list used for iteration. + * @cache: A backref lookup result cache. * - * btrfs_check_shared uses the backref walking code but will short + * btrfs_is_data_extent_shared uses the backref walking code but will short * circuit as soon as it finds a root or inode that doesn't match the * one passed in. This provides a significant performance benefit for * callers (such as fiemap) which want to know whether the extent is @@ -1531,8 +1680,10 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, * * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error. */ -int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr, - struct ulist *roots, struct ulist *tmp) +int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr, + u64 extent_gen, + struct ulist *roots, struct ulist *tmp, + struct btrfs_backref_shared_cache *cache) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; @@ -1544,7 +1695,9 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr, .root_objectid = root->root_key.objectid, .inum = inum, .share_count = 0, + .have_delayed_delete_refs = false, }; + int level; ulist_init(roots); ulist_init(tmp); @@ -1561,23 +1714,73 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr, btrfs_get_tree_mod_seq(fs_info, &elem); } + /* -1 means we are in the bytenr of the data extent. */ + level = -1; ULIST_ITER_INIT(&uiter); + cache->use_cache = true; while (1) { + bool is_shared; + bool cached; + ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp, roots, NULL, &shared, false); if (ret == BACKREF_FOUND_SHARED) { /* this is the only condition under which we return 1 */ ret = 1; + if (level >= 0) + store_backref_shared_cache(cache, root, bytenr, + level, true); break; } if (ret < 0 && ret != -ENOENT) break; ret = 0; + /* + * If our data extent is not shared through reflinks and it was + * created in a generation after the last one used to create a + * snapshot of the inode's root, then it can not be shared + * indirectly through subtrees, as that can only happen with + * snapshots. In this case bail out, no need to check for the + * sharedness of extent buffers. + */ + if (level == -1 && + extent_gen > btrfs_root_last_snapshot(&root->root_item)) + break; + + /* + * If our data extent was not directly shared (without multiple + * reference items), than it might have a single reference item + * with a count > 1 for the same offset, which means there are 2 + * (or more) file extent items that point to the data extent - + * this happens when a file extent item needs to be split and + * then one item gets moved to another leaf due to a b+tree leaf + * split when inserting some item. In this case the file extent + * items may be located in different leaves and therefore some + * of the leaves may be referenced through shared subtrees while + * others are not. Since our extent buffer cache only works for + * a single path (by far the most common case and simpler to + * deal with), we can not use it if we have multiple leaves + * (which implies multiple paths). + */ + if (level == -1 && tmp->nnodes > 1) + cache->use_cache = false; + + if (level >= 0) + store_backref_shared_cache(cache, root, bytenr, + level, false); node = ulist_next(tmp, &uiter); if (!node) break; bytenr = node->val; + level++; + cached = lookup_backref_shared_cache(cache, root, bytenr, level, + &is_shared); + if (cached) { + ret = (is_shared ? 1 : 0); + break; + } shared.share_count = 0; + shared.have_delayed_delete_refs = false; cond_resched(); } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 2759de7d324c..8e69584d538d 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -17,6 +17,21 @@ struct inode_fs_paths { struct btrfs_data_container *fspath; }; +struct btrfs_backref_shared_cache_entry { + u64 bytenr; + u64 gen; + bool is_shared; +}; + +struct btrfs_backref_shared_cache { + /* + * A path from a root to a leaf that has a file extent item pointing to + * a given data extent should never exceed the maximum b+tree height. + */ + struct btrfs_backref_shared_cache_entry entries[BTRFS_MAX_LEVEL]; + bool use_cache; +}; + typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root, void *ctx); @@ -62,8 +77,10 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, u64 start_off, struct btrfs_path *path, struct btrfs_inode_extref **ret_extref, u64 *found_off); -int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr, - struct ulist *roots, struct ulist *tmp_ulist); +int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr, + u64 extent_gen, + struct ulist *roots, struct ulist *tmp, + struct btrfs_backref_shared_cache *cache); int __init btrfs_prelim_ref_init(void); void __cold btrfs_prelim_ref_exit(void); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index e0375ba9d0fe..deebc8ddbd93 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -593,8 +593,6 @@ next: if (need_resched() || rwsem_is_contended(&fs_info->commit_root_sem)) { - if (wakeup) - caching_ctl->progress = last; btrfs_release_path(path); up_read(&fs_info->commit_root_sem); mutex_unlock(&caching_ctl->mutex); @@ -618,9 +616,6 @@ next: key.objectid = last; key.offset = 0; key.type = BTRFS_EXTENT_ITEM_KEY; - - if (wakeup) - caching_ctl->progress = last; btrfs_release_path(path); goto next; } @@ -655,7 +650,6 @@ next: total_found += add_new_free_space(block_group, last, block_group->start + block_group->length); - caching_ctl->progress = (u64)-1; out: btrfs_free_path(path); @@ -725,8 +719,6 @@ done: } #endif - caching_ctl->progress = (u64)-1; - up_read(&fs_info->commit_root_sem); btrfs_free_excluded_extents(block_group); mutex_unlock(&caching_ctl->mutex); @@ -755,7 +747,6 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait) mutex_init(&caching_ctl->mutex); init_waitqueue_head(&caching_ctl->wait); caching_ctl->block_group = cache; - caching_ctl->progress = cache->start; refcount_set(&caching_ctl->count, 2); btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL); @@ -772,7 +763,6 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait) WARN_ON(cache->caching_ctl); cache->caching_ctl = caching_ctl; cache->cached = BTRFS_CACHE_STARTED; - cache->has_caching_ctl = 1; spin_unlock(&cache->lock); write_lock(&fs_info->block_group_cache_lock); @@ -988,32 +978,31 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, kobject_put(kobj); } - if (block_group->has_caching_ctl) - caching_ctl = btrfs_get_caching_control(block_group); if (block_group->cached == BTRFS_CACHE_STARTED) btrfs_wait_block_group_cache_done(block_group); - if (block_group->has_caching_ctl) { - write_lock(&fs_info->block_group_cache_lock); - if (!caching_ctl) { - struct btrfs_caching_control *ctl; - - list_for_each_entry(ctl, - &fs_info->caching_block_groups, list) - if (ctl->block_group == block_group) { - caching_ctl = ctl; - refcount_inc(&caching_ctl->count); - break; - } - } - if (caching_ctl) - list_del_init(&caching_ctl->list); - write_unlock(&fs_info->block_group_cache_lock); - if (caching_ctl) { - /* Once for the caching bgs list and once for us. */ - btrfs_put_caching_control(caching_ctl); - btrfs_put_caching_control(caching_ctl); + + write_lock(&fs_info->block_group_cache_lock); + caching_ctl = btrfs_get_caching_control(block_group); + if (!caching_ctl) { + struct btrfs_caching_control *ctl; + + list_for_each_entry(ctl, &fs_info->caching_block_groups, list) { + if (ctl->block_group == block_group) { + caching_ctl = ctl; + refcount_inc(&caching_ctl->count); + break; + } } } + if (caching_ctl) + list_del_init(&caching_ctl->list); + write_unlock(&fs_info->block_group_cache_lock); + + if (caching_ctl) { + /* Once for the caching bgs list and once for us. */ + btrfs_put_caching_control(caching_ctl); + btrfs_put_caching_control(caching_ctl); + } spin_lock(&trans->transaction->dirty_bgs_lock); WARN_ON(!list_empty(&block_group->dirty_list)); @@ -1034,12 +1023,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, < block_group->zone_unusable); WARN_ON(block_group->space_info->disk_total < block_group->length * factor); - WARN_ON(block_group->zone_is_active && + WARN_ON(test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, + &block_group->runtime_flags) && block_group->space_info->active_total_bytes < block_group->length); } block_group->space_info->total_bytes -= block_group->length; - if (block_group->zone_is_active) + if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) block_group->space_info->active_total_bytes -= block_group->length; block_group->space_info->bytes_readonly -= (block_group->length - block_group->zone_unusable); @@ -1069,7 +1059,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, goto out; spin_lock(&block_group->lock); - block_group->removed = 1; + set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags); + /* * At this point trimming or scrub can't start on this block group, * because we removed the block group from the rbtree @@ -1304,6 +1295,9 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) return; + if (btrfs_fs_closing(fs_info)) + return; + /* * Long running balances can keep us blocked here for eternity, so * simply skip deletion if we're unable to get the mutex. @@ -1543,6 +1537,9 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) return; + if (btrfs_fs_closing(fs_info)) + return; + if (!btrfs_should_reclaim(fs_info)) return; @@ -1890,16 +1887,6 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) return 0; } -static void link_block_group(struct btrfs_block_group *cache) -{ - struct btrfs_space_info *space_info = cache->space_info; - int index = btrfs_bg_flags_to_raid_index(cache->flags); - - down_write(&space_info->groups_sem); - list_add_tail(&cache->list, &space_info->block_groups[index]); - up_write(&space_info->groups_sem); -} - static struct btrfs_block_group *btrfs_create_block_group_cache( struct btrfs_fs_info *fs_info, u64 start) { @@ -1937,7 +1924,8 @@ static struct btrfs_block_group *btrfs_create_block_group_cache( btrfs_init_free_space_ctl(cache, cache->free_space_ctl); atomic_set(&cache->frozen, 0); mutex_init(&cache->free_space_lock); - btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root); + cache->full_stripe_locks_root.root = RB_ROOT; + mutex_init(&cache->full_stripe_locks_root.lock); return cache; } @@ -2002,7 +1990,6 @@ static int read_one_block_group(struct btrfs_fs_info *info, int need_clear) { struct btrfs_block_group *cache; - struct btrfs_space_info *space_info; const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS); int ret; @@ -2078,11 +2065,9 @@ static int read_one_block_group(struct btrfs_fs_info *info, /* Should not have any excluded extents. Just in case, though. */ btrfs_free_excluded_extents(cache); } else if (cache->length == cache->used) { - cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; btrfs_free_excluded_extents(cache); } else if (cache->used == 0) { - cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; add_new_free_space(cache, cache->start, cache->start + cache->length); @@ -2095,14 +2080,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, goto error; } trace_btrfs_add_block_group(info, cache, 0); - btrfs_update_space_info(info, cache->flags, cache->length, - cache->used, cache->bytes_super, - cache->zone_unusable, cache->zone_is_active, - &space_info); - - cache->space_info = space_info; - - link_block_group(cache); + btrfs_add_bg_to_space_info(info, cache); set_avail_alloc_bits(info, cache->flags); if (btrfs_chunk_writeable(info, cache->start)) { @@ -2126,7 +2104,6 @@ error: static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) { struct extent_map_tree *em_tree = &fs_info->mapping_tree; - struct btrfs_space_info *space_info; struct rb_node *node; int ret = 0; @@ -2146,7 +2123,6 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) /* Fill dummy cache as FULL */ bg->length = em->len; bg->flags = map->type; - bg->last_byte_to_unpin = (u64)-1; bg->cached = BTRFS_CACHE_FINISHED; bg->used = em->len; bg->flags = map->type; @@ -2167,10 +2143,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) break; } - btrfs_update_space_info(fs_info, bg->flags, em->len, em->len, - 0, 0, false, &space_info); - bg->space_info = space_info; - link_block_group(bg); + btrfs_add_bg_to_space_info(fs_info, bg); set_avail_alloc_bits(fs_info, bg->flags); } @@ -2190,7 +2163,16 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) int need_clear = 0; u64 cache_gen; - if (!root) + /* + * Either no extent root (with ibadroots rescue option) or we have + * unsupported RO options. The fs can never be mounted read-write, so no + * need to waste time searching block group items. + * + * This also allows new extent tree related changes to be RO compat, + * no need for a full incompat flag. + */ + if (!root || (btrfs_super_compat_ro_flags(info->super_copy) & + ~BTRFS_FEATURE_COMPAT_RO_SUPP)) return fill_dummy_bgs(info); key.objectid = 0; @@ -2425,7 +2407,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) ret = insert_block_group_item(trans, block_group); if (ret) btrfs_abort_transaction(trans, ret); - if (!block_group->chunk_item_inserted) { + if (!test_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, + &block_group->runtime_flags)) { mutex_lock(&fs_info->chunk_mutex); ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group); mutex_unlock(&fs_info->chunk_mutex); @@ -2494,7 +2477,6 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran set_free_space_tree_thresholds(cache); cache->used = bytes_used; cache->flags = type; - cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; cache->global_root_id = calculate_global_root_id(fs_info, cache->start); @@ -2519,14 +2501,6 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran btrfs_free_excluded_extents(cache); -#ifdef CONFIG_BTRFS_DEBUG - if (btrfs_should_fragment_free_space(cache)) { - u64 new_bytes_used = size - bytes_used; - - bytes_used += new_bytes_used >> 1; - fragment_free_space(cache); - } -#endif /* * Ensure the corresponding space_info object is created and * assigned to our block group. We want our bg to be added to the rbtree @@ -2547,12 +2521,17 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran * the rbtree, update the space info's counters. */ trace_btrfs_add_block_group(fs_info, cache, 1); - btrfs_update_space_info(fs_info, cache->flags, size, bytes_used, - cache->bytes_super, cache->zone_unusable, - cache->zone_is_active, &cache->space_info); + btrfs_add_bg_to_space_info(fs_info, cache); btrfs_update_global_block_rsv(fs_info); - link_block_group(cache); +#ifdef CONFIG_BTRFS_DEBUG + if (btrfs_should_fragment_free_space(cache)) { + u64 new_bytes_used = size - bytes_used; + + cache->space_info->bytes_used += new_bytes_used >> 1; + fragment_free_space(cache); + } +#endif list_add_tail(&cache->bg_list, &trans->new_bgs); trans->delayed_ref_updates++; @@ -2869,7 +2848,7 @@ again: cache_size *= fs_info->sectorsize; ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0, - cache_size); + cache_size, false); if (ret) goto out_put; @@ -3965,35 +3944,24 @@ void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans, void btrfs_put_block_group_cache(struct btrfs_fs_info *info) { struct btrfs_block_group *block_group; - u64 last = 0; - while (1) { - struct inode *inode; + block_group = btrfs_lookup_first_block_group(info, 0); + while (block_group) { + btrfs_wait_block_group_cache_done(block_group); + spin_lock(&block_group->lock); + if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF, + &block_group->runtime_flags)) { + struct inode *inode = block_group->inode; - block_group = btrfs_lookup_first_block_group(info, last); - while (block_group) { - btrfs_wait_block_group_cache_done(block_group); - spin_lock(&block_group->lock); - if (block_group->iref) - break; + block_group->inode = NULL; spin_unlock(&block_group->lock); - block_group = btrfs_next_block_group(block_group); - } - if (!block_group) { - if (last == 0) - break; - last = 0; - continue; - } - inode = block_group->inode; - block_group->iref = 0; - block_group->inode = NULL; - spin_unlock(&block_group->lock); - ASSERT(block_group->io_ctl.inode == NULL); - iput(inode); - last = block_group->start + block_group->length; - btrfs_put_block_group(block_group); + ASSERT(block_group->io_ctl.inode == NULL); + iput(inode); + } else { + spin_unlock(&block_group->lock); + } + block_group = btrfs_next_block_group(block_group); } } @@ -4129,7 +4097,7 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group) spin_lock(&block_group->lock); cleanup = (atomic_dec_and_test(&block_group->frozen) && - block_group->removed); + test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)); spin_unlock(&block_group->lock); if (cleanup) { @@ -4150,7 +4118,7 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group) * tasks trimming this block group have left 1 entry each one. * Free them if any. */ - __btrfs_remove_free_space_cache(block_group->free_space_ctl); + btrfs_remove_free_space_cache(block_group); } } diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 6b3cdc4cbc41..8fb14b99a1d1 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -46,19 +46,44 @@ enum btrfs_chunk_alloc_enum { CHUNK_ALLOC_FORCE_FOR_EXTENT, }; +/* Block group flags set at runtime */ +enum btrfs_block_group_flags { + BLOCK_GROUP_FLAG_IREF, + BLOCK_GROUP_FLAG_REMOVED, + BLOCK_GROUP_FLAG_TO_COPY, + BLOCK_GROUP_FLAG_RELOCATING_REPAIR, + BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, + BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, + BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, +}; + +enum btrfs_caching_type { + BTRFS_CACHE_NO, + BTRFS_CACHE_STARTED, + BTRFS_CACHE_FINISHED, + BTRFS_CACHE_ERROR, +}; + struct btrfs_caching_control { struct list_head list; struct mutex mutex; wait_queue_head_t wait; struct btrfs_work work; struct btrfs_block_group *block_group; - u64 progress; refcount_t count; }; /* Once caching_thread() finds this much free space, it will wake up waiters. */ #define CACHING_CTL_WAKE_UP SZ_2M +/* + * Tree to record all locked full stripes of a RAID5/6 block group + */ +struct btrfs_full_stripe_locks_tree { + struct rb_root root; + struct mutex lock; +}; + struct btrfs_block_group { struct btrfs_fs_info *fs_info; struct inode *inode; @@ -95,23 +120,15 @@ struct btrfs_block_group { /* For raid56, this is a full stripe, without parity */ unsigned long full_stripe_len; + unsigned long runtime_flags; unsigned int ro; - unsigned int iref:1; - unsigned int has_caching_ctl:1; - unsigned int removed:1; - unsigned int to_copy:1; - unsigned int relocating_repair:1; - unsigned int chunk_item_inserted:1; - unsigned int zone_is_active:1; - unsigned int zoned_data_reloc_ongoing:1; int disk_cache_state; /* Cache tracking stuff */ int cached; struct btrfs_caching_control *caching_ctl; - u64 last_byte_to_unpin; struct btrfs_space_info *space_info; @@ -305,8 +322,6 @@ void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans, u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags); void btrfs_put_block_group_cache(struct btrfs_fs_info *info); int btrfs_free_block_groups(struct btrfs_fs_info *info); -void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache, - struct btrfs_caching_control *caching_ctl); int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, struct block_device *bdev, u64 physical, u64 **logical, int *naddrs, int *stripe_len); diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 06be0644dd37..ec96285357e0 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -286,7 +286,7 @@ u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, */ if (block_rsv == delayed_rsv) target = global_rsv; - else if (block_rsv != global_rsv && !delayed_rsv->full) + else if (block_rsv != global_rsv && !btrfs_block_rsv_full(delayed_rsv)) target = delayed_rsv; if (target && block_rsv->space_info != target->space_info) @@ -424,6 +424,7 @@ void btrfs_init_root_block_rsv(struct btrfs_root *root) case BTRFS_CSUM_TREE_OBJECTID: case BTRFS_EXTENT_TREE_OBJECTID: case BTRFS_FREE_SPACE_TREE_OBJECTID: + case BTRFS_BLOCK_GROUP_TREE_OBJECTID: root->block_rsv = &fs_info->delayed_refs_rsv; break; case BTRFS_ROOT_TREE_OBJECTID: diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index 0c183709be00..578c3497a455 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -92,4 +92,13 @@ static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info, btrfs_block_rsv_release(fs_info, block_rsv, 0, NULL); } +/* + * Fast path to check if the reserve is full, may be carefully used outside of + * locks. + */ +static inline bool btrfs_block_rsv_full(const struct btrfs_block_rsv *rsv) +{ + return data_race(rsv->full); +} + #endif /* BTRFS_BLOCK_RSV_H */ diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index b160b8e124e0..54c2ccb36b61 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -65,6 +65,8 @@ enum { * on the same file. */ BTRFS_INODE_VERITY_IN_PROGRESS, + /* Set when this inode is a free space inode. */ + BTRFS_INODE_FREE_SPACE_INODE, }; /* in memory btrfs inode */ @@ -94,7 +96,8 @@ struct btrfs_inode { /* special utility tree used to record which mirrors have already been * tried when checksums fail for a given block */ - struct extent_io_tree io_failure_tree; + struct rb_root io_failure_tree; + spinlock_t io_failure_lock; /* * Keep track of where the inode has extent items mapped in order to @@ -250,11 +253,6 @@ struct btrfs_inode { struct inode vfs_inode; }; -static inline u32 btrfs_inode_sectorsize(const struct btrfs_inode *inode) -{ - return inode->root->fs_info->sectorsize; -} - static inline struct btrfs_inode *BTRFS_I(const struct inode *inode) { return container_of(inode, struct btrfs_inode, vfs_inode); @@ -272,13 +270,6 @@ static inline unsigned long btrfs_inode_hash(u64 objectid, return (unsigned long)h; } -static inline void btrfs_insert_inode_hash(struct inode *inode) -{ - unsigned long h = btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root); - - __insert_inode_hash(inode, h); -} - #if BITS_PER_LONG == 32 /* @@ -312,13 +303,7 @@ static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size) static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode) { - struct btrfs_root *root = inode->root; - - if (root == root->fs_info->tree_root && - btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID) - return true; - - return false; + return test_bit(BTRFS_INODE_FREE_SPACE_INODE, &inode->runtime_flags); } static inline bool is_data_inode(struct inode *inode) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index e84d22c5c6a8..e6635fe70067 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -8,6 +8,7 @@ #include <linux/file.h> #include <linux/fs.h> #include <linux/pagemap.h> +#include <linux/pagevec.h> #include <linux/highmem.h> #include <linux/kthread.h> #include <linux/time.h> @@ -15,6 +16,7 @@ #include <linux/string.h> #include <linux/backing-dev.h> #include <linux/writeback.h> +#include <linux/psi.h> #include <linux/slab.h> #include <linux/sched/mm.h> #include <linux/log2.h> @@ -152,9 +154,7 @@ static void finish_compressed_bio_read(struct compressed_bio *cb) } /* Do io completion on the original bio */ - if (cb->status != BLK_STS_OK) - cb->orig_bio->bi_status = cb->status; - bio_endio(cb->orig_bio); + btrfs_bio_end_io(btrfs_bio(cb->orig_bio), cb->status); /* Finally free the cb struct */ kfree(cb->compressed_pages); @@ -166,16 +166,15 @@ static void finish_compressed_bio_read(struct compressed_bio *cb) * before decompressing it into the original bio and freeing the uncompressed * pages. */ -static void end_compressed_bio_read(struct bio *bio) +static void end_compressed_bio_read(struct btrfs_bio *bbio) { - struct compressed_bio *cb = bio->bi_private; + struct compressed_bio *cb = bbio->private; struct inode *inode = cb->inode; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_inode *bi = BTRFS_I(inode); bool csum = !(bi->flags & BTRFS_INODE_NODATASUM) && !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); - blk_status_t status = bio->bi_status; - struct btrfs_bio *bbio = btrfs_bio(bio); + blk_status_t status = bbio->bio.bi_status; struct bvec_iter iter; struct bio_vec bv; u32 offset; @@ -186,9 +185,8 @@ static void end_compressed_bio_read(struct bio *bio) if (!status && (!csum || !btrfs_check_data_csum(inode, bbio, offset, bv.bv_page, bv.bv_offset))) { - clean_io_failure(fs_info, &bi->io_failure_tree, - &bi->io_tree, start, bv.bv_page, - btrfs_ino(bi), bv.bv_offset); + btrfs_clean_io_failure(bi, start, bv.bv_page, + bv.bv_offset); } else { int ret; @@ -209,7 +207,7 @@ static void end_compressed_bio_read(struct bio *bio) if (refcount_dec_and_test(&cb->pending_ios)) finish_compressed_bio_read(cb); btrfs_bio_free_csum(bbio); - bio_put(bio); + bio_put(&bbio->bio); } /* @@ -222,8 +220,7 @@ static noinline void end_compressed_writeback(struct inode *inode, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); unsigned long index = cb->start >> PAGE_SHIFT; unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; - struct page *pages[16]; - unsigned long nr_pages = end_index - index + 1; + struct folio_batch fbatch; const int errno = blk_status_to_errno(cb->status); int i; int ret; @@ -231,24 +228,23 @@ static noinline void end_compressed_writeback(struct inode *inode, if (errno) mapping_set_error(inode->i_mapping, errno); - while (nr_pages > 0) { - ret = find_get_pages_contig(inode->i_mapping, index, - min_t(unsigned long, - nr_pages, ARRAY_SIZE(pages)), pages); - if (ret == 0) { - nr_pages -= 1; - index += 1; - continue; - } + folio_batch_init(&fbatch); + while (index <= end_index) { + ret = filemap_get_folios(inode->i_mapping, &index, end_index, + &fbatch); + + if (ret == 0) + return; + for (i = 0; i < ret; i++) { + struct folio *folio = fbatch.folios[i]; + if (errno) - SetPageError(pages[i]); - btrfs_page_clamp_clear_writeback(fs_info, pages[i], + folio_set_error(folio); + btrfs_page_clamp_clear_writeback(fs_info, &folio->page, cb->start, cb->len); - put_page(pages[i]); } - nr_pages -= ret; - index += ret; + folio_batch_release(&fbatch); } /* the inode may be gone now */ } @@ -301,20 +297,20 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work) * This also calls the writeback end hooks for the file pages so that metadata * and checksums can be updated in the file. */ -static void end_compressed_bio_write(struct bio *bio) +static void end_compressed_bio_write(struct btrfs_bio *bbio) { - struct compressed_bio *cb = bio->bi_private; + struct compressed_bio *cb = bbio->private; - if (bio->bi_status) - cb->status = bio->bi_status; + if (bbio->bio.bi_status) + cb->status = bbio->bio.bi_status; if (refcount_dec_and_test(&cb->pending_ios)) { struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); - btrfs_record_physical_zoned(cb->inode, cb->start, bio); + btrfs_record_physical_zoned(cb->inode, cb->start, &bbio->bio); queue_work(fs_info->compressed_write_workers, &cb->write_end_work); } - bio_put(bio); + bio_put(&bbio->bio); } /* @@ -335,7 +331,8 @@ static void end_compressed_bio_write(struct bio *bio) static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr, - blk_opf_t opf, bio_end_io_t endio_func, + blk_opf_t opf, + btrfs_bio_end_io_t endio_func, u64 *next_stripe_start) { struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); @@ -344,12 +341,8 @@ static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_byte struct bio *bio; int ret; - bio = btrfs_bio_alloc(BIO_MAX_VECS); - + bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, endio_func, cb); bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; - bio->bi_opf = opf; - bio->bi_private = cb; - bio->bi_end_io = endio_func; em = btrfs_get_chunk_map(fs_info, disk_bytenr, fs_info->sectorsize); if (IS_ERR(em)) { @@ -478,8 +471,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, if (!skip_sum) { ret = btrfs_csum_one_bio(inode, bio, start, true); if (ret) { - bio->bi_status = ret; - bio_endio(bio); + btrfs_bio_end_io(btrfs_bio(bio), ret); break; } } @@ -519,7 +511,8 @@ static u64 bio_end_offset(struct bio *bio) */ static noinline int add_ra_bio_pages(struct inode *inode, u64 compressed_end, - struct compressed_bio *cb) + struct compressed_bio *cb, + int *memstall, unsigned long *pflags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); unsigned long end_index; @@ -588,6 +581,11 @@ static noinline int add_ra_bio_pages(struct inode *inode, continue; } + if (!*memstall && PageWorkingset(page)) { + psi_memstall_enter(pflags); + *memstall = 1; + } + ret = set_page_extent_mapped(page); if (ret < 0) { unlock_page(page); @@ -596,7 +594,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, } page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1; - lock_extent(tree, cur, page_end); + lock_extent(tree, cur, page_end, NULL); read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur); read_unlock(&em_tree->lock); @@ -610,7 +608,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, (cur + fs_info->sectorsize > extent_map_end(em)) || (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) { free_extent_map(em); - unlock_extent(tree, cur, page_end); + unlock_extent(tree, cur, page_end, NULL); unlock_page(page); put_page(page); break; @@ -630,7 +628,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, add_size = min(em->start + em->len, page_end + 1) - cur; ret = bio_add_page(cb->orig_bio, page, add_size, offset_in_page(cur)); if (ret != add_size) { - unlock_extent(tree, cur, page_end); + unlock_extent(tree, cur, page_end, NULL); unlock_page(page); put_page(page); break; @@ -674,6 +672,8 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, u64 em_len; u64 em_start; struct extent_map *em; + unsigned long pflags; + int memstall = 0; blk_status_t ret; int ret2; int i; @@ -729,7 +729,7 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, goto fail; } - add_ra_bio_pages(inode, em_start + em_len, cb); + add_ra_bio_pages(inode, em_start + em_len, cb, &memstall, &pflags); /* include any pages we added in add_ra-bio_pages */ cb->len = bio->bi_iter.bi_size; @@ -799,8 +799,7 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_lookup_bio_sums(inode, comp_bio, NULL); if (ret) { - comp_bio->bi_status = ret; - bio_endio(comp_bio); + btrfs_bio_end_io(btrfs_bio(comp_bio), ret); break; } @@ -810,6 +809,9 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, } } + if (memstall) + psi_memstall_leave(&pflags); + if (refcount_dec_and_test(&cb->pending_ios)) finish_compressed_bio_read(cb); return; @@ -826,8 +828,7 @@ fail: kfree(cb); out: free_extent_map(em); - bio->bi_status = ret; - bio_endio(bio); + btrfs_bio_end_io(btrfs_bio(bio), ret); return; } diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index ebfa35fe1c38..dcb510f38dda 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -114,6 +114,22 @@ noinline void btrfs_release_path(struct btrfs_path *p) } /* + * We want the transaction abort to print stack trace only for errors where the + * cause could be a bug, eg. due to ENOSPC, and not for common errors that are + * caused by external factors. + */ +bool __cold abort_should_print_stack(int errno) +{ + switch (errno) { + case -EIO: + case -EROFS: + case -ENOMEM: + return false; + } + return true; +} + +/* * safely gets a reference on the root node of a tree. A lock * is not taken, so a concurrent writer may put a different node * at the root of the tree. See btrfs_lock_root_node for the @@ -1447,6 +1463,11 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, return 0; } + if (p->nowait) { + free_extent_buffer(tmp); + return -EAGAIN; + } + if (unlock_up) btrfs_unlock_up_safe(p, level + 1); @@ -1467,6 +1488,8 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, ret = -EAGAIN; goto out; + } else if (p->nowait) { + return -EAGAIN; } if (unlock_up) { @@ -1634,7 +1657,13 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, * We don't know the level of the root node until we actually * have it read locked */ - b = btrfs_read_lock_root_node(root); + if (p->nowait) { + b = btrfs_try_read_lock_root_node(root); + if (IS_ERR(b)) + return b; + } else { + b = btrfs_read_lock_root_node(root); + } level = btrfs_header_level(b); if (level > write_lock_level) goto out; @@ -1910,6 +1939,13 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, WARN_ON(p->nodes[0] != NULL); BUG_ON(!cow && ins_len); + /* + * For now only allow nowait for read only operations. There's no + * strict reason why we can't, we just only need it for reads so it's + * only implemented for reads. + */ + ASSERT(!p->nowait || !cow); + if (ins_len < 0) { lowest_unlock = 2; @@ -1936,7 +1972,12 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (p->need_commit_sem) { ASSERT(p->search_commit_root); - down_read(&fs_info->commit_root_sem); + if (p->nowait) { + if (!down_read_trylock(&fs_info->commit_root_sem)) + return -EAGAIN; + } else { + down_read(&fs_info->commit_root_sem); + } } again: @@ -2082,7 +2123,15 @@ cow_done: btrfs_tree_lock(b); p->locks[level] = BTRFS_WRITE_LOCK; } else { - btrfs_tree_read_lock(b); + if (p->nowait) { + if (!btrfs_try_tree_read_lock(b)) { + free_extent_buffer(b); + ret = -EAGAIN; + goto done; + } + } else { + btrfs_tree_read_lock(b); + } p->locks[level] = BTRFS_READ_LOCK; } p->nodes[level] = b; @@ -2131,6 +2180,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, lowest_level = p->lowest_level; WARN_ON(p->nodes[0] != NULL); + ASSERT(!p->nowait); if (p->search_commit_root) { BUG_ON(time_seq); @@ -4432,6 +4482,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, int ret = 1; int keep_locks = path->keep_locks; + ASSERT(!path->nowait); path->keep_locks = 1; again: cur = btrfs_read_lock_root_node(root); @@ -4612,6 +4663,13 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, int ret; int i; + /* + * The nowait semantics are used only for write paths, where we don't + * use the tree mod log and sequence numbers. + */ + if (time_seq) + ASSERT(!path->nowait); + nritems = btrfs_header_nritems(path->nodes[0]); if (nritems == 0) return 1; @@ -4630,7 +4688,14 @@ again: if (path->need_commit_sem) { path->need_commit_sem = 0; need_commit_sem = true; - down_read(&fs_info->commit_root_sem); + if (path->nowait) { + if (!down_read_trylock(&fs_info->commit_root_sem)) { + ret = -EAGAIN; + goto done; + } + } else { + down_read(&fs_info->commit_root_sem); + } } ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); } @@ -4706,7 +4771,7 @@ again: next = c; ret = read_block_for_search(root, path, &next, level, slot, &key); - if (ret == -EAGAIN) + if (ret == -EAGAIN && !path->nowait) goto again; if (ret < 0) { @@ -4716,6 +4781,10 @@ again: if (!path->skip_locking) { ret = btrfs_try_tree_read_lock(next); + if (!ret && path->nowait) { + ret = -EAGAIN; + goto done; + } if (!ret && time_seq) { /* * If we don't get the lock, we may be racing @@ -4746,7 +4815,7 @@ again: ret = read_block_for_search(root, path, &next, level, 0, &key); - if (ret == -EAGAIN) + if (ret == -EAGAIN && !path->nowait) goto again; if (ret < 0) { @@ -4754,8 +4823,16 @@ again: goto done; } - if (!path->skip_locking) - btrfs_tree_read_lock(next); + if (!path->skip_locking) { + if (path->nowait) { + if (!btrfs_try_tree_read_lock(next)) { + ret = -EAGAIN; + goto done; + } + } else { + btrfs_tree_read_lock(next); + } + } } ret = 0; done: diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index df8c99c99df9..9e6d48ff4597 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -42,7 +42,6 @@ struct btrfs_delayed_ref_root; struct btrfs_space_info; struct btrfs_block_group; extern struct kmem_cache *btrfs_trans_handle_cachep; -extern struct kmem_cache *btrfs_bit_radix_cachep; extern struct kmem_cache *btrfs_path_cachep; extern struct kmem_cache *btrfs_free_space_cachep; extern struct kmem_cache *btrfs_free_space_bitmap_cachep; @@ -50,6 +49,11 @@ struct btrfs_ordered_sum; struct btrfs_ref; struct btrfs_bio; struct btrfs_ioctl_encoded_io_args; +struct btrfs_device; +struct btrfs_fs_devices; +struct btrfs_balance_control; +struct btrfs_delayed_root; +struct reloc_control; #define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */ @@ -280,14 +284,9 @@ struct btrfs_super_block { /* the UUID written into btree blocks */ u8 metadata_uuid[BTRFS_FSID_SIZE]; - /* Extent tree v2 */ - __le64 block_group_root; - __le64 block_group_root_generation; - u8 block_group_root_level; - /* future expansion */ - u8 reserved8[7]; - __le64 reserved[25]; + u8 reserved8[8]; + __le64 reserved[27]; u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; @@ -307,7 +306,8 @@ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); #define BTRFS_FEATURE_COMPAT_RO_SUPP \ (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE | \ BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \ - BTRFS_FEATURE_COMPAT_RO_VERITY) + BTRFS_FEATURE_COMPAT_RO_VERITY | \ + BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE) #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL @@ -443,9 +443,10 @@ struct btrfs_path { * header (ie. sizeof(struct btrfs_item) is not included). */ unsigned int search_for_extension:1; + /* Stop search if any locks need to be taken (for read) */ + unsigned int nowait:1; }; -#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \ - sizeof(struct btrfs_item)) + struct btrfs_dev_replace { u64 replace_state; /* see #define above */ time64_t time_started; /* seconds since 1-Jan-1970 */ @@ -502,21 +503,6 @@ struct btrfs_free_cluster { struct list_head block_group_list; }; -enum btrfs_caching_type { - BTRFS_CACHE_NO, - BTRFS_CACHE_STARTED, - BTRFS_CACHE_FINISHED, - BTRFS_CACHE_ERROR, -}; - -/* - * Tree to record all locked full stripes of a RAID5/6 block group - */ -struct btrfs_full_stripe_locks_tree { - struct rb_root root; - struct mutex lock; -}; - /* Discard control. */ /* * Async discard uses multiple lists to differentiate the discard filter @@ -548,42 +534,6 @@ struct btrfs_discard_ctl { atomic64_t discard_bytes_saved; }; -void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info); - -/* fs_info */ -struct reloc_control; -struct btrfs_device; -struct btrfs_fs_devices; -struct btrfs_balance_control; -struct btrfs_delayed_root; - -/* - * Block group or device which contains an active swapfile. Used for preventing - * unsafe operations while a swapfile is active. - * - * These are sorted on (ptr, inode) (note that a block group or device can - * contain more than one swapfile). We compare the pointer values because we - * don't actually care what the object is, we just need a quick check whether - * the object exists in the rbtree. - */ -struct btrfs_swapfile_pin { - struct rb_node node; - void *ptr; - struct inode *inode; - /* - * If true, ptr points to a struct btrfs_block_group. Otherwise, ptr - * points to a struct btrfs_device. - */ - bool is_block_group; - /* - * Only used when 'is_block_group' is true and it is the number of - * extents used by a swapfile for this block group ('ptr' field). - */ - int bg_extent_count; -}; - -bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); - enum { BTRFS_FS_CLOSING_START, BTRFS_FS_CLOSING_DONE, @@ -890,6 +840,7 @@ struct btrfs_fs_info { struct kobject *space_info_kobj; struct kobject *qgroups_kobj; + struct kobject *discard_kobj; /* used to keep from writing metadata until there is a nice batch */ struct percpu_counter dirty_metadata_bytes; @@ -1005,6 +956,7 @@ struct btrfs_fs_info { struct completion qgroup_rescan_completion; struct btrfs_work qgroup_rescan_work; bool qgroup_rescan_running; /* protected by qgroup_rescan_lock */ + u8 qgroup_drop_subtree_thres; /* filesystem state */ unsigned long fs_state; @@ -1092,6 +1044,23 @@ struct btrfs_fs_info { /* Updates are not protected by any lock */ struct btrfs_commit_stats commit_stats; + /* + * Last generation where we dropped a non-relocation root. + * Use btrfs_set_last_root_drop_gen() and btrfs_get_last_root_drop_gen() + * to change it and to read it, respectively. + */ + u64 last_root_drop_gen; + + /* + * Annotations for transaction events (structures are empty when + * compiled without lockdep). + */ + struct lockdep_map btrfs_trans_num_writers_map; + struct lockdep_map btrfs_trans_num_extwriters_map; + struct lockdep_map btrfs_state_change_map[4]; + struct lockdep_map btrfs_trans_pending_ordered_map; + struct lockdep_map btrfs_ordered_extent_map; + #ifdef CONFIG_BTRFS_FS_REF_VERIFY spinlock_t ref_verify_lock; struct rb_root block_tree; @@ -1099,7 +1068,6 @@ struct btrfs_fs_info { #ifdef CONFIG_BTRFS_DEBUG struct kobject *debug_kobj; - struct kobject *discard_debug_kobj; struct list_head allocated_roots; spinlock_t eb_leak_lock; @@ -1107,12 +1075,85 @@ struct btrfs_fs_info { #endif }; +static inline void btrfs_set_last_root_drop_gen(struct btrfs_fs_info *fs_info, + u64 gen) +{ + WRITE_ONCE(fs_info->last_root_drop_gen, gen); +} + +static inline u64 btrfs_get_last_root_drop_gen(const struct btrfs_fs_info *fs_info) +{ + return READ_ONCE(fs_info->last_root_drop_gen); +} + static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) { return sb->s_fs_info; } /* + * Take the number of bytes to be checksummed and figure out how many leaves + * it would require to store the csums for that many bytes. + */ +static inline u64 btrfs_csum_bytes_to_leaves( + const struct btrfs_fs_info *fs_info, u64 csum_bytes) +{ + const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits; + + return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf); +} + +/* + * Use this if we would be adding new items, as we could split nodes as we cow + * down the tree. + */ +static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info, + unsigned num_items) +{ + return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items; +} + +/* + * Doing a truncate or a modification won't result in new nodes or leaves, just + * what we need for COW. + */ +static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info, + unsigned num_items) +{ + return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; +} + +#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \ + sizeof(struct btrfs_item)) + +static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) +{ + return fs_info->zone_size > 0; +} + +/* + * Count how many fs_info->max_extent_size cover the @size + */ +static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size) +{ +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (!fs_info) + return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); +#endif + + return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size); +} + +bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type); +bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type); +void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info); +void btrfs_exclop_finish(struct btrfs_fs_info *fs_info); +void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation op); + +/* * The state of btrfs root */ enum { @@ -1174,6 +1215,82 @@ enum { BTRFS_ROOT_RESET_LOCKDEP_CLASS, }; +enum btrfs_lockdep_trans_states { + BTRFS_LOCKDEP_TRANS_COMMIT_START, + BTRFS_LOCKDEP_TRANS_UNBLOCKED, + BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED, + BTRFS_LOCKDEP_TRANS_COMPLETED, +}; + +/* + * Lockdep annotation for wait events. + * + * @owner: The struct where the lockdep map is defined + * @lock: The lockdep map corresponding to a wait event + * + * This macro is used to annotate a wait event. In this case a thread acquires + * the lockdep map as writer (exclusive lock) because it has to block until all + * the threads that hold the lock as readers signal the condition for the wait + * event and release their locks. + */ +#define btrfs_might_wait_for_event(owner, lock) \ + do { \ + rwsem_acquire(&owner->lock##_map, 0, 0, _THIS_IP_); \ + rwsem_release(&owner->lock##_map, _THIS_IP_); \ + } while (0) + +/* + * Protection for the resource/condition of a wait event. + * + * @owner: The struct where the lockdep map is defined + * @lock: The lockdep map corresponding to a wait event + * + * Many threads can modify the condition for the wait event at the same time + * and signal the threads that block on the wait event. The threads that modify + * the condition and do the signaling acquire the lock as readers (shared + * lock). + */ +#define btrfs_lockdep_acquire(owner, lock) \ + rwsem_acquire_read(&owner->lock##_map, 0, 0, _THIS_IP_) + +/* + * Used after signaling the condition for a wait event to release the lockdep + * map held by a reader thread. + */ +#define btrfs_lockdep_release(owner, lock) \ + rwsem_release(&owner->lock##_map, _THIS_IP_) + +/* + * Macros for the transaction states wait events, similar to the generic wait + * event macros. + */ +#define btrfs_might_wait_for_state(owner, i) \ + do { \ + rwsem_acquire(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_); \ + rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_); \ + } while (0) + +#define btrfs_trans_state_lockdep_acquire(owner, i) \ + rwsem_acquire_read(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_) + +#define btrfs_trans_state_lockdep_release(owner, i) \ + rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_) + +/* Initialization of the lockdep map */ +#define btrfs_lockdep_init_map(owner, lock) \ + do { \ + static struct lock_class_key lock##_key; \ + lockdep_init_map(&owner->lock##_map, #lock, &lock##_key, 0); \ + } while (0) + +/* Initialization of the transaction states lockdep maps. */ +#define btrfs_state_lockdep_init_map(owner, lock, state) \ + do { \ + static struct lock_class_key lock##_key; \ + lockdep_init_map(&owner->btrfs_state_change_map[state], #lock, \ + &lock##_key, 0); \ + } while (0) + static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) { clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags); @@ -2391,17 +2508,6 @@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup, BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, num_devices, 64); -/* - * For extent tree v2 we overload the extent root with the block group root, as - * we will have multiple extent roots. - */ -BTRFS_SETGET_STACK_FUNCS(backup_block_group_root, struct btrfs_root_backup, - extent_root, 64); -BTRFS_SETGET_STACK_FUNCS(backup_block_group_root_gen, struct btrfs_root_backup, - extent_root_gen, 64); -BTRFS_SETGET_STACK_FUNCS(backup_block_group_root_level, - struct btrfs_root_backup, extent_root_level, 8); - /* struct btrfs_balance_item */ BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64); @@ -2534,13 +2640,6 @@ BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block, BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64); BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, uuid_tree_generation, 64); -BTRFS_SETGET_STACK_FUNCS(super_block_group_root, struct btrfs_super_block, - block_group_root, 64); -BTRFS_SETGET_STACK_FUNCS(super_block_group_root_generation, - struct btrfs_super_block, - block_group_root_generation, 64); -BTRFS_SETGET_STACK_FUNCS(super_block_group_root_level, struct btrfs_super_block, - block_group_root_level, 8); int btrfs_super_csum_size(const struct btrfs_super_block *s); const char *btrfs_super_csum_name(u16 csum_type); @@ -2761,45 +2860,6 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, enum btrfs_inline_ref_type is_data); u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset); -static inline u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, - u64 offset) -{ - u64 offset_in_sectors = offset >> fs_info->sectorsize_bits; - - return csums + offset_in_sectors * fs_info->csum_size; -} - -/* - * Take the number of bytes to be checksummed and figure out how many leaves - * it would require to store the csums for that many bytes. - */ -static inline u64 btrfs_csum_bytes_to_leaves( - const struct btrfs_fs_info *fs_info, u64 csum_bytes) -{ - const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits; - - return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf); -} - -/* - * Use this if we would be adding new items, as we could split nodes as we cow - * down the tree. - */ -static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info, - unsigned num_items) -{ - return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items; -} - -/* - * Doing a truncate or a modification won't result in new nodes or leaves, just - * what we need for COW. - */ -static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info, - unsigned num_items) -{ - return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; -} int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info, u64 start, u64 num_bytes); @@ -3257,12 +3317,9 @@ int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset); int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len); blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst); -int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 objectid, u64 pos, - u64 disk_offset, u64 disk_num_bytes, - u64 num_bytes, u64 offset, u64 ram_bytes, - u8 compression, u8 encryption, u16 other_encoding); +int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 objectid, u64 pos, + u64 num_bytes); int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid, @@ -3273,7 +3330,8 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, u64 offset, bool one_ordered); int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, - struct list_head *list, int search_commit); + struct list_head *list, int search_commit, + bool nowait); void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, const struct btrfs_path *path, struct btrfs_file_extent_item *fi, @@ -3299,11 +3357,9 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, u64 start, u64 end); int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, u32 bio_offset, struct page *page, u32 pgoff); -struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, - u64 start, u64 len); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, - u64 *ram_bytes, bool strict); + u64 *ram_bytes, bool nowait, bool strict); void __btrfs_del_delalloc_inode(struct btrfs_root *root, struct btrfs_inode *inode); @@ -3358,7 +3414,6 @@ void btrfs_split_delalloc_extent(struct inode *inode, void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end); vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); void btrfs_evict_inode(struct inode *inode); -int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); struct inode *btrfs_alloc_inode(struct super_block *sb); void btrfs_destroy_inode(struct inode *inode); void btrfs_free_inode(struct inode *inode); @@ -3407,7 +3462,10 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, const struct btrfs_ioctl_encoded_io_args *encoded); -ssize_t btrfs_dio_rw(struct kiocb *iocb, struct iov_iter *iter, size_t done_before); +ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, + size_t done_before); +struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, + size_t done_before); extern const struct dentry_operations btrfs_dentry_operations; @@ -3439,15 +3497,6 @@ void btrfs_get_block_group_info(struct list_head *groups_list, struct btrfs_ioctl_space_info *space); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); -bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, - enum btrfs_exclusive_operation type); -bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, - enum btrfs_exclusive_operation type); -void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info); -void btrfs_exclop_finish(struct btrfs_fs_info *fs_info); -void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, - enum btrfs_exclusive_operation op); - /* file.c */ int __init btrfs_auto_defrag_init(void); @@ -3457,8 +3506,6 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); -void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end, - int skip_pinned); extern const struct file_operations btrfs_file_operations; int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, @@ -3478,8 +3525,10 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, struct extent_state **cached, bool noreserve); int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, - size_t *write_bytes); + size_t *write_bytes, bool nowait); void btrfs_check_nocow_unlock(struct btrfs_inode *inode); +bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, + u64 *delalloc_start_ret, u64 *delalloc_end_ret); /* tree-defrag.c */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, @@ -3745,21 +3794,26 @@ const char * __attribute_const__ btrfs_decode_error(int errno); __cold void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, const char *function, - unsigned int line, int errno); + unsigned int line, int errno, bool first_hit); + +bool __cold abort_should_print_stack(int errno); /* * Call btrfs_abort_transaction as early as possible when an error condition is - * detected, that way the exact line number is reported. + * detected, that way the exact stack trace is reported for some errors. */ #define btrfs_abort_transaction(trans, errno) \ do { \ + bool first = false; \ /* Report first abort since mount */ \ if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ &((trans)->fs_info->fs_state))) { \ - if ((errno) != -EIO && (errno) != -EROFS) { \ - WARN(1, KERN_DEBUG \ + first = true; \ + if (WARN(abort_should_print_stack(errno), \ + KERN_DEBUG \ "BTRFS: Transaction aborted (error %d)\n", \ - (errno)); \ + (errno))) { \ + /* Stack trace printed. */ \ } else { \ btrfs_debug((trans)->fs_info, \ "Transaction aborted (error %d)", \ @@ -3767,7 +3821,7 @@ do { \ } \ } \ __btrfs_abort_transaction((trans), __func__, \ - __LINE__, (errno)); \ + __LINE__, (errno), first); \ } while (0) #ifdef CONFIG_PRINTK_INDEX @@ -3984,16 +4038,9 @@ int btrfs_scrub_cancel(struct btrfs_fs_info *info); int btrfs_scrub_cancel_dev(struct btrfs_device *dev); int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, struct btrfs_scrub_progress *progress); -static inline void btrfs_init_full_stripe_locks_tree( - struct btrfs_full_stripe_locks_tree *locks_root) -{ - locks_root->root = RB_ROOT; - mutex_init(&locks_root->lock); -} /* dev-replace.c */ void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info); -void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info); void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount); static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info) @@ -4020,6 +4067,7 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) extern const struct fsverity_operations btrfs_verityops; int btrfs_drop_verity_items(struct btrfs_inode *inode); +int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size); BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item, encryption, 8); @@ -4037,6 +4085,12 @@ static inline int btrfs_drop_verity_items(struct btrfs_inode *inode) return 0; } +static inline int btrfs_get_verity_descriptor(struct inode *inode, void *buf, + size_t buf_size) +{ + return -EPERM; +} + #endif /* Sanity test specific functions */ @@ -4053,24 +4107,6 @@ static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) } #endif -static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) -{ - return fs_info->zone_size > 0; -} - -/* - * Count how many fs_info->max_extent_size cover the @size - */ -static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size) -{ -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (!fs_info) - return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); -#endif - - return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size); -} - static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) { return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID; diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 1e8f17ff829e..118b2e20b2e1 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -127,9 +127,11 @@ int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) } int btrfs_check_data_free_space(struct btrfs_inode *inode, - struct extent_changeset **reserved, u64 start, u64 len) + struct extent_changeset **reserved, u64 start, + u64 len, bool noflush) { struct btrfs_fs_info *fs_info = inode->root->fs_info; + enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_DATA; int ret; /* align the range */ @@ -137,7 +139,12 @@ int btrfs_check_data_free_space(struct btrfs_inode *inode, round_down(start, fs_info->sectorsize); start = round_down(start, fs_info->sectorsize); - ret = btrfs_alloc_data_chunk_ondemand(inode, len); + if (noflush) + flush = BTRFS_RESERVE_NO_FLUSH; + else if (btrfs_is_free_space_inode(inode)) + flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE; + + ret = btrfs_reserve_data_bytes(fs_info, len, flush); if (ret < 0) return ret; @@ -454,7 +461,7 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, { int ret; - ret = btrfs_check_data_free_space(inode, reserved, start, len); + ret = btrfs_check_data_free_space(inode, reserved, start, len, false); if (ret < 0) return ret; ret = btrfs_delalloc_reserve_metadata(inode, len, len, false); diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h index 28bf5c3ef430..e07d46043455 100644 --- a/fs/btrfs/delalloc-space.h +++ b/fs/btrfs/delalloc-space.h @@ -7,7 +7,8 @@ struct extent_changeset; int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); int btrfs_check_data_free_space(struct btrfs_inode *inode, - struct extent_changeset **reserved, u64 start, u64 len); + struct extent_changeset **reserved, u64 start, u64 len, + bool noflush); void btrfs_free_reserved_data_space(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len); void btrfs_delalloc_release_space(struct btrfs_inode *inode, diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index e7f34871a132..cac5169eaf8d 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -302,15 +302,21 @@ static inline void btrfs_release_prepared_delayed_node( __btrfs_release_delayed_node(node, 1); } -static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len) +static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len, + struct btrfs_delayed_node *node, + enum btrfs_delayed_item_type type) { struct btrfs_delayed_item *item; + item = kmalloc(sizeof(*item) + data_len, GFP_NOFS); if (item) { item->data_len = data_len; - item->ins_or_del = 0; + item->type = type; item->bytes_reserved = 0; - item->delayed_node = NULL; + item->delayed_node = node; + RB_CLEAR_NODE(&item->rb_node); + INIT_LIST_HEAD(&item->log_list); + item->logged = false; refcount_set(&item->refs, 1); } return item; @@ -319,72 +325,32 @@ static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len) /* * __btrfs_lookup_delayed_item - look up the delayed item by key * @delayed_node: pointer to the delayed node - * @key: the key to look up - * @prev: used to store the prev item if the right item isn't found - * @next: used to store the next item if the right item isn't found + * @index: the dir index value to lookup (offset of a dir index key) * * Note: if we don't find the right item, we will return the prev item and * the next item. */ static struct btrfs_delayed_item *__btrfs_lookup_delayed_item( struct rb_root *root, - struct btrfs_key *key, - struct btrfs_delayed_item **prev, - struct btrfs_delayed_item **next) + u64 index) { - struct rb_node *node, *prev_node = NULL; + struct rb_node *node = root->rb_node; struct btrfs_delayed_item *delayed_item = NULL; - int ret = 0; - - node = root->rb_node; while (node) { delayed_item = rb_entry(node, struct btrfs_delayed_item, rb_node); - prev_node = node; - ret = btrfs_comp_cpu_keys(&delayed_item->key, key); - if (ret < 0) + if (delayed_item->index < index) node = node->rb_right; - else if (ret > 0) + else if (delayed_item->index > index) node = node->rb_left; else return delayed_item; } - if (prev) { - if (!prev_node) - *prev = NULL; - else if (ret < 0) - *prev = delayed_item; - else if ((node = rb_prev(prev_node)) != NULL) { - *prev = rb_entry(node, struct btrfs_delayed_item, - rb_node); - } else - *prev = NULL; - } - - if (next) { - if (!prev_node) - *next = NULL; - else if (ret > 0) - *next = delayed_item; - else if ((node = rb_next(prev_node)) != NULL) { - *next = rb_entry(node, struct btrfs_delayed_item, - rb_node); - } else - *next = NULL; - } return NULL; } -static struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item( - struct btrfs_delayed_node *delayed_node, - struct btrfs_key *key) -{ - return __btrfs_lookup_delayed_item(&delayed_node->ins_root.rb_root, key, - NULL, NULL); -} - static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, struct btrfs_delayed_item *ins) { @@ -392,15 +358,13 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, struct rb_node *parent_node = NULL; struct rb_root_cached *root; struct btrfs_delayed_item *item; - int cmp; bool leftmost = true; - if (ins->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM) + if (ins->type == BTRFS_DELAYED_INSERTION_ITEM) root = &delayed_node->ins_root; - else if (ins->ins_or_del == BTRFS_DELAYED_DELETION_ITEM) - root = &delayed_node->del_root; else - BUG(); + root = &delayed_node->del_root; + p = &root->rb_root.rb_node; node = &ins->rb_node; @@ -409,11 +373,10 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, item = rb_entry(parent_node, struct btrfs_delayed_item, rb_node); - cmp = btrfs_comp_cpu_keys(&item->key, &ins->key); - if (cmp < 0) { + if (item->index < ins->index) { p = &(*p)->rb_right; leftmost = false; - } else if (cmp > 0) { + } else if (item->index > ins->index) { p = &(*p)->rb_left; } else { return -EEXIST; @@ -422,14 +385,10 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, rb_link_node(node, parent_node, p); rb_insert_color_cached(node, root, leftmost); - ins->delayed_node = delayed_node; - - /* Delayed items are always for dir index items. */ - ASSERT(ins->key.type == BTRFS_DIR_INDEX_KEY); - if (ins->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM && - ins->key.offset >= delayed_node->index_cnt) - delayed_node->index_cnt = ins->key.offset + 1; + if (ins->type == BTRFS_DELAYED_INSERTION_ITEM && + ins->index >= delayed_node->index_cnt) + delayed_node->index_cnt = ins->index + 1; delayed_node->count++; atomic_inc(&delayed_node->root->fs_info->delayed_root->items); @@ -451,21 +410,21 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) struct rb_root_cached *root; struct btrfs_delayed_root *delayed_root; - /* Not associated with any delayed_node */ - if (!delayed_item->delayed_node) + /* Not inserted, ignore it. */ + if (RB_EMPTY_NODE(&delayed_item->rb_node)) return; + delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root; BUG_ON(!delayed_root); - BUG_ON(delayed_item->ins_or_del != BTRFS_DELAYED_DELETION_ITEM && - delayed_item->ins_or_del != BTRFS_DELAYED_INSERTION_ITEM); - if (delayed_item->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM) + if (delayed_item->type == BTRFS_DELAYED_INSERTION_ITEM) root = &delayed_item->delayed_node->ins_root; else root = &delayed_item->delayed_node->del_root; rb_erase_cached(&delayed_item->rb_node, root); + RB_CLEAR_NODE(&delayed_item->rb_node); delayed_item->delayed_node->count--; finish_one_item(delayed_root); @@ -520,12 +479,11 @@ static struct btrfs_delayed_item *__btrfs_next_delayed_item( } static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_delayed_item *item) { struct btrfs_block_rsv *src_rsv; struct btrfs_block_rsv *dst_rsv; - struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_fs_info *fs_info = trans->fs_info; u64 num_bytes; int ret; @@ -545,14 +503,14 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true); if (!ret) { trace_btrfs_space_reservation(fs_info, "delayed_item", - item->key.objectid, + item->delayed_node->inode_id, num_bytes, 1); /* * For insertions we track reserved metadata space by accounting * for the number of leaves that will be used, based on the delayed * node's index_items_size field. */ - if (item->ins_or_del == BTRFS_DELAYED_DELETION_ITEM) + if (item->type == BTRFS_DELAYED_DELETION_ITEM) item->bytes_reserved = num_bytes; } @@ -574,8 +532,8 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, * to release/reserve qgroup space. */ trace_btrfs_space_reservation(fs_info, "delayed_item", - item->key.objectid, item->bytes_reserved, - 0); + item->delayed_node->inode_id, + item->bytes_reserved, 0); btrfs_block_rsv_release(fs_info, rsv, item->bytes_reserved, NULL); } @@ -688,6 +646,8 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, struct btrfs_delayed_item *next; const int max_size = BTRFS_LEAF_DATA_SIZE(fs_info); struct btrfs_item_batch batch; + struct btrfs_key first_key; + const u32 first_data_size = first_item->data_len; int total_size; char *ins_data = NULL; int ret; @@ -716,9 +676,9 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, ASSERT(first_item->bytes_reserved == 0); list_add_tail(&first_item->tree_list, &item_list); - batch.total_data_size = first_item->data_len; + batch.total_data_size = first_data_size; batch.nr = 1; - total_size = first_item->data_len + sizeof(struct btrfs_item); + total_size = first_data_size + sizeof(struct btrfs_item); curr = first_item; while (true) { @@ -732,8 +692,7 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, * We cannot allow gaps in the key space if we're doing log * replay. */ - if (continuous_keys_only && - (next->key.offset != curr->key.offset + 1)) + if (continuous_keys_only && (next->index != curr->index + 1)) break; ASSERT(next->bytes_reserved == 0); @@ -750,8 +709,11 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, } if (batch.nr == 1) { - batch.keys = &first_item->key; - batch.data_sizes = &first_item->data_len; + first_key.objectid = node->inode_id; + first_key.type = BTRFS_DIR_INDEX_KEY; + first_key.offset = first_item->index; + batch.keys = &first_key; + batch.data_sizes = &first_data_size; } else { struct btrfs_key *ins_keys; u32 *ins_sizes; @@ -768,7 +730,9 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, batch.keys = ins_keys; batch.data_sizes = ins_sizes; list_for_each_entry(curr, &item_list, tree_list) { - ins_keys[i] = curr->key; + ins_keys[i].objectid = node->inode_id; + ins_keys[i].type = BTRFS_DIR_INDEX_KEY; + ins_keys[i].offset = curr->index; ins_sizes[i] = curr->data_len; i++; } @@ -864,6 +828,7 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_delayed_item *item) { + const u64 ino = item->delayed_node->inode_id; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_delayed_item *curr, *next; struct extent_buffer *leaf = path->nodes[0]; @@ -902,7 +867,9 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans, slot++; btrfs_item_key_to_cpu(leaf, &key, slot); - if (btrfs_comp_cpu_keys(&next->key, &key) != 0) + if (key.objectid != ino || + key.type != BTRFS_DIR_INDEX_KEY || + key.offset != next->index) break; nitems++; curr = next; @@ -920,9 +887,8 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans, * Check btrfs_delayed_item_reserve_metadata() to see why we * don't need to release/reserve qgroup space. */ - trace_btrfs_space_reservation(fs_info, "delayed_item", - item->key.objectid, total_reserved_size, - 0); + trace_btrfs_space_reservation(fs_info, "delayed_item", ino, + total_reserved_size, 0); btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv, total_reserved_size, NULL); } @@ -940,8 +906,12 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_delayed_node *node) { + struct btrfs_key key; int ret = 0; + key.objectid = node->inode_id; + key.type = BTRFS_DIR_INDEX_KEY; + while (ret == 0) { struct btrfs_delayed_item *item; @@ -952,7 +922,8 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans, break; } - ret = btrfs_search_slot(trans, root, &item->key, path, -1, 1); + key.offset = item->index; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { /* * There's no matching item in the leaf. This means we @@ -1457,16 +1428,15 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, if (IS_ERR(delayed_node)) return PTR_ERR(delayed_node); - delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len); + delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len, + delayed_node, + BTRFS_DELAYED_INSERTION_ITEM); if (!delayed_item) { ret = -ENOMEM; goto release_node; } - delayed_item->key.objectid = btrfs_ino(dir); - delayed_item->key.type = BTRFS_DIR_INDEX_KEY; - delayed_item->key.offset = index; - delayed_item->ins_or_del = BTRFS_DELAYED_INSERTION_ITEM; + delayed_item->index = index; dir_item = (struct btrfs_dir_item *)delayed_item->data; dir_item->location = *disk_key; @@ -1490,8 +1460,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, } if (reserve_leaf_space) { - ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, - delayed_item); + ret = btrfs_delayed_item_reserve_metadata(trans, delayed_item); /* * Space was reserved for a dir index item insertion when we * started the transaction, so getting a failure here should be @@ -1538,12 +1507,12 @@ release_node: static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info, struct btrfs_delayed_node *node, - struct btrfs_key *key) + u64 index) { struct btrfs_delayed_item *item; mutex_lock(&node->mutex); - item = __btrfs_lookup_delayed_insertion_item(node, key); + item = __btrfs_lookup_delayed_item(&node->ins_root.rb_root, index); if (!item) { mutex_unlock(&node->mutex); return 1; @@ -1589,32 +1558,25 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, { struct btrfs_delayed_node *node; struct btrfs_delayed_item *item; - struct btrfs_key item_key; int ret; node = btrfs_get_or_create_delayed_node(dir); if (IS_ERR(node)) return PTR_ERR(node); - item_key.objectid = btrfs_ino(dir); - item_key.type = BTRFS_DIR_INDEX_KEY; - item_key.offset = index; - - ret = btrfs_delete_delayed_insertion_item(trans->fs_info, node, - &item_key); + ret = btrfs_delete_delayed_insertion_item(trans->fs_info, node, index); if (!ret) goto end; - item = btrfs_alloc_delayed_item(0); + item = btrfs_alloc_delayed_item(0, node, BTRFS_DELAYED_DELETION_ITEM); if (!item) { ret = -ENOMEM; goto end; } - item->key = item_key; - item->ins_or_del = BTRFS_DELAYED_DELETION_ITEM; + item->index = index; - ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, item); + ret = btrfs_delayed_item_reserve_metadata(trans, item); /* * we have reserved enough space when we start a new transaction, * so reserving metadata failure is impossible. @@ -1743,9 +1705,9 @@ int btrfs_should_delete_dir_index(struct list_head *del_list, int ret = 0; list_for_each_entry(curr, del_list, readdir_list) { - if (curr->key.offset > index) + if (curr->index > index) break; - if (curr->key.offset == index) { + if (curr->index == index) { ret = 1; break; } @@ -1779,13 +1741,13 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, list_for_each_entry_safe(curr, next, ins_list, readdir_list) { list_del(&curr->readdir_list); - if (curr->key.offset < ctx->pos) { + if (curr->index < ctx->pos) { if (refcount_dec_and_test(&curr->refs)) kfree(curr); continue; } - ctx->pos = curr->key.offset; + ctx->pos = curr->index; di = (struct btrfs_dir_item *)curr->data; name = (char *)(di + 1); @@ -2085,3 +2047,113 @@ void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info) } } +void btrfs_log_get_delayed_items(struct btrfs_inode *inode, + struct list_head *ins_list, + struct list_head *del_list) +{ + struct btrfs_delayed_node *node; + struct btrfs_delayed_item *item; + + node = btrfs_get_delayed_node(inode); + if (!node) + return; + + mutex_lock(&node->mutex); + item = __btrfs_first_delayed_insertion_item(node); + while (item) { + /* + * It's possible that the item is already in a log list. This + * can happen in case two tasks are trying to log the same + * directory. For example if we have tasks A and task B: + * + * Task A collected the delayed items into a log list while + * under the inode's log_mutex (at btrfs_log_inode()), but it + * only releases the items after logging the inodes they point + * to (if they are new inodes), which happens after unlocking + * the log mutex; + * + * Task B enters btrfs_log_inode() and acquires the log_mutex + * of the same directory inode, before task B releases the + * delayed items. This can happen for example when logging some + * inode we need to trigger logging of its parent directory, so + * logging two files that have the same parent directory can + * lead to this. + * + * If this happens, just ignore delayed items already in a log + * list. All the tasks logging the directory are under a log + * transaction and whichever finishes first can not sync the log + * before the other completes and leaves the log transaction. + */ + if (!item->logged && list_empty(&item->log_list)) { + refcount_inc(&item->refs); + list_add_tail(&item->log_list, ins_list); + } + item = __btrfs_next_delayed_item(item); + } + + item = __btrfs_first_delayed_deletion_item(node); + while (item) { + /* It may be non-empty, for the same reason mentioned above. */ + if (!item->logged && list_empty(&item->log_list)) { + refcount_inc(&item->refs); + list_add_tail(&item->log_list, del_list); + } + item = __btrfs_next_delayed_item(item); + } + mutex_unlock(&node->mutex); + + /* + * We are called during inode logging, which means the inode is in use + * and can not be evicted before we finish logging the inode. So we never + * have the last reference on the delayed inode. + * Also, we don't use btrfs_release_delayed_node() because that would + * requeue the delayed inode (change its order in the list of prepared + * nodes) and we don't want to do such change because we don't create or + * delete delayed items. + */ + ASSERT(refcount_read(&node->refs) > 1); + refcount_dec(&node->refs); +} + +void btrfs_log_put_delayed_items(struct btrfs_inode *inode, + struct list_head *ins_list, + struct list_head *del_list) +{ + struct btrfs_delayed_node *node; + struct btrfs_delayed_item *item; + struct btrfs_delayed_item *next; + + node = btrfs_get_delayed_node(inode); + if (!node) + return; + + mutex_lock(&node->mutex); + + list_for_each_entry_safe(item, next, ins_list, log_list) { + item->logged = true; + list_del_init(&item->log_list); + if (refcount_dec_and_test(&item->refs)) + kfree(item); + } + + list_for_each_entry_safe(item, next, del_list, log_list) { + item->logged = true; + list_del_init(&item->log_list); + if (refcount_dec_and_test(&item->refs)) + kfree(item); + } + + mutex_unlock(&node->mutex); + + /* + * We are called during inode logging, which means the inode is in use + * and can not be evicted before we finish logging the inode. So we never + * have the last reference on the delayed inode. + * Also, we don't use btrfs_release_delayed_node() because that would + * requeue the delayed inode (change its order in the list of prepared + * nodes) and we don't want to do such change because we don't create or + * delete delayed items. + */ + ASSERT(refcount_read(&node->refs) > 1); + refcount_dec(&node->refs); +} diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 9795dc295a18..0163ca637a96 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -16,9 +16,10 @@ #include <linux/refcount.h> #include "ctree.h" -/* types of the delayed item */ -#define BTRFS_DELAYED_INSERTION_ITEM 1 -#define BTRFS_DELAYED_DELETION_ITEM 2 +enum btrfs_delayed_item_type { + BTRFS_DELAYED_INSERTION_ITEM, + BTRFS_DELAYED_DELETION_ITEM +}; struct btrfs_delayed_root { spinlock_t lock; @@ -73,14 +74,27 @@ struct btrfs_delayed_node { struct btrfs_delayed_item { struct rb_node rb_node; - struct btrfs_key key; + /* Offset value of the corresponding dir index key. */ + u64 index; struct list_head tree_list; /* used for batch insert/delete items */ struct list_head readdir_list; /* used for readdir items */ + /* + * Used when logging a directory. + * Insertions and deletions to this list are protected by the parent + * delayed node's mutex. + */ + struct list_head log_list; u64 bytes_reserved; struct btrfs_delayed_node *delayed_node; refcount_t refs; - int ins_or_del; - u32 data_len; + enum btrfs_delayed_item_type type:8; + /* + * Track if this delayed item was already logged. + * Protected by the mutex of the parent delayed inode. + */ + bool logged; + /* The maximum leaf size is 64K, so u16 is more than enough. */ + u16 data_len; char data[]; }; @@ -144,6 +158,14 @@ int btrfs_should_delete_dir_index(struct list_head *del_list, int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, struct list_head *ins_list); +/* Used during directory logging. */ +void btrfs_log_get_delayed_items(struct btrfs_inode *inode, + struct list_head *ins_list, + struct list_head *del_list); +void btrfs_log_put_delayed_items(struct btrfs_inode *inode, + struct list_head *ins_list, + struct list_head *del_list); + /* for init */ int __init btrfs_delayed_inode_init(void); void __cold btrfs_delayed_inode_exit(void); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 41cddd3ff059..61e58066b5fd 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -545,10 +545,7 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info, if (!cache) continue; - spin_lock(&cache->lock); - cache->to_copy = 1; - spin_unlock(&cache->lock); - + set_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags); btrfs_put_block_group(cache); } if (iter_ret < 0) @@ -577,7 +574,7 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, return true; spin_lock(&cache->lock); - if (cache->removed) { + if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) { spin_unlock(&cache->lock); return true; } @@ -610,9 +607,7 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, } /* Last stripe on this device */ - spin_lock(&cache->lock); - cache->to_copy = 0; - spin_unlock(&cache->lock); + clear_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags); return true; } @@ -1288,11 +1283,6 @@ int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) return 1; } -void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info) -{ - percpu_counter_inc(&fs_info->dev_replace.bio_counter); -} - void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount) { percpu_counter_sub(&fs_info->dev_replace.bio_counter, amount); diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 3911049a5f23..6084b313056a 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -7,6 +7,10 @@ #define BTRFS_DEV_REPLACE_H struct btrfs_ioctl_dev_replace_args; +struct btrfs_fs_info; +struct btrfs_trans_handle; +struct btrfs_dev_replace; +struct btrfs_block_group; int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info); int btrfs_run_dev_replace(struct btrfs_trans_handle *trans); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1af28b066b42..d99bf7c64611 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -131,8 +131,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, if (atomic) return -EAGAIN; - lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1, - &cached_state); + lock_extent(io_tree, eb->start, eb->start + eb->len - 1, &cached_state); if (extent_buffer_uptodate(eb) && btrfs_header_generation(eb) == parent_transid) { ret = 0; @@ -145,8 +144,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, ret = 1; clear_extent_buffer_uptodate(eb); out: - unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, - &cached_state); + unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, + &cached_state); return ret; } @@ -167,11 +166,9 @@ static bool btrfs_supported_super_csum(u16 csum_type) * Return 0 if the superblock checksum type matches the checksum value of that * algorithm. Pass the raw disk superblock data. */ -static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, - char *raw_disk_sb) +int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, + const struct btrfs_super_block *disk_sb) { - struct btrfs_super_block *disk_sb = - (struct btrfs_super_block *)raw_disk_sb; char result[BTRFS_CSUM_SIZE]; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); @@ -182,7 +179,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is * filled with zeros and is included in the checksum. */ - crypto_shash_digest(shash, raw_disk_sb + BTRFS_CSUM_SIZE, + crypto_shash_digest(shash, (const u8 *)disk_sb + BTRFS_CSUM_SIZE, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result); if (memcmp(disk_sb->csum, result, fs_info->csum_size)) @@ -647,16 +644,14 @@ static void run_one_async_start(struct btrfs_work *work) */ static void run_one_async_done(struct btrfs_work *work) { - struct async_submit_bio *async; - struct inode *inode; - - async = container_of(work, struct async_submit_bio, work); - inode = async->inode; + struct async_submit_bio *async = + container_of(work, struct async_submit_bio, work); + struct inode *inode = async->inode; + struct btrfs_bio *bbio = btrfs_bio(async->bio); /* If an error occurred we just want to clean up the bio and move on */ if (async->status) { - async->bio->bi_status = async->status; - bio_endio(async->bio); + btrfs_bio_end_io(bbio, async->status); return; } @@ -757,6 +752,7 @@ static bool should_async_write(struct btrfs_fs_info *fs_info, void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_bio *bbio = btrfs_bio(bio); blk_status_t ret; bio->bi_opf |= REQ_META; @@ -776,8 +772,7 @@ void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_ ret = btree_csum_one_bio(bio); if (ret) { - bio->bi_status = ret; - bio_endio(bio); + btrfs_bio_end_io(bbio, ret); return; } @@ -1524,6 +1519,9 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info, if (objectid == BTRFS_UUID_TREE_OBJECTID) return btrfs_grab_root(fs_info->uuid_root) ? fs_info->uuid_root : ERR_PTR(-ENOENT); + if (objectid == BTRFS_BLOCK_GROUP_TREE_OBJECTID) + return btrfs_grab_root(fs_info->block_group_root) ? + fs_info->block_group_root : ERR_PTR(-ENOENT); if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) { struct btrfs_root *root = btrfs_global_root(fs_info, &key); @@ -1980,14 +1978,7 @@ static void backup_super_roots(struct btrfs_fs_info *info) btrfs_set_backup_chunk_root_level(root_backup, btrfs_header_level(info->chunk_root->node)); - if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) { - btrfs_set_backup_block_group_root(root_backup, - info->block_group_root->node->start); - btrfs_set_backup_block_group_root_gen(root_backup, - btrfs_header_generation(info->block_group_root->node)); - btrfs_set_backup_block_group_root_level(root_backup, - btrfs_header_level(info->block_group_root->node)); - } else { + if (!btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE)) { struct btrfs_root *extent_root = btrfs_extent_root(info, 0); struct btrfs_root *csum_root = btrfs_csum_root(info, 0); @@ -2225,6 +2216,8 @@ static void btrfs_init_balance(struct btrfs_fs_info *fs_info) static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) { struct inode *inode = fs_info->btree_inode; + unsigned long hash = btrfs_inode_hash(BTRFS_BTREE_INODE_OBJECTID, + fs_info->tree_root); inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; set_nlink(inode, 1); @@ -2238,8 +2231,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree, - IO_TREE_BTREE_INODE_IO, inode); - BTRFS_I(inode)->io_tree.track_uptodate = false; + IO_TREE_BTREE_INODE_IO, NULL); extent_map_tree_init(&BTRFS_I(inode)->extent_tree); BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root); @@ -2247,7 +2239,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) BTRFS_I(inode)->location.type = 0; BTRFS_I(inode)->location.offset = 0; set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); - btrfs_insert_inode_hash(inode); + __insert_inode_hash(inode, hash); } static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info) @@ -2266,6 +2258,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) fs_info->qgroup_seq = 1; fs_info->qgroup_ulist = NULL; fs_info->qgroup_rescan_running = false; + fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL; mutex_init(&fs_info->qgroup_rescan_lock); } @@ -2529,10 +2522,24 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) if (ret) return ret; - location.objectid = BTRFS_DEV_TREE_OBJECTID; location.type = BTRFS_ROOT_ITEM_KEY; location.offset = 0; + if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) { + location.objectid = BTRFS_BLOCK_GROUP_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(root)) { + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + ret = PTR_ERR(root); + goto out; + } + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->block_group_root = root; + } + } + + location.objectid = BTRFS_DEV_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); if (IS_ERR(root)) { if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { @@ -2544,7 +2551,9 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) fs_info->dev_root = root; } /* Initialize fs_info for all devices in any case */ - btrfs_init_devices_late(fs_info); + ret = btrfs_init_devices_late(fs_info); + if (ret) + goto out; /* * This tree can share blocks with some other fs tree during relocation @@ -2600,8 +2609,8 @@ out: * 1, 2 2nd and 3rd backup copy * -1 skip bytenr check */ -static int validate_super(struct btrfs_fs_info *fs_info, - struct btrfs_super_block *sb, int mirror_num) +int btrfs_validate_super(struct btrfs_fs_info *fs_info, + struct btrfs_super_block *sb, int mirror_num) { u64 nodesize = btrfs_super_nodesize(sb); u64 sectorsize = btrfs_super_sectorsize(sb); @@ -2703,6 +2712,18 @@ static int validate_super(struct btrfs_fs_info *fs_info, ret = -EINVAL; } + /* + * Artificial requirement for block-group-tree to force newer features + * (free-space-tree, no-holes) so the test matrix is smaller. + */ + if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) && + (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) || + !btrfs_fs_incompat(fs_info, NO_HOLES))) { + btrfs_err(fs_info, + "block-group-tree feature requires fres-space-tree and no-holes"); + ret = -EINVAL; + } + if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid, BTRFS_FSID_SIZE) != 0) { btrfs_err(fs_info, @@ -2785,7 +2806,7 @@ static int validate_super(struct btrfs_fs_info *fs_info, */ static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info) { - return validate_super(fs_info, fs_info->super_copy, 0); + return btrfs_validate_super(fs_info, fs_info->super_copy, 0); } /* @@ -2799,7 +2820,7 @@ static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info, { int ret; - ret = validate_super(fs_info, sb, -1); + ret = btrfs_validate_super(fs_info, sb, -1); if (ret < 0) goto out; if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) { @@ -2860,17 +2881,7 @@ static int load_important_roots(struct btrfs_fs_info *fs_info) btrfs_warn(fs_info, "couldn't read tree root"); return ret; } - - if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) - return 0; - - bytenr = btrfs_super_block_group_root(sb); - gen = btrfs_super_block_group_root_generation(sb); - level = btrfs_super_block_group_root_level(sb); - ret = load_super_root(fs_info->block_group_root, bytenr, gen, level); - if (ret) - btrfs_warn(fs_info, "couldn't read block group root"); - return ret; + return 0; } static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) @@ -2882,16 +2893,6 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) int ret = 0; int i; - if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { - struct btrfs_root *root; - - root = btrfs_alloc_root(fs_info, BTRFS_BLOCK_GROUP_TREE_OBJECTID, - GFP_KERNEL); - if (!root) - return -ENOMEM; - fs_info->block_group_root = root; - } - for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) { if (handle_error) { if (!IS_ERR(tree_root->node)) @@ -2990,6 +2991,19 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) mutex_init(&fs_info->zoned_data_reloc_io_lock); seqlock_init(&fs_info->profiles_lock); + btrfs_lockdep_init_map(fs_info, btrfs_trans_num_writers); + btrfs_lockdep_init_map(fs_info, btrfs_trans_num_extwriters); + btrfs_lockdep_init_map(fs_info, btrfs_trans_pending_ordered); + btrfs_lockdep_init_map(fs_info, btrfs_ordered_extent); + btrfs_state_lockdep_init_map(fs_info, btrfs_trans_commit_start, + BTRFS_LOCKDEP_TRANS_COMMIT_START); + btrfs_state_lockdep_init_map(fs_info, btrfs_trans_unblocked, + BTRFS_LOCKDEP_TRANS_UNBLOCKED); + btrfs_state_lockdep_init_map(fs_info, btrfs_trans_super_committed, + BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); + btrfs_state_lockdep_init_map(fs_info, btrfs_trans_completed, + BTRFS_LOCKDEP_TRANS_COMPLETED); + INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); INIT_LIST_HEAD(&fs_info->space_info); INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); @@ -3279,6 +3293,112 @@ out: return ret; } +/* + * Do various sanity and dependency checks of different features. + * + * This is the place for less strict checks (like for subpage or artificial + * feature dependencies). + * + * For strict checks or possible corruption detection, see + * btrfs_validate_super(). + * + * This should be called after btrfs_parse_options(), as some mount options + * (space cache related) can modify on-disk format like free space tree and + * screw up certain feature dependencies. + */ +int btrfs_check_features(struct btrfs_fs_info *fs_info, struct super_block *sb) +{ + struct btrfs_super_block *disk_super = fs_info->super_copy; + u64 incompat = btrfs_super_incompat_flags(disk_super); + const u64 compat_ro = btrfs_super_compat_ro_flags(disk_super); + const u64 compat_ro_unsupp = (compat_ro & ~BTRFS_FEATURE_COMPAT_RO_SUPP); + + if (incompat & ~BTRFS_FEATURE_INCOMPAT_SUPP) { + btrfs_err(fs_info, + "cannot mount because of unknown incompat features (0x%llx)", + incompat); + return -EINVAL; + } + + /* Runtime limitation for mixed block groups. */ + if ((incompat & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && + (fs_info->sectorsize != fs_info->nodesize)) { + btrfs_err(fs_info, +"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups", + fs_info->nodesize, fs_info->sectorsize); + return -EINVAL; + } + + /* Mixed backref is an always-enabled feature. */ + incompat |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; + + /* Set compression related flags just in case. */ + if (fs_info->compress_type == BTRFS_COMPRESS_LZO) + incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; + else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD) + incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD; + + /* + * An ancient flag, which should really be marked deprecated. + * Such runtime limitation doesn't really need a incompat flag. + */ + if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) + incompat |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; + + if (compat_ro_unsupp && !sb_rdonly(sb)) { + btrfs_err(fs_info, + "cannot mount read-write because of unknown compat_ro features (0x%llx)", + compat_ro); + return -EINVAL; + } + + /* + * We have unsupported RO compat features, although RO mounted, we + * should not cause any metadata writes, including log replay. + * Or we could screw up whatever the new feature requires. + */ + if (compat_ro_unsupp && btrfs_super_log_root(disk_super) && + !btrfs_test_opt(fs_info, NOLOGREPLAY)) { + btrfs_err(fs_info, +"cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay", + compat_ro); + return -EINVAL; + } + + /* + * Artificial limitations for block group tree, to force + * block-group-tree to rely on no-holes and free-space-tree. + */ + if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) && + (!btrfs_fs_incompat(fs_info, NO_HOLES) || + !btrfs_test_opt(fs_info, FREE_SPACE_TREE))) { + btrfs_err(fs_info, +"block-group-tree feature requires no-holes and free-space-tree features"); + return -EINVAL; + } + + /* + * Subpage runtime limitation on v1 cache. + * + * V1 space cache still has some hard codeed PAGE_SIZE usage, while + * we're already defaulting to v2 cache, no need to bother v1 as it's + * going to be deprecated anyway. + */ + if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) { + btrfs_warn(fs_info, + "v1 space cache is not supported for page size %lu with sectorsize %u", + PAGE_SIZE, fs_info->sectorsize); + return -EINVAL; + } + + /* This can be called by remount, we need to protect the super block. */ + spin_lock(&fs_info->super_lock); + btrfs_set_super_incompat_flags(disk_super, incompat); + spin_unlock(&fs_info->super_lock); + + return 0; +} + int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options) { @@ -3359,7 +3479,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device * We want to check superblock checksum, the type is stored inside. * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k). */ - if (btrfs_check_super_csum(fs_info, (u8 *)disk_super)) { + if (btrfs_check_super_csum(fs_info, disk_super)) { btrfs_err(fs_info, "superblock checksum mismatch"); err = -EINVAL; btrfs_release_disk_super(disk_super); @@ -3428,72 +3548,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_alloc; } - features = btrfs_super_incompat_flags(disk_super) & - ~BTRFS_FEATURE_INCOMPAT_SUPP; - if (features) { - btrfs_err(fs_info, - "cannot mount because of unsupported optional features (0x%llx)", - features); - err = -EINVAL; - goto fail_alloc; - } - - features = btrfs_super_incompat_flags(disk_super); - features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; - if (fs_info->compress_type == BTRFS_COMPRESS_LZO) - features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; - else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD) - features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD; - - /* - * Flag our filesystem as having big metadata blocks if they are bigger - * than the page size. - */ - if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) - features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; - - /* - * mixed block groups end up with duplicate but slightly offset - * extent buffers for the same range. It leads to corruptions - */ - if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && - (sectorsize != nodesize)) { - btrfs_err(fs_info, -"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups", - nodesize, sectorsize); - goto fail_alloc; - } - - /* - * Needn't use the lock because there is no other task which will - * update the flag. - */ - btrfs_set_super_incompat_flags(disk_super, features); - - features = btrfs_super_compat_ro_flags(disk_super) & - ~BTRFS_FEATURE_COMPAT_RO_SUPP; - if (!sb_rdonly(sb) && features) { - btrfs_err(fs_info, - "cannot mount read-write because of unsupported optional features (0x%llx)", - features); - err = -EINVAL; - goto fail_alloc; - } - /* - * We have unsupported RO compat features, although RO mounted, we - * should not cause any metadata write, including log replay. - * Or we could screw up whatever the new feature requires. - */ - if (unlikely(features && btrfs_super_log_root(disk_super) && - !btrfs_test_opt(fs_info, NOLOGREPLAY))) { - btrfs_err(fs_info, -"cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay", - features); - err = -EINVAL; + ret = btrfs_check_features(fs_info, sb); + if (ret < 0) { + err = ret; goto fail_alloc; } - if (sectorsize < PAGE_SIZE) { struct btrfs_subpage_info *subpage_info; @@ -3833,7 +3893,7 @@ static void btrfs_end_super_write(struct bio *bio) } struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, - int copy_num) + int copy_num, bool drop_cache) { struct btrfs_super_block *super; struct page *page; @@ -3851,6 +3911,19 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev)) return ERR_PTR(-EINVAL); + if (drop_cache) { + /* This should only be called with the primary sb. */ + ASSERT(copy_num == 0); + + /* + * Drop the page of the primary superblock, so later read will + * always read from the device. + */ + invalidate_inode_pages2_range(mapping, + bytenr >> PAGE_SHIFT, + (bytenr + BTRFS_SUPER_INFO_SIZE) >> PAGE_SHIFT); + } + page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS); if (IS_ERR(page)) return ERR_CAST(page); @@ -3882,7 +3955,7 @@ struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev) * later supers, using BTRFS_SUPER_MIRROR_MAX instead */ for (i = 0; i < 1; i++) { - super = btrfs_read_dev_one_super(bdev, i); + super = btrfs_read_dev_one_super(bdev, i, false); if (IS_ERR(super)) continue; @@ -4475,6 +4548,17 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags); /* + * If we had UNFINISHED_DROPS we could still be processing them, so + * clear that bit and wake up relocation so it can stop. + * We must do this before stopping the block group reclaim task, because + * at btrfs_relocate_block_group() we wait for this bit, and after the + * wait we stop with -EINTR if btrfs_fs_closing() returns non-zero - we + * have just set BTRFS_FS_CLOSING_START, so btrfs_fs_closing() will + * return 1. + */ + btrfs_wake_unfinished_drop(fs_info); + + /* * We may have the reclaim task running and relocating a data block group, * in which case it may create delayed iputs. So stop it before we park * the cleaner kthread otherwise we can get new delayed iputs after @@ -4492,12 +4576,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) */ kthread_park(fs_info->cleaner_kthread); - /* - * If we had UNFINISHED_DROPS we could still be processing them, so - * clear that bit and wake up relocation so it can stop. - */ - btrfs_wake_unfinished_drop(fs_info); - /* wait for the qgroup rescan worker to stop */ btrfs_qgroup_wait_for_completion(fs_info, false); @@ -4520,6 +4598,31 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) /* clear out the rbtree of defraggable inodes */ btrfs_cleanup_defrag_inodes(fs_info); + /* + * After we parked the cleaner kthread, ordered extents may have + * completed and created new delayed iputs. If one of the async reclaim + * tasks is running and in the RUN_DELAYED_IPUTS flush state, then we + * can hang forever trying to stop it, because if a delayed iput is + * added after it ran btrfs_run_delayed_iputs() and before it called + * btrfs_wait_on_delayed_iputs(), it will hang forever since there is + * no one else to run iputs. + * + * So wait for all ongoing ordered extents to complete and then run + * delayed iputs. This works because once we reach this point no one + * can either create new ordered extents nor create delayed iputs + * through some other means. + * + * Also note that btrfs_wait_ordered_roots() is not safe here, because + * it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent, + * but the delayed iput for the respective inode is made only when doing + * the final btrfs_put_ordered_extent() (which must happen at + * btrfs_finish_ordered_io() when we are unmounting). + */ + btrfs_flush_workqueue(fs_info->endio_write_workers); + /* Ordered extents for free space inodes. */ + btrfs_flush_workqueue(fs_info->endio_freespace_worker); + btrfs_run_delayed_iputs(fs_info); + cancel_work_sync(&fs_info->async_reclaim_work); cancel_work_sync(&fs_info->async_data_reclaim_work); cancel_work_sync(&fs_info->preempt_reclaim_work); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 47ad8e0a2d33..9fa923e005a3 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -42,14 +42,19 @@ struct extent_buffer *btrfs_find_create_tree_block( void btrfs_clean_tree_block(struct extent_buffer *buf); void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info); int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info); +int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, + const struct btrfs_super_block *disk_sb); int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options); void __cold close_ctree(struct btrfs_fs_info *fs_info); +int btrfs_validate_super(struct btrfs_fs_info *fs_info, + struct btrfs_super_block *sb, int mirror_num); +int btrfs_check_features(struct btrfs_fs_info *fs_info, struct super_block *sb); int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors); struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev); struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, - int copy_num); + int copy_num, bool drop_cache); int btrfs_commit_super(struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, struct btrfs_key *key); @@ -103,7 +108,7 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root) static inline struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info) { - if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) + if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) return fs_info->block_group_root; return btrfs_extent_root(fs_info, 0); } diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 1d4c2397d0d6..fab7eb76e53b 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -58,7 +58,7 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, } struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, - u64 root_objectid, u32 generation, + u64 root_objectid, u64 generation, int check_generation) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h index f32f4113c976..5afb7ca42828 100644 --- a/fs/btrfs/export.h +++ b/fs/btrfs/export.h @@ -19,7 +19,7 @@ struct btrfs_fid { } __attribute__ ((packed)); struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, - u64 root_objectid, u32 generation, + u64 root_objectid, u64 generation, int check_generation); struct dentry *btrfs_get_parent(struct dentry *child); diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c new file mode 100644 index 000000000000..83cb0378096f --- /dev/null +++ b/fs/btrfs/extent-io-tree.c @@ -0,0 +1,1674 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/slab.h> +#include <trace/events/btrfs.h> +#include "ctree.h" +#include "extent-io-tree.h" +#include "btrfs_inode.h" +#include "misc.h" + +static struct kmem_cache *extent_state_cache; + +static inline bool extent_state_in_tree(const struct extent_state *state) +{ + return !RB_EMPTY_NODE(&state->rb_node); +} + +#ifdef CONFIG_BTRFS_DEBUG +static LIST_HEAD(states); +static DEFINE_SPINLOCK(leak_lock); + +static inline void btrfs_leak_debug_add_state(struct extent_state *state) +{ + unsigned long flags; + + spin_lock_irqsave(&leak_lock, flags); + list_add(&state->leak_list, &states); + spin_unlock_irqrestore(&leak_lock, flags); +} + +static inline void btrfs_leak_debug_del_state(struct extent_state *state) +{ + unsigned long flags; + + spin_lock_irqsave(&leak_lock, flags); + list_del(&state->leak_list); + spin_unlock_irqrestore(&leak_lock, flags); +} + +static inline void btrfs_extent_state_leak_debug_check(void) +{ + struct extent_state *state; + + while (!list_empty(&states)) { + state = list_entry(states.next, struct extent_state, leak_list); + pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n", + state->start, state->end, state->state, + extent_state_in_tree(state), + refcount_read(&state->refs)); + list_del(&state->leak_list); + kmem_cache_free(extent_state_cache, state); + } +} + +#define btrfs_debug_check_extent_io_range(tree, start, end) \ + __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end)) +static inline void __btrfs_debug_check_extent_io_range(const char *caller, + struct extent_io_tree *tree, + u64 start, u64 end) +{ + struct inode *inode = tree->private_data; + u64 isize; + + if (!inode) + return; + + isize = i_size_read(inode); + if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { + btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, + "%s: ino %llu isize %llu odd range [%llu,%llu]", + caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); + } +} +#else +#define btrfs_leak_debug_add_state(state) do {} while (0) +#define btrfs_leak_debug_del_state(state) do {} while (0) +#define btrfs_extent_state_leak_debug_check() do {} while (0) +#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) +#endif + +/* + * For the file_extent_tree, we want to hold the inode lock when we lookup and + * update the disk_i_size, but lockdep will complain because our io_tree we hold + * the tree lock and get the inode lock when setting delalloc. These two things + * are unrelated, so make a class for the file_extent_tree so we don't get the + * two locking patterns mixed up. + */ +static struct lock_class_key file_extent_tree_class; + +struct tree_entry { + u64 start; + u64 end; + struct rb_node rb_node; +}; + +void extent_io_tree_init(struct btrfs_fs_info *fs_info, + struct extent_io_tree *tree, unsigned int owner, + void *private_data) +{ + tree->fs_info = fs_info; + tree->state = RB_ROOT; + spin_lock_init(&tree->lock); + tree->private_data = private_data; + tree->owner = owner; + if (owner == IO_TREE_INODE_FILE_EXTENT) + lockdep_set_class(&tree->lock, &file_extent_tree_class); +} + +void extent_io_tree_release(struct extent_io_tree *tree) +{ + spin_lock(&tree->lock); + /* + * Do a single barrier for the waitqueue_active check here, the state + * of the waitqueue should not change once extent_io_tree_release is + * called. + */ + smp_mb(); + while (!RB_EMPTY_ROOT(&tree->state)) { + struct rb_node *node; + struct extent_state *state; + + node = rb_first(&tree->state); + state = rb_entry(node, struct extent_state, rb_node); + rb_erase(&state->rb_node, &tree->state); + RB_CLEAR_NODE(&state->rb_node); + /* + * btree io trees aren't supposed to have tasks waiting for + * changes in the flags of extent states ever. + */ + ASSERT(!waitqueue_active(&state->wq)); + free_extent_state(state); + + cond_resched_lock(&tree->lock); + } + spin_unlock(&tree->lock); +} + +static struct extent_state *alloc_extent_state(gfp_t mask) +{ + struct extent_state *state; + + /* + * The given mask might be not appropriate for the slab allocator, + * drop the unsupported bits + */ + mask &= ~(__GFP_DMA32|__GFP_HIGHMEM); + state = kmem_cache_alloc(extent_state_cache, mask); + if (!state) + return state; + state->state = 0; + RB_CLEAR_NODE(&state->rb_node); + btrfs_leak_debug_add_state(state); + refcount_set(&state->refs, 1); + init_waitqueue_head(&state->wq); + trace_alloc_extent_state(state, mask, _RET_IP_); + return state; +} + +static struct extent_state *alloc_extent_state_atomic(struct extent_state *prealloc) +{ + if (!prealloc) + prealloc = alloc_extent_state(GFP_ATOMIC); + + return prealloc; +} + +void free_extent_state(struct extent_state *state) +{ + if (!state) + return; + if (refcount_dec_and_test(&state->refs)) { + WARN_ON(extent_state_in_tree(state)); + btrfs_leak_debug_del_state(state); + trace_free_extent_state(state, _RET_IP_); + kmem_cache_free(extent_state_cache, state); + } +} + +static int add_extent_changeset(struct extent_state *state, u32 bits, + struct extent_changeset *changeset, + int set) +{ + int ret; + + if (!changeset) + return 0; + if (set && (state->state & bits) == bits) + return 0; + if (!set && (state->state & bits) == 0) + return 0; + changeset->bytes_changed += state->end - state->start + 1; + ret = ulist_add(&changeset->range_changed, state->start, state->end, + GFP_ATOMIC); + return ret; +} + +static inline struct extent_state *next_state(struct extent_state *state) +{ + struct rb_node *next = rb_next(&state->rb_node); + + if (next) + return rb_entry(next, struct extent_state, rb_node); + else + return NULL; +} + +static inline struct extent_state *prev_state(struct extent_state *state) +{ + struct rb_node *next = rb_prev(&state->rb_node); + + if (next) + return rb_entry(next, struct extent_state, rb_node); + else + return NULL; +} + +/* + * Search @tree for an entry that contains @offset. Such entry would have + * entry->start <= offset && entry->end >= offset. + * + * @tree: the tree to search + * @offset: offset that should fall within an entry in @tree + * @node_ret: pointer where new node should be anchored (used when inserting an + * entry in the tree) + * @parent_ret: points to entry which would have been the parent of the entry, + * containing @offset + * + * Return a pointer to the entry that contains @offset byte address and don't change + * @node_ret and @parent_ret. + * + * If no such entry exists, return pointer to entry that ends before @offset + * and fill parameters @node_ret and @parent_ret, ie. does not return NULL. + */ +static inline struct extent_state *tree_search_for_insert(struct extent_io_tree *tree, + u64 offset, + struct rb_node ***node_ret, + struct rb_node **parent_ret) +{ + struct rb_root *root = &tree->state; + struct rb_node **node = &root->rb_node; + struct rb_node *prev = NULL; + struct extent_state *entry = NULL; + + while (*node) { + prev = *node; + entry = rb_entry(prev, struct extent_state, rb_node); + + if (offset < entry->start) + node = &(*node)->rb_left; + else if (offset > entry->end) + node = &(*node)->rb_right; + else + return entry; + } + + if (node_ret) + *node_ret = node; + if (parent_ret) + *parent_ret = prev; + + /* Search neighbors until we find the first one past the end */ + while (entry && offset > entry->end) + entry = next_state(entry); + + return entry; +} + +/* + * Search offset in the tree or fill neighbor rbtree node pointers. + * + * @tree: the tree to search + * @offset: offset that should fall within an entry in @tree + * @next_ret: pointer to the first entry whose range ends after @offset + * @prev_ret: pointer to the first entry whose range begins before @offset + * + * Return a pointer to the entry that contains @offset byte address. If no + * such entry exists, then return NULL and fill @prev_ret and @next_ret. + * Otherwise return the found entry and other pointers are left untouched. + */ +static struct extent_state *tree_search_prev_next(struct extent_io_tree *tree, + u64 offset, + struct extent_state **prev_ret, + struct extent_state **next_ret) +{ + struct rb_root *root = &tree->state; + struct rb_node **node = &root->rb_node; + struct extent_state *orig_prev; + struct extent_state *entry = NULL; + + ASSERT(prev_ret); + ASSERT(next_ret); + + while (*node) { + entry = rb_entry(*node, struct extent_state, rb_node); + + if (offset < entry->start) + node = &(*node)->rb_left; + else if (offset > entry->end) + node = &(*node)->rb_right; + else + return entry; + } + + orig_prev = entry; + while (entry && offset > entry->end) + entry = next_state(entry); + *next_ret = entry; + entry = orig_prev; + + while (entry && offset < entry->start) + entry = prev_state(entry); + *prev_ret = entry; + + return NULL; +} + +/* + * Inexact rb-tree search, return the next entry if @offset is not found + */ +static inline struct extent_state *tree_search(struct extent_io_tree *tree, u64 offset) +{ + return tree_search_for_insert(tree, offset, NULL, NULL); +} + +static void extent_io_tree_panic(struct extent_io_tree *tree, int err) +{ + btrfs_panic(tree->fs_info, err, + "locking error: extent tree was modified by another thread while locked"); +} + +/* + * Utility function to look for merge candidates inside a given range. Any + * extents with matching state are merged together into a single extent in the + * tree. Extents with EXTENT_IO in their state field are not merged because + * the end_io handlers need to be able to do operations on them without + * sleeping (or doing allocations/splits). + * + * This should be called with the tree lock held. + */ +static void merge_state(struct extent_io_tree *tree, struct extent_state *state) +{ + struct extent_state *other; + + if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY)) + return; + + other = prev_state(state); + if (other && other->end == state->start - 1 && + other->state == state->state) { + if (tree->private_data) + btrfs_merge_delalloc_extent(tree->private_data, + state, other); + state->start = other->start; + rb_erase(&other->rb_node, &tree->state); + RB_CLEAR_NODE(&other->rb_node); + free_extent_state(other); + } + other = next_state(state); + if (other && other->start == state->end + 1 && + other->state == state->state) { + if (tree->private_data) + btrfs_merge_delalloc_extent(tree->private_data, state, + other); + state->end = other->end; + rb_erase(&other->rb_node, &tree->state); + RB_CLEAR_NODE(&other->rb_node); + free_extent_state(other); + } +} + +static void set_state_bits(struct extent_io_tree *tree, + struct extent_state *state, + u32 bits, struct extent_changeset *changeset) +{ + u32 bits_to_set = bits & ~EXTENT_CTLBITS; + int ret; + + if (tree->private_data) + btrfs_set_delalloc_extent(tree->private_data, state, bits); + + ret = add_extent_changeset(state, bits_to_set, changeset, 1); + BUG_ON(ret < 0); + state->state |= bits_to_set; +} + +/* + * Insert an extent_state struct into the tree. 'bits' are set on the + * struct before it is inserted. + * + * This may return -EEXIST if the extent is already there, in which case the + * state struct is freed. + * + * The tree lock is not taken internally. This is a utility function and + * probably isn't what you want to call (see set/clear_extent_bit). + */ +static int insert_state(struct extent_io_tree *tree, + struct extent_state *state, + u32 bits, struct extent_changeset *changeset) +{ + struct rb_node **node; + struct rb_node *parent; + const u64 end = state->end; + + set_state_bits(tree, state, bits, changeset); + + node = &tree->state.rb_node; + while (*node) { + struct extent_state *entry; + + parent = *node; + entry = rb_entry(parent, struct extent_state, rb_node); + + if (end < entry->start) { + node = &(*node)->rb_left; + } else if (end > entry->end) { + node = &(*node)->rb_right; + } else { + btrfs_err(tree->fs_info, + "found node %llu %llu on insert of %llu %llu", + entry->start, entry->end, state->start, end); + return -EEXIST; + } + } + + rb_link_node(&state->rb_node, parent, node); + rb_insert_color(&state->rb_node, &tree->state); + + merge_state(tree, state); + return 0; +} + +/* + * Insert state to @tree to the location given by @node and @parent. + */ +static void insert_state_fast(struct extent_io_tree *tree, + struct extent_state *state, struct rb_node **node, + struct rb_node *parent, unsigned bits, + struct extent_changeset *changeset) +{ + set_state_bits(tree, state, bits, changeset); + rb_link_node(&state->rb_node, parent, node); + rb_insert_color(&state->rb_node, &tree->state); + merge_state(tree, state); +} + +/* + * Split a given extent state struct in two, inserting the preallocated + * struct 'prealloc' as the newly created second half. 'split' indicates an + * offset inside 'orig' where it should be split. + * + * Before calling, + * the tree has 'orig' at [orig->start, orig->end]. After calling, there + * are two extent state structs in the tree: + * prealloc: [orig->start, split - 1] + * orig: [ split, orig->end ] + * + * The tree locks are not taken by this function. They need to be held + * by the caller. + */ +static int split_state(struct extent_io_tree *tree, struct extent_state *orig, + struct extent_state *prealloc, u64 split) +{ + struct rb_node *parent = NULL; + struct rb_node **node; + + if (tree->private_data) + btrfs_split_delalloc_extent(tree->private_data, orig, split); + + prealloc->start = orig->start; + prealloc->end = split - 1; + prealloc->state = orig->state; + orig->start = split; + + parent = &orig->rb_node; + node = &parent; + while (*node) { + struct extent_state *entry; + + parent = *node; + entry = rb_entry(parent, struct extent_state, rb_node); + + if (prealloc->end < entry->start) { + node = &(*node)->rb_left; + } else if (prealloc->end > entry->end) { + node = &(*node)->rb_right; + } else { + free_extent_state(prealloc); + return -EEXIST; + } + } + + rb_link_node(&prealloc->rb_node, parent, node); + rb_insert_color(&prealloc->rb_node, &tree->state); + + return 0; +} + +/* + * Utility function to clear some bits in an extent state struct. It will + * optionally wake up anyone waiting on this state (wake == 1). + * + * If no bits are set on the state struct after clearing things, the + * struct is freed and removed from the tree + */ +static struct extent_state *clear_state_bit(struct extent_io_tree *tree, + struct extent_state *state, + u32 bits, int wake, + struct extent_changeset *changeset) +{ + struct extent_state *next; + u32 bits_to_clear = bits & ~EXTENT_CTLBITS; + int ret; + + if (tree->private_data) + btrfs_clear_delalloc_extent(tree->private_data, state, bits); + + ret = add_extent_changeset(state, bits_to_clear, changeset, 0); + BUG_ON(ret < 0); + state->state &= ~bits_to_clear; + if (wake) + wake_up(&state->wq); + if (state->state == 0) { + next = next_state(state); + if (extent_state_in_tree(state)) { + rb_erase(&state->rb_node, &tree->state); + RB_CLEAR_NODE(&state->rb_node); + free_extent_state(state); + } else { + WARN_ON(1); + } + } else { + merge_state(tree, state); + next = next_state(state); + } + return next; +} + +/* + * Clear some bits on a range in the tree. This may require splitting or + * inserting elements in the tree, so the gfp mask is used to indicate which + * allocations or sleeping are allowed. + * + * Pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove the given + * range from the tree regardless of state (ie for truncate). + * + * The range [start, end] is inclusive. + * + * This takes the tree lock, and returns 0 on success and < 0 on error. + */ +int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_state **cached_state, + gfp_t mask, struct extent_changeset *changeset) +{ + struct extent_state *state; + struct extent_state *cached; + struct extent_state *prealloc = NULL; + u64 last_end; + int err; + int clear = 0; + int wake; + int delete = (bits & EXTENT_CLEAR_ALL_BITS); + + btrfs_debug_check_extent_io_range(tree, start, end); + trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits); + + if (delete) + bits |= ~EXTENT_CTLBITS; + + if (bits & EXTENT_DELALLOC) + bits |= EXTENT_NORESERVE; + + wake = (bits & EXTENT_LOCKED) ? 1 : 0; + if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY)) + clear = 1; +again: + if (!prealloc && gfpflags_allow_blocking(mask)) { + /* + * Don't care for allocation failure here because we might end + * up not needing the pre-allocated extent state at all, which + * is the case if we only have in the tree extent states that + * cover our input range and don't cover too any other range. + * If we end up needing a new extent state we allocate it later. + */ + prealloc = alloc_extent_state(mask); + } + + spin_lock(&tree->lock); + if (cached_state) { + cached = *cached_state; + + if (clear) { + *cached_state = NULL; + cached_state = NULL; + } + + if (cached && extent_state_in_tree(cached) && + cached->start <= start && cached->end > start) { + if (clear) + refcount_dec(&cached->refs); + state = cached; + goto hit_next; + } + if (clear) + free_extent_state(cached); + } + + /* This search will find the extents that end after our range starts. */ + state = tree_search(tree, start); + if (!state) + goto out; +hit_next: + if (state->start > end) + goto out; + WARN_ON(state->end < start); + last_end = state->end; + + /* The state doesn't have the wanted bits, go ahead. */ + if (!(state->state & bits)) { + state = next_state(state); + goto next; + } + + /* + * | ---- desired range ---- | + * | state | or + * | ------------- state -------------- | + * + * We need to split the extent we found, and may flip bits on second + * half. + * + * If the extent we found extends past our range, we just split and + * search again. It'll get split again the next time though. + * + * If the extent we found is inside our range, we clear the desired bit + * on it. + */ + + if (state->start < start) { + prealloc = alloc_extent_state_atomic(prealloc); + BUG_ON(!prealloc); + err = split_state(tree, state, prealloc, start); + if (err) + extent_io_tree_panic(tree, err); + + prealloc = NULL; + if (err) + goto out; + if (state->end <= end) { + state = clear_state_bit(tree, state, bits, wake, changeset); + goto next; + } + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | + * We need to split the extent, and clear the bit on the first half. + */ + if (state->start <= end && state->end > end) { + prealloc = alloc_extent_state_atomic(prealloc); + BUG_ON(!prealloc); + err = split_state(tree, state, prealloc, end + 1); + if (err) + extent_io_tree_panic(tree, err); + + if (wake) + wake_up(&state->wq); + + clear_state_bit(tree, prealloc, bits, wake, changeset); + + prealloc = NULL; + goto out; + } + + state = clear_state_bit(tree, state, bits, wake, changeset); +next: + if (last_end == (u64)-1) + goto out; + start = last_end + 1; + if (start <= end && state && !need_resched()) + goto hit_next; + +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + if (gfpflags_allow_blocking(mask)) + cond_resched(); + goto again; + +out: + spin_unlock(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return 0; + +} + +static void wait_on_state(struct extent_io_tree *tree, + struct extent_state *state) + __releases(tree->lock) + __acquires(tree->lock) +{ + DEFINE_WAIT(wait); + prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&tree->lock); + schedule(); + spin_lock(&tree->lock); + finish_wait(&state->wq, &wait); +} + +/* + * Wait for one or more bits to clear on a range in the state tree. + * The range [start, end] is inclusive. + * The tree lock is taken by this function + */ +void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits) +{ + struct extent_state *state; + + btrfs_debug_check_extent_io_range(tree, start, end); + + spin_lock(&tree->lock); +again: + while (1) { + /* + * This search will find all the extents that end after our + * range starts. + */ + state = tree_search(tree, start); +process_node: + if (!state) + break; + if (state->start > end) + goto out; + + if (state->state & bits) { + start = state->start; + refcount_inc(&state->refs); + wait_on_state(tree, state); + free_extent_state(state); + goto again; + } + start = state->end + 1; + + if (start > end) + break; + + if (!cond_resched_lock(&tree->lock)) { + state = next_state(state); + goto process_node; + } + } +out: + spin_unlock(&tree->lock); +} + +static void cache_state_if_flags(struct extent_state *state, + struct extent_state **cached_ptr, + unsigned flags) +{ + if (cached_ptr && !(*cached_ptr)) { + if (!flags || (state->state & flags)) { + *cached_ptr = state; + refcount_inc(&state->refs); + } + } +} + +static void cache_state(struct extent_state *state, + struct extent_state **cached_ptr) +{ + return cache_state_if_flags(state, cached_ptr, + EXTENT_LOCKED | EXTENT_BOUNDARY); +} + +/* + * Find the first state struct with 'bits' set after 'start', and return it. + * tree->lock must be held. NULL will returned if nothing was found after + * 'start'. + */ +static struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, + u64 start, u32 bits) +{ + struct extent_state *state; + + /* + * This search will find all the extents that end after our range + * starts. + */ + state = tree_search(tree, start); + while (state) { + if (state->end >= start && (state->state & bits)) + return state; + state = next_state(state); + } + return NULL; +} + +/* + * Find the first offset in the io tree with one or more @bits set. + * + * Note: If there are multiple bits set in @bits, any of them will match. + * + * Return 0 if we find something, and update @start_ret and @end_ret. + * Return 1 if we found nothing. + */ +int find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits, + struct extent_state **cached_state) +{ + struct extent_state *state; + int ret = 1; + + spin_lock(&tree->lock); + if (cached_state && *cached_state) { + state = *cached_state; + if (state->end == start - 1 && extent_state_in_tree(state)) { + while ((state = next_state(state)) != NULL) { + if (state->state & bits) + goto got_it; + } + free_extent_state(*cached_state); + *cached_state = NULL; + goto out; + } + free_extent_state(*cached_state); + *cached_state = NULL; + } + + state = find_first_extent_bit_state(tree, start, bits); +got_it: + if (state) { + cache_state_if_flags(state, cached_state, 0); + *start_ret = state->start; + *end_ret = state->end; + ret = 0; + } +out: + spin_unlock(&tree->lock); + return ret; +} + +/* + * Find a contiguous area of bits + * + * @tree: io tree to check + * @start: offset to start the search from + * @start_ret: the first offset we found with the bits set + * @end_ret: the final contiguous range of the bits that were set + * @bits: bits to look for + * + * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges + * to set bits appropriately, and then merge them again. During this time it + * will drop the tree->lock, so use this helper if you want to find the actual + * contiguous area for given bits. We will search to the first bit we find, and + * then walk down the tree until we find a non-contiguous area. The area + * returned will be the full contiguous area with the bits set. + */ +int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits) +{ + struct extent_state *state; + int ret = 1; + + spin_lock(&tree->lock); + state = find_first_extent_bit_state(tree, start, bits); + if (state) { + *start_ret = state->start; + *end_ret = state->end; + while ((state = next_state(state)) != NULL) { + if (state->start > (*end_ret + 1)) + break; + *end_ret = state->end; + } + ret = 0; + } + spin_unlock(&tree->lock); + return ret; +} + +/* + * Find a contiguous range of bytes in the file marked as delalloc, not more + * than 'max_bytes'. start and end are used to return the range, + * + * True is returned if we find something, false if nothing was in the tree. + */ +bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, + u64 *end, u64 max_bytes, + struct extent_state **cached_state) +{ + struct extent_state *state; + u64 cur_start = *start; + bool found = false; + u64 total_bytes = 0; + + spin_lock(&tree->lock); + + /* + * This search will find all the extents that end after our range + * starts. + */ + state = tree_search(tree, cur_start); + if (!state) { + *end = (u64)-1; + goto out; + } + + while (state) { + if (found && (state->start != cur_start || + (state->state & EXTENT_BOUNDARY))) { + goto out; + } + if (!(state->state & EXTENT_DELALLOC)) { + if (!found) + *end = state->end; + goto out; + } + if (!found) { + *start = state->start; + *cached_state = state; + refcount_inc(&state->refs); + } + found = true; + *end = state->end; + cur_start = state->end + 1; + total_bytes += state->end - state->start + 1; + if (total_bytes >= max_bytes) + break; + state = next_state(state); + } +out: + spin_unlock(&tree->lock); + return found; +} + +/* + * Set some bits on a range in the tree. This may require allocations or + * sleeping, so the gfp mask is used to indicate what is allowed. + * + * If any of the exclusive bits are set, this will fail with -EEXIST if some + * part of the range already has the desired bits set. The start of the + * existing range is returned in failed_start in this case. + * + * [start, end] is inclusive This takes the tree lock. + */ +static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, u64 *failed_start, + struct extent_state **cached_state, + struct extent_changeset *changeset, gfp_t mask) +{ + struct extent_state *state; + struct extent_state *prealloc = NULL; + struct rb_node **p; + struct rb_node *parent; + int err = 0; + u64 last_start; + u64 last_end; + u32 exclusive_bits = (bits & EXTENT_LOCKED); + + btrfs_debug_check_extent_io_range(tree, start, end); + trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits); + + if (exclusive_bits) + ASSERT(failed_start); + else + ASSERT(failed_start == NULL); +again: + if (!prealloc && gfpflags_allow_blocking(mask)) { + /* + * Don't care for allocation failure here because we might end + * up not needing the pre-allocated extent state at all, which + * is the case if we only have in the tree extent states that + * cover our input range and don't cover too any other range. + * If we end up needing a new extent state we allocate it later. + */ + prealloc = alloc_extent_state(mask); + } + + spin_lock(&tree->lock); + if (cached_state && *cached_state) { + state = *cached_state; + if (state->start <= start && state->end > start && + extent_state_in_tree(state)) + goto hit_next; + } + /* + * This search will find all the extents that end after our range + * starts. + */ + state = tree_search_for_insert(tree, start, &p, &parent); + if (!state) { + prealloc = alloc_extent_state_atomic(prealloc); + BUG_ON(!prealloc); + prealloc->start = start; + prealloc->end = end; + insert_state_fast(tree, prealloc, p, parent, bits, changeset); + cache_state(prealloc, cached_state); + prealloc = NULL; + goto out; + } +hit_next: + last_start = state->start; + last_end = state->end; + + /* + * | ---- desired range ---- | + * | state | + * + * Just lock what we found and keep going + */ + if (state->start == start && state->end <= end) { + if (state->state & exclusive_bits) { + *failed_start = state->start; + err = -EEXIST; + goto out; + } + + set_state_bits(tree, state, bits, changeset); + cache_state(state, cached_state); + merge_state(tree, state); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; + state = next_state(state); + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; + goto search_again; + } + + /* + * | ---- desired range ---- | + * | state | + * or + * | ------------- state -------------- | + * + * We need to split the extent we found, and may flip bits on second + * half. + * + * If the extent we found extends past our range, we just split and + * search again. It'll get split again the next time though. + * + * If the extent we found is inside our range, we set the desired bit + * on it. + */ + if (state->start < start) { + if (state->state & exclusive_bits) { + *failed_start = start; + err = -EEXIST; + goto out; + } + + /* + * If this extent already has all the bits we want set, then + * skip it, not necessary to split it or do anything with it. + */ + if ((state->state & bits) == bits) { + start = state->end + 1; + cache_state(state, cached_state); + goto search_again; + } + + prealloc = alloc_extent_state_atomic(prealloc); + BUG_ON(!prealloc); + err = split_state(tree, state, prealloc, start); + if (err) + extent_io_tree_panic(tree, err); + + prealloc = NULL; + if (err) + goto out; + if (state->end <= end) { + set_state_bits(tree, state, bits, changeset); + cache_state(state, cached_state); + merge_state(tree, state); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; + state = next_state(state); + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; + } + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | or | state | + * + * There's a hole, we need to insert something in it and ignore the + * extent we found. + */ + if (state->start > start) { + u64 this_end; + if (end < last_start) + this_end = end; + else + this_end = last_start - 1; + + prealloc = alloc_extent_state_atomic(prealloc); + BUG_ON(!prealloc); + + /* + * Avoid to free 'prealloc' if it can be merged with the later + * extent. + */ + prealloc->start = start; + prealloc->end = this_end; + err = insert_state(tree, prealloc, bits, changeset); + if (err) + extent_io_tree_panic(tree, err); + + cache_state(prealloc, cached_state); + prealloc = NULL; + start = this_end + 1; + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | + * + * We need to split the extent, and set the bit on the first half + */ + if (state->start <= end && state->end > end) { + if (state->state & exclusive_bits) { + *failed_start = start; + err = -EEXIST; + goto out; + } + + prealloc = alloc_extent_state_atomic(prealloc); + BUG_ON(!prealloc); + err = split_state(tree, state, prealloc, end + 1); + if (err) + extent_io_tree_panic(tree, err); + + set_state_bits(tree, prealloc, bits, changeset); + cache_state(prealloc, cached_state); + merge_state(tree, prealloc); + prealloc = NULL; + goto out; + } + +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + if (gfpflags_allow_blocking(mask)) + cond_resched(); + goto again; + +out: + spin_unlock(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return err; + +} + +int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_state **cached_state, gfp_t mask) +{ + return __set_extent_bit(tree, start, end, bits, NULL, cached_state, + NULL, mask); +} + +/* + * Convert all bits in a given range from one bit to another + * + * @tree: the io tree to search + * @start: the start offset in bytes + * @end: the end offset in bytes (inclusive) + * @bits: the bits to set in this range + * @clear_bits: the bits to clear in this range + * @cached_state: state that we're going to cache + * + * This will go through and set bits for the given range. If any states exist + * already in this range they are set with the given bit and cleared of the + * clear_bits. This is only meant to be used by things that are mergeable, ie. + * converting from say DELALLOC to DIRTY. This is not meant to be used with + * boundary bits like LOCK. + * + * All allocations are done with GFP_NOFS. + */ +int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, u32 clear_bits, + struct extent_state **cached_state) +{ + struct extent_state *state; + struct extent_state *prealloc = NULL; + struct rb_node **p; + struct rb_node *parent; + int err = 0; + u64 last_start; + u64 last_end; + bool first_iteration = true; + + btrfs_debug_check_extent_io_range(tree, start, end); + trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits, + clear_bits); + +again: + if (!prealloc) { + /* + * Best effort, don't worry if extent state allocation fails + * here for the first iteration. We might have a cached state + * that matches exactly the target range, in which case no + * extent state allocations are needed. We'll only know this + * after locking the tree. + */ + prealloc = alloc_extent_state(GFP_NOFS); + if (!prealloc && !first_iteration) + return -ENOMEM; + } + + spin_lock(&tree->lock); + if (cached_state && *cached_state) { + state = *cached_state; + if (state->start <= start && state->end > start && + extent_state_in_tree(state)) + goto hit_next; + } + + /* + * This search will find all the extents that end after our range + * starts. + */ + state = tree_search_for_insert(tree, start, &p, &parent); + if (!state) { + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) { + err = -ENOMEM; + goto out; + } + prealloc->start = start; + prealloc->end = end; + insert_state_fast(tree, prealloc, p, parent, bits, NULL); + cache_state(prealloc, cached_state); + prealloc = NULL; + goto out; + } +hit_next: + last_start = state->start; + last_end = state->end; + + /* + * | ---- desired range ---- | + * | state | + * + * Just lock what we found and keep going. + */ + if (state->start == start && state->end <= end) { + set_state_bits(tree, state, bits, NULL); + cache_state(state, cached_state); + state = clear_state_bit(tree, state, clear_bits, 0, NULL); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; + goto search_again; + } + + /* + * | ---- desired range ---- | + * | state | + * or + * | ------------- state -------------- | + * + * We need to split the extent we found, and may flip bits on second + * half. + * + * If the extent we found extends past our range, we just split and + * search again. It'll get split again the next time though. + * + * If the extent we found is inside our range, we set the desired bit + * on it. + */ + if (state->start < start) { + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) { + err = -ENOMEM; + goto out; + } + err = split_state(tree, state, prealloc, start); + if (err) + extent_io_tree_panic(tree, err); + prealloc = NULL; + if (err) + goto out; + if (state->end <= end) { + set_state_bits(tree, state, bits, NULL); + cache_state(state, cached_state); + state = clear_state_bit(tree, state, clear_bits, 0, NULL); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; + } + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | or | state | + * + * There's a hole, we need to insert something in it and ignore the + * extent we found. + */ + if (state->start > start) { + u64 this_end; + if (end < last_start) + this_end = end; + else + this_end = last_start - 1; + + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) { + err = -ENOMEM; + goto out; + } + + /* + * Avoid to free 'prealloc' if it can be merged with the later + * extent. + */ + prealloc->start = start; + prealloc->end = this_end; + err = insert_state(tree, prealloc, bits, NULL); + if (err) + extent_io_tree_panic(tree, err); + cache_state(prealloc, cached_state); + prealloc = NULL; + start = this_end + 1; + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | + * + * We need to split the extent, and set the bit on the first half. + */ + if (state->start <= end && state->end > end) { + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) { + err = -ENOMEM; + goto out; + } + + err = split_state(tree, state, prealloc, end + 1); + if (err) + extent_io_tree_panic(tree, err); + + set_state_bits(tree, prealloc, bits, NULL); + cache_state(prealloc, cached_state); + clear_state_bit(tree, prealloc, clear_bits, 0, NULL); + prealloc = NULL; + goto out; + } + +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + cond_resched(); + first_iteration = false; + goto again; + +out: + spin_unlock(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return err; +} + +/* + * Find the first range that has @bits not set. This range could start before + * @start. + * + * @tree: the tree to search + * @start: offset at/after which the found extent should start + * @start_ret: records the beginning of the range + * @end_ret: records the end of the range (inclusive) + * @bits: the set of bits which must be unset + * + * Since unallocated range is also considered one which doesn't have the bits + * set it's possible that @end_ret contains -1, this happens in case the range + * spans (last_range_end, end of device]. In this case it's up to the caller to + * trim @end_ret to the appropriate size. + */ +void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits) +{ + struct extent_state *state; + struct extent_state *prev = NULL, *next; + + spin_lock(&tree->lock); + + /* Find first extent with bits cleared */ + while (1) { + state = tree_search_prev_next(tree, start, &prev, &next); + if (!state && !next && !prev) { + /* + * Tree is completely empty, send full range and let + * caller deal with it + */ + *start_ret = 0; + *end_ret = -1; + goto out; + } else if (!state && !next) { + /* + * We are past the last allocated chunk, set start at + * the end of the last extent. + */ + *start_ret = prev->end + 1; + *end_ret = -1; + goto out; + } else if (!state) { + state = next; + } + + /* + * At this point 'state' either contains 'start' or start is + * before 'state' + */ + if (in_range(start, state->start, state->end - state->start + 1)) { + if (state->state & bits) { + /* + * |--range with bits sets--| + * | + * start + */ + start = state->end + 1; + } else { + /* + * 'start' falls within a range that doesn't + * have the bits set, so take its start as the + * beginning of the desired range + * + * |--range with bits cleared----| + * | + * start + */ + *start_ret = state->start; + break; + } + } else { + /* + * |---prev range---|---hole/unset---|---node range---| + * | + * start + * + * or + * + * |---hole/unset--||--first node--| + * 0 | + * start + */ + if (prev) + *start_ret = prev->end + 1; + else + *start_ret = 0; + break; + } + } + + /* + * Find the longest stretch from start until an entry which has the + * bits set + */ + while (state) { + if (state->end >= start && !(state->state & bits)) { + *end_ret = state->end; + } else { + *end_ret = state->start - 1; + break; + } + state = next_state(state); + } +out: + spin_unlock(&tree->lock); +} + +/* + * Count the number of bytes in the tree that have a given bit(s) set. This + * can be fairly slow, except for EXTENT_DIRTY which is cached. The total + * number found is returned. + */ +u64 count_range_bits(struct extent_io_tree *tree, + u64 *start, u64 search_end, u64 max_bytes, + u32 bits, int contig) +{ + struct extent_state *state; + u64 cur_start = *start; + u64 total_bytes = 0; + u64 last = 0; + int found = 0; + + if (WARN_ON(search_end <= cur_start)) + return 0; + + spin_lock(&tree->lock); + + /* + * This search will find all the extents that end after our range + * starts. + */ + state = tree_search(tree, cur_start); + while (state) { + if (state->start > search_end) + break; + if (contig && found && state->start > last + 1) + break; + if (state->end >= cur_start && (state->state & bits) == bits) { + total_bytes += min(search_end, state->end) + 1 - + max(cur_start, state->start); + if (total_bytes >= max_bytes) + break; + if (!found) { + *start = max(cur_start, state->start); + found = 1; + } + last = state->end; + } else if (contig && found) { + break; + } + state = next_state(state); + } + spin_unlock(&tree->lock); + return total_bytes; +} + +/* + * Searche a range in the state tree for a given mask. If 'filled' == 1, this + * returns 1 only if every extent in the tree has the bits set. Otherwise, 1 + * is returned if any bit in the range is found set. + */ +int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, int filled, struct extent_state *cached) +{ + struct extent_state *state = NULL; + int bitset = 0; + + spin_lock(&tree->lock); + if (cached && extent_state_in_tree(cached) && cached->start <= start && + cached->end > start) + state = cached; + else + state = tree_search(tree, start); + while (state && start <= end) { + if (filled && state->start > start) { + bitset = 0; + break; + } + + if (state->start > end) + break; + + if (state->state & bits) { + bitset = 1; + if (!filled) + break; + } else if (filled) { + bitset = 0; + break; + } + + if (state->end == (u64)-1) + break; + + start = state->end + 1; + if (start > end) + break; + state = next_state(state); + } + + /* We ran out of states and were still inside of our range. */ + if (filled && !state) + bitset = 0; + spin_unlock(&tree->lock); + return bitset; +} + +/* Wrappers around set/clear extent bit */ +int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_changeset *changeset) +{ + /* + * We don't support EXTENT_LOCKED yet, as current changeset will + * record any bits changed, so for EXTENT_LOCKED case, it will + * either fail with -EEXIST or changeset will record the whole + * range. + */ + ASSERT(!(bits & EXTENT_LOCKED)); + + return __set_extent_bit(tree, start, end, bits, NULL, NULL, changeset, + GFP_NOFS); +} + +int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_changeset *changeset) +{ + /* + * Don't support EXTENT_LOCKED case, same reason as + * set_record_extent_bits(). + */ + ASSERT(!(bits & EXTENT_LOCKED)); + + return __clear_extent_bit(tree, start, end, bits, NULL, GFP_NOFS, + changeset); +} + +int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) +{ + int err; + u64 failed_start; + + err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, + NULL, NULL, GFP_NOFS); + if (err == -EEXIST) { + if (failed_start > start) + clear_extent_bit(tree, start, failed_start - 1, + EXTENT_LOCKED, NULL); + return 0; + } + return 1; +} + +/* + * Either insert or lock state struct between start and end use mask to tell + * us if waiting is desired. + */ +int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached_state) +{ + int err; + u64 failed_start; + + err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, + cached_state, NULL, GFP_NOFS); + while (err == -EEXIST) { + if (failed_start != start) + clear_extent_bit(tree, start, failed_start - 1, + EXTENT_LOCKED, cached_state); + + wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); + err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, + &failed_start, cached_state, NULL, + GFP_NOFS); + } + return err; +} + +void __cold extent_state_free_cachep(void) +{ + btrfs_extent_state_leak_debug_check(); + kmem_cache_destroy(extent_state_cache); +} + +int __init extent_state_init_cachep(void) +{ + extent_state_cache = kmem_cache_create("btrfs_extent_state", + sizeof(struct extent_state), 0, + SLAB_MEM_SPREAD, NULL); + if (!extent_state_cache) + return -ENOMEM; + + return 0; +} diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index c3eb52dbe61c..a855f40dd61d 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -17,7 +17,6 @@ struct io_failure_record; #define EXTENT_NODATASUM (1U << 7) #define EXTENT_CLEAR_META_RESV (1U << 8) #define EXTENT_NEED_WAIT (1U << 9) -#define EXTENT_DAMAGED (1U << 10) #define EXTENT_NORESERVE (1U << 11) #define EXTENT_QGROUP_RESERVED (1U << 12) #define EXTENT_CLEAR_DATA_RESV (1U << 13) @@ -35,10 +34,18 @@ struct io_failure_record; * delalloc bytes decremented, in an atomic way to prevent races with stat(2). */ #define EXTENT_ADD_INODE_BYTES (1U << 15) + +/* + * Set during truncate when we're clearing an entire range and we just want the + * extent states to go away. + */ +#define EXTENT_CLEAR_ALL_BITS (1U << 16) + #define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \ EXTENT_CLEAR_DATA_RESV) #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | \ - EXTENT_ADD_INODE_BYTES) + EXTENT_ADD_INODE_BYTES | \ + EXTENT_CLEAR_ALL_BITS) /* * Redefined bits above which are used only in the device allocation tree, @@ -56,7 +63,6 @@ enum { IO_TREE_FS_EXCLUDED_EXTENTS, IO_TREE_BTREE_INODE_IO, IO_TREE_INODE_IO, - IO_TREE_INODE_IO_FAILURE, IO_TREE_RELOC_BLOCKS, IO_TREE_TRANS_DIRTY_PAGES, IO_TREE_ROOT_DIRTY_LOG_PAGES, @@ -70,8 +76,6 @@ struct extent_io_tree { struct rb_root state; struct btrfs_fs_info *fs_info; void *private_data; - u64 dirty_bytes; - bool track_uptodate; /* Who owns this io tree, should be one of IO_TREE_* */ u8 owner; @@ -89,33 +93,23 @@ struct extent_state { refcount_t refs; u32 state; - struct io_failure_record *failrec; - #ifdef CONFIG_BTRFS_DEBUG struct list_head leak_list; #endif }; -int __init extent_state_cache_init(void); -void __cold extent_state_cache_exit(void); - void extent_io_tree_init(struct btrfs_fs_info *fs_info, struct extent_io_tree *tree, unsigned int owner, void *private_data); void extent_io_tree_release(struct extent_io_tree *tree); -int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached); - -static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end) -{ - return lock_extent_bits(tree, start, end, NULL); -} +int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached); int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end); -int __init extent_io_init(void); -void __cold extent_io_exit(void); +int __init extent_state_init_cachep(void); +void __cold extent_state_free_cachep(void); u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, @@ -126,72 +120,66 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, int filled, struct extent_state *cached_state); int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_changeset *changeset); -int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, int wake, int delete, - struct extent_state **cached); int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, int wake, int delete, - struct extent_state **cached, gfp_t mask, - struct extent_changeset *changeset); + u32 bits, struct extent_state **cached, gfp_t mask, + struct extent_changeset *changeset); -static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) +static inline int clear_extent_bit(struct extent_io_tree *tree, u64 start, + u64 end, u32 bits, + struct extent_state **cached) { - return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL); + return __clear_extent_bit(tree, start, end, bits, cached, + GFP_NOFS, NULL); } -static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached) +static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached) { - return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, - GFP_NOFS, NULL); + return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached, + GFP_NOFS, NULL); } -static inline int unlock_extent_cached_atomic(struct extent_io_tree *tree, - u64 start, u64 end, struct extent_state **cached) +static inline int unlock_extent_atomic(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) { - return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, - GFP_ATOMIC, NULL); + return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached, + GFP_ATOMIC, NULL); } static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits) { - int wake = 0; - - if (bits & EXTENT_LOCKED) - wake = 1; - - return clear_extent_bit(tree, start, end, bits, wake, 0, NULL); + return clear_extent_bit(tree, start, end, bits, NULL); } int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_changeset *changeset); int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, unsigned exclusive_bits, u64 *failed_start, - struct extent_state **cached_state, gfp_t mask, - struct extent_changeset *changeset); -int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits); + u32 bits, struct extent_state **cached_state, gfp_t mask); + +static inline int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, + u64 end, u32 bits) +{ + return set_extent_bit(tree, start, end, bits, NULL, GFP_NOWAIT); +} static inline int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits) { - return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS, - NULL); + return set_extent_bit(tree, start, end, bits, NULL, GFP_NOFS); } static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state) { - return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, - cached_state, GFP_NOFS, NULL); + return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, + cached_state, GFP_NOFS, NULL); } static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { - return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, NULL, - mask, NULL); + return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, mask); } static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, @@ -199,7 +187,7 @@ static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, { return clear_extent_bit(tree, start, end, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, cached); + EXTENT_DO_ACCOUNTING, cached); } int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, @@ -211,30 +199,29 @@ static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start, struct extent_state **cached_state) { return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_UPTODATE | extra_bits, - 0, NULL, cached_state, GFP_NOFS, NULL); + EXTENT_DELALLOC | extra_bits, + cached_state, GFP_NOFS); } static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state) { return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG, - 0, NULL, cached_state, GFP_NOFS, NULL); + EXTENT_DELALLOC | EXTENT_DEFRAG, + cached_state, GFP_NOFS); } static inline int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end) { - return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, NULL, - GFP_NOFS, NULL); + return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, GFP_NOFS); } static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask) { - return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, - cached_state, mask, NULL); + return set_extent_bit(tree, start, end, EXTENT_UPTODATE, + cached_state, mask); } int find_first_extent_bit(struct extent_io_tree *tree, u64 start, @@ -244,24 +231,9 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, u32 bits); int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, u32 bits); -int extent_invalidate_folio(struct extent_io_tree *tree, - struct folio *folio, size_t offset); bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, u64 *end, u64 max_bytes, struct extent_state **cached_state); - -/* This should be reworked in the future and put elsewhere. */ -struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start); -int set_state_failrec(struct extent_io_tree *tree, u64 start, - struct io_failure_record *failrec); -void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, - u64 end); -int free_io_failure(struct extent_io_tree *failure_tree, - struct extent_io_tree *io_tree, - struct io_failure_record *rec); -int clean_io_failure(struct btrfs_fs_info *fs_info, - struct extent_io_tree *failure_tree, - struct extent_io_tree *io_tree, u64 start, - struct page *page, u64 ino, unsigned int pg_offset); +void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits); #endif /* BTRFS_EXTENT_IO_TREE_H */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6914cd8024ba..2801c991814f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2220,6 +2220,12 @@ static noinline int check_delayed_ref(struct btrfs_root *root, } if (!mutex_trylock(&head->mutex)) { + if (path->nowait) { + spin_unlock(&delayed_refs->lock); + btrfs_put_transaction(cur_trans); + return -EAGAIN; + } + refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); @@ -2686,13 +2692,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, len = cache->start + cache->length - start; len = min(len, end + 1 - start); - down_read(&fs_info->commit_root_sem); - if (start < cache->last_byte_to_unpin && return_free_space) { - u64 add_len = min(len, cache->last_byte_to_unpin - start); - - btrfs_add_free_space(cache, start, add_len); - } - up_read(&fs_info->commit_root_sem); + if (return_free_space) + btrfs_add_free_space(cache, start, len); start += len; total_unpinned += len; @@ -3294,21 +3295,22 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, } /* - * If this is a leaf and there are tree mod log users, we may - * have recorded mod log operations that point to this leaf. - * So we must make sure no one reuses this leaf's extent before - * mod log operations are applied to a node, otherwise after - * rewinding a node using the mod log operations we get an - * inconsistent btree, as the leaf's extent may now be used as - * a node or leaf for another different btree. + * If there are tree mod log users we may have recorded mod log + * operations for this node. If we re-allocate this node we + * could replay operations on this node that happened when it + * existed in a completely different root. For example if it + * was part of root A, then was reallocated to root B, and we + * are doing a btrfs_old_search_slot(root b), we could replay + * operations that happened when the block was part of root A, + * giving us an inconsistent view of the btree. + * * We are safe from races here because at this point no other * node or root points to this extent buffer, so if after this - * check a new tree mod log user joins, it will not be able to - * find a node pointing to this leaf and record operations that - * point to this leaf. + * check a new tree mod log user joins we will not have an + * existing log of operations on this node that we have to + * contend with. */ - if (btrfs_header_level(buf) == 0 && - test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) + if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) must_pin = true; if (must_pin || btrfs_is_zoned(fs_info)) { @@ -3804,7 +3806,8 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, block_group->start == fs_info->data_reloc_bg || fs_info->data_reloc_bg == 0); - if (block_group->ro || block_group->zoned_data_reloc_ongoing) { + if (block_group->ro || + test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) { ret = 1; goto out; } @@ -3881,7 +3884,7 @@ out: * regular extents) at the same time to the same zone, which * easily break the write pointer. */ - block_group->zoned_data_reloc_ongoing = 1; + set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags); fs_info->data_reloc_bg = 0; } spin_unlock(&fs_info->relocation_bg_lock); @@ -4888,6 +4891,9 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, !test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state)) lockdep_owner = BTRFS_FS_TREE_OBJECTID; + /* btrfs_clean_tree_block() accesses generation field. */ + btrfs_set_header_generation(buf, trans->transid); + /* * This needs to stay, because we could allocate a freed block from an * old tree into a new tree, so we need to make sure this new block is @@ -5639,6 +5645,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, */ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) { + const bool is_reloc_root = (root->root_key.objectid == + BTRFS_TREE_RELOC_OBJECTID); struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct btrfs_trans_handle *trans; @@ -5798,6 +5806,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) goto out_end_trans; } + if (!is_reloc_root) + btrfs_set_last_root_drop_gen(fs_info, trans->transid); + btrfs_end_transaction_throttle(trans); if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) { btrfs_debug(fs_info, @@ -5832,7 +5843,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) goto out_end_trans; } - if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { + if (!is_reloc_root) { ret = btrfs_find_root(tree_root, &root->root_key, path, NULL, NULL); if (ret < 0) { @@ -5864,6 +5875,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) btrfs_put_root(root); root_dropped = true; out_end_trans: + if (!is_reloc_root) + btrfs_set_last_root_drop_gen(fs_info, trans->transid); + btrfs_end_transaction_throttle(trans); out_free: kfree(wc); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index cf4f19e80e2f..4dcf22e051ff 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -31,38 +31,27 @@ #include "block-group.h" #include "compression.h" -static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; -static struct bio_set btrfs_bioset; - -static inline bool extent_state_in_tree(const struct extent_state *state) -{ - return !RB_EMPTY_NODE(&state->rb_node); -} #ifdef CONFIG_BTRFS_DEBUG -static LIST_HEAD(states); -static DEFINE_SPINLOCK(leak_lock); - -static inline void btrfs_leak_debug_add(spinlock_t *lock, - struct list_head *new, - struct list_head *head) +static inline void btrfs_leak_debug_add_eb(struct extent_buffer *eb) { + struct btrfs_fs_info *fs_info = eb->fs_info; unsigned long flags; - spin_lock_irqsave(lock, flags); - list_add(new, head); - spin_unlock_irqrestore(lock, flags); + spin_lock_irqsave(&fs_info->eb_leak_lock, flags); + list_add(&eb->leak_list, &fs_info->allocated_ebs); + spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); } -static inline void btrfs_leak_debug_del(spinlock_t *lock, - struct list_head *entry) +static inline void btrfs_leak_debug_del_eb(struct extent_buffer *eb) { + struct btrfs_fs_info *fs_info = eb->fs_info; unsigned long flags; - spin_lock_irqsave(lock, flags); - list_del(entry); - spin_unlock_irqrestore(lock, flags); + spin_lock_irqsave(&fs_info->eb_leak_lock, flags); + list_del(&eb->leak_list); + spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); } void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) @@ -91,53 +80,11 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) } spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); } - -static inline void btrfs_extent_state_leak_debug_check(void) -{ - struct extent_state *state; - - while (!list_empty(&states)) { - state = list_entry(states.next, struct extent_state, leak_list); - pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n", - state->start, state->end, state->state, - extent_state_in_tree(state), - refcount_read(&state->refs)); - list_del(&state->leak_list); - kmem_cache_free(extent_state_cache, state); - } -} - -#define btrfs_debug_check_extent_io_range(tree, start, end) \ - __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end)) -static inline void __btrfs_debug_check_extent_io_range(const char *caller, - struct extent_io_tree *tree, u64 start, u64 end) -{ - struct inode *inode = tree->private_data; - u64 isize; - - if (!inode || !is_data_inode(inode)) - return; - - isize = i_size_read(inode); - if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { - btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, - "%s: ino %llu isize %llu odd range [%llu,%llu]", - caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); - } -} #else -#define btrfs_leak_debug_add(lock, new, head) do {} while (0) -#define btrfs_leak_debug_del(lock, entry) do {} while (0) -#define btrfs_extent_state_leak_debug_check() do {} while (0) -#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) +#define btrfs_leak_debug_add_eb(eb) do {} while (0) +#define btrfs_leak_debug_del_eb(eb) do {} while (0) #endif -struct tree_entry { - u64 start; - u64 end; - struct rb_node rb_node; -}; - /* * Structure to record info about the bio being assembled, and other info like * how many bytes are there before stripe/ordered extent boundary. @@ -148,6 +95,7 @@ struct btrfs_bio_ctrl { enum btrfs_compression_type compress_type; u32 len_to_stripe_boundary; u32 len_to_oe_boundary; + btrfs_bio_end_io_t end_io_func; }; struct extent_page_data { @@ -161,24 +109,6 @@ struct extent_page_data { unsigned int sync_io:1; }; -static int add_extent_changeset(struct extent_state *state, u32 bits, - struct extent_changeset *changeset, - int set) -{ - int ret; - - if (!changeset) - return 0; - if (set && (state->state & bits) == bits) - return 0; - if (!set && (state->state & bits) == 0) - return 0; - changeset->bytes_changed += state->end - state->start + 1; - ret = ulist_add(&changeset->range_changed, state->start, state->end, - GFP_ATOMIC); - return ret; -} - static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) { struct bio *bio; @@ -207,7 +137,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) btrfs_submit_data_read_bio(inode, bio, mirror_num, bio_ctrl->compress_type); - /* The bio is owned by the bi_end_io handler now */ + /* The bio is owned by the end_io handler now */ bio_ctrl->bio = NULL; } @@ -223,26 +153,15 @@ static void submit_write_bio(struct extent_page_data *epd, int ret) if (ret) { ASSERT(ret < 0); - bio->bi_status = errno_to_blk_status(ret); - bio_endio(bio); - /* The bio is owned by the bi_end_io handler now */ + btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); + /* The bio is owned by the end_io handler now */ epd->bio_ctrl.bio = NULL; } else { submit_one_bio(&epd->bio_ctrl); } } -int __init extent_state_cache_init(void) -{ - extent_state_cache = kmem_cache_create("btrfs_extent_state", - sizeof(struct extent_state), 0, - SLAB_MEM_SPREAD, NULL); - if (!extent_state_cache) - return -ENOMEM; - return 0; -} - -int __init extent_io_init(void) +int __init extent_buffer_init_cachep(void) { extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", sizeof(struct extent_buffer), 0, @@ -250,32 +169,10 @@ int __init extent_io_init(void) if (!extent_buffer_cache) return -ENOMEM; - if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, - offsetof(struct btrfs_bio, bio), - BIOSET_NEED_BVECS)) - goto free_buffer_cache; - - if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE)) - goto free_bioset; - return 0; - -free_bioset: - bioset_exit(&btrfs_bioset); - -free_buffer_cache: - kmem_cache_destroy(extent_buffer_cache); - extent_buffer_cache = NULL; - return -ENOMEM; -} - -void __cold extent_state_cache_exit(void) -{ - btrfs_extent_state_leak_debug_check(); - kmem_cache_destroy(extent_state_cache); } -void __cold extent_io_exit(void) +void __cold extent_buffer_free_cachep(void) { /* * Make sure all delayed rcu free are flushed before we @@ -283,1244 +180,6 @@ void __cold extent_io_exit(void) */ rcu_barrier(); kmem_cache_destroy(extent_buffer_cache); - bioset_exit(&btrfs_bioset); -} - -/* - * For the file_extent_tree, we want to hold the inode lock when we lookup and - * update the disk_i_size, but lockdep will complain because our io_tree we hold - * the tree lock and get the inode lock when setting delalloc. These two things - * are unrelated, so make a class for the file_extent_tree so we don't get the - * two locking patterns mixed up. - */ -static struct lock_class_key file_extent_tree_class; - -void extent_io_tree_init(struct btrfs_fs_info *fs_info, - struct extent_io_tree *tree, unsigned int owner, - void *private_data) -{ - tree->fs_info = fs_info; - tree->state = RB_ROOT; - tree->dirty_bytes = 0; - spin_lock_init(&tree->lock); - tree->private_data = private_data; - tree->owner = owner; - if (owner == IO_TREE_INODE_FILE_EXTENT) - lockdep_set_class(&tree->lock, &file_extent_tree_class); -} - -void extent_io_tree_release(struct extent_io_tree *tree) -{ - spin_lock(&tree->lock); - /* - * Do a single barrier for the waitqueue_active check here, the state - * of the waitqueue should not change once extent_io_tree_release is - * called. - */ - smp_mb(); - while (!RB_EMPTY_ROOT(&tree->state)) { - struct rb_node *node; - struct extent_state *state; - - node = rb_first(&tree->state); - state = rb_entry(node, struct extent_state, rb_node); - rb_erase(&state->rb_node, &tree->state); - RB_CLEAR_NODE(&state->rb_node); - /* - * btree io trees aren't supposed to have tasks waiting for - * changes in the flags of extent states ever. - */ - ASSERT(!waitqueue_active(&state->wq)); - free_extent_state(state); - - cond_resched_lock(&tree->lock); - } - spin_unlock(&tree->lock); -} - -static struct extent_state *alloc_extent_state(gfp_t mask) -{ - struct extent_state *state; - - /* - * The given mask might be not appropriate for the slab allocator, - * drop the unsupported bits - */ - mask &= ~(__GFP_DMA32|__GFP_HIGHMEM); - state = kmem_cache_alloc(extent_state_cache, mask); - if (!state) - return state; - state->state = 0; - state->failrec = NULL; - RB_CLEAR_NODE(&state->rb_node); - btrfs_leak_debug_add(&leak_lock, &state->leak_list, &states); - refcount_set(&state->refs, 1); - init_waitqueue_head(&state->wq); - trace_alloc_extent_state(state, mask, _RET_IP_); - return state; -} - -void free_extent_state(struct extent_state *state) -{ - if (!state) - return; - if (refcount_dec_and_test(&state->refs)) { - WARN_ON(extent_state_in_tree(state)); - btrfs_leak_debug_del(&leak_lock, &state->leak_list); - trace_free_extent_state(state, _RET_IP_); - kmem_cache_free(extent_state_cache, state); - } -} - -/** - * Search @tree for an entry that contains @offset. Such entry would have - * entry->start <= offset && entry->end >= offset. - * - * @tree: the tree to search - * @offset: offset that should fall within an entry in @tree - * @node_ret: pointer where new node should be anchored (used when inserting an - * entry in the tree) - * @parent_ret: points to entry which would have been the parent of the entry, - * containing @offset - * - * Return a pointer to the entry that contains @offset byte address and don't change - * @node_ret and @parent_ret. - * - * If no such entry exists, return pointer to entry that ends before @offset - * and fill parameters @node_ret and @parent_ret, ie. does not return NULL. - */ -static inline struct rb_node *tree_search_for_insert(struct extent_io_tree *tree, - u64 offset, - struct rb_node ***node_ret, - struct rb_node **parent_ret) -{ - struct rb_root *root = &tree->state; - struct rb_node **node = &root->rb_node; - struct rb_node *prev = NULL; - struct tree_entry *entry; - - while (*node) { - prev = *node; - entry = rb_entry(prev, struct tree_entry, rb_node); - - if (offset < entry->start) - node = &(*node)->rb_left; - else if (offset > entry->end) - node = &(*node)->rb_right; - else - return *node; - } - - if (node_ret) - *node_ret = node; - if (parent_ret) - *parent_ret = prev; - - /* Search neighbors until we find the first one past the end */ - while (prev && offset > entry->end) { - prev = rb_next(prev); - entry = rb_entry(prev, struct tree_entry, rb_node); - } - - return prev; -} - -/* - * Inexact rb-tree search, return the next entry if @offset is not found - */ -static inline struct rb_node *tree_search(struct extent_io_tree *tree, u64 offset) -{ - return tree_search_for_insert(tree, offset, NULL, NULL); -} - -/** - * Search offset in the tree or fill neighbor rbtree node pointers. - * - * @tree: the tree to search - * @offset: offset that should fall within an entry in @tree - * @next_ret: pointer to the first entry whose range ends after @offset - * @prev_ret: pointer to the first entry whose range begins before @offset - * - * Return a pointer to the entry that contains @offset byte address. If no - * such entry exists, then return NULL and fill @prev_ret and @next_ret. - * Otherwise return the found entry and other pointers are left untouched. - */ -static struct rb_node *tree_search_prev_next(struct extent_io_tree *tree, - u64 offset, - struct rb_node **prev_ret, - struct rb_node **next_ret) -{ - struct rb_root *root = &tree->state; - struct rb_node **node = &root->rb_node; - struct rb_node *prev = NULL; - struct rb_node *orig_prev = NULL; - struct tree_entry *entry; - - ASSERT(prev_ret); - ASSERT(next_ret); - - while (*node) { - prev = *node; - entry = rb_entry(prev, struct tree_entry, rb_node); - - if (offset < entry->start) - node = &(*node)->rb_left; - else if (offset > entry->end) - node = &(*node)->rb_right; - else - return *node; - } - - orig_prev = prev; - while (prev && offset > entry->end) { - prev = rb_next(prev); - entry = rb_entry(prev, struct tree_entry, rb_node); - } - *next_ret = prev; - prev = orig_prev; - - entry = rb_entry(prev, struct tree_entry, rb_node); - while (prev && offset < entry->start) { - prev = rb_prev(prev); - entry = rb_entry(prev, struct tree_entry, rb_node); - } - *prev_ret = prev; - - return NULL; -} - -/* - * utility function to look for merge candidates inside a given range. - * Any extents with matching state are merged together into a single - * extent in the tree. Extents with EXTENT_IO in their state field - * are not merged because the end_io handlers need to be able to do - * operations on them without sleeping (or doing allocations/splits). - * - * This should be called with the tree lock held. - */ -static void merge_state(struct extent_io_tree *tree, - struct extent_state *state) -{ - struct extent_state *other; - struct rb_node *other_node; - - if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY)) - return; - - other_node = rb_prev(&state->rb_node); - if (other_node) { - other = rb_entry(other_node, struct extent_state, rb_node); - if (other->end == state->start - 1 && - other->state == state->state) { - if (tree->private_data && - is_data_inode(tree->private_data)) - btrfs_merge_delalloc_extent(tree->private_data, - state, other); - state->start = other->start; - rb_erase(&other->rb_node, &tree->state); - RB_CLEAR_NODE(&other->rb_node); - free_extent_state(other); - } - } - other_node = rb_next(&state->rb_node); - if (other_node) { - other = rb_entry(other_node, struct extent_state, rb_node); - if (other->start == state->end + 1 && - other->state == state->state) { - if (tree->private_data && - is_data_inode(tree->private_data)) - btrfs_merge_delalloc_extent(tree->private_data, - state, other); - state->end = other->end; - rb_erase(&other->rb_node, &tree->state); - RB_CLEAR_NODE(&other->rb_node); - free_extent_state(other); - } - } -} - -static void set_state_bits(struct extent_io_tree *tree, - struct extent_state *state, u32 bits, - struct extent_changeset *changeset); - -/* - * insert an extent_state struct into the tree. 'bits' are set on the - * struct before it is inserted. - * - * This may return -EEXIST if the extent is already there, in which case the - * state struct is freed. - * - * The tree lock is not taken internally. This is a utility function and - * probably isn't what you want to call (see set/clear_extent_bit). - */ -static int insert_state(struct extent_io_tree *tree, - struct extent_state *state, - u32 bits, struct extent_changeset *changeset) -{ - struct rb_node **node; - struct rb_node *parent; - const u64 end = state->end; - - set_state_bits(tree, state, bits, changeset); - - node = &tree->state.rb_node; - while (*node) { - struct tree_entry *entry; - - parent = *node; - entry = rb_entry(parent, struct tree_entry, rb_node); - - if (end < entry->start) { - node = &(*node)->rb_left; - } else if (end > entry->end) { - node = &(*node)->rb_right; - } else { - btrfs_err(tree->fs_info, - "found node %llu %llu on insert of %llu %llu", - entry->start, entry->end, state->start, end); - return -EEXIST; - } - } - - rb_link_node(&state->rb_node, parent, node); - rb_insert_color(&state->rb_node, &tree->state); - - merge_state(tree, state); - return 0; -} - -/* - * Insert state to @tree to the location given by @node and @parent. - */ -static void insert_state_fast(struct extent_io_tree *tree, - struct extent_state *state, struct rb_node **node, - struct rb_node *parent, unsigned bits, - struct extent_changeset *changeset) -{ - set_state_bits(tree, state, bits, changeset); - rb_link_node(&state->rb_node, parent, node); - rb_insert_color(&state->rb_node, &tree->state); - merge_state(tree, state); -} - -/* - * split a given extent state struct in two, inserting the preallocated - * struct 'prealloc' as the newly created second half. 'split' indicates an - * offset inside 'orig' where it should be split. - * - * Before calling, - * the tree has 'orig' at [orig->start, orig->end]. After calling, there - * are two extent state structs in the tree: - * prealloc: [orig->start, split - 1] - * orig: [ split, orig->end ] - * - * The tree locks are not taken by this function. They need to be held - * by the caller. - */ -static int split_state(struct extent_io_tree *tree, struct extent_state *orig, - struct extent_state *prealloc, u64 split) -{ - struct rb_node *parent = NULL; - struct rb_node **node; - - if (tree->private_data && is_data_inode(tree->private_data)) - btrfs_split_delalloc_extent(tree->private_data, orig, split); - - prealloc->start = orig->start; - prealloc->end = split - 1; - prealloc->state = orig->state; - orig->start = split; - - parent = &orig->rb_node; - node = &parent; - while (*node) { - struct tree_entry *entry; - - parent = *node; - entry = rb_entry(parent, struct tree_entry, rb_node); - - if (prealloc->end < entry->start) { - node = &(*node)->rb_left; - } else if (prealloc->end > entry->end) { - node = &(*node)->rb_right; - } else { - free_extent_state(prealloc); - return -EEXIST; - } - } - - rb_link_node(&prealloc->rb_node, parent, node); - rb_insert_color(&prealloc->rb_node, &tree->state); - - return 0; -} - -static struct extent_state *next_state(struct extent_state *state) -{ - struct rb_node *next = rb_next(&state->rb_node); - if (next) - return rb_entry(next, struct extent_state, rb_node); - else - return NULL; -} - -/* - * utility function to clear some bits in an extent state struct. - * it will optionally wake up anyone waiting on this state (wake == 1). - * - * If no bits are set on the state struct after clearing things, the - * struct is freed and removed from the tree - */ -static struct extent_state *clear_state_bit(struct extent_io_tree *tree, - struct extent_state *state, - u32 bits, int wake, - struct extent_changeset *changeset) -{ - struct extent_state *next; - u32 bits_to_clear = bits & ~EXTENT_CTLBITS; - int ret; - - if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { - u64 range = state->end - state->start + 1; - WARN_ON(range > tree->dirty_bytes); - tree->dirty_bytes -= range; - } - - if (tree->private_data && is_data_inode(tree->private_data)) - btrfs_clear_delalloc_extent(tree->private_data, state, bits); - - ret = add_extent_changeset(state, bits_to_clear, changeset, 0); - BUG_ON(ret < 0); - state->state &= ~bits_to_clear; - if (wake) - wake_up(&state->wq); - if (state->state == 0) { - next = next_state(state); - if (extent_state_in_tree(state)) { - rb_erase(&state->rb_node, &tree->state); - RB_CLEAR_NODE(&state->rb_node); - free_extent_state(state); - } else { - WARN_ON(1); - } - } else { - merge_state(tree, state); - next = next_state(state); - } - return next; -} - -static struct extent_state * -alloc_extent_state_atomic(struct extent_state *prealloc) -{ - if (!prealloc) - prealloc = alloc_extent_state(GFP_ATOMIC); - - return prealloc; -} - -static void extent_io_tree_panic(struct extent_io_tree *tree, int err) -{ - btrfs_panic(tree->fs_info, err, - "locking error: extent tree was modified by another thread while locked"); -} - -/* - * clear some bits on a range in the tree. This may require splitting - * or inserting elements in the tree, so the gfp mask is used to - * indicate which allocations or sleeping are allowed. - * - * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove - * the given range from the tree regardless of state (ie for truncate). - * - * the range [start, end] is inclusive. - * - * This takes the tree lock, and returns 0 on success and < 0 on error. - */ -int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, int wake, int delete, - struct extent_state **cached_state, - gfp_t mask, struct extent_changeset *changeset) -{ - struct extent_state *state; - struct extent_state *cached; - struct extent_state *prealloc = NULL; - struct rb_node *node; - u64 last_end; - int err; - int clear = 0; - - btrfs_debug_check_extent_io_range(tree, start, end); - trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits); - - if (bits & EXTENT_DELALLOC) - bits |= EXTENT_NORESERVE; - - if (delete) - bits |= ~EXTENT_CTLBITS; - - if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY)) - clear = 1; -again: - if (!prealloc && gfpflags_allow_blocking(mask)) { - /* - * Don't care for allocation failure here because we might end - * up not needing the pre-allocated extent state at all, which - * is the case if we only have in the tree extent states that - * cover our input range and don't cover too any other range. - * If we end up needing a new extent state we allocate it later. - */ - prealloc = alloc_extent_state(mask); - } - - spin_lock(&tree->lock); - if (cached_state) { - cached = *cached_state; - - if (clear) { - *cached_state = NULL; - cached_state = NULL; - } - - if (cached && extent_state_in_tree(cached) && - cached->start <= start && cached->end > start) { - if (clear) - refcount_dec(&cached->refs); - state = cached; - goto hit_next; - } - if (clear) - free_extent_state(cached); - } - /* - * this search will find the extents that end after - * our range starts - */ - node = tree_search(tree, start); - if (!node) - goto out; - state = rb_entry(node, struct extent_state, rb_node); -hit_next: - if (state->start > end) - goto out; - WARN_ON(state->end < start); - last_end = state->end; - - /* the state doesn't have the wanted bits, go ahead */ - if (!(state->state & bits)) { - state = next_state(state); - goto next; - } - - /* - * | ---- desired range ---- | - * | state | or - * | ------------- state -------------- | - * - * We need to split the extent we found, and may flip - * bits on second half. - * - * If the extent we found extends past our range, we - * just split and search again. It'll get split again - * the next time though. - * - * If the extent we found is inside our range, we clear - * the desired bit on it. - */ - - if (state->start < start) { - prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); - err = split_state(tree, state, prealloc, start); - if (err) - extent_io_tree_panic(tree, err); - - prealloc = NULL; - if (err) - goto out; - if (state->end <= end) { - state = clear_state_bit(tree, state, bits, wake, changeset); - goto next; - } - goto search_again; - } - /* - * | ---- desired range ---- | - * | state | - * We need to split the extent, and clear the bit - * on the first half - */ - if (state->start <= end && state->end > end) { - prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); - err = split_state(tree, state, prealloc, end + 1); - if (err) - extent_io_tree_panic(tree, err); - - if (wake) - wake_up(&state->wq); - - clear_state_bit(tree, prealloc, bits, wake, changeset); - - prealloc = NULL; - goto out; - } - - state = clear_state_bit(tree, state, bits, wake, changeset); -next: - if (last_end == (u64)-1) - goto out; - start = last_end + 1; - if (start <= end && state && !need_resched()) - goto hit_next; - -search_again: - if (start > end) - goto out; - spin_unlock(&tree->lock); - if (gfpflags_allow_blocking(mask)) - cond_resched(); - goto again; - -out: - spin_unlock(&tree->lock); - if (prealloc) - free_extent_state(prealloc); - - return 0; - -} - -static void wait_on_state(struct extent_io_tree *tree, - struct extent_state *state) - __releases(tree->lock) - __acquires(tree->lock) -{ - DEFINE_WAIT(wait); - prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); - spin_unlock(&tree->lock); - schedule(); - spin_lock(&tree->lock); - finish_wait(&state->wq, &wait); -} - -/* - * waits for one or more bits to clear on a range in the state tree. - * The range [start, end] is inclusive. - * The tree lock is taken by this function - */ -static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits) -{ - struct extent_state *state; - struct rb_node *node; - - btrfs_debug_check_extent_io_range(tree, start, end); - - spin_lock(&tree->lock); -again: - while (1) { - /* - * this search will find all the extents that end after - * our range starts - */ - node = tree_search(tree, start); -process_node: - if (!node) - break; - - state = rb_entry(node, struct extent_state, rb_node); - - if (state->start > end) - goto out; - - if (state->state & bits) { - start = state->start; - refcount_inc(&state->refs); - wait_on_state(tree, state); - free_extent_state(state); - goto again; - } - start = state->end + 1; - - if (start > end) - break; - - if (!cond_resched_lock(&tree->lock)) { - node = rb_next(node); - goto process_node; - } - } -out: - spin_unlock(&tree->lock); -} - -static void set_state_bits(struct extent_io_tree *tree, - struct extent_state *state, - u32 bits, struct extent_changeset *changeset) -{ - u32 bits_to_set = bits & ~EXTENT_CTLBITS; - int ret; - - if (tree->private_data && is_data_inode(tree->private_data)) - btrfs_set_delalloc_extent(tree->private_data, state, bits); - - if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { - u64 range = state->end - state->start + 1; - tree->dirty_bytes += range; - } - ret = add_extent_changeset(state, bits_to_set, changeset, 1); - BUG_ON(ret < 0); - state->state |= bits_to_set; -} - -static void cache_state_if_flags(struct extent_state *state, - struct extent_state **cached_ptr, - unsigned flags) -{ - if (cached_ptr && !(*cached_ptr)) { - if (!flags || (state->state & flags)) { - *cached_ptr = state; - refcount_inc(&state->refs); - } - } -} - -static void cache_state(struct extent_state *state, - struct extent_state **cached_ptr) -{ - return cache_state_if_flags(state, cached_ptr, - EXTENT_LOCKED | EXTENT_BOUNDARY); -} - -/* - * set some bits on a range in the tree. This may require allocations or - * sleeping, so the gfp mask is used to indicate what is allowed. - * - * If any of the exclusive bits are set, this will fail with -EEXIST if some - * part of the range already has the desired bits set. The start of the - * existing range is returned in failed_start in this case. - * - * [start, end] is inclusive This takes the tree lock. - */ -int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, - u32 exclusive_bits, u64 *failed_start, - struct extent_state **cached_state, gfp_t mask, - struct extent_changeset *changeset) -{ - struct extent_state *state; - struct extent_state *prealloc = NULL; - struct rb_node *node; - struct rb_node **p; - struct rb_node *parent; - int err = 0; - u64 last_start; - u64 last_end; - - btrfs_debug_check_extent_io_range(tree, start, end); - trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits); - - if (exclusive_bits) - ASSERT(failed_start); - else - ASSERT(failed_start == NULL); -again: - if (!prealloc && gfpflags_allow_blocking(mask)) { - /* - * Don't care for allocation failure here because we might end - * up not needing the pre-allocated extent state at all, which - * is the case if we only have in the tree extent states that - * cover our input range and don't cover too any other range. - * If we end up needing a new extent state we allocate it later. - */ - prealloc = alloc_extent_state(mask); - } - - spin_lock(&tree->lock); - if (cached_state && *cached_state) { - state = *cached_state; - if (state->start <= start && state->end > start && - extent_state_in_tree(state)) { - node = &state->rb_node; - goto hit_next; - } - } - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search_for_insert(tree, start, &p, &parent); - if (!node) { - prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); - prealloc->start = start; - prealloc->end = end; - insert_state_fast(tree, prealloc, p, parent, bits, changeset); - cache_state(prealloc, cached_state); - prealloc = NULL; - goto out; - } - state = rb_entry(node, struct extent_state, rb_node); -hit_next: - last_start = state->start; - last_end = state->end; - - /* - * | ---- desired range ---- | - * | state | - * - * Just lock what we found and keep going - */ - if (state->start == start && state->end <= end) { - if (state->state & exclusive_bits) { - *failed_start = state->start; - err = -EEXIST; - goto out; - } - - set_state_bits(tree, state, bits, changeset); - cache_state(state, cached_state); - merge_state(tree, state); - if (last_end == (u64)-1) - goto out; - start = last_end + 1; - state = next_state(state); - if (start < end && state && state->start == start && - !need_resched()) - goto hit_next; - goto search_again; - } - - /* - * | ---- desired range ---- | - * | state | - * or - * | ------------- state -------------- | - * - * We need to split the extent we found, and may flip bits on - * second half. - * - * If the extent we found extends past our - * range, we just split and search again. It'll get split - * again the next time though. - * - * If the extent we found is inside our range, we set the - * desired bit on it. - */ - if (state->start < start) { - if (state->state & exclusive_bits) { - *failed_start = start; - err = -EEXIST; - goto out; - } - - /* - * If this extent already has all the bits we want set, then - * skip it, not necessary to split it or do anything with it. - */ - if ((state->state & bits) == bits) { - start = state->end + 1; - cache_state(state, cached_state); - goto search_again; - } - - prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); - err = split_state(tree, state, prealloc, start); - if (err) - extent_io_tree_panic(tree, err); - - prealloc = NULL; - if (err) - goto out; - if (state->end <= end) { - set_state_bits(tree, state, bits, changeset); - cache_state(state, cached_state); - merge_state(tree, state); - if (last_end == (u64)-1) - goto out; - start = last_end + 1; - state = next_state(state); - if (start < end && state && state->start == start && - !need_resched()) - goto hit_next; - } - goto search_again; - } - /* - * | ---- desired range ---- | - * | state | or | state | - * - * There's a hole, we need to insert something in it and - * ignore the extent we found. - */ - if (state->start > start) { - u64 this_end; - if (end < last_start) - this_end = end; - else - this_end = last_start - 1; - - prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); - - /* - * Avoid to free 'prealloc' if it can be merged with - * the later extent. - */ - prealloc->start = start; - prealloc->end = this_end; - err = insert_state(tree, prealloc, bits, changeset); - if (err) - extent_io_tree_panic(tree, err); - - cache_state(prealloc, cached_state); - prealloc = NULL; - start = this_end + 1; - goto search_again; - } - /* - * | ---- desired range ---- | - * | state | - * We need to split the extent, and set the bit - * on the first half - */ - if (state->start <= end && state->end > end) { - if (state->state & exclusive_bits) { - *failed_start = start; - err = -EEXIST; - goto out; - } - - prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); - err = split_state(tree, state, prealloc, end + 1); - if (err) - extent_io_tree_panic(tree, err); - - set_state_bits(tree, prealloc, bits, changeset); - cache_state(prealloc, cached_state); - merge_state(tree, prealloc); - prealloc = NULL; - goto out; - } - -search_again: - if (start > end) - goto out; - spin_unlock(&tree->lock); - if (gfpflags_allow_blocking(mask)) - cond_resched(); - goto again; - -out: - spin_unlock(&tree->lock); - if (prealloc) - free_extent_state(prealloc); - - return err; - -} - -/** - * convert_extent_bit - convert all bits in a given range from one bit to - * another - * @tree: the io tree to search - * @start: the start offset in bytes - * @end: the end offset in bytes (inclusive) - * @bits: the bits to set in this range - * @clear_bits: the bits to clear in this range - * @cached_state: state that we're going to cache - * - * This will go through and set bits for the given range. If any states exist - * already in this range they are set with the given bit and cleared of the - * clear_bits. This is only meant to be used by things that are mergeable, ie - * converting from say DELALLOC to DIRTY. This is not meant to be used with - * boundary bits like LOCK. - * - * All allocations are done with GFP_NOFS. - */ -int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, u32 clear_bits, - struct extent_state **cached_state) -{ - struct extent_state *state; - struct extent_state *prealloc = NULL; - struct rb_node *node; - struct rb_node **p; - struct rb_node *parent; - int err = 0; - u64 last_start; - u64 last_end; - bool first_iteration = true; - - btrfs_debug_check_extent_io_range(tree, start, end); - trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits, - clear_bits); - -again: - if (!prealloc) { - /* - * Best effort, don't worry if extent state allocation fails - * here for the first iteration. We might have a cached state - * that matches exactly the target range, in which case no - * extent state allocations are needed. We'll only know this - * after locking the tree. - */ - prealloc = alloc_extent_state(GFP_NOFS); - if (!prealloc && !first_iteration) - return -ENOMEM; - } - - spin_lock(&tree->lock); - if (cached_state && *cached_state) { - state = *cached_state; - if (state->start <= start && state->end > start && - extent_state_in_tree(state)) { - node = &state->rb_node; - goto hit_next; - } - } - - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search_for_insert(tree, start, &p, &parent); - if (!node) { - prealloc = alloc_extent_state_atomic(prealloc); - if (!prealloc) { - err = -ENOMEM; - goto out; - } - prealloc->start = start; - prealloc->end = end; - insert_state_fast(tree, prealloc, p, parent, bits, NULL); - cache_state(prealloc, cached_state); - prealloc = NULL; - goto out; - } - state = rb_entry(node, struct extent_state, rb_node); -hit_next: - last_start = state->start; - last_end = state->end; - - /* - * | ---- desired range ---- | - * | state | - * - * Just lock what we found and keep going - */ - if (state->start == start && state->end <= end) { - set_state_bits(tree, state, bits, NULL); - cache_state(state, cached_state); - state = clear_state_bit(tree, state, clear_bits, 0, NULL); - if (last_end == (u64)-1) - goto out; - start = last_end + 1; - if (start < end && state && state->start == start && - !need_resched()) - goto hit_next; - goto search_again; - } - - /* - * | ---- desired range ---- | - * | state | - * or - * | ------------- state -------------- | - * - * We need to split the extent we found, and may flip bits on - * second half. - * - * If the extent we found extends past our - * range, we just split and search again. It'll get split - * again the next time though. - * - * If the extent we found is inside our range, we set the - * desired bit on it. - */ - if (state->start < start) { - prealloc = alloc_extent_state_atomic(prealloc); - if (!prealloc) { - err = -ENOMEM; - goto out; - } - err = split_state(tree, state, prealloc, start); - if (err) - extent_io_tree_panic(tree, err); - prealloc = NULL; - if (err) - goto out; - if (state->end <= end) { - set_state_bits(tree, state, bits, NULL); - cache_state(state, cached_state); - state = clear_state_bit(tree, state, clear_bits, 0, NULL); - if (last_end == (u64)-1) - goto out; - start = last_end + 1; - if (start < end && state && state->start == start && - !need_resched()) - goto hit_next; - } - goto search_again; - } - /* - * | ---- desired range ---- | - * | state | or | state | - * - * There's a hole, we need to insert something in it and - * ignore the extent we found. - */ - if (state->start > start) { - u64 this_end; - if (end < last_start) - this_end = end; - else - this_end = last_start - 1; - - prealloc = alloc_extent_state_atomic(prealloc); - if (!prealloc) { - err = -ENOMEM; - goto out; - } - - /* - * Avoid to free 'prealloc' if it can be merged with - * the later extent. - */ - prealloc->start = start; - prealloc->end = this_end; - err = insert_state(tree, prealloc, bits, NULL); - if (err) - extent_io_tree_panic(tree, err); - cache_state(prealloc, cached_state); - prealloc = NULL; - start = this_end + 1; - goto search_again; - } - /* - * | ---- desired range ---- | - * | state | - * We need to split the extent, and set the bit - * on the first half - */ - if (state->start <= end && state->end > end) { - prealloc = alloc_extent_state_atomic(prealloc); - if (!prealloc) { - err = -ENOMEM; - goto out; - } - - err = split_state(tree, state, prealloc, end + 1); - if (err) - extent_io_tree_panic(tree, err); - - set_state_bits(tree, prealloc, bits, NULL); - cache_state(prealloc, cached_state); - clear_state_bit(tree, prealloc, clear_bits, 0, NULL); - prealloc = NULL; - goto out; - } - -search_again: - if (start > end) - goto out; - spin_unlock(&tree->lock); - cond_resched(); - first_iteration = false; - goto again; - -out: - spin_unlock(&tree->lock); - if (prealloc) - free_extent_state(prealloc); - - return err; -} - -/* wrappers around set/clear extent bit */ -int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_changeset *changeset) -{ - /* - * We don't support EXTENT_LOCKED yet, as current changeset will - * record any bits changed, so for EXTENT_LOCKED case, it will - * either fail with -EEXIST or changeset will record the whole - * range. - */ - BUG_ON(bits & EXTENT_LOCKED); - - return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS, - changeset); -} - -int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits) -{ - return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, - GFP_NOWAIT, NULL); -} - -int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, int wake, int delete, - struct extent_state **cached) -{ - return __clear_extent_bit(tree, start, end, bits, wake, delete, - cached, GFP_NOFS, NULL); -} - -int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_changeset *changeset) -{ - /* - * Don't support EXTENT_LOCKED case, same reason as - * set_record_extent_bits(). - */ - BUG_ON(bits & EXTENT_LOCKED); - - return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS, - changeset); -} - -/* - * either insert or lock state struct between start and end use mask to tell - * us if waiting is desired. - */ -int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state) -{ - int err; - u64 failed_start; - - while (1) { - err = set_extent_bit(tree, start, end, EXTENT_LOCKED, - EXTENT_LOCKED, &failed_start, - cached_state, GFP_NOFS, NULL); - if (err == -EEXIST) { - wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); - start = failed_start; - } else - break; - WARN_ON(start > end); - } - return err; -} - -int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) -{ - int err; - u64 failed_start; - - err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, - &failed_start, NULL, GFP_NOFS, NULL); - if (err == -EEXIST) { - if (failed_start > start) - clear_extent_bit(tree, start, failed_start - 1, - EXTENT_LOCKED, 1, 0, NULL); - return 0; - } - return 1; } void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) @@ -1554,295 +213,6 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) } } -/* find the first state struct with 'bits' set after 'start', and - * return it. tree->lock must be held. NULL will returned if - * nothing was found after 'start' - */ -static struct extent_state * -find_first_extent_bit_state(struct extent_io_tree *tree, u64 start, u32 bits) -{ - struct rb_node *node; - struct extent_state *state; - - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(tree, start); - if (!node) - goto out; - - while (1) { - state = rb_entry(node, struct extent_state, rb_node); - if (state->end >= start && (state->state & bits)) - return state; - - node = rb_next(node); - if (!node) - break; - } -out: - return NULL; -} - -/* - * Find the first offset in the io tree with one or more @bits set. - * - * Note: If there are multiple bits set in @bits, any of them will match. - * - * Return 0 if we find something, and update @start_ret and @end_ret. - * Return 1 if we found nothing. - */ -int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits, - struct extent_state **cached_state) -{ - struct extent_state *state; - int ret = 1; - - spin_lock(&tree->lock); - if (cached_state && *cached_state) { - state = *cached_state; - if (state->end == start - 1 && extent_state_in_tree(state)) { - while ((state = next_state(state)) != NULL) { - if (state->state & bits) - goto got_it; - } - free_extent_state(*cached_state); - *cached_state = NULL; - goto out; - } - free_extent_state(*cached_state); - *cached_state = NULL; - } - - state = find_first_extent_bit_state(tree, start, bits); -got_it: - if (state) { - cache_state_if_flags(state, cached_state, 0); - *start_ret = state->start; - *end_ret = state->end; - ret = 0; - } -out: - spin_unlock(&tree->lock); - return ret; -} - -/** - * Find a contiguous area of bits - * - * @tree: io tree to check - * @start: offset to start the search from - * @start_ret: the first offset we found with the bits set - * @end_ret: the final contiguous range of the bits that were set - * @bits: bits to look for - * - * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges - * to set bits appropriately, and then merge them again. During this time it - * will drop the tree->lock, so use this helper if you want to find the actual - * contiguous area for given bits. We will search to the first bit we find, and - * then walk down the tree until we find a non-contiguous area. The area - * returned will be the full contiguous area with the bits set. - */ -int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits) -{ - struct extent_state *state; - int ret = 1; - - spin_lock(&tree->lock); - state = find_first_extent_bit_state(tree, start, bits); - if (state) { - *start_ret = state->start; - *end_ret = state->end; - while ((state = next_state(state)) != NULL) { - if (state->start > (*end_ret + 1)) - break; - *end_ret = state->end; - } - ret = 0; - } - spin_unlock(&tree->lock); - return ret; -} - -/** - * Find the first range that has @bits not set. This range could start before - * @start. - * - * @tree: the tree to search - * @start: offset at/after which the found extent should start - * @start_ret: records the beginning of the range - * @end_ret: records the end of the range (inclusive) - * @bits: the set of bits which must be unset - * - * Since unallocated range is also considered one which doesn't have the bits - * set it's possible that @end_ret contains -1, this happens in case the range - * spans (last_range_end, end of device]. In this case it's up to the caller to - * trim @end_ret to the appropriate size. - */ -void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits) -{ - struct extent_state *state; - struct rb_node *node, *prev = NULL, *next; - - spin_lock(&tree->lock); - - /* Find first extent with bits cleared */ - while (1) { - node = tree_search_prev_next(tree, start, &prev, &next); - if (!node && !next && !prev) { - /* - * Tree is completely empty, send full range and let - * caller deal with it - */ - *start_ret = 0; - *end_ret = -1; - goto out; - } else if (!node && !next) { - /* - * We are past the last allocated chunk, set start at - * the end of the last extent. - */ - state = rb_entry(prev, struct extent_state, rb_node); - *start_ret = state->end + 1; - *end_ret = -1; - goto out; - } else if (!node) { - node = next; - } - /* - * At this point 'node' either contains 'start' or start is - * before 'node' - */ - state = rb_entry(node, struct extent_state, rb_node); - - if (in_range(start, state->start, state->end - state->start + 1)) { - if (state->state & bits) { - /* - * |--range with bits sets--| - * | - * start - */ - start = state->end + 1; - } else { - /* - * 'start' falls within a range that doesn't - * have the bits set, so take its start as - * the beginning of the desired range - * - * |--range with bits cleared----| - * | - * start - */ - *start_ret = state->start; - break; - } - } else { - /* - * |---prev range---|---hole/unset---|---node range---| - * | - * start - * - * or - * - * |---hole/unset--||--first node--| - * 0 | - * start - */ - if (prev) { - state = rb_entry(prev, struct extent_state, - rb_node); - *start_ret = state->end + 1; - } else { - *start_ret = 0; - } - break; - } - } - - /* - * Find the longest stretch from start until an entry which has the - * bits set - */ - while (1) { - state = rb_entry(node, struct extent_state, rb_node); - if (state->end >= start && !(state->state & bits)) { - *end_ret = state->end; - } else { - *end_ret = state->start - 1; - break; - } - - node = rb_next(node); - if (!node) - break; - } -out: - spin_unlock(&tree->lock); -} - -/* - * find a contiguous range of bytes in the file marked as delalloc, not - * more than 'max_bytes'. start and end are used to return the range, - * - * true is returned if we find something, false if nothing was in the tree - */ -bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, - u64 *end, u64 max_bytes, - struct extent_state **cached_state) -{ - struct rb_node *node; - struct extent_state *state; - u64 cur_start = *start; - bool found = false; - u64 total_bytes = 0; - - spin_lock(&tree->lock); - - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(tree, cur_start); - if (!node) { - *end = (u64)-1; - goto out; - } - - while (1) { - state = rb_entry(node, struct extent_state, rb_node); - if (found && (state->start != cur_start || - (state->state & EXTENT_BOUNDARY))) { - goto out; - } - if (!(state->state & EXTENT_DELALLOC)) { - if (!found) - *end = state->end; - goto out; - } - if (!found) { - *start = state->start; - *cached_state = state; - refcount_inc(&state->refs); - } - found = true; - *end = state->end; - cur_start = state->end + 1; - node = rb_next(node); - total_bytes += state->end - state->start + 1; - if (total_bytes >= max_bytes) - break; - if (!node) - break; - } -out: - spin_unlock(&tree->lock); - return found; -} - /* * Process one page for __process_pages_contig(). * @@ -1900,9 +270,8 @@ static int __process_pages_contig(struct address_space *mapping, pgoff_t start_index = start >> PAGE_SHIFT; pgoff_t end_index = end >> PAGE_SHIFT; pgoff_t index = start_index; - unsigned long nr_pages = end_index - start_index + 1; unsigned long pages_processed = 0; - struct page *pages[16]; + struct folio_batch fbatch; int err = 0; int i; @@ -1911,16 +280,17 @@ static int __process_pages_contig(struct address_space *mapping, ASSERT(processed_end && *processed_end == start); } - if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0) + if ((page_ops & PAGE_SET_ERROR) && start_index <= end_index) mapping_set_error(mapping, -EIO); - while (nr_pages > 0) { - int found_pages; + folio_batch_init(&fbatch); + while (index <= end_index) { + int found_folios; + + found_folios = filemap_get_folios_contig(mapping, &index, + end_index, &fbatch); - found_pages = find_get_pages_contig(mapping, index, - min_t(unsigned long, - nr_pages, ARRAY_SIZE(pages)), pages); - if (found_pages == 0) { + if (found_folios == 0) { /* * Only if we're going to lock these pages, we can find * nothing at @index. @@ -1930,23 +300,20 @@ static int __process_pages_contig(struct address_space *mapping, goto out; } - for (i = 0; i < found_pages; i++) { + for (i = 0; i < found_folios; i++) { int process_ret; - + struct folio *folio = fbatch.folios[i]; process_ret = process_one_page(fs_info, mapping, - pages[i], locked_page, page_ops, + &folio->page, locked_page, page_ops, start, end); if (process_ret < 0) { - for (; i < found_pages; i++) - put_page(pages[i]); err = -EAGAIN; + folio_batch_release(&fbatch); goto out; } - put_page(pages[i]); - pages_processed++; + pages_processed += folio_nr_pages(folio); } - nr_pages -= found_pages; - index += found_pages; + folio_batch_release(&fbatch); cond_resched(); } out: @@ -2094,14 +461,14 @@ again: } /* step three, lock the state bits for the whole range */ - lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state); + lock_extent(tree, delalloc_start, delalloc_end, &cached_state); /* then test to make sure it is all still delalloc */ ret = test_range_bit(tree, delalloc_start, delalloc_end, EXTENT_DELALLOC, 1, cached_state); if (!ret) { - unlock_extent_cached(tree, delalloc_start, delalloc_end, - &cached_state); + unlock_extent(tree, delalloc_start, delalloc_end, + &cached_state); __unlock_for_delalloc(inode, locked_page, delalloc_start, delalloc_end); cond_resched(); @@ -2118,210 +485,46 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct page *locked_page, u32 clear_bits, unsigned long page_ops) { - clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL); + clear_extent_bit(&inode->io_tree, start, end, clear_bits, NULL); __process_pages_contig(inode->vfs_inode.i_mapping, locked_page, start, end, page_ops, NULL); } -/* - * count the number of bytes in the tree that have a given bit(s) - * set. This can be fairly slow, except for EXTENT_DIRTY which is - * cached. The total number found is returned. - */ -u64 count_range_bits(struct extent_io_tree *tree, - u64 *start, u64 search_end, u64 max_bytes, - u32 bits, int contig) +static int insert_failrec(struct btrfs_inode *inode, + struct io_failure_record *failrec) { - struct rb_node *node; - struct extent_state *state; - u64 cur_start = *start; - u64 total_bytes = 0; - u64 last = 0; - int found = 0; - - if (WARN_ON(search_end <= cur_start)) - return 0; + struct rb_node *exist; - spin_lock(&tree->lock); - if (cur_start == 0 && bits == EXTENT_DIRTY) { - total_bytes = tree->dirty_bytes; - goto out; - } - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(tree, cur_start); - if (!node) - goto out; + spin_lock(&inode->io_failure_lock); + exist = rb_simple_insert(&inode->io_failure_tree, failrec->bytenr, + &failrec->rb_node); + spin_unlock(&inode->io_failure_lock); - while (1) { - state = rb_entry(node, struct extent_state, rb_node); - if (state->start > search_end) - break; - if (contig && found && state->start > last + 1) - break; - if (state->end >= cur_start && (state->state & bits) == bits) { - total_bytes += min(search_end, state->end) + 1 - - max(cur_start, state->start); - if (total_bytes >= max_bytes) - break; - if (!found) { - *start = max(cur_start, state->start); - found = 1; - } - last = state->end; - } else if (contig && found) { - break; - } - node = rb_next(node); - if (!node) - break; - } -out: - spin_unlock(&tree->lock); - return total_bytes; + return (exist == NULL) ? 0 : -EEXIST; } -/* - * set the private field for a given byte offset in the tree. If there isn't - * an extent_state there already, this does nothing. - */ -int set_state_failrec(struct extent_io_tree *tree, u64 start, - struct io_failure_record *failrec) +static struct io_failure_record *get_failrec(struct btrfs_inode *inode, u64 start) { struct rb_node *node; - struct extent_state *state; - int ret = 0; + struct io_failure_record *failrec = ERR_PTR(-ENOENT); - spin_lock(&tree->lock); - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(tree, start); - if (!node) { - ret = -ENOENT; - goto out; - } - state = rb_entry(node, struct extent_state, rb_node); - if (state->start != start) { - ret = -ENOENT; - goto out; - } - state->failrec = failrec; -out: - spin_unlock(&tree->lock); - return ret; -} - -struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start) -{ - struct rb_node *node; - struct extent_state *state; - struct io_failure_record *failrec; - - spin_lock(&tree->lock); - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(tree, start); - if (!node) { - failrec = ERR_PTR(-ENOENT); - goto out; - } - state = rb_entry(node, struct extent_state, rb_node); - if (state->start != start) { - failrec = ERR_PTR(-ENOENT); - goto out; - } - - failrec = state->failrec; -out: - spin_unlock(&tree->lock); + spin_lock(&inode->io_failure_lock); + node = rb_simple_search(&inode->io_failure_tree, start); + if (node) + failrec = rb_entry(node, struct io_failure_record, rb_node); + spin_unlock(&inode->io_failure_lock); return failrec; } -/* - * searches a range in the state tree for a given mask. - * If 'filled' == 1, this returns 1 only if every extent in the tree - * has the bits set. Otherwise, 1 is returned if any bit in the - * range is found set. - */ -int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, int filled, struct extent_state *cached) -{ - struct extent_state *state = NULL; - struct rb_node *node; - int bitset = 0; - - spin_lock(&tree->lock); - if (cached && extent_state_in_tree(cached) && cached->start <= start && - cached->end > start) - node = &cached->rb_node; - else - node = tree_search(tree, start); - while (node && start <= end) { - state = rb_entry(node, struct extent_state, rb_node); - - if (filled && state->start > start) { - bitset = 0; - break; - } - - if (state->start > end) - break; - - if (state->state & bits) { - bitset = 1; - if (!filled) - break; - } else if (filled) { - bitset = 0; - break; - } - - if (state->end == (u64)-1) - break; - - start = state->end + 1; - if (start > end) - break; - node = rb_next(node); - if (!node) { - if (filled) - bitset = 0; - break; - } - } - spin_unlock(&tree->lock); - return bitset; -} - -int free_io_failure(struct extent_io_tree *failure_tree, - struct extent_io_tree *io_tree, - struct io_failure_record *rec) +static void free_io_failure(struct btrfs_inode *inode, + struct io_failure_record *rec) { - int ret; - int err = 0; - - set_state_failrec(failure_tree, rec->start, NULL); - ret = clear_extent_bits(failure_tree, rec->start, - rec->start + rec->len - 1, - EXTENT_LOCKED | EXTENT_DIRTY); - if (ret) - err = ret; - - ret = clear_extent_bits(io_tree, rec->start, - rec->start + rec->len - 1, - EXTENT_DAMAGED); - if (ret && !err) - err = ret; + spin_lock(&inode->io_failure_lock); + rb_erase(&rec->rb_node, &inode->io_failure_tree); + spin_unlock(&inode->io_failure_lock); kfree(rec); - return err; } /* @@ -2456,24 +659,18 @@ static int prev_mirror(const struct io_failure_record *failrec, int cur_mirror) * each time an IO finishes, we do a fast check in the IO failure tree * to see if we need to process or clean up an io_failure_record */ -int clean_io_failure(struct btrfs_fs_info *fs_info, - struct extent_io_tree *failure_tree, - struct extent_io_tree *io_tree, u64 start, - struct page *page, u64 ino, unsigned int pg_offset) +int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start, + struct page *page, unsigned int pg_offset) { - u64 private; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; + u64 ino = btrfs_ino(inode); + u64 locked_start, locked_end; struct io_failure_record *failrec; - struct extent_state *state; int mirror; int ret; - private = 0; - ret = count_range_bits(failure_tree, &private, (u64)-1, 1, - EXTENT_DIRTY, 0); - if (!ret) - return 0; - - failrec = get_state_failrec(failure_tree, start); + failrec = get_failrec(inode, start); if (IS_ERR(failrec)) return 0; @@ -2482,14 +679,10 @@ int clean_io_failure(struct btrfs_fs_info *fs_info, if (sb_rdonly(fs_info->sb)) goto out; - spin_lock(&io_tree->lock); - state = find_first_extent_bit_state(io_tree, - failrec->start, - EXTENT_LOCKED); - spin_unlock(&io_tree->lock); - - if (!state || state->start > failrec->start || - state->end < failrec->start + failrec->len - 1) + ret = find_first_extent_bit(io_tree, failrec->bytenr, &locked_start, + &locked_end, EXTENT_LOCKED, NULL); + if (ret || locked_start > failrec->bytenr || + locked_end < failrec->bytenr + failrec->len - 1) goto out; mirror = failrec->this_mirror; @@ -2500,7 +693,7 @@ int clean_io_failure(struct btrfs_fs_info *fs_info, } while (mirror != failrec->failed_mirror); out: - free_io_failure(failure_tree, io_tree, failrec); + free_io_failure(inode, failrec); return 0; } @@ -2512,30 +705,26 @@ out: */ void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end) { - struct extent_io_tree *failure_tree = &inode->io_failure_tree; struct io_failure_record *failrec; - struct extent_state *state, *next; + struct rb_node *node, *next; - if (RB_EMPTY_ROOT(&failure_tree->state)) + if (RB_EMPTY_ROOT(&inode->io_failure_tree)) return; - spin_lock(&failure_tree->lock); - state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY); - while (state) { - if (state->start > end) + spin_lock(&inode->io_failure_lock); + node = rb_simple_search_first(&inode->io_failure_tree, start); + while (node) { + failrec = rb_entry(node, struct io_failure_record, rb_node); + if (failrec->bytenr > end) break; - ASSERT(state->end <= end); - - next = next_state(state); - - failrec = state->failrec; - free_extent_state(state); + next = rb_next(node); + rb_erase(&failrec->rb_node, &inode->io_failure_tree); kfree(failrec); - state = next; + node = next; } - spin_unlock(&failure_tree->lock); + spin_unlock(&inode->io_failure_lock); } static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode, @@ -2545,16 +734,14 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 start = bbio->file_offset + bio_offset; struct io_failure_record *failrec; - struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; - struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; const u32 sectorsize = fs_info->sectorsize; int ret; - failrec = get_state_failrec(failure_tree, start); + failrec = get_failrec(BTRFS_I(inode), start); if (!IS_ERR(failrec)) { btrfs_debug(fs_info, "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu", - failrec->logical, failrec->start, failrec->len); + failrec->logical, failrec->bytenr, failrec->len); /* * when data can be on disk more than twice, add to failrec here * (e.g. with a list for failed_mirror) to make @@ -2569,7 +756,8 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode if (!failrec) return ERR_PTR(-ENOMEM); - failrec->start = start; + RB_CLEAR_NODE(&failrec->rb_node); + failrec->bytenr = start; failrec->len = sectorsize; failrec->failed_mirror = bbio->mirror_num; failrec->this_mirror = bbio->mirror_num; @@ -2594,14 +782,8 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode } /* Set the bits in the private failure tree */ - ret = set_extent_bits(failure_tree, start, start + sectorsize - 1, - EXTENT_LOCKED | EXTENT_DIRTY); - if (ret >= 0) { - ret = set_state_failrec(failure_tree, start, failrec); - /* Set the bits in the inode's tree */ - ret = set_extent_bits(tree, start, start + sectorsize - 1, - EXTENT_DAMAGED); - } else if (ret < 0) { + ret = insert_failrec(BTRFS_I(inode), failrec); + if (ret) { kfree(failrec); return ERR_PTR(ret); } @@ -2616,8 +798,6 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, u64 start = failed_bbio->file_offset + bio_offset; struct io_failure_record *failrec; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; - struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; struct bio *failed_bio = &failed_bbio->bio; const int icsum = bio_offset >> fs_info->sectorsize_bits; struct bio *repair_bio; @@ -2646,17 +826,15 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, btrfs_debug(fs_info, "failed to repair num_copies %d this_mirror %d failed_mirror %d", failrec->num_copies, failrec->this_mirror, failrec->failed_mirror); - free_io_failure(failure_tree, tree, failrec); + free_io_failure(BTRFS_I(inode), failrec); return -EIO; } - repair_bio = btrfs_bio_alloc(1); + repair_bio = btrfs_bio_alloc(1, REQ_OP_READ, failed_bbio->end_io, + failed_bbio->private); repair_bbio = btrfs_bio(repair_bio); repair_bbio->file_offset = start; - repair_bio->bi_opf = REQ_OP_READ; - repair_bio->bi_end_io = failed_bio->bi_end_io; repair_bio->bi_iter.bi_sector = failrec->logical >> 9; - repair_bio->bi_private = failed_bio->bi_private; if (failed_bbio->csum) { const u32 csum_size = fs_info->csum_size; @@ -2720,8 +898,8 @@ static void end_sector_io(struct page *page, u64 offset, bool uptodate) if (uptodate) set_extent_uptodate(&inode->io_tree, offset, offset + sectorsize - 1, &cached, GFP_ATOMIC); - unlock_extent_cached_atomic(&inode->io_tree, offset, - offset + sectorsize - 1, &cached); + unlock_extent_atomic(&inode->io_tree, offset, offset + sectorsize - 1, + &cached); } static void submit_data_read_repair(struct inode *inode, @@ -2823,8 +1001,9 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end) * Scheduling is not allowed, so the extent state tree is expected * to have one and only one object corresponding to this IO. */ -static void end_bio_extent_writepage(struct bio *bio) +static void end_bio_extent_writepage(struct btrfs_bio *bbio) { + struct bio *bio = &bbio->bio; int error = blk_status_to_errno(bio->bi_status); struct bio_vec *bvec; u64 start; @@ -2924,11 +1103,7 @@ static void endio_readpage_release_extent(struct processed_extent *processed, * Now we don't have range contiguous to the processed range, release * the processed range now. */ - if (processed->uptodate && tree->track_uptodate) - set_extent_uptodate(tree, processed->start, processed->end, - &cached, GFP_ATOMIC); - unlock_extent_cached_atomic(tree, processed->start, processed->end, - &cached); + unlock_extent_atomic(tree, processed->start, processed->end, &cached); update: /* Update processed to current range */ @@ -2988,11 +1163,10 @@ static struct extent_buffer *find_extent_buffer_readpage( * Scheduling is not allowed, so the extent state tree is expected * to have one and only one object corresponding to this IO. */ -static void end_bio_extent_readpage(struct bio *bio) +static void end_bio_extent_readpage(struct btrfs_bio *bbio) { + struct bio *bio = &bbio->bio; struct bio_vec *bvec; - struct btrfs_bio *bbio = btrfs_bio(bio); - struct extent_io_tree *tree, *failure_tree; struct processed_extent processed = { 0 }; /* * The offset to the beginning of a bio, since one bio can never be @@ -3019,8 +1193,6 @@ static void end_bio_extent_readpage(struct bio *bio) "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", bio->bi_iter.bi_sector, bio->bi_status, bbio->mirror_num); - tree = &BTRFS_I(inode)->io_tree; - failure_tree = &BTRFS_I(inode)->io_failure_tree; /* * We always issue full-sector reads, but if some block in a @@ -3061,9 +1233,7 @@ static void end_bio_extent_readpage(struct bio *bio) loff_t i_size = i_size_read(inode); pgoff_t end_index = i_size >> PAGE_SHIFT; - clean_io_failure(BTRFS_I(inode)->root->fs_info, - failure_tree, tree, start, page, - btrfs_ino(BTRFS_I(inode)), 0); + btrfs_clean_io_failure(BTRFS_I(inode), start, page, 0); /* * Zero out the remaining part if this range straddles @@ -3162,50 +1332,6 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array) return 0; } -/* - * Initialize the members up to but not including 'bio'. Use after allocating a - * new bio by bio_alloc_bioset as it does not initialize the bytes outside of - * 'bio' because use of __GFP_ZERO is not supported. - */ -static inline void btrfs_bio_init(struct btrfs_bio *bbio) -{ - memset(bbio, 0, offsetof(struct btrfs_bio, bio)); -} - -/* - * Allocate a btrfs_io_bio, with @nr_iovecs as maximum number of iovecs. - * - * The bio allocation is backed by bioset and does not fail. - */ -struct bio *btrfs_bio_alloc(unsigned int nr_iovecs) -{ - struct bio *bio; - - ASSERT(0 < nr_iovecs && nr_iovecs <= BIO_MAX_VECS); - bio = bio_alloc_bioset(NULL, nr_iovecs, 0, GFP_NOFS, &btrfs_bioset); - btrfs_bio_init(btrfs_bio(bio)); - return bio; -} - -struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size) -{ - struct bio *bio; - struct btrfs_bio *bbio; - - ASSERT(offset <= UINT_MAX && size <= UINT_MAX); - - /* this will never fail when it's backed by a bioset */ - bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset); - ASSERT(bio); - - bbio = btrfs_bio(bio); - btrfs_bio_init(bbio); - - bio_trim(bio, offset >> 9, size >> 9); - bbio->iter = bio->bi_iter; - return bio; -} - /** * Attempt to add a page to bio * @@ -3351,7 +1477,6 @@ static int alloc_new_bio(struct btrfs_inode *inode, struct btrfs_bio_ctrl *bio_ctrl, struct writeback_control *wbc, blk_opf_t opf, - bio_end_io_t end_io_func, u64 disk_bytenr, u32 offset, u64 file_offset, enum btrfs_compression_type compress_type) { @@ -3359,7 +1484,9 @@ static int alloc_new_bio(struct btrfs_inode *inode, struct bio *bio; int ret; - bio = btrfs_bio_alloc(BIO_MAX_VECS); + ASSERT(bio_ctrl->end_io_func); + + bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, bio_ctrl->end_io_func, NULL); /* * For compressed page range, its disk_bytenr is always @disk_bytenr * passed in, no matter if we have added any range into previous bio. @@ -3370,8 +1497,6 @@ static int alloc_new_bio(struct btrfs_inode *inode, bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT; bio_ctrl->bio = bio; bio_ctrl->compress_type = compress_type; - bio->bi_end_io = end_io_func; - bio->bi_opf = opf; ret = calc_bio_boundaries(bio_ctrl, inode, file_offset); if (ret < 0) goto error; @@ -3410,31 +1535,30 @@ static int alloc_new_bio(struct btrfs_inode *inode, return 0; error: bio_ctrl->bio = NULL; - bio->bi_status = errno_to_blk_status(ret); - bio_endio(bio); + btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); return ret; } /* * @opf: bio REQ_OP_* and REQ_* flags as one value * @wbc: optional writeback control for io accounting - * @page: page to add to the bio * @disk_bytenr: logical bytenr where the write will be + * @page: page to add to the bio * @size: portion of page that we want to write to * @pg_offset: offset of the new bio or to check whether we are adding * a contiguous page to the previous one - * @bio_ret: must be valid pointer, newly allocated bio will be stored there - * @end_io_func: end_io callback for new bio - * @mirror_num: desired mirror to read/write - * @prev_bio_flags: flags of previous bio to see if we can merge the current one * @compress_type: compress type for current bio + * + * The will either add the page into the existing @bio_ctrl->bio, or allocate a + * new one in @bio_ctrl->bio. + * The mirror number for this IO should already be initizlied in + * @bio_ctrl->mirror_num. */ static int submit_extent_page(blk_opf_t opf, struct writeback_control *wbc, struct btrfs_bio_ctrl *bio_ctrl, - struct page *page, u64 disk_bytenr, + u64 disk_bytenr, struct page *page, size_t size, unsigned long pg_offset, - bio_end_io_t end_io_func, enum btrfs_compression_type compress_type, bool force_bio_submit) { @@ -3446,6 +1570,9 @@ static int submit_extent_page(blk_opf_t opf, ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE && pg_offset + size <= PAGE_SIZE); + + ASSERT(bio_ctrl->end_io_func); + if (force_bio_submit) submit_one_bio(bio_ctrl); @@ -3456,7 +1583,7 @@ static int submit_extent_page(blk_opf_t opf, /* Allocate new bio if needed */ if (!bio_ctrl->bio) { ret = alloc_new_bio(inode, bio_ctrl, wbc, opf, - end_io_func, disk_bytenr, offset, + disk_bytenr, offset, page_offset(page) + cur, compress_type); if (ret < 0) @@ -3613,7 +1740,6 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, u64 extent_offset; u64 last_byte = i_size_read(inode); u64 block_start; - u64 cur_end; struct extent_map *em; int ret = 0; size_t pg_offset = 0; @@ -3623,7 +1749,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, ret = set_page_extent_mapped(page); if (ret < 0) { - unlock_extent(tree, start, end); + unlock_extent(tree, start, end, NULL); btrfs_page_set_error(fs_info, page, start, PAGE_SIZE); unlock_page(page); goto out; @@ -3637,6 +1763,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, memzero_page(page, zero_offset, iosize); } } + bio_ctrl->end_io_func = end_bio_extent_readpage; begin_page_read(fs_info, page); while (cur <= end) { unsigned long this_bio_flag = 0; @@ -3651,15 +1778,14 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, memzero_page(page, pg_offset, iosize); set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); - unlock_extent_cached(tree, cur, - cur + iosize - 1, &cached); + unlock_extent(tree, cur, cur + iosize - 1, &cached); end_page_read(page, true, cur, iosize); break; } em = __get_extent_map(inode, page, pg_offset, cur, end - cur + 1, em_cached); if (IS_ERR(em)) { - unlock_extent(tree, cur, end); + unlock_extent(tree, cur, end, NULL); end_page_read(page, false, cur, end + 1 - cur); ret = PTR_ERR(em); break; @@ -3672,7 +1798,6 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, this_bio_flag = em->compress_type; iosize = min(extent_map_end(em) - cur, end - cur + 1); - cur_end = min(extent_map_end(em) - 1, end); iosize = ALIGN(iosize, blocksize); if (this_bio_flag != BTRFS_COMPRESS_NONE) disk_bytenr = em->block_start; @@ -3735,43 +1860,31 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); - unlock_extent_cached(tree, cur, - cur + iosize - 1, &cached); + unlock_extent(tree, cur, cur + iosize - 1, &cached); end_page_read(page, true, cur, iosize); cur = cur + iosize; pg_offset += iosize; continue; } /* the get_extent function already copied into the page */ - if (test_range_bit(tree, cur, cur_end, - EXTENT_UPTODATE, 1, NULL)) { - unlock_extent(tree, cur, cur + iosize - 1); - end_page_read(page, true, cur, iosize); - cur = cur + iosize; - pg_offset += iosize; - continue; - } - /* we have an inline extent but it didn't get marked up - * to date. Error out - */ if (block_start == EXTENT_MAP_INLINE) { - unlock_extent(tree, cur, cur + iosize - 1); - end_page_read(page, false, cur, iosize); + unlock_extent(tree, cur, cur + iosize - 1, NULL); + end_page_read(page, true, cur, iosize); cur = cur + iosize; pg_offset += iosize; continue; } ret = submit_extent_page(REQ_OP_READ | read_flags, NULL, - bio_ctrl, page, disk_bytenr, iosize, - pg_offset, end_bio_extent_readpage, - this_bio_flag, force_bio_submit); + bio_ctrl, disk_bytenr, page, iosize, + pg_offset, this_bio_flag, + force_bio_submit); if (ret) { /* * We have to unlock the remaining range, or the page * will never be unlocked. */ - unlock_extent(tree, cur, end); + unlock_extent(tree, cur, end, NULL); end_page_read(page, false, cur, end + 1 - cur); goto out; } @@ -3984,6 +2097,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, */ wbc->nr_to_write--; + epd->bio_ctrl.end_io_func = end_bio_extent_writepage; while (cur <= end) { u64 disk_bytenr; u64 em_end; @@ -4077,10 +2191,9 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, btrfs_page_clear_dirty(fs_info, page, cur, iosize); ret = submit_extent_page(op | write_flags, wbc, - &epd->bio_ctrl, page, - disk_bytenr, iosize, + &epd->bio_ctrl, disk_bytenr, + page, iosize, cur - page_offset(page), - end_bio_extent_writepage, 0, false); if (ret) { has_error = true; @@ -4431,8 +2544,9 @@ static struct extent_buffer *find_extent_buffer_nolock( * Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback() * after all extent buffers in the page has finished their writeback. */ -static void end_bio_subpage_eb_writepage(struct bio *bio) +static void end_bio_subpage_eb_writepage(struct btrfs_bio *bbio) { + struct bio *bio = &bbio->bio; struct btrfs_fs_info *fs_info; struct bio_vec *bvec; struct bvec_iter_all iter_all; @@ -4488,8 +2602,9 @@ static void end_bio_subpage_eb_writepage(struct bio *bio) bio_put(bio); } -static void end_bio_extent_buffer_writepage(struct bio *bio) +static void end_bio_extent_buffer_writepage(struct btrfs_bio *bbio) { + struct bio *bio = &bbio->bio; struct bio_vec *bvec; struct extent_buffer *eb; int done; @@ -4571,10 +2686,11 @@ static int write_one_subpage_eb(struct extent_buffer *eb, if (no_dirty_ebs) clear_page_dirty_for_io(page); + epd->bio_ctrl.end_io_func = end_bio_subpage_eb_writepage; + ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, - &epd->bio_ctrl, page, eb->start, eb->len, - eb->start - page_offset(page), - end_bio_subpage_eb_writepage, 0, false); + &epd->bio_ctrl, eb->start, page, eb->len, + eb->start - page_offset(page), 0, false); if (ret) { btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len); set_btree_ioerr(page, eb); @@ -4605,6 +2721,8 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, prepare_eb_write(eb); + epd->bio_ctrl.end_io_func = end_bio_extent_buffer_writepage; + num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { struct page *p = eb->pages[i]; @@ -4612,10 +2730,8 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, clear_page_dirty_for_io(p); set_page_writeback(p); ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, - &epd->bio_ctrl, p, disk_bytenr, - PAGE_SIZE, 0, - end_bio_extent_buffer_writepage, - 0, false); + &epd->bio_ctrl, disk_bytenr, p, + PAGE_SIZE, 0, 0, false); if (ret) { set_btree_ioerr(p, eb); if (PageWriteback(p)) @@ -5236,7 +3352,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree, if (start > end) return 0; - lock_extent_bits(tree, start, end, &cached_state); + lock_extent(tree, start, end, &cached_state); folio_wait_writeback(folio); /* @@ -5244,7 +3360,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree, * so here we only need to unlock the extent range to free any * existing extent state. */ - unlock_extent_cached(tree, start, end, &cached_state); + unlock_extent(tree, start, end, &cached_state); return 0; } @@ -5263,15 +3379,17 @@ static int try_release_extent_state(struct extent_io_tree *tree, if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) { ret = 0; } else { + u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM | + EXTENT_DELALLOC_NEW | EXTENT_CTLBITS); + /* * At this point we can safely clear everything except the * locked bit, the nodatasum bit and the delalloc new bit. * The delalloc new bit will be cleared by ordered extent * completion. */ - ret = __clear_extent_bit(tree, start, end, - ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW), - 0, 0, NULL, mask, NULL); + ret = __clear_extent_bit(tree, start, end, clear_bits, NULL, + mask, NULL); /* if clear_extent_bit failed for enomem reasons, * we can't allow the release to continue. @@ -5370,42 +3488,6 @@ next: } /* - * helper function for fiemap, which doesn't want to see any holes. - * This maps until we find something past 'last' - */ -static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode, - u64 offset, u64 last) -{ - u64 sectorsize = btrfs_inode_sectorsize(inode); - struct extent_map *em; - u64 len; - - if (offset >= last) - return NULL; - - while (1) { - len = last - offset; - if (len == 0) - break; - len = ALIGN(len, sectorsize); - em = btrfs_get_extent_fiemap(inode, offset, len); - if (IS_ERR(em)) - return em; - - /* if this isn't a hole return it */ - if (em->block_start != EXTENT_MAP_HOLE) - return em; - - /* this is a hole, advance to the next extent */ - offset = extent_map_end(em); - free_extent_map(em); - if (offset >= last) - break; - } - return NULL; -} - -/* * To cache previous fiemap extent * * Will be used for merging fiemap extent @@ -5434,6 +3516,9 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, { int ret = 0; + /* Set at the end of extent_fiemap(). */ + ASSERT((flags & FIEMAP_EXTENT_LAST) == 0); + if (!cache->cached) goto assign; @@ -5457,16 +3542,13 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, * So truly compressed (physical size smaller than logical size) * extents won't get merged with each other * - * 3) Share same flags except FIEMAP_EXTENT_LAST - * So regular extent won't get merged with prealloc extent + * 3) Share same flags */ if (cache->offset + cache->len == offset && cache->phys + cache->len == phys && - (cache->flags & ~FIEMAP_EXTENT_LAST) == - (flags & ~FIEMAP_EXTENT_LAST)) { + cache->flags == flags) { cache->len += len; - cache->flags |= flags; - goto try_submit_last; + return 0; } /* Not mergeable, need to submit cached one */ @@ -5481,13 +3563,8 @@ assign: cache->phys = phys; cache->len = len; cache->flags = flags; -try_submit_last: - if (cache->flags & FIEMAP_EXTENT_LAST) { - ret = fiemap_fill_next_extent(fieinfo, cache->offset, - cache->phys, cache->len, cache->flags); - cache->cached = false; - } - return ret; + + return 0; } /* @@ -5517,215 +3594,534 @@ static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo, return ret; } -int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, - u64 start, u64 len) +static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path) { - int ret = 0; - u64 off; - u64 max = start + len; - u32 flags = 0; - u32 found_type; - u64 last; - u64 last_for_get_extent = 0; - u64 disko = 0; - u64 isize = i_size_read(&inode->vfs_inode); - struct btrfs_key found_key; - struct extent_map *em = NULL; - struct extent_state *cached_state = NULL; - struct btrfs_path *path; - struct btrfs_root *root = inode->root; - struct fiemap_cache cache = { 0 }; - struct ulist *roots; - struct ulist *tmp_ulist; - int end = 0; - u64 em_start = 0; - u64 em_len = 0; - u64 em_end = 0; + struct extent_buffer *clone; + struct btrfs_key key; + int slot; + int ret; - if (len == 0) - return -EINVAL; + path->slots[0]++; + if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) + return 0; - path = btrfs_alloc_path(); - if (!path) + ret = btrfs_next_leaf(inode->root, path); + if (ret != 0) + return ret; + + /* + * Don't bother with cloning if there are no more file extent items for + * our inode. + */ + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) + return 1; + + /* See the comment at fiemap_search_slot() about why we clone. */ + clone = btrfs_clone_extent_buffer(path->nodes[0]); + if (!clone) return -ENOMEM; - roots = ulist_alloc(GFP_KERNEL); - tmp_ulist = ulist_alloc(GFP_KERNEL); - if (!roots || !tmp_ulist) { - ret = -ENOMEM; - goto out_free_ulist; + slot = path->slots[0]; + btrfs_release_path(path); + path->nodes[0] = clone; + path->slots[0] = slot; + + return 0; +} + +/* + * Search for the first file extent item that starts at a given file offset or + * the one that starts immediately before that offset. + * Returns: 0 on success, < 0 on error, 1 if not found. + */ +static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path, + u64 file_offset) +{ + const u64 ino = btrfs_ino(inode); + struct btrfs_root *root = inode->root; + struct extent_buffer *clone; + struct btrfs_key key; + int slot; + int ret; + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = file_offset; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ret; + + if (ret > 0 && path->slots[0] > 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); + if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } + + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret != 0) + return ret; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) + return 1; } /* - * We can't initialize that to 'start' as this could miss extents due - * to extent item merging + * We clone the leaf and use it during fiemap. This is because while + * using the leaf we do expensive things like checking if an extent is + * shared, which can take a long time. In order to prevent blocking + * other tasks for too long, we use a clone of the leaf. We have locked + * the file range in the inode's io tree, so we know none of our file + * extent items can change. This way we avoid blocking other tasks that + * want to insert items for other inodes in the same leaf or b+tree + * rebalance operations (triggered for example when someone is trying + * to push items into this leaf when trying to insert an item in a + * neighbour leaf). + * We also need the private clone because holding a read lock on an + * extent buffer of the subvolume's b+tree will make lockdep unhappy + * when we call fiemap_fill_next_extent(), because that may cause a page + * fault when filling the user space buffer with fiemap data. */ - off = 0; - start = round_down(start, btrfs_inode_sectorsize(inode)); - len = round_up(max, btrfs_inode_sectorsize(inode)) - start; + clone = btrfs_clone_extent_buffer(path->nodes[0]); + if (!clone) + return -ENOMEM; + + slot = path->slots[0]; + btrfs_release_path(path); + path->nodes[0] = clone; + path->slots[0] = slot; + + return 0; +} + +/* + * Process a range which is a hole or a prealloc extent in the inode's subvolume + * btree. If @disk_bytenr is 0, we are dealing with a hole, otherwise a prealloc + * extent. The end offset (@end) is inclusive. + */ +static int fiemap_process_hole(struct btrfs_inode *inode, + struct fiemap_extent_info *fieinfo, + struct fiemap_cache *cache, + struct btrfs_backref_shared_cache *backref_cache, + u64 disk_bytenr, u64 extent_offset, + u64 extent_gen, + struct ulist *roots, struct ulist *tmp_ulist, + u64 start, u64 end) +{ + const u64 i_size = i_size_read(&inode->vfs_inode); + const u64 ino = btrfs_ino(inode); + u64 cur_offset = start; + u64 last_delalloc_end = 0; + u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN; + bool checked_extent_shared = false; + int ret; /* - * lookup the last file extent. We're not using i_size here - * because there might be preallocation past i_size + * There can be no delalloc past i_size, so don't waste time looking for + * it beyond i_size. */ - ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1, - 0); - if (ret < 0) { - goto out_free_ulist; - } else { - WARN_ON(!ret); - if (ret == 1) - ret = 0; - } + while (cur_offset < end && cur_offset < i_size) { + u64 delalloc_start; + u64 delalloc_end; + u64 prealloc_start; + u64 prealloc_len = 0; + bool delalloc; + + delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end, + &delalloc_start, + &delalloc_end); + if (!delalloc) + break; - path->slots[0]--; - btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); - found_type = found_key.type; - - /* No extents, but there might be delalloc bits */ - if (found_key.objectid != btrfs_ino(inode) || - found_type != BTRFS_EXTENT_DATA_KEY) { - /* have to trust i_size as the end */ - last = (u64)-1; - last_for_get_extent = isize; - } else { /* - * remember the start of the last extent. There are a - * bunch of different factors that go into the length of the - * extent, so its much less complex to remember where it started + * If this is a prealloc extent we have to report every section + * of it that has no delalloc. */ - last = found_key.offset; - last_for_get_extent = last + 1; + if (disk_bytenr != 0) { + if (last_delalloc_end == 0) { + prealloc_start = start; + prealloc_len = delalloc_start - start; + } else { + prealloc_start = last_delalloc_end + 1; + prealloc_len = delalloc_start - prealloc_start; + } + } + + if (prealloc_len > 0) { + if (!checked_extent_shared && fieinfo->fi_extents_max) { + ret = btrfs_is_data_extent_shared(inode->root, + ino, disk_bytenr, + extent_gen, roots, + tmp_ulist, + backref_cache); + if (ret < 0) + return ret; + else if (ret > 0) + prealloc_flags |= FIEMAP_EXTENT_SHARED; + + checked_extent_shared = true; + } + ret = emit_fiemap_extent(fieinfo, cache, prealloc_start, + disk_bytenr + extent_offset, + prealloc_len, prealloc_flags); + if (ret) + return ret; + extent_offset += prealloc_len; + } + + ret = emit_fiemap_extent(fieinfo, cache, delalloc_start, 0, + delalloc_end + 1 - delalloc_start, + FIEMAP_EXTENT_DELALLOC | + FIEMAP_EXTENT_UNKNOWN); + if (ret) + return ret; + + last_delalloc_end = delalloc_end; + cur_offset = delalloc_end + 1; + extent_offset += cur_offset - delalloc_start; + cond_resched(); } - btrfs_release_path(path); /* - * we might have some extents allocated but more delalloc past those - * extents. so, we trust isize unless the start of the last extent is - * beyond isize + * Either we found no delalloc for the whole prealloc extent or we have + * a prealloc extent that spans i_size or starts at or after i_size. */ - if (last < isize) { - last = (u64)-1; - last_for_get_extent = isize; + if (disk_bytenr != 0 && last_delalloc_end < end) { + u64 prealloc_start; + u64 prealloc_len; + + if (last_delalloc_end == 0) { + prealloc_start = start; + prealloc_len = end + 1 - start; + } else { + prealloc_start = last_delalloc_end + 1; + prealloc_len = end + 1 - prealloc_start; + } + + if (!checked_extent_shared && fieinfo->fi_extents_max) { + ret = btrfs_is_data_extent_shared(inode->root, + ino, disk_bytenr, + extent_gen, roots, + tmp_ulist, + backref_cache); + if (ret < 0) + return ret; + else if (ret > 0) + prealloc_flags |= FIEMAP_EXTENT_SHARED; + } + ret = emit_fiemap_extent(fieinfo, cache, prealloc_start, + disk_bytenr + extent_offset, + prealloc_len, prealloc_flags); + if (ret) + return ret; } - lock_extent_bits(&inode->io_tree, start, start + len - 1, - &cached_state); + return 0; +} - em = get_extent_skip_holes(inode, start, last_for_get_extent); - if (!em) - goto out; - if (IS_ERR(em)) { - ret = PTR_ERR(em); +static int fiemap_find_last_extent_offset(struct btrfs_inode *inode, + struct btrfs_path *path, + u64 *last_extent_end_ret) +{ + const u64 ino = btrfs_ino(inode); + struct btrfs_root *root = inode->root; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *ei; + struct btrfs_key key; + u64 disk_bytenr; + int ret; + + /* + * Lookup the last file extent. We're not using i_size here because + * there might be preallocation past i_size. + */ + ret = btrfs_lookup_file_extent(NULL, root, path, ino, (u64)-1, 0); + /* There can't be a file extent item at offset (u64)-1 */ + ASSERT(ret != 0); + if (ret < 0) + return ret; + + /* + * For a non-existing key, btrfs_search_slot() always leaves us at a + * slot > 0, except if the btree is empty, which is impossible because + * at least it has the inode item for this inode and all the items for + * the root inode 256. + */ + ASSERT(path->slots[0] > 0); + path->slots[0]--; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { + /* No file extent items in the subvolume tree. */ + *last_extent_end_ret = 0; + return 0; + } + + /* + * For an inline extent, the disk_bytenr is where inline data starts at, + * so first check if we have an inline extent item before checking if we + * have an implicit hole (disk_bytenr == 0). + */ + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) { + *last_extent_end_ret = btrfs_file_extent_end(path); + return 0; + } + + /* + * Find the last file extent item that is not a hole (when NO_HOLES is + * not enabled). This should take at most 2 iterations in the worst + * case: we have one hole file extent item at slot 0 of a leaf and + * another hole file extent item as the last item in the previous leaf. + * This is because we merge file extent items that represent holes. + */ + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + while (disk_bytenr == 0) { + ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY); + if (ret < 0) { + return ret; + } else if (ret > 0) { + /* No file extent items that are not holes. */ + *last_extent_end_ret = 0; + return 0; + } + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + } + + *last_extent_end_ret = btrfs_file_extent_end(path); + return 0; +} + +int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + const u64 ino = btrfs_ino(inode); + struct extent_state *cached_state = NULL; + struct btrfs_path *path; + struct btrfs_root *root = inode->root; + struct fiemap_cache cache = { 0 }; + struct btrfs_backref_shared_cache *backref_cache; + struct ulist *roots; + struct ulist *tmp_ulist; + u64 last_extent_end; + u64 prev_extent_end; + u64 lockstart; + u64 lockend; + bool stopped = false; + int ret; + + backref_cache = kzalloc(sizeof(*backref_cache), GFP_KERNEL); + path = btrfs_alloc_path(); + roots = ulist_alloc(GFP_KERNEL); + tmp_ulist = ulist_alloc(GFP_KERNEL); + if (!backref_cache || !path || !roots || !tmp_ulist) { + ret = -ENOMEM; goto out; } - while (!end) { - u64 offset_in_extent = 0; + lockstart = round_down(start, root->fs_info->sectorsize); + lockend = round_up(start + len, root->fs_info->sectorsize); + prev_extent_end = lockstart; - /* break if the extent we found is outside the range */ - if (em->start >= max || extent_map_end(em) < off) - break; + lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); - /* - * get_extent may return an extent that starts before our - * requested range. We have to make sure the ranges - * we return to fiemap always move forward and don't - * overlap, so adjust the offsets here - */ - em_start = max(em->start, off); + ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end); + if (ret < 0) + goto out_unlock; + btrfs_release_path(path); + path->reada = READA_FORWARD; + ret = fiemap_search_slot(inode, path, lockstart); + if (ret < 0) { + goto out_unlock; + } else if (ret > 0) { /* - * record the offset from the start of the extent - * for adjusting the disk offset below. Only do this if the - * extent isn't compressed since our in ram offset may be past - * what we have actually allocated on disk. + * No file extent item found, but we may have delalloc between + * the current offset and i_size. So check for that. */ - if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) - offset_in_extent = em_start - em->start; - em_end = extent_map_end(em); - em_len = em_end - em_start; - flags = 0; - if (em->block_start < EXTENT_MAP_LAST_BYTE) - disko = em->block_start + offset_in_extent; - else - disko = 0; + ret = 0; + goto check_eof_delalloc; + } + + while (prev_extent_end < lockend) { + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_file_extent_item *ei; + struct btrfs_key key; + u64 extent_end; + u64 extent_len; + u64 extent_offset = 0; + u64 extent_gen; + u64 disk_bytenr = 0; + u64 flags = 0; + int extent_type; + u8 compression; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) + break; + + extent_end = btrfs_file_extent_end(path); /* - * bump off for our next call to get_extent + * The first iteration can leave us at an extent item that ends + * before our range's start. Move to the next item. */ - off = extent_map_end(em); - if (off >= max) - end = 1; - - if (em->block_start == EXTENT_MAP_LAST_BYTE) { - end = 1; - flags |= FIEMAP_EXTENT_LAST; - } else if (em->block_start == EXTENT_MAP_INLINE) { - flags |= (FIEMAP_EXTENT_DATA_INLINE | - FIEMAP_EXTENT_NOT_ALIGNED); - } else if (em->block_start == EXTENT_MAP_DELALLOC) { - flags |= (FIEMAP_EXTENT_DELALLOC | - FIEMAP_EXTENT_UNKNOWN); - } else if (fieinfo->fi_extents_max) { - u64 bytenr = em->block_start - - (em->start - em->orig_start); + if (extent_end <= lockstart) + goto next_item; - /* - * As btrfs supports shared space, this information - * can be exported to userspace tools via - * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0 - * then we're just getting a count and we can skip the - * lookup stuff. - */ - ret = btrfs_check_shared(root, btrfs_ino(inode), - bytenr, roots, tmp_ulist); - if (ret < 0) - goto out_free; - if (ret) - flags |= FIEMAP_EXTENT_SHARED; - ret = 0; + /* We have in implicit hole (NO_HOLES feature enabled). */ + if (prev_extent_end < key.offset) { + const u64 range_end = min(key.offset, lockend) - 1; + + ret = fiemap_process_hole(inode, fieinfo, &cache, + backref_cache, 0, 0, 0, + roots, tmp_ulist, + prev_extent_end, range_end); + if (ret < 0) { + goto out_unlock; + } else if (ret > 0) { + /* fiemap_fill_next_extent() told us to stop. */ + stopped = true; + break; + } + + /* We've reached the end of the fiemap range, stop. */ + if (key.offset >= lockend) { + stopped = true; + break; + } } - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + + extent_len = extent_end - key.offset; + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + compression = btrfs_file_extent_compression(leaf, ei); + extent_type = btrfs_file_extent_type(leaf, ei); + extent_gen = btrfs_file_extent_generation(leaf, ei); + + if (extent_type != BTRFS_FILE_EXTENT_INLINE) { + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + if (compression == BTRFS_COMPRESS_NONE) + extent_offset = btrfs_file_extent_offset(leaf, ei); + } + + if (compression != BTRFS_COMPRESS_NONE) flags |= FIEMAP_EXTENT_ENCODED; - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - flags |= FIEMAP_EXTENT_UNWRITTEN; - free_extent_map(em); - em = NULL; - if ((em_start >= last) || em_len == (u64)-1 || - (last == (u64)-1 && isize <= em_end)) { - flags |= FIEMAP_EXTENT_LAST; - end = 1; + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + flags |= FIEMAP_EXTENT_DATA_INLINE; + flags |= FIEMAP_EXTENT_NOT_ALIGNED; + ret = emit_fiemap_extent(fieinfo, &cache, key.offset, 0, + extent_len, flags); + } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + ret = fiemap_process_hole(inode, fieinfo, &cache, + backref_cache, + disk_bytenr, extent_offset, + extent_gen, roots, tmp_ulist, + key.offset, extent_end - 1); + } else if (disk_bytenr == 0) { + /* We have an explicit hole. */ + ret = fiemap_process_hole(inode, fieinfo, &cache, + backref_cache, 0, 0, 0, + roots, tmp_ulist, + key.offset, extent_end - 1); + } else { + /* We have a regular extent. */ + if (fieinfo->fi_extents_max) { + ret = btrfs_is_data_extent_shared(root, ino, + disk_bytenr, + extent_gen, + roots, + tmp_ulist, + backref_cache); + if (ret < 0) + goto out_unlock; + else if (ret > 0) + flags |= FIEMAP_EXTENT_SHARED; + } + + ret = emit_fiemap_extent(fieinfo, &cache, key.offset, + disk_bytenr + extent_offset, + extent_len, flags); } - /* now scan forward to see if this is really the last extent. */ - em = get_extent_skip_holes(inode, off, last_for_get_extent); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto out; + if (ret < 0) { + goto out_unlock; + } else if (ret > 0) { + /* fiemap_fill_next_extent() told us to stop. */ + stopped = true; + break; } - if (!em) { - flags |= FIEMAP_EXTENT_LAST; - end = 1; + + prev_extent_end = extent_end; +next_item: + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out_unlock; } - ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko, - em_len, flags); - if (ret) { - if (ret == 1) - ret = 0; - goto out_free; + + ret = fiemap_next_leaf_item(inode, path); + if (ret < 0) { + goto out_unlock; + } else if (ret > 0) { + /* No more file extent items for this inode. */ + break; } + cond_resched(); } -out_free: - if (!ret) - ret = emit_last_fiemap_cache(fieinfo, &cache); - free_extent_map(em); -out: - unlock_extent_cached(&inode->io_tree, start, start + len - 1, - &cached_state); -out_free_ulist: +check_eof_delalloc: + /* + * Release (and free) the path before emitting any final entries to + * fiemap_fill_next_extent() to keep lockdep happy. This is because + * once we find no more file extent items exist, we may have a + * non-cloned leaf, and fiemap_fill_next_extent() can trigger page + * faults when copying data to the user space buffer. + */ + btrfs_free_path(path); + path = NULL; + + if (!stopped && prev_extent_end < lockend) { + ret = fiemap_process_hole(inode, fieinfo, &cache, backref_cache, + 0, 0, 0, roots, tmp_ulist, + prev_extent_end, lockend - 1); + if (ret < 0) + goto out_unlock; + prev_extent_end = lockend; + } + + if (cache.cached && cache.offset + cache.len >= last_extent_end) { + const u64 i_size = i_size_read(&inode->vfs_inode); + + if (prev_extent_end < i_size) { + u64 delalloc_start; + u64 delalloc_end; + bool delalloc; + + delalloc = btrfs_find_delalloc_in_range(inode, + prev_extent_end, + i_size - 1, + &delalloc_start, + &delalloc_end); + if (!delalloc) + cache.flags |= FIEMAP_EXTENT_LAST; + } else { + cache.flags |= FIEMAP_EXTENT_LAST; + } + } + + ret = emit_last_fiemap_cache(fieinfo, &cache); + +out_unlock: + unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); +out: + kfree(backref_cache); btrfs_free_path(path); ulist_free(roots); ulist_free(tmp_ulist); @@ -5856,7 +4252,7 @@ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) { btrfs_release_extent_buffer_pages(eb); - btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list); + btrfs_leak_debug_del_eb(eb); __free_extent_buffer(eb); } @@ -5873,8 +4269,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, eb->bflags = 0; init_rwsem(&eb->lock); - btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list, - &fs_info->allocated_ebs); + btrfs_leak_debug_add_eb(eb); INIT_LIST_HEAD(&eb->release_list); spin_lock_init(&eb->refs_lock); @@ -6342,7 +4737,7 @@ static int release_extent_buffer(struct extent_buffer *eb) spin_unlock(&eb->refs_lock); } - btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list); + btrfs_leak_debug_del_eb(eb); /* Should be safe to release our pages at this point */ btrfs_release_extent_buffer_pages(eb); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS @@ -6362,18 +4757,16 @@ static int release_extent_buffer(struct extent_buffer *eb) void free_extent_buffer(struct extent_buffer *eb) { int refs; - int old; if (!eb) return; + refs = atomic_read(&eb->refs); while (1) { - refs = atomic_read(&eb->refs); if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3) || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs == 1)) break; - old = atomic_cmpxchg(&eb->refs, refs, refs - 1); - if (old == refs) + if (atomic_try_cmpxchg(&eb->refs, &refs, refs - 1)) return; } @@ -6569,7 +4962,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1)) return -EAGAIN; } else { - ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1); + ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1, NULL); if (ret < 0) return ret; } @@ -6579,7 +4972,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, PageUptodate(page) || btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) { set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - unlock_extent(io_tree, eb->start, eb->start + eb->len - 1); + unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, NULL); return ret; } @@ -6587,13 +4980,14 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, eb->read_mirror = 0; atomic_set(&eb->io_pages, 1); check_buffer_tree_ref(eb); + bio_ctrl.end_io_func = end_bio_extent_readpage; + btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len); btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len); ret = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl, - page, eb->start, eb->len, - eb->start - page_offset(page), - end_bio_extent_readpage, 0, true); + eb->start, page, eb->len, + eb->start - page_offset(page), 0, true); if (ret) { /* * In the endio function, if we hit something wrong we will @@ -6684,6 +5078,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) * set io_pages. See check_buffer_tree_ref for a more detailed comment. */ check_buffer_tree_ref(eb); + bio_ctrl.end_io_func = end_bio_extent_readpage; for (i = 0; i < num_pages; i++) { page = eb->pages[i]; @@ -6696,9 +5091,8 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) ClearPageError(page); err = submit_extent_page(REQ_OP_READ, NULL, - &bio_ctrl, page, page_offset(page), - PAGE_SIZE, 0, end_bio_extent_readpage, - 0, false); + &bio_ctrl, page_offset(page), page, + PAGE_SIZE, 0, 0, false); if (err) { /* * We failed to submit the bio so it's the diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 4bc72a87b9a9..7929f054dda3 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -60,11 +60,13 @@ enum { struct btrfs_bio; struct btrfs_root; struct btrfs_inode; -struct btrfs_io_bio; struct btrfs_fs_info; struct io_failure_record; struct extent_io_tree; +int __init extent_buffer_init_cachep(void); +void __cold extent_buffer_free_cachep(void); + typedef void (submit_bio_hook_t)(struct inode *inode, struct bio *bio, int mirror_num, enum btrfs_compression_type compress_type); @@ -240,10 +242,10 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct page *locked_page, u32 bits_to_clear, unsigned long page_ops); +int extent_invalidate_folio(struct extent_io_tree *tree, + struct folio *folio, size_t offset); int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array); -struct bio *btrfs_bio_alloc(unsigned int nr_iovecs); -struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size); void end_extent_writepage(struct page *page, int err, u64 start, u64 end); int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num); @@ -257,8 +259,12 @@ int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num); * bio end_io callback is called to indicate things have failed. */ struct io_failure_record { + /* Use rb_simple_node for search/insert */ + struct { + struct rb_node rb_node; + u64 bytenr; + }; struct page *page; - u64 start; u64 len; u64 logical; int this_mirror; @@ -269,6 +275,9 @@ struct io_failure_record { int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, u32 bio_offset, struct page *page, unsigned int pgoff, submit_bio_hook_t *submit_bio_hook); +void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end); +int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start, + struct page *page, unsigned int pg_offset); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS bool find_lock_delalloc_range(struct inode *inode, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 6fee14ce2e6b..6092a4eedc92 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -7,6 +7,7 @@ #include "volumes.h" #include "extent_map.h" #include "compression.h" +#include "btrfs_inode.h" static struct kmem_cache *extent_map_cache; @@ -54,9 +55,7 @@ struct extent_map *alloc_extent_map(void) if (!em) return NULL; RB_CLEAR_NODE(&em->rb_node); - em->flags = 0; em->compress_type = BTRFS_COMPRESS_NONE; - em->generation = 0; refcount_set(&em->refs, 1); INIT_LIST_HEAD(&em->list); return em; @@ -73,7 +72,6 @@ void free_extent_map(struct extent_map *em) { if (!em) return; - WARN_ON(refcount_read(&em->refs) == 0); if (refcount_dec_and_test(&em->refs)) { WARN_ON(extent_map_in_tree(em)); WARN_ON(!list_empty(&em->list)); @@ -143,8 +141,7 @@ static int tree_insert(struct rb_root_cached *root, struct extent_map *em) * it can't be found, try to find some neighboring extents */ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, - struct rb_node **prev_ret, - struct rb_node **next_ret) + struct rb_node **prev_or_next_ret) { struct rb_node *n = root->rb_node; struct rb_node *prev = NULL; @@ -152,6 +149,8 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, struct extent_map *entry; struct extent_map *prev_entry = NULL; + ASSERT(prev_or_next_ret); + while (n) { entry = rb_entry(n, struct extent_map, rb_node); prev = n; @@ -165,24 +164,29 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, return n; } - if (prev_ret) { - orig_prev = prev; - while (prev && offset >= extent_map_end(prev_entry)) { - prev = rb_next(prev); - prev_entry = rb_entry(prev, struct extent_map, rb_node); - } - *prev_ret = prev; - prev = orig_prev; + orig_prev = prev; + while (prev && offset >= extent_map_end(prev_entry)) { + prev = rb_next(prev); + prev_entry = rb_entry(prev, struct extent_map, rb_node); + } + + /* + * Previous extent map found, return as in this case the caller does not + * care about the next one. + */ + if (prev) { + *prev_or_next_ret = prev; + return NULL; } - if (next_ret) { + prev = orig_prev; + prev_entry = rb_entry(prev, struct extent_map, rb_node); + while (prev && offset < prev_entry->start) { + prev = rb_prev(prev); prev_entry = rb_entry(prev, struct extent_map, rb_node); - while (prev && offset < prev_entry->start) { - prev = rb_prev(prev); - prev_entry = rb_entry(prev, struct extent_map, rb_node); - } - *next_ret = prev; } + *prev_or_next_ret = prev; + return NULL; } @@ -336,6 +340,8 @@ out: void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) { + lockdep_assert_held_write(&tree->lock); + clear_bit(EXTENT_FLAG_LOGGING, &em->flags); if (extent_map_in_tree(em)) try_merge_map(tree, em); @@ -382,7 +388,7 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits) __clear_extent_bit(&device->alloc_state, stripe->physical, stripe->physical + stripe_size - 1, bits, - 0, 0, NULL, GFP_NOWAIT, NULL); + NULL, GFP_NOWAIT, NULL); } } @@ -425,16 +431,13 @@ __lookup_extent_mapping(struct extent_map_tree *tree, { struct extent_map *em; struct rb_node *rb_node; - struct rb_node *prev = NULL; - struct rb_node *next = NULL; + struct rb_node *prev_or_next = NULL; u64 end = range_end(start, len); - rb_node = __tree_search(&tree->map.rb_root, start, &prev, &next); + rb_node = __tree_search(&tree->map.rb_root, start, &prev_or_next); if (!rb_node) { - if (prev) - rb_node = prev; - else if (next) - rb_node = next; + if (prev_or_next) + rb_node = prev_or_next; else return NULL; } @@ -658,3 +661,293 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, ASSERT(ret == 0 || ret == -EEXIST); return ret; } + +/* + * Drop all extent maps from a tree in the fastest possible way, rescheduling + * if needed. This avoids searching the tree, from the root down to the first + * extent map, before each deletion. + */ +static void drop_all_extent_maps_fast(struct extent_map_tree *tree) +{ + write_lock(&tree->lock); + while (!RB_EMPTY_ROOT(&tree->map.rb_root)) { + struct extent_map *em; + struct rb_node *node; + + node = rb_first_cached(&tree->map); + em = rb_entry(node, struct extent_map, rb_node); + clear_bit(EXTENT_FLAG_PINNED, &em->flags); + clear_bit(EXTENT_FLAG_LOGGING, &em->flags); + remove_extent_mapping(tree, em); + free_extent_map(em); + cond_resched_rwlock_write(&tree->lock); + } + write_unlock(&tree->lock); +} + +/* + * Drop all extent maps in a given range. + * + * @inode: The target inode. + * @start: Start offset of the range. + * @end: End offset of the range (inclusive value). + * @skip_pinned: Indicate if pinned extent maps should be ignored or not. + * + * This drops all the extent maps that intersect the given range [@start, @end]. + * Extent maps that partially overlap the range and extend behind or beyond it, + * are split. + * The caller should have locked an appropriate file range in the inode's io + * tree before calling this function. + */ +void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, + bool skip_pinned) +{ + struct extent_map *split; + struct extent_map *split2; + struct extent_map *em; + struct extent_map_tree *em_tree = &inode->extent_tree; + u64 len = end - start + 1; + + WARN_ON(end < start); + if (end == (u64)-1) { + if (start == 0 && !skip_pinned) { + drop_all_extent_maps_fast(em_tree); + return; + } + len = (u64)-1; + } else { + /* Make end offset exclusive for use in the loop below. */ + end++; + } + + /* + * It's ok if we fail to allocate the extent maps, see the comment near + * the bottom of the loop below. We only need two spare extent maps in + * the worst case, where the first extent map that intersects our range + * starts before the range and the last extent map that intersects our + * range ends after our range (and they might be the same extent map), + * because we need to split those two extent maps at the boundaries. + */ + split = alloc_extent_map(); + split2 = alloc_extent_map(); + + write_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + + while (em) { + /* extent_map_end() returns exclusive value (last byte + 1). */ + const u64 em_end = extent_map_end(em); + struct extent_map *next_em = NULL; + u64 gen; + unsigned long flags; + bool modified; + bool compressed; + + if (em_end < end) { + next_em = next_extent_map(em); + if (next_em) { + if (next_em->start < end) + refcount_inc(&next_em->refs); + else + next_em = NULL; + } + } + + if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { + start = em_end; + if (end != (u64)-1) + len = start + len - em_end; + goto next; + } + + clear_bit(EXTENT_FLAG_PINNED, &em->flags); + clear_bit(EXTENT_FLAG_LOGGING, &flags); + modified = !list_empty(&em->list); + + /* + * The extent map does not cross our target range, so no need to + * split it, we can remove it directly. + */ + if (em->start >= start && em_end <= end) + goto remove_em; + + flags = em->flags; + gen = em->generation; + compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + + if (em->start < start) { + if (!split) { + split = split2; + split2 = NULL; + if (!split) + goto remove_em; + } + split->start = em->start; + split->len = start - em->start; + + if (em->block_start < EXTENT_MAP_LAST_BYTE) { + split->orig_start = em->orig_start; + split->block_start = em->block_start; + + if (compressed) + split->block_len = em->block_len; + else + split->block_len = split->len; + split->orig_block_len = max(split->block_len, + em->orig_block_len); + split->ram_bytes = em->ram_bytes; + } else { + split->orig_start = split->start; + split->block_len = 0; + split->block_start = em->block_start; + split->orig_block_len = 0; + split->ram_bytes = split->len; + } + + split->generation = gen; + split->flags = flags; + split->compress_type = em->compress_type; + replace_extent_mapping(em_tree, em, split, modified); + free_extent_map(split); + split = split2; + split2 = NULL; + } + if (em_end > end) { + if (!split) { + split = split2; + split2 = NULL; + if (!split) + goto remove_em; + } + split->start = start + len; + split->len = em_end - (start + len); + split->block_start = em->block_start; + split->flags = flags; + split->compress_type = em->compress_type; + split->generation = gen; + + if (em->block_start < EXTENT_MAP_LAST_BYTE) { + split->orig_block_len = max(em->block_len, + em->orig_block_len); + + split->ram_bytes = em->ram_bytes; + if (compressed) { + split->block_len = em->block_len; + split->orig_start = em->orig_start; + } else { + const u64 diff = start + len - em->start; + + split->block_len = split->len; + split->block_start += diff; + split->orig_start = em->orig_start; + } + } else { + split->ram_bytes = split->len; + split->orig_start = split->start; + split->block_len = 0; + split->orig_block_len = 0; + } + + if (extent_map_in_tree(em)) { + replace_extent_mapping(em_tree, em, split, + modified); + } else { + int ret; + + ret = add_extent_mapping(em_tree, split, + modified); + /* Logic error, shouldn't happen. */ + ASSERT(ret == 0); + if (WARN_ON(ret != 0) && modified) + btrfs_set_inode_full_sync(inode); + } + free_extent_map(split); + split = NULL; + } +remove_em: + if (extent_map_in_tree(em)) { + /* + * If the extent map is still in the tree it means that + * either of the following is true: + * + * 1) It fits entirely in our range (doesn't end beyond + * it or starts before it); + * + * 2) It starts before our range and/or ends after our + * range, and we were not able to allocate the extent + * maps for split operations, @split and @split2. + * + * If we are at case 2) then we just remove the entire + * extent map - this is fine since if anyone needs it to + * access the subranges outside our range, will just + * load it again from the subvolume tree's file extent + * item. However if the extent map was in the list of + * modified extents, then we must mark the inode for a + * full fsync, otherwise a fast fsync will miss this + * extent if it's new and needs to be logged. + */ + if ((em->start < start || em_end > end) && modified) { + ASSERT(!split); + btrfs_set_inode_full_sync(inode); + } + remove_extent_mapping(em_tree, em); + } + + /* + * Once for the tree reference (we replaced or removed the + * extent map from the tree). + */ + free_extent_map(em); +next: + /* Once for us (for our lookup reference). */ + free_extent_map(em); + + em = next_em; + } + + write_unlock(&em_tree->lock); + + free_extent_map(split); + free_extent_map(split2); +} + +/* + * Replace a range in the inode's extent map tree with a new extent map. + * + * @inode: The target inode. + * @new_em: The new extent map to add to the inode's extent map tree. + * @modified: Indicate if the new extent map should be added to the list of + * modified extents (for fast fsync tracking). + * + * Drops all the extent maps in the inode's extent map tree that intersect the + * range of the new extent map and adds the new extent map to the tree. + * The caller should have locked an appropriate file range in the inode's io + * tree before calling this function. + */ +int btrfs_replace_extent_map_range(struct btrfs_inode *inode, + struct extent_map *new_em, + bool modified) +{ + const u64 end = new_em->start + new_em->len - 1; + struct extent_map_tree *tree = &inode->extent_tree; + int ret; + + ASSERT(!extent_map_in_tree(new_em)); + + /* + * The caller has locked an appropriate file range in the inode's io + * tree, but getting -EEXIST when adding the new extent map can still + * happen in case there are extents that partially cover the range, and + * this is due to two tasks operating on different parts of the extent. + * See commit 18e83ac75bfe67 ("Btrfs: fix unexpected EEXIST from + * btrfs_get_extent") for an example and details. + */ + do { + btrfs_drop_extent_map_range(inode, new_em->start, end, false); + write_lock(&tree->lock); + ret = add_extent_mapping(tree, new_em, modified); + write_unlock(&tree->lock); + } while (ret == -EEXIST); + + return ret; +} diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index d2fa32ffe304..ad311864272a 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -63,6 +63,8 @@ struct extent_map_tree { rwlock_t lock; }; +struct btrfs_inode; + static inline int extent_map_in_tree(const struct extent_map *em) { return !RB_EMPTY_NODE(&em->rb_node); @@ -104,5 +106,11 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree, int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, struct extent_map_tree *em_tree, struct extent_map **em_in, u64 start, u64 len); +void btrfs_drop_extent_map_range(struct btrfs_inode *inode, + u64 start, u64 end, + bool skip_pinned); +int btrfs_replace_extent_map_range(struct btrfs_inode *inode, + struct extent_map *new_em, + bool modified); #endif diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index c828f971a346..6bb9fa961a6a 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -118,7 +118,7 @@ int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES)) return 0; return clear_extent_bit(&inode->file_extent_tree, start, - start + len - 1, EXTENT_DIRTY, 0, 0, NULL); + start + len - 1, EXTENT_DIRTY, NULL); } static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info, @@ -129,12 +129,20 @@ static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info, return ncsums * fs_info->sectorsize; } -int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, +/* + * Calculate the total size needed to allocate for an ordered sum structure + * spanning @bytes in the file. + */ +static int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info, unsigned long bytes) +{ + int num_sectors = (int)DIV_ROUND_UP(bytes, fs_info->sectorsize); + + return sizeof(struct btrfs_ordered_sum) + num_sectors * fs_info->csum_size; +} + +int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 objectid, u64 pos, - u64 disk_offset, u64 disk_num_bytes, - u64 num_bytes, u64 offset, u64 ram_bytes, - u8 compression, u8 encryption, u16 other_encoding) + u64 objectid, u64 pos, u64 num_bytes) { int ret = 0; struct btrfs_file_extent_item *item; @@ -157,16 +165,16 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset); - btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes); - btrfs_set_file_extent_offset(leaf, item, offset); + btrfs_set_file_extent_disk_bytenr(leaf, item, 0); + btrfs_set_file_extent_disk_num_bytes(leaf, item, 0); + btrfs_set_file_extent_offset(leaf, item, 0); btrfs_set_file_extent_num_bytes(leaf, item, num_bytes); - btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes); + btrfs_set_file_extent_ram_bytes(leaf, item, num_bytes); btrfs_set_file_extent_generation(leaf, item, trans->transid); btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); - btrfs_set_file_extent_compression(leaf, item, compression); - btrfs_set_file_extent_encryption(leaf, item, encryption); - btrfs_set_file_extent_other_encoding(leaf, item, other_encoding); + btrfs_set_file_extent_compression(leaf, item, 0); + btrfs_set_file_extent_encryption(leaf, item, 0); + btrfs_set_file_extent_other_encoding(leaf, item, 0); btrfs_mark_buffer_dirty(leaf); out: @@ -503,7 +511,8 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst } int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, - struct list_head *list, int search_commit) + struct list_head *list, int search_commit, + bool nowait) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; @@ -525,6 +534,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, if (!path) return -ENOMEM; + path->nowait = nowait; if (search_commit) { path->skip_locking = 1; path->reada = READA_FORWARD; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 5a3f6e0d9688..d01631d47806 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -473,7 +473,7 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, */ clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, cached); + cached); err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, extra_bits, cached); @@ -499,159 +499,6 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, } /* - * this drops all the extents in the cache that intersect the range - * [start, end]. Existing extents are split as required. - */ -void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end, - int skip_pinned) -{ - struct extent_map *em; - struct extent_map *split = NULL; - struct extent_map *split2 = NULL; - struct extent_map_tree *em_tree = &inode->extent_tree; - u64 len = end - start + 1; - u64 gen; - int ret; - int testend = 1; - unsigned long flags; - int compressed = 0; - bool modified; - - WARN_ON(end < start); - if (end == (u64)-1) { - len = (u64)-1; - testend = 0; - } - while (1) { - int no_splits = 0; - - modified = false; - if (!split) - split = alloc_extent_map(); - if (!split2) - split2 = alloc_extent_map(); - if (!split || !split2) - no_splits = 1; - - write_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, len); - if (!em) { - write_unlock(&em_tree->lock); - break; - } - flags = em->flags; - gen = em->generation; - if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { - if (testend && em->start + em->len >= start + len) { - free_extent_map(em); - write_unlock(&em_tree->lock); - break; - } - start = em->start + em->len; - if (testend) - len = start + len - (em->start + em->len); - free_extent_map(em); - write_unlock(&em_tree->lock); - continue; - } - compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); - clear_bit(EXTENT_FLAG_PINNED, &em->flags); - clear_bit(EXTENT_FLAG_LOGGING, &flags); - modified = !list_empty(&em->list); - if (no_splits) - goto next; - - if (em->start < start) { - split->start = em->start; - split->len = start - em->start; - - if (em->block_start < EXTENT_MAP_LAST_BYTE) { - split->orig_start = em->orig_start; - split->block_start = em->block_start; - - if (compressed) - split->block_len = em->block_len; - else - split->block_len = split->len; - split->orig_block_len = max(split->block_len, - em->orig_block_len); - split->ram_bytes = em->ram_bytes; - } else { - split->orig_start = split->start; - split->block_len = 0; - split->block_start = em->block_start; - split->orig_block_len = 0; - split->ram_bytes = split->len; - } - - split->generation = gen; - split->flags = flags; - split->compress_type = em->compress_type; - replace_extent_mapping(em_tree, em, split, modified); - free_extent_map(split); - split = split2; - split2 = NULL; - } - if (testend && em->start + em->len > start + len) { - u64 diff = start + len - em->start; - - split->start = start + len; - split->len = em->start + em->len - (start + len); - split->flags = flags; - split->compress_type = em->compress_type; - split->generation = gen; - - if (em->block_start < EXTENT_MAP_LAST_BYTE) { - split->orig_block_len = max(em->block_len, - em->orig_block_len); - - split->ram_bytes = em->ram_bytes; - if (compressed) { - split->block_len = em->block_len; - split->block_start = em->block_start; - split->orig_start = em->orig_start; - } else { - split->block_len = split->len; - split->block_start = em->block_start - + diff; - split->orig_start = em->orig_start; - } - } else { - split->ram_bytes = split->len; - split->orig_start = split->start; - split->block_len = 0; - split->block_start = em->block_start; - split->orig_block_len = 0; - } - - if (extent_map_in_tree(em)) { - replace_extent_mapping(em_tree, em, split, - modified); - } else { - ret = add_extent_mapping(em_tree, split, - modified); - ASSERT(ret == 0); /* Logic error */ - } - free_extent_map(split); - split = NULL; - } -next: - if (extent_map_in_tree(em)) - remove_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); - - /* once for us */ - free_extent_map(em); - /* once for the tree*/ - free_extent_map(em); - } - if (split) - free_extent_map(split); - if (split2) - free_extent_map(split2); -} - -/* * this is very complex, but the basic idea is to drop all extents * in the range start - end. hint_block is filled in with a block number * that would be a good hint to the block allocator for this file. @@ -708,7 +555,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, } if (args->drop_cache) - btrfs_drop_extent_cache(inode, args->start, args->end - 1, 0); + btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false); if (args->start >= inode->disk_i_size && !args->replace_extent) modify_tree = 0; @@ -1339,26 +1186,54 @@ static int prepare_uptodate_page(struct inode *inode, return 0; } +static unsigned int get_prepare_fgp_flags(bool nowait) +{ + unsigned int fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT; + + if (nowait) + fgp_flags |= FGP_NOWAIT; + + return fgp_flags; +} + +static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait) +{ + gfp_t gfp; + + gfp = btrfs_alloc_write_mask(inode->i_mapping); + if (nowait) { + gfp &= ~__GFP_DIRECT_RECLAIM; + gfp |= GFP_NOWAIT; + } + + return gfp; +} + /* * this just gets pages into the page cache and locks them down. */ static noinline int prepare_pages(struct inode *inode, struct page **pages, size_t num_pages, loff_t pos, - size_t write_bytes, bool force_uptodate) + size_t write_bytes, bool force_uptodate, + bool nowait) { int i; unsigned long index = pos >> PAGE_SHIFT; - gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); + gfp_t mask = get_prepare_gfp_flags(inode, nowait); + unsigned int fgp_flags = get_prepare_fgp_flags(nowait); int err = 0; int faili; for (i = 0; i < num_pages; i++) { again: - pages[i] = find_or_create_page(inode->i_mapping, index + i, - mask | __GFP_WRITE); + pages[i] = pagecache_get_page(inode->i_mapping, index + i, + fgp_flags, mask | __GFP_WRITE); if (!pages[i]) { faili = i - 1; - err = -ENOMEM; + if (nowait) + err = -EAGAIN; + else + err = -ENOMEM; goto fail; } @@ -1376,7 +1251,7 @@ again: pos + write_bytes, false); if (err) { put_page(pages[i]); - if (err == -EAGAIN) { + if (!nowait && err == -EAGAIN) { err = 0; goto again; } @@ -1411,7 +1286,7 @@ static noinline int lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, - u64 *lockstart, u64 *lockend, + u64 *lockstart, u64 *lockend, bool nowait, struct extent_state **cached_state) { struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -1426,15 +1301,27 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, if (start_pos < inode->vfs_inode.i_size) { struct btrfs_ordered_extent *ordered; - lock_extent_bits(&inode->io_tree, start_pos, last_pos, - cached_state); + if (nowait) { + if (!try_lock_extent(&inode->io_tree, start_pos, last_pos)) { + for (i = 0; i < num_pages; i++) { + unlock_page(pages[i]); + put_page(pages[i]); + pages[i] = NULL; + } + + return -EAGAIN; + } + } else { + lock_extent(&inode->io_tree, start_pos, last_pos, cached_state); + } + ordered = btrfs_lookup_ordered_range(inode, start_pos, last_pos - start_pos + 1); if (ordered && ordered->file_offset + ordered->num_bytes > start_pos && ordered->file_offset <= last_pos) { - unlock_extent_cached(&inode->io_tree, start_pos, - last_pos, cached_state); + unlock_extent(&inode->io_tree, start_pos, last_pos, + cached_state); for (i = 0; i < num_pages; i++) { unlock_page(pages[i]); put_page(pages[i]); @@ -1481,7 +1368,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0. */ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, - size_t *write_bytes) + size_t *write_bytes, bool nowait) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; @@ -1500,17 +1387,22 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, fs_info->sectorsize) - 1; num_bytes = lockend - lockstart + 1; - btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, NULL); + if (nowait) { + if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend)) { + btrfs_drew_write_unlock(&root->snapshot_lock); + return -EAGAIN; + } + } else { + btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, NULL); + } ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, - NULL, NULL, NULL, false); - if (ret <= 0) { - ret = 0; + NULL, NULL, NULL, nowait, false); + if (ret <= 0) btrfs_drew_write_unlock(&root->snapshot_lock); - } else { + else *write_bytes = min_t(size_t, *write_bytes , num_bytes - pos + lockstart); - } - unlock_extent(&inode->io_tree, lockstart, lockend); + unlock_extent(&inode->io_tree, lockstart, lockend, NULL); return ret; } @@ -1607,8 +1499,10 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, bool force_page_uptodate = false; loff_t old_isize = i_size_read(inode); unsigned int ilock_flags = 0; + const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); + unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0); - if (iocb->ki_flags & IOCB_NOWAIT) + if (nowait) ilock_flags |= BTRFS_ILOCK_TRY; ret = btrfs_inode_lock(inode, ilock_flags); @@ -1664,18 +1558,29 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, extent_changeset_release(data_reserved); ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, pos, - write_bytes); + write_bytes, nowait); if (ret < 0) { + int can_nocow; + + if (nowait && (ret == -ENOSPC || ret == -EAGAIN)) { + ret = -EAGAIN; + break; + } + /* * If we don't have to COW at the offset, reserve * metadata only. write_bytes may get smaller than * requested here. */ - if (btrfs_check_nocow_lock(BTRFS_I(inode), pos, - &write_bytes) > 0) - only_release_metadata = true; - else + can_nocow = btrfs_check_nocow_lock(BTRFS_I(inode), pos, + &write_bytes, nowait); + if (can_nocow < 0) + ret = can_nocow; + if (can_nocow > 0) + ret = 0; + if (ret) break; + only_release_metadata = true; } num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE); @@ -1685,7 +1590,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, WARN_ON(reserve_bytes == 0); ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), reserve_bytes, - reserve_bytes, false); + reserve_bytes, nowait); if (ret) { if (!only_release_metadata) btrfs_free_reserved_data_space(BTRFS_I(inode), @@ -1693,19 +1598,27 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, write_bytes); else btrfs_check_nocow_unlock(BTRFS_I(inode)); + + if (nowait && ret == -ENOSPC) + ret = -EAGAIN; break; } release_bytes = reserve_bytes; again: + ret = balance_dirty_pages_ratelimited_flags(inode->i_mapping, bdp_flags); + if (ret) { + btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); + break; + } + /* * This is going to setup the pages array with the number of * pages we want, so we don't really need to worry about the * contents of pages from loop to loop */ ret = prepare_pages(inode, pages, num_pages, - pos, write_bytes, - force_page_uptodate); + pos, write_bytes, force_page_uptodate, false); if (ret) { btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); @@ -1715,10 +1628,11 @@ again: extents_locked = lock_and_cleanup_extent_if_need( BTRFS_I(inode), pages, num_pages, pos, write_bytes, &lockstart, - &lockend, &cached_state); + &lockend, nowait, &cached_state); if (extents_locked < 0) { - if (extents_locked == -EAGAIN) + if (!nowait && extents_locked == -EAGAIN) goto again; + btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); ret = extents_locked; @@ -1782,8 +1696,8 @@ again: * possible cached extent state to avoid a memory leak. */ if (extents_locked) - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - lockstart, lockend, &cached_state); + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, + lockend, &cached_state); else free_extent_state(cached_state); @@ -1801,8 +1715,6 @@ again: cond_resched(); - balance_dirty_pages_ratelimited(inode->i_mapping); - pos += copied; num_written += copied; } @@ -1858,6 +1770,7 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) loff_t endbyte; ssize_t err; unsigned int ilock_flags = 0; + struct iomap_dio *dio; if (iocb->ki_flags & IOCB_NOWAIT) ilock_flags |= BTRFS_ILOCK_TRY; @@ -1918,11 +1831,22 @@ relock: * So here we disable page faults in the iov_iter and then retry if we * got -EFAULT, faulting in the pages before the retry. */ -again: from->nofault = true; - err = btrfs_dio_rw(iocb, from, written); + dio = btrfs_dio_write(iocb, from, written); from->nofault = false; + /* + * iomap_dio_complete() will call btrfs_sync_file() if we have a dsync + * iocb, and that needs to lock the inode. So unlock it before calling + * iomap_dio_complete() to avoid a deadlock. + */ + btrfs_inode_unlock(inode, ilock_flags); + + if (IS_ERR_OR_NULL(dio)) + err = PTR_ERR_OR_ZERO(dio); + else + err = iomap_dio_complete(dio); + /* No increment (+=) because iomap returns a cumulative value. */ if (err > 0) written = err; @@ -1948,12 +1872,10 @@ again: } else { fault_in_iov_iter_readable(from, left); prev_left = left; - goto again; + goto relock; } } - btrfs_inode_unlock(inode, ilock_flags); - /* * If 'err' is -ENOTBLK or we have not written all data, then it means * we must fallback to buffered IO. @@ -2045,7 +1967,7 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, if (BTRFS_FS_ERROR(inode->root->fs_info)) return -EROFS; - if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) + if (encoded && (iocb->ki_flags & IOCB_NOWAIT)) return -EOPNOTSUPP; if (sync) @@ -2201,14 +2123,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) atomic_inc(&root->log_batch); /* - * Always check for the full sync flag while holding the inode's lock, - * to avoid races with other tasks. The flag must be either set all the - * time during logging or always off all the time while logging. - */ - full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); - - /* * Before we acquired the inode's lock and the mmap lock, someone may * have dirtied more pages in the target range. We need to make sure * that writeback for any such pages does not start while we are logging @@ -2233,6 +2147,17 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) } /* + * Always check for the full sync flag while holding the inode's lock, + * to avoid races with other tasks. The flag must be either set all the + * time during logging or always off all the time while logging. + * We check the flag here after starting delalloc above, because when + * running delalloc the full sync flag may be set if we need to drop + * extra extent map ranges due to temporary memory allocation failures. + */ + full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + + /* * We have to do this here to avoid the priority inversion of waiting on * IO of a lower priority task while holding a transaction open. * @@ -2380,6 +2305,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = btrfs_commit_transaction(trans); out: ASSERT(list_empty(&ctx.list)); + ASSERT(list_empty(&ctx.conflict_inodes)); err = file_check_and_advance_wb_err(file); if (!ret) ret = err; @@ -2448,7 +2374,6 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; struct extent_map *hole_em; - struct extent_map_tree *em_tree = &inode->extent_tree; struct btrfs_key key; int ret; @@ -2505,8 +2430,8 @@ static int fill_holes(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), - offset, 0, 0, end - offset, 0, end - offset, 0, 0, 0); + ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, + end - offset); if (ret) return ret; @@ -2515,7 +2440,7 @@ out: hole_em = alloc_extent_map(); if (!hole_em) { - btrfs_drop_extent_cache(inode, offset, end - 1, 0); + btrfs_drop_extent_map_range(inode, offset, end - 1, false); btrfs_set_inode_full_sync(inode); } else { hole_em->start = offset; @@ -2529,12 +2454,7 @@ out: hole_em->compress_type = BTRFS_COMPRESS_NONE; hole_em->generation = trans->transid; - do { - btrfs_drop_extent_cache(inode, offset, end - 1, 0); - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, hole_em, 1); - write_unlock(&em_tree->lock); - } while (ret == -EEXIST); + ret = btrfs_replace_extent_map_range(inode, hole_em, true); free_extent_map(hole_em); if (ret) btrfs_set_inode_full_sync(inode); @@ -2591,8 +2511,8 @@ static void btrfs_punch_hole_lock_range(struct inode *inode, while (1) { truncate_pagecache_range(inode, lockstart, lockend); - lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, - cached_state); + lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + cached_state); /* * We can't have ordered extents in the range, nor dirty/writeback * pages, because we have locked the inode's VFS lock in exclusive @@ -2607,8 +2527,8 @@ static void btrfs_punch_hole_lock_range(struct inode *inode, page_lockend)) break; - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, - lockend, cached_state); + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + cached_state); } btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend); @@ -3008,9 +2928,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) if (ret) goto out_only_mutex; - lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode))); - lockend = round_down(offset + len, - btrfs_inode_sectorsize(BTRFS_I(inode))) - 1; + lockstart = round_up(offset, fs_info->sectorsize); + lockend = round_down(offset + len, fs_info->sectorsize) - 1; same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset)) == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)); /* @@ -3108,8 +3027,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); out: - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state); out_only_mutex: if (!updated_inode && truncated_block && !ret) { /* @@ -3212,7 +3131,7 @@ enum { static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode, u64 offset) { - const u64 sectorsize = btrfs_inode_sectorsize(inode); + const u64 sectorsize = inode->root->fs_info->sectorsize; struct extent_map *em; int ret; @@ -3242,7 +3161,7 @@ static int btrfs_zero_range(struct inode *inode, struct extent_changeset *data_reserved = NULL; int ret; u64 alloc_hint = 0; - const u64 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode)); + const u64 sectorsize = fs_info->sectorsize; u64 alloc_start = round_down(offset, sectorsize); u64 alloc_end = round_up(offset + len, sectorsize); u64 bytes_to_reserve = 0; @@ -3382,16 +3301,16 @@ reserve_space: ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved, alloc_start, bytes_to_reserve); if (ret) { - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, - lockend, &cached_state); + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, + lockend, &cached_state); goto out; } ret = btrfs_prealloc_file_range(inode, mode, alloc_start, alloc_end - alloc_start, i_blocksize(inode), offset + len, &alloc_hint); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, - lockend, &cached_state); + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state); /* btrfs_prealloc_file_range releases reserved space on error */ if (ret) { space_reserved = false; @@ -3428,7 +3347,7 @@ static long btrfs_fallocate(struct file *file, int mode, u64 data_space_reserved = 0; u64 qgroup_reserved = 0; struct extent_map *em; - int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode)); + int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize; int ret; /* Do not allow fallocate in ZONED mode */ @@ -3502,8 +3421,8 @@ static long btrfs_fallocate(struct file *file, int mode, } locked_end = alloc_end - 1; - lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, - &cached_state); + lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + &cached_state); btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end); @@ -3592,31 +3511,290 @@ static long btrfs_fallocate(struct file *file, int mode, */ ret = btrfs_fallocate_update_isize(inode, actual_end, mode); out_unlock: - unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, - &cached_state); + unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + &cached_state); out: btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); extent_changeset_free(data_reserved); return ret; } +/* + * Helper for btrfs_find_delalloc_in_range(). Find a subrange in a given range + * that has unflushed and/or flushing delalloc. There might be other adjacent + * subranges after the one it found, so btrfs_find_delalloc_in_range() keeps + * looping while it gets adjacent subranges, and merging them together. + */ +static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end, + u64 *delalloc_start_ret, u64 *delalloc_end_ret) +{ + const u64 len = end + 1 - start; + struct extent_map_tree *em_tree = &inode->extent_tree; + struct extent_map *em; + u64 em_end; + u64 delalloc_len; + + /* + * Search the io tree first for EXTENT_DELALLOC. If we find any, it + * means we have delalloc (dirty pages) for which writeback has not + * started yet. + */ + *delalloc_start_ret = start; + delalloc_len = count_range_bits(&inode->io_tree, delalloc_start_ret, end, + len, EXTENT_DELALLOC, 1); + /* + * If delalloc was found then *delalloc_start_ret has a sector size + * aligned value (rounded down). + */ + if (delalloc_len > 0) + *delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1; + + /* + * Now also check if there's any extent map in the range that does not + * map to a hole or prealloc extent. We do this because: + * + * 1) When delalloc is flushed, the file range is locked, we clear the + * EXTENT_DELALLOC bit from the io tree and create an extent map for + * an allocated extent. So we might just have been called after + * delalloc is flushed and before the ordered extent completes and + * inserts the new file extent item in the subvolume's btree; + * + * 2) We may have an extent map created by flushing delalloc for a + * subrange that starts before the subrange we found marked with + * EXTENT_DELALLOC in the io tree. + */ + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + read_unlock(&em_tree->lock); + + /* extent_map_end() returns a non-inclusive end offset. */ + em_end = em ? extent_map_end(em) : 0; + + /* + * If we have a hole/prealloc extent map, check the next one if this one + * ends before our range's end. + */ + if (em && (em->block_start == EXTENT_MAP_HOLE || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) && em_end < end) { + struct extent_map *next_em; + + read_lock(&em_tree->lock); + next_em = lookup_extent_mapping(em_tree, em_end, len - em_end); + read_unlock(&em_tree->lock); + + free_extent_map(em); + em_end = next_em ? extent_map_end(next_em) : 0; + em = next_em; + } + + if (em && (em->block_start == EXTENT_MAP_HOLE || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { + free_extent_map(em); + em = NULL; + } + + /* + * No extent map or one for a hole or prealloc extent. Use the delalloc + * range we found in the io tree if we have one. + */ + if (!em) + return (delalloc_len > 0); + + /* + * We don't have any range as EXTENT_DELALLOC in the io tree, so the + * extent map is the only subrange representing delalloc. + */ + if (delalloc_len == 0) { + *delalloc_start_ret = em->start; + *delalloc_end_ret = min(end, em_end - 1); + free_extent_map(em); + return true; + } + + /* + * The extent map represents a delalloc range that starts before the + * delalloc range we found in the io tree. + */ + if (em->start < *delalloc_start_ret) { + *delalloc_start_ret = em->start; + /* + * If the ranges are adjacent, return a combined range. + * Otherwise return the extent map's range. + */ + if (em_end < *delalloc_start_ret) + *delalloc_end_ret = min(end, em_end - 1); + + free_extent_map(em); + return true; + } + + /* + * The extent map starts after the delalloc range we found in the io + * tree. If it's adjacent, return a combined range, otherwise return + * the range found in the io tree. + */ + if (*delalloc_end_ret + 1 == em->start) + *delalloc_end_ret = min(end, em_end - 1); + + free_extent_map(em); + return true; +} + +/* + * Check if there's delalloc in a given range. + * + * @inode: The inode. + * @start: The start offset of the range. It does not need to be + * sector size aligned. + * @end: The end offset (inclusive value) of the search range. + * It does not need to be sector size aligned. + * @delalloc_start_ret: Output argument, set to the start offset of the + * subrange found with delalloc (may not be sector size + * aligned). + * @delalloc_end_ret: Output argument, set to he end offset (inclusive value) + * of the subrange found with delalloc. + * + * Returns true if a subrange with delalloc is found within the given range, and + * if so it sets @delalloc_start_ret and @delalloc_end_ret with the start and + * end offsets of the subrange. + */ +bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, + u64 *delalloc_start_ret, u64 *delalloc_end_ret) +{ + u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize); + u64 prev_delalloc_end = 0; + bool ret = false; + + while (cur_offset < end) { + u64 delalloc_start; + u64 delalloc_end; + bool delalloc; + + delalloc = find_delalloc_subrange(inode, cur_offset, end, + &delalloc_start, + &delalloc_end); + if (!delalloc) + break; + + if (prev_delalloc_end == 0) { + /* First subrange found. */ + *delalloc_start_ret = max(delalloc_start, start); + *delalloc_end_ret = delalloc_end; + ret = true; + } else if (delalloc_start == prev_delalloc_end + 1) { + /* Subrange adjacent to the previous one, merge them. */ + *delalloc_end_ret = delalloc_end; + } else { + /* Subrange not adjacent to the previous one, exit. */ + break; + } + + prev_delalloc_end = delalloc_end; + cur_offset = delalloc_end + 1; + cond_resched(); + } + + return ret; +} + +/* + * Check if there's a hole or delalloc range in a range representing a hole (or + * prealloc extent) found in the inode's subvolume btree. + * + * @inode: The inode. + * @whence: Seek mode (SEEK_DATA or SEEK_HOLE). + * @start: Start offset of the hole region. It does not need to be sector + * size aligned. + * @end: End offset (inclusive value) of the hole region. It does not + * need to be sector size aligned. + * @start_ret: Return parameter, used to set the start of the subrange in the + * hole that matches the search criteria (seek mode), if such + * subrange is found (return value of the function is true). + * The value returned here may not be sector size aligned. + * + * Returns true if a subrange matching the given seek mode is found, and if one + * is found, it updates @start_ret with the start of the subrange. + */ +static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence, + u64 start, u64 end, u64 *start_ret) +{ + u64 delalloc_start; + u64 delalloc_end; + bool delalloc; + + delalloc = btrfs_find_delalloc_in_range(inode, start, end, + &delalloc_start, &delalloc_end); + if (delalloc && whence == SEEK_DATA) { + *start_ret = delalloc_start; + return true; + } + + if (delalloc && whence == SEEK_HOLE) { + /* + * We found delalloc but it starts after out start offset. So we + * have a hole between our start offset and the delalloc start. + */ + if (start < delalloc_start) { + *start_ret = start; + return true; + } + /* + * Delalloc range starts at our start offset. + * If the delalloc range's length is smaller than our range, + * then it means we have a hole that starts where the delalloc + * subrange ends. + */ + if (delalloc_end < end) { + *start_ret = delalloc_end + 1; + return true; + } + + /* There's delalloc for the whole range. */ + return false; + } + + if (!delalloc && whence == SEEK_HOLE) { + *start_ret = start; + return true; + } + + /* + * No delalloc in the range and we are seeking for data. The caller has + * to iterate to the next extent item in the subvolume btree. + */ + return false; +} + static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset, int whence) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct extent_map *em = NULL; struct extent_state *cached_state = NULL; - loff_t i_size = inode->vfs_inode.i_size; + const loff_t i_size = i_size_read(&inode->vfs_inode); + const u64 ino = btrfs_ino(inode); + struct btrfs_root *root = inode->root; + struct btrfs_path *path; + struct btrfs_key key; + u64 last_extent_end; u64 lockstart; u64 lockend; u64 start; - u64 len; - int ret = 0; + int ret; + bool found = false; if (i_size == 0 || offset >= i_size) return -ENXIO; /* + * Quick path. If the inode has no prealloc extents and its number of + * bytes used matches its i_size, then it can not have holes. + */ + if (whence == SEEK_HOLE && + !(inode->flags & BTRFS_INODE_PREALLOC) && + inode_get_bytes(&inode->vfs_inode) == i_size) + return i_size; + + /* * offset can be negative, in this case we start finding DATA/HOLE from * the very start of the file. */ @@ -3627,45 +3805,164 @@ static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset, if (lockend <= lockstart) lockend = lockstart + fs_info->sectorsize; lockend--; - len = lockend - lockstart + 1; - lock_extent_bits(&inode->io_tree, lockstart, lockend, &cached_state); + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = READA_FORWARD; + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = start; + + last_extent_end = lockstart; + + lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + goto out; + } else if (ret > 0 && path->slots[0] > 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); + if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } while (start < i_size) { - em = btrfs_get_extent_fiemap(inode, start, len); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - em = NULL; - break; + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_file_extent_item *extent; + u64 extent_end; + + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + + leaf = path->nodes[0]; } - if (whence == SEEK_HOLE && - (em->block_start == EXTENT_MAP_HOLE || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) - break; - else if (whence == SEEK_DATA && - (em->block_start != EXTENT_MAP_HOLE && - !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) break; - start = em->start + em->len; - free_extent_map(em); - em = NULL; + extent_end = btrfs_file_extent_end(path); + + /* + * In the first iteration we may have a slot that points to an + * extent that ends before our start offset, so skip it. + */ + if (extent_end <= start) { + path->slots[0]++; + continue; + } + + /* We have an implicit hole, NO_HOLES feature is likely set. */ + if (last_extent_end < key.offset) { + u64 search_start = last_extent_end; + u64 found_start; + + /* + * First iteration, @start matches @offset and it's + * within the hole. + */ + if (start == offset) + search_start = offset; + + found = find_desired_extent_in_hole(inode, whence, + search_start, + key.offset - 1, + &found_start); + if (found) { + start = found_start; + break; + } + /* + * Didn't find data or a hole (due to delalloc) in the + * implicit hole range, so need to analyze the extent. + */ + } + + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + if (btrfs_file_extent_disk_bytenr(leaf, extent) == 0 || + btrfs_file_extent_type(leaf, extent) == + BTRFS_FILE_EXTENT_PREALLOC) { + /* + * Explicit hole or prealloc extent, search for delalloc. + * A prealloc extent is treated like a hole. + */ + u64 search_start = key.offset; + u64 found_start; + + /* + * First iteration, @start matches @offset and it's + * within the hole. + */ + if (start == offset) + search_start = offset; + + found = find_desired_extent_in_hole(inode, whence, + search_start, + extent_end - 1, + &found_start); + if (found) { + start = found_start; + break; + } + /* + * Didn't find data or a hole (due to delalloc) in the + * implicit hole range, so need to analyze the next + * extent item. + */ + } else { + /* + * Found a regular or inline extent. + * If we are seeking for data, adjust the start offset + * and stop, we're done. + */ + if (whence == SEEK_DATA) { + start = max_t(u64, key.offset, offset); + found = true; + break; + } + /* + * Else, we are seeking for a hole, check the next file + * extent item. + */ + } + + start = extent_end; + last_extent_end = extent_end; + path->slots[0]++; + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out; + } cond_resched(); } - free_extent_map(em); - unlock_extent_cached(&inode->io_tree, lockstart, lockend, - &cached_state); - if (ret) { - offset = ret; - } else { - if (whence == SEEK_DATA && start >= i_size) - offset = -ENXIO; - else - offset = min_t(loff_t, start, i_size); + + /* We have an implicit hole from the last extent found up to i_size. */ + if (!found && start < i_size) { + found = find_desired_extent_in_hole(inode, whence, start, + i_size - 1, &start); + if (!found) + start = i_size; } - return offset; +out: + unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + btrfs_free_path(path); + + if (ret < 0) + return ret; + + if (whence == SEEK_DATA && start >= i_size) + return -ENXIO; + + return min_t(loff_t, start, i_size); } static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) @@ -3693,7 +3990,7 @@ static int btrfs_file_open(struct inode *inode, struct file *filp) { int ret; - filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; + filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC; ret = fsverity_file_open(inode, filp); if (ret) @@ -3753,7 +4050,7 @@ again: */ pagefault_disable(); to->nofault = true; - ret = btrfs_dio_rw(iocb, to, read); + ret = btrfs_dio_read(iocb, to, read); to->nofault = false; pagefault_enable(); @@ -3810,6 +4107,7 @@ const struct file_operations btrfs_file_operations = { .mmap = btrfs_file_mmap, .open = btrfs_file_open, .release = btrfs_release_file, + .get_unmapped_area = thp_get_unmapped_area, .fsync = btrfs_sync_file, .fallocate = btrfs_fallocate, .unlocked_ioctl = btrfs_ioctl, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 996da650ecdc..f4023651dd68 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -48,6 +48,24 @@ static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, u64 offset, u64 bytes, bool update_stats); +static void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) +{ + struct btrfs_free_space *info; + struct rb_node *node; + + while ((node = rb_last(&ctl->free_space_offset)) != NULL) { + info = rb_entry(node, struct btrfs_free_space, offset_index); + if (!info->bitmap) { + unlink_free_space(ctl, info, true); + kmem_cache_free(btrfs_free_space_cachep, info); + } else { + free_bitmap(ctl, info); + } + + cond_resched_lock(&ctl->tree_lock); + } +} + static struct inode *__lookup_free_space_inode(struct btrfs_root *root, struct btrfs_path *path, u64 offset) @@ -126,10 +144,8 @@ struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, block_group->disk_cache_state = BTRFS_DC_CLEAR; } - if (!block_group->iref) { + if (!test_and_set_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) block_group->inode = igrab(inode); - block_group->iref = 1; - } spin_unlock(&block_group->lock); return inode; @@ -241,8 +257,7 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, clear_nlink(inode); /* One for the block groups ref */ spin_lock(&block_group->lock); - if (block_group->iref) { - block_group->iref = 0; + if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) { block_group->inode = NULL; spin_unlock(&block_group->lock); iput(inode); @@ -333,8 +348,8 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, btrfs_i_size_write(inode, 0); truncate_pagecache(vfs_inode, 0); - lock_extent_bits(&inode->io_tree, 0, (u64)-1, &cached_state); - btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); + lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); + btrfs_drop_extent_map_range(inode, 0, (u64)-1, false); /* * We skip the throttling logic for free space cache inodes, so we don't @@ -345,7 +360,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); btrfs_inode_safe_disk_i_size_write(inode, control.last_size); - unlock_extent_cached(&inode->io_tree, 0, (u64)-1, &cached_state); + unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); if (ret) goto fail; @@ -693,6 +708,12 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) max_bitmaps = max_t(u64, max_bitmaps, 1); + if (ctl->total_bitmaps > max_bitmaps) + btrfs_err(block_group->fs_info, +"invalid free space control: bg start=%llu len=%llu total_bitmaps=%u unit=%u max_bitmaps=%llu bytes_per_bg=%llu", + block_group->start, block_group->length, + ctl->total_bitmaps, ctl->unit, max_bitmaps, + bytes_per_bg); ASSERT(ctl->total_bitmaps <= max_bitmaps); /* @@ -875,7 +896,10 @@ out: return ret; free_cache: io_ctl_drop_pages(&io_ctl); + + spin_lock(&ctl->tree_lock); __btrfs_remove_free_space_cache(ctl); + spin_unlock(&ctl->tree_lock); goto out; } @@ -914,6 +938,8 @@ static int copy_free_space_cache(struct btrfs_block_group *block_group, return ret; } +static struct lock_class_key btrfs_free_space_inode_key; + int load_free_space_cache(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; @@ -983,6 +1009,14 @@ int load_free_space_cache(struct btrfs_block_group *block_group) } spin_unlock(&block_group->lock); + /* + * Reinitialize the class of struct inode's mapping->invalidate_lock for + * free space inodes to prevent false positives related to locks for normal + * inodes. + */ + lockdep_set_class(&(&inode->i_data)->invalidate_lock, + &btrfs_free_space_inode_key); + ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl, path, block_group->start); btrfs_free_path(path); @@ -1001,7 +1035,13 @@ int load_free_space_cache(struct btrfs_block_group *block_group) if (ret == 0) ret = 1; } else { + /* + * We need to call the _locked variant so we don't try to update + * the discard counters. + */ + spin_lock(&tmp_ctl.tree_lock); __btrfs_remove_free_space_cache(&tmp_ctl); + spin_unlock(&tmp_ctl.tree_lock); btrfs_warn(fs_info, "block group %llu has wrong amount of free space", block_group->start); @@ -1123,7 +1163,7 @@ update_cache_item(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DELALLOC, 0, 0, NULL); + EXTENT_DELALLOC, NULL); goto fail; } leaf = path->nodes[0]; @@ -1135,8 +1175,8 @@ update_cache_item(struct btrfs_trans_handle *trans, if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || found_key.offset != offset) { clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, - inode->i_size - 1, EXTENT_DELALLOC, 0, - 0, NULL); + inode->i_size - 1, EXTENT_DELALLOC, + NULL); btrfs_release_path(path); goto fail; } @@ -1232,7 +1272,7 @@ static int flush_dirty_cache(struct inode *inode) ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); if (ret) clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DELALLOC, 0, 0, NULL); + EXTENT_DELALLOC, NULL); return ret; } @@ -1252,8 +1292,8 @@ cleanup_write_cache_enospc(struct inode *inode, struct extent_state **cached_state) { io_ctl_drop_pages(io_ctl); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, - i_size_read(inode) - 1, cached_state); + unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, + cached_state); } static int __btrfs_wait_cache_io(struct btrfs_root *root, @@ -1378,8 +1418,8 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, if (ret) goto out_unlock; - lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, - &cached_state); + lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, + &cached_state); io_ctl_set_generation(io_ctl, trans->transid); @@ -1434,8 +1474,8 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, io_ctl_drop_pages(io_ctl); io_ctl_free(io_ctl); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, - i_size_read(inode) - 1, &cached_state); + unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, + &cached_state); /* * at this point the pages are under IO and we're happy, @@ -2860,7 +2900,8 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group, if (btrfs_is_zoned(fs_info)) { btrfs_info(fs_info, "free space %llu active %d", block_group->zone_capacity - block_group->alloc_offset, - block_group->zone_is_active); + test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, + &block_group->runtime_flags)); return; } @@ -2964,34 +3005,6 @@ static void __btrfs_return_cluster_to_free_space( btrfs_put_block_group(block_group); } -static void __btrfs_remove_free_space_cache_locked( - struct btrfs_free_space_ctl *ctl) -{ - struct btrfs_free_space *info; - struct rb_node *node; - - while ((node = rb_last(&ctl->free_space_offset)) != NULL) { - info = rb_entry(node, struct btrfs_free_space, offset_index); - if (!info->bitmap) { - unlink_free_space(ctl, info, true); - kmem_cache_free(btrfs_free_space_cachep, info); - } else { - free_bitmap(ctl, info); - } - - cond_resched_lock(&ctl->tree_lock); - } -} - -void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) -{ - spin_lock(&ctl->tree_lock); - __btrfs_remove_free_space_cache_locked(ctl); - if (ctl->block_group) - btrfs_discard_update_discardable(ctl->block_group); - spin_unlock(&ctl->tree_lock); -} - void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; @@ -3009,7 +3022,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group) cond_resched_lock(&ctl->tree_lock); } - __btrfs_remove_free_space_cache_locked(ctl); + __btrfs_remove_free_space_cache(ctl); btrfs_discard_update_discardable(block_group); spin_unlock(&ctl->tree_lock); @@ -3992,7 +4005,7 @@ int btrfs_trim_block_group(struct btrfs_block_group *block_group, *trimmed = 0; spin_lock(&block_group->lock); - if (block_group->removed) { + if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { spin_unlock(&block_group->lock); return 0; } @@ -4022,7 +4035,7 @@ int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group, *trimmed = 0; spin_lock(&block_group->lock); - if (block_group->removed) { + if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { spin_unlock(&block_group->lock); return 0; } @@ -4044,7 +4057,7 @@ int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group, *trimmed = 0; spin_lock(&block_group->lock); - if (block_group->removed) { + if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { spin_unlock(&block_group->lock); return 0; } diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 15591b299895..6d419ba53e95 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -113,7 +113,6 @@ int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, u64 bytenr, u64 size); int btrfs_remove_free_space(struct btrfs_block_group *block_group, u64 bytenr, u64 size); -void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl); void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group); bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group); u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 1bf89aa67216..367bcfcf68f5 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1453,8 +1453,6 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY); ASSERT(key.objectid < end && key.objectid + key.offset <= end); - caching_ctl->progress = key.objectid; - offset = key.objectid; while (offset < key.objectid + key.offset) { bit = free_space_test_bit(block_group, path, offset); @@ -1490,8 +1488,6 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, goto out; } - caching_ctl->progress = (u64)-1; - ret = 0; out: return ret; @@ -1531,8 +1527,6 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY); ASSERT(key.objectid < end && key.objectid + key.offset <= end); - caching_ctl->progress = key.objectid; - total_found += add_new_free_space(block_group, key.objectid, key.objectid + key.offset); if (total_found > CACHING_CTL_WAKE_UP) { @@ -1552,8 +1546,6 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, goto out; } - caching_ctl->progress = (u64)-1; - ret = 0; out: return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1372210869b1..0e516aefbf51 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -977,7 +977,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode, if (!(start >= locked_page_end || end <= locked_page_start)) locked_page = async_chunk->locked_page; } - lock_extent(io_tree, start, end); + lock_extent(io_tree, start, end, NULL); /* We have fall back to uncompressed write */ if (!async_extent->pages) @@ -1024,7 +1024,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode, 1 << BTRFS_ORDERED_COMPRESSED, async_extent->compress_type); if (ret) { - btrfs_drop_extent_cache(inode, start, end, 0); + btrfs_drop_extent_map_range(inode, start, end, false); goto out_free_reserve; } btrfs_dec_block_group_reservations(fs_info, ins.objectid); @@ -1254,7 +1254,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode, } alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); - btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); /* * Relocation relies on the relocated extents to have exactly the same @@ -1319,8 +1318,9 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * skip current ordered extent. */ if (ret) - btrfs_drop_extent_cache(inode, start, - start + ram_size - 1, 0); + btrfs_drop_extent_map_range(inode, start, + start + ram_size - 1, + false); } btrfs_dec_block_group_reservations(fs_info, ins.objectid); @@ -1360,7 +1360,7 @@ out: return ret; out_drop_extent_cache: - btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); + btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false); out_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); @@ -1524,7 +1524,7 @@ static int cow_file_range_async(struct btrfs_inode *inode, unsigned nofs_flag; const blk_opf_t write_flags = wbc_to_write_flags(wbc); - unlock_extent(&inode->io_tree, start, end); + unlock_extent(&inode->io_tree, start, end, NULL); if (inode->flags & BTRFS_INODE_NOCOMPRESS && !btrfs_test_opt(fs_info, FORCE_COMPRESS)) { @@ -1666,7 +1666,7 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode, } static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, - u64 bytenr, u64 num_bytes) + u64 bytenr, u64 num_bytes, bool nowait) { struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr); struct btrfs_ordered_sum *sums; @@ -1674,7 +1674,8 @@ static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, LIST_HEAD(list); ret = btrfs_lookup_csums_range(csum_root, bytenr, - bytenr + num_bytes - 1, &list, 0); + bytenr + num_bytes - 1, &list, 0, + nowait); if (ret == 0 && list_empty(&list)) return 0; @@ -1747,7 +1748,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, if (count > 0) clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE, - 0, 0, NULL); + NULL); } return cow_file_range(inode, locked_page, start, end, page_started, @@ -1800,6 +1801,7 @@ static int can_nocow_file_extent(struct btrfs_path *path, u8 extent_type; int can_nocow = 0; int ret = 0; + bool nowait = path->nowait; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); extent_type = btrfs_file_extent_type(leaf, fi); @@ -1876,7 +1878,8 @@ static int can_nocow_file_extent(struct btrfs_path *path, * Force COW if csums exist in the range. This ensures that csums for a * given extent are either valid or do not exist. */ - ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes); + ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes, + nowait); WARN_ON_ONCE(ret > 0 && is_freespace_inode); if (ret != 0) goto out; @@ -2099,8 +2102,8 @@ out_check: 1 << BTRFS_ORDERED_PREALLOC, BTRFS_COMPRESS_NONE); if (ret) { - btrfs_drop_extent_cache(inode, cur_offset, - nocow_end, 0); + btrfs_drop_extent_map_range(inode, cur_offset, + nocow_end, false); goto error; } } else { @@ -2548,7 +2551,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len, ASSERT(pre + post < len); - lock_extent(&inode->io_tree, start, start + len - 1); + lock_extent(&inode->io_tree, start, start + len - 1, NULL); write_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); if (!em) { @@ -2622,7 +2625,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len, out_unlock: write_unlock(&em_tree->lock); - unlock_extent(&inode->io_tree, start, start + len - 1); + unlock_extent(&inode->io_tree, start, start + len - 1, NULL); out: free_extent_map(split_pre); free_extent_map(split_mid); @@ -2700,8 +2703,10 @@ void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirro if (bio_op(bio) == REQ_OP_ZONE_APPEND) { ret = extract_ordered_extent(bi, bio, page_offset(bio_first_bvec_all(bio)->bv_page)); - if (ret) - goto out; + if (ret) { + btrfs_bio_end_io(btrfs_bio(bio), ret); + return; + } } /* @@ -2721,16 +2726,12 @@ void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirro return; ret = btrfs_csum_one_bio(bi, bio, (u64)-1, false); - if (ret) - goto out; + if (ret) { + btrfs_bio_end_io(btrfs_bio(bio), ret); + return; + } } btrfs_submit_bio(fs_info, bio, mirror_num); - return; -out: - if (ret) { - bio->bi_status = ret; - bio_endio(bio); - } } void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, @@ -2757,8 +2758,7 @@ void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, */ ret = btrfs_lookup_bio_sums(inode, bio, NULL); if (ret) { - bio->bi_status = ret; - bio_endio(bio); + btrfs_bio_end_io(btrfs_bio(bio), ret); return; } @@ -2818,8 +2818,8 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, ret = set_extent_bit(&inode->io_tree, search_start, search_start + em_len - 1, - EXTENT_DELALLOC_NEW, 0, NULL, cached_state, - GFP_NOFS, NULL); + EXTENT_DELALLOC_NEW, cached_state, + GFP_NOFS); next: search_start = extent_map_end(em); free_extent_map(em); @@ -2931,7 +2931,7 @@ again: if (ret) goto out_page; - lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state); + lock_extent(&inode->io_tree, page_start, page_end, &cached_state); /* already ordered? We're done */ if (PageOrdered(page)) @@ -2939,8 +2939,8 @@ again: ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); if (ordered) { - unlock_extent_cached(&inode->io_tree, page_start, page_end, - &cached_state); + unlock_extent(&inode->io_tree, page_start, page_end, + &cached_state); unlock_page(page); btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); @@ -2966,8 +2966,7 @@ out_reserved: if (free_delalloc_space) btrfs_delalloc_release_space(inode, data_reserved, page_start, PAGE_SIZE, true); - unlock_extent_cached(&inode->io_tree, page_start, page_end, - &cached_state); + unlock_extent(&inode->io_tree, page_start, page_end, &cached_state); out_page: if (ret) { /* @@ -3225,6 +3224,8 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) clear_bits |= EXTENT_DELALLOC_NEW; freespace_inode = btrfs_is_free_space_inode(inode); + if (!freespace_inode) + btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent); if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { ret = -EIO; @@ -3269,7 +3270,7 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) } clear_bits |= EXTENT_LOCKED; - lock_extent_bits(io_tree, start, end, &cached_state); + lock_extent(io_tree, start, end, &cached_state); if (freespace_inode) trans = btrfs_join_transaction_spacecache(root); @@ -3325,7 +3326,7 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) clear_extent_bit(&inode->io_tree, start, end, EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES, - 0, 0, &cached_state); + &cached_state); btrfs_inode_safe_disk_i_size_write(inode, 0); ret = btrfs_update_inode_fallback(trans, root, inode); @@ -3336,7 +3337,6 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) ret = 0; out: clear_extent_bit(&inode->io_tree, start, end, clear_bits, - (clear_bits & EXTENT_LOCKED) ? 1 : 0, 0, &cached_state); if (trans) @@ -3361,8 +3361,8 @@ out: unwritten_start += logical_len; clear_extent_uptodate(io_tree, unwritten_start, end, NULL); - /* Drop the cache for the part of the extent we didn't write. */ - btrfs_drop_extent_cache(inode, unwritten_start, end, 0); + /* Drop extent maps for the part of the extent we didn't write. */ + btrfs_drop_extent_map_range(inode, unwritten_start, end, false); /* * If the ordered extent had an IOERR or something else went @@ -3439,6 +3439,13 @@ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, return 0; } +static u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, u64 offset) +{ + u64 offset_in_sectors = offset >> fs_info->sectorsize_bits; + + return csums + offset_in_sectors * fs_info->csum_size; +} + /* * check_data_csum - verify checksum of one sector of uncompressed data * @inode: inode @@ -4878,9 +4885,9 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, block_end = block_start + blocksize - 1; ret = btrfs_check_data_free_space(inode, &data_reserved, block_start, - blocksize); + blocksize, false); if (ret < 0) { - if (btrfs_check_nocow_lock(inode, block_start, &write_bytes) > 0) { + if (btrfs_check_nocow_lock(inode, block_start, &write_bytes, false) > 0) { /* For nocow case, no need to reserve data space */ only_release_metadata = true; } else { @@ -4922,12 +4929,11 @@ again: } wait_on_page_writeback(page); - lock_extent_bits(io_tree, block_start, block_end, &cached_state); + lock_extent(io_tree, block_start, block_end, &cached_state); ordered = btrfs_lookup_ordered_extent(inode, block_start); if (ordered) { - unlock_extent_cached(io_tree, block_start, block_end, - &cached_state); + unlock_extent(io_tree, block_start, block_end, &cached_state); unlock_page(page); put_page(page); btrfs_start_ordered_extent(ordered, 1); @@ -4937,13 +4943,12 @@ again: clear_extent_bit(&inode->io_tree, block_start, block_end, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, &cached_state); + &cached_state); ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0, &cached_state); if (ret) { - unlock_extent_cached(io_tree, block_start, block_end, - &cached_state); + unlock_extent(io_tree, block_start, block_end, &cached_state); goto out_unlock; } @@ -4960,11 +4965,11 @@ again: btrfs_page_clear_checked(fs_info, page, block_start, block_end + 1 - block_start); btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start); - unlock_extent_cached(io_tree, block_start, block_end, &cached_state); + unlock_extent(io_tree, block_start, block_end, &cached_state); if (only_release_metadata) set_extent_bit(&inode->io_tree, block_start, block_end, - EXTENT_NORESERVE, 0, NULL, NULL, GFP_NOFS, NULL); + EXTENT_NORESERVE, NULL, GFP_NOFS); out_unlock: if (ret) { @@ -5021,8 +5026,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode, return ret; } - ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), - offset, 0, 0, len, 0, len, 0, 0, 0); + ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, len); if (ret) { btrfs_abort_transaction(trans, ret); } else { @@ -5046,7 +5050,6 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) struct extent_io_tree *io_tree = &inode->io_tree; struct extent_map *em = NULL; struct extent_state *cached_state = NULL; - struct extent_map_tree *em_tree = &inode->extent_tree; u64 hole_start = ALIGN(oldsize, fs_info->sectorsize); u64 block_end = ALIGN(size, fs_info->sectorsize); u64 last_byte; @@ -5094,10 +5097,11 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) if (err) break; - btrfs_drop_extent_cache(inode, cur_offset, - cur_offset + hole_size - 1, 0); hole_em = alloc_extent_map(); if (!hole_em) { + btrfs_drop_extent_map_range(inode, cur_offset, + cur_offset + hole_size - 1, + false); btrfs_set_inode_full_sync(inode); goto next; } @@ -5112,16 +5116,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) hole_em->compress_type = BTRFS_COMPRESS_NONE; hole_em->generation = fs_info->generation; - while (1) { - write_lock(&em_tree->lock); - err = add_extent_mapping(em_tree, hole_em, 1); - write_unlock(&em_tree->lock); - if (err != -EEXIST) - break; - btrfs_drop_extent_cache(inode, cur_offset, - cur_offset + - hole_size - 1, 0); - } + err = btrfs_replace_extent_map_range(inode, hole_em, true); free_extent_map(hole_em); } else { err = btrfs_inode_set_file_extent_range(inode, @@ -5137,7 +5132,7 @@ next: break; } free_extent_map(em); - unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state); + unlock_extent(io_tree, hole_start, block_end - 1, &cached_state); return err; } @@ -5271,7 +5266,7 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr * While truncating the inode pages during eviction, we get the VFS * calling btrfs_invalidate_folio() against each folio of the inode. This * is slow because the calls to btrfs_invalidate_folio() result in a - * huge amount of calls to lock_extent_bits() and clear_extent_bit(), + * huge amount of calls to lock_extent() and clear_extent_bit(), * which keep merging and splitting extent_state structures over and over, * wasting lots of time. * @@ -5283,29 +5278,12 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr static void evict_inode_truncate_pages(struct inode *inode) { struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree; struct rb_node *node; ASSERT(inode->i_state & I_FREEING); truncate_inode_pages_final(&inode->i_data); - write_lock(&map_tree->lock); - while (!RB_EMPTY_ROOT(&map_tree->map.rb_root)) { - struct extent_map *em; - - node = rb_first_cached(&map_tree->map); - em = rb_entry(node, struct extent_map, rb_node); - clear_bit(EXTENT_FLAG_PINNED, &em->flags); - clear_bit(EXTENT_FLAG_LOGGING, &em->flags); - remove_extent_mapping(map_tree, em); - free_extent_map(em); - if (need_resched()) { - write_unlock(&map_tree->lock); - cond_resched(); - write_lock(&map_tree->lock); - } - } - write_unlock(&map_tree->lock); + btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false); /* * Keep looping until we have no more ranges in the io tree. @@ -5338,7 +5316,7 @@ static void evict_inode_truncate_pages(struct inode *inode) state_flags = state->state; spin_unlock(&io_tree->lock); - lock_extent_bits(io_tree, start, end, &cached_state); + lock_extent(io_tree, start, end, &cached_state); /* * If still has DELALLOC flag, the extent didn't reach disk, @@ -5353,8 +5331,7 @@ static void evict_inode_truncate_pages(struct inode *inode) end - start + 1); clear_extent_bit(io_tree, start, end, - EXTENT_LOCKED | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, + EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING, &cached_state); cond_resched(); @@ -5707,6 +5684,11 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p) BTRFS_I(inode)->location.offset = 0; BTRFS_I(inode)->root = btrfs_grab_root(args->root); BUG_ON(args->root && !BTRFS_I(inode)->root); + + if (args->root && args->root == args->root->fs_info->tree_root && + args->ino != BTRFS_BTREE_INODE_OBJECTID) + set_bit(BTRFS_INODE_FREE_SPACE_INODE, + &BTRFS_I(inode)->runtime_flags); return 0; } @@ -6867,7 +6849,6 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct btrfs_key found_key; struct extent_map *em = NULL; struct extent_map_tree *em_tree = &inode->extent_tree; - struct extent_io_tree *io_tree = &inode->io_tree; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); @@ -7030,8 +7011,6 @@ next: } flush_dcache_page(page); } - set_extent_uptodate(io_tree, em->start, - extent_map_end(em) - 1, NULL, GFP_NOFS); goto insert; } not_found: @@ -7065,133 +7044,6 @@ out: return em; } -struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, - u64 start, u64 len) -{ - struct extent_map *em; - struct extent_map *hole_em = NULL; - u64 delalloc_start = start; - u64 end; - u64 delalloc_len; - u64 delalloc_end; - int err = 0; - - em = btrfs_get_extent(inode, NULL, 0, start, len); - if (IS_ERR(em)) - return em; - /* - * If our em maps to: - * - a hole or - * - a pre-alloc extent, - * there might actually be delalloc bytes behind it. - */ - if (em->block_start != EXTENT_MAP_HOLE && - !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - return em; - else - hole_em = em; - - /* check to see if we've wrapped (len == -1 or similar) */ - end = start + len; - if (end < start) - end = (u64)-1; - else - end -= 1; - - em = NULL; - - /* ok, we didn't find anything, lets look for delalloc */ - delalloc_len = count_range_bits(&inode->io_tree, &delalloc_start, - end, len, EXTENT_DELALLOC, 1); - delalloc_end = delalloc_start + delalloc_len; - if (delalloc_end < delalloc_start) - delalloc_end = (u64)-1; - - /* - * We didn't find anything useful, return the original results from - * get_extent() - */ - if (delalloc_start > end || delalloc_end <= start) { - em = hole_em; - hole_em = NULL; - goto out; - } - - /* - * Adjust the delalloc_start to make sure it doesn't go backwards from - * the start they passed in - */ - delalloc_start = max(start, delalloc_start); - delalloc_len = delalloc_end - delalloc_start; - - if (delalloc_len > 0) { - u64 hole_start; - u64 hole_len; - const u64 hole_end = extent_map_end(hole_em); - - em = alloc_extent_map(); - if (!em) { - err = -ENOMEM; - goto out; - } - - ASSERT(hole_em); - /* - * When btrfs_get_extent can't find anything it returns one - * huge hole - * - * Make sure what it found really fits our range, and adjust to - * make sure it is based on the start from the caller - */ - if (hole_end <= start || hole_em->start > end) { - free_extent_map(hole_em); - hole_em = NULL; - } else { - hole_start = max(hole_em->start, start); - hole_len = hole_end - hole_start; - } - - if (hole_em && delalloc_start > hole_start) { - /* - * Our hole starts before our delalloc, so we have to - * return just the parts of the hole that go until the - * delalloc starts - */ - em->len = min(hole_len, delalloc_start - hole_start); - em->start = hole_start; - em->orig_start = hole_start; - /* - * Don't adjust block start at all, it is fixed at - * EXTENT_MAP_HOLE - */ - em->block_start = hole_em->block_start; - em->block_len = hole_len; - if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags)) - set_bit(EXTENT_FLAG_PREALLOC, &em->flags); - } else { - /* - * Hole is out of passed range or it starts after - * delalloc range - */ - em->start = delalloc_start; - em->len = delalloc_len; - em->orig_start = delalloc_start; - em->block_start = EXTENT_MAP_DELALLOC; - em->block_len = delalloc_len; - } - } else { - return hole_em; - } -out: - - free_extent_map(hole_em); - if (err) { - free_extent_map(em); - return ERR_PTR(err); - } - return em; -} - static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, const u64 start, const u64 len, @@ -7221,7 +7073,8 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, if (ret) { if (em) { free_extent_map(em); - btrfs_drop_extent_cache(inode, start, start + len - 1, 0); + btrfs_drop_extent_map_range(inode, start, + start + len - 1, false); } em = ERR_PTR(ret); } @@ -7292,7 +7145,7 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) */ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, - u64 *ram_bytes, bool strict) + u64 *ram_bytes, bool nowait, bool strict) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct can_nocow_file_extent_args nocow_args = { 0 }; @@ -7308,6 +7161,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, path = btrfs_alloc_path(); if (!path) return -ENOMEM; + path->nowait = nowait; ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(BTRFS_I(inode)), offset, 0); @@ -7404,7 +7258,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, if (!try_lock_extent(io_tree, lockstart, lockend)) return -EAGAIN; } else { - lock_extent_bits(io_tree, lockstart, lockend, cached_state); + lock_extent(io_tree, lockstart, lockend, cached_state); } /* * We're concerned with the entire range that we're going to be @@ -7426,7 +7280,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, lockstart, lockend))) break; - unlock_extent_cached(io_tree, lockstart, lockend, cached_state); + unlock_extent(io_tree, lockstart, lockend, cached_state); if (ordered) { if (nowait) { @@ -7488,7 +7342,6 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, u64 ram_bytes, int compress_type, int type) { - struct extent_map_tree *em_tree; struct extent_map *em; int ret; @@ -7497,7 +7350,6 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_REGULAR); - em_tree = &inode->extent_tree; em = alloc_extent_map(); if (!em) return ERR_PTR(-ENOMEM); @@ -7518,18 +7370,7 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, em->compress_type = compress_type; } - do { - btrfs_drop_extent_cache(inode, em->start, - em->start + em->len - 1, 0); - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 1); - write_unlock(&em_tree->lock); - /* - * The caller has taken lock_extent(), who could race with us - * to add em? - */ - } while (ret == -EEXIST); - + ret = btrfs_replace_extent_map_range(inode, em, true); if (ret) { free_extent_map(em); return ERR_PTR(ret); @@ -7577,7 +7418,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, block_start = em->block_start + (start - em->start); if (can_nocow_extent(inode, start, &len, &orig_start, - &orig_block_len, &ram_bytes, false) == 1) { + &orig_block_len, &ram_bytes, false, false) == 1) { bg = btrfs_inc_nocow_writers(fs_info, block_start); if (bg) can_nocow = true; @@ -7762,7 +7603,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, if (write && !(flags & IOMAP_NOWAIT)) { ret = btrfs_check_data_free_space(BTRFS_I(inode), &dio_data->data_reserved, - start, data_alloc_len); + start, data_alloc_len, false); if (!ret) dio_data->data_space_reserved = true; else if (ret && !(BTRFS_I(inode)->flags & @@ -7884,8 +7725,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, } if (unlock_extents) - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - lockstart, lockend, &cached_state); + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state); else free_extent_state(cached_state); @@ -7914,8 +7755,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, return 0; unlock_err: - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state); err: if (dio_data->data_space_reserved) { btrfs_free_reserved_data_space(BTRFS_I(inode), @@ -7938,7 +7779,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, if (!write && (iomap->type == IOMAP_HOLE)) { /* If reading from a hole, unlock and return */ - unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1); + unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1, + NULL); return 0; } @@ -7950,7 +7792,7 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, pos, length, false); else unlock_extent(&BTRFS_I(inode)->io_tree, pos, - pos + length - 1); + pos + length - 1, NULL); ret = -ENOTBLK; } @@ -7975,7 +7817,7 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip) } else { unlock_extent(&BTRFS_I(dip->inode)->io_tree, dip->file_offset, - dip->file_offset + dip->bytes - 1); + dip->file_offset + dip->bytes - 1, NULL); } kfree(dip->csums); @@ -7986,7 +7828,7 @@ static void submit_dio_repair_bio(struct inode *inode, struct bio *bio, int mirror_num, enum btrfs_compression_type compress_type) { - struct btrfs_dio_private *dip = bio->bi_private; + struct btrfs_dio_private *dip = btrfs_bio(bio)->private; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); BUG_ON(bio_op(bio) == REQ_OP_WRITE); @@ -8001,8 +7843,6 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, { struct inode *inode = dip->inode; struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; - struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); blk_status_t err = BLK_STS_OK; struct bvec_iter iter; @@ -8015,9 +7855,8 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, if (uptodate && (!csum || !btrfs_check_data_csum(inode, bbio, offset, bv.bv_page, bv.bv_offset))) { - clean_io_failure(fs_info, failure_tree, io_tree, start, - bv.bv_page, btrfs_ino(BTRFS_I(inode)), - bv.bv_offset); + btrfs_clean_io_failure(BTRFS_I(inode), start, + bv.bv_page, bv.bv_offset); } else { int ret; @@ -8039,10 +7878,10 @@ static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, false); } -static void btrfs_end_dio_bio(struct bio *bio) +static void btrfs_end_dio_bio(struct btrfs_bio *bbio) { - struct btrfs_dio_private *dip = bio->bi_private; - struct btrfs_bio *bbio = btrfs_bio(bio); + struct btrfs_dio_private *dip = bbio->private; + struct bio *bio = &bbio->bio; blk_status_t err = bio->bi_status; if (err) @@ -8068,7 +7907,7 @@ static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, u64 file_offset, int async_submit) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_dio_private *dip = bio->bi_private; + struct btrfs_dio_private *dip = btrfs_bio(bio)->private; blk_status_t ret; /* Save the original iter for read repair */ @@ -8091,8 +7930,7 @@ static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, */ ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false); if (ret) { - bio->bi_status = ret; - bio_endio(bio); + btrfs_bio_end_io(btrfs_bio(bio), ret); return; } } else { @@ -8142,7 +7980,7 @@ static void btrfs_submit_direct(const struct iomap_iter *iter, */ status = BLK_STS_RESOURCE; dip->csums = kcalloc(nr_sectors, fs_info->csum_size, GFP_NOFS); - if (!dip) + if (!dip->csums) goto out_err; status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums); @@ -8175,9 +8013,8 @@ static void btrfs_submit_direct(const struct iomap_iter *iter, * This will never fail as it's passing GPF_NOFS and * the allocation is backed by btrfs_bioset. */ - bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len); - bio->bi_private = dip; - bio->bi_end_io = btrfs_end_dio_bio; + bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len, + btrfs_end_dio_bio, dip); btrfs_bio(bio)->file_offset = file_offset; if (bio_op(bio) == REQ_OP_ZONE_APPEND) { @@ -8241,13 +8078,21 @@ static const struct iomap_dio_ops btrfs_dio_ops = { .bio_set = &btrfs_dio_bioset, }; -ssize_t btrfs_dio_rw(struct kiocb *iocb, struct iov_iter *iter, size_t done_before) +ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before) { struct btrfs_dio_data data; return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - IOMAP_DIO_PARTIAL | IOMAP_DIO_NOSYNC, - &data, done_before); + IOMAP_DIO_PARTIAL, &data, done_before); +} + +struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, + size_t done_before) +{ + struct btrfs_dio_data data; + + return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, + IOMAP_DIO_PARTIAL, &data, done_before); } static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, @@ -8259,6 +8104,25 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret) return ret; + /* + * fiemap_prep() called filemap_write_and_wait() for the whole possible + * file range (0 to LLONG_MAX), but that is not enough if we have + * compression enabled. The first filemap_fdatawrite_range() only kicks + * in the compression of data (in an async thread) and will return + * before the compression is done and writeback is started. A second + * filemap_fdatawrite_range() is needed to wait for the compression to + * complete and writeback to start. We also need to wait for ordered + * extents to complete, because our fiemap implementation uses mainly + * file extent items to list the extents, searching for extent maps + * only for file ranges with holes or prealloc extents to figure out + * if we have delalloc in those ranges. + */ + if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) { + ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX); + if (ret) + return ret; + } + return extent_fiemap(BTRFS_I(inode), fieinfo, start, len); } @@ -8391,14 +8255,14 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, } if (!inode_evicting) - lock_extent_bits(tree, page_start, page_end, &cached_state); + lock_extent(tree, page_start, page_end, &cached_state); cur = page_start; while (cur < page_end) { struct btrfs_ordered_extent *ordered; - bool delete_states; u64 range_end; u32 range_len; + u32 extra_flags = 0; ordered = btrfs_lookup_first_ordered_range(inode, cur, page_end + 1 - cur); @@ -8408,7 +8272,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, * No ordered extent covering this range, we are safe * to delete all extent states in the range. */ - delete_states = true; + extra_flags = EXTENT_CLEAR_ALL_BITS; goto next; } if (ordered->file_offset > cur) { @@ -8419,7 +8283,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, * the ordered extent in the next iteration. */ range_end = ordered->file_offset - 1; - delete_states = true; + extra_flags = EXTENT_CLEAR_ALL_BITS; goto next; } @@ -8434,7 +8298,6 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, * We can't delete the extent states as * btrfs_finish_ordered_io() may still use some of them. */ - delete_states = false; goto next; } btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len); @@ -8451,7 +8314,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, clear_extent_bit(tree, cur, range_end, EXTENT_DELALLOC | EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, 1, 0, &cached_state); + EXTENT_DEFRAG, &cached_state); spin_lock_irq(&inode->ordered_tree.lock); set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); @@ -8459,6 +8322,12 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, cur - ordered->file_offset); spin_unlock_irq(&inode->ordered_tree.lock); + /* + * If the ordered extent has finished, we're safe to delete all + * the extent states of the range, otherwise + * btrfs_finish_ordered_io() will get executed by endio for + * other pages, so we can't delete extent states. + */ if (btrfs_dec_test_ordered_pending(inode, &ordered, cur, range_end + 1 - cur)) { btrfs_finish_ordered_io(ordered); @@ -8466,14 +8335,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, * The ordered extent has finished, now we're again * safe to delete all extent states of the range. */ - delete_states = true; - } else { - /* - * btrfs_finish_ordered_io() will get executed by endio - * of other pages, thus we can't delete extent states - * anymore - */ - delete_states = false; + extra_flags = EXTENT_CLEAR_ALL_BITS; } next: if (ordered) @@ -8497,8 +8359,8 @@ next: if (!inode_evicting) { clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_UPTODATE | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, - delete_states, &cached_state); + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG | + extra_flags, &cached_state); } cur = range_end + 1; } @@ -8589,11 +8451,11 @@ again: } wait_on_page_writeback(page); - lock_extent_bits(io_tree, page_start, page_end, &cached_state); + lock_extent(io_tree, page_start, page_end, &cached_state); ret2 = set_page_extent_mapped(page); if (ret2 < 0) { ret = vmf_error(ret2); - unlock_extent_cached(io_tree, page_start, page_end, &cached_state); + unlock_extent(io_tree, page_start, page_end, &cached_state); goto out_unlock; } @@ -8604,8 +8466,7 @@ again: ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, PAGE_SIZE); if (ordered) { - unlock_extent_cached(io_tree, page_start, page_end, - &cached_state); + unlock_extent(io_tree, page_start, page_end, &cached_state); unlock_page(page); up_read(&BTRFS_I(inode)->i_mmap_lock); btrfs_start_ordered_extent(ordered, 1); @@ -8633,13 +8494,12 @@ again: */ clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, 0, 0, &cached_state); + EXTENT_DEFRAG, &cached_state); ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0, &cached_state); if (ret2) { - unlock_extent_cached(io_tree, page_start, page_end, - &cached_state); + unlock_extent(io_tree, page_start, page_end, &cached_state); ret = VM_FAULT_SIGBUS; goto out_unlock; } @@ -8659,7 +8519,7 @@ again: btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); - unlock_extent_cached(io_tree, page_start, page_end, &cached_state); + unlock_extent(io_tree, page_start, page_end, &cached_state); up_read(&BTRFS_I(inode)->i_mmap_lock); btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); @@ -8760,24 +8620,24 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize); control.new_size = new_size; - lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1, + lock_extent(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1, &cached_state); /* * We want to drop from the next block forward in case this new * size is not block aligned since we will be keeping the last * block of the extent just the way it is. */ - btrfs_drop_extent_cache(BTRFS_I(inode), - ALIGN(new_size, fs_info->sectorsize), - (u64)-1, 0); + btrfs_drop_extent_map_range(BTRFS_I(inode), + ALIGN(new_size, fs_info->sectorsize), + (u64)-1, false); ret = btrfs_truncate_inode_items(trans, root, &control); inode_sub_bytes(inode, control.sub_bytes); btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), control.last_size); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, - (u64)-1, &cached_state); + unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1, + &cached_state); trans->block_rsv = &fs_info->trans_block_rsv; if (ret != -ENOSPC && ret != -EAGAIN) @@ -8908,6 +8768,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->last_log_commit = 0; spin_lock_init(&ei->lock); + spin_lock_init(&ei->io_failure_lock); ei->outstanding_extents = 0; if (sb->s_magic != BTRFS_TEST_MAGIC) btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv, @@ -8924,12 +8785,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) inode = &ei->vfs_inode; extent_map_tree_init(&ei->extent_tree); extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode); - extent_io_tree_init(fs_info, &ei->io_failure_tree, - IO_TREE_INODE_IO_FAILURE, inode); extent_io_tree_init(fs_info, &ei->file_extent_tree, - IO_TREE_INODE_FILE_EXTENT, inode); - ei->io_tree.track_uptodate = true; - ei->io_failure_tree.track_uptodate = true; + IO_TREE_INODE_FILE_EXTENT, NULL); + ei->io_failure_tree = RB_ROOT; atomic_set(&ei->sync_writers, 0); mutex_init(&ei->log_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); @@ -8944,7 +8802,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS void btrfs_test_destroy_inode(struct inode *inode) { - btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0); + btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false); kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } #endif @@ -8959,6 +8817,7 @@ void btrfs_destroy_inode(struct inode *vfs_inode) struct btrfs_ordered_extent *ordered; struct btrfs_inode *inode = BTRFS_I(vfs_inode); struct btrfs_root *root = inode->root; + bool freespace_inode; WARN_ON(!hlist_empty(&vfs_inode->i_dentry)); WARN_ON(vfs_inode->i_data.nrpages); @@ -8980,6 +8839,12 @@ void btrfs_destroy_inode(struct inode *vfs_inode) if (!root) return; + /* + * If this is a free space inode do not take the ordered extents lockdep + * map. + */ + freespace_inode = btrfs_is_free_space_inode(inode); + while (1) { ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); if (!ordered) @@ -8988,6 +8853,10 @@ void btrfs_destroy_inode(struct inode *vfs_inode) btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup", ordered->file_offset, ordered->num_bytes); + + if (!freespace_inode) + btrfs_lockdep_acquire(root->fs_info, btrfs_ordered_extent); + btrfs_remove_ordered_extent(inode, ordered); btrfs_put_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); @@ -8995,7 +8864,7 @@ void btrfs_destroy_inode(struct inode *vfs_inode) } btrfs_qgroup_check_reserved_leak(inode); inode_tree_del(inode); - btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); + btrfs_drop_extent_map_range(inode, 0, (u64)-1, false); btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1); btrfs_put_root(inode->root); } @@ -10008,7 +9877,6 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_map *em; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key ins; @@ -10064,11 +9932,10 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, break; } - btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset, - cur_offset + ins.offset -1, 0); - em = alloc_extent_map(); if (!em) { + btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset, + cur_offset + ins.offset - 1, false); btrfs_set_inode_full_sync(BTRFS_I(inode)); goto next; } @@ -10083,16 +9950,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, set_bit(EXTENT_FLAG_PREALLOC, &em->flags); em->generation = trans->transid; - while (1) { - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 1); - write_unlock(&em_tree->lock); - if (ret != -EEXIST) - break; - btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset, - cur_offset + ins.offset - 1, - 0); - } + ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true); free_extent_map(em); next: num_bytes -= ins.offset; @@ -10168,7 +10026,7 @@ static int btrfs_permission(struct user_namespace *mnt_userns, } static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode) + struct file *file, umode_t mode) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct btrfs_trans_handle *trans; @@ -10176,7 +10034,7 @@ static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, struct inode *inode; struct btrfs_new_inode_args new_inode_args = { .dir = dir, - .dentry = dentry, + .dentry = file->f_path.dentry, .orphan = true, }; unsigned int trans_num_items; @@ -10213,7 +10071,7 @@ static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, set_nlink(inode, 1); if (!ret) { - d_tmpfile(dentry, inode); + d_tmpfile(file, inode); unlock_new_inode(inode); mark_inode_dirty(inode); } @@ -10225,7 +10083,7 @@ out_new_inode_args: out_inode: if (ret) iput(inode); - return ret; + return finish_open_simple(file, ret); } void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) @@ -10346,7 +10204,7 @@ static ssize_t btrfs_encoded_read_inline( } read_extent_buffer(leaf, tmp, ptr, count); btrfs_release_path(path); - unlock_extent_cached(io_tree, start, lockend, cached_state); + unlock_extent(io_tree, start, lockend, cached_state); btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); *unlocked = true; @@ -10371,7 +10229,7 @@ struct btrfs_encoded_read_private { static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) { - struct btrfs_encoded_read_private *priv = bio->bi_private; + struct btrfs_encoded_read_private *priv = btrfs_bio(bio)->private; struct btrfs_fs_info *fs_info = inode->root->fs_info; blk_status_t ret; @@ -10389,7 +10247,7 @@ static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode, static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio) { const bool uptodate = (bbio->bio.bi_status == BLK_STS_OK); - struct btrfs_encoded_read_private *priv = bbio->bio.bi_private; + struct btrfs_encoded_read_private *priv = bbio->private; struct btrfs_inode *inode = priv->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; u32 sectorsize = fs_info->sectorsize; @@ -10417,10 +10275,9 @@ static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio) return BLK_STS_OK; } -static void btrfs_encoded_read_endio(struct bio *bio) +static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) { - struct btrfs_encoded_read_private *priv = bio->bi_private; - struct btrfs_bio *bbio = btrfs_bio(bio); + struct btrfs_encoded_read_private *priv = bbio->private; blk_status_t status; status = btrfs_encoded_read_verify_csum(bbio); @@ -10438,7 +10295,7 @@ static void btrfs_encoded_read_endio(struct bio *bio) if (!atomic_dec_return(&priv->pending)) wake_up(&priv->wait); btrfs_bio_free_csum(bbio); - bio_put(bio); + bio_put(&bbio->bio); } int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, @@ -10485,12 +10342,11 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, size_t bytes = min_t(u64, remaining, PAGE_SIZE); if (!bio) { - bio = btrfs_bio_alloc(BIO_MAX_VECS); + bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, + btrfs_encoded_read_endio, + &priv); bio->bi_iter.bi_sector = (disk_bytenr + cur) >> SECTOR_SHIFT; - bio->bi_end_io = btrfs_encoded_read_endio; - bio->bi_private = &priv; - bio->bi_opf = REQ_OP_READ; } if (!bytes || @@ -10551,7 +10407,7 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, if (ret) goto out; - unlock_extent_cached(io_tree, start, lockend, cached_state); + unlock_extent(io_tree, start, lockend, cached_state); btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); *unlocked = true; @@ -10621,13 +10477,13 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, lockend - start + 1); if (ret) goto out_unlock_inode; - lock_extent_bits(io_tree, start, lockend, &cached_state); + lock_extent(io_tree, start, lockend, &cached_state); ordered = btrfs_lookup_ordered_range(inode, start, lockend - start + 1); if (!ordered) break; btrfs_put_ordered_extent(ordered); - unlock_extent_cached(io_tree, start, lockend, &cached_state); + unlock_extent(io_tree, start, lockend, &cached_state); cond_resched(); } @@ -10701,7 +10557,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, em = NULL; if (disk_bytenr == EXTENT_MAP_HOLE) { - unlock_extent_cached(io_tree, start, lockend, &cached_state); + unlock_extent(io_tree, start, lockend, &cached_state); btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); unlocked = true; ret = iov_iter_zero(count, iter); @@ -10722,7 +10578,7 @@ out_em: free_extent_map(em); out_unlock_extent: if (!unlocked) - unlock_extent_cached(io_tree, start, lockend, &cached_state); + unlock_extent(io_tree, start, lockend, &cached_state); out_unlock_inode: if (!unlocked) btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); @@ -10860,14 +10716,14 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, end >> PAGE_SHIFT); if (ret) goto out_pages; - lock_extent_bits(io_tree, start, end, &cached_state); + lock_extent(io_tree, start, end, &cached_state); ordered = btrfs_lookup_ordered_range(inode, start, num_bytes); if (!ordered && !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end)) break; if (ordered) btrfs_put_ordered_extent(ordered); - unlock_extent_cached(io_tree, start, end, &cached_state); + unlock_extent(io_tree, start, end, &cached_state); cond_resched(); } @@ -10921,7 +10777,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, (1 << BTRFS_ORDERED_COMPRESSED), compression); if (ret) { - btrfs_drop_extent_cache(inode, start, end, 0); + btrfs_drop_extent_map_range(inode, start, end, false); goto out_free_reserved; } btrfs_dec_block_group_reservations(fs_info, ins.objectid); @@ -10929,7 +10785,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, if (start + encoded->len > inode->vfs_inode.i_size) i_size_write(&inode->vfs_inode, start + encoded->len); - unlock_extent_cached(io_tree, start, end, &cached_state); + unlock_extent(io_tree, start, end, &cached_state); btrfs_delalloc_release_extents(inode, num_bytes); @@ -10960,7 +10816,7 @@ out_free_data_space: if (!extent_reserved) btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes); out_unlock: - unlock_extent_cached(io_tree, start, end, &cached_state); + unlock_extent(io_tree, start, end, &cached_state); out_pages: for (i = 0; i < nr_pages; i++) { if (pages[i]) @@ -11201,7 +11057,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); - lock_extent_bits(io_tree, 0, isize - 1, &cached_state); + lock_extent(io_tree, 0, isize - 1, &cached_state); start = 0; while (start < isize) { u64 logical_block_start, physical_block_start; @@ -11242,7 +11098,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, free_extent_map(em); em = NULL; - ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, true); + ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, false, true); if (ret < 0) { goto out; } else if (ret) { @@ -11338,7 +11194,7 @@ out: if (!IS_ERR_OR_NULL(em)) free_extent_map(em); - unlock_extent_cached(io_tree, 0, isize - 1, &cached_state); + unlock_extent(io_tree, 0, isize - 1, &cached_state); if (ret) btrfs_swap_deactivate(file); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index fe0cc816b4eb..5ba2e810dc6e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1218,10 +1218,10 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, /* get the big lock and read metadata off disk */ if (!locked) - lock_extent_bits(io_tree, start, end, &cached); + lock_extent(io_tree, start, end, &cached); em = defrag_get_extent(BTRFS_I(inode), start, newer_than); if (!locked) - unlock_extent_cached(io_tree, start, end, &cached); + unlock_extent(io_tree, start, end, &cached); if (IS_ERR(em)) return NULL; @@ -1333,10 +1333,10 @@ again: while (1) { struct btrfs_ordered_extent *ordered; - lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state); + lock_extent(&inode->io_tree, page_start, page_end, &cached_state); ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); - unlock_extent_cached(&inode->io_tree, page_start, page_end, - &cached_state); + unlock_extent(&inode->io_tree, page_start, page_end, + &cached_state); if (!ordered) break; @@ -1616,7 +1616,7 @@ static int defrag_one_locked_target(struct btrfs_inode *inode, return ret; clear_extent_bit(&inode->io_tree, start, start + len - 1, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, 0, 0, cached_state); + EXTENT_DEFRAG, cached_state); set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state); /* Update the page status */ @@ -1666,9 +1666,9 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, wait_on_page_writeback(pages[i]); /* Lock the pages range */ - lock_extent_bits(&inode->io_tree, start_index << PAGE_SHIFT, - (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, - &cached_state); + lock_extent(&inode->io_tree, start_index << PAGE_SHIFT, + (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, + &cached_state); /* * Now we have a consistent view about the extent map, re-check * which range really needs to be defragged. @@ -1694,9 +1694,9 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, kfree(entry); } unlock_extent: - unlock_extent_cached(&inode->io_tree, start_index << PAGE_SHIFT, - (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, - &cached_state); + unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT, + (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, + &cached_state); free_pages: for (i = 0; i < nr_pages; i++) { if (pages[i]) { @@ -3105,6 +3105,8 @@ static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp) } } + btrfs_free_path(path); + path = NULL; if (copy_to_user(argp, subvol_info, sizeof(*subvol_info))) ret = -EFAULT; @@ -3194,6 +3196,8 @@ static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root, } out: + btrfs_free_path(path); + if (!ret || ret == -EOVERFLOW) { rootrefs->num_items = found; /* update min_treeid for next search */ @@ -3205,7 +3209,6 @@ out: } kfree(rootrefs); - btrfs_free_path(path); return ret; } @@ -4231,6 +4234,8 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) ipath->fspath->val[i] = rel_ptr; } + btrfs_free_path(path); + path = NULL; ret = copy_to_user((void __user *)(unsigned long)ipa->fspath, ipath->fspath, size); if (ret) { @@ -4281,21 +4286,20 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, size = min_t(u32, loi->size, SZ_16M); } - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } - inodes = init_data_container(size); if (IS_ERR(inodes)) { ret = PTR_ERR(inodes); - inodes = NULL; - goto out; + goto out_loi; } + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } ret = iterate_inodes_from_logical(loi->logical, fs_info, path, inodes, ignore_offset); + btrfs_free_path(path); if (ret == -EINVAL) ret = -ENOENT; if (ret < 0) @@ -4307,7 +4311,6 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, ret = -EFAULT; out: - btrfs_free_path(path); kvfree(inodes); out_loi: kfree(loi); diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 9063072b399b..0eab3cb274a1 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -286,6 +286,31 @@ struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) } /* + * Loop around taking references on and locking the root node of the tree in + * nowait mode until we end up with a lock on the root node or returning to + * avoid blocking. + * + * Return: root extent buffer with read lock held or -EAGAIN. + */ +struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root) +{ + struct extent_buffer *eb; + + while (1) { + eb = btrfs_root_node(root); + if (!btrfs_try_tree_read_lock(eb)) { + free_extent_buffer(eb); + return ERR_PTR(-EAGAIN); + } + if (eb == root->node) + break; + btrfs_tree_read_unlock(eb); + free_extent_buffer(eb); + } + return eb; +} + +/* * DREW locks * ========== * diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index ab268be09bb5..490c7a79e995 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -94,6 +94,7 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb); int btrfs_try_tree_write_lock(struct extent_buffer *eb); struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root); +struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root); #ifdef CONFIG_BTRFS_DEBUG static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb) diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h index 340f995652f2..f9850edfd726 100644 --- a/fs/btrfs/misc.h +++ b/fs/btrfs/misc.h @@ -88,6 +88,41 @@ static inline struct rb_node *rb_simple_search(struct rb_root *root, u64 bytenr) return NULL; } +/* + * Search @root from an entry that starts or comes after @bytenr. + * + * @root: the root to search. + * @bytenr: bytenr to search from. + * + * Return the rb_node that start at or after @bytenr. If there is no entry at + * or after @bytner return NULL. + */ +static inline struct rb_node *rb_simple_search_first(struct rb_root *root, + u64 bytenr) +{ + struct rb_node *node = root->rb_node, *ret = NULL; + struct rb_simple_node *entry, *ret_entry = NULL; + + while (node) { + entry = rb_entry(node, struct rb_simple_node, rb_node); + + if (bytenr < entry->bytenr) { + if (!ret || entry->bytenr < ret_entry->bytenr) { + ret = node; + ret_entry = entry; + } + + node = node->rb_left; + } else if (bytenr > entry->bytenr) { + node = node->rb_right; + } else { + return node; + } + } + + return ret; +} + static inline struct rb_node *rb_simple_insert(struct rb_root *root, u64 bytenr, struct rb_node *node) { diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 1952ac85222c..e54f8280031f 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -524,7 +524,15 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *node; bool pending; + bool freespace_inode; + /* + * If this is a free space inode the thread has not acquired the ordered + * extents lockdep map. + */ + freespace_inode = btrfs_is_free_space_inode(btrfs_inode); + + btrfs_lockdep_acquire(fs_info, btrfs_trans_pending_ordered); /* This is paired with btrfs_add_ordered_extent. */ spin_lock(&btrfs_inode->lock); btrfs_mod_outstanding_extents(btrfs_inode, -1); @@ -580,6 +588,8 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, } } + btrfs_lockdep_release(fs_info, btrfs_trans_pending_ordered); + spin_lock(&root->ordered_extent_lock); list_del_init(&entry->root_extent_list); root->nr_ordered_extents--; @@ -594,6 +604,8 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, } spin_unlock(&root->ordered_extent_lock); wake_up(&entry->wait); + if (!freespace_inode) + btrfs_lockdep_release(fs_info, btrfs_ordered_extent); } static void btrfs_run_ordered_extent_work(struct btrfs_work *work) @@ -712,10 +724,17 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait) u64 start = entry->file_offset; u64 end = start + entry->num_bytes - 1; struct btrfs_inode *inode = BTRFS_I(entry->inode); + bool freespace_inode; trace_btrfs_ordered_extent_start(inode, entry); /* + * If this is a free space inode do not take the ordered extents lockdep + * map. + */ + freespace_inode = btrfs_is_free_space_inode(inode); + + /* * pages in the range can be dirty, clean or writeback. We * start IO on any dirty ones so the wait doesn't stall waiting * for the flusher thread to find them @@ -723,6 +742,8 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait) if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); if (wait) { + if (!freespace_inode) + btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent); wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags)); } @@ -1022,7 +1043,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, cachedp = cached_state; while (1) { - lock_extent_bits(&inode->io_tree, start, end, cachedp); + lock_extent(&inode->io_tree, start, end, cachedp); ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1); if (!ordered) { @@ -1035,12 +1056,37 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, refcount_dec(&cache->refs); break; } - unlock_extent_cached(&inode->io_tree, start, end, cachedp); + unlock_extent(&inode->io_tree, start, end, cachedp); btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); } } +/* + * Lock the passed range and ensure all pending ordered extents in it are run + * to completion in nowait mode. + * + * Return true if btrfs_lock_ordered_range does not return any extents, + * otherwise false. + */ +bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end) +{ + struct btrfs_ordered_extent *ordered; + + if (!try_lock_extent(&inode->io_tree, start, end)) + return false; + + ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1); + if (!ordered) + return true; + + btrfs_put_ordered_extent(ordered); + unlock_extent(&inode->io_tree, start, end, NULL); + + return false; +} + + static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos, u64 len) { diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 87792f85e2c4..f59f2dbdb25e 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -160,18 +160,6 @@ struct btrfs_ordered_extent { struct block_device *bdev; }; -/* - * calculates the total size you need to allocate for an ordered sum - * structure spanning 'bytes' in the file - */ -static inline int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info, - unsigned long bytes) -{ - int num_sectors = (int)DIV_ROUND_UP(bytes, fs_info->sectorsize); - - return sizeof(struct btrfs_ordered_sum) + num_sectors * fs_info->csum_size; -} - static inline void btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) { @@ -218,6 +206,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state); +bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end); int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre, u64 post); int __init ordered_data_init(void); diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index a2ec8ecae8de..055a631276ce 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -270,11 +270,8 @@ int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 ino = btrfs_ino(BTRFS_I(inode)); - int ret; - - ret = iterate_object_props(root, path, ino, inode_prop_iterator, inode); - return ret; + return iterate_object_props(root, path, ino, inode_prop_iterator, inode); } static int prop_compression_validate(const struct btrfs_inode *inode, diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index db723c0026bd..b74105a10f16 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -275,7 +275,7 @@ static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *p } /* - * Add relation specified by two qgoup ids. + * Add relation specified by two qgroup ids. * * Must be called with qgroup_lock held. * @@ -333,6 +333,13 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, } #endif +static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info) +{ + fs_info->qgroup_flags |= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT | + BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN | + BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING); +} + /* * The full config is read in one go, only called from open_ctree() * It doesn't use any locking, as at this point we're still single-threaded @@ -401,7 +408,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) } if (btrfs_qgroup_status_generation(l, ptr) != fs_info->generation) { - flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + qgroup_mark_inconsistent(fs_info); btrfs_err(fs_info, "qgroup generation mismatch, marked as inconsistent"); } @@ -419,7 +426,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) || (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) { btrfs_err(fs_info, "inconsistent qgroup config"); - flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + qgroup_mark_inconsistent(fs_info); } if (!qgroup) { qgroup = add_qgroup_rb(fs_info, found_key.offset); @@ -878,7 +885,8 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans) l = path->nodes[0]; slot = path->slots[0]; ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item); - btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags); + btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags & + BTRFS_QGROUP_STATUS_FLAGS_MASK); btrfs_set_qgroup_status_generation(l, ptr, trans->transid); btrfs_set_qgroup_status_rescan(l, ptr, fs_info->qgroup_rescan_progress.objectid); @@ -1052,7 +1060,8 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION); fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON | BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; - btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags); + btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags & + BTRFS_QGROUP_STATUS_FLAGS_MASK); btrfs_set_qgroup_status_rescan(leaf, ptr, 0); btrfs_mark_buffer_dirty(leaf); @@ -1174,6 +1183,21 @@ out_add_root: fs_info->qgroup_rescan_running = true; btrfs_queue_work(fs_info->qgroup_rescan_workers, &fs_info->qgroup_rescan_work); + } else { + /* + * We have set both BTRFS_FS_QUOTA_ENABLED and + * BTRFS_QGROUP_STATUS_FLAG_ON, so we can only fail with + * -EINPROGRESS. That can happen because someone started the + * rescan worker by calling quota rescan ioctl before we + * attempted to initialize the rescan worker. Failure due to + * quotas disabled in the meanwhile is not possible, because + * we are holding a write lock on fs_info->subvol_sem, which + * is also acquired when disabling quotas. + * Ignore such error, and any other error would need to undo + * everything we did in the transaction we just committed. + */ + ASSERT(ret == -EINPROGRESS); + ret = 0; } out_free_path: @@ -1255,6 +1279,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) quota_root = fs_info->quota_root; fs_info->quota_root = NULL; fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; + fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL; spin_unlock(&fs_info->qgroup_lock); btrfs_free_qgroup_config(fs_info); @@ -1717,7 +1742,7 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, ret = update_qgroup_limit_item(trans, qgroup); if (ret) { - fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + qgroup_mark_inconsistent(fs_info); btrfs_info(fs_info, "unable to update quota limit for %llu", qgroupid); } @@ -1790,10 +1815,13 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, */ ASSERT(trans != NULL); + if (trans->fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) + return 0; + ret = btrfs_find_all_roots(NULL, trans->fs_info, bytenr, 0, &old_root, true); if (ret < 0) { - trans->fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + qgroup_mark_inconsistent(trans->fs_info); btrfs_warn(trans->fs_info, "error accounting new delayed refs extent (err code: %d), quota inconsistent", ret); @@ -2269,7 +2297,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, out: btrfs_free_path(dst_path); if (ret < 0) - fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + qgroup_mark_inconsistent(fs_info); return ret; } @@ -2280,6 +2308,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = trans->fs_info; int ret = 0; int level; + u8 drop_subptree_thres; struct extent_buffer *eb = root_eb; struct btrfs_path *path = NULL; @@ -2289,6 +2318,23 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) return 0; + spin_lock(&fs_info->qgroup_lock); + drop_subptree_thres = fs_info->qgroup_drop_subtree_thres; + spin_unlock(&fs_info->qgroup_lock); + + /* + * This function only gets called for snapshot drop, if we hit a high + * node here, it means we are going to change ownership for quite a lot + * of extents, which will greatly slow down btrfs_commit_transaction(). + * + * So here if we find a high tree here, we just skip the accounting and + * mark qgroup inconsistent. + */ + if (root_level >= drop_subptree_thres) { + qgroup_mark_inconsistent(fs_info); + return 0; + } + if (!extent_buffer_uptodate(root_eb)) { ret = btrfs_read_extent_buffer(root_eb, root_gen, root_level, NULL); if (ret) @@ -2604,7 +2650,8 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, * If quotas get disabled meanwhile, the resources need to be freed and * we can't just exit here. */ - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || + fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) goto out_free; if (new_roots) { @@ -2700,7 +2747,8 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) num_dirty_extents++; trace_btrfs_qgroup_account_extents(fs_info, record); - if (!ret) { + if (!ret && !(fs_info->qgroup_flags & + BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) { /* * Old roots should be searched when inserting qgroup * extent record @@ -2773,12 +2821,10 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans) spin_unlock(&fs_info->qgroup_lock); ret = update_qgroup_info_item(trans, qgroup); if (ret) - fs_info->qgroup_flags |= - BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + qgroup_mark_inconsistent(fs_info); ret = update_qgroup_limit_item(trans, qgroup); if (ret) - fs_info->qgroup_flags |= - BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + qgroup_mark_inconsistent(fs_info); spin_lock(&fs_info->qgroup_lock); } if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) @@ -2789,7 +2835,7 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans) ret = update_qgroup_status_item(trans); if (ret) - fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + qgroup_mark_inconsistent(fs_info); return ret; } @@ -2905,14 +2951,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, dstgroup->rsv_rfer = inherit->lim.rsv_rfer; dstgroup->rsv_excl = inherit->lim.rsv_excl; - ret = update_qgroup_limit_item(trans, dstgroup); - if (ret) { - fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; - btrfs_info(fs_info, - "unable to update quota limit for %llu", - dstgroup->qgroupid); - goto unlock; - } + qgroup_dirty(fs_info, dstgroup); } if (srcid) { @@ -3013,7 +3052,7 @@ out: if (!committing) mutex_unlock(&fs_info->qgroup_ioctl_lock); if (need_rescan) - fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + qgroup_mark_inconsistent(fs_info); return ret; } @@ -3286,7 +3325,8 @@ static bool rescan_should_stop(struct btrfs_fs_info *fs_info) { return btrfs_fs_closing(fs_info) || test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) || - !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); + !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || + fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN; } static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) @@ -3351,7 +3391,8 @@ out: } mutex_lock(&fs_info->qgroup_rescan_lock); - if (!stopped) + if (!stopped || + fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; if (trans) { ret = update_qgroup_status_item(trans); @@ -3362,6 +3403,7 @@ out: } } fs_info->qgroup_rescan_running = false; + fs_info->qgroup_flags &= ~BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN; complete_all(&fs_info->qgroup_rescan_completion); mutex_unlock(&fs_info->qgroup_rescan_lock); @@ -3372,6 +3414,8 @@ out: if (stopped) { btrfs_info(fs_info, "qgroup scan paused"); + } else if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) { + btrfs_info(fs_info, "qgroup scan cancelled"); } else if (err >= 0) { btrfs_info(fs_info, "qgroup scan completed%s", err > 0 ? " (inconsistency flag cleared)" : ""); @@ -3434,6 +3478,8 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, memset(&fs_info->qgroup_rescan_progress, 0, sizeof(fs_info->qgroup_rescan_progress)); + fs_info->qgroup_flags &= ~(BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN | + BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING); fs_info->qgroup_rescan_progress.objectid = progress_objectid; init_completion(&fs_info->qgroup_rescan_completion); mutex_unlock(&fs_info->qgroup_rescan_lock); @@ -4231,8 +4277,7 @@ out_unlock: spin_unlock(&blocks->lock); out: if (ret < 0) - fs_info->qgroup_flags |= - BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + qgroup_mark_inconsistent(fs_info); return ret; } @@ -4319,7 +4364,7 @@ out: btrfs_err_rl(fs_info, "failed to account subtree at bytenr %llu: %d", subvol_eb->start, ret); - fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + qgroup_mark_inconsistent(fs_info); } return ret; } diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 0c4dd2a9af96..578c77e94200 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -100,6 +100,9 @@ * subtree rescan for them. */ +#define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN (1UL << 3) +#define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING (1UL << 4) + /* * Record a dirty extent, and info qgroup to update quota on it * TODO: Use kmem cache to alloc it. diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 2feb5c20641a..82c8e991300e 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -275,7 +275,6 @@ static void merge_rbio(struct btrfs_raid_bio *dest, /* Also inherit the bitmaps from @victim. */ bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap, dest->stripe_nsectors); - dest->generic_bio_cnt += victim->generic_bio_cnt; bio_list_init(&victim->bio_list); } @@ -814,8 +813,6 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) struct bio *cur = bio_list_get(&rbio->bio_list); struct bio *extra; - if (rbio->generic_bio_cnt) - btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt); /* * Clear the data bitmap, as the rbio may be cached for later usage. * do this before before unlock_stripe() so there will be no new bio @@ -946,6 +943,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, spin_lock_init(&rbio->bio_list_lock); INIT_LIST_HEAD(&rbio->stripe_cache); INIT_LIST_HEAD(&rbio->hash_list); + btrfs_get_bioc(bioc); rbio->bioc = bioc; rbio->nr_pages = num_pages; rbio->nr_sectors = num_sectors; @@ -1634,10 +1632,8 @@ static int full_stripe_write(struct btrfs_raid_bio *rbio) int ret; ret = alloc_rbio_parity_pages(rbio); - if (ret) { - __free_raid_bio(rbio); + if (ret) return ret; - } ret = lock_stripe_add(rbio); if (ret == 0) @@ -1813,23 +1809,22 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) { - btrfs_put_bioc(bioc); ret = PTR_ERR(rbio); - goto out_dec_counter; + goto fail; } rbio->operation = BTRFS_RBIO_WRITE; rbio_add_bio(rbio, bio); - rbio->generic_bio_cnt = 1; - /* * don't plug on full rbios, just get them out the door * as quickly as we can */ if (rbio_is_full(rbio)) { ret = full_stripe_write(rbio); - if (ret) - goto out_dec_counter; + if (ret) { + __free_raid_bio(rbio); + goto fail; + } return; } @@ -1843,14 +1838,15 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) list_add_tail(&rbio->plug_list, &plug->rbio_list); } else { ret = __raid56_parity_write(rbio); - if (ret) - goto out_dec_counter; + if (ret) { + __free_raid_bio(rbio); + goto fail; + } } return; -out_dec_counter: - btrfs_bio_counter_dec(fs_info); +fail: bio->bi_status = errno_to_blk_status(ret); bio_endio(bio); } @@ -2198,18 +2194,11 @@ cleanup: * of the drive. */ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, - int mirror_num, bool generic_io) + int mirror_num) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; - if (generic_io) { - ASSERT(bioc->mirror_num == mirror_num); - btrfs_bio(bio)->mirror_num = mirror_num; - } else { - btrfs_get_bioc(bioc); - } - rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) { bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); @@ -2225,14 +2214,11 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)", __func__, bio->bi_iter.bi_sector << 9, (u64)bio->bi_iter.bi_size, bioc->map_type); - kfree(rbio); + __free_raid_bio(rbio); bio->bi_status = BLK_STS_IOERR; goto out_end_bio; } - if (generic_io) - rbio->generic_bio_cnt = 1; - /* * Loop retry: * for 'mirror == 2', reconstruct from all other stripes. @@ -2261,8 +2247,6 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, return; out_end_bio: - btrfs_bio_counter_dec(fs_info); - btrfs_put_bioc(bioc); bio_endio(bio); } @@ -2326,13 +2310,6 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, ASSERT(i < rbio->real_stripes); bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors); - - /* - * We have already increased bio_counter when getting bioc, record it - * so we can free it at rbio_orig_end_io(). - */ - rbio->generic_bio_cnt = 1; - return rbio; } @@ -2767,17 +2744,13 @@ raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc) rbio->faila = find_logical_bio_stripe(rbio, bio); if (rbio->faila == -1) { - BUG(); - kfree(rbio); + btrfs_warn_rl(fs_info, + "can not determine the failed stripe number for full stripe %llu", + bioc->raid_map[0]); + __free_raid_bio(rbio); return NULL; } - /* - * When we get bioc, we have already increased bio_counter, record it - * so we can free it at rbio_orig_end_io() - */ - rbio->generic_bio_cnt = 1; - return rbio; } diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 6f48f9e4c869..91d5c0adad15 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -89,8 +89,6 @@ struct btrfs_raid_bio { */ int bio_list_bytes; - int generic_bio_cnt; - refcount_t refs; atomic_t stripes_pending; @@ -166,7 +164,7 @@ static inline int nr_data_stripes(const struct map_lookup *map) struct btrfs_device; void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, - int mirror_num, bool generic_io); + int mirror_num); void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc); void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 9acf47b11fe6..f50586ff85c8 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -92,7 +92,7 @@ static int copy_inline_to_page(struct btrfs_inode *inode, clear_extent_bit(&inode->io_tree, file_offset, range_end, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, NULL); + NULL); ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL); if (ret) goto out_unlock; @@ -615,8 +615,8 @@ out: static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, struct inode *inode2, u64 loff2, u64 len) { - unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); - unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); + unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1, NULL); + unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1, NULL); } static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, @@ -634,8 +634,8 @@ static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, swap(range1_end, range2_end); } - lock_extent(&BTRFS_I(inode1)->io_tree, loff1, range1_end); - lock_extent(&BTRFS_I(inode2)->io_tree, loff2, range2_end); + lock_extent(&BTRFS_I(inode1)->io_tree, loff1, range1_end, NULL); + lock_extent(&BTRFS_I(inode2)->io_tree, loff2, range2_end, NULL); btrfs_assert_inode_range_clean(BTRFS_I(inode1), loff1, range1_end); btrfs_assert_inode_range_clean(BTRFS_I(inode2), loff2, range2_end); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 45c02aba2492..666a37a0ee89 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1124,10 +1124,10 @@ int replace_file_extents(struct btrfs_trans_handle *trans, if (!ret) continue; - btrfs_drop_extent_cache(BTRFS_I(inode), - key.offset, end, 1); + btrfs_drop_extent_map_range(BTRFS_I(inode), + key.offset, end, true); unlock_extent(&BTRFS_I(inode)->io_tree, - key.offset, end); + key.offset, end, NULL); } } @@ -1566,9 +1566,9 @@ static int invalidate_extent_cache(struct btrfs_root *root, } /* the lock_extent waits for read_folio to complete */ - lock_extent(&BTRFS_I(inode)->io_tree, start, end); - btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 1); - unlock_extent(&BTRFS_I(inode)->io_tree, start, end); + lock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL); + btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, true); + unlock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL); } return 0; } @@ -2869,13 +2869,13 @@ static noinline_for_stack int prealloc_file_extent_cluster( else end = cluster->end - offset; - lock_extent(&inode->io_tree, start, end); + lock_extent(&inode->io_tree, start, end, NULL); num_bytes = end + 1 - start; ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start, num_bytes, num_bytes, end + 1, &alloc_hint); cur_offset = end + 1; - unlock_extent(&inode->io_tree, start, end); + unlock_extent(&inode->io_tree, start, end, NULL); if (ret) break; } @@ -2890,7 +2890,6 @@ static noinline_for_stack int prealloc_file_extent_cluster( static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inode, u64 start, u64 end, u64 block_start) { - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_map *em; int ret = 0; @@ -2904,18 +2903,11 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inod em->block_start = block_start; set_bit(EXTENT_FLAG_PINNED, &em->flags); - lock_extent(&BTRFS_I(inode)->io_tree, start, end); - while (1) { - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); - write_unlock(&em_tree->lock); - if (ret != -EEXIST) { - free_extent_map(em); - break; - } - btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0); - } - unlock_extent(&BTRFS_I(inode)->io_tree, start, end); + lock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL); + ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, false); + unlock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL); + free_extent_map(em); + return ret; } @@ -3006,7 +2998,7 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, goto release_page; /* Mark the range delalloc and dirty for later writeback */ - lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end); + lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, NULL); ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start, clamped_end, 0, NULL); if (ret) { @@ -3039,7 +3031,7 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, boundary_start, boundary_end, EXTENT_BOUNDARY); } - unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end); + unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, NULL); btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len); cur += clamped_len; @@ -4339,7 +4331,7 @@ int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len) disk_bytenr = file_pos + inode->index_cnt; csum_root = btrfs_csum_root(fs_info, disk_bytenr); ret = btrfs_lookup_csums_range(csum_root, disk_bytenr, - disk_bytenr + len - 1, &list, 0); + disk_bytenr + len - 1, &list, 0, false); if (ret) goto out; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index d647cb2938c0..e1f599d7a916 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -337,7 +337,6 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, struct extent_buffer *leaf; struct btrfs_key key; unsigned long ptr; - int err = 0; int ret; path = btrfs_alloc_path(); @@ -350,7 +349,6 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, again: ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); if (ret < 0) { - err = ret; goto out; } else if (ret == 0) { leaf = path->nodes[0]; @@ -360,18 +358,18 @@ again: if ((btrfs_root_ref_dirid(leaf, ref) != dirid) || (btrfs_root_ref_name_len(leaf, ref) != name_len) || memcmp_extent_buffer(leaf, name, ptr, name_len)) { - err = -ENOENT; + ret = -ENOENT; goto out; } *sequence = btrfs_root_ref_sequence(leaf, ref); ret = btrfs_del_item(trans, tree_root, path); - if (ret) { - err = ret; + if (ret) goto out; - } - } else - err = -ENOENT; + } else { + ret = -ENOENT; + goto out; + } if (key.type == BTRFS_ROOT_BACKREF_KEY) { btrfs_release_path(path); @@ -383,7 +381,7 @@ again: out: btrfs_free_path(path); - return err; + return ret; } /* diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 3afe5fa50a63..196c4c6ed1ed 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -54,6 +54,8 @@ struct scrub_ctx; */ #define SCRUB_MAX_SECTORS_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K) +#define SCRUB_MAX_PAGES (DIV_ROUND_UP(BTRFS_MAX_METADATA_BLOCKSIZE, PAGE_SIZE)) + struct scrub_recover { refcount_t refs; struct btrfs_io_context *bioc; @@ -62,16 +64,12 @@ struct scrub_recover { struct scrub_sector { struct scrub_block *sblock; - struct page *page; - struct btrfs_device *dev; struct list_head list; u64 flags; /* extent flags */ u64 generation; - u64 logical; - u64 physical; - u64 physical_for_dev_replace; + /* Offset in bytes to @sblock. */ + u32 offset; atomic_t refs; - u8 mirror_num; unsigned int have_csum:1; unsigned int io_error:1; u8 csum[BTRFS_CSUM_SIZE]; @@ -94,8 +92,22 @@ struct scrub_bio { }; struct scrub_block { + /* + * Each page will have its page::private used to record the logical + * bytenr. + */ + struct page *pages[SCRUB_MAX_PAGES]; struct scrub_sector *sectors[SCRUB_MAX_SECTORS_PER_BLOCK]; + struct btrfs_device *dev; + /* Logical bytenr of the sblock */ + u64 logical; + u64 physical; + u64 physical_for_dev_replace; + /* Length of sblock in bytes */ + u32 len; int sector_count; + int mirror_num; + atomic_t outstanding_sectors; refcount_t refs; /* free mem on transition to zero */ struct scrub_ctx *sctx; @@ -202,8 +214,174 @@ struct full_stripe_lock { struct mutex mutex; }; +#ifndef CONFIG_64BIT +/* This structure is for archtectures whose (void *) is smaller than u64 */ +struct scrub_page_private { + u64 logical; +}; +#endif + +static int attach_scrub_page_private(struct page *page, u64 logical) +{ +#ifdef CONFIG_64BIT + attach_page_private(page, (void *)logical); + return 0; +#else + struct scrub_page_private *spp; + + spp = kmalloc(sizeof(*spp), GFP_KERNEL); + if (!spp) + return -ENOMEM; + spp->logical = logical; + attach_page_private(page, (void *)spp); + return 0; +#endif +} + +static void detach_scrub_page_private(struct page *page) +{ +#ifdef CONFIG_64BIT + detach_page_private(page); + return; +#else + struct scrub_page_private *spp; + + spp = detach_page_private(page); + kfree(spp); + return; +#endif +} + +static struct scrub_block *alloc_scrub_block(struct scrub_ctx *sctx, + struct btrfs_device *dev, + u64 logical, u64 physical, + u64 physical_for_dev_replace, + int mirror_num) +{ + struct scrub_block *sblock; + + sblock = kzalloc(sizeof(*sblock), GFP_KERNEL); + if (!sblock) + return NULL; + refcount_set(&sblock->refs, 1); + sblock->sctx = sctx; + sblock->logical = logical; + sblock->physical = physical; + sblock->physical_for_dev_replace = physical_for_dev_replace; + sblock->dev = dev; + sblock->mirror_num = mirror_num; + sblock->no_io_error_seen = 1; + /* + * Scrub_block::pages will be allocated at alloc_scrub_sector() when + * the corresponding page is not allocated. + */ + return sblock; +} + +/* + * Allocate a new scrub sector and attach it to @sblock. + * + * Will also allocate new pages for @sblock if needed. + */ +static struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock, + u64 logical, gfp_t gfp) +{ + const pgoff_t page_index = (logical - sblock->logical) >> PAGE_SHIFT; + struct scrub_sector *ssector; + + /* We must never have scrub_block exceed U32_MAX in size. */ + ASSERT(logical - sblock->logical < U32_MAX); + + ssector = kzalloc(sizeof(*ssector), gfp); + if (!ssector) + return NULL; + + /* Allocate a new page if the slot is not allocated */ + if (!sblock->pages[page_index]) { + int ret; + + sblock->pages[page_index] = alloc_page(gfp); + if (!sblock->pages[page_index]) { + kfree(ssector); + return NULL; + } + ret = attach_scrub_page_private(sblock->pages[page_index], + sblock->logical + (page_index << PAGE_SHIFT)); + if (ret < 0) { + kfree(ssector); + __free_page(sblock->pages[page_index]); + sblock->pages[page_index] = NULL; + return NULL; + } + } + + atomic_set(&ssector->refs, 1); + ssector->sblock = sblock; + /* The sector to be added should not be used */ + ASSERT(sblock->sectors[sblock->sector_count] == NULL); + ssector->offset = logical - sblock->logical; + + /* The sector count must be smaller than the limit */ + ASSERT(sblock->sector_count < SCRUB_MAX_SECTORS_PER_BLOCK); + + sblock->sectors[sblock->sector_count] = ssector; + sblock->sector_count++; + sblock->len += sblock->sctx->fs_info->sectorsize; + + return ssector; +} + +static struct page *scrub_sector_get_page(struct scrub_sector *ssector) +{ + struct scrub_block *sblock = ssector->sblock; + pgoff_t index; + /* + * When calling this function, ssector must be alreaday attached to the + * parent sblock. + */ + ASSERT(sblock); + + /* The range should be inside the sblock range */ + ASSERT(ssector->offset < sblock->len); + + index = ssector->offset >> PAGE_SHIFT; + ASSERT(index < SCRUB_MAX_PAGES); + ASSERT(sblock->pages[index]); + ASSERT(PagePrivate(sblock->pages[index])); + return sblock->pages[index]; +} + +static unsigned int scrub_sector_get_page_offset(struct scrub_sector *ssector) +{ + struct scrub_block *sblock = ssector->sblock; + + /* + * When calling this function, ssector must be already attached to the + * parent sblock. + */ + ASSERT(sblock); + + /* The range should be inside the sblock range */ + ASSERT(ssector->offset < sblock->len); + + return offset_in_page(ssector->offset); +} + +static char *scrub_sector_get_kaddr(struct scrub_sector *ssector) +{ + return page_address(scrub_sector_get_page(ssector)) + + scrub_sector_get_page_offset(ssector); +} + +static int bio_add_scrub_sector(struct bio *bio, struct scrub_sector *ssector, + unsigned int len) +{ + return bio_add_page(bio, scrub_sector_get_page(ssector), len, + scrub_sector_get_page_offset(ssector)); +} + static int scrub_setup_recheck_block(struct scrub_block *original_sblock, - struct scrub_block *sblocks_for_recheck); + struct scrub_block *sblocks_for_recheck[]); static void scrub_recheck_block(struct btrfs_fs_info *fs_info, struct scrub_block *sblock, int retry_failed_mirror); @@ -533,10 +711,8 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) if (sctx->curr != -1) { struct scrub_bio *sbio = sctx->bios[sctx->curr]; - for (i = 0; i < sbio->sector_count; i++) { - WARN_ON(!sbio->sectors[i]->page); + for (i = 0; i < sbio->sector_count; i++) scrub_block_put(sbio->sectors[i]->sblock); - } bio_put(sbio->bio); } @@ -726,15 +902,22 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) int ret; WARN_ON(sblock->sector_count < 1); - dev = sblock->sectors[0]->dev; + dev = sblock->dev; fs_info = sblock->sctx->fs_info; + /* Super block error, no need to search extent tree. */ + if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { + btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu", + errstr, rcu_str_deref(dev->name), + sblock->physical); + return; + } path = btrfs_alloc_path(); if (!path) return; - swarn.physical = sblock->sectors[0]->physical; - swarn.logical = sblock->sectors[0]->logical; + swarn.physical = sblock->physical; + swarn.logical = sblock->logical; swarn.errstr = errstr; swarn.dev = NULL; @@ -804,13 +987,14 @@ static inline void scrub_put_recover(struct btrfs_fs_info *fs_info, static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) { struct scrub_ctx *sctx = sblock_to_check->sctx; - struct btrfs_device *dev; + struct btrfs_device *dev = sblock_to_check->dev; struct btrfs_fs_info *fs_info; u64 logical; unsigned int failed_mirror_index; unsigned int is_metadata; unsigned int have_csum; - struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */ + /* One scrub_block for each mirror */ + struct scrub_block *sblocks_for_recheck[BTRFS_MAX_MIRRORS] = { 0 }; struct scrub_block *sblock_bad; int ret; int mirror_index; @@ -825,22 +1009,23 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) fs_info = sctx->fs_info; if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { /* - * if we find an error in a super block, we just report it. + * If we find an error in a super block, we just report it. * They will get written with the next transaction commit * anyway */ + scrub_print_warning("super block error", sblock_to_check); spin_lock(&sctx->stat_lock); ++sctx->stat.super_errors; spin_unlock(&sctx->stat_lock); + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); return 0; } - logical = sblock_to_check->sectors[0]->logical; - BUG_ON(sblock_to_check->sectors[0]->mirror_num < 1); - failed_mirror_index = sblock_to_check->sectors[0]->mirror_num - 1; + logical = sblock_to_check->logical; + ASSERT(sblock_to_check->mirror_num); + failed_mirror_index = sblock_to_check->mirror_num - 1; is_metadata = !(sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA); have_csum = sblock_to_check->sectors[0]->have_csum; - dev = sblock_to_check->sectors[0]->dev; if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical)) return 0; @@ -902,17 +1087,28 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) * repaired area is verified in order to correctly maintain * the statistics. */ - - sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS, - sizeof(*sblocks_for_recheck), GFP_KERNEL); - if (!sblocks_for_recheck) { - spin_lock(&sctx->stat_lock); - sctx->stat.malloc_errors++; - sctx->stat.read_errors++; - sctx->stat.uncorrectable_errors++; - spin_unlock(&sctx->stat_lock); - btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); - goto out; + for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) { + /* + * Note: the two members refs and outstanding_sectors are not + * used in the blocks that are used for the recheck procedure. + * + * But alloc_scrub_block() will initialize sblock::ref anyway, + * so we can use scrub_block_put() to clean them up. + * + * And here we don't setup the physical/dev for the sblock yet, + * they will be correctly initialized in scrub_setup_recheck_block(). + */ + sblocks_for_recheck[mirror_index] = alloc_scrub_block(sctx, NULL, + logical, 0, 0, mirror_index); + if (!sblocks_for_recheck[mirror_index]) { + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + sctx->stat.read_errors++; + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); + goto out; + } } /* Setup the context, map the logical blocks and alloc the sectors */ @@ -926,7 +1122,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) goto out; } BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); - sblock_bad = sblocks_for_recheck + failed_mirror_index; + sblock_bad = sblocks_for_recheck[failed_mirror_index]; /* build and submit the bios for the failed mirror, check checksums */ scrub_recheck_block(fs_info, sblock_bad, 1); @@ -1011,22 +1207,22 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) { if (mirror_index >= BTRFS_MAX_MIRRORS) break; - if (!sblocks_for_recheck[mirror_index].sector_count) + if (!sblocks_for_recheck[mirror_index]->sector_count) break; - sblock_other = sblocks_for_recheck + mirror_index; + sblock_other = sblocks_for_recheck[mirror_index]; } else { struct scrub_recover *r = sblock_bad->sectors[0]->recover; int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs; if (mirror_index >= max_allowed) break; - if (!sblocks_for_recheck[1].sector_count) + if (!sblocks_for_recheck[1]->sector_count) break; ASSERT(failed_mirror_index == 0); - sblock_other = sblocks_for_recheck + 1; - sblock_other->sectors[0]->mirror_num = 1 + mirror_index; + sblock_other = sblocks_for_recheck[1]; + sblock_other->mirror_num = 1 + mirror_index; } /* build and submit the bios, check checksums */ @@ -1097,12 +1293,11 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) /* Try to find no-io-error sector in mirrors */ for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS && - sblocks_for_recheck[mirror_index].sector_count > 0; + sblocks_for_recheck[mirror_index]->sector_count > 0; mirror_index++) { - if (!sblocks_for_recheck[mirror_index]. + if (!sblocks_for_recheck[mirror_index]-> sectors[sector_num]->io_error) { - sblock_other = sblocks_for_recheck + - mirror_index; + sblock_other = sblocks_for_recheck[mirror_index]; break; } } @@ -1176,25 +1371,28 @@ did_not_correct_error: } out: - if (sblocks_for_recheck) { - for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; - mirror_index++) { - struct scrub_block *sblock = sblocks_for_recheck + - mirror_index; - struct scrub_recover *recover; - int i; - - for (i = 0; i < sblock->sector_count; i++) { - sblock->sectors[i]->sblock = NULL; - recover = sblock->sectors[i]->recover; - if (recover) { - scrub_put_recover(fs_info, recover); - sblock->sectors[i]->recover = NULL; - } - scrub_sector_put(sblock->sectors[i]); + for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) { + struct scrub_block *sblock = sblocks_for_recheck[mirror_index]; + struct scrub_recover *recover; + int sector_index; + + /* Not allocated, continue checking the next mirror */ + if (!sblock) + continue; + + for (sector_index = 0; sector_index < sblock->sector_count; + sector_index++) { + /* + * Here we just cleanup the recover, each sector will be + * properly cleaned up by later scrub_block_put() + */ + recover = sblock->sectors[sector_index]->recover; + if (recover) { + scrub_put_recover(fs_info, recover); + sblock->sectors[sector_index]->recover = NULL; } } - kfree(sblocks_for_recheck); + scrub_block_put(sblock); } ret = unlock_full_stripe(fs_info, logical, full_stripe_locked); @@ -1244,12 +1442,12 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, } static int scrub_setup_recheck_block(struct scrub_block *original_sblock, - struct scrub_block *sblocks_for_recheck) + struct scrub_block *sblocks_for_recheck[]) { struct scrub_ctx *sctx = original_sblock->sctx; struct btrfs_fs_info *fs_info = sctx->fs_info; + u64 logical = original_sblock->logical; u64 length = original_sblock->sector_count << fs_info->sectorsize_bits; - u64 logical = original_sblock->sectors[0]->logical; u64 generation = original_sblock->sectors[0]->generation; u64 flags = original_sblock->sectors[0]->flags; u64 have_csum = original_sblock->sectors[0]->have_csum; @@ -1264,11 +1462,6 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, int nmirrors; int ret; - /* - * Note: the two members refs and outstanding_sectors are not used (and - * not set) in the blocks that are used for the recheck procedure. - */ - while (length > 0) { sublen = min_t(u64, length, fs_info->sectorsize); mapped_length = sublen; @@ -1307,24 +1500,19 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, struct scrub_block *sblock; struct scrub_sector *sector; - sblock = sblocks_for_recheck + mirror_index; + sblock = sblocks_for_recheck[mirror_index]; sblock->sctx = sctx; - sector = kzalloc(sizeof(*sector), GFP_NOFS); + sector = alloc_scrub_sector(sblock, logical, GFP_NOFS); if (!sector) { -leave_nomem: spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; spin_unlock(&sctx->stat_lock); scrub_put_recover(fs_info, recover); return -ENOMEM; } - scrub_sector_get(sector); - sblock->sectors[sector_index] = sector; - sector->sblock = sblock; sector->flags = flags; sector->generation = generation; - sector->logical = logical; sector->have_csum = have_csum; if (have_csum) memcpy(sector->csum, @@ -1339,21 +1527,20 @@ leave_nomem: mirror_index, &stripe_index, &stripe_offset); - sector->physical = bioc->stripes[stripe_index].physical + - stripe_offset; - sector->dev = bioc->stripes[stripe_index].dev; + /* + * We're at the first sector, also populate @sblock + * physical and dev. + */ + if (sector_index == 0) { + sblock->physical = + bioc->stripes[stripe_index].physical + + stripe_offset; + sblock->dev = bioc->stripes[stripe_index].dev; + sblock->physical_for_dev_replace = + original_sblock->physical_for_dev_replace; + } BUG_ON(sector_index >= original_sblock->sector_count); - sector->physical_for_dev_replace = - original_sblock->sectors[sector_index]-> - physical_for_dev_replace; - /* For missing devices, dev->bdev is NULL */ - sector->mirror_num = mirror_index + 1; - sblock->sector_count++; - sector->page = alloc_page(GFP_NOFS); - if (!sector->page) - goto leave_nomem; - scrub_get_recover(recover); sector->recover = recover; } @@ -1377,11 +1564,11 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, { DECLARE_COMPLETION_ONSTACK(done); - bio->bi_iter.bi_sector = sector->logical >> 9; + bio->bi_iter.bi_sector = (sector->offset + sector->sblock->logical) >> + SECTOR_SHIFT; bio->bi_private = &done; bio->bi_end_io = scrub_bio_wait_endio; - raid56_parity_recover(bio, sector->recover->bioc, - sector->sblock->sectors[0]->mirror_num, false); + raid56_parity_recover(bio, sector->recover->bioc, sector->sblock->mirror_num); wait_for_completion_io(&done); return blk_status_to_errno(bio->bi_status); @@ -1395,17 +1582,16 @@ static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info, int i; /* All sectors in sblock belong to the same stripe on the same device. */ - ASSERT(first_sector->dev); - if (!first_sector->dev->bdev) + ASSERT(sblock->dev); + if (!sblock->dev->bdev) goto out; - bio = bio_alloc(first_sector->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS); + bio = bio_alloc(sblock->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS); for (i = 0; i < sblock->sector_count; i++) { struct scrub_sector *sector = sblock->sectors[i]; - WARN_ON(!sector->page); - bio_add_page(bio, sector->page, PAGE_SIZE, 0); + bio_add_scrub_sector(bio, sector, fs_info->sectorsize); } if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) { @@ -1449,16 +1635,16 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, struct bio bio; struct bio_vec bvec; - if (sector->dev->bdev == NULL) { + if (sblock->dev->bdev == NULL) { sector->io_error = 1; sblock->no_io_error_seen = 0; continue; } - WARN_ON(!sector->page); - bio_init(&bio, sector->dev->bdev, &bvec, 1, REQ_OP_READ); - bio_add_page(&bio, sector->page, fs_info->sectorsize, 0); - bio.bi_iter.bi_sector = sector->physical >> 9; + bio_init(&bio, sblock->dev->bdev, &bvec, 1, REQ_OP_READ); + bio_add_scrub_sector(&bio, sector, fs_info->sectorsize); + bio.bi_iter.bi_sector = (sblock->physical + sector->offset) >> + SECTOR_SHIFT; btrfsic_check_bio(&bio); if (submit_bio_wait(&bio)) { @@ -1475,7 +1661,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector) { - struct btrfs_fs_devices *fs_devices = sector->dev->fs_devices; + struct btrfs_fs_devices *fs_devices = sector->sblock->dev->fs_devices; int ret; ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE); @@ -1521,30 +1707,29 @@ static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad, struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info; const u32 sectorsize = fs_info->sectorsize; - BUG_ON(sector_bad->page == NULL); - BUG_ON(sector_good->page == NULL); if (force_write || sblock_bad->header_error || sblock_bad->checksum_error || sector_bad->io_error) { struct bio bio; struct bio_vec bvec; int ret; - if (!sector_bad->dev->bdev) { + if (!sblock_bad->dev->bdev) { btrfs_warn_rl(fs_info, "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected"); return -EIO; } - bio_init(&bio, sector_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE); - bio.bi_iter.bi_sector = sector_bad->physical >> 9; - __bio_add_page(&bio, sector_good->page, sectorsize, 0); + bio_init(&bio, sblock_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE); + bio.bi_iter.bi_sector = (sblock_bad->physical + + sector_bad->offset) >> SECTOR_SHIFT; + ret = bio_add_scrub_sector(&bio, sector_good, sectorsize); btrfsic_check_bio(&bio); ret = submit_bio_wait(&bio); bio_uninit(&bio); if (ret) { - btrfs_dev_stat_inc_and_print(sector_bad->dev, + btrfs_dev_stat_inc_and_print(sblock_bad->dev, BTRFS_DEV_STAT_WRITE_ERRS); atomic64_inc(&fs_info->dev_replace.num_write_errors); return -EIO; @@ -1577,11 +1762,11 @@ static void scrub_write_block_to_dev_replace(struct scrub_block *sblock) static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num) { + const u32 sectorsize = sblock->sctx->fs_info->sectorsize; struct scrub_sector *sector = sblock->sectors[sector_num]; - BUG_ON(sector->page == NULL); if (sector->io_error) - clear_page(page_address(sector->page)); + memset(scrub_sector_get_kaddr(sector), 0, sectorsize); return scrub_add_sector_to_wr_bio(sblock->sctx, sector); } @@ -1608,9 +1793,15 @@ static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) return ret; } +static void scrub_block_get(struct scrub_block *sblock) +{ + refcount_inc(&sblock->refs); +} + static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx, struct scrub_sector *sector) { + struct scrub_block *sblock = sector->sblock; struct scrub_bio *sbio; int ret; const u32 sectorsize = sctx->fs_info->sectorsize; @@ -1629,14 +1820,15 @@ again: } sbio = sctx->wr_curr_bio; if (sbio->sector_count == 0) { - ret = fill_writer_pointer_gap(sctx, sector->physical_for_dev_replace); + ret = fill_writer_pointer_gap(sctx, sector->offset + + sblock->physical_for_dev_replace); if (ret) { mutex_unlock(&sctx->wr_lock); return ret; } - sbio->physical = sector->physical_for_dev_replace; - sbio->logical = sector->logical; + sbio->physical = sblock->physical_for_dev_replace + sector->offset; + sbio->logical = sblock->logical + sector->offset; sbio->dev = sctx->wr_tgtdev; if (!sbio->bio) { sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio, @@ -1647,14 +1839,14 @@ again: sbio->bio->bi_iter.bi_sector = sbio->physical >> 9; sbio->status = 0; } else if (sbio->physical + sbio->sector_count * sectorsize != - sector->physical_for_dev_replace || + sblock->physical_for_dev_replace + sector->offset || sbio->logical + sbio->sector_count * sectorsize != - sector->logical) { + sblock->logical + sector->offset) { scrub_wr_submit(sctx); goto again; } - ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0); + ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize); if (ret != sectorsize) { if (sbio->sector_count < 1) { bio_put(sbio->bio); @@ -1668,6 +1860,13 @@ again: sbio->sectors[sbio->sector_count] = sector; scrub_sector_get(sector); + /* + * Since ssector no longer holds a page, but uses sblock::pages, we + * have to ensure the sblock had not been freed before our write bio + * finished. + */ + scrub_block_get(sector->sblock); + sbio->sector_count++; if (sbio->sector_count == sctx->sectors_per_bio) scrub_wr_submit(sctx); @@ -1729,8 +1928,14 @@ static void scrub_wr_bio_end_io_worker(struct work_struct *work) } } - for (i = 0; i < sbio->sector_count; i++) + /* + * In scrub_add_sector_to_wr_bio() we grab extra ref for sblock, now in + * endio we should put the sblock. + */ + for (i = 0; i < sbio->sector_count; i++) { + scrub_block_put(sbio->sectors[i]->sblock); scrub_sector_put(sbio->sectors[i]); + } bio_put(sbio->bio); kfree(sbio); @@ -1762,7 +1967,7 @@ static int scrub_checksum(struct scrub_block *sblock) else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) ret = scrub_checksum_tree_block(sblock); else if (flags & BTRFS_EXTENT_FLAG_SUPER) - (void)scrub_checksum_super(sblock); + ret = scrub_checksum_super(sblock); else WARN_ON(1); if (ret) @@ -1785,15 +1990,11 @@ static int scrub_checksum_data(struct scrub_block *sblock) if (!sector->have_csum) return 0; - kaddr = page_address(sector->page); + kaddr = scrub_sector_get_kaddr(sector); shash->tfm = fs_info->csum_shash; crypto_shash_init(shash); - /* - * In scrub_sectors() and scrub_sectors_for_parity() we ensure each sector - * only contains one sector of data. - */ crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum); if (memcmp(csum, sector->csum, fs_info->csum_size)) @@ -1826,7 +2027,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) ASSERT(sblock->sector_count == num_sectors); sector = sblock->sectors[0]; - kaddr = page_address(sector->page); + kaddr = scrub_sector_get_kaddr(sector); h = (struct btrfs_header *)kaddr; memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size); @@ -1835,7 +2036,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) * a) don't have an extent buffer and * b) the page is already kmapped */ - if (sector->logical != btrfs_stack_header_bytenr(h)) + if (sblock->logical != btrfs_stack_header_bytenr(h)) sblock->header_error = 1; if (sector->generation != btrfs_stack_header_generation(h)) { @@ -1856,7 +2057,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) sectorsize - BTRFS_CSUM_SIZE); for (i = 1; i < num_sectors; i++) { - kaddr = page_address(sblock->sectors[i]->page); + kaddr = scrub_sector_get_kaddr(sblock->sectors[i]); crypto_shash_update(shash, kaddr, sectorsize); } @@ -1881,10 +2082,10 @@ static int scrub_checksum_super(struct scrub_block *sblock) BUG_ON(sblock->sector_count < 1); sector = sblock->sectors[0]; - kaddr = page_address(sector->page); + kaddr = scrub_sector_get_kaddr(sector); s = (struct btrfs_super_block *)kaddr; - if (sector->logical != btrfs_super_bytenr(s)) + if (sblock->logical != btrfs_super_bytenr(s)) ++fail_cor; if (sector->generation != btrfs_super_generation(s)) @@ -1901,31 +2102,9 @@ static int scrub_checksum_super(struct scrub_block *sblock) if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size)) ++fail_cor; - if (fail_cor + fail_gen) { - /* - * if we find an error in a super block, we just report it. - * They will get written with the next transaction commit - * anyway - */ - spin_lock(&sctx->stat_lock); - ++sctx->stat.super_errors; - spin_unlock(&sctx->stat_lock); - if (fail_cor) - btrfs_dev_stat_inc_and_print(sector->dev, - BTRFS_DEV_STAT_CORRUPTION_ERRS); - else - btrfs_dev_stat_inc_and_print(sector->dev, - BTRFS_DEV_STAT_GENERATION_ERRS); - } - return fail_cor + fail_gen; } -static void scrub_block_get(struct scrub_block *sblock) -{ - refcount_inc(&sblock->refs); -} - static void scrub_block_put(struct scrub_block *sblock) { if (refcount_dec_and_test(&sblock->refs)) { @@ -1936,6 +2115,12 @@ static void scrub_block_put(struct scrub_block *sblock) for (i = 0; i < sblock->sector_count; i++) scrub_sector_put(sblock->sectors[i]); + for (i = 0; i < DIV_ROUND_UP(sblock->len, PAGE_SIZE); i++) { + if (sblock->pages[i]) { + detach_scrub_page_private(sblock->pages[i]); + __free_page(sblock->pages[i]); + } + } kfree(sblock); } } @@ -1947,11 +2132,8 @@ static void scrub_sector_get(struct scrub_sector *sector) static void scrub_sector_put(struct scrub_sector *sector) { - if (atomic_dec_and_test(§or->refs)) { - if (sector->page) - __free_page(sector->page); + if (atomic_dec_and_test(§or->refs)) kfree(sector); - } } /* @@ -2056,9 +2238,9 @@ again: } sbio = sctx->bios[sctx->curr]; if (sbio->sector_count == 0) { - sbio->physical = sector->physical; - sbio->logical = sector->logical; - sbio->dev = sector->dev; + sbio->physical = sblock->physical + sector->offset; + sbio->logical = sblock->logical + sector->offset; + sbio->dev = sblock->dev; if (!sbio->bio) { sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio, REQ_OP_READ, GFP_NOFS); @@ -2068,16 +2250,16 @@ again: sbio->bio->bi_iter.bi_sector = sbio->physical >> 9; sbio->status = 0; } else if (sbio->physical + sbio->sector_count * sectorsize != - sector->physical || + sblock->physical + sector->offset || sbio->logical + sbio->sector_count * sectorsize != - sector->logical || - sbio->dev != sector->dev) { + sblock->logical + sector->offset || + sbio->dev != sblock->dev) { scrub_submit(sctx); goto again; } sbio->sectors[sbio->sector_count] = sector; - ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0); + ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize); if (ret != sectorsize) { if (sbio->sector_count < 1) { bio_put(sbio->bio); @@ -2102,6 +2284,7 @@ static void scrub_missing_raid56_end_io(struct bio *bio) struct scrub_block *sblock = bio->bi_private; struct btrfs_fs_info *fs_info = sblock->sctx->fs_info; + btrfs_bio_counter_dec(fs_info); if (bio->bi_status) sblock->no_io_error_seen = 0; @@ -2118,8 +2301,8 @@ static void scrub_missing_raid56_worker(struct work_struct *work) u64 logical; struct btrfs_device *dev; - logical = sblock->sectors[0]->logical; - dev = sblock->sectors[0]->dev; + logical = sblock->logical; + dev = sblock->dev; if (sblock->no_io_error_seen) scrub_recheck_block_checksum(sblock); @@ -2157,7 +2340,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) struct scrub_ctx *sctx = sblock->sctx; struct btrfs_fs_info *fs_info = sctx->fs_info; u64 length = sblock->sector_count << fs_info->sectorsize_bits; - u64 logical = sblock->sectors[0]->logical; + u64 logical = sblock->logical; struct btrfs_io_context *bioc = NULL; struct bio *bio; struct btrfs_raid_bio *rbio; @@ -2193,17 +2376,16 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) for (i = 0; i < sblock->sector_count; i++) { struct scrub_sector *sector = sblock->sectors[i]; - /* - * For now, our scrub is still one page per sector, so pgoff - * is always 0. - */ - raid56_add_scrub_pages(rbio, sector->page, 0, sector->logical); + raid56_add_scrub_pages(rbio, scrub_sector_get_page(sector), + scrub_sector_get_page_offset(sector), + sector->offset + sector->sblock->logical); } INIT_WORK(&sblock->work, scrub_missing_raid56_worker); scrub_block_get(sblock); scrub_pending_bio_inc(sctx); raid56_submit_missing_rbio(rbio); + btrfs_put_bioc(bioc); return; rbio_out: @@ -2225,7 +2407,8 @@ static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len, const u32 sectorsize = sctx->fs_info->sectorsize; int index; - sblock = kzalloc(sizeof(*sblock), GFP_KERNEL); + sblock = alloc_scrub_block(sctx, dev, logical, physical, + physical_for_dev_replace, mirror_num); if (!sblock) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -2233,12 +2416,6 @@ static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len, return -ENOMEM; } - /* one ref inside this function, plus one for each page added to - * a bio later on */ - refcount_set(&sblock->refs, 1); - sblock->sctx = sctx; - sblock->no_io_error_seen = 1; - for (index = 0; len > 0; index++) { struct scrub_sector *sector; /* @@ -2248,36 +2425,22 @@ static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len, */ u32 l = min(sectorsize, len); - sector = kzalloc(sizeof(*sector), GFP_KERNEL); + sector = alloc_scrub_sector(sblock, logical, GFP_KERNEL); if (!sector) { -leave_nomem: spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; spin_unlock(&sctx->stat_lock); scrub_block_put(sblock); return -ENOMEM; } - ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK); - scrub_sector_get(sector); - sblock->sectors[index] = sector; - sector->sblock = sblock; - sector->dev = dev; sector->flags = flags; sector->generation = gen; - sector->logical = logical; - sector->physical = physical; - sector->physical_for_dev_replace = physical_for_dev_replace; - sector->mirror_num = mirror_num; if (csum) { sector->have_csum = 1; memcpy(sector->csum, csum, sctx->fs_info->csum_size); } else { sector->have_csum = 0; } - sblock->sector_count++; - sector->page = alloc_page(GFP_KERNEL); - if (!sector->page) - goto leave_nomem; len -= l; logical += l; physical += l; @@ -2423,8 +2586,9 @@ static void scrub_block_complete(struct scrub_block *sblock) } if (sblock->sparity && corrupted && !sblock->data_corrected) { - u64 start = sblock->sectors[0]->logical; - u64 end = sblock->sectors[sblock->sector_count - 1]->logical + + u64 start = sblock->logical; + u64 end = sblock->logical + + sblock->sectors[sblock->sector_count - 1]->offset + sblock->sctx->fs_info->sectorsize; ASSERT(end - start <= U32_MAX); @@ -2578,7 +2742,7 @@ static int scrub_sectors_for_parity(struct scrub_parity *sparity, ASSERT(IS_ALIGNED(len, sectorsize)); - sblock = kzalloc(sizeof(*sblock), GFP_KERNEL); + sblock = alloc_scrub_block(sctx, dev, logical, physical, physical, mirror_num); if (!sblock) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -2586,51 +2750,32 @@ static int scrub_sectors_for_parity(struct scrub_parity *sparity, return -ENOMEM; } - /* one ref inside this function, plus one for each page added to - * a bio later on */ - refcount_set(&sblock->refs, 1); - sblock->sctx = sctx; - sblock->no_io_error_seen = 1; sblock->sparity = sparity; scrub_parity_get(sparity); for (index = 0; len > 0; index++) { struct scrub_sector *sector; - sector = kzalloc(sizeof(*sector), GFP_KERNEL); + sector = alloc_scrub_sector(sblock, logical, GFP_KERNEL); if (!sector) { -leave_nomem: spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; spin_unlock(&sctx->stat_lock); scrub_block_put(sblock); return -ENOMEM; } - ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK); - /* For scrub block */ - scrub_sector_get(sector); sblock->sectors[index] = sector; /* For scrub parity */ scrub_sector_get(sector); list_add_tail(§or->list, &sparity->sectors_list); - sector->sblock = sblock; - sector->dev = dev; sector->flags = flags; sector->generation = gen; - sector->logical = logical; - sector->physical = physical; - sector->mirror_num = mirror_num; if (csum) { sector->have_csum = 1; memcpy(sector->csum, csum, sctx->fs_info->csum_size); } else { sector->have_csum = 0; } - sblock->sector_count++; - sector->page = alloc_page(GFP_KERNEL); - if (!sector->page) - goto leave_nomem; - /* Iterate over the stripe range in sectorsize steps */ len -= sectorsize; @@ -2774,6 +2919,7 @@ static void scrub_parity_bio_endio_worker(struct work_struct *work) work); struct scrub_ctx *sctx = sparity->sctx; + btrfs_bio_counter_dec(sctx->fs_info); scrub_free_parity(sparity); scrub_pending_bio_dec(sctx); } @@ -2824,6 +2970,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) sparity->scrub_dev, &sparity->dbitmap, sparity->nsectors); + btrfs_put_bioc(bioc); if (!rbio) goto rbio_out; @@ -2835,7 +2982,6 @@ rbio_out: bio_put(bio); bioc_out: btrfs_bio_counter_dec(fs_info); - btrfs_put_bioc(bioc); bitmap_or(&sparity->ebitmap, &sparity->ebitmap, &sparity->dbitmap, sparity->nsectors); spin_lock(&sctx->stat_lock); @@ -3077,7 +3223,7 @@ static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx, ret = btrfs_lookup_csums_range(csum_root, extent_start, extent_start + extent_size - 1, - &sctx->csum_list, 1); + &sctx->csum_list, 1, false); if (ret) { scrub_parity_mark_sectors_error(sparity, extent_start, extent_size); @@ -3266,7 +3412,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, } /* Block group removed? */ spin_lock(&bg->lock); - if (bg->removed) { + if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) { spin_unlock(&bg->lock); ret = 0; break; @@ -3303,7 +3449,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, if (extent_flags & BTRFS_EXTENT_FLAG_DATA) { ret = btrfs_lookup_csums_range(csum_root, cur_logical, cur_logical + scrub_len - 1, - &sctx->csum_list, 1); + &sctx->csum_list, 1, false); if (ret) break; } @@ -3606,7 +3752,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, * kthread or relocation. */ spin_lock(&bg->lock); - if (!bg->removed) + if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) ret = -EINVAL; spin_unlock(&bg->lock); @@ -3764,13 +3910,10 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, } if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) { - spin_lock(&cache->lock); - if (!cache->to_copy) { - spin_unlock(&cache->lock); + if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags)) { btrfs_put_block_group(cache); goto skip; } - spin_unlock(&cache->lock); } /* @@ -3782,7 +3925,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, * repair extents. */ spin_lock(&cache->lock); - if (cache->removed) { + if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) { spin_unlock(&cache->lock); btrfs_put_block_group(cache); goto skip; @@ -3942,8 +4085,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, * balance is triggered or it becomes used and unused again. */ spin_lock(&cache->lock); - if (!cache->removed && !cache->ro && cache->reserved == 0 && - cache->used == 0) { + if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags) && + !cache->ro && cache->reserved == 0 && cache->used == 0) { spin_unlock(&cache->lock); if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) btrfs_discard_queue_work(&fs_info->discard_ctl, @@ -4102,36 +4245,21 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, int ret; struct btrfs_device *dev; unsigned int nofs_flag; + bool need_commit = false; if (btrfs_fs_closing(fs_info)) return -EAGAIN; - if (fs_info->nodesize > BTRFS_STRIPE_LEN) { - /* - * in this case scrub is unable to calculate the checksum - * the way scrub is implemented. Do not handle this - * situation at all because it won't ever happen. - */ - btrfs_err(fs_info, - "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails", - fs_info->nodesize, - BTRFS_STRIPE_LEN); - return -EINVAL; - } + /* At mount time we have ensured nodesize is in the range of [4K, 64K]. */ + ASSERT(fs_info->nodesize <= BTRFS_STRIPE_LEN); - if (fs_info->nodesize > - SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits || - fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_SECTORS_PER_BLOCK) { - /* - * Would exhaust the array bounds of sectorv member in - * struct scrub_block - */ - btrfs_err(fs_info, -"scrub: nodesize and sectorsize <= SCRUB_MAX_SECTORS_PER_BLOCK (%d <= %d && %d <= %d) fails", - fs_info->nodesize, SCRUB_MAX_SECTORS_PER_BLOCK, - fs_info->sectorsize, SCRUB_MAX_SECTORS_PER_BLOCK); - return -EINVAL; - } + /* + * SCRUB_MAX_SECTORS_PER_BLOCK is calculated using the largest possible + * value (max nodesize / min sectorsize), thus nodesize should always + * be fine. + */ + ASSERT(fs_info->nodesize <= + SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits); /* Allocate outside of device_list_mutex */ sctx = scrub_setup_ctx(fs_info, is_dev_replace); @@ -4205,6 +4333,12 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, */ nofs_flag = memalloc_nofs_save(); if (!is_dev_replace) { + u64 old_super_errors; + + spin_lock(&sctx->stat_lock); + old_super_errors = sctx->stat.super_errors; + spin_unlock(&sctx->stat_lock); + btrfs_info(fs_info, "scrub: started on devid %llu", devid); /* * by holding device list mutex, we can @@ -4213,6 +4347,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, mutex_lock(&fs_info->fs_devices->device_list_mutex); ret = scrub_supers(sctx, dev); mutex_unlock(&fs_info->fs_devices->device_list_mutex); + + spin_lock(&sctx->stat_lock); + /* + * Super block errors found, but we can not commit transaction + * at current context, since btrfs_commit_transaction() needs + * to pause the current running scrub (hold by ourselves). + */ + if (sctx->stat.super_errors > old_super_errors && !sctx->readonly) + need_commit = true; + spin_unlock(&sctx->stat_lock); } if (!ret) @@ -4239,6 +4383,25 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, scrub_workers_put(fs_info); scrub_put_ctx(sctx); + /* + * We found some super block errors before, now try to force a + * transaction commit, as scrub has finished. + */ + if (need_commit) { + struct btrfs_trans_handle *trans; + + trans = btrfs_start_transaction(fs_info->tree_root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + btrfs_err(fs_info, + "scrub: failed to start transaction to fix super block errors: %d", ret); + return ret; + } + ret = btrfs_commit_transaction(trans); + if (ret < 0) + btrfs_err(fs_info, + "scrub: failed to commit transaction to fix super block errors: %d", ret); + } return ret; out: scrub_workers_put(fs_info); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index e7671afcee4f..1c4b693ee4a3 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -15,6 +15,7 @@ #include <linux/string.h> #include <linux/compat.h> #include <linux/crc32c.h> +#include <linux/fsverity.h> #include "send.h" #include "ctree.h" @@ -127,6 +128,8 @@ struct send_ctx { bool cur_inode_new_gen; bool cur_inode_deleted; bool ignore_cur_inode; + bool cur_inode_needs_verity; + void *verity_descriptor; u64 send_progress; @@ -345,6 +348,7 @@ static bool proto_cmd_ok(const struct send_ctx *sctx, int cmd) switch (sctx->proto) { case 1: return cmd <= BTRFS_SEND_C_MAX_V1; case 2: return cmd <= BTRFS_SEND_C_MAX_V2; + case 3: return cmd <= BTRFS_SEND_C_MAX_V3; default: return false; } } @@ -624,6 +628,7 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len) return tlv_put(sctx, attr, &__tmp, sizeof(__tmp)); \ } +TLV_PUT_DEFINE_INT(8) TLV_PUT_DEFINE_INT(32) TLV_PUT_DEFINE_INT(64) @@ -842,17 +847,32 @@ out: return ret; } +struct btrfs_inode_info { + u64 size; + u64 gen; + u64 mode; + u64 uid; + u64 gid; + u64 rdev; + u64 fileattr; + u64 nlink; +}; + /* * Helper function to retrieve some fields from an inode item. */ -static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path, - u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid, - u64 *gid, u64 *rdev, u64 *fileattr) +static int get_inode_info(struct btrfs_root *root, u64 ino, + struct btrfs_inode_info *info) { int ret; + struct btrfs_path *path; struct btrfs_inode_item *ii; struct btrfs_key key; + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + key.objectid = ino; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; @@ -860,47 +880,43 @@ static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path, if (ret) { if (ret > 0) ret = -ENOENT; - return ret; + goto out; } + if (!info) + goto out; + ii = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); - if (size) - *size = btrfs_inode_size(path->nodes[0], ii); - if (gen) - *gen = btrfs_inode_generation(path->nodes[0], ii); - if (mode) - *mode = btrfs_inode_mode(path->nodes[0], ii); - if (uid) - *uid = btrfs_inode_uid(path->nodes[0], ii); - if (gid) - *gid = btrfs_inode_gid(path->nodes[0], ii); - if (rdev) - *rdev = btrfs_inode_rdev(path->nodes[0], ii); + info->size = btrfs_inode_size(path->nodes[0], ii); + info->gen = btrfs_inode_generation(path->nodes[0], ii); + info->mode = btrfs_inode_mode(path->nodes[0], ii); + info->uid = btrfs_inode_uid(path->nodes[0], ii); + info->gid = btrfs_inode_gid(path->nodes[0], ii); + info->rdev = btrfs_inode_rdev(path->nodes[0], ii); + info->nlink = btrfs_inode_nlink(path->nodes[0], ii); /* * Transfer the unchanged u64 value of btrfs_inode_item::flags, that's * otherwise logically split to 32/32 parts. */ - if (fileattr) - *fileattr = btrfs_inode_flags(path->nodes[0], ii); + info->fileattr = btrfs_inode_flags(path->nodes[0], ii); +out: + btrfs_free_path(path); return ret; } -static int get_inode_info(struct btrfs_root *root, - u64 ino, u64 *size, u64 *gen, - u64 *mode, u64 *uid, u64 *gid, - u64 *rdev, u64 *fileattr) +static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen) { - struct btrfs_path *path; int ret; + struct btrfs_inode_info info; - path = alloc_path_for_send(); - if (!path) - return -ENOMEM; - ret = __get_inode_info(root, path, ino, size, gen, mode, uid, gid, - rdev, fileattr); - btrfs_free_path(path); + if (!gen) + return -EPERM; + + ret = get_inode_info(root, ino, &info); + if (!ret) + *gen = info.gen; return ret; } @@ -1643,21 +1659,22 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) int right_ret; u64 left_gen; u64 right_gen; + struct btrfs_inode_info info; - ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL, - NULL, NULL, NULL); + ret = get_inode_info(sctx->send_root, ino, &info); if (ret < 0 && ret != -ENOENT) goto out; - left_ret = ret; + left_ret = (info.nlink == 0) ? -ENOENT : ret; + left_gen = info.gen; if (!sctx->parent_root) { right_ret = -ENOENT; } else { - ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen, - NULL, NULL, NULL, NULL, NULL); + ret = get_inode_info(sctx->parent_root, ino, &info); if (ret < 0 && ret != -ENOENT) goto out; - right_ret = ret; + right_ret = (info.nlink == 0) ? -ENOENT : ret; + right_gen = info.gen; } if (!left_ret && !right_ret) { @@ -1816,8 +1833,7 @@ static int get_first_ref(struct btrfs_root *root, u64 ino, btrfs_release_path(path); if (dir_gen) { - ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL, - NULL, NULL, NULL, NULL); + ret = get_inode_gen(root, parent_dir, dir_gen); if (ret < 0) goto out; } @@ -1874,6 +1890,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, int ret = 0; u64 gen; u64 other_inode = 0; + struct btrfs_inode_info info; if (!sctx->parent_root) goto out; @@ -1888,8 +1905,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, * and we can just unlink this entry. */ if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID) { - ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, - NULL, NULL, NULL, NULL); + ret = get_inode_gen(sctx->parent_root, dir, &gen); if (ret < 0 && ret != -ENOENT) goto out; if (ret) { @@ -1916,13 +1932,14 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, */ if (other_inode > sctx->send_progress || is_waiting_for_move(sctx, other_inode)) { - ret = get_inode_info(sctx->parent_root, other_inode, NULL, - who_gen, who_mode, NULL, NULL, NULL, NULL); + ret = get_inode_info(sctx->parent_root, other_inode, &info); if (ret < 0) goto out; ret = 1; *who_ino = other_inode; + *who_gen = info.gen; + *who_mode = info.mode; } else { ret = 0; } @@ -1955,8 +1972,7 @@ static int did_overwrite_ref(struct send_ctx *sctx, goto out; if (dir != BTRFS_FIRST_FREE_OBJECTID) { - ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, - NULL, NULL, NULL, NULL); + ret = get_inode_gen(sctx->send_root, dir, &gen); if (ret < 0 && ret != -ENOENT) goto out; if (ret) { @@ -1978,8 +1994,7 @@ static int did_overwrite_ref(struct send_ctx *sctx, goto out; } - ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL, - NULL, NULL, NULL); + ret = get_inode_gen(sctx->send_root, ow_inode, &gen); if (ret < 0) goto out; @@ -2645,6 +2660,7 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino) int ret = 0; struct fs_path *p; int cmd; + struct btrfs_inode_info info; u64 gen; u64 mode; u64 rdev; @@ -2656,10 +2672,12 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino) return -ENOMEM; if (ino != sctx->cur_ino) { - ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, - NULL, NULL, &rdev, NULL); + ret = get_inode_info(sctx->send_root, ino, &info); if (ret < 0) goto out; + gen = info.gen; + mode = info.mode; + rdev = info.rdev; } else { gen = sctx->cur_inode_gen; mode = sctx->cur_inode_mode; @@ -3359,8 +3377,7 @@ finish: /* * The parent inode might have been deleted in the send snapshot */ - ret = get_inode_info(sctx->send_root, cur->dir, NULL, - NULL, NULL, NULL, NULL, NULL, NULL); + ret = get_inode_info(sctx->send_root, cur->dir, NULL); if (ret == -ENOENT) { ret = 0; continue; @@ -3534,12 +3551,10 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, goto out; } - ret = get_inode_info(sctx->parent_root, di_key.objectid, NULL, - &left_gen, NULL, NULL, NULL, NULL, NULL); + ret = get_inode_gen(sctx->parent_root, di_key.objectid, &left_gen); if (ret < 0) goto out; - ret = get_inode_info(sctx->send_root, di_key.objectid, NULL, - &right_gen, NULL, NULL, NULL, NULL, NULL); + ret = get_inode_gen(sctx->send_root, di_key.objectid, &right_gen); if (ret < 0) { if (ret == -ENOENT) ret = 0; @@ -3669,8 +3684,7 @@ static int is_ancestor(struct btrfs_root *root, cur_offset = item_size; } - ret = get_inode_info(root, parent, NULL, &parent_gen, - NULL, NULL, NULL, NULL, NULL); + ret = get_inode_gen(root, parent, &parent_gen); if (ret < 0) goto out; ret = check_ino_in_path(root, ino1, ino1_gen, @@ -3760,9 +3774,7 @@ static int wait_for_parent_move(struct send_ctx *sctx, memcmp(path_before->start, path_after->start, len1))) { u64 parent_ino_gen; - ret = get_inode_info(sctx->parent_root, ino, NULL, - &parent_ino_gen, NULL, NULL, NULL, - NULL, NULL); + ret = get_inode_gen(sctx->parent_root, ino, &parent_ino_gen); if (ret < 0) goto out; if (ino_gen == parent_ino_gen) { @@ -4441,8 +4453,7 @@ static int record_new_ref_if_needed(int num, u64 dir, int index, struct recorded_ref *ref; u64 dir_gen; - ret = get_inode_info(sctx->send_root, dir, NULL, &dir_gen, NULL, - NULL, NULL, NULL, NULL); + ret = get_inode_gen(sctx->send_root, dir, &dir_gen); if (ret < 0) goto out; @@ -4472,8 +4483,7 @@ static int record_deleted_ref_if_needed(int num, u64 dir, int index, struct recorded_ref *ref; u64 dir_gen; - ret = get_inode_info(sctx->parent_root, dir, NULL, &dir_gen, NULL, - NULL, NULL, NULL, NULL); + ret = get_inode_gen(sctx->parent_root, dir, &dir_gen); if (ret < 0) goto out; @@ -4886,6 +4896,84 @@ static int process_all_new_xattrs(struct send_ctx *sctx) return ret; } +static int send_verity(struct send_ctx *sctx, struct fs_path *path, + struct fsverity_descriptor *desc) +{ + int ret; + + ret = begin_cmd(sctx, BTRFS_SEND_C_ENABLE_VERITY); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); + TLV_PUT_U8(sctx, BTRFS_SEND_A_VERITY_ALGORITHM, + le8_to_cpu(desc->hash_algorithm)); + TLV_PUT_U32(sctx, BTRFS_SEND_A_VERITY_BLOCK_SIZE, + 1U << le8_to_cpu(desc->log_blocksize)); + TLV_PUT(sctx, BTRFS_SEND_A_VERITY_SALT_DATA, desc->salt, + le8_to_cpu(desc->salt_size)); + TLV_PUT(sctx, BTRFS_SEND_A_VERITY_SIG_DATA, desc->signature, + le32_to_cpu(desc->sig_size)); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + return ret; +} + +static int process_verity(struct send_ctx *sctx) +{ + int ret = 0; + struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; + struct inode *inode; + struct fs_path *p; + + inode = btrfs_iget(fs_info->sb, sctx->cur_ino, sctx->send_root); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + ret = btrfs_get_verity_descriptor(inode, NULL, 0); + if (ret < 0) + goto iput; + + if (ret > FS_VERITY_MAX_DESCRIPTOR_SIZE) { + ret = -EMSGSIZE; + goto iput; + } + if (!sctx->verity_descriptor) { + sctx->verity_descriptor = kvmalloc(FS_VERITY_MAX_DESCRIPTOR_SIZE, + GFP_KERNEL); + if (!sctx->verity_descriptor) { + ret = -ENOMEM; + goto iput; + } + } + + ret = btrfs_get_verity_descriptor(inode, sctx->verity_descriptor, ret); + if (ret < 0) + goto iput; + + p = fs_path_alloc(); + if (!p) { + ret = -ENOMEM; + goto iput; + } + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + if (ret < 0) + goto free_path; + + ret = send_verity(sctx, p, sctx->verity_descriptor); + if (ret < 0) + goto free_path; + +free_path: + fs_path_free(p); +iput: + iput(inode); + return ret; +} + static inline u64 max_send_read_size(const struct send_ctx *sctx) { return sctx->send_max_size - SZ_16K; @@ -5056,8 +5144,7 @@ static int send_clone(struct send_ctx *sctx, TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); if (clone_root->root == sctx->send_root) { - ret = get_inode_info(sctx->send_root, clone_root->ino, NULL, - &gen, NULL, NULL, NULL, NULL, NULL); + ret = get_inode_gen(sctx->send_root, clone_root->ino, &gen); if (ret < 0) goto out; ret = get_cur_path(sctx, clone_root->ino, gen, p); @@ -5536,6 +5623,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, struct btrfs_path *path; struct btrfs_key key; int ret; + struct btrfs_inode_info info; u64 clone_src_i_size = 0; /* @@ -5565,12 +5653,11 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, * There are inodes that have extents that lie behind its i_size. Don't * accept clones from these extents. */ - ret = __get_inode_info(clone_root->root, path, clone_root->ino, - &clone_src_i_size, NULL, NULL, NULL, NULL, NULL, - NULL); + ret = get_inode_info(clone_root->root, clone_root->ino, &info); btrfs_release_path(path); if (ret < 0) goto out; + clone_src_i_size = info.size; /* * We can't send a clone operation for the entire range if we find @@ -5615,6 +5702,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, u64 ext_len; u64 clone_len; u64 clone_data_offset; + bool crossed_src_i_size = false; if (slot >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(clone_root->root, path); @@ -5672,8 +5760,10 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, if (key.offset >= clone_src_i_size) break; - if (key.offset + ext_len > clone_src_i_size) + if (key.offset + ext_len > clone_src_i_size) { ext_len = clone_src_i_size - key.offset; + crossed_src_i_size = true; + } clone_data_offset = btrfs_file_extent_offset(leaf, ei); if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte) { @@ -5734,6 +5824,25 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, ret = send_clone(sctx, offset, clone_len, clone_root); } + } else if (crossed_src_i_size && clone_len < len) { + /* + * If we are at i_size of the clone source inode and we + * can not clone from it, terminate the loop. This is + * to avoid sending two write operations, one with a + * length matching clone_len and the final one after + * this loop with a length of len - clone_len. + * + * When using encoded writes (BTRFS_SEND_FLAG_COMPRESSED + * was passed to the send ioctl), this helps avoid + * sending an encoded write for an offset that is not + * sector size aligned, in case the i_size of the source + * inode is not sector size aligned. That will make the + * receiver fallback to decompression of the data and + * writing it using regular buffered IO, therefore while + * not incorrect, it's not optimal due decompression and + * possible re-compression at the receiver. + */ + break; } else { ret = send_extent_data(sctx, dst_path, offset, clone_len); @@ -6259,6 +6368,7 @@ out: static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) { int ret = 0; + struct btrfs_inode_info info; u64 left_mode; u64 left_uid; u64 left_gid; @@ -6301,11 +6411,13 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) goto out; if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino) goto out; - - ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL, - &left_mode, &left_uid, &left_gid, NULL, &left_fileattr); + ret = get_inode_info(sctx->send_root, sctx->cur_ino, &info); if (ret < 0) goto out; + left_mode = info.mode; + left_uid = info.uid; + left_gid = info.gid; + left_fileattr = info.fileattr; if (!sctx->parent_root || sctx->cur_inode_new) { need_chown = 1; @@ -6316,11 +6428,14 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) } else { u64 old_size; - ret = get_inode_info(sctx->parent_root, sctx->cur_ino, - &old_size, NULL, &right_mode, &right_uid, - &right_gid, NULL, &right_fileattr); + ret = get_inode_info(sctx->parent_root, sctx->cur_ino, &info); if (ret < 0) goto out; + old_size = info.size; + right_mode = info.mode; + right_uid = info.uid; + right_gid = info.gid; + right_fileattr = info.fileattr; if (left_uid != right_uid || left_gid != right_gid) need_chown = 1; @@ -6378,6 +6493,13 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) goto out; } + if (proto_cmd_ok(sctx, BTRFS_SEND_C_ENABLE_VERITY) + && sctx->cur_inode_needs_verity) { + ret = process_verity(sctx); + if (ret < 0) + goto out; + } + ret = send_capabilities(sctx); if (ret < 0) goto out; @@ -6407,86 +6529,6 @@ out: return ret; } -struct parent_paths_ctx { - struct list_head *refs; - struct send_ctx *sctx; -}; - -static int record_parent_ref(int num, u64 dir, int index, struct fs_path *name, - void *ctx) -{ - struct parent_paths_ctx *ppctx = ctx; - - /* - * Pass 0 as the generation for the directory, we don't care about it - * here as we have no new references to add, we just want to delete all - * references for an inode. - */ - return record_ref_in_tree(&ppctx->sctx->rbtree_deleted_refs, ppctx->refs, - name, dir, 0, ppctx->sctx); -} - -/* - * Issue unlink operations for all paths of the current inode found in the - * parent snapshot. - */ -static int btrfs_unlink_all_paths(struct send_ctx *sctx) -{ - LIST_HEAD(deleted_refs); - struct btrfs_path *path; - struct btrfs_root *root = sctx->parent_root; - struct btrfs_key key; - struct btrfs_key found_key; - struct parent_paths_ctx ctx; - int iter_ret = 0; - int ret; - - path = alloc_path_for_send(); - if (!path) - return -ENOMEM; - - key.objectid = sctx->cur_ino; - key.type = BTRFS_INODE_REF_KEY; - key.offset = 0; - - ctx.refs = &deleted_refs; - ctx.sctx = sctx; - - btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { - if (found_key.objectid != key.objectid) - break; - if (found_key.type != key.type && - found_key.type != BTRFS_INODE_EXTREF_KEY) - break; - - ret = iterate_inode_ref(root, path, &found_key, 1, - record_parent_ref, &ctx); - if (ret < 0) - goto out; - } - /* Catch error found during iteration */ - if (iter_ret < 0) { - ret = iter_ret; - goto out; - } - - while (!list_empty(&deleted_refs)) { - struct recorded_ref *ref; - - ref = list_first_entry(&deleted_refs, struct recorded_ref, list); - ret = send_unlink(sctx, ref->full_path); - if (ret < 0) - goto out; - recorded_ref_free(ref); - } - ret = 0; -out: - btrfs_free_path(path); - if (ret) - __free_recorded_refs(&deleted_refs); - return ret; -} - static void close_current_inode(struct send_ctx *sctx) { u64 i_size; @@ -6577,25 +6619,36 @@ static int changed_inode(struct send_ctx *sctx, * file descriptor against it or turning a RO snapshot into RW mode, * keep an open file descriptor against a file, delete it and then * turn the snapshot back to RO mode before using it for a send - * operation. So if we find such cases, ignore the inode and all its - * items completely if it's a new inode, or if it's a changed inode - * make sure all its previous paths (from the parent snapshot) are all - * unlinked and all other the inode items are ignored. + * operation. The former is what the receiver operation does. + * Therefore, if we want to send these snapshots soon after they're + * received, we need to handle orphan inodes as well. Moreover, orphans + * can appear not only in the send snapshot but also in the parent + * snapshot. Here are several cases: + * + * Case 1: BTRFS_COMPARE_TREE_NEW + * | send snapshot | action + * -------------------------------- + * nlink | 0 | ignore + * + * Case 2: BTRFS_COMPARE_TREE_DELETED + * | parent snapshot | action + * ---------------------------------- + * nlink | 0 | as usual + * Note: No unlinks will be sent because there're no paths for it. + * + * Case 3: BTRFS_COMPARE_TREE_CHANGED + * | | parent snapshot | send snapshot | action + * ----------------------------------------------------------------------- + * subcase 1 | nlink | 0 | 0 | ignore + * subcase 2 | nlink | >0 | 0 | new_gen(deletion) + * subcase 3 | nlink | 0 | >0 | new_gen(creation) + * */ - if (result == BTRFS_COMPARE_TREE_NEW || - result == BTRFS_COMPARE_TREE_CHANGED) { - u32 nlinks; - - nlinks = btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii); - if (nlinks == 0) { + if (result == BTRFS_COMPARE_TREE_NEW) { + if (btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii) == 0) { sctx->ignore_cur_inode = true; - if (result == BTRFS_COMPARE_TREE_CHANGED) - ret = btrfs_unlink_all_paths(sctx); goto out; } - } - - if (result == BTRFS_COMPARE_TREE_NEW) { sctx->cur_inode_gen = left_gen; sctx->cur_inode_new = true; sctx->cur_inode_deleted = false; @@ -6616,6 +6669,16 @@ static int changed_inode(struct send_ctx *sctx, sctx->cur_inode_mode = btrfs_inode_mode( sctx->right_path->nodes[0], right_ii); } else if (result == BTRFS_COMPARE_TREE_CHANGED) { + u32 new_nlinks, old_nlinks; + + new_nlinks = btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii); + old_nlinks = btrfs_inode_nlink(sctx->right_path->nodes[0], right_ii); + if (new_nlinks == 0 && old_nlinks == 0) { + sctx->ignore_cur_inode = true; + goto out; + } else if (new_nlinks == 0 || old_nlinks == 0) { + sctx->cur_inode_new_gen = 1; + } /* * We need to do some special handling in case the inode was * reported as changed with a changed generation number. This @@ -6627,53 +6690,61 @@ static int changed_inode(struct send_ctx *sctx, /* * First, process the inode as if it was deleted. */ - sctx->cur_inode_gen = right_gen; - sctx->cur_inode_new = false; - sctx->cur_inode_deleted = true; - sctx->cur_inode_size = btrfs_inode_size( - sctx->right_path->nodes[0], right_ii); - sctx->cur_inode_mode = btrfs_inode_mode( - sctx->right_path->nodes[0], right_ii); - ret = process_all_refs(sctx, - BTRFS_COMPARE_TREE_DELETED); - if (ret < 0) - goto out; + if (old_nlinks > 0) { + sctx->cur_inode_gen = right_gen; + sctx->cur_inode_new = false; + sctx->cur_inode_deleted = true; + sctx->cur_inode_size = btrfs_inode_size( + sctx->right_path->nodes[0], right_ii); + sctx->cur_inode_mode = btrfs_inode_mode( + sctx->right_path->nodes[0], right_ii); + ret = process_all_refs(sctx, + BTRFS_COMPARE_TREE_DELETED); + if (ret < 0) + goto out; + } /* * Now process the inode as if it was new. */ - sctx->cur_inode_gen = left_gen; - sctx->cur_inode_new = true; - sctx->cur_inode_deleted = false; - sctx->cur_inode_size = btrfs_inode_size( - sctx->left_path->nodes[0], left_ii); - sctx->cur_inode_mode = btrfs_inode_mode( - sctx->left_path->nodes[0], left_ii); - sctx->cur_inode_rdev = btrfs_inode_rdev( - sctx->left_path->nodes[0], left_ii); - ret = send_create_inode_if_needed(sctx); - if (ret < 0) - goto out; + if (new_nlinks > 0) { + sctx->cur_inode_gen = left_gen; + sctx->cur_inode_new = true; + sctx->cur_inode_deleted = false; + sctx->cur_inode_size = btrfs_inode_size( + sctx->left_path->nodes[0], + left_ii); + sctx->cur_inode_mode = btrfs_inode_mode( + sctx->left_path->nodes[0], + left_ii); + sctx->cur_inode_rdev = btrfs_inode_rdev( + sctx->left_path->nodes[0], + left_ii); + ret = send_create_inode_if_needed(sctx); + if (ret < 0) + goto out; - ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW); - if (ret < 0) - goto out; - /* - * Advance send_progress now as we did not get into - * process_recorded_refs_if_needed in the new_gen case. - */ - sctx->send_progress = sctx->cur_ino + 1; + ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW); + if (ret < 0) + goto out; + /* + * Advance send_progress now as we did not get + * into process_recorded_refs_if_needed in the + * new_gen case. + */ + sctx->send_progress = sctx->cur_ino + 1; - /* - * Now process all extents and xattrs of the inode as if - * they were all new. - */ - ret = process_all_extents(sctx); - if (ret < 0) - goto out; - ret = process_all_new_xattrs(sctx); - if (ret < 0) - goto out; + /* + * Now process all extents and xattrs of the + * inode as if they were all new. + */ + ret = process_all_extents(sctx); + if (ret < 0) + goto out; + ret = process_all_new_xattrs(sctx); + if (ret < 0) + goto out; + } } else { sctx->cur_inode_gen = left_gen; sctx->cur_inode_new = false; @@ -6785,18 +6856,27 @@ static int changed_extent(struct send_ctx *sctx, return ret; } +static int changed_verity(struct send_ctx *sctx, enum btrfs_compare_tree_result result) +{ + int ret = 0; + + if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) { + if (result == BTRFS_COMPARE_TREE_NEW) + sctx->cur_inode_needs_verity = true; + } + return ret; +} + static int dir_changed(struct send_ctx *sctx, u64 dir) { u64 orig_gen, new_gen; int ret; - ret = get_inode_info(sctx->send_root, dir, NULL, &new_gen, NULL, NULL, - NULL, NULL, NULL); + ret = get_inode_gen(sctx->send_root, dir, &new_gen); if (ret) return ret; - ret = get_inode_info(sctx->parent_root, dir, NULL, &orig_gen, NULL, - NULL, NULL, NULL, NULL); + ret = get_inode_gen(sctx->parent_root, dir, &orig_gen); if (ret) return ret; @@ -6939,6 +7019,9 @@ static int changed_cb(struct btrfs_path *left_path, ret = changed_xattr(sctx, result); else if (key->type == BTRFS_EXTENT_DATA_KEY) ret = changed_extent(sctx, result); + else if (key->type == BTRFS_VERITY_DESC_ITEM_KEY && + key->offset == 0) + ret = changed_verity(sctx, result); } out: @@ -8036,6 +8119,7 @@ out: kvfree(sctx->clone_roots); kfree(sctx->send_buf_pages); kvfree(sctx->send_buf); + kvfree(sctx->verity_descriptor); name_cache_free(sctx); diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 4bb4e6a638cb..f7585cfa7e52 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -10,7 +10,12 @@ #include <linux/types.h> #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream" +/* Conditional support for the upcoming protocol version. */ +#ifdef CONFIG_BTRFS_DEBUG +#define BTRFS_SEND_STREAM_VERSION 3 +#else #define BTRFS_SEND_STREAM_VERSION 2 +#endif /* * In send stream v1, no command is larger than 64K. In send stream v2, no limit @@ -92,8 +97,11 @@ enum btrfs_send_cmd { BTRFS_SEND_C_ENCODED_WRITE = 25, BTRFS_SEND_C_MAX_V2 = 25, + /* Version 3 */ + BTRFS_SEND_C_ENABLE_VERITY = 26, + BTRFS_SEND_C_MAX_V3 = 26, /* End */ - BTRFS_SEND_C_MAX = 25, + BTRFS_SEND_C_MAX = 26, }; /* attributes in send stream */ @@ -160,8 +168,14 @@ enum { BTRFS_SEND_A_ENCRYPTION = 31, BTRFS_SEND_A_MAX_V2 = 31, - /* End */ - BTRFS_SEND_A_MAX = 31, + /* Version 3 */ + BTRFS_SEND_A_VERITY_ALGORITHM = 32, + BTRFS_SEND_A_VERITY_BLOCK_SIZE = 33, + BTRFS_SEND_A_VERITY_SALT_DATA = 34, + BTRFS_SEND_A_VERITY_SIG_DATA = 35, + BTRFS_SEND_A_MAX_V3 = 35, + + __BTRFS_SEND_A_MAX = 35, }; long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg); diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 435559ba94fa..f171bf875633 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -293,32 +293,36 @@ out: return ret; } -void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, - u64 total_bytes, u64 bytes_used, - u64 bytes_readonly, u64 bytes_zone_unusable, - bool active, struct btrfs_space_info **space_info) +void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, + struct btrfs_block_group *block_group) { struct btrfs_space_info *found; - int factor; + int factor, index; - factor = btrfs_bg_type_to_factor(flags); + factor = btrfs_bg_type_to_factor(block_group->flags); - found = btrfs_find_space_info(info, flags); + found = btrfs_find_space_info(info, block_group->flags); ASSERT(found); spin_lock(&found->lock); - found->total_bytes += total_bytes; - if (active) - found->active_total_bytes += total_bytes; - found->disk_total += total_bytes * factor; - found->bytes_used += bytes_used; - found->disk_used += bytes_used * factor; - found->bytes_readonly += bytes_readonly; - found->bytes_zone_unusable += bytes_zone_unusable; - if (total_bytes > 0) + found->total_bytes += block_group->length; + if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) + found->active_total_bytes += block_group->length; + found->disk_total += block_group->length * factor; + found->bytes_used += block_group->used; + found->disk_used += block_group->used * factor; + found->bytes_readonly += block_group->bytes_super; + found->bytes_zone_unusable += block_group->zone_unusable; + if (block_group->length > 0) found->full = 0; btrfs_try_granting_tickets(info, found); spin_unlock(&found->lock); - *space_info = found; + + block_group->space_info = found; + + index = btrfs_bg_flags_to_raid_index(block_group->flags); + down_write(&found->groups_sem); + list_add_tail(&block_group->list, &found->block_groups[index]); + up_write(&found->groups_sem); } struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, @@ -472,28 +476,47 @@ do { \ spin_unlock(&__rsv->lock); \ } while (0) +static const char *space_info_flag_to_str(const struct btrfs_space_info *space_info) +{ + switch (space_info->flags) { + case BTRFS_BLOCK_GROUP_SYSTEM: + return "SYSTEM"; + case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA: + return "DATA+METADATA"; + case BTRFS_BLOCK_GROUP_DATA: + return "DATA"; + case BTRFS_BLOCK_GROUP_METADATA: + return "METADATA"; + default: + return "UNKNOWN"; + } +} + +static void dump_global_block_rsv(struct btrfs_fs_info *fs_info) +{ + DUMP_BLOCK_RSV(fs_info, global_block_rsv); + DUMP_BLOCK_RSV(fs_info, trans_block_rsv); + DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); + DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); + DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); +} + static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *info) { + const char *flag_str = space_info_flag_to_str(info); lockdep_assert_held(&info->lock); /* The free space could be negative in case of overcommit */ - btrfs_info(fs_info, "space_info %llu has %lld free, is %sfull", - info->flags, + btrfs_info(fs_info, "space_info %s has %lld free, is %sfull", + flag_str, (s64)(info->total_bytes - btrfs_space_info_used(info, true)), info->full ? "" : "not "); btrfs_info(fs_info, - "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu", +"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu", info->total_bytes, info->bytes_used, info->bytes_pinned, info->bytes_reserved, info->bytes_may_use, info->bytes_readonly, info->bytes_zone_unusable); - - DUMP_BLOCK_RSV(fs_info, global_block_rsv); - DUMP_BLOCK_RSV(fs_info, trans_block_rsv); - DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); - DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); - DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); - } void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, @@ -505,6 +528,7 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, spin_lock(&info->lock); __btrfs_dump_space_info(fs_info, info); + dump_global_block_rsv(fs_info); spin_unlock(&info->lock); if (!dump_block_groups) @@ -1662,7 +1686,6 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, &space_info->priority_tickets); } } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { - used += orig_bytes; /* * We will do the space reservation dance during log replay, * which means we won't have fs_info->fs_root set, so don't do @@ -1737,7 +1760,8 @@ int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, int ret; ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA || - flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE); + flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE || + flush == BTRFS_RESERVE_NO_FLUSH); ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA); ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush); @@ -1749,3 +1773,17 @@ int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, } return ret; } + +/* Dump all the space infos when we abort a transaction due to ENOSPC. */ +__cold void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info) +{ + struct btrfs_space_info *space_info; + + btrfs_info(fs_info, "dumping space info:"); + list_for_each_entry(space_info, &fs_info->space_info, list) { + spin_lock(&space_info->lock); + __btrfs_dump_space_info(fs_info, space_info); + spin_unlock(&space_info->lock); + } + dump_global_block_rsv(fs_info); +} diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 12fd6147f92d..ce66023a9eb8 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -123,10 +123,8 @@ DECLARE_SPACE_INFO_UPDATE(bytes_may_use, "space_info"); DECLARE_SPACE_INFO_UPDATE(bytes_pinned, "pinned"); int btrfs_init_space_info(struct btrfs_fs_info *fs_info); -void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, - u64 total_bytes, u64 bytes_used, - u64 bytes_readonly, u64 bytes_zone_unusable, - bool active, struct btrfs_space_info **space_info); +void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, + struct btrfs_block_group *block_group); void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, u64 chunk_size); struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, @@ -159,4 +157,7 @@ static inline void btrfs_space_info_free_bytes_may_use( } int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, enum btrfs_reserve_flush_enum flush); +void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info); +void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info); + #endif /* BTRFS_SPACE_INFO_H */ diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 6fc2b77ae5c3..9a176af847d7 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -337,7 +337,7 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, * * Even with 0 returned, the page still need extra check to make sure * it's really the correct page, as the caller is using - * find_get_pages_contig(), which can race with page invalidating. + * filemap_get_folios_contig(), which can race with page invalidating. */ int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index f89beac3c665..5942b9384088 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -346,12 +346,14 @@ void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info) __cold void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, const char *function, - unsigned int line, int errno) + unsigned int line, int errno, bool first_hit) { struct btrfs_fs_info *fs_info = trans->fs_info; WRITE_ONCE(trans->aborted, errno); WRITE_ONCE(trans->transaction->aborted, errno); + if (first_hit && errno == -ENOSPC) + btrfs_dump_space_info_for_trans_abort(fs_info); /* Wake up anybody who may be waiting on this transaction */ wake_up(&fs_info->transaction_wait); wake_up(&fs_info->transaction_blocked_wait); @@ -626,6 +628,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, int saved_compress_level; bool saved_compress_force; int no_compress = 0; + const bool remounting = test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state); if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE)) btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE); @@ -1137,10 +1140,12 @@ out: } if (!ret) ret = btrfs_check_mountopts_zoned(info); - if (!ret && btrfs_test_opt(info, SPACE_CACHE)) - btrfs_info(info, "disk space caching is enabled"); - if (!ret && btrfs_test_opt(info, FREE_SPACE_TREE)) - btrfs_info(info, "using free space tree"); + if (!ret && !remounting) { + if (btrfs_test_opt(info, SPACE_CACHE)) + btrfs_info(info, "disk space caching is enabled"); + if (btrfs_test_opt(info, FREE_SPACE_TREE)) + btrfs_info(info, "using free space tree"); + } return ret; } @@ -2009,14 +2014,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (ret) goto restore; - /* V1 cache is not supported for subpage mount. */ - if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) { - btrfs_warn(fs_info, - "v1 space cache is not supported for page size %lu with sectorsize %u", - PAGE_SIZE, fs_info->sectorsize); - ret = -EINVAL; + ret = btrfs_check_features(fs_info, sb); + if (ret < 0) goto restore; - } + btrfs_remount_begin(fs_info, old_opts, *flags); btrfs_resize_thread_pool(fs_info, fs_info->thread_pool_size, old_thread_pool_size); @@ -2550,11 +2551,87 @@ static int btrfs_freeze(struct super_block *sb) return btrfs_commit_transaction(trans); } +static int check_dev_super(struct btrfs_device *dev) +{ + struct btrfs_fs_info *fs_info = dev->fs_info; + struct btrfs_super_block *sb; + u16 csum_type; + int ret = 0; + + /* This should be called with fs still frozen. */ + ASSERT(test_bit(BTRFS_FS_FROZEN, &fs_info->flags)); + + /* Missing dev, no need to check. */ + if (!dev->bdev) + return 0; + + /* Only need to check the primary super block. */ + sb = btrfs_read_dev_one_super(dev->bdev, 0, true); + if (IS_ERR(sb)) + return PTR_ERR(sb); + + /* Verify the checksum. */ + csum_type = btrfs_super_csum_type(sb); + if (csum_type != btrfs_super_csum_type(fs_info->super_copy)) { + btrfs_err(fs_info, "csum type changed, has %u expect %u", + csum_type, btrfs_super_csum_type(fs_info->super_copy)); + ret = -EUCLEAN; + goto out; + } + + if (btrfs_check_super_csum(fs_info, sb)) { + btrfs_err(fs_info, "csum for on-disk super block no longer matches"); + ret = -EUCLEAN; + goto out; + } + + /* Btrfs_validate_super() includes fsid check against super->fsid. */ + ret = btrfs_validate_super(fs_info, sb, 0); + if (ret < 0) + goto out; + + if (btrfs_super_generation(sb) != fs_info->last_trans_committed) { + btrfs_err(fs_info, "transid mismatch, has %llu expect %llu", + btrfs_super_generation(sb), + fs_info->last_trans_committed); + ret = -EUCLEAN; + goto out; + } +out: + btrfs_release_disk_super(sb); + return ret; +} + static int btrfs_unfreeze(struct super_block *sb) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_device *device; + int ret = 0; + /* + * Make sure the fs is not changed by accident (like hibernation then + * modified by other OS). + * If we found anything wrong, we mark the fs error immediately. + * + * And since the fs is frozen, no one can modify the fs yet, thus + * we don't need to hold device_list_mutex. + */ + list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { + ret = check_dev_super(device); + if (ret < 0) { + btrfs_handle_fs_error(fs_info, ret, + "super block on devid %llu got modified unexpectedly", + device->devid); + break; + } + } clear_bit(BTRFS_FS_FROZEN, &fs_info->flags); + + /* + * We still return 0, to allow VFS layer to unfreeze the fs even the + * above checks failed. Since the fs is either fine or read-only, we're + * safe to continue, without causing further damage. + */ return 0; } @@ -2662,17 +2739,21 @@ static int __init init_btrfs_fs(void) if (err) goto free_compress; - err = extent_io_init(); + err = extent_state_init_cachep(); if (err) goto free_cachep; - err = extent_state_cache_init(); + err = extent_buffer_init_cachep(); + if (err) + goto free_extent_cachep; + + err = btrfs_bioset_init(); if (err) - goto free_extent_io; + goto free_eb_cachep; err = extent_map_init(); if (err) - goto free_extent_state_cache; + goto free_bioset; err = ordered_data_init(); if (err) @@ -2724,10 +2805,12 @@ free_ordered_data: ordered_data_exit(); free_extent_map: extent_map_exit(); -free_extent_state_cache: - extent_state_cache_exit(); -free_extent_io: - extent_io_exit(); +free_bioset: + btrfs_bioset_exit(); +free_eb_cachep: + extent_buffer_free_cachep(); +free_extent_cachep: + extent_state_free_cachep(); free_cachep: btrfs_destroy_cachep(); free_compress: @@ -2746,8 +2829,9 @@ static void __exit exit_btrfs_fs(void) btrfs_prelim_ref_exit(); ordered_data_exit(); extent_map_exit(); - extent_state_cache_exit(); - extent_io_exit(); + btrfs_bioset_exit(); + extent_state_free_cachep(); + extent_buffer_free_cachep(); btrfs_interface_exit(); unregister_filesystem(&btrfs_fs_type); btrfs_exit_sysfs(); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index d5d0717fd09a..74fef1f49c35 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -35,12 +35,12 @@ * qgroup_attrs /sys/fs/btrfs/<uuid>/qgroups/<level>_<qgroupid> * space_info_attrs /sys/fs/btrfs/<uuid>/allocation/<bg-type> * raid_attrs /sys/fs/btrfs/<uuid>/allocation/<bg-type>/<bg-profile> + * discard_attrs /sys/fs/btrfs/<uuid>/discard * * When built with BTRFS_CONFIG_DEBUG: * * btrfs_debug_feature_attrs /sys/fs/btrfs/debug * btrfs_debug_mount_attrs /sys/fs/btrfs/<uuid>/debug - * discard_debug_attrs /sys/fs/btrfs/<uuid>/debug/discard */ struct btrfs_feature_attr { @@ -286,6 +286,7 @@ BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA); BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES); BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID); BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE); +BTRFS_FEAT_ATTR_COMPAT_RO(block_group_tree, BLOCK_GROUP_TREE); BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34); #ifdef CONFIG_BLK_DEV_ZONED BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED); @@ -317,6 +318,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(metadata_uuid), BTRFS_FEAT_ATTR_PTR(free_space_tree), BTRFS_FEAT_ATTR_PTR(raid1c34), + BTRFS_FEAT_ATTR_PTR(block_group_tree), #ifdef CONFIG_BLK_DEV_ZONED BTRFS_FEAT_ATTR_PTR(zoned), #endif @@ -429,12 +431,10 @@ static const struct attribute_group btrfs_static_feature_attr_group = { .attrs = btrfs_supported_static_feature_attrs, }; -#ifdef CONFIG_BTRFS_DEBUG - /* * Discard statistics and tunables */ -#define discard_to_fs_info(_kobj) to_fs_info((_kobj)->parent->parent) +#define discard_to_fs_info(_kobj) to_fs_info(get_btrfs_kobj(_kobj)) static ssize_t btrfs_discardable_bytes_show(struct kobject *kobj, struct kobj_attribute *a, @@ -583,11 +583,11 @@ BTRFS_ATTR_RW(discard, max_discard_size, btrfs_discard_max_discard_size_show, btrfs_discard_max_discard_size_store); /* - * Per-filesystem debugging of discard (when mounted with discard=async). + * Per-filesystem stats for discard (when mounted with discard=async). * - * Path: /sys/fs/btrfs/<uuid>/debug/discard/ + * Path: /sys/fs/btrfs/<uuid>/discard/ */ -static const struct attribute *discard_debug_attrs[] = { +static const struct attribute *discard_attrs[] = { BTRFS_ATTR_PTR(discard, discardable_bytes), BTRFS_ATTR_PTR(discard, discardable_extents), BTRFS_ATTR_PTR(discard, discard_bitmap_bytes), @@ -599,6 +599,8 @@ static const struct attribute *discard_debug_attrs[] = { NULL, }; +#ifdef CONFIG_BTRFS_DEBUG + /* * Per-filesystem runtime debugging exported via sysfs. * @@ -837,11 +839,8 @@ static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj, char *buf) { struct btrfs_space_info *space_info = to_space_info(kobj); - ssize_t ret; - ret = sysfs_emit(buf, "%d\n", READ_ONCE(space_info->bg_reclaim_threshold)); - - return ret; + return sysfs_emit(buf, "%d\n", READ_ONCE(space_info->bg_reclaim_threshold)); } static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj, @@ -1150,25 +1149,6 @@ static ssize_t btrfs_generation_show(struct kobject *kobj, } BTRFS_ATTR(, generation, btrfs_generation_show); -/* - * Look for an exact string @string in @buffer with possible leading or - * trailing whitespace - */ -static bool strmatch(const char *buffer, const char *string) -{ - const size_t len = strlen(string); - - /* Skip leading whitespace */ - buffer = skip_spaces(buffer); - - /* Match entire string, check if the rest is whitespace or empty */ - if (strncmp(string, buffer, len) == 0 && - strlen(skip_spaces(buffer + len)) == 0) - return true; - - return false; -} - static const char * const btrfs_read_policy_name[] = { "pid" }; static ssize_t btrfs_read_policy_show(struct kobject *kobj, @@ -1202,7 +1182,7 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj, int i; for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { - if (strmatch(buf, btrfs_read_policy_name[i])) { + if (sysfs_streq(buf, btrfs_read_policy_name[i])) { if (i != fs_devices->read_policy) { fs_devices->read_policy = i; btrfs_info(fs_devices->fs_info, @@ -1222,11 +1202,8 @@ static ssize_t btrfs_bg_reclaim_threshold_show(struct kobject *kobj, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - ssize_t ret; - - ret = sysfs_emit(buf, "%d\n", READ_ONCE(fs_info->bg_reclaim_threshold)); - return ret; + return sysfs_emit(buf, "%d\n", READ_ONCE(fs_info->bg_reclaim_threshold)); } static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj, @@ -1427,13 +1404,12 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) kobject_del(fs_info->space_info_kobj); kobject_put(fs_info->space_info_kobj); } -#ifdef CONFIG_BTRFS_DEBUG - if (fs_info->discard_debug_kobj) { - sysfs_remove_files(fs_info->discard_debug_kobj, - discard_debug_attrs); - kobject_del(fs_info->discard_debug_kobj); - kobject_put(fs_info->discard_debug_kobj); + if (fs_info->discard_kobj) { + sysfs_remove_files(fs_info->discard_kobj, discard_attrs); + kobject_del(fs_info->discard_kobj); + kobject_put(fs_info->discard_kobj); } +#ifdef CONFIG_BTRFS_DEBUG if (fs_info->debug_kobj) { sysfs_remove_files(fs_info->debug_kobj, btrfs_debug_mount_attrs); kobject_del(fs_info->debug_kobj); @@ -2001,20 +1977,18 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) error = sysfs_create_files(fs_info->debug_kobj, btrfs_debug_mount_attrs); if (error) goto failure; +#endif /* Discard directory */ - fs_info->discard_debug_kobj = kobject_create_and_add("discard", - fs_info->debug_kobj); - if (!fs_info->discard_debug_kobj) { + fs_info->discard_kobj = kobject_create_and_add("discard", fsid_kobj); + if (!fs_info->discard_kobj) { error = -ENOMEM; goto failure; } - error = sysfs_create_files(fs_info->discard_debug_kobj, - discard_debug_attrs); + error = sysfs_create_files(fs_info->discard_kobj, discard_attrs); if (error) goto failure; -#endif error = addrm_unknown_feature_attrs(fs_info, true); if (error) @@ -2041,6 +2015,98 @@ failure: return error; } +static ssize_t qgroup_enabled_show(struct kobject *qgroups_kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent); + bool enabled; + + spin_lock(&fs_info->qgroup_lock); + enabled = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON; + spin_unlock(&fs_info->qgroup_lock); + + return sysfs_emit(buf, "%d\n", enabled); +} +BTRFS_ATTR(qgroups, enabled, qgroup_enabled_show); + +static ssize_t qgroup_inconsistent_show(struct kobject *qgroups_kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent); + bool inconsistent; + + spin_lock(&fs_info->qgroup_lock); + inconsistent = (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT); + spin_unlock(&fs_info->qgroup_lock); + + return sysfs_emit(buf, "%d\n", inconsistent); +} +BTRFS_ATTR(qgroups, inconsistent, qgroup_inconsistent_show); + +static ssize_t qgroup_drop_subtree_thres_show(struct kobject *qgroups_kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent); + u8 result; + + spin_lock(&fs_info->qgroup_lock); + result = fs_info->qgroup_drop_subtree_thres; + spin_unlock(&fs_info->qgroup_lock); + + return sysfs_emit(buf, "%d\n", result); +} + +static ssize_t qgroup_drop_subtree_thres_store(struct kobject *qgroups_kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent); + u8 new_thres; + int ret; + + ret = kstrtou8(buf, 10, &new_thres); + if (ret) + return -EINVAL; + + if (new_thres > BTRFS_MAX_LEVEL) + return -EINVAL; + + spin_lock(&fs_info->qgroup_lock); + fs_info->qgroup_drop_subtree_thres = new_thres; + spin_unlock(&fs_info->qgroup_lock); + + return len; +} +BTRFS_ATTR_RW(qgroups, drop_subtree_threshold, qgroup_drop_subtree_thres_show, + qgroup_drop_subtree_thres_store); + +/* + * Qgroups global info + * + * Path: /sys/fs/btrfs/<uuid>/qgroups/ + */ +static struct attribute *qgroups_attrs[] = { + BTRFS_ATTR_PTR(qgroups, enabled), + BTRFS_ATTR_PTR(qgroups, inconsistent), + BTRFS_ATTR_PTR(qgroups, drop_subtree_threshold), + NULL +}; +ATTRIBUTE_GROUPS(qgroups); + +static void qgroups_release(struct kobject *kobj) +{ + kfree(kobj); +} + +static struct kobj_type qgroups_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = qgroups_groups, + .release = qgroups_release, +}; + static inline struct btrfs_fs_info *qgroup_kobj_to_fs_info(struct kobject *kobj) { return to_fs_info(kobj->parent->parent); @@ -2166,11 +2232,15 @@ int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info) if (fs_info->qgroups_kobj) return 0; - fs_info->qgroups_kobj = kobject_create_and_add("qgroups", fsid_kobj); - if (!fs_info->qgroups_kobj) { - ret = -ENOMEM; + fs_info->qgroups_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL); + if (!fs_info->qgroups_kobj) + return -ENOMEM; + + ret = kobject_init_and_add(fs_info->qgroups_kobj, &qgroups_ktype, + fsid_kobj, "qgroups"); + if (ret < 0) goto out; - } + rbtree_postorder_for_each_entry_safe(qgroup, next, &fs_info->qgroup_tree, node) { ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); @@ -2251,8 +2321,11 @@ int __init btrfs_init_sysfs(void) #ifdef CONFIG_BTRFS_DEBUG ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_debug_feature_attr_group); - if (ret) - goto out2; + if (ret) { + sysfs_unmerge_group(&btrfs_kset->kobj, + &btrfs_static_feature_attr_group); + goto out_remove_group; + } #endif return 0; diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index cc9377cf56a3..d43cb5242fec 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -200,7 +200,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) void btrfs_free_dummy_root(struct btrfs_root *root) { - if (!root) + if (IS_ERR_OR_NULL(root)) return; /* Will be freed by btrfs_free_fs_roots */ if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state))) @@ -243,7 +243,7 @@ void btrfs_free_dummy_block_group(struct btrfs_block_group *cache) { if (!cache) return; - __btrfs_remove_free_space_cache(cache->free_space_ctl); + btrfs_remove_free_space_cache(cache); kfree(cache->free_space_ctl); kfree(cache); } diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index a232b15b8021..350da449db08 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -4,6 +4,7 @@ */ #include <linux/pagemap.h> +#include <linux/pagevec.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/sizes.h> @@ -20,39 +21,40 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end, unsigned long flags) { int ret; - struct page *pages[16]; + struct folio_batch fbatch; unsigned long index = start >> PAGE_SHIFT; unsigned long end_index = end >> PAGE_SHIFT; - unsigned long nr_pages = end_index - index + 1; int i; int count = 0; int loops = 0; - while (nr_pages > 0) { - ret = find_get_pages_contig(inode->i_mapping, index, - min_t(unsigned long, nr_pages, - ARRAY_SIZE(pages)), pages); + folio_batch_init(&fbatch); + + while (index <= end_index) { + ret = filemap_get_folios_contig(inode->i_mapping, &index, + end_index, &fbatch); for (i = 0; i < ret; i++) { + struct folio *folio = fbatch.folios[i]; + if (flags & PROCESS_TEST_LOCKED && - !PageLocked(pages[i])) + !folio_test_locked(folio)) count++; - if (flags & PROCESS_UNLOCK && PageLocked(pages[i])) - unlock_page(pages[i]); - put_page(pages[i]); + if (flags & PROCESS_UNLOCK && folio_test_locked(folio)) + folio_unlock(folio); if (flags & PROCESS_RELEASE) - put_page(pages[i]); + folio_put(folio); } - nr_pages -= ret; - index += ret; + folio_batch_release(&fbatch); cond_resched(); loops++; if (loops > 100000) { printk(KERN_ERR - "stuck in a loop, start %llu, end %llu, nr_pages %lu, ret %d\n", - start, end, nr_pages, ret); + "stuck in a loop, start %llu, end %llu, ret %d\n", + start, end, ret); break; } } + return count; } @@ -80,7 +82,6 @@ static void extent_flag_to_str(const struct extent_state *state, char *dest) PRINT_ONE_FLAG(state, dest, cur, NODATASUM); PRINT_ONE_FLAG(state, dest, cur, CLEAR_META_RESV); PRINT_ONE_FLAG(state, dest, cur, NEED_WAIT); - PRINT_ONE_FLAG(state, dest, cur, DAMAGED); PRINT_ONE_FLAG(state, dest, cur, NORESERVE); PRINT_ONE_FLAG(state, dest, cur, QGROUP_RESERVED); PRINT_ONE_FLAG(state, dest, cur, CLEAR_DATA_RESV); @@ -172,7 +173,7 @@ static int test_find_delalloc(u32 sectorsize) sectorsize - 1, start, end); goto out_bits; } - unlock_extent(tmp, start, end); + unlock_extent(tmp, start, end, NULL); unlock_page(locked_page); put_page(locked_page); @@ -208,7 +209,7 @@ static int test_find_delalloc(u32 sectorsize) test_err("there were unlocked pages in the range"); goto out_bits; } - unlock_extent(tmp, start, end); + unlock_extent(tmp, start, end, NULL); /* locked_page was unlocked above */ put_page(locked_page); @@ -263,7 +264,7 @@ static int test_find_delalloc(u32 sectorsize) test_err("pages in range were not all locked"); goto out_bits; } - unlock_extent(tmp, start, end); + unlock_extent(tmp, start, end, NULL); /* * Now to test where we run into a page that is no longer dirty in the diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index 5930cdcae5cb..ebf68fcd2149 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -82,7 +82,7 @@ static int test_extents(struct btrfs_block_group *cache) } /* Cleanup */ - __btrfs_remove_free_space_cache(cache->free_space_ctl); + btrfs_remove_free_space_cache(cache); return 0; } @@ -149,7 +149,7 @@ static int test_bitmaps(struct btrfs_block_group *cache, u32 sectorsize) return -1; } - __btrfs_remove_free_space_cache(cache->free_space_ctl); + btrfs_remove_free_space_cache(cache); return 0; } @@ -230,7 +230,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group *cache, return -1; } - __btrfs_remove_free_space_cache(cache->free_space_ctl); + btrfs_remove_free_space_cache(cache); /* Now with the extent entry offset into the bitmap */ ret = test_add_free_space_entry(cache, SZ_4M, SZ_4M, 1); @@ -266,7 +266,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group *cache, * [ bitmap ] * [ del ] */ - __btrfs_remove_free_space_cache(cache->free_space_ctl); + btrfs_remove_free_space_cache(cache); ret = test_add_free_space_entry(cache, bitmap_offset + SZ_4M, SZ_4M, 1); if (ret) { test_err("couldn't add bitmap %d", ret); @@ -291,7 +291,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group *cache, return -1; } - __btrfs_remove_free_space_cache(cache->free_space_ctl); + btrfs_remove_free_space_cache(cache); /* * This blew up before, we have part of the free space in a bitmap and @@ -317,7 +317,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group *cache, return ret; } - __btrfs_remove_free_space_cache(cache->free_space_ctl); + btrfs_remove_free_space_cache(cache); return 0; } @@ -629,7 +629,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache, if (ret) return ret; - __btrfs_remove_free_space_cache(cache->free_space_ctl); + btrfs_remove_free_space_cache(cache); /* * Now test a similar scenario, but where our extent entry is located @@ -819,7 +819,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache, return ret; cache->free_space_ctl->op = orig_free_space_ops; - __btrfs_remove_free_space_cache(cache->free_space_ctl); + btrfs_remove_free_space_cache(cache); return 0; } @@ -868,7 +868,7 @@ static int test_bytes_index(struct btrfs_block_group *cache, u32 sectorsize) } /* Now validate bitmaps do the correct thing. */ - __btrfs_remove_free_space_cache(cache->free_space_ctl); + btrfs_remove_free_space_cache(cache); for (i = 0; i < 2; i++) { offset = i * BITS_PER_BITMAP * sectorsize; bytes = (i + 1) * SZ_1M; @@ -891,7 +891,7 @@ static int test_bytes_index(struct btrfs_block_group *cache, u32 sectorsize) } /* Now validate bitmaps with different ->max_extent_size. */ - __btrfs_remove_free_space_cache(cache->free_space_ctl); + btrfs_remove_free_space_cache(cache); orig_free_space_ops = cache->free_space_ctl->op; cache->free_space_ctl->op = &test_free_space_ops; @@ -998,7 +998,7 @@ static int test_bytes_index(struct btrfs_block_group *cache, u32 sectorsize) } cache->free_space_ctl->op = orig_free_space_ops; - __btrfs_remove_free_space_cache(cache->free_space_ctl); + btrfs_remove_free_space_cache(cache); return 0; } diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index cac89c388131..625f7d398368 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -267,7 +267,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } free_extent_map(em); - btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0); + btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false); /* * All of the magic numbers are based on the mapping setup in @@ -975,7 +975,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) BTRFS_MAX_EXTENT_SIZE >> 1, (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_UPTODATE, 0, 0, NULL); + EXTENT_UPTODATE, NULL); if (ret) { test_err("clear_extent_bit returned %d", ret); goto out; @@ -1043,7 +1043,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) BTRFS_MAX_EXTENT_SIZE + sectorsize, BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_UPTODATE, 0, 0, NULL); + EXTENT_UPTODATE, NULL); if (ret) { test_err("clear_extent_bit returned %d", ret); goto out; @@ -1076,7 +1076,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) /* Empty */ ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_UPTODATE, 0, 0, NULL); + EXTENT_UPTODATE, NULL); if (ret) { test_err("clear_extent_bit returned %d", ret); goto out; @@ -1092,7 +1092,7 @@ out: if (ret) clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_UPTODATE, 0, 0, NULL); + EXTENT_UPTODATE, NULL); iput(inode); btrfs_free_dummy_root(root); btrfs_free_dummy_fs_info(fs_info); diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index eee1e4459541..63676ea19f29 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -225,20 +225,20 @@ static int test_no_shared_qgroup(struct btrfs_root *root, */ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); if (ret) { - ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } ret = insert_normal_tree_ref(root, nodesize, nodesize, 0, BTRFS_FS_TREE_OBJECTID); - if (ret) + if (ret) { + ulist_free(old_roots); return ret; + } ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); if (ret) { ulist_free(old_roots); - ulist_free(new_roots); test_err("couldn't find old roots: %d", ret); return ret; } @@ -250,29 +250,31 @@ static int test_no_shared_qgroup(struct btrfs_root *root, return ret; } + /* btrfs_qgroup_account_extent() always frees the ulists passed to it. */ + old_roots = NULL; + new_roots = NULL; + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, nodesize, nodesize)) { test_err("qgroup counts didn't match expected values"); return -EINVAL; } - old_roots = NULL; - new_roots = NULL; ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); if (ret) { - ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } ret = remove_extent_item(root, nodesize, nodesize); - if (ret) + if (ret) { + ulist_free(old_roots); return -EINVAL; + } ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); if (ret) { ulist_free(old_roots); - ulist_free(new_roots); test_err("couldn't find old roots: %d", ret); return ret; } @@ -322,20 +324,20 @@ static int test_multiple_refs(struct btrfs_root *root, ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); if (ret) { - ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } ret = insert_normal_tree_ref(root, nodesize, nodesize, 0, BTRFS_FS_TREE_OBJECTID); - if (ret) + if (ret) { + ulist_free(old_roots); return ret; + } ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); if (ret) { ulist_free(old_roots); - ulist_free(new_roots); test_err("couldn't find old roots: %d", ret); return ret; } @@ -355,20 +357,20 @@ static int test_multiple_refs(struct btrfs_root *root, ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); if (ret) { - ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } ret = add_tree_ref(root, nodesize, nodesize, 0, BTRFS_FIRST_FREE_OBJECTID); - if (ret) + if (ret) { + ulist_free(old_roots); return ret; + } ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); if (ret) { ulist_free(old_roots); - ulist_free(new_roots); test_err("couldn't find old roots: %d", ret); return ret; } @@ -394,20 +396,20 @@ static int test_multiple_refs(struct btrfs_root *root, ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); if (ret) { - ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } ret = remove_extent_ref(root, nodesize, nodesize, 0, BTRFS_FIRST_FREE_OBJECTID); - if (ret) + if (ret) { + ulist_free(old_roots); return ret; + } ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); if (ret) { ulist_free(old_roots); - ulist_free(new_roots); test_err("couldn't find old roots: %d", ret); return ret; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 0bec10740ad3..d1f1da6820fb 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -161,7 +161,6 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root, *tmp; - struct btrfs_caching_control *caching_ctl, *next; /* * At this point no one can be using this transaction to modify any tree @@ -196,46 +195,6 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) } spin_unlock(&cur_trans->dropped_roots_lock); - /* - * We have to update the last_byte_to_unpin under the commit_root_sem, - * at the same time we swap out the commit roots. - * - * This is because we must have a real view of the last spot the caching - * kthreads were while caching. Consider the following views of the - * extent tree for a block group - * - * commit root - * +----+----+----+----+----+----+----+ - * |\\\\| |\\\\|\\\\| |\\\\|\\\\| - * +----+----+----+----+----+----+----+ - * 0 1 2 3 4 5 6 7 - * - * new commit root - * +----+----+----+----+----+----+----+ - * | | | |\\\\| | |\\\\| - * +----+----+----+----+----+----+----+ - * 0 1 2 3 4 5 6 7 - * - * If the cache_ctl->progress was at 3, then we are only allowed to - * unpin [0,1) and [2,3], because the caching thread has already - * processed those extents. We are not allowed to unpin [5,6), because - * the caching thread will re-start it's search from 3, and thus find - * the hole from [4,6) to add to the free space cache. - */ - write_lock(&fs_info->block_group_cache_lock); - list_for_each_entry_safe(caching_ctl, next, - &fs_info->caching_block_groups, list) { - struct btrfs_block_group *cache = caching_ctl->block_group; - - if (btrfs_block_group_done(cache)) { - cache->last_byte_to_unpin = (u64)-1; - list_del_init(&caching_ctl->list); - btrfs_put_caching_control(caching_ctl); - } else { - cache->last_byte_to_unpin = caching_ctl->progress; - } - } - write_unlock(&fs_info->block_group_cache_lock); up_write(&fs_info->commit_root_sem); } @@ -313,6 +272,8 @@ loop: atomic_inc(&cur_trans->num_writers); extwriter_counter_inc(cur_trans, type); spin_unlock(&fs_info->trans_lock); + btrfs_lockdep_acquire(fs_info, btrfs_trans_num_writers); + btrfs_lockdep_acquire(fs_info, btrfs_trans_num_extwriters); return 0; } spin_unlock(&fs_info->trans_lock); @@ -334,16 +295,23 @@ loop: if (!cur_trans) return -ENOMEM; + btrfs_lockdep_acquire(fs_info, btrfs_trans_num_writers); + btrfs_lockdep_acquire(fs_info, btrfs_trans_num_extwriters); + spin_lock(&fs_info->trans_lock); if (fs_info->running_transaction) { /* * someone started a transaction after we unlocked. Make sure * to redo the checks above */ + btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters); + btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); kfree(cur_trans); goto loop; } else if (BTRFS_FS_ERROR(fs_info)) { spin_unlock(&fs_info->trans_lock); + btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters); + btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); kfree(cur_trans); return -EROFS; } @@ -397,7 +365,7 @@ loop: spin_lock_init(&cur_trans->releasing_ebs_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(fs_info, &cur_trans->dirty_pages, - IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode); + IO_TREE_TRANS_DIRTY_PAGES, NULL); extent_io_tree_init(fs_info, &cur_trans->pinned_extents, IO_TREE_FS_PINNED_EXTENTS, NULL); fs_info->generation++; @@ -541,6 +509,7 @@ static void wait_current_trans(struct btrfs_fs_info *fs_info) refcount_inc(&cur_trans->use_count); spin_unlock(&fs_info->trans_lock); + btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); wait_event(fs_info->transaction_wait, cur_trans->state >= TRANS_STATE_UNBLOCKED || TRANS_ABORTED(cur_trans)); @@ -625,7 +594,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, */ num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items); if (flush == BTRFS_RESERVE_FLUSH_ALL && - delayed_refs_rsv->full == 0) { + btrfs_block_rsv_full(delayed_refs_rsv) == 0) { delayed_refs_bytes = num_bytes; num_bytes <<= 1; } @@ -650,7 +619,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, if (rsv->space_info->force_alloc) do_chunk_alloc = true; } else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL && - !delayed_refs_rsv->full) { + !btrfs_block_rsv_full(delayed_refs_rsv)) { /* * Some people call with btrfs_start_transaction(root, 0) * because they can be throttled, but have some other mechanism @@ -859,6 +828,15 @@ static noinline void wait_for_commit(struct btrfs_transaction *commit, u64 transid = commit->transid; bool put = false; + /* + * At the moment this function is called with min_state either being + * TRANS_STATE_COMPLETED or TRANS_STATE_SUPER_COMMITTED. + */ + if (min_state == TRANS_STATE_COMPLETED) + btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED); + else + btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); + while (1) { wait_event(commit->commit_wait, commit->state >= min_state); if (put) @@ -1022,6 +1000,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, extwriter_counter_dec(cur_trans, trans->type); cond_wake_up(&cur_trans->writer_wait); + + btrfs_lockdep_release(info, btrfs_trans_num_extwriters); + btrfs_lockdep_release(info, btrfs_trans_num_writers); + btrfs_put_transaction(cur_trans); if (current->journal_info == trans) @@ -1134,7 +1116,7 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info, * it's safe to do it (through extent_io_tree_release()). */ err = clear_extent_bit(dirty_pages, start, end, - EXTENT_NEED_WAIT, 0, 0, &cached_state); + EXTENT_NEED_WAIT, &cached_state); if (err == -ENOMEM) err = 0; if (!err) @@ -1912,14 +1894,6 @@ static void update_super_roots(struct btrfs_fs_info *fs_info) super->cache_generation = 0; if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags)) super->uuid_tree_generation = root_item->generation; - - if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { - root_item = &fs_info->block_group_root->root_item; - - super->block_group_root = root_item->bytenr; - super->block_group_root_generation = root_item->generation; - super->block_group_root_level = root_item->level; - } } int btrfs_transaction_in_commit(struct btrfs_fs_info *info) @@ -1967,6 +1941,7 @@ void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans) * Wait for the current transaction commit to start and block * subsequent transaction joins */ + btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START); wait_event(fs_info->transaction_blocked_wait, cur_trans->state >= TRANS_STATE_COMMIT_START || TRANS_ABORTED(cur_trans)); @@ -1994,6 +1969,12 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) if (cur_trans == fs_info->running_transaction) { cur_trans->state = TRANS_STATE_COMMIT_DOING; spin_unlock(&fs_info->trans_lock); + + /* + * The thread has already released the lockdep map as reader + * already in btrfs_commit_transaction(). + */ + btrfs_might_wait_for_event(fs_info, btrfs_trans_num_writers); wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); @@ -2118,12 +2099,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) ktime_t interval; ASSERT(refcount_read(&trans->use_count) == 1); + btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START); /* Stop the commit early if ->aborted is set */ if (TRANS_ABORTED(cur_trans)) { ret = cur_trans->aborted; - btrfs_end_transaction(trans); - return ret; + goto lockdep_trans_commit_start_release; } btrfs_trans_release_metadata(trans); @@ -2140,10 +2121,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * Any running threads may add more while we are here. */ ret = btrfs_run_delayed_refs(trans, 0); - if (ret) { - btrfs_end_transaction(trans); - return ret; - } + if (ret) + goto lockdep_trans_commit_start_release; } btrfs_create_pending_block_groups(trans); @@ -2172,10 +2151,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (run_it) { ret = btrfs_start_dirty_block_groups(trans); - if (ret) { - btrfs_end_transaction(trans); - return ret; - } + if (ret) + goto lockdep_trans_commit_start_release; } } @@ -2190,6 +2167,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (trans->in_fsync) want_state = TRANS_STATE_SUPER_COMMITTED; + + btrfs_trans_state_lockdep_release(fs_info, + BTRFS_LOCKDEP_TRANS_COMMIT_START); ret = btrfs_end_transaction(trans); wait_for_commit(cur_trans, want_state); @@ -2203,6 +2183,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) cur_trans->state = TRANS_STATE_COMMIT_START; wake_up(&fs_info->transaction_blocked_wait); + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START); if (cur_trans->list.prev != &fs_info->trans_list) { enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; @@ -2222,7 +2203,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) btrfs_put_transaction(prev_trans); if (ret) - goto cleanup_transaction; + goto lockdep_release; } else { spin_unlock(&fs_info->trans_lock); } @@ -2236,7 +2217,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) */ if (BTRFS_FS_ERROR(fs_info)) { ret = -EROFS; - goto cleanup_transaction; + goto lockdep_release; } } @@ -2250,19 +2231,28 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) ret = btrfs_start_delalloc_flush(fs_info); if (ret) - goto cleanup_transaction; + goto lockdep_release; ret = btrfs_run_delayed_items(trans); if (ret) - goto cleanup_transaction; + goto lockdep_release; + /* + * The thread has started/joined the transaction thus it holds the + * lockdep map as a reader. It has to release it before acquiring the + * lockdep map as a writer. + */ + btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters); + btrfs_might_wait_for_event(fs_info, btrfs_trans_num_extwriters); wait_event(cur_trans->writer_wait, extwriter_counter_read(cur_trans) == 0); /* some pending stuffs might be added after the previous flush. */ ret = btrfs_run_delayed_items(trans); - if (ret) + if (ret) { + btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); goto cleanup_transaction; + } btrfs_wait_delalloc_flush(fs_info); @@ -2271,6 +2261,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * transaction. Otherwise if this transaction commits before the ordered * extents complete we lose logged data after a power failure. */ + btrfs_might_wait_for_event(fs_info, btrfs_trans_pending_ordered); wait_event(cur_trans->pending_wait, atomic_read(&cur_trans->pending_ordered) == 0); @@ -2284,10 +2275,28 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) add_pending_snapshot(trans); cur_trans->state = TRANS_STATE_COMMIT_DOING; spin_unlock(&fs_info->trans_lock); + + /* + * The thread has started/joined the transaction thus it holds the + * lockdep map as a reader. It has to release it before acquiring the + * lockdep map as a writer. + */ + btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); + btrfs_might_wait_for_event(fs_info, btrfs_trans_num_writers); wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); /* + * Make lockdep happy by acquiring the state locks after + * btrfs_trans_num_writers is released. If we acquired the state locks + * before releasing the btrfs_trans_num_writers lock then lockdep would + * complain because we did not follow the reverse order unlocking rule. + */ + btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED); + btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); + btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); + + /* * We've started the commit, clear the flag in case we were triggered to * do an async commit but somebody else started before the transaction * kthread could do the work. @@ -2296,6 +2305,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (TRANS_ABORTED(cur_trans)) { ret = cur_trans->aborted; + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); goto scrub_continue; } /* @@ -2430,6 +2440,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) mutex_unlock(&fs_info->reloc_mutex); wake_up(&fs_info->transaction_wait); + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); ret = btrfs_write_and_wait_transaction(trans); if (ret) { @@ -2461,6 +2472,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) */ cur_trans->state = TRANS_STATE_SUPER_COMMITTED; wake_up(&cur_trans->commit_wait); + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); btrfs_finish_extent_commit(trans); @@ -2474,6 +2486,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) */ cur_trans->state = TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED); spin_lock(&fs_info->trans_lock); list_del_init(&cur_trans->list); @@ -2502,7 +2515,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) unlock_reloc: mutex_unlock(&fs_info->reloc_mutex); + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); scrub_continue: + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED); btrfs_scrub_continue(fs_info); cleanup_transaction: btrfs_trans_release_metadata(trans); @@ -2515,6 +2531,16 @@ cleanup_transaction: cleanup_transaction(trans, ret); return ret; + +lockdep_release: + btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters); + btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); + goto cleanup_transaction; + +lockdep_trans_commit_start_release: + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START); + btrfs_end_transaction(trans); + return ret; } /* diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9205c4a5ca81..c3cf3dabe0b1 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -22,6 +22,8 @@ #include "zoned.h" #include "inode-item.h" +#define MAX_CONFLICT_INODES 10 + /* magic values for the inode_only field in btrfs_log_inode: * * LOG_INODE_ALL means to log everything @@ -31,8 +33,6 @@ enum { LOG_INODE_ALL, LOG_INODE_EXISTS, - LOG_OTHER_INODE, - LOG_OTHER_INODE_ALL, }; /* @@ -801,7 +801,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ret = btrfs_lookup_csums_range(root->log_root, csum_start, csum_end - 1, - &ordered_sums, 0); + &ordered_sums, 0, false); if (ret) goto out; /* @@ -1063,8 +1063,7 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, u64 inode_objectid, u64 parent_objectid, - u64 ref_index, char *name, int namelen, - int *search_done) + u64 ref_index, char *name, int namelen) { int ret; char *victim_name; @@ -1126,19 +1125,12 @@ again: kfree(victim_name); if (ret) return ret; - *search_done = 1; goto again; } kfree(victim_name); ptr = (unsigned long)(victim_ref + 1) + victim_name_len; } - - /* - * NOTE: we have searched root tree and checked the - * corresponding ref, it does not need to check again. - */ - *search_done = 1; } btrfs_release_path(path); @@ -1202,14 +1194,12 @@ again: kfree(victim_name); if (ret) return ret; - *search_done = 1; goto again; } kfree(victim_name); next: cur_offset += victim_name_len + sizeof(*extref); } - *search_done = 1; } btrfs_release_path(path); @@ -1373,103 +1363,6 @@ again: return ret; } -static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir, - const u8 ref_type, const char *name, - const int namelen) -{ - struct btrfs_key key; - struct btrfs_path *path; - const u64 parent_id = btrfs_ino(BTRFS_I(dir)); - int ret; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - key.objectid = btrfs_ino(BTRFS_I(inode)); - key.type = ref_type; - if (key.type == BTRFS_INODE_REF_KEY) - key.offset = parent_id; - else - key.offset = btrfs_extref_hash(parent_id, name, namelen); - - ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0); - if (ret < 0) - goto out; - if (ret > 0) { - ret = 0; - goto out; - } - if (key.type == BTRFS_INODE_EXTREF_KEY) - ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], - path->slots[0], parent_id, name, namelen); - else - ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0], - name, namelen); - -out: - btrfs_free_path(path); - return ret; -} - -static int add_link(struct btrfs_trans_handle *trans, - struct inode *dir, struct inode *inode, const char *name, - int namelen, u64 ref_index) -{ - struct btrfs_root *root = BTRFS_I(dir)->root; - struct btrfs_dir_item *dir_item; - struct btrfs_key key; - struct btrfs_path *path; - struct inode *other_inode = NULL; - int ret; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - dir_item = btrfs_lookup_dir_item(NULL, root, path, - btrfs_ino(BTRFS_I(dir)), - name, namelen, 0); - if (!dir_item) { - btrfs_release_path(path); - goto add_link; - } else if (IS_ERR(dir_item)) { - ret = PTR_ERR(dir_item); - goto out; - } - - /* - * Our inode's dentry collides with the dentry of another inode which is - * in the log but not yet processed since it has a higher inode number. - * So delete that other dentry. - */ - btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key); - btrfs_release_path(path); - other_inode = read_one_inode(root, key.objectid); - if (!other_inode) { - ret = -ENOENT; - goto out; - } - ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(other_inode), - name, namelen); - if (ret) - goto out; - /* - * If we dropped the link count to 0, bump it so that later the iput() - * on the inode will not free it. We will fixup the link count later. - */ - if (other_inode->i_nlink == 0) - set_nlink(other_inode, 1); -add_link: - ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), - name, namelen, 0, ref_index); -out: - iput(other_inode); - btrfs_free_path(path); - - return ret; -} - /* * replay one inode back reference item found in the log tree. * eb, slot and key refer to the buffer and key found in the log tree. @@ -1490,7 +1383,6 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, char *name = NULL; int namelen; int ret; - int search_done = 0; int log_ref_ver = 0; u64 parent_objectid; u64 inode_objectid; @@ -1565,51 +1457,19 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * overwrite any existing back reference, and we don't * want to create dangling pointers in the directory. */ - - if (!search_done) { - ret = __add_inode_ref(trans, root, path, log, - BTRFS_I(dir), - BTRFS_I(inode), - inode_objectid, - parent_objectid, - ref_index, name, namelen, - &search_done); - if (ret) { - if (ret == 1) - ret = 0; - goto out; - } - } - - /* - * If a reference item already exists for this inode - * with the same parent and name, but different index, - * drop it and the corresponding directory index entries - * from the parent before adding the new reference item - * and dir index entries, otherwise we would fail with - * -EEXIST returned from btrfs_add_link() below. - */ - ret = btrfs_inode_ref_exists(inode, dir, key->type, - name, namelen); - if (ret > 0) { - ret = unlink_inode_for_log_replay(trans, - BTRFS_I(dir), - BTRFS_I(inode), - name, namelen); - /* - * If we dropped the link count to 0, bump it so - * that later the iput() on the inode will not - * free it. We will fixup the link count later. - */ - if (!ret && inode->i_nlink == 0) - set_nlink(inode, 1); - } - if (ret < 0) + ret = __add_inode_ref(trans, root, path, log, + BTRFS_I(dir), BTRFS_I(inode), + inode_objectid, parent_objectid, + ref_index, name, namelen); + if (ret) { + if (ret == 1) + ret = 0; goto out; + } /* insert our name */ - ret = add_link(trans, dir, inode, name, namelen, - ref_index); + ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), + name, namelen, 0, ref_index); if (ret) goto out; @@ -3834,15 +3694,29 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, u64 *last_old_dentry_offset) { struct btrfs_root *log = inode->root->log_root; - struct extent_buffer *src = path->nodes[0]; - const int nritems = btrfs_header_nritems(src); + struct extent_buffer *src; + const int nritems = btrfs_header_nritems(path->nodes[0]); const u64 ino = btrfs_ino(inode); bool last_found = false; int batch_start = 0; int batch_size = 0; int i; - for (i = path->slots[0]; i < nritems; i++) { + /* + * We need to clone the leaf, release the read lock on it, and use the + * clone before modifying the log tree. See the comment at copy_items() + * about why we need to do this. + */ + src = btrfs_clone_extent_buffer(path->nodes[0]); + if (!src) + return -ENOMEM; + + i = path->slots[0]; + btrfs_release_path(path); + path->nodes[0] = src; + path->slots[0] = i; + + for (; i < nritems; i++) { struct btrfs_dir_item *di; struct btrfs_key key; int ret; @@ -3875,6 +3749,11 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, *last_old_dentry_offset = key.offset; continue; } + + /* If we logged this dir index item before, we can skip it. */ + if (key.offset <= inode->last_dir_index_offset) + continue; + /* * We must make sure that when we log a directory entry, the * corresponding inode, after log replay, has a matching link @@ -3905,51 +3784,6 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, ctx->log_new_dentries = true; } - if (!ctx->logged_before) - goto add_to_batch; - - /* - * If we were logged before and have logged dir items, we can skip - * checking if any item with a key offset larger than the last one - * we logged is in the log tree, saving time and avoiding adding - * contention on the log tree. We can only rely on the value of - * last_dir_index_offset when we know for sure that the inode was - * previously logged in the current transaction. - */ - if (key.offset > inode->last_dir_index_offset) - goto add_to_batch; - /* - * Check if the key was already logged before. If not we can add - * it to a batch for bulk insertion. - */ - ret = btrfs_search_slot(NULL, log, &key, dst_path, 0, 0); - if (ret < 0) { - return ret; - } else if (ret > 0) { - btrfs_release_path(dst_path); - goto add_to_batch; - } - - /* - * Item exists in the log. Overwrite the item in the log if it - * has different content or do nothing if it has exactly the same - * content. And then flush the current batch if any - do it after - * overwriting the current item, or we would deadlock otherwise, - * since we are holding a path for the existing item. - */ - ret = do_overwrite_item(trans, log, dst_path, src, i, &key); - if (ret < 0) - return ret; - - if (batch_size > 0) { - ret = flush_dir_items_batch(trans, log, src, dst_path, - batch_start, batch_size); - if (ret < 0) - return ret; - batch_size = 0; - } - continue; -add_to_batch: if (batch_size == 0) batch_start = i; batch_size++; @@ -4136,6 +3970,71 @@ done: } /* + * If the inode was logged before and it was evicted, then its + * last_dir_index_offset is (u64)-1, so we don't the value of the last index + * key offset. If that's the case, search for it and update the inode. This + * is to avoid lookups in the log tree every time we try to insert a dir index + * key from a leaf changed in the current transaction, and to allow us to always + * do batch insertions of dir index keys. + */ +static int update_last_dir_index_offset(struct btrfs_inode *inode, + struct btrfs_path *path, + const struct btrfs_log_ctx *ctx) +{ + const u64 ino = btrfs_ino(inode); + struct btrfs_key key; + int ret; + + lockdep_assert_held(&inode->log_mutex); + + if (inode->last_dir_index_offset != (u64)-1) + return 0; + + if (!ctx->logged_before) { + inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1; + return 0; + } + + key.objectid = ino; + key.type = BTRFS_DIR_INDEX_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0); + /* + * An error happened or we actually have an index key with an offset + * value of (u64)-1. Bail out, we're done. + */ + if (ret <= 0) + goto out; + + ret = 0; + inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1; + + /* + * No dir index items, bail out and leave last_dir_index_offset with + * the value right before the first valid index value. + */ + if (path->slots[0] == 0) + goto out; + + /* + * btrfs_search_slot() left us at one slot beyond the slot with the last + * index key, or beyond the last key of the directory that is not an + * index key. If we have an index key before, set last_dir_index_offset + * to its offset value, otherwise leave it with a value right before the + * first valid index value, as it means we have an empty directory. + */ + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); + if (key.objectid == ino && key.type == BTRFS_DIR_INDEX_KEY) + inode->last_dir_index_offset = key.offset; + +out: + btrfs_release_path(path); + + return ret; +} + +/* * logging directories is very similar to logging inodes, We find all the items * from the current transaction and write them to the log. * @@ -4157,6 +4056,10 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans, u64 max_key; int ret; + ret = update_last_dir_index_offset(inode, path, ctx); + if (ret) + return ret; + min_key = BTRFS_DIR_START_INDEX; max_key = 0; ctx->last_dir_item_offset = inode->last_dir_index_offset; @@ -4382,8 +4285,8 @@ static int log_csums(struct btrfs_trans_handle *trans, * file which happens to refer to the same extent as well. Such races * can leave checksum items in the log with overlapping ranges. */ - ret = lock_extent_bits(&log_root->log_csum_range, sums->bytenr, - lock_end, &cached_state); + ret = lock_extent(&log_root->log_csum_range, sums->bytenr, lock_end, + &cached_state); if (ret) return ret; /* @@ -4399,8 +4302,8 @@ static int log_csums(struct btrfs_trans_handle *trans, if (!ret) ret = btrfs_csum_file_blocks(trans, log_root, sums); - unlock_extent_cached(&log_root->log_csum_range, sums->bytenr, lock_end, - &cached_state); + unlock_extent(&log_root->log_csum_range, sums->bytenr, lock_end, + &cached_state); return ret; } @@ -4414,7 +4317,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, { struct btrfs_root *log = inode->root->log_root; struct btrfs_file_extent_item *extent; - struct extent_buffer *src = src_path->nodes[0]; + struct extent_buffer *src; int ret = 0; struct btrfs_key *ins_keys; u32 *ins_sizes; @@ -4425,6 +4328,43 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM); const u64 i_size = i_size_read(&inode->vfs_inode); + /* + * To keep lockdep happy and avoid deadlocks, clone the source leaf and + * use the clone. This is because otherwise we would be changing the log + * tree, to insert items from the subvolume tree or insert csum items, + * while holding a read lock on a leaf from the subvolume tree, which + * creates a nasty lock dependency when COWing log tree nodes/leaves: + * + * 1) Modifying the log tree triggers an extent buffer allocation while + * holding a write lock on a parent extent buffer from the log tree. + * Allocating the pages for an extent buffer, or the extent buffer + * struct, can trigger inode eviction and finally the inode eviction + * will trigger a release/remove of a delayed node, which requires + * taking the delayed node's mutex; + * + * 2) Allocating a metadata extent for a log tree can trigger the async + * reclaim thread and make us wait for it to release enough space and + * unblock our reservation ticket. The reclaim thread can start + * flushing delayed items, and that in turn results in the need to + * lock delayed node mutexes and in the need to write lock extent + * buffers of a subvolume tree - all this while holding a write lock + * on the parent extent buffer in the log tree. + * + * So one task in scenario 1) running in parallel with another task in + * scenario 2) could lead to a deadlock, one wanting to lock a delayed + * node mutex while having a read lock on a leaf from the subvolume, + * while the other is holding the delayed node's mutex and wants to + * write lock the same subvolume leaf for flushing delayed items. + */ + src = btrfs_clone_extent_buffer(src_path->nodes[0]); + if (!src) + return -ENOMEM; + + i = src_path->slots[0]; + btrfs_release_path(src_path); + src_path->nodes[0] = src; + src_path->slots[0] = i; + ins_data = kmalloc(nr * sizeof(struct btrfs_key) + nr * sizeof(u32), GFP_NOFS); if (!ins_data) @@ -4513,7 +4453,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, disk_bytenr += extent_offset; ret = btrfs_lookup_csums_range(csum_root, disk_bytenr, disk_bytenr + extent_num_bytes - 1, - &ordered_sums, 0); + &ordered_sums, 0, false); if (ret) goto out; @@ -4709,7 +4649,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, ret = btrfs_lookup_csums_range(csum_root, em->block_start + csum_offset, em->block_start + csum_offset + - csum_len - 1, &ordered_sums, 0); + csum_len - 1, &ordered_sums, 0, false); if (ret) return ret; @@ -5221,10 +5161,9 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans, * leafs from the log root. */ btrfs_release_path(path); - ret = btrfs_insert_file_extent(trans, root->log_root, - ino, prev_extent_end, 0, - 0, hole_len, 0, hole_len, - 0, 0, 0); + ret = btrfs_insert_hole_extent(trans, root->log_root, + ino, prev_extent_end, + hole_len); if (ret < 0) return ret; @@ -5253,10 +5192,8 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans, btrfs_release_path(path); hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize); - ret = btrfs_insert_file_extent(trans, root->log_root, - ino, prev_extent_end, 0, 0, - hole_len, 0, hole_len, - 0, 0, 0); + ret = btrfs_insert_hole_extent(trans, root->log_root, ino, + prev_extent_end, hole_len); if (ret < 0) return ret; } @@ -5399,111 +5336,461 @@ out: return ret; } +/* + * Check if we need to log an inode. This is used in contexts where while + * logging an inode we need to log another inode (either that it exists or in + * full mode). This is used instead of btrfs_inode_in_log() because the later + * requires the inode to be in the log and have the log transaction committed, + * while here we do not care if the log transaction was already committed - our + * caller will commit the log later - and we want to avoid logging an inode + * multiple times when multiple tasks have joined the same log transaction. + */ +static bool need_log_inode(const struct btrfs_trans_handle *trans, + const struct btrfs_inode *inode) +{ + /* + * If a directory was not modified, no dentries added or removed, we can + * and should avoid logging it. + */ + if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid) + return false; + + /* + * If this inode does not have new/updated/deleted xattrs since the last + * time it was logged and is flagged as logged in the current transaction, + * we can skip logging it. As for new/deleted names, those are updated in + * the log by link/unlink/rename operations. + * In case the inode was logged and then evicted and reloaded, its + * logged_trans will be 0, in which case we have to fully log it since + * logged_trans is a transient field, not persisted. + */ + if (inode->logged_trans == trans->transid && + !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags)) + return false; + + return true; +} + +struct btrfs_dir_list { + u64 ino; + struct list_head list; +}; + +/* + * Log the inodes of the new dentries of a directory. + * See process_dir_items_leaf() for details about why it is needed. + * This is a recursive operation - if an existing dentry corresponds to a + * directory, that directory's new entries are logged too (same behaviour as + * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes + * the dentries point to we do not acquire their VFS lock, otherwise lockdep + * complains about the following circular lock dependency / possible deadlock: + * + * CPU0 CPU1 + * ---- ---- + * lock(&type->i_mutex_dir_key#3/2); + * lock(sb_internal#2); + * lock(&type->i_mutex_dir_key#3/2); + * lock(&sb->s_type->i_mutex_key#14); + * + * Where sb_internal is the lock (a counter that works as a lock) acquired by + * sb_start_intwrite() in btrfs_start_transaction(). + * Not acquiring the VFS lock of the inodes is still safe because: + * + * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible + * that while logging the inode new references (names) are added or removed + * from the inode, leaving the logged inode item with a link count that does + * not match the number of logged inode reference items. This is fine because + * at log replay time we compute the real number of links and correct the + * link count in the inode item (see replay_one_buffer() and + * link_to_fixup_dir()); + * + * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that + * while logging the inode's items new index items (key type + * BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item + * has a size that doesn't match the sum of the lengths of all the logged + * names - this is ok, not a problem, because at log replay time we set the + * directory's i_size to the correct value (see replay_one_name() and + * do_overwrite_item()). + */ +static int log_new_dir_dentries(struct btrfs_trans_handle *trans, + struct btrfs_inode *start_inode, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_root *root = start_inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_path *path; + LIST_HEAD(dir_list); + struct btrfs_dir_list *dir_elem; + u64 ino = btrfs_ino(start_inode); + int ret = 0; + + /* + * If we are logging a new name, as part of a link or rename operation, + * don't bother logging new dentries, as we just want to log the names + * of an inode and that any new parents exist. + */ + if (ctx->logging_new_name) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + while (true) { + struct extent_buffer *leaf; + struct btrfs_key min_key; + bool continue_curr_inode = true; + int nritems; + int i; + + min_key.objectid = ino; + min_key.type = BTRFS_DIR_INDEX_KEY; + min_key.offset = 0; +again: + btrfs_release_path(path); + ret = btrfs_search_forward(root, &min_key, path, trans->transid); + if (ret < 0) { + break; + } else if (ret > 0) { + ret = 0; + goto next; + } + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + for (i = path->slots[0]; i < nritems; i++) { + struct btrfs_dir_item *di; + struct btrfs_key di_key; + struct inode *di_inode; + int log_mode = LOG_INODE_EXISTS; + int type; + + btrfs_item_key_to_cpu(leaf, &min_key, i); + if (min_key.objectid != ino || + min_key.type != BTRFS_DIR_INDEX_KEY) { + continue_curr_inode = false; + break; + } + + di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); + type = btrfs_dir_type(leaf, di); + if (btrfs_dir_transid(leaf, di) < trans->transid) + continue; + btrfs_dir_item_key_to_cpu(leaf, di, &di_key); + if (di_key.type == BTRFS_ROOT_ITEM_KEY) + continue; + + btrfs_release_path(path); + di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root); + if (IS_ERR(di_inode)) { + ret = PTR_ERR(di_inode); + goto out; + } + + if (!need_log_inode(trans, BTRFS_I(di_inode))) { + btrfs_add_delayed_iput(di_inode); + break; + } + + ctx->log_new_dentries = false; + if (type == BTRFS_FT_DIR) + log_mode = LOG_INODE_ALL; + ret = btrfs_log_inode(trans, BTRFS_I(di_inode), + log_mode, ctx); + btrfs_add_delayed_iput(di_inode); + if (ret) + goto out; + if (ctx->log_new_dentries) { + dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); + if (!dir_elem) { + ret = -ENOMEM; + goto out; + } + dir_elem->ino = di_key.objectid; + list_add_tail(&dir_elem->list, &dir_list); + } + break; + } + + if (continue_curr_inode && min_key.offset < (u64)-1) { + min_key.offset++; + goto again; + } + +next: + if (list_empty(&dir_list)) + break; + + dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, list); + ino = dir_elem->ino; + list_del(&dir_elem->list); + kfree(dir_elem); + } +out: + btrfs_free_path(path); + if (ret) { + struct btrfs_dir_list *next; + + list_for_each_entry_safe(dir_elem, next, &dir_list, list) + kfree(dir_elem); + } + + return ret; +} + struct btrfs_ino_list { u64 ino; u64 parent; struct list_head list; }; -static int log_conflicting_inodes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_log_ctx *ctx, - u64 ino, u64 parent) +static void free_conflicting_inodes(struct btrfs_log_ctx *ctx) +{ + struct btrfs_ino_list *curr; + struct btrfs_ino_list *next; + + list_for_each_entry_safe(curr, next, &ctx->conflict_inodes, list) { + list_del(&curr->list); + kfree(curr); + } +} + +static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino, + struct btrfs_path *path) +{ + struct btrfs_key key; + int ret; + + key.objectid = ino; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + path->search_commit_root = 1; + path->skip_locking = 1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (WARN_ON_ONCE(ret > 0)) { + /* + * We have previously found the inode through the commit root + * so this should not happen. If it does, just error out and + * fallback to a transaction commit. + */ + ret = -ENOENT; + } else if (ret == 0) { + struct btrfs_inode_item *item; + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + if (S_ISDIR(btrfs_inode_mode(path->nodes[0], item))) + ret = 1; + } + + btrfs_release_path(path); + path->search_commit_root = 0; + path->skip_locking = 0; + + return ret; +} + +static int add_conflicting_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 ino, u64 parent, + struct btrfs_log_ctx *ctx) { struct btrfs_ino_list *ino_elem; - LIST_HEAD(inode_list); - int ret = 0; + struct inode *inode; + + /* + * It's rare to have a lot of conflicting inodes, in practice it is not + * common to have more than 1 or 2. We don't want to collect too many, + * as we could end up logging too many inodes (even if only in + * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction + * commits. + */ + if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) + return BTRFS_LOG_FORCE_COMMIT; + + inode = btrfs_iget(root->fs_info->sb, ino, root); + /* + * If the other inode that had a conflicting dir entry was deleted in + * the current transaction then we either: + * + * 1) Log the parent directory (later after adding it to the list) if + * the inode is a directory. This is because it may be a deleted + * subvolume/snapshot or it may be a regular directory that had + * deleted subvolumes/snapshots (or subdirectories that had them), + * and at the moment we can't deal with dropping subvolumes/snapshots + * during log replay. So we just log the parent, which will result in + * a fallback to a transaction commit if we are dealing with those + * cases (last_unlink_trans will match the current transaction); + * + * 2) Do nothing if it's not a directory. During log replay we simply + * unlink the conflicting dentry from the parent directory and then + * add the dentry for our inode. Like this we can avoid logging the + * parent directory (and maybe fallback to a transaction commit in + * case it has a last_unlink_trans == trans->transid, due to moving + * some inode from it to some other directory). + */ + if (IS_ERR(inode)) { + int ret = PTR_ERR(inode); + + if (ret != -ENOENT) + return ret; + + ret = conflicting_inode_is_dir(root, ino, path); + /* Not a directory or we got an error. */ + if (ret <= 0) + return ret; + + /* Conflicting inode is a directory, so we'll log its parent. */ + ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); + if (!ino_elem) + return -ENOMEM; + ino_elem->ino = ino; + ino_elem->parent = parent; + list_add_tail(&ino_elem->list, &ctx->conflict_inodes); + ctx->num_conflict_inodes++; + + return 0; + } + + /* + * If the inode was already logged skip it - otherwise we can hit an + * infinite loop. Example: + * + * From the commit root (previous transaction) we have the following + * inodes: + * + * inode 257 a directory + * inode 258 with references "zz" and "zz_link" on inode 257 + * inode 259 with reference "a" on inode 257 + * + * And in the current (uncommitted) transaction we have: + * + * inode 257 a directory, unchanged + * inode 258 with references "a" and "a2" on inode 257 + * inode 259 with reference "zz_link" on inode 257 + * inode 261 with reference "zz" on inode 257 + * + * When logging inode 261 the following infinite loop could + * happen if we don't skip already logged inodes: + * + * - we detect inode 258 as a conflicting inode, with inode 261 + * on reference "zz", and log it; + * + * - we detect inode 259 as a conflicting inode, with inode 258 + * on reference "a", and log it; + * + * - we detect inode 258 as a conflicting inode, with inode 259 + * on reference "zz_link", and log it - again! After this we + * repeat the above steps forever. + * + * Here we can use need_log_inode() because we only need to log the + * inode in LOG_INODE_EXISTS mode and rename operations update the log, + * so that the log ends up with the new name and without the old name. + */ + if (!need_log_inode(trans, BTRFS_I(inode))) { + btrfs_add_delayed_iput(inode); + return 0; + } + + btrfs_add_delayed_iput(inode); ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); if (!ino_elem) return -ENOMEM; ino_elem->ino = ino; ino_elem->parent = parent; - list_add_tail(&ino_elem->list, &inode_list); + list_add_tail(&ino_elem->list, &ctx->conflict_inodes); + ctx->num_conflict_inodes++; - while (!list_empty(&inode_list)) { - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_key key; - struct inode *inode; + return 0; +} - ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list, - list); - ino = ino_elem->ino; - parent = ino_elem->parent; - list_del(&ino_elem->list); - kfree(ino_elem); - if (ret) - continue; +static int log_conflicting_inodes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + int ret = 0; - btrfs_release_path(path); + /* + * Conflicting inodes are logged by the first call to btrfs_log_inode(), + * otherwise we could have unbounded recursion of btrfs_log_inode() + * calls. This check guarantees we can have only 1 level of recursion. + */ + if (ctx->logging_conflict_inodes) + return 0; + + ctx->logging_conflict_inodes = true; + + /* + * New conflicting inodes may be found and added to the list while we + * are logging a conflicting inode, so keep iterating while the list is + * not empty. + */ + while (!list_empty(&ctx->conflict_inodes)) { + struct btrfs_ino_list *curr; + struct inode *inode; + u64 ino; + u64 parent; + + curr = list_first_entry(&ctx->conflict_inodes, + struct btrfs_ino_list, list); + ino = curr->ino; + parent = curr->parent; + list_del(&curr->list); + kfree(curr); inode = btrfs_iget(fs_info->sb, ino, root); /* * If the other inode that had a conflicting dir entry was * deleted in the current transaction, we need to log its parent - * directory. + * directory. See the comment at add_conflicting_inode(). */ if (IS_ERR(inode)) { ret = PTR_ERR(inode); - if (ret == -ENOENT) { - inode = btrfs_iget(fs_info->sb, parent, root); - if (IS_ERR(inode)) { - ret = PTR_ERR(inode); - } else { - ret = btrfs_log_inode(trans, - BTRFS_I(inode), - LOG_OTHER_INODE_ALL, - ctx); - btrfs_add_delayed_iput(inode); - } + if (ret != -ENOENT) + break; + + inode = btrfs_iget(fs_info->sb, parent, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + break; } + + /* + * Always log the directory, we cannot make this + * conditional on need_log_inode() because the directory + * might have been logged in LOG_INODE_EXISTS mode or + * the dir index of the conflicting inode is not in a + * dir index key range logged for the directory. So we + * must make sure the deletion is recorded. + */ + ret = btrfs_log_inode(trans, BTRFS_I(inode), + LOG_INODE_ALL, ctx); + btrfs_add_delayed_iput(inode); + if (ret) + break; continue; } + /* - * If the inode was already logged skip it - otherwise we can - * hit an infinite loop. Example: - * - * From the commit root (previous transaction) we have the - * following inodes: - * - * inode 257 a directory - * inode 258 with references "zz" and "zz_link" on inode 257 - * inode 259 with reference "a" on inode 257 - * - * And in the current (uncommitted) transaction we have: - * - * inode 257 a directory, unchanged - * inode 258 with references "a" and "a2" on inode 257 - * inode 259 with reference "zz_link" on inode 257 - * inode 261 with reference "zz" on inode 257 - * - * When logging inode 261 the following infinite loop could - * happen if we don't skip already logged inodes: - * - * - we detect inode 258 as a conflicting inode, with inode 261 - * on reference "zz", and log it; - * - * - we detect inode 259 as a conflicting inode, with inode 258 - * on reference "a", and log it; + * Here we can use need_log_inode() because we only need to log + * the inode in LOG_INODE_EXISTS mode and rename operations + * update the log, so that the log ends up with the new name and + * without the old name. * - * - we detect inode 258 as a conflicting inode, with inode 259 - * on reference "zz_link", and log it - again! After this we - * repeat the above steps forever. + * We did this check at add_conflicting_inode(), but here we do + * it again because if some other task logged the inode after + * that, we can avoid doing it again. */ - spin_lock(&BTRFS_I(inode)->lock); - /* - * Check the inode's logged_trans only instead of - * btrfs_inode_in_log(). This is because the last_log_commit of - * the inode is not updated when we only log that it exists (see - * btrfs_log_inode()). - */ - if (BTRFS_I(inode)->logged_trans == trans->transid) { - spin_unlock(&BTRFS_I(inode)->lock); + if (!need_log_inode(trans, BTRFS_I(inode))) { btrfs_add_delayed_iput(inode); continue; } - spin_unlock(&BTRFS_I(inode)->lock); + /* * We are safe logging the other inode without acquiring its * lock as long as we log with the LOG_INODE_EXISTS mode. We @@ -5511,67 +5798,16 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * well because during a rename we pin the log and update the * log with the new name before we unpin it. */ - ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_OTHER_INODE, ctx); - if (ret) { - btrfs_add_delayed_iput(inode); - continue; - } - - key.objectid = ino; - key.type = BTRFS_INODE_REF_KEY; - key.offset = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) { - btrfs_add_delayed_iput(inode); - continue; - } - - while (true) { - struct extent_buffer *leaf = path->nodes[0]; - int slot = path->slots[0]; - u64 other_ino = 0; - u64 other_parent = 0; - - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) { - break; - } else if (ret > 0) { - ret = 0; - break; - } - continue; - } - - btrfs_item_key_to_cpu(leaf, &key, slot); - if (key.objectid != ino || - (key.type != BTRFS_INODE_REF_KEY && - key.type != BTRFS_INODE_EXTREF_KEY)) { - ret = 0; - break; - } - - ret = btrfs_check_ref_name_override(leaf, slot, &key, - BTRFS_I(inode), &other_ino, - &other_parent); - if (ret < 0) - break; - if (ret > 0) { - ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); - if (!ino_elem) { - ret = -ENOMEM; - break; - } - ino_elem->ino = other_ino; - ino_elem->parent = other_parent; - list_add_tail(&ino_elem->list, &inode_list); - ret = 0; - } - path->slots[0]++; - } + ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx); btrfs_add_delayed_iput(inode); + if (ret) + break; } + ctx->logging_conflict_inodes = false; + if (ret) + free_conflicting_inodes(ctx); + return ret; } @@ -5582,7 +5818,6 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_path *dst_path, const u64 logged_isize, - const bool recursive_logging, const int inode_only, struct btrfs_log_ctx *ctx, bool *need_log_inode_item) @@ -5621,8 +5856,8 @@ again: break; } else if ((min_key->type == BTRFS_INODE_REF_KEY || min_key->type == BTRFS_INODE_EXTREF_KEY) && - inode->generation == trans->transid && - !recursive_logging) { + (inode->generation == trans->transid || + ctx->logging_conflict_inodes)) { u64 other_ino = 0; u64 other_parent = 0; @@ -5646,11 +5881,12 @@ again: return ret; ins_nr = 0; - ret = log_conflicting_inodes(trans, root, path, - ctx, other_ino, other_parent); + btrfs_release_path(path); + ret = add_conflicting_inode(trans, root, path, + other_ino, + other_parent, ctx); if (ret) return ret; - btrfs_release_path(path); goto next_key; } } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) { @@ -5733,6 +5969,371 @@ next_key: return ret; } +static int insert_delayed_items_batch(struct btrfs_trans_handle *trans, + struct btrfs_root *log, + struct btrfs_path *path, + const struct btrfs_item_batch *batch, + const struct btrfs_delayed_item *first_item) +{ + const struct btrfs_delayed_item *curr = first_item; + int ret; + + ret = btrfs_insert_empty_items(trans, log, path, batch); + if (ret) + return ret; + + for (int i = 0; i < batch->nr; i++) { + char *data_ptr; + + data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char); + write_extent_buffer(path->nodes[0], &curr->data, + (unsigned long)data_ptr, curr->data_len); + curr = list_next_entry(curr, log_list); + path->slots[0]++; + } + + btrfs_release_path(path); + + return 0; +} + +static int log_delayed_insertion_items(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path, + const struct list_head *delayed_ins_list, + struct btrfs_log_ctx *ctx) +{ + /* 195 (4095 bytes of keys and sizes) fits in a single 4K page. */ + const int max_batch_size = 195; + const int leaf_data_size = BTRFS_LEAF_DATA_SIZE(trans->fs_info); + const u64 ino = btrfs_ino(inode); + struct btrfs_root *log = inode->root->log_root; + struct btrfs_item_batch batch = { + .nr = 0, + .total_data_size = 0, + }; + const struct btrfs_delayed_item *first = NULL; + const struct btrfs_delayed_item *curr; + char *ins_data; + struct btrfs_key *ins_keys; + u32 *ins_sizes; + u64 curr_batch_size = 0; + int batch_idx = 0; + int ret; + + /* We are adding dir index items to the log tree. */ + lockdep_assert_held(&inode->log_mutex); + + /* + * We collect delayed items before copying index keys from the subvolume + * to the log tree. However just after we collected them, they may have + * been flushed (all of them or just some of them), and therefore we + * could have copied them from the subvolume tree to the log tree. + * So find the first delayed item that was not yet logged (they are + * sorted by index number). + */ + list_for_each_entry(curr, delayed_ins_list, log_list) { + if (curr->index > inode->last_dir_index_offset) { + first = curr; + break; + } + } + + /* Empty list or all delayed items were already logged. */ + if (!first) + return 0; + + ins_data = kmalloc(max_batch_size * sizeof(u32) + + max_batch_size * sizeof(struct btrfs_key), GFP_NOFS); + if (!ins_data) + return -ENOMEM; + ins_sizes = (u32 *)ins_data; + batch.data_sizes = ins_sizes; + ins_keys = (struct btrfs_key *)(ins_data + max_batch_size * sizeof(u32)); + batch.keys = ins_keys; + + curr = first; + while (!list_entry_is_head(curr, delayed_ins_list, log_list)) { + const u32 curr_size = curr->data_len + sizeof(struct btrfs_item); + + if (curr_batch_size + curr_size > leaf_data_size || + batch.nr == max_batch_size) { + ret = insert_delayed_items_batch(trans, log, path, + &batch, first); + if (ret) + goto out; + batch_idx = 0; + batch.nr = 0; + batch.total_data_size = 0; + curr_batch_size = 0; + first = curr; + } + + ins_sizes[batch_idx] = curr->data_len; + ins_keys[batch_idx].objectid = ino; + ins_keys[batch_idx].type = BTRFS_DIR_INDEX_KEY; + ins_keys[batch_idx].offset = curr->index; + curr_batch_size += curr_size; + batch.total_data_size += curr->data_len; + batch.nr++; + batch_idx++; + curr = list_next_entry(curr, log_list); + } + + ASSERT(batch.nr >= 1); + ret = insert_delayed_items_batch(trans, log, path, &batch, first); + + curr = list_last_entry(delayed_ins_list, struct btrfs_delayed_item, + log_list); + inode->last_dir_index_offset = curr->index; +out: + kfree(ins_data); + + return ret; +} + +static int log_delayed_deletions_full(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path, + const struct list_head *delayed_del_list, + struct btrfs_log_ctx *ctx) +{ + const u64 ino = btrfs_ino(inode); + const struct btrfs_delayed_item *curr; + + curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item, + log_list); + + while (!list_entry_is_head(curr, delayed_del_list, log_list)) { + u64 first_dir_index = curr->index; + u64 last_dir_index; + const struct btrfs_delayed_item *next; + int ret; + + /* + * Find a range of consecutive dir index items to delete. Like + * this we log a single dir range item spanning several contiguous + * dir items instead of logging one range item per dir index item. + */ + next = list_next_entry(curr, log_list); + while (!list_entry_is_head(next, delayed_del_list, log_list)) { + if (next->index != curr->index + 1) + break; + curr = next; + next = list_next_entry(next, log_list); + } + + last_dir_index = curr->index; + ASSERT(last_dir_index >= first_dir_index); + + ret = insert_dir_log_key(trans, inode->root->log_root, path, + ino, first_dir_index, last_dir_index); + if (ret) + return ret; + curr = list_next_entry(curr, log_list); + } + + return 0; +} + +static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path, + struct btrfs_log_ctx *ctx, + const struct list_head *delayed_del_list, + const struct btrfs_delayed_item *first, + const struct btrfs_delayed_item **last_ret) +{ + const struct btrfs_delayed_item *next; + struct extent_buffer *leaf = path->nodes[0]; + const int last_slot = btrfs_header_nritems(leaf) - 1; + int slot = path->slots[0] + 1; + const u64 ino = btrfs_ino(inode); + + next = list_next_entry(first, log_list); + + while (slot < last_slot && + !list_entry_is_head(next, delayed_del_list, log_list)) { + struct btrfs_key key; + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != ino || + key.type != BTRFS_DIR_INDEX_KEY || + key.offset != next->index) + break; + + slot++; + *last_ret = next; + next = list_next_entry(next, log_list); + } + + return btrfs_del_items(trans, inode->root->log_root, path, + path->slots[0], slot - path->slots[0]); +} + +static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path, + const struct list_head *delayed_del_list, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_root *log = inode->root->log_root; + const struct btrfs_delayed_item *curr; + u64 last_range_start; + u64 last_range_end = 0; + struct btrfs_key key; + + key.objectid = btrfs_ino(inode); + key.type = BTRFS_DIR_INDEX_KEY; + curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item, + log_list); + + while (!list_entry_is_head(curr, delayed_del_list, log_list)) { + const struct btrfs_delayed_item *last = curr; + u64 first_dir_index = curr->index; + u64 last_dir_index; + bool deleted_items = false; + int ret; + + key.offset = curr->index; + ret = btrfs_search_slot(trans, log, &key, path, -1, 1); + if (ret < 0) { + return ret; + } else if (ret == 0) { + ret = batch_delete_dir_index_items(trans, inode, path, ctx, + delayed_del_list, curr, + &last); + if (ret) + return ret; + deleted_items = true; + } + + btrfs_release_path(path); + + /* + * If we deleted items from the leaf, it means we have a range + * item logging their range, so no need to add one or update an + * existing one. Otherwise we have to log a dir range item. + */ + if (deleted_items) + goto next_batch; + + last_dir_index = last->index; + ASSERT(last_dir_index >= first_dir_index); + /* + * If this range starts right after where the previous one ends, + * then we want to reuse the previous range item and change its + * end offset to the end of this range. This is just to minimize + * leaf space usage, by avoiding adding a new range item. + */ + if (last_range_end != 0 && first_dir_index == last_range_end + 1) + first_dir_index = last_range_start; + + ret = insert_dir_log_key(trans, log, path, key.objectid, + first_dir_index, last_dir_index); + if (ret) + return ret; + + last_range_start = first_dir_index; + last_range_end = last_dir_index; +next_batch: + curr = list_next_entry(last, log_list); + } + + return 0; +} + +static int log_delayed_deletion_items(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path, + const struct list_head *delayed_del_list, + struct btrfs_log_ctx *ctx) +{ + /* + * We are deleting dir index items from the log tree or adding range + * items to it. + */ + lockdep_assert_held(&inode->log_mutex); + + if (list_empty(delayed_del_list)) + return 0; + + if (ctx->logged_before) + return log_delayed_deletions_incremental(trans, inode, path, + delayed_del_list, ctx); + + return log_delayed_deletions_full(trans, inode, path, delayed_del_list, + ctx); +} + +/* + * Similar logic as for log_new_dir_dentries(), but it iterates over the delayed + * items instead of the subvolume tree. + */ +static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + const struct list_head *delayed_ins_list, + struct btrfs_log_ctx *ctx) +{ + const bool orig_log_new_dentries = ctx->log_new_dentries; + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_item *item; + int ret = 0; + + /* + * No need for the log mutex, plus to avoid potential deadlocks or + * lockdep annotations due to nesting of delayed inode mutexes and log + * mutexes. + */ + lockdep_assert_not_held(&inode->log_mutex); + + ASSERT(!ctx->logging_new_delayed_dentries); + ctx->logging_new_delayed_dentries = true; + + list_for_each_entry(item, delayed_ins_list, log_list) { + struct btrfs_dir_item *dir_item; + struct inode *di_inode; + struct btrfs_key key; + int log_mode = LOG_INODE_EXISTS; + + dir_item = (struct btrfs_dir_item *)item->data; + btrfs_disk_key_to_cpu(&key, &dir_item->location); + + if (key.type == BTRFS_ROOT_ITEM_KEY) + continue; + + di_inode = btrfs_iget(fs_info->sb, key.objectid, inode->root); + if (IS_ERR(di_inode)) { + ret = PTR_ERR(di_inode); + break; + } + + if (!need_log_inode(trans, BTRFS_I(di_inode))) { + btrfs_add_delayed_iput(di_inode); + continue; + } + + if (btrfs_stack_dir_type(dir_item) == BTRFS_FT_DIR) + log_mode = LOG_INODE_ALL; + + ctx->log_new_dentries = false; + ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx); + + if (!ret && ctx->log_new_dentries) + ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx); + + btrfs_add_delayed_iput(di_inode); + + if (ret) + break; + } + + ctx->log_new_dentries = orig_log_new_dentries; + ctx->logging_new_delayed_dentries = false; + + return ret; +} + /* log a single inode in the tree log. * At least one parent directory for this inode must exist in the tree * or be logged already. @@ -5764,9 +6365,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, u64 logged_isize = 0; bool need_log_inode_item = true; bool xattrs_logged = false; - bool recursive_logging = false; bool inode_item_dropped = true; - const bool orig_logged_before = ctx->logged_before; + bool full_dir_logging = false; + LIST_HEAD(delayed_ins_list); + LIST_HEAD(delayed_del_list); path = btrfs_alloc_path(); if (!path) @@ -5794,27 +6396,46 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, max_key.type = (u8)-1; max_key.offset = (u64)-1; + if (S_ISDIR(inode->vfs_inode.i_mode) && inode_only == LOG_INODE_ALL) + full_dir_logging = true; + /* - * Only run delayed items if we are a directory. We want to make sure - * all directory indexes hit the fs/subvolume tree so we can find them - * and figure out which index ranges have to be logged. + * If we are logging a directory while we are logging dentries of the + * delayed items of some other inode, then we need to flush the delayed + * items of this directory and not log the delayed items directly. This + * is to prevent more than one level of recursion into btrfs_log_inode() + * by having something like this: + * + * $ mkdir -p a/b/c/d/e/f/g/h/... + * $ xfs_io -c "fsync" a + * + * Where all directories in the path did not exist before and are + * created in the current transaction. + * So in such a case we directly log the delayed items of the main + * directory ("a") without flushing them first, while for each of its + * subdirectories we flush their delayed items before logging them. + * This prevents a potential unbounded recursion like this: + * + * btrfs_log_inode() + * log_new_delayed_dentries() + * btrfs_log_inode() + * log_new_delayed_dentries() + * btrfs_log_inode() + * log_new_delayed_dentries() + * (...) + * + * We have thresholds for the maximum number of delayed items to have in + * memory, and once they are hit, the items are flushed asynchronously. + * However the limit is quite high, so lets prevent deep levels of + * recursion to happen by limiting the maximum depth to be 1. */ - if (S_ISDIR(inode->vfs_inode.i_mode)) { + if (full_dir_logging && ctx->logging_new_delayed_dentries) { ret = btrfs_commit_inode_delayed_items(trans, inode); if (ret) goto out; } - if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) { - recursive_logging = true; - if (inode_only == LOG_OTHER_INODE) - inode_only = LOG_INODE_EXISTS; - else - inode_only = LOG_INODE_ALL; - mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING); - } else { - mutex_lock(&inode->log_mutex); - } + mutex_lock(&inode->log_mutex); /* * For symlinks, we must always log their content, which is stored in an @@ -5846,9 +6467,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, * to known the file was moved from A to B, so logging just A would * result in losing the file after a log replay. */ - if (S_ISDIR(inode->vfs_inode.i_mode) && - inode_only == LOG_INODE_ALL && - inode->last_unlink_trans >= trans->transid) { + if (full_dir_logging && inode->last_unlink_trans >= trans->transid) { btrfs_set_log_full_commit(trans); ret = BTRFS_LOG_FORCE_COMMIT; goto out_unlock; @@ -5859,14 +6478,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, * copies of everything. */ if (S_ISDIR(inode->vfs_inode.i_mode)) { - int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; - clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags); - if (inode_only == LOG_INODE_EXISTS) - max_key_type = BTRFS_XATTR_ITEM_KEY; if (ctx->logged_before) ret = drop_inode_items(trans, log, path, inode, - max_key_type); + BTRFS_XATTR_ITEM_KEY); } else { if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) { /* @@ -5922,9 +6537,19 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, if (ret) goto out_unlock; + /* + * If we are logging a directory in full mode, collect the delayed items + * before iterating the subvolume tree, so that we don't miss any new + * dir index items in case they get flushed while or right after we are + * iterating the subvolume tree. + */ + if (full_dir_logging && !ctx->logging_new_delayed_dentries) + btrfs_log_get_delayed_items(inode, &delayed_ins_list, + &delayed_del_list); + ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key, path, dst_path, logged_isize, - recursive_logging, inode_only, ctx, + inode_only, ctx, &need_log_inode_item); if (ret) goto out_unlock; @@ -5977,10 +6602,18 @@ log_extents: write_unlock(&em_tree->lock); } - if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) { + if (full_dir_logging) { ret = log_directory_changes(trans, inode, path, dst_path, ctx); if (ret) goto out_unlock; + ret = log_delayed_insertion_items(trans, inode, path, + &delayed_ins_list, ctx); + if (ret) + goto out_unlock; + ret = log_delayed_deletion_items(trans, inode, path, + &delayed_del_list, ctx); + if (ret) + goto out_unlock; } spin_lock(&inode->lock); @@ -6033,208 +6666,20 @@ out: btrfs_free_path(path); btrfs_free_path(dst_path); - if (recursive_logging) - ctx->logged_before = orig_logged_before; - - return ret; -} - -/* - * Check if we need to log an inode. This is used in contexts where while - * logging an inode we need to log another inode (either that it exists or in - * full mode). This is used instead of btrfs_inode_in_log() because the later - * requires the inode to be in the log and have the log transaction committed, - * while here we do not care if the log transaction was already committed - our - * caller will commit the log later - and we want to avoid logging an inode - * multiple times when multiple tasks have joined the same log transaction. - */ -static bool need_log_inode(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode) -{ - /* - * If a directory was not modified, no dentries added or removed, we can - * and should avoid logging it. - */ - if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid) - return false; - - /* - * If this inode does not have new/updated/deleted xattrs since the last - * time it was logged and is flagged as logged in the current transaction, - * we can skip logging it. As for new/deleted names, those are updated in - * the log by link/unlink/rename operations. - * In case the inode was logged and then evicted and reloaded, its - * logged_trans will be 0, in which case we have to fully log it since - * logged_trans is a transient field, not persisted. - */ - if (inode->logged_trans == trans->transid && - !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags)) - return false; - - return true; -} - -struct btrfs_dir_list { - u64 ino; - struct list_head list; -}; - -/* - * Log the inodes of the new dentries of a directory. See log_dir_items() for - * details about the why it is needed. - * This is a recursive operation - if an existing dentry corresponds to a - * directory, that directory's new entries are logged too (same behaviour as - * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes - * the dentries point to we do not lock their i_mutex, otherwise lockdep - * complains about the following circular lock dependency / possible deadlock: - * - * CPU0 CPU1 - * ---- ---- - * lock(&type->i_mutex_dir_key#3/2); - * lock(sb_internal#2); - * lock(&type->i_mutex_dir_key#3/2); - * lock(&sb->s_type->i_mutex_key#14); - * - * Where sb_internal is the lock (a counter that works as a lock) acquired by - * sb_start_intwrite() in btrfs_start_transaction(). - * Not locking i_mutex of the inodes is still safe because: - * - * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible - * that while logging the inode new references (names) are added or removed - * from the inode, leaving the logged inode item with a link count that does - * not match the number of logged inode reference items. This is fine because - * at log replay time we compute the real number of links and correct the - * link count in the inode item (see replay_one_buffer() and - * link_to_fixup_dir()); - * - * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that - * while logging the inode's items new index items (key type - * BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item - * has a size that doesn't match the sum of the lengths of all the logged - * names - this is ok, not a problem, because at log replay time we set the - * directory's i_size to the correct value (see replay_one_name() and - * do_overwrite_item()). - */ -static int log_new_dir_dentries(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_inode *start_inode, - struct btrfs_log_ctx *ctx) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_path *path; - LIST_HEAD(dir_list); - struct btrfs_dir_list *dir_elem; - int ret = 0; - - /* - * If we are logging a new name, as part of a link or rename operation, - * don't bother logging new dentries, as we just want to log the names - * of an inode and that any new parents exist. - */ - if (ctx->logging_new_name) - return 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); - if (!dir_elem) { - btrfs_free_path(path); - return -ENOMEM; - } - dir_elem->ino = btrfs_ino(start_inode); - list_add_tail(&dir_elem->list, &dir_list); - - while (!list_empty(&dir_list)) { - struct extent_buffer *leaf; - struct btrfs_key min_key; - int nritems; - int i; - - dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, - list); - if (ret) - goto next_dir_inode; - - min_key.objectid = dir_elem->ino; - min_key.type = BTRFS_DIR_INDEX_KEY; - min_key.offset = 0; -again: - btrfs_release_path(path); - ret = btrfs_search_forward(root, &min_key, path, trans->transid); - if (ret < 0) { - goto next_dir_inode; - } else if (ret > 0) { - ret = 0; - goto next_dir_inode; - } - - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - for (i = path->slots[0]; i < nritems; i++) { - struct btrfs_dir_item *di; - struct btrfs_key di_key; - struct inode *di_inode; - struct btrfs_dir_list *new_dir_elem; - int log_mode = LOG_INODE_EXISTS; - int type; - - btrfs_item_key_to_cpu(leaf, &min_key, i); - if (min_key.objectid != dir_elem->ino || - min_key.type != BTRFS_DIR_INDEX_KEY) - goto next_dir_inode; - - di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); - type = btrfs_dir_type(leaf, di); - if (btrfs_dir_transid(leaf, di) < trans->transid) - continue; - btrfs_dir_item_key_to_cpu(leaf, di, &di_key); - if (di_key.type == BTRFS_ROOT_ITEM_KEY) - continue; - - btrfs_release_path(path); - di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root); - if (IS_ERR(di_inode)) { - ret = PTR_ERR(di_inode); - goto next_dir_inode; - } + if (ret) + free_conflicting_inodes(ctx); + else + ret = log_conflicting_inodes(trans, inode->root, ctx); - if (!need_log_inode(trans, BTRFS_I(di_inode))) { - btrfs_add_delayed_iput(di_inode); - break; - } + if (full_dir_logging && !ctx->logging_new_delayed_dentries) { + if (!ret) + ret = log_new_delayed_dentries(trans, inode, + &delayed_ins_list, ctx); - ctx->log_new_dentries = false; - if (type == BTRFS_FT_DIR) - log_mode = LOG_INODE_ALL; - ret = btrfs_log_inode(trans, BTRFS_I(di_inode), - log_mode, ctx); - btrfs_add_delayed_iput(di_inode); - if (ret) - goto next_dir_inode; - if (ctx->log_new_dentries) { - new_dir_elem = kmalloc(sizeof(*new_dir_elem), - GFP_NOFS); - if (!new_dir_elem) { - ret = -ENOMEM; - goto next_dir_inode; - } - new_dir_elem->ino = di_key.objectid; - list_add_tail(&new_dir_elem->list, &dir_list); - } - break; - } - if (min_key.offset < (u64)-1) { - min_key.offset++; - goto again; - } -next_dir_inode: - list_del(&dir_elem->list); - kfree(dir_elem); + btrfs_log_put_delayed_items(inode, &delayed_ins_list, + &delayed_del_list); } - btrfs_free_path(path); return ret; } @@ -6346,7 +6791,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, ret = btrfs_log_inode(trans, BTRFS_I(dir_inode), LOG_INODE_ALL, ctx); if (!ret && ctx->log_new_dentries) - ret = log_new_dir_dentries(trans, root, + ret = log_new_dir_dentries(trans, BTRFS_I(dir_inode), ctx); btrfs_add_delayed_iput(dir_inode); if (ret) @@ -6661,7 +7106,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_trans; if (log_dentries) - ret = log_new_dir_dentries(trans, root, inode, ctx); + ret = log_new_dir_dentries(trans, inode, ctx); else ret = 0; end_trans: @@ -7088,6 +7533,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, * inconsistent state after a rename operation. */ btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); + ASSERT(list_empty(&ctx.conflict_inodes)); out: /* * If an error happened mark the log for a full commit because it's not diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 57ab5f3b8dc7..aed1e05e9879 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -20,6 +20,7 @@ struct btrfs_log_ctx { int log_transid; bool log_new_dentries; bool logging_new_name; + bool logging_new_delayed_dentries; /* Indicate if the inode being logged was logged before. */ bool logged_before; /* Tracks the last logged dir item/index key offset. */ @@ -28,6 +29,9 @@ struct btrfs_log_ctx { struct list_head list; /* Only used for fast fsyncs. */ struct list_head ordered_extents; + struct list_head conflict_inodes; + int num_conflict_inodes; + bool logging_conflict_inodes; }; static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, @@ -37,10 +41,14 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, ctx->log_transid = 0; ctx->log_new_dentries = false; ctx->logging_new_name = false; + ctx->logging_new_delayed_dentries = false; ctx->logged_before = false; ctx->inode = inode; INIT_LIST_HEAD(&ctx->list); INIT_LIST_HEAD(&ctx->ordered_extents); + INIT_LIST_HEAD(&ctx->conflict_inodes); + ctx->num_conflict_inodes = 0; + ctx->logging_conflict_inodes = false; } static inline void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx) diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index 90eb5c2830a9..ee00e33c309e 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -659,8 +659,7 @@ rollback: * * Returns the size on success or a negative error code on failure. */ -static int btrfs_get_verity_descriptor(struct inode *inode, void *buf, - size_t buf_size) +int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size) { u64 true_size; int ret = 0; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f63ff91e2883..635f45f1a2ef 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -34,6 +34,8 @@ #include "discard.h" #include "zoned.h" +static struct bio_set btrfs_bioset; + #define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ BTRFS_BLOCK_GROUP_RAID10 | \ BTRFS_BLOCK_GROUP_RAID56_MASK) @@ -247,10 +249,10 @@ static int init_first_rw_device(struct btrfs_trans_handle *trans); static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); static int __btrfs_map_block(struct btrfs_fs_info *fs_info, - enum btrfs_map_op op, - u64 logical, u64 *length, + enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_io_context **bioc_ret, - int mirror_num, int need_raid_map); + struct btrfs_io_stripe *smap, + int *mirror_num_ret, int need_raid_map); /* * Device locking @@ -1009,6 +1011,18 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) rcu_assign_pointer(device->name, name); } + if (orig_dev->zone_info) { + struct btrfs_zoned_device_info *zone_info; + + zone_info = btrfs_clone_dev_zone_info(orig_dev); + if (!zone_info) { + btrfs_free_device(device); + ret = -ENOMEM; + goto error; + } + device->zone_info = zone_info; + } + list_add(&device->dev_list, &fs_devices->devices); device->fs_devices = fs_devices; fs_devices->num_devices++; @@ -2017,7 +2031,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct page *page; int ret; - disk_super = btrfs_read_dev_one_super(bdev, copy_num); + disk_super = btrfs_read_dev_one_super(bdev, copy_num, false); if (IS_ERR(disk_super)) continue; @@ -5595,7 +5609,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, if (ret) goto out; - bg->chunk_item_inserted = 1; + set_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, &bg->runtime_flags); if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size); @@ -5896,7 +5910,6 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_ sizeof(u64) * (total_stripes), GFP_NOFS|__GFP_NOFAIL); - atomic_set(&bioc->error, 0); refcount_set(&bioc->refs, 1); bioc->fs_info = fs_info; @@ -6092,7 +6105,7 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, int ret = 0; ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, - logical, &length, &bioc, 0, 0); + logical, &length, &bioc, NULL, NULL, 0); if (ret) { ASSERT(bioc == NULL); return ret; @@ -6153,9 +6166,7 @@ static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical) cache = btrfs_lookup_block_group(fs_info, logical); - spin_lock(&cache->lock); - ret = cache->to_copy; - spin_unlock(&cache->lock); + ret = test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags); btrfs_put_block_group(cache); return ret; @@ -6351,11 +6362,19 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, return 0; } +static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map, + u32 stripe_index, u64 stripe_offset, u64 stripe_nr) +{ + dst->dev = map->stripes[stripe_index].dev; + dst->physical = map->stripes[stripe_index].physical + + stripe_offset + stripe_nr * map->stripe_len; +} + static int __btrfs_map_block(struct btrfs_fs_info *fs_info, - enum btrfs_map_op op, - u64 logical, u64 *length, + enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_io_context **bioc_ret, - int mirror_num, int need_raid_map) + struct btrfs_io_stripe *smap, + int *mirror_num_ret, int need_raid_map) { struct extent_map *em; struct map_lookup *map; @@ -6366,6 +6385,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int data_stripes; int i; int ret = 0; + int mirror_num = (mirror_num_ret ? *mirror_num_ret : 0); int num_stripes; int max_errors = 0; int tgtdev_indexes = 0; @@ -6526,6 +6546,29 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, tgtdev_indexes = num_stripes; } + /* + * If this I/O maps to a single device, try to return the device and + * physical block information on the stack instead of allocating an + * I/O context structure. + */ + if (smap && num_alloc_stripes == 1 && + !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) && + (!need_full_stripe(op) || !dev_replace_is_ongoing || + !dev_replace->tgtdev)) { + if (patch_the_first_stripe_for_dev_replace) { + smap->dev = dev_replace->tgtdev; + smap->physical = physical_to_patch_in_first_stripe; + *mirror_num_ret = map->num_stripes + 1; + } else { + set_io_stripe(smap, map, stripe_index, stripe_offset, + stripe_nr); + *mirror_num_ret = mirror_num; + } + *bioc_ret = NULL; + ret = 0; + goto out; + } + bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes); if (!bioc) { ret = -ENOMEM; @@ -6533,9 +6576,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, } for (i = 0; i < num_stripes; i++) { - bioc->stripes[i].physical = map->stripes[stripe_index].physical + - stripe_offset + stripe_nr * map->stripe_len; - bioc->stripes[i].dev = map->stripes[stripe_index].dev; + set_io_stripe(&bioc->stripes[i], map, stripe_index, stripe_offset, + stripe_nr); stripe_index++; } @@ -6603,7 +6645,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, struct btrfs_io_context **bioc_ret, int mirror_num) { return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, - mirror_num, 0); + NULL, &mirror_num, 0); } /* For Scrub/replace */ @@ -6611,14 +6653,77 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_io_context **bioc_ret) { - return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1); + return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, + NULL, NULL, 1); } -static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_io_context *bioc) +/* + * Initialize a btrfs_bio structure. This skips the embedded bio itself as it + * is already initialized by the block layer. + */ +static inline void btrfs_bio_init(struct btrfs_bio *bbio, + btrfs_bio_end_io_t end_io, void *private) { - if (bioc->orig_bio->bi_opf & REQ_META) - return bioc->fs_info->endio_meta_workers; - return bioc->fs_info->endio_workers; + memset(bbio, 0, offsetof(struct btrfs_bio, bio)); + bbio->end_io = end_io; + bbio->private = private; +} + +/* + * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for + * btrfs, and is used for all I/O submitted through btrfs_submit_bio. + * + * Just like the underlying bio_alloc_bioset it will not fail as it is backed by + * a mempool. + */ +struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, + btrfs_bio_end_io_t end_io, void *private) +{ + struct bio *bio; + + bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset); + btrfs_bio_init(btrfs_bio(bio), end_io, private); + return bio; +} + +struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, + btrfs_bio_end_io_t end_io, void *private) +{ + struct bio *bio; + struct btrfs_bio *bbio; + + ASSERT(offset <= UINT_MAX && size <= UINT_MAX); + + bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset); + bbio = btrfs_bio(bio); + btrfs_bio_init(bbio, end_io, private); + + bio_trim(bio, offset >> 9, size >> 9); + bbio->iter = bio->bi_iter; + return bio; +} + +static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) +{ + if (!dev || !dev->bdev) + return; + if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET) + return; + + if (btrfs_op(bio) == BTRFS_MAP_WRITE) + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); + if (!(bio->bi_opf & REQ_RAHEAD)) + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); + if (bio->bi_opf & REQ_PREFLUSH) + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS); +} + +static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info, + struct bio *bio) +{ + if (bio->bi_opf & REQ_META) + return fs_info->endio_meta_workers; + return fs_info->endio_workers; } static void btrfs_end_bio_work(struct work_struct *work) @@ -6626,103 +6731,101 @@ static void btrfs_end_bio_work(struct work_struct *work) struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); - bio_endio(&bbio->bio); + bbio->end_io(bbio); } -static void btrfs_end_bioc(struct btrfs_io_context *bioc, bool async) +static void btrfs_simple_end_io(struct bio *bio) { - struct bio *orig_bio = bioc->orig_bio; - struct btrfs_bio *bbio = btrfs_bio(orig_bio); + struct btrfs_fs_info *fs_info = bio->bi_private; + struct btrfs_bio *bbio = btrfs_bio(bio); - bbio->mirror_num = bioc->mirror_num; - orig_bio->bi_private = bioc->private; - orig_bio->bi_end_io = bioc->end_io; + btrfs_bio_counter_dec(fs_info); - /* - * Only send an error to the higher layers if it is beyond the tolerance - * threshold. - */ - if (atomic_read(&bioc->error) > bioc->max_errors) - orig_bio->bi_status = BLK_STS_IOERR; - else - orig_bio->bi_status = BLK_STS_OK; + if (bio->bi_status) + btrfs_log_dev_io_error(bio, bbio->device); - if (btrfs_op(orig_bio) == BTRFS_MAP_READ && async) { + if (bio_op(bio) == REQ_OP_READ) { INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); - queue_work(btrfs_end_io_wq(bioc), &bbio->end_io_work); + queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); } else { - bio_endio(orig_bio); + bbio->end_io(bbio); } +} + +static void btrfs_raid56_end_io(struct bio *bio) +{ + struct btrfs_io_context *bioc = bio->bi_private; + struct btrfs_bio *bbio = btrfs_bio(bio); + + btrfs_bio_counter_dec(bioc->fs_info); + bbio->mirror_num = bioc->mirror_num; + bbio->end_io(bbio); btrfs_put_bioc(bioc); } -static void btrfs_end_bio(struct bio *bio) +static void btrfs_orig_write_end_io(struct bio *bio) { struct btrfs_io_stripe *stripe = bio->bi_private; struct btrfs_io_context *bioc = stripe->bioc; + struct btrfs_bio *bbio = btrfs_bio(bio); + + btrfs_bio_counter_dec(bioc->fs_info); if (bio->bi_status) { atomic_inc(&bioc->error); - if (bio->bi_status == BLK_STS_IOERR || - bio->bi_status == BLK_STS_TARGET) { - if (btrfs_op(bio) == BTRFS_MAP_WRITE) - btrfs_dev_stat_inc_and_print(stripe->dev, - BTRFS_DEV_STAT_WRITE_ERRS); - else if (!(bio->bi_opf & REQ_RAHEAD)) - btrfs_dev_stat_inc_and_print(stripe->dev, - BTRFS_DEV_STAT_READ_ERRS); - if (bio->bi_opf & REQ_PREFLUSH) - btrfs_dev_stat_inc_and_print(stripe->dev, - BTRFS_DEV_STAT_FLUSH_ERRS); - } + btrfs_log_dev_io_error(bio, stripe->dev); } - if (bio != bioc->orig_bio) - bio_put(bio); + /* + * Only send an error to the higher layers if it is beyond the tolerance + * threshold. + */ + if (atomic_read(&bioc->error) > bioc->max_errors) + bio->bi_status = BLK_STS_IOERR; + else + bio->bi_status = BLK_STS_OK; - btrfs_bio_counter_dec(bioc->fs_info); - if (atomic_dec_and_test(&bioc->stripes_pending)) - btrfs_end_bioc(bioc, true); + bbio->end_io(bbio); + btrfs_put_bioc(bioc); } -static void submit_stripe_bio(struct btrfs_io_context *bioc, - struct bio *orig_bio, int dev_nr, bool clone) +static void btrfs_clone_write_end_io(struct bio *bio) { - struct btrfs_fs_info *fs_info = bioc->fs_info; - struct btrfs_device *dev = bioc->stripes[dev_nr].dev; - u64 physical = bioc->stripes[dev_nr].physical; - struct bio *bio; + struct btrfs_io_stripe *stripe = bio->bi_private; + + if (bio->bi_status) { + atomic_inc(&stripe->bioc->error); + btrfs_log_dev_io_error(bio, stripe->dev); + } + /* Pass on control to the original bio this one was cloned from */ + bio_endio(stripe->bioc->orig_bio); + bio_put(bio); +} + +static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) +{ if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || - (btrfs_op(orig_bio) == BTRFS_MAP_WRITE && + (btrfs_op(bio) == BTRFS_MAP_WRITE && !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { - atomic_inc(&bioc->error); - if (atomic_dec_and_test(&bioc->stripes_pending)) - btrfs_end_bioc(bioc, false); + bio_io_error(bio); return; } - if (clone) { - bio = bio_alloc_clone(dev->bdev, orig_bio, GFP_NOFS, &fs_bio_set); - } else { - bio = orig_bio; - bio_set_dev(bio, dev->bdev); - btrfs_bio(bio)->device = dev; - } + bio_set_dev(bio, dev->bdev); - bioc->stripes[dev_nr].bioc = bioc; - bio->bi_private = &bioc->stripes[dev_nr]; - bio->bi_end_io = btrfs_end_bio; - bio->bi_iter.bi_sector = physical >> 9; /* * For zone append writing, bi_sector must point the beginning of the * zone */ if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; + if (btrfs_dev_is_sequential(dev, physical)) { - u64 zone_start = round_down(physical, fs_info->zone_size); + u64 zone_start = round_down(physical, + dev->fs_info->zone_size); bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; } else { @@ -6730,50 +6833,53 @@ static void submit_stripe_bio(struct btrfs_io_context *bioc, bio->bi_opf |= REQ_OP_WRITE; } } - btrfs_debug_in_rcu(fs_info, + btrfs_debug_in_rcu(dev->fs_info, "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name), dev->devid, bio->bi_iter.bi_size); - btrfs_bio_counter_inc_noblocked(fs_info); - btrfsic_check_bio(bio); submit_bio(bio); } +static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) +{ + struct bio *orig_bio = bioc->orig_bio, *bio; + + ASSERT(bio_op(orig_bio) != REQ_OP_READ); + + /* Reuse the bio embedded into the btrfs_bio for the last mirror */ + if (dev_nr == bioc->num_stripes - 1) { + bio = orig_bio; + bio->bi_end_io = btrfs_orig_write_end_io; + } else { + bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set); + bio_inc_remaining(orig_bio); + bio->bi_end_io = btrfs_clone_write_end_io; + } + + bio->bi_private = &bioc->stripes[dev_nr]; + bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT; + bioc->stripes[dev_nr].bioc = bioc; + btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); +} + void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num) { u64 logical = bio->bi_iter.bi_sector << 9; u64 length = bio->bi_iter.bi_size; u64 map_length = length; - int ret; - int dev_nr; - int total_devs; struct btrfs_io_context *bioc = NULL; + struct btrfs_io_stripe smap; + int ret; btrfs_bio_counter_inc_blocked(fs_info); - ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, - &map_length, &bioc, mirror_num, 1); + ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, + &bioc, &smap, &mirror_num, 1); if (ret) { btrfs_bio_counter_dec(fs_info); - bio->bi_status = errno_to_blk_status(ret); - bio_endio(bio); - return; - } - - total_devs = bioc->num_stripes; - bioc->orig_bio = bio; - bioc->private = bio->bi_private; - bioc->end_io = bio->bi_end_io; - atomic_set(&bioc->stripes_pending, total_devs); - - if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && - ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) { - if (btrfs_op(bio) == BTRFS_MAP_WRITE) - raid56_parity_write(bio, bioc); - else - raid56_parity_recover(bio, bioc, mirror_num, true); + btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); return; } @@ -6784,12 +6890,31 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror BUG(); } - for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { - const bool should_clone = (dev_nr < total_devs - 1); + if (!bioc) { + /* Single mirror read/write fast path */ + btrfs_bio(bio)->mirror_num = mirror_num; + btrfs_bio(bio)->device = smap.dev; + bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; + bio->bi_private = fs_info; + bio->bi_end_io = btrfs_simple_end_io; + btrfs_submit_dev_bio(smap.dev, bio); + } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + /* Parity RAID write or read recovery */ + bio->bi_private = bioc; + bio->bi_end_io = btrfs_raid56_end_io; + if (bio_op(bio) == REQ_OP_READ) + raid56_parity_recover(bio, bioc, mirror_num); + else + raid56_parity_write(bio, bioc); + } else { + /* Write to multiple mirrors */ + int total_devs = bioc->num_stripes; + int dev_nr; - submit_stripe_bio(bioc, bio, dev_nr, should_clone); + bioc->orig_bio = bio; + for (dev_nr = 0; dev_nr < total_devs; dev_nr++) + btrfs_submit_mirrored_bio(bioc, dev_nr); } - btrfs_bio_counter_dec(fs_info); } static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args, @@ -6805,18 +6930,18 @@ static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args, static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args, const struct btrfs_device *device) { - ASSERT((args->devid != (u64)-1) || args->missing); + if (args->missing) { + if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) && + !device->bdev) + return true; + return false; + } - if ((args->devid != (u64)-1) && device->devid != args->devid) + if (device->devid != args->devid) return false; if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0) return false; - if (!args->missing) - return true; - if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) && - !device->bdev) - return true; - return false; + return true; } /* @@ -7029,6 +7154,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, u64 devid; u64 type; u8 uuid[BTRFS_UUID_SIZE]; + int index; int num_stripes; int ret; int i; @@ -7036,6 +7162,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, logical = key->offset; length = btrfs_chunk_length(leaf, chunk); type = btrfs_chunk_type(leaf, chunk); + index = btrfs_bg_flags_to_raid_index(type); num_stripes = btrfs_chunk_num_stripes(leaf, chunk); #if BITS_PER_LONG == 32 @@ -7089,7 +7216,15 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, map->io_align = btrfs_chunk_io_align(leaf, chunk); map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); map->type = type; - map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); + /* + * We can't use the sub_stripes value, as for profiles other than + * RAID10, they may have 0 as sub_stripes for filesystems created by + * older mkfs (<v5.4). + * In that case, it can cause divide-by-zero errors later. + * Since currently sub_stripes is fixed for each profile, let's + * use the trusted value instead. + */ + map->sub_stripes = btrfs_raid_array[index].sub_stripes; map->verified_stripes = 0; em->orig_block_len = btrfs_calc_stripe_length(em); for (i = 0; i < num_stripes; i++) { @@ -7621,10 +7756,11 @@ error: return ret; } -void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) +int btrfs_init_devices_late(struct btrfs_fs_info *fs_info) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; struct btrfs_device *device; + int ret = 0; fs_devices->fs_info = fs_info; @@ -7633,12 +7769,18 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) device->fs_info = fs_info; list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { - list_for_each_entry(device, &seed_devs->devices, dev_list) + list_for_each_entry(device, &seed_devs->devices, dev_list) { device->fs_info = fs_info; + ret = btrfs_get_dev_zone_info(device, false); + if (ret) + break; + } seed_devs->fs_info = fs_info; } mutex_unlock(&fs_devices->device_list_mutex); + + return ret; } static u64 btrfs_dev_stats_value(const struct extent_buffer *eb, @@ -8244,7 +8386,7 @@ static int relocating_repair_kthread(void *data) if (!cache) goto out; - if (!cache->relocating_repair) + if (!test_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags)) goto out; ret = btrfs_may_alloc_data_chunk(fs_info, target); @@ -8281,17 +8423,27 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) if (!cache) return true; - spin_lock(&cache->lock); - if (cache->relocating_repair) { - spin_unlock(&cache->lock); + if (test_and_set_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags)) { btrfs_put_block_group(cache); return true; } - cache->relocating_repair = 1; - spin_unlock(&cache->lock); kthread_run(relocating_repair_kthread, cache, "btrfs-relocating-repair"); return true; } + +int __init btrfs_bioset_init(void) +{ + if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, + offsetof(struct btrfs_bio, bio), + BIOSET_NEED_BVECS)) + return -ENOMEM; + return 0; +} + +void __cold btrfs_bioset_exit(void) +{ + bioset_exit(&btrfs_bioset); +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 5639961b3626..099def5613b8 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -181,6 +181,31 @@ struct btrfs_device { }; /* + * Block group or device which contains an active swapfile. Used for preventing + * unsafe operations while a swapfile is active. + * + * These are sorted on (ptr, inode) (note that a block group or device can + * contain more than one swapfile). We compare the pointer values because we + * don't actually care what the object is, we just need a quick check whether + * the object exists in the rbtree. + */ +struct btrfs_swapfile_pin { + struct rb_node node; + void *ptr; + struct inode *inode; + /* + * If true, ptr points to a struct btrfs_block_group. Otherwise, ptr + * points to a struct btrfs_device. + */ + bool is_block_group; + /* + * Only used when 'is_block_group' is true and it is the number of + * extents used by a swapfile for this block group ('ptr' field). + */ + int bg_extent_count; +}; + +/* * If we read those variants at the context of their own lock, we needn't * use the following helpers, reading them directly is safe. */ @@ -361,6 +386,8 @@ struct btrfs_fs_devices { */ #define BTRFS_MAX_BIO_SECTORS (256) +typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio); + /* * Additional info to pass along bio. * @@ -368,6 +395,7 @@ struct btrfs_fs_devices { */ struct btrfs_bio { unsigned int mirror_num; + struct bvec_iter iter; /* for direct I/O */ u64 file_offset; @@ -376,7 +404,10 @@ struct btrfs_bio { struct btrfs_device *device; u8 *csum; u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; - struct bvec_iter iter; + + /* End I/O information supplied to btrfs_bio_alloc */ + btrfs_bio_end_io_t end_io; + void *private; /* For read end I/O handling */ struct work_struct end_io_work; @@ -393,6 +424,20 @@ static inline struct btrfs_bio *btrfs_bio(struct bio *bio) return container_of(bio, struct btrfs_bio, bio); } +int __init btrfs_bioset_init(void); +void __cold btrfs_bioset_exit(void); + +struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, + btrfs_bio_end_io_t end_io, void *private); +struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, + btrfs_bio_end_io_t end_io, void *private); + +static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) +{ + bbio->bio.bi_status = status; + bbio->end_io(bbio); +} + static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio) { if (bbio->csum != bbio->csum_inline) { @@ -451,12 +496,9 @@ struct btrfs_discard_stripe { */ struct btrfs_io_context { refcount_t refs; - atomic_t stripes_pending; struct btrfs_fs_info *fs_info; u64 map_type; /* get from map_lookup->type */ - bio_end_io_t *end_io; struct bio *orig_bio; - void *private; atomic_t error; int max_errors; int num_stripes; @@ -629,7 +671,7 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_get_dev_stats *stats); -void btrfs_init_devices_late(struct btrfs_fs_info *fs_info); +int btrfs_init_devices_late(struct btrfs_fs_info *fs_info); int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); int btrfs_run_dev_stats(struct btrfs_trans_handle *trans); void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev); @@ -714,4 +756,6 @@ const char *btrfs_bg_type_to_raid_name(u64 flags); int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info); bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical); +bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); + #endif diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 62e7007a7e46..c9e2b0c85309 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -134,7 +134,8 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, super[i] = page_address(page[i]); } - if (super[0]->generation > super[1]->generation) + if (btrfs_super_generation(super[0]) > + btrfs_super_generation(super[1])) sector = zones[1].start; else sector = zones[0].start; @@ -466,7 +467,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) goto out; } - zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL); + zones = kvcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL); if (!zones) { ret = -ENOMEM; goto out; @@ -585,7 +586,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) } - kfree(zones); + kvfree(zones); switch (bdev_zoned_model(bdev)) { case BLK_ZONED_HM: @@ -617,7 +618,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) return 0; out: - kfree(zones); + kvfree(zones); out_free_zone_info: btrfs_destroy_dev_zone_info(device); @@ -639,6 +640,46 @@ void btrfs_destroy_dev_zone_info(struct btrfs_device *device) device->zone_info = NULL; } +struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev) +{ + struct btrfs_zoned_device_info *zone_info; + + zone_info = kmemdup(orig_dev->zone_info, sizeof(*zone_info), GFP_KERNEL); + if (!zone_info) + return NULL; + + zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); + if (!zone_info->seq_zones) + goto out; + + bitmap_copy(zone_info->seq_zones, orig_dev->zone_info->seq_zones, + zone_info->nr_zones); + + zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); + if (!zone_info->empty_zones) + goto out; + + bitmap_copy(zone_info->empty_zones, orig_dev->zone_info->empty_zones, + zone_info->nr_zones); + + zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); + if (!zone_info->active_zones) + goto out; + + bitmap_copy(zone_info->active_zones, orig_dev->zone_info->active_zones, + zone_info->nr_zones); + zone_info->zone_cache = NULL; + + return zone_info; + +out: + bitmap_free(zone_info->seq_zones); + bitmap_free(zone_info->empty_zones); + bitmap_free(zone_info->active_zones); + kfree(zone_info); + return NULL; +} + int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) { @@ -652,80 +693,55 @@ int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, return 0; } +static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info) +{ + struct btrfs_device *device; + + list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { + if (device->bdev && + bdev_zoned_model(device->bdev) == BLK_ZONED_HM) { + btrfs_err(fs_info, + "zoned: mode not enabled but zoned device found: %pg", + device->bdev); + return -EINVAL; + } + } + + return 0; +} + int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; - u64 zoned_devices = 0; - u64 nr_devices = 0; u64 zone_size = 0; u64 max_zone_append_size = 0; - const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED); - int ret = 0; + int ret; - /* Count zoned devices */ - list_for_each_entry(device, &fs_devices->devices, dev_list) { - enum blk_zoned_model model; + /* + * Host-Managed devices can't be used without the ZONED flag. With the + * ZONED all devices can be used, using zone emulation if required. + */ + if (!btrfs_fs_incompat(fs_info, ZONED)) + return btrfs_check_for_zoned_device(fs_info); + + list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { + struct btrfs_zoned_device_info *zone_info = device->zone_info; if (!device->bdev) continue; - model = bdev_zoned_model(device->bdev); - /* - * A Host-Managed zoned device must be used as a zoned device. - * A Host-Aware zoned device and a non-zoned devices can be - * treated as a zoned device, if ZONED flag is enabled in the - * superblock. - */ - if (model == BLK_ZONED_HM || - (model == BLK_ZONED_HA && incompat_zoned) || - (model == BLK_ZONED_NONE && incompat_zoned)) { - struct btrfs_zoned_device_info *zone_info; - - zone_info = device->zone_info; - zoned_devices++; - if (!zone_size) { - zone_size = zone_info->zone_size; - } else if (zone_info->zone_size != zone_size) { - btrfs_err(fs_info, + if (!zone_size) { + zone_size = zone_info->zone_size; + } else if (zone_info->zone_size != zone_size) { + btrfs_err(fs_info, "zoned: unequal block device zone sizes: have %llu found %llu", - device->zone_info->zone_size, - zone_size); - ret = -EINVAL; - goto out; - } - if (!max_zone_append_size || - (zone_info->max_zone_append_size && - zone_info->max_zone_append_size < max_zone_append_size)) - max_zone_append_size = - zone_info->max_zone_append_size; + zone_info->zone_size, zone_size); + return -EINVAL; } - nr_devices++; - } - - if (!zoned_devices && !incompat_zoned) - goto out; - - if (!zoned_devices && incompat_zoned) { - /* No zoned block device found on ZONED filesystem */ - btrfs_err(fs_info, - "zoned: no zoned devices found on a zoned filesystem"); - ret = -EINVAL; - goto out; - } - - if (zoned_devices && !incompat_zoned) { - btrfs_err(fs_info, - "zoned: mode not enabled but zoned device found"); - ret = -EINVAL; - goto out; - } - - if (zoned_devices != nr_devices) { - btrfs_err(fs_info, - "zoned: cannot mix zoned and regular devices"); - ret = -EINVAL; - goto out; + if (!max_zone_append_size || + (zone_info->max_zone_append_size && + zone_info->max_zone_append_size < max_zone_append_size)) + max_zone_append_size = zone_info->max_zone_append_size; } /* @@ -737,14 +753,12 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) btrfs_err(fs_info, "zoned: zone size %llu not aligned to stripe %u", zone_size, BTRFS_STRIPE_LEN); - ret = -EINVAL; - goto out; + return -EINVAL; } if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { btrfs_err(fs_info, "zoned: mixed block groups not supported"); - ret = -EINVAL; - goto out; + return -EINVAL; } fs_info->zone_size = zone_size; @@ -760,11 +774,10 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) */ ret = btrfs_check_mountopts_zoned(fs_info); if (ret) - goto out; + return ret; btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size); -out: - return ret; + return 0; } int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info) @@ -1436,7 +1449,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } else if (map->num_stripes == num_conventional) { cache->alloc_offset = last_alloc; - cache->zone_is_active = 1; + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags); goto out; } } @@ -1452,7 +1465,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } cache->alloc_offset = alloc_offsets[0]; cache->zone_capacity = caps[0]; - cache->zone_is_active = test_bit(0, active); + if (test_bit(0, active)) + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags); break; case BTRFS_BLOCK_GROUP_DUP: if (map->type & BTRFS_BLOCK_GROUP_DATA) { @@ -1486,7 +1500,9 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } } else { - cache->zone_is_active = test_bit(0, active); + if (test_bit(0, active)) + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, + &cache->runtime_flags); } cache->alloc_offset = alloc_offsets[0]; cache->zone_capacity = min(caps[0], caps[1]); @@ -1530,7 +1546,7 @@ out: if (!ret) { cache->meta_write_pointer = cache->alloc_offset + cache->start; - if (cache->zone_is_active) { + if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags)) { btrfs_get_block_group(cache); spin_lock(&fs_info->zone_active_bgs_lock); list_add_tail(&cache->active_bg_list, @@ -1563,7 +1579,6 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) free = cache->zone_capacity - cache->alloc_offset; /* We only need ->free_space in ALLOC_SEQ block groups */ - cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; cache->free_space_ctl->free_space = free; cache->zone_unusable = unusable; @@ -1871,7 +1886,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) spin_lock(&space_info->lock); spin_lock(&block_group->lock); - if (block_group->zone_is_active) { + if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) { ret = true; goto out_unlock; } @@ -1897,7 +1912,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) } /* Successfully activated all the zones */ - block_group->zone_is_active = 1; + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); space_info->active_total_bytes += block_group->length; spin_unlock(&block_group->lock); btrfs_try_granting_tickets(fs_info, space_info); @@ -1918,22 +1933,55 @@ out_unlock: return ret; } +static void wait_eb_writebacks(struct btrfs_block_group *block_group) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + const u64 end = block_group->start + block_group->length; + struct radix_tree_iter iter; + struct extent_buffer *eb; + void __rcu **slot; + + rcu_read_lock(); + radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, + block_group->start >> fs_info->sectorsize_bits) { + eb = radix_tree_deref_slot(slot); + if (!eb) + continue; + if (radix_tree_deref_retry(eb)) { + slot = radix_tree_iter_retry(&iter); + continue; + } + + if (eb->start < block_group->start) + continue; + if (eb->start >= end) + break; + + slot = radix_tree_iter_resume(slot, &iter); + rcu_read_unlock(); + wait_on_extent_buffer_writeback(eb); + rcu_read_lock(); + } + rcu_read_unlock(); +} + static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct map_lookup *map; + const bool is_metadata = (block_group->flags & + (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)); int ret = 0; int i; spin_lock(&block_group->lock); - if (!block_group->zone_is_active) { + if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) { spin_unlock(&block_group->lock); return 0; } /* Check if we have unwritten allocated space */ - if ((block_group->flags & - (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) && + if (is_metadata && block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) { spin_unlock(&block_group->lock); return -EAGAIN; @@ -1958,6 +2006,9 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ /* No need to wait for NOCOW writers. Zoned mode does not allow that */ btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start, block_group->length); + /* Wait for extent buffers to be written. */ + if (is_metadata) + wait_eb_writebacks(block_group); spin_lock(&block_group->lock); @@ -1965,7 +2016,8 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ * Bail out if someone already deactivated the block group, or * allocated space is left in the block group. */ - if (!block_group->zone_is_active) { + if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, + &block_group->runtime_flags)) { spin_unlock(&block_group->lock); btrfs_dec_block_group_ro(block_group); return 0; @@ -1978,7 +2030,7 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ } } - block_group->zone_is_active = 0; + clear_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); block_group->alloc_offset = block_group->zone_capacity; block_group->free_space_ctl->free_space = 0; btrfs_clear_treelog_bg(block_group); @@ -2186,13 +2238,14 @@ void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logica ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)); spin_lock(&block_group->lock); - if (!block_group->zoned_data_reloc_ongoing) + if (!test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) goto out; /* All relocation extents are written. */ if (block_group->start + block_group->alloc_offset == logical + length) { /* Now, release this block group for further allocations. */ - block_group->zoned_data_reloc_ongoing = 0; + clear_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, + &block_group->runtime_flags); } out: @@ -2264,7 +2317,9 @@ int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, list) { if (!spin_trylock(&bg->lock)) continue; - if (btrfs_zoned_bg_is_full(bg) || bg->zone_is_active) { + if (btrfs_zoned_bg_is_full(bg) || + test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, + &bg->runtime_flags)) { spin_unlock(&bg->lock); continue; } diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index e17462db3a84..8bd16d40b7c6 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -36,6 +36,7 @@ int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info); int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache); void btrfs_destroy_dev_zone_info(struct btrfs_device *device); +struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev); int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info); int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info); int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, @@ -103,6 +104,16 @@ static inline int btrfs_get_dev_zone_info(struct btrfs_device *device, static inline void btrfs_destroy_dev_zone_info(struct btrfs_device *device) { } +/* + * In case the kernel is compiled without CONFIG_BLK_DEV_ZONED we'll never call + * into btrfs_clone_dev_zone_info() so it's safe to return NULL here. + */ +static inline struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info( + struct btrfs_device *orig_dev) +{ + return NULL; +} + static inline int btrfs_check_zoned_mode(const struct btrfs_fs_info *fs_info) { if (!btrfs_is_zoned(fs_info)) diff --git a/fs/buffer.c b/fs/buffer.c index 55e762a58eb6..d9c6d1fbb6dd 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -52,8 +52,8 @@ #include "internal.h" static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); -static int submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, - struct writeback_control *wbc); +static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, + struct writeback_control *wbc); #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) @@ -152,7 +152,7 @@ static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate) /* * Default synchronous end-of-IO handler.. Just mark it up-to-date and - * unlock the buffer. This is what ll_rw_block uses too. + * unlock the buffer. */ void end_buffer_read_sync(struct buffer_head *bh, int uptodate) { @@ -491,8 +491,8 @@ int inode_has_buffers(struct inode *inode) * all already-submitted IO to complete, but does not queue any new * writes to the disk. * - * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as - * you dirty the buffers, and then use osync_inode_buffers to wait for + * To do O_SYNC writes, just queue the buffer writes with write_dirty_buffer + * as you dirty the buffers, and then use osync_inode_buffers to wait for * completion. Any other dirty buffers which are not yet queued for * write will not be flushed to disk by the osync. */ @@ -562,7 +562,7 @@ void write_boundary_block(struct block_device *bdev, struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize); if (bh) { if (buffer_dirty(bh)) - ll_rw_block(REQ_OP_WRITE, 1, &bh); + write_dirty_buffer(bh, 0); put_bh(bh); } } @@ -1342,23 +1342,12 @@ void __breadahead(struct block_device *bdev, sector_t block, unsigned size) { struct buffer_head *bh = __getblk(bdev, block, size); if (likely(bh)) { - ll_rw_block(REQ_OP_READ | REQ_RAHEAD, 1, &bh); + bh_readahead(bh, REQ_RAHEAD); brelse(bh); } } EXPORT_SYMBOL(__breadahead); -void __breadahead_gfp(struct block_device *bdev, sector_t block, unsigned size, - gfp_t gfp) -{ - struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp); - if (likely(bh)) { - ll_rw_block(REQ_OP_READ | REQ_RAHEAD, 1, &bh); - brelse(bh); - } -} -EXPORT_SYMBOL(__breadahead_gfp); - /** * __bread_gfp() - reads a specified block and returns the bh * @bdev: the block_device to read from @@ -1464,19 +1453,15 @@ EXPORT_SYMBOL(set_bh_page); static void discard_buffer(struct buffer_head * bh) { - unsigned long b_state, b_state_old; + unsigned long b_state; lock_buffer(bh); clear_buffer_dirty(bh); bh->b_bdev = NULL; - b_state = bh->b_state; - for (;;) { - b_state_old = cmpxchg(&bh->b_state, b_state, - (b_state & ~BUFFER_FLAGS_DISCARD)); - if (b_state_old == b_state) - break; - b_state = b_state_old; - } + b_state = READ_ONCE(bh->b_state); + do { + } while (!try_cmpxchg(&bh->b_state, &b_state, + b_state & ~BUFFER_FLAGS_DISCARD)); unlock_buffer(bh); } @@ -1817,7 +1802,7 @@ done: /* * The page was marked dirty, but the buffers were * clean. Someone wrote them back by hand with - * ll_rw_block/submit_bh. A rare case. + * write_dirty_buffer/submit_bh. A rare case. */ end_page_writeback(page); @@ -2033,7 +2018,7 @@ int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len, if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh) && (block_start < from || block_end > to)) { - ll_rw_block(REQ_OP_READ, 1, &bh); + bh_read_nowait(bh, 0); *wait_bh++=bh; } } @@ -2352,7 +2337,7 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size) struct address_space *mapping = inode->i_mapping; const struct address_space_operations *aops = mapping->a_ops; struct page *page; - void *fsdata; + void *fsdata = NULL; int err; err = inode_newsize_ok(inode, size); @@ -2378,7 +2363,7 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, const struct address_space_operations *aops = mapping->a_ops; unsigned int blocksize = i_blocksize(inode); struct page *page; - void *fsdata; + void *fsdata = NULL; pgoff_t index, curidx; loff_t curpos; unsigned zerofrom, offset, len; @@ -2593,11 +2578,9 @@ int block_truncate_page(struct address_space *mapping, set_buffer_uptodate(bh); if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) { - err = -EIO; - ll_rw_block(REQ_OP_READ, 1, &bh); - wait_on_buffer(bh); + err = bh_read(bh, 0); /* Uhhuh. Read error. Complain and punt. */ - if (!buffer_uptodate(bh)) + if (err < 0) goto unlock; } @@ -2673,8 +2656,8 @@ static void end_bio_bh_io_sync(struct bio *bio) bio_put(bio); } -static int submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, - struct writeback_control *wbc) +static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, + struct writeback_control *wbc) { const enum req_op op = opf & REQ_OP_MASK; struct bio *bio; @@ -2717,70 +2700,14 @@ static int submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, } submit_bio(bio); - return 0; } -int submit_bh(blk_opf_t opf, struct buffer_head *bh) +void submit_bh(blk_opf_t opf, struct buffer_head *bh) { - return submit_bh_wbc(opf, bh, NULL); + submit_bh_wbc(opf, bh, NULL); } EXPORT_SYMBOL(submit_bh); -/** - * ll_rw_block: low-level access to block devices (DEPRECATED) - * @opf: block layer request operation and flags. - * @nr: number of &struct buffer_heads in the array - * @bhs: array of pointers to &struct buffer_head - * - * ll_rw_block() takes an array of pointers to &struct buffer_heads, and - * requests an I/O operation on them, either a %REQ_OP_READ or a %REQ_OP_WRITE. - * @opf contains flags modifying the detailed I/O behavior, most notably - * %REQ_RAHEAD. - * - * This function drops any buffer that it cannot get a lock on (with the - * BH_Lock state bit), any buffer that appears to be clean when doing a write - * request, and any buffer that appears to be up-to-date when doing read - * request. Further it marks as clean buffers that are processed for - * writing (the buffer cache won't assume that they are actually clean - * until the buffer gets unlocked). - * - * ll_rw_block sets b_end_io to simple completion handler that marks - * the buffer up-to-date (if appropriate), unlocks the buffer and wakes - * any waiters. - * - * All of the buffers must be for the same device, and must also be a - * multiple of the current approved size for the device. - */ -void ll_rw_block(const blk_opf_t opf, int nr, struct buffer_head *bhs[]) -{ - const enum req_op op = opf & REQ_OP_MASK; - int i; - - for (i = 0; i < nr; i++) { - struct buffer_head *bh = bhs[i]; - - if (!trylock_buffer(bh)) - continue; - if (op == REQ_OP_WRITE) { - if (test_clear_buffer_dirty(bh)) { - bh->b_end_io = end_buffer_write_sync; - get_bh(bh); - submit_bh(opf, bh); - continue; - } - } else { - if (!buffer_uptodate(bh)) { - bh->b_end_io = end_buffer_read_sync; - get_bh(bh); - submit_bh(opf, bh); - continue; - } - } - unlock_buffer(bh); - } -} -EXPORT_SYMBOL(ll_rw_block); - void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags) { lock_buffer(bh); @@ -2801,8 +2728,6 @@ EXPORT_SYMBOL(write_dirty_buffer); */ int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags) { - int ret = 0; - WARN_ON(atomic_read(&bh->b_count) < 1); lock_buffer(bh); if (test_clear_buffer_dirty(bh)) { @@ -2817,14 +2742,14 @@ int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags) get_bh(bh); bh->b_end_io = end_buffer_write_sync; - ret = submit_bh(REQ_OP_WRITE | op_flags, bh); + submit_bh(REQ_OP_WRITE | op_flags, bh); wait_on_buffer(bh); - if (!ret && !buffer_uptodate(bh)) - ret = -EIO; + if (!buffer_uptodate(bh)) + return -EIO; } else { unlock_buffer(bh); } - return ret; + return 0; } EXPORT_SYMBOL(__sync_dirty_buffer); @@ -3029,29 +2954,69 @@ int bh_uptodate_or_lock(struct buffer_head *bh) EXPORT_SYMBOL(bh_uptodate_or_lock); /** - * bh_submit_read - Submit a locked buffer for reading + * __bh_read - Submit read for a locked buffer * @bh: struct buffer_head + * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ + * @wait: wait until reading finish * - * Returns zero on success and -EIO on error. + * Returns zero on success or don't wait, and -EIO on error. */ -int bh_submit_read(struct buffer_head *bh) +int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait) { - BUG_ON(!buffer_locked(bh)); + int ret = 0; - if (buffer_uptodate(bh)) { - unlock_buffer(bh); - return 0; - } + BUG_ON(!buffer_locked(bh)); get_bh(bh); bh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, bh); - wait_on_buffer(bh); - if (buffer_uptodate(bh)) - return 0; - return -EIO; + submit_bh(REQ_OP_READ | op_flags, bh); + if (wait) { + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + ret = -EIO; + } + return ret; +} +EXPORT_SYMBOL(__bh_read); + +/** + * __bh_read_batch - Submit read for a batch of unlocked buffers + * @nr: entry number of the buffer batch + * @bhs: a batch of struct buffer_head + * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ + * @force_lock: force to get a lock on the buffer if set, otherwise drops any + * buffer that cannot lock. + * + * Returns zero on success or don't wait, and -EIO on error. + */ +void __bh_read_batch(int nr, struct buffer_head *bhs[], + blk_opf_t op_flags, bool force_lock) +{ + int i; + + for (i = 0; i < nr; i++) { + struct buffer_head *bh = bhs[i]; + + if (buffer_uptodate(bh)) + continue; + + if (force_lock) + lock_buffer(bh); + else + if (!trylock_buffer(bh)) + continue; + + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + continue; + } + + bh->b_end_io = end_buffer_read_sync; + get_bh(bh); + submit_bh(REQ_OP_READ | op_flags, bh); + } } -EXPORT_SYMBOL(bh_submit_read); +EXPORT_SYMBOL(__bh_read_batch); void __init buffer_init(void) { diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index facf2ebe464b..03ca8f2f657a 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -15,9 +15,8 @@ * file or directory. The caller must hold the inode lock. */ static bool __cachefiles_mark_inode_in_use(struct cachefiles_object *object, - struct dentry *dentry) + struct inode *inode) { - struct inode *inode = d_backing_inode(dentry); bool can_use = false; if (!(inode->i_flags & S_KERNEL_FILE)) { @@ -26,21 +25,18 @@ static bool __cachefiles_mark_inode_in_use(struct cachefiles_object *object, can_use = true; } else { trace_cachefiles_mark_failed(object, inode); - pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n", - dentry, inode->i_ino); } return can_use; } static bool cachefiles_mark_inode_in_use(struct cachefiles_object *object, - struct dentry *dentry) + struct inode *inode) { - struct inode *inode = d_backing_inode(dentry); bool can_use; inode_lock(inode); - can_use = __cachefiles_mark_inode_in_use(object, dentry); + can_use = __cachefiles_mark_inode_in_use(object, inode); inode_unlock(inode); return can_use; } @@ -49,21 +45,17 @@ static bool cachefiles_mark_inode_in_use(struct cachefiles_object *object, * Unmark a backing inode. The caller must hold the inode lock. */ static void __cachefiles_unmark_inode_in_use(struct cachefiles_object *object, - struct dentry *dentry) + struct inode *inode) { - struct inode *inode = d_backing_inode(dentry); - inode->i_flags &= ~S_KERNEL_FILE; trace_cachefiles_mark_inactive(object, inode); } static void cachefiles_do_unmark_inode_in_use(struct cachefiles_object *object, - struct dentry *dentry) + struct inode *inode) { - struct inode *inode = d_backing_inode(dentry); - inode_lock(inode); - __cachefiles_unmark_inode_in_use(object, dentry); + __cachefiles_unmark_inode_in_use(object, inode); inode_unlock(inode); } @@ -77,14 +69,12 @@ void cachefiles_unmark_inode_in_use(struct cachefiles_object *object, struct cachefiles_cache *cache = object->volume->cache; struct inode *inode = file_inode(file); - if (inode) { - cachefiles_do_unmark_inode_in_use(object, file->f_path.dentry); + cachefiles_do_unmark_inode_in_use(object, inode); - if (!test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags)) { - atomic_long_add(inode->i_blocks, &cache->b_released); - if (atomic_inc_return(&cache->f_released)) - cachefiles_state_changed(cache); - } + if (!test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags)) { + atomic_long_add(inode->i_blocks, &cache->b_released); + if (atomic_inc_return(&cache->f_released)) + cachefiles_state_changed(cache); } } @@ -164,8 +154,11 @@ retry: inode_lock(d_inode(subdir)); inode_unlock(d_inode(dir)); - if (!__cachefiles_mark_inode_in_use(NULL, subdir)) + if (!__cachefiles_mark_inode_in_use(NULL, d_inode(subdir))) { + pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n", + subdir, d_inode(subdir)->i_ino); goto mark_error; + } inode_unlock(d_inode(subdir)); @@ -224,9 +217,7 @@ nomem_d_alloc: void cachefiles_put_directory(struct dentry *dir) { if (dir) { - inode_lock(dir->d_inode); - __cachefiles_unmark_inode_in_use(NULL, dir); - inode_unlock(dir->d_inode); + cachefiles_do_unmark_inode_in_use(NULL, d_inode(dir)); dput(dir); } } @@ -410,7 +401,7 @@ try_again: "Rename failed with error %d", ret); } - __cachefiles_unmark_inode_in_use(object, rep); + __cachefiles_unmark_inode_in_use(object, d_inode(rep)); unlock_rename(cache->graveyard, dir); dput(grave); _leave(" = 0"); @@ -451,84 +442,72 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object) const struct cred *saved_cred; struct dentry *fan = volume->fanout[(u8)object->cookie->key_hash]; struct file *file; - struct path path; + const struct path parentpath = { .mnt = cache->mnt, .dentry = fan }; uint64_t ni_size; long ret; cachefiles_begin_secure(cache, &saved_cred); - path.mnt = cache->mnt; ret = cachefiles_inject_write_error(); - if (ret == 0) - path.dentry = vfs_tmpfile(&init_user_ns, fan, S_IFREG, O_RDWR); - else - path.dentry = ERR_PTR(ret); - if (IS_ERR(path.dentry)) { - trace_cachefiles_vfs_error(object, d_inode(fan), PTR_ERR(path.dentry), + if (ret == 0) { + file = vfs_tmpfile_open(&init_user_ns, &parentpath, S_IFREG, + O_RDWR | O_LARGEFILE | O_DIRECT, + cache->cache_cred); + ret = PTR_ERR_OR_ZERO(file); + } + if (ret) { + trace_cachefiles_vfs_error(object, d_inode(fan), ret, cachefiles_trace_tmpfile_error); - if (PTR_ERR(path.dentry) == -EIO) + if (ret == -EIO) cachefiles_io_error_obj(object, "Failed to create tmpfile"); - file = ERR_CAST(path.dentry); - goto out; + goto err; } - trace_cachefiles_tmpfile(object, d_backing_inode(path.dentry)); + trace_cachefiles_tmpfile(object, file_inode(file)); - if (!cachefiles_mark_inode_in_use(object, path.dentry)) { - file = ERR_PTR(-EBUSY); - goto out_dput; - } + /* This is a newly created file with no other possible user */ + if (!cachefiles_mark_inode_in_use(object, file_inode(file))) + WARN_ON(1); ret = cachefiles_ondemand_init_object(object); - if (ret < 0) { - file = ERR_PTR(ret); - goto out_unuse; - } + if (ret < 0) + goto err_unuse; ni_size = object->cookie->object_size; ni_size = round_up(ni_size, CACHEFILES_DIO_BLOCK_SIZE); if (ni_size > 0) { - trace_cachefiles_trunc(object, d_backing_inode(path.dentry), 0, ni_size, + trace_cachefiles_trunc(object, file_inode(file), 0, ni_size, cachefiles_trunc_expand_tmpfile); ret = cachefiles_inject_write_error(); if (ret == 0) - ret = vfs_truncate(&path, ni_size); + ret = vfs_truncate(&file->f_path, ni_size); if (ret < 0) { trace_cachefiles_vfs_error( - object, d_backing_inode(path.dentry), ret, + object, file_inode(file), ret, cachefiles_trace_trunc_error); - file = ERR_PTR(ret); - goto out_unuse; + goto err_unuse; } } - file = open_with_fake_path(&path, O_RDWR | O_LARGEFILE | O_DIRECT, - d_backing_inode(path.dentry), cache->cache_cred); - if (IS_ERR(file)) { - trace_cachefiles_vfs_error(object, d_backing_inode(path.dentry), - PTR_ERR(file), - cachefiles_trace_open_error); - goto out_unuse; - } + ret = -EINVAL; if (unlikely(!file->f_op->read_iter) || unlikely(!file->f_op->write_iter)) { fput(file); pr_notice("Cache does not support read_iter and write_iter\n"); - file = ERR_PTR(-EINVAL); - goto out_unuse; + goto err_unuse; } - - goto out_dput; - -out_unuse: - cachefiles_do_unmark_inode_in_use(object, path.dentry); -out_dput: - dput(path.dentry); out: cachefiles_end_secure(cache, saved_cred); return file; + +err_unuse: + cachefiles_do_unmark_inode_in_use(object, file_inode(file)); + fput(file); +err: + file = ERR_PTR(ret); + goto out; } /* @@ -569,8 +548,11 @@ static bool cachefiles_open_file(struct cachefiles_object *object, _enter("%pd", dentry); - if (!cachefiles_mark_inode_in_use(object, dentry)) + if (!cachefiles_mark_inode_in_use(object, d_inode(dentry))) { + pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n", + dentry, d_inode(dentry)->i_ino); return false; + } /* We need to open a file interface onto a data file now as we can't do * it on demand because writeback called from do_exit() sees @@ -624,7 +606,7 @@ check_failed: error_fput: fput(file); error: - cachefiles_do_unmark_inode_in_use(object, dentry); + cachefiles_do_unmark_inode_in_use(object, d_inode(dentry)); dput(dentry); return false; } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 53cfe026b3ea..e54814d0c2f7 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -754,6 +754,7 @@ void ceph_add_cap(struct inode *inode, cap->issue_seq = seq; cap->mseq = mseq; cap->cap_gen = gen; + wake_up_all(&ci->i_cap_wq); } /* @@ -2247,7 +2248,6 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode) struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_request *req1 = NULL, *req2 = NULL; - unsigned int max_sessions; int ret, err = 0; spin_lock(&ci->i_unsafe_lock); @@ -2266,27 +2266,23 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode) spin_unlock(&ci->i_unsafe_lock); /* - * The mdsc->max_sessions is unlikely to be changed - * mostly, here we will retry it by reallocating the - * sessions array memory to get rid of the mdsc->mutex - * lock. - */ -retry: - max_sessions = mdsc->max_sessions; - - /* * Trigger to flush the journal logs in all the relevant MDSes * manually, or in the worst case we must wait at most 5 seconds * to wait the journal logs to be flushed by the MDSes periodically. */ - if ((req1 || req2) && likely(max_sessions)) { - struct ceph_mds_session **sessions = NULL; - struct ceph_mds_session *s; + if (req1 || req2) { struct ceph_mds_request *req; + struct ceph_mds_session **sessions; + struct ceph_mds_session *s; + unsigned int max_sessions; int i; - sessions = kzalloc(max_sessions * sizeof(s), GFP_KERNEL); + mutex_lock(&mdsc->mutex); + max_sessions = mdsc->max_sessions; + + sessions = kcalloc(max_sessions, sizeof(s), GFP_KERNEL); if (!sessions) { + mutex_unlock(&mdsc->mutex); err = -ENOMEM; goto out; } @@ -2298,16 +2294,6 @@ retry: s = req->r_session; if (!s) continue; - if (unlikely(s->s_mds >= max_sessions)) { - spin_unlock(&ci->i_unsafe_lock); - for (i = 0; i < max_sessions; i++) { - s = sessions[i]; - if (s) - ceph_put_mds_session(s); - } - kfree(sessions); - goto retry; - } if (!sessions[s->s_mds]) { s = ceph_get_mds_session(s); sessions[s->s_mds] = s; @@ -2320,16 +2306,6 @@ retry: s = req->r_session; if (!s) continue; - if (unlikely(s->s_mds >= max_sessions)) { - spin_unlock(&ci->i_unsafe_lock); - for (i = 0; i < max_sessions; i++) { - s = sessions[i]; - if (s) - ceph_put_mds_session(s); - } - kfree(sessions); - goto retry; - } if (!sessions[s->s_mds]) { s = ceph_get_mds_session(s); sessions[s->s_mds] = s; @@ -2341,11 +2317,12 @@ retry: /* the auth MDS */ spin_lock(&ci->i_ceph_lock); if (ci->i_auth_cap) { - s = ci->i_auth_cap->session; - if (!sessions[s->s_mds]) - sessions[s->s_mds] = ceph_get_mds_session(s); + s = ci->i_auth_cap->session; + if (!sessions[s->s_mds]) + sessions[s->s_mds] = ceph_get_mds_session(s); } spin_unlock(&ci->i_ceph_lock); + mutex_unlock(&mdsc->mutex); /* send flush mdlog request to MDSes */ for (i = 0; i < max_sessions; i++) { @@ -2759,13 +2736,17 @@ again: * on transition from wanted -> needed caps. This is needed * for WRBUFFER|WR -> WR to avoid a new WR sync write from * going before a prior buffered writeback happens. + * + * For RDCACHE|RD -> RD, there is not need to wait and we can + * just exclude the revoking caps and force to sync read. */ int not = want & ~(have & need); int revoking = implemented & ~have; + int exclude = revoking & not; dout("get_cap_refs %p have %s but not %s (revoking %s)\n", inode, ceph_cap_string(have), ceph_cap_string(not), ceph_cap_string(revoking)); - if ((revoking & not) == 0) { + if (!exclude || !(exclude & CEPH_CAP_FILE_BUFFER)) { if (!snap_rwsem_locked && !ci->i_head_snapc && (need & CEPH_CAP_FILE_WR)) { @@ -2787,7 +2768,7 @@ again: snap_rwsem_locked = true; } if ((have & want) == want) - *got = need | want; + *got = need | (want & ~exclude); else *got = need; ceph_take_cap_refs(ci, *got, true); @@ -3550,6 +3531,9 @@ static void handle_cap_grant(struct inode *inode, check_caps = 1; /* check auth cap only */ else check_caps = 2; /* check all caps */ + /* If there is new caps, try to wake up the waiters */ + if (~cap->issued & newcaps) + wake = true; cap->issued = newcaps; cap->implemented |= newcaps; } else if (cap->issued == newcaps) { diff --git a/fs/ceph/export.c b/fs/ceph/export.c index e0fa66ac8b9f..f780e4e0d062 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -181,6 +181,7 @@ struct inode *ceph_lookup_inode(struct super_block *sb, u64 ino) static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) { struct inode *inode = __lookup_inode(sb, ino); + struct ceph_inode_info *ci = ceph_inode(inode); int err; if (IS_ERR(inode)) @@ -192,7 +193,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) return ERR_PTR(err); } /* -ESTALE if inode as been unlinked and no file is open */ - if ((inode->i_nlink == 0) && (atomic_read(&inode->i_count) == 1)) { + if ((inode->i_nlink == 0) && !__ceph_is_file_opened(ci)) { iput(inode); return ERR_PTR(-ESTALE); } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 42351d7a0dd6..bad9eeb6a1a5 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -362,7 +362,7 @@ static int ceph_fill_fragtree(struct inode *inode, if (nsplits != ci->i_fragtree_nsplits) { update = true; } else if (nsplits) { - i = prandom_u32() % nsplits; + i = prandom_u32_max(nsplits); id = le32_to_cpu(fragtree->splits[i].frag); if (!__ceph_find_frag(ci, id)) update = true; @@ -2192,6 +2192,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied, &prealloc_cf); inode->i_ctime = attr->ia_ctime; + inode_inc_iversion_raw(inode); } release &= issued; @@ -2356,6 +2357,7 @@ int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, goto out; } + req->r_feature_needed = CEPHFS_FEATURE_OP_GETVXATTR; req->r_path2 = kstrdup(name, GFP_NOFS); if (!req->r_path2) { err = -ENOMEM; @@ -2447,6 +2449,7 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct inode *inode = d_inode(path->dentry); + struct super_block *sb = inode->i_sb; struct ceph_inode_info *ci = ceph_inode(inode); u32 valid_mask = STATX_BASIC_STATS; int err = 0; @@ -2476,16 +2479,34 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, } if (ceph_snap(inode) == CEPH_NOSNAP) - stat->dev = inode->i_sb->s_dev; + stat->dev = sb->s_dev; else stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0; if (S_ISDIR(inode->i_mode)) { - if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), - RBYTES)) + if (ceph_test_mount_opt(ceph_sb_to_client(sb), RBYTES)) { stat->size = ci->i_rbytes; - else + } else if (ceph_snap(inode) == CEPH_SNAPDIR) { + struct ceph_inode_info *pci; + struct ceph_snap_realm *realm; + struct inode *parent; + + parent = ceph_lookup_inode(sb, ceph_ino(inode)); + if (IS_ERR(parent)) + return PTR_ERR(parent); + + pci = ceph_inode(parent); + spin_lock(&pci->i_ceph_lock); + realm = pci->i_snap_realm; + if (realm) + stat->size = realm->num_snaps; + else + stat->size = 0; + spin_unlock(&pci->i_ceph_lock); + iput(parent); + } else { stat->size = ci->i_files + ci->i_subdirs; + } stat->blocks = 0; stat->blksize = 65536; /* diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 80f8b9ec1a31..26a0a8b9975e 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2318,6 +2318,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) INIT_LIST_HEAD(&req->r_unsafe_dir_item); INIT_LIST_HEAD(&req->r_unsafe_target_item); req->r_fmode = -1; + req->r_feature_needed = -1; kref_init(&req->r_kref); RB_CLEAR_NODE(&req->r_node); INIT_LIST_HEAD(&req->r_wait); @@ -2916,6 +2917,16 @@ static void __do_request(struct ceph_mds_client *mdsc, dout("do_request mds%d session %p state %s\n", mds, session, ceph_session_state_name(session->s_state)); + + /* + * The old ceph will crash the MDSs when see unknown OPs + */ + if (req->r_feature_needed > 0 && + !test_bit(req->r_feature_needed, &session->s_features)) { + err = -EOPNOTSUPP; + goto out_session; + } + if (session->s_state != CEPH_MDS_SESSION_OPEN && session->s_state != CEPH_MDS_SESSION_HUNG) { /* diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 256e3eada6c1..0598faa50e2e 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -31,8 +31,9 @@ enum ceph_feature_type { CEPHFS_FEATURE_METRIC_COLLECT, CEPHFS_FEATURE_ALTERNATE_NAME, CEPHFS_FEATURE_NOTIFY_SESSION_STATE, + CEPHFS_FEATURE_OP_GETVXATTR, - CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_NOTIFY_SESSION_STATE, + CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_OP_GETVXATTR, }; #define CEPHFS_FEATURES_CLIENT_SUPPORTED { \ @@ -44,6 +45,7 @@ enum ceph_feature_type { CEPHFS_FEATURE_DELEG_INO, \ CEPHFS_FEATURE_METRIC_COLLECT, \ CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \ + CEPHFS_FEATURE_OP_GETVXATTR, \ } /* @@ -336,6 +338,8 @@ struct ceph_mds_request { long long r_dir_ordered_cnt; int r_readdir_cache_idx; + int r_feature_needed; + struct ceph_cap_reservation r_caps_reservation; }; diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 8d0a6d2c2da4..3fbabc98e1f7 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -29,7 +29,7 @@ static int __mdsmap_get_random_mds(struct ceph_mdsmap *m, bool ignore_laggy) return -1; /* pick */ - n = prandom_u32() % n; + n = prandom_u32_max(n); for (j = 0, i = 0; i < m->possible_max_rank; i++) { if (CEPH_MDS_IS_READY(i, ignore_laggy)) j++; diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 864cdaa0d2bd..e4151852184e 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -763,7 +763,7 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, struct ceph_mds_snap_realm *ri; /* encoded */ __le64 *snaps; /* encoded */ __le64 *prior_parent_snaps; /* encoded */ - struct ceph_snap_realm *realm = NULL; + struct ceph_snap_realm *realm; struct ceph_snap_realm *first_realm = NULL; struct ceph_snap_realm *realm_to_rebuild = NULL; int rebuild_snapcs; @@ -774,6 +774,7 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, dout("%s deletion=%d\n", __func__, deletion); more: + realm = NULL; rebuild_snapcs = 0; ceph_decode_need(&p, e, sizeof(*ri), bad); ri = p; diff --git a/fs/cifs/cached_dir.c b/fs/cifs/cached_dir.c index b401339f6e73..60399081046a 100644 --- a/fs/cifs/cached_dir.c +++ b/fs/cifs/cached_dir.c @@ -5,12 +5,99 @@ * Copyright (c) 2022, Ronnie Sahlberg <lsahlber@redhat.com> */ +#include <linux/namei.h> #include "cifsglob.h" #include "cifsproto.h" #include "cifs_debug.h" #include "smb2proto.h" #include "cached_dir.h" +static struct cached_fid *init_cached_dir(const char *path); +static void free_cached_dir(struct cached_fid *cfid); + +static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids, + const char *path, + bool lookup_only) +{ + struct cached_fid *cfid; + + spin_lock(&cfids->cfid_list_lock); + list_for_each_entry(cfid, &cfids->entries, entry) { + if (!strcmp(cfid->path, path)) { + /* + * If it doesn't have a lease it is either not yet + * fully cached or it may be in the process of + * being deleted due to a lease break. + */ + if (!cfid->has_lease) { + spin_unlock(&cfids->cfid_list_lock); + return NULL; + } + kref_get(&cfid->refcount); + spin_unlock(&cfids->cfid_list_lock); + return cfid; + } + } + if (lookup_only) { + spin_unlock(&cfids->cfid_list_lock); + return NULL; + } + if (cfids->num_entries >= MAX_CACHED_FIDS) { + spin_unlock(&cfids->cfid_list_lock); + return NULL; + } + cfid = init_cached_dir(path); + if (cfid == NULL) { + spin_unlock(&cfids->cfid_list_lock); + return NULL; + } + cfid->cfids = cfids; + cfids->num_entries++; + list_add(&cfid->entry, &cfids->entries); + cfid->on_list = true; + kref_get(&cfid->refcount); + spin_unlock(&cfids->cfid_list_lock); + return cfid; +} + +static struct dentry * +path_to_dentry(struct cifs_sb_info *cifs_sb, const char *path) +{ + struct dentry *dentry; + const char *s, *p; + char sep; + + sep = CIFS_DIR_SEP(cifs_sb); + dentry = dget(cifs_sb->root); + s = path; + + do { + struct inode *dir = d_inode(dentry); + struct dentry *child; + + if (!S_ISDIR(dir->i_mode)) { + dput(dentry); + dentry = ERR_PTR(-ENOTDIR); + break; + } + + /* skip separators */ + while (*s == sep) + s++; + if (!*s) + break; + p = s++; + /* next separator */ + while (*s && *s != sep) + s++; + + child = lookup_positive_unlocked(p, dentry, s - p); + dput(dentry); + dentry = child; + } while (!IS_ERR(dentry)); + return dentry; +} + /* * Open the and cache a directory handle. * If error then *cfid is not initialized. @@ -31,54 +118,57 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; struct kvec qi_iov[1]; int rc, flags = 0; - __le16 utf16_path = 0; /* Null - since an open of top of share */ + __le16 *utf16_path = NULL; u8 oplock = SMB2_OPLOCK_LEVEL_II; struct cifs_fid *pfid; - struct dentry *dentry; + struct dentry *dentry = NULL; struct cached_fid *cfid; + struct cached_fids *cfids; - if (tcon == NULL || tcon->nohandlecache || + if (tcon == NULL || tcon->cfids == NULL || tcon->nohandlecache || is_smb1_server(tcon->ses->server)) return -EOPNOTSUPP; ses = tcon->ses; server = ses->server; + cfids = tcon->cfids; - if (cifs_sb->root == NULL) - return -ENOENT; + if (!server->ops->new_lease_key) + return -EIO; - if (strlen(path)) + if (cifs_sb->root == NULL) return -ENOENT; - dentry = cifs_sb->root; + utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); + if (!utf16_path) + return -ENOMEM; - cfid = tcon->cfid; - mutex_lock(&cfid->fid_mutex); - if (cfid->is_valid) { - cifs_dbg(FYI, "found a cached root file handle\n"); + cfid = find_or_create_cached_dir(cfids, path, lookup_only); + if (cfid == NULL) { + kfree(utf16_path); + return -ENOENT; + } + /* + * At this point we either have a lease already and we can just + * return it. If not we are guaranteed to be the only thread accessing + * this cfid. + */ + if (cfid->has_lease) { *ret_cfid = cfid; - kref_get(&cfid->refcount); - mutex_unlock(&cfid->fid_mutex); + kfree(utf16_path); return 0; } /* * We do not hold the lock for the open because in case - * SMB2_open needs to reconnect, it will end up calling - * cifs_mark_open_files_invalid() which takes the lock again - * thus causing a deadlock + * SMB2_open needs to reconnect. + * This is safe because no other thread will be able to get a ref + * to the cfid until we have finished opening the file and (possibly) + * acquired a lease. */ - mutex_unlock(&cfid->fid_mutex); - - if (lookup_only) - return -ENOENT; - if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - if (!server->ops->new_lease_key) - return -EIO; - pfid = &cfid->fid; server->ops->new_lease_key(pfid); @@ -99,7 +189,7 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, oparms.reconnect = false; rc = SMB2_open_init(tcon, server, - &rqst[0], &oplock, &oparms, &utf16_path); + &rqst[0], &oplock, &oparms, utf16_path); if (rc) goto oshr_free; smb2_set_next_command(tcon, &rqst[0]); @@ -122,47 +212,13 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, rc = compound_send_recv(xid, ses, server, flags, 2, rqst, resp_buftype, rsp_iov); - mutex_lock(&cfid->fid_mutex); - - /* - * Now we need to check again as the cached root might have - * been successfully re-opened from a concurrent process - */ - - if (cfid->is_valid) { - /* work was already done */ - - /* stash fids for close() later */ - struct cifs_fid fid = { - .persistent_fid = pfid->persistent_fid, - .volatile_fid = pfid->volatile_fid, - }; - - /* - * caller expects this func to set the fid in cfid to valid - * cached root, so increment the refcount. - */ - kref_get(&cfid->refcount); - - mutex_unlock(&cfid->fid_mutex); - - if (rc == 0) { - /* close extra handle outside of crit sec */ - SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); - } - rc = 0; - goto oshr_free; - } - - /* Cached root is still invalid, continue normaly */ - if (rc) { if (rc == -EREMCHG) { tcon->need_reconnect = true; pr_warn_once("server share %s deleted\n", - tcon->treeName); + tcon->tree_name); } - goto oshr_exit; + goto oshr_free; } atomic_inc(&tcon->num_remote_opens); @@ -174,30 +230,18 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, oparms.fid->mid = le64_to_cpu(o_rsp->hdr.MessageId); #endif /* CIFS_DEBUG2 */ - cfid->tcon = tcon; - cfid->is_valid = true; - cfid->dentry = dentry; - dget(dentry); - kref_init(&cfid->refcount); + if (o_rsp->OplockLevel != SMB2_OPLOCK_LEVEL_LEASE) + goto oshr_free; - /* BB TBD check to see if oplock level check can be removed below */ - if (o_rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) { - /* - * See commit 2f94a3125b87. Increment the refcount when we - * get a lease for root, release it if lease break occurs - */ - kref_get(&cfid->refcount); - cfid->has_lease = true; - smb2_parse_contexts(server, o_rsp, - &oparms.fid->epoch, - oparms.fid->lease_key, &oplock, - NULL, NULL); - } else - goto oshr_exit; + + smb2_parse_contexts(server, o_rsp, + &oparms.fid->epoch, + oparms.fid->lease_key, &oplock, + NULL, NULL); qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base; if (le32_to_cpu(qi_rsp->OutputBufferLength) < sizeof(struct smb2_file_all_info)) - goto oshr_exit; + goto oshr_free; if (!smb2_validate_and_copy_iov( le16_to_cpu(qi_rsp->OutputBufferOffset), sizeof(struct smb2_file_all_info), @@ -205,15 +249,42 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, (char *)&cfid->file_all_info)) cfid->file_all_info_is_valid = true; + if (!path[0]) + dentry = dget(cifs_sb->root); + else { + dentry = path_to_dentry(cifs_sb, path); + if (IS_ERR(dentry)) { + rc = -ENOENT; + goto oshr_free; + } + } + cfid->dentry = dentry; + cfid->tcon = tcon; cfid->time = jiffies; + cfid->is_open = true; + cfid->has_lease = true; -oshr_exit: - mutex_unlock(&cfid->fid_mutex); oshr_free: + kfree(utf16_path); SMB2_open_free(&rqst[0]); SMB2_query_info_free(&rqst[1]); free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); + spin_lock(&cfids->cfid_list_lock); + if (!cfid->has_lease) { + if (cfid->on_list) { + list_del(&cfid->entry); + cfid->on_list = false; + cfids->num_entries--; + } + rc = -ENOENT; + } + spin_unlock(&cfids->cfid_list_lock); + if (rc) { + free_cached_dir(cfid); + cfid = NULL; + } + if (rc == 0) *ret_cfid = cfid; @@ -225,18 +296,22 @@ int open_cached_dir_by_dentry(struct cifs_tcon *tcon, struct cached_fid **ret_cfid) { struct cached_fid *cfid; + struct cached_fids *cfids = tcon->cfids; - cfid = tcon->cfid; + if (cfids == NULL) + return -ENOENT; - mutex_lock(&cfid->fid_mutex); - if (cfid->dentry == dentry) { - cifs_dbg(FYI, "found a cached root file handle by dentry\n"); - *ret_cfid = cfid; - kref_get(&cfid->refcount); - mutex_unlock(&cfid->fid_mutex); - return 0; + spin_lock(&cfids->cfid_list_lock); + list_for_each_entry(cfid, &cfids->entries, entry) { + if (dentry && cfid->dentry == dentry) { + cifs_dbg(FYI, "found a cached root file handle by dentry\n"); + kref_get(&cfid->refcount); + *ret_cfid = cfid; + spin_unlock(&cfids->cfid_list_lock); + return 0; + } } - mutex_unlock(&cfid->fid_mutex); + spin_unlock(&cfids->cfid_list_lock); return -ENOENT; } @@ -245,63 +320,50 @@ smb2_close_cached_fid(struct kref *ref) { struct cached_fid *cfid = container_of(ref, struct cached_fid, refcount); - struct cached_dirent *dirent, *q; - if (cfid->is_valid) { - cifs_dbg(FYI, "clear cached root file handle\n"); - SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid, - cfid->fid.volatile_fid); + spin_lock(&cfid->cfids->cfid_list_lock); + if (cfid->on_list) { + list_del(&cfid->entry); + cfid->on_list = false; + cfid->cfids->num_entries--; } + spin_unlock(&cfid->cfids->cfid_list_lock); - /* - * We only check validity above to send SMB2_close, - * but we still need to invalidate these entries - * when this function is called - */ - cfid->is_valid = false; - cfid->file_all_info_is_valid = false; - cfid->has_lease = false; - if (cfid->dentry) { - dput(cfid->dentry); - cfid->dentry = NULL; - } - /* - * Delete all cached dirent names - */ - mutex_lock(&cfid->dirents.de_mutex); - list_for_each_entry_safe(dirent, q, &cfid->dirents.entries, entry) { - list_del(&dirent->entry); - kfree(dirent->name); - kfree(dirent); + dput(cfid->dentry); + cfid->dentry = NULL; + + if (cfid->is_open) { + SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid, + cfid->fid.volatile_fid); } - cfid->dirents.is_valid = 0; - cfid->dirents.is_failed = 0; - cfid->dirents.ctx = NULL; - cfid->dirents.pos = 0; - mutex_unlock(&cfid->dirents.de_mutex); + free_cached_dir(cfid); } -void close_cached_dir(struct cached_fid *cfid) +void drop_cached_dir_by_name(const unsigned int xid, struct cifs_tcon *tcon, + const char *name, struct cifs_sb_info *cifs_sb) { - mutex_lock(&cfid->fid_mutex); - kref_put(&cfid->refcount, smb2_close_cached_fid); - mutex_unlock(&cfid->fid_mutex); -} + struct cached_fid *cfid = NULL; + int rc; -void close_cached_dir_lease_locked(struct cached_fid *cfid) -{ + rc = open_cached_dir(xid, tcon, name, cifs_sb, true, &cfid); + if (rc) { + cifs_dbg(FYI, "no cached dir found for rmdir(%s)\n", name); + return; + } + spin_lock(&cfid->cfids->cfid_list_lock); if (cfid->has_lease) { cfid->has_lease = false; kref_put(&cfid->refcount, smb2_close_cached_fid); } + spin_unlock(&cfid->cfids->cfid_list_lock); + close_cached_dir(cfid); } -void close_cached_dir_lease(struct cached_fid *cfid) + +void close_cached_dir(struct cached_fid *cfid) { - mutex_lock(&cfid->fid_mutex); - close_cached_dir_lease_locked(cfid); - mutex_unlock(&cfid->fid_mutex); + kref_put(&cfid->refcount, smb2_close_cached_fid); } /* @@ -314,34 +376,60 @@ void close_all_cached_dirs(struct cifs_sb_info *cifs_sb) struct cached_fid *cfid; struct cifs_tcon *tcon; struct tcon_link *tlink; + struct cached_fids *cfids; for (node = rb_first(root); node; node = rb_next(node)) { tlink = rb_entry(node, struct tcon_link, tl_rbnode); tcon = tlink_tcon(tlink); if (IS_ERR(tcon)) continue; - cfid = tcon->cfid; - mutex_lock(&cfid->fid_mutex); - if (cfid->dentry) { + cfids = tcon->cfids; + if (cfids == NULL) + continue; + list_for_each_entry(cfid, &cfids->entries, entry) { dput(cfid->dentry); cfid->dentry = NULL; } - mutex_unlock(&cfid->fid_mutex); } } /* - * Invalidate and close all cached dirs when a TCON has been reset + * Invalidate all cached dirs when a TCON has been reset * due to a session loss. */ void invalidate_all_cached_dirs(struct cifs_tcon *tcon) { - mutex_lock(&tcon->cfid->fid_mutex); - tcon->cfid->is_valid = false; - /* cached handle is not valid, so SMB2_CLOSE won't be sent below */ - close_cached_dir_lease_locked(tcon->cfid); - memset(&tcon->cfid->fid, 0, sizeof(struct cifs_fid)); - mutex_unlock(&tcon->cfid->fid_mutex); + struct cached_fids *cfids = tcon->cfids; + struct cached_fid *cfid, *q; + LIST_HEAD(entry); + + spin_lock(&cfids->cfid_list_lock); + list_for_each_entry_safe(cfid, q, &cfids->entries, entry) { + list_move(&cfid->entry, &entry); + cfids->num_entries--; + cfid->is_open = false; + cfid->on_list = false; + /* To prevent race with smb2_cached_lease_break() */ + kref_get(&cfid->refcount); + } + spin_unlock(&cfids->cfid_list_lock); + + list_for_each_entry_safe(cfid, q, &entry, entry) { + list_del(&cfid->entry); + cancel_work_sync(&cfid->lease_break); + if (cfid->has_lease) { + /* + * We lease was never cancelled from the server so we + * need to drop the reference. + */ + spin_lock(&cfids->cfid_list_lock); + cfid->has_lease = false; + spin_unlock(&cfids->cfid_list_lock); + kref_put(&cfid->refcount, smb2_close_cached_fid); + } + /* Drop the extra reference opened above*/ + kref_put(&cfid->refcount, smb2_close_cached_fid); + } } static void @@ -350,39 +438,121 @@ smb2_cached_lease_break(struct work_struct *work) struct cached_fid *cfid = container_of(work, struct cached_fid, lease_break); - close_cached_dir_lease(cfid); + spin_lock(&cfid->cfids->cfid_list_lock); + cfid->has_lease = false; + spin_unlock(&cfid->cfids->cfid_list_lock); + kref_put(&cfid->refcount, smb2_close_cached_fid); } int cached_dir_lease_break(struct cifs_tcon *tcon, __u8 lease_key[16]) { - if (tcon->cfid->is_valid && - !memcmp(lease_key, - tcon->cfid->fid.lease_key, - SMB2_LEASE_KEY_SIZE)) { - tcon->cfid->time = 0; - INIT_WORK(&tcon->cfid->lease_break, - smb2_cached_lease_break); - queue_work(cifsiod_wq, - &tcon->cfid->lease_break); - return true; + struct cached_fids *cfids = tcon->cfids; + struct cached_fid *cfid; + + if (cfids == NULL) + return false; + + spin_lock(&cfids->cfid_list_lock); + list_for_each_entry(cfid, &cfids->entries, entry) { + if (cfid->has_lease && + !memcmp(lease_key, + cfid->fid.lease_key, + SMB2_LEASE_KEY_SIZE)) { + cfid->time = 0; + /* + * We found a lease remove it from the list + * so no threads can access it. + */ + list_del(&cfid->entry); + cfid->on_list = false; + cfids->num_entries--; + + queue_work(cifsiod_wq, + &cfid->lease_break); + spin_unlock(&cfids->cfid_list_lock); + return true; + } } + spin_unlock(&cfids->cfid_list_lock); return false; } -struct cached_fid *init_cached_dir(void) +static struct cached_fid *init_cached_dir(const char *path) { struct cached_fid *cfid; - cfid = kzalloc(sizeof(*cfid), GFP_KERNEL); + cfid = kzalloc(sizeof(*cfid), GFP_ATOMIC); if (!cfid) return NULL; + cfid->path = kstrdup(path, GFP_ATOMIC); + if (!cfid->path) { + kfree(cfid); + return NULL; + } + + INIT_WORK(&cfid->lease_break, smb2_cached_lease_break); + INIT_LIST_HEAD(&cfid->entry); INIT_LIST_HEAD(&cfid->dirents.entries); mutex_init(&cfid->dirents.de_mutex); - mutex_init(&cfid->fid_mutex); + spin_lock_init(&cfid->fid_lock); + kref_init(&cfid->refcount); return cfid; } -void free_cached_dir(struct cifs_tcon *tcon) +static void free_cached_dir(struct cached_fid *cfid) +{ + struct cached_dirent *dirent, *q; + + dput(cfid->dentry); + cfid->dentry = NULL; + + /* + * Delete all cached dirent names + */ + list_for_each_entry_safe(dirent, q, &cfid->dirents.entries, entry) { + list_del(&dirent->entry); + kfree(dirent->name); + kfree(dirent); + } + + kfree(cfid->path); + cfid->path = NULL; + kfree(cfid); +} + +struct cached_fids *init_cached_dirs(void) +{ + struct cached_fids *cfids; + + cfids = kzalloc(sizeof(*cfids), GFP_KERNEL); + if (!cfids) + return NULL; + spin_lock_init(&cfids->cfid_list_lock); + INIT_LIST_HEAD(&cfids->entries); + return cfids; +} + +/* + * Called from tconInfoFree when we are tearing down the tcon. + * There are no active users or open files/directories at this point. + */ +void free_cached_dirs(struct cached_fids *cfids) { - kfree(tcon->cfid); + struct cached_fid *cfid, *q; + LIST_HEAD(entry); + + spin_lock(&cfids->cfid_list_lock); + list_for_each_entry_safe(cfid, q, &cfids->entries, entry) { + cfid->on_list = false; + cfid->is_open = false; + list_move(&cfid->entry, &entry); + } + spin_unlock(&cfids->cfid_list_lock); + + list_for_each_entry_safe(cfid, q, &entry, entry) { + list_del(&cfid->entry); + free_cached_dir(cfid); + } + + kfree(cfids); } diff --git a/fs/cifs/cached_dir.h b/fs/cifs/cached_dir.h index bd262dc8b179..2f4e764c9ca9 100644 --- a/fs/cifs/cached_dir.h +++ b/fs/cifs/cached_dir.h @@ -31,13 +31,17 @@ struct cached_dirents { }; struct cached_fid { - bool is_valid:1; /* Do we have a useable root fid */ - bool file_all_info_is_valid:1; + struct list_head entry; + struct cached_fids *cfids; + const char *path; bool has_lease:1; + bool is_open:1; + bool on_list:1; + bool file_all_info_is_valid:1; unsigned long time; /* jiffies of when lease was taken */ struct kref refcount; struct cifs_fid fid; - struct mutex fid_mutex; + spinlock_t fid_lock; struct cifs_tcon *tcon; struct dentry *dentry; struct work_struct lease_break; @@ -45,8 +49,18 @@ struct cached_fid { struct cached_dirents dirents; }; -extern struct cached_fid *init_cached_dir(void); -extern void free_cached_dir(struct cifs_tcon *tcon); +#define MAX_CACHED_FIDS 16 +struct cached_fids { + /* Must be held when: + * - accessing the cfids->entries list + */ + spinlock_t cfid_list_lock; + int num_entries; + struct list_head entries; +}; + +extern struct cached_fids *init_cached_dirs(void); +extern void free_cached_dirs(struct cached_fids *cfids); extern int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, const char *path, struct cifs_sb_info *cifs_sb, @@ -55,8 +69,10 @@ extern int open_cached_dir_by_dentry(struct cifs_tcon *tcon, struct dentry *dentry, struct cached_fid **cfid); extern void close_cached_dir(struct cached_fid *cfid); -extern void close_cached_dir_lease(struct cached_fid *cfid); -extern void close_cached_dir_lease_locked(struct cached_fid *cfid); +extern void drop_cached_dir_by_name(const unsigned int xid, + struct cifs_tcon *tcon, + const char *name, + struct cifs_sb_info *cifs_sb); extern void close_all_cached_dirs(struct cifs_sb_info *cifs_sb); extern void invalidate_all_cached_dirs(struct cifs_tcon *tcon); extern int cached_dir_lease_break(struct cifs_tcon *tcon, __u8 lease_key[16]); diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index c05477e28cff..90850da390ae 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -87,7 +87,7 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon) { __u32 dev_type = le32_to_cpu(tcon->fsDevInfo.DeviceType); - seq_printf(m, "%s Mounts: %d ", tcon->treeName, tcon->tc_count); + seq_printf(m, "%s Mounts: %d ", tcon->tree_name, tcon->tc_count); if (tcon->nativeFileSystem) seq_printf(m, "Type: %s ", tcon->nativeFileSystem); seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x\n\tPathComponentMax: %d Status: %d", @@ -601,7 +601,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v) list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { i++; - seq_printf(m, "\n%d) %s", i, tcon->treeName); + seq_printf(m, "\n%d) %s", i, tcon->tree_name); if (tcon->need_reconnect) seq_puts(m, "\tDISCONNECTED "); seq_printf(m, "\nSMBs: %d", diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h index ee4ea2b60c0f..d44808263cfb 100644 --- a/fs/cifs/cifs_debug.h +++ b/fs/cifs/cifs_debug.h @@ -108,8 +108,8 @@ do { \ #define cifs_tcon_dbg_func(ratefunc, type, fmt, ...) \ do { \ const char *tn = ""; \ - if (tcon && tcon->treeName) \ - tn = tcon->treeName; \ + if (tcon && tcon->tree_name) \ + tn = tcon->tree_name; \ if ((type) & FYI && cifsFYI & CIFS_INFO) { \ pr_debug_ ## ratefunc("%s: %s " fmt, \ __FILE__, tn, ##__VA_ARGS__); \ @@ -150,7 +150,7 @@ do { \ #define cifs_tcon_dbg(type, fmt, ...) \ do { \ if (0) \ - pr_debug("%s " fmt, tcon->treeName, ##__VA_ARGS__); \ + pr_debug("%s " fmt, tcon->tree_name, ##__VA_ARGS__); \ } while (0) #define cifs_info(fmt, ...) \ diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h index b87cbbe6d2d4..d86d78d5bfdc 100644 --- a/fs/cifs/cifs_ioctl.h +++ b/fs/cifs/cifs_ioctl.h @@ -91,6 +91,13 @@ struct smb3_notify { bool watch_tree; } __packed; +struct smb3_notify_info { + __u32 completion_filter; + bool watch_tree; + __u32 data_len; /* size of notify data below */ + __u8 notify_data[]; +} __packed; + #define CIFS_IOCTL_MAGIC 0xCF #define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int) #define CIFS_IOC_SET_INTEGRITY _IO(CIFS_IOCTL_MAGIC, 4) @@ -100,6 +107,7 @@ struct smb3_notify { #define CIFS_DUMP_KEY _IOWR(CIFS_IOCTL_MAGIC, 8, struct smb3_key_debug_info) #define CIFS_IOC_NOTIFY _IOW(CIFS_IOCTL_MAGIC, 9, struct smb3_notify) #define CIFS_DUMP_FULL_KEY _IOWR(CIFS_IOCTL_MAGIC, 10, struct smb3_full_key_debug_info) +#define CIFS_IOC_NOTIFY_INFO _IOWR(CIFS_IOCTL_MAGIC, 11, struct smb3_notify_info) #define CIFS_IOC_SHUTDOWN _IOR ('X', 125, __u32) /* diff --git a/fs/cifs/cifs_swn.c b/fs/cifs/cifs_swn.c index 1e4c7cc5287f..7233c6a7e6d7 100644 --- a/fs/cifs/cifs_swn.c +++ b/fs/cifs/cifs_swn.c @@ -256,23 +256,23 @@ static struct cifs_swn_reg *cifs_find_swn_reg(struct cifs_tcon *tcon) const char *share_name; const char *net_name; - net_name = extract_hostname(tcon->treeName); + net_name = extract_hostname(tcon->tree_name); if (IS_ERR(net_name)) { int ret; ret = PTR_ERR(net_name); cifs_dbg(VFS, "%s: failed to extract host name from target '%s': %d\n", - __func__, tcon->treeName, ret); + __func__, tcon->tree_name, ret); return ERR_PTR(-EINVAL); } - share_name = extract_sharename(tcon->treeName); + share_name = extract_sharename(tcon->tree_name); if (IS_ERR(share_name)) { int ret; ret = PTR_ERR(share_name); cifs_dbg(VFS, "%s: failed to extract share name from target '%s': %d\n", - __func__, tcon->treeName, ret); + __func__, tcon->tree_name, ret); kfree(net_name); return ERR_PTR(-EINVAL); } @@ -335,14 +335,14 @@ static struct cifs_swn_reg *cifs_get_swn_reg(struct cifs_tcon *tcon) goto fail; } - reg->net_name = extract_hostname(tcon->treeName); + reg->net_name = extract_hostname(tcon->tree_name); if (IS_ERR(reg->net_name)) { ret = PTR_ERR(reg->net_name); cifs_dbg(VFS, "%s: failed to extract host name from target: %d\n", __func__, ret); goto fail_idr; } - reg->share_name = extract_sharename(tcon->treeName); + reg->share_name = extract_sharename(tcon->tree_name); if (IS_ERR(reg->share_name)) { ret = PTR_ERR(reg->share_name); cifs_dbg(VFS, "%s: failed to extract share name from target: %d\n", __func__, ret); diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 46f5718754f9..5db73c0f792a 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -103,26 +103,24 @@ static int cifs_calc_signature(struct smb_rqst *rqst, if (!rqst->rq_iov || !signature || !server) return -EINVAL; - rc = cifs_alloc_hash("md5", &server->secmech.md5, - &server->secmech.sdescmd5); + rc = cifs_alloc_hash("md5", &server->secmech.md5); if (rc) return -1; - rc = crypto_shash_init(&server->secmech.sdescmd5->shash); + rc = crypto_shash_init(server->secmech.md5); if (rc) { cifs_dbg(VFS, "%s: Could not init md5\n", __func__); return rc; } - rc = crypto_shash_update(&server->secmech.sdescmd5->shash, + rc = crypto_shash_update(server->secmech.md5, server->session_key.response, server->session_key.len); if (rc) { cifs_dbg(VFS, "%s: Could not update with response\n", __func__); return rc; } - return __cifs_calc_signature(rqst, server, signature, - &server->secmech.sdescmd5->shash); + return __cifs_calc_signature(rqst, server, signature, server->secmech.md5); } /* must be called with server->srv_mutex held */ @@ -412,7 +410,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, wchar_t *domain; wchar_t *server; - if (!ses->server->secmech.sdeschmacmd5) { + if (!ses->server->secmech.hmacmd5) { cifs_dbg(VFS, "%s: can't generate ntlmv2 hash\n", __func__); return -1; } @@ -420,14 +418,14 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, /* calculate md4 hash of password */ E_md4hash(ses->password, nt_hash, nls_cp); - rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash, + rc = crypto_shash_setkey(ses->server->secmech.hmacmd5->tfm, nt_hash, CIFS_NTHASH_SIZE); if (rc) { cifs_dbg(VFS, "%s: Could not set NT Hash as a key\n", __func__); return rc; } - rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); + rc = crypto_shash_init(ses->server->secmech.hmacmd5); if (rc) { cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__); return rc; @@ -448,7 +446,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, memset(user, '\0', 2); } - rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + rc = crypto_shash_update(ses->server->secmech.hmacmd5, (char *)user, 2 * len); kfree(user); if (rc) { @@ -468,7 +466,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, len = cifs_strtoUTF16((__le16 *)domain, ses->domainName, len, nls_cp); rc = - crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + crypto_shash_update(ses->server->secmech.hmacmd5, (char *)domain, 2 * len); kfree(domain); if (rc) { @@ -488,7 +486,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, len = cifs_strtoUTF16((__le16 *)server, ses->ip_addr, len, nls_cp); rc = - crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + crypto_shash_update(ses->server->secmech.hmacmd5, (char *)server, 2 * len); kfree(server); if (rc) { @@ -498,7 +496,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, } } - rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, + rc = crypto_shash_final(ses->server->secmech.hmacmd5, ntlmv2_hash); if (rc) cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); @@ -518,12 +516,12 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) hash_len = ses->auth_key.len - (CIFS_SESS_KEY_SIZE + offsetof(struct ntlmv2_resp, challenge.key[0])); - if (!ses->server->secmech.sdeschmacmd5) { + if (!ses->server->secmech.hmacmd5) { cifs_dbg(VFS, "%s: can't generate ntlmv2 hash\n", __func__); return -1; } - rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, + rc = crypto_shash_setkey(ses->server->secmech.hmacmd5->tfm, ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); if (rc) { cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n", @@ -531,7 +529,7 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) return rc; } - rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); + rc = crypto_shash_init(ses->server->secmech.hmacmd5); if (rc) { cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__); return rc; @@ -543,7 +541,7 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) else memcpy(ntlmv2->challenge.key, ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE); - rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + rc = crypto_shash_update(ses->server->secmech.hmacmd5, ntlmv2->challenge.key, hash_len); if (rc) { cifs_dbg(VFS, "%s: Could not update with response\n", __func__); @@ -551,7 +549,7 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) } /* Note that the MD5 digest over writes anon.challenge_key.key */ - rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, + rc = crypto_shash_final(ses->server->secmech.hmacmd5, ntlmv2->ntlmv2_hash); if (rc) cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); @@ -627,9 +625,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) cifs_server_lock(ses->server); - rc = cifs_alloc_hash("hmac(md5)", - &ses->server->secmech.hmacmd5, - &ses->server->secmech.sdeschmacmd5); + rc = cifs_alloc_hash("hmac(md5)", &ses->server->secmech.hmacmd5); if (rc) { goto unlock; } @@ -649,7 +645,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) } /* now calculate the session key for NTLMv2 */ - rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, + rc = crypto_shash_setkey(ses->server->secmech.hmacmd5->tfm, ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); if (rc) { cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n", @@ -657,13 +653,13 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) goto unlock; } - rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); + rc = crypto_shash_init(ses->server->secmech.hmacmd5); if (rc) { cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__); goto unlock; } - rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + rc = crypto_shash_update(ses->server->secmech.hmacmd5, ntlmv2->ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); if (rc) { @@ -671,7 +667,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) goto unlock; } - rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, + rc = crypto_shash_final(ses->server->secmech.hmacmd5, ses->auth_key.response); if (rc) cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); @@ -679,7 +675,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) unlock: cifs_server_unlock(ses->server); setup_ntlmv2_rsp_ret: - kfree(tiblob); + kfree_sensitive(tiblob); return rc; } @@ -718,49 +714,19 @@ calc_seckey(struct cifs_ses *ses) void cifs_crypto_secmech_release(struct TCP_Server_Info *server) { - if (server->secmech.cmacaes) { - crypto_free_shash(server->secmech.cmacaes); - server->secmech.cmacaes = NULL; - } - - if (server->secmech.hmacsha256) { - crypto_free_shash(server->secmech.hmacsha256); - server->secmech.hmacsha256 = NULL; - } - - if (server->secmech.md5) { - crypto_free_shash(server->secmech.md5); - server->secmech.md5 = NULL; - } + cifs_free_hash(&server->secmech.aes_cmac); + cifs_free_hash(&server->secmech.hmacsha256); + cifs_free_hash(&server->secmech.md5); + cifs_free_hash(&server->secmech.sha512); + cifs_free_hash(&server->secmech.hmacmd5); - if (server->secmech.sha512) { - crypto_free_shash(server->secmech.sha512); - server->secmech.sha512 = NULL; + if (server->secmech.enc) { + crypto_free_aead(server->secmech.enc); + server->secmech.enc = NULL; } - if (server->secmech.hmacmd5) { - crypto_free_shash(server->secmech.hmacmd5); - server->secmech.hmacmd5 = NULL; + if (server->secmech.dec) { + crypto_free_aead(server->secmech.dec); + server->secmech.dec = NULL; } - - if (server->secmech.ccmaesencrypt) { - crypto_free_aead(server->secmech.ccmaesencrypt); - server->secmech.ccmaesencrypt = NULL; - } - - if (server->secmech.ccmaesdecrypt) { - crypto_free_aead(server->secmech.ccmaesdecrypt); - server->secmech.ccmaesdecrypt = NULL; - } - - kfree(server->secmech.sdesccmacaes); - server->secmech.sdesccmacaes = NULL; - kfree(server->secmech.sdeschmacsha256); - server->secmech.sdeschmacsha256 = NULL; - kfree(server->secmech.sdeschmacmd5); - server->secmech.sdeschmacmd5 = NULL; - kfree(server->secmech.sdescmd5); - server->secmech.sdescmd5 = NULL; - kfree(server->secmech.sdescsha512); - server->secmech.sdescsha512 = NULL; } diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 8042d7280dec..fe220686bba4 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -396,6 +396,7 @@ cifs_alloc_inode(struct super_block *sb) cifs_inode->epoch = 0; spin_lock_init(&cifs_inode->open_file_lock); generate_random_uuid(cifs_inode->lease_key); + cifs_inode->symlink_target = NULL; /* * Can not set i_flags here - they get immediately overwritten to zero @@ -412,7 +413,11 @@ cifs_alloc_inode(struct super_block *sb) static void cifs_free_inode(struct inode *inode) { - kmem_cache_free(cifs_inode_cachep, CIFS_I(inode)); + struct cifsInodeInfo *cinode = CIFS_I(inode); + + if (S_ISLNK(inode->i_mode)) + kfree(cinode->symlink_target); + kmem_cache_free(cifs_inode_cachep, cinode); } static void @@ -1138,6 +1143,30 @@ const struct inode_operations cifs_file_inode_ops = { .fiemap = cifs_fiemap, }; +const char *cifs_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *done) +{ + char *target_path; + + target_path = kmalloc(PATH_MAX, GFP_KERNEL); + if (!target_path) + return ERR_PTR(-ENOMEM); + + spin_lock(&inode->i_lock); + if (likely(CIFS_I(inode)->symlink_target)) { + strscpy(target_path, CIFS_I(inode)->symlink_target, PATH_MAX); + } else { + kfree(target_path); + target_path = ERR_PTR(-EOPNOTSUPP); + } + spin_unlock(&inode->i_lock); + + if (!IS_ERR(target_path)) + set_delayed_call(done, kfree_link, target_path); + + return target_path; +} + const struct inode_operations cifs_symlink_inode_ops = { .get_link = cifs_get_link, .permission = cifs_permission, @@ -1297,8 +1326,11 @@ static ssize_t cifs_copy_file_range(struct file *src_file, loff_t off, ssize_t rc; struct cifsFileInfo *cfile = dst_file->private_data; - if (cfile->swapfile) - return -EOPNOTSUPP; + if (cfile->swapfile) { + rc = -EOPNOTSUPP; + free_xid(xid); + return rc; + } rc = cifs_file_copychunk_range(xid, src_file, off, dst_file, destoff, len, flags); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 5b4a7a32bdc5..388b745a978e 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -153,6 +153,6 @@ extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ /* when changing internal version - update following two lines at same time */ -#define SMB3_PRODUCT_BUILD 39 -#define CIFS_VERSION "2.39" +#define SMB3_PRODUCT_BUILD 40 +#define CIFS_VERSION "2.40" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index ae7f571a7dba..1420acf987f0 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -153,26 +153,16 @@ struct session_key { char *response; }; -/* crypto security descriptor definition */ -struct sdesc { - struct shash_desc shash; - char ctx[]; -}; - /* crypto hashing related structure/fields, not specific to a sec mech */ struct cifs_secmech { - struct crypto_shash *hmacmd5; /* hmac-md5 hash function */ - struct crypto_shash *md5; /* md5 hash function */ - struct crypto_shash *hmacsha256; /* hmac-sha256 hash function */ - struct crypto_shash *cmacaes; /* block-cipher based MAC function */ - struct crypto_shash *sha512; /* sha512 hash function */ - struct sdesc *sdeschmacmd5; /* ctxt to generate ntlmv2 hash, CR1 */ - struct sdesc *sdescmd5; /* ctxt to generate cifs/smb signature */ - struct sdesc *sdeschmacsha256; /* ctxt to generate smb2 signature */ - struct sdesc *sdesccmacaes; /* ctxt to generate smb3 signature */ - struct sdesc *sdescsha512; /* ctxt to generate smb3.11 signing key */ - struct crypto_aead *ccmaesencrypt; /* smb3 encryption aead */ - struct crypto_aead *ccmaesdecrypt; /* smb3 decryption aead */ + struct shash_desc *hmacmd5; /* hmacmd5 hash function, for NTLMv2/CR1 hashes */ + struct shash_desc *md5; /* md5 hash function, for CIFS/SMB1 signatures */ + struct shash_desc *hmacsha256; /* hmac-sha256 hash function, for SMB2 signatures */ + struct shash_desc *sha512; /* sha512 hash function, for SMB3.1.1 preauth hash */ + struct shash_desc *aes_cmac; /* block-cipher based MAC function, for SMB3 signatures */ + + struct crypto_aead *enc; /* smb3 encryption AEAD TFM (AES-CCM and AES-GCM) */ + struct crypto_aead *dec; /* smb3 decryption AEAD TFM (AES-CCM and AES-GCM) */ }; /* per smb session structure/fields */ @@ -195,6 +185,19 @@ struct cifs_cred { struct cifs_ace *aces; }; +struct cifs_open_info_data { + char *symlink_target; + union { + struct smb2_file_all_info fi; + struct smb311_posix_qinfo posix_fi; + }; +}; + +static inline void cifs_free_open_info(struct cifs_open_info_data *data) +{ + kfree(data->symlink_target); +} + /* ***************************************************************** * Except the CIFS PDUs themselves all the @@ -317,20 +320,20 @@ struct smb_version_operations { int (*is_path_accessible)(const unsigned int, struct cifs_tcon *, struct cifs_sb_info *, const char *); /* query path data from the server */ - int (*query_path_info)(const unsigned int, struct cifs_tcon *, - struct cifs_sb_info *, const char *, - FILE_ALL_INFO *, bool *, bool *); + int (*query_path_info)(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse); /* query file data from the server */ - int (*query_file_info)(const unsigned int, struct cifs_tcon *, - struct cifs_fid *, FILE_ALL_INFO *); + int (*query_file_info)(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *cfile, struct cifs_open_info_data *data); /* query reparse tag from srv to determine which type of special file */ int (*query_reparse_tag)(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *path, __u32 *reparse_tag); /* get server index number */ - int (*get_srv_inum)(const unsigned int, struct cifs_tcon *, - struct cifs_sb_info *, const char *, - u64 *uniqueid, FILE_ALL_INFO *); + int (*get_srv_inum)(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, u64 *uniqueid, + struct cifs_open_info_data *data); /* set size by path */ int (*set_path_size)(const unsigned int, struct cifs_tcon *, const char *, __u64, struct cifs_sb_info *, bool); @@ -379,8 +382,8 @@ struct smb_version_operations { struct cifs_sb_info *, const char *, char **, bool); /* open a file for non-posix mounts */ - int (*open)(const unsigned int, struct cifs_open_parms *, - __u32 *, FILE_ALL_INFO *); + int (*open)(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock, + void *buf); /* set fid protocol-specific info */ void (*set_fid)(struct cifsFileInfo *, struct cifs_fid *, __u32); /* close a file */ @@ -451,7 +454,7 @@ struct smb_version_operations { int (*enum_snapshots)(const unsigned int xid, struct cifs_tcon *tcon, struct cifsFileInfo *src_file, void __user *); int (*notify)(const unsigned int xid, struct file *pfile, - void __user *pbuf); + void __user *pbuf, bool return_changes); int (*query_mf_symlink)(unsigned int, struct cifs_tcon *, struct cifs_sb_info *, const unsigned char *, char *, unsigned int *); @@ -1133,6 +1136,7 @@ struct cifs_fattr { struct timespec64 cf_mtime; struct timespec64 cf_ctime; u32 cf_cifstag; + char *cf_symlink_target; }; /* @@ -1149,7 +1153,7 @@ struct cifs_tcon { struct list_head openFileList; spinlock_t open_file_lock; /* protects list above */ struct cifs_ses *ses; /* pointer to session associated with */ - char treeName[MAX_TREE_SIZE + 1]; /* UNC name of resource in ASCII */ + char tree_name[MAX_TREE_SIZE + 1]; /* UNC name of resource in ASCII */ char *nativeFileSystem; char *password; /* for share-level security */ __u32 tid; /* The 4 byte tree id */ @@ -1228,7 +1232,7 @@ struct cifs_tcon { struct fscache_volume *fscache; /* cookie for share */ #endif struct list_head pending_opens; /* list of incomplete opens */ - struct cached_fid *cfid; /* Cached root fid */ + struct cached_fids *cfids; /* BB add field for back pointer to sb struct(s)? */ #ifdef CONFIG_CIFS_DFS_UPCALL struct list_head ulist; /* cache update list */ @@ -1395,6 +1399,7 @@ struct cifsFileInfo { struct work_struct put; /* work for the final part of _put */ struct delayed_work deferred; bool deferred_close_scheduled; /* Flag to indicate close is scheduled */ + char *symlink_target; }; struct cifs_io_parms { @@ -1553,6 +1558,7 @@ struct cifsInodeInfo { struct list_head deferred_closes; /* list of deferred closes */ spinlock_t deferred_lock; /* protection on deferred list */ bool lease_granted; /* Flag to indicate whether lease or oplock is granted. */ + char *symlink_target; }; static inline struct cifsInodeInfo * @@ -2121,4 +2127,14 @@ static inline size_t ntlmssp_workstation_name_size(const struct cifs_ses *ses) return sizeof(ses->workstation_name); } +static inline void move_cifs_info_to_smb2(struct smb2_file_all_info *dst, const FILE_ALL_INFO *src) +{ + memcpy(dst, src, (size_t)((u8 *)&src->AccessFlags - (u8 *)src)); + dst->AccessFlags = src->AccessFlags; + dst->CurrentByteOffset = src->CurrentByteOffset; + dst->Mode = src->Mode; + dst->AlignmentRequirement = src->AlignmentRequirement; + dst->FileNameLength = src->FileNameLength; +} + #endif /* _CIFS_GLOB_H */ diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index aeba371c4c70..d1abaeea974a 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -483,7 +483,7 @@ put_bcc(__u16 count, struct smb_hdr *hdr) typedef struct negotiate_req { struct smb_hdr hdr; /* wct = 0 */ __le16 ByteCount; - unsigned char DialectsArray[1]; + unsigned char DialectsArray[]; } __attribute__((packed)) NEGOTIATE_REQ; #define MIN_TZ_ADJ (15 * 60) /* minimum grid for timezones in seconds */ @@ -508,13 +508,14 @@ typedef struct negotiate_rsp { __u8 EncryptionKeyLength; __u16 ByteCount; union { - unsigned char EncryptionKey[1]; /* cap extended security off */ + /* cap extended security off */ + DECLARE_FLEX_ARRAY(unsigned char, EncryptionKey); /* followed by Domain name - if extended security is off */ /* followed by 16 bytes of server GUID */ /* then security blob if cap_extended_security negotiated */ struct { unsigned char GUID[SMB1_CLIENT_GUID_SIZE]; - unsigned char SecurityBlob[1]; + unsigned char SecurityBlob[]; } __attribute__((packed)) extended_response; } __attribute__((packed)) u; } __attribute__((packed)) NEGOTIATE_RSP; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 3bc94bcc7177..83e83d8beabb 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -182,10 +182,9 @@ extern int cifs_unlock_range(struct cifsFileInfo *cfile, extern int cifs_push_mandatory_locks(struct cifsFileInfo *cfile); extern void cifs_down_write(struct rw_semaphore *sem); -extern struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid, - struct file *file, - struct tcon_link *tlink, - __u32 oplock); +struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, + struct tcon_link *tlink, __u32 oplock, + const char *symlink_target); extern int cifs_posix_open(const char *full_path, struct inode **inode, struct super_block *sb, int mode, unsigned int f_flags, __u32 *oplock, __u16 *netfid, @@ -200,9 +199,9 @@ extern int cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr); extern struct inode *cifs_iget(struct super_block *sb, struct cifs_fattr *fattr); -extern int cifs_get_inode_info(struct inode **inode, const char *full_path, - FILE_ALL_INFO *data, struct super_block *sb, - int xid, const struct cifs_fid *fid); +int cifs_get_inode_info(struct inode **inode, const char *full_path, + struct cifs_open_info_data *data, struct super_block *sb, int xid, + const struct cifs_fid *fid); extern int smb311_posix_get_inode_info(struct inode **pinode, const char *search_path, struct super_block *sb, unsigned int xid); extern int cifs_get_inode_info_unix(struct inode **pinode, @@ -598,9 +597,8 @@ struct cifs_aio_ctx *cifs_aio_ctx_alloc(void); void cifs_aio_ctx_release(struct kref *refcount); int setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw); -int cifs_alloc_hash(const char *name, struct crypto_shash **shash, - struct sdesc **sdesc); -void cifs_free_hash(struct crypto_shash **shash, struct sdesc **sdesc); +int cifs_alloc_hash(const char *name, struct shash_desc **sdesc); +void cifs_free_hash(struct shash_desc **sdesc); extern void rqst_page_get_length(struct smb_rqst *rqst, unsigned int page, unsigned int *len, unsigned int *offset); @@ -639,7 +637,7 @@ cifs_chan_is_iface_active(struct cifs_ses *ses, int cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server); int -SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon); +SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon, bool in_mount); void extract_unc_hostname(const char *unc, const char **h, size_t *len); int copy_path_name(char *dst, const char *src); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 7aa91e272027..1724066c1536 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -465,7 +465,7 @@ CIFSSMBNegotiate(const unsigned int xid, for (i = 0; i < CIFS_NUM_PROT; i++) { size_t len = strlen(protocols[i].name) + 1; - memcpy(pSMB->DialectsArray+count, protocols[i].name, len); + memcpy(&pSMB->DialectsArray[count], protocols[i].name, len); count += len; } inc_rfc1001_len(pSMB, count); @@ -2305,7 +2305,7 @@ int CIFSSMBRenameOpenFile(const unsigned int xid, struct cifs_tcon *pTcon, remap); } rename_info->target_name_len = cpu_to_le32(2 * len_of_str); - count = 12 /* sizeof(struct set_file_rename) */ + (2 * len_of_str); + count = sizeof(struct set_file_rename) + (2 * len_of_str); byte_count += count; pSMB->DataCount = cpu_to_le16(count); pSMB->TotalDataCount = pSMB->DataCount; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 7ae6f2c08153..9db9527c61cf 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -155,7 +155,7 @@ static void smb2_query_server_interfaces(struct work_struct *work) /* * query server network interfaces, in case they change */ - rc = SMB3_request_interfaces(0, tcon); + rc = SMB3_request_interfaces(0, tcon, false); if (rc) { cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n", __func__, rc); @@ -311,7 +311,7 @@ cifs_abort_connection(struct TCP_Server_Info *server) } server->sequence_number = 0; server->session_estab = false; - kfree(server->session_key.response); + kfree_sensitive(server->session_key.response); server->session_key.response = NULL; server->session_key.len = 0; server->lstrp = jiffies; @@ -1580,10 +1580,11 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect) cifs_crypto_secmech_release(server); - kfree(server->session_key.response); + kfree_sensitive(server->session_key.response); server->session_key.response = NULL; server->session_key.len = 0; kfree(server->hostname); + server->hostname = NULL; task = xchg(&server->tsk, NULL); if (task) @@ -1940,7 +1941,8 @@ void cifs_put_smb_ses(struct cifs_ses *ses) spin_unlock(&ses->ses_lock); cifs_dbg(FYI, "%s: ses_count=%d\n", __func__, ses->ses_count); - cifs_dbg(FYI, "%s: ses ipc: %s\n", __func__, ses->tcon_ipc ? ses->tcon_ipc->treeName : "NONE"); + cifs_dbg(FYI, + "%s: ses ipc: %s\n", __func__, ses->tcon_ipc ? ses->tcon_ipc->tree_name : "NONE"); spin_lock(&cifs_tcp_ses_lock); if (--ses->ses_count > 0) { @@ -2293,7 +2295,7 @@ static int match_tcon(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) { if (tcon->status == TID_EXITING) return 0; - if (strncmp(tcon->treeName, ctx->UNC, MAX_TREE_SIZE)) + if (strncmp(tcon->tree_name, ctx->UNC, MAX_TREE_SIZE)) return 0; if (tcon->seal != ctx->seal) return 0; @@ -2831,9 +2833,12 @@ ip_rfc1001_connect(struct TCP_Server_Info *server) * sessinit is sent but no second negprot */ struct rfc1002_session_packet *ses_init_buf; + unsigned int req_noscope_len; struct smb_hdr *smb_buf; + ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet), GFP_KERNEL); + if (ses_init_buf) { ses_init_buf->trailer.session_req.called_len = 32; @@ -2869,8 +2874,12 @@ ip_rfc1001_connect(struct TCP_Server_Info *server) ses_init_buf->trailer.session_req.scope2 = 0; smb_buf = (struct smb_hdr *)ses_init_buf; - /* sizeof RFC1002_SESSION_REQUEST with no scope */ - smb_buf->smb_buf_length = cpu_to_be32(0x81000044); + /* sizeof RFC1002_SESSION_REQUEST with no scopes */ + req_noscope_len = sizeof(struct rfc1002_session_packet) - 2; + + /* == cpu_to_be32(0x81000044) */ + smb_buf->smb_buf_length = + cpu_to_be32((RFC1002_SESSION_REQUEST << 24) | req_noscope_len); rc = smb_send(server, smb_buf, 0x44); kfree(ses_init_buf); /* @@ -3846,9 +3855,13 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) uuid_copy(&cifs_sb->dfs_mount_id, &mnt_ctx.mount_id); out: - free_xid(mnt_ctx.xid); cifs_try_adding_channels(cifs_sb, mnt_ctx.ses); - return mount_setup_tlink(cifs_sb, mnt_ctx.ses, mnt_ctx.tcon); + rc = mount_setup_tlink(cifs_sb, mnt_ctx.ses, mnt_ctx.tcon); + if (rc) + goto error; + + free_xid(mnt_ctx.xid); + return rc; error: dfs_cache_put_refsrv_sessions(&mnt_ctx.mount_id); @@ -3875,8 +3888,12 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) goto error; } + rc = mount_setup_tlink(cifs_sb, mnt_ctx.ses, mnt_ctx.tcon); + if (rc) + goto error; + free_xid(mnt_ctx.xid); - return mount_setup_tlink(cifs_sb, mnt_ctx.ses, mnt_ctx.tcon); + return rc; error: mount_put_conns(&mnt_ctx); @@ -3921,12 +3938,11 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, pSMB->AndXCommand = 0xFF; pSMB->Flags = cpu_to_le16(TCON_EXTENDED_SECINFO); bcc_ptr = &pSMB->Password[0]; - if (tcon->pipe || (ses->server->sec_mode & SECMODE_USER)) { - pSMB->PasswordLength = cpu_to_le16(1); /* minimum */ - *bcc_ptr = 0; /* password is null byte */ - bcc_ptr++; /* skip password */ - /* already aligned so no need to do it below */ - } + + pSMB->PasswordLength = cpu_to_le16(1); /* minimum */ + *bcc_ptr = 0; /* password is null byte */ + bcc_ptr++; /* skip password */ + /* already aligned so no need to do it below */ if (ses->server->sign) smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE; @@ -3989,7 +4005,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, } bcc_ptr += length + 1; bytes_left -= (length + 1); - strscpy(tcon->treeName, tree, sizeof(tcon->treeName)); + strscpy(tcon->tree_name, tree, sizeof(tcon->tree_name)); /* mostly informational -- no need to fail on error here */ kfree(tcon->nativeFileSystem); @@ -4134,7 +4150,7 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, if (ses->auth_key.response) { cifs_dbg(FYI, "Free previous auth_key.response = %p\n", ses->auth_key.response); - kfree(ses->auth_key.response); + kfree_sensitive(ses->auth_key.response); ses->auth_key.response = NULL; ses->auth_key.len = 0; } @@ -4197,7 +4213,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) ctx->local_nls = cifs_sb->local_nls; ctx->linux_uid = fsuid; ctx->cred_uid = fsuid; - ctx->UNC = master_tcon->treeName; + ctx->UNC = master_tcon->tree_name; ctx->retry = master_tcon->retry; ctx->nocase = master_tcon->nocase; ctx->nohandlecache = master_tcon->nohandlecache; @@ -4663,7 +4679,7 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru /* If it is not dfs or there was no cached dfs referral, then reconnect to same share */ if (!server->current_fullpath || dfs_cache_noreq_find(server->current_fullpath + 1, &ref, &tl)) { - rc = ops->tree_connect(xid, tcon->ses, tcon->treeName, tcon, cifs_sb->local_nls); + rc = ops->tree_connect(xid, tcon->ses, tcon->tree_name, tcon, cifs_sb->local_nls); goto out; } @@ -4707,7 +4723,7 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru tcon->status = TID_IN_TCON; spin_unlock(&tcon->tc_lock); - rc = ops->tree_connect(xid, tcon->ses, tcon->treeName, tcon, nlsc); + rc = ops->tree_connect(xid, tcon->ses, tcon->tree_name, tcon, nlsc); if (rc) { spin_lock(&tcon->tc_lock); if (tcon->status == TID_IN_TCON) diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index a9b6c3eba6de..e70915ad7541 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -98,7 +98,7 @@ static struct cifs_ses *find_ipc_from_server_path(struct cifs_ses **ses, const c get_ipc_unc(path, unc, sizeof(unc)); for (; *ses; ses++) { - if (!strcasecmp(unc, (*ses)->tcon_ipc->treeName)) + if (!strcasecmp(unc, (*ses)->tcon_ipc->tree_name)) return *ses; } return ERR_PTR(-ENOENT); diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 08f7392716e2..8b1c37158556 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -50,7 +50,7 @@ cifs_build_path_to_root(struct smb3_fs_context *ctx, struct cifs_sb_info *cifs_s } if (add_treename) - dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1); + dfsplen = strnlen(tcon->tree_name, MAX_TREE_SIZE + 1); else dfsplen = 0; @@ -59,7 +59,7 @@ cifs_build_path_to_root(struct smb3_fs_context *ctx, struct cifs_sb_info *cifs_s return full_path; if (dfsplen) - memcpy(full_path, tcon->treeName, dfsplen); + memcpy(full_path, tcon->tree_name, dfsplen); full_path[dfsplen] = CIFS_DIR_SEP(cifs_sb); memcpy(full_path + dfsplen + 1, ctx->prepath, pplen); convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb)); @@ -93,7 +93,7 @@ build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page, return ERR_PTR(-ENOMEM); if (prefix) - dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1); + dfsplen = strnlen(tcon->tree_name, MAX_TREE_SIZE + 1); else dfsplen = 0; @@ -123,7 +123,7 @@ build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page, } if (dfsplen) { s -= dfsplen; - memcpy(s, tcon->treeName, dfsplen); + memcpy(s, tcon->tree_name, dfsplen); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) { int i; for (i = 0; i < dfsplen; i++) { @@ -165,10 +165,9 @@ check_name(struct dentry *direntry, struct cifs_tcon *tcon) /* Inode operations in similar order to how they appear in Linux file fs.h */ -static int -cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, - struct tcon_link *tlink, unsigned oflags, umode_t mode, - __u32 *oplock, struct cifs_fid *fid) +static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, + struct tcon_link *tlink, unsigned int oflags, umode_t mode, __u32 *oplock, + struct cifs_fid *fid, struct cifs_open_info_data *buf) { int rc = -ENOENT; int create_options = CREATE_NOT_DIR; @@ -177,7 +176,6 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, struct cifs_tcon *tcon = tlink_tcon(tlink); const char *full_path; void *page = alloc_dentry_path(); - FILE_ALL_INFO *buf = NULL; struct inode *newinode = NULL; int disposition; struct TCP_Server_Info *server = tcon->ses->server; @@ -290,12 +288,6 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, goto out; } - buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); - if (buf == NULL) { - rc = -ENOMEM; - goto out; - } - /* * if we're not using unix extensions, see if we need to set * ATTR_READONLY on the create call @@ -364,8 +356,7 @@ cifs_create_get_file_info: { #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ /* TODO: Add support for calling POSIX query info here, but passing in fid */ - rc = cifs_get_inode_info(&newinode, full_path, buf, inode->i_sb, - xid, fid); + rc = cifs_get_inode_info(&newinode, full_path, buf, inode->i_sb, xid, fid); if (newinode) { if (server->ops->set_lease_key) server->ops->set_lease_key(newinode, fid); @@ -402,7 +393,6 @@ cifs_create_set_dentry: d_add(direntry, newinode); out: - kfree(buf); free_dentry_path(page); return rc; @@ -423,10 +413,11 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, struct tcon_link *tlink; struct cifs_tcon *tcon; struct TCP_Server_Info *server; - struct cifs_fid fid; + struct cifs_fid fid = {}; struct cifs_pending_open open; __u32 oplock; struct cifsFileInfo *file_info; + struct cifs_open_info_data buf = {}; if (unlikely(cifs_forced_shutdown(CIFS_SB(inode->i_sb)))) return -EIO; @@ -484,8 +475,7 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, cifs_add_pending_open(&fid, tlink, &open); rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode, - &oplock, &fid); - + &oplock, &fid, &buf); if (rc) { cifs_del_pending_open(&open); goto out; @@ -510,7 +500,7 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, file->f_op = &cifs_file_direct_ops; } - file_info = cifs_new_fileinfo(&fid, file, tlink, oplock); + file_info = cifs_new_fileinfo(&fid, file, tlink, oplock, buf.symlink_target); if (file_info == NULL) { if (server->ops->close) server->ops->close(xid, tcon, &fid); @@ -526,6 +516,7 @@ out: cifs_put_tlink(tlink); out_free_xid: free_xid(xid); + cifs_free_open_info(&buf); return rc; } @@ -547,12 +538,15 @@ int cifs_create(struct user_namespace *mnt_userns, struct inode *inode, struct TCP_Server_Info *server; struct cifs_fid fid; __u32 oplock; + struct cifs_open_info_data buf = {}; cifs_dbg(FYI, "cifs_create parent inode = 0x%p name is: %pd and dentry = 0x%p\n", inode, direntry, direntry); - if (unlikely(cifs_forced_shutdown(CIFS_SB(inode->i_sb)))) - return -EIO; + if (unlikely(cifs_forced_shutdown(CIFS_SB(inode->i_sb)))) { + rc = -EIO; + goto out_free_xid; + } tlink = cifs_sb_tlink(CIFS_SB(inode->i_sb)); rc = PTR_ERR(tlink); @@ -565,11 +559,11 @@ int cifs_create(struct user_namespace *mnt_userns, struct inode *inode, if (server->ops->new_lease_key) server->ops->new_lease_key(&fid); - rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode, - &oplock, &fid); + rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode, &oplock, &fid, &buf); if (!rc && server->ops->close) server->ops->close(xid, tcon, &fid); + cifs_free_open_info(&buf); cifs_put_tlink(tlink); out_free_xid: free_xid(xid); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 6f38b134a346..cd9698209930 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -209,16 +209,14 @@ posix_open_ret: } #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ -static int -cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, - struct cifs_tcon *tcon, unsigned int f_flags, __u32 *oplock, - struct cifs_fid *fid, unsigned int xid) +static int cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, + struct cifs_tcon *tcon, unsigned int f_flags, __u32 *oplock, + struct cifs_fid *fid, unsigned int xid, struct cifs_open_info_data *buf) { int rc; int desired_access; int disposition; int create_options = CREATE_NOT_DIR; - FILE_ALL_INFO *buf; struct TCP_Server_Info *server = tcon->ses->server; struct cifs_open_parms oparms; @@ -255,10 +253,6 @@ cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_sb_info *ci /* BB pass O_SYNC flag through on file attributes .. BB */ - buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); - if (!buf) - return -ENOMEM; - /* O_SYNC also has bit for O_DSYNC so following check picks up either */ if (f_flags & O_SYNC) create_options |= CREATE_WRITE_THROUGH; @@ -276,9 +270,8 @@ cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_sb_info *ci oparms.reconnect = false; rc = server->ops->open(xid, &oparms, oplock, buf); - if (rc) - goto out; + return rc; /* TODO: Add support for calling posix query info but with passing in fid */ if (tcon->unix_ext) @@ -294,8 +287,6 @@ cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_sb_info *ci rc = -EOPENSTALE; } -out: - kfree(buf); return rc; } @@ -325,9 +316,9 @@ cifs_down_write(struct rw_semaphore *sem) static void cifsFileInfo_put_work(struct work_struct *work); -struct cifsFileInfo * -cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, - struct tcon_link *tlink, __u32 oplock) +struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, + struct tcon_link *tlink, __u32 oplock, + const char *symlink_target) { struct dentry *dentry = file_dentry(file); struct inode *inode = d_inode(dentry); @@ -347,6 +338,15 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, return NULL; } + if (symlink_target) { + cfile->symlink_target = kstrdup(symlink_target, GFP_KERNEL); + if (!cfile->symlink_target) { + kfree(fdlocks); + kfree(cfile); + return NULL; + } + } + INIT_LIST_HEAD(&fdlocks->locks); fdlocks->cfile = cfile; cfile->llist = fdlocks; @@ -440,6 +440,7 @@ static void cifsFileInfo_put_final(struct cifsFileInfo *cifs_file) cifs_put_tlink(cifs_file->tlink); dput(cifs_file->dentry); cifs_sb_deactive(sb); + kfree(cifs_file->symlink_target); kfree(cifs_file); } @@ -488,7 +489,7 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, struct cifsInodeInfo *cifsi = CIFS_I(inode); struct super_block *sb = inode->i_sb; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); - struct cifs_fid fid; + struct cifs_fid fid = {}; struct cifs_pending_open open; bool oplock_break_cancelled; @@ -570,8 +571,9 @@ int cifs_open(struct inode *inode, struct file *file) void *page; const char *full_path; bool posix_open_ok = false; - struct cifs_fid fid; + struct cifs_fid fid = {}; struct cifs_pending_open open; + struct cifs_open_info_data data = {}; xid = get_xid(); @@ -662,15 +664,15 @@ int cifs_open(struct inode *inode, struct file *file) if (server->ops->get_lease_key) server->ops->get_lease_key(inode, &fid); - rc = cifs_nt_open(full_path, inode, cifs_sb, tcon, - file->f_flags, &oplock, &fid, xid); + rc = cifs_nt_open(full_path, inode, cifs_sb, tcon, file->f_flags, &oplock, &fid, + xid, &data); if (rc) { cifs_del_pending_open(&open); goto out; } } - cfile = cifs_new_fileinfo(&fid, file, tlink, oplock); + cfile = cifs_new_fileinfo(&fid, file, tlink, oplock, data.symlink_target); if (cfile == NULL) { if (server->ops->close) server->ops->close(xid, tcon, &fid); @@ -712,6 +714,7 @@ out: free_dentry_path(page); free_xid(xid); cifs_put_tlink(tlink); + cifs_free_open_info(&data); return rc; } @@ -1882,11 +1885,13 @@ int cifs_flock(struct file *file, int cmd, struct file_lock *fl) struct cifsFileInfo *cfile; __u32 type; - rc = -EACCES; xid = get_xid(); - if (!(fl->fl_flags & FL_FLOCK)) - return -ENOLCK; + if (!(fl->fl_flags & FL_FLOCK)) { + rc = -ENOLCK; + free_xid(xid); + return rc; + } cfile = (struct cifsFileInfo *)file->private_data; tcon = tlink_tcon(cfile->tlink); @@ -1905,8 +1910,9 @@ int cifs_flock(struct file *file, int cmd, struct file_lock *fl) * if no lock or unlock then nothing to do since we do not * know what it is */ + rc = -EOPNOTSUPP; free_xid(xid); - return -EOPNOTSUPP; + return rc; } rc = cifs_setlk(file, fl, type, wait_flag, posix_lck, lock, unlock, @@ -2428,12 +2434,16 @@ cifs_writev_complete(struct work_struct *work) struct cifs_writedata * cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete) { + struct cifs_writedata *writedata = NULL; struct page **pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); - if (pages) - return cifs_writedata_direct_alloc(pages, complete); + if (pages) { + writedata = cifs_writedata_direct_alloc(pages, complete); + if (!writedata) + kvfree(pages); + } - return NULL; + return writedata; } struct cifs_writedata * @@ -3293,6 +3303,9 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, cifs_uncached_writev_complete); if (!wdata) { rc = -ENOMEM; + for (i = 0; i < nr_pages; i++) + put_page(pagevec[i]); + kvfree(pagevec); add_credits_and_wake_if(server, credits, 0); break; } @@ -4271,6 +4284,15 @@ static ssize_t __cifs_readv( len = ctx->len; } + if (direct) { + rc = filemap_write_and_wait_range(file->f_inode->i_mapping, + offset, offset + len - 1); + if (rc) { + kref_put(&ctx->refcount, cifs_aio_ctx_release); + return -EAGAIN; + } + } + /* grab a lock here due to read response handlers can access ctx */ mutex_lock(&ctx->aio_mutex); diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c index 0e13dec86b25..45119597c765 100644 --- a/fs/cifs/fs_context.c +++ b/fs/cifs/fs_context.c @@ -791,6 +791,13 @@ do { \ cifs_sb->ctx->field = NULL; \ } while (0) +#define STEAL_STRING_SENSITIVE(cifs_sb, ctx, field) \ +do { \ + kfree_sensitive(ctx->field); \ + ctx->field = cifs_sb->ctx->field; \ + cifs_sb->ctx->field = NULL; \ +} while (0) + static int smb3_reconfigure(struct fs_context *fc) { struct smb3_fs_context *ctx = smb3_fc2context(fc); @@ -811,7 +818,7 @@ static int smb3_reconfigure(struct fs_context *fc) STEAL_STRING(cifs_sb, ctx, UNC); STEAL_STRING(cifs_sb, ctx, source); STEAL_STRING(cifs_sb, ctx, username); - STEAL_STRING(cifs_sb, ctx, password); + STEAL_STRING_SENSITIVE(cifs_sb, ctx, password); STEAL_STRING(cifs_sb, ctx, domainname); STEAL_STRING(cifs_sb, ctx, nodename); STEAL_STRING(cifs_sb, ctx, iocharset); @@ -1162,7 +1169,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, } break; case Opt_pass: - kfree(ctx->password); + kfree_sensitive(ctx->password); ctx->password = NULL; if (strlen(param->string) == 0) break; @@ -1470,6 +1477,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, return 0; cifs_parse_mount_err: + kfree_sensitive(ctx->password); return -EINVAL; } diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index 23ef56f55ce5..a1751b956318 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -45,7 +45,7 @@ int cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) memset(&key, 0, sizeof(key)); - sharename = extract_sharename(tcon->treeName); + sharename = extract_sharename(tcon->tree_name); if (IS_ERR(sharename)) { cifs_dbg(FYI, "%s: couldn't extract sharename\n", __func__); return -EINVAL; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index bac08c20f559..4e2ca3c6e5c0 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -210,6 +210,12 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) */ inode->i_blocks = (512 - 1 + fattr->cf_bytes) >> 9; } + + if (S_ISLNK(fattr->cf_mode)) { + kfree(cifs_i->symlink_target); + cifs_i->symlink_target = fattr->cf_symlink_target; + fattr->cf_symlink_target = NULL; + } spin_unlock(&inode->i_lock); if (fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL) @@ -347,13 +353,22 @@ cifs_get_file_info_unix(struct file *filp) int rc; unsigned int xid; FILE_UNIX_BASIC_INFO find_data; - struct cifs_fattr fattr; + struct cifs_fattr fattr = {}; struct inode *inode = file_inode(filp); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifsFileInfo *cfile = filp->private_data; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); xid = get_xid(); + + if (cfile->symlink_target) { + fattr.cf_symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); + if (!fattr.cf_symlink_target) { + rc = -ENOMEM; + goto cifs_gfiunix_out; + } + } + rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->fid.netfid, &find_data); if (!rc) { cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb); @@ -378,6 +393,7 @@ int cifs_get_inode_info_unix(struct inode **pinode, FILE_UNIX_BASIC_INFO find_data; struct cifs_fattr fattr; struct cifs_tcon *tcon; + struct TCP_Server_Info *server; struct tcon_link *tlink; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); @@ -387,10 +403,12 @@ int cifs_get_inode_info_unix(struct inode **pinode, if (IS_ERR(tlink)) return PTR_ERR(tlink); tcon = tlink_tcon(tlink); + server = tcon->ses->server; /* could have done a find first instead but this returns more info */ rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data, cifs_sb->local_nls, cifs_remap(cifs_sb)); + cifs_dbg(FYI, "%s: query path info: rc = %d\n", __func__, rc); cifs_put_tlink(tlink); if (!rc) { @@ -410,6 +428,17 @@ int cifs_get_inode_info_unix(struct inode **pinode, cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc); } + if (S_ISLNK(fattr.cf_mode) && !fattr.cf_symlink_target) { + if (!server->ops->query_symlink) + return -EOPNOTSUPP; + rc = server->ops->query_symlink(xid, tcon, cifs_sb, full_path, + &fattr.cf_symlink_target, false); + if (rc) { + cifs_dbg(FYI, "%s: query_symlink: %d\n", __func__, rc); + goto cgiiu_exit; + } + } + if (*pinode == NULL) { /* get new inode */ cifs_fill_uniqueid(sb, &fattr); @@ -432,6 +461,7 @@ int cifs_get_inode_info_unix(struct inode **pinode, } cgiiu_exit: + kfree(fattr.cf_symlink_target); return rc; } #else @@ -601,10 +631,10 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path, } /* Fill a cifs_fattr struct with info from POSIX info struct */ -static void -smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct smb311_posix_qinfo *info, - struct super_block *sb, bool adjust_tz, bool symlink) +static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct cifs_open_info_data *data, + struct super_block *sb, bool adjust_tz, bool symlink) { + struct smb311_posix_qinfo *info = &data->posix_fi; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); @@ -639,6 +669,8 @@ smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct smb311_posix_qinfo * if (symlink) { fattr->cf_mode |= S_IFLNK; fattr->cf_dtype = DT_LNK; + fattr->cf_symlink_target = data->symlink_target; + data->symlink_target = NULL; } else if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { fattr->cf_mode |= S_IFDIR; fattr->cf_dtype = DT_DIR; @@ -655,13 +687,11 @@ smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct smb311_posix_qinfo * fattr->cf_mode, fattr->cf_uniqueid, fattr->cf_nlink); } - -/* Fill a cifs_fattr struct with info from FILE_ALL_INFO */ -static void -cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, - struct super_block *sb, bool adjust_tz, - bool symlink, u32 reparse_tag) +static void cifs_open_info_to_fattr(struct cifs_fattr *fattr, struct cifs_open_info_data *data, + struct super_block *sb, bool adjust_tz, bool symlink, + u32 reparse_tag) { + struct smb2_file_all_info *info = &data->fi; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); @@ -703,7 +733,8 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, } else if (reparse_tag == IO_REPARSE_TAG_LX_BLK) { fattr->cf_mode |= S_IFBLK | cifs_sb->ctx->file_mode; fattr->cf_dtype = DT_BLK; - } else if (symlink) { /* TODO add more reparse tag checks */ + } else if (symlink || reparse_tag == IO_REPARSE_TAG_SYMLINK || + reparse_tag == IO_REPARSE_TAG_NFS) { fattr->cf_mode = S_IFLNK; fattr->cf_dtype = DT_LNK; } else if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { @@ -735,6 +766,11 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, } } + if (S_ISLNK(fattr->cf_mode)) { + fattr->cf_symlink_target = data->symlink_target; + data->symlink_target = NULL; + } + fattr->cf_uid = cifs_sb->ctx->linux_uid; fattr->cf_gid = cifs_sb->ctx->linux_gid; } @@ -744,23 +780,28 @@ cifs_get_file_info(struct file *filp) { int rc; unsigned int xid; - FILE_ALL_INFO find_data; + struct cifs_open_info_data data = {}; struct cifs_fattr fattr; struct inode *inode = file_inode(filp); struct cifsFileInfo *cfile = filp->private_data; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct TCP_Server_Info *server = tcon->ses->server; + bool symlink = false; + u32 tag = 0; if (!server->ops->query_file_info) return -ENOSYS; xid = get_xid(); - rc = server->ops->query_file_info(xid, tcon, &cfile->fid, &find_data); + rc = server->ops->query_file_info(xid, tcon, cfile, &data); switch (rc) { case 0: /* TODO: add support to query reparse tag */ - cifs_all_info_to_fattr(&fattr, &find_data, inode->i_sb, false, - false, 0 /* no reparse tag */); + if (data.symlink_target) { + symlink = true; + tag = IO_REPARSE_TAG_SYMLINK; + } + cifs_open_info_to_fattr(&fattr, &data, inode->i_sb, false, symlink, tag); break; case -EREMOTE: cifs_create_dfs_fattr(&fattr, inode->i_sb); @@ -789,6 +830,7 @@ cifs_get_file_info(struct file *filp) /* if filetype is different, return error */ rc = cifs_fattr_to_inode(inode, &fattr); cgfi_exit: + cifs_free_open_info(&data); free_xid(xid); return rc; } @@ -860,14 +902,9 @@ cifs_backup_query_path_info(int xid, } #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ -static void -cifs_set_fattr_ino(int xid, - struct cifs_tcon *tcon, - struct super_block *sb, - struct inode **inode, - const char *full_path, - FILE_ALL_INFO *data, - struct cifs_fattr *fattr) +static void cifs_set_fattr_ino(int xid, struct cifs_tcon *tcon, struct super_block *sb, + struct inode **inode, const char *full_path, + struct cifs_open_info_data *data, struct cifs_fattr *fattr) { struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct TCP_Server_Info *server = tcon->ses->server; @@ -885,11 +922,8 @@ cifs_set_fattr_ino(int xid, * If we have an inode pass a NULL tcon to ensure we don't * make a round trip to the server. This only works for SMB2+. */ - rc = server->ops->get_srv_inum(xid, - *inode ? NULL : tcon, - cifs_sb, full_path, - &fattr->cf_uniqueid, - data); + rc = server->ops->get_srv_inum(xid, *inode ? NULL : tcon, cifs_sb, full_path, + &fattr->cf_uniqueid, data); if (rc) { /* * If that fails reuse existing ino or generate one @@ -913,7 +947,7 @@ cifs_set_fattr_ino(int xid, } else { /* make an ino by hashing the UNC */ fattr->cf_flags |= CIFS_FATTR_FAKE_ROOT_INO; - fattr->cf_uniqueid = simple_hashstr(tcon->treeName); + fattr->cf_uniqueid = simple_hashstr(tcon->tree_name); } } } @@ -923,14 +957,10 @@ static inline bool is_inode_cache_good(struct inode *ino) return ino && CIFS_CACHE_READ(CIFS_I(ino)) && CIFS_I(ino)->time != 0; } -int -cifs_get_inode_info(struct inode **inode, - const char *full_path, - FILE_ALL_INFO *in_data, - struct super_block *sb, int xid, - const struct cifs_fid *fid) +int cifs_get_inode_info(struct inode **inode, const char *full_path, + struct cifs_open_info_data *data, struct super_block *sb, int xid, + const struct cifs_fid *fid) { - struct cifs_tcon *tcon; struct TCP_Server_Info *server; struct tcon_link *tlink; @@ -938,8 +968,7 @@ cifs_get_inode_info(struct inode **inode, bool adjust_tz = false; struct cifs_fattr fattr = {0}; bool is_reparse_point = false; - FILE_ALL_INFO *data = in_data; - FILE_ALL_INFO *tmp_data = NULL; + struct cifs_open_info_data tmp_data = {}; void *smb1_backup_rsp_buf = NULL; int rc = 0; int tmprc = 0; @@ -960,21 +989,15 @@ cifs_get_inode_info(struct inode **inode, cifs_dbg(FYI, "No need to revalidate cached inode sizes\n"); goto out; } - tmp_data = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); - if (!tmp_data) { - rc = -ENOMEM; - goto out; - } - rc = server->ops->query_path_info(xid, tcon, cifs_sb, - full_path, tmp_data, - &adjust_tz, &is_reparse_point); + rc = server->ops->query_path_info(xid, tcon, cifs_sb, full_path, &tmp_data, + &adjust_tz, &is_reparse_point); #ifdef CONFIG_CIFS_DFS_UPCALL if (rc == -ENOENT && is_tcon_dfs(tcon)) rc = cifs_dfs_query_info_nonascii_quirk(xid, tcon, cifs_sb, full_path); #endif - data = tmp_data; + data = &tmp_data; } /* @@ -988,14 +1011,24 @@ cifs_get_inode_info(struct inode **inode, * since we have to check if its reparse tag matches a known * special file type e.g. symlink or fifo or char etc. */ - if ((le32_to_cpu(data->Attributes) & ATTR_REPARSE) && - server->ops->query_reparse_tag) { - rc = server->ops->query_reparse_tag(xid, tcon, cifs_sb, - full_path, &reparse_tag); - cifs_dbg(FYI, "reparse tag 0x%x\n", reparse_tag); + if (is_reparse_point && data->symlink_target) { + reparse_tag = IO_REPARSE_TAG_SYMLINK; + } else if ((le32_to_cpu(data->fi.Attributes) & ATTR_REPARSE) && + server->ops->query_reparse_tag) { + tmprc = server->ops->query_reparse_tag(xid, tcon, cifs_sb, full_path, + &reparse_tag); + if (tmprc) + cifs_dbg(FYI, "%s: query_reparse_tag: rc = %d\n", __func__, tmprc); + if (server->ops->query_symlink) { + tmprc = server->ops->query_symlink(xid, tcon, cifs_sb, full_path, + &data->symlink_target, + is_reparse_point); + if (tmprc) + cifs_dbg(FYI, "%s: query_symlink: rc = %d\n", __func__, + tmprc); + } } - cifs_all_info_to_fattr(&fattr, data, sb, adjust_tz, - is_reparse_point, reparse_tag); + cifs_open_info_to_fattr(&fattr, data, sb, adjust_tz, is_reparse_point, reparse_tag); break; case -EREMOTE: /* DFS link, no metadata available on this server */ @@ -1014,18 +1047,20 @@ cifs_get_inode_info(struct inode **inode, */ if (backup_cred(cifs_sb) && is_smb1_server(server)) { /* for easier reading */ + FILE_ALL_INFO *fi; FILE_DIRECTORY_INFO *fdi; SEARCH_ID_FULL_DIR_INFO *si; rc = cifs_backup_query_path_info(xid, tcon, sb, full_path, &smb1_backup_rsp_buf, - &data); + &fi); if (rc) goto out; - fdi = (FILE_DIRECTORY_INFO *)data; - si = (SEARCH_ID_FULL_DIR_INFO *)data; + move_cifs_info_to_smb2(&data->fi, fi); + fdi = (FILE_DIRECTORY_INFO *)fi; + si = (SEARCH_ID_FULL_DIR_INFO *)fi; cifs_dir_info_to_fattr(&fattr, fdi, cifs_sb); fattr.cf_uniqueid = le64_to_cpu(si->UniqueId); @@ -1123,7 +1158,8 @@ handle_mnt_opt: out: cifs_buf_release(smb1_backup_rsp_buf); cifs_put_tlink(tlink); - kfree(tmp_data); + cifs_free_open_info(&tmp_data); + kfree(fattr.cf_symlink_target); return rc; } @@ -1138,7 +1174,7 @@ smb311_posix_get_inode_info(struct inode **inode, bool adjust_tz = false; struct cifs_fattr fattr = {0}; bool symlink = false; - struct smb311_posix_qinfo *data = NULL; + struct cifs_open_info_data data = {}; int rc = 0; int tmprc = 0; @@ -1155,15 +1191,9 @@ smb311_posix_get_inode_info(struct inode **inode, cifs_dbg(FYI, "No need to revalidate cached inode sizes\n"); goto out; } - data = kmalloc(sizeof(struct smb311_posix_qinfo), GFP_KERNEL); - if (!data) { - rc = -ENOMEM; - goto out; - } - rc = smb311_posix_query_path_info(xid, tcon, cifs_sb, - full_path, data, - &adjust_tz, &symlink); + rc = smb311_posix_query_path_info(xid, tcon, cifs_sb, full_path, &data, &adjust_tz, + &symlink); /* * 2. Convert it to internal cifs metadata (fattr) @@ -1171,7 +1201,7 @@ smb311_posix_get_inode_info(struct inode **inode, switch (rc) { case 0: - smb311_posix_info_to_fattr(&fattr, data, sb, adjust_tz, symlink); + smb311_posix_info_to_fattr(&fattr, &data, sb, adjust_tz, symlink); break; case -EREMOTE: /* DFS link, no metadata available on this server */ @@ -1228,7 +1258,8 @@ smb311_posix_get_inode_info(struct inode **inode, } out: cifs_put_tlink(tlink); - kfree(data); + cifs_free_open_info(&data); + kfree(fattr.cf_symlink_target); return rc; } @@ -2265,13 +2296,13 @@ cifs_dentry_needs_reval(struct dentry *dentry) return true; if (!open_cached_dir_by_dentry(tcon, dentry->d_parent, &cfid)) { - mutex_lock(&cfid->fid_mutex); + spin_lock(&cfid->fid_lock); if (cfid->time && cifs_i->time > cfid->time) { - mutex_unlock(&cfid->fid_mutex); + spin_unlock(&cfid->fid_lock); close_cached_dir(cfid); return false; } - mutex_unlock(&cfid->fid_mutex); + spin_unlock(&cfid->fid_lock); close_cached_dir(cfid); } /* @@ -2327,7 +2358,7 @@ cifs_invalidate_mapping(struct inode *inode) static int cifs_wait_bit_killable(struct wait_bit_key *key, int mode) { - freezable_schedule_unsafe(); + schedule(); if (signal_pending_state(mode, current)) return -ERESTARTSYS; return 0; @@ -2345,7 +2376,7 @@ cifs_revalidate_mapping(struct inode *inode) return 0; rc = wait_on_bit_lock_action(flags, CIFS_INO_LOCK, cifs_wait_bit_killable, - TASK_KILLABLE); + TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); if (rc) return rc; diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index b6e6e5d6c8dd..6419ec47c2a8 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -343,7 +343,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) rc = put_user(ExtAttrBits & FS_FL_USER_VISIBLE, (int __user *)arg); - if (rc != EOPNOTSUPP) + if (rc != -EOPNOTSUPP) break; } #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ @@ -373,7 +373,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) * pSMBFile->fid.netfid, * extAttrBits, * &ExtAttrMask); - * if (rc != EOPNOTSUPP) + * if (rc != -EOPNOTSUPP) * break; */ @@ -484,12 +484,35 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) tcon = tlink_tcon(tlink); if (tcon && tcon->ses->server->ops->notify) { rc = tcon->ses->server->ops->notify(xid, - filep, (void __user *)arg); + filep, (void __user *)arg, + false /* no ret data */); cifs_dbg(FYI, "ioctl notify rc %d\n", rc); } else rc = -EOPNOTSUPP; cifs_put_tlink(tlink); break; + case CIFS_IOC_NOTIFY_INFO: + if (!S_ISDIR(inode->i_mode)) { + /* Notify can only be done on directories */ + rc = -EOPNOTSUPP; + break; + } + cifs_sb = CIFS_SB(inode->i_sb); + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) { + rc = PTR_ERR(tlink); + break; + } + tcon = tlink_tcon(tlink); + if (tcon && tcon->ses->server->ops->notify) { + rc = tcon->ses->server->ops->notify(xid, + filep, (void __user *)arg, + true /* return details */); + cifs_dbg(FYI, "ioctl notify info rc %d\n", rc); + } else + rc = -EOPNOTSUPP; + cifs_put_tlink(tlink); + break; case CIFS_IOC_SHUTDOWN: rc = cifs_shutdown(inode->i_sb, arg); break; diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 6803cb27eecc..bd374feeccaa 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -38,29 +38,28 @@ static int symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash) { int rc; - struct crypto_shash *md5 = NULL; - struct sdesc *sdescmd5 = NULL; + struct shash_desc *md5 = NULL; - rc = cifs_alloc_hash("md5", &md5, &sdescmd5); + rc = cifs_alloc_hash("md5", &md5); if (rc) goto symlink_hash_err; - rc = crypto_shash_init(&sdescmd5->shash); + rc = crypto_shash_init(md5); if (rc) { cifs_dbg(VFS, "%s: Could not init md5 shash\n", __func__); goto symlink_hash_err; } - rc = crypto_shash_update(&sdescmd5->shash, link_str, link_len); + rc = crypto_shash_update(md5, link_str, link_len); if (rc) { cifs_dbg(VFS, "%s: Could not update with link_str\n", __func__); goto symlink_hash_err; } - rc = crypto_shash_final(&sdescmd5->shash, md5_hash); + rc = crypto_shash_final(md5, md5_hash); if (rc) cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); symlink_hash_err: - cifs_free_hash(&md5, &sdescmd5); + cifs_free_hash(&md5); return rc; } @@ -202,40 +201,6 @@ out: return rc; } -static int -query_mf_symlink(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const unsigned char *path, - char **symlinkinfo) -{ - int rc; - u8 *buf = NULL; - unsigned int link_len = 0; - unsigned int bytes_read = 0; - - buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - if (tcon->ses->server->ops->query_mf_symlink) - rc = tcon->ses->server->ops->query_mf_symlink(xid, tcon, - cifs_sb, path, buf, &bytes_read); - else - rc = -ENOSYS; - - if (rc) - goto out; - - if (bytes_read == 0) { /* not a symlink */ - rc = -EINVAL; - goto out; - } - - rc = parse_mf_symlink(buf, bytes_read, &link_len, symlinkinfo); -out: - kfree(buf); - return rc; -} - int check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, @@ -245,6 +210,7 @@ check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, u8 *buf = NULL; unsigned int link_len = 0; unsigned int bytes_read = 0; + char *symlink = NULL; if (!couldbe_mf_symlink(fattr)) /* it's not a symlink */ @@ -266,7 +232,7 @@ check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, if (bytes_read == 0) /* not a symlink */ goto out; - rc = parse_mf_symlink(buf, bytes_read, &link_len, NULL); + rc = parse_mf_symlink(buf, bytes_read, &link_len, &symlink); if (rc == -EINVAL) { /* it's not a symlink */ rc = 0; @@ -281,6 +247,7 @@ check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, fattr->cf_mode &= ~S_IFMT; fattr->cf_mode |= S_IFLNK | S_IRWXU | S_IRWXG | S_IRWXO; fattr->cf_dtype = DT_LNK; + fattr->cf_symlink_target = symlink; out: kfree(buf); return rc; @@ -600,75 +567,6 @@ cifs_hl_exit: return rc; } -const char * -cifs_get_link(struct dentry *direntry, struct inode *inode, - struct delayed_call *done) -{ - int rc = -ENOMEM; - unsigned int xid; - const char *full_path; - void *page; - char *target_path = NULL; - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); - struct tcon_link *tlink = NULL; - struct cifs_tcon *tcon; - struct TCP_Server_Info *server; - - if (!direntry) - return ERR_PTR(-ECHILD); - - xid = get_xid(); - - tlink = cifs_sb_tlink(cifs_sb); - if (IS_ERR(tlink)) { - free_xid(xid); - return ERR_CAST(tlink); - } - tcon = tlink_tcon(tlink); - server = tcon->ses->server; - - page = alloc_dentry_path(); - full_path = build_path_from_dentry(direntry, page); - if (IS_ERR(full_path)) { - free_xid(xid); - cifs_put_tlink(tlink); - free_dentry_path(page); - return ERR_CAST(full_path); - } - - cifs_dbg(FYI, "Full path: %s inode = 0x%p\n", full_path, inode); - - rc = -EACCES; - /* - * First try Minshall+French Symlinks, if configured - * and fallback to UNIX Extensions Symlinks. - */ - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) - rc = query_mf_symlink(xid, tcon, cifs_sb, full_path, - &target_path); - - if (rc != 0 && server->ops->query_symlink) { - struct cifsInodeInfo *cifsi = CIFS_I(inode); - bool reparse_point = false; - - if (cifsi->cifsAttrs & ATTR_REPARSE) - reparse_point = true; - - rc = server->ops->query_symlink(xid, tcon, cifs_sb, full_path, - &target_path, reparse_point); - } - - free_dentry_path(page); - free_xid(xid); - cifs_put_tlink(tlink); - if (rc != 0) { - kfree(target_path); - return ERR_PTR(rc); - } - set_delayed_call(done, kfree_link, target_path); - return target_path; -} - int cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode, struct dentry *direntry, const char *symname) diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 87f60f736731..3e68d8208cf5 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -117,8 +117,8 @@ tconInfoAlloc(void) ret_buf = kzalloc(sizeof(*ret_buf), GFP_KERNEL); if (!ret_buf) return NULL; - ret_buf->cfid = init_cached_dir(); - if (!ret_buf->cfid) { + ret_buf->cfids = init_cached_dirs(); + if (!ret_buf->cfids) { kfree(ret_buf); return NULL; } @@ -144,7 +144,7 @@ tconInfoFree(struct cifs_tcon *tcon) cifs_dbg(FYI, "Null buffer passed to tconInfoFree\n"); return; } - free_cached_dir(tcon); + free_cached_dirs(tcon->cfids); atomic_dec(&tconInfoAllocCount); kfree(tcon->nativeFileSystem); kfree_sensitive(tcon->password); @@ -400,6 +400,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) { struct smb_hdr *buf = (struct smb_hdr *)buffer; struct smb_com_lock_req *pSMB = (struct smb_com_lock_req *)buf; + struct TCP_Server_Info *pserver; struct cifs_ses *ses; struct cifs_tcon *tcon; struct cifsInodeInfo *pCifsInode; @@ -464,9 +465,12 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) if (!(pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE)) return false; + /* If server is a channel, select the primary channel */ + pserver = CIFS_SERVER_IS_CHAN(srv) ? srv->primary_server : srv; + /* look up tcon based on tid & uid */ spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(ses, &srv->smb_ses_list, smb_ses_list) { + list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { if (tcon->tid != buf->Tid) continue; @@ -525,7 +529,7 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb) cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM; cifs_sb->mnt_cifs_serverino_autodisabled = true; cifs_dbg(VFS, "Autodisabling the use of server inode numbers on %s\n", - tcon ? tcon->treeName : "new server"); + tcon ? tcon->tree_name : "new server"); cifs_dbg(VFS, "The server doesn't seem to support them properly or the files might be on different servers (DFS)\n"); cifs_dbg(VFS, "Hardlinks will not be recognized on this mount. Consider mounting with the \"noserverino\" option to silence this message.\n"); @@ -824,7 +828,7 @@ cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path) free_dentry_path(page); } -/* parses DFS refferal V3 structure +/* parses DFS referral V3 structure * caller is responsible for freeing target_nodes * returns: * - on success - 0 @@ -1071,59 +1075,58 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw) /** * cifs_alloc_hash - allocate hash and hash context together * @name: The name of the crypto hash algo - * @shash: Where to put the pointer to the hash algo - * @sdesc: Where to put the pointer to the hash descriptor + * @sdesc: SHASH descriptor where to put the pointer to the hash TFM * * The caller has to make sure @sdesc is initialized to either NULL or - * a valid context. Both can be freed via cifs_free_hash(). + * a valid context. It can be freed via cifs_free_hash(). */ int -cifs_alloc_hash(const char *name, - struct crypto_shash **shash, struct sdesc **sdesc) +cifs_alloc_hash(const char *name, struct shash_desc **sdesc) { int rc = 0; - size_t size; + struct crypto_shash *alg = NULL; - if (*sdesc != NULL) + if (*sdesc) return 0; - *shash = crypto_alloc_shash(name, 0, 0); - if (IS_ERR(*shash)) { - cifs_dbg(VFS, "Could not allocate crypto %s\n", name); - rc = PTR_ERR(*shash); - *shash = NULL; + alg = crypto_alloc_shash(name, 0, 0); + if (IS_ERR(alg)) { + cifs_dbg(VFS, "Could not allocate shash TFM '%s'\n", name); + rc = PTR_ERR(alg); *sdesc = NULL; return rc; } - size = sizeof(struct shash_desc) + crypto_shash_descsize(*shash); - *sdesc = kmalloc(size, GFP_KERNEL); + *sdesc = kmalloc(sizeof(struct shash_desc) + crypto_shash_descsize(alg), GFP_KERNEL); if (*sdesc == NULL) { - cifs_dbg(VFS, "no memory left to allocate crypto %s\n", name); - crypto_free_shash(*shash); - *shash = NULL; + cifs_dbg(VFS, "no memory left to allocate shash TFM '%s'\n", name); + crypto_free_shash(alg); return -ENOMEM; } - (*sdesc)->shash.tfm = *shash; + (*sdesc)->tfm = alg; return 0; } /** * cifs_free_hash - free hash and hash context together - * @shash: Where to find the pointer to the hash algo - * @sdesc: Where to find the pointer to the hash descriptor + * @sdesc: Where to find the pointer to the hash TFM * - * Freeing a NULL hash or context is safe. + * Freeing a NULL descriptor is safe. */ void -cifs_free_hash(struct crypto_shash **shash, struct sdesc **sdesc) +cifs_free_hash(struct shash_desc **sdesc) { - kfree(*sdesc); + if (unlikely(!sdesc) || !*sdesc) + return; + + if ((*sdesc)->tfm) { + crypto_free_shash((*sdesc)->tfm); + (*sdesc)->tfm = NULL; + } + + kfree_sensitive(*sdesc); *sdesc = NULL; - if (*shash) - crypto_free_shash(*shash); - *shash = NULL; } /** @@ -1328,7 +1331,7 @@ int cifs_dfs_query_info_nonascii_quirk(const unsigned int xid, char *treename, *dfspath, sep; int treenamelen, linkpathlen, rc; - treename = tcon->treeName; + treename = tcon->tree_name; /* MS-DFSC: All paths in REQ_GET_DFS_REFERRAL and RESP_GET_DFS_REFERRAL * messages MUST be encoded with exactly one leading backslash, not two * leading backslashes. diff --git a/fs/cifs/netlink.c b/fs/cifs/netlink.c index 291cb606f149..147d9409252c 100644 --- a/fs/cifs/netlink.c +++ b/fs/cifs/netlink.c @@ -51,6 +51,7 @@ struct genl_family cifs_genl_family = { .policy = cifs_genl_policy, .ops = cifs_genl_ops, .n_ops = ARRAY_SIZE(cifs_genl_ops), + .resv_start_op = CIFS_GENL_CMD_SWN_NOTIFY + 1, .mcgrps = cifs_genl_mcgrps, .n_mcgrps = ARRAY_SIZE(cifs_genl_mcgrps), }; diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 8e060c00c969..2d75ba5aaa8a 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -844,17 +844,34 @@ static bool emit_cached_dirents(struct cached_dirents *cde, struct dir_context *ctx) { struct cached_dirent *dirent; - int rc; + bool rc; list_for_each_entry(dirent, &cde->entries, entry) { - if (ctx->pos >= dirent->pos) + /* + * Skip all early entries prior to the current lseek() + * position. + */ + if (ctx->pos > dirent->pos) continue; + /* + * We recorded the current ->pos value for the dirent + * when we stored it in the cache. + * However, this sequence of ->pos values may have holes + * in it, for example dot-dirs returned from the server + * are suppressed. + * Handle this bu forcing ctx->pos to be the same as the + * ->pos of the current dirent we emit from the cache. + * This means that when we emit these entries from the cache + * we now emit them with the same ->pos value as in the + * initial scan. + */ ctx->pos = dirent->pos; rc = dir_emit(ctx, dirent->name, dirent->namelen, dirent->fattr.cf_uniqueid, dirent->fattr.cf_dtype); if (!rc) return rc; + ctx->pos++; } return true; } @@ -994,6 +1011,8 @@ static int cifs_filldir(char *find_entry, struct file *file, cifs_unix_basic_to_fattr(&fattr, &((FILE_UNIX_INFO *)find_entry)->basic, cifs_sb); + if (S_ISLNK(fattr.cf_mode)) + fattr.cf_flags |= CIFS_FATTR_NEED_REVAL; break; case SMB_FIND_FILE_INFO_STANDARD: cifs_std_info_to_fattr(&fattr, @@ -1202,10 +1221,10 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) ctx->pos, tmp_buf); cifs_save_resume_key(current_entry, cifsFile); break; - } else - current_entry = - nxt_dir_entry(current_entry, end_of_smb, - cifsFile->srch_inf.info_level); + } + current_entry = + nxt_dir_entry(current_entry, end_of_smb, + cifsFile->srch_inf.info_level); } kfree(tmp_buf); diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 3af3b05b6c74..92e4278ec35d 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -496,6 +496,7 @@ out: cifs_put_tcp_session(chan->server, 0); } + free_xid(xid); return rc; } @@ -601,11 +602,6 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses, /* BB FIXME add check that strings total less than 335 or will need to send them as arrays */ - /* unicode strings, must be word aligned before the call */ -/* if ((long) bcc_ptr % 2) { - *bcc_ptr = 0; - bcc_ptr++; - } */ /* copy user */ if (ses->user_name == NULL) { /* null user mount */ @@ -1213,10 +1209,18 @@ out_free_smb_buf: static void sess_free_buffer(struct sess_data *sess_data) { + struct kvec *iov = sess_data->iov; + + /* + * Zero the session data before freeing, as it might contain sensitive info (keys, etc). + * Note that iov[1] is already freed by caller. + */ + if (sess_data->buf0_type != CIFS_NO_BUFFER && iov[0].iov_base) + memzero_explicit(iov[0].iov_base, iov[0].iov_len); - free_rsp_buf(sess_data->buf0_type, sess_data->iov[0].iov_base); + free_rsp_buf(sess_data->buf0_type, iov[0].iov_base); sess_data->buf0_type = CIFS_NO_BUFFER; - kfree(sess_data->iov[2].iov_base); + kfree_sensitive(iov[2].iov_base); } static int @@ -1318,7 +1322,7 @@ sess_auth_ntlmv2(struct sess_data *sess_data) } if (ses->capabilities & CAP_UNICODE) { - if (sess_data->iov[0].iov_len % 2) { + if (!IS_ALIGNED(sess_data->iov[0].iov_len, 2)) { *bcc_ptr = 0; bcc_ptr++; } @@ -1358,7 +1362,7 @@ sess_auth_ntlmv2(struct sess_data *sess_data) /* no string area to decode, do nothing */ } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { /* unicode string area must be word-aligned */ - if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { + if (!IS_ALIGNED((unsigned long)bcc_ptr - (unsigned long)smb_buf, 2)) { ++bcc_ptr; --bytes_remaining; } @@ -1374,7 +1378,7 @@ out: sess_data->result = rc; sess_data->func = NULL; sess_free_buffer(sess_data); - kfree(ses->auth_key.response); + kfree_sensitive(ses->auth_key.response); ses->auth_key.response = NULL; } @@ -1442,8 +1446,7 @@ sess_auth_kerberos(struct sess_data *sess_data) if (ses->capabilities & CAP_UNICODE) { /* unicode strings must be word aligned */ - if ((sess_data->iov[0].iov_len - + sess_data->iov[1].iov_len) % 2) { + if (!IS_ALIGNED(sess_data->iov[0].iov_len + sess_data->iov[1].iov_len, 2)) { *bcc_ptr = 0; bcc_ptr++; } @@ -1494,7 +1497,7 @@ sess_auth_kerberos(struct sess_data *sess_data) /* no string area to decode, do nothing */ } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { /* unicode string area must be word-aligned */ - if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { + if (!IS_ALIGNED((unsigned long)bcc_ptr - (unsigned long)smb_buf, 2)) { ++bcc_ptr; --bytes_remaining; } @@ -1513,7 +1516,7 @@ out: sess_data->result = rc; sess_data->func = NULL; sess_free_buffer(sess_data); - kfree(ses->auth_key.response); + kfree_sensitive(ses->auth_key.response); ses->auth_key.response = NULL; } @@ -1546,7 +1549,7 @@ _sess_auth_rawntlmssp_assemble_req(struct sess_data *sess_data) bcc_ptr = sess_data->iov[2].iov_base; /* unicode strings must be word aligned */ - if ((sess_data->iov[0].iov_len + sess_data->iov[1].iov_len) % 2) { + if (!IS_ALIGNED(sess_data->iov[0].iov_len + sess_data->iov[1].iov_len, 2)) { *bcc_ptr = 0; bcc_ptr++; } @@ -1648,7 +1651,7 @@ sess_auth_rawntlmssp_negotiate(struct sess_data *sess_data) rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses); out_free_ntlmsspblob: - kfree(ntlmsspblob); + kfree_sensitive(ntlmsspblob); out: sess_free_buffer(sess_data); @@ -1658,9 +1661,9 @@ out: } /* Else error. Cleanup */ - kfree(ses->auth_key.response); + kfree_sensitive(ses->auth_key.response); ses->auth_key.response = NULL; - kfree(ses->ntlmssp); + kfree_sensitive(ses->ntlmssp); ses->ntlmssp = NULL; sess_data->func = NULL; @@ -1747,7 +1750,7 @@ sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data) /* no string area to decode, do nothing */ } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { /* unicode string area must be word-aligned */ - if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { + if (!IS_ALIGNED((unsigned long)bcc_ptr - (unsigned long)smb_buf, 2)) { ++bcc_ptr; --bytes_remaining; } @@ -1759,7 +1762,7 @@ sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data) } out_free_ntlmsspblob: - kfree(ntlmsspblob); + kfree_sensitive(ntlmsspblob); out: sess_free_buffer(sess_data); @@ -1767,9 +1770,9 @@ out: rc = sess_establish_session(sess_data); /* Cleanup */ - kfree(ses->auth_key.response); + kfree_sensitive(ses->auth_key.response); ses->auth_key.response = NULL; - kfree(ses->ntlmssp); + kfree_sensitive(ses->ntlmssp); ses->ntlmssp = NULL; sess_data->func = NULL; @@ -1845,7 +1848,7 @@ int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses, rc = sess_data->result; out: - kfree(sess_data); + kfree_sensitive(sess_data); return rc; } #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index f36b2d2d40ca..50480751e521 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -542,31 +542,32 @@ cifs_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, return rc; } -static int -cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - FILE_ALL_INFO *data, bool *adjustTZ, bool *symlink) +static int cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + struct cifs_open_info_data *data, bool *adjustTZ, bool *symlink) { int rc; + FILE_ALL_INFO fi = {}; *symlink = false; /* could do find first instead but this returns more info */ - rc = CIFSSMBQPathInfo(xid, tcon, full_path, data, 0 /* not legacy */, - cifs_sb->local_nls, cifs_remap(cifs_sb)); + rc = CIFSSMBQPathInfo(xid, tcon, full_path, &fi, 0 /* not legacy */, cifs_sb->local_nls, + cifs_remap(cifs_sb)); /* * BB optimize code so we do not make the above call when server claims * no NT SMB support and the above call failed at least once - set flag * in tcon or mount. */ if ((rc == -EOPNOTSUPP) || (rc == -EINVAL)) { - rc = SMBQueryInformation(xid, tcon, full_path, data, - cifs_sb->local_nls, + rc = SMBQueryInformation(xid, tcon, full_path, &fi, cifs_sb->local_nls, cifs_remap(cifs_sb)); + if (!rc) + move_cifs_info_to_smb2(&data->fi, &fi); *adjustTZ = true; } - if (!rc && (le32_to_cpu(data->Attributes) & ATTR_REPARSE)) { + if (!rc && (le32_to_cpu(fi.Attributes) & ATTR_REPARSE)) { int tmprc; int oplock = 0; struct cifs_fid fid; @@ -592,10 +593,9 @@ cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, return rc; } -static int -cifs_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - u64 *uniqueid, FILE_ALL_INFO *data) +static int cifs_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + u64 *uniqueid, struct cifs_open_info_data *unused) { /* * We can not use the IndexNumber field by default from Windows or @@ -613,11 +613,22 @@ cifs_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon, cifs_remap(cifs_sb)); } -static int -cifs_query_file_info(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_fid *fid, FILE_ALL_INFO *data) +static int cifs_query_file_info(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *cfile, struct cifs_open_info_data *data) { - return CIFSSMBQFileInfo(xid, tcon, fid->netfid, data); + int rc; + FILE_ALL_INFO fi = {}; + + if (cfile->symlink_target) { + data->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); + if (!data->symlink_target) + return -ENOMEM; + } + + rc = CIFSSMBQFileInfo(xid, tcon, cfile->fid.netfid, &fi); + if (!rc) + move_cifs_info_to_smb2(&data->fi, &fi); + return rc; } static void @@ -702,19 +713,20 @@ cifs_mkdir_setinfo(struct inode *inode, const char *full_path, cifsInode->cifsAttrs = dosattrs; } -static int -cifs_open_file(const unsigned int xid, struct cifs_open_parms *oparms, - __u32 *oplock, FILE_ALL_INFO *buf) +static int cifs_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock, + void *buf) { + FILE_ALL_INFO *fi = buf; + if (!(oparms->tcon->ses->capabilities & CAP_NT_SMBS)) return SMBLegacyOpen(xid, oparms->tcon, oparms->path, oparms->disposition, oparms->desired_access, oparms->create_options, - &oparms->fid->netfid, oplock, buf, + &oparms->fid->netfid, oplock, fi, oparms->cifs_sb->local_nls, cifs_remap(oparms->cifs_sb)); - return CIFS_open(xid, oparms, oplock, buf); + return CIFS_open(xid, oparms, oplock, fi); } static void diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index 9dfd2dd612c2..ffbd9a99fc12 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -20,40 +20,125 @@ #include "cifs_unicode.h" #include "fscache.h" #include "smb2proto.h" +#include "smb2status.h" -int -smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, - __u32 *oplock, FILE_ALL_INFO *buf) +static struct smb2_symlink_err_rsp *symlink_data(const struct kvec *iov) +{ + struct smb2_err_rsp *err = iov->iov_base; + struct smb2_symlink_err_rsp *sym = ERR_PTR(-EINVAL); + u32 len; + + if (err->ErrorContextCount) { + struct smb2_error_context_rsp *p, *end; + + len = (u32)err->ErrorContextCount * (offsetof(struct smb2_error_context_rsp, + ErrorContextData) + + sizeof(struct smb2_symlink_err_rsp)); + if (le32_to_cpu(err->ByteCount) < len || iov->iov_len < len + sizeof(*err)) + return ERR_PTR(-EINVAL); + + p = (struct smb2_error_context_rsp *)err->ErrorData; + end = (struct smb2_error_context_rsp *)((u8 *)err + iov->iov_len); + do { + if (le32_to_cpu(p->ErrorId) == SMB2_ERROR_ID_DEFAULT) { + sym = (struct smb2_symlink_err_rsp *)&p->ErrorContextData; + break; + } + cifs_dbg(FYI, "%s: skipping unhandled error context: 0x%x\n", + __func__, le32_to_cpu(p->ErrorId)); + + len = ALIGN(le32_to_cpu(p->ErrorDataLength), 8); + p = (struct smb2_error_context_rsp *)((u8 *)&p->ErrorContextData + len); + } while (p < end); + } else if (le32_to_cpu(err->ByteCount) >= sizeof(*sym) && + iov->iov_len >= SMB2_SYMLINK_STRUCT_SIZE) { + sym = (struct smb2_symlink_err_rsp *)err->ErrorData; + } + + if (!IS_ERR(sym) && (le32_to_cpu(sym->SymLinkErrorTag) != SYMLINK_ERROR_TAG || + le32_to_cpu(sym->ReparseTag) != IO_REPARSE_TAG_SYMLINK)) + sym = ERR_PTR(-EINVAL); + + return sym; +} + +int smb2_parse_symlink_response(struct cifs_sb_info *cifs_sb, const struct kvec *iov, char **path) +{ + struct smb2_symlink_err_rsp *sym; + unsigned int sub_offs, sub_len; + unsigned int print_offs, print_len; + char *s; + + if (!cifs_sb || !iov || !iov->iov_base || !iov->iov_len || !path) + return -EINVAL; + + sym = symlink_data(iov); + if (IS_ERR(sym)) + return PTR_ERR(sym); + + sub_len = le16_to_cpu(sym->SubstituteNameLength); + sub_offs = le16_to_cpu(sym->SubstituteNameOffset); + print_len = le16_to_cpu(sym->PrintNameLength); + print_offs = le16_to_cpu(sym->PrintNameOffset); + + if (iov->iov_len < SMB2_SYMLINK_STRUCT_SIZE + sub_offs + sub_len || + iov->iov_len < SMB2_SYMLINK_STRUCT_SIZE + print_offs + print_len) + return -EINVAL; + + s = cifs_strndup_from_utf16((char *)sym->PathBuffer + sub_offs, sub_len, true, + cifs_sb->local_nls); + if (!s) + return -ENOMEM; + convert_delimiter(s, '/'); + cifs_dbg(FYI, "%s: symlink target: %s\n", __func__, s); + + *path = s; + return 0; +} + +int smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock, void *buf) { int rc; __le16 *smb2_path; - struct smb2_file_all_info *smb2_data = NULL; __u8 smb2_oplock; + struct cifs_open_info_data *data = buf; + struct smb2_file_all_info file_info = {}; + struct smb2_file_all_info *smb2_data = data ? &file_info : NULL; + struct kvec err_iov = {}; + int err_buftype = CIFS_NO_BUFFER; struct cifs_fid *fid = oparms->fid; struct network_resiliency_req nr_ioctl_req; smb2_path = cifs_convert_path_to_utf16(oparms->path, oparms->cifs_sb); - if (smb2_path == NULL) { - rc = -ENOMEM; - goto out; - } - - smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2, - GFP_KERNEL); - if (smb2_data == NULL) { - rc = -ENOMEM; - goto out; - } + if (smb2_path == NULL) + return -ENOMEM; oparms->desired_access |= FILE_READ_ATTRIBUTES; smb2_oplock = SMB2_OPLOCK_LEVEL_BATCH; - rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, NULL, - NULL, NULL); + rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, NULL, &err_iov, + &err_buftype); + if (rc && data) { + struct smb2_hdr *hdr = err_iov.iov_base; + + if (unlikely(!err_iov.iov_base || err_buftype == CIFS_NO_BUFFER)) + rc = -ENOMEM; + else if (hdr->Status == STATUS_STOPPED_ON_SYMLINK) { + rc = smb2_parse_symlink_response(oparms->cifs_sb, &err_iov, + &data->symlink_target); + if (!rc) { + memset(smb2_data, 0, sizeof(*smb2_data)); + oparms->create_options |= OPEN_REPARSE_POINT; + rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, + NULL, NULL, NULL); + oparms->create_options &= ~OPEN_REPARSE_POINT; + } + } + } + if (rc) goto out; - if (oparms->tcon->use_resilient) { /* default timeout is 0, servers pick default (120 seconds) */ nr_ioctl_req.Timeout = @@ -73,7 +158,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, rc = 0; } - if (buf) { + if (smb2_data) { /* if open response does not have IndexNumber field - get it */ if (smb2_data->IndexNumber == 0) { rc = SMB2_get_srv_num(xid, oparms->tcon, @@ -89,12 +174,12 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, rc = 0; } } - move_smb2_info_to_cifs(buf, smb2_data); + memcpy(&data->fi, smb2_data, sizeof(data->fi)); } *oplock = smb2_oplock; out: - kfree(smb2_data); + free_rsp_buf(err_buftype, err_iov.iov_base); kfree(smb2_path); return rc; } diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index b83f59051b26..68e08c85fbb8 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -24,6 +24,7 @@ #include "smb2pdu.h" #include "smb2proto.h" #include "cached_dir.h" +#include "smb2status.h" static void free_set_inf_compound(struct smb_rqst *rqst) @@ -50,13 +51,15 @@ struct cop_vars { /* * note: If cfile is passed, the reference to it is dropped here. * So make sure that you do not reuse cfile after return from this func. + * + * If passing @err_iov and @err_buftype, ensure to make them both large enough (>= 3) to hold all + * error responses. Caller is also responsible for freeing them up. */ -static int -smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - __u32 desired_access, __u32 create_disposition, - __u32 create_options, umode_t mode, void *ptr, int command, - struct cifsFileInfo *cfile) +static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + __u32 desired_access, __u32 create_disposition, __u32 create_options, + umode_t mode, void *ptr, int command, struct cifsFileInfo *cfile, + struct kvec *err_iov, int *err_buftype) { struct cop_vars *vars = NULL; struct kvec *rsp_iov; @@ -70,6 +73,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, int num_rqst = 0; int resp_buftype[3]; struct smb2_query_info_rsp *qi_rsp = NULL; + struct cifs_open_info_data *idata; int flags = 0; __u8 delete_pending[8] = {1, 0, 0, 0, 0, 0, 0, 0}; unsigned int size[2]; @@ -379,20 +383,25 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, SMB2_open_free(&rqst[0]); if (rc == -EREMCHG) { - pr_warn_once("server share %s deleted\n", tcon->treeName); + pr_warn_once("server share %s deleted\n", tcon->tree_name); tcon->need_reconnect = true; } switch (command) { case SMB2_OP_QUERY_INFO: + idata = ptr; + if (rc == 0 && cfile && cfile->symlink_target) { + idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); + if (!idata->symlink_target) + rc = -ENOMEM; + } if (rc == 0) { qi_rsp = (struct smb2_query_info_rsp *) rsp_iov[1].iov_base; rc = smb2_validate_and_copy_iov( le16_to_cpu(qi_rsp->OutputBufferOffset), le32_to_cpu(qi_rsp->OutputBufferLength), - &rsp_iov[1], sizeof(struct smb2_file_all_info), - ptr); + &rsp_iov[1], sizeof(idata->fi), (char *)&idata->fi); } if (rqst[1].rq_iov) SMB2_query_info_free(&rqst[1]); @@ -406,13 +415,20 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, tcon->tid); break; case SMB2_OP_POSIX_QUERY_INFO: + idata = ptr; + if (rc == 0 && cfile && cfile->symlink_target) { + idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); + if (!idata->symlink_target) + rc = -ENOMEM; + } if (rc == 0) { qi_rsp = (struct smb2_query_info_rsp *) rsp_iov[1].iov_base; rc = smb2_validate_and_copy_iov( le16_to_cpu(qi_rsp->OutputBufferOffset), le32_to_cpu(qi_rsp->OutputBufferLength), - &rsp_iov[1], sizeof(struct smb311_posix_qinfo) /* add SIDs */, ptr); + &rsp_iov[1], sizeof(idata->posix_fi) /* add SIDs */, + (char *)&idata->posix_fi); } if (rqst[1].rq_iov) SMB2_query_info_free(&rqst[1]); @@ -477,42 +493,33 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, free_set_inf_compound(rqst); break; } - free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); - free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); - free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); + + if (rc && err_iov && err_buftype) { + memcpy(err_iov, rsp_iov, 3 * sizeof(*err_iov)); + memcpy(err_buftype, resp_buftype, 3 * sizeof(*err_buftype)); + } else { + free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); + free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); + free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); + } kfree(vars); return rc; } -void -move_smb2_info_to_cifs(FILE_ALL_INFO *dst, struct smb2_file_all_info *src) -{ - memcpy(dst, src, (size_t)(&src->CurrentByteOffset) - (size_t)src); - dst->CurrentByteOffset = src->CurrentByteOffset; - dst->Mode = src->Mode; - dst->AlignmentRequirement = src->AlignmentRequirement; - dst->IndexNumber1 = 0; /* we don't use it */ -} - -int -smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - FILE_ALL_INFO *data, bool *adjust_tz, bool *reparse) +int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse) { int rc; - struct smb2_file_all_info *smb2_data; __u32 create_options = 0; struct cifsFileInfo *cfile; struct cached_fid *cfid = NULL; + struct kvec err_iov[3] = {}; + int err_buftype[3] = {}; *adjust_tz = false; *reparse = false; - smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2, - GFP_KERNEL); - if (smb2_data == NULL) - return -ENOMEM; - if (strcmp(full_path, "")) rc = -ENOENT; else @@ -520,63 +527,58 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, /* If it is a root and its handle is cached then use it */ if (!rc) { if (cfid->file_all_info_is_valid) { - move_smb2_info_to_cifs(data, - &cfid->file_all_info); + memcpy(&data->fi, &cfid->file_all_info, sizeof(data->fi)); } else { - rc = SMB2_query_info(xid, tcon, - cfid->fid.persistent_fid, - cfid->fid.volatile_fid, smb2_data); - if (!rc) - move_smb2_info_to_cifs(data, smb2_data); + rc = SMB2_query_info(xid, tcon, cfid->fid.persistent_fid, + cfid->fid.volatile_fid, &data->fi); } close_cached_dir(cfid); - goto out; + return rc; } cifs_get_readable_path(tcon, full_path, &cfile); - rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, - FILE_READ_ATTRIBUTES, FILE_OPEN, create_options, - ACL_NO_MODE, smb2_data, SMB2_OP_QUERY_INFO, cfile); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, + create_options, ACL_NO_MODE, data, SMB2_OP_QUERY_INFO, cfile, + err_iov, err_buftype); if (rc == -EOPNOTSUPP) { + if (err_iov[0].iov_base && err_buftype[0] != CIFS_NO_BUFFER && + ((struct smb2_hdr *)err_iov[0].iov_base)->Command == SMB2_CREATE && + ((struct smb2_hdr *)err_iov[0].iov_base)->Status == STATUS_STOPPED_ON_SYMLINK) { + rc = smb2_parse_symlink_response(cifs_sb, err_iov, &data->symlink_target); + if (rc) + goto out; + } *reparse = true; create_options |= OPEN_REPARSE_POINT; /* Failed on a symbolic link - query a reparse point info */ cifs_get_readable_path(tcon, full_path, &cfile); - rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, - FILE_READ_ATTRIBUTES, FILE_OPEN, - create_options, ACL_NO_MODE, - smb2_data, SMB2_OP_QUERY_INFO, cfile); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, + FILE_OPEN, create_options, ACL_NO_MODE, data, + SMB2_OP_QUERY_INFO, cfile, NULL, NULL); } - if (rc) - goto out; - move_smb2_info_to_cifs(data, smb2_data); out: - kfree(smb2_data); + free_rsp_buf(err_buftype[0], err_iov[0].iov_base); + free_rsp_buf(err_buftype[1], err_iov[1].iov_base); + free_rsp_buf(err_buftype[2], err_iov[2].iov_base); return rc; } -int -smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - struct smb311_posix_qinfo *data, bool *adjust_tz, bool *reparse) +int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse) { int rc; __u32 create_options = 0; struct cifsFileInfo *cfile; - struct smb311_posix_qinfo *smb2_data; + struct kvec err_iov[3] = {}; + int err_buftype[3] = {}; *adjust_tz = false; *reparse = false; - /* BB TODO: Make struct larger when add support for parsing owner SIDs */ - smb2_data = kzalloc(sizeof(struct smb311_posix_qinfo), - GFP_KERNEL); - if (smb2_data == NULL) - return -ENOMEM; - /* * BB TODO: Add support for using the cached root handle. * Create SMB2_query_posix_info worker function to do non-compounded query @@ -585,29 +587,32 @@ smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, */ cifs_get_readable_path(tcon, full_path, &cfile); - rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, - FILE_READ_ATTRIBUTES, FILE_OPEN, create_options, - ACL_NO_MODE, smb2_data, SMB2_OP_POSIX_QUERY_INFO, cfile); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, + create_options, ACL_NO_MODE, data, SMB2_OP_POSIX_QUERY_INFO, cfile, + err_iov, err_buftype); if (rc == -EOPNOTSUPP) { /* BB TODO: When support for special files added to Samba re-verify this path */ + if (err_iov[0].iov_base && err_buftype[0] != CIFS_NO_BUFFER && + ((struct smb2_hdr *)err_iov[0].iov_base)->Command == SMB2_CREATE && + ((struct smb2_hdr *)err_iov[0].iov_base)->Status == STATUS_STOPPED_ON_SYMLINK) { + rc = smb2_parse_symlink_response(cifs_sb, err_iov, &data->symlink_target); + if (rc) + goto out; + } *reparse = true; create_options |= OPEN_REPARSE_POINT; /* Failed on a symbolic link - query a reparse point info */ cifs_get_readable_path(tcon, full_path, &cfile); - rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, - FILE_READ_ATTRIBUTES, FILE_OPEN, - create_options, ACL_NO_MODE, - smb2_data, SMB2_OP_POSIX_QUERY_INFO, cfile); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, + FILE_OPEN, create_options, ACL_NO_MODE, data, + SMB2_OP_POSIX_QUERY_INFO, cfile, NULL, NULL); } - if (rc) - goto out; - - /* TODO: will need to allow for the 2 SIDs when add support for getting owner UID/GID */ - memcpy(data, smb2_data, sizeof(struct smb311_posix_qinfo)); out: - kfree(smb2_data); + free_rsp_buf(err_buftype[0], err_iov[0].iov_base); + free_rsp_buf(err_buftype[1], err_iov[1].iov_base); + free_rsp_buf(err_buftype[2], err_iov[2].iov_base); return rc; } @@ -619,7 +624,7 @@ smb2_mkdir(const unsigned int xid, struct inode *parent_inode, umode_t mode, return smb2_compound_op(xid, tcon, cifs_sb, name, FILE_WRITE_ATTRIBUTES, FILE_CREATE, CREATE_NOT_FILE, mode, NULL, SMB2_OP_MKDIR, - NULL); + NULL, NULL, NULL); } void @@ -641,7 +646,7 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name, tmprc = smb2_compound_op(xid, tcon, cifs_sb, name, FILE_WRITE_ATTRIBUTES, FILE_CREATE, CREATE_NOT_FILE, ACL_NO_MODE, - &data, SMB2_OP_SET_INFO, cfile); + &data, SMB2_OP_SET_INFO, cfile, NULL, NULL); if (tmprc == 0) cifs_i->cifsAttrs = dosattrs; } @@ -650,9 +655,10 @@ int smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb) { + drop_cached_dir_by_name(xid, tcon, name, cifs_sb); return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, CREATE_NOT_FILE, ACL_NO_MODE, - NULL, SMB2_OP_RMDIR, NULL); + NULL, SMB2_OP_RMDIR, NULL, NULL, NULL); } int @@ -661,7 +667,7 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name, { return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT, - ACL_NO_MODE, NULL, SMB2_OP_DELETE, NULL); + ACL_NO_MODE, NULL, SMB2_OP_DELETE, NULL, NULL, NULL); } static int @@ -680,7 +686,7 @@ smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon, } rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, access, FILE_OPEN, 0, ACL_NO_MODE, smb2_to_name, - command, cfile); + command, cfile, NULL, NULL); smb2_rename_path: kfree(smb2_to_name); return rc; @@ -693,6 +699,7 @@ smb2_rename_path(const unsigned int xid, struct cifs_tcon *tcon, { struct cifsFileInfo *cfile; + drop_cached_dir_by_name(xid, tcon, from_name, cifs_sb); cifs_get_writable_path(tcon, from_name, FIND_WR_WITH_DELETE, &cfile); return smb2_set_path_attr(xid, tcon, from_name, to_name, @@ -720,7 +727,7 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon, cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile); return smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_WRITE_DATA, FILE_OPEN, 0, ACL_NO_MODE, - &eof, SMB2_OP_SET_EOF, cfile); + &eof, SMB2_OP_SET_EOF, cfile, NULL, NULL); } int @@ -746,7 +753,8 @@ smb2_set_file_info(struct inode *inode, const char *full_path, cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_WRITE_ATTRIBUTES, FILE_OPEN, - 0, ACL_NO_MODE, buf, SMB2_OP_SET_INFO, cfile); + 0, ACL_NO_MODE, buf, SMB2_OP_SET_INFO, cfile, + NULL, NULL); cifs_put_tlink(tlink); return rc; } diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index d73e5672aac4..572293c18e16 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -135,6 +135,7 @@ static __u32 get_neg_ctxt_len(struct smb2_hdr *hdr, __u32 len, int smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server) { + struct TCP_Server_Info *pserver; struct smb2_hdr *shdr = (struct smb2_hdr *)buf; struct smb2_pdu *pdu = (struct smb2_pdu *)shdr; int hdr_size = sizeof(struct smb2_hdr); @@ -143,6 +144,9 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server) __u32 calc_len; /* calculated length */ __u64 mid; + /* If server is a channel, select the primary channel */ + pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + /* * Add function to do table lookup of StructureSize by command * ie Validate the wct via smb2_struct_sizes table above @@ -155,7 +159,7 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server) /* decrypt frame now that it is completely read in */ spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(iter, &server->smb_ses_list, smb_ses_list) { + list_for_each_entry(iter, &pserver->smb_ses_list, smb_ses_list) { if (iter->Suid == le64_to_cpu(thdr->SessionId)) { ses = iter; break; @@ -248,7 +252,7 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server) * Some windows servers (win2016) will pad also the final * PDU in a compound to 8 bytes. */ - if (((calc_len + 7) & ~7) == len) + if (ALIGN(calc_len, 8) == len) return 0; /* @@ -608,51 +612,52 @@ smb2_tcon_find_pending_open_lease(struct cifs_tcon *tcon, } static bool -smb2_is_valid_lease_break(char *buffer) +smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server) { struct smb2_lease_break *rsp = (struct smb2_lease_break *)buffer; - struct TCP_Server_Info *server; + struct TCP_Server_Info *pserver; struct cifs_ses *ses; struct cifs_tcon *tcon; struct cifs_pending_open *open; cifs_dbg(FYI, "Checking for lease break\n"); + /* If server is a channel, select the primary channel */ + pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + /* look up tcon based on tid & uid */ spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { - list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { - list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { - spin_lock(&tcon->open_file_lock); - cifs_stats_inc( - &tcon->stats.cifs_stats.num_oplock_brks); - if (smb2_tcon_has_lease(tcon, rsp)) { - spin_unlock(&tcon->open_file_lock); - spin_unlock(&cifs_tcp_ses_lock); - return true; - } - open = smb2_tcon_find_pending_open_lease(tcon, - rsp); - if (open) { - __u8 lease_key[SMB2_LEASE_KEY_SIZE]; - struct tcon_link *tlink; - - tlink = cifs_get_tlink(open->tlink); - memcpy(lease_key, open->lease_key, - SMB2_LEASE_KEY_SIZE); - spin_unlock(&tcon->open_file_lock); - spin_unlock(&cifs_tcp_ses_lock); - smb2_queue_pending_open_break(tlink, - lease_key, - rsp->NewLeaseState); - return true; - } + list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { + list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { + spin_lock(&tcon->open_file_lock); + cifs_stats_inc( + &tcon->stats.cifs_stats.num_oplock_brks); + if (smb2_tcon_has_lease(tcon, rsp)) { spin_unlock(&tcon->open_file_lock); + spin_unlock(&cifs_tcp_ses_lock); + return true; + } + open = smb2_tcon_find_pending_open_lease(tcon, + rsp); + if (open) { + __u8 lease_key[SMB2_LEASE_KEY_SIZE]; + struct tcon_link *tlink; + + tlink = cifs_get_tlink(open->tlink); + memcpy(lease_key, open->lease_key, + SMB2_LEASE_KEY_SIZE); + spin_unlock(&tcon->open_file_lock); + spin_unlock(&cifs_tcp_ses_lock); + smb2_queue_pending_open_break(tlink, + lease_key, + rsp->NewLeaseState); + return true; + } + spin_unlock(&tcon->open_file_lock); - if (cached_dir_lease_break(tcon, rsp->LeaseKey)) { - spin_unlock(&cifs_tcp_ses_lock); - return true; - } + if (cached_dir_lease_break(tcon, rsp->LeaseKey)) { + spin_unlock(&cifs_tcp_ses_lock); + return true; } } } @@ -671,6 +676,7 @@ bool smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) { struct smb2_oplock_break *rsp = (struct smb2_oplock_break *)buffer; + struct TCP_Server_Info *pserver; struct cifs_ses *ses; struct cifs_tcon *tcon; struct cifsInodeInfo *cinode; @@ -684,16 +690,19 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) if (rsp->StructureSize != smb2_rsp_struct_sizes[SMB2_OPLOCK_BREAK_HE]) { if (le16_to_cpu(rsp->StructureSize) == 44) - return smb2_is_valid_lease_break(buffer); + return smb2_is_valid_lease_break(buffer, server); else return false; } cifs_dbg(FYI, "oplock level 0x%x\n", rsp->OplockLevel); + /* If server is a channel, select the primary channel */ + pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + /* look up tcon based on tid & uid */ spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { spin_lock(&tcon->open_file_lock); @@ -870,8 +879,8 @@ smb311_update_preauth_hash(struct cifs_ses *ses, struct TCP_Server_Info *server, struct kvec *iov, int nvec) { int i, rc; - struct sdesc *d; struct smb2_hdr *hdr; + struct shash_desc *sha512 = NULL; hdr = (struct smb2_hdr *)iov[0].iov_base; /* neg prot are always taken */ @@ -901,14 +910,14 @@ ok: if (rc) return rc; - d = server->secmech.sdescsha512; - rc = crypto_shash_init(&d->shash); + sha512 = server->secmech.sha512; + rc = crypto_shash_init(sha512); if (rc) { cifs_dbg(VFS, "%s: Could not init sha512 shash\n", __func__); return rc; } - rc = crypto_shash_update(&d->shash, ses->preauth_sha_hash, + rc = crypto_shash_update(sha512, ses->preauth_sha_hash, SMB2_PREAUTH_HASH_SIZE); if (rc) { cifs_dbg(VFS, "%s: Could not update sha512 shash\n", __func__); @@ -916,8 +925,7 @@ ok: } for (i = 0; i < nvec; i++) { - rc = crypto_shash_update(&d->shash, - iov[i].iov_base, iov[i].iov_len); + rc = crypto_shash_update(sha512, iov[i].iov_base, iov[i].iov_len); if (rc) { cifs_dbg(VFS, "%s: Could not update sha512 shash\n", __func__); @@ -925,7 +933,7 @@ ok: } } - rc = crypto_shash_final(&d->shash, ses->preauth_sha_hash); + rc = crypto_shash_final(sha512, ses->preauth_sha_hash); if (rc) { cifs_dbg(VFS, "%s: Could not finalize sha512 shash\n", __func__); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 421be43af425..bfaafd02fb1f 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -512,8 +512,7 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) static int parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, - size_t buf_len, - struct cifs_ses *ses) + size_t buf_len, struct cifs_ses *ses, bool in_mount) { struct network_interface_info_ioctl_rsp *p; struct sockaddr_in *addr4; @@ -531,6 +530,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, p = buf; spin_lock(&ses->iface_lock); + ses->iface_count = 0; /* * Go through iface_list and do kref_put to remove * any unused ifaces. ifaces in use will be removed @@ -543,6 +543,21 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, } spin_unlock(&ses->iface_lock); + /* + * Samba server e.g. can return an empty interface list in some cases, + * which would only be a problem if we were requesting multichannel + */ + if (bytes_left == 0) { + /* avoid spamming logs every 10 minutes, so log only in mount */ + if ((ses->chan_max > 1) && in_mount) + cifs_dbg(VFS, + "multichannel not available\n" + "Empty network interface list returned by server %s\n", + ses->server->hostname); + rc = -EINVAL; + goto out; + } + while (bytes_left >= sizeof(*p)) { memset(&tmp_iface, 0, sizeof(tmp_iface)); tmp_iface.speed = le64_to_cpu(p->LinkSpeed); @@ -637,9 +652,9 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, kref_put(&iface->refcount, release_iface); } else list_add_tail(&info->iface_head, &ses->iface_list); - spin_unlock(&ses->iface_lock); ses->iface_count++; + spin_unlock(&ses->iface_lock); ses->iface_last_update = jiffies; next_iface: nb_iface++; @@ -673,7 +688,7 @@ out: } int -SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) +SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon, bool in_mount) { int rc; unsigned int ret_data_len = 0; @@ -693,7 +708,7 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) goto out; } - rc = parse_server_interfaces(out_buf, ret_data_len, ses); + rc = parse_server_interfaces(out_buf, ret_data_len, ses, in_mount); if (rc) goto out; @@ -729,7 +744,7 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon, if (rc) return; - SMB3_request_interfaces(xid, tcon); + SMB3_request_interfaces(xid, tcon, true /* called during mount */); SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid, FS_ATTRIBUTE_INFORMATION); @@ -787,7 +802,7 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, rc = open_cached_dir(xid, tcon, full_path, cifs_sb, true, &cfid); if (!rc) { - if (cfid->is_valid) { + if (cfid->has_lease) { close_cached_dir(cfid); return 0; } @@ -817,33 +832,25 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, return rc; } -static int -smb2_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - u64 *uniqueid, FILE_ALL_INFO *data) +static int smb2_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + u64 *uniqueid, struct cifs_open_info_data *data) { - *uniqueid = le64_to_cpu(data->IndexNumber); + *uniqueid = le64_to_cpu(data->fi.IndexNumber); return 0; } -static int -smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_fid *fid, FILE_ALL_INFO *data) +static int smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *cfile, struct cifs_open_info_data *data) { - int rc; - struct smb2_file_all_info *smb2_data; + struct cifs_fid *fid = &cfile->fid; - smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2, - GFP_KERNEL); - if (smb2_data == NULL) - return -ENOMEM; - - rc = SMB2_query_info(xid, tcon, fid->persistent_fid, fid->volatile_fid, - smb2_data); - if (!rc) - move_smb2_info_to_cifs(data, smb2_data); - kfree(smb2_data); - return rc; + if (cfile->symlink_target) { + data->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); + if (!data->symlink_target) + return -ENOMEM; + } + return SMB2_query_info(xid, tcon, fid->persistent_fid, fid->volatile_fid, &data->fi); } #ifdef CONFIG_CIFS_XATTR @@ -1109,6 +1116,8 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, COMPOUND_FID, current->tgid, FILE_FULL_EA_INFORMATION, SMB2_O_INFO_FILE, 0, data, size); + if (rc) + goto sea_exit; smb2_set_next_command(tcon, &rqst[1]); smb2_set_related(&rqst[1]); @@ -1119,6 +1128,8 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, rqst[2].rq_nvec = 1; rc = SMB2_close_init(tcon, server, &rqst[2], COMPOUND_FID, COMPOUND_FID, false); + if (rc) + goto sea_exit; smb2_set_related(&rqst[2]); rc = compound_send_recv(xid, ses, server, @@ -1327,7 +1338,7 @@ SMB2_request_res_key(const unsigned int xid, struct cifs_tcon *tcon, CIFSMaxBufSize, (char **)&res_key, &ret_data_len); if (rc == -EOPNOTSUPP) { - pr_warn_once("Server share %s does not support copy range\n", tcon->treeName); + pr_warn_once("Server share %s does not support copy range\n", tcon->tree_name); goto req_res_key_exit; } else if (rc) { cifs_tcon_dbg(VFS, "refcpy ioctl error %d getting resume key\n", rc); @@ -2012,9 +2023,10 @@ smb3_enum_snapshots(const unsigned int xid, struct cifs_tcon *tcon, static int smb3_notify(const unsigned int xid, struct file *pfile, - void __user *ioc_buf) + void __user *ioc_buf, bool return_changes) { - struct smb3_notify notify; + struct smb3_notify_info notify; + struct smb3_notify_info __user *pnotify_buf; struct dentry *dentry = pfile->f_path.dentry; struct inode *inode = file_inode(pfile); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); @@ -2022,10 +2034,12 @@ smb3_notify(const unsigned int xid, struct file *pfile, struct cifs_fid fid; struct cifs_tcon *tcon; const unsigned char *path; + char *returned_ioctl_info = NULL; void *page = alloc_dentry_path(); __le16 *utf16_path = NULL; u8 oplock = SMB2_OPLOCK_LEVEL_NONE; int rc = 0; + __u32 ret_len = 0; path = build_path_from_dentry(dentry, page); if (IS_ERR(path)) { @@ -2039,9 +2053,17 @@ smb3_notify(const unsigned int xid, struct file *pfile, goto notify_exit; } - if (copy_from_user(¬ify, ioc_buf, sizeof(struct smb3_notify))) { - rc = -EFAULT; - goto notify_exit; + if (return_changes) { + if (copy_from_user(¬ify, ioc_buf, sizeof(struct smb3_notify_info))) { + rc = -EFAULT; + goto notify_exit; + } + } else { + if (copy_from_user(¬ify, ioc_buf, sizeof(struct smb3_notify))) { + rc = -EFAULT; + goto notify_exit; + } + notify.data_len = 0; } tcon = cifs_sb_master_tcon(cifs_sb); @@ -2058,12 +2080,22 @@ smb3_notify(const unsigned int xid, struct file *pfile, goto notify_exit; rc = SMB2_change_notify(xid, tcon, fid.persistent_fid, fid.volatile_fid, - notify.watch_tree, notify.completion_filter); + notify.watch_tree, notify.completion_filter, + notify.data_len, &returned_ioctl_info, &ret_len); SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); cifs_dbg(FYI, "change notify for path %s rc %d\n", path, rc); - + if (return_changes && (ret_len > 0) && (notify.data_len > 0)) { + if (ret_len > notify.data_len) + ret_len = notify.data_len; + pnotify_buf = (struct smb3_notify_info __user *)ioc_buf; + if (copy_to_user(pnotify_buf->notify_data, returned_ioctl_info, ret_len)) + rc = -EFAULT; + else if (copy_to_user(&pnotify_buf->data_len, &ret_len, sizeof(ret_len))) + rc = -EFAULT; + } + kfree(returned_ioctl_info); notify_exit: free_dentry_path(page); kfree(utf16_path); @@ -2274,14 +2306,18 @@ static void smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server) { struct smb2_hdr *shdr = (struct smb2_hdr *)buf; + struct TCP_Server_Info *pserver; struct cifs_ses *ses; struct cifs_tcon *tcon; if (shdr->Status != STATUS_NETWORK_NAME_DELETED) return; + /* If server is a channel, select the primary channel */ + pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { if (tcon->tid == le32_to_cpu(shdr->Id.SyncId.TreeId)) { spin_lock(&tcon->tc_lock); @@ -2289,7 +2325,7 @@ smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server) spin_unlock(&tcon->tc_lock); spin_unlock(&cifs_tcp_ses_lock); pr_warn_once("Server share %s deleted.\n", - tcon->treeName); + tcon->tree_name); return; } } @@ -2498,7 +2534,7 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, if (rc == -EREMCHG) { tcon->need_reconnect = true; pr_warn_once("server share %s deleted\n", - tcon->treeName); + tcon->tree_name); } goto qic_exit; } @@ -2814,9 +2850,6 @@ parse_reparse_point(struct reparse_data_buffer *buf, } } -#define SMB2_SYMLINK_STRUCT_SIZE \ - (sizeof(struct smb2_err_rsp) - 1 + sizeof(struct smb2_symlink_err_rsp)) - static int smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, @@ -2828,13 +2861,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_open_parms oparms; struct cifs_fid fid; struct kvec err_iov = {NULL, 0}; - struct smb2_err_rsp *err_buf = NULL; - struct smb2_symlink_err_rsp *symlink; struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses); - unsigned int sub_len; - unsigned int sub_offset; - unsigned int print_len; - unsigned int print_offset; int flags = CIFS_CP_CREATE_CLOSE_OP; struct smb_rqst rqst[3]; int resp_buftype[3]; @@ -2951,47 +2978,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, goto querty_exit; } - err_buf = err_iov.iov_base; - if (le32_to_cpu(err_buf->ByteCount) < sizeof(struct smb2_symlink_err_rsp) || - err_iov.iov_len < SMB2_SYMLINK_STRUCT_SIZE) { - rc = -EINVAL; - goto querty_exit; - } - - symlink = (struct smb2_symlink_err_rsp *)err_buf->ErrorData; - if (le32_to_cpu(symlink->SymLinkErrorTag) != SYMLINK_ERROR_TAG || - le32_to_cpu(symlink->ReparseTag) != IO_REPARSE_TAG_SYMLINK) { - rc = -EINVAL; - goto querty_exit; - } - - /* open must fail on symlink - reset rc */ - rc = 0; - sub_len = le16_to_cpu(symlink->SubstituteNameLength); - sub_offset = le16_to_cpu(symlink->SubstituteNameOffset); - print_len = le16_to_cpu(symlink->PrintNameLength); - print_offset = le16_to_cpu(symlink->PrintNameOffset); - - if (err_iov.iov_len < SMB2_SYMLINK_STRUCT_SIZE + sub_offset + sub_len) { - rc = -EINVAL; - goto querty_exit; - } - - if (err_iov.iov_len < - SMB2_SYMLINK_STRUCT_SIZE + print_offset + print_len) { - rc = -EINVAL; - goto querty_exit; - } - - *target_path = cifs_strndup_from_utf16( - (char *)symlink->PathBuffer + sub_offset, - sub_len, true, cifs_sb->local_nls); - if (!(*target_path)) { - rc = -ENOMEM; - goto querty_exit; - } - convert_delimiter(*target_path, '/'); - cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path); + rc = smb2_parse_symlink_response(cifs_sb, &err_iov, target_path); querty_exit: cifs_dbg(FYI, "query symlink rc %d\n", rc); @@ -4285,21 +4272,23 @@ init_sg(int num_rqst, struct smb_rqst *rqst, u8 *sign) static int smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key) { + struct TCP_Server_Info *pserver; struct cifs_ses *ses; u8 *ses_enc_key; + /* If server is a channel, select the primary channel */ + pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { - list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { - if (ses->Suid == ses_id) { - spin_lock(&ses->ses_lock); - ses_enc_key = enc ? ses->smb3encryptionkey : - ses->smb3decryptionkey; - memcpy(key, ses_enc_key, SMB3_ENC_DEC_KEY_SIZE); - spin_unlock(&ses->ses_lock); - spin_unlock(&cifs_tcp_ses_lock); - return 0; - } + list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { + if (ses->Suid == ses_id) { + spin_lock(&ses->ses_lock); + ses_enc_key = enc ? ses->smb3encryptionkey : + ses->smb3decryptionkey; + memcpy(key, ses_enc_key, SMB3_ENC_DEC_KEY_SIZE); + spin_unlock(&ses->ses_lock); + spin_unlock(&cifs_tcp_ses_lock); + return 0; } } spin_unlock(&cifs_tcp_ses_lock); @@ -4344,8 +4333,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, return rc; } - tfm = enc ? server->secmech.ccmaesencrypt : - server->secmech.ccmaesdecrypt; + tfm = enc ? server->secmech.enc : server->secmech.dec; if ((server->cipher_type == SMB2_ENCRYPTION_AES256_CCM) || (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) @@ -4410,11 +4398,11 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, if (!rc && enc) memcpy(&tr_hdr->Signature, sign, SMB2_SIGNATURE_SIZE); - kfree(iv); + kfree_sensitive(iv); free_sg: - kfree(sg); + kfree_sensitive(sg); free_req: - kfree(req); + kfree_sensitive(req); return rc; } @@ -5102,7 +5090,7 @@ smb2_make_node(unsigned int xid, struct inode *inode, { struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); int rc = -EPERM; - FILE_ALL_INFO *buf = NULL; + struct cifs_open_info_data buf = {}; struct cifs_io_parms io_parms = {0}; __u32 oplock = 0; struct cifs_fid fid; @@ -5118,7 +5106,7 @@ smb2_make_node(unsigned int xid, struct inode *inode, * and was used by default in earlier versions of Windows */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) - goto out; + return rc; /* * TODO: Add ability to create instead via reparse point. Windows (e.g. @@ -5127,16 +5115,10 @@ smb2_make_node(unsigned int xid, struct inode *inode, */ if (!S_ISCHR(mode) && !S_ISBLK(mode)) - goto out; + return rc; cifs_dbg(FYI, "sfu compat create special file\n"); - buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); - if (buf == NULL) { - rc = -ENOMEM; - goto out; - } - oparms.tcon = tcon; oparms.cifs_sb = cifs_sb; oparms.desired_access = GENERIC_WRITE; @@ -5151,21 +5133,21 @@ smb2_make_node(unsigned int xid, struct inode *inode, oplock = REQ_OPLOCK; else oplock = 0; - rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf); + rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, &buf); if (rc) - goto out; + return rc; /* * BB Do not bother to decode buf since no local inode yet to put * timestamps in, but we can reuse it safely. */ - pdev = (struct win_dev *)buf; + pdev = (struct win_dev *)&buf.fi; io_parms.pid = current->tgid; io_parms.tcon = tcon; io_parms.offset = 0; io_parms.length = sizeof(struct win_dev); - iov[1].iov_base = buf; + iov[1].iov_base = &buf.fi; iov[1].iov_len = sizeof(struct win_dev); if (S_ISCHR(mode)) { memcpy(pdev->type, "IntxCHR", 8); @@ -5184,8 +5166,8 @@ smb2_make_node(unsigned int xid, struct inode *inode, d_drop(dentry); /* FIXME: add code here to set EAs */ -out: - kfree(buf); + + cifs_free_open_info(&buf); return rc; } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 6352ab32c7e7..a5695748a89b 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -466,15 +466,14 @@ build_signing_ctxt(struct smb2_signing_capabilities *pneg_ctxt) /* * Context Data length must be rounded to multiple of 8 for some servers */ - pneg_ctxt->DataLength = cpu_to_le16(DIV_ROUND_UP( - sizeof(struct smb2_signing_capabilities) - - sizeof(struct smb2_neg_context) + - (num_algs * 2 /* sizeof u16 */), 8) * 8); + pneg_ctxt->DataLength = cpu_to_le16(ALIGN(sizeof(struct smb2_signing_capabilities) - + sizeof(struct smb2_neg_context) + + (num_algs * sizeof(u16)), 8)); pneg_ctxt->SigningAlgorithmCount = cpu_to_le16(num_algs); pneg_ctxt->SigningAlgorithms[0] = cpu_to_le16(SIGNING_ALG_AES_CMAC); - ctxt_len += 2 /* sizeof le16 */ * num_algs; - ctxt_len = DIV_ROUND_UP(ctxt_len, 8) * 8; + ctxt_len += sizeof(__le16) * num_algs; + ctxt_len = ALIGN(ctxt_len, 8); return ctxt_len; /* TBD add SIGNING_ALG_AES_GMAC and/or SIGNING_ALG_HMAC_SHA256 */ } @@ -511,8 +510,7 @@ build_netname_ctxt(struct smb2_netname_neg_context *pneg_ctxt, char *hostname) /* copy up to max of first 100 bytes of server name to NetName field */ pneg_ctxt->DataLength = cpu_to_le16(2 * cifs_strtoUTF16(pneg_ctxt->NetName, hostname, 100, cp)); /* context size is DataLength + minimal smb2_neg_context */ - return DIV_ROUND_UP(le16_to_cpu(pneg_ctxt->DataLength) + - sizeof(struct smb2_neg_context), 8) * 8; + return ALIGN(le16_to_cpu(pneg_ctxt->DataLength) + sizeof(struct smb2_neg_context), 8); } static void @@ -557,18 +555,18 @@ assemble_neg_contexts(struct smb2_negotiate_req *req, * round up total_len of fixed part of SMB3 negotiate request to 8 * byte boundary before adding negotiate contexts */ - *total_len = roundup(*total_len, 8); + *total_len = ALIGN(*total_len, 8); pneg_ctxt = (*total_len) + (char *)req; req->NegotiateContextOffset = cpu_to_le32(*total_len); build_preauth_ctxt((struct smb2_preauth_neg_context *)pneg_ctxt); - ctxt_len = DIV_ROUND_UP(sizeof(struct smb2_preauth_neg_context), 8) * 8; + ctxt_len = ALIGN(sizeof(struct smb2_preauth_neg_context), 8); *total_len += ctxt_len; pneg_ctxt += ctxt_len; build_encrypt_ctxt((struct smb2_encryption_neg_context *)pneg_ctxt); - ctxt_len = DIV_ROUND_UP(sizeof(struct smb2_encryption_neg_context), 8) * 8; + ctxt_len = ALIGN(sizeof(struct smb2_encryption_neg_context), 8); *total_len += ctxt_len; pneg_ctxt += ctxt_len; @@ -595,9 +593,7 @@ assemble_neg_contexts(struct smb2_negotiate_req *req, if (server->compress_algorithm) { build_compression_ctxt((struct smb2_compression_capabilities_context *) pneg_ctxt); - ctxt_len = DIV_ROUND_UP( - sizeof(struct smb2_compression_capabilities_context), - 8) * 8; + ctxt_len = ALIGN(sizeof(struct smb2_compression_capabilities_context), 8); *total_len += ctxt_len; pneg_ctxt += ctxt_len; neg_context_count++; @@ -780,7 +776,7 @@ static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp, if (rc) break; /* offsets must be 8 byte aligned */ - clen = (clen + 7) & ~0x7; + clen = ALIGN(clen, 8); offset += clen + sizeof(struct smb2_neg_context); len_of_ctxts -= clen; } @@ -873,7 +869,7 @@ SMB2_negotiate(const unsigned int xid, struct smb2_negotiate_rsp *rsp; struct kvec iov[1]; struct kvec rsp_iov; - int rc = 0; + int rc; int resp_buftype; int blob_offset, blob_length; char *security_blob; @@ -1169,9 +1165,9 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) pneg_inbuf->Dialects[0] = cpu_to_le16(server->vals->protocol_id); pneg_inbuf->DialectCount = cpu_to_le16(1); - /* structure is big enough for 3 dialects, sending only 1 */ + /* structure is big enough for 4 dialects, sending only 1 */ inbuflen = sizeof(*pneg_inbuf) - - sizeof(pneg_inbuf->Dialects[0]) * 2; + sizeof(pneg_inbuf->Dialects[0]) * 3; } rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, @@ -1345,7 +1341,13 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) static void SMB2_sess_free_buffer(struct SMB2_sess_data *sess_data) { - free_rsp_buf(sess_data->buf0_type, sess_data->iov[0].iov_base); + struct kvec *iov = sess_data->iov; + + /* iov[1] is already freed by caller */ + if (sess_data->buf0_type != CIFS_NO_BUFFER && iov[0].iov_base) + memzero_explicit(iov[0].iov_base, iov[0].iov_len); + + free_rsp_buf(sess_data->buf0_type, iov[0].iov_base); sess_data->buf0_type = CIFS_NO_BUFFER; } @@ -1477,6 +1479,8 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) out_put_spnego_key: key_invalidate(spnego_key); key_put(spnego_key); + if (rc) + kfree_sensitive(ses->auth_key.response); out: sess_data->result = rc; sess_data->func = NULL; @@ -1526,7 +1530,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) &blob_length, ses, server, sess_data->nls_cp); if (rc) - goto out_err; + goto out; if (use_spnego) { /* BB eventually need to add this */ @@ -1573,7 +1577,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) } out: - kfree(ntlmssp_blob); + kfree_sensitive(ntlmssp_blob); SMB2_sess_free_buffer(sess_data); if (!rc) { sess_data->result = 0; @@ -1581,7 +1585,7 @@ out: return; } out_err: - kfree(ses->ntlmssp); + kfree_sensitive(ses->ntlmssp); ses->ntlmssp = NULL; sess_data->result = rc; sess_data->func = NULL; @@ -1657,9 +1661,9 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data) } #endif out: - kfree(ntlmssp_blob); + kfree_sensitive(ntlmssp_blob); SMB2_sess_free_buffer(sess_data); - kfree(ses->ntlmssp); + kfree_sensitive(ses->ntlmssp); ses->ntlmssp = NULL; sess_data->result = rc; sess_data->func = NULL; @@ -1737,7 +1741,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, cifs_server_dbg(VFS, "signing requested but authenticated as guest\n"); rc = sess_data->result; out: - kfree(sess_data); + kfree_sensitive(sess_data); return rc; } @@ -1930,7 +1934,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, tcon->capabilities = rsp->Capabilities; /* we keep caps little endian */ tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess); tcon->tid = le32_to_cpu(rsp->hdr.Id.SyncId.TreeId); - strscpy(tcon->treeName, tree, sizeof(tcon->treeName)); + strscpy(tcon->tree_name, tree, sizeof(tcon->tree_name)); if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) && ((tcon->share_flags & SHI1005_FLAGS_DFS) == 0)) @@ -1973,6 +1977,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) if (!ses || !(ses->server)) return -EIO; + trace_smb3_tdis_enter(xid, tcon->tid, ses->Suid, tcon->tree_name); spin_lock(&ses->chan_lock); if ((tcon->need_reconnect) || (CIFS_ALL_CHANS_NEED_RECONNECT(tcon->ses))) { @@ -2004,8 +2009,11 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) rc = cifs_send_recv(xid, ses, ses->server, &rqst, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); - if (rc) + if (rc) { cifs_stats_fail_inc(tcon, SMB2_TREE_DISCONNECT_HE); + trace_smb3_tdis_err(xid, tcon->tid, ses->Suid, rc); + } + trace_smb3_tdis_done(xid, tcon->tid, ses->Suid); return rc; } @@ -2411,9 +2419,9 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len) unsigned int acelen, acl_size, ace_count; unsigned int owner_offset = 0; unsigned int group_offset = 0; - struct smb3_acl acl; + struct smb3_acl acl = {}; - *len = roundup(sizeof(struct crt_sd_ctxt) + (sizeof(struct cifs_ace) * 4), 8); + *len = round_up(sizeof(struct crt_sd_ctxt) + (sizeof(struct cifs_ace) * 4), 8); if (set_owner) { /* sizeof(struct owner_group_sids) is already multiple of 8 so no need to round */ @@ -2484,10 +2492,11 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len) acl.AclRevision = ACL_REVISION; /* See 2.4.4.1 of MS-DTYP */ acl.AclSize = cpu_to_le16(acl_size); acl.AceCount = cpu_to_le16(ace_count); + /* acl.Sbz1 and Sbz2 MBZ so are not set here, but initialized above */ memcpy(aclptr, &acl, sizeof(struct smb3_acl)); buf->ccontext.DataLength = cpu_to_le32(ptr - (__u8 *)&buf->sd); - *len = roundup(ptr - (__u8 *)buf, 8); + *len = round_up((unsigned int)(ptr - (__u8 *)buf), 8); return buf; } @@ -2581,7 +2590,7 @@ alloc_path_with_tree_prefix(__le16 **out_path, int *out_size, int *out_len, * final path needs to be 8-byte aligned as specified in * MS-SMB2 2.2.13 SMB2 CREATE Request. */ - *out_size = roundup(*out_len * sizeof(__le16), 8); + *out_size = round_up(*out_len * sizeof(__le16), 8); *out_path = kzalloc(*out_size + sizeof(__le16) /* null */, GFP_KERNEL); if (!*out_path) return -ENOMEM; @@ -2674,7 +2683,7 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, req->hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS; rc = alloc_path_with_tree_prefix(©_path, ©_size, &name_len, - tcon->treeName, utf16_path); + tcon->tree_name, utf16_path); if (rc) goto err_free_req; @@ -2816,7 +2825,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, req->hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS; rc = alloc_path_with_tree_prefix(©_path, ©_size, &name_len, - tcon->treeName, path); + tcon->tree_name, path); if (rc) return rc; req->NameLength = cpu_to_le16(name_len * 2); @@ -2826,9 +2835,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, uni_path_len = (2 * UniStrnlen((wchar_t *)path, PATH_MAX)) + 2; /* MUST set path len (NameLength) to 0 opening root of share */ req->NameLength = cpu_to_le16(uni_path_len - 2); - copy_size = uni_path_len; - if (copy_size % 8 != 0) - copy_size = roundup(copy_size, 8); + copy_size = round_up(uni_path_len, 8); copy_path = kzalloc(copy_size, GFP_KERNEL); if (!copy_path) return -ENOMEM; @@ -3011,7 +3018,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, oparms->create_options, oparms->desired_access, rc); if (rc == -EREMCHG) { pr_warn_once("server share %s deleted\n", - tcon->treeName); + tcon->tree_name); tcon->need_reconnect = true; } goto creat_exit; @@ -3472,7 +3479,7 @@ smb2_validate_and_copy_iov(unsigned int offset, unsigned int buffer_length, if (rc) return rc; - memcpy(data, begin_of_buf, buffer_length); + memcpy(data, begin_of_buf, minbufsize); return 0; } @@ -3596,7 +3603,7 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, rc = smb2_validate_and_copy_iov(le16_to_cpu(rsp->OutputBufferOffset), le32_to_cpu(rsp->OutputBufferLength), - &rsp_iov, min_len, *data); + &rsp_iov, dlen ? *dlen : min_len, *data); if (rc && allocated) { kfree(*data); *data = NULL; @@ -3702,11 +3709,13 @@ SMB2_notify_init(const unsigned int xid, struct smb_rqst *rqst, int SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, bool watch_tree, - u32 completion_filter) + u32 completion_filter, u32 max_out_data_len, char **out_data, + u32 *plen /* returned data len */) { struct cifs_ses *ses = tcon->ses; struct TCP_Server_Info *server = cifs_pick_channel(ses); struct smb_rqst rqst; + struct smb2_change_notify_rsp *smb_rsp; struct kvec iov[1]; struct kvec rsp_iov = {NULL, 0}; int resp_buftype = CIFS_NO_BUFFER; @@ -3722,6 +3731,9 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, memset(&rqst, 0, sizeof(struct smb_rqst)); memset(&iov, 0, sizeof(iov)); + if (plen) + *plen = 0; + rqst.rq_iov = iov; rqst.rq_nvec = 1; @@ -3740,9 +3752,28 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, cifs_stats_fail_inc(tcon, SMB2_CHANGE_NOTIFY_HE); trace_smb3_notify_err(xid, persistent_fid, tcon->tid, ses->Suid, (u8)watch_tree, completion_filter, rc); - } else + } else { trace_smb3_notify_done(xid, persistent_fid, tcon->tid, - ses->Suid, (u8)watch_tree, completion_filter); + ses->Suid, (u8)watch_tree, completion_filter); + /* validate that notify information is plausible */ + if ((rsp_iov.iov_base == NULL) || + (rsp_iov.iov_len < sizeof(struct smb2_change_notify_rsp))) + goto cnotify_exit; + + smb_rsp = (struct smb2_change_notify_rsp *)rsp_iov.iov_base; + + smb2_validate_iov(le16_to_cpu(smb_rsp->OutputBufferOffset), + le32_to_cpu(smb_rsp->OutputBufferLength), &rsp_iov, + sizeof(struct file_notify_information)); + + *out_data = kmemdup((char *)smb_rsp + le16_to_cpu(smb_rsp->OutputBufferOffset), + le32_to_cpu(smb_rsp->OutputBufferLength), GFP_KERNEL); + if (*out_data == NULL) { + rc = -ENOMEM; + goto cnotify_exit; + } else + *plen = le32_to_cpu(smb_rsp->OutputBufferLength); + } cnotify_exit: if (rqst.rq_iov) @@ -4090,7 +4121,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len, if (request_type & CHAINED_REQUEST) { if (!(request_type & END_OF_CHAIN)) { /* next 8-byte aligned request */ - *total_len = DIV_ROUND_UP(*total_len, 8) * 8; + *total_len = ALIGN(*total_len, 8); shdr->NextCommand = cpu_to_le32(*total_len); } else /* END_OF_CHAIN */ shdr->NextCommand = 0; @@ -4429,7 +4460,7 @@ smb2_writev_callback(struct mid_q_entry *mid) wdata->bytes, wdata->result); if (wdata->result == -ENOSPC) pr_warn_once("Out of space writing to %s\n", - tcon->treeName); + tcon->tree_name); } else trace_smb3_write_done(0 /* no xid */, wdata->cfile->fid.persistent_fid, diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index f57881b8464f..1237bb86e93a 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -56,6 +56,9 @@ struct smb2_rdma_crypto_transform { #define COMPOUND_FID 0xFFFFFFFFFFFFFFFFULL +#define SMB2_SYMLINK_STRUCT_SIZE \ + (sizeof(struct smb2_err_rsp) - 1 + sizeof(struct smb2_symlink_err_rsp)) + #define SYMLINK_ERROR_TAG 0x4c4d5953 struct smb2_symlink_err_rsp { diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 3f740f24b96a..be21b5d26f67 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -53,16 +53,12 @@ extern bool smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv); extern int smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid); - -extern void move_smb2_info_to_cifs(FILE_ALL_INFO *dst, - struct smb2_file_all_info *src); extern int smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *path, __u32 *reparse_tag); -extern int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, - const char *full_path, FILE_ALL_INFO *data, - bool *adjust_tz, bool *symlink); +int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse); extern int smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon, const char *full_path, __u64 size, struct cifs_sb_info *cifs_sb, bool set_alloc); @@ -95,9 +91,9 @@ extern int smb3_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const unsigned char *path, char *pbuf, unsigned int *pbytes_read); -extern int smb2_open_file(const unsigned int xid, - struct cifs_open_parms *oparms, - __u32 *oplock, FILE_ALL_INFO *buf); +int smb2_parse_symlink_response(struct cifs_sb_info *cifs_sb, const struct kvec *iov, char **path); +int smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock, + void *buf); extern int smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, const unsigned int xid); extern int smb2_push_mandatory_locks(struct cifsFileInfo *cfile); @@ -148,7 +144,8 @@ extern int SMB2_ioctl_init(struct cifs_tcon *tcon, extern void SMB2_ioctl_free(struct smb_rqst *rqst); extern int SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, bool watch_tree, - u32 completion_filter); + u32 completion_filter, u32 max_out_data_len, + char **out_data, u32 *plen /* returned data len */); extern int __SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, @@ -278,9 +275,9 @@ extern int smb2_query_info_compound(const unsigned int xid, struct kvec *rsp, int *buftype, struct cifs_sb_info *cifs_sb); /* query path info from the server using SMB311 POSIX extensions*/ -extern int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *sb, const char *path, struct smb311_posix_qinfo *qinf, - bool *adjust_tx, bool *symlink); +int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse); int posix_info_parse(const void *beg, const void *end, struct smb2_posix_info_parsed *out); int posix_info_sid_size(const void *beg, const void *end); diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 1a5fc3314dbf..381babc1212c 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -32,19 +32,17 @@ smb3_crypto_shash_allocate(struct TCP_Server_Info *server) struct cifs_secmech *p = &server->secmech; int rc; - rc = cifs_alloc_hash("hmac(sha256)", - &p->hmacsha256, - &p->sdeschmacsha256); + rc = cifs_alloc_hash("hmac(sha256)", &p->hmacsha256); if (rc) goto err; - rc = cifs_alloc_hash("cmac(aes)", &p->cmacaes, &p->sdesccmacaes); + rc = cifs_alloc_hash("cmac(aes)", &p->aes_cmac); if (rc) goto err; return 0; err: - cifs_free_hash(&p->hmacsha256, &p->sdeschmacsha256); + cifs_free_hash(&p->hmacsha256); return rc; } @@ -54,25 +52,23 @@ smb311_crypto_shash_allocate(struct TCP_Server_Info *server) struct cifs_secmech *p = &server->secmech; int rc = 0; - rc = cifs_alloc_hash("hmac(sha256)", - &p->hmacsha256, - &p->sdeschmacsha256); + rc = cifs_alloc_hash("hmac(sha256)", &p->hmacsha256); if (rc) return rc; - rc = cifs_alloc_hash("cmac(aes)", &p->cmacaes, &p->sdesccmacaes); + rc = cifs_alloc_hash("cmac(aes)", &p->aes_cmac); if (rc) goto err; - rc = cifs_alloc_hash("sha512", &p->sha512, &p->sdescsha512); + rc = cifs_alloc_hash("sha512", &p->sha512); if (rc) goto err; return 0; err: - cifs_free_hash(&p->cmacaes, &p->sdesccmacaes); - cifs_free_hash(&p->hmacsha256, &p->sdeschmacsha256); + cifs_free_hash(&p->aes_cmac); + cifs_free_hash(&p->hmacsha256); return rc; } @@ -81,18 +77,19 @@ static int smb2_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key) { struct cifs_chan *chan; + struct TCP_Server_Info *pserver; struct cifs_ses *ses = NULL; - struct TCP_Server_Info *it = NULL; int i; int rc = 0; spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(it, &cifs_tcp_ses_list, tcp_ses_list) { - list_for_each_entry(ses, &it->smb_ses_list, smb_ses_list) { - if (ses->Suid == ses_id) - goto found; - } + /* If server is a channel, select the primary channel */ + pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + + list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { + if (ses->Suid == ses_id) + goto found; } cifs_server_dbg(VFS, "%s: Could not find session 0x%llx\n", __func__, ses_id); @@ -140,9 +137,13 @@ out: static struct cifs_ses * smb2_find_smb_ses_unlocked(struct TCP_Server_Info *server, __u64 ses_id) { + struct TCP_Server_Info *pserver; struct cifs_ses *ses; - list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + /* If server is a channel, select the primary channel */ + pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + + list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { if (ses->Suid != ses_id) continue; ++ses->ses_count; @@ -219,34 +220,30 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, struct kvec *iov = rqst->rq_iov; struct smb2_hdr *shdr = (struct smb2_hdr *)iov[0].iov_base; struct cifs_ses *ses; - struct shash_desc *shash; - struct crypto_shash *hash; - struct sdesc *sdesc = NULL; + struct shash_desc *shash = NULL; struct smb_rqst drqst; ses = smb2_find_smb_ses(server, le64_to_cpu(shdr->SessionId)); - if (!ses) { + if (unlikely(!ses)) { cifs_server_dbg(VFS, "%s: Could not find session\n", __func__); - return 0; + return -ENOENT; } memset(smb2_signature, 0x0, SMB2_HMACSHA256_SIZE); memset(shdr->Signature, 0x0, SMB2_SIGNATURE_SIZE); if (allocate_crypto) { - rc = cifs_alloc_hash("hmac(sha256)", &hash, &sdesc); + rc = cifs_alloc_hash("hmac(sha256)", &shash); if (rc) { cifs_server_dbg(VFS, "%s: sha256 alloc failed\n", __func__); goto out; } - shash = &sdesc->shash; } else { - hash = server->secmech.hmacsha256; - shash = &server->secmech.sdeschmacsha256->shash; + shash = server->secmech.hmacsha256; } - rc = crypto_shash_setkey(hash, ses->auth_key.response, + rc = crypto_shash_setkey(shash->tfm, ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE); if (rc) { cifs_server_dbg(VFS, @@ -288,7 +285,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, out: if (allocate_crypto) - cifs_free_hash(&hash, &sdesc); + cifs_free_hash(&shash); if (ses) cifs_put_smb_ses(ses); return rc; @@ -315,42 +312,38 @@ static int generate_key(struct cifs_ses *ses, struct kvec label, goto smb3signkey_ret; } - rc = crypto_shash_setkey(server->secmech.hmacsha256, + rc = crypto_shash_setkey(server->secmech.hmacsha256->tfm, ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE); if (rc) { cifs_server_dbg(VFS, "%s: Could not set with session key\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_init(&server->secmech.sdeschmacsha256->shash); + rc = crypto_shash_init(server->secmech.hmacsha256); if (rc) { cifs_server_dbg(VFS, "%s: Could not init sign hmac\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, - i, 4); + rc = crypto_shash_update(server->secmech.hmacsha256, i, 4); if (rc) { cifs_server_dbg(VFS, "%s: Could not update with n\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, - label.iov_base, label.iov_len); + rc = crypto_shash_update(server->secmech.hmacsha256, label.iov_base, label.iov_len); if (rc) { cifs_server_dbg(VFS, "%s: Could not update with label\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, - &zero, 1); + rc = crypto_shash_update(server->secmech.hmacsha256, &zero, 1); if (rc) { cifs_server_dbg(VFS, "%s: Could not update with zero\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, - context.iov_base, context.iov_len); + rc = crypto_shash_update(server->secmech.hmacsha256, context.iov_base, context.iov_len); if (rc) { cifs_server_dbg(VFS, "%s: Could not update with context\n", __func__); goto smb3signkey_ret; @@ -358,19 +351,16 @@ static int generate_key(struct cifs_ses *ses, struct kvec label, if ((server->cipher_type == SMB2_ENCRYPTION_AES256_CCM) || (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) { - rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, - L256, 4); + rc = crypto_shash_update(server->secmech.hmacsha256, L256, 4); } else { - rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, - L128, 4); + rc = crypto_shash_update(server->secmech.hmacsha256, L128, 4); } if (rc) { cifs_server_dbg(VFS, "%s: Could not update with L\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_final(&server->secmech.sdeschmacsha256->shash, - hashptr); + rc = crypto_shash_final(server->secmech.hmacsha256, hashptr); if (rc) { cifs_server_dbg(VFS, "%s: Could not generate sha256 hash\n", __func__); goto smb3signkey_ret; @@ -550,38 +540,35 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, unsigned char *sigptr = smb3_signature; struct kvec *iov = rqst->rq_iov; struct smb2_hdr *shdr = (struct smb2_hdr *)iov[0].iov_base; - struct shash_desc *shash; - struct crypto_shash *hash; - struct sdesc *sdesc = NULL; + struct shash_desc *shash = NULL; struct smb_rqst drqst; u8 key[SMB3_SIGN_KEY_SIZE]; rc = smb2_get_sign_key(le64_to_cpu(shdr->SessionId), server, key); - if (rc) - return 0; + if (unlikely(rc)) { + cifs_server_dbg(VFS, "%s: Could not get signing key\n", __func__); + return rc; + } if (allocate_crypto) { - rc = cifs_alloc_hash("cmac(aes)", &hash, &sdesc); + rc = cifs_alloc_hash("cmac(aes)", &shash); if (rc) return rc; - - shash = &sdesc->shash; } else { - hash = server->secmech.cmacaes; - shash = &server->secmech.sdesccmacaes->shash; + shash = server->secmech.aes_cmac; } memset(smb3_signature, 0x0, SMB2_CMACAES_SIZE); memset(shdr->Signature, 0x0, SMB2_SIGNATURE_SIZE); - rc = crypto_shash_setkey(hash, key, SMB2_CMACAES_SIZE); + rc = crypto_shash_setkey(shash->tfm, key, SMB2_CMACAES_SIZE); if (rc) { cifs_server_dbg(VFS, "%s: Could not set key for cmac aes\n", __func__); goto out; } /* - * we already allocate sdesccmacaes when we init smb3 signing key, + * we already allocate aes_cmac when we init smb3 signing key, * so unlike smb2 case we do not have to check here if secmech are * initialized */ @@ -617,7 +604,7 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, out: if (allocate_crypto) - cifs_free_hash(&hash, &sdesc); + cifs_free_hash(&shash); return rc; } @@ -902,7 +889,7 @@ smb3_crypto_aead_allocate(struct TCP_Server_Info *server) { struct crypto_aead *tfm; - if (!server->secmech.ccmaesencrypt) { + if (!server->secmech.enc) { if ((server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) || (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) tfm = crypto_alloc_aead("gcm(aes)", 0, 0); @@ -913,23 +900,23 @@ smb3_crypto_aead_allocate(struct TCP_Server_Info *server) __func__); return PTR_ERR(tfm); } - server->secmech.ccmaesencrypt = tfm; + server->secmech.enc = tfm; } - if (!server->secmech.ccmaesdecrypt) { + if (!server->secmech.dec) { if ((server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) || (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) tfm = crypto_alloc_aead("gcm(aes)", 0, 0); else tfm = crypto_alloc_aead("ccm(aes)", 0, 0); if (IS_ERR(tfm)) { - crypto_free_aead(server->secmech.ccmaesencrypt); - server->secmech.ccmaesencrypt = NULL; + crypto_free_aead(server->secmech.enc); + server->secmech.enc = NULL; cifs_server_dbg(VFS, "%s: Failed to alloc decrypt aead\n", __func__); return PTR_ERR(tfm); } - server->secmech.ccmaesdecrypt = tfm; + server->secmech.dec = tfm; } return 0; diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c index 5fbbec22bcc8..90789aaa6567 100644 --- a/fs/cifs/smbdirect.c +++ b/fs/cifs/smbdirect.c @@ -90,7 +90,7 @@ int smbd_max_send_size = 1364; int smbd_max_fragmented_recv_size = 1024 * 1024; /* The maximum single-message size which can be received */ -int smbd_max_receive_size = 8192; +int smbd_max_receive_size = 1364; /* The timeout to initiate send of a keepalive message on idle */ int smbd_keep_alive_interval = 120; @@ -99,7 +99,7 @@ int smbd_keep_alive_interval = 120; * User configurable initial values for RDMA transport * The actual values used may be lower and are limited to hardware capabilities */ -/* Default maximum number of SGEs in a RDMA write/read */ +/* Default maximum number of pages in a single RDMA write/read */ int smbd_max_frmr_depth = 2048; /* If payload is less than this byte, use RDMA send/recv not read/write */ @@ -270,7 +270,7 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc) struct smbd_request *request = container_of(wc->wr_cqe, struct smbd_request, cqe); - log_rdma_send(INFO, "smbd_request %p completed wc->status=%d\n", + log_rdma_send(INFO, "smbd_request 0x%p completed wc->status=%d\n", request, wc->status); if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) { @@ -448,7 +448,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) struct smbd_connection *info = response->info; int data_length = 0; - log_rdma_recv(INFO, "response=%p type=%d wc status=%d wc opcode %d byte_len=%d pkey_index=%x\n", + log_rdma_recv(INFO, "response=0x%p type=%d wc status=%d wc opcode %d byte_len=%d pkey_index=%u\n", response, response->type, wc->status, wc->opcode, wc->byte_len, wc->pkey_index); @@ -723,7 +723,7 @@ static int smbd_post_send_negotiate_req(struct smbd_connection *info) send_wr.opcode = IB_WR_SEND; send_wr.send_flags = IB_SEND_SIGNALED; - log_rdma_send(INFO, "sge addr=%llx length=%x lkey=%x\n", + log_rdma_send(INFO, "sge addr=0x%llx length=%u lkey=0x%x\n", request->sge[0].addr, request->sge[0].length, request->sge[0].lkey); @@ -792,7 +792,7 @@ static int smbd_post_send(struct smbd_connection *info, for (i = 0; i < request->num_sge; i++) { log_rdma_send(INFO, - "rdma_request sge[%d] addr=%llu length=%u\n", + "rdma_request sge[%d] addr=0x%llx length=%u\n", i, request->sge[i].addr, request->sge[i].length); ib_dma_sync_single_for_device( info->id->device, @@ -1017,9 +1017,9 @@ static int smbd_post_send_data( { int i; u32 data_length = 0; - struct scatterlist sgl[SMBDIRECT_MAX_SGE]; + struct scatterlist sgl[SMBDIRECT_MAX_SEND_SGE - 1]; - if (n_vec > SMBDIRECT_MAX_SGE) { + if (n_vec > SMBDIRECT_MAX_SEND_SGE - 1) { cifs_dbg(VFS, "Can't fit data to SGL, n_vec=%d\n", n_vec); return -EINVAL; } @@ -1079,7 +1079,7 @@ static int smbd_negotiate(struct smbd_connection *info) response->type = SMBD_NEGOTIATE_RESP; rc = smbd_post_recv(info, response); - log_rdma_event(INFO, "smbd_post_recv rc=%d iov.addr=%llx iov.length=%x iov.lkey=%x\n", + log_rdma_event(INFO, "smbd_post_recv rc=%d iov.addr=0x%llx iov.length=%u iov.lkey=0x%x\n", rc, response->sge.addr, response->sge.length, response->sge.lkey); if (rc) @@ -1539,7 +1539,7 @@ static struct smbd_connection *_smbd_get_connection( if (smbd_send_credit_target > info->id->device->attrs.max_cqe || smbd_send_credit_target > info->id->device->attrs.max_qp_wr) { - log_rdma_event(ERR, "consider lowering send_credit_target = %d. Possible CQE overrun, device reporting max_cpe %d max_qp_wr %d\n", + log_rdma_event(ERR, "consider lowering send_credit_target = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n", smbd_send_credit_target, info->id->device->attrs.max_cqe, info->id->device->attrs.max_qp_wr); @@ -1548,7 +1548,7 @@ static struct smbd_connection *_smbd_get_connection( if (smbd_receive_credit_max > info->id->device->attrs.max_cqe || smbd_receive_credit_max > info->id->device->attrs.max_qp_wr) { - log_rdma_event(ERR, "consider lowering receive_credit_max = %d. Possible CQE overrun, device reporting max_cpe %d max_qp_wr %d\n", + log_rdma_event(ERR, "consider lowering receive_credit_max = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n", smbd_receive_credit_max, info->id->device->attrs.max_cqe, info->id->device->attrs.max_qp_wr); @@ -1562,17 +1562,15 @@ static struct smbd_connection *_smbd_get_connection( info->max_receive_size = smbd_max_receive_size; info->keep_alive_interval = smbd_keep_alive_interval; - if (info->id->device->attrs.max_send_sge < SMBDIRECT_MAX_SGE) { + if (info->id->device->attrs.max_send_sge < SMBDIRECT_MAX_SEND_SGE || + info->id->device->attrs.max_recv_sge < SMBDIRECT_MAX_RECV_SGE) { log_rdma_event(ERR, - "warning: device max_send_sge = %d too small\n", - info->id->device->attrs.max_send_sge); - log_rdma_event(ERR, "Queue Pair creation may fail\n"); - } - if (info->id->device->attrs.max_recv_sge < SMBDIRECT_MAX_SGE) { - log_rdma_event(ERR, - "warning: device max_recv_sge = %d too small\n", + "device %.*s max_send_sge/max_recv_sge = %d/%d too small\n", + IB_DEVICE_NAME_MAX, + info->id->device->name, + info->id->device->attrs.max_send_sge, info->id->device->attrs.max_recv_sge); - log_rdma_event(ERR, "Queue Pair creation may fail\n"); + goto config_failed; } info->send_cq = NULL; @@ -1598,8 +1596,8 @@ static struct smbd_connection *_smbd_get_connection( qp_attr.qp_context = info; qp_attr.cap.max_send_wr = info->send_credit_target; qp_attr.cap.max_recv_wr = info->receive_credit_max; - qp_attr.cap.max_send_sge = SMBDIRECT_MAX_SGE; - qp_attr.cap.max_recv_sge = SMBDIRECT_MAX_SGE; + qp_attr.cap.max_send_sge = SMBDIRECT_MAX_SEND_SGE; + qp_attr.cap.max_recv_sge = SMBDIRECT_MAX_RECV_SGE; qp_attr.cap.max_inline_data = 0; qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; qp_attr.qp_type = IB_QPT_RC; @@ -1986,10 +1984,11 @@ int smbd_send(struct TCP_Server_Info *server, int num_rqst, struct smb_rqst *rqst_array) { struct smbd_connection *info = server->smbd_conn; - struct kvec vec; + struct kvec vecs[SMBDIRECT_MAX_SEND_SGE - 1]; int nvecs; int size; unsigned int buflen, remaining_data_length; + unsigned int offset, remaining_vec_data_length; int start, i, j; int max_iov_size = info->max_send_size - sizeof(struct smbd_data_transfer); @@ -1998,10 +1997,8 @@ int smbd_send(struct TCP_Server_Info *server, struct smb_rqst *rqst; int rqst_idx; - if (info->transport_status != SMBD_CONNECTED) { - rc = -EAGAIN; - goto done; - } + if (info->transport_status != SMBD_CONNECTED) + return -EAGAIN; /* * Add in the page array if there is one. The caller needs to set @@ -2012,125 +2009,95 @@ int smbd_send(struct TCP_Server_Info *server, for (i = 0; i < num_rqst; i++) remaining_data_length += smb_rqst_len(server, &rqst_array[i]); - if (remaining_data_length > info->max_fragmented_send_size) { + if (unlikely(remaining_data_length > info->max_fragmented_send_size)) { + /* assertion: payload never exceeds negotiated maximum */ log_write(ERR, "payload size %d > max size %d\n", remaining_data_length, info->max_fragmented_send_size); - rc = -EINVAL; - goto done; + return -EINVAL; } log_write(INFO, "num_rqst=%d total length=%u\n", num_rqst, remaining_data_length); rqst_idx = 0; -next_rqst: - rqst = &rqst_array[rqst_idx]; - iov = rqst->rq_iov; - - cifs_dbg(FYI, "Sending smb (RDMA): idx=%d smb_len=%lu\n", - rqst_idx, smb_rqst_len(server, rqst)); - for (i = 0; i < rqst->rq_nvec; i++) - dump_smb(iov[i].iov_base, iov[i].iov_len); - - - log_write(INFO, "rqst_idx=%d nvec=%d rqst->rq_npages=%d rq_pagesz=%d rq_tailsz=%d buflen=%lu\n", - rqst_idx, rqst->rq_nvec, rqst->rq_npages, rqst->rq_pagesz, - rqst->rq_tailsz, smb_rqst_len(server, rqst)); - - start = i = 0; - buflen = 0; - while (true) { - buflen += iov[i].iov_len; - if (buflen > max_iov_size) { - if (i > start) { - remaining_data_length -= - (buflen-iov[i].iov_len); - log_write(INFO, "sending iov[] from start=%d i=%d nvecs=%d remaining_data_length=%d\n", - start, i, i - start, - remaining_data_length); - rc = smbd_post_send_data( - info, &iov[start], i-start, - remaining_data_length); - if (rc) - goto done; - } else { - /* iov[start] is too big, break it */ - nvecs = (buflen+max_iov_size-1)/max_iov_size; - log_write(INFO, "iov[%d] iov_base=%p buflen=%d break to %d vectors\n", - start, iov[start].iov_base, - buflen, nvecs); - for (j = 0; j < nvecs; j++) { - vec.iov_base = - (char *)iov[start].iov_base + - j*max_iov_size; - vec.iov_len = max_iov_size; - if (j == nvecs-1) - vec.iov_len = - buflen - - max_iov_size*(nvecs-1); - remaining_data_length -= vec.iov_len; - log_write(INFO, - "sending vec j=%d iov_base=%p iov_len=%zu remaining_data_length=%d\n", - j, vec.iov_base, vec.iov_len, - remaining_data_length); - rc = smbd_post_send_data( - info, &vec, 1, - remaining_data_length); - if (rc) - goto done; + do { + rqst = &rqst_array[rqst_idx]; + iov = rqst->rq_iov; + + cifs_dbg(FYI, "Sending smb (RDMA): idx=%d smb_len=%lu\n", + rqst_idx, smb_rqst_len(server, rqst)); + remaining_vec_data_length = 0; + for (i = 0; i < rqst->rq_nvec; i++) { + remaining_vec_data_length += iov[i].iov_len; + dump_smb(iov[i].iov_base, iov[i].iov_len); + } + + log_write(INFO, "rqst_idx=%d nvec=%d rqst->rq_npages=%d rq_pagesz=%d rq_tailsz=%d buflen=%lu\n", + rqst_idx, rqst->rq_nvec, + rqst->rq_npages, rqst->rq_pagesz, + rqst->rq_tailsz, smb_rqst_len(server, rqst)); + + start = 0; + offset = 0; + do { + buflen = 0; + i = start; + j = 0; + while (i < rqst->rq_nvec && + j < SMBDIRECT_MAX_SEND_SGE - 1 && + buflen < max_iov_size) { + + vecs[j].iov_base = iov[i].iov_base + offset; + if (buflen + iov[i].iov_len > max_iov_size) { + vecs[j].iov_len = + max_iov_size - iov[i].iov_len; + buflen = max_iov_size; + offset = vecs[j].iov_len; + } else { + vecs[j].iov_len = + iov[i].iov_len - offset; + buflen += vecs[j].iov_len; + offset = 0; + ++i; } - i++; - if (i == rqst->rq_nvec) - break; + ++j; } + + remaining_vec_data_length -= buflen; + remaining_data_length -= buflen; + log_write(INFO, "sending %s iov[%d] from start=%d nvecs=%d remaining_data_length=%d\n", + remaining_vec_data_length > 0 ? + "partial" : "complete", + rqst->rq_nvec, start, j, + remaining_data_length); + start = i; - buflen = 0; - } else { - i++; - if (i == rqst->rq_nvec) { - /* send out all remaining vecs */ - remaining_data_length -= buflen; - log_write(INFO, "sending iov[] from start=%d i=%d nvecs=%d remaining_data_length=%d\n", - start, i, i - start, + rc = smbd_post_send_data(info, vecs, j, remaining_data_length); + if (rc) + goto done; + } while (remaining_vec_data_length > 0); + + /* now sending pages if there are any */ + for (i = 0; i < rqst->rq_npages; i++) { + rqst_page_get_length(rqst, i, &buflen, &offset); + nvecs = (buflen + max_iov_size - 1) / max_iov_size; + log_write(INFO, "sending pages buflen=%d nvecs=%d\n", + buflen, nvecs); + for (j = 0; j < nvecs; j++) { + size = min_t(unsigned int, max_iov_size, remaining_data_length); + remaining_data_length -= size; + log_write(INFO, "sending pages i=%d offset=%d size=%d remaining_data_length=%d\n", + i, j * max_iov_size + offset, size, remaining_data_length); - rc = smbd_post_send_data(info, &iov[start], - i-start, remaining_data_length); + rc = smbd_post_send_page( + info, rqst->rq_pages[i], + j*max_iov_size + offset, + size, remaining_data_length); if (rc) goto done; - break; } } - log_write(INFO, "looping i=%d buflen=%d\n", i, buflen); - } - - /* now sending pages if there are any */ - for (i = 0; i < rqst->rq_npages; i++) { - unsigned int offset; - - rqst_page_get_length(rqst, i, &buflen, &offset); - nvecs = (buflen + max_iov_size - 1) / max_iov_size; - log_write(INFO, "sending pages buflen=%d nvecs=%d\n", - buflen, nvecs); - for (j = 0; j < nvecs; j++) { - size = max_iov_size; - if (j == nvecs-1) - size = buflen - j*max_iov_size; - remaining_data_length -= size; - log_write(INFO, "sending pages i=%d offset=%d size=%d remaining_data_length=%d\n", - i, j * max_iov_size + offset, size, - remaining_data_length); - rc = smbd_post_send_page( - info, rqst->rq_pages[i], - j*max_iov_size + offset, - size, remaining_data_length); - if (rc) - goto done; - } - } - - rqst_idx++; - if (rqst_idx < num_rqst) - goto next_rqst; + } while (++rqst_idx < num_rqst); done: /* diff --git a/fs/cifs/smbdirect.h b/fs/cifs/smbdirect.h index a87fca82a796..207ef979cd51 100644 --- a/fs/cifs/smbdirect.h +++ b/fs/cifs/smbdirect.h @@ -91,7 +91,7 @@ struct smbd_connection { /* Memory registrations */ /* Maximum number of RDMA read/write outstanding on this connection */ int responder_resources; - /* Maximum number of SGEs in a RDMA write/read */ + /* Maximum number of pages in a single RDMA write/read on this connection */ int max_frmr_depth; /* * If payload is less than or equal to the threshold, @@ -225,21 +225,25 @@ struct smbd_buffer_descriptor_v1 { __le32 length; } __packed; -/* Default maximum number of SGEs in a RDMA send/recv */ -#define SMBDIRECT_MAX_SGE 16 +/* Maximum number of SGEs used by smbdirect.c in any send work request */ +#define SMBDIRECT_MAX_SEND_SGE 6 + /* The context for a SMBD request */ struct smbd_request { struct smbd_connection *info; struct ib_cqe cqe; - /* the SGE entries for this packet */ - struct ib_sge sge[SMBDIRECT_MAX_SGE]; + /* the SGE entries for this work request */ + struct ib_sge sge[SMBDIRECT_MAX_SEND_SGE]; int num_sge; /* SMBD packet header follows this structure */ u8 packet[]; }; +/* Maximum number of SGEs used by smbdirect.c in any receive work request */ +#define SMBDIRECT_MAX_RECV_SGE 1 + /* The context for a SMBD response */ struct smbd_response { struct smbd_connection *info; diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index 6b88dc2e364f..110070ba8b04 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -372,6 +372,7 @@ DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_eof_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_info_compound_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(delete_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mkdir_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(tdis_enter); DECLARE_EVENT_CLASS(smb3_inf_compound_done_class, @@ -409,6 +410,7 @@ DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_eof_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_info_compound_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(delete_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mkdir_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(tdis_done); DECLARE_EVENT_CLASS(smb3_inf_compound_err_class, @@ -451,6 +453,7 @@ DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_eof_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_info_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mkdir_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(delete_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(tdis_err); /* * For logging SMB3 Status code and Command for responses which return errors diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 9a2753e21170..575fa8f58342 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -753,8 +753,9 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ) { int error; - error = wait_event_freezekillable_unsafe(server->response_q, - midQ->mid_state != MID_REQUEST_SUBMITTED); + error = wait_event_state(server->response_q, + midQ->mid_state != MID_REQUEST_SUBMITTED, + (TASK_KILLABLE|TASK_FREEZABLE_UNSAFE)); if (error < 0) return -ERESTARTSYS; diff --git a/fs/coredump.c b/fs/coredump.c index 9f4aae202109..7bad7785e8e6 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -354,7 +354,7 @@ static int zap_process(struct task_struct *start, int exit_code) struct task_struct *t; int nr = 0; - /* ignore all signals except SIGKILL, see prepare_signal() */ + /* Allow SIGKILL, see prepare_signal() */ start->signal->flags = SIGNAL_GROUP_EXIT; start->signal->group_exit_code = exit_code; start->signal->group_stop_count = 0; @@ -402,9 +402,8 @@ static int coredump_wait(int exit_code, struct core_state *core_state) if (core_waiters > 0) { struct core_thread *ptr; - freezer_do_not_count(); - wait_for_completion(&core_state->startup); - freezer_count(); + wait_for_completion_state(&core_state->startup, + TASK_UNINTERRUPTIBLE|TASK_FREEZABLE); /* * Wait for all the threads to become inactive, so that * all the thread context (extended register state, like @@ -412,7 +411,7 @@ static int coredump_wait(int exit_code, struct core_state *core_state) */ ptr = core_state->dumper.next; while (ptr != NULL) { - wait_task_inactive(ptr->task, 0); + wait_task_inactive(ptr->task, TASK_ANY); ptr = ptr->next; } } @@ -832,6 +831,39 @@ static int __dump_skip(struct coredump_params *cprm, size_t nr) } } +static int dump_emit_page(struct coredump_params *cprm, struct page *page) +{ + struct bio_vec bvec = { + .bv_page = page, + .bv_offset = 0, + .bv_len = PAGE_SIZE, + }; + struct iov_iter iter; + struct file *file = cprm->file; + loff_t pos; + ssize_t n; + + if (cprm->to_skip) { + if (!__dump_skip(cprm, cprm->to_skip)) + return 0; + cprm->to_skip = 0; + } + if (cprm->written + PAGE_SIZE > cprm->limit) + return 0; + if (dump_interrupted()) + return 0; + pos = file->f_pos; + iov_iter_bvec(&iter, WRITE, &bvec, 1, PAGE_SIZE); + n = __kernel_write_iter(cprm->file, &iter, &pos); + if (n != PAGE_SIZE) + return 0; + file->f_pos = pos; + cprm->written += PAGE_SIZE; + cprm->pos += PAGE_SIZE; + + return 1; +} + int dump_emit(struct coredump_params *cprm, const void *addr, int nr) { if (cprm->to_skip) { @@ -863,7 +895,6 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, for (addr = start; addr < start + len; addr += PAGE_SIZE) { struct page *page; - int stop; /* * To avoid having to allocate page tables for virtual address @@ -874,10 +905,7 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, */ page = get_dump_page(addr); if (page) { - void *kaddr = kmap_local_page(page); - - stop = !dump_emit(cprm, kaddr, PAGE_SIZE); - kunmap_local(kaddr); + int stop = !dump_emit_page(cprm, page); put_page(page); if (stop) return 0; @@ -1072,30 +1100,20 @@ whole: return vma->vm_end - vma->vm_start; } -static struct vm_area_struct *first_vma(struct task_struct *tsk, - struct vm_area_struct *gate_vma) -{ - struct vm_area_struct *ret = tsk->mm->mmap; - - if (ret) - return ret; - return gate_vma; -} - /* * Helper function for iterating across a vma list. It ensures that the caller * will visit `gate_vma' prior to terminating the search. */ -static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma, +static struct vm_area_struct *coredump_next_vma(struct ma_state *mas, + struct vm_area_struct *vma, struct vm_area_struct *gate_vma) { - struct vm_area_struct *ret; - - ret = this_vma->vm_next; - if (ret) - return ret; - if (this_vma == gate_vma) + if (gate_vma && (vma == gate_vma)) return NULL; + + vma = mas_next(mas, ULONG_MAX); + if (vma) + return vma; return gate_vma; } @@ -1119,9 +1137,10 @@ static void free_vma_snapshot(struct coredump_params *cprm) */ static bool dump_vma_snapshot(struct coredump_params *cprm) { - struct vm_area_struct *vma, *gate_vma; + struct vm_area_struct *gate_vma, *vma = NULL; struct mm_struct *mm = current->mm; - int i; + MA_STATE(mas, &mm->mm_mt, 0, 0); + int i = 0; /* * Once the stack expansion code is fixed to not change VMA bounds @@ -1141,8 +1160,7 @@ static bool dump_vma_snapshot(struct coredump_params *cprm) return false; } - for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; - vma = next_vma(vma, gate_vma), i++) { + while ((vma = coredump_next_vma(&mas, vma, gate_vma)) != NULL) { struct core_vma_metadata *m = cprm->vma_meta + i; m->start = vma->vm_start; @@ -1150,10 +1168,10 @@ static bool dump_vma_snapshot(struct coredump_params *cprm) m->flags = vma->vm_flags; m->dump_size = vma_dump_size(vma, cprm->mm_flags); m->pgoff = vma->vm_pgoff; - m->file = vma->vm_file; if (m->file) get_file(m->file); + i++; } mmap_write_unlock(mm); diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 2217fe5ece6f..1b4403136d05 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -25,21 +25,25 @@ * then this function isn't applicable. This function may sleep, so it must be * called from a workqueue rather than from the bio's bi_end_io callback. * - * This function sets PG_error on any pages that contain any blocks that failed - * to be decrypted. The filesystem must not mark such pages uptodate. + * Return: %true on success; %false on failure. On failure, bio->bi_status is + * also set to an error status. */ -void fscrypt_decrypt_bio(struct bio *bio) +bool fscrypt_decrypt_bio(struct bio *bio) { struct bio_vec *bv; struct bvec_iter_all iter_all; bio_for_each_segment_all(bv, bio, iter_all) { struct page *page = bv->bv_page; - int ret = fscrypt_decrypt_pagecache_blocks(page, bv->bv_len, + int err = fscrypt_decrypt_pagecache_blocks(page, bv->bv_len, bv->bv_offset); - if (ret) - SetPageError(page); + + if (err) { + bio->bi_status = errno_to_blk_status(err); + return false; + } } + return true; } EXPORT_SYMBOL(fscrypt_decrypt_bio); diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 3afdaa084773..d5f68a0c5d15 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -184,7 +184,7 @@ struct fscrypt_symlink_data { struct fscrypt_prepared_key { struct crypto_skcipher *tfm; #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT - struct fscrypt_blk_crypto_key *blk_key; + struct blk_crypto_key *blk_key; #endif }; @@ -225,7 +225,7 @@ struct fscrypt_info { * will be NULL if the master key was found in a process-subscribed * keyring rather than in the filesystem-level keyring. */ - struct key *ci_master_key; + struct fscrypt_master_key *ci_master_key; /* * Link in list of inodes that were unlocked with the master key. @@ -344,7 +344,8 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, const struct fscrypt_info *ci); -void fscrypt_destroy_inline_crypt_key(struct fscrypt_prepared_key *prep_key); +void fscrypt_destroy_inline_crypt_key(struct super_block *sb, + struct fscrypt_prepared_key *prep_key); /* * Check whether the crypto transform or blk-crypto key has been allocated in @@ -390,7 +391,8 @@ fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, } static inline void -fscrypt_destroy_inline_crypt_key(struct fscrypt_prepared_key *prep_key) +fscrypt_destroy_inline_crypt_key(struct super_block *sb, + struct fscrypt_prepared_key *prep_key) { } @@ -437,6 +439,40 @@ struct fscrypt_master_key_secret { struct fscrypt_master_key { /* + * Back-pointer to the super_block of the filesystem to which this + * master key has been added. Only valid if ->mk_active_refs > 0. + */ + struct super_block *mk_sb; + + /* + * Link in ->mk_sb->s_master_keys->key_hashtable. + * Only valid if ->mk_active_refs > 0. + */ + struct hlist_node mk_node; + + /* Semaphore that protects ->mk_secret and ->mk_users */ + struct rw_semaphore mk_sem; + + /* + * Active and structural reference counts. An active ref guarantees + * that the struct continues to exist, continues to be in the keyring + * ->mk_sb->s_master_keys, and that any embedded subkeys (e.g. + * ->mk_direct_keys) that have been prepared continue to exist. + * A structural ref only guarantees that the struct continues to exist. + * + * There is one active ref associated with ->mk_secret being present, + * and one active ref for each inode in ->mk_decrypted_inodes. + * + * There is one structural ref associated with the active refcount being + * nonzero. Finding a key in the keyring also takes a structural ref, + * which is then held temporarily while the key is operated on. + */ + refcount_t mk_active_refs; + refcount_t mk_struct_refs; + + struct rcu_head mk_rcu_head; + + /* * The secret key material. After FS_IOC_REMOVE_ENCRYPTION_KEY is * executed, this is wiped and no new inodes can be unlocked with this * key; however, there may still be inodes in ->mk_decrypted_inodes @@ -444,7 +480,10 @@ struct fscrypt_master_key { * FS_IOC_REMOVE_ENCRYPTION_KEY can be retried, or * FS_IOC_ADD_ENCRYPTION_KEY can add the secret again. * - * Locking: protected by this master key's key->sem. + * While ->mk_secret is present, one ref in ->mk_active_refs is held. + * + * Locking: protected by ->mk_sem. The manipulation of ->mk_active_refs + * associated with this field is protected by ->mk_sem as well. */ struct fscrypt_master_key_secret mk_secret; @@ -465,23 +504,13 @@ struct fscrypt_master_key { * * This is NULL for v1 policy keys; those can only be added by root. * - * Locking: in addition to this keyring's own semaphore, this is - * protected by this master key's key->sem, so we can do atomic - * search+insert. It can also be searched without taking any locks, but - * in that case the returned key may have already been removed. + * Locking: protected by ->mk_sem. (We don't just rely on the keyrings + * subsystem semaphore ->mk_users->sem, as we need support for atomic + * search+insert along with proper synchronization with ->mk_secret.) */ struct key *mk_users; /* - * Length of ->mk_decrypted_inodes, plus one if mk_secret is present. - * Once this goes to 0, the master key is removed from ->s_master_keys. - * The 'struct fscrypt_master_key' will continue to live as long as the - * 'struct key' whose payload it is, but we won't let this reference - * count rise again. - */ - refcount_t mk_refcount; - - /* * List of inodes that were unlocked using this key. This allows the * inodes to be evicted efficiently if the key is removed. */ @@ -506,10 +535,10 @@ static inline bool is_master_key_secret_present(const struct fscrypt_master_key_secret *secret) { /* - * The READ_ONCE() is only necessary for fscrypt_drop_inode() and - * fscrypt_key_describe(). These run in atomic context, so they can't - * take the key semaphore and thus 'secret' can change concurrently - * which would be a data race. But they only need to know whether the + * The READ_ONCE() is only necessary for fscrypt_drop_inode(). + * fscrypt_drop_inode() runs in atomic context, so it can't take the key + * semaphore and thus 'secret' can change concurrently which would be a + * data race. But fscrypt_drop_inode() only need to know whether the * secret *was* present at the time of check, so READ_ONCE() suffices. */ return READ_ONCE(secret->size) != 0; @@ -538,7 +567,11 @@ static inline int master_key_spec_len(const struct fscrypt_key_specifier *spec) return 0; } -struct key * +void fscrypt_put_master_key(struct fscrypt_master_key *mk); + +void fscrypt_put_master_key_activeref(struct fscrypt_master_key *mk); + +struct fscrypt_master_key * fscrypt_find_master_key(struct super_block *sb, const struct fscrypt_key_specifier *mk_spec); @@ -569,7 +602,8 @@ extern struct fscrypt_mode fscrypt_modes[]; int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, const struct fscrypt_info *ci); -void fscrypt_destroy_prepared_key(struct fscrypt_prepared_key *prep_key); +void fscrypt_destroy_prepared_key(struct super_block *sb, + struct fscrypt_prepared_key *prep_key); int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key); diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 7c01025879b3..7b8c5a1104b5 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -5,8 +5,6 @@ * Encryption hooks for higher-level filesystem operations. */ -#include <linux/key.h> - #include "fscrypt_private.h" /** @@ -142,7 +140,6 @@ int fscrypt_prepare_setflags(struct inode *inode, unsigned int oldflags, unsigned int flags) { struct fscrypt_info *ci; - struct key *key; struct fscrypt_master_key *mk; int err; @@ -158,14 +155,13 @@ int fscrypt_prepare_setflags(struct inode *inode, ci = inode->i_crypt_info; if (ci->ci_policy.version != FSCRYPT_POLICY_V2) return -EINVAL; - key = ci->ci_master_key; - mk = key->payload.data[0]; - down_read(&key->sem); + mk = ci->ci_master_key; + down_read(&mk->mk_sem); if (is_master_key_secret_present(&mk->mk_secret)) err = fscrypt_derive_dirhash_key(ci, mk); else err = -ENOKEY; - up_read(&key->sem); + up_read(&mk->mk_sem); return err; } return 0; diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index 90f3e68f166e..cea8b14007e6 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -21,26 +21,22 @@ #include "fscrypt_private.h" -struct fscrypt_blk_crypto_key { - struct blk_crypto_key base; - int num_devs; - struct request_queue *devs[]; -}; - -static int fscrypt_get_num_devices(struct super_block *sb) +static struct block_device **fscrypt_get_devices(struct super_block *sb, + unsigned int *num_devs) { - if (sb->s_cop->get_num_devices) - return sb->s_cop->get_num_devices(sb); - return 1; -} + struct block_device **devs; -static void fscrypt_get_devices(struct super_block *sb, int num_devs, - struct request_queue **devs) -{ - if (num_devs == 1) - devs[0] = bdev_get_queue(sb->s_bdev); - else - sb->s_cop->get_devices(sb, devs); + if (sb->s_cop->get_devices) { + devs = sb->s_cop->get_devices(sb, num_devs); + if (devs) + return devs; + } + devs = kmalloc(sizeof(*devs), GFP_KERNEL); + if (!devs) + return ERR_PTR(-ENOMEM); + devs[0] = sb->s_bdev; + *num_devs = 1; + return devs; } static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_info *ci) @@ -74,15 +70,17 @@ static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_info *ci) * helpful for debugging problems where the "wrong" implementation is used. */ static void fscrypt_log_blk_crypto_impl(struct fscrypt_mode *mode, - struct request_queue **devs, - int num_devs, + struct block_device **devs, + unsigned int num_devs, const struct blk_crypto_config *cfg) { - int i; + unsigned int i; for (i = 0; i < num_devs; i++) { + struct request_queue *q = bdev_get_queue(devs[i]); + if (!IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) || - __blk_crypto_cfg_supported(devs[i]->crypto_profile, cfg)) { + __blk_crypto_cfg_supported(q->crypto_profile, cfg)) { if (!xchg(&mode->logged_blk_crypto_native, 1)) pr_info("fscrypt: %s using blk-crypto (native)\n", mode->friendly_name); @@ -99,9 +97,9 @@ int fscrypt_select_encryption_impl(struct fscrypt_info *ci) const struct inode *inode = ci->ci_inode; struct super_block *sb = inode->i_sb; struct blk_crypto_config crypto_cfg; - int num_devs; - struct request_queue **devs; - int i; + struct block_device **devs; + unsigned int num_devs; + unsigned int i; /* The file must need contents encryption, not filenames encryption */ if (!S_ISREG(inode->i_mode)) @@ -129,20 +127,20 @@ int fscrypt_select_encryption_impl(struct fscrypt_info *ci) return 0; /* - * On all the filesystem's devices, blk-crypto must support the crypto - * configuration that the file would use. + * On all the filesystem's block devices, blk-crypto must support the + * crypto configuration that the file would use. */ crypto_cfg.crypto_mode = ci->ci_mode->blk_crypto_mode; crypto_cfg.data_unit_size = sb->s_blocksize; crypto_cfg.dun_bytes = fscrypt_get_dun_bytes(ci); - num_devs = fscrypt_get_num_devices(sb); - devs = kmalloc_array(num_devs, sizeof(*devs), GFP_KERNEL); - if (!devs) - return -ENOMEM; - fscrypt_get_devices(sb, num_devs, devs); + + devs = fscrypt_get_devices(sb, &num_devs); + if (IS_ERR(devs)) + return PTR_ERR(devs); for (i = 0; i < num_devs; i++) { - if (!blk_crypto_config_supported(devs[i], &crypto_cfg)) + if (!blk_crypto_config_supported(bdev_get_queue(devs[i]), + &crypto_cfg)) goto out_free_devs; } @@ -162,49 +160,41 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, const struct inode *inode = ci->ci_inode; struct super_block *sb = inode->i_sb; enum blk_crypto_mode_num crypto_mode = ci->ci_mode->blk_crypto_mode; - int num_devs = fscrypt_get_num_devices(sb); - int queue_refs = 0; - struct fscrypt_blk_crypto_key *blk_key; + struct blk_crypto_key *blk_key; + struct block_device **devs; + unsigned int num_devs; + unsigned int i; int err; - int i; - blk_key = kzalloc(struct_size(blk_key, devs, num_devs), GFP_KERNEL); + blk_key = kmalloc(sizeof(*blk_key), GFP_KERNEL); if (!blk_key) return -ENOMEM; - blk_key->num_devs = num_devs; - fscrypt_get_devices(sb, num_devs, blk_key->devs); - - err = blk_crypto_init_key(&blk_key->base, raw_key, crypto_mode, + err = blk_crypto_init_key(blk_key, raw_key, crypto_mode, fscrypt_get_dun_bytes(ci), sb->s_blocksize); if (err) { fscrypt_err(inode, "error %d initializing blk-crypto key", err); goto fail; } - /* - * We have to start using blk-crypto on all the filesystem's devices. - * We also have to save all the request_queue's for later so that the - * key can be evicted from them. This is needed because some keys - * aren't destroyed until after the filesystem was already unmounted - * (namely, the per-mode keys in struct fscrypt_master_key). - */ + /* Start using blk-crypto on all the filesystem's block devices. */ + devs = fscrypt_get_devices(sb, &num_devs); + if (IS_ERR(devs)) { + err = PTR_ERR(devs); + goto fail; + } for (i = 0; i < num_devs; i++) { - if (!blk_get_queue(blk_key->devs[i])) { - fscrypt_err(inode, "couldn't get request_queue"); - err = -EAGAIN; - goto fail; - } - queue_refs++; - - err = blk_crypto_start_using_key(&blk_key->base, - blk_key->devs[i]); - if (err) { - fscrypt_err(inode, - "error %d starting to use blk-crypto", err); - goto fail; - } + err = blk_crypto_start_using_key(blk_key, + bdev_get_queue(devs[i])); + if (err) + break; } + kfree(devs); + if (err) { + fscrypt_err(inode, "error %d starting to use blk-crypto", err); + goto fail; + } + /* * Pairs with the smp_load_acquire() in fscrypt_is_key_prepared(). * I.e., here we publish ->blk_key with a RELEASE barrier so that @@ -215,24 +205,29 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, return 0; fail: - for (i = 0; i < queue_refs; i++) - blk_put_queue(blk_key->devs[i]); kfree_sensitive(blk_key); return err; } -void fscrypt_destroy_inline_crypt_key(struct fscrypt_prepared_key *prep_key) +void fscrypt_destroy_inline_crypt_key(struct super_block *sb, + struct fscrypt_prepared_key *prep_key) { - struct fscrypt_blk_crypto_key *blk_key = prep_key->blk_key; - int i; + struct blk_crypto_key *blk_key = prep_key->blk_key; + struct block_device **devs; + unsigned int num_devs; + unsigned int i; - if (blk_key) { - for (i = 0; i < blk_key->num_devs; i++) { - blk_crypto_evict_key(blk_key->devs[i], &blk_key->base); - blk_put_queue(blk_key->devs[i]); - } - kfree_sensitive(blk_key); + if (!blk_key) + return; + + /* Evict the key from all the filesystem's block devices. */ + devs = fscrypt_get_devices(sb, &num_devs); + if (!IS_ERR(devs)) { + for (i = 0; i < num_devs; i++) + blk_crypto_evict_key(bdev_get_queue(devs[i]), blk_key); + kfree(devs); } + kfree_sensitive(blk_key); } bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode) @@ -282,7 +277,7 @@ void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, ci = inode->i_crypt_info; fscrypt_generate_dun(ci, first_lblk, dun); - bio_crypt_set_ctx(bio, &ci->ci_enc_key.blk_key->base, dun, gfp_mask); + bio_crypt_set_ctx(bio, ci->ci_enc_key.blk_key, dun, gfp_mask); } EXPORT_SYMBOL_GPL(fscrypt_set_bio_crypt_ctx); @@ -369,7 +364,7 @@ bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, * uses the same pointer. I.e., there's currently no need to support * merging requests where the keys are the same but the pointers differ. */ - if (bc->bc_key != &inode->i_crypt_info->ci_enc_key.blk_key->base) + if (bc->bc_key != inode->i_crypt_info->ci_enc_key.blk_key) return false; fscrypt_generate_dun(inode->i_crypt_info, next_lblk, next_dun); @@ -401,46 +396,45 @@ bool fscrypt_mergeable_bio_bh(struct bio *bio, EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio_bh); /** - * fscrypt_dio_supported() - check whether a DIO (direct I/O) request is - * supported as far as encryption is concerned - * @iocb: the file and position the I/O is targeting - * @iter: the I/O data segment(s) + * fscrypt_dio_supported() - check whether DIO (direct I/O) is supported on an + * inode, as far as encryption is concerned + * @inode: the inode in question * * Return: %true if there are no encryption constraints that prevent DIO from * being supported; %false if DIO is unsupported. (Note that in the * %true case, the filesystem might have other, non-encryption-related - * constraints that prevent DIO from actually being supported.) + * constraints that prevent DIO from actually being supported. Also, on + * encrypted files the filesystem is still responsible for only allowing + * DIO when requests are filesystem-block-aligned.) */ -bool fscrypt_dio_supported(struct kiocb *iocb, struct iov_iter *iter) +bool fscrypt_dio_supported(struct inode *inode) { - const struct inode *inode = file_inode(iocb->ki_filp); - const unsigned int blocksize = i_blocksize(inode); + int err; /* If the file is unencrypted, no veto from us. */ if (!fscrypt_needs_contents_encryption(inode)) return true; - /* We only support DIO with inline crypto, not fs-layer crypto. */ - if (!fscrypt_inode_uses_inline_crypto(inode)) - return false; - /* - * Since the granularity of encryption is filesystem blocks, the file - * position and total I/O length must be aligned to the filesystem block - * size -- not just to the block device's logical block size as is - * traditionally the case for DIO on many filesystems. + * We only support DIO with inline crypto, not fs-layer crypto. * - * We require that the user-provided memory buffers be filesystem block - * aligned too. It is simpler to have a single alignment value required - * for all properties of the I/O, as is normally the case for DIO. - * Also, allowing less aligned buffers would imply that data units could - * cross bvecs, which would greatly complicate the I/O stack, which - * assumes that bios can be split at any bvec boundary. + * To determine whether the inode is using inline crypto, we have to set + * up the key if it wasn't already done. This is because in the current + * design of fscrypt, the decision of whether to use inline crypto or + * not isn't made until the inode's encryption key is being set up. In + * the DIO read/write case, the key will always be set up already, since + * the file will be open. But in the case of statx(), the key might not + * be set up yet, as the file might not have been opened yet. */ - if (!IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), blocksize)) + err = fscrypt_require_key(inode); + if (err) { + /* + * Key unavailable or couldn't be set up. This edge case isn't + * worth worrying about; just report that DIO is unsupported. + */ return false; - - return true; + } + return fscrypt_inode_uses_inline_crypto(inode); } EXPORT_SYMBOL_GPL(fscrypt_dio_supported); diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index caee9f8620dd..2a24b1f0ae68 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -18,6 +18,7 @@ * information about these ioctls. */ +#include <asm/unaligned.h> #include <crypto/skcipher.h> #include <linux/key-type.h> #include <linux/random.h> @@ -25,6 +26,18 @@ #include "fscrypt_private.h" +/* The master encryption keys for a filesystem (->s_master_keys) */ +struct fscrypt_keyring { + /* + * Lock that protects ->key_hashtable. It does *not* protect the + * fscrypt_master_key structs themselves. + */ + spinlock_t lock; + + /* Hash table that maps fscrypt_key_specifier to fscrypt_master_key */ + struct hlist_head key_hashtable[128]; +}; + static void wipe_master_key_secret(struct fscrypt_master_key_secret *secret) { fscrypt_destroy_hkdf(&secret->hkdf); @@ -38,66 +51,81 @@ static void move_master_key_secret(struct fscrypt_master_key_secret *dst, memzero_explicit(src, sizeof(*src)); } -static void free_master_key(struct fscrypt_master_key *mk) +static void fscrypt_free_master_key(struct rcu_head *head) { - size_t i; - - wipe_master_key_secret(&mk->mk_secret); - - for (i = 0; i <= FSCRYPT_MODE_MAX; i++) { - fscrypt_destroy_prepared_key(&mk->mk_direct_keys[i]); - fscrypt_destroy_prepared_key(&mk->mk_iv_ino_lblk_64_keys[i]); - fscrypt_destroy_prepared_key(&mk->mk_iv_ino_lblk_32_keys[i]); - } - - key_put(mk->mk_users); + struct fscrypt_master_key *mk = + container_of(head, struct fscrypt_master_key, mk_rcu_head); + /* + * The master key secret and any embedded subkeys should have already + * been wiped when the last active reference to the fscrypt_master_key + * struct was dropped; doing it here would be unnecessarily late. + * Nevertheless, use kfree_sensitive() in case anything was missed. + */ kfree_sensitive(mk); } -static inline bool valid_key_spec(const struct fscrypt_key_specifier *spec) +void fscrypt_put_master_key(struct fscrypt_master_key *mk) { - if (spec->__reserved) - return false; - return master_key_spec_len(spec) != 0; + if (!refcount_dec_and_test(&mk->mk_struct_refs)) + return; + /* + * No structural references left, so free ->mk_users, and also free the + * fscrypt_master_key struct itself after an RCU grace period ensures + * that concurrent keyring lookups can no longer find it. + */ + WARN_ON(refcount_read(&mk->mk_active_refs) != 0); + key_put(mk->mk_users); + mk->mk_users = NULL; + call_rcu(&mk->mk_rcu_head, fscrypt_free_master_key); } -static int fscrypt_key_instantiate(struct key *key, - struct key_preparsed_payload *prep) +void fscrypt_put_master_key_activeref(struct fscrypt_master_key *mk) { - key->payload.data[0] = (struct fscrypt_master_key *)prep->data; - return 0; -} + struct super_block *sb = mk->mk_sb; + struct fscrypt_keyring *keyring = sb->s_master_keys; + size_t i; -static void fscrypt_key_destroy(struct key *key) -{ - free_master_key(key->payload.data[0]); -} + if (!refcount_dec_and_test(&mk->mk_active_refs)) + return; + /* + * No active references left, so complete the full removal of this + * fscrypt_master_key struct by removing it from the keyring and + * destroying any subkeys embedded in it. + */ -static void fscrypt_key_describe(const struct key *key, struct seq_file *m) -{ - seq_puts(m, key->description); + spin_lock(&keyring->lock); + hlist_del_rcu(&mk->mk_node); + spin_unlock(&keyring->lock); - if (key_is_positive(key)) { - const struct fscrypt_master_key *mk = key->payload.data[0]; + /* + * ->mk_active_refs == 0 implies that ->mk_secret is not present and + * that ->mk_decrypted_inodes is empty. + */ + WARN_ON(is_master_key_secret_present(&mk->mk_secret)); + WARN_ON(!list_empty(&mk->mk_decrypted_inodes)); - if (!is_master_key_secret_present(&mk->mk_secret)) - seq_puts(m, ": secret removed"); + for (i = 0; i <= FSCRYPT_MODE_MAX; i++) { + fscrypt_destroy_prepared_key( + sb, &mk->mk_direct_keys[i]); + fscrypt_destroy_prepared_key( + sb, &mk->mk_iv_ino_lblk_64_keys[i]); + fscrypt_destroy_prepared_key( + sb, &mk->mk_iv_ino_lblk_32_keys[i]); } + memzero_explicit(&mk->mk_ino_hash_key, + sizeof(mk->mk_ino_hash_key)); + mk->mk_ino_hash_key_initialized = false; + + /* Drop the structural ref associated with the active refs. */ + fscrypt_put_master_key(mk); } -/* - * Type of key in ->s_master_keys. Each key of this type represents a master - * key which has been added to the filesystem. Its payload is a - * 'struct fscrypt_master_key'. The "." prefix in the key type name prevents - * users from adding keys of this type via the keyrings syscalls rather than via - * the intended method of FS_IOC_ADD_ENCRYPTION_KEY. - */ -static struct key_type key_type_fscrypt = { - .name = "._fscrypt", - .instantiate = fscrypt_key_instantiate, - .destroy = fscrypt_key_destroy, - .describe = fscrypt_key_describe, -}; +static inline bool valid_key_spec(const struct fscrypt_key_specifier *spec) +{ + if (spec->__reserved) + return false; + return master_key_spec_len(spec) != 0; +} static int fscrypt_user_key_instantiate(struct key *key, struct key_preparsed_payload *prep) @@ -131,32 +159,6 @@ static struct key_type key_type_fscrypt_user = { .describe = fscrypt_user_key_describe, }; -/* Search ->s_master_keys or ->mk_users */ -static struct key *search_fscrypt_keyring(struct key *keyring, - struct key_type *type, - const char *description) -{ - /* - * We need to mark the keyring reference as "possessed" so that we - * acquire permission to search it, via the KEY_POS_SEARCH permission. - */ - key_ref_t keyref = make_key_ref(keyring, true /* possessed */); - - keyref = keyring_search(keyref, type, description, false); - if (IS_ERR(keyref)) { - if (PTR_ERR(keyref) == -EAGAIN || /* not found */ - PTR_ERR(keyref) == -EKEYREVOKED) /* recently invalidated */ - keyref = ERR_PTR(-ENOKEY); - return ERR_CAST(keyref); - } - return key_ref_to_ptr(keyref); -} - -#define FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE \ - (CONST_STRLEN("fscrypt-") + sizeof_field(struct super_block, s_id)) - -#define FSCRYPT_MK_DESCRIPTION_SIZE (2 * FSCRYPT_KEY_IDENTIFIER_SIZE + 1) - #define FSCRYPT_MK_USERS_DESCRIPTION_SIZE \ (CONST_STRLEN("fscrypt-") + 2 * FSCRYPT_KEY_IDENTIFIER_SIZE + \ CONST_STRLEN("-users") + 1) @@ -164,21 +166,6 @@ static struct key *search_fscrypt_keyring(struct key *keyring, #define FSCRYPT_MK_USER_DESCRIPTION_SIZE \ (2 * FSCRYPT_KEY_IDENTIFIER_SIZE + CONST_STRLEN(".uid.") + 10 + 1) -static void format_fs_keyring_description( - char description[FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE], - const struct super_block *sb) -{ - sprintf(description, "fscrypt-%s", sb->s_id); -} - -static void format_mk_description( - char description[FSCRYPT_MK_DESCRIPTION_SIZE], - const struct fscrypt_key_specifier *mk_spec) -{ - sprintf(description, "%*phN", - master_key_spec_len(mk_spec), (u8 *)&mk_spec->u); -} - static void format_mk_users_keyring_description( char description[FSCRYPT_MK_USERS_DESCRIPTION_SIZE], const u8 mk_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]) @@ -199,20 +186,15 @@ static void format_mk_user_description( /* Create ->s_master_keys if needed. Synchronized by fscrypt_add_key_mutex. */ static int allocate_filesystem_keyring(struct super_block *sb) { - char description[FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE]; - struct key *keyring; + struct fscrypt_keyring *keyring; if (sb->s_master_keys) return 0; - format_fs_keyring_description(description, sb); - keyring = keyring_alloc(description, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, - current_cred(), KEY_POS_SEARCH | - KEY_USR_SEARCH | KEY_USR_READ | KEY_USR_VIEW, - KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); - if (IS_ERR(keyring)) - return PTR_ERR(keyring); - + keyring = kzalloc(sizeof(*keyring), GFP_KERNEL); + if (!keyring) + return -ENOMEM; + spin_lock_init(&keyring->lock); /* * Pairs with the smp_load_acquire() in fscrypt_find_master_key(). * I.e., here we publish ->s_master_keys with a RELEASE barrier so that @@ -222,21 +204,80 @@ static int allocate_filesystem_keyring(struct super_block *sb) return 0; } -void fscrypt_sb_free(struct super_block *sb) +/* + * Release all encryption keys that have been added to the filesystem, along + * with the keyring that contains them. + * + * This is called at unmount time. The filesystem's underlying block device(s) + * are still available at this time; this is important because after user file + * accesses have been allowed, this function may need to evict keys from the + * keyslots of an inline crypto engine, which requires the block device(s). + * + * This is also called when the super_block is being freed. This is needed to + * avoid a memory leak if mounting fails after the "test_dummy_encryption" + * option was processed, as in that case the unmount-time call isn't made. + */ +void fscrypt_destroy_keyring(struct super_block *sb) { - key_put(sb->s_master_keys); + struct fscrypt_keyring *keyring = sb->s_master_keys; + size_t i; + + if (!keyring) + return; + + for (i = 0; i < ARRAY_SIZE(keyring->key_hashtable); i++) { + struct hlist_head *bucket = &keyring->key_hashtable[i]; + struct fscrypt_master_key *mk; + struct hlist_node *tmp; + + hlist_for_each_entry_safe(mk, tmp, bucket, mk_node) { + /* + * Since all inodes were already evicted, every key + * remaining in the keyring should have an empty inode + * list, and should only still be in the keyring due to + * the single active ref associated with ->mk_secret. + * There should be no structural refs beyond the one + * associated with the active ref. + */ + WARN_ON(refcount_read(&mk->mk_active_refs) != 1); + WARN_ON(refcount_read(&mk->mk_struct_refs) != 1); + WARN_ON(!is_master_key_secret_present(&mk->mk_secret)); + wipe_master_key_secret(&mk->mk_secret); + fscrypt_put_master_key_activeref(mk); + } + } + kfree_sensitive(keyring); sb->s_master_keys = NULL; } +static struct hlist_head * +fscrypt_mk_hash_bucket(struct fscrypt_keyring *keyring, + const struct fscrypt_key_specifier *mk_spec) +{ + /* + * Since key specifiers should be "random" values, it is sufficient to + * use a trivial hash function that just takes the first several bits of + * the key specifier. + */ + unsigned long i = get_unaligned((unsigned long *)&mk_spec->u); + + return &keyring->key_hashtable[i % ARRAY_SIZE(keyring->key_hashtable)]; +} + /* - * Find the specified master key in ->s_master_keys. - * Returns ERR_PTR(-ENOKEY) if not found. + * Find the specified master key struct in ->s_master_keys and take a structural + * ref to it. The structural ref guarantees that the key struct continues to + * exist, but it does *not* guarantee that ->s_master_keys continues to contain + * the key struct. The structural ref needs to be dropped by + * fscrypt_put_master_key(). Returns NULL if the key struct is not found. */ -struct key *fscrypt_find_master_key(struct super_block *sb, - const struct fscrypt_key_specifier *mk_spec) +struct fscrypt_master_key * +fscrypt_find_master_key(struct super_block *sb, + const struct fscrypt_key_specifier *mk_spec) { - struct key *keyring; - char description[FSCRYPT_MK_DESCRIPTION_SIZE]; + struct fscrypt_keyring *keyring; + struct hlist_head *bucket; + struct fscrypt_master_key *mk; /* * Pairs with the smp_store_release() in allocate_filesystem_keyring(). @@ -246,10 +287,38 @@ struct key *fscrypt_find_master_key(struct super_block *sb, */ keyring = smp_load_acquire(&sb->s_master_keys); if (keyring == NULL) - return ERR_PTR(-ENOKEY); /* No keyring yet, so no keys yet. */ - - format_mk_description(description, mk_spec); - return search_fscrypt_keyring(keyring, &key_type_fscrypt, description); + return NULL; /* No keyring yet, so no keys yet. */ + + bucket = fscrypt_mk_hash_bucket(keyring, mk_spec); + rcu_read_lock(); + switch (mk_spec->type) { + case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR: + hlist_for_each_entry_rcu(mk, bucket, mk_node) { + if (mk->mk_spec.type == + FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR && + memcmp(mk->mk_spec.u.descriptor, + mk_spec->u.descriptor, + FSCRYPT_KEY_DESCRIPTOR_SIZE) == 0 && + refcount_inc_not_zero(&mk->mk_struct_refs)) + goto out; + } + break; + case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER: + hlist_for_each_entry_rcu(mk, bucket, mk_node) { + if (mk->mk_spec.type == + FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER && + memcmp(mk->mk_spec.u.identifier, + mk_spec->u.identifier, + FSCRYPT_KEY_IDENTIFIER_SIZE) == 0 && + refcount_inc_not_zero(&mk->mk_struct_refs)) + goto out; + } + break; + } + mk = NULL; +out: + rcu_read_unlock(); + return mk; } static int allocate_master_key_users_keyring(struct fscrypt_master_key *mk) @@ -277,17 +346,30 @@ static int allocate_master_key_users_keyring(struct fscrypt_master_key *mk) static struct key *find_master_key_user(struct fscrypt_master_key *mk) { char description[FSCRYPT_MK_USER_DESCRIPTION_SIZE]; + key_ref_t keyref; format_mk_user_description(description, mk->mk_spec.u.identifier); - return search_fscrypt_keyring(mk->mk_users, &key_type_fscrypt_user, - description); + + /* + * We need to mark the keyring reference as "possessed" so that we + * acquire permission to search it, via the KEY_POS_SEARCH permission. + */ + keyref = keyring_search(make_key_ref(mk->mk_users, true /*possessed*/), + &key_type_fscrypt_user, description, false); + if (IS_ERR(keyref)) { + if (PTR_ERR(keyref) == -EAGAIN || /* not found */ + PTR_ERR(keyref) == -EKEYREVOKED) /* recently invalidated */ + keyref = ERR_PTR(-ENOKEY); + return ERR_CAST(keyref); + } + return key_ref_to_ptr(keyref); } /* * Give the current user a "key" in ->mk_users. This charges the user's quota * and marks the master key as added by the current user, so that it cannot be - * removed by another user with the key. Either the master key's key->sem must - * be held for write, or the master key must be still undergoing initialization. + * removed by another user with the key. Either ->mk_sem must be held for + * write, or the master key must be still undergoing initialization. */ static int add_master_key_user(struct fscrypt_master_key *mk) { @@ -309,7 +391,7 @@ static int add_master_key_user(struct fscrypt_master_key *mk) /* * Remove the current user's "key" from ->mk_users. - * The master key's key->sem must be held for write. + * ->mk_sem must be held for write. * * Returns 0 if removed, -ENOKEY if not found, or another -errno code. */ @@ -327,63 +409,49 @@ static int remove_master_key_user(struct fscrypt_master_key *mk) } /* - * Allocate a new fscrypt_master_key which contains the given secret, set it as - * the payload of a new 'struct key' of type fscrypt, and link the 'struct key' - * into the given keyring. Synchronized by fscrypt_add_key_mutex. + * Allocate a new fscrypt_master_key, transfer the given secret over to it, and + * insert it into sb->s_master_keys. */ -static int add_new_master_key(struct fscrypt_master_key_secret *secret, - const struct fscrypt_key_specifier *mk_spec, - struct key *keyring) +static int add_new_master_key(struct super_block *sb, + struct fscrypt_master_key_secret *secret, + const struct fscrypt_key_specifier *mk_spec) { + struct fscrypt_keyring *keyring = sb->s_master_keys; struct fscrypt_master_key *mk; - char description[FSCRYPT_MK_DESCRIPTION_SIZE]; - struct key *key; int err; mk = kzalloc(sizeof(*mk), GFP_KERNEL); if (!mk) return -ENOMEM; + mk->mk_sb = sb; + init_rwsem(&mk->mk_sem); + refcount_set(&mk->mk_struct_refs, 1); mk->mk_spec = *mk_spec; - move_master_key_secret(&mk->mk_secret, secret); - - refcount_set(&mk->mk_refcount, 1); /* secret is present */ INIT_LIST_HEAD(&mk->mk_decrypted_inodes); spin_lock_init(&mk->mk_decrypted_inodes_lock); if (mk_spec->type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) { err = allocate_master_key_users_keyring(mk); if (err) - goto out_free_mk; + goto out_put; err = add_master_key_user(mk); if (err) - goto out_free_mk; + goto out_put; } - /* - * Note that we don't charge this key to anyone's quota, since when - * ->mk_users is in use those keys are charged instead, and otherwise - * (when ->mk_users isn't in use) only root can add these keys. - */ - format_mk_description(description, mk_spec); - key = key_alloc(&key_type_fscrypt, description, - GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(), - KEY_POS_SEARCH | KEY_USR_SEARCH | KEY_USR_VIEW, - KEY_ALLOC_NOT_IN_QUOTA, NULL); - if (IS_ERR(key)) { - err = PTR_ERR(key); - goto out_free_mk; - } - err = key_instantiate_and_link(key, mk, sizeof(*mk), keyring, NULL); - key_put(key); - if (err) - goto out_free_mk; + move_master_key_secret(&mk->mk_secret, secret); + refcount_set(&mk->mk_active_refs, 1); /* ->mk_secret is present */ + spin_lock(&keyring->lock); + hlist_add_head_rcu(&mk->mk_node, + fscrypt_mk_hash_bucket(keyring, mk_spec)); + spin_unlock(&keyring->lock); return 0; -out_free_mk: - free_master_key(mk); +out_put: + fscrypt_put_master_key(mk); return err; } @@ -392,42 +460,34 @@ out_free_mk: static int add_existing_master_key(struct fscrypt_master_key *mk, struct fscrypt_master_key_secret *secret) { - struct key *mk_user; - bool rekey; int err; /* * If the current user is already in ->mk_users, then there's nothing to - * do. (Not applicable for v1 policy keys, which have NULL ->mk_users.) + * do. Otherwise, we need to add the user to ->mk_users. (Neither is + * applicable for v1 policy keys, which have NULL ->mk_users.) */ if (mk->mk_users) { - mk_user = find_master_key_user(mk); + struct key *mk_user = find_master_key_user(mk); + if (mk_user != ERR_PTR(-ENOKEY)) { if (IS_ERR(mk_user)) return PTR_ERR(mk_user); key_put(mk_user); return 0; } - } - - /* If we'll be re-adding ->mk_secret, try to take the reference. */ - rekey = !is_master_key_secret_present(&mk->mk_secret); - if (rekey && !refcount_inc_not_zero(&mk->mk_refcount)) - return KEY_DEAD; - - /* Add the current user to ->mk_users, if applicable. */ - if (mk->mk_users) { err = add_master_key_user(mk); - if (err) { - if (rekey && refcount_dec_and_test(&mk->mk_refcount)) - return KEY_DEAD; + if (err) return err; - } } /* Re-add the secret if needed. */ - if (rekey) + if (!is_master_key_secret_present(&mk->mk_secret)) { + if (!refcount_inc_not_zero(&mk->mk_active_refs)) + return KEY_DEAD; move_master_key_secret(&mk->mk_secret, secret); + } + return 0; } @@ -436,38 +496,36 @@ static int do_add_master_key(struct super_block *sb, const struct fscrypt_key_specifier *mk_spec) { static DEFINE_MUTEX(fscrypt_add_key_mutex); - struct key *key; + struct fscrypt_master_key *mk; int err; mutex_lock(&fscrypt_add_key_mutex); /* serialize find + link */ -retry: - key = fscrypt_find_master_key(sb, mk_spec); - if (IS_ERR(key)) { - err = PTR_ERR(key); - if (err != -ENOKEY) - goto out_unlock; + + mk = fscrypt_find_master_key(sb, mk_spec); + if (!mk) { /* Didn't find the key in ->s_master_keys. Add it. */ err = allocate_filesystem_keyring(sb); - if (err) - goto out_unlock; - err = add_new_master_key(secret, mk_spec, sb->s_master_keys); + if (!err) + err = add_new_master_key(sb, secret, mk_spec); } else { /* * Found the key in ->s_master_keys. Re-add the secret if * needed, and add the user to ->mk_users if needed. */ - down_write(&key->sem); - err = add_existing_master_key(key->payload.data[0], secret); - up_write(&key->sem); + down_write(&mk->mk_sem); + err = add_existing_master_key(mk, secret); + up_write(&mk->mk_sem); if (err == KEY_DEAD) { - /* Key being removed or needs to be removed */ - key_invalidate(key); - key_put(key); - goto retry; + /* + * We found a key struct, but it's already been fully + * removed. Ignore the old struct and add a new one. + * fscrypt_add_key_mutex means we don't need to worry + * about concurrent adds. + */ + err = add_new_master_key(sb, secret, mk_spec); } - key_put(key); + fscrypt_put_master_key(mk); } -out_unlock: mutex_unlock(&fscrypt_add_key_mutex); return err; } @@ -771,19 +829,19 @@ int fscrypt_verify_key_added(struct super_block *sb, const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]) { struct fscrypt_key_specifier mk_spec; - struct key *key, *mk_user; struct fscrypt_master_key *mk; + struct key *mk_user; int err; mk_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER; memcpy(mk_spec.u.identifier, identifier, FSCRYPT_KEY_IDENTIFIER_SIZE); - key = fscrypt_find_master_key(sb, &mk_spec); - if (IS_ERR(key)) { - err = PTR_ERR(key); + mk = fscrypt_find_master_key(sb, &mk_spec); + if (!mk) { + err = -ENOKEY; goto out; } - mk = key->payload.data[0]; + down_read(&mk->mk_sem); mk_user = find_master_key_user(mk); if (IS_ERR(mk_user)) { err = PTR_ERR(mk_user); @@ -791,7 +849,8 @@ int fscrypt_verify_key_added(struct super_block *sb, key_put(mk_user); err = 0; } - key_put(key); + up_read(&mk->mk_sem); + fscrypt_put_master_key(mk); out: if (err == -ENOKEY && capable(CAP_FOWNER)) err = 0; @@ -953,11 +1012,10 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users) struct super_block *sb = file_inode(filp)->i_sb; struct fscrypt_remove_key_arg __user *uarg = _uarg; struct fscrypt_remove_key_arg arg; - struct key *key; struct fscrypt_master_key *mk; u32 status_flags = 0; int err; - bool dead; + bool inodes_remain; if (copy_from_user(&arg, uarg, sizeof(arg))) return -EFAULT; @@ -977,12 +1035,10 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users) return -EACCES; /* Find the key being removed. */ - key = fscrypt_find_master_key(sb, &arg.key_spec); - if (IS_ERR(key)) - return PTR_ERR(key); - mk = key->payload.data[0]; - - down_write(&key->sem); + mk = fscrypt_find_master_key(sb, &arg.key_spec); + if (!mk) + return -ENOKEY; + down_write(&mk->mk_sem); /* If relevant, remove current user's (or all users) claim to the key */ if (mk->mk_users && mk->mk_users->keys.nr_leaves_on_tree != 0) { @@ -991,7 +1047,7 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users) else err = remove_master_key_user(mk); if (err) { - up_write(&key->sem); + up_write(&mk->mk_sem); goto out_put_key; } if (mk->mk_users->keys.nr_leaves_on_tree != 0) { @@ -1003,26 +1059,22 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users) status_flags |= FSCRYPT_KEY_REMOVAL_STATUS_FLAG_OTHER_USERS; err = 0; - up_write(&key->sem); + up_write(&mk->mk_sem); goto out_put_key; } } /* No user claims remaining. Go ahead and wipe the secret. */ - dead = false; + err = -ENOKEY; if (is_master_key_secret_present(&mk->mk_secret)) { wipe_master_key_secret(&mk->mk_secret); - dead = refcount_dec_and_test(&mk->mk_refcount); - } - up_write(&key->sem); - if (dead) { - /* - * No inodes reference the key, and we wiped the secret, so the - * key object is free to be removed from the keyring. - */ - key_invalidate(key); + fscrypt_put_master_key_activeref(mk); err = 0; - } else { + } + inodes_remain = refcount_read(&mk->mk_active_refs) > 0; + up_write(&mk->mk_sem); + + if (inodes_remain) { /* Some inodes still reference this key; try to evict them. */ err = try_to_lock_encrypted_files(sb, mk); if (err == -EBUSY) { @@ -1038,7 +1090,7 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users) * has been fully removed including all files locked. */ out_put_key: - key_put(key); + fscrypt_put_master_key(mk); if (err == 0) err = put_user(status_flags, &uarg->removal_status_flags); return err; @@ -1085,7 +1137,6 @@ int fscrypt_ioctl_get_key_status(struct file *filp, void __user *uarg) { struct super_block *sb = file_inode(filp)->i_sb; struct fscrypt_get_key_status_arg arg; - struct key *key; struct fscrypt_master_key *mk; int err; @@ -1102,19 +1153,18 @@ int fscrypt_ioctl_get_key_status(struct file *filp, void __user *uarg) arg.user_count = 0; memset(arg.__out_reserved, 0, sizeof(arg.__out_reserved)); - key = fscrypt_find_master_key(sb, &arg.key_spec); - if (IS_ERR(key)) { - if (key != ERR_PTR(-ENOKEY)) - return PTR_ERR(key); + mk = fscrypt_find_master_key(sb, &arg.key_spec); + if (!mk) { arg.status = FSCRYPT_KEY_STATUS_ABSENT; err = 0; goto out; } - mk = key->payload.data[0]; - down_read(&key->sem); + down_read(&mk->mk_sem); if (!is_master_key_secret_present(&mk->mk_secret)) { - arg.status = FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED; + arg.status = refcount_read(&mk->mk_active_refs) > 0 ? + FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED : + FSCRYPT_KEY_STATUS_ABSENT /* raced with full removal */; err = 0; goto out_release_key; } @@ -1136,8 +1186,8 @@ int fscrypt_ioctl_get_key_status(struct file *filp, void __user *uarg) } err = 0; out_release_key: - up_read(&key->sem); - key_put(key); + up_read(&mk->mk_sem); + fscrypt_put_master_key(mk); out: if (!err && copy_to_user(uarg, &arg, sizeof(arg))) err = -EFAULT; @@ -1149,13 +1199,9 @@ int __init fscrypt_init_keyring(void) { int err; - err = register_key_type(&key_type_fscrypt); - if (err) - return err; - err = register_key_type(&key_type_fscrypt_user); if (err) - goto err_unregister_fscrypt; + return err; err = register_key_type(&key_type_fscrypt_provisioning); if (err) @@ -1165,7 +1211,5 @@ int __init fscrypt_init_keyring(void) err_unregister_fscrypt_user: unregister_key_type(&key_type_fscrypt_user); -err_unregister_fscrypt: - unregister_key_type(&key_type_fscrypt); return err; } diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index fbc71abdabe3..f7407071a952 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -9,7 +9,6 @@ */ #include <crypto/skcipher.h> -#include <linux/key.h> #include <linux/random.h> #include "fscrypt_private.h" @@ -155,10 +154,12 @@ int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key, } /* Destroy a crypto transform object and/or blk-crypto key. */ -void fscrypt_destroy_prepared_key(struct fscrypt_prepared_key *prep_key) +void fscrypt_destroy_prepared_key(struct super_block *sb, + struct fscrypt_prepared_key *prep_key) { crypto_free_skcipher(prep_key->tfm); - fscrypt_destroy_inline_crypt_key(prep_key); + fscrypt_destroy_inline_crypt_key(sb, prep_key); + memzero_explicit(prep_key, sizeof(*prep_key)); } /* Given a per-file encryption key, set up the file's crypto transform object */ @@ -412,20 +413,18 @@ static bool fscrypt_valid_master_key_size(const struct fscrypt_master_key *mk, /* * Find the master key, then set up the inode's actual encryption key. * - * If the master key is found in the filesystem-level keyring, then the - * corresponding 'struct key' is returned in *master_key_ret with its semaphore - * read-locked. This is needed to ensure that only one task links the - * fscrypt_info into ->mk_decrypted_inodes (as multiple tasks may race to create - * an fscrypt_info for the same inode), and to synchronize the master key being - * removed with a new inode starting to use it. + * If the master key is found in the filesystem-level keyring, then it is + * returned in *mk_ret with its semaphore read-locked. This is needed to ensure + * that only one task links the fscrypt_info into ->mk_decrypted_inodes (as + * multiple tasks may race to create an fscrypt_info for the same inode), and to + * synchronize the master key being removed with a new inode starting to use it. */ static int setup_file_encryption_key(struct fscrypt_info *ci, bool need_dirhash_key, - struct key **master_key_ret) + struct fscrypt_master_key **mk_ret) { - struct key *key; - struct fscrypt_master_key *mk = NULL; struct fscrypt_key_specifier mk_spec; + struct fscrypt_master_key *mk; int err; err = fscrypt_select_encryption_impl(ci); @@ -436,11 +435,10 @@ static int setup_file_encryption_key(struct fscrypt_info *ci, if (err) return err; - key = fscrypt_find_master_key(ci->ci_inode->i_sb, &mk_spec); - if (IS_ERR(key)) { - if (key != ERR_PTR(-ENOKEY) || - ci->ci_policy.version != FSCRYPT_POLICY_V1) - return PTR_ERR(key); + mk = fscrypt_find_master_key(ci->ci_inode->i_sb, &mk_spec); + if (!mk) { + if (ci->ci_policy.version != FSCRYPT_POLICY_V1) + return -ENOKEY; /* * As a legacy fallback for v1 policies, search for the key in @@ -450,9 +448,7 @@ static int setup_file_encryption_key(struct fscrypt_info *ci, */ return fscrypt_setup_v1_file_key_via_subscribed_keyrings(ci); } - - mk = key->payload.data[0]; - down_read(&key->sem); + down_read(&mk->mk_sem); /* Has the secret been removed (via FS_IOC_REMOVE_ENCRYPTION_KEY)? */ if (!is_master_key_secret_present(&mk->mk_secret)) { @@ -480,18 +476,18 @@ static int setup_file_encryption_key(struct fscrypt_info *ci, if (err) goto out_release_key; - *master_key_ret = key; + *mk_ret = mk; return 0; out_release_key: - up_read(&key->sem); - key_put(key); + up_read(&mk->mk_sem); + fscrypt_put_master_key(mk); return err; } static void put_crypt_info(struct fscrypt_info *ci) { - struct key *key; + struct fscrypt_master_key *mk; if (!ci) return; @@ -499,26 +495,21 @@ static void put_crypt_info(struct fscrypt_info *ci) if (ci->ci_direct_key) fscrypt_put_direct_key(ci->ci_direct_key); else if (ci->ci_owns_key) - fscrypt_destroy_prepared_key(&ci->ci_enc_key); - - key = ci->ci_master_key; - if (key) { - struct fscrypt_master_key *mk = key->payload.data[0]; + fscrypt_destroy_prepared_key(ci->ci_inode->i_sb, + &ci->ci_enc_key); + mk = ci->ci_master_key; + if (mk) { /* * Remove this inode from the list of inodes that were unlocked - * with the master key. - * - * In addition, if we're removing the last inode from a key that - * already had its secret removed, invalidate the key so that it - * gets removed from ->s_master_keys. + * with the master key. In addition, if we're removing the last + * inode from a master key struct that already had its secret + * removed, then complete the full removal of the struct. */ spin_lock(&mk->mk_decrypted_inodes_lock); list_del(&ci->ci_master_key_link); spin_unlock(&mk->mk_decrypted_inodes_lock); - if (refcount_dec_and_test(&mk->mk_refcount)) - key_invalidate(key); - key_put(key); + fscrypt_put_master_key_activeref(mk); } memzero_explicit(ci, sizeof(*ci)); kmem_cache_free(fscrypt_info_cachep, ci); @@ -532,7 +523,7 @@ fscrypt_setup_encryption_info(struct inode *inode, { struct fscrypt_info *crypt_info; struct fscrypt_mode *mode; - struct key *master_key = NULL; + struct fscrypt_master_key *mk = NULL; int res; res = fscrypt_initialize(inode->i_sb->s_cop->flags); @@ -555,8 +546,7 @@ fscrypt_setup_encryption_info(struct inode *inode, WARN_ON(mode->ivsize > FSCRYPT_MAX_IV_SIZE); crypt_info->ci_mode = mode; - res = setup_file_encryption_key(crypt_info, need_dirhash_key, - &master_key); + res = setup_file_encryption_key(crypt_info, need_dirhash_key, &mk); if (res) goto out; @@ -571,12 +561,9 @@ fscrypt_setup_encryption_info(struct inode *inode, * We won the race and set ->i_crypt_info to our crypt_info. * Now link it into the master key's inode list. */ - if (master_key) { - struct fscrypt_master_key *mk = - master_key->payload.data[0]; - - refcount_inc(&mk->mk_refcount); - crypt_info->ci_master_key = key_get(master_key); + if (mk) { + crypt_info->ci_master_key = mk; + refcount_inc(&mk->mk_active_refs); spin_lock(&mk->mk_decrypted_inodes_lock); list_add(&crypt_info->ci_master_key_link, &mk->mk_decrypted_inodes); @@ -586,9 +573,9 @@ fscrypt_setup_encryption_info(struct inode *inode, } res = 0; out: - if (master_key) { - up_read(&master_key->sem); - key_put(master_key); + if (mk) { + up_read(&mk->mk_sem); + fscrypt_put_master_key(mk); } put_crypt_info(crypt_info); return res; @@ -753,7 +740,6 @@ EXPORT_SYMBOL(fscrypt_free_inode); int fscrypt_drop_inode(struct inode *inode) { const struct fscrypt_info *ci = fscrypt_get_info(inode); - const struct fscrypt_master_key *mk; /* * If ci is NULL, then the inode doesn't have an encryption key set up @@ -763,7 +749,6 @@ int fscrypt_drop_inode(struct inode *inode) */ if (!ci || !ci->ci_master_key) return 0; - mk = ci->ci_master_key->payload.data[0]; /* * With proper, non-racy use of FS_IOC_REMOVE_ENCRYPTION_KEY, all inodes @@ -782,6 +767,6 @@ int fscrypt_drop_inode(struct inode *inode) * then the thread removing the key will either evict the inode itself * or will correctly detect that it wasn't evicted due to the race. */ - return !is_master_key_secret_present(&mk->mk_secret); + return !is_master_key_secret_present(&ci->ci_master_key->mk_secret); } EXPORT_SYMBOL_GPL(fscrypt_drop_inode); diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c index 2762c5350432..75dabd9b27f9 100644 --- a/fs/crypto/keysetup_v1.c +++ b/fs/crypto/keysetup_v1.c @@ -143,6 +143,7 @@ invalid: /* Master key referenced by DIRECT_KEY policy */ struct fscrypt_direct_key { + struct super_block *dk_sb; struct hlist_node dk_node; refcount_t dk_refcount; const struct fscrypt_mode *dk_mode; @@ -154,7 +155,7 @@ struct fscrypt_direct_key { static void free_direct_key(struct fscrypt_direct_key *dk) { if (dk) { - fscrypt_destroy_prepared_key(&dk->dk_key); + fscrypt_destroy_prepared_key(dk->dk_sb, &dk->dk_key); kfree_sensitive(dk); } } @@ -231,6 +232,7 @@ fscrypt_get_direct_key(const struct fscrypt_info *ci, const u8 *raw_key) dk = kzalloc(sizeof(*dk), GFP_KERNEL); if (!dk) return ERR_PTR(-ENOMEM); + dk->dk_sb = ci->ci_inode->i_sb; refcount_set(&dk->dk_refcount, 1); dk->dk_mode = ci->ci_mode; err = fscrypt_prepare_key(&dk->dk_key, raw_key, ci); diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 80b8ca0f340b..46757c3052ef 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -744,12 +744,8 @@ int fscrypt_set_context(struct inode *inode, void *fs_data) * delayed key setup that requires the inode number. */ if (ci->ci_policy.version == FSCRYPT_POLICY_V2 && - (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) { - const struct fscrypt_master_key *mk = - ci->ci_master_key->payload.data[0]; - - fscrypt_hash_inode_number(ci, mk); - } + (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) + fscrypt_hash_inode_number(ci, ci->ci_master_key); return inode->i_sb->s_cop->set_context(inode, &ctx, ctxsize, fs_data); } @@ -833,19 +829,6 @@ bool fscrypt_dummy_policies_equal(const struct fscrypt_dummy_policy *p1, } EXPORT_SYMBOL_GPL(fscrypt_dummy_policies_equal); -/* Deprecated, do not use */ -int fscrypt_set_test_dummy_encryption(struct super_block *sb, const char *arg, - struct fscrypt_dummy_policy *dummy_policy) -{ - struct fs_parameter param = { - .type = fs_value_is_string, - .string = arg ? (char *)arg : "", - }; - return fscrypt_parse_test_dummy_encryption(¶m, dummy_policy) ?: - fscrypt_add_test_dummy_key(sb, dummy_policy); -} -EXPORT_SYMBOL_GPL(fscrypt_set_test_dummy_encryption); - /** * fscrypt_show_test_dummy_encryption() - show '-o test_dummy_encryption' * @seq: the seq_file to print the option to diff --git a/fs/d_path.c b/fs/d_path.c index e4e0ebad1f15..56a6ee4c6331 100644 --- a/fs/d_path.c +++ b/fs/d_path.c @@ -34,7 +34,7 @@ static bool prepend_char(struct prepend_buffer *p, unsigned char c) } /* - * The source of the prepend data can be an optimistoc load + * The source of the prepend data can be an optimistic load * of a dentry name and length. And because we don't hold any * locks, the length and the pointer to the name may not be * in sync if a concurrent rename happens, and the kernel @@ -297,8 +297,7 @@ EXPORT_SYMBOL(d_path); /* * Helper function for dentry_operations.d_dname() members */ -char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen, - const char *fmt, ...) +char *dynamic_dname(char *buffer, int buflen, const char *fmt, ...) { va_list args; char temp[64]; @@ -1445,6 +1445,9 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, loff_t done = 0; int ret; + if (!iomi.len) + return 0; + if (iov_iter_rw(iter) == WRITE) { lockdep_assert_held_write(&iomi.inode->i_rwsem); iomi.flags |= IOMAP_WRITE; diff --git a/fs/dcache.c b/fs/dcache.c index bb0c4d0038db..52e6d5fdab6b 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2597,15 +2597,7 @@ EXPORT_SYMBOL(d_rehash); static inline unsigned start_dir_add(struct inode *dir) { - /* - * The caller holds a spinlock (dentry::d_lock). On !PREEMPT_RT - * kernels spin_lock() implicitly disables preemption, but not on - * PREEMPT_RT. So for RT it has to be done explicitly to protect - * the sequence count write side critical section against a reader - * or another writer preempting, which would result in a live lock. - */ - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_disable(); + preempt_disable_nested(); for (;;) { unsigned n = dir->i_dir_seq; if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n) @@ -2618,8 +2610,7 @@ static inline void end_dir_add(struct inode *dir, unsigned int n, wait_queue_head_t *d_wait) { smp_store_release(&dir->i_dir_seq, n + 2); - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_enable(); + preempt_enable_nested(); wake_up_all(d_wait); } @@ -3258,8 +3249,10 @@ void d_genocide(struct dentry *parent) EXPORT_SYMBOL(d_genocide); -void d_tmpfile(struct dentry *dentry, struct inode *inode) +void d_tmpfile(struct file *file, struct inode *inode) { + struct dentry *dentry = file->f_path.dentry; + inode_dec_link_count(inode); BUG_ON(dentry->d_name.name != dentry->d_iname || !hlist_unhashed(&dentry->d_u.d_alias) || diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 950c63fa4d0b..ddb3fc258df9 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -1121,7 +1121,7 @@ void debugfs_print_regs32(struct seq_file *s, const struct debugfs_reg32 *regs, } EXPORT_SYMBOL_GPL(debugfs_print_regs32); -static int debugfs_show_regset32(struct seq_file *s, void *data) +static int debugfs_regset32_show(struct seq_file *s, void *data) { struct debugfs_regset32 *regset = s->private; @@ -1136,17 +1136,7 @@ static int debugfs_show_regset32(struct seq_file *s, void *data) return 0; } -static int debugfs_open_regset32(struct inode *inode, struct file *file) -{ - return single_open(file, debugfs_show_regset32, inode->i_private); -} - -static const struct file_operations fops_regset32 = { - .open = debugfs_open_regset32, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(debugfs_regset32); /** * debugfs_create_regset32 - create a debugfs file that returns register values @@ -1167,7 +1157,7 @@ void debugfs_create_regset32(const char *name, umode_t mode, struct dentry *parent, struct debugfs_regset32 *regset) { - debugfs_create_file(name, mode, parent, regset, &fops_regset32); + debugfs_create_file(name, mode, parent, regset, &debugfs_regset32_fops); } EXPORT_SYMBOL_GPL(debugfs_create_regset32); diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 232cfdf095ae..2e8e112b1993 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -82,6 +82,8 @@ struct debugfs_mount_opts { kuid_t uid; kgid_t gid; umode_t mode; + /* Opt_* bitfield. */ + unsigned int opts; }; enum { @@ -111,6 +113,7 @@ static int debugfs_parse_options(char *data, struct debugfs_mount_opts *opts) kgid_t gid; char *p; + opts->opts = 0; opts->mode = DEBUGFS_DEFAULT_MODE; while ((p = strsep(&data, ",")) != NULL) { @@ -145,24 +148,44 @@ static int debugfs_parse_options(char *data, struct debugfs_mount_opts *opts) * but traditionally debugfs has ignored all mount options */ } + + opts->opts |= BIT(token); } return 0; } -static int debugfs_apply_options(struct super_block *sb) +static void _debugfs_apply_options(struct super_block *sb, bool remount) { struct debugfs_fs_info *fsi = sb->s_fs_info; struct inode *inode = d_inode(sb->s_root); struct debugfs_mount_opts *opts = &fsi->mount_opts; - inode->i_mode &= ~S_IALLUGO; - inode->i_mode |= opts->mode; + /* + * On remount, only reset mode/uid/gid if they were provided as mount + * options. + */ - inode->i_uid = opts->uid; - inode->i_gid = opts->gid; + if (!remount || opts->opts & BIT(Opt_mode)) { + inode->i_mode &= ~S_IALLUGO; + inode->i_mode |= opts->mode; + } - return 0; + if (!remount || opts->opts & BIT(Opt_uid)) + inode->i_uid = opts->uid; + + if (!remount || opts->opts & BIT(Opt_gid)) + inode->i_gid = opts->gid; +} + +static void debugfs_apply_options(struct super_block *sb) +{ + _debugfs_apply_options(sb, false); +} + +static void debugfs_apply_options_remount(struct super_block *sb) +{ + _debugfs_apply_options(sb, true); } static int debugfs_remount(struct super_block *sb, int *flags, char *data) @@ -175,7 +198,7 @@ static int debugfs_remount(struct super_block *sb, int *flags, char *data) if (err) goto fail; - debugfs_apply_options(sb); + debugfs_apply_options_remount(sb); fail: return err; diff --git a/fs/direct-io.c b/fs/direct-io.c index f669163d5860..03d381377ae1 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -421,8 +421,6 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) unsigned long flags; bio->bi_private = dio; - /* don't account direct I/O as memory stall */ - bio_clear_flag(bio, BIO_WORKINGSET); spin_lock_irqsave(&dio->bio_lock, flags); dio->refcount++; diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index 19ef136f9e4f..d60a8d8f109d 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -200,13 +200,13 @@ void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, if (!prev_seq) { kref_get(&lkb->lkb_ref); + mutex_lock(&ls->ls_cb_mutex); if (test_bit(LSFL_CB_DELAY, &ls->ls_flags)) { - mutex_lock(&ls->ls_cb_mutex); list_add(&lkb->lkb_cb_list, &ls->ls_cb_delay); - mutex_unlock(&ls->ls_cb_mutex); } else { queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work); } + mutex_unlock(&ls->ls_cb_mutex); } out: mutex_unlock(&lkb->lkb_cb_mutex); @@ -288,10 +288,13 @@ void dlm_callback_stop(struct dlm_ls *ls) void dlm_callback_suspend(struct dlm_ls *ls) { - set_bit(LSFL_CB_DELAY, &ls->ls_flags); + if (ls->ls_callback_wq) { + mutex_lock(&ls->ls_cb_mutex); + set_bit(LSFL_CB_DELAY, &ls->ls_flags); + mutex_unlock(&ls->ls_cb_mutex); - if (ls->ls_callback_wq) flush_workqueue(ls->ls_callback_wq); + } } #define MAX_CB_QUEUE 25 @@ -302,11 +305,11 @@ void dlm_callback_resume(struct dlm_ls *ls) int count = 0, sum = 0; bool empty; - clear_bit(LSFL_CB_DELAY, &ls->ls_flags); - if (!ls->ls_callback_wq) return; + clear_bit(LSFL_CB_DELAY, &ls->ls_flags); + more: mutex_lock(&ls->ls_cb_mutex); list_for_each_entry_safe(lkb, safe, &ls->ls_cb_delay, lkb_cb_list) { diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h index 181ad7d20c4d..e5e05fcc5813 100644 --- a/fs/dlm/ast.h +++ b/fs/dlm/ast.h @@ -11,7 +11,6 @@ #ifndef __ASTD_DOT_H__ #define __ASTD_DOT_H__ -void dlm_del_ast(struct dlm_lkb *lkb); int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, uint32_t sbflags, uint64_t seq); int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb, diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 8aca8085d24e..e34c3d2639a5 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -661,7 +661,7 @@ struct dlm_ls { spinlock_t ls_recover_idr_lock; wait_queue_head_t ls_wait_general; wait_queue_head_t ls_recover_lock_wait; - struct mutex ls_clear_proc_locks; + spinlock_t ls_clear_proc_locks; struct list_head ls_root_list; /* root resources */ struct rw_semaphore ls_root_sem; /* protect root_list */ diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index dac7eb75dba9..94a72ede5764 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -401,7 +401,7 @@ static int pre_rsb_struct(struct dlm_ls *ls) unlock any spinlocks, go back and call pre_rsb_struct again. Otherwise, take an rsb off the list and return it. */ -static int get_rsb_struct(struct dlm_ls *ls, char *name, int len, +static int get_rsb_struct(struct dlm_ls *ls, const void *name, int len, struct dlm_rsb **r_ret) { struct dlm_rsb *r; @@ -412,7 +412,8 @@ static int get_rsb_struct(struct dlm_ls *ls, char *name, int len, count = ls->ls_new_rsb_count; spin_unlock(&ls->ls_new_rsb_spin); log_debug(ls, "find_rsb retry %d %d %s", - count, dlm_config.ci_new_rsb_count, name); + count, dlm_config.ci_new_rsb_count, + (const char *)name); return -EAGAIN; } @@ -448,7 +449,7 @@ static int rsb_cmp(struct dlm_rsb *r, const char *name, int nlen) return memcmp(r->res_name, maxname, DLM_RESNAME_MAXLEN); } -int dlm_search_rsb_tree(struct rb_root *tree, char *name, int len, +int dlm_search_rsb_tree(struct rb_root *tree, const void *name, int len, struct dlm_rsb **r_ret) { struct rb_node *node = tree->rb_node; @@ -546,7 +547,7 @@ static int rsb_insert(struct dlm_rsb *rsb, struct rb_root *tree) * while that rsb has a potentially stale master.) */ -static int find_rsb_dir(struct dlm_ls *ls, char *name, int len, +static int find_rsb_dir(struct dlm_ls *ls, const void *name, int len, uint32_t hash, uint32_t b, int dir_nodeid, int from_nodeid, unsigned int flags, struct dlm_rsb **r_ret) @@ -724,7 +725,7 @@ static int find_rsb_dir(struct dlm_ls *ls, char *name, int len, dlm_recover_locks) before we've made ourself master (in dlm_recover_masters). */ -static int find_rsb_nodir(struct dlm_ls *ls, char *name, int len, +static int find_rsb_nodir(struct dlm_ls *ls, const void *name, int len, uint32_t hash, uint32_t b, int dir_nodeid, int from_nodeid, unsigned int flags, struct dlm_rsb **r_ret) @@ -818,8 +819,9 @@ static int find_rsb_nodir(struct dlm_ls *ls, char *name, int len, return error; } -static int find_rsb(struct dlm_ls *ls, char *name, int len, int from_nodeid, - unsigned int flags, struct dlm_rsb **r_ret) +static int find_rsb(struct dlm_ls *ls, const void *name, int len, + int from_nodeid, unsigned int flags, + struct dlm_rsb **r_ret) { uint32_t hash, b; int dir_nodeid; @@ -2864,17 +2866,9 @@ static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args) static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, struct dlm_args *args) { - int rv = -EINVAL; + int rv = -EBUSY; if (args->flags & DLM_LKF_CONVERT) { - if (lkb->lkb_flags & DLM_IFL_MSTCPY) - goto out; - - if (args->flags & DLM_LKF_QUECVT && - !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1]) - goto out; - - rv = -EBUSY; if (lkb->lkb_status != DLM_LKSTS_GRANTED) goto out; @@ -2884,6 +2878,14 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, if (is_overlap(lkb)) goto out; + + rv = -EINVAL; + if (lkb->lkb_flags & DLM_IFL_MSTCPY) + goto out; + + if (args->flags & DLM_LKF_QUECVT && + !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1]) + goto out; } lkb->lkb_exflags = args->flags; @@ -2900,11 +2902,25 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, #endif rv = 0; out: - if (rv) - log_debug(ls, "validate_lock_args %d %x %x %x %d %d %s", + switch (rv) { + case 0: + break; + case -EINVAL: + /* annoy the user because dlm usage is wrong */ + WARN_ON(1); + log_error(ls, "%s %d %x %x %x %d %d %s", __func__, + rv, lkb->lkb_id, lkb->lkb_flags, args->flags, + lkb->lkb_status, lkb->lkb_wait_type, + lkb->lkb_resource->res_name); + break; + default: + log_debug(ls, "%s %d %x %x %x %d %d %s", __func__, rv, lkb->lkb_id, lkb->lkb_flags, args->flags, lkb->lkb_status, lkb->lkb_wait_type, lkb->lkb_resource->res_name); + break; + } + return rv; } @@ -2918,23 +2934,12 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args) { struct dlm_ls *ls = lkb->lkb_resource->res_ls; - int rv = -EINVAL; - - if (lkb->lkb_flags & DLM_IFL_MSTCPY) { - log_error(ls, "unlock on MSTCPY %x", lkb->lkb_id); - dlm_print_lkb(lkb); - goto out; - } - - /* an lkb may still exist even though the lock is EOL'ed due to a - cancel, unlock or failed noqueue request; an app can't use these - locks; return same error as if the lkid had not been found at all */ + int rv = -EBUSY; - if (lkb->lkb_flags & DLM_IFL_ENDOFLIFE) { - log_debug(ls, "unlock on ENDOFLIFE %x", lkb->lkb_id); - rv = -ENOENT; + /* normal unlock not allowed if there's any op in progress */ + if (!(args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) && + (lkb->lkb_wait_type || lkb->lkb_wait_count)) goto out; - } /* an lkb may be waiting for an rsb lookup to complete where the lookup was initiated by another lock */ @@ -2949,7 +2954,24 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args) unhold_lkb(lkb); /* undoes create_lkb() */ } /* caller changes -EBUSY to 0 for CANCEL and FORCEUNLOCK */ - rv = -EBUSY; + goto out; + } + + rv = -EINVAL; + if (lkb->lkb_flags & DLM_IFL_MSTCPY) { + log_error(ls, "unlock on MSTCPY %x", lkb->lkb_id); + dlm_print_lkb(lkb); + goto out; + } + + /* an lkb may still exist even though the lock is EOL'ed due to a + * cancel, unlock or failed noqueue request; an app can't use these + * locks; return same error as if the lkid had not been found at all + */ + + if (lkb->lkb_flags & DLM_IFL_ENDOFLIFE) { + log_debug(ls, "unlock on ENDOFLIFE %x", lkb->lkb_id); + rv = -ENOENT; goto out; } @@ -3022,14 +3044,8 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args) goto out; } /* add_to_waiters() will set OVERLAP_UNLOCK */ - goto out_ok; } - /* normal unlock not allowed if there's any op in progress */ - rv = -EBUSY; - if (lkb->lkb_wait_type || lkb->lkb_wait_count) - goto out; - out_ok: /* an overlapping op shouldn't blow away exflags from other op */ lkb->lkb_exflags |= args->flags; @@ -3037,11 +3053,25 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args) lkb->lkb_astparam = args->astparam; rv = 0; out: - if (rv) - log_debug(ls, "validate_unlock_args %d %x %x %x %x %d %s", rv, + switch (rv) { + case 0: + break; + case -EINVAL: + /* annoy the user because dlm usage is wrong */ + WARN_ON(1); + log_error(ls, "%s %d %x %x %x %x %d %s", __func__, rv, + lkb->lkb_id, lkb->lkb_flags, lkb->lkb_exflags, + args->flags, lkb->lkb_wait_type, + lkb->lkb_resource->res_name); + break; + default: + log_debug(ls, "%s %d %x %x %x %x %d %s", __func__, rv, lkb->lkb_id, lkb->lkb_flags, lkb->lkb_exflags, args->flags, lkb->lkb_wait_type, lkb->lkb_resource->res_name); + break; + } + return rv; } @@ -3292,8 +3322,9 @@ static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) * request_lock(), convert_lock(), unlock_lock(), cancel_lock() */ -static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, char *name, - int len, struct dlm_args *args) +static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, + const void *name, int len, + struct dlm_args *args) { struct dlm_rsb *r; int error; @@ -3392,7 +3423,7 @@ int dlm_lock(dlm_lockspace_t *lockspace, int mode, struct dlm_lksb *lksb, uint32_t flags, - void *name, + const void *name, unsigned int namelen, uint32_t parent_lkid, void (*ast) (void *astarg), @@ -3438,7 +3469,7 @@ int dlm_lock(dlm_lockspace_t *lockspace, if (error == -EINPROGRESS) error = 0; out_put: - trace_dlm_lock_end(ls, lkb, name, namelen, mode, flags, error); + trace_dlm_lock_end(ls, lkb, name, namelen, mode, flags, error, true); if (convert || error) __put_lkb(ls, lkb); @@ -3623,7 +3654,7 @@ static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb, case cpu_to_le32(DLM_MSG_REQUEST_REPLY): case cpu_to_le32(DLM_MSG_CONVERT_REPLY): case cpu_to_le32(DLM_MSG_GRANT): - if (!lkb->lkb_lvbptr) + if (!lkb->lkb_lvbptr || !(lkb->lkb_exflags & DLM_LKF_VALBLK)) break; memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen); break; @@ -5080,8 +5111,11 @@ void dlm_receive_buffer(union dlm_packet *p, int nodeid) down_read(&ls->ls_recv_active); if (hd->h_cmd == DLM_MSG) dlm_receive_message(ls, &p->message, nodeid); - else + else if (hd->h_cmd == DLM_RCOM) dlm_receive_rcom(ls, &p->rcom, nodeid); + else + log_error(ls, "invalid h_cmd %d from %d lockspace %x", + hd->h_cmd, nodeid, le32_to_cpu(hd->u.h_lockspace)); up_read(&ls->ls_recv_active); dlm_put_lockspace(ls); @@ -5801,6 +5835,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, { struct dlm_lkb *lkb; struct dlm_args args; + bool do_put = true; int error; dlm_lock_recovery(ls); @@ -5811,13 +5846,14 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, goto out; } + trace_dlm_lock_start(ls, lkb, name, namelen, mode, flags); + if (flags & DLM_LKF_VALBLK) { ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS); if (!ua->lksb.sb_lvbptr) { kfree(ua); - __put_lkb(ls, lkb); error = -ENOMEM; - goto out; + goto out_put; } } #ifdef CONFIG_DLM_DEPRECATED_API @@ -5831,8 +5867,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, kfree(ua->lksb.sb_lvbptr); ua->lksb.sb_lvbptr = NULL; kfree(ua); - __put_lkb(ls, lkb); - goto out; + goto out_put; } /* After ua is attached to lkb it will be freed by dlm_free_lkb(). @@ -5851,8 +5886,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, error = 0; fallthrough; default: - __put_lkb(ls, lkb); - goto out; + goto out_put; } /* add this new lkb to the per-process list of locks */ @@ -5860,6 +5894,11 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, hold_lkb(lkb); list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks); spin_unlock(&ua->proc->locks_spin); + do_put = false; + out_put: + trace_dlm_lock_end(ls, lkb, name, namelen, mode, flags, error, false); + if (do_put) + __put_lkb(ls, lkb); out: dlm_unlock_recovery(ls); return error; @@ -5885,6 +5924,8 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, if (error) goto out; + trace_dlm_lock_start(ls, lkb, NULL, 0, mode, flags); + /* user can change the params on its lock when it converts it, or add an lvb that didn't exist before */ @@ -5922,6 +5963,7 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, if (error == -EINPROGRESS || error == -EAGAIN || error == -EDEADLK) error = 0; out_put: + trace_dlm_lock_end(ls, lkb, NULL, 0, mode, flags, error, false); dlm_put_lkb(lkb); out: dlm_unlock_recovery(ls); @@ -6014,6 +6056,8 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, if (error) goto out; + trace_dlm_unlock_start(ls, lkb, flags); + ua = lkb->lkb_ua; if (lvb_in && ua->lksb.sb_lvbptr) @@ -6042,6 +6086,7 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, list_move(&lkb->lkb_ownqueue, &ua->proc->unlocking); spin_unlock(&ua->proc->locks_spin); out_put: + trace_dlm_unlock_end(ls, lkb, flags, error); dlm_put_lkb(lkb); out: dlm_unlock_recovery(ls); @@ -6063,6 +6108,8 @@ int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, if (error) goto out; + trace_dlm_unlock_start(ls, lkb, flags); + ua = lkb->lkb_ua; if (ua_tmp->castparam) ua->castparam = ua_tmp->castparam; @@ -6080,6 +6127,7 @@ int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, if (error == -EBUSY) error = 0; out_put: + trace_dlm_unlock_end(ls, lkb, flags, error); dlm_put_lkb(lkb); out: dlm_unlock_recovery(ls); @@ -6101,6 +6149,8 @@ int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid) if (error) goto out; + trace_dlm_unlock_start(ls, lkb, flags); + ua = lkb->lkb_ua; error = set_unlock_args(flags, ua, &args); @@ -6129,6 +6179,7 @@ int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid) if (error == -EBUSY) error = 0; out_put: + trace_dlm_unlock_end(ls, lkb, flags, error); dlm_put_lkb(lkb); out: dlm_unlock_recovery(ls); @@ -6184,7 +6235,7 @@ static struct dlm_lkb *del_proc_lock(struct dlm_ls *ls, { struct dlm_lkb *lkb = NULL; - mutex_lock(&ls->ls_clear_proc_locks); + spin_lock(&ls->ls_clear_proc_locks); if (list_empty(&proc->locks)) goto out; @@ -6196,7 +6247,7 @@ static struct dlm_lkb *del_proc_lock(struct dlm_ls *ls, else lkb->lkb_flags |= DLM_IFL_DEAD; out: - mutex_unlock(&ls->ls_clear_proc_locks); + spin_unlock(&ls->ls_clear_proc_locks); return lkb; } @@ -6233,7 +6284,7 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) dlm_put_lkb(lkb); } - mutex_lock(&ls->ls_clear_proc_locks); + spin_lock(&ls->ls_clear_proc_locks); /* in-progress unlocks */ list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) { @@ -6249,7 +6300,7 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) dlm_put_lkb(lkb); } - mutex_unlock(&ls->ls_clear_proc_locks); + spin_unlock(&ls->ls_clear_proc_locks); dlm_unlock_recovery(ls); } diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h index a7b6474f009d..40c76b5544da 100644 --- a/fs/dlm/lock.h +++ b/fs/dlm/lock.h @@ -36,7 +36,7 @@ static inline void dlm_adjust_timeouts(struct dlm_ls *ls) { } int dlm_master_lookup(struct dlm_ls *ls, int nodeid, char *name, int len, unsigned int flags, int *r_nodeid, int *result); -int dlm_search_rsb_tree(struct rb_root *tree, char *name, int len, +int dlm_search_rsb_tree(struct rb_root *tree, const void *name, int len, struct dlm_rsb **r_ret); void dlm_recover_purge(struct dlm_ls *ls); diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 3972f4d86c75..bae050df7abf 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -416,7 +416,7 @@ static int new_lockspace(const char *name, const char *cluster, if (namelen > DLM_LOCKSPACE_LEN || namelen == 0) return -EINVAL; - if (!lvblen || (lvblen % 8)) + if (lvblen % 8) return -EINVAL; if (!try_module_get(THIS_MODULE)) @@ -584,7 +584,7 @@ static int new_lockspace(const char *name, const char *cluster, atomic_set(&ls->ls_requestqueue_cnt, 0); init_waitqueue_head(&ls->ls_requestqueue_wait); mutex_init(&ls->ls_requestqueue_mutex); - mutex_init(&ls->ls_clear_proc_locks); + spin_lock_init(&ls->ls_clear_proc_locks); /* Due backwards compatibility with 3.1 we need to use maximum * possible dlm message size to be sure the message will fit and @@ -703,10 +703,11 @@ static int new_lockspace(const char *name, const char *cluster, return error; } -int dlm_new_lockspace(const char *name, const char *cluster, - uint32_t flags, int lvblen, - const struct dlm_lockspace_ops *ops, void *ops_arg, - int *ops_result, dlm_lockspace_t **lockspace) +static int __dlm_new_lockspace(const char *name, const char *cluster, + uint32_t flags, int lvblen, + const struct dlm_lockspace_ops *ops, + void *ops_arg, int *ops_result, + dlm_lockspace_t **lockspace) { int error = 0; @@ -732,6 +733,25 @@ int dlm_new_lockspace(const char *name, const char *cluster, return error; } +int dlm_new_lockspace(const char *name, const char *cluster, uint32_t flags, + int lvblen, const struct dlm_lockspace_ops *ops, + void *ops_arg, int *ops_result, + dlm_lockspace_t **lockspace) +{ + return __dlm_new_lockspace(name, cluster, flags | DLM_LSFL_FS, lvblen, + ops, ops_arg, ops_result, lockspace); +} + +int dlm_new_user_lockspace(const char *name, const char *cluster, + uint32_t flags, int lvblen, + const struct dlm_lockspace_ops *ops, + void *ops_arg, int *ops_result, + dlm_lockspace_t **lockspace) +{ + return __dlm_new_lockspace(name, cluster, flags, lvblen, ops, + ops_arg, ops_result, lockspace); +} + static int lkb_idr_is_local(int id, void *p, void *data) { struct dlm_lkb *lkb = p; diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h index 306fc4f4ea15..03f4a4a3a871 100644 --- a/fs/dlm/lockspace.h +++ b/fs/dlm/lockspace.h @@ -12,6 +12,14 @@ #ifndef __LOCKSPACE_DOT_H__ #define __LOCKSPACE_DOT_H__ +/* DLM_LSFL_FS + * The lockspace user is in the kernel (i.e. filesystem). Enables + * direct bast/cast callbacks. + * + * internal lockspace flag - will be removed in future + */ +#define DLM_LSFL_FS 0x00000004 + int dlm_lockspace_init(void); void dlm_lockspace_exit(void); struct dlm_ls *dlm_find_lockspace_global(uint32_t id); @@ -20,6 +28,11 @@ struct dlm_ls *dlm_find_lockspace_device(int minor); void dlm_put_lockspace(struct dlm_ls *ls); void dlm_stop_lockspaces(void); void dlm_stop_lockspaces_check(void); +int dlm_new_user_lockspace(const char *name, const char *cluster, + uint32_t flags, int lvblen, + const struct dlm_lockspace_ops *ops, + void *ops_arg, int *ops_result, + dlm_lockspace_t **lockspace); #endif /* __LOCKSPACE_DOT_H__ */ diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index a4e84e8d94c8..59f64c596233 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1336,6 +1336,8 @@ struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation, return NULL; } + /* for dlm_lowcomms_commit_msg() */ + kref_get(&msg->ref); /* we assume if successful commit must called */ msg->idx = idx; return msg; @@ -1375,6 +1377,8 @@ void dlm_lowcomms_commit_msg(struct dlm_msg *msg) { _dlm_lowcomms_commit_msg(msg); srcu_read_unlock(&connections_srcu, msg->idx); + /* because dlm_lowcomms_new_msg() */ + kref_put(&msg->ref, dlm_msg_release); } #endif diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c index 67f68d48d60c..4de4b8651c6c 100644 --- a/fs/dlm/netlink.c +++ b/fs/dlm/netlink.c @@ -75,6 +75,7 @@ static struct genl_family family __ro_after_init = { .version = DLM_GENL_VERSION, .small_ops = dlm_nl_ops, .n_small_ops = ARRAY_SIZE(dlm_nl_ops), + .resv_start_op = DLM_CMD_HELLO + 1, .module = THIS_MODULE, }; diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 99e8f0744513..c5d27bccc3dc 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -16,6 +16,8 @@ #include <linux/slab.h> #include <linux/sched/signal.h> +#include <trace/events/dlm.h> + #include "dlm_internal.h" #include "lockspace.h" #include "lock.h" @@ -184,7 +186,7 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, return; ls = lkb->lkb_resource->res_ls; - mutex_lock(&ls->ls_clear_proc_locks); + spin_lock(&ls->ls_clear_proc_locks); /* If ORPHAN/DEAD flag is set, it means the process is dead so an ast can't be delivered. For ORPHAN's, dlm_clear_proc_locks() freed @@ -230,7 +232,7 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, spin_unlock(&proc->locks_spin); } out: - mutex_unlock(&ls->ls_clear_proc_locks); + spin_unlock(&ls->ls_clear_proc_locks); } static int device_user_lock(struct dlm_user_proc *proc, @@ -421,9 +423,9 @@ static int device_create_lockspace(struct dlm_lspace_params *params) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - error = dlm_new_lockspace(params->name, dlm_config.ci_cluster_name, params->flags, - DLM_USER_LVB_LEN, NULL, NULL, NULL, - &lockspace); + error = dlm_new_user_lockspace(params->name, dlm_config.ci_cluster_name, + params->flags, DLM_USER_LVB_LEN, NULL, + NULL, NULL, &lockspace); if (error) return error; @@ -882,7 +884,9 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count, goto try_another; } - if (cb.flags & DLM_CB_CAST) { + if (cb.flags & DLM_CB_BAST) { + trace_dlm_bast(lkb->lkb_resource->res_ls, lkb, cb.mode); + } else if (cb.flags & DLM_CB_CAST) { new_mode = cb.mode; if (!cb.sb_status && lkb->lkb_lksb->sb_lvbptr && @@ -891,6 +895,7 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count, lkb->lkb_lksb->sb_status = cb.sb_status; lkb->lkb_lksb->sb_flags = cb.sb_flags; + trace_dlm_ast(lkb->lkb_resource->res_ls, lkb); } rv = copy_result_to_user(lkb->lkb_ua, diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 5f2b49e13731..f2ed0c0266cb 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -506,7 +506,7 @@ ecryptfs_dentry_to_lower(struct dentry *dentry) return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.dentry; } -static inline struct path * +static inline const struct path * ecryptfs_dentry_to_lower_path(struct dentry *dentry) { return &((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path; diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 18d5b91cb573..268b74499c28 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -33,7 +33,7 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb, struct iov_iter *to) { ssize_t rc; - struct path *path; + const struct path *path; struct file *file = iocb->ki_filp; rc = generic_file_read_iter(iocb, to); @@ -53,7 +53,7 @@ struct ecryptfs_getdents_callback { }; /* Inspired by generic filldir in fs/readdir.c */ -static int +static bool ecryptfs_filldir(struct dir_context *ctx, const char *lower_name, int lower_namelen, loff_t offset, u64 ino, unsigned int d_type) { @@ -61,18 +61,19 @@ ecryptfs_filldir(struct dir_context *ctx, const char *lower_name, container_of(ctx, struct ecryptfs_getdents_callback, ctx); size_t name_size; char *name; - int rc; + int err; + bool res; buf->filldir_called++; - rc = ecryptfs_decode_and_decrypt_filename(&name, &name_size, - buf->sb, lower_name, - lower_namelen); - if (rc) { - if (rc != -EINVAL) { + err = ecryptfs_decode_and_decrypt_filename(&name, &name_size, + buf->sb, lower_name, + lower_namelen); + if (err) { + if (err != -EINVAL) { ecryptfs_printk(KERN_DEBUG, "%s: Error attempting to decode and decrypt filename [%s]; rc = [%d]\n", - __func__, lower_name, rc); - return rc; + __func__, lower_name, err); + return false; } /* Mask -EINVAL errors as these are most likely due a plaintext @@ -81,16 +82,15 @@ ecryptfs_filldir(struct dir_context *ctx, const char *lower_name, * the "lost+found" dentry in the root directory of an Ext4 * filesystem. */ - return 0; + return true; } buf->caller->pos = buf->ctx.pos; - rc = !dir_emit(buf->caller, name, name_size, ino, d_type); + res = dir_emit(buf->caller, name, name_size, ino, d_type); kfree(name); - if (!rc) + if (res) buf->entries_written++; - - return rc; + return res; } /** @@ -111,14 +111,8 @@ static int ecryptfs_readdir(struct file *file, struct dir_context *ctx) lower_file = ecryptfs_file_to_lower(file); rc = iterate_dir(lower_file, &buf.ctx); ctx->pos = buf.ctx.pos; - if (rc < 0) - goto out; - if (buf.filldir_called && !buf.entries_written) - goto out; - if (rc >= 0) - fsstack_copy_attr_atime(inode, - file_inode(lower_file)); -out: + if (rc >= 0 && (buf.entries_written || !buf.filldir_called)) + fsstack_copy_attr_atime(inode, file_inode(lower_file)); return rc; } diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 16d50dface59..c214fe0981bd 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -317,7 +317,7 @@ static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode) static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry, struct dentry *lower_dentry) { - struct path *path = ecryptfs_dentry_to_lower_path(dentry->d_parent); + const struct path *path = ecryptfs_dentry_to_lower_path(dentry->d_parent); struct inode *inode, *lower_inode; struct ecryptfs_dentry_info *dentry_info; int rc = 0; diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 2dd23a82e0de..2dc927ba067f 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -105,7 +105,7 @@ static int ecryptfs_init_lower_file(struct dentry *dentry, struct file **lower_file) { const struct cred *cred = current_cred(); - struct path *path = ecryptfs_dentry_to_lower_path(dentry); + const struct path *path = ecryptfs_dentry_to_lower_path(dentry); int rc; rc = ecryptfs_privileged_open(lower_file, path->dentry, path->mnt, diff --git a/fs/efivarfs/vars.c b/fs/efivarfs/vars.c index a0ef63cfcecb..9e4f47808bd5 100644 --- a/fs/efivarfs/vars.c +++ b/fs/efivarfs/vars.c @@ -651,22 +651,6 @@ int efivar_entry_set_get_size(struct efivar_entry *entry, u32 attributes, if (err) return err; - /* - * Ensure that the available space hasn't shrunk below the safe level - */ - status = check_var_size(attributes, *size + ucs2_strsize(name, 1024)); - if (status != EFI_SUCCESS) { - if (status != EFI_UNSUPPORTED) { - err = efi_status_to_err(status); - goto out; - } - - if (*size > 65536) { - err = -ENOSPC; - goto out; - } - } - status = efivar_set_variable_locked(name, vendor, attributes, *size, data, false); if (status != EFI_SUCCESS) { diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 2d55569f96ac..51b7ac7166d9 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -317,52 +317,61 @@ dstmap_out: return ret; } -static int z_erofs_shifted_transform(struct z_erofs_decompress_req *rq, - struct page **pagepool) +static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, + struct page **pagepool) { - const unsigned int nrpages_out = + const unsigned int inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT; + const unsigned int outpages = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; const unsigned int righthalf = min_t(unsigned int, rq->outputsize, PAGE_SIZE - rq->pageofs_out); const unsigned int lefthalf = rq->outputsize - righthalf; + const unsigned int interlaced_offset = + rq->alg == Z_EROFS_COMPRESSION_SHIFTED ? 0 : rq->pageofs_out; unsigned char *src, *dst; - if (nrpages_out > 2) { + if (outpages > 2 && rq->alg == Z_EROFS_COMPRESSION_SHIFTED) { DBG_BUGON(1); - return -EIO; + return -EFSCORRUPTED; } if (rq->out[0] == *rq->in) { - DBG_BUGON(nrpages_out != 1); + DBG_BUGON(rq->pageofs_out); return 0; } - src = kmap_atomic(*rq->in) + rq->pageofs_in; + src = kmap_local_page(rq->in[inpages - 1]) + rq->pageofs_in; if (rq->out[0]) { - dst = kmap_atomic(rq->out[0]); - memcpy(dst + rq->pageofs_out, src, righthalf); - kunmap_atomic(dst); + dst = kmap_local_page(rq->out[0]); + memcpy(dst + rq->pageofs_out, src + interlaced_offset, + righthalf); + kunmap_local(dst); } - if (nrpages_out == 2) { - DBG_BUGON(!rq->out[1]); - if (rq->out[1] == *rq->in) { + if (outpages > inpages) { + DBG_BUGON(!rq->out[outpages - 1]); + if (rq->out[outpages - 1] != rq->in[inpages - 1]) { + dst = kmap_local_page(rq->out[outpages - 1]); + memcpy(dst, interlaced_offset ? src : + (src + righthalf), lefthalf); + kunmap_local(dst); + } else if (!interlaced_offset) { memmove(src, src + righthalf, lefthalf); - } else { - dst = kmap_atomic(rq->out[1]); - memcpy(dst, src + righthalf, lefthalf); - kunmap_atomic(dst); } } - kunmap_atomic(src); + kunmap_local(src); return 0; } static struct z_erofs_decompressor decompressors[] = { [Z_EROFS_COMPRESSION_SHIFTED] = { - .decompress = z_erofs_shifted_transform, + .decompress = z_erofs_transform_plain, .name = "shifted" }, + [Z_EROFS_COMPRESSION_INTERLACED] = { + .decompress = z_erofs_transform_plain, + .name = "interlaced" + }, [Z_EROFS_COMPRESSION_LZ4] = { .decompress = z_erofs_lz4_decompress, .name = "lz4" diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c index 5e59b3f523eb..091fd5adf818 100644 --- a/fs/erofs/decompressor_lzma.c +++ b/fs/erofs/decompressor_lzma.c @@ -217,6 +217,9 @@ again: strm->buf.out_size = min_t(u32, outlen, PAGE_SIZE - pageofs); outlen -= strm->buf.out_size; + if (!rq->out[no] && rq->fillgaps) /* deduped */ + rq->out[no] = erofs_allocpage(pagepool, + GFP_KERNEL | __GFP_NOFAIL); if (rq->out[no]) strm->buf.out = kmap(rq->out[no]) + pageofs; pageofs = 0; diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 2b48373f690b..dbcd24371002 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -25,6 +25,8 @@ #define EROFS_FEATURE_INCOMPAT_DEVICE_TABLE 0x00000008 #define EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 0x00000008 #define EROFS_FEATURE_INCOMPAT_ZTAILPACKING 0x00000010 +#define EROFS_FEATURE_INCOMPAT_FRAGMENTS 0x00000020 +#define EROFS_FEATURE_INCOMPAT_DEDUPE 0x00000020 #define EROFS_ALL_FEATURE_INCOMPAT \ (EROFS_FEATURE_INCOMPAT_ZERO_PADDING | \ EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \ @@ -32,7 +34,9 @@ EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \ EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \ EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 | \ - EROFS_FEATURE_INCOMPAT_ZTAILPACKING) + EROFS_FEATURE_INCOMPAT_ZTAILPACKING | \ + EROFS_FEATURE_INCOMPAT_FRAGMENTS | \ + EROFS_FEATURE_INCOMPAT_DEDUPE) #define EROFS_SB_EXTSLOT_SIZE 16 @@ -71,7 +75,9 @@ struct erofs_super_block { } __packed u1; __le16 extra_devices; /* # of devices besides the primary device */ __le16 devt_slotoff; /* startoff = devt_slotoff * devt_slotsize */ - __u8 reserved2[38]; + __u8 reserved[6]; + __le64 packed_nid; /* nid of the special packed inode */ + __u8 reserved2[24]; }; /* @@ -295,16 +301,27 @@ struct z_erofs_lzma_cfgs { * bit 1 : HEAD1 big pcluster (0 - off; 1 - on) * bit 2 : HEAD2 big pcluster (0 - off; 1 - on) * bit 3 : tailpacking inline pcluster (0 - off; 1 - on) + * bit 4 : interlaced plain pcluster (0 - off; 1 - on) + * bit 5 : fragment pcluster (0 - off; 1 - on) */ #define Z_EROFS_ADVISE_COMPACTED_2B 0x0001 #define Z_EROFS_ADVISE_BIG_PCLUSTER_1 0x0002 #define Z_EROFS_ADVISE_BIG_PCLUSTER_2 0x0004 #define Z_EROFS_ADVISE_INLINE_PCLUSTER 0x0008 +#define Z_EROFS_ADVISE_INTERLACED_PCLUSTER 0x0010 +#define Z_EROFS_ADVISE_FRAGMENT_PCLUSTER 0x0020 +#define Z_EROFS_FRAGMENT_INODE_BIT 7 struct z_erofs_map_header { - __le16 h_reserved1; - /* indicates the encoded size of tailpacking data */ - __le16 h_idata_size; + union { + /* fragment data offset in the packed inode */ + __le32 h_fragmentoff; + struct { + __le16 h_reserved1; + /* indicates the encoded size of tailpacking data */ + __le16 h_idata_size; + }; + }; __le16 h_advise; /* * bit 0-3 : algorithm type of head 1 (logical cluster type 01); @@ -313,7 +330,8 @@ struct z_erofs_map_header { __u8 h_algorithmtype; /* * bit 0-2 : logical cluster bits - 12, e.g. 0 for 4096; - * bit 3-7 : reserved. + * bit 3-6 : reserved; + * bit 7 : move the whole file into packed inode or not. */ __u8 h_clusterbits; }; @@ -355,6 +373,9 @@ enum { #define Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS 2 #define Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT 0 +/* (noncompact only, HEAD) This pcluster refers to partial decompressed data */ +#define Z_EROFS_VLE_DI_PARTIAL_REF (1 << 15) + /* * D0_CBLKCNT will be marked _only_ at the 1st non-head lcluster to store the * compressed block count of a compressed extent (in logical clusters, aka. @@ -402,6 +423,10 @@ struct erofs_dirent { /* check the EROFS on-disk layout strictly at compile time */ static inline void erofs_check_ondisk_layout_definitions(void) { + const __le64 fmh = *(__le64 *)&(struct z_erofs_map_header) { + .h_clusterbits = 1 << Z_EROFS_FRAGMENT_INODE_BIT + }; + BUILD_BUG_ON(sizeof(struct erofs_super_block) != 128); BUILD_BUG_ON(sizeof(struct erofs_inode_compact) != 32); BUILD_BUG_ON(sizeof(struct erofs_inode_extended) != 64); @@ -419,6 +444,9 @@ static inline void erofs_check_ondisk_layout_definitions(void) BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) < Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1); + /* exclude old compiler versions like gcc 7.5.0 */ + BUILD_BUG_ON(__builtin_constant_p(fmh) ? + fmh != cpu_to_le64(1ULL << 63) : 0); } #endif diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index b5fd9d71e67f..af5ed6b9c54d 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -1,10 +1,16 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2022, Alibaba Cloud + * Copyright (C) 2022, Bytedance Inc. All rights reserved. */ #include <linux/fscache.h> #include "internal.h" +static DEFINE_MUTEX(erofs_domain_list_lock); +static DEFINE_MUTEX(erofs_domain_cookies_lock); +static LIST_HEAD(erofs_domain_list); +static struct vfsmount *erofs_pseudo_mnt; + static struct netfs_io_request *erofs_fscache_alloc_request(struct address_space *mapping, loff_t start, size_t len) { @@ -69,11 +75,15 @@ static void erofs_fscache_rreq_unlock_folios(struct netfs_io_request *rreq) rcu_read_lock(); xas_for_each(&xas, folio, last_page) { - unsigned int pgpos = - (folio_index(folio) - start_page) * PAGE_SIZE; - unsigned int pgend = pgpos + folio_size(folio); + unsigned int pgpos, pgend; bool pg_failed = false; + if (xas_retry(&xas, folio)) + continue; + + pgpos = (folio_index(folio) - start_page) * PAGE_SIZE; + pgend = pgpos + folio_size(folio); + for (;;) { if (!subreq) { pg_failed = true; @@ -234,113 +244,114 @@ out: return ret; } -static int erofs_fscache_read_folio_inline(struct folio *folio, - struct erofs_map_blocks *map) -{ - struct super_block *sb = folio_mapping(folio)->host->i_sb; - struct erofs_buf buf = __EROFS_BUF_INITIALIZER; - erofs_blk_t blknr; - size_t offset, len; - void *src, *dst; - - /* For tail packing layout, the offset may be non-zero. */ - offset = erofs_blkoff(map->m_pa); - blknr = erofs_blknr(map->m_pa); - len = map->m_llen; - - src = erofs_read_metabuf(&buf, sb, blknr, EROFS_KMAP); - if (IS_ERR(src)) - return PTR_ERR(src); - - dst = kmap_local_folio(folio, 0); - memcpy(dst, src + offset, len); - memset(dst + len, 0, PAGE_SIZE - len); - kunmap_local(dst); - - erofs_put_metabuf(&buf); - return 0; -} - -static int erofs_fscache_read_folio(struct file *file, struct folio *folio) +/* + * Read into page cache in the range described by (@pos, @len). + * + * On return, the caller is responsible for page unlocking if the output @unlock + * is true, or the callee will take this responsibility through netfs_io_request + * interface. + * + * The return value is the number of bytes successfully handled, or negative + * error code on failure. The only exception is that, the length of the range + * instead of the error code is returned on failure after netfs_io_request is + * allocated, so that .readahead() could advance rac accordingly. + */ +static int erofs_fscache_data_read(struct address_space *mapping, + loff_t pos, size_t len, bool *unlock) { - struct inode *inode = folio_mapping(folio)->host; + struct inode *inode = mapping->host; struct super_block *sb = inode->i_sb; + struct netfs_io_request *rreq; struct erofs_map_blocks map; struct erofs_map_dev mdev; - struct netfs_io_request *rreq; - erofs_off_t pos; - loff_t pstart; + struct iov_iter iter; + size_t count; int ret; - DBG_BUGON(folio_size(folio) != EROFS_BLKSIZ); + *unlock = true; - pos = folio_pos(folio); map.m_la = pos; - ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); if (ret) - goto out_unlock; + return ret; - if (!(map.m_flags & EROFS_MAP_MAPPED)) { - folio_zero_range(folio, 0, folio_size(folio)); - goto out_uptodate; + if (map.m_flags & EROFS_MAP_META) { + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + erofs_blk_t blknr; + size_t offset, size; + void *src; + + /* For tail packing layout, the offset may be non-zero. */ + offset = erofs_blkoff(map.m_pa); + blknr = erofs_blknr(map.m_pa); + size = map.m_llen; + + src = erofs_read_metabuf(&buf, sb, blknr, EROFS_KMAP); + if (IS_ERR(src)) + return PTR_ERR(src); + + iov_iter_xarray(&iter, READ, &mapping->i_pages, pos, PAGE_SIZE); + if (copy_to_iter(src + offset, size, &iter) != size) { + erofs_put_metabuf(&buf); + return -EFAULT; + } + iov_iter_zero(PAGE_SIZE - size, &iter); + erofs_put_metabuf(&buf); + return PAGE_SIZE; } - if (map.m_flags & EROFS_MAP_META) { - ret = erofs_fscache_read_folio_inline(folio, &map); - goto out_uptodate; + if (!(map.m_flags & EROFS_MAP_MAPPED)) { + count = len; + iov_iter_xarray(&iter, READ, &mapping->i_pages, pos, count); + iov_iter_zero(count, &iter); + return count; } + count = min_t(size_t, map.m_llen - (pos - map.m_la), len); + DBG_BUGON(!count || count % PAGE_SIZE); + mdev = (struct erofs_map_dev) { .m_deviceid = map.m_deviceid, .m_pa = map.m_pa, }; - ret = erofs_map_dev(sb, &mdev); if (ret) - goto out_unlock; + return ret; + rreq = erofs_fscache_alloc_request(mapping, pos, count); + if (IS_ERR(rreq)) + return PTR_ERR(rreq); - rreq = erofs_fscache_alloc_request(folio_mapping(folio), - folio_pos(folio), folio_size(folio)); - if (IS_ERR(rreq)) { - ret = PTR_ERR(rreq); - goto out_unlock; - } - - pstart = mdev.m_pa + (pos - map.m_la); - return erofs_fscache_read_folios_async(mdev.m_fscache->cookie, - rreq, pstart); - -out_uptodate: - if (!ret) - folio_mark_uptodate(folio); -out_unlock: - folio_unlock(folio); - return ret; + *unlock = false; + erofs_fscache_read_folios_async(mdev.m_fscache->cookie, + rreq, mdev.m_pa + (pos - map.m_la)); + return count; } -static void erofs_fscache_advance_folios(struct readahead_control *rac, - size_t len, bool unlock) +static int erofs_fscache_read_folio(struct file *file, struct folio *folio) { - while (len) { - struct folio *folio = readahead_folio(rac); - len -= folio_size(folio); - if (unlock) { + bool unlock; + int ret; + + DBG_BUGON(folio_size(folio) != EROFS_BLKSIZ); + + ret = erofs_fscache_data_read(folio_mapping(folio), folio_pos(folio), + folio_size(folio), &unlock); + if (unlock) { + if (ret > 0) folio_mark_uptodate(folio); - folio_unlock(folio); - } + folio_unlock(folio); } + return ret < 0 ? ret : 0; } static void erofs_fscache_readahead(struct readahead_control *rac) { - struct inode *inode = rac->mapping->host; - struct super_block *sb = inode->i_sb; - size_t len, count, done = 0; - erofs_off_t pos; - loff_t start, offset; - int ret; + struct folio *folio; + size_t len, done = 0; + loff_t start, pos; + bool unlock; + int ret, size; if (!readahead_count(rac)) return; @@ -349,67 +360,22 @@ static void erofs_fscache_readahead(struct readahead_control *rac) len = readahead_length(rac); do { - struct erofs_map_blocks map; - struct erofs_map_dev mdev; - struct netfs_io_request *rreq; - pos = start + done; - map.m_la = pos; - - ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); - if (ret) + ret = erofs_fscache_data_read(rac->mapping, pos, + len - done, &unlock); + if (ret <= 0) return; - offset = start + done; - count = min_t(size_t, map.m_llen - (pos - map.m_la), - len - done); - - if (!(map.m_flags & EROFS_MAP_MAPPED)) { - struct iov_iter iter; - - iov_iter_xarray(&iter, READ, &rac->mapping->i_pages, - offset, count); - iov_iter_zero(count, &iter); - - erofs_fscache_advance_folios(rac, count, true); - ret = count; - continue; - } - - if (map.m_flags & EROFS_MAP_META) { - struct folio *folio = readahead_folio(rac); - - ret = erofs_fscache_read_folio_inline(folio, &map); - if (!ret) { + size = ret; + while (size) { + folio = readahead_folio(rac); + size -= folio_size(folio); + if (unlock) { folio_mark_uptodate(folio); - ret = folio_size(folio); + folio_unlock(folio); } - - folio_unlock(folio); - continue; } - - mdev = (struct erofs_map_dev) { - .m_deviceid = map.m_deviceid, - .m_pa = map.m_pa, - }; - ret = erofs_map_dev(sb, &mdev); - if (ret) - return; - - rreq = erofs_fscache_alloc_request(rac->mapping, offset, count); - if (IS_ERR(rreq)) - return; - /* - * Drop the ref of folios here. Unlock them in - * rreq_unlock_folios() when rreq complete. - */ - erofs_fscache_advance_folios(rac, count, false); - ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie, - rreq, mdev.m_pa + (pos - map.m_la)); - if (!ret) - ret = count; - } while (ret > 0 && ((done += ret) < len)); + } while ((done += ret) < len); } static const struct address_space_operations erofs_fscache_meta_aops = { @@ -421,9 +387,114 @@ const struct address_space_operations erofs_fscache_access_aops = { .readahead = erofs_fscache_readahead, }; -int erofs_fscache_register_cookie(struct super_block *sb, - struct erofs_fscache **fscache, - char *name, bool need_inode) +static void erofs_fscache_domain_put(struct erofs_domain *domain) +{ + if (!domain) + return; + mutex_lock(&erofs_domain_list_lock); + if (refcount_dec_and_test(&domain->ref)) { + list_del(&domain->list); + if (list_empty(&erofs_domain_list)) { + kern_unmount(erofs_pseudo_mnt); + erofs_pseudo_mnt = NULL; + } + mutex_unlock(&erofs_domain_list_lock); + fscache_relinquish_volume(domain->volume, NULL, false); + kfree(domain->domain_id); + kfree(domain); + return; + } + mutex_unlock(&erofs_domain_list_lock); +} + +static int erofs_fscache_register_volume(struct super_block *sb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + char *domain_id = sbi->domain_id; + struct fscache_volume *volume; + char *name; + int ret = 0; + + name = kasprintf(GFP_KERNEL, "erofs,%s", + domain_id ? domain_id : sbi->fsid); + if (!name) + return -ENOMEM; + + volume = fscache_acquire_volume(name, NULL, NULL, 0); + if (IS_ERR_OR_NULL(volume)) { + erofs_err(sb, "failed to register volume for %s", name); + ret = volume ? PTR_ERR(volume) : -EOPNOTSUPP; + volume = NULL; + } + + sbi->volume = volume; + kfree(name); + return ret; +} + +static int erofs_fscache_init_domain(struct super_block *sb) +{ + int err; + struct erofs_domain *domain; + struct erofs_sb_info *sbi = EROFS_SB(sb); + + domain = kzalloc(sizeof(struct erofs_domain), GFP_KERNEL); + if (!domain) + return -ENOMEM; + + domain->domain_id = kstrdup(sbi->domain_id, GFP_KERNEL); + if (!domain->domain_id) { + kfree(domain); + return -ENOMEM; + } + + err = erofs_fscache_register_volume(sb); + if (err) + goto out; + + if (!erofs_pseudo_mnt) { + erofs_pseudo_mnt = kern_mount(&erofs_fs_type); + if (IS_ERR(erofs_pseudo_mnt)) { + err = PTR_ERR(erofs_pseudo_mnt); + goto out; + } + } + + domain->volume = sbi->volume; + refcount_set(&domain->ref, 1); + list_add(&domain->list, &erofs_domain_list); + sbi->domain = domain; + return 0; +out: + kfree(domain->domain_id); + kfree(domain); + return err; +} + +static int erofs_fscache_register_domain(struct super_block *sb) +{ + int err; + struct erofs_domain *domain; + struct erofs_sb_info *sbi = EROFS_SB(sb); + + mutex_lock(&erofs_domain_list_lock); + list_for_each_entry(domain, &erofs_domain_list, list) { + if (!strcmp(domain->domain_id, sbi->domain_id)) { + sbi->domain = domain; + sbi->volume = domain->volume; + refcount_inc(&domain->ref); + mutex_unlock(&erofs_domain_list_lock); + return 0; + } + } + err = erofs_fscache_init_domain(sb); + mutex_unlock(&erofs_domain_list_lock); + return err; +} + +static +struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb, + char *name, bool need_inode) { struct fscache_volume *volume = EROFS_SB(sb)->volume; struct erofs_fscache *ctx; @@ -432,7 +503,7 @@ int erofs_fscache_register_cookie(struct super_block *sb, ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) - return -ENOMEM; + return ERR_PTR(-ENOMEM); cookie = fscache_acquire_cookie(volume, FSCACHE_ADV_WANT_CACHE_SIZE, name, strlen(name), NULL, 0, 0); @@ -462,63 +533,149 @@ int erofs_fscache_register_cookie(struct super_block *sb, ctx->inode = inode; } - *fscache = ctx; - return 0; + return ctx; err_cookie: fscache_unuse_cookie(ctx->cookie, NULL, NULL); fscache_relinquish_cookie(ctx->cookie, false); - ctx->cookie = NULL; err: kfree(ctx); - return ret; + return ERR_PTR(ret); } -void erofs_fscache_unregister_cookie(struct erofs_fscache **fscache) +static void erofs_fscache_relinquish_cookie(struct erofs_fscache *ctx) { - struct erofs_fscache *ctx = *fscache; - - if (!ctx) - return; - fscache_unuse_cookie(ctx->cookie, NULL, NULL); fscache_relinquish_cookie(ctx->cookie, false); - ctx->cookie = NULL; - iput(ctx->inode); - ctx->inode = NULL; - + kfree(ctx->name); kfree(ctx); - *fscache = NULL; +} + +static +struct erofs_fscache *erofs_fscache_domain_init_cookie(struct super_block *sb, + char *name, bool need_inode) +{ + int err; + struct inode *inode; + struct erofs_fscache *ctx; + struct erofs_domain *domain = EROFS_SB(sb)->domain; + + ctx = erofs_fscache_acquire_cookie(sb, name, need_inode); + if (IS_ERR(ctx)) + return ctx; + + ctx->name = kstrdup(name, GFP_KERNEL); + if (!ctx->name) { + err = -ENOMEM; + goto out; + } + + inode = new_inode(erofs_pseudo_mnt->mnt_sb); + if (!inode) { + err = -ENOMEM; + goto out; + } + + ctx->domain = domain; + ctx->anon_inode = inode; + inode->i_private = ctx; + refcount_inc(&domain->ref); + return ctx; +out: + erofs_fscache_relinquish_cookie(ctx); + return ERR_PTR(err); +} + +static +struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb, + char *name, bool need_inode) +{ + struct inode *inode; + struct erofs_fscache *ctx; + struct erofs_domain *domain = EROFS_SB(sb)->domain; + struct super_block *psb = erofs_pseudo_mnt->mnt_sb; + + mutex_lock(&erofs_domain_cookies_lock); + spin_lock(&psb->s_inode_list_lock); + list_for_each_entry(inode, &psb->s_inodes, i_sb_list) { + ctx = inode->i_private; + if (!ctx || ctx->domain != domain || strcmp(ctx->name, name)) + continue; + igrab(inode); + spin_unlock(&psb->s_inode_list_lock); + mutex_unlock(&erofs_domain_cookies_lock); + return ctx; + } + spin_unlock(&psb->s_inode_list_lock); + ctx = erofs_fscache_domain_init_cookie(sb, name, need_inode); + mutex_unlock(&erofs_domain_cookies_lock); + return ctx; +} + +struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, + char *name, bool need_inode) +{ + if (EROFS_SB(sb)->domain_id) + return erofs_domain_register_cookie(sb, name, need_inode); + return erofs_fscache_acquire_cookie(sb, name, need_inode); +} + +void erofs_fscache_unregister_cookie(struct erofs_fscache *ctx) +{ + bool drop; + struct erofs_domain *domain; + + if (!ctx) + return; + domain = ctx->domain; + if (domain) { + mutex_lock(&erofs_domain_cookies_lock); + drop = atomic_read(&ctx->anon_inode->i_count) == 1; + iput(ctx->anon_inode); + mutex_unlock(&erofs_domain_cookies_lock); + if (!drop) + return; + } + + erofs_fscache_relinquish_cookie(ctx); + erofs_fscache_domain_put(domain); } int erofs_fscache_register_fs(struct super_block *sb) { + int ret; struct erofs_sb_info *sbi = EROFS_SB(sb); - struct fscache_volume *volume; - char *name; - int ret = 0; + struct erofs_fscache *fscache; - name = kasprintf(GFP_KERNEL, "erofs,%s", sbi->opt.fsid); - if (!name) - return -ENOMEM; + if (sbi->domain_id) + ret = erofs_fscache_register_domain(sb); + else + ret = erofs_fscache_register_volume(sb); + if (ret) + return ret; - volume = fscache_acquire_volume(name, NULL, NULL, 0); - if (IS_ERR_OR_NULL(volume)) { - erofs_err(sb, "failed to register volume for %s", name); - ret = volume ? PTR_ERR(volume) : -EOPNOTSUPP; - volume = NULL; - } + /* acquired domain/volume will be relinquished in kill_sb() on error */ + fscache = erofs_fscache_register_cookie(sb, sbi->fsid, true); + if (IS_ERR(fscache)) + return PTR_ERR(fscache); - sbi->volume = volume; - kfree(name); - return ret; + sbi->s_fscache = fscache; + return 0; } void erofs_fscache_unregister_fs(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); - fscache_relinquish_volume(sbi->volume, NULL, false); + erofs_fscache_unregister_cookie(sbi->s_fscache); + + if (sbi->domain) + erofs_fscache_domain_put(sbi->domain); + else + fscache_relinquish_volume(sbi->volume, NULL, false); + + sbi->s_fscache = NULL; sbi->volume = NULL; + sbi->domain = NULL; } diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 95a403720e8c..ad2a82f2eb4c 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -214,7 +214,7 @@ static int erofs_fill_symlink(struct inode *inode, void *kaddr, /* if it cannot be handled with fast symlink scheme */ if (vi->datalayout != EROFS_INODE_FLAT_INLINE || - inode->i_size >= EROFS_BLKSIZ) { + inode->i_size >= EROFS_BLKSIZ || inode->i_size < 0) { inode->i_op = &erofs_symlink_iops; return 0; } @@ -241,7 +241,7 @@ static int erofs_fill_symlink(struct inode *inode, void *kaddr, return 0; } -static int erofs_fill_inode(struct inode *inode, int isdir) +static int erofs_fill_inode(struct inode *inode) { struct erofs_inode *vi = EROFS_I(inode); struct erofs_buf buf = __EROFS_BUF_INITIALIZER; @@ -249,7 +249,7 @@ static int erofs_fill_inode(struct inode *inode, int isdir) unsigned int ofs; int err = 0; - trace_erofs_fill_inode(inode, isdir); + trace_erofs_fill_inode(inode); /* read inode base data from disk */ kaddr = erofs_read_inode(&buf, inode, &ofs); @@ -324,21 +324,13 @@ static int erofs_iget_set_actor(struct inode *inode, void *opaque) return 0; } -static inline struct inode *erofs_iget_locked(struct super_block *sb, - erofs_nid_t nid) +struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid) { const unsigned long hashval = erofs_inode_hash(nid); + struct inode *inode; - return iget5_locked(sb, hashval, erofs_ilookup_test_actor, + inode = iget5_locked(sb, hashval, erofs_ilookup_test_actor, erofs_iget_set_actor, &nid); -} - -struct inode *erofs_iget(struct super_block *sb, - erofs_nid_t nid, - bool isdir) -{ - struct inode *inode = erofs_iget_locked(sb, nid); - if (!inode) return ERR_PTR(-ENOMEM); @@ -348,10 +340,10 @@ struct inode *erofs_iget(struct super_block *sb, vi->nid = nid; - err = erofs_fill_inode(inode, isdir); - if (!err) + err = erofs_fill_inode(inode); + if (!err) { unlock_new_inode(inode); - else { + } else { iget_failed(inode); inode = ERR_PTR(err); } diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index a01cc82795a2..05dc68627722 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -75,7 +75,6 @@ struct erofs_mount_opts { unsigned int max_sync_decompress_pages; #endif unsigned int mount_opt; - char *fsid; }; struct erofs_dev_context { @@ -88,6 +87,8 @@ struct erofs_dev_context { struct erofs_fs_context { struct erofs_mount_opts opt; struct erofs_dev_context *devs; + char *fsid; + char *domain_id; }; /* all filesystem-wide lz4 configurations */ @@ -98,9 +99,19 @@ struct erofs_sb_lz4_info { u16 max_pclusterblks; }; +struct erofs_domain { + refcount_t ref; + struct list_head list; + struct fscache_volume *volume; + char *domain_id; +}; + struct erofs_fscache { struct fscache_cookie *cookie; struct inode *inode; + struct inode *anon_inode; + struct erofs_domain *domain; + char *name; }; struct erofs_sb_info { @@ -120,6 +131,7 @@ struct erofs_sb_info { struct inode *managed_cache; struct erofs_sb_lz4_info lz4; + struct inode *packed_inode; #endif /* CONFIG_EROFS_FS_ZIP */ struct erofs_dev_context *devs; struct dax_device *dax_dev; @@ -157,6 +169,9 @@ struct erofs_sb_info { /* fscache support */ struct fscache_volume *volume; struct erofs_fscache *s_fscache; + struct erofs_domain *domain; + char *fsid; + char *domain_id; }; #define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info) @@ -183,7 +198,6 @@ enum { EROFS_ZIP_CACHE_READAROUND }; -#ifdef CONFIG_EROFS_FS_ZIP #define EROFS_LOCKED_MAGIC (INT_MIN | 0xE0F510CCL) /* basic unit of the workstation of a super_block */ @@ -223,7 +237,6 @@ static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp) return atomic_cond_read_relaxed(&grp->refcount, VAL != EROFS_LOCKED_MAGIC); } -#endif /* !CONFIG_EROFS_FS_ZIP */ /* we strictly follow PAGE_SIZE and no buffer head yet */ #define LOG_BLOCK_SIZE PAGE_SHIFT @@ -277,6 +290,8 @@ EROFS_FEATURE_FUNCS(chunked_file, incompat, INCOMPAT_CHUNKED_FILE) EROFS_FEATURE_FUNCS(device_table, incompat, INCOMPAT_DEVICE_TABLE) EROFS_FEATURE_FUNCS(compr_head2, incompat, INCOMPAT_COMPR_HEAD2) EROFS_FEATURE_FUNCS(ztailpacking, incompat, INCOMPAT_ZTAILPACKING) +EROFS_FEATURE_FUNCS(fragments, incompat, INCOMPAT_FRAGMENTS) +EROFS_FEATURE_FUNCS(dedupe, incompat, INCOMPAT_DEDUPE) EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM) /* atomic flag definitions */ @@ -312,8 +327,13 @@ struct erofs_inode { unsigned char z_algorithmtype[2]; unsigned char z_logical_clusterbits; unsigned long z_tailextent_headlcn; - erofs_off_t z_idataoff; - unsigned short z_idata_size; + union { + struct { + erofs_off_t z_idataoff; + unsigned short z_idata_size; + }; + erofs_off_t z_fragmentoff; + }; }; #endif /* CONFIG_EROFS_FS_ZIP */ }; @@ -364,6 +384,7 @@ struct page *erofs_grab_cache_page_nowait(struct address_space *mapping, } extern const struct super_operations erofs_sops; +extern struct file_system_type erofs_fs_type; extern const struct address_space_operations erofs_raw_access_aops; extern const struct address_space_operations z_erofs_aops; @@ -371,6 +392,8 @@ extern const struct address_space_operations z_erofs_aops; enum { BH_Encoded = BH_PrivateStart, BH_FullMapped, + BH_Fragment, + BH_Partialref, }; /* Has a disk mapping */ @@ -381,6 +404,10 @@ enum { #define EROFS_MAP_ENCODED (1 << BH_Encoded) /* The length of extent is full */ #define EROFS_MAP_FULL_MAPPED (1 << BH_FullMapped) +/* Located in the special packed inode */ +#define EROFS_MAP_FRAGMENT (1 << BH_Fragment) +/* The extent refers to partial decompressed data */ +#define EROFS_MAP_PARTIAL_REF (1 << BH_Partialref) struct erofs_map_blocks { struct erofs_buf buf; @@ -402,11 +429,12 @@ struct erofs_map_blocks { #define EROFS_GET_BLOCKS_FIEMAP 0x0002 /* Used to map the whole extent if non-negligible data is requested for LZMA */ #define EROFS_GET_BLOCKS_READMORE 0x0004 -/* Used to map tail extent for tailpacking inline pcluster */ +/* Used to map tail extent for tailpacking inline or fragment pcluster */ #define EROFS_GET_BLOCKS_FINDTAIL 0x0008 enum { Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX, + Z_EROFS_COMPRESSION_INTERLACED, Z_EROFS_COMPRESSION_RUNTIME_MAX }; @@ -466,7 +494,7 @@ extern const struct inode_operations erofs_generic_iops; extern const struct inode_operations erofs_symlink_iops; extern const struct inode_operations erofs_fast_symlink_iops; -struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid, bool dir); +struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid); int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags); @@ -581,27 +609,26 @@ static inline int z_erofs_load_lzma_config(struct super_block *sb, int erofs_fscache_register_fs(struct super_block *sb); void erofs_fscache_unregister_fs(struct super_block *sb); -int erofs_fscache_register_cookie(struct super_block *sb, - struct erofs_fscache **fscache, - char *name, bool need_inode); -void erofs_fscache_unregister_cookie(struct erofs_fscache **fscache); +struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, + char *name, bool need_inode); +void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache); extern const struct address_space_operations erofs_fscache_access_aops; #else static inline int erofs_fscache_register_fs(struct super_block *sb) { - return 0; + return -EOPNOTSUPP; } static inline void erofs_fscache_unregister_fs(struct super_block *sb) {} -static inline int erofs_fscache_register_cookie(struct super_block *sb, - struct erofs_fscache **fscache, - char *name, bool need_inode) +static inline +struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, + char *name, bool need_inode) { - return -EOPNOTSUPP; + return ERR_PTR(-EOPNOTSUPP); } -static inline void erofs_fscache_unregister_cookie(struct erofs_fscache **fscache) +static inline void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache) { } #endif diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c index fd75506799c4..0dc34721080c 100644 --- a/fs/erofs/namei.c +++ b/fs/erofs/namei.c @@ -185,7 +185,6 @@ int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid, if (IS_ERR(de)) return PTR_ERR(de); - /* the target page has been mapped */ if (ndirents) de = find_target_dirent(&qn, (u8 *)de, EROFS_BLKSIZ, ndirents); @@ -197,9 +196,7 @@ int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid, return PTR_ERR_OR_ZERO(de); } -/* NOTE: i_mutex is already held by vfs */ -static struct dentry *erofs_lookup(struct inode *dir, - struct dentry *dentry, +static struct dentry *erofs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { int err; @@ -207,17 +204,11 @@ static struct dentry *erofs_lookup(struct inode *dir, unsigned int d_type; struct inode *inode; - DBG_BUGON(!d_really_is_negative(dentry)); - /* dentry must be unhashed in lookup, no need to worry about */ - DBG_BUGON(!d_unhashed(dentry)); - trace_erofs_lookup(dir, dentry, flags); - /* file name exceeds fs limit */ if (dentry->d_name.len > EROFS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); - /* false uninitialized warnings on gcc 4.8.x */ err = erofs_namei(dir, &dentry->d_name, &nid, &d_type); if (err == -ENOENT) { @@ -228,7 +219,7 @@ static struct dentry *erofs_lookup(struct inode *dir, } else { erofs_dbg("%s, %pd (nid %llu) found, d_type %u", __func__, dentry, nid, d_type); - inode = erofs_iget(dir->i_sb, nid, d_type == FT_DIR); + inode = erofs_iget(dir->i_sb, nid); } return d_splice_alias(inode, dentry); } diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 3173debeaa5a..1c7dcca702b3 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -224,10 +224,10 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, struct erofs_device_info *dif, erofs_off_t *pos) { struct erofs_sb_info *sbi = EROFS_SB(sb); + struct erofs_fscache *fscache; struct erofs_deviceslot *dis; struct block_device *bdev; void *ptr; - int ret; ptr = erofs_read_metabuf(buf, sb, erofs_blknr(*pos), EROFS_KMAP); if (IS_ERR(ptr)) @@ -245,10 +245,10 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, } if (erofs_is_fscache_mode(sb)) { - ret = erofs_fscache_register_cookie(sb, &dif->fscache, - dif->path, false); - if (ret) - return ret; + fscache = erofs_fscache_register_cookie(sb, dif->path, false); + if (IS_ERR(fscache)) + return PTR_ERR(fscache); + dif->fscache = fscache; } else { bdev = blkdev_get_by_path(dif->path, FMODE_READ | FMODE_EXCL, sb->s_type); @@ -381,6 +381,17 @@ static int erofs_read_superblock(struct super_block *sb) #endif sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact)); sbi->root_nid = le16_to_cpu(dsb->root_nid); +#ifdef CONFIG_EROFS_FS_ZIP + sbi->packed_inode = NULL; + if (erofs_sb_has_fragments(sbi) && dsb->packed_nid) { + sbi->packed_inode = + erofs_iget(sb, le64_to_cpu(dsb->packed_nid)); + if (IS_ERR(sbi->packed_inode)) { + ret = PTR_ERR(sbi->packed_inode); + goto out; + } + } +#endif sbi->inos = le64_to_cpu(dsb->inos); sbi->build_time = le64_to_cpu(dsb->build_time); @@ -411,6 +422,10 @@ static int erofs_read_superblock(struct super_block *sb) erofs_info(sb, "EXPERIMENTAL compressed inline data feature in use. Use at your own risk!"); if (erofs_is_fscache_mode(sb)) erofs_info(sb, "EXPERIMENTAL fscache-based on-demand read feature in use. Use at your own risk!"); + if (erofs_sb_has_fragments(sbi)) + erofs_info(sb, "EXPERIMENTAL compressed fragments feature in use. Use at your own risk!"); + if (erofs_sb_has_dedupe(sbi)) + erofs_info(sb, "EXPERIMENTAL global deduplication feature in use. Use at your own risk!"); out: erofs_put_metabuf(&buf); return ret; @@ -440,6 +455,7 @@ enum { Opt_dax_enum, Opt_device, Opt_fsid, + Opt_domain_id, Opt_err }; @@ -465,6 +481,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = { fsparam_enum("dax", Opt_dax_enum, erofs_dax_param_enums), fsparam_string("device", Opt_device), fsparam_string("fsid", Opt_fsid), + fsparam_string("domain_id", Opt_domain_id), {} }; @@ -562,14 +579,24 @@ static int erofs_fc_parse_param(struct fs_context *fc, break; case Opt_fsid: #ifdef CONFIG_EROFS_FS_ONDEMAND - kfree(ctx->opt.fsid); - ctx->opt.fsid = kstrdup(param->string, GFP_KERNEL); - if (!ctx->opt.fsid) + kfree(ctx->fsid); + ctx->fsid = kstrdup(param->string, GFP_KERNEL); + if (!ctx->fsid) return -ENOMEM; #else errorfc(fc, "fsid option not supported"); #endif break; + case Opt_domain_id: +#ifdef CONFIG_EROFS_FS_ONDEMAND + kfree(ctx->domain_id); + ctx->domain_id = kstrdup(param->string, GFP_KERNEL); + if (!ctx->domain_id) + return -ENOMEM; +#else + errorfc(fc, "domain_id option not supported"); +#endif + break; default: return -ENOPARAM; } @@ -641,7 +668,7 @@ static int erofs_init_managed_cache(struct super_block *sb) { return 0; } static struct inode *erofs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { - return erofs_iget(sb, ino, false); + return erofs_iget(sb, ino); } static struct dentry *erofs_fh_to_dentry(struct super_block *sb, @@ -667,7 +694,7 @@ static struct dentry *erofs_get_parent(struct dentry *child) err = erofs_namei(d_inode(child), &dotdot_name, &nid, &d_type); if (err) return ERR_PTR(err); - return d_obtain_alias(erofs_iget(child->d_sb, nid, d_type == FT_DIR)); + return d_obtain_alias(erofs_iget(child->d_sb, nid)); } static const struct export_operations erofs_export_ops = { @@ -676,6 +703,13 @@ static const struct export_operations erofs_export_ops = { .get_parent = erofs_get_parent, }; +static int erofs_fc_fill_pseudo_super(struct super_block *sb, struct fs_context *fc) +{ + static const struct tree_descr empty_descr = {""}; + + return simple_fill_super(sb, EROFS_SUPER_MAGIC, &empty_descr); +} + static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) { struct inode *inode; @@ -694,9 +728,12 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_fs_info = sbi; sbi->opt = ctx->opt; - ctx->opt.fsid = NULL; sbi->devs = ctx->devs; ctx->devs = NULL; + sbi->fsid = ctx->fsid; + ctx->fsid = NULL; + sbi->domain_id = ctx->domain_id; + ctx->domain_id = NULL; if (erofs_is_fscache_mode(sb)) { sb->s_blocksize = EROFS_BLKSIZ; @@ -706,11 +743,6 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) if (err) return err; - err = erofs_fscache_register_cookie(sb, &sbi->s_fscache, - sbi->opt.fsid, true); - if (err) - return err; - err = super_setup_bdi(sb); if (err) return err; @@ -752,7 +784,7 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) #endif /* get the root inode */ - inode = erofs_iget(sb, ROOT_NID(sbi), true); + inode = erofs_iget(sb, ROOT_NID(sbi)); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -781,11 +813,16 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) return 0; } +static int erofs_fc_anon_get_tree(struct fs_context *fc) +{ + return get_tree_nodev(fc, erofs_fc_fill_pseudo_super); +} + static int erofs_fc_get_tree(struct fs_context *fc) { struct erofs_fs_context *ctx = fc->fs_private; - if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && ctx->opt.fsid) + if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && ctx->fsid) return get_tree_nodev(fc, erofs_fc_fill_super); return get_tree_bdev(fc, erofs_fc_fill_super); @@ -799,6 +836,9 @@ static int erofs_fc_reconfigure(struct fs_context *fc) DBG_BUGON(!sb_rdonly(sb)); + if (ctx->fsid || ctx->domain_id) + erofs_info(sb, "ignoring reconfiguration for fsid|domain_id."); + if (test_opt(&ctx->opt, POSIX_ACL)) fc->sb_flags |= SB_POSIXACL; else @@ -817,7 +857,8 @@ static int erofs_release_device_info(int id, void *ptr, void *data) fs_put_dax(dif->dax_dev, NULL); if (dif->bdev) blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL); - erofs_fscache_unregister_cookie(&dif->fscache); + erofs_fscache_unregister_cookie(dif->fscache); + dif->fscache = NULL; kfree(dif->path); kfree(dif); return 0; @@ -837,7 +878,8 @@ static void erofs_fc_free(struct fs_context *fc) struct erofs_fs_context *ctx = fc->fs_private; erofs_free_dev_context(ctx->devs); - kfree(ctx->opt.fsid); + kfree(ctx->fsid); + kfree(ctx->domain_id); kfree(ctx); } @@ -848,10 +890,21 @@ static const struct fs_context_operations erofs_context_ops = { .free = erofs_fc_free, }; +static const struct fs_context_operations erofs_anon_context_ops = { + .get_tree = erofs_fc_anon_get_tree, +}; + static int erofs_init_fs_context(struct fs_context *fc) { - struct erofs_fs_context *ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + struct erofs_fs_context *ctx; + + /* pseudo mount for anon inodes */ + if (fc->sb_flags & SB_KERNMOUNT) { + fc->ops = &erofs_anon_context_ops; + return 0; + } + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return -ENOMEM; ctx->devs = kzalloc(sizeof(struct erofs_dev_context), GFP_KERNEL); @@ -878,8 +931,14 @@ static void erofs_kill_sb(struct super_block *sb) WARN_ON(sb->s_magic != EROFS_SUPER_MAGIC); + /* pseudo mount for anon inodes */ + if (sb->s_flags & SB_KERNMOUNT) { + kill_anon_super(sb); + return; + } + if (erofs_is_fscache_mode(sb)) - generic_shutdown_super(sb); + kill_anon_super(sb); else kill_block_super(sb); @@ -889,9 +948,9 @@ static void erofs_kill_sb(struct super_block *sb) erofs_free_dev_context(sbi->devs); fs_put_dax(sbi->dax_dev, NULL); - erofs_fscache_unregister_cookie(&sbi->s_fscache); erofs_fscache_unregister_fs(sb); - kfree(sbi->opt.fsid); + kfree(sbi->fsid); + kfree(sbi->domain_id); kfree(sbi); sb->s_fs_info = NULL; } @@ -908,11 +967,13 @@ static void erofs_put_super(struct super_block *sb) #ifdef CONFIG_EROFS_FS_ZIP iput(sbi->managed_cache); sbi->managed_cache = NULL; + iput(sbi->packed_inode); + sbi->packed_inode = NULL; #endif - erofs_fscache_unregister_cookie(&sbi->s_fscache); + erofs_fscache_unregister_fs(sb); } -static struct file_system_type erofs_fs_type = { +struct file_system_type erofs_fs_type = { .owner = THIS_MODULE, .name = "erofs", .init_fs_context = erofs_init_fs_context, @@ -1042,8 +1103,10 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root) if (test_opt(opt, DAX_NEVER)) seq_puts(seq, ",dax=never"); #ifdef CONFIG_EROFS_FS_ONDEMAND - if (opt->fsid) - seq_printf(seq, ",fsid=%s", opt->fsid); + if (sbi->fsid) + seq_printf(seq, ",fsid=%s", sbi->fsid); + if (sbi->domain_id) + seq_printf(seq, ",domain_id=%s", sbi->domain_id); #endif return 0; } diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c index c1383e508bbe..fd476961f742 100644 --- a/fs/erofs/sysfs.c +++ b/fs/erofs/sysfs.c @@ -76,6 +76,8 @@ EROFS_ATTR_FEATURE(device_table); EROFS_ATTR_FEATURE(compr_head2); EROFS_ATTR_FEATURE(sb_chksum); EROFS_ATTR_FEATURE(ztailpacking); +EROFS_ATTR_FEATURE(fragments); +EROFS_ATTR_FEATURE(dedupe); static struct attribute *erofs_feat_attrs[] = { ATTR_LIST(zero_padding), @@ -86,6 +88,8 @@ static struct attribute *erofs_feat_attrs[] = { ATTR_LIST(compr_head2), ATTR_LIST(sb_chksum), ATTR_LIST(ztailpacking), + ATTR_LIST(fragments), + ATTR_LIST(dedupe), NULL, }; ATTRIBUTE_GROUPS(erofs_feat); @@ -201,12 +205,27 @@ static struct kobject erofs_feat = { int erofs_register_sysfs(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); + char *name; + char *str = NULL; int err; + if (erofs_is_fscache_mode(sb)) { + if (sbi->domain_id) { + str = kasprintf(GFP_KERNEL, "%s,%s", sbi->domain_id, + sbi->fsid); + if (!str) + return -ENOMEM; + name = str; + } else { + name = sbi->fsid; + } + } else { + name = sb->s_id; + } sbi->s_kobj.kset = &erofs_root; init_completion(&sbi->s_kobj_unregister); - err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s", - erofs_is_fscache_mode(sb) ? sbi->opt.fsid : sb->s_id); + err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s", name); + kfree(str); if (err) goto put_sb_kobj; return 0; diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h index 332462c59f11..0a43c9ee9f8f 100644 --- a/fs/erofs/xattr.h +++ b/fs/erofs/xattr.h @@ -39,9 +39,7 @@ static inline unsigned int xattrblock_offset(struct erofs_sb_info *sbi, #ifdef CONFIG_EROFS_FS_XATTR extern const struct xattr_handler erofs_xattr_user_handler; extern const struct xattr_handler erofs_xattr_trusted_handler; -#ifdef CONFIG_EROFS_FS_SECURITY extern const struct xattr_handler erofs_xattr_security_handler; -#endif static inline const struct xattr_handler *erofs_xattr_handler(unsigned int idx) { diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 5792ca9e0d5e..b792d424d774 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -7,6 +7,7 @@ #include "zdata.h" #include "compress.h" #include <linux/prefetch.h> +#include <linux/psi.h> #include <trace/events/erofs.h> @@ -650,6 +651,38 @@ static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe, la < fe->headoffset; } +static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos, + struct page *page, unsigned int pageofs, + unsigned int len) +{ + struct inode *packed_inode = EROFS_I_SB(inode)->packed_inode; + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + u8 *src, *dst; + unsigned int i, cnt; + + if (!packed_inode) + return -EFSCORRUPTED; + + pos += EROFS_I(inode)->z_fragmentoff; + for (i = 0; i < len; i += cnt) { + cnt = min_t(unsigned int, len - i, + EROFS_BLKSIZ - erofs_blkoff(pos)); + src = erofs_bread(&buf, packed_inode, + erofs_blknr(pos), EROFS_KMAP); + if (IS_ERR(src)) { + erofs_put_metabuf(&buf); + return PTR_ERR(src); + } + + dst = kmap_local_page(page); + memcpy(dst + pageofs + i, src + erofs_blkoff(pos), cnt); + kunmap_local(dst); + pos += cnt; + } + erofs_put_metabuf(&buf); + return 0; +} + static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, struct page *page, struct page **pagepool) { @@ -688,7 +721,8 @@ repeat: /* didn't get a valid pcluster previously (very rare) */ } - if (!(map->m_flags & EROFS_MAP_MAPPED)) + if (!(map->m_flags & EROFS_MAP_MAPPED) || + map->m_flags & EROFS_MAP_FRAGMENT) goto hitted; err = z_erofs_collector_begin(fe); @@ -735,6 +769,24 @@ hitted: zero_user_segment(page, cur, end); goto next_part; } + if (map->m_flags & EROFS_MAP_FRAGMENT) { + unsigned int pageofs, skip, len; + + if (offset > map->m_la) { + pageofs = 0; + skip = offset - map->m_la; + } else { + pageofs = map->m_la & ~PAGE_MASK; + skip = 0; + } + len = min_t(unsigned int, map->m_llen - skip, end - cur); + err = z_erofs_read_fragment(inode, skip, page, pageofs, len); + if (err) + goto out; + ++spiltted; + tight = false; + goto next_part; + } exclusive = (!cur && (!spiltted || tight)); if (cur) @@ -764,14 +816,14 @@ retry: ++spiltted; if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) fe->pcl->multibases = true; - - if ((map->m_flags & EROFS_MAP_FULL_MAPPED) && - fe->pcl->length == map->m_llen) - fe->pcl->partial = false; if (fe->pcl->length < offset + end - map->m_la) { fe->pcl->length = offset + end - map->m_la; fe->pcl->pageofs_out = map->m_la & ~PAGE_MASK; } + if ((map->m_flags & EROFS_MAP_FULL_MAPPED) && + !(map->m_flags & EROFS_MAP_PARTIAL_REF) && + fe->pcl->length == map->m_llen) + fe->pcl->partial = false; next_part: /* shorten the remaining extent to update progress */ map->m_llen = offset + cur - map->m_la; @@ -838,15 +890,13 @@ static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be, if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK)) { unsigned int pgnr; - struct page *oldpage; pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT; DBG_BUGON(pgnr >= be->nr_pages); - oldpage = be->decompressed_pages[pgnr]; - be->decompressed_pages[pgnr] = bvec->page; - - if (!oldpage) + if (!be->decompressed_pages[pgnr]) { + be->decompressed_pages[pgnr] = bvec->page; return; + } } /* (cold path) one pcluster is requested multiple times */ @@ -1365,6 +1415,8 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, struct block_device *last_bdev; unsigned int nr_bios = 0; struct bio *bio = NULL; + unsigned long pflags; + int memstall = 0; bi_private = jobqueueset_init(sb, q, fgq, force_fg); qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head; @@ -1415,9 +1467,18 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, last_bdev != mdev.m_bdev)) { submit_bio_retry: submit_bio(bio); + if (memstall) { + psi_memstall_leave(&pflags); + memstall = 0; + } bio = NULL; } + if (unlikely(PageWorkingset(page)) && !memstall) { + psi_memstall_enter(&pflags); + memstall = 1; + } + if (!bio) { bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOIO); @@ -1445,8 +1506,11 @@ submit_bio_retry: move_to_bypass_jobqueue(pcl, qtail, owned_head); } while (owned_head != Z_EROFS_PCLUSTER_TAIL); - if (bio) + if (bio) { submit_bio(bio); + if (memstall) + psi_memstall_leave(&pflags); + } /* * although background is preferred, no one is pending for submission. diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index e7f04c4fbb81..d98c95212985 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -126,10 +126,10 @@ static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) } /* - * bit 31: I/O error occurred on this page - * bit 0 - 30: remaining parts to complete this page + * bit 30: I/O error occurred on this page + * bit 0 - 29: remaining parts to complete this page */ -#define Z_EROFS_PAGE_EIO (1 << 31) +#define Z_EROFS_PAGE_EIO (1 << 30) static inline void z_erofs_onlinepage_init(struct page *page) { diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index d58549ca1df9..0bb66927e3d0 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -17,7 +17,7 @@ int z_erofs_fill_inode(struct inode *inode) struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); if (!erofs_sb_has_big_pcluster(sbi) && - !erofs_sb_has_ztailpacking(sbi) && + !erofs_sb_has_ztailpacking(sbi) && !erofs_sb_has_fragments(sbi) && vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) { vi->z_advise = 0; vi->z_algorithmtype[0] = 0; @@ -55,20 +55,25 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) goto out_unlock; - DBG_BUGON(!erofs_sb_has_big_pcluster(EROFS_SB(sb)) && - !erofs_sb_has_ztailpacking(EROFS_SB(sb)) && - vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY); - pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize + vi->xattr_isize, 8); - kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), - EROFS_KMAP_ATOMIC); + kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), EROFS_KMAP); if (IS_ERR(kaddr)) { err = PTR_ERR(kaddr); goto out_unlock; } h = kaddr + erofs_blkoff(pos); + /* + * if the highest bit of the 8-byte map header is set, the whole file + * is stored in the packed inode. The rest bits keeps z_fragmentoff. + */ + if (h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT) { + vi->z_advise = Z_EROFS_ADVISE_FRAGMENT_PCLUSTER; + vi->z_fragmentoff = le64_to_cpu(*(__le64 *)h) ^ (1ULL << 63); + vi->z_tailextent_headlcn = 0; + goto done; + } vi->z_advise = le16_to_cpu(h->h_advise); vi->z_algorithmtype[0] = h->h_algorithmtype & 15; vi->z_algorithmtype[1] = h->h_algorithmtype >> 4; @@ -79,7 +84,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel", headnr + 1, vi->z_algorithmtype[headnr], vi->nid); err = -EOPNOTSUPP; - goto unmap_done; + goto out_put_metabuf; } vi->z_logical_clusterbits = LOG_BLOCK_SIZE + (h->h_clusterbits & 7); @@ -89,7 +94,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) erofs_err(sb, "per-inode big pcluster without sb feature for nid %llu", vi->nid); err = -EFSCORRUPTED; - goto unmap_done; + goto out_put_metabuf; } if (vi->datalayout == EROFS_INODE_FLAT_COMPRESSION && !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1) ^ @@ -97,12 +102,8 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) erofs_err(sb, "big pcluster head1/2 of compact indexes should be consistent for nid %llu", vi->nid); err = -EFSCORRUPTED; - goto unmap_done; + goto out_put_metabuf; } -unmap_done: - erofs_put_metabuf(&buf); - if (err) - goto out_unlock; if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) { struct erofs_map_blocks map = { @@ -121,11 +122,28 @@ unmap_done: err = -EFSCORRUPTED; } if (err < 0) - goto out_unlock; + goto out_put_metabuf; + } + + if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER && + !(h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT)) { + struct erofs_map_blocks map = { + .buf = __EROFS_BUF_INITIALIZER + }; + + vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff); + err = z_erofs_do_map_blocks(inode, &map, + EROFS_GET_BLOCKS_FINDTAIL); + erofs_put_metabuf(&map.buf); + if (err < 0) + goto out_put_metabuf; } +done: /* paired with smp_mb() at the beginning of the function */ smp_mb(); set_bit(EROFS_I_Z_INITED_BIT, &vi->flags); +out_put_metabuf: + erofs_put_metabuf(&buf); out_unlock: clear_and_wake_up_bit(EROFS_I_BL_Z_BIT, &vi->flags); return err; @@ -143,20 +161,9 @@ struct z_erofs_maprecorder { u16 delta[2]; erofs_blk_t pblk, compressedblks; erofs_off_t nextpackoff; + bool partialref; }; -static int z_erofs_reload_indexes(struct z_erofs_maprecorder *m, - erofs_blk_t eblk) -{ - struct super_block *const sb = m->inode->i_sb; - - m->kaddr = erofs_read_metabuf(&m->map->buf, sb, eblk, - EROFS_KMAP_ATOMIC); - if (IS_ERR(m->kaddr)) - return PTR_ERR(m->kaddr); - return 0; -} - static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, unsigned long lcn) { @@ -169,11 +176,11 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, lcn * sizeof(struct z_erofs_vle_decompressed_index); struct z_erofs_vle_decompressed_index *di; unsigned int advise, type; - int err; - err = z_erofs_reload_indexes(m, erofs_blknr(pos)); - if (err) - return err; + m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb, + erofs_blknr(pos), EROFS_KMAP_ATOMIC); + if (IS_ERR(m->kaddr)) + return PTR_ERR(m->kaddr); m->nextpackoff = pos + sizeof(struct z_erofs_vle_decompressed_index); m->lcn = lcn; @@ -201,6 +208,8 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1: case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2: + if (advise & Z_EROFS_VLE_DI_PARTIAL_REF) + m->partialref = true; m->clusterofs = le16_to_cpu(di->di_clusterofs); m->pblk = le32_to_cpu(di->di_u.blkaddr); break; @@ -370,7 +379,6 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m, unsigned int compacted_4b_initial, compacted_2b; unsigned int amortizedshift; erofs_off_t pos; - int err; if (lclusterbits != 12) return -EOPNOTSUPP; @@ -407,9 +415,10 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m, amortizedshift = 2; out: pos += lcn * (1 << amortizedshift); - err = z_erofs_reload_indexes(m, erofs_blknr(pos)); - if (err) - return err; + m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb, + erofs_blknr(pos), EROFS_KMAP_ATOMIC); + if (IS_ERR(m->kaddr)) + return PTR_ERR(m->kaddr); return unpack_compacted_index(m, amortizedshift, pos, lookahead); } @@ -598,6 +607,7 @@ static int z_erofs_do_map_blocks(struct inode *inode, { struct erofs_inode *const vi = EROFS_I(inode); bool ztailpacking = vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER; + bool fragment = vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER; struct z_erofs_maprecorder m = { .inode = inode, .map = map, @@ -663,15 +673,23 @@ static int z_erofs_do_map_blocks(struct inode *inode, err = -EOPNOTSUPP; goto unmap_out; } - + if (m.partialref) + map->m_flags |= EROFS_MAP_PARTIAL_REF; map->m_llen = end - map->m_la; - if (flags & EROFS_GET_BLOCKS_FINDTAIL) + if (flags & EROFS_GET_BLOCKS_FINDTAIL) { vi->z_tailextent_headlcn = m.lcn; + /* for non-compact indexes, fragmentoff is 64 bits */ + if (fragment && + vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) + vi->z_fragmentoff |= (u64)m.pblk << 32; + } if (ztailpacking && m.lcn == vi->z_tailextent_headlcn) { map->m_flags |= EROFS_MAP_META; map->m_pa = vi->z_idataoff; map->m_plen = vi->z_idata_size; + } else if (fragment && m.lcn == vi->z_tailextent_headlcn) { + map->m_flags |= EROFS_MAP_FRAGMENT; } else { map->m_pa = blknr_to_addr(m.pblk); err = z_erofs_get_extent_compressedlen(&m, initial_lcn); @@ -679,12 +697,18 @@ static int z_erofs_do_map_blocks(struct inode *inode, goto out; } - if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN) - map->m_algorithmformat = Z_EROFS_COMPRESSION_SHIFTED; - else if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) + if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN) { + if (vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER) + map->m_algorithmformat = + Z_EROFS_COMPRESSION_INTERLACED; + else + map->m_algorithmformat = + Z_EROFS_COMPRESSION_SHIFTED; + } else if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) { map->m_algorithmformat = vi->z_algorithmtype[1]; - else + } else { map->m_algorithmformat = vi->z_algorithmtype[0]; + } if ((flags & EROFS_GET_BLOCKS_FIEMAP) || ((flags & EROFS_GET_BLOCKS_READMORE) && @@ -705,10 +729,10 @@ out: return err; } -int z_erofs_map_blocks_iter(struct inode *inode, - struct erofs_map_blocks *map, +int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, int flags) { + struct erofs_inode *const vi = EROFS_I(inode); int err = 0; trace_z_erofs_map_blocks_iter_enter(inode, map, flags); @@ -725,6 +749,15 @@ int z_erofs_map_blocks_iter(struct inode *inode, if (err) goto out; + if ((vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) && + !vi->z_tailextent_headlcn) { + map->m_la = 0; + map->m_llen = inode->i_size; + map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_FULL_MAPPED | + EROFS_MAP_FRAGMENT; + goto out; + } + err = z_erofs_do_map_blocks(inode, map, flags); out: trace_z_erofs_map_blocks_iter_exit(inode, map, flags, err); @@ -751,7 +784,8 @@ static int z_erofs_iomap_begin_report(struct inode *inode, loff_t offset, iomap->length = map.m_llen; if (map.m_flags & EROFS_MAP_MAPPED) { iomap->type = IOMAP_MAPPED; - iomap->addr = map.m_pa; + iomap->addr = map.m_flags & EROFS_MAP_FRAGMENT ? + IOMAP_NULL_ADDR : map.m_pa; } else { iomap->type = IOMAP_HOLE; iomap->addr = IOMAP_NULL_ADDR; diff --git a/fs/eventfd.c b/fs/eventfd.c index 3627dd7d25db..c0ffee99ad23 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -69,17 +69,17 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n) * it returns false, the eventfd_signal() call should be deferred to a * safe context. */ - if (WARN_ON_ONCE(current->in_eventfd_signal)) + if (WARN_ON_ONCE(current->in_eventfd)) return 0; spin_lock_irqsave(&ctx->wqh.lock, flags); - current->in_eventfd_signal = 1; + current->in_eventfd = 1; if (ULLONG_MAX - ctx->count < n) n = ULLONG_MAX - ctx->count; ctx->count += n; if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, EPOLLIN); - current->in_eventfd_signal = 0; + current->in_eventfd = 0; spin_unlock_irqrestore(&ctx->wqh.lock, flags); return n; @@ -253,8 +253,10 @@ static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to) __set_current_state(TASK_RUNNING); } eventfd_ctx_do_read(ctx, &ucnt); + current->in_eventfd = 1; if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, EPOLLOUT); + current->in_eventfd = 0; spin_unlock_irq(&ctx->wqh.lock); if (unlikely(copy_to_iter(&ucnt, sizeof(ucnt), to) != sizeof(ucnt))) return -EFAULT; @@ -301,8 +303,10 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c } if (likely(res > 0)) { ctx->count += ucnt; + current->in_eventfd = 1; if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, EPOLLIN); + current->in_eventfd = 0; } spin_unlock_irq(&ctx->wqh.lock); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 8b56b94e2f56..52954d4637b5 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1065,7 +1065,7 @@ static inline bool list_add_tail_lockless(struct list_head *new, * added to the list from another CPU: the winner observes * new->next == new. */ - if (cmpxchg(&new->next, new, head) != new) + if (!try_cmpxchg(&new->next, &new, head)) return false; /* diff --git a/fs/exec.c b/fs/exec.c index 9a5ca7b82bfc..a0b1f0337a62 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -28,7 +28,6 @@ #include <linux/file.h> #include <linux/fdtable.h> #include <linux/mm.h> -#include <linux/vmacache.h> #include <linux/stat.h> #include <linux/fcntl.h> #include <linux/swap.h> @@ -65,7 +64,6 @@ #include <linux/io_uring.h> #include <linux/syscall_user_dispatch.h> #include <linux/coredump.h> -#include <linux/time_namespace.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> @@ -684,6 +682,8 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) unsigned long length = old_end - old_start; unsigned long new_start = old_start - shift; unsigned long new_end = old_end - shift; + VMA_ITERATOR(vmi, mm, new_start); + struct vm_area_struct *next; struct mmu_gather tlb; BUG_ON(new_start > new_end); @@ -692,7 +692,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) * ensure there are no vmas between where we want to go * and where we are */ - if (vma != find_vma(mm, new_start)) + if (vma != vma_next(&vmi)) return -EFAULT; /* @@ -711,12 +711,13 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) lru_add_drain(); tlb_gather_mmu(&tlb, mm); + next = vma_next(&vmi); if (new_end > old_start) { /* * when the old and new regions overlap clear from new_end. */ free_pgd_range(&tlb, new_end, old_end, new_end, - vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING); + next ? next->vm_start : USER_PGTABLES_CEILING); } else { /* * otherwise, clean from old_start; this is done to not touch @@ -725,7 +726,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) * for the others its just a little faster. */ free_pgd_range(&tlb, old_start, old_end, new_end, - vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING); + next ? next->vm_start : USER_PGTABLES_CEILING); } tlb_finish_mmu(&tlb); @@ -958,8 +959,7 @@ struct file *open_exec(const char *name) } EXPORT_SYMBOL(open_exec); -#if defined(CONFIG_HAVE_AOUT) || defined(CONFIG_BINFMT_FLAT) || \ - defined(CONFIG_BINFMT_ELF_FDPIC) +#if defined(CONFIG_BINFMT_FLAT) || defined(CONFIG_BINFMT_ELF_FDPIC) ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) { ssize_t res = vfs_read(file, (void __user *)addr, len, &pos); @@ -979,12 +979,10 @@ static int exec_mmap(struct mm_struct *mm) { struct task_struct *tsk; struct mm_struct *old_mm, *active_mm; - bool vfork; int ret; /* Notify parent that we're no longer interested in the old VM */ tsk = current; - vfork = !!tsk->vfork_done; old_mm = current->mm; exec_mm_release(tsk, old_mm); if (old_mm) @@ -1026,13 +1024,9 @@ static int exec_mmap(struct mm_struct *mm) activate_mm(active_mm, mm); if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) local_irq_enable(); - tsk->mm->vmacache_seqnum = 0; - vmacache_flush(tsk); + lru_gen_add_mm(mm); task_unlock(tsk); - - if (vfork) - timens_on_fork(tsk->nsproxy, tsk); - + lru_gen_use_mm(mm); if (old_mm) { mmap_read_unlock(old_mm); BUG_ON(active_mm != old_mm); @@ -1203,11 +1197,11 @@ static int unshare_sighand(struct task_struct *me) return -ENOMEM; refcount_set(&newsighand->count, 1); - memcpy(newsighand->action, oldsighand->action, - sizeof(newsighand->action)); write_lock_irq(&tasklist_lock); spin_lock(&oldsighand->siglock); + memcpy(newsighand->action, oldsighand->action, + sizeof(newsighand->action)); rcu_assign_pointer(me->sighand, newsighand); spin_unlock(&oldsighand->siglock); write_unlock_irq(&tasklist_lock); @@ -1595,7 +1589,7 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) { /* Handle suid and sgid on files */ struct user_namespace *mnt_userns; - struct inode *inode; + struct inode *inode = file_inode(file); unsigned int mode; kuid_t uid; kgid_t gid; @@ -1606,7 +1600,6 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) if (task_no_new_privs(current)) return; - inode = file->f_path.dentry->d_inode; mode = READ_ONCE(inode->i_mode); if (!(mode & (S_ISUID|S_ISGID))) return; @@ -1888,7 +1881,7 @@ static int do_execveat_common(int fd, struct filename *filename, * whether NPROC limit is still exceeded. */ if ((current->flags & PF_NPROC_EXCEEDED) && - is_ucounts_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { + is_rlimit_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { retval = -EAGAIN; goto out_ret; } diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c index a27b55ec060a..0fc08fdcba73 100644 --- a/fs/exfat/dir.c +++ b/fs/exfat/dir.c @@ -212,9 +212,9 @@ static void exfat_free_namebuf(struct exfat_dentry_namebuf *nb) /* skip iterating emit_dots when dir is empty */ #define ITER_POS_FILLED_DOTS (2) -static int exfat_iterate(struct file *filp, struct dir_context *ctx) +static int exfat_iterate(struct file *file, struct dir_context *ctx) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct inode *tmp; struct exfat_dir_entry de; @@ -228,7 +228,7 @@ static int exfat_iterate(struct file *filp, struct dir_context *ctx) mutex_lock(&EXFAT_SB(sb)->s_lock); cpos = ctx->pos; - if (!dir_emit_dots(filp, ctx)) + if (!dir_emit_dots(file, ctx)) goto unlock; if (ctx->pos == ITER_POS_FILLED_DOTS) { diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c index ee0b7cf51157..41ae4cce1f42 100644 --- a/fs/exfat/fatent.c +++ b/fs/exfat/fatent.c @@ -270,8 +270,7 @@ int exfat_zeroed_cluster(struct inode *dir, unsigned int clu) struct super_block *sb = dir->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct buffer_head *bh; - sector_t blknr, last_blknr; - int i; + sector_t blknr, last_blknr, i; blknr = exfat_cluster_to_sector(sbi, clu); last_blknr = blknr + sbi->sect_per_clus; diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index a795437b86d0..5590a1e83126 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -552,7 +552,7 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info) inode->i_uid = sbi->options.fs_uid; inode->i_gid = sbi->options.fs_gid; inode_inc_iversion(inode); - inode->i_generation = prandom_u32(); + inode->i_generation = get_random_u32(); if (info->attr & ATTR_SUBDIR) { /* directory */ inode->i_generation &= ~1; diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 3ef80d000e13..c648a493faf2 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -248,21 +248,20 @@ struct getdents_callback { * A rather strange filldir function to capture * the name matching the specified inode number. */ -static int filldir_one(struct dir_context *ctx, const char *name, int len, +static bool filldir_one(struct dir_context *ctx, const char *name, int len, loff_t pos, u64 ino, unsigned int d_type) { struct getdents_callback *buf = container_of(ctx, struct getdents_callback, ctx); - int result = 0; buf->sequence++; if (buf->ino == ino && len <= NAME_MAX) { memcpy(buf->name, name, len); buf->name[len] = '\0'; buf->found = 1; - result = -1; + return false; // no more } - return result; + return true; } /** diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index c17ccc19b938..5dc0a31f4a08 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -126,6 +126,7 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group) struct ext2_group_desc * desc; struct buffer_head * bh = NULL; ext2_fsblk_t bitmap_blk; + int ret; desc = ext2_get_group_desc(sb, block_group, NULL); if (!desc) @@ -139,10 +140,10 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group) block_group, le32_to_cpu(desc->bg_block_bitmap)); return NULL; } - if (likely(bh_uptodate_or_lock(bh))) + ret = bh_read(bh, 0); + if (ret > 0) return bh; - - if (bh_submit_read(bh) < 0) { + if (ret < 0) { brelse(bh); ext2_error(sb, __func__, "Cannot read block bitmap - " diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 998dd2ac8008..f4944c4dee60 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -277,8 +277,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent) int best_ndir = inodes_per_group; int best_group = -1; - group = prandom_u32(); - parent_group = (unsigned)group % ngroups; + parent_group = prandom_u32_max(ngroups); for (i = 0; i < ngroups; i++) { group = (parent_group + i) % ngroups; desc = ext2_get_group_desc (sb, group, NULL); diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 5fd9a22d2b70..9125eab85146 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -120,7 +120,7 @@ static int ext2_create (struct user_namespace * mnt_userns, } static int ext2_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode) + struct file *file, umode_t mode) { struct inode *inode = ext2_new_inode(dir, mode, NULL); if (IS_ERR(inode)) @@ -128,9 +128,9 @@ static int ext2_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, ext2_set_file_ops(inode); mark_inode_dirty(inode); - d_tmpfile(dentry, inode); + d_tmpfile(file, inode); unlock_new_inode(inode); - return 0; + return finish_open_simple(file, 0); } static int ext2_mknod (struct user_namespace * mnt_userns, struct inode * dir, diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 252c742379cf..03f2af98b1b4 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -163,7 +163,7 @@ static void ext2_put_super (struct super_block * sb) db_count = sbi->s_gdb_count; for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); - kfree(sbi->s_group_desc); + kvfree(sbi->s_group_desc); kfree(sbi->s_debts); percpu_counter_destroy(&sbi->s_freeblocks_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); @@ -1052,6 +1052,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) sbi->s_blocks_per_group); goto failed_mount; } + /* At least inode table, bitmaps, and sb have to fit in one group */ + if (sbi->s_blocks_per_group <= sbi->s_itb_per_group + 3) { + ext2_msg(sb, KERN_ERR, + "error: #blocks per group smaller than metadata size: %lu <= %lu", + sbi->s_blocks_per_group, sbi->s_inodes_per_group + 3); + goto failed_mount; + } if (sbi->s_frags_per_group > sb->s_blocksize * 8) { ext2_msg(sb, KERN_ERR, "error: #fragments per group too big: %lu", @@ -1065,9 +1072,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) sbi->s_inodes_per_group); goto failed_mount; } + if (sb_bdev_nr_blocks(sb) < le32_to_cpu(es->s_blocks_count)) { + ext2_msg(sb, KERN_ERR, + "bad geometry: block count %u exceeds size of device (%u blocks)", + le32_to_cpu(es->s_blocks_count), + (unsigned)sb_bdev_nr_blocks(sb)); + goto failed_mount; + } - if (EXT2_BLOCKS_PER_GROUP(sb) == 0) - goto cantfind_ext2; sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) - le32_to_cpu(es->s_first_data_block) - 1) / EXT2_BLOCKS_PER_GROUP(sb)) + 1; @@ -1080,7 +1092,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) } db_count = (sbi->s_groups_count + EXT2_DESC_PER_BLOCK(sb) - 1) / EXT2_DESC_PER_BLOCK(sb); - sbi->s_group_desc = kmalloc_array(db_count, + sbi->s_group_desc = kvmalloc_array(db_count, sizeof(struct buffer_head *), GFP_KERNEL); if (sbi->s_group_desc == NULL) { @@ -1206,7 +1218,7 @@ failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); failed_mount_group_desc: - kfree(sbi->s_group_desc); + kvfree(sbi->s_group_desc); kfree(sbi->s_debts); failed_mount: brelse(bh); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 9bca5565547b..8d5453852f98 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -167,8 +167,6 @@ enum SHIFT_DIRECTION { #define EXT4_MB_CR0_OPTIMIZED 0x8000 /* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */ #define EXT4_MB_CR1_OPTIMIZED 0x00010000 -/* Perform linear traversal for one group */ -#define EXT4_MB_SEARCH_NEXT_LINEAR 0x00020000 struct ext4_allocation_request { /* target inode for block we're allocating */ struct inode *inode; @@ -1600,8 +1598,8 @@ struct ext4_sb_info { struct list_head s_discard_list; struct work_struct s_discard_work; atomic_t s_retry_alloc_pending; - struct rb_root s_mb_avg_fragment_size_root; - rwlock_t s_mb_rb_lock; + struct list_head *s_mb_avg_fragment_size; + rwlock_t *s_mb_avg_fragment_size_locks; struct list_head *s_mb_largest_free_orders; rwlock_t *s_mb_largest_free_orders_locks; @@ -2979,6 +2977,7 @@ extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, extern int ext4_write_inode(struct inode *, struct writeback_control *); extern int ext4_setattr(struct user_namespace *, struct dentry *, struct iattr *); +extern u32 ext4_dio_alignment(struct inode *inode); extern int ext4_getattr(struct user_namespace *, const struct path *, struct kstat *, u32, unsigned int); extern void ext4_evict_inode(struct inode *); @@ -3413,6 +3412,8 @@ struct ext4_group_info { ext4_grpblk_t bb_first_free; /* first free block */ ext4_grpblk_t bb_free; /* total free blocks */ ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ + int bb_avg_fragment_size_order; /* order of average + fragment in BG */ ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ ext4_group_t bb_group; /* Group number */ struct list_head bb_prealloc_list; @@ -3420,7 +3421,7 @@ struct ext4_group_info { void *bb_bitmap; #endif struct rw_semaphore alloc_sem; - struct rb_node bb_avg_fragment_size_rb; + struct list_head bb_avg_fragment_size_node; struct list_head bb_largest_free_order_node; ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block * regions, index is order. @@ -3591,9 +3592,6 @@ extern bool empty_inline_dir(struct inode *dir, int *has_inline_data); extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, struct ext4_dir_entry_2 **parent_de, int *retval); -extern int ext4_inline_data_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo, - int *has_inline, __u64 start, __u64 len); extern void *ext4_read_inline_link(struct inode *inode); struct iomap; @@ -3712,7 +3710,7 @@ extern int ext4_ext_insert_extent(handle_t *, struct inode *, extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t, struct ext4_ext_path **, int flags); -extern void ext4_ext_drop_refs(struct ext4_ext_path *); +extern void ext4_free_ext_path(struct ext4_ext_path *); extern int ext4_ext_check_inode(struct inode *inode); extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c148bb97b527..6c399a8b22b3 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -106,6 +106,25 @@ static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped) return 0; } +static void ext4_ext_drop_refs(struct ext4_ext_path *path) +{ + int depth, i; + + if (!path) + return; + depth = path->p_depth; + for (i = 0; i <= depth; i++, path++) { + brelse(path->p_bh); + path->p_bh = NULL; + } +} + +void ext4_free_ext_path(struct ext4_ext_path *path) +{ + ext4_ext_drop_refs(path); + kfree(path); +} + /* * Make sure 'handle' has at least 'check_cred' credits. If not, restart * transaction with 'restart_cred' credits. The function drops i_data_sem @@ -460,6 +479,10 @@ static int __ext4_ext_check(const char *function, unsigned int line, error_msg = "invalid eh_entries"; goto corrupted; } + if (unlikely((eh->eh_entries == 0) && (depth > 0))) { + error_msg = "eh_entries is 0 but eh_depth is > 0"; + goto corrupted; + } if (!ext4_valid_extent_entries(inode, eh, lblk, &pblk, depth)) { error_msg = "invalid extent entries"; goto corrupted; @@ -632,8 +655,7 @@ int ext4_ext_precache(struct inode *inode) ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED); out: up_read(&ei->i_data_sem); - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); return ret; } @@ -720,19 +742,6 @@ static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, #define ext4_ext_show_move(inode, path, newblock, level) #endif -void ext4_ext_drop_refs(struct ext4_ext_path *path) -{ - int depth, i; - - if (!path) - return; - depth = path->p_depth; - for (i = 0; i <= depth; i++, path++) { - brelse(path->p_bh); - path->p_bh = NULL; - } -} - /* * ext4_ext_binsearch_idx: * binary search for the closest index of the given block @@ -951,8 +960,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block, return path; err: - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); if (orig_path) *orig_path = NULL; return ERR_PTR(ret); @@ -2170,8 +2178,7 @@ merge: err = ext4_ext_dirty(handle, inode, path + path->p_depth); cleanup: - ext4_ext_drop_refs(npath); - kfree(npath); + ext4_free_ext_path(npath); return err; } @@ -3057,8 +3064,7 @@ again: } } out: - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); path = NULL; if (err == -EAGAIN) goto again; @@ -4371,8 +4377,7 @@ got_allocated_blocks: allocated = map->m_len; ext4_ext_show_leaf(inode, path); out: - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); trace_ext4_ext_map_blocks_exit(inode, flags, map, err ? err : allocated); @@ -5179,6 +5184,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, * and it is decreased till we reach start. */ again: + ret = 0; if (SHIFT == SHIFT_LEFT) iterator = &start; else @@ -5222,14 +5228,21 @@ again: ext4_ext_get_actual_len(extent); } else { extent = EXT_FIRST_EXTENT(path[depth].p_hdr); - if (le32_to_cpu(extent->ee_block) > 0) + if (le32_to_cpu(extent->ee_block) > start) *iterator = le32_to_cpu(extent->ee_block) - 1; - else - /* Beginning is reached, end of the loop */ + else if (le32_to_cpu(extent->ee_block) == start) iterator = NULL; - /* Update path extent in case we need to stop */ - while (le32_to_cpu(extent->ee_block) < start) + else { + extent = EXT_LAST_EXTENT(path[depth].p_hdr); + while (le32_to_cpu(extent->ee_block) >= start) + extent--; + + if (extent == EXT_LAST_EXTENT(path[depth].p_hdr)) + break; + extent++; + iterator = NULL; + } path[depth].p_ext = extent; } ret = ext4_ext_shift_path_extents(path, shift, inode, @@ -5241,8 +5254,7 @@ again: break; } out: - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); return ret; } @@ -5534,15 +5546,13 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) EXT4_GET_BLOCKS_METADATA_NOFAIL); } - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); if (ret < 0) { up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } } else { - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); } ret = ext4_es_remove_extent(inode, offset_lblk, @@ -5762,10 +5772,8 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, count -= len; repeat: - ext4_ext_drop_refs(path1); - kfree(path1); - ext4_ext_drop_refs(path2); - kfree(path2); + ext4_free_ext_path(path1); + ext4_free_ext_path(path2); path1 = path2 = NULL; } return replaced_count; @@ -5844,8 +5852,7 @@ int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu) } out: - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); return err ? err : mapped; } @@ -5912,8 +5919,7 @@ int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start, ret = ext4_ext_dirty(NULL, inode, &path[path->p_depth]); up_write(&EXT4_I(inode)->i_data_sem); out: - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); ext4_mark_inode_dirty(NULL, inode); return ret; } @@ -5931,8 +5937,7 @@ void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end) return; ex = path[path->p_depth].p_ext; if (!ex) { - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); ext4_mark_inode_dirty(NULL, inode); return; } @@ -5945,8 +5950,7 @@ void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end) ext4_ext_dirty(NULL, inode, &path[path->p_depth]); up_write(&EXT4_I(inode)->i_data_sem); ext4_mark_inode_dirty(NULL, inode); - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); } } @@ -5985,13 +5989,11 @@ int ext4_ext_replay_set_iblocks(struct inode *inode) return PTR_ERR(path); ex = path[path->p_depth].p_ext; if (!ex) { - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); goto out; } end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); /* Count the number of data blocks */ cur = 0; @@ -6021,30 +6023,26 @@ int ext4_ext_replay_set_iblocks(struct inode *inode) if (IS_ERR(path)) goto out; numblks += path->p_depth; - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); while (cur < end) { path = ext4_find_extent(inode, cur, NULL, 0); if (IS_ERR(path)) break; ex = path[path->p_depth].p_ext; if (!ex) { - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); return 0; } cur = max(cur + 1, le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex)); ret = skip_hole(inode, &cur); if (ret < 0) { - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); break; } path2 = ext4_find_extent(inode, cur, NULL, 0); if (IS_ERR(path2)) { - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); break; } for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) { @@ -6058,10 +6056,8 @@ int ext4_ext_replay_set_iblocks(struct inode *inode) if (cmp1 != cmp2 && cmp2 != 0) numblks++; } - ext4_ext_drop_refs(path); - ext4_ext_drop_refs(path2); - kfree(path); - kfree(path2); + ext4_free_ext_path(path); + ext4_free_ext_path(path2); } out: @@ -6088,13 +6084,11 @@ int ext4_ext_clear_bb(struct inode *inode) return PTR_ERR(path); ex = path[path->p_depth].p_ext; if (!ex) { - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); return 0; } end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); cur = 0; while (cur < end) { @@ -6113,8 +6107,7 @@ int ext4_ext_clear_bb(struct inode *inode) ext4_fc_record_regions(inode->i_sb, inode->i_ino, 0, path[j].p_block, 1, 1); } - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); } ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); ext4_fc_record_regions(inode->i_sb, inode->i_ino, diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 23167efda95e..cd0a861853e3 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -667,8 +667,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, } } out: - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); } static void ext4_es_insert_extent_ind_check(struct inode *inode, diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 2af962cbb835..0f6d0a80467d 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -229,6 +229,12 @@ __releases(&EXT4_SB(inode->i_sb)->s_fc_lock) finish_wait(wq, &wait.wq_entry); } +static bool ext4_fc_disabled(struct super_block *sb) +{ + return (!test_opt2(sb, JOURNAL_FAST_COMMIT) || + (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)); +} + /* * Inform Ext4's fast about start of an inode update * @@ -240,8 +246,7 @@ void ext4_fc_start_update(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); - if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || - (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) + if (ext4_fc_disabled(inode->i_sb)) return; restart: @@ -265,8 +270,7 @@ void ext4_fc_stop_update(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); - if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || - (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) + if (ext4_fc_disabled(inode->i_sb)) return; if (atomic_dec_and_test(&ei->i_fc_updates)) @@ -283,8 +287,7 @@ void ext4_fc_del(struct inode *inode) struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_fc_dentry_update *fc_dentry; - if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || - (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) + if (ext4_fc_disabled(inode->i_sb)) return; restart: @@ -337,8 +340,7 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl struct ext4_sb_info *sbi = EXT4_SB(sb); tid_t tid; - if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || - (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) + if (ext4_fc_disabled(sb)) return; ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); @@ -493,10 +495,8 @@ void __ext4_fc_track_unlink(handle_t *handle, void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry) { struct inode *inode = d_inode(dentry); - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || - (sbi->s_mount_state & EXT4_FC_REPLAY)) + if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) @@ -522,10 +522,8 @@ void __ext4_fc_track_link(handle_t *handle, void ext4_fc_track_link(handle_t *handle, struct dentry *dentry) { struct inode *inode = d_inode(dentry); - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || - (sbi->s_mount_state & EXT4_FC_REPLAY)) + if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) @@ -551,10 +549,8 @@ void __ext4_fc_track_create(handle_t *handle, struct inode *inode, void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) { struct inode *inode = d_inode(dentry); - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || - (sbi->s_mount_state & EXT4_FC_REPLAY)) + if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) @@ -576,22 +572,20 @@ static int __track_inode(struct inode *inode, void *arg, bool update) void ext4_fc_track_inode(handle_t *handle, struct inode *inode) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int ret; if (S_ISDIR(inode->i_mode)) return; + if (ext4_fc_disabled(inode->i_sb)) + return; + if (ext4_should_journal_data(inode)) { ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_INODE_JOURNAL_DATA, handle); return; } - if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || - (sbi->s_mount_state & EXT4_FC_REPLAY)) - return; - if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; @@ -634,15 +628,13 @@ static int __track_range(struct inode *inode, void *arg, bool update) void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, ext4_lblk_t end) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct __track_range_args args; int ret; if (S_ISDIR(inode->i_mode)) return; - if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || - (sbi->s_mount_state & EXT4_FC_REPLAY)) + if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) @@ -710,10 +702,10 @@ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) * After allocating len, we should have space at least for a 0 byte * padding. */ - if (len + sizeof(struct ext4_fc_tl) > bsize) + if (len + EXT4_FC_TAG_BASE_LEN > bsize) return NULL; - if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) { + if (bsize - off - 1 > len + EXT4_FC_TAG_BASE_LEN) { /* * Only allocate from current buffer if we have enough space for * this request AND we have space to add a zero byte padding. @@ -730,10 +722,10 @@ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) /* Need to add PAD tag */ tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off); tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); - pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl); + pad_len = bsize - off - 1 - EXT4_FC_TAG_BASE_LEN; tl->fc_len = cpu_to_le16(pad_len); if (crc) - *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl)); + *crc = ext4_chksum(sbi, *crc, tl, EXT4_FC_TAG_BASE_LEN); if (pad_len > 0) ext4_fc_memzero(sb, tl + 1, pad_len, crc); ext4_fc_submit_bh(sb, false); @@ -775,7 +767,7 @@ static int ext4_fc_write_tail(struct super_block *sb, u32 crc) * ext4_fc_reserve_space takes care of allocating an extra block if * there's no enough space on this block for accommodating this tail. */ - dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc); + dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(tail), &crc); if (!dst) return -ENOSPC; @@ -785,8 +777,8 @@ static int ext4_fc_write_tail(struct super_block *sb, u32 crc) tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail)); sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize); - ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc); - dst += sizeof(tl); + ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, &crc); + dst += EXT4_FC_TAG_BASE_LEN; tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc); dst += sizeof(tail.fc_tid); @@ -808,15 +800,15 @@ static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val, struct ext4_fc_tl tl; u8 *dst; - dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc); + dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + len, crc); if (!dst) return false; tl.fc_tag = cpu_to_le16(tag); tl.fc_len = cpu_to_le16(len); - ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc); - ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc); + ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc); + ext4_fc_memcpy(sb, dst + EXT4_FC_TAG_BASE_LEN, val, len, crc); return true; } @@ -828,8 +820,8 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, struct ext4_fc_dentry_info fcd; struct ext4_fc_tl tl; int dlen = fc_dentry->fcd_name.len; - u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen, - crc); + u8 *dst = ext4_fc_reserve_space(sb, + EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc); if (!dst) return false; @@ -838,8 +830,8 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino); tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op); tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen); - ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc); - dst += sizeof(tl); + ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc); + dst += EXT4_FC_TAG_BASE_LEN; ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc); dst += sizeof(fcd); ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc); @@ -874,22 +866,25 @@ static int ext4_fc_write_inode(struct inode *inode, u32 *crc) tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE); tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino)); + ret = -ECANCELED; dst = ext4_fc_reserve_space(inode->i_sb, - sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc); + EXT4_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc); if (!dst) - return -ECANCELED; + goto err; - if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc)) - return -ECANCELED; - dst += sizeof(tl); + if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc)) + goto err; + dst += EXT4_FC_TAG_BASE_LEN; if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc)) - return -ECANCELED; + goto err; dst += sizeof(fc_inode); if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc), inode_len, crc)) - return -ECANCELED; - - return 0; + goto err; + ret = 0; +err: + brelse(iloc.bh); + return ret; } /* @@ -1343,7 +1338,7 @@ struct dentry_info_args { }; static inline void tl_to_darg(struct dentry_info_args *darg, - struct ext4_fc_tl *tl, u8 *val) + struct ext4_fc_tl *tl, u8 *val) { struct ext4_fc_dentry_info fcd; @@ -1352,8 +1347,14 @@ static inline void tl_to_darg(struct dentry_info_args *darg, darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino); darg->ino = le32_to_cpu(fcd.fc_ino); darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname); - darg->dname_len = le16_to_cpu(tl->fc_len) - - sizeof(struct ext4_fc_dentry_info); + darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info); +} + +static inline void ext4_fc_get_tl(struct ext4_fc_tl *tl, u8 *val) +{ + memcpy(tl, val, EXT4_FC_TAG_BASE_LEN); + tl->fc_len = le16_to_cpu(tl->fc_len); + tl->fc_tag = le16_to_cpu(tl->fc_tag); } /* Unlink replay function */ @@ -1491,13 +1492,15 @@ static int ext4_fc_record_modified_inode(struct super_block *sb, int ino) if (state->fc_modified_inodes[i] == ino) return 0; if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) { - state->fc_modified_inodes = krealloc( - state->fc_modified_inodes, + int *fc_modified_inodes; + + fc_modified_inodes = krealloc(state->fc_modified_inodes, sizeof(int) * (state->fc_modified_inodes_size + EXT4_FC_REPLAY_REALLOC_INCREMENT), GFP_KERNEL); - if (!state->fc_modified_inodes) + if (!fc_modified_inodes) return -ENOMEM; + state->fc_modified_inodes = fc_modified_inodes; state->fc_modified_inodes_size += EXT4_FC_REPLAY_REALLOC_INCREMENT; } @@ -1516,8 +1519,9 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl, struct ext4_inode *raw_fc_inode; struct inode *inode = NULL; struct ext4_iloc iloc; - int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag); + int inode_len, ino, ret, tag = tl->fc_tag; struct ext4_extent_header *eh; + size_t off_gen = offsetof(struct ext4_inode, i_generation); memcpy(&fc_inode, val, sizeof(fc_inode)); @@ -1541,12 +1545,12 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl, if (ret) goto out; - inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode); + inode_len = tl->fc_len - sizeof(struct ext4_fc_inode); raw_inode = ext4_raw_inode(&iloc); memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block)); - memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation, - inode_len - offsetof(struct ext4_inode, i_generation)); + memcpy((u8 *)raw_inode + off_gen, (u8 *)raw_fc_inode + off_gen, + inode_len - off_gen); if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) { eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]); if (eh->eh_magic != EXT4_EXT_MAGIC) { @@ -1682,15 +1686,18 @@ int ext4_fc_record_regions(struct super_block *sb, int ino, if (replay && state->fc_regions_used != state->fc_regions_valid) state->fc_regions_used = state->fc_regions_valid; if (state->fc_regions_used == state->fc_regions_size) { + struct ext4_fc_alloc_region *fc_regions; + + fc_regions = krealloc(state->fc_regions, + sizeof(struct ext4_fc_alloc_region) * + (state->fc_regions_size + + EXT4_FC_REPLAY_REALLOC_INCREMENT), + GFP_KERNEL); + if (!fc_regions) + return -ENOMEM; state->fc_regions_size += EXT4_FC_REPLAY_REALLOC_INCREMENT; - state->fc_regions = krealloc( - state->fc_regions, - state->fc_regions_size * - sizeof(struct ext4_fc_alloc_region), - GFP_KERNEL); - if (!state->fc_regions) - return -ENOMEM; + state->fc_regions = fc_regions; } region = &state->fc_regions[state->fc_regions_used++]; region->ino = ino; @@ -1770,8 +1777,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb, ret = ext4_ext_insert_extent( NULL, inode, &path, &newex, 0); up_write((&EXT4_I(inode)->i_data_sem)); - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); if (ret) goto out; goto next; @@ -1926,8 +1932,7 @@ static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) for (j = 0; j < path->p_depth; j++) ext4_mb_mark_bb(inode->i_sb, path[j].p_block, 1, 1); - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); } cur += ret; ext4_mb_mark_bb(inode->i_sb, map.m_pblk, @@ -1972,6 +1977,34 @@ void ext4_fc_replay_cleanup(struct super_block *sb) kfree(sbi->s_fc_replay_state.fc_modified_inodes); } +static inline bool ext4_fc_tag_len_isvalid(struct ext4_fc_tl *tl, + u8 *val, u8 *end) +{ + if (val + tl->fc_len > end) + return false; + + /* Here only check ADD_RANGE/TAIL/HEAD which will read data when do + * journal rescan before do CRC check. Other tags length check will + * rely on CRC check. + */ + switch (tl->fc_tag) { + case EXT4_FC_TAG_ADD_RANGE: + return (sizeof(struct ext4_fc_add_range) == tl->fc_len); + case EXT4_FC_TAG_TAIL: + return (sizeof(struct ext4_fc_tail) <= tl->fc_len); + case EXT4_FC_TAG_HEAD: + return (sizeof(struct ext4_fc_head) == tl->fc_len); + case EXT4_FC_TAG_DEL_RANGE: + case EXT4_FC_TAG_LINK: + case EXT4_FC_TAG_UNLINK: + case EXT4_FC_TAG_CREAT: + case EXT4_FC_TAG_INODE: + case EXT4_FC_TAG_PAD: + default: + return true; + } +} + /* * Recovery Scan phase handler * @@ -2028,12 +2061,18 @@ static int ext4_fc_replay_scan(journal_t *journal, } state->fc_replay_expected_off++; - for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) { - memcpy(&tl, cur, sizeof(tl)); - val = cur + sizeof(tl); + for (cur = start; cur < end - EXT4_FC_TAG_BASE_LEN; + cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { + ext4_fc_get_tl(&tl, cur); + val = cur + EXT4_FC_TAG_BASE_LEN; + if (!ext4_fc_tag_len_isvalid(&tl, val, end)) { + ret = state->fc_replay_num_tags ? + JBD2_FC_REPLAY_STOP : -ECANCELED; + goto out_err; + } ext4_debug("Scan phase, tag:%s, blk %lld\n", - tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr); - switch (le16_to_cpu(tl.fc_tag)) { + tag2str(tl.fc_tag), bh->b_blocknr); + switch (tl.fc_tag) { case EXT4_FC_TAG_ADD_RANGE: memcpy(&ext, val, sizeof(ext)); ex = (struct ext4_extent *)&ext.fc_ex; @@ -2053,13 +2092,13 @@ static int ext4_fc_replay_scan(journal_t *journal, case EXT4_FC_TAG_PAD: state->fc_cur_tag++; state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, - sizeof(tl) + le16_to_cpu(tl.fc_len)); + EXT4_FC_TAG_BASE_LEN + tl.fc_len); break; case EXT4_FC_TAG_TAIL: state->fc_cur_tag++; memcpy(&tail, val, sizeof(tail)); state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, - sizeof(tl) + + EXT4_FC_TAG_BASE_LEN + offsetof(struct ext4_fc_tail, fc_crc)); if (le32_to_cpu(tail.fc_tid) == expected_tid && @@ -2086,7 +2125,7 @@ static int ext4_fc_replay_scan(journal_t *journal, } state->fc_cur_tag++; state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, - sizeof(tl) + le16_to_cpu(tl.fc_len)); + EXT4_FC_TAG_BASE_LEN + tl.fc_len); break; default: ret = state->fc_replay_num_tags ? @@ -2141,19 +2180,20 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, start = (u8 *)bh->b_data; end = (__u8 *)bh->b_data + journal->j_blocksize - 1; - for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) { - memcpy(&tl, cur, sizeof(tl)); - val = cur + sizeof(tl); + for (cur = start; cur < end - EXT4_FC_TAG_BASE_LEN; + cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { + ext4_fc_get_tl(&tl, cur); + val = cur + EXT4_FC_TAG_BASE_LEN; if (state->fc_replay_num_tags == 0) { ret = JBD2_FC_REPLAY_STOP; ext4_fc_set_bitmaps_and_counters(sb); break; } - ext4_debug("Replay phase, tag:%s\n", - tag2str(le16_to_cpu(tl.fc_tag))); + + ext4_debug("Replay phase, tag:%s\n", tag2str(tl.fc_tag)); state->fc_replay_num_tags--; - switch (le16_to_cpu(tl.fc_tag)) { + switch (tl.fc_tag) { case EXT4_FC_TAG_LINK: ret = ext4_fc_replay_link(sb, &tl, val); break; @@ -2174,19 +2214,18 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, break; case EXT4_FC_TAG_PAD: trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0, - le16_to_cpu(tl.fc_len), 0); + tl.fc_len, 0); break; case EXT4_FC_TAG_TAIL: - trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0, - le16_to_cpu(tl.fc_len), 0); + trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, + 0, tl.fc_len, 0); memcpy(&tail, val, sizeof(tail)); WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid); break; case EXT4_FC_TAG_HEAD: break; default: - trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0, - le16_to_cpu(tl.fc_len), 0); + trace_ext4_fc_replay(sb, tl.fc_tag, 0, tl.fc_len, 0); ret = -ECANCELED; break; } diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h index 1db12847a83b..a6154c3ed135 100644 --- a/fs/ext4/fast_commit.h +++ b/fs/ext4/fast_commit.h @@ -70,6 +70,9 @@ struct ext4_fc_tail { __le32 fc_crc; }; +/* Tag base length */ +#define EXT4_FC_TAG_BASE_LEN (sizeof(struct ext4_fc_tl)) + /* * Fast commit status codes */ diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 109d07629f81..a7a597c727e6 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -36,19 +36,34 @@ #include "acl.h" #include "truncate.h" -static bool ext4_dio_supported(struct kiocb *iocb, struct iov_iter *iter) +/* + * Returns %true if the given DIO request should be attempted with DIO, or + * %false if it should fall back to buffered I/O. + * + * DIO isn't well specified; when it's unsupported (either due to the request + * being misaligned, or due to the file not supporting DIO at all), filesystems + * either fall back to buffered I/O or return EINVAL. For files that don't use + * any special features like encryption or verity, ext4 has traditionally + * returned EINVAL for misaligned DIO. iomap_dio_rw() uses this convention too. + * In this case, we should attempt the DIO, *not* fall back to buffered I/O. + * + * In contrast, in cases where DIO is unsupported due to ext4 features, ext4 + * traditionally falls back to buffered I/O. + * + * This function implements the traditional ext4 behavior in all these cases. + */ +static bool ext4_should_use_dio(struct kiocb *iocb, struct iov_iter *iter) { struct inode *inode = file_inode(iocb->ki_filp); + u32 dio_align = ext4_dio_alignment(inode); - if (!fscrypt_dio_supported(iocb, iter)) - return false; - if (fsverity_active(inode)) - return false; - if (ext4_should_journal_data(inode)) + if (dio_align == 0) return false; - if (ext4_has_inline_data(inode)) - return false; - return true; + + if (dio_align == 1) + return true; + + return IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), dio_align); } static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) @@ -63,7 +78,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) inode_lock_shared(inode); } - if (!ext4_dio_supported(iocb, to)) { + if (!ext4_should_use_dio(iocb, to)) { inode_unlock_shared(inode); /* * Fallback to buffered I/O if the operation being performed on @@ -511,7 +526,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) } /* Fallback to buffered I/O if the inode does not support direct I/O. */ - if (!ext4_dio_supported(iocb, from)) { + if (!ext4_should_use_dio(iocb, from)) { if (ilock_shared) inode_unlock_shared(inode); else @@ -528,6 +543,12 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) ret = -EAGAIN; goto out; } + /* + * Make sure inline data cannot be created anymore since we are going + * to allocate blocks for DIO. We know the inode does not have any + * inline data now because ext4_dio_supported() checked for that. + */ + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); offset = iocb->ki_pos; count = ret; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index f73e5eb43eae..e9bc46684106 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -463,10 +463,9 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, hinfo.hash_version = DX_HASH_HALF_MD4; hinfo.seed = sbi->s_hash_seed; ext4fs_dirhash(parent, qstr->name, qstr->len, &hinfo); - grp = hinfo.hash; + parent_group = hinfo.hash % ngroups; } else - grp = prandom_u32(); - parent_group = (unsigned)grp % ngroups; + parent_group = prandom_u32_max(ngroups); for (i = 0; i < ngroups; i++) { g = (parent_group + i) % ngroups; get_orlov_stats(sb, g, flex_size, &stats); @@ -510,7 +509,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, goto fallback; } - max_dirs = ndirs / ngroups + inodes_per_group / 16; + max_dirs = ndirs / ngroups + inodes_per_group*flex_size / 16; min_inodes = avefreei - inodes_per_group*flex_size / 4; if (min_inodes < 1) min_inodes = 1; @@ -1280,7 +1279,7 @@ got: EXT4_GROUP_INFO_IBITMAP_CORRUPT); goto out; } - inode->i_generation = prandom_u32(); + inode->i_generation = get_random_u32(); /* Precompute checksum seed for inode metadata */ if (ext4_has_metadata_csum(sb)) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 601214453c3a..2b5ef1b64249 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1188,6 +1188,13 @@ retry_grab: page = grab_cache_page_write_begin(mapping, index); if (!page) return -ENOMEM; + /* + * The same as page allocation, we prealloc buffer heads before + * starting the handle. + */ + if (!page_has_buffers(page)) + create_empty_buffers(page, inode->i_sb->s_blocksize, 0); + unlock_page(page); retry_journal: @@ -5342,6 +5349,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, int error, rc = 0; int orphan = 0; const unsigned int ia_valid = attr->ia_valid; + bool inc_ivers = true; if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; @@ -5425,8 +5433,8 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, return -EINVAL; } - if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size) - inode_inc_iversion(inode); + if (attr->ia_size == inode->i_size) + inc_ivers = false; if (shrink) { if (ext4_should_order_data(inode)) { @@ -5528,6 +5536,8 @@ out_mmap_sem: } if (!error) { + if (inc_ivers) + inode_inc_iversion(inode); setattr_copy(mnt_userns, inode, attr); mark_inode_dirty(inode); } @@ -5550,6 +5560,22 @@ err_out: return error; } +u32 ext4_dio_alignment(struct inode *inode) +{ + if (fsverity_active(inode)) + return 0; + if (ext4_should_journal_data(inode)) + return 0; + if (ext4_has_inline_data(inode)) + return 0; + if (IS_ENCRYPTED(inode)) { + if (!fscrypt_dio_supported(inode)) + return 0; + return i_blocksize(inode); + } + return 1; /* use the iomap defaults */ +} + int ext4_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { @@ -5565,6 +5591,27 @@ int ext4_getattr(struct user_namespace *mnt_userns, const struct path *path, stat->btime.tv_nsec = ei->i_crtime.tv_nsec; } + /* + * Return the DIO alignment restrictions if requested. We only return + * this information when requested, since on encrypted files it might + * take a fair bit of work to get if the file wasn't opened recently. + */ + if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) { + u32 dio_align = ext4_dio_alignment(inode); + + stat->result_mask |= STATX_DIOALIGN; + if (dio_align == 1) { + struct block_device *bdev = inode->i_sb->s_bdev; + + /* iomap defaults */ + stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; + stat->dio_offset_align = bdev_logical_block_size(bdev); + } else { + stat->dio_mem_align = dio_align; + stat->dio_offset_align = dio_align; + } + } + flags = ei->i_flags & EXT4_FL_USER_VISIBLE; if (flags & EXT4_APPEND_FL) stat->attributes |= STATX_ATTR_APPEND; @@ -5731,9 +5778,6 @@ int ext4_mark_iloc_dirty(handle_t *handle, } ext4_fc_track_inode(handle, inode); - if (IS_I_VERSION(inode)) - inode_inc_iversion(inode); - /* the do_update_inode consumes one bh->b_count */ get_bh(iloc->bh); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 3cf3ec4b1c21..95dfea28bf4e 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -145,9 +145,8 @@ static int ext4_update_backup_sb(struct super_block *sb, if (ext4_has_metadata_csum(sb) && es->s_checksum != ext4_superblock_csum(sb, es)) { ext4_msg(sb, KERN_ERR, "Invalid checksum for backup " - "superblock %llu\n", sb_block); + "superblock %llu", sb_block); unlock_buffer(bh); - err = -EFSBADCRC; goto out_bh; } func(es, arg); @@ -452,9 +451,10 @@ static long swap_inode_boot_loader(struct super_block *sb, swap_inode_data(inode, inode_bl); inode->i_ctime = inode_bl->i_ctime = current_time(inode); + inode_inc_iversion(inode); - inode->i_generation = prandom_u32(); - inode_bl->i_generation = prandom_u32(); + inode->i_generation = get_random_u32(); + inode_bl->i_generation = get_random_u32(); ext4_reset_inode_seed(inode); ext4_reset_inode_seed(inode_bl); @@ -665,6 +665,7 @@ static int ext4_ioctl_setflags(struct inode *inode, ext4_set_inode_flags(inode, false); inode->i_ctime = current_time(inode); + inode_inc_iversion(inode); err = ext4_mark_iloc_dirty(handle, inode, &iloc); flags_err: @@ -775,6 +776,7 @@ static int ext4_ioctl_setproject(struct inode *inode, __u32 projid) EXT4_I(inode)->i_projid = kprojid; inode->i_ctime = current_time(inode); + inode_inc_iversion(inode); out_dirty: rc = ext4_mark_iloc_dirty(handle, inode, &iloc); if (!err) @@ -1060,9 +1062,6 @@ static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg) if (!EXT4_SB(sb)->s_journal) return -ENODEV; - if (flags & ~EXT4_IOC_CHECKPOINT_FLAG_VALID) - return -EINVAL; - if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && !bdev_max_discard_sectors(EXT4_SB(sb)->s_journal->j_dev)) return -EOPNOTSUPP; @@ -1257,6 +1256,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) err = ext4_reserve_inode_write(handle, inode, &iloc); if (err == 0) { inode->i_ctime = current_time(inode); + inode_inc_iversion(inode); inode->i_generation = generation; err = ext4_mark_iloc_dirty(handle, inode, &iloc); } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index bd8f8b5c3d30..9dad93059945 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -140,13 +140,15 @@ * number of buddy bitmap orders possible) number of lists. Group-infos are * placed in appropriate lists. * - * 2) Average fragment size rb tree (sbi->s_mb_avg_fragment_size_root) + * 2) Average fragment size lists (sbi->s_mb_avg_fragment_size) * - * Locking: sbi->s_mb_rb_lock (rwlock) + * Locking: sbi->s_mb_avg_fragment_size_locks(array of rw locks) * - * This is a red black tree consisting of group infos and the tree is sorted - * by average fragment sizes (which is calculated as ext4_group_info->bb_free - * / ext4_group_info->bb_fragments). + * This is an array of lists where in the i-th list there are groups with + * average fragment size >= 2^i and < 2^(i+1). The average fragment size + * is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments. + * Note that we don't bother with a special list for completely empty groups + * so we only have MB_NUM_ORDERS(sb) lists. * * When "mb_optimize_scan" mount option is set, mballoc consults the above data * structures to decide the order in which groups are to be traversed for @@ -160,7 +162,8 @@ * * At CR = 1, we only consider groups where average fragment size > request * size. So, we lookup a group which has average fragment size just above or - * equal to request size using our rb tree (data structure 2) in O(log N) time. + * equal to request size using our average fragment size group lists (data + * structure 2) in O(1) time. * * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in * linear order which requires O(N) search time for each CR 0 and CR 1 phase. @@ -802,65 +805,51 @@ static void ext4_mb_mark_free_simple(struct super_block *sb, } } -static void ext4_mb_rb_insert(struct rb_root *root, struct rb_node *new, - int (*cmp)(struct rb_node *, struct rb_node *)) +static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len) { - struct rb_node **iter = &root->rb_node, *parent = NULL; + int order; - while (*iter) { - parent = *iter; - if (cmp(new, *iter) > 0) - iter = &((*iter)->rb_left); - else - iter = &((*iter)->rb_right); - } - - rb_link_node(new, parent, iter); - rb_insert_color(new, root); -} - -static int -ext4_mb_avg_fragment_size_cmp(struct rb_node *rb1, struct rb_node *rb2) -{ - struct ext4_group_info *grp1 = rb_entry(rb1, - struct ext4_group_info, - bb_avg_fragment_size_rb); - struct ext4_group_info *grp2 = rb_entry(rb2, - struct ext4_group_info, - bb_avg_fragment_size_rb); - int num_frags_1, num_frags_2; - - num_frags_1 = grp1->bb_fragments ? - grp1->bb_free / grp1->bb_fragments : 0; - num_frags_2 = grp2->bb_fragments ? - grp2->bb_free / grp2->bb_fragments : 0; - - return (num_frags_2 - num_frags_1); + /* + * We don't bother with a special lists groups with only 1 block free + * extents and for completely empty groups. + */ + order = fls(len) - 2; + if (order < 0) + return 0; + if (order == MB_NUM_ORDERS(sb)) + order--; + return order; } -/* - * Reinsert grpinfo into the avg_fragment_size tree with new average - * fragment size. - */ +/* Move group to appropriate avg_fragment_size list */ static void mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) { struct ext4_sb_info *sbi = EXT4_SB(sb); + int new_order; if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0) return; - write_lock(&sbi->s_mb_rb_lock); - if (!RB_EMPTY_NODE(&grp->bb_avg_fragment_size_rb)) { - rb_erase(&grp->bb_avg_fragment_size_rb, - &sbi->s_mb_avg_fragment_size_root); - RB_CLEAR_NODE(&grp->bb_avg_fragment_size_rb); - } + new_order = mb_avg_fragment_size_order(sb, + grp->bb_free / grp->bb_fragments); + if (new_order == grp->bb_avg_fragment_size_order) + return; - ext4_mb_rb_insert(&sbi->s_mb_avg_fragment_size_root, - &grp->bb_avg_fragment_size_rb, - ext4_mb_avg_fragment_size_cmp); - write_unlock(&sbi->s_mb_rb_lock); + if (grp->bb_avg_fragment_size_order != -1) { + write_lock(&sbi->s_mb_avg_fragment_size_locks[ + grp->bb_avg_fragment_size_order]); + list_del(&grp->bb_avg_fragment_size_node); + write_unlock(&sbi->s_mb_avg_fragment_size_locks[ + grp->bb_avg_fragment_size_order]); + } + grp->bb_avg_fragment_size_order = new_order; + write_lock(&sbi->s_mb_avg_fragment_size_locks[ + grp->bb_avg_fragment_size_order]); + list_add_tail(&grp->bb_avg_fragment_size_node, + &sbi->s_mb_avg_fragment_size[grp->bb_avg_fragment_size_order]); + write_unlock(&sbi->s_mb_avg_fragment_size_locks[ + grp->bb_avg_fragment_size_order]); } /* @@ -909,86 +898,55 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, *new_cr = 1; } else { *group = grp->bb_group; - ac->ac_last_optimal_group = *group; ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED; } } /* - * Choose next group by traversing average fragment size tree. Updates *new_cr - * if cr lvel needs an update. Sets EXT4_MB_SEARCH_NEXT_LINEAR to indicate that - * the linear search should continue for one iteration since there's lock - * contention on the rb tree lock. + * Choose next group by traversing average fragment size list of suitable + * order. Updates *new_cr if cr level needs an update. */ static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, int *new_cr, ext4_group_t *group, ext4_group_t ngroups) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - int avg_fragment_size, best_so_far; - struct rb_node *node, *found; - struct ext4_group_info *grp; - - /* - * If there is contention on the lock, instead of waiting for the lock - * to become available, just continue searching lineraly. We'll resume - * our rb tree search later starting at ac->ac_last_optimal_group. - */ - if (!read_trylock(&sbi->s_mb_rb_lock)) { - ac->ac_flags |= EXT4_MB_SEARCH_NEXT_LINEAR; - return; - } + struct ext4_group_info *grp = NULL, *iter; + int i; if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) { if (sbi->s_mb_stats) atomic_inc(&sbi->s_bal_cr1_bad_suggestions); - /* We have found something at CR 1 in the past */ - grp = ext4_get_group_info(ac->ac_sb, ac->ac_last_optimal_group); - for (found = rb_next(&grp->bb_avg_fragment_size_rb); found != NULL; - found = rb_next(found)) { - grp = rb_entry(found, struct ext4_group_info, - bb_avg_fragment_size_rb); + } + + for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); + i < MB_NUM_ORDERS(ac->ac_sb); i++) { + if (list_empty(&sbi->s_mb_avg_fragment_size[i])) + continue; + read_lock(&sbi->s_mb_avg_fragment_size_locks[i]); + if (list_empty(&sbi->s_mb_avg_fragment_size[i])) { + read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]); + continue; + } + list_for_each_entry(iter, &sbi->s_mb_avg_fragment_size[i], + bb_avg_fragment_size_node) { if (sbi->s_mb_stats) atomic64_inc(&sbi->s_bal_cX_groups_considered[1]); - if (likely(ext4_mb_good_group(ac, grp->bb_group, 1))) + if (likely(ext4_mb_good_group(ac, iter->bb_group, 1))) { + grp = iter; break; - } - goto done; - } - - node = sbi->s_mb_avg_fragment_size_root.rb_node; - best_so_far = 0; - found = NULL; - - while (node) { - grp = rb_entry(node, struct ext4_group_info, - bb_avg_fragment_size_rb); - avg_fragment_size = 0; - if (ext4_mb_good_group(ac, grp->bb_group, 1)) { - avg_fragment_size = grp->bb_fragments ? - grp->bb_free / grp->bb_fragments : 0; - if (!best_so_far || avg_fragment_size < best_so_far) { - best_so_far = avg_fragment_size; - found = node; } } - if (avg_fragment_size > ac->ac_g_ex.fe_len) - node = node->rb_right; - else - node = node->rb_left; + read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]); + if (grp) + break; } -done: - if (found) { - grp = rb_entry(found, struct ext4_group_info, - bb_avg_fragment_size_rb); + if (grp) { *group = grp->bb_group; ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED; } else { *new_cr = 2; } - - read_unlock(&sbi->s_mb_rb_lock); - ac->ac_last_optimal_group = *group; } static inline int should_optimize_scan(struct ext4_allocation_context *ac) @@ -1017,11 +975,6 @@ next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups) goto inc_and_return; } - if (ac->ac_flags & EXT4_MB_SEARCH_NEXT_LINEAR) { - ac->ac_flags &= ~EXT4_MB_SEARCH_NEXT_LINEAR; - goto inc_and_return; - } - return group; inc_and_return: /* @@ -1049,8 +1002,10 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, { *new_cr = ac->ac_criteria; - if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) + if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) { + *group = next_linear_group(ac, *group, ngroups); return; + } if (*new_cr == 0) { ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups); @@ -1075,23 +1030,25 @@ mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) struct ext4_sb_info *sbi = EXT4_SB(sb); int i; - if (test_opt2(sb, MB_OPTIMIZE_SCAN) && grp->bb_largest_free_order >= 0) { + for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) + if (grp->bb_counters[i] > 0) + break; + /* No need to move between order lists? */ + if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || + i == grp->bb_largest_free_order) { + grp->bb_largest_free_order = i; + return; + } + + if (grp->bb_largest_free_order >= 0) { write_lock(&sbi->s_mb_largest_free_orders_locks[ grp->bb_largest_free_order]); list_del_init(&grp->bb_largest_free_order_node); write_unlock(&sbi->s_mb_largest_free_orders_locks[ grp->bb_largest_free_order]); } - grp->bb_largest_free_order = -1; /* uninit */ - - for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) { - if (grp->bb_counters[i] > 0) { - grp->bb_largest_free_order = i; - break; - } - } - if (test_opt2(sb, MB_OPTIMIZE_SCAN) && - grp->bb_largest_free_order >= 0 && grp->bb_free) { + grp->bb_largest_free_order = i; + if (grp->bb_largest_free_order >= 0 && grp->bb_free) { write_lock(&sbi->s_mb_largest_free_orders_locks[ grp->bb_largest_free_order]); list_add_tail(&grp->bb_largest_free_order_node, @@ -1148,13 +1105,13 @@ void ext4_mb_generate_buddy(struct super_block *sb, EXT4_GROUP_INFO_BBITMAP_CORRUPT); } mb_set_largest_free_order(sb, grp); + mb_update_avg_fragment_size(sb, grp); clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); period = get_cycles() - period; atomic_inc(&sbi->s_mb_buddies_generated); atomic64_add(period, &sbi->s_mb_generation_time); - mb_update_avg_fragment_size(sb, grp); } /* The buddy information is attached the buddy cache inode @@ -2636,7 +2593,7 @@ static noinline_for_stack int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { ext4_group_t prefetch_grp = 0, ngroups, group, i; - int cr = -1; + int cr = -1, new_cr; int err = 0, first_err = 0; unsigned int nr = 0, prefetch_ios = 0; struct ext4_sb_info *sbi; @@ -2707,17 +2664,14 @@ repeat: * from the goal value specified */ group = ac->ac_g_ex.fe_group; - ac->ac_last_optimal_group = group; ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups; prefetch_grp = group; - for (i = 0; i < ngroups; group = next_linear_group(ac, group, ngroups), - i++) { - int ret = 0, new_cr; + for (i = 0, new_cr = cr; i < ngroups; i++, + ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) { + int ret = 0; cond_resched(); - - ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups); if (new_cr != cr) { cr = new_cr; goto repeat; @@ -2991,9 +2945,7 @@ __acquires(&EXT4_SB(sb)->s_mb_rb_lock) struct super_block *sb = pde_data(file_inode(seq->file)); unsigned long position; - read_lock(&EXT4_SB(sb)->s_mb_rb_lock); - - if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1) + if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb)) return NULL; position = *pos + 1; return (void *) ((unsigned long) position); @@ -3005,7 +2957,7 @@ static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, lof unsigned long position; ++*pos; - if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1) + if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb)) return NULL; position = *pos + 1; return (void *) ((unsigned long) position); @@ -3017,29 +2969,22 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned long position = ((unsigned long) v); struct ext4_group_info *grp; - struct rb_node *n; - unsigned int count, min, max; + unsigned int count; position--; if (position >= MB_NUM_ORDERS(sb)) { - seq_puts(seq, "fragment_size_tree:\n"); - n = rb_first(&sbi->s_mb_avg_fragment_size_root); - if (!n) { - seq_puts(seq, "\ttree_min: 0\n\ttree_max: 0\n\ttree_nodes: 0\n"); - return 0; - } - grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb); - min = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0; - count = 1; - while (rb_next(n)) { - count++; - n = rb_next(n); - } - grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb); - max = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0; + position -= MB_NUM_ORDERS(sb); + if (position == 0) + seq_puts(seq, "avg_fragment_size_lists:\n"); - seq_printf(seq, "\ttree_min: %u\n\ttree_max: %u\n\ttree_nodes: %u\n", - min, max, count); + count = 0; + read_lock(&sbi->s_mb_avg_fragment_size_locks[position]); + list_for_each_entry(grp, &sbi->s_mb_avg_fragment_size[position], + bb_avg_fragment_size_node) + count++; + read_unlock(&sbi->s_mb_avg_fragment_size_locks[position]); + seq_printf(seq, "\tlist_order_%u_groups: %u\n", + (unsigned int)position, count); return 0; } @@ -3049,9 +2994,11 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) seq_puts(seq, "max_free_order_lists:\n"); } count = 0; + read_lock(&sbi->s_mb_largest_free_orders_locks[position]); list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position], bb_largest_free_order_node) count++; + read_unlock(&sbi->s_mb_largest_free_orders_locks[position]); seq_printf(seq, "\tlist_order_%u_groups: %u\n", (unsigned int)position, count); @@ -3059,11 +3006,7 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) } static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v) -__releases(&EXT4_SB(sb)->s_mb_rb_lock) { - struct super_block *sb = pde_data(file_inode(seq->file)); - - read_unlock(&EXT4_SB(sb)->s_mb_rb_lock); } const struct seq_operations ext4_mb_seq_structs_summary_ops = { @@ -3176,8 +3119,9 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, init_rwsem(&meta_group_info[i]->alloc_sem); meta_group_info[i]->bb_free_root = RB_ROOT; INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node); - RB_CLEAR_NODE(&meta_group_info[i]->bb_avg_fragment_size_rb); + INIT_LIST_HEAD(&meta_group_info[i]->bb_avg_fragment_size_node); meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ + meta_group_info[i]->bb_avg_fragment_size_order = -1; /* uninit */ meta_group_info[i]->bb_group = group; mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group); @@ -3426,7 +3370,24 @@ int ext4_mb_init(struct super_block *sb) i++; } while (i < MB_NUM_ORDERS(sb)); - sbi->s_mb_avg_fragment_size_root = RB_ROOT; + sbi->s_mb_avg_fragment_size = + kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), + GFP_KERNEL); + if (!sbi->s_mb_avg_fragment_size) { + ret = -ENOMEM; + goto out; + } + sbi->s_mb_avg_fragment_size_locks = + kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t), + GFP_KERNEL); + if (!sbi->s_mb_avg_fragment_size_locks) { + ret = -ENOMEM; + goto out; + } + for (i = 0; i < MB_NUM_ORDERS(sb); i++) { + INIT_LIST_HEAD(&sbi->s_mb_avg_fragment_size[i]); + rwlock_init(&sbi->s_mb_avg_fragment_size_locks[i]); + } sbi->s_mb_largest_free_orders = kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), GFP_KERNEL); @@ -3445,7 +3406,6 @@ int ext4_mb_init(struct super_block *sb) INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]); rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]); } - rwlock_init(&sbi->s_mb_rb_lock); spin_lock_init(&sbi->s_md_lock); sbi->s_mb_free_pending = 0; @@ -3516,6 +3476,8 @@ out_free_locality_groups: free_percpu(sbi->s_locality_groups); sbi->s_locality_groups = NULL; out: + kfree(sbi->s_mb_avg_fragment_size); + kfree(sbi->s_mb_avg_fragment_size_locks); kfree(sbi->s_mb_largest_free_orders); kfree(sbi->s_mb_largest_free_orders_locks); kfree(sbi->s_mb_offsets); @@ -3582,6 +3544,8 @@ int ext4_mb_release(struct super_block *sb) kvfree(group_info); rcu_read_unlock(); } + kfree(sbi->s_mb_avg_fragment_size); + kfree(sbi->s_mb_avg_fragment_size_locks); kfree(sbi->s_mb_largest_free_orders); kfree(sbi->s_mb_largest_free_orders_locks); kfree(sbi->s_mb_offsets); @@ -5193,6 +5157,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int bsbits = ac->ac_sb->s_blocksize_bits; loff_t size, isize; + bool inode_pa_eligible, group_pa_eligible; if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) return; @@ -5200,25 +5165,27 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) return; + group_pa_eligible = sbi->s_mb_group_prealloc > 0; + inode_pa_eligible = true; size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) >> bsbits; + /* No point in using inode preallocation for closed files */ if ((size == isize) && !ext4_fs_is_busy(sbi) && - !inode_is_open_for_write(ac->ac_inode)) { - ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; - return; - } + !inode_is_open_for_write(ac->ac_inode)) + inode_pa_eligible = false; - if (sbi->s_mb_group_prealloc <= 0) { - ac->ac_flags |= EXT4_MB_STREAM_ALLOC; - return; - } - - /* don't use group allocation for large files */ size = max(size, isize); - if (size > sbi->s_mb_stream_request) { - ac->ac_flags |= EXT4_MB_STREAM_ALLOC; + /* Don't use group allocation for large files */ + if (size > sbi->s_mb_stream_request) + group_pa_eligible = false; + + if (!group_pa_eligible) { + if (inode_pa_eligible) + ac->ac_flags |= EXT4_MB_STREAM_ALLOC; + else + ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; return; } @@ -5565,6 +5532,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, ext4_fsblk_t block = 0; unsigned int inquota = 0; unsigned int reserv_clstrs = 0; + int retries = 0; u64 seq; might_sleep(); @@ -5667,7 +5635,8 @@ repeat: ar->len = ac->ac_b_ex.fe_len; } } else { - if (ext4_mb_discard_preallocations_should_retry(sb, ac, &seq)) + if (++retries < 3 && + ext4_mb_discard_preallocations_should_retry(sb, ac, &seq)) goto repeat; /* * If block allocation fails then the pa allocated above diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 39da92ceabf8..dcda2a943cee 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -178,7 +178,6 @@ struct ext4_allocation_context { /* copy of the best found extent taken before preallocation efforts */ struct ext4_free_extent ac_f_ex; - ext4_group_t ac_last_optimal_group; __u32 ac_groups_considered; __u32 ac_flags; /* allocation hints */ __u16 ac_groups_scanned; diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 54e7d3c95fd7..a19a9661646e 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -56,8 +56,7 @@ static int finish_range(handle_t *handle, struct inode *inode, retval = ext4_ext_insert_extent(handle, inode, &path, &newext, 0); err_out: up_write((&EXT4_I(inode)->i_data_sem)); - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); lb->first_pblock = 0; return retval; } @@ -425,7 +424,8 @@ int ext4_ext_migrate(struct inode *inode) * already is extent-based, error out. */ if (!ext4_has_feature_extents(inode->i_sb) || - (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) || + ext4_has_inline_data(inode)) return -EINVAL; if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0) diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 9af68a7ecdcf..588cb09c5291 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -265,7 +265,7 @@ static unsigned int mmp_new_seq(void) u32 new_seq; do { - new_seq = prandom_u32(); + new_seq = get_random_u32(); } while (new_seq > EXT4_MMP_SEQ_MAX); return new_seq; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 701f1d6a217f..044e34cd835c 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -32,8 +32,7 @@ get_ext_path(struct inode *inode, ext4_lblk_t lblock, if (IS_ERR(path)) return PTR_ERR(path); if (path[ext_depth(inode)].p_ext == NULL) { - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); *ppath = NULL; return -ENODATA; } @@ -103,12 +102,10 @@ mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count, if (unwritten != ext4_ext_is_unwritten(ext)) goto out; from += ext4_ext_get_actual_len(ext); - ext4_ext_drop_refs(path); } ret = 1; out: - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); return ret; } @@ -472,19 +469,17 @@ mext_check_arguments(struct inode *orig_inode, if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode)) return -EPERM; - /* Ext4 move extent does not support swapfile */ + /* Ext4 move extent does not support swap files */ if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) { - ext4_debug("ext4 move extent: The argument files should " - "not be swapfile [ino:orig %lu, donor %lu]\n", + ext4_debug("ext4 move extent: The argument files should not be swap files [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); - return -EBUSY; + return -ETXTBSY; } if (ext4_is_quota_file(orig_inode) && ext4_is_quota_file(donor_inode)) { - ext4_debug("ext4 move extent: The argument files should " - "not be quota files [ino:orig %lu, donor %lu]\n", + ext4_debug("ext4 move extent: The argument files should not be quota files [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); - return -EBUSY; + return -EOPNOTSUPP; } /* Ext4 move extent supports only extent based file */ @@ -631,11 +626,11 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, if (ret) goto out; ex = path[path->p_depth].p_ext; - next_blk = ext4_ext_next_allocated_block(path); cur_blk = le32_to_cpu(ex->ee_block); cur_len = ext4_ext_get_actual_len(ex); /* Check hole before the start pos */ if (cur_blk + cur_len - 1 < o_start) { + next_blk = ext4_ext_next_allocated_block(path); if (next_blk == EXT_MAX_BLOCKS) { ret = -ENODATA; goto out; @@ -663,7 +658,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, donor_page_index = d_start >> (PAGE_SHIFT - donor_inode->i_blkbits); offset_in_page = o_start % blocks_per_page; - if (cur_len > blocks_per_page- offset_in_page) + if (cur_len > blocks_per_page - offset_in_page) cur_len = blocks_per_page - offset_in_page; /* * Up semaphore to avoid following problems: @@ -694,8 +689,7 @@ out: ext4_discard_preallocations(donor_inode, 0); } - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); ext4_double_up_write_data_sem(orig_inode, donor_inode); unlock_two_nondirectories(orig_inode, donor_inode); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 3a31b662f661..c08c0aba1883 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -85,15 +85,20 @@ static struct buffer_head *ext4_append(handle_t *handle, return bh; inode->i_size += inode->i_sb->s_blocksize; EXT4_I(inode)->i_disksize = inode->i_size; + err = ext4_mark_inode_dirty(handle, inode); + if (err) + goto out; BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); - if (err) { - brelse(bh); - ext4_std_error(inode->i_sb, err); - return ERR_PTR(err); - } + if (err) + goto out; return bh; + +out: + brelse(bh); + ext4_std_error(inode->i_sb, err); + return ERR_PTR(err); } static int ext4_dx_csum_verify(struct inode *inode, @@ -126,7 +131,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, struct ext4_dir_entry *dirent; int is_dx_block = 0; - if (block >= inode->i_size) { + if (block >= inode->i_size >> inode->i_blkbits) { ext4_error_inode(inode, func, line, block, "Attempting to read directory block (%u) that is past i_size (%llu)", block, inode->i_size); @@ -2254,8 +2259,16 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, memset(de, 0, len); /* wipe old data */ de = (struct ext4_dir_entry_2 *) data2; top = data2 + len; - while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) + while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) { + if (ext4_check_dir_entry(dir, NULL, de, bh2, data2, len, + (data2 + (blocksize - csum_size) - + (char *) de))) { + brelse(bh2); + brelse(bh); + return -EFSCORRUPTED; + } de = de2; + } de->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) - (char *) de, blocksize); @@ -2849,7 +2862,7 @@ retry: } static int ext4_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode) + struct file *file, umode_t mode) { handle_t *handle; struct inode *inode; @@ -2871,7 +2884,7 @@ retry: inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); - d_tmpfile(dentry, inode); + d_tmpfile(file, inode); err = ext4_orphan_add(handle, inode); if (err) goto err_unlock_inode; @@ -2882,7 +2895,7 @@ retry: ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; - return err; + return finish_open_simple(file, err); err_unlock_inode: ext4_journal_stop(handle); unlock_new_inode(inode); diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index e02a5f14e021..3d21eae267fc 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -75,7 +75,7 @@ static void __read_end_io(struct bio *bio) bio_for_each_segment_all(bv, bio, iter_all) { page = bv->bv_page; - /* PG_error was set if any post_read step failed */ + /* PG_error was set if verity failed. */ if (bio->bi_status || PageError(page)) { ClearPageUptodate(page); /* will re-read again later */ @@ -96,10 +96,12 @@ static void decrypt_work(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); + struct bio *bio = ctx->bio; - fscrypt_decrypt_bio(ctx->bio); - - bio_post_read_processing(ctx); + if (fscrypt_decrypt_bio(bio)) + bio_post_read_processing(ctx); + else + __read_end_io(bio); } static void verity_work(struct work_struct *work) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index fea2a68d067b..46b87ffeb304 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1158,6 +1158,7 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data, while (group < sbi->s_groups_count) { struct buffer_head *bh; ext4_fsblk_t backup_block; + struct ext4_super_block *es; /* Out of journal space, and can't get more - abort - so sad */ err = ext4_resize_ensure_credits_batch(handle, 1); @@ -1186,6 +1187,10 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data, memcpy(bh->b_data, data, size); if (rest) memset(bh->b_data + size, 0, rest); + es = (struct ext4_super_block *) bh->b_data; + es->s_block_group_nr = cpu_to_le16(group); + if (ext4_has_metadata_csum(sb)) + es->s_checksum = ext4_superblock_csum(sb, es); set_buffer_uptodate(bh); unlock_buffer(bh); err = ext4_handle_dirty_metadata(handle, NULL, bh); @@ -2122,7 +2127,7 @@ retry: goto out; } - if (ext4_blocks_count(es) == n_blocks_count) + if (ext4_blocks_count(es) == n_blocks_count && n_blocks_count_retry == 0) goto out; err = ext4_alloc_flex_bg_array(sb, n_group + 1); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9a66abcca1a8..7cdd2138c897 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -205,19 +205,12 @@ int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait) { - if (trylock_buffer(bh)) { - if (wait) - return ext4_read_bh(bh, op_flags, NULL); + lock_buffer(bh); + if (!wait) { ext4_read_bh_nowait(bh, op_flags, NULL); return 0; } - if (wait) { - wait_on_buffer(bh); - if (buffer_uptodate(bh)) - return 0; - return -EIO; - } - return 0; + return ext4_read_bh(bh, op_flags, NULL); } /* @@ -264,7 +257,8 @@ void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block) struct buffer_head *bh = sb_getblk_gfp(sb, block, 0); if (likely(bh)) { - ext4_read_bh_lock(bh, REQ_RAHEAD, false); + if (trylock_buffer(bh)) + ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL); brelse(bh); } } @@ -1576,7 +1570,7 @@ enum { Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, Opt_resgid, Opt_resuid, Opt_sb, Opt_nouid32, Opt_debug, Opt_removed, - Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, + Opt_user_xattr, Opt_acl, Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev, Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit, @@ -1585,7 +1579,7 @@ enum { Opt_inlinecrypt, Opt_usrjquota, Opt_grpjquota, Opt_quota, Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, - Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, + Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error, Opt_nowarn_on_error, Opt_mblk_io_submit, Opt_debug_want_extra_isize, @@ -1662,9 +1656,7 @@ static const struct fs_parameter_spec ext4_param_specs[] = { fsparam_flag ("oldalloc", Opt_removed), fsparam_flag ("orlov", Opt_removed), fsparam_flag ("user_xattr", Opt_user_xattr), - fsparam_flag ("nouser_xattr", Opt_nouser_xattr), fsparam_flag ("acl", Opt_acl), - fsparam_flag ("noacl", Opt_noacl), fsparam_flag ("norecovery", Opt_noload), fsparam_flag ("noload", Opt_noload), fsparam_flag ("bh", Opt_removed), @@ -1694,7 +1686,7 @@ static const struct fs_parameter_spec ext4_param_specs[] = { fsparam_flag ("barrier", Opt_barrier), fsparam_u32 ("barrier", Opt_barrier), fsparam_flag ("nobarrier", Opt_nobarrier), - fsparam_flag ("i_version", Opt_i_version), + fsparam_flag ("i_version", Opt_removed), fsparam_flag ("dax", Opt_dax), fsparam_enum ("dax", Opt_dax_type, ext4_param_dax), fsparam_u32 ("stripe", Opt_stripe), @@ -1749,10 +1741,6 @@ static const struct fs_parameter_spec ext4_param_specs[] = { #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) -static const char deprecated_msg[] = - "Mount option \"%s\" will be removed by %s\n" - "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n"; - #define MOPT_SET 0x0001 #define MOPT_CLEAR 0x0002 #define MOPT_NOSUPPORT 0x0004 @@ -1814,13 +1802,10 @@ static const struct mount_opts { {Opt_journal_ioprio, 0, MOPT_NO_EXT2}, {Opt_data, 0, MOPT_NO_EXT2}, {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET}, - {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR}, #ifdef CONFIG_EXT4_FS_POSIX_ACL {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET}, - {Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR}, #else {Opt_acl, 0, MOPT_NOSUPPORT}, - {Opt_noacl, 0, MOPT_NOSUPPORT}, #endif {Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET}, {Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET}, @@ -2120,10 +2105,6 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) else return note_qf_name(fc, GRPQUOTA, param); #endif - case Opt_noacl: - case Opt_nouser_xattr: - ext4_msg(NULL, KERN_WARNING, deprecated_msg, param->key, "3.5"); - break; case Opt_sb: if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { ext4_msg(NULL, KERN_WARNING, @@ -2140,11 +2121,6 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_abort: ctx_set_mount_flag(ctx, EXT4_MF_FS_ABORTED); return 0; - case Opt_i_version: - ext4_msg(NULL, KERN_WARNING, deprecated_msg, param->key, "5.20"); - ext4_msg(NULL, KERN_WARNING, "Use iversion instead\n"); - ctx_set_flags(ctx, SB_I_VERSION); - return 0; case Opt_inlinecrypt: #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT ctx_set_flags(ctx, SB_INLINECRYPT); @@ -2814,14 +2790,6 @@ static void ext4_apply_options(struct fs_context *fc, struct super_block *sb) sb->s_flags &= ~ctx->mask_s_flags; sb->s_flags |= ctx->vals_s_flags; - /* - * i_version differs from common mount option iversion so we have - * to let vfs know that it was set, otherwise it would get cleared - * on remount - */ - if (ctx->mask_s_flags & SB_I_VERSION) - fc->sb_flags |= SB_I_VERSION; - #define APPLY(X) ({ if (ctx->spec & EXT4_SPEC_##X) sbi->X = ctx->X; }) APPLY(s_commit_interval); APPLY(s_stripe); @@ -2970,8 +2938,6 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time); if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time); - if (sb->s_flags & SB_I_VERSION) - SEQ_OPTS_PUTS("i_version"); if (nodefs || sbi->s_stripe) SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe); if (nodefs || EXT4_MOUNT_DATA_FLAGS & @@ -3767,6 +3733,7 @@ static int ext4_lazyinit_thread(void *arg) unsigned long next_wakeup, cur; BUG_ON(NULL == eli); + set_freezable(); cont_thread: while (true) { @@ -3811,8 +3778,7 @@ cont_thread: } if (!progress) { elr->lr_next_sched = jiffies + - (prandom_u32() - % (EXT4_DEF_LI_MAX_START_DELAY * HZ)); + prandom_u32_max(EXT4_DEF_LI_MAX_START_DELAY * HZ); } if (time_before(elr->lr_next_sched, next_wakeup)) next_wakeup = elr->lr_next_sched; @@ -3959,8 +3925,8 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, * spread the inode table initialization requests * better. */ - elr->lr_next_sched = jiffies + (prandom_u32() % - (EXT4_DEF_LI_MAX_START_DELAY * HZ)); + elr->lr_next_sched = jiffies + prandom_u32_max( + EXT4_DEF_LI_MAX_START_DELAY * HZ); return elr; } @@ -3982,9 +3948,9 @@ int ext4_register_li_request(struct super_block *sb, goto out; } - if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) && - (first_not_zeroed == ngroups || sb_rdonly(sb) || - !test_opt(sb, INIT_INODE_TABLE))) + if (sb_rdonly(sb) || + (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) && + (first_not_zeroed == ngroups || !test_opt(sb, INIT_INODE_TABLE)))) goto out; elr = ext4_li_request_new(sb, first_not_zeroed); @@ -4311,112 +4277,10 @@ err_out: return NULL; } -static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) +static void ext4_set_def_opts(struct super_block *sb, + struct ext4_super_block *es) { - struct buffer_head *bh, **group_desc; - struct ext4_super_block *es = NULL; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct flex_groups **flex_groups; - ext4_fsblk_t block; - ext4_fsblk_t logical_sb_block; - unsigned long offset = 0; unsigned long def_mount_opts; - struct inode *root; - int ret = -ENOMEM; - int blocksize, clustersize; - unsigned int db_count; - unsigned int i; - int needs_recovery, has_huge_files; - __u64 blocks_count; - int err = 0; - ext4_group_t first_not_zeroed; - struct ext4_fs_context *ctx = fc->fs_private; - int silent = fc->sb_flags & SB_SILENT; - - /* Set defaults for the variables that will be set during parsing */ - if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) - ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; - - sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; - sbi->s_sectors_written_start = - part_stat_read(sb->s_bdev, sectors[STAT_WRITE]); - - /* -EINVAL is default */ - ret = -EINVAL; - blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); - if (!blocksize) { - ext4_msg(sb, KERN_ERR, "unable to set blocksize"); - goto out_fail; - } - - /* - * The ext4 superblock will not be buffer aligned for other than 1kB - * block sizes. We need to calculate the offset from buffer start. - */ - if (blocksize != EXT4_MIN_BLOCK_SIZE) { - logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE; - offset = do_div(logical_sb_block, blocksize); - } else { - logical_sb_block = sbi->s_sb_block; - } - - bh = ext4_sb_bread_unmovable(sb, logical_sb_block); - if (IS_ERR(bh)) { - ext4_msg(sb, KERN_ERR, "unable to read superblock"); - ret = PTR_ERR(bh); - goto out_fail; - } - /* - * Note: s_es must be initialized as soon as possible because - * some ext4 macro-instructions depend on its value - */ - es = (struct ext4_super_block *) (bh->b_data + offset); - sbi->s_es = es; - sb->s_magic = le16_to_cpu(es->s_magic); - if (sb->s_magic != EXT4_SUPER_MAGIC) - goto cantfind_ext4; - sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written); - - /* Warn if metadata_csum and gdt_csum are both set. */ - if (ext4_has_feature_metadata_csum(sb) && - ext4_has_feature_gdt_csum(sb)) - ext4_warning(sb, "metadata_csum and uninit_bg are " - "redundant flags; please run fsck."); - - /* Check for a known checksum algorithm */ - if (!ext4_verify_csum_type(sb, es)) { - ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " - "unknown checksum algorithm."); - silent = 1; - goto cantfind_ext4; - } - ext4_setup_csum_trigger(sb, EXT4_JTR_ORPHAN_FILE, - ext4_orphan_file_block_trigger); - - /* Load the checksum driver */ - sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); - if (IS_ERR(sbi->s_chksum_driver)) { - ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver."); - ret = PTR_ERR(sbi->s_chksum_driver); - sbi->s_chksum_driver = NULL; - goto failed_mount; - } - - /* Check superblock checksum */ - if (!ext4_superblock_csum_verify(sb, es)) { - ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " - "invalid superblock checksum. Run e2fsck?"); - silent = 1; - ret = -EFSBADCRC; - goto cantfind_ext4; - } - - /* Precompute checksum seed for all metadata */ - if (ext4_has_feature_csum_seed(sb)) - sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed); - else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb)) - sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, - sizeof(es->s_uuid)); /* Set defaults before we parse the mount options */ def_mount_opts = le32_to_cpu(es->s_default_mount_opts); @@ -4445,9 +4309,9 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK) set_opt(sb, WRITEBACK_DATA); - if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC) + if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_PANIC) set_opt(sb, ERRORS_PANIC); - else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE) + else if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_CONTINUE) set_opt(sb, ERRORS_CONT); else set_opt(sb, ERRORS_RO); @@ -4456,12 +4320,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (def_mount_opts & EXT4_DEFM_DISCARD) set_opt(sb, DISCARD); - sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid)); - sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid)); - sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; - sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; - sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; - if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0) set_opt(sb, BARRIER); @@ -4473,31 +4331,96 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) set_opt(sb, DELALLOC); - /* - * set default s_li_wait_mult for lazyinit, for the case there is - * no mount option specified. - */ - sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; + if (sb->s_blocksize == PAGE_SIZE) + set_opt(sb, DIOREAD_NOLOCK); +} - if (le32_to_cpu(es->s_log_block_size) > - (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { - ext4_msg(sb, KERN_ERR, - "Invalid log block size: %u", - le32_to_cpu(es->s_log_block_size)); - goto failed_mount; - } - if (le32_to_cpu(es->s_log_cluster_size) > - (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { - ext4_msg(sb, KERN_ERR, - "Invalid log cluster size: %u", - le32_to_cpu(es->s_log_cluster_size)); - goto failed_mount; +static int ext4_handle_clustersize(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int clustersize; + + /* Handle clustersize */ + clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size); + if (ext4_has_feature_bigalloc(sb)) { + if (clustersize < sb->s_blocksize) { + ext4_msg(sb, KERN_ERR, + "cluster size (%d) smaller than " + "block size (%lu)", clustersize, sb->s_blocksize); + return -EINVAL; + } + sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) - + le32_to_cpu(es->s_log_block_size); + sbi->s_clusters_per_group = + le32_to_cpu(es->s_clusters_per_group); + if (sbi->s_clusters_per_group > sb->s_blocksize * 8) { + ext4_msg(sb, KERN_ERR, + "#clusters per group too big: %lu", + sbi->s_clusters_per_group); + return -EINVAL; + } + if (sbi->s_blocks_per_group != + (sbi->s_clusters_per_group * (clustersize / sb->s_blocksize))) { + ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and " + "clusters per group (%lu) inconsistent", + sbi->s_blocks_per_group, + sbi->s_clusters_per_group); + return -EINVAL; + } + } else { + if (clustersize != sb->s_blocksize) { + ext4_msg(sb, KERN_ERR, + "fragment/cluster size (%d) != " + "block size (%lu)", clustersize, sb->s_blocksize); + return -EINVAL; + } + if (sbi->s_blocks_per_group > sb->s_blocksize * 8) { + ext4_msg(sb, KERN_ERR, + "#blocks per group too big: %lu", + sbi->s_blocks_per_group); + return -EINVAL; + } + sbi->s_clusters_per_group = sbi->s_blocks_per_group; + sbi->s_cluster_bits = 0; } + sbi->s_cluster_ratio = clustersize / sb->s_blocksize; - blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); + /* Do we have standard group size of clustersize * 8 blocks ? */ + if (sbi->s_blocks_per_group == clustersize << 3) + set_opt2(sb, STD_GROUP_SIZE); - if (blocksize == PAGE_SIZE) - set_opt(sb, DIOREAD_NOLOCK); + return 0; +} + +static void ext4_fast_commit_init(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + /* Initialize fast commit stuff */ + atomic_set(&sbi->s_fc_subtid, 0); + INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]); + INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]); + INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]); + INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]); + sbi->s_fc_bytes = 0; + ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); + sbi->s_fc_ineligible_tid = 0; + spin_lock_init(&sbi->s_fc_lock); + memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); + sbi->s_fc_replay_state.fc_regions = NULL; + sbi->s_fc_replay_state.fc_regions_size = 0; + sbi->s_fc_replay_state.fc_regions_used = 0; + sbi->s_fc_replay_state.fc_regions_valid = 0; + sbi->s_fc_replay_state.fc_modified_inodes = NULL; + sbi->s_fc_replay_state.fc_modified_inodes_size = 0; + sbi->s_fc_replay_state.fc_modified_inodes_used = 0; +} + +static int ext4_inode_info_init(struct super_block *sb, + struct ext4_super_block *es) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; @@ -4508,16 +4431,16 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) { ext4_msg(sb, KERN_ERR, "invalid first ino: %u", sbi->s_first_ino); - goto failed_mount; + return -EINVAL; } if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) || (!is_power_of_2(sbi->s_inode_size)) || - (sbi->s_inode_size > blocksize)) { + (sbi->s_inode_size > sb->s_blocksize)) { ext4_msg(sb, KERN_ERR, "unsupported inode size: %d", sbi->s_inode_size); - ext4_msg(sb, KERN_ERR, "blocksize: %d", blocksize); - goto failed_mount; + ext4_msg(sb, KERN_ERR, "blocksize: %lu", sb->s_blocksize); + return -EINVAL; } /* * i_atime_extra is the last extra field available for @@ -4535,6 +4458,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) } sb->s_time_min = EXT4_TIMESTAMP_MIN; } + if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) { sbi->s_want_extra_isize = sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE; @@ -4546,7 +4470,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (v > max) { ext4_msg(sb, KERN_ERR, "bad s_want_extra_isize: %d", v); - goto failed_mount; + return -EINVAL; } if (sbi->s_want_extra_isize < v) sbi->s_want_extra_isize = v; @@ -4555,91 +4479,112 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (v > max) { ext4_msg(sb, KERN_ERR, "bad s_min_extra_isize: %d", v); - goto failed_mount; + return -EINVAL; } if (sbi->s_want_extra_isize < v) sbi->s_want_extra_isize = v; } } - err = parse_apply_sb_mount_options(sb, ctx); - if (err < 0) - goto failed_mount; + return 0; +} - sbi->s_def_mount_opt = sbi->s_mount_opt; +#if IS_ENABLED(CONFIG_UNICODE) +static int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es) +{ + const struct ext4_sb_encodings *encoding_info; + struct unicode_map *encoding; + __u16 encoding_flags = le16_to_cpu(es->s_encoding_flags); - err = ext4_check_opt_consistency(fc, sb); - if (err < 0) - goto failed_mount; + if (!ext4_has_feature_casefold(sb) || sb->s_encoding) + return 0; - ext4_apply_options(fc, sb); + encoding_info = ext4_sb_read_encoding(es); + if (!encoding_info) { + ext4_msg(sb, KERN_ERR, + "Encoding requested by superblock is unknown"); + return -EINVAL; + } -#if IS_ENABLED(CONFIG_UNICODE) - if (ext4_has_feature_casefold(sb) && !sb->s_encoding) { - const struct ext4_sb_encodings *encoding_info; - struct unicode_map *encoding; - __u16 encoding_flags = le16_to_cpu(es->s_encoding_flags); + encoding = utf8_load(encoding_info->version); + if (IS_ERR(encoding)) { + ext4_msg(sb, KERN_ERR, + "can't mount with superblock charset: %s-%u.%u.%u " + "not supported by the kernel. flags: 0x%x.", + encoding_info->name, + unicode_major(encoding_info->version), + unicode_minor(encoding_info->version), + unicode_rev(encoding_info->version), + encoding_flags); + return -EINVAL; + } + ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: " + "%s-%u.%u.%u with flags 0x%hx", encoding_info->name, + unicode_major(encoding_info->version), + unicode_minor(encoding_info->version), + unicode_rev(encoding_info->version), + encoding_flags); - encoding_info = ext4_sb_read_encoding(es); - if (!encoding_info) { - ext4_msg(sb, KERN_ERR, - "Encoding requested by superblock is unknown"); - goto failed_mount; - } + sb->s_encoding = encoding; + sb->s_encoding_flags = encoding_flags; - encoding = utf8_load(encoding_info->version); - if (IS_ERR(encoding)) { - ext4_msg(sb, KERN_ERR, - "can't mount with superblock charset: %s-%u.%u.%u " - "not supported by the kernel. flags: 0x%x.", - encoding_info->name, - unicode_major(encoding_info->version), - unicode_minor(encoding_info->version), - unicode_rev(encoding_info->version), - encoding_flags); - goto failed_mount; - } - ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: " - "%s-%u.%u.%u with flags 0x%hx", encoding_info->name, - unicode_major(encoding_info->version), - unicode_minor(encoding_info->version), - unicode_rev(encoding_info->version), - encoding_flags); + return 0; +} +#else +static inline int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es) +{ + return 0; +} +#endif + +static int ext4_init_metadata_csum(struct super_block *sb, struct ext4_super_block *es) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + /* Warn if metadata_csum and gdt_csum are both set. */ + if (ext4_has_feature_metadata_csum(sb) && + ext4_has_feature_gdt_csum(sb)) + ext4_warning(sb, "metadata_csum and uninit_bg are " + "redundant flags; please run fsck."); - sb->s_encoding = encoding; - sb->s_encoding_flags = encoding_flags; + /* Check for a known checksum algorithm */ + if (!ext4_verify_csum_type(sb, es)) { + ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " + "unknown checksum algorithm."); + return -EINVAL; } -#endif + ext4_setup_csum_trigger(sb, EXT4_JTR_ORPHAN_FILE, + ext4_orphan_file_block_trigger); - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { - printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, O_DIRECT and fast_commit support!\n"); - /* can't mount with both data=journal and dioread_nolock. */ - clear_opt(sb, DIOREAD_NOLOCK); - clear_opt2(sb, JOURNAL_FAST_COMMIT); - if (test_opt2(sb, EXPLICIT_DELALLOC)) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "both data=journal and delalloc"); - goto failed_mount; - } - if (test_opt(sb, DAX_ALWAYS)) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "both data=journal and dax"); - goto failed_mount; - } - if (ext4_has_feature_encrypt(sb)) { - ext4_msg(sb, KERN_WARNING, - "encrypted files will use data=ordered " - "instead of data journaling mode"); - } - if (test_opt(sb, DELALLOC)) - clear_opt(sb, DELALLOC); - } else { - sb->s_iflags |= SB_I_CGROUPWB; + /* Load the checksum driver */ + sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); + if (IS_ERR(sbi->s_chksum_driver)) { + int ret = PTR_ERR(sbi->s_chksum_driver); + ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver."); + sbi->s_chksum_driver = NULL; + return ret; } - sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | - (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); + /* Check superblock checksum */ + if (!ext4_superblock_csum_verify(sb, es)) { + ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " + "invalid superblock checksum. Run e2fsck?"); + return -EFSBADCRC; + } + + /* Precompute checksum seed for all metadata */ + if (ext4_has_feature_csum_seed(sb)) + sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed); + else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb)) + sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, + sizeof(es->s_uuid)); + return 0; +} +static int ext4_check_feature_compatibility(struct super_block *sb, + struct ext4_super_block *es, + int silent) +{ if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV && (ext4_has_compat_features(sb) || ext4_has_ro_compat_features(sb) || @@ -4653,7 +4598,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (ext4_has_feature_64bit(sb)) { ext4_msg(sb, KERN_ERR, "The Hurd can't support 64-bit file systems"); - goto failed_mount; + return -EINVAL; } /* @@ -4663,7 +4608,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (ext4_has_feature_ea_inode(sb)) { ext4_msg(sb, KERN_ERR, "ea_inode feature is not supported for Hurd"); - goto failed_mount; + return -EINVAL; } } @@ -4677,10 +4622,10 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) * it's actually an ext[34] filesystem. */ if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb))) - goto failed_mount; + return -EINVAL; ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due " "to feature incompatibilities"); - goto failed_mount; + return -EINVAL; } } @@ -4694,10 +4639,10 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) * it's actually an ext4 filesystem. */ if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb))) - goto failed_mount; + return -EINVAL; ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due " "to feature incompatibilities"); - goto failed_mount; + return -EINVAL; } } @@ -4707,9 +4652,463 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) * so there is a chance incompat flags are set on a rev 0 filesystem. */ if (!ext4_feature_set_ok(sb, (sb_rdonly(sb)))) + return -EINVAL; + + return 0; +} + +static int ext4_geometry_check(struct super_block *sb, + struct ext4_super_block *es) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + __u64 blocks_count; + + /* check blocks count against device size */ + blocks_count = sb_bdev_nr_blocks(sb); + if (blocks_count && ext4_blocks_count(es) > blocks_count) { + ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu " + "exceeds size of device (%llu blocks)", + ext4_blocks_count(es), blocks_count); + return -EINVAL; + } + + /* + * It makes no sense for the first data block to be beyond the end + * of the filesystem. + */ + if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) { + ext4_msg(sb, KERN_WARNING, "bad geometry: first data " + "block %u is beyond end of filesystem (%llu)", + le32_to_cpu(es->s_first_data_block), + ext4_blocks_count(es)); + return -EINVAL; + } + if ((es->s_first_data_block == 0) && (es->s_log_block_size == 0) && + (sbi->s_cluster_ratio == 1)) { + ext4_msg(sb, KERN_WARNING, "bad geometry: first data " + "block is 0 with a 1k block and cluster size"); + return -EINVAL; + } + + blocks_count = (ext4_blocks_count(es) - + le32_to_cpu(es->s_first_data_block) + + EXT4_BLOCKS_PER_GROUP(sb) - 1); + do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb)); + if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) { + ext4_msg(sb, KERN_WARNING, "groups count too large: %llu " + "(block count %llu, first data block %u, " + "blocks per group %lu)", blocks_count, + ext4_blocks_count(es), + le32_to_cpu(es->s_first_data_block), + EXT4_BLOCKS_PER_GROUP(sb)); + return -EINVAL; + } + sbi->s_groups_count = blocks_count; + sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, + (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); + if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) != + le32_to_cpu(es->s_inodes_count)) { + ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu", + le32_to_cpu(es->s_inodes_count), + ((u64)sbi->s_groups_count * sbi->s_inodes_per_group)); + return -EINVAL; + } + + return 0; +} + +static void ext4_group_desc_free(struct ext4_sb_info *sbi) +{ + struct buffer_head **group_desc; + int i; + + rcu_read_lock(); + group_desc = rcu_dereference(sbi->s_group_desc); + for (i = 0; i < sbi->s_gdb_count; i++) + brelse(group_desc[i]); + kvfree(group_desc); + rcu_read_unlock(); +} + +static int ext4_group_desc_init(struct super_block *sb, + struct ext4_super_block *es, + ext4_fsblk_t logical_sb_block, + ext4_group_t *first_not_zeroed) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned int db_count; + ext4_fsblk_t block; + int ret; + int i; + + db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / + EXT4_DESC_PER_BLOCK(sb); + if (ext4_has_feature_meta_bg(sb)) { + if (le32_to_cpu(es->s_first_meta_bg) > db_count) { + ext4_msg(sb, KERN_WARNING, + "first meta block group too large: %u " + "(group descriptor block count %u)", + le32_to_cpu(es->s_first_meta_bg), db_count); + return -EINVAL; + } + } + rcu_assign_pointer(sbi->s_group_desc, + kvmalloc_array(db_count, + sizeof(struct buffer_head *), + GFP_KERNEL)); + if (sbi->s_group_desc == NULL) { + ext4_msg(sb, KERN_ERR, "not enough memory"); + return -ENOMEM; + } + + bgl_lock_init(sbi->s_blockgroup_lock); + + /* Pre-read the descriptors into the buffer cache */ + for (i = 0; i < db_count; i++) { + block = descriptor_loc(sb, logical_sb_block, i); + ext4_sb_breadahead_unmovable(sb, block); + } + + for (i = 0; i < db_count; i++) { + struct buffer_head *bh; + + block = descriptor_loc(sb, logical_sb_block, i); + bh = ext4_sb_bread_unmovable(sb, block); + if (IS_ERR(bh)) { + ext4_msg(sb, KERN_ERR, + "can't read group descriptor %d", i); + sbi->s_gdb_count = i; + ret = PTR_ERR(bh); + goto out; + } + rcu_read_lock(); + rcu_dereference(sbi->s_group_desc)[i] = bh; + rcu_read_unlock(); + } + sbi->s_gdb_count = db_count; + if (!ext4_check_descriptors(sb, logical_sb_block, first_not_zeroed)) { + ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); + ret = -EFSCORRUPTED; + goto out; + } + return 0; +out: + ext4_group_desc_free(sbi); + return ret; +} + +static int ext4_load_and_init_journal(struct super_block *sb, + struct ext4_super_block *es, + struct ext4_fs_context *ctx) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + int err; + + err = ext4_load_journal(sb, es, ctx->journal_devnum); + if (err) + return err; + + if (ext4_has_feature_64bit(sb) && + !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, + JBD2_FEATURE_INCOMPAT_64BIT)) { + ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature"); + goto out; + } + + if (!set_journal_csum_feature_set(sb)) { + ext4_msg(sb, KERN_ERR, "Failed to set journal checksum " + "feature set"); + goto out; + } + + if (test_opt2(sb, JOURNAL_FAST_COMMIT) && + !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, + JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) { + ext4_msg(sb, KERN_ERR, + "Failed to set fast commit journal feature"); + goto out; + } + + /* We have now updated the journal if required, so we can + * validate the data journaling mode. */ + switch (test_opt(sb, DATA_FLAGS)) { + case 0: + /* No mode set, assume a default based on the journal + * capabilities: ORDERED_DATA if the journal can + * cope, else JOURNAL_DATA + */ + if (jbd2_journal_check_available_features + (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { + set_opt(sb, ORDERED_DATA); + sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA; + } else { + set_opt(sb, JOURNAL_DATA); + sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA; + } + break; + + case EXT4_MOUNT_ORDERED_DATA: + case EXT4_MOUNT_WRITEBACK_DATA: + if (!jbd2_journal_check_available_features + (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { + ext4_msg(sb, KERN_ERR, "Journal does not support " + "requested data journaling mode"); + goto out; + } + break; + default: + break; + } + + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA && + test_opt(sb, JOURNAL_ASYNC_COMMIT)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "journal_async_commit in data=ordered mode"); + goto out; + } + + set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio); + + sbi->s_journal->j_submit_inode_data_buffers = + ext4_journal_submit_inode_data_buffers; + sbi->s_journal->j_finish_inode_data_buffers = + ext4_journal_finish_inode_data_buffers; + + return 0; + +out: + /* flush s_error_work before journal destroy. */ + flush_work(&sbi->s_error_work); + jbd2_journal_destroy(sbi->s_journal); + sbi->s_journal = NULL; + return -EINVAL; +} + +static int ext4_journal_data_mode_check(struct super_block *sb) +{ + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { + printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with " + "data=journal disables delayed allocation, " + "dioread_nolock, O_DIRECT and fast_commit support!\n"); + /* can't mount with both data=journal and dioread_nolock. */ + clear_opt(sb, DIOREAD_NOLOCK); + clear_opt2(sb, JOURNAL_FAST_COMMIT); + if (test_opt2(sb, EXPLICIT_DELALLOC)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and delalloc"); + return -EINVAL; + } + if (test_opt(sb, DAX_ALWAYS)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and dax"); + return -EINVAL; + } + if (ext4_has_feature_encrypt(sb)) { + ext4_msg(sb, KERN_WARNING, + "encrypted files will use data=ordered " + "instead of data journaling mode"); + } + if (test_opt(sb, DELALLOC)) + clear_opt(sb, DELALLOC); + } else { + sb->s_iflags |= SB_I_CGROUPWB; + } + + return 0; +} + +static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb, + int silent) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es; + ext4_fsblk_t logical_sb_block; + unsigned long offset = 0; + struct buffer_head *bh; + int ret = -EINVAL; + int blocksize; + + blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); + if (!blocksize) { + ext4_msg(sb, KERN_ERR, "unable to set blocksize"); + return -EINVAL; + } + + /* + * The ext4 superblock will not be buffer aligned for other than 1kB + * block sizes. We need to calculate the offset from buffer start. + */ + if (blocksize != EXT4_MIN_BLOCK_SIZE) { + logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE; + offset = do_div(logical_sb_block, blocksize); + } else { + logical_sb_block = sbi->s_sb_block; + } + + bh = ext4_sb_bread_unmovable(sb, logical_sb_block); + if (IS_ERR(bh)) { + ext4_msg(sb, KERN_ERR, "unable to read superblock"); + return PTR_ERR(bh); + } + /* + * Note: s_es must be initialized as soon as possible because + * some ext4 macro-instructions depend on its value + */ + es = (struct ext4_super_block *) (bh->b_data + offset); + sbi->s_es = es; + sb->s_magic = le16_to_cpu(es->s_magic); + if (sb->s_magic != EXT4_SUPER_MAGIC) { + if (!silent) + ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); + goto out; + } + + if (le32_to_cpu(es->s_log_block_size) > + (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { + ext4_msg(sb, KERN_ERR, + "Invalid log block size: %u", + le32_to_cpu(es->s_log_block_size)); + goto out; + } + if (le32_to_cpu(es->s_log_cluster_size) > + (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { + ext4_msg(sb, KERN_ERR, + "Invalid log cluster size: %u", + le32_to_cpu(es->s_log_cluster_size)); + goto out; + } + + blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); + + /* + * If the default block size is not the same as the real block size, + * we need to reload it. + */ + if (sb->s_blocksize == blocksize) { + *lsb = logical_sb_block; + sbi->s_sbh = bh; + return 0; + } + + /* + * bh must be released before kill_bdev(), otherwise + * it won't be freed and its page also. kill_bdev() + * is called by sb_set_blocksize(). + */ + brelse(bh); + /* Validate the filesystem blocksize */ + if (!sb_set_blocksize(sb, blocksize)) { + ext4_msg(sb, KERN_ERR, "bad block size %d", + blocksize); + bh = NULL; + goto out; + } + + logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE; + offset = do_div(logical_sb_block, blocksize); + bh = ext4_sb_bread_unmovable(sb, logical_sb_block); + if (IS_ERR(bh)) { + ext4_msg(sb, KERN_ERR, "Can't read superblock on 2nd try"); + ret = PTR_ERR(bh); + bh = NULL; + goto out; + } + es = (struct ext4_super_block *)(bh->b_data + offset); + sbi->s_es = es; + if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) { + ext4_msg(sb, KERN_ERR, "Magic mismatch, very weird!"); + goto out; + } + *lsb = logical_sb_block; + sbi->s_sbh = bh; + return 0; +out: + brelse(bh); + return ret; +} + +static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) +{ + struct ext4_super_block *es = NULL; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct flex_groups **flex_groups; + ext4_fsblk_t block; + ext4_fsblk_t logical_sb_block; + struct inode *root; + int ret = -ENOMEM; + unsigned int i; + int needs_recovery, has_huge_files; + int err = 0; + ext4_group_t first_not_zeroed; + struct ext4_fs_context *ctx = fc->fs_private; + int silent = fc->sb_flags & SB_SILENT; + + /* Set defaults for the variables that will be set during parsing */ + if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) + ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; + + sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; + sbi->s_sectors_written_start = + part_stat_read(sb->s_bdev, sectors[STAT_WRITE]); + + /* -EINVAL is default */ + ret = -EINVAL; + err = ext4_load_super(sb, &logical_sb_block, silent); + if (err) + goto out_fail; + + es = sbi->s_es; + sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written); + + err = ext4_init_metadata_csum(sb, es); + if (err) + goto failed_mount; + + ext4_set_def_opts(sb, es); + + sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid)); + sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid)); + sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; + sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; + sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; + + /* + * set default s_li_wait_mult for lazyinit, for the case there is + * no mount option specified. + */ + sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; + + if (ext4_inode_info_init(sb, es)) + goto failed_mount; + + err = parse_apply_sb_mount_options(sb, ctx); + if (err < 0) + goto failed_mount; + + sbi->s_def_mount_opt = sbi->s_mount_opt; + + err = ext4_check_opt_consistency(fc, sb); + if (err < 0) goto failed_mount; - if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (blocksize / 4)) { + ext4_apply_options(fc, sb); + + if (ext4_encoding_init(sb, es)) + goto failed_mount; + + if (ext4_journal_data_mode_check(sb)) + goto failed_mount; + + sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | + (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); + + /* i_version is always enabled now */ + sb->s_flags |= SB_I_VERSION; + + if (ext4_check_feature_compatibility(sb, es, silent)) + goto failed_mount; + + if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (sb->s_blocksize / 4)) { ext4_msg(sb, KERN_ERR, "Number of reserved GDT blocks insanely large: %d", le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks)); @@ -4717,7 +5116,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) } if (sbi->s_daxdev) { - if (blocksize == PAGE_SIZE) + if (sb->s_blocksize == PAGE_SIZE) set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags); else ext4_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n"); @@ -4742,40 +5141,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) goto failed_mount; } - if (sb->s_blocksize != blocksize) { - /* - * bh must be released before kill_bdev(), otherwise - * it won't be freed and its page also. kill_bdev() - * is called by sb_set_blocksize(). - */ - brelse(bh); - /* Validate the filesystem blocksize */ - if (!sb_set_blocksize(sb, blocksize)) { - ext4_msg(sb, KERN_ERR, "bad block size %d", - blocksize); - bh = NULL; - goto failed_mount; - } - - logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE; - offset = do_div(logical_sb_block, blocksize); - bh = ext4_sb_bread_unmovable(sb, logical_sb_block); - if (IS_ERR(bh)) { - ext4_msg(sb, KERN_ERR, - "Can't read superblock on 2nd try"); - ret = PTR_ERR(bh); - bh = NULL; - goto failed_mount; - } - es = (struct ext4_super_block *)(bh->b_data + offset); - sbi->s_es = es; - if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) { - ext4_msg(sb, KERN_ERR, - "Magic mismatch, very weird!"); - goto failed_mount; - } - } - has_huge_files = ext4_has_feature_huge_file(sb); sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, has_huge_files); @@ -4797,19 +5162,21 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); - sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb); - if (sbi->s_inodes_per_block == 0) - goto cantfind_ext4; + sbi->s_inodes_per_block = sb->s_blocksize / EXT4_INODE_SIZE(sb); + if (sbi->s_inodes_per_block == 0 || sbi->s_blocks_per_group == 0) { + if (!silent) + ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); + goto failed_mount; + } if (sbi->s_inodes_per_group < sbi->s_inodes_per_block || - sbi->s_inodes_per_group > blocksize * 8) { + sbi->s_inodes_per_group > sb->s_blocksize * 8) { ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n", sbi->s_inodes_per_group); goto failed_mount; } sbi->s_itb_per_group = sbi->s_inodes_per_group / sbi->s_inodes_per_block; - sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb); - sbi->s_sbh = bh; + sbi->s_desc_per_block = sb->s_blocksize / EXT4_DESC_SIZE(sb); sbi->s_mount_state = le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY; sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); @@ -4835,54 +5202,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) } } - /* Handle clustersize */ - clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size); - if (ext4_has_feature_bigalloc(sb)) { - if (clustersize < blocksize) { - ext4_msg(sb, KERN_ERR, - "cluster size (%d) smaller than " - "block size (%d)", clustersize, blocksize); - goto failed_mount; - } - sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) - - le32_to_cpu(es->s_log_block_size); - sbi->s_clusters_per_group = - le32_to_cpu(es->s_clusters_per_group); - if (sbi->s_clusters_per_group > blocksize * 8) { - ext4_msg(sb, KERN_ERR, - "#clusters per group too big: %lu", - sbi->s_clusters_per_group); - goto failed_mount; - } - if (sbi->s_blocks_per_group != - (sbi->s_clusters_per_group * (clustersize / blocksize))) { - ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and " - "clusters per group (%lu) inconsistent", - sbi->s_blocks_per_group, - sbi->s_clusters_per_group); - goto failed_mount; - } - } else { - if (clustersize != blocksize) { - ext4_msg(sb, KERN_ERR, - "fragment/cluster size (%d) != " - "block size (%d)", clustersize, blocksize); - goto failed_mount; - } - if (sbi->s_blocks_per_group > blocksize * 8) { - ext4_msg(sb, KERN_ERR, - "#blocks per group too big: %lu", - sbi->s_blocks_per_group); - goto failed_mount; - } - sbi->s_clusters_per_group = sbi->s_blocks_per_group; - sbi->s_cluster_bits = 0; - } - sbi->s_cluster_ratio = clustersize / blocksize; - - /* Do we have standard group size of clustersize * 8 blocks ? */ - if (sbi->s_blocks_per_group == clustersize << 3) - set_opt2(sb, STD_GROUP_SIZE); + if (ext4_handle_clustersize(sb)) + goto failed_mount; /* * Test whether we have more sectors than will fit in sector_t, @@ -4896,111 +5217,12 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) goto failed_mount; } - if (EXT4_BLOCKS_PER_GROUP(sb) == 0) - goto cantfind_ext4; - - /* check blocks count against device size */ - blocks_count = sb_bdev_nr_blocks(sb); - if (blocks_count && ext4_blocks_count(es) > blocks_count) { - ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu " - "exceeds size of device (%llu blocks)", - ext4_blocks_count(es), blocks_count); + if (ext4_geometry_check(sb, es)) goto failed_mount; - } - /* - * It makes no sense for the first data block to be beyond the end - * of the filesystem. - */ - if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) { - ext4_msg(sb, KERN_WARNING, "bad geometry: first data " - "block %u is beyond end of filesystem (%llu)", - le32_to_cpu(es->s_first_data_block), - ext4_blocks_count(es)); - goto failed_mount; - } - if ((es->s_first_data_block == 0) && (es->s_log_block_size == 0) && - (sbi->s_cluster_ratio == 1)) { - ext4_msg(sb, KERN_WARNING, "bad geometry: first data " - "block is 0 with a 1k block and cluster size"); - goto failed_mount; - } - - blocks_count = (ext4_blocks_count(es) - - le32_to_cpu(es->s_first_data_block) + - EXT4_BLOCKS_PER_GROUP(sb) - 1); - do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb)); - if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) { - ext4_msg(sb, KERN_WARNING, "groups count too large: %llu " - "(block count %llu, first data block %u, " - "blocks per group %lu)", blocks_count, - ext4_blocks_count(es), - le32_to_cpu(es->s_first_data_block), - EXT4_BLOCKS_PER_GROUP(sb)); - goto failed_mount; - } - sbi->s_groups_count = blocks_count; - sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, - (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); - if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) != - le32_to_cpu(es->s_inodes_count)) { - ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu", - le32_to_cpu(es->s_inodes_count), - ((u64)sbi->s_groups_count * sbi->s_inodes_per_group)); - ret = -EINVAL; - goto failed_mount; - } - db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / - EXT4_DESC_PER_BLOCK(sb); - if (ext4_has_feature_meta_bg(sb)) { - if (le32_to_cpu(es->s_first_meta_bg) > db_count) { - ext4_msg(sb, KERN_WARNING, - "first meta block group too large: %u " - "(group descriptor block count %u)", - le32_to_cpu(es->s_first_meta_bg), db_count); - goto failed_mount; - } - } - rcu_assign_pointer(sbi->s_group_desc, - kvmalloc_array(db_count, - sizeof(struct buffer_head *), - GFP_KERNEL)); - if (sbi->s_group_desc == NULL) { - ext4_msg(sb, KERN_ERR, "not enough memory"); - ret = -ENOMEM; + err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed); + if (err) goto failed_mount; - } - - bgl_lock_init(sbi->s_blockgroup_lock); - - /* Pre-read the descriptors into the buffer cache */ - for (i = 0; i < db_count; i++) { - block = descriptor_loc(sb, logical_sb_block, i); - ext4_sb_breadahead_unmovable(sb, block); - } - - for (i = 0; i < db_count; i++) { - struct buffer_head *bh; - - block = descriptor_loc(sb, logical_sb_block, i); - bh = ext4_sb_bread_unmovable(sb, block); - if (IS_ERR(bh)) { - ext4_msg(sb, KERN_ERR, - "can't read group descriptor %d", i); - db_count = i; - ret = PTR_ERR(bh); - goto failed_mount2; - } - rcu_read_lock(); - rcu_dereference(sbi->s_group_desc)[i] = bh; - rcu_read_unlock(); - } - sbi->s_gdb_count = db_count; - if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) { - ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); - ret = -EFSCORRUPTED; - goto failed_mount2; - } timer_setup(&sbi->s_err_report, print_daily_error_info, 0); spin_lock_init(&sbi->s_error_lock); @@ -5038,24 +5260,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); - /* Initialize fast commit stuff */ - atomic_set(&sbi->s_fc_subtid, 0); - INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]); - INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]); - INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]); - INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]); - sbi->s_fc_bytes = 0; - ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); - sbi->s_fc_ineligible_tid = 0; - spin_lock_init(&sbi->s_fc_lock); - memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); - sbi->s_fc_replay_state.fc_regions = NULL; - sbi->s_fc_replay_state.fc_regions_size = 0; - sbi->s_fc_replay_state.fc_regions_used = 0; - sbi->s_fc_replay_state.fc_regions_valid = 0; - sbi->s_fc_replay_state.fc_modified_inodes = NULL; - sbi->s_fc_replay_state.fc_modified_inodes_size = 0; - sbi->s_fc_replay_state.fc_modified_inodes_used = 0; + ext4_fast_commit_init(sb); sb->s_root = NULL; @@ -5072,37 +5277,37 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) * root first: it may be modified in the journal! */ if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) { - err = ext4_load_journal(sb, es, ctx->journal_devnum); + err = ext4_load_and_init_journal(sb, es, ctx); if (err) goto failed_mount3a; } else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) && ext4_has_feature_journal_needs_recovery(sb)) { ext4_msg(sb, KERN_ERR, "required journal recovery " "suppressed and not mounted read-only"); - goto failed_mount_wq; + goto failed_mount3a; } else { /* Nojournal mode, all journal mount options are illegal */ if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) { ext4_msg(sb, KERN_ERR, "can't mount with " "journal_checksum, fs mounted w/o journal"); - goto failed_mount_wq; + goto failed_mount3a; } if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { ext4_msg(sb, KERN_ERR, "can't mount with " "journal_async_commit, fs mounted w/o journal"); - goto failed_mount_wq; + goto failed_mount3a; } if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { ext4_msg(sb, KERN_ERR, "can't mount with " "commit=%lu, fs mounted w/o journal", sbi->s_commit_interval / HZ); - goto failed_mount_wq; + goto failed_mount3a; } if (EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) { ext4_msg(sb, KERN_ERR, "can't mount with " "data=, fs mounted w/o journal"); - goto failed_mount_wq; + goto failed_mount3a; } sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM; clear_opt(sb, JOURNAL_CHECKSUM); @@ -5110,76 +5315,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) clear_opt2(sb, JOURNAL_FAST_COMMIT); sbi->s_journal = NULL; needs_recovery = 0; - goto no_journal; - } - - if (ext4_has_feature_64bit(sb) && - !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, - JBD2_FEATURE_INCOMPAT_64BIT)) { - ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature"); - goto failed_mount_wq; - } - - if (!set_journal_csum_feature_set(sb)) { - ext4_msg(sb, KERN_ERR, "Failed to set journal checksum " - "feature set"); - goto failed_mount_wq; - } - - if (test_opt2(sb, JOURNAL_FAST_COMMIT) && - !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, - JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) { - ext4_msg(sb, KERN_ERR, - "Failed to set fast commit journal feature"); - goto failed_mount_wq; - } - - /* We have now updated the journal if required, so we can - * validate the data journaling mode. */ - switch (test_opt(sb, DATA_FLAGS)) { - case 0: - /* No mode set, assume a default based on the journal - * capabilities: ORDERED_DATA if the journal can - * cope, else JOURNAL_DATA - */ - if (jbd2_journal_check_available_features - (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { - set_opt(sb, ORDERED_DATA); - sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA; - } else { - set_opt(sb, JOURNAL_DATA); - sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA; - } - break; - - case EXT4_MOUNT_ORDERED_DATA: - case EXT4_MOUNT_WRITEBACK_DATA: - if (!jbd2_journal_check_available_features - (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { - ext4_msg(sb, KERN_ERR, "Journal does not support " - "requested data journaling mode"); - goto failed_mount_wq; - } - break; - default: - break; - } - - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA && - test_opt(sb, JOURNAL_ASYNC_COMMIT)) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "journal_async_commit in data=ordered mode"); - goto failed_mount_wq; } - set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio); - - sbi->s_journal->j_submit_inode_data_buffers = - ext4_journal_submit_inode_data_buffers; - sbi->s_journal->j_finish_inode_data_buffers = - ext4_journal_finish_inode_data_buffers; - -no_journal: if (!test_opt(sb, NO_MBCACHE)) { sbi->s_ea_block_cache = ext4_xattr_create_cache(); if (!sbi->s_ea_block_cache) { @@ -5198,7 +5335,7 @@ no_journal: } } - if (ext4_has_feature_verity(sb) && blocksize != PAGE_SIZE) { + if (ext4_has_feature_verity(sb) && sb->s_blocksize != PAGE_SIZE) { ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs-verity"); goto failed_mount_wq; } @@ -5408,11 +5545,6 @@ no_journal: return 0; -cantfind_ext4: - if (!silent) - ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); - goto failed_mount; - failed_mount9: ext4_release_orphan_info(sb); failed_mount8: @@ -5466,13 +5598,7 @@ failed_mount3: flush_work(&sbi->s_error_work); del_timer_sync(&sbi->s_err_report); ext4_stop_mmpd(sbi); -failed_mount2: - rcu_read_lock(); - group_desc = rcu_dereference(sbi->s_group_desc); - for (i = 0; i < db_count; i++) - brelse(group_desc[i]); - kvfree(group_desc); - rcu_read_unlock(); + ext4_group_desc_free(sbi); failed_mount: if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); @@ -5487,7 +5613,7 @@ failed_mount: #endif fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); /* ext4_blkdev_remove() calls kill_bdev(), release bh before it. */ - brelse(bh); + brelse(sbi->s_sbh); ext4_blkdev_remove(sbi); out_fail: sb->s_fs_info = NULL; @@ -6653,7 +6779,7 @@ static int ext4_write_info(struct super_block *sb, int type) handle_t *handle; /* Data block + inode block */ - handle = ext4_journal_start(d_inode(sb->s_root), EXT4_HT_QUOTA, 2); + handle = ext4_journal_start_sb(sb, EXT4_HT_QUOTA, 2); if (IS_ERR(handle)) return PTR_ERR(handle); ret = dquot_commit_info(sb, type); diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index b051d19b5c8a..3c640bd7ecae 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -298,16 +298,14 @@ static int ext4_get_verity_descriptor_location(struct inode *inode, last_extent = path[path->p_depth].p_ext; if (!last_extent) { EXT4_ERROR_INODE(inode, "verity file has no extents"); - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); return -EFSCORRUPTED; } end_lblk = le32_to_cpu(last_extent->ee_block) + ext4_ext_get_actual_len(last_extent); desc_size_pos = (u64)end_lblk << inode->i_blkbits; - ext4_ext_drop_refs(path); - kfree(path); + ext4_free_ext_path(path); if (desc_size_pos < sizeof(desc_size_disk)) goto bad; @@ -365,13 +363,14 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) { - DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index); struct page *page; index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT; page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED); if (!page || !PageUptodate(page)) { + DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index); + if (page) put_page(page); else if (num_ra_pages > 1) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 533216e80fa2..36d6ba7190b6 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -2412,6 +2412,7 @@ retry_inode: if (!error) { ext4_xattr_update_super_block(handle, inode->i_sb); inode->i_ctime = current_time(inode); + inode_inc_iversion(inode); if (!value) no_expand = 0; error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index eaa240b21f07..5bbc44a5216e 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -219,7 +219,7 @@ static int f2fs_acl_update_mode(struct user_namespace *mnt_userns, return error; if (error == 0) *acl = NULL; - if (!in_group_p(i_gid_into_mnt(mnt_userns, inode)) && + if (!vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode)) && !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) mode &= ~S_ISGID; *mode_p = mode; diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 8259e0fa97e1..0c82dae082aa 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -26,12 +26,16 @@ static struct kmem_cache *ino_entry_slab; struct kmem_cache *f2fs_inode_entry_slab; -void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) +void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io, + unsigned char reason) { f2fs_build_fault_attr(sbi, 0, 0); set_ckpt_flags(sbi, CP_ERROR_FLAG); - if (!end_io) + if (!end_io) { f2fs_flush_merged_writes(sbi); + + f2fs_handle_stop(sbi, reason); + } } /* @@ -89,7 +93,7 @@ repeat: return ERR_PTR(err); } - f2fs_update_iostat(sbi, FS_META_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, NULL, FS_META_READ_IO, F2FS_BLKSIZE); lock_page(page); if (unlikely(page->mapping != mapping)) { @@ -122,7 +126,7 @@ retry: if (PTR_ERR(page) == -EIO && ++count <= DEFAULT_RETRY_IO_COUNT) goto retry; - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_META_PAGE); } return page; } @@ -140,7 +144,7 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr, unsigned int segno, offset; bool exist; - if (type != DATA_GENERIC_ENHANCE && type != DATA_GENERIC_ENHANCE_READ) + if (type == DATA_GENERIC) return true; segno = GET_SEGNO(sbi, blkaddr); @@ -148,6 +152,13 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr, se = get_seg_entry(sbi, segno); exist = f2fs_test_bit(offset, se->cur_valid_map); + if (exist && type == DATA_GENERIC_ENHANCE_UPDATE) { + f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d", + blkaddr, exist); + set_sbi_flag(sbi, SBI_NEED_FSCK); + return exist; + } + if (!exist && type == DATA_GENERIC_ENHANCE) { f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d", blkaddr, exist); @@ -185,6 +196,7 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, case DATA_GENERIC: case DATA_GENERIC_ENHANCE: case DATA_GENERIC_ENHANCE_READ: + case DATA_GENERIC_ENHANCE_UPDATE: if (unlikely(blkaddr >= MAX_BLKADDR(sbi) || blkaddr < MAIN_BLKADDR(sbi))) { f2fs_warn(sbi, "access invalid blkaddr:%u", @@ -276,7 +288,8 @@ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, f2fs_put_page(page, err ? 1 : 0); if (!err) - f2fs_update_iostat(sbi, FS_META_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, NULL, FS_META_READ_IO, + F2FS_BLKSIZE); } out: blk_finish_plug(&plug); @@ -448,8 +461,7 @@ static bool f2fs_dirty_meta_folio(struct address_space *mapping, if (!folio_test_uptodate(folio)) folio_mark_uptodate(folio); - if (!folio_test_dirty(folio)) { - filemap_dirty_folio(mapping, folio); + if (filemap_dirty_folio(mapping, folio)) { inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_META); set_page_private_reference(&folio->page); return true; @@ -1053,7 +1065,8 @@ void f2fs_remove_dirty_inode(struct inode *inode) spin_unlock(&sbi->inode_lock[type]); } -int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) +int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type, + bool from_cp) { struct list_head *head; struct inode *inode; @@ -1088,11 +1101,15 @@ retry: if (inode) { unsigned long cur_ino = inode->i_ino; - F2FS_I(inode)->cp_task = current; + if (from_cp) + F2FS_I(inode)->cp_task = current; + F2FS_I(inode)->wb_task = current; filemap_fdatawrite(inode->i_mapping); - F2FS_I(inode)->cp_task = NULL; + F2FS_I(inode)->wb_task = NULL; + if (from_cp) + F2FS_I(inode)->cp_task = NULL; iput(inode); /* We need to give cpu to another writers. */ @@ -1221,7 +1238,7 @@ retry_flush_dents: /* write all the dirty dentry pages */ if (get_pages(sbi, F2FS_DIRTY_DENTS)) { f2fs_unlock_all(sbi); - err = f2fs_sync_dirty_inodes(sbi, DIR_INODE); + err = f2fs_sync_dirty_inodes(sbi, DIR_INODE, true); if (err) return err; cond_resched(); @@ -1892,15 +1909,27 @@ int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi) void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi) { struct ckpt_req_control *cprc = &sbi->cprc_info; + struct task_struct *ckpt_task; + + if (!cprc->f2fs_issue_ckpt) + return; - if (cprc->f2fs_issue_ckpt) { - struct task_struct *ckpt_task = cprc->f2fs_issue_ckpt; + ckpt_task = cprc->f2fs_issue_ckpt; + cprc->f2fs_issue_ckpt = NULL; + kthread_stop(ckpt_task); - cprc->f2fs_issue_ckpt = NULL; - kthread_stop(ckpt_task); + f2fs_flush_ckpt_thread(sbi); +} - flush_remained_ckpt_reqs(sbi, NULL); - } +void f2fs_flush_ckpt_thread(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + + flush_remained_ckpt_reqs(sbi, NULL); + + /* Let's wait for the previous dispatched checkpoint. */ + while (atomic_read(&cprc->queued_ckpt)) + io_schedule_timeout(DEFAULT_IO_TIMEOUT); } void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 70e97075e535..d315c2de136f 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -762,6 +762,7 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) if (dic->clen > PAGE_SIZE * dic->nr_cpages - COMPRESS_HEADER_SIZE) { ret = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_FAIL_DECOMPRESSION); goto out_release; } @@ -912,17 +913,15 @@ bool f2fs_sanity_check_cluster(struct dnode_of_data *dn) reason = "[C|*|C|*]"; goto out; } - if (compressed) { - if (!__is_valid_data_blkaddr(blkaddr)) { - if (!cluster_end) - cluster_end = i; - continue; - } - /* [COMPR_ADDR, NULL_ADDR or NEW_ADDR, valid_blkaddr] */ - if (cluster_end) { - reason = "[C|N|N|V]"; - goto out; - } + if (!__is_valid_data_blkaddr(blkaddr)) { + if (!cluster_end) + cluster_end = i; + continue; + } + /* [COMPR_ADDR, NULL_ADDR or NEW_ADDR, valid_blkaddr] */ + if (cluster_end) { + reason = "[C|N|N|V]"; + goto out; } } return false; @@ -952,6 +951,7 @@ static int __f2fs_cluster_blocks(struct inode *inode, if (f2fs_sanity_check_cluster(&dn)) { ret = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), ERROR_CORRUPTED_CLUSTER); goto fail; } @@ -1568,12 +1568,8 @@ static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic, if (!dic->cbuf) return -ENOMEM; - if (cops->init_decompress_ctx) { - int ret = cops->init_decompress_ctx(dic); - - if (ret) - return ret; - } + if (cops->init_decompress_ctx) + return cops->init_decompress_ctx(dic); return 0; } @@ -1905,7 +1901,7 @@ bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, struct page *page, void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino) { - struct address_space *mapping = sbi->compress_inode->i_mapping; + struct address_space *mapping = COMPRESS_MAPPING(sbi); struct folio_batch fbatch; pgoff_t index = 0; pgoff_t end = MAX_BLKADDR(sbi); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index aa3ccddfa037..a71e818cd67b 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -139,7 +139,7 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task) continue; } - /* PG_error was set if decryption or verity failed. */ + /* PG_error was set if verity failed. */ if (bio->bi_status || PageError(page)) { ClearPageUptodate(page); /* will re-read again later */ @@ -185,7 +185,7 @@ static void f2fs_verify_bio(struct work_struct *work) struct page *page = bv->bv_page; if (!f2fs_is_compressed_page(page) && - !PageError(page) && !fsverity_verify_page(page)) + !fsverity_verify_page(page)) SetPageError(page); } } else { @@ -236,10 +236,9 @@ static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx, bio_for_each_segment_all(bv, ctx->bio, iter_all) { struct page *page = bv->bv_page; - /* PG_error was set if decryption failed. */ if (f2fs_is_compressed_page(page)) - f2fs_end_read_compressed_page(page, PageError(page), - blkaddr, in_task); + f2fs_end_read_compressed_page(page, false, blkaddr, + in_task); else all_compressed = false; @@ -259,14 +258,17 @@ static void f2fs_post_read_work(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); + struct bio *bio = ctx->bio; - if (ctx->enabled_steps & STEP_DECRYPT) - fscrypt_decrypt_bio(ctx->bio); + if ((ctx->enabled_steps & STEP_DECRYPT) && !fscrypt_decrypt_bio(bio)) { + f2fs_finish_read_bio(bio, true); + return; + } if (ctx->enabled_steps & STEP_DECOMPRESS) f2fs_handle_step_decompress(ctx, true); - f2fs_verify_and_finish_bio(ctx->bio, true); + f2fs_verify_and_finish_bio(bio, true); } static void f2fs_read_end_io(struct bio *bio) @@ -333,7 +335,8 @@ static void f2fs_write_end_io(struct bio *bio) mempool_free(page, sbi->write_io_dummy); if (unlikely(bio->bi_status)) - f2fs_stop_checkpoint(sbi, true); + f2fs_stop_checkpoint(sbi, true, + STOP_CP_REASON_WRITE_FAIL); continue; } @@ -349,7 +352,8 @@ static void f2fs_write_end_io(struct bio *bio) if (unlikely(bio->bi_status)) { mapping_set_error(page->mapping, -EIO); if (type == F2FS_WB_CP_DATA) - f2fs_stop_checkpoint(sbi, true); + f2fs_stop_checkpoint(sbi, true, + STOP_CP_REASON_WRITE_FAIL); } f2fs_bug_on(sbi, page->mapping == NODE_MAPPING(sbi) && @@ -703,8 +707,10 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr, fio->is_por ? META_POR : (__is_meta_io(fio) ? - META_GENERIC : DATA_GENERIC_ENHANCE))) + META_GENERIC : DATA_GENERIC_ENHANCE))) { + f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } trace_f2fs_submit_page_bio(page, fio); @@ -723,7 +729,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE); inc_page_count(fio->sbi, is_read_io(fio->op) ? - __read_io_type(page): WB_DATA_TYPE(fio->page)); + __read_io_type(page) : WB_DATA_TYPE(fio->page)); __submit_bio(fio->sbi, bio, fio->type); return 0; @@ -904,8 +910,10 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio) fio->encrypted_page : fio->page; if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr, - __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC)) + __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC)) { + f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } trace_f2fs_submit_page_bio(page, fio); @@ -1083,7 +1091,7 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page, } ClearPageError(page); inc_page_count(sbi, F2FS_RD_DATA); - f2fs_update_iostat(sbi, FS_DATA_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, NULL, FS_DATA_READ_IO, F2FS_BLKSIZE); __submit_bio(sbi, bio, DATA); return 0; } @@ -1215,6 +1223,8 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr, DATA_GENERIC_ENHANCE_READ)) { err = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_INVALID_BLKADDR); goto put_err; } goto got_it; @@ -1235,6 +1245,8 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, dn.data_blkaddr, DATA_GENERIC_ENHANCE)) { err = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_INVALID_BLKADDR); goto put_err; } got_it: @@ -1548,6 +1560,7 @@ next_block: if (__is_valid_data_blkaddr(blkaddr) && !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto sync_out; } @@ -1593,6 +1606,8 @@ next_block: (flag != F2FS_GET_BLOCK_FIEMAP || IS_ENABLED(CONFIG_F2FS_CHECK_FS))) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, + ERROR_CORRUPTED_CLUSTER); goto sync_out; } if (flag == F2FS_GET_BLOCK_BMAP) { @@ -1816,7 +1831,7 @@ static int f2fs_xattr_fiemap(struct inode *inode, err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags); trace_f2fs_fiemap(inode, 0, phys, len, flags, err); - if (err || err == 1) + if (err) return err; } @@ -2074,6 +2089,8 @@ got_it: if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr, DATA_GENERIC_ENHANCE_READ)) { ret = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_INVALID_BLKADDR); goto out; } } else { @@ -2122,7 +2139,8 @@ submit_and_realloc: goto submit_and_realloc; inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA); - f2fs_update_iostat(F2FS_I_SB(inode), FS_DATA_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO, + F2FS_BLKSIZE); ClearPageError(page); *last_block_in_bio = block_nr; goto out; @@ -2270,8 +2288,7 @@ submit_and_realloc: refcount_inc(&dic->refcnt); inc_page_count(sbi, F2FS_RD_DATA); - f2fs_update_iostat(sbi, FS_DATA_READ_IO, F2FS_BLKSIZE); - f2fs_update_iostat(sbi, FS_CDATA_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, inode, FS_DATA_READ_IO, F2FS_BLKSIZE); ClearPageError(page); *last_block_in_bio = blkaddr; } @@ -2543,7 +2560,7 @@ bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio) return true; /* if this is cold file, we should overwrite to avoid fragmentation */ - if (file_is_cold(inode)) + if (file_is_cold(inode) && !is_inode_flag_set(inode, FI_OPU_WRITE)) return true; return check_inplace_update_policy(inode, fio); @@ -2617,8 +2634,11 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) fio->old_blkaddr = ei.blk + page->index - ei.fofs; if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, - DATA_GENERIC_ENHANCE)) + DATA_GENERIC_ENHANCE)) { + f2fs_handle_error(fio->sbi, + ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } ipu_force = true; fio->need_lock = LOCK_DONE; @@ -2646,6 +2666,7 @@ got_it: !f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, DATA_GENERIC_ENHANCE)) { err = -EFSCORRUPTED; + f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR); goto out_writepage; } @@ -2856,7 +2877,7 @@ out: } unlock_page(page); if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) && - !F2FS_I(inode)->cp_task && allow_balance) + !F2FS_I(inode)->wb_task && allow_balance) f2fs_balance_fs(sbi, need_balance_fs); if (unlikely(f2fs_cp_error(sbi))) { @@ -3156,7 +3177,7 @@ static inline bool __should_serialize_io(struct inode *inode, struct writeback_control *wbc) { /* to avoid deadlock in path of data flush */ - if (F2FS_I(inode)->cp_task) + if (F2FS_I(inode)->wb_task) return false; if (!S_ISREG(inode->i_mode)) @@ -3559,6 +3580,7 @@ repeat: if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE_READ)) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto fail; } err = f2fs_submit_page_read(inode, page, blkaddr, 0, true); @@ -3697,8 +3719,7 @@ static bool f2fs_dirty_data_folio(struct address_space *mapping, folio_mark_uptodate(folio); BUG_ON(folio_test_swapcache(folio)); - if (!folio_test_dirty(folio)) { - filemap_dirty_folio(mapping, folio); + if (filemap_dirty_folio(mapping, folio)) { f2fs_update_dirty_folio(inode, folio); return true; } @@ -3970,6 +3991,7 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, if (ret < 0) return ret; + stat_inc_swapfile_inode(inode); set_inode_flag(inode, FI_PIN_FILE); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return ret; @@ -3979,6 +4001,7 @@ static void f2fs_swap_deactivate(struct file *file) { struct inode *inode = file_inode(file); + stat_dec_swapfile_inode(inode); clear_inode_flag(inode, FI_PIN_FILE); } #else diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index c01471573977..a216dcdf6941 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -91,7 +91,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; si->nquota_files = sbi->nquota_files; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; - si->aw_cnt = sbi->atomic_files; + si->aw_cnt = atomic_read(&sbi->atomic_files); si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); si->nr_dio_read = get_pages(sbi, F2FS_DIO_READ); si->nr_dio_write = get_pages(sbi, F2FS_DIO_WRITE); @@ -135,6 +135,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->inline_inode = atomic_read(&sbi->inline_inode); si->inline_dir = atomic_read(&sbi->inline_dir); si->compr_inode = atomic_read(&sbi->compr_inode); + si->swapfile_inode = atomic_read(&sbi->swapfile_inode); si->compr_blocks = atomic64_read(&sbi->compr_blocks); si->append = sbi->im[APPEND_INO].ino_num; si->update = sbi->im[UPDATE_INO].ino_num; @@ -347,7 +348,7 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, "\n=====[ partition info(%pg). #%d, %s, CP: %s]=====\n", si->sbi->sb->s_bdev, i++, - f2fs_readonly(si->sbi->sb) ? "RO": "RW", + f2fs_readonly(si->sbi->sb) ? "RO" : "RW", is_set_ckpt_flags(si->sbi, CP_DISABLED_FLAG) ? "Disabled" : (f2fs_cp_error(si->sbi) ? "Error" : "Good")); if (si->sbi->s_flag) { @@ -385,6 +386,8 @@ static int stat_show(struct seq_file *s, void *v) si->inline_dir); seq_printf(s, " - Compressed Inode: %u, Blocks: %llu\n", si->compr_inode, si->compr_blocks); + seq_printf(s, " - Swapfile Inode: %u\n", + si->swapfile_inode); seq_printf(s, " - Orphan/Append/Update Inode: %u, %u, %u\n", si->orphans, si->append, si->update); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", @@ -607,6 +610,8 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) atomic_set(&sbi->inline_dir, 0); atomic_set(&sbi->compr_inode, 0); atomic64_set(&sbi->compr_blocks, 0); + atomic_set(&sbi->swapfile_inode, 0); + atomic_set(&sbi->atomic_files, 0); atomic_set(&sbi->inplace_count, 0); for (i = META_CP; i < META_MAX; i++) atomic_set(&sbi->meta_count[i], 0); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index d5bd7932fb64..21960a899b6a 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -1041,6 +1041,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, __func__, le16_to_cpu(de->name_len)); set_sbi_flag(sbi, SBI_NEED_FSCK); err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_CORRUPTED_DIRENT); goto out; } diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 866e72b29bd5..932c070173b9 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -544,7 +544,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode, if (!et) return; - trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len); + trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len, 0); write_lock(&et->lock); @@ -583,7 +583,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode, org_end = dei.fofs + dei.len; f2fs_bug_on(sbi, pos >= org_end); - if (pos > dei.fofs && pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) { + if (pos > dei.fofs && pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) { en->ei.len = pos - en->ei.fofs; prev_en = en; parts = 1; @@ -675,7 +675,7 @@ void f2fs_update_extent_tree_range_compressed(struct inode *inode, struct rb_node **insert_p = NULL, *insert_parent = NULL; bool leftmost = false; - trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, llen); + trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, llen, c_len); /* it is safe here to check FI_NO_EXTENT w/o et->lock in ro image */ if (is_inode_flag_set(inode, FI_NO_EXTENT)) @@ -804,9 +804,8 @@ void f2fs_drop_extent_tree(struct inode *inode) if (!f2fs_may_extent_tree(inode)) return; - set_inode_flag(inode, FI_NO_EXTENT); - write_lock(&et->lock); + set_inode_flag(inode, FI_NO_EXTENT); __free_extent_tree(sbi, et); if (et->largest.len) { et->largest.len = 0; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3c7cdb70fe2e..e6355a5683b7 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -266,6 +266,10 @@ enum { * condition of read on truncated area * by extent_cache */ + DATA_GENERIC_ENHANCE_UPDATE, /* + * strong check on range and segment + * bitmap for update case + */ META_GENERIC, }; @@ -274,7 +278,7 @@ enum { ORPHAN_INO, /* for orphan ino list */ APPEND_INO, /* for append ino list */ UPDATE_INO, /* for update ino list */ - TRANS_DIR_INO, /* for trasactions dir ino list */ + TRANS_DIR_INO, /* for transactions dir ino list */ FLUSH_INO, /* for multiple device flushing */ MAX_INO_ENTRY, /* max. list */ }; @@ -782,6 +786,7 @@ struct f2fs_inode_info { unsigned int clevel; /* maximum level of given file name */ struct task_struct *task; /* lookup and create consistency */ struct task_struct *cp_task; /* separate cp/wb IO stats*/ + struct task_struct *wb_task; /* indicate inode is in context of writeback */ nid_t i_xattr_nid; /* node id that contains xattrs */ loff_t last_disk_size; /* lastly written file size */ spinlock_t i_size_lock; /* protect last_disk_size */ @@ -1158,7 +1163,10 @@ enum iostat_type { APP_BUFFERED_IO, /* app buffered write IOs */ APP_WRITE_IO, /* app write IOs */ APP_MAPPED_IO, /* app mapped IOs */ + APP_BUFFERED_CDATA_IO, /* app buffered write IOs on compressed file */ + APP_MAPPED_CDATA_IO, /* app mapped write IOs on compressed file */ FS_DATA_IO, /* data IOs from kworker/fsync/reclaimer */ + FS_CDATA_IO, /* data IOs from kworker/fsync/reclaimer on compressed file */ FS_NODE_IO, /* node IOs from kworker/fsync/reclaimer */ FS_META_IO, /* meta IOs from kworker/reclaimer */ FS_GC_DATA_IO, /* data IOs from forground gc */ @@ -1172,6 +1180,8 @@ enum iostat_type { APP_BUFFERED_READ_IO, /* app buffered read IOs */ APP_READ_IO, /* app read IOs */ APP_MAPPED_READ_IO, /* app mapped read IOs */ + APP_BUFFERED_CDATA_READ_IO, /* app buffered read IOs on compressed file */ + APP_MAPPED_CDATA_READ_IO, /* app mapped read IOs on compressed file */ FS_DATA_READ_IO, /* data read IOs */ FS_GDATA_READ_IO, /* data read IOs from background gc */ FS_CDATA_READ_IO, /* compressed data read IOs */ @@ -1247,7 +1257,6 @@ enum inode_type { DIR_INODE, /* for dirty dir inode */ FILE_INODE, /* for dirty regular/symlink inode */ DIRTY_META, /* for all dirtied inode metadata */ - ATOMIC_FILE, /* for all atomic files */ NR_INODE_TYPE, }; @@ -1726,11 +1735,9 @@ struct f2fs_sb_info { unsigned int gc_mode; /* current GC state */ unsigned int next_victim_seg[2]; /* next segment in victim section */ spinlock_t gc_urgent_high_lock; - bool gc_urgent_high_limited; /* indicates having limited trial count */ unsigned int gc_urgent_high_remaining; /* remaining trial count for GC_URGENT_HIGH */ /* for skip statistic */ - unsigned int atomic_files; /* # of opened atomic file */ unsigned long long skipped_gc_rwsem; /* FG_GC only */ /* threshold for gc trials on pinned files */ @@ -1761,6 +1768,8 @@ struct f2fs_sb_info { atomic_t inline_dir; /* # of inline_dentry inodes */ atomic_t compr_inode; /* # of compressed inodes */ atomic64_t compr_blocks; /* # of compressed blocks */ + atomic_t swapfile_inode; /* # of swapfile inodes */ + atomic_t atomic_files; /* # of opened atomic file */ atomic_t max_aw_cnt; /* max # of atomic writes */ unsigned int io_skip_bggc; /* skip background gc for in-flight IO */ unsigned int other_skip_bggc; /* skip background gc for other reasons */ @@ -1806,6 +1815,10 @@ struct f2fs_sb_info { struct workqueue_struct *post_read_wq; /* post read workqueue */ + unsigned char errors[MAX_F2FS_ERRORS]; /* error flags */ + spinlock_t error_lock; /* protect errors array */ + bool error_dirty; /* errors of sb is dirty */ + struct kmem_cache *inline_xattr_slab; /* inline xattr entry */ unsigned int inline_xattr_slab_size; /* default inline xattr slab size */ @@ -2525,7 +2538,7 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) if (__cp_payload(sbi) > 0) { if (flag == NAT_BITMAP) - return &ckpt->sit_nat_version_bitmap; + return tmp_ptr; else return (unsigned char *)ckpt + F2FS_BLKSIZE; } else { @@ -3547,6 +3560,8 @@ int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly); int f2fs_quota_sync(struct super_block *sb, int type); loff_t max_file_blocks(struct inode *inode); void f2fs_quota_off_umount(struct super_block *sb); +void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason); +void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error); int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); int f2fs_sync_fs(struct super_block *sb, int sync); int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi); @@ -3706,7 +3721,9 @@ static inline bool f2fs_need_rand_seg(struct f2fs_sb_info *sbi) /* * checkpoint.c */ -void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io); +void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io, + unsigned char reason); +void f2fs_flush_ckpt_thread(struct f2fs_sb_info *sbi); struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); struct page *f2fs_get_meta_page_retry(struct f2fs_sb_info *sbi, pgoff_t index); @@ -3736,7 +3753,8 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi); int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi); void f2fs_update_dirty_folio(struct inode *inode, struct folio *folio); void f2fs_remove_dirty_inode(struct inode *inode); -int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type); +int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type, + bool from_cp); void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type); u64 f2fs_get_sectors_written(struct f2fs_sb_info *sbi); int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc); @@ -3858,7 +3876,7 @@ struct f2fs_stat_info { int nr_issued_ckpt, nr_total_ckpt, nr_queued_ckpt; unsigned int cur_ckpt_time, peak_ckpt_time; int inline_xattr, inline_inode, inline_dir, append, update, orphans; - int compr_inode; + int compr_inode, swapfile_inode; unsigned long long compr_blocks; int aw_cnt, max_aw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; @@ -3947,6 +3965,14 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) (atomic64_add(blocks, &F2FS_I_SB(inode)->compr_blocks)) #define stat_sub_compr_blocks(inode, blocks) \ (atomic64_sub(blocks, &F2FS_I_SB(inode)->compr_blocks)) +#define stat_inc_swapfile_inode(inode) \ + (atomic_inc(&F2FS_I_SB(inode)->swapfile_inode)) +#define stat_dec_swapfile_inode(inode) \ + (atomic_dec(&F2FS_I_SB(inode)->swapfile_inode)) +#define stat_inc_atomic_inode(inode) \ + (atomic_inc(&F2FS_I_SB(inode)->atomic_files)) +#define stat_dec_atomic_inode(inode) \ + (atomic_dec(&F2FS_I_SB(inode)->atomic_files)) #define stat_inc_meta_count(sbi, blkaddr) \ do { \ if (blkaddr < SIT_I(sbi)->sit_base_addr) \ @@ -3966,7 +3992,7 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) (atomic_inc(&(sbi)->inplace_count)) #define stat_update_max_atomic_write(inode) \ do { \ - int cur = F2FS_I_SB(inode)->atomic_files; \ + int cur = atomic_read(&F2FS_I_SB(inode)->atomic_files); \ int max = atomic_read(&F2FS_I_SB(inode)->max_aw_cnt); \ if (cur > max) \ atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \ @@ -4031,6 +4057,10 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi); #define stat_dec_compr_inode(inode) do { } while (0) #define stat_add_compr_blocks(inode, blocks) do { } while (0) #define stat_sub_compr_blocks(inode, blocks) do { } while (0) +#define stat_inc_swapfile_inode(inode) do { } while (0) +#define stat_dec_swapfile_inode(inode) do { } while (0) +#define stat_inc_atomic_inode(inode) do { } while (0) +#define stat_dec_atomic_inode(inode) do { } while (0) #define stat_update_max_atomic_write(inode) do { } while (0) #define stat_inc_meta_count(sbi, blkaddr) do { } while (0) #define stat_inc_seg_type(sbi, curseg) do { } while (0) @@ -4471,17 +4501,6 @@ static inline void f2fs_i_compr_blocks_update(struct inode *inode, f2fs_mark_inode_dirty_sync(inode, true); } -static inline int block_unaligned_IO(struct inode *inode, - struct kiocb *iocb, struct iov_iter *iter) -{ - unsigned int i_blkbits = READ_ONCE(inode->i_blkbits); - unsigned int blocksize_mask = (1 << i_blkbits) - 1; - loff_t offset = iocb->ki_pos; - unsigned long align = offset | iov_iter_alignment(iter); - - return align & blocksize_mask; -} - static inline bool f2fs_allow_multi_device_dio(struct f2fs_sb_info *sbi, int flag) { @@ -4492,35 +4511,6 @@ static inline bool f2fs_allow_multi_device_dio(struct f2fs_sb_info *sbi, return sbi->aligned_blksize; } -static inline bool f2fs_force_buffered_io(struct inode *inode, - struct kiocb *iocb, struct iov_iter *iter) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - int rw = iov_iter_rw(iter); - - if (!fscrypt_dio_supported(iocb, iter)) - return true; - if (fsverity_active(inode)) - return true; - if (f2fs_compressed_file(inode)) - return true; - - /* disallow direct IO if any of devices has unaligned blksize */ - if (f2fs_is_multi_device(sbi) && !sbi->aligned_blksize) - return true; - - if (f2fs_lfs_mode(sbi) && (rw == WRITE)) { - if (block_unaligned_IO(inode, iocb, iter)) - return true; - if (F2FS_IO_ALIGNED(sbi)) - return true; - } - if (is_sbi_flag_set(F2FS_I_SB(inode), SBI_CP_DISABLED)) - return true; - - return false; -} - static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx) { return fsverity_active(inode) && diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ce4905a073b3..82cda1258227 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -43,8 +43,8 @@ static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf) ret = filemap_fault(vmf); if (!ret) - f2fs_update_iostat(F2FS_I_SB(inode), APP_MAPPED_READ_IO, - F2FS_BLKSIZE); + f2fs_update_iostat(F2FS_I_SB(inode), inode, + APP_MAPPED_READ_IO, F2FS_BLKSIZE); trace_f2fs_filemap_fault(inode, vmf->pgoff, (unsigned long)ret); @@ -154,7 +154,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) if (!PageUptodate(page)) SetPageUptodate(page); - f2fs_update_iostat(sbi, APP_MAPPED_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, inode, APP_MAPPED_IO, F2FS_BLKSIZE); f2fs_update_time(sbi, REQ_TIME); trace_f2fs_vm_page_mkwrite(page, DATA); @@ -808,6 +808,34 @@ int f2fs_truncate(struct inode *inode) return 0; } +static bool f2fs_force_buffered_io(struct inode *inode, int rw) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (!fscrypt_dio_supported(inode)) + return true; + if (fsverity_active(inode)) + return true; + if (f2fs_compressed_file(inode)) + return true; + + /* disallow direct IO if any of devices has unaligned blksize */ + if (f2fs_is_multi_device(sbi) && !sbi->aligned_blksize) + return true; + /* + * for blkzoned device, fallback direct IO to buffered IO, so + * all IOs can be serialized by log-structured write. + */ + if (f2fs_sb_has_blkzoned(sbi) && (rw == WRITE)) + return true; + if (f2fs_lfs_mode(sbi) && rw == WRITE && F2FS_IO_ALIGNED(sbi)) + return true; + if (is_sbi_flag_set(sbi, SBI_CP_DISABLED)) + return true; + + return false; +} + int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { @@ -824,6 +852,24 @@ int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path, stat->btime.tv_nsec = fi->i_crtime.tv_nsec; } + /* + * Return the DIO alignment restrictions if requested. We only return + * this information when requested, since on encrypted files it might + * take a fair bit of work to get if the file wasn't opened recently. + * + * f2fs sometimes supports DIO reads but not DIO writes. STATX_DIOALIGN + * cannot represent that, so in that case we report no DIO support. + */ + if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) { + unsigned int bsize = i_blocksize(inode); + + stat->result_mask |= STATX_DIOALIGN; + if (!f2fs_force_buffered_io(inode, WRITE)) { + stat->dio_mem_align = bsize; + stat->dio_offset_align = bsize; + } + } + flags = fi->i_flags; if (flags & F2FS_COMPR_FL) stat->attributes |= STATX_ATTR_COMPRESSED; @@ -871,9 +917,10 @@ static void __setattr_copy(struct user_namespace *mnt_userns, inode->i_ctime = attr->ia_ctime; if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; - kgid_t kgid = i_gid_into_mnt(mnt_userns, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); - if (!in_group_p(kgid) && !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) + if (!vfsgid_in_group_p(vfsgid) && + !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) mode &= ~S_ISGID; set_acl_inode(inode, mode); } @@ -1155,6 +1202,7 @@ next_dnode: !f2fs_is_valid_blkaddr(sbi, *blkaddr, DATA_GENERIC_ENHANCE)) { f2fs_put_dnode(&dn); + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; } @@ -1439,6 +1487,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, if (!f2fs_is_valid_blkaddr(sbi, dn->data_blkaddr, DATA_GENERIC_ENHANCE)) { ret = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); break; } @@ -2048,9 +2097,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) } f2fs_i_size_write(fi->cow_inode, i_size_read(inode)); - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - sbi->atomic_files++; - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + stat_inc_atomic_inode(inode); set_inode_flag(inode, FI_ATOMIC_FILE); set_inode_flag(fi->cow_inode, FI_COW_FILE); @@ -2144,7 +2191,8 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) if (ret) { if (ret == -EROFS) { ret = 0; - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_SHUTDOWN); set_sbi_flag(sbi, SBI_IS_SHUTDOWN); trace_f2fs_shutdown(sbi, in, ret); } @@ -2157,7 +2205,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) ret = freeze_bdev(sb->s_bdev); if (ret) goto out; - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); set_sbi_flag(sbi, SBI_IS_SHUTDOWN); thaw_bdev(sb->s_bdev); break; @@ -2166,16 +2214,16 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) ret = f2fs_sync_fs(sb, 1); if (ret) goto out; - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); set_sbi_flag(sbi, SBI_IS_SHUTDOWN); break; case F2FS_GOING_DOWN_NOSYNC: - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); set_sbi_flag(sbi, SBI_IS_SHUTDOWN); break; case F2FS_GOING_DOWN_METAFLUSH: f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_META_IO); - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); set_sbi_flag(sbi, SBI_IS_SHUTDOWN); break; case F2FS_GOING_DOWN_NEED_FSCK: @@ -3321,8 +3369,10 @@ static int release_compress_blocks(struct dnode_of_data *dn, pgoff_t count) if (!__is_valid_data_blkaddr(blkaddr)) continue; if (unlikely(!f2fs_is_valid_blkaddr(sbi, blkaddr, - DATA_GENERIC_ENHANCE))) + DATA_GENERIC_ENHANCE))) { + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } } while (count) { @@ -3483,8 +3533,10 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count) if (!__is_valid_data_blkaddr(blkaddr)) continue; if (unlikely(!f2fs_is_valid_blkaddr(sbi, blkaddr, - DATA_GENERIC_ENHANCE))) + DATA_GENERIC_ENHANCE))) { + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } } while (count) { @@ -3756,6 +3808,8 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg) DATA_GENERIC_ENHANCE)) { ret = -EFSCORRUPTED; f2fs_put_dnode(&dn); + f2fs_handle_error(sbi, + ERROR_INVALID_BLKADDR); goto out; } @@ -4182,7 +4236,7 @@ static bool f2fs_should_use_dio(struct inode *inode, struct kiocb *iocb, if (!(iocb->ki_flags & IOCB_DIRECT)) return false; - if (f2fs_force_buffered_io(inode, iocb, iter)) + if (f2fs_force_buffered_io(inode, iov_iter_rw(iter))) return false; /* @@ -4212,7 +4266,7 @@ static int f2fs_dio_read_end_io(struct kiocb *iocb, ssize_t size, int error, dec_page_count(sbi, F2FS_DIO_READ); if (error) return error; - f2fs_update_iostat(sbi, APP_DIRECT_READ_IO, size); + f2fs_update_iostat(sbi, NULL, APP_DIRECT_READ_IO, size); return 0; } @@ -4301,7 +4355,8 @@ skip_read_trace: } else { ret = filemap_read(iocb, to, 0); if (ret > 0) - f2fs_update_iostat(F2FS_I_SB(inode), APP_BUFFERED_READ_IO, ret); + f2fs_update_iostat(F2FS_I_SB(inode), inode, + APP_BUFFERED_READ_IO, ret); } if (trace_f2fs_dataread_end_enabled()) trace_f2fs_dataread_end(inode, pos, ret); @@ -4418,7 +4473,8 @@ static ssize_t f2fs_buffered_write_iter(struct kiocb *iocb, if (ret > 0) { iocb->ki_pos += ret; - f2fs_update_iostat(F2FS_I_SB(inode), APP_BUFFERED_IO, ret); + f2fs_update_iostat(F2FS_I_SB(inode), inode, + APP_BUFFERED_IO, ret); } return ret; } @@ -4431,7 +4487,7 @@ static int f2fs_dio_write_end_io(struct kiocb *iocb, ssize_t size, int error, dec_page_count(sbi, F2FS_DIO_WRITE); if (error) return error; - f2fs_update_iostat(sbi, APP_DIRECT_IO, size); + f2fs_update_iostat(sbi, NULL, APP_DIRECT_IO, size); return 0; } @@ -4619,7 +4675,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) skip_write_trace: /* Do the actual write. */ ret = dio ? - f2fs_dio_write_iter(iocb, from, &may_need_sync): + f2fs_dio_write_iter(iocb, from, &may_need_sync) : f2fs_buffered_write_iter(iocb, from); if (trace_f2fs_datawrite_end_enabled()) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 6da21d405ce1..4546e01b2ee0 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -74,7 +74,8 @@ static int gc_thread_func(void *data) if (time_to_inject(sbi, FAULT_CHECKPOINT)) { f2fs_show_injection_info(sbi, FAULT_CHECKPOINT); - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_FAULT_INJECT); } if (!sb_start_write_trylock(sbi->sb)) { @@ -97,14 +98,10 @@ static int gc_thread_func(void *data) */ if (sbi->gc_mode == GC_URGENT_HIGH) { spin_lock(&sbi->gc_urgent_high_lock); - if (sbi->gc_urgent_high_limited) { - if (!sbi->gc_urgent_high_remaining) { - sbi->gc_urgent_high_limited = false; - spin_unlock(&sbi->gc_urgent_high_lock); - sbi->gc_mode = GC_NORMAL; - continue; - } + if (sbi->gc_urgent_high_remaining) { sbi->gc_urgent_high_remaining--; + if (!sbi->gc_urgent_high_remaining) + sbi->gc_mode = GC_NORMAL; } spin_unlock(&sbi->gc_urgent_high_lock); } @@ -285,7 +282,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, /* let's select beginning hot/small space first in no_heap mode*/ if (f2fs_need_rand_seg(sbi)) - p->offset = prandom_u32() % (MAIN_SECS(sbi) * sbi->segs_per_sec); + p->offset = prandom_u32_max(MAIN_SECS(sbi) * sbi->segs_per_sec); else if (test_opt(sbi, NOHEAP) && (type == CURSEG_HOT_DATA || IS_NODESEG(type))) p->offset = 0; @@ -1082,7 +1079,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, { struct page *node_page; nid_t nid; - unsigned int ofs_in_node; + unsigned int ofs_in_node, max_addrs; block_t source_blkaddr; nid = le32_to_cpu(sum->nid); @@ -1108,6 +1105,14 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, return false; } + max_addrs = IS_INODE(node_page) ? DEF_ADDRS_PER_INODE : + DEF_ADDRS_PER_BLOCK; + if (ofs_in_node >= max_addrs) { + f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%u, nid:%u, max:%u", + ofs_in_node, dni->ino, dni->nid, max_addrs); + return false; + } + *nofs = ofs_of_node(node_page); source_blkaddr = data_blkaddr(NULL, node_page, ofs_in_node); f2fs_put_page(node_page, 1); @@ -1159,6 +1164,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index) if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, DATA_GENERIC_ENHANCE_READ))) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto put_page; } goto got_it; @@ -1177,6 +1183,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index) if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, DATA_GENERIC_ENHANCE))) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto put_page; } got_it: @@ -1206,8 +1213,8 @@ got_it: f2fs_put_page(fio.encrypted_page, 0); f2fs_put_page(page, 1); - f2fs_update_iostat(sbi, FS_DATA_READ_IO, F2FS_BLKSIZE); - f2fs_update_iostat(sbi, FS_GDATA_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, inode, FS_DATA_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, NULL, FS_GDATA_READ_IO, F2FS_BLKSIZE); return 0; put_encrypted_page: @@ -1307,8 +1314,10 @@ static int move_data_block(struct inode *inode, block_t bidx, goto up_out; } - f2fs_update_iostat(fio.sbi, FS_DATA_READ_IO, F2FS_BLKSIZE); - f2fs_update_iostat(fio.sbi, FS_GDATA_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(fio.sbi, inode, FS_DATA_READ_IO, + F2FS_BLKSIZE); + f2fs_update_iostat(fio.sbi, NULL, FS_GDATA_READ_IO, + F2FS_BLKSIZE); lock_page(mpage); if (unlikely(mpage->mapping != META_MAPPING(fio.sbi) || @@ -1360,7 +1369,7 @@ static int move_data_block(struct inode *inode, block_t bidx, goto put_page_out; } - f2fs_update_iostat(fio.sbi, FS_GC_DATA_IO, F2FS_BLKSIZE); + f2fs_update_iostat(fio.sbi, NULL, FS_GC_DATA_IO, F2FS_BLKSIZE); f2fs_update_data_blkaddr(&dn, newaddr); set_inode_flag(inode, FI_APPEND_WRITE); @@ -1706,7 +1715,8 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, f2fs_err(sbi, "Inconsistent segment (%u) type [%d, %d] in SSA and SIT", segno, type, GET_SUM_TYPE((&sum->footer))); set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_CORRUPTED_SUMMARY); goto skip; } diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index bf46a7dfbea2..21a495234ffd 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -64,7 +64,6 @@ bool f2fs_may_inline_dentry(struct inode *inode) void f2fs_do_read_inline_data(struct page *page, struct page *ipage) { struct inode *inode = page->mapping->host; - void *src_addr, *dst_addr; if (PageUptodate(page)) return; @@ -74,11 +73,8 @@ void f2fs_do_read_inline_data(struct page *page, struct page *ipage) zero_user_segment(page, MAX_INLINE_DATA(inode), PAGE_SIZE); /* Copy the whole inline data block */ - src_addr = inline_data_addr(inode, ipage); - dst_addr = kmap_atomic(page); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); - flush_dcache_page(page); - kunmap_atomic(dst_addr); + memcpy_to_page(page, 0, inline_data_addr(inode, ipage), + MAX_INLINE_DATA(inode)); if (!PageUptodate(page)) SetPageUptodate(page); } @@ -164,6 +160,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) set_sbi_flag(fio.sbi, SBI_NEED_FSCK); f2fs_warn(fio.sbi, "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, run fsck to fix.", __func__, dn->inode->i_ino, dn->data_blkaddr); + f2fs_handle_error(fio.sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; } @@ -246,7 +243,6 @@ out: int f2fs_write_inline_data(struct inode *inode, struct page *page) { - void *src_addr, *dst_addr; struct dnode_of_data dn; int err; @@ -263,10 +259,8 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) f2fs_bug_on(F2FS_I_SB(inode), page->index); f2fs_wait_on_page_writeback(dn.inode_page, NODE, true, true); - src_addr = kmap_atomic(page); - dst_addr = inline_data_addr(inode, dn.inode_page); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); - kunmap_atomic(src_addr); + memcpy_from_page(inline_data_addr(inode, dn.inode_page), + page, 0, MAX_INLINE_DATA(inode)); set_page_dirty(dn.inode_page); f2fs_clear_page_cache_dirty_tag(page); @@ -419,6 +413,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, set_sbi_flag(F2FS_P_SB(page), SBI_NEED_FSCK); f2fs_warn(F2FS_P_SB(page), "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, run fsck to fix.", __func__, dir->i_ino, dn.data_blkaddr); + f2fs_handle_error(F2FS_P_SB(page), ERROR_INVALID_BLKADDR); err = -EFSCORRUPTED; goto out; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 6d11c365d7b4..9f0d3864d9f1 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -81,8 +81,10 @@ static int __written_first_block(struct f2fs_sb_info *sbi, if (!__is_valid_data_blkaddr(addr)) return 1; - if (!f2fs_is_valid_blkaddr(sbi, addr, DATA_GENERIC_ENHANCE)) + if (!f2fs_is_valid_blkaddr(sbi, addr, DATA_GENERIC_ENHANCE)) { + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; + } return 0; } @@ -333,6 +335,16 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return true; } +static void init_idisk_time(struct inode *inode) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + + fi->i_disk_time[0] = inode->i_atime; + fi->i_disk_time[1] = inode->i_ctime; + fi->i_disk_time[2] = inode->i_mtime; + fi->i_disk_time[3] = fi->i_crtime; +} + static int do_read_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -405,6 +417,7 @@ static int do_read_inode(struct inode *inode) if (!sanity_check_inode(inode, node_page)) { f2fs_put_page(node_page, 1); + f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); return -EFSCORRUPTED; } @@ -465,10 +478,7 @@ static int do_read_inode(struct inode *inode) } } - fi->i_disk_time[0] = inode->i_atime; - fi->i_disk_time[1] = inode->i_ctime; - fi->i_disk_time[2] = inode->i_mtime; - fi->i_disk_time[3] = fi->i_crtime; + init_idisk_time(inode); f2fs_put_page(node_page, 1); stat_inc_inline_xattr(inode); @@ -480,6 +490,12 @@ static int do_read_inode(struct inode *inode) return 0; } +static bool is_meta_ino(struct f2fs_sb_info *sbi, unsigned int ino) +{ + return ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi) || + ino == F2FS_COMPRESS_INO(sbi); +} + struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -491,16 +507,22 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) { + if (is_meta_ino(sbi, ino)) { + f2fs_err(sbi, "inaccessible inode: %lu, run fsck to repair", ino); + set_sbi_flag(sbi, SBI_NEED_FSCK); + ret = -EFSCORRUPTED; + trace_f2fs_iget_exit(inode, ret); + iput(inode); + f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); + return ERR_PTR(ret); + } + trace_f2fs_iget(inode); return inode; } - if (ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi)) - goto make_now; -#ifdef CONFIG_F2FS_FS_COMPRESSION - if (ino == F2FS_COMPRESS_INO(sbi)) + if (is_meta_ino(sbi, ino)) goto make_now; -#endif ret = do_read_inode(inode); if (ret) @@ -676,11 +698,7 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) if (inode->i_nlink == 0) clear_page_private_inline(node_page); - F2FS_I(inode)->i_disk_time[0] = inode->i_atime; - F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; - F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; - F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; - + init_idisk_time(inode); #ifdef CONFIG_F2FS_CHECK_FS f2fs_inode_chksum_set(F2FS_I_SB(inode), node_page); #endif @@ -699,7 +717,8 @@ retry: cond_resched(); goto retry; } else if (err != -ENOENT) { - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_UPDATE_INODE); } return; } diff --git a/fs/f2fs/iostat.c b/fs/f2fs/iostat.c index d84c5f6cc09d..3166a8939ed4 100644 --- a/fs/f2fs/iostat.c +++ b/fs/f2fs/iostat.c @@ -31,55 +31,65 @@ int __maybe_unused iostat_info_seq_show(struct seq_file *seq, void *offset) /* print app write IOs */ seq_puts(seq, "[WRITE]\n"); - seq_printf(seq, "app buffered: %-16llu\n", + seq_printf(seq, "app buffered data: %-16llu\n", sbi->rw_iostat[APP_BUFFERED_IO]); - seq_printf(seq, "app direct: %-16llu\n", + seq_printf(seq, "app direct data: %-16llu\n", sbi->rw_iostat[APP_DIRECT_IO]); - seq_printf(seq, "app mapped: %-16llu\n", + seq_printf(seq, "app mapped data: %-16llu\n", sbi->rw_iostat[APP_MAPPED_IO]); + seq_printf(seq, "app buffered cdata: %-16llu\n", + sbi->rw_iostat[APP_BUFFERED_CDATA_IO]); + seq_printf(seq, "app mapped cdata: %-16llu\n", + sbi->rw_iostat[APP_MAPPED_CDATA_IO]); /* print fs write IOs */ - seq_printf(seq, "fs data: %-16llu\n", + seq_printf(seq, "fs data: %-16llu\n", sbi->rw_iostat[FS_DATA_IO]); - seq_printf(seq, "fs node: %-16llu\n", + seq_printf(seq, "fs cdata: %-16llu\n", + sbi->rw_iostat[FS_CDATA_IO]); + seq_printf(seq, "fs node: %-16llu\n", sbi->rw_iostat[FS_NODE_IO]); - seq_printf(seq, "fs meta: %-16llu\n", + seq_printf(seq, "fs meta: %-16llu\n", sbi->rw_iostat[FS_META_IO]); - seq_printf(seq, "fs gc data: %-16llu\n", + seq_printf(seq, "fs gc data: %-16llu\n", sbi->rw_iostat[FS_GC_DATA_IO]); - seq_printf(seq, "fs gc node: %-16llu\n", + seq_printf(seq, "fs gc node: %-16llu\n", sbi->rw_iostat[FS_GC_NODE_IO]); - seq_printf(seq, "fs cp data: %-16llu\n", + seq_printf(seq, "fs cp data: %-16llu\n", sbi->rw_iostat[FS_CP_DATA_IO]); - seq_printf(seq, "fs cp node: %-16llu\n", + seq_printf(seq, "fs cp node: %-16llu\n", sbi->rw_iostat[FS_CP_NODE_IO]); - seq_printf(seq, "fs cp meta: %-16llu\n", + seq_printf(seq, "fs cp meta: %-16llu\n", sbi->rw_iostat[FS_CP_META_IO]); /* print app read IOs */ seq_puts(seq, "[READ]\n"); - seq_printf(seq, "app buffered: %-16llu\n", + seq_printf(seq, "app buffered data: %-16llu\n", sbi->rw_iostat[APP_BUFFERED_READ_IO]); - seq_printf(seq, "app direct: %-16llu\n", + seq_printf(seq, "app direct data: %-16llu\n", sbi->rw_iostat[APP_DIRECT_READ_IO]); - seq_printf(seq, "app mapped: %-16llu\n", + seq_printf(seq, "app mapped data: %-16llu\n", sbi->rw_iostat[APP_MAPPED_READ_IO]); + seq_printf(seq, "app buffered cdata: %-16llu\n", + sbi->rw_iostat[APP_BUFFERED_CDATA_READ_IO]); + seq_printf(seq, "app mapped cdata: %-16llu\n", + sbi->rw_iostat[APP_MAPPED_CDATA_READ_IO]); /* print fs read IOs */ - seq_printf(seq, "fs data: %-16llu\n", + seq_printf(seq, "fs data: %-16llu\n", sbi->rw_iostat[FS_DATA_READ_IO]); - seq_printf(seq, "fs gc data: %-16llu\n", + seq_printf(seq, "fs gc data: %-16llu\n", sbi->rw_iostat[FS_GDATA_READ_IO]); - seq_printf(seq, "fs compr_data: %-16llu\n", + seq_printf(seq, "fs cdata: %-16llu\n", sbi->rw_iostat[FS_CDATA_READ_IO]); - seq_printf(seq, "fs node: %-16llu\n", + seq_printf(seq, "fs node: %-16llu\n", sbi->rw_iostat[FS_NODE_READ_IO]); - seq_printf(seq, "fs meta: %-16llu\n", + seq_printf(seq, "fs meta: %-16llu\n", sbi->rw_iostat[FS_META_READ_IO]); /* print other IOs */ seq_puts(seq, "[OTHER]\n"); - seq_printf(seq, "fs discard: %-16llu\n", + seq_printf(seq, "fs discard: %-16llu\n", sbi->rw_iostat[FS_DISCARD]); return 0; @@ -159,7 +169,7 @@ void f2fs_reset_iostat(struct f2fs_sb_info *sbi) spin_unlock_irq(&sbi->iostat_lat_lock); } -void f2fs_update_iostat(struct f2fs_sb_info *sbi, +void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode, enum iostat_type type, unsigned long long io_bytes) { unsigned long flags; @@ -176,6 +186,28 @@ void f2fs_update_iostat(struct f2fs_sb_info *sbi, if (type == APP_BUFFERED_READ_IO || type == APP_DIRECT_READ_IO) sbi->rw_iostat[APP_READ_IO] += io_bytes; +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (inode && f2fs_compressed_file(inode)) { + if (type == APP_BUFFERED_IO) + sbi->rw_iostat[APP_BUFFERED_CDATA_IO] += io_bytes; + + if (type == APP_BUFFERED_READ_IO) + sbi->rw_iostat[APP_BUFFERED_CDATA_READ_IO] += io_bytes; + + if (type == APP_MAPPED_READ_IO) + sbi->rw_iostat[APP_MAPPED_CDATA_READ_IO] += io_bytes; + + if (type == APP_MAPPED_IO) + sbi->rw_iostat[APP_MAPPED_CDATA_IO] += io_bytes; + + if (type == FS_DATA_READ_IO) + sbi->rw_iostat[FS_CDATA_READ_IO] += io_bytes; + + if (type == FS_DATA_IO) + sbi->rw_iostat[FS_CDATA_IO] += io_bytes; + } +#endif + spin_unlock_irqrestore(&sbi->iostat_lock, flags); f2fs_record_iostat(sbi); diff --git a/fs/f2fs/iostat.h b/fs/f2fs/iostat.h index 22a2d01f57ef..2c048307b6e0 100644 --- a/fs/f2fs/iostat.h +++ b/fs/f2fs/iostat.h @@ -31,7 +31,7 @@ struct iostat_lat_info { extern int __maybe_unused iostat_info_seq_show(struct seq_file *seq, void *offset); extern void f2fs_reset_iostat(struct f2fs_sb_info *sbi); -extern void f2fs_update_iostat(struct f2fs_sb_info *sbi, +extern void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode, enum iostat_type type, unsigned long long io_bytes); struct bio_iostat_ctx { @@ -65,7 +65,7 @@ extern void f2fs_destroy_iostat_processing(void); extern int f2fs_init_iostat(struct f2fs_sb_info *sbi); extern void f2fs_destroy_iostat(struct f2fs_sb_info *sbi); #else -static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, +static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode, enum iostat_type type, unsigned long long io_bytes) {} static inline void iostat_update_and_unbind_ctx(struct bio *bio, int rw) {} static inline void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index bf00d5057abb..a389772fd212 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -50,7 +50,7 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, inode->i_blocks = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); F2FS_I(inode)->i_crtime = inode->i_mtime; - inode->i_generation = prandom_u32(); + inode->i_generation = get_random_u32(); if (S_ISDIR(inode->i_mode)) F2FS_I(inode)->i_current_depth = 1; @@ -845,7 +845,7 @@ out: } static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode, bool is_whiteout, + struct file *file, umode_t mode, bool is_whiteout, struct inode **new_inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); @@ -892,8 +892,8 @@ static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, inode->i_state |= I_LINKABLE; spin_unlock(&inode->i_lock); } else { - if (dentry) - d_tmpfile(dentry, inode); + if (file) + d_tmpfile(file, inode); else f2fs_i_links_write(inode, false); } @@ -915,16 +915,19 @@ out: } static int f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode) + struct file *file, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + int err; if (unlikely(f2fs_cp_error(sbi))) return -EIO; if (!f2fs_is_checkpoint_ready(sbi)) return -ENOSPC; - return __f2fs_tmpfile(mnt_userns, dir, dentry, mode, false, NULL); + err = __f2fs_tmpfile(mnt_userns, dir, file, mode, false, NULL); + + return finish_open_simple(file, err); } static int f2fs_create_whiteout(struct user_namespace *mnt_userns, diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index e06a0c478b39..983572f23896 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -36,6 +36,7 @@ int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: out-of-range nid=%x, run fsck to fix.", __func__, nid); + f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); return -EFSCORRUPTED; } return 0; @@ -585,7 +586,7 @@ retry: ne = nat_in_journal(journal, i); node_info_from_raw_nat(ni, &ne); } - up_read(&curseg->journal_rwsem); + up_read(&curseg->journal_rwsem); if (i >= 0) { f2fs_up_read(&nm_i->nat_tree_lock); goto cache; @@ -1295,6 +1296,7 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs) if (unlikely(new_ni.blk_addr != NULL_ADDR)) { err = -EFSCORRUPTED; set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto fail; } #endif @@ -1369,7 +1371,7 @@ static int read_node_page(struct page *page, blk_opf_t op_flags) err = f2fs_submit_page_bio(&fio); if (!err) - f2fs_update_iostat(sbi, FS_NODE_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, NULL, FS_NODE_READ_IO, F2FS_BLKSIZE); return err; } @@ -2147,8 +2149,7 @@ static bool f2fs_dirty_node_folio(struct address_space *mapping, if (IS_INODE(&folio->page)) f2fs_inode_chksum_set(F2FS_M_SB(mapping), &folio->page); #endif - if (!folio_test_dirty(folio)) { - filemap_dirty_folio(mapping, folio); + if (filemap_dirty_folio(mapping, folio)) { inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES); set_page_private_reference(&folio->page); return true; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index dcd0a1e35095..dea95b48b647 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -474,7 +474,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, struct dnode_of_data tdn = *dn; nid_t ino, nid; struct inode *inode; - unsigned int offset; + unsigned int offset, ofs_in_node, max_addrs; block_t bidx; int i; @@ -501,15 +501,25 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, got_it: /* Use the locked dnode page and inode */ nid = le32_to_cpu(sum.nid); + ofs_in_node = le16_to_cpu(sum.ofs_in_node); + + max_addrs = ADDRS_PER_PAGE(dn->node_page, dn->inode); + if (ofs_in_node >= max_addrs) { + f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%lu, nid:%u, max:%u", + ofs_in_node, dn->inode->i_ino, nid, max_addrs); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUMMARY); + return -EFSCORRUPTED; + } + if (dn->inode->i_ino == nid) { tdn.nid = nid; if (!dn->inode_page_locked) lock_page(dn->inode_page); tdn.node_page = dn->inode_page; - tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node); + tdn.ofs_in_node = ofs_in_node; goto truncate_out; } else if (dn->nid == nid) { - tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node); + tdn.ofs_in_node = ofs_in_node; goto truncate_out; } @@ -628,6 +638,7 @@ retry_dn: inode->i_ino, ofs_of_node(dn.node_page), ofs_of_node(page)); err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); goto err; } @@ -640,12 +651,14 @@ retry_dn: if (__is_valid_data_blkaddr(src) && !f2fs_is_valid_blkaddr(sbi, src, META_POR)) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto err; } if (__is_valid_data_blkaddr(dest) && !f2fs_is_valid_blkaddr(sbi, dest, META_POR)) { err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto err; } @@ -698,6 +711,16 @@ retry_prev: goto err; } + if (f2fs_is_valid_blkaddr(sbi, dest, + DATA_GENERIC_ENHANCE_UPDATE)) { + f2fs_err(sbi, "Inconsistent dest blkaddr:%u, ino:%lu, ofs:%u", + dest, inode->i_ino, dn.ofs_in_node); + err = -EFSCORRUPTED; + f2fs_handle_error(sbi, + ERROR_INVALID_BLKADDR); + goto err; + } + /* write dummy data page */ f2fs_replace_block(sbi, &dn, src, dest, ni.version, false, false); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 0de21f82d7bc..acf3d3fa4363 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -187,7 +187,6 @@ bool f2fs_need_SSR(struct f2fs_sb_info *sbi) void f2fs_abort_atomic_write(struct inode *inode, bool clean) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); if (!f2fs_is_atomic_file(inode)) @@ -200,10 +199,7 @@ void f2fs_abort_atomic_write(struct inode *inode, bool clean) fi->cow_inode = NULL; release_atomic_write_cnt(inode); clear_inode_flag(inode, FI_ATOMIC_FILE); - - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - sbi->atomic_files--; - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + stat_dec_atomic_inode(inode); } static int __replace_atomic_write_block(struct inode *inode, pgoff_t index, @@ -312,6 +308,8 @@ static int __f2fs_commit_atomic_write(struct inode *inode) DATA_GENERIC_ENHANCE)) { f2fs_put_dnode(&dn); ret = -EFSCORRUPTED; + f2fs_handle_error(sbi, + ERROR_INVALID_BLKADDR); goto out; } @@ -376,7 +374,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { if (time_to_inject(sbi, FAULT_CHECKPOINT)) { f2fs_show_injection_info(sbi, FAULT_CHECKPOINT); - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_FAULT_INJECT); } /* balance_fs_bg is able to be pending */ @@ -476,12 +474,12 @@ do_sync: mutex_lock(&sbi->flush_lock); blk_start_plug(&plug); - f2fs_sync_dirty_inodes(sbi, FILE_INODE); + f2fs_sync_dirty_inodes(sbi, FILE_INODE, false); blk_finish_plug(&plug); mutex_unlock(&sbi->flush_lock); } - f2fs_sync_fs(sbi->sb, true); + f2fs_sync_fs(sbi->sb, 1); stat_inc_bg_cp_count(sbi->stat_info); } @@ -694,7 +692,8 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi) } while (ret && --count); if (ret) { - f2fs_stop_checkpoint(sbi, false); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_FLUSH_FAIL); break; } @@ -1171,7 +1170,7 @@ submit: atomic_inc(&dcc->issued_discard); - f2fs_update_iostat(sbi, FS_DISCARD, 1); + f2fs_update_iostat(sbi, NULL, FS_DISCARD, 1); lstart += len; start += len; @@ -2535,7 +2534,7 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) sanity_check_seg_type(sbi, seg_type); if (f2fs_need_rand_seg(sbi)) - return prandom_u32() % (MAIN_SECS(sbi) * sbi->segs_per_sec); + return prandom_u32_max(MAIN_SECS(sbi) * sbi->segs_per_sec); /* if segs_per_sec is large than 1, we need to keep original policy. */ if (__is_large_section(sbi)) @@ -2589,7 +2588,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) curseg->alloc_type = LFS; if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) curseg->fragment_remained_chunk = - prandom_u32() % sbi->max_fragment_chunk + 1; + prandom_u32_max(sbi->max_fragment_chunk) + 1; } static int __next_free_blkoff(struct f2fs_sb_info *sbi, @@ -2626,9 +2625,9 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, /* To allocate block chunks in different sizes, use random number */ if (--seg->fragment_remained_chunk <= 0) { seg->fragment_remained_chunk = - prandom_u32() % sbi->max_fragment_chunk + 1; + prandom_u32_max(sbi->max_fragment_chunk) + 1; seg->next_blkoff += - prandom_u32() % sbi->max_fragment_hole + 1; + prandom_u32_max(sbi->max_fragment_hole) + 1; } } } @@ -3388,7 +3387,7 @@ void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page, f2fs_submit_page_write(&fio); stat_inc_meta_count(sbi, page->index); - f2fs_update_iostat(sbi, io_type, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, NULL, io_type, F2FS_BLKSIZE); } void f2fs_do_write_node_page(unsigned int nid, struct f2fs_io_info *fio) @@ -3398,7 +3397,7 @@ void f2fs_do_write_node_page(unsigned int nid, struct f2fs_io_info *fio) set_summary(&sum, nid, 0, 0); do_write_page(&sum, fio); - f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); + f2fs_update_iostat(fio->sbi, NULL, fio->io_type, F2FS_BLKSIZE); } void f2fs_outplace_write_data(struct dnode_of_data *dn, @@ -3412,7 +3411,7 @@ void f2fs_outplace_write_data(struct dnode_of_data *dn, do_write_page(&sum, fio); f2fs_update_data_blkaddr(dn, fio->new_blkaddr); - f2fs_update_iostat(sbi, fio->io_type, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, dn->inode, fio->io_type, F2FS_BLKSIZE); } int f2fs_inplace_write_data(struct f2fs_io_info *fio) @@ -3432,6 +3431,7 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) f2fs_warn(sbi, "%s: incorrect segment(%u) type, run fsck to fix.", __func__, segno); err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUM_TYPE); goto drop_bio; } @@ -3453,7 +3453,8 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) if (!err) { f2fs_update_device_state(fio->sbi, fio->ino, fio->new_blkaddr, 1); - f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); + f2fs_update_iostat(fio->sbi, fio->page->mapping->host, + fio->io_type, F2FS_BLKSIZE); } return err; @@ -4379,6 +4380,8 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) if (se->type >= NR_PERSISTENT_LOG) { f2fs_err(sbi, "Invalid segment type: %u, segno: %u", se->type, start); + f2fs_handle_error(sbi, + ERROR_INCONSISTENT_SUM_TYPE); return -EFSCORRUPTED; } @@ -4415,6 +4418,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) f2fs_err(sbi, "Wrong journal entry on segno %u", start); err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_CORRUPTED_JOURNAL); break; } @@ -4434,6 +4438,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) f2fs_err(sbi, "Invalid segment type: %u, segno: %u", se->type, start); err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUM_TYPE); break; } @@ -4465,6 +4470,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) if (sit_valid_blocks[NODE] != valid_node_count(sbi)) { f2fs_err(sbi, "SIT is corrupted node# %u vs %u", sit_valid_blocks[NODE], valid_node_count(sbi)); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_NODE_COUNT); return -EFSCORRUPTED; } @@ -4473,6 +4479,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) f2fs_err(sbi, "SIT is corrupted data# %u %u vs %u", sit_valid_blocks[DATA], sit_valid_blocks[NODE], valid_user_blocks(sbi)); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_BLOCK_COUNT); return -EFSCORRUPTED; } @@ -4623,6 +4630,7 @@ static int sanity_check_curseg(struct f2fs_sb_info *sbi) f2fs_err(sbi, "Current segment has invalid alloc_type:%d", curseg->alloc_type); + f2fs_handle_error(sbi, ERROR_INVALID_CURSEG); return -EFSCORRUPTED; } @@ -4640,6 +4648,7 @@ out: "Current segment's next free block offset is inconsistent with bitmap, logtype:%u, segno:%u, type:%u, next_blkoff:%u, blkofs:%u", i, curseg->segno, curseg->alloc_type, curseg->next_blkoff, blkofs); + f2fs_handle_error(sbi, ERROR_INVALID_CURSEG); return -EFSCORRUPTED; } } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index d1d63766f2c7..be8f2d7d007b 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -753,6 +753,7 @@ static inline int check_block_count(struct f2fs_sb_info *sbi, f2fs_err(sbi, "Mismatch valid blocks %d vs. %d", GET_SIT_VBLOCKS(raw_sit), valid_blocks); set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SIT); return -EFSCORRUPTED; } @@ -767,6 +768,7 @@ static inline int check_block_count(struct f2fs_sb_info *sbi, f2fs_err(sbi, "Wrong valid blocks %d or segno %u", GET_SIT_VBLOCKS(raw_sit), segno); set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SIT); return -EFSCORRUPTED; } return 0; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 2451623c05a7..3834ead04620 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -301,10 +301,10 @@ static void f2fs_destroy_casefold_cache(void) { } static inline void limit_reserve_root(struct f2fs_sb_info *sbi) { - block_t limit = min((sbi->user_block_count << 1) / 1000, + block_t limit = min((sbi->user_block_count >> 3), sbi->user_block_count - sbi->reserved_blocks); - /* limit is 0.2% */ + /* limit is 12.5% */ if (test_opt(sbi, RESERVE_ROOT) && F2FS_OPTION(sbi).root_reserved_blocks > limit) { F2FS_OPTION(sbi).root_reserved_blocks = limit; @@ -1342,6 +1342,11 @@ default_check: return -EINVAL; } + if (test_opt(sbi, ATGC) && f2fs_lfs_mode(sbi)) { + f2fs_err(sbi, "LFS not compatible with ATGC"); + return -EINVAL; + } + if (f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb)) { f2fs_err(sbi, "Allow to mount readonly mode only"); return -EROFS; @@ -1666,9 +1671,8 @@ static int f2fs_freeze(struct super_block *sb) if (is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY)) return -EINVAL; - /* ensure no checkpoint required */ - if (!llist_empty(&F2FS_SB(sb)->cprc_info.issue_list)) - return -EINVAL; + /* Let's flush checkpoints and stop the thread. */ + f2fs_flush_ckpt_thread(F2FS_SB(sb)); /* to avoid deadlock on f2fs_evict_inode->SB_FREEZE_FS */ set_sbi_flag(F2FS_SB(sb), SBI_IS_FREEZING); @@ -2181,6 +2185,9 @@ static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) f2fs_up_write(&sbi->gc_lock); f2fs_sync_fs(sbi->sb, 1); + + /* Let's ensure there's no pending checkpoint anymore */ + f2fs_flush_ckpt_thread(sbi); } static int f2fs_remount(struct super_block *sb, int *flags, char *data) @@ -2346,6 +2353,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) f2fs_stop_ckpt_thread(sbi); need_restart_ckpt = true; } else { + /* Flush if the prevous checkpoint, if exists. */ + f2fs_flush_ckpt_thread(sbi); + err = f2fs_start_ckpt_thread(sbi); if (err) { f2fs_err(sbi, @@ -2465,7 +2475,6 @@ static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data, size_t toread; loff_t i_size = i_size_read(inode); struct page *page; - char *kaddr; if (off > i_size) return 0; @@ -2498,9 +2507,7 @@ repeat: return -EIO; } - kaddr = kmap_atomic(page); - memcpy(data, kaddr + offset, tocopy); - kunmap_atomic(kaddr); + memcpy_from_page(data, page, offset, tocopy); f2fs_put_page(page, 1); offset = 0; @@ -2522,7 +2529,6 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type, size_t towrite = len; struct page *page; void *fsdata = NULL; - char *kaddr; int err = 0; int tocopy; @@ -2541,10 +2547,7 @@ retry: break; } - kaddr = kmap_atomic(page); - memcpy(kaddr + offset, data, tocopy); - kunmap_atomic(kaddr); - flush_dcache_page(page); + memcpy_to_page(page, offset, data, tocopy); a_ops->write_end(NULL, mapping, off, tocopy, tocopy, page, fsdata); @@ -3039,23 +3042,24 @@ static void f2fs_get_ino_and_lblk_bits(struct super_block *sb, *lblk_bits_ret = 8 * sizeof(block_t); } -static int f2fs_get_num_devices(struct super_block *sb) +static struct block_device **f2fs_get_devices(struct super_block *sb, + unsigned int *num_devs) { struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct block_device **devs; + int i; - if (f2fs_is_multi_device(sbi)) - return sbi->s_ndevs; - return 1; -} + if (!f2fs_is_multi_device(sbi)) + return NULL; -static void f2fs_get_devices(struct super_block *sb, - struct request_queue **devs) -{ - struct f2fs_sb_info *sbi = F2FS_SB(sb); - int i; + devs = kmalloc_array(sbi->s_ndevs, sizeof(*devs), GFP_KERNEL); + if (!devs) + return ERR_PTR(-ENOMEM); for (i = 0; i < sbi->s_ndevs; i++) - devs[i] = bdev_get_queue(FDEV(i).bdev); + devs[i] = FDEV(i).bdev; + *num_devs = sbi->s_ndevs; + return devs; } static const struct fscrypt_operations f2fs_cryptops = { @@ -3066,7 +3070,6 @@ static const struct fscrypt_operations f2fs_cryptops = { .empty_dir = f2fs_empty_dir, .has_stable_inodes = f2fs_has_stable_inodes, .get_ino_and_lblk_bits = f2fs_get_ino_and_lblk_bits, - .get_num_devices = f2fs_get_num_devices, .get_devices = f2fs_get_devices, }; #endif @@ -3843,6 +3846,68 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) return err; } +void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason) +{ + struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + int err; + + f2fs_down_write(&sbi->sb_lock); + + if (raw_super->s_stop_reason[reason] < ((1 << BITS_PER_BYTE) - 1)) + raw_super->s_stop_reason[reason]++; + + err = f2fs_commit_super(sbi, false); + if (err) + f2fs_err(sbi, "f2fs_commit_super fails to record reason:%u err:%d", + reason, err); + f2fs_up_write(&sbi->sb_lock); +} + +static void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag) +{ + spin_lock(&sbi->error_lock); + if (!test_bit(flag, (unsigned long *)sbi->errors)) { + set_bit(flag, (unsigned long *)sbi->errors); + sbi->error_dirty = true; + } + spin_unlock(&sbi->error_lock); +} + +static bool f2fs_update_errors(struct f2fs_sb_info *sbi) +{ + bool need_update = false; + + spin_lock(&sbi->error_lock); + if (sbi->error_dirty) { + memcpy(F2FS_RAW_SUPER(sbi)->s_errors, sbi->errors, + MAX_F2FS_ERRORS); + sbi->error_dirty = false; + need_update = true; + } + spin_unlock(&sbi->error_lock); + + return need_update; +} + +void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error) +{ + int err; + + f2fs_save_errors(sbi, error); + + f2fs_down_write(&sbi->sb_lock); + + if (!f2fs_update_errors(sbi)) + goto out_unlock; + + err = f2fs_commit_super(sbi, false); + if (err) + f2fs_err(sbi, "f2fs_commit_super fails to record errors:%u, err:%d", + error, err); +out_unlock: + f2fs_up_write(&sbi->sb_lock); +} + static int f2fs_scan_devices(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); @@ -4190,6 +4255,9 @@ try_onemore: goto free_devices; } + spin_lock_init(&sbi->error_lock); + memcpy(sbi->errors, raw_super->s_errors, MAX_F2FS_ERRORS); + sbi->total_valid_node_count = le32_to_cpu(sbi->ckpt->valid_node_count); percpu_counter_set(&sbi->total_valid_inode_count, diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index eba5fb1629d7..df27afd71ef4 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -128,6 +128,12 @@ static ssize_t sb_status_show(struct f2fs_attr *a, return sprintf(buf, "%lx\n", sbi->s_flag); } +static ssize_t cp_status_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sprintf(buf, "%x\n", le32_to_cpu(F2FS_CKPT(sbi)->ckpt_flags)); +} + static ssize_t pending_discard_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -527,7 +533,6 @@ out: if (!strcmp(a->attr.name, "gc_urgent_high_remaining")) { spin_lock(&sbi->gc_urgent_high_lock); - sbi->gc_urgent_high_limited = t != 0; sbi->gc_urgent_high_remaining = t; spin_unlock(&sbi->gc_urgent_high_lock); @@ -1030,8 +1035,10 @@ static struct attribute *f2fs_feat_attrs[] = { ATTRIBUTE_GROUPS(f2fs_feat); F2FS_GENERAL_RO_ATTR(sb_status); +F2FS_GENERAL_RO_ATTR(cp_status); static struct attribute *f2fs_stat_attrs[] = { ATTR_LIST(sb_status), + ATTR_LIST(cp_status), NULL, }; ATTRIBUTE_GROUPS(f2fs_stat); diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index 7b8f2b41c29b..c352fff88a5e 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -47,16 +47,13 @@ static int pagecache_read(struct inode *inode, void *buf, size_t count, size_t n = min_t(size_t, count, PAGE_SIZE - offset_in_page(pos)); struct page *page; - void *addr; page = read_mapping_page(inode->i_mapping, pos >> PAGE_SHIFT, NULL); if (IS_ERR(page)) return PTR_ERR(page); - addr = kmap_atomic(page); - memcpy(buf, addr + offset_in_page(pos), n); - kunmap_atomic(addr); + memcpy_from_page(buf, page, offset_in_page(pos), n); put_page(page); @@ -85,16 +82,13 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count, PAGE_SIZE - offset_in_page(pos)); struct page *page; void *fsdata; - void *addr; int res; res = aops->write_begin(NULL, mapping, pos, n, &page, &fsdata); if (res) return res; - addr = kmap_atomic(page); - memcpy(addr + offset_in_page(pos), buf, n); - kunmap_atomic(addr); + memcpy_to_page(page, offset_in_page(pos), buf, n); res = aops->write_end(NULL, mapping, pos, n, n, page, fsdata); if (res < 0) @@ -246,6 +240,8 @@ static int f2fs_get_verity_descriptor(struct inode *inode, void *buf, if (pos + size < pos || pos + size > inode->i_sb->s_maxbytes || pos < f2fs_verity_metadata_pos(inode) || size > INT_MAX) { f2fs_warn(F2FS_I_SB(inode), "invalid verity xattr"); + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_VERITY_XATTR); return -EFSCORRUPTED; } if (buf_size) { @@ -262,13 +258,14 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) { - DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index); struct page *page; index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT; page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED); if (!page || !PageUptodate(page)) { + DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index); + if (page) put_page(page); else if (num_ra_pages > 1) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index c76c15086e5f..dc2e8637189e 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -367,6 +367,8 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, inode->i_ino); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); err = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_XATTR); goto out; } check: @@ -583,6 +585,8 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) inode->i_ino); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); error = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_XATTR); goto cleanup; } @@ -658,6 +662,8 @@ static int __f2fs_setxattr(struct inode *inode, int index, inode->i_ino); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); error = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_XATTR); goto exit; } @@ -684,6 +690,8 @@ static int __f2fs_setxattr(struct inode *inode, int index, inode->i_ino, ENTRY_SIZE(last)); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); error = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_XATTR); goto exit; } last = XATTR_NEXT_ENTRY(last); diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 249825017da7..00235b8a1823 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -705,7 +705,7 @@ static int fat_readdir(struct file *file, struct dir_context *ctx) } #define FAT_IOCTL_FILLDIR_FUNC(func, dirent_type) \ -static int func(struct dir_context *ctx, const char *name, int name_len, \ +static bool func(struct dir_context *ctx, const char *name, int name_len, \ loff_t offset, u64 ino, unsigned int d_type) \ { \ struct fat_ioctl_filldir_callback *buf = \ @@ -714,7 +714,7 @@ static int func(struct dir_context *ctx, const char *name, int name_len, \ struct dirent_type __user *d2 = d1 + 1; \ \ if (buf->result) \ - return -EINVAL; \ + return false; \ buf->result++; \ \ if (name != NULL) { \ @@ -750,10 +750,10 @@ static int func(struct dir_context *ctx, const char *name, int name_len, \ put_user(short_len, &d1->d_reclen)) \ goto efault; \ } \ - return 0; \ + return true; \ efault: \ buf->result = -EFAULT; \ - return -EFAULT; \ + return false; \ } FAT_IOCTL_FILLDIR_FUNC(fat_ioctl_filldir, __fat_dirent) diff --git a/fs/fat/file.c b/fs/fat/file.c index 3e4eb3467cb4..8a6b493b5b5f 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -461,8 +461,9 @@ static int fat_allow_set_time(struct user_namespace *mnt_userns, { umode_t allow_utime = sbi->options.allow_utime; - if (!uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode))) { - if (in_group_p(i_gid_into_mnt(mnt_userns, inode))) + if (!vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), + current_fsuid())) { + if (vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode))) allow_utime >>= 3; if (allow_utime & MAY_WRITE) return 1; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index a38238d75c08..1cbcc4608dc7 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -523,7 +523,7 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) inode->i_uid = sbi->options.fs_uid; inode->i_gid = sbi->options.fs_gid; inode_inc_iversion(inode); - inode->i_generation = prandom_u32(); + inode->i_generation = get_random_u32(); if ((de->attr & ATTR_DIR) && !IS_FREE(de->name)) { inode->i_generation &= ~1; diff --git a/fs/fhandle.c b/fs/fhandle.c index 6630c69c23a2..f2bc27d1975e 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -14,7 +14,7 @@ #include "internal.h" #include "mount.h" -static long do_sys_name_to_handle(struct path *path, +static long do_sys_name_to_handle(const struct path *path, struct file_handle __user *ufh, int __user *mnt_id) { diff --git a/fs/file.c b/fs/file.c index 3bcc1ecc314a..c942c89ca4cd 100644 --- a/fs/file.c +++ b/fs/file.c @@ -980,6 +980,7 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret *ret_fd = fd; return file; } +EXPORT_SYMBOL(task_lookup_next_fd_rcu); /* * Lightweight file lookup - no refcnt increment if fd table isn't shared. @@ -1002,7 +1003,16 @@ static unsigned long __fget_light(unsigned int fd, fmode_t mask) struct files_struct *files = current->files; struct file *file; - if (atomic_read(&files->count) == 1) { + /* + * If another thread is concurrently calling close_fd() followed + * by put_files_struct(), we must not observe the old table + * entry combined with the new refcount - otherwise we could + * return a file that is concurrently being freed. + * + * atomic_read_acquire() pairs with atomic_dec_and_test() in + * put_files_struct(). + */ + if (atomic_read_acquire(&files->count) == 1) { file = files_lookup_fd_raw(files, fd); if (!file || unlikely(file->f_mode & mask)) return 0; diff --git a/fs/file_table.c b/fs/file_table.c index 99c6796c9f28..dd88701e54a9 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -324,12 +324,7 @@ static void __fput(struct file *file) } fops_put(file->f_op); put_pid(file->f_owner.pid); - if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) - i_readcount_dec(inode); - if (mode & FMODE_WRITER) { - put_write_access(inode); - __mnt_drop_write(mnt); - } + put_file_access(file); dput(dentry); if (unlikely(mode & FMODE_NEED_UNMOUNT)) dissolve_on_fput(mnt); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 08a1993ab7fd..9958d4020771 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1712,15 +1712,28 @@ static int writeback_single_inode(struct inode *inode, wb = inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); /* - * If the inode is now fully clean, then it can be safely removed from - * its writeback list (if any). Otherwise the flusher threads are - * responsible for the writeback lists. + * If the inode is freeing, its i_io_list shoudn't be updated + * as it can be finally deleted at this moment. */ - if (!(inode->i_state & I_DIRTY_ALL)) - inode_cgwb_move_to_attached(inode, wb); - else if (!(inode->i_state & I_SYNC_QUEUED) && - (inode->i_state & I_DIRTY)) - redirty_tail_locked(inode, wb); + if (!(inode->i_state & I_FREEING)) { + /* + * If the inode is now fully clean, then it can be safely + * removed from its writeback list (if any). Otherwise the + * flusher threads are responsible for the writeback lists. + */ + if (!(inode->i_state & I_DIRTY_ALL)) + inode_cgwb_move_to_attached(inode, wb); + else if (!(inode->i_state & I_SYNC_QUEUED)) { + if ((inode->i_state & I_DIRTY)) + redirty_tail_locked(inode, wb); + else if (inode->i_state & I_DIRTY_TIME) { + inode->dirtied_when = jiffies; + inode_io_list_move_locked(inode, + wb, + &wb->b_dirty_time); + } + } + } spin_unlock(&wb->list_lock); inode_sync_complete(inode); @@ -2370,6 +2383,20 @@ void __mark_inode_dirty(struct inode *inode, int flags) if (flags & I_DIRTY_INODE) { /* + * Inode timestamp update will piggback on this dirtying. + * We tell ->dirty_inode callback that timestamps need to + * be updated by setting I_DIRTY_TIME in flags. + */ + if (inode->i_state & I_DIRTY_TIME) { + spin_lock(&inode->i_lock); + if (inode->i_state & I_DIRTY_TIME) { + inode->i_state &= ~I_DIRTY_TIME; + flags |= I_DIRTY_TIME; + } + spin_unlock(&inode->i_lock); + } + + /* * Notify the filesystem about the inode being dirtied, so that * (if needed) it can update on-disk fields and journal the * inode. This is only needed when the inode itself is being @@ -2378,7 +2405,8 @@ void __mark_inode_dirty(struct inode *inode, int flags) */ trace_writeback_dirty_inode_start(inode, flags); if (sb->s_op->dirty_inode) - sb->s_op->dirty_inode(inode, flags & I_DIRTY_INODE); + sb->s_op->dirty_inode(inode, + flags & (I_DIRTY_INODE | I_DIRTY_TIME)); trace_writeback_dirty_inode(inode, flags); /* I_DIRTY_INODE supersedes I_DIRTY_TIME. */ @@ -2399,21 +2427,15 @@ void __mark_inode_dirty(struct inode *inode, int flags) */ smp_mb(); - if (((inode->i_state & flags) == flags) || - (dirtytime && (inode->i_state & I_DIRTY_INODE))) + if ((inode->i_state & flags) == flags) return; spin_lock(&inode->i_lock); - if (dirtytime && (inode->i_state & I_DIRTY_INODE)) - goto out_unlock_inode; if ((inode->i_state & flags) != flags) { const int was_dirty = inode->i_state & I_DIRTY; inode_attach_wb(inode, NULL); - /* I_DIRTY_INODE supersedes I_DIRTY_TIME. */ - if (flags & I_DIRTY_INODE) - inode->i_state &= ~I_DIRTY_TIME; inode->i_state |= flags; /* @@ -2486,7 +2508,6 @@ void __mark_inode_dirty(struct inode *inode, int flags) out_unlock: if (wb) spin_unlock(&wb->list_lock); -out_unlock_inode: spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(__mark_inode_dirty); diff --git a/fs/fscache/volume.c b/fs/fscache/volume.c index a058e0136bfe..ab8ceddf9efa 100644 --- a/fs/fscache/volume.c +++ b/fs/fscache/volume.c @@ -203,7 +203,11 @@ static struct fscache_volume *fscache_alloc_volume(const char *volume_key, struct fscache_volume *volume; struct fscache_cache *cache; size_t klen, hlen; - char *key; + u8 *key; + + klen = strlen(volume_key); + if (klen > NAME_MAX) + return NULL; if (!coherency_data) coherency_len = 0; @@ -229,7 +233,6 @@ static struct fscache_volume *fscache_alloc_volume(const char *volume_key, /* Stick the length on the front of the key and pad it out to make * hashing easier. */ - klen = strlen(volume_key); hlen = round_up(1 + klen + 1, sizeof(__le32)); key = kzalloc(hlen, GFP_KERNEL); if (!key) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 51897427a534..b4a6e0a1b945 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -776,7 +776,8 @@ static int fuse_check_page(struct page *page) 1 << PG_active | 1 << PG_workingset | 1 << PG_reclaim | - 1 << PG_waiters))) { + 1 << PG_waiters | + LRU_GEN_MASK | LRU_REFS_MASK))) { dump_page(page, "fuse: trying to steal weird page"); return 1; } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index b585b04e815e..bb97a384dc5d 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -529,7 +529,7 @@ out_err: */ static int fuse_create_open(struct inode *dir, struct dentry *entry, struct file *file, unsigned int flags, - umode_t mode) + umode_t mode, u32 opcode) { int err; struct inode *inode; @@ -573,7 +573,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID; } - args.opcode = FUSE_CREATE; + args.opcode = opcode; args.nodeid = get_node_id(dir); args.in_numargs = 2; args.in_args[0].size = sizeof(inarg); @@ -676,7 +676,7 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry, if (fc->no_create) goto mknod; - err = fuse_create_open(dir, entry, file, flags, mode); + err = fuse_create_open(dir, entry, file, flags, mode, FUSE_CREATE); if (err == -ENOSYS) { fc->no_create = 1; goto mknod; @@ -802,6 +802,23 @@ static int fuse_create(struct user_namespace *mnt_userns, struct inode *dir, return fuse_mknod(&init_user_ns, dir, entry, mode, 0); } +static int fuse_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct file *file, umode_t mode) +{ + struct fuse_conn *fc = get_fuse_conn(dir); + int err; + + if (fc->no_tmpfile) + return -EOPNOTSUPP; + + err = fuse_create_open(dir, file->f_path.dentry, file, file->f_flags, mode, FUSE_TMPFILE); + if (err == -ENOSYS) { + fc->no_tmpfile = 1; + err = -EOPNOTSUPP; + } + return err; +} + static int fuse_mkdir(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *entry, umode_t mode) { @@ -1913,6 +1930,7 @@ static const struct inode_operations fuse_dir_inode_operations = { .setattr = fuse_setattr, .create = fuse_create, .atomic_open = fuse_atomic_open, + .tmpfile = fuse_tmpfile, .mknod = fuse_mknod, .permission = fuse_permission, .getattr = fuse_getattr, diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 1a3afd469e3a..71bfb663aac5 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -3001,6 +3001,10 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, goto out; } + err = file_modified(file); + if (err) + goto out; + if (!(mode & FALLOC_FL_KEEP_SIZE)) set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 488b460e046f..98a9cf531873 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -784,6 +784,9 @@ struct fuse_conn { /* Does the filesystem support per inode DAX? */ unsigned int inode_dax:1; + /* Is tmpfile not implemented by fs? */ + unsigned int no_tmpfile:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index b4e565711045..e8deaacf1832 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -77,8 +77,10 @@ static void fuse_add_dirent_to_cache(struct file *file, goto unlock; addr = kmap_local_page(page); - if (!offset) + if (!offset) { clear_page(addr); + SetPageUptodate(page); + } memcpy(addr + offset, dirent, reclen); kunmap_local(addr); fi->rdc.size = (index << PAGE_SHIFT) + offset + reclen; @@ -516,6 +518,12 @@ retry_locked: page = find_get_page_flags(file->f_mapping, index, FGP_ACCESSED | FGP_LOCK); + /* Page gone missing, then re-added to cache, but not initialized? */ + if (page && !PageUptodate(page)) { + unlock_page(page); + put_page(page); + page = NULL; + } spin_lock(&fi->rdc.lock); if (!page) { /* diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index 756d05779200..cf40895233f5 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -66,7 +66,7 @@ struct get_name_filldir { char *name; }; -static int get_name_filldir(struct dir_context *ctx, const char *name, +static bool get_name_filldir(struct dir_context *ctx, const char *name, int length, loff_t offset, u64 inum, unsigned int type) { @@ -74,12 +74,12 @@ static int get_name_filldir(struct dir_context *ctx, const char *name, container_of(ctx, struct get_name_filldir, ctx); if (inum != gnfd->inum.no_addr) - return 0; + return true; memcpy(gnfd->name, name, length); gnfd->name[length] = 0; - return 1; + return false; } static int gfs2_get_name(struct dentry *parent, char *name, diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 892006fbbb09..60c6fb91fb58 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1443,6 +1443,22 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl); } +static void __flock_holder_uninit(struct file *file, struct gfs2_holder *fl_gh) +{ + struct gfs2_glock *gl = fl_gh->gh_gl; + + /* + * Make sure gfs2_glock_put() won't sleep under the file->f_lock + * spinlock. + */ + + gfs2_glock_hold(gl); + spin_lock(&file->f_lock); + gfs2_holder_uninit(fl_gh); + spin_unlock(&file->f_lock); + gfs2_glock_put(gl); +} + static int do_flock(struct file *file, int cmd, struct file_lock *fl) { struct gfs2_file *fp = file->private_data; @@ -1455,7 +1471,9 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) int sleeptime; state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED; - flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY_1CB) | GL_EXACT; + flags = GL_EXACT | GL_NOPID; + if (!IS_SETLKW(cmd)) + flags |= LM_FLAG_TRY_1CB; mutex_lock(&fp->f_fl_mutex); @@ -1474,18 +1492,21 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) &gfs2_flock_glops, CREATE, &gl); if (error) goto out; + spin_lock(&file->f_lock); gfs2_holder_init(gl, state, flags, fl_gh); + spin_unlock(&file->f_lock); gfs2_glock_put(gl); } for (sleeptime = 1; sleeptime <= 4; sleeptime <<= 1) { error = gfs2_glock_nq(fl_gh); if (error != GLR_TRYFAILED) break; - fl_gh->gh_flags = LM_FLAG_TRY | GL_EXACT; + fl_gh->gh_flags &= ~LM_FLAG_TRY_1CB; + fl_gh->gh_flags |= LM_FLAG_TRY; msleep(sleeptime); } if (error) { - gfs2_holder_uninit(fl_gh); + __flock_holder_uninit(file, fl_gh); if (error == GLR_TRYFAILED) error = -EAGAIN; } else { @@ -1507,7 +1528,7 @@ static void do_unflock(struct file *file, struct file_lock *fl) locks_lock_file_wait(file, fl); if (gfs2_holder_initialized(fl_gh)) { gfs2_glock_dq(fl_gh); - gfs2_holder_uninit(fl_gh); + __flock_holder_uninit(file, fl_gh); } mutex_unlock(&fp->f_fl_mutex); } diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 41b6c89e4bf7..df335c258eb0 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -33,6 +33,9 @@ #include <linux/list_sort.h> #include <linux/lockref.h> #include <linux/rhashtable.h> +#include <linux/pid_namespace.h> +#include <linux/fdtable.h> +#include <linux/file.h> #include "gfs2.h" #include "incore.h" @@ -59,6 +62,8 @@ typedef void (*glock_examiner) (struct gfs2_glock * gl); static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target); static void __gfs2_glock_dq(struct gfs2_holder *gh); +static void handle_callback(struct gfs2_glock *gl, unsigned int state, + unsigned long delay, bool remote); static struct dentry *gfs2_root; static struct workqueue_struct *glock_workqueue; @@ -730,7 +735,8 @@ static bool is_system_glock(struct gfs2_glock *gl) * */ -static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target) +static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, + unsigned int target) __releases(&gl->gl_lockref.lock) __acquires(&gl->gl_lockref.lock) { @@ -741,7 +747,8 @@ __acquires(&gl->gl_lockref.lock) if (target != LM_ST_UNLOCKED && glock_blocked_by_withdraw(gl) && gh && !(gh->gh_flags & LM_FLAG_NOEXP)) - return; + goto skip_inval; + lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP | LM_FLAG_PRIORITY); GLOCK_BUG_ON(gl, gl->gl_state == target); @@ -826,6 +833,20 @@ skip_inval: (target != LM_ST_UNLOCKED || test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags))) { if (!is_system_glock(gl)) { + handle_callback(gl, LM_ST_UNLOCKED, 0, false); /* sets demote */ + /* + * Ordinarily, we would call dlm and its callback would call + * finish_xmote, which would call state_change() to the new state. + * Since we withdrew, we won't call dlm, so call state_change + * manually, but to the UNLOCKED state we desire. + */ + state_change(gl, LM_ST_UNLOCKED); + /* + * We skip telling dlm to do the locking, so we won't get a + * reply that would otherwise clear GLF_LOCK. So we clear it here. + */ + clear_bit(GLF_LOCK, &gl->gl_flags); + clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); gfs2_glock_queue_work(gl, GL_GLOCK_DFT_HOLD); goto out; } else { @@ -1018,16 +1039,18 @@ static void delete_work_func(struct work_struct *work) if (gfs2_queue_delete_work(gl, 5 * HZ)) return; } - goto out; } inode = gfs2_lookup_by_inum(sdp, no_addr, gl->gl_no_formal_ino, GFS2_BLKST_UNLINKED); - if (!IS_ERR_OR_NULL(inode)) { + if (IS_ERR(inode)) { + if (PTR_ERR(inode) == -EAGAIN && + (gfs2_queue_delete_work(gl, 5 * HZ))) + return; + } else { d_prune_aliases(inode); iput(inode); } -out: gfs2_glock_put(gl); } @@ -1436,6 +1459,15 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...) va_end(args); } +static inline bool pid_is_meaningful(const struct gfs2_holder *gh) +{ + if (!(gh->gh_flags & GL_NOPID)) + return true; + if (gh->gh_state == LM_ST_UNLOCKED) + return true; + return false; +} + /** * add_to_queue - Add a holder to the wait queue (but look for recursion) * @gh: the holder structure to add @@ -1472,10 +1504,17 @@ __acquires(&gl->gl_lockref.lock) } list_for_each_entry(gh2, &gl->gl_holders, gh_list) { - if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid && - (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK) && - !test_bit(HIF_MAY_DEMOTE, &gh2->gh_iflags))) - goto trap_recursive; + if (likely(gh2->gh_owner_pid != gh->gh_owner_pid)) + continue; + if (gh->gh_gl->gl_ops->go_type == LM_TYPE_FLOCK) + continue; + if (test_bit(HIF_MAY_DEMOTE, &gh2->gh_iflags)) + continue; + if (!pid_is_meaningful(gh2)) + continue; + goto trap_recursive; + } + list_for_each_entry(gh2, &gl->gl_holders, gh_list) { if (try_futile && !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) { fail: @@ -2194,6 +2233,20 @@ static void dump_glock_func(struct gfs2_glock *gl) dump_glock(NULL, gl, true); } +static void withdraw_dq(struct gfs2_glock *gl) +{ + spin_lock(&gl->gl_lockref.lock); + if (!__lockref_is_dead(&gl->gl_lockref) && + glock_blocked_by_withdraw(gl)) + do_error(gl, LM_OUT_ERROR); /* remove pending waiters */ + spin_unlock(&gl->gl_lockref.lock); +} + +void gfs2_gl_dq_holders(struct gfs2_sbd *sdp) +{ + glock_hash_walk(withdraw_dq, sdp); +} + /** * gfs2_gl_hash_clear - Empty out the glock hash table * @sdp: the filesystem @@ -2272,19 +2325,24 @@ static const char *hflags2str(char *buf, u16 flags, unsigned long iflags) static void dump_holder(struct seq_file *seq, const struct gfs2_holder *gh, const char *fs_id_buf) { - struct task_struct *gh_owner = NULL; + const char *comm = "(none)"; + pid_t owner_pid = 0; char flags_buf[32]; rcu_read_lock(); - if (gh->gh_owner_pid) + if (pid_is_meaningful(gh)) { + struct task_struct *gh_owner; + + comm = "(ended)"; + owner_pid = pid_nr(gh->gh_owner_pid); gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID); + if (gh_owner) + comm = gh_owner->comm; + } gfs2_print_dbg(seq, "%s H: s:%s f:%s e:%d p:%ld [%s] %pS\n", fs_id_buf, state2str(gh->gh_state), hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags), - gh->gh_error, - gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1, - gh_owner ? gh_owner->comm : "(ended)", - (void *)gh->gh_ip); + gh->gh_error, (long)owner_pid, comm, (void *)gh->gh_ip); rcu_read_unlock(); } @@ -2699,6 +2757,172 @@ static const struct file_operations gfs2_glstats_fops = { .release = gfs2_glocks_release, }; +struct gfs2_glockfd_iter { + struct super_block *sb; + unsigned int tgid; + struct task_struct *task; + unsigned int fd; + struct file *file; +}; + +static struct task_struct *gfs2_glockfd_next_task(struct gfs2_glockfd_iter *i) +{ + struct pid_namespace *ns = task_active_pid_ns(current); + struct pid *pid; + + if (i->task) + put_task_struct(i->task); + + rcu_read_lock(); +retry: + i->task = NULL; + pid = find_ge_pid(i->tgid, ns); + if (pid) { + i->tgid = pid_nr_ns(pid, ns); + i->task = pid_task(pid, PIDTYPE_TGID); + if (!i->task) { + i->tgid++; + goto retry; + } + get_task_struct(i->task); + } + rcu_read_unlock(); + return i->task; +} + +static struct file *gfs2_glockfd_next_file(struct gfs2_glockfd_iter *i) +{ + if (i->file) { + fput(i->file); + i->file = NULL; + } + + rcu_read_lock(); + for(;; i->fd++) { + struct inode *inode; + + i->file = task_lookup_next_fd_rcu(i->task, &i->fd); + if (!i->file) { + i->fd = 0; + break; + } + inode = file_inode(i->file); + if (inode->i_sb != i->sb) + continue; + if (get_file_rcu(i->file)) + break; + } + rcu_read_unlock(); + return i->file; +} + +static void *gfs2_glockfd_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct gfs2_glockfd_iter *i = seq->private; + + if (*pos) + return NULL; + while (gfs2_glockfd_next_task(i)) { + if (gfs2_glockfd_next_file(i)) + return i; + i->tgid++; + } + return NULL; +} + +static void *gfs2_glockfd_seq_next(struct seq_file *seq, void *iter_ptr, + loff_t *pos) +{ + struct gfs2_glockfd_iter *i = seq->private; + + (*pos)++; + i->fd++; + do { + if (gfs2_glockfd_next_file(i)) + return i; + i->tgid++; + } while (gfs2_glockfd_next_task(i)); + return NULL; +} + +static void gfs2_glockfd_seq_stop(struct seq_file *seq, void *iter_ptr) +{ + struct gfs2_glockfd_iter *i = seq->private; + + if (i->file) + fput(i->file); + if (i->task) + put_task_struct(i->task); +} + +static void gfs2_glockfd_seq_show_flock(struct seq_file *seq, + struct gfs2_glockfd_iter *i) +{ + struct gfs2_file *fp = i->file->private_data; + struct gfs2_holder *fl_gh = &fp->f_fl_gh; + struct lm_lockname gl_name = { .ln_type = LM_TYPE_RESERVED }; + + if (!READ_ONCE(fl_gh->gh_gl)) + return; + + spin_lock(&i->file->f_lock); + if (gfs2_holder_initialized(fl_gh)) + gl_name = fl_gh->gh_gl->gl_name; + spin_unlock(&i->file->f_lock); + + if (gl_name.ln_type != LM_TYPE_RESERVED) { + seq_printf(seq, "%d %u %u/%llx\n", + i->tgid, i->fd, gl_name.ln_type, + (unsigned long long)gl_name.ln_number); + } +} + +static int gfs2_glockfd_seq_show(struct seq_file *seq, void *iter_ptr) +{ + struct gfs2_glockfd_iter *i = seq->private; + struct inode *inode = file_inode(i->file); + struct gfs2_glock *gl; + + inode_lock_shared(inode); + gl = GFS2_I(inode)->i_iopen_gh.gh_gl; + if (gl) { + seq_printf(seq, "%d %u %u/%llx\n", + i->tgid, i->fd, gl->gl_name.ln_type, + (unsigned long long)gl->gl_name.ln_number); + } + gfs2_glockfd_seq_show_flock(seq, i); + inode_unlock_shared(inode); + return 0; +} + +static const struct seq_operations gfs2_glockfd_seq_ops = { + .start = gfs2_glockfd_seq_start, + .next = gfs2_glockfd_seq_next, + .stop = gfs2_glockfd_seq_stop, + .show = gfs2_glockfd_seq_show, +}; + +static int gfs2_glockfd_open(struct inode *inode, struct file *file) +{ + struct gfs2_glockfd_iter *i; + struct gfs2_sbd *sdp = inode->i_private; + + i = __seq_open_private(file, &gfs2_glockfd_seq_ops, + sizeof(struct gfs2_glockfd_iter)); + if (!i) + return -ENOMEM; + i->sb = sdp->sd_vfs; + return 0; +} + +static const struct file_operations gfs2_glockfd_fops = { + .owner = THIS_MODULE, + .open = gfs2_glockfd_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + DEFINE_SEQ_ATTRIBUTE(gfs2_sbstats); void gfs2_create_debugfs_file(struct gfs2_sbd *sdp) @@ -2708,6 +2932,9 @@ void gfs2_create_debugfs_file(struct gfs2_sbd *sdp) debugfs_create_file("glocks", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp, &gfs2_glocks_fops); + debugfs_create_file("glockfd", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp, + &gfs2_glockfd_fops); + debugfs_create_file("glstats", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp, &gfs2_glstats_fops); diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 5aed8b500cf5..0d068f4fd7d6 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -91,6 +91,7 @@ enum { #define GL_ASYNC 0x0040 #define GL_EXACT 0x0080 #define GL_SKIP 0x0100 +#define GL_NOPID 0x0200 #define GL_NOCACHE 0x0400 /* @@ -274,6 +275,7 @@ extern void gfs2_cancel_delete_work(struct gfs2_glock *gl); extern bool gfs2_delete_work_queued(const struct gfs2_glock *gl); extern void gfs2_flush_delete_work(struct gfs2_sbd *sdp); extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp); +extern void gfs2_gl_dq_holders(struct gfs2_sbd *sdp); extern void gfs2_glock_thaw(struct gfs2_sbd *sdp); extern void gfs2_glock_add_to_lru(struct gfs2_glock *gl); extern void gfs2_glock_free(struct gfs2_glock *gl); diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index c8ec876f33ea..04a201584fa7 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -130,6 +130,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, if (inode->i_state & I_NEW) { struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_glock *io_gl; + int extra_flags = 0; error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); @@ -141,9 +142,12 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, if (unlikely(error)) goto fail; - if (blktype != GFS2_BLKST_UNLINKED) + if (blktype == GFS2_BLKST_UNLINKED) + extra_flags |= LM_FLAG_TRY; + else gfs2_cancel_delete_work(io_gl); - error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, + error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, + GL_EXACT | GL_NOPID | extra_flags, &ip->i_iopen_gh); gfs2_glock_put(io_gl); if (unlikely(error)) @@ -210,6 +214,8 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, return inode; fail: + if (error == GLR_TRYFAILED) + error = -EAGAIN; if (gfs2_holder_initialized(&ip->i_iopen_gh)) gfs2_glock_dq_uninit(&ip->i_iopen_gh); if (gfs2_holder_initialized(&i_gh)) @@ -720,7 +726,8 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, error = insert_inode_locked4(inode, ip->i_no_addr, iget_test, &ip->i_no_addr); BUG_ON(error); - error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); + error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT | GL_NOPID, + &ip->i_iopen_gh); if (error) goto fail_gunlock2; diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 6ce369b096d4..71911bf9ab34 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -1302,7 +1302,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table) memcpy(cluster, table, strlen(table) - strlen(fsname)); fsname++; - flags = DLM_LSFL_FS | DLM_LSFL_NEWEXCL; + flags = DLM_LSFL_NEWEXCL; /* * create/join lockspace diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 14ae9de76277..afcb32854f14 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -151,14 +151,6 @@ static int __init init_gfs2_fs(void) if (error) goto fail_shrinker; - error = register_filesystem(&gfs2_fs_type); - if (error) - goto fail_fs1; - - error = register_filesystem(&gfs2meta_fs_type); - if (error) - goto fail_fs2; - error = -ENOMEM; gfs_recovery_wq = alloc_workqueue("gfs_recovery", WQ_MEM_RECLAIM | WQ_FREEZABLE, 0); @@ -180,11 +172,23 @@ static int __init init_gfs2_fs(void) goto fail_mempool; gfs2_register_debugfs(); + error = register_filesystem(&gfs2_fs_type); + if (error) + goto fail_fs1; + + error = register_filesystem(&gfs2meta_fs_type); + if (error) + goto fail_fs2; + pr_info("GFS2 installed\n"); return 0; +fail_fs2: + unregister_filesystem(&gfs2_fs_type); +fail_fs1: + mempool_destroy(gfs2_page_pool); fail_mempool: destroy_workqueue(gfs2_freeze_wq); fail_wq3: @@ -192,10 +196,6 @@ fail_wq3: fail_wq2: destroy_workqueue(gfs_recovery_wq); fail_wq1: - unregister_filesystem(&gfs2meta_fs_type); -fail_fs2: - unregister_filesystem(&gfs2_fs_type); -fail_fs1: unregister_shrinker(&gfs2_qd_shrinker); fail_shrinker: kmem_cache_destroy(gfs2_trans_cachep); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 7e70e0ba5a6c..6ed728aae9a5 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -525,8 +525,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) if (buffer_uptodate(first_bh)) goto out; - if (!buffer_locked(first_bh)) - ll_rw_block(REQ_OP_READ | REQ_META | REQ_PRIO, 1, &first_bh); + bh_read_nowait(first_bh, REQ_META | REQ_PRIO); dblock++; extlen--; @@ -534,9 +533,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) while (extlen) { bh = gfs2_getbuf(gl, dblock, CREATE); - if (!buffer_uptodate(bh) && !buffer_locked(bh)) - ll_rw_block(REQ_OP_READ | REQ_RAHEAD | REQ_META | - REQ_PRIO, 1, &bh); + bh_readahead(bh, REQ_RAHEAD | REQ_META | REQ_PRIO); brelse(bh); dblock++; extlen--; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 549879929c84..c0cf1d2d0ef5 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -178,7 +178,10 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent) pr_warn("Invalid block size\n"); return -EINVAL; } - + if (sb->sb_bsize_shift != ffs(sb->sb_bsize) - 1) { + pr_warn("Invalid block size shift\n"); + return -EINVAL; + } return 0; } @@ -381,8 +384,10 @@ static int init_names(struct gfs2_sbd *sdp, int silent) if (!table[0]) table = sdp->sd_vfs->s_id; - strlcpy(sdp->sd_proto_name, proto, GFS2_FSNAME_LEN); - strlcpy(sdp->sd_table_name, table, GFS2_FSNAME_LEN); + BUILD_BUG_ON(GFS2_LOCKNAME_LEN > GFS2_FSNAME_LEN); + + strscpy(sdp->sd_proto_name, proto, GFS2_LOCKNAME_LEN); + strscpy(sdp->sd_table_name, table, GFS2_LOCKNAME_LEN); table = sdp->sd_table_name; while ((table = strchr(table, '/'))) @@ -401,7 +406,8 @@ static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh, error = gfs2_glock_nq_num(sdp, GFS2_MOUNT_LOCK, &gfs2_nondisk_glops, - LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE, + LM_ST_EXCLUSIVE, + LM_FLAG_NOEXP | GL_NOCACHE | GL_NOPID, mount_gh); if (error) { fs_err(sdp, "can't acquire mount glock: %d\n", error); @@ -411,7 +417,7 @@ static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh, error = gfs2_glock_nq_num(sdp, GFS2_LIVE_LOCK, &gfs2_nondisk_glops, LM_ST_SHARED, - LM_FLAG_NOEXP | GL_EXACT, + LM_FLAG_NOEXP | GL_EXACT | GL_NOPID, &sdp->sd_live_gh); if (error) { fs_err(sdp, "can't acquire live glock: %d\n", error); @@ -687,7 +693,7 @@ static int init_statfs(struct gfs2_sbd *sdp) iput(pn); pn = NULL; ip = GFS2_I(sdp->sd_sc_inode); - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_NOPID, &sdp->sd_sc_gh); if (error) { fs_err(sdp, "can't lock local \"sc\" file: %d\n", error); @@ -776,7 +782,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) error = gfs2_glock_nq_num(sdp, sdp->sd_lockstruct.ls_jid, &gfs2_journal_glops, LM_ST_EXCLUSIVE, - LM_FLAG_NOEXP | GL_NOCACHE, + LM_FLAG_NOEXP | GL_NOCACHE | GL_NOPID, &sdp->sd_journal_gh); if (error) { fs_err(sdp, "can't acquire journal glock: %d\n", error); @@ -786,7 +792,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) ip = GFS2_I(sdp->sd_jdesc->jd_inode); sdp->sd_jinode_gl = ip->i_gl; error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, - LM_FLAG_NOEXP | GL_EXACT | GL_NOCACHE, + LM_FLAG_NOEXP | GL_EXACT | + GL_NOCACHE | GL_NOPID, &sdp->sd_jinode_gh); if (error) { fs_err(sdp, "can't acquire journal inode glock: %d\n", @@ -957,7 +964,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo) pn = NULL; ip = GFS2_I(sdp->sd_qc_inode); - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_NOPID, &sdp->sd_qc_gh); if (error) { fs_err(sdp, "can't lock local \"qc\" file: %d\n", error); @@ -1439,13 +1446,13 @@ static int gfs2_parse_param(struct fs_context *fc, struct fs_parameter *param) switch (o) { case Opt_lockproto: - strlcpy(args->ar_lockproto, param->string, GFS2_LOCKNAME_LEN); + strscpy(args->ar_lockproto, param->string, GFS2_LOCKNAME_LEN); break; case Opt_locktable: - strlcpy(args->ar_locktable, param->string, GFS2_LOCKNAME_LEN); + strscpy(args->ar_locktable, param->string, GFS2_LOCKNAME_LEN); break; case Opt_hostdata: - strlcpy(args->ar_hostdata, param->string, GFS2_LOCKNAME_LEN); + strscpy(args->ar_hostdata, param->string, GFS2_LOCKNAME_LEN); break; case Opt_spectator: args->ar_spectator = 1; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index f201eaf59d0d..1ed17226d9ed 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -745,12 +745,8 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index, } if (PageUptodate(page)) set_buffer_uptodate(bh); - if (!buffer_uptodate(bh)) { - ll_rw_block(REQ_OP_READ | REQ_META | REQ_PRIO, 1, &bh); - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) - goto unlock_out; - } + if (bh_read(bh, REQ_META | REQ_PRIO) < 0) + goto unlock_out; if (gfs2_is_jdata(ip)) gfs2_trans_add_data(ip->i_gl, bh); else diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index b5b0f285b27f..b018957a1bb2 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -346,7 +346,8 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp) } error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_EXCLUSIVE, - LM_FLAG_NOEXP, &sdp->sd_freeze_gh); + LM_FLAG_NOEXP | GL_NOPID, + &sdp->sd_freeze_gh); if (error) goto out; diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 8241029a2a5d..7a6aeffcdf5c 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -164,6 +164,11 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) } if (!ret) gfs2_make_fs_ro(sdp); + /* + * Dequeue any pending non-system glock holders that can no + * longer be granted because the file system is withdrawn. + */ + gfs2_gl_dq_holders(sdp); gfs2_freeze_unlock(&freeze_gh); } @@ -204,6 +209,7 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) * exception code in glock_dq. */ iput(inode); + sdp->sd_jdesc->jd_inode = NULL; /* * Wait until the journal inode's glock is freed. This allows try locks * on other nodes to be successful, otherwise we remain the owner of @@ -226,7 +232,8 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) */ fs_warn(sdp, "Requesting recovery of jid %d.\n", sdp->sd_lockstruct.ls_jid); - gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | LM_FLAG_NOEXP, + gfs2_holder_reinit(LM_ST_EXCLUSIVE, + LM_FLAG_TRY_1CB | LM_FLAG_NOEXP | GL_NOPID, &sdp->sd_live_gh); msleep(GL_GLOCK_MAX_HOLD); /* @@ -251,7 +258,8 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) fs_warn(sdp, "Unable to recover our journal jid %d.\n", sdp->sd_lockstruct.ls_jid); gfs2_glock_dq_wait(&sdp->sd_live_gh); - gfs2_holder_reinit(LM_ST_SHARED, LM_FLAG_NOEXP | GL_EXACT, + gfs2_holder_reinit(LM_ST_SHARED, + LM_FLAG_NOEXP | GL_EXACT | GL_NOPID, &sdp->sd_live_gh); gfs2_glock_nq(&sdp->sd_live_gh); } diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index c83fd0e8404d..2015e42e752a 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -21,7 +21,6 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len) int pagenum; int bytes_read; int bytes_to_read; - void *vaddr; off += node->page_offset; pagenum = off >> PAGE_SHIFT; @@ -33,9 +32,7 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len) page = node->page[pagenum]; bytes_to_read = min_t(int, len - bytes_read, PAGE_SIZE - off); - vaddr = kmap_atomic(page); - memcpy(buf + bytes_read, vaddr + off, bytes_to_read); - kunmap_atomic(vaddr); + memcpy_from_page(buf + bytes_read, page, off, bytes_to_read); pagenum++; off = 0; /* page offset only applies to the first page */ @@ -80,8 +77,7 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len) off += node->page_offset; page = node->page[0]; - memcpy(kmap(page) + off, buf, len); - kunmap(page); + memcpy_to_page(page, off, buf, len); set_page_dirty(page); } @@ -105,8 +101,7 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len) off += node->page_offset; page = node->page[0]; - memset(kmap(page) + off, 0, len); - kunmap(page); + memzero_page(page, off, len); set_page_dirty(page); } @@ -123,9 +118,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, src_page = src_node->page[0]; dst_page = dst_node->page[0]; - memcpy(kmap(dst_page) + dst, kmap(src_page) + src, len); - kunmap(src_page); - kunmap(dst_page); + memcpy_page(dst_page, dst, src_page, src, len); set_page_dirty(dst_page); } @@ -140,9 +133,9 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) src += node->page_offset; dst += node->page_offset; page = node->page[0]; - ptr = kmap(page); + ptr = kmap_local_page(page); memmove(ptr + dst, ptr + src, len); - kunmap(page); + kunmap_local(ptr); set_page_dirty(page); } @@ -346,13 +339,14 @@ struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num) if (!test_bit(HFS_BNODE_NEW, &node->flags)) return node; - desc = (struct hfs_bnode_desc *)(kmap(node->page[0]) + node->page_offset); + desc = (struct hfs_bnode_desc *)(kmap_local_page(node->page[0]) + + node->page_offset); node->prev = be32_to_cpu(desc->prev); node->next = be32_to_cpu(desc->next); node->num_recs = be16_to_cpu(desc->num_recs); node->type = desc->type; node->height = desc->height; - kunmap(node->page[0]); + kunmap_local(desc); switch (node->type) { case HFS_NODE_HEADER: @@ -436,14 +430,12 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num) } pagep = node->page; - memset(kmap(*pagep) + node->page_offset, 0, - min((int)PAGE_SIZE, (int)tree->node_size)); + memzero_page(*pagep, node->page_offset, + min((int)PAGE_SIZE, (int)tree->node_size)); set_page_dirty(*pagep); - kunmap(*pagep); for (i = 1; i < tree->pages_per_bnode; i++) { - memset(kmap(*++pagep), 0, PAGE_SIZE); + memzero_page(*++pagep, 0, PAGE_SIZE); set_page_dirty(*pagep); - kunmap(*pagep); } clear_bit(HFS_BNODE_NEW, &node->flags); wake_up(&node->lock_wq); diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 19017d296173..2fa4b1f8cc7f 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -80,7 +80,8 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke goto free_inode; /* Load the header */ - head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc)); + head = (struct hfs_btree_header_rec *)(kmap_local_page(page) + + sizeof(struct hfs_bnode_desc)); tree->root = be32_to_cpu(head->root); tree->leaf_count = be32_to_cpu(head->leaf_count); tree->leaf_head = be32_to_cpu(head->leaf_head); @@ -119,11 +120,12 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke tree->node_size_shift = ffs(size) - 1; tree->pages_per_bnode = (tree->node_size + PAGE_SIZE - 1) >> PAGE_SHIFT; - kunmap(page); + kunmap_local(head); put_page(page); return tree; fail_page: + kunmap_local(head); put_page(page); free_inode: tree->inode->i_mapping->a_ops = &hfs_aops; @@ -169,7 +171,8 @@ void hfs_btree_write(struct hfs_btree *tree) return; /* Load the header */ page = node->page[0]; - head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc)); + head = (struct hfs_btree_header_rec *)(kmap_local_page(page) + + sizeof(struct hfs_bnode_desc)); head->root = cpu_to_be32(tree->root); head->leaf_count = cpu_to_be32(tree->leaf_count); @@ -180,7 +183,7 @@ void hfs_btree_write(struct hfs_btree *tree) head->attributes = cpu_to_be32(tree->attributes); head->depth = cpu_to_be16(tree->depth); - kunmap(page); + kunmap_local(head); set_page_dirty(page); hfs_bnode_put(node); } @@ -268,7 +271,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) off += node->page_offset; pagep = node->page + (off >> PAGE_SHIFT); - data = kmap(*pagep); + data = kmap_local_page(*pagep); off &= ~PAGE_MASK; idx = 0; @@ -281,7 +284,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) idx += i; data[off] |= m; set_page_dirty(*pagep); - kunmap(*pagep); + kunmap_local(data); tree->free_nodes--; mark_inode_dirty(tree->inode); hfs_bnode_put(node); @@ -290,14 +293,14 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) } } if (++off >= PAGE_SIZE) { - kunmap(*pagep); - data = kmap(*++pagep); + kunmap_local(data); + data = kmap_local_page(*++pagep); off = 0; } idx += 8; len--; } - kunmap(*pagep); + kunmap_local(data); nidx = node->next; if (!nidx) { printk(KERN_DEBUG "create new bmap node...\n"); @@ -313,7 +316,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) off = off16; off += node->page_offset; pagep = node->page + (off >> PAGE_SHIFT); - data = kmap(*pagep); + data = kmap_local_page(*pagep); off &= ~PAGE_MASK; } } @@ -360,20 +363,20 @@ void hfs_bmap_free(struct hfs_bnode *node) } off += node->page_offset + nidx / 8; page = node->page[off >> PAGE_SHIFT]; - data = kmap(page); + data = kmap_local_page(page); off &= ~PAGE_MASK; m = 1 << (~nidx & 7); byte = data[off]; if (!(byte & m)) { pr_crit("trying to free free bnode %u(%d)\n", node->this, node->type); - kunmap(page); + kunmap_local(data); hfs_bnode_put(node); return; } data[off] = byte & ~m; set_page_dirty(page); - kunmap(page); + kunmap_local(data); hfs_bnode_put(node); tree->free_nodes++; mark_inode_dirty(tree->inode); diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c index cebce0cfe340..bd8dcea85588 100644 --- a/fs/hfsplus/bitmap.c +++ b/fs/hfsplus/bitmap.c @@ -39,7 +39,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, start = size; goto out; } - pptr = kmap(page); + pptr = kmap_local_page(page); curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32; i = offset % 32; offset &= ~(PAGE_CACHE_BITS - 1); @@ -74,7 +74,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, } curr++; } - kunmap(page); + kunmap_local(pptr); offset += PAGE_CACHE_BITS; if (offset >= size) break; @@ -84,7 +84,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, start = size; goto out; } - curr = pptr = kmap(page); + curr = pptr = kmap_local_page(page); if ((size ^ offset) / PAGE_CACHE_BITS) end = pptr + PAGE_CACHE_BITS / 32; else @@ -127,7 +127,7 @@ found: len -= 32; } set_page_dirty(page); - kunmap(page); + kunmap_local(pptr); offset += PAGE_CACHE_BITS; page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL); @@ -135,7 +135,7 @@ found: start = size; goto out; } - pptr = kmap(page); + pptr = kmap_local_page(page); curr = pptr; end = pptr + PAGE_CACHE_BITS / 32; } @@ -151,7 +151,7 @@ last: done: *curr = cpu_to_be32(n); set_page_dirty(page); - kunmap(page); + kunmap_local(pptr); *max = offset + (curr - pptr) * 32 + i - start; sbi->free_blocks -= *max; hfsplus_mark_mdb_dirty(sb); @@ -185,7 +185,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) page = read_mapping_page(mapping, pnr, NULL); if (IS_ERR(page)) goto kaboom; - pptr = kmap(page); + pptr = kmap_local_page(page); curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32; end = pptr + PAGE_CACHE_BITS / 32; len = count; @@ -215,11 +215,11 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) if (!count) break; set_page_dirty(page); - kunmap(page); + kunmap_local(pptr); page = read_mapping_page(mapping, ++pnr, NULL); if (IS_ERR(page)) goto kaboom; - pptr = kmap(page); + pptr = kmap_local_page(page); curr = pptr; end = pptr + PAGE_CACHE_BITS / 32; } @@ -231,7 +231,7 @@ done: } out: set_page_dirty(page); - kunmap(page); + kunmap_local(pptr); sbi->free_blocks += len; hfsplus_mark_mdb_dirty(sb); mutex_unlock(&sbi->alloc_mutex); diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index a5ab00e54220..87974d5e6791 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -29,14 +29,12 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len) off &= ~PAGE_MASK; l = min_t(int, len, PAGE_SIZE - off); - memcpy(buf, kmap(*pagep) + off, l); - kunmap(*pagep); + memcpy_from_page(buf, *pagep, off, l); while ((len -= l) != 0) { buf += l; l = min_t(int, len, PAGE_SIZE); - memcpy(buf, kmap(*++pagep), l); - kunmap(*pagep); + memcpy_from_page(buf, *++pagep, 0, l); } } @@ -82,16 +80,14 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len) off &= ~PAGE_MASK; l = min_t(int, len, PAGE_SIZE - off); - memcpy(kmap(*pagep) + off, buf, l); + memcpy_to_page(*pagep, off, buf, l); set_page_dirty(*pagep); - kunmap(*pagep); while ((len -= l) != 0) { buf += l; l = min_t(int, len, PAGE_SIZE); - memcpy(kmap(*++pagep), buf, l); + memcpy_to_page(*++pagep, 0, buf, l); set_page_dirty(*pagep); - kunmap(*pagep); } } @@ -112,15 +108,13 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len) off &= ~PAGE_MASK; l = min_t(int, len, PAGE_SIZE - off); - memset(kmap(*pagep) + off, 0, l); + memzero_page(*pagep, off, l); set_page_dirty(*pagep); - kunmap(*pagep); while ((len -= l) != 0) { l = min_t(int, len, PAGE_SIZE); - memset(kmap(*++pagep), 0, l); + memzero_page(*++pagep, 0, l); set_page_dirty(*pagep); - kunmap(*pagep); } } @@ -142,24 +136,20 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, if (src == dst) { l = min_t(int, len, PAGE_SIZE - src); - memcpy(kmap(*dst_page) + src, kmap(*src_page) + src, l); - kunmap(*src_page); + memcpy_page(*dst_page, src, *src_page, src, l); set_page_dirty(*dst_page); - kunmap(*dst_page); while ((len -= l) != 0) { l = min_t(int, len, PAGE_SIZE); - memcpy(kmap(*++dst_page), kmap(*++src_page), l); - kunmap(*src_page); + memcpy_page(*++dst_page, 0, *++src_page, 0, l); set_page_dirty(*dst_page); - kunmap(*dst_page); } } else { void *src_ptr, *dst_ptr; do { - src_ptr = kmap(*src_page) + src; - dst_ptr = kmap(*dst_page) + dst; + dst_ptr = kmap_local_page(*dst_page) + dst; + src_ptr = kmap_local_page(*src_page) + src; if (PAGE_SIZE - src < PAGE_SIZE - dst) { l = PAGE_SIZE - src; src = 0; @@ -171,9 +161,9 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, } l = min(len, l); memcpy(dst_ptr, src_ptr, l); - kunmap(*src_page); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); if (!dst) dst_page++; else @@ -185,6 +175,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) { struct page **src_page, **dst_page; + void *src_ptr, *dst_ptr; int l; hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len); @@ -202,27 +193,28 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) if (src == dst) { while (src < len) { - memmove(kmap(*dst_page), kmap(*src_page), src); - kunmap(*src_page); + dst_ptr = kmap_local_page(*dst_page); + src_ptr = kmap_local_page(*src_page); + memmove(dst_ptr, src_ptr, src); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); len -= src; src = PAGE_SIZE; src_page--; dst_page--; } src -= len; - memmove(kmap(*dst_page) + src, - kmap(*src_page) + src, len); - kunmap(*src_page); + dst_ptr = kmap_local_page(*dst_page); + src_ptr = kmap_local_page(*src_page); + memmove(dst_ptr + src, src_ptr + src, len); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); } else { - void *src_ptr, *dst_ptr; - do { - src_ptr = kmap(*src_page) + src; - dst_ptr = kmap(*dst_page) + dst; + dst_ptr = kmap_local_page(*dst_page) + dst; + src_ptr = kmap_local_page(*src_page) + src; if (src < dst) { l = src; src = PAGE_SIZE; @@ -234,9 +226,9 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) } l = min(len, l); memmove(dst_ptr - l, src_ptr - l, l); - kunmap(*src_page); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); if (dst == PAGE_SIZE) dst_page--; else @@ -251,26 +243,27 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) if (src == dst) { l = min_t(int, len, PAGE_SIZE - src); - memmove(kmap(*dst_page) + src, - kmap(*src_page) + src, l); - kunmap(*src_page); + + dst_ptr = kmap_local_page(*dst_page) + src; + src_ptr = kmap_local_page(*src_page) + src; + memmove(dst_ptr, src_ptr, l); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); while ((len -= l) != 0) { l = min_t(int, len, PAGE_SIZE); - memmove(kmap(*++dst_page), - kmap(*++src_page), l); - kunmap(*src_page); + dst_ptr = kmap_local_page(*++dst_page); + src_ptr = kmap_local_page(*++src_page); + memmove(dst_ptr, src_ptr, l); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); } } else { - void *src_ptr, *dst_ptr; - do { - src_ptr = kmap(*src_page) + src; - dst_ptr = kmap(*dst_page) + dst; + dst_ptr = kmap_local_page(*dst_page) + dst; + src_ptr = kmap_local_page(*src_page) + src; if (PAGE_SIZE - src < PAGE_SIZE - dst) { l = PAGE_SIZE - src; @@ -283,9 +276,9 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) } l = min(len, l); memmove(dst_ptr, src_ptr, l); - kunmap(*src_page); + kunmap_local(src_ptr); set_page_dirty(*dst_page); - kunmap(*dst_page); + kunmap_local(dst_ptr); if (!dst) dst_page++; else @@ -498,14 +491,14 @@ struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num) if (!test_bit(HFS_BNODE_NEW, &node->flags)) return node; - desc = (struct hfs_bnode_desc *)(kmap(node->page[0]) + - node->page_offset); + desc = (struct hfs_bnode_desc *)(kmap_local_page(node->page[0]) + + node->page_offset); node->prev = be32_to_cpu(desc->prev); node->next = be32_to_cpu(desc->next); node->num_recs = be16_to_cpu(desc->num_recs); node->type = desc->type; node->height = desc->height; - kunmap(node->page[0]); + kunmap_local(desc); switch (node->type) { case HFS_NODE_HEADER: @@ -589,14 +582,12 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num) } pagep = node->page; - memset(kmap(*pagep) + node->page_offset, 0, - min_t(int, PAGE_SIZE, tree->node_size)); + memzero_page(*pagep, node->page_offset, + min_t(int, PAGE_SIZE, tree->node_size)); set_page_dirty(*pagep); - kunmap(*pagep); for (i = 1; i < tree->pages_per_bnode; i++) { - memset(kmap(*++pagep), 0, PAGE_SIZE); + memzero_page(*++pagep, 0, PAGE_SIZE); set_page_dirty(*pagep); - kunmap(*pagep); } clear_bit(HFS_BNODE_NEW, &node->flags); wake_up(&node->lock_wq); diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index 66774f4cb4fd..9e1732a2b92a 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -163,7 +163,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) goto free_inode; /* Load the header */ - head = (struct hfs_btree_header_rec *)(kmap(page) + + head = (struct hfs_btree_header_rec *)(kmap_local_page(page) + sizeof(struct hfs_bnode_desc)); tree->root = be32_to_cpu(head->root); tree->leaf_count = be32_to_cpu(head->leaf_count); @@ -240,11 +240,12 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) (tree->node_size + PAGE_SIZE - 1) >> PAGE_SHIFT; - kunmap(page); + kunmap_local(head); put_page(page); return tree; fail_page: + kunmap_local(head); put_page(page); free_inode: tree->inode->i_mapping->a_ops = &hfsplus_aops; @@ -291,7 +292,7 @@ int hfs_btree_write(struct hfs_btree *tree) return -EIO; /* Load the header */ page = node->page[0]; - head = (struct hfs_btree_header_rec *)(kmap(page) + + head = (struct hfs_btree_header_rec *)(kmap_local_page(page) + sizeof(struct hfs_bnode_desc)); head->root = cpu_to_be32(tree->root); @@ -303,7 +304,7 @@ int hfs_btree_write(struct hfs_btree *tree) head->attributes = cpu_to_be32(tree->attributes); head->depth = cpu_to_be16(tree->depth); - kunmap(page); + kunmap_local(head); set_page_dirty(page); hfs_bnode_put(node); return 0; @@ -394,7 +395,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) off += node->page_offset; pagep = node->page + (off >> PAGE_SHIFT); - data = kmap(*pagep); + data = kmap_local_page(*pagep); off &= ~PAGE_MASK; idx = 0; @@ -407,7 +408,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) idx += i; data[off] |= m; set_page_dirty(*pagep); - kunmap(*pagep); + kunmap_local(data); tree->free_nodes--; mark_inode_dirty(tree->inode); hfs_bnode_put(node); @@ -417,14 +418,14 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) } } if (++off >= PAGE_SIZE) { - kunmap(*pagep); - data = kmap(*++pagep); + kunmap_local(data); + data = kmap_local_page(*++pagep); off = 0; } idx += 8; len--; } - kunmap(*pagep); + kunmap_local(data); nidx = node->next; if (!nidx) { hfs_dbg(BNODE_MOD, "create new bmap node\n"); @@ -440,7 +441,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) off = off16; off += node->page_offset; pagep = node->page + (off >> PAGE_SHIFT); - data = kmap(*pagep); + data = kmap_local_page(*pagep); off &= ~PAGE_MASK; } } @@ -490,7 +491,7 @@ void hfs_bmap_free(struct hfs_bnode *node) } off += node->page_offset + nidx / 8; page = node->page[off >> PAGE_SHIFT]; - data = kmap(page); + data = kmap_local_page(page); off &= ~PAGE_MASK; m = 1 << (~nidx & 7); byte = data[off]; @@ -498,13 +499,13 @@ void hfs_bmap_free(struct hfs_bnode *node) pr_crit("trying to free free bnode " "%u(%d)\n", node->this, node->type); - kunmap(page); + kunmap_local(data); hfs_bnode_put(node); return; } data[off] = byte & ~m; set_page_dirty(page); - kunmap(page); + kunmap_local(data); hfs_bnode_put(node); tree->free_nodes++; mark_inode_dirty(tree->inode); diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 07881b76d42f..277468783fee 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -103,7 +103,7 @@ static char *__dentry_name(struct dentry *dentry, char *name) */ BUG_ON(p + strlen(p) + 1 != name + PATH_MAX); - strlcpy(name, root, PATH_MAX); + strscpy(name, root, PATH_MAX); if (len > p - name) { __putname(name); return NULL; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index f7a5b5124d8a..df7772335dc0 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -328,6 +328,12 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) } else { unlock_page(page); + if (PageHWPoison(page)) { + put_page(page); + retval = -EIO; + break; + } + /* * We have the page, copy it to user space buffer. */ @@ -364,13 +370,155 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, return -EINVAL; } -static void remove_huge_page(struct page *page) +static void hugetlb_delete_from_page_cache(struct page *page) { ClearPageDirty(page); ClearPageUptodate(page); delete_from_page_cache(page); } +/* + * Called with i_mmap_rwsem held for inode based vma maps. This makes + * sure vma (and vm_mm) will not go away. We also hold the hugetlb fault + * mutex for the page in the mapping. So, we can not race with page being + * faulted into the vma. + */ +static bool hugetlb_vma_maps_page(struct vm_area_struct *vma, + unsigned long addr, struct page *page) +{ + pte_t *ptep, pte; + + ptep = huge_pte_offset(vma->vm_mm, addr, + huge_page_size(hstate_vma(vma))); + + if (!ptep) + return false; + + pte = huge_ptep_get(ptep); + if (huge_pte_none(pte) || !pte_present(pte)) + return false; + + if (pte_page(pte) == page) + return true; + + return false; +} + +/* + * Can vma_offset_start/vma_offset_end overflow on 32-bit arches? + * No, because the interval tree returns us only those vmas + * which overlap the truncated area starting at pgoff, + * and no vma on a 32-bit arch can span beyond the 4GB. + */ +static unsigned long vma_offset_start(struct vm_area_struct *vma, pgoff_t start) +{ + if (vma->vm_pgoff < start) + return (start - vma->vm_pgoff) << PAGE_SHIFT; + else + return 0; +} + +static unsigned long vma_offset_end(struct vm_area_struct *vma, pgoff_t end) +{ + unsigned long t_end; + + if (!end) + return vma->vm_end; + + t_end = ((end - vma->vm_pgoff) << PAGE_SHIFT) + vma->vm_start; + if (t_end > vma->vm_end) + t_end = vma->vm_end; + return t_end; +} + +/* + * Called with hugetlb fault mutex held. Therefore, no more mappings to + * this folio can be created while executing the routine. + */ +static void hugetlb_unmap_file_folio(struct hstate *h, + struct address_space *mapping, + struct folio *folio, pgoff_t index) +{ + struct rb_root_cached *root = &mapping->i_mmap; + struct hugetlb_vma_lock *vma_lock; + struct page *page = &folio->page; + struct vm_area_struct *vma; + unsigned long v_start; + unsigned long v_end; + pgoff_t start, end; + + start = index * pages_per_huge_page(h); + end = (index + 1) * pages_per_huge_page(h); + + i_mmap_lock_write(mapping); +retry: + vma_lock = NULL; + vma_interval_tree_foreach(vma, root, start, end - 1) { + v_start = vma_offset_start(vma, start); + v_end = vma_offset_end(vma, end); + + if (!hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page)) + continue; + + if (!hugetlb_vma_trylock_write(vma)) { + vma_lock = vma->vm_private_data; + /* + * If we can not get vma lock, we need to drop + * immap_sema and take locks in order. First, + * take a ref on the vma_lock structure so that + * we can be guaranteed it will not go away when + * dropping immap_sema. + */ + kref_get(&vma_lock->refs); + break; + } + + unmap_hugepage_range(vma, vma->vm_start + v_start, v_end, + NULL, ZAP_FLAG_DROP_MARKER); + hugetlb_vma_unlock_write(vma); + } + + i_mmap_unlock_write(mapping); + + if (vma_lock) { + /* + * Wait on vma_lock. We know it is still valid as we have + * a reference. We must 'open code' vma locking as we do + * not know if vma_lock is still attached to vma. + */ + down_write(&vma_lock->rw_sema); + i_mmap_lock_write(mapping); + + vma = vma_lock->vma; + if (!vma) { + /* + * If lock is no longer attached to vma, then just + * unlock, drop our reference and retry looking for + * other vmas. + */ + up_write(&vma_lock->rw_sema); + kref_put(&vma_lock->refs, hugetlb_vma_lock_release); + goto retry; + } + + /* + * vma_lock is still attached to vma. Check to see if vma + * still maps page and if so, unmap. + */ + v_start = vma_offset_start(vma, start); + v_end = vma_offset_end(vma, end); + if (hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page)) + unmap_hugepage_range(vma, vma->vm_start + v_start, + v_end, NULL, + ZAP_FLAG_DROP_MARKER); + + kref_put(&vma_lock->refs, hugetlb_vma_lock_release); + hugetlb_vma_unlock_write(vma); + + goto retry; + } +} + static void hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, zap_flags_t zap_flags) @@ -383,32 +531,66 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, * an inclusive "last". */ vma_interval_tree_foreach(vma, root, start, end ? end - 1 : ULONG_MAX) { - unsigned long v_offset; + unsigned long v_start; unsigned long v_end; + if (!hugetlb_vma_trylock_write(vma)) + continue; + + v_start = vma_offset_start(vma, start); + v_end = vma_offset_end(vma, end); + + unmap_hugepage_range(vma, vma->vm_start + v_start, v_end, + NULL, zap_flags); + /* - * Can the expression below overflow on 32-bit arches? - * No, because the interval tree returns us only those vmas - * which overlap the truncated area starting at pgoff, - * and no vma on a 32-bit arch can span beyond the 4GB. + * Note that vma lock only exists for shared/non-private + * vmas. Therefore, lock is not held when calling + * unmap_hugepage_range for private vmas. */ - if (vma->vm_pgoff < start) - v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT; - else - v_offset = 0; - - if (!end) - v_end = vma->vm_end; - else { - v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT) - + vma->vm_start; - if (v_end > vma->vm_end) - v_end = vma->vm_end; - } + hugetlb_vma_unlock_write(vma); + } +} - unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end, - NULL, zap_flags); +/* + * Called with hugetlb fault mutex held. + * Returns true if page was actually removed, false otherwise. + */ +static bool remove_inode_single_folio(struct hstate *h, struct inode *inode, + struct address_space *mapping, + struct folio *folio, pgoff_t index, + bool truncate_op) +{ + bool ret = false; + + /* + * If folio is mapped, it was faulted in after being + * unmapped in caller. Unmap (again) while holding + * the fault mutex. The mutex will prevent faults + * until we finish removing the folio. + */ + if (unlikely(folio_mapped(folio))) + hugetlb_unmap_file_folio(h, mapping, folio, index); + + folio_lock(folio); + /* + * We must remove the folio from page cache before removing + * the region/ reserve map (hugetlb_unreserve_pages). In + * rare out of memory conditions, removal of the region/reserve + * map could fail. Correspondingly, the subpool and global + * reserve usage count can need to be adjusted. + */ + VM_BUG_ON(HPageRestoreReserve(&folio->page)); + hugetlb_delete_from_page_cache(&folio->page); + ret = true; + if (!truncate_op) { + if (unlikely(hugetlb_unreserve_pages(inode, index, + index + 1, 1))) + hugetlb_fix_reserve_counts(inode); } + + folio_unlock(folio); + return ret; } /* @@ -418,10 +600,10 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, * truncation is indicated by end of range being LLONG_MAX * In this case, we first scan the range and release found pages. * After releasing pages, hugetlb_unreserve_pages cleans up region/reserve - * maps and global counts. Page faults can not race with truncation - * in this routine. hugetlb_no_page() holds i_mmap_rwsem and prevents - * page faults in the truncated range by checking i_size. i_size is - * modified while holding i_mmap_rwsem. + * maps and global counts. Page faults can race with truncation. + * During faults, hugetlb_no_page() checks i_size before page allocation, + * and again after obtaining page table lock. It will 'back out' + * allocations in the truncated range. * hole punch is indicated if end is not LLONG_MAX * In the hole punch case we scan the range and release found pages. * Only when releasing a page is the associated region/reserve map @@ -451,61 +633,17 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, u32 hash = 0; index = folio->index; - if (!truncate_op) { - /* - * Only need to hold the fault mutex in the - * hole punch case. This prevents races with - * page faults. Races are not possible in the - * case of truncation. - */ - hash = hugetlb_fault_mutex_hash(mapping, index); - mutex_lock(&hugetlb_fault_mutex_table[hash]); - } + hash = hugetlb_fault_mutex_hash(mapping, index); + mutex_lock(&hugetlb_fault_mutex_table[hash]); /* - * If folio is mapped, it was faulted in after being - * unmapped in caller. Unmap (again) now after taking - * the fault mutex. The mutex will prevent faults - * until we finish removing the folio. - * - * This race can only happen in the hole punch case. - * Getting here in a truncate operation is a bug. + * Remove folio that was part of folio_batch. */ - if (unlikely(folio_mapped(folio))) { - BUG_ON(truncate_op); - - mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_lock_write(mapping); - mutex_lock(&hugetlb_fault_mutex_table[hash]); - hugetlb_vmdelete_list(&mapping->i_mmap, - index * pages_per_huge_page(h), - (index + 1) * pages_per_huge_page(h), - ZAP_FLAG_DROP_MARKER); - i_mmap_unlock_write(mapping); - } + if (remove_inode_single_folio(h, inode, mapping, folio, + index, truncate_op)) + freed++; - folio_lock(folio); - /* - * We must free the huge page and remove from page - * cache (remove_huge_page) BEFORE removing the - * region/reserve map (hugetlb_unreserve_pages). In - * rare out of memory conditions, removal of the - * region/reserve map could fail. Correspondingly, - * the subpool and global reserve usage count can need - * to be adjusted. - */ - VM_BUG_ON(HPageRestoreReserve(&folio->page)); - remove_huge_page(&folio->page); - freed++; - if (!truncate_op) { - if (unlikely(hugetlb_unreserve_pages(inode, - index, index + 1, 1))) - hugetlb_fix_reserve_counts(inode); - } - - folio_unlock(folio); - if (!truncate_op) - mutex_unlock(&hugetlb_fault_mutex_table[hash]); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); } folio_batch_release(&fbatch); cond_resched(); @@ -543,8 +681,8 @@ static void hugetlb_vmtruncate(struct inode *inode, loff_t offset) BUG_ON(offset & ~huge_page_mask(h)); pgoff = offset >> PAGE_SHIFT; - i_mmap_lock_write(mapping); i_size_write(inode, offset); + i_mmap_lock_write(mapping); if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0, ZAP_FLAG_DROP_MARKER); @@ -703,11 +841,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, /* addr is the offset within the file (zero based) */ addr = index * hpage_size; - /* - * fault mutex taken here, protects against fault path - * and hole punch. inode_lock previously taken protects - * against truncation. - */ + /* mutex taken here, fault path and hole punch */ hash = hugetlb_fault_mutex_hash(mapping, index); mutex_lock(&hugetlb_fault_mutex_table[hash]); @@ -737,7 +871,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, } clear_huge_page(page, addr, pages_per_huge_page(h)); __SetPageUptodate(page); - error = huge_add_to_page_cache(page, mapping, index); + error = hugetlb_add_to_page_cache(page, mapping, index); if (unlikely(error)) { restore_reserve_on_error(h, &pseudo_vma, addr, page); put_page(page); @@ -749,7 +883,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, SetHPageMigratable(page); /* - * unlock_page because locked by huge_add_to_page_cache() + * unlock_page because locked by hugetlb_add_to_page_cache() * put_page() due to reference from alloc_huge_page() */ unlock_page(page); @@ -885,33 +1019,18 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, /* * File creation. Allocate an inode, and we're done.. */ -static int do_hugetlbfs_mknod(struct inode *dir, - struct dentry *dentry, - umode_t mode, - dev_t dev, - bool tmpfile) +static int hugetlbfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t dev) { struct inode *inode; - int error = -ENOSPC; inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev); - if (inode) { - dir->i_ctime = dir->i_mtime = current_time(dir); - if (tmpfile) { - d_tmpfile(dentry, inode); - } else { - d_instantiate(dentry, inode); - dget(dentry);/* Extra count - pin the dentry in core */ - } - error = 0; - } - return error; -} - -static int hugetlbfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode, dev_t dev) -{ - return do_hugetlbfs_mknod(dir, dentry, mode, dev, false); + if (!inode) + return -ENOSPC; + dir->i_ctime = dir->i_mtime = current_time(dir); + d_instantiate(dentry, inode); + dget(dentry);/* Extra count - pin the dentry in core */ + return 0; } static int hugetlbfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, @@ -932,10 +1051,17 @@ static int hugetlbfs_create(struct user_namespace *mnt_userns, } static int hugetlbfs_tmpfile(struct user_namespace *mnt_userns, - struct inode *dir, struct dentry *dentry, + struct inode *dir, struct file *file, umode_t mode) { - return do_hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0, true); + struct inode *inode; + + inode = hugetlbfs_get_inode(dir->i_sb, dir, mode | S_IFREG, 0); + if (!inode) + return -ENOSPC; + dir->i_ctime = dir->i_mtime = current_time(dir); + d_tmpfile(file, inode); + return finish_open_simple(file, 0); } static int hugetlbfs_symlink(struct user_namespace *mnt_userns, @@ -991,13 +1117,6 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping, static int hugetlbfs_error_remove_page(struct address_space *mapping, struct page *page) { - struct inode *inode = mapping->host; - pgoff_t index = page->index; - - remove_huge_page(page); - if (unlikely(hugetlb_unreserve_pages(inode, index, index + 1, 1))) - hugetlb_fix_reserve_counts(inode); - return 0; } diff --git a/fs/inode.c b/fs/inode.c index ba1de23c13c1..b608528efd3a 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -192,8 +192,6 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_wb_frn_history = 0; #endif - if (security_inode_alloc(inode)) - goto out; spin_lock_init(&inode->i_lock); lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key); @@ -228,11 +226,12 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_fsnotify_mask = 0; #endif inode->i_flctx = NULL; + + if (unlikely(security_inode_alloc(inode))) + return -ENOMEM; this_cpu_inc(nr_inodes); return 0; -out: - return -ENOMEM; } EXPORT_SYMBOL(inode_init_always); diff --git a/fs/internal.h b/fs/internal.h index 87e96b9024ce..6f0386b34fae 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -16,6 +16,7 @@ struct shrink_control; struct fs_context; struct user_namespace; struct pipe_inode_info; +struct iov_iter; /* * block/bdev.c @@ -62,7 +63,7 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct path *); int do_rmdir(int dfd, struct filename *name); int do_unlinkat(int dfd, struct filename *name); -int may_linkat(struct user_namespace *mnt_userns, struct path *link); +int may_linkat(struct user_namespace *mnt_userns, const struct path *link); int do_renameat2(int olddfd, struct filename *oldname, int newdfd, struct filename *newname, unsigned int flags); int do_mkdirat(int dfd, struct filename *name, umode_t mode); @@ -101,6 +102,16 @@ extern void chroot_fs_refs(const struct path *, const struct path *); extern struct file *alloc_empty_file(int, const struct cred *); extern struct file *alloc_empty_file_noaccount(int, const struct cred *); +static inline void put_file_access(struct file *file) +{ + if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) { + i_readcount_dec(file->f_inode); + } else if (file->f_mode & FMODE_WRITER) { + put_write_access(file->f_inode); + __mnt_drop_write(file->f_path.mnt); + } +} + /* * super.c */ @@ -221,3 +232,5 @@ ssize_t do_getxattr(struct user_namespace *mnt_userns, int setxattr_copy(const char __user *name, struct xattr_ctx *ctx); int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct xattr_ctx *ctx); + +ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos); diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index ca5c62901541..91ee0b308e13 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1360,6 +1360,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, error = wpc->ops->map_blocks(wpc, inode, pos); if (error) break; + trace_iomap_writepage_map(inode, &wpc->iomap); if (WARN_ON_ONCE(wpc->iomap.type == IOMAP_INLINE)) continue; if (wpc->iomap.type == IOMAP_HOLE) @@ -1421,7 +1422,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, if (!count) folio_end_writeback(folio); done: - mapping_set_error(folio->mapping, error); + mapping_set_error(inode->i_mapping, error); return error; } diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index d48868fc40d7..f6ea9540d082 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -148,6 +148,7 @@ DEFINE_EVENT(iomap_class, name, \ TP_ARGS(inode, iomap)) DEFINE_IOMAP_EVENT(iomap_iter_dstmap); DEFINE_IOMAP_EVENT(iomap_iter_srcmap); +DEFINE_IOMAP_EVENT(iomap_writepage_map); TRACE_EVENT(iomap_iter, TP_PROTO(struct iomap_iter *iter, const void *ops, diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c index b466172eec25..c4da3f634b92 100644 --- a/fs/isofs/compress.c +++ b/fs/isofs/compress.c @@ -67,8 +67,7 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, for ( i = 0 ; i < pcount ; i++ ) { if (!pages[i]) continue; - memset(page_address(pages[i]), 0, PAGE_SIZE); - flush_dcache_page(pages[i]); + memzero_page(pages[i], 0, PAGE_SIZE); SetPageUptodate(pages[i]); } return ((loff_t)pcount) << PAGE_SHIFT; @@ -82,7 +81,7 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, return 0; } haveblocks = isofs_get_blocks(inode, blocknum, bhs, needblocks); - ll_rw_block(REQ_OP_READ, haveblocks, bhs); + bh_read_batch(haveblocks, bhs); curbh = 0; curpage = 0; @@ -120,7 +119,7 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, zerr != Z_STREAM_END) { if (!stream.avail_out) { if (pages[curpage]) { - stream.next_out = page_address(pages[curpage]) + stream.next_out = kmap_local_page(pages[curpage]) + poffset; stream.avail_out = PAGE_SIZE - poffset; poffset = 0; @@ -176,6 +175,10 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, flush_dcache_page(pages[curpage]); SetPageUptodate(pages[curpage]); } + if (stream.next_out != (unsigned char *)zisofs_sink_page) { + kunmap_local(stream.next_out); + stream.next_out = NULL; + } curpage++; } if (!stream.avail_in) @@ -183,6 +186,8 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, } inflate_out: zlib_inflateEnd(&stream); + if (stream.next_out && stream.next_out != (unsigned char *)zisofs_sink_page) + kunmap_local(stream.next_out); z_eio: mutex_unlock(&zisofs_zlib_lock); @@ -283,9 +288,7 @@ static int zisofs_fill_pages(struct inode *inode, int full_page, int pcount, } if (poffset && *pages) { - memset(page_address(*pages) + poffset, 0, - PAGE_SIZE - poffset); - flush_dcache_page(*pages); + memzero_page(*pages, poffset, PAGE_SIZE - poffset); SetPageUptodate(*pages); } return 0; @@ -343,10 +346,8 @@ static int zisofs_read_folio(struct file *file, struct folio *folio) for (i = 0; i < pcount; i++, index++) { if (i != full_page) pages[i] = grab_cache_page_nowait(mapping, index); - if (pages[i]) { + if (pages[i]) ClearPageError(pages[i]); - kmap(pages[i]); - } } err = zisofs_fill_pages(inode, full_page, pcount, pages); @@ -357,7 +358,6 @@ static int zisofs_read_folio(struct file *file, struct folio *folio) flush_dcache_page(pages[i]); if (i == full_page && err) SetPageError(pages[i]); - kunmap(pages[i]); unlock_page(pages[i]); if (i != full_page) put_page(pages[i]); diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 88bf20303466..df9d70588b60 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -1277,13 +1277,11 @@ static int isofs_read_level3_size(struct inode *inode) } while (more_entries); out: kfree(tmpde); - if (bh) - brelse(bh); + brelse(bh); return 0; out_nomem: - if (bh) - brelse(bh); + brelse(bh); return -ENOMEM; out_noread: @@ -1486,8 +1484,7 @@ static int isofs_read_inode(struct inode *inode, int relocated) ret = 0; out: kfree(tmpde); - if (bh) - brelse(bh); + brelse(bh); return ret; out_badread: diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index b2b2bc9b88d9..885a7a6cc53e 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -122,8 +122,8 @@ static int journal_submit_commit_record(journal_t *journal, { struct commit_header *tmp; struct buffer_head *bh; - int ret; struct timespec64 now; + blk_opf_t write_flags = REQ_OP_WRITE | REQ_SYNC; *cbh = NULL; @@ -155,13 +155,11 @@ static int journal_submit_commit_record(journal_t *journal, if (journal->j_flags & JBD2_BARRIER && !jbd2_has_feature_async_commit(journal)) - ret = submit_bh(REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | - REQ_FUA, bh); - else - ret = submit_bh(REQ_OP_WRITE | REQ_SYNC, bh); + write_flags |= REQ_PREFLUSH | REQ_FUA; + submit_bh(write_flags, bh); *cbh = bh; - return ret; + return 0; } /* @@ -570,7 +568,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) journal->j_running_transaction = NULL; start_time = ktime_get(); commit_transaction->t_log_start = journal->j_head; - wake_up(&journal->j_wait_transaction_locked); + wake_up_all(&journal->j_wait_transaction_locked); write_unlock(&journal->j_state_lock); jbd2_debug(3, "JBD2: commit phase 2a\n"); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 6350d3857c89..2696f43e7239 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -923,10 +923,16 @@ int jbd2_fc_wait_bufs(journal_t *journal, int num_blks) for (i = j_fc_off - 1; i >= j_fc_off - num_blks; i--) { bh = journal->j_fc_wbuf[i]; wait_on_buffer(bh); + /* + * Update j_fc_off so jbd2_fc_release_bufs can release remain + * buffer head. + */ + if (unlikely(!buffer_uptodate(bh))) { + journal->j_fc_off = i + 1; + return -EIO; + } put_bh(bh); journal->j_fc_wbuf[i] = NULL; - if (unlikely(!buffer_uptodate(bh))) - return -EIO; } return 0; @@ -1606,7 +1612,7 @@ static int jbd2_write_superblock(journal_t *journal, blk_opf_t write_flags) { struct buffer_head *bh = journal->j_sb_buffer; journal_superblock_t *sb = journal->j_superblock; - int ret; + int ret = 0; /* Buffer got discarded which means block device got invalidated */ if (!buffer_mapped(bh)) { @@ -1636,7 +1642,7 @@ static int jbd2_write_superblock(journal_t *journal, blk_opf_t write_flags) sb->s_checksum = jbd2_superblock_csum(journal, sb); get_bh(bh); bh->b_end_io = end_buffer_write_sync; - ret = submit_bh(REQ_OP_WRITE | write_flags, bh); + submit_bh(REQ_OP_WRITE | write_flags, bh); wait_on_buffer(bh); if (buffer_write_io_error(bh)) { clear_buffer_write_io_error(bh); @@ -1644,9 +1650,8 @@ static int jbd2_write_superblock(journal_t *journal, blk_opf_t write_flags) ret = -EIO; } if (ret) { - printk(KERN_ERR "JBD2: Error %d detected when updating " - "journal superblock for %s.\n", ret, - journal->j_devname); + printk(KERN_ERR "JBD2: I/O error when updating journal superblock for %s.\n", + journal->j_devname); if (!is_journal_aborted(journal)) jbd2_journal_abort(journal, ret); } @@ -1893,19 +1898,16 @@ static int journal_get_superblock(journal_t *journal) { struct buffer_head *bh; journal_superblock_t *sb; - int err = -EIO; + int err; bh = journal->j_sb_buffer; J_ASSERT(bh != NULL); - if (!buffer_uptodate(bh)) { - ll_rw_block(REQ_OP_READ, 1, &bh); - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) { - printk(KERN_ERR - "JBD2: IO error reading journal superblock\n"); - goto out; - } + err = bh_read(bh, 0); + if (err < 0) { + printk(KERN_ERR + "JBD2: IO error reading journal superblock\n"); + goto out; } if (buffer_verified(bh)) diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index f548479615c6..8286a9ec122f 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -100,7 +100,7 @@ static int do_readahead(journal_t *journal, unsigned int start) if (!buffer_uptodate(bh) && !buffer_locked(bh)) { bufs[nbufs++] = bh; if (nbufs == MAXBUF) { - ll_rw_block(REQ_OP_READ, nbufs, bufs); + bh_readahead_batch(nbufs, bufs, 0); journal_brelse_array(bufs, nbufs); nbufs = 0; } @@ -109,7 +109,7 @@ static int do_readahead(journal_t *journal, unsigned int start) } if (nbufs) - ll_rw_block(REQ_OP_READ, nbufs, bufs); + bh_readahead_batch(nbufs, bufs, 0); err = 0; failed: @@ -152,9 +152,14 @@ static int jread(struct buffer_head **bhp, journal_t *journal, return -ENOMEM; if (!buffer_uptodate(bh)) { - /* If this is a brand new buffer, start readahead. - Otherwise, we assume we are already reading it. */ - if (!buffer_req(bh)) + /* + * If this is a brand new buffer, start readahead. + * Otherwise, we assume we are already reading it. + */ + bool need_readahead = !buffer_req(bh); + + bh_read_nowait(bh, 0); + if (need_readahead) do_readahead(journal, offset); wait_on_buffer(bh); } @@ -256,6 +261,7 @@ static int fc_do_one_pass(journal_t *journal, err = journal->j_fc_replay_callback(journal, bh, pass, next_fc_block - journal->j_fc_first, expected_commit_id); + brelse(bh); next_fc_block++; if (err < 0 || err == JBD2_FC_REPLAY_STOP) break; @@ -687,7 +693,6 @@ static int do_one_pass(journal_t *journal, mark_buffer_dirty(nbh); BUFFER_TRACE(nbh, "marking uptodate"); ++info->nr_replays; - /* ll_rw_block(WRITE, 1, &nbh); */ unlock_buffer(nbh); brelse(obh); brelse(nbh); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index e1be93ccd81c..6a404ac1c178 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -168,7 +168,7 @@ static void wait_transaction_locked(journal_t *journal) int need_to_start; tid_t tid = journal->j_running_transaction->t_tid; - prepare_to_wait(&journal->j_wait_transaction_locked, &wait, + prepare_to_wait_exclusive(&journal->j_wait_transaction_locked, &wait, TASK_UNINTERRUPTIBLE); need_to_start = !tid_geq(journal->j_commit_request, tid); read_unlock(&journal->j_state_lock); @@ -194,7 +194,7 @@ static void wait_transaction_switching(journal_t *journal) read_unlock(&journal->j_state_lock); return; } - prepare_to_wait(&journal->j_wait_transaction_locked, &wait, + prepare_to_wait_exclusive(&journal->j_wait_transaction_locked, &wait, TASK_UNINTERRUPTIBLE); read_unlock(&journal->j_state_lock); /* @@ -920,7 +920,7 @@ void jbd2_journal_unlock_updates (journal_t *journal) write_lock(&journal->j_state_lock); --journal->j_barrier_count; write_unlock(&journal->j_state_lock); - wake_up(&journal->j_wait_transaction_locked); + wake_up_all(&journal->j_wait_transaction_locked); } static void warn_dirty_buffer(struct buffer_head *bh) diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index c6821a509481..4061e0ba7010 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -1035,7 +1035,7 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c, { int i, ret; int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); - struct mtd_oob_ops ops; + struct mtd_oob_ops ops = { }; ops.mode = MTD_OPS_AUTO_OOB; ops.ooblen = NR_OOB_SCAN_PAGES * c->oobavail; @@ -1076,7 +1076,7 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c, int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb) { - struct mtd_oob_ops ops; + struct mtd_oob_ops ops = { }; int ret, cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); ops.mode = MTD_OPS_AUTO_OOB; @@ -1101,7 +1101,7 @@ int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb) { int ret; - struct mtd_oob_ops ops; + struct mtd_oob_ops ops = { }; int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); ops.mode = MTD_OPS_AUTO_OOB; diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 1cc88ba6de90..f33b3baad07c 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -31,10 +31,15 @@ static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */ #define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb) +static bool __kernfs_active(struct kernfs_node *kn) +{ + return atomic_read(&kn->active) >= 0; +} + static bool kernfs_active(struct kernfs_node *kn) { lockdep_assert_held(&kernfs_root(kn)->kernfs_rwsem); - return atomic_read(&kn->active) >= 0; + return __kernfs_active(kn); } static bool kernfs_lockdep(struct kernfs_node *kn) @@ -472,6 +477,16 @@ static void kernfs_drain(struct kernfs_node *kn) lockdep_assert_held_write(&root->kernfs_rwsem); WARN_ON_ONCE(kernfs_active(kn)); + /* + * Skip draining if already fully drained. This avoids draining and its + * lockdep annotations for nodes which have never been activated + * allowing embedding kernfs_remove() in create error paths without + * worrying about draining. + */ + if (atomic_read(&kn->active) == KN_DEACTIVATED_BIAS && + !kernfs_should_drain_open_files(kn)) + return; + up_write(&root->kernfs_rwsem); if (kernfs_lockdep(kn)) { @@ -480,7 +495,6 @@ static void kernfs_drain(struct kernfs_node *kn) lock_contended(&kn->dep_map, _RET_IP_); } - /* but everyone should wait for draining */ wait_event(root->deactivate_waitq, atomic_read(&kn->active) == KN_DEACTIVATED_BIAS); @@ -489,7 +503,8 @@ static void kernfs_drain(struct kernfs_node *kn) rwsem_release(&kn->dep_map, _RET_IP_); } - kernfs_drain_open_files(kn); + if (kernfs_should_drain_open_files(kn)) + kernfs_drain_open_files(kn); down_write(&root->kernfs_rwsem); } @@ -696,12 +711,11 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root, } /* - * ACTIVATED is protected with kernfs_mutex but it was clear when - * @kn was added to idr and we just wanna see it set. No need to - * grab kernfs_mutex. + * We should fail if @kn has never been activated and guarantee success + * if the caller knows that @kn is active. Both can be achieved by + * __kernfs_active() which tests @kn->active without kernfs_rwsem. */ - if (unlikely(!(kn->flags & KERNFS_ACTIVATED) || - !atomic_inc_not_zero(&kn->count))) + if (unlikely(!__kernfs_active(kn) || !atomic_inc_not_zero(&kn->count))) goto err_unlock; spin_unlock(&kernfs_idr_lock); @@ -743,10 +757,7 @@ int kernfs_add_one(struct kernfs_node *kn) goto out_unlock; ret = -ENOENT; - if (parent->flags & KERNFS_EMPTY_DIR) - goto out_unlock; - - if ((parent->flags & KERNFS_ACTIVATED) && !kernfs_active(parent)) + if (parent->flags & (KERNFS_REMOVING | KERNFS_EMPTY_DIR)) goto out_unlock; kn->hash = kernfs_name_hash(kn->name, kn->ns); @@ -1304,6 +1315,21 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos, return pos->parent; } +static void kernfs_activate_one(struct kernfs_node *kn) +{ + lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem); + + kn->flags |= KERNFS_ACTIVATED; + + if (kernfs_active(kn) || (kn->flags & (KERNFS_HIDDEN | KERNFS_REMOVING))) + return; + + WARN_ON_ONCE(kn->parent && RB_EMPTY_NODE(&kn->rb)); + WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS); + + atomic_sub(KN_DEACTIVATED_BIAS, &kn->active); +} + /** * kernfs_activate - activate a node which started deactivated * @kn: kernfs_node whose subtree is to be activated @@ -1325,15 +1351,42 @@ void kernfs_activate(struct kernfs_node *kn) down_write(&root->kernfs_rwsem); pos = NULL; - while ((pos = kernfs_next_descendant_post(pos, kn))) { - if (pos->flags & KERNFS_ACTIVATED) - continue; + while ((pos = kernfs_next_descendant_post(pos, kn))) + kernfs_activate_one(pos); + + up_write(&root->kernfs_rwsem); +} + +/** + * kernfs_show - show or hide a node + * @kn: kernfs_node to show or hide + * @show: whether to show or hide + * + * If @show is %false, @kn is marked hidden and deactivated. A hidden node is + * ignored in future activaitons. If %true, the mark is removed and activation + * state is restored. This function won't implicitly activate a new node in a + * %KERNFS_ROOT_CREATE_DEACTIVATED root which hasn't been activated yet. + * + * To avoid recursion complexities, directories aren't supported for now. + */ +void kernfs_show(struct kernfs_node *kn, bool show) +{ + struct kernfs_root *root = kernfs_root(kn); - WARN_ON_ONCE(pos->parent && RB_EMPTY_NODE(&pos->rb)); - WARN_ON_ONCE(atomic_read(&pos->active) != KN_DEACTIVATED_BIAS); + if (WARN_ON_ONCE(kernfs_type(kn) == KERNFS_DIR)) + return; + + down_write(&root->kernfs_rwsem); - atomic_sub(KN_DEACTIVATED_BIAS, &pos->active); - pos->flags |= KERNFS_ACTIVATED; + if (show) { + kn->flags &= ~KERNFS_HIDDEN; + if (kn->flags & KERNFS_ACTIVATED) + kernfs_activate_one(kn); + } else { + kn->flags |= KERNFS_HIDDEN; + if (kernfs_active(kn)) + atomic_add(KN_DEACTIVATED_BIAS, &kn->active); + kernfs_drain(kn); } up_write(&root->kernfs_rwsem); @@ -1358,34 +1411,27 @@ static void __kernfs_remove(struct kernfs_node *kn) pr_debug("kernfs %s: removing\n", kn->name); - /* prevent any new usage under @kn by deactivating all nodes */ + /* prevent new usage by marking all nodes removing and deactivating */ pos = NULL; - while ((pos = kernfs_next_descendant_post(pos, kn))) + while ((pos = kernfs_next_descendant_post(pos, kn))) { + pos->flags |= KERNFS_REMOVING; if (kernfs_active(pos)) atomic_add(KN_DEACTIVATED_BIAS, &pos->active); + } /* deactivate and unlink the subtree node-by-node */ do { pos = kernfs_leftmost_descendant(kn); /* - * kernfs_drain() drops kernfs_rwsem temporarily and @pos's + * kernfs_drain() may drop kernfs_rwsem temporarily and @pos's * base ref could have been put by someone else by the time * the function returns. Make sure it doesn't go away * underneath us. */ kernfs_get(pos); - /* - * Drain iff @kn was activated. This avoids draining and - * its lockdep annotations for nodes which have never been - * activated and allows embedding kernfs_remove() in create - * error paths without worrying about draining. - */ - if (kn->flags & KERNFS_ACTIVATED) - kernfs_drain(pos); - else - WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS); + kernfs_drain(pos); /* * kernfs_unlink_sibling() succeeds once per node. Use it @@ -1585,8 +1631,11 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, down_write(&root->kernfs_rwsem); kn = kernfs_find_ns(parent, name, ns); - if (kn) + if (kn) { + kernfs_get(kn); __kernfs_remove(kn); + kernfs_put(kn); + } up_write(&root->kernfs_rwsem); diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index b3ec34386b43..9ab6c92e02da 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -23,6 +23,8 @@ struct kernfs_open_node { atomic_t event; wait_queue_head_t poll; struct list_head files; /* goes through kernfs_open_file.list */ + unsigned int nr_mmapped; + unsigned int nr_to_release; }; /* @@ -57,31 +59,17 @@ static inline struct mutex *kernfs_open_file_mutex_lock(struct kernfs_node *kn) } /** - * kernfs_deref_open_node - Get kernfs_open_node corresponding to @kn. - * - * @of: associated kernfs_open_file instance. - * @kn: target kernfs_node. - * - * Fetch and return ->attr.open of @kn if @of->list is non empty. - * If @of->list is not empty we can safely assume that @of is on - * @kn->attr.open->files list and this guarantees that @kn->attr.open - * will not vanish i.e. dereferencing outside RCU read-side critical - * section is safe here. - * - * The caller needs to make sure that @of->list is not empty. + * of_on - Return the kernfs_open_node of the specified kernfs_open_file + * @of: taret kernfs_open_file */ -static struct kernfs_open_node * -kernfs_deref_open_node(struct kernfs_open_file *of, struct kernfs_node *kn) +static struct kernfs_open_node *of_on(struct kernfs_open_file *of) { - struct kernfs_open_node *on; - - on = rcu_dereference_check(kn->attr.open, !list_empty(&of->list)); - - return on; + return rcu_dereference_protected(of->kn->attr.open, + !list_empty(&of->list)); } /** - * kernfs_deref_open_node_protected - Get kernfs_open_node corresponding to @kn + * kernfs_deref_open_node_locked - Get kernfs_open_node corresponding to @kn * * @kn: target kernfs_node. * @@ -96,7 +84,7 @@ kernfs_deref_open_node(struct kernfs_open_file *of, struct kernfs_node *kn) * The caller needs to make sure that kernfs_open_file_mutex is held. */ static struct kernfs_open_node * -kernfs_deref_open_node_protected(struct kernfs_node *kn) +kernfs_deref_open_node_locked(struct kernfs_node *kn) { return rcu_dereference_protected(kn->attr.open, lockdep_is_held(kernfs_open_file_mutex_ptr(kn))); @@ -207,12 +195,8 @@ static void kernfs_seq_stop(struct seq_file *sf, void *v) static int kernfs_seq_show(struct seq_file *sf, void *v) { struct kernfs_open_file *of = sf->private; - struct kernfs_open_node *on = kernfs_deref_open_node(of, of->kn); - - if (!on) - return -EINVAL; - of->event = atomic_read(&on->event); + of->event = atomic_read(&of_on(of)->event); return of->kn->attr.ops->seq_show(sf, v); } @@ -235,7 +219,6 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) struct kernfs_open_file *of = kernfs_of(iocb->ki_filp); ssize_t len = min_t(size_t, iov_iter_count(iter), PAGE_SIZE); const struct kernfs_ops *ops; - struct kernfs_open_node *on; char *buf; buf = of->prealloc_buf; @@ -257,14 +240,7 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) goto out_free; } - on = kernfs_deref_open_node(of, of->kn); - if (!on) { - len = -EINVAL; - mutex_unlock(&of->mutex); - goto out_free; - } - - of->event = atomic_read(&on->event); + of->event = atomic_read(&of_on(of)->event); ops = kernfs_ops(of->kn); if (ops->read) @@ -553,6 +529,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) rc = 0; of->mmapped = true; + of_on(of)->nr_mmapped++; of->vm_ops = vma->vm_ops; vma->vm_ops = &kernfs_vm_ops; out_put: @@ -580,31 +557,30 @@ out_unlock: static int kernfs_get_open_node(struct kernfs_node *kn, struct kernfs_open_file *of) { - struct kernfs_open_node *on, *new_on = NULL; - struct mutex *mutex = NULL; + struct kernfs_open_node *on; + struct mutex *mutex; mutex = kernfs_open_file_mutex_lock(kn); - on = kernfs_deref_open_node_protected(kn); + on = kernfs_deref_open_node_locked(kn); - if (on) { - list_add_tail(&of->list, &on->files); - mutex_unlock(mutex); - return 0; - } else { + if (!on) { /* not there, initialize a new one */ - new_on = kmalloc(sizeof(*new_on), GFP_KERNEL); - if (!new_on) { + on = kzalloc(sizeof(*on), GFP_KERNEL); + if (!on) { mutex_unlock(mutex); return -ENOMEM; } - atomic_set(&new_on->event, 1); - init_waitqueue_head(&new_on->poll); - INIT_LIST_HEAD(&new_on->files); - list_add_tail(&of->list, &new_on->files); - rcu_assign_pointer(kn->attr.open, new_on); + atomic_set(&on->event, 1); + init_waitqueue_head(&on->poll); + INIT_LIST_HEAD(&on->files); + rcu_assign_pointer(kn->attr.open, on); } - mutex_unlock(mutex); + list_add_tail(&of->list, &on->files); + if (kn->flags & KERNFS_HAS_RELEASE) + on->nr_to_release++; + + mutex_unlock(mutex); return 0; } @@ -613,6 +589,7 @@ static int kernfs_get_open_node(struct kernfs_node *kn, * * @kn: target kernfs_node * @of: associated kernfs_open_file + * @open_failed: ->open() failed, cancel ->release() * * Unlink @of from list of @kn's associated open files. If list of * associated open files becomes empty, disassociate and free @@ -622,21 +599,30 @@ static int kernfs_get_open_node(struct kernfs_node *kn, * None. */ static void kernfs_unlink_open_file(struct kernfs_node *kn, - struct kernfs_open_file *of) + struct kernfs_open_file *of, + bool open_failed) { struct kernfs_open_node *on; - struct mutex *mutex = NULL; + struct mutex *mutex; mutex = kernfs_open_file_mutex_lock(kn); - on = kernfs_deref_open_node_protected(kn); + on = kernfs_deref_open_node_locked(kn); if (!on) { mutex_unlock(mutex); return; } - if (of) + if (of) { + if (kn->flags & KERNFS_HAS_RELEASE) { + WARN_ON_ONCE(of->released == open_failed); + if (open_failed) + on->nr_to_release--; + } + if (of->mmapped) + on->nr_mmapped--; list_del(&of->list); + } if (list_empty(&on->files)) { rcu_assign_pointer(kn->attr.open, NULL); @@ -763,7 +749,7 @@ static int kernfs_fop_open(struct inode *inode, struct file *file) return 0; err_put_node: - kernfs_unlink_open_file(kn, of); + kernfs_unlink_open_file(kn, of, true); err_seq_release: seq_release(inode, file); err_free: @@ -795,6 +781,7 @@ static void kernfs_release_file(struct kernfs_node *kn, */ kn->attr.ops->release(of); of->released = true; + of_on(of)->nr_to_release--; } } @@ -802,15 +789,16 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp) { struct kernfs_node *kn = inode->i_private; struct kernfs_open_file *of = kernfs_of(filp); - struct mutex *mutex = NULL; if (kn->flags & KERNFS_HAS_RELEASE) { + struct mutex *mutex; + mutex = kernfs_open_file_mutex_lock(kn); kernfs_release_file(kn, of); mutex_unlock(mutex); } - kernfs_unlink_open_file(kn, of); + kernfs_unlink_open_file(kn, of, false); seq_release(inode, filp); kfree(of->prealloc_buf); kfree(of); @@ -818,28 +806,33 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp) return 0; } -void kernfs_drain_open_files(struct kernfs_node *kn) +bool kernfs_should_drain_open_files(struct kernfs_node *kn) { struct kernfs_open_node *on; - struct kernfs_open_file *of; - struct mutex *mutex = NULL; - - if (!(kn->flags & (KERNFS_HAS_MMAP | KERNFS_HAS_RELEASE))) - return; + bool ret; /* - * lockless opportunistic check is safe below because no one is adding to - * ->attr.open at this point of time. This check allows early bail out - * if ->attr.open is already NULL. kernfs_unlink_open_file makes - * ->attr.open NULL only while holding kernfs_open_file_mutex so below - * check under kernfs_open_file_mutex_ptr(kn) will ensure bailing out if - * ->attr.open became NULL while waiting for the mutex. + * @kn being deactivated guarantees that @kn->attr.open can't change + * beneath us making the lockless test below safe. */ - if (!rcu_access_pointer(kn->attr.open)) - return; + WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS); + + rcu_read_lock(); + on = rcu_dereference(kn->attr.open); + ret = on && (on->nr_mmapped || on->nr_to_release); + rcu_read_unlock(); + + return ret; +} + +void kernfs_drain_open_files(struct kernfs_node *kn) +{ + struct kernfs_open_node *on; + struct kernfs_open_file *of; + struct mutex *mutex; mutex = kernfs_open_file_mutex_lock(kn); - on = kernfs_deref_open_node_protected(kn); + on = kernfs_deref_open_node_locked(kn); if (!on) { mutex_unlock(mutex); return; @@ -848,13 +841,17 @@ void kernfs_drain_open_files(struct kernfs_node *kn) list_for_each_entry(of, &on->files, list) { struct inode *inode = file_inode(of->file); - if (kn->flags & KERNFS_HAS_MMAP) + if (of->mmapped) { unmap_mapping_range(inode->i_mapping, 0, 0, 1); + of->mmapped = false; + on->nr_mmapped--; + } if (kn->flags & KERNFS_HAS_RELEASE) kernfs_release_file(kn, of); } + WARN_ON_ONCE(on->nr_mmapped || on->nr_to_release); mutex_unlock(mutex); } @@ -874,11 +871,7 @@ void kernfs_drain_open_files(struct kernfs_node *kn) */ __poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait) { - struct kernfs_node *kn = kernfs_dentry_node(of->file->f_path.dentry); - struct kernfs_open_node *on = kernfs_deref_open_node(of, kn); - - if (!on) - return EPOLLERR; + struct kernfs_open_node *on = of_on(of); poll_wait(of->file, &on->poll, wait); diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index 3ae214d02d44..fc5821effd97 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -157,6 +157,7 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, */ extern const struct file_operations kernfs_file_fops; +bool kernfs_should_drain_open_files(struct kernfs_node *kn); void kernfs_drain_open_files(struct kernfs_node *kn); /* diff --git a/fs/ksmbd/auth.c b/fs/ksmbd/auth.c index c5a5c7b90d72..2a39ffb8423b 100644 --- a/fs/ksmbd/auth.c +++ b/fs/ksmbd/auth.c @@ -424,6 +424,9 @@ ksmbd_build_ntlmssp_challenge_blob(struct challenge_message *chgblob, NTLMSSP_NEGOTIATE_56); } + if (cflags & NTLMSSP_NEGOTIATE_SEAL && smb3_encryption_negotiated(conn)) + flags |= NTLMSSP_NEGOTIATE_SEAL; + if (cflags & NTLMSSP_NEGOTIATE_ALWAYS_SIGN) flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN; @@ -984,13 +987,16 @@ out: return rc; } -static int ksmbd_get_encryption_key(struct ksmbd_conn *conn, __u64 ses_id, +static int ksmbd_get_encryption_key(struct ksmbd_work *work, __u64 ses_id, int enc, u8 *key) { struct ksmbd_session *sess; u8 *ses_enc_key; - sess = ksmbd_session_lookup_all(conn, ses_id); + if (enc) + sess = work->sess; + else + sess = ksmbd_session_lookup_all(work->conn, ses_id); if (!sess) return -EINVAL; @@ -1078,9 +1084,10 @@ static struct scatterlist *ksmbd_init_sg(struct kvec *iov, unsigned int nvec, return sg; } -int ksmbd_crypt_message(struct ksmbd_conn *conn, struct kvec *iov, +int ksmbd_crypt_message(struct ksmbd_work *work, struct kvec *iov, unsigned int nvec, int enc) { + struct ksmbd_conn *conn = work->conn; struct smb2_transform_hdr *tr_hdr = smb2_get_msg(iov[0].iov_base); unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 20; int rc; @@ -1094,7 +1101,7 @@ int ksmbd_crypt_message(struct ksmbd_conn *conn, struct kvec *iov, unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize); struct ksmbd_crypto_ctx *ctx; - rc = ksmbd_get_encryption_key(conn, + rc = ksmbd_get_encryption_key(work, le64_to_cpu(tr_hdr->SessionId), enc, key); diff --git a/fs/ksmbd/auth.h b/fs/ksmbd/auth.h index 25b772653de0..362b6159a6cf 100644 --- a/fs/ksmbd/auth.h +++ b/fs/ksmbd/auth.h @@ -33,9 +33,10 @@ struct ksmbd_session; struct ksmbd_conn; +struct ksmbd_work; struct kvec; -int ksmbd_crypt_message(struct ksmbd_conn *conn, struct kvec *iov, +int ksmbd_crypt_message(struct ksmbd_work *work, struct kvec *iov, unsigned int nvec, int enc); void ksmbd_copy_gss_neg_header(void *buf); int ksmbd_auth_ntlmv2(struct ksmbd_conn *conn, struct ksmbd_session *sess, diff --git a/fs/ksmbd/connection.c b/fs/ksmbd/connection.c index 756ad631c019..12be8386446a 100644 --- a/fs/ksmbd/connection.c +++ b/fs/ksmbd/connection.c @@ -60,6 +60,12 @@ struct ksmbd_conn *ksmbd_conn_alloc(void) conn->local_nls = load_nls("utf8"); if (!conn->local_nls) conn->local_nls = load_nls_default(); + if (IS_ENABLED(CONFIG_UNICODE)) + conn->um = utf8_load(UNICODE_AGE(12, 1, 0)); + else + conn->um = ERR_PTR(-EOPNOTSUPP); + if (IS_ERR(conn->um)) + conn->um = NULL; atomic_set(&conn->req_running, 0); atomic_set(&conn->r_count, 0); conn->total_credits = 1; @@ -350,6 +356,8 @@ out: wait_event(conn->r_count_q, atomic_read(&conn->r_count) == 0); + if (IS_ENABLED(CONFIG_UNICODE)) + utf8_unload(conn->um); unload_nls(conn->local_nls); if (default_conn_ops.terminate_fn) default_conn_ops.terminate_fn(conn); diff --git a/fs/ksmbd/connection.h b/fs/ksmbd/connection.h index e7f7d5707951..3643354a3fa7 100644 --- a/fs/ksmbd/connection.h +++ b/fs/ksmbd/connection.h @@ -14,6 +14,7 @@ #include <net/request_sock.h> #include <linux/kthread.h> #include <linux/nls.h> +#include <linux/unicode.h> #include "smb_common.h" #include "ksmbd_work.h" @@ -46,6 +47,7 @@ struct ksmbd_conn { char *request_buf; struct ksmbd_transport *transport; struct nls_table *local_nls; + struct unicode_map *um; struct list_head conns_list; /* smb session 1 per user */ struct xarray sessions; diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h index e0cbcfa98c7e..ff07c67f4565 100644 --- a/fs/ksmbd/ksmbd_netlink.h +++ b/fs/ksmbd/ksmbd_netlink.h @@ -163,7 +163,8 @@ struct ksmbd_share_config_response { __u16 force_directory_mode; __u16 force_uid; __u16 force_gid; - __u32 reserved[128]; /* Reserved room */ + __s8 share_name[KSMBD_REQ_MAX_SHARE_NAME]; + __u32 reserved[112]; /* Reserved room */ __u32 veto_list_sz; __s8 ____payload[]; }; diff --git a/fs/ksmbd/mgmt/share_config.c b/fs/ksmbd/mgmt/share_config.c index c9bca1c2c834..328a412259dc 100644 --- a/fs/ksmbd/mgmt/share_config.c +++ b/fs/ksmbd/mgmt/share_config.c @@ -16,6 +16,7 @@ #include "user_config.h" #include "user_session.h" #include "../transport_ipc.h" +#include "../misc.h" #define SHARE_HASH_BITS 3 static DEFINE_HASHTABLE(shares_table, SHARE_HASH_BITS); @@ -26,7 +27,7 @@ struct ksmbd_veto_pattern { struct list_head list; }; -static unsigned int share_name_hash(char *name) +static unsigned int share_name_hash(const char *name) { return jhash(name, strlen(name), 0); } @@ -72,7 +73,7 @@ __get_share_config(struct ksmbd_share_config *share) return share; } -static struct ksmbd_share_config *__share_lookup(char *name) +static struct ksmbd_share_config *__share_lookup(const char *name) { struct ksmbd_share_config *share; unsigned int key = share_name_hash(name); @@ -119,7 +120,8 @@ static int parse_veto_list(struct ksmbd_share_config *share, return 0; } -static struct ksmbd_share_config *share_config_request(char *name) +static struct ksmbd_share_config *share_config_request(struct unicode_map *um, + const char *name) { struct ksmbd_share_config_response *resp; struct ksmbd_share_config *share = NULL; @@ -133,6 +135,19 @@ static struct ksmbd_share_config *share_config_request(char *name) if (resp->flags == KSMBD_SHARE_FLAG_INVALID) goto out; + if (*resp->share_name) { + char *cf_resp_name; + bool equal; + + cf_resp_name = ksmbd_casefold_sharename(um, resp->share_name); + if (IS_ERR(cf_resp_name)) + goto out; + equal = !strcmp(cf_resp_name, name); + kfree(cf_resp_name); + if (!equal) + goto out; + } + share = kzalloc(sizeof(struct ksmbd_share_config), GFP_KERNEL); if (!share) goto out; @@ -190,20 +205,11 @@ out: return share; } -static void strtolower(char *share_name) -{ - while (*share_name) { - *share_name = tolower(*share_name); - share_name++; - } -} - -struct ksmbd_share_config *ksmbd_share_config_get(char *name) +struct ksmbd_share_config *ksmbd_share_config_get(struct unicode_map *um, + const char *name) { struct ksmbd_share_config *share; - strtolower(name); - down_read(&shares_table_lock); share = __share_lookup(name); if (share) @@ -212,7 +218,7 @@ struct ksmbd_share_config *ksmbd_share_config_get(char *name) if (share) return share; - return share_config_request(name); + return share_config_request(um, name); } bool ksmbd_share_veto_filename(struct ksmbd_share_config *share, diff --git a/fs/ksmbd/mgmt/share_config.h b/fs/ksmbd/mgmt/share_config.h index 902f2cb1963a..3fd338293942 100644 --- a/fs/ksmbd/mgmt/share_config.h +++ b/fs/ksmbd/mgmt/share_config.h @@ -9,6 +9,7 @@ #include <linux/workqueue.h> #include <linux/hashtable.h> #include <linux/path.h> +#include <linux/unicode.h> struct ksmbd_share_config { char *name; @@ -74,7 +75,8 @@ static inline void ksmbd_share_config_put(struct ksmbd_share_config *share) __ksmbd_share_config_put(share); } -struct ksmbd_share_config *ksmbd_share_config_get(char *name); +struct ksmbd_share_config *ksmbd_share_config_get(struct unicode_map *um, + const char *name); bool ksmbd_share_veto_filename(struct ksmbd_share_config *share, const char *filename); #endif /* __SHARE_CONFIG_MANAGEMENT_H__ */ diff --git a/fs/ksmbd/mgmt/tree_connect.c b/fs/ksmbd/mgmt/tree_connect.c index 97ab7987df6e..8ce17b3fb8da 100644 --- a/fs/ksmbd/mgmt/tree_connect.c +++ b/fs/ksmbd/mgmt/tree_connect.c @@ -17,7 +17,7 @@ struct ksmbd_tree_conn_status ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess, - char *share_name) + const char *share_name) { struct ksmbd_tree_conn_status status = {-ENOENT, NULL}; struct ksmbd_tree_connect_response *resp = NULL; @@ -26,7 +26,7 @@ ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess, struct sockaddr *peer_addr; int ret; - sc = ksmbd_share_config_get(share_name); + sc = ksmbd_share_config_get(conn->um, share_name); if (!sc) return status; @@ -61,7 +61,7 @@ ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess, struct ksmbd_share_config *new_sc; ksmbd_share_config_del(sc); - new_sc = ksmbd_share_config_get(share_name); + new_sc = ksmbd_share_config_get(conn->um, share_name); if (!new_sc) { pr_err("Failed to update stale share config\n"); status.ret = -ESTALE; diff --git a/fs/ksmbd/mgmt/tree_connect.h b/fs/ksmbd/mgmt/tree_connect.h index 71e50271dccf..0f97ddc1e39c 100644 --- a/fs/ksmbd/mgmt/tree_connect.h +++ b/fs/ksmbd/mgmt/tree_connect.h @@ -42,7 +42,7 @@ struct ksmbd_session; struct ksmbd_tree_conn_status ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess, - char *share_name); + const char *share_name); int ksmbd_tree_conn_disconnect(struct ksmbd_session *sess, struct ksmbd_tree_connect *tree_conn); diff --git a/fs/ksmbd/misc.c b/fs/ksmbd/misc.c index df991107ad2c..9e8afaa686e3 100644 --- a/fs/ksmbd/misc.c +++ b/fs/ksmbd/misc.c @@ -7,6 +7,7 @@ #include <linux/kernel.h> #include <linux/xattr.h> #include <linux/fs.h> +#include <linux/unicode.h> #include "misc.h" #include "smb_common.h" @@ -159,7 +160,7 @@ out: */ char *convert_to_nt_pathname(struct ksmbd_share_config *share, - struct path *path) + const struct path *path) { char *pathname, *ab_pathname, *nt_pathname; int share_path_len = share->path_sz; @@ -226,26 +227,53 @@ void ksmbd_conv_path_to_windows(char *path) strreplace(path, '/', '\\'); } +char *ksmbd_casefold_sharename(struct unicode_map *um, const char *name) +{ + char *cf_name; + int cf_len; + + cf_name = kzalloc(KSMBD_REQ_MAX_SHARE_NAME, GFP_KERNEL); + if (!cf_name) + return ERR_PTR(-ENOMEM); + + if (IS_ENABLED(CONFIG_UNICODE) && um) { + const struct qstr q_name = {.name = name, .len = strlen(name)}; + + cf_len = utf8_casefold(um, &q_name, cf_name, + KSMBD_REQ_MAX_SHARE_NAME); + if (cf_len < 0) + goto out_ascii; + + return cf_name; + } + +out_ascii: + cf_len = strscpy(cf_name, name, KSMBD_REQ_MAX_SHARE_NAME); + if (cf_len < 0) { + kfree(cf_name); + return ERR_PTR(-E2BIG); + } + + for (; *cf_name; ++cf_name) + *cf_name = isascii(*cf_name) ? tolower(*cf_name) : *cf_name; + return cf_name - cf_len; +} + /** * ksmbd_extract_sharename() - get share name from tree connect request * @treename: buffer containing tree name and share name * * Return: share name on success, otherwise error */ -char *ksmbd_extract_sharename(char *treename) +char *ksmbd_extract_sharename(struct unicode_map *um, const char *treename) { - char *name = treename; - char *dst; - char *pos = strrchr(name, '\\'); + const char *name = treename, *pos = strrchr(name, '\\'); if (pos) name = (pos + 1); /* caller has to free the memory */ - dst = kstrdup(name, GFP_KERNEL); - if (!dst) - return ERR_PTR(-ENOMEM); - return dst; + return ksmbd_casefold_sharename(um, name); } /** diff --git a/fs/ksmbd/misc.h b/fs/ksmbd/misc.h index aae2a252945f..1facfcd21200 100644 --- a/fs/ksmbd/misc.h +++ b/fs/ksmbd/misc.h @@ -15,12 +15,13 @@ int match_pattern(const char *str, size_t len, const char *pattern); int ksmbd_validate_filename(char *filename); int parse_stream_name(char *filename, char **stream_name, int *s_type); char *convert_to_nt_pathname(struct ksmbd_share_config *share, - struct path *path); + const struct path *path); int get_nlink(struct kstat *st); void ksmbd_conv_path_to_unix(char *path); void ksmbd_strip_last_slash(char *path); void ksmbd_conv_path_to_windows(char *path); -char *ksmbd_extract_sharename(char *treename); +char *ksmbd_casefold_sharename(struct unicode_map *um, const char *name); +char *ksmbd_extract_sharename(struct unicode_map *um, const char *treename); char *convert_to_unix_name(struct ksmbd_share_config *share, const char *name); #define KSMBD_DIR_INFO_ALIGNMENT 8 diff --git a/fs/ksmbd/ndr.c b/fs/ksmbd/ndr.c index 5052be9261d9..0ae8d08d85a8 100644 --- a/fs/ksmbd/ndr.c +++ b/fs/ksmbd/ndr.c @@ -345,6 +345,8 @@ int ndr_encode_posix_acl(struct ndr *n, { unsigned int ref_id = 0x00020000; int ret; + vfsuid_t vfsuid; + vfsgid_t vfsgid; n->offset = 0; n->length = 1024; @@ -372,10 +374,12 @@ int ndr_encode_posix_acl(struct ndr *n, if (ret) return ret; - ret = ndr_write_int64(n, from_kuid(&init_user_ns, i_uid_into_mnt(user_ns, inode))); + vfsuid = i_uid_into_vfsuid(user_ns, inode); + ret = ndr_write_int64(n, from_kuid(&init_user_ns, vfsuid_into_kuid(vfsuid))); if (ret) return ret; - ret = ndr_write_int64(n, from_kgid(&init_user_ns, i_gid_into_mnt(user_ns, inode))); + vfsgid = i_gid_into_vfsgid(user_ns, inode); + ret = ndr_write_int64(n, from_kgid(&init_user_ns, vfsgid_into_kgid(vfsgid))); if (ret) return ret; ret = ndr_write_int32(n, inode->i_mode); diff --git a/fs/ksmbd/oplock.c b/fs/ksmbd/oplock.c index 9046cff4374b..d7d47b82451d 100644 --- a/fs/ksmbd/oplock.c +++ b/fs/ksmbd/oplock.c @@ -1609,12 +1609,18 @@ void create_posix_rsp_buf(char *cc, struct ksmbd_file *fp) struct create_posix_rsp *buf; struct inode *inode = file_inode(fp->filp); struct user_namespace *user_ns = file_mnt_user_ns(fp->filp); + vfsuid_t vfsuid = i_uid_into_vfsuid(user_ns, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(user_ns, inode); buf = (struct create_posix_rsp *)cc; memset(buf, 0, sizeof(struct create_posix_rsp)); buf->ccontext.DataOffset = cpu_to_le16(offsetof (struct create_posix_rsp, nlink)); - buf->ccontext.DataLength = cpu_to_le32(52); + /* + * DataLength = nlink(4) + reparse_tag(4) + mode(4) + + * domain sid(28) + unix group sid(16). + */ + buf->ccontext.DataLength = cpu_to_le32(56); buf->ccontext.NameOffset = cpu_to_le16(offsetof (struct create_posix_rsp, Name)); buf->ccontext.NameLength = cpu_to_le16(POSIX_CTXT_DATA_LEN); @@ -1638,13 +1644,18 @@ void create_posix_rsp_buf(char *cc, struct ksmbd_file *fp) buf->nlink = cpu_to_le32(inode->i_nlink); buf->reparse_tag = cpu_to_le32(fp->volatile_id); - buf->mode = cpu_to_le32(inode->i_mode); - id_to_sid(from_kuid_munged(&init_user_ns, - i_uid_into_mnt(user_ns, inode)), - SIDNFS_USER, (struct smb_sid *)&buf->SidBuffer[0]); - id_to_sid(from_kgid_munged(&init_user_ns, - i_gid_into_mnt(user_ns, inode)), - SIDNFS_GROUP, (struct smb_sid *)&buf->SidBuffer[20]); + buf->mode = cpu_to_le32(inode->i_mode & 0777); + /* + * SidBuffer(44) contain two sids(Domain sid(28), UNIX group sid(16)). + * Domain sid(28) = revision(1) + num_subauth(1) + authority(6) + + * sub_auth(4 * 4(num_subauth)) + RID(4). + * UNIX group id(16) = revision(1) + num_subauth(1) + authority(6) + + * sub_auth(4 * 1(num_subauth)) + RID(4). + */ + id_to_sid(from_kuid_munged(&init_user_ns, vfsuid_into_kuid(vfsuid)), + SIDOWNER, (struct smb_sid *)&buf->SidBuffer[0]); + id_to_sid(from_kgid_munged(&init_user_ns, vfsgid_into_kgid(vfsgid)), + SIDUNIX_GROUP, (struct smb_sid *)&buf->SidBuffer[28]); } /* diff --git a/fs/ksmbd/server.c b/fs/ksmbd/server.c index ce42bff42ef9..a0d635304754 100644 --- a/fs/ksmbd/server.c +++ b/fs/ksmbd/server.c @@ -235,10 +235,8 @@ send: if (work->sess && work->sess->enc && work->encrypted && conn->ops->encrypt_resp) { rc = conn->ops->encrypt_resp(work); - if (rc < 0) { + if (rc < 0) conn->ops->set_rsp_status(work, STATUS_DATA_ERROR); - goto send; - } } ksmbd_conn_write(work); diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 19412ac701a6..b2fc85d440d0 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -925,7 +925,7 @@ static void decode_encrypt_ctxt(struct ksmbd_conn *conn, * * Return: true if connection should be encrypted, else false */ -static bool smb3_encryption_negotiated(struct ksmbd_conn *conn) +bool smb3_encryption_negotiated(struct ksmbd_conn *conn) { if (!conn->ops->generate_encryptionkey) return false; @@ -1883,7 +1883,7 @@ int smb2_tree_connect(struct ksmbd_work *work) goto out_err1; } - name = ksmbd_extract_sharename(treename); + name = ksmbd_extract_sharename(conn->um, treename); if (IS_ERR(name)) { status.ret = KSMBD_TREE_CONN_STATUS_ERROR; goto out_err1; @@ -2185,7 +2185,7 @@ out: * Return: 0 on success, otherwise error */ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len, - struct path *path) + const struct path *path) { struct user_namespace *user_ns = mnt_user_ns(path->mnt); char *attr_name = NULL, *value; @@ -2272,7 +2272,7 @@ next: return rc; } -static noinline int smb2_set_stream_name_xattr(struct path *path, +static noinline int smb2_set_stream_name_xattr(const struct path *path, struct ksmbd_file *fp, char *stream_name, int s_type) { @@ -2311,7 +2311,7 @@ static noinline int smb2_set_stream_name_xattr(struct path *path, return 0; } -static int smb2_remove_smb_xattrs(struct path *path) +static int smb2_remove_smb_xattrs(const struct path *path) { struct user_namespace *user_ns = mnt_user_ns(path->mnt); char *name, *xattr_list = NULL; @@ -2345,7 +2345,7 @@ out: return err; } -static int smb2_create_truncate(struct path *path) +static int smb2_create_truncate(const struct path *path) { int rc = vfs_truncate(path, 0); @@ -2364,7 +2364,7 @@ static int smb2_create_truncate(struct path *path) return rc; } -static void smb2_new_xattrs(struct ksmbd_tree_connect *tcon, struct path *path, +static void smb2_new_xattrs(struct ksmbd_tree_connect *tcon, const struct path *path, struct ksmbd_file *fp) { struct xattr_dos_attrib da = {0}; @@ -2387,7 +2387,7 @@ static void smb2_new_xattrs(struct ksmbd_tree_connect *tcon, struct path *path, } static void smb2_update_xattrs(struct ksmbd_tree_connect *tcon, - struct path *path, struct ksmbd_file *fp) + const struct path *path, struct ksmbd_file *fp) { struct xattr_dos_attrib da; int rc; @@ -2447,7 +2447,7 @@ static int smb2_creat(struct ksmbd_work *work, struct path *path, char *name, static int smb2_create_sd_buffer(struct ksmbd_work *work, struct smb2_create_req *req, - struct path *path) + const struct path *path) { struct create_context *context; struct create_sd_buf_req *sd_buf; @@ -2477,8 +2477,11 @@ static void ksmbd_acls_fattr(struct smb_fattr *fattr, struct user_namespace *mnt_userns, struct inode *inode) { - fattr->cf_uid = i_uid_into_mnt(mnt_userns, inode); - fattr->cf_gid = i_gid_into_mnt(mnt_userns, inode); + vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + + fattr->cf_uid = vfsuid_into_kuid(vfsuid); + fattr->cf_gid = vfsgid_into_kgid(vfsgid); fattr->cf_mode = inode->i_mode; fattr->cf_acls = NULL; fattr->cf_dacls = NULL; @@ -2761,7 +2764,6 @@ int smb2_open(struct ksmbd_work *work) } else { file_present = true; user_ns = mnt_user_ns(path.mnt); - generic_fillattr(user_ns, d_inode(path.dentry), &stat); } if (stream_name) { if (req->CreateOptions & FILE_DIRECTORY_FILE_LE) { @@ -2770,7 +2772,8 @@ int smb2_open(struct ksmbd_work *work) rsp->hdr.Status = STATUS_NOT_A_DIRECTORY; } } else { - if (S_ISDIR(stat.mode) && s_type == DATA_STREAM) { + if (file_present && S_ISDIR(d_inode(path.dentry)->i_mode) && + s_type == DATA_STREAM) { rc = -EIO; rsp->hdr.Status = STATUS_FILE_IS_A_DIRECTORY; } @@ -2787,7 +2790,8 @@ int smb2_open(struct ksmbd_work *work) } if (file_present && req->CreateOptions & FILE_NON_DIRECTORY_FILE_LE && - S_ISDIR(stat.mode) && !(req->CreateOptions & FILE_DELETE_ON_CLOSE_LE)) { + S_ISDIR(d_inode(path.dentry)->i_mode) && + !(req->CreateOptions & FILE_DELETE_ON_CLOSE_LE)) { ksmbd_debug(SMB, "open() argument is a directory: %s, %x\n", name, req->CreateOptions); rsp->hdr.Status = STATUS_FILE_IS_A_DIRECTORY; @@ -2797,7 +2801,7 @@ int smb2_open(struct ksmbd_work *work) if (file_present && (req->CreateOptions & FILE_DIRECTORY_FILE_LE) && !(req->CreateDisposition == FILE_CREATE_LE) && - !S_ISDIR(stat.mode)) { + !S_ISDIR(d_inode(path.dentry)->i_mode)) { rsp->hdr.Status = STATUS_NOT_A_DIRECTORY; rc = -EIO; goto err_out; @@ -3561,17 +3565,22 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level, posix_info->AllocationSize = cpu_to_le64(ksmbd_kstat->kstat->blocks << 9); posix_info->DeviceId = cpu_to_le32(ksmbd_kstat->kstat->rdev); posix_info->HardLinks = cpu_to_le32(ksmbd_kstat->kstat->nlink); - posix_info->Mode = cpu_to_le32(ksmbd_kstat->kstat->mode); + posix_info->Mode = cpu_to_le32(ksmbd_kstat->kstat->mode & 0777); posix_info->Inode = cpu_to_le64(ksmbd_kstat->kstat->ino); posix_info->DosAttributes = S_ISDIR(ksmbd_kstat->kstat->mode) ? FILE_ATTRIBUTE_DIRECTORY_LE : FILE_ATTRIBUTE_ARCHIVE_LE; if (d_info->hide_dot_file && d_info->name[0] == '.') posix_info->DosAttributes |= FILE_ATTRIBUTE_HIDDEN_LE; + /* + * SidBuffer(32) contain two sids(Domain sid(16), UNIX group sid(16)). + * UNIX sid(16) = revision(1) + num_subauth(1) + authority(6) + + * sub_auth(4 * 1(num_subauth)) + RID(4). + */ id_to_sid(from_kuid_munged(&init_user_ns, ksmbd_kstat->kstat->uid), - SIDNFS_USER, (struct smb_sid *)&posix_info->SidBuffer[0]); + SIDUNIX_USER, (struct smb_sid *)&posix_info->SidBuffer[0]); id_to_sid(from_kgid_munged(&init_user_ns, ksmbd_kstat->kstat->gid), - SIDNFS_GROUP, (struct smb_sid *)&posix_info->SidBuffer[20]); + SIDUNIX_GROUP, (struct smb_sid *)&posix_info->SidBuffer[16]); memcpy(posix_info->name, conv_name, conv_len); posix_info->name_len = cpu_to_le32(conv_len); posix_info->NextEntryOffset = cpu_to_le32(next_entry_offset); @@ -3776,7 +3785,7 @@ static int reserve_populate_dentry(struct ksmbd_dir_info *d_info, return 0; } -static int __query_dir(struct dir_context *ctx, const char *name, int namlen, +static bool __query_dir(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct ksmbd_readdir_data *buf; @@ -3790,27 +3799,20 @@ static int __query_dir(struct dir_context *ctx, const char *name, int namlen, /* dot and dotdot entries are already reserved */ if (!strcmp(".", name) || !strcmp("..", name)) - return 0; + return true; if (ksmbd_share_veto_filename(priv->work->tcon->share_conf, name)) - return 0; + return true; if (!match_pattern(name, namlen, priv->search_pattern)) - return 0; + return true; d_info->name = name; d_info->name_len = namlen; rc = reserve_populate_dentry(d_info, priv->info_level); if (rc) - return rc; - if (d_info->flags & SMB2_RETURN_SINGLE_ENTRY) { + return false; + if (d_info->flags & SMB2_RETURN_SINGLE_ENTRY) d_info->out_buf_len = 0; - return 0; - } - return 0; -} - -static void restart_ctx(struct dir_context *ctx) -{ - ctx->pos = 0; + return true; } static int verify_info_level(int info_level) @@ -3894,8 +3896,7 @@ int smb2_query_dir(struct ksmbd_work *work) inode_permission(file_mnt_user_ns(dir_fp->filp), file_inode(dir_fp->filp), MAY_READ | MAY_EXEC)) { - pr_err("no right to enumerate directory (%pd)\n", - dir_fp->filp->f_path.dentry); + pr_err("no right to enumerate directory (%pD)\n", dir_fp->filp); rc = -EACCES; goto err_out2; } @@ -3921,7 +3922,6 @@ int smb2_query_dir(struct ksmbd_work *work) if (srch_flag & SMB2_REOPEN || srch_flag & SMB2_RESTART_SCANS) { ksmbd_debug(SMB, "Restart directory scan\n"); generic_file_llseek(dir_fp->filp, 0, SEEK_SET); - restart_ctx(&dir_fp->readdir_data.ctx); } memset(&d_info, 0, sizeof(struct ksmbd_dir_info)); @@ -3968,11 +3968,9 @@ int smb2_query_dir(struct ksmbd_work *work) */ if (!d_info.out_buf_len && !d_info.num_entry) goto no_buf_len; - if (rc == 0) - restart_ctx(&dir_fp->readdir_data.ctx); - if (rc == -ENOSPC) + if (rc > 0 || rc == -ENOSPC) rc = 0; - if (rc) + else if (rc) goto err_out; d_info.wptr = d_info.rptr; @@ -4029,6 +4027,8 @@ err_out2: rsp->hdr.Status = STATUS_NO_MEMORY; else if (rc == -EFAULT) rsp->hdr.Status = STATUS_INVALID_INFO_CLASS; + else if (rc == -EIO) + rsp->hdr.Status = STATUS_FILE_CORRUPT_ERROR; if (!rsp->hdr.Status) rsp->hdr.Status = STATUS_UNEXPECTED_IO_ERROR; @@ -4158,7 +4158,7 @@ static int smb2_get_ea(struct ksmbd_work *work, struct ksmbd_file *fp, int rc, name_len, value_len, xattr_list_len, idx; ssize_t buf_free_len, alignment_bytes, next_offset, rsp_data_cnt = 0; struct smb2_ea_info_req *ea_req = NULL; - struct path *path; + const struct path *path; struct user_namespace *user_ns = file_mnt_user_ns(fp->filp); if (!(fp->daccess & FILE_READ_EA_LE)) { @@ -4495,7 +4495,7 @@ static void get_file_stream_info(struct ksmbd_work *work, struct smb2_file_stream_info *file_info; char *stream_name, *xattr_list = NULL, *stream_buf; struct kstat stat; - struct path *path = &fp->filp->f_path; + const struct path *path = &fp->filp->f_path; ssize_t xattr_list_len; int nbytes = 0, streamlen, stream_name_len, next, idx = 0; int buf_free_len; @@ -4720,7 +4720,11 @@ static int find_file_posix_info(struct smb2_query_info_rsp *rsp, { struct smb311_posix_qinfo *file_info; struct inode *inode = file_inode(fp->filp); + struct user_namespace *user_ns = file_mnt_user_ns(fp->filp); + vfsuid_t vfsuid = i_uid_into_vfsuid(user_ns, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(user_ns, inode); u64 time; + int out_buf_len = sizeof(struct smb311_posix_qinfo) + 32; file_info = (struct smb311_posix_qinfo *)rsp->Buffer; file_info->CreationTime = cpu_to_le64(fp->create_time); @@ -4735,12 +4739,22 @@ static int find_file_posix_info(struct smb2_query_info_rsp *rsp, file_info->EndOfFile = cpu_to_le64(inode->i_size); file_info->AllocationSize = cpu_to_le64(inode->i_blocks << 9); file_info->HardLinks = cpu_to_le32(inode->i_nlink); - file_info->Mode = cpu_to_le32(inode->i_mode); + file_info->Mode = cpu_to_le32(inode->i_mode & 0777); file_info->DeviceId = cpu_to_le32(inode->i_rdev); - rsp->OutputBufferLength = - cpu_to_le32(sizeof(struct smb311_posix_qinfo)); - inc_rfc1001_len(rsp_org, sizeof(struct smb311_posix_qinfo)); - return 0; + + /* + * Sids(32) contain two sids(Domain sid(16), UNIX group sid(16)). + * UNIX sid(16) = revision(1) + num_subauth(1) + authority(6) + + * sub_auth(4 * 1(num_subauth)) + RID(4). + */ + id_to_sid(from_kuid_munged(&init_user_ns, vfsuid_into_kuid(vfsuid)), + SIDUNIX_USER, (struct smb_sid *)&file_info->Sids[0]); + id_to_sid(from_kgid_munged(&init_user_ns, vfsgid_into_kgid(vfsgid)), + SIDUNIX_GROUP, (struct smb_sid *)&file_info->Sids[16]); + + rsp->OutputBufferLength = cpu_to_le32(out_buf_len); + inc_rfc1001_len(rsp_org, out_buf_len); + return out_buf_len; } static int smb2_get_info_file(struct ksmbd_work *work, @@ -4860,8 +4874,8 @@ static int smb2_get_info_file(struct ksmbd_work *work, pr_err("client doesn't negotiate with SMB3.1.1 POSIX Extensions\n"); rc = -EOPNOTSUPP; } else { - rc = find_file_posix_info(rsp, fp, work->response_buf); - file_infoclass_size = sizeof(struct smb311_posix_qinfo); + file_infoclass_size = find_file_posix_info(rsp, fp, + work->response_buf); } break; default: @@ -5413,7 +5427,7 @@ static int smb2_rename(struct ksmbd_work *work, if (!pathname) return -ENOMEM; - abs_oldname = d_path(&fp->filp->f_path, pathname, PATH_MAX); + abs_oldname = file_path(fp->filp, pathname, PATH_MAX); if (IS_ERR(abs_oldname)) { rc = -EINVAL; goto out; @@ -5548,7 +5562,7 @@ static int smb2_create_link(struct ksmbd_work *work, } ksmbd_debug(SMB, "link name is %s\n", link_name); - target_name = d_path(&filp->f_path, pathname, PATH_MAX); + target_name = file_path(filp, pathname, PATH_MAX); if (IS_ERR(target_name)) { rc = -EINVAL; goto out; @@ -6266,8 +6280,8 @@ int smb2_read(struct ksmbd_work *work) goto out; } - ksmbd_debug(SMB, "filename %pd, offset %lld, len %zu\n", - fp->filp->f_path.dentry, offset, length); + ksmbd_debug(SMB, "filename %pD, offset %lld, len %zu\n", + fp->filp, offset, length); work->aux_payload_buf = kvmalloc(length, GFP_KERNEL | __GFP_ZERO); if (!work->aux_payload_buf) { @@ -6531,8 +6545,8 @@ int smb2_write(struct ksmbd_work *work) data_buf = (char *)(((char *)&req->hdr.ProtocolId) + le16_to_cpu(req->DataOffset)); - ksmbd_debug(SMB, "filename %pd, offset %lld, len %zu\n", - fp->filp->f_path.dentry, offset, length); + ksmbd_debug(SMB, "filename %pD, offset %lld, len %zu\n", + fp->filp, offset, length); err = ksmbd_vfs_write(work, fp, data_buf, length, &offset, writethrough, &nbytes); if (err < 0) @@ -7643,11 +7657,16 @@ int smb2_ioctl(struct ksmbd_work *work) goto out; } - if (in_buf_len < sizeof(struct validate_negotiate_info_req)) - return -EINVAL; + if (in_buf_len < offsetof(struct validate_negotiate_info_req, + Dialects)) { + ret = -EINVAL; + goto out; + } - if (out_buf_len < sizeof(struct validate_negotiate_info_rsp)) - return -EINVAL; + if (out_buf_len < sizeof(struct validate_negotiate_info_rsp)) { + ret = -EINVAL; + goto out; + } ret = fsctl_validate_negotiate_info(conn, (struct validate_negotiate_info_req *)&req->Buffer[0], @@ -8573,7 +8592,7 @@ int smb3_encrypt_resp(struct ksmbd_work *work) buf_size += iov[1].iov_len; work->resp_hdr_sz = iov[1].iov_len; - rc = ksmbd_crypt_message(work->conn, iov, rq_nvec, 1); + rc = ksmbd_crypt_message(work, iov, rq_nvec, 1); if (rc) return rc; @@ -8592,7 +8611,6 @@ bool smb3_is_transform_hdr(void *buf) int smb3_decrypt_req(struct ksmbd_work *work) { - struct ksmbd_conn *conn = work->conn; struct ksmbd_session *sess; char *buf = work->request_buf; unsigned int pdu_length = get_rfc1002_len(buf); @@ -8612,7 +8630,7 @@ int smb3_decrypt_req(struct ksmbd_work *work) return -ECONNABORTED; } - sess = ksmbd_session_lookup_all(conn, le64_to_cpu(tr_hdr->SessionId)); + sess = ksmbd_session_lookup_all(work->conn, le64_to_cpu(tr_hdr->SessionId)); if (!sess) { pr_err("invalid session id(%llx) in transform header\n", le64_to_cpu(tr_hdr->SessionId)); @@ -8623,7 +8641,7 @@ int smb3_decrypt_req(struct ksmbd_work *work) iov[0].iov_len = sizeof(struct smb2_transform_hdr) + 4; iov[1].iov_base = buf + sizeof(struct smb2_transform_hdr) + 4; iov[1].iov_len = buf_data_size; - rc = ksmbd_crypt_message(conn, iov, 2, 0); + rc = ksmbd_crypt_message(work, iov, 2, 0); if (rc) return rc; diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h index af455278d005..092fdd3f8750 100644 --- a/fs/ksmbd/smb2pdu.h +++ b/fs/ksmbd/smb2pdu.h @@ -158,7 +158,8 @@ struct create_posix_rsp { __le32 nlink; __le32 reparse_tag; __le32 mode; - u8 SidBuffer[40]; + /* SidBuffer contain two sids(Domain sid(28), UNIX group sid(16)) */ + u8 SidBuffer[44]; } __packed; struct smb2_buffer_desc_v1 { @@ -439,7 +440,8 @@ struct smb2_posix_info { __le32 HardLinks; __le32 ReparseTag; __le32 Mode; - u8 SidBuffer[40]; + /* SidBuffer contain two sids (UNIX user sid(16), UNIX group sid(16)) */ + u8 SidBuffer[32]; __le32 name_len; u8 name[1]; /* @@ -492,6 +494,7 @@ int smb3_decrypt_req(struct ksmbd_work *work); int smb3_encrypt_resp(struct ksmbd_work *work); bool smb3_11_final_sess_setup_resp(struct ksmbd_work *work); int smb2_set_rsp_credits(struct ksmbd_work *work); +bool smb3_encryption_negotiated(struct ksmbd_conn *conn); /* smb2 misc functions */ int ksmbd_smb2_check_message(struct ksmbd_work *work); diff --git a/fs/ksmbd/smb_common.c b/fs/ksmbd/smb_common.c index 7f8ab14fb8ec..d96da872d70a 100644 --- a/fs/ksmbd/smb_common.c +++ b/fs/ksmbd/smb_common.c @@ -4,6 +4,8 @@ * Copyright (C) 2018 Namjae Jeon <linkinjeon@kernel.org> */ +#include <linux/user_namespace.h> + #include "smb_common.h" #include "server.h" #include "misc.h" @@ -625,8 +627,8 @@ int ksmbd_override_fsids(struct ksmbd_work *work) if (!cred) return -ENOMEM; - cred->fsuid = make_kuid(current_user_ns(), uid); - cred->fsgid = make_kgid(current_user_ns(), gid); + cred->fsuid = make_kuid(&init_user_ns, uid); + cred->fsgid = make_kgid(&init_user_ns, gid); gi = groups_alloc(0); if (!gi) { diff --git a/fs/ksmbd/smbacl.c b/fs/ksmbd/smbacl.c index 3781bca2c8fc..b05ff9b146b5 100644 --- a/fs/ksmbd/smbacl.c +++ b/fs/ksmbd/smbacl.c @@ -275,7 +275,8 @@ static int sid_to_id(struct user_namespace *user_ns, uid_t id; id = le32_to_cpu(psid->sub_auth[psid->num_subauth - 1]); - uid = mapped_kuid_user(user_ns, &init_user_ns, KUIDT_INIT(id)); + uid = KUIDT_INIT(id); + uid = from_vfsuid(user_ns, &init_user_ns, VFSUIDT_INIT(uid)); if (uid_valid(uid)) { fattr->cf_uid = uid; rc = 0; @@ -285,7 +286,8 @@ static int sid_to_id(struct user_namespace *user_ns, gid_t id; id = le32_to_cpu(psid->sub_auth[psid->num_subauth - 1]); - gid = mapped_kgid_user(user_ns, &init_user_ns, KGIDT_INIT(id)); + gid = KGIDT_INIT(id); + gid = from_vfsgid(user_ns, &init_user_ns, VFSGIDT_INIT(gid)); if (gid_valid(gid)) { fattr->cf_gid = gid; rc = 0; @@ -991,7 +993,7 @@ static void smb_set_ace(struct smb_ace *ace, const struct smb_sid *sid, u8 type, } int smb_inherit_dacl(struct ksmbd_conn *conn, - struct path *path, + const struct path *path, unsigned int uid, unsigned int gid) { const struct smb_sid *psid, *creator = NULL; @@ -1185,7 +1187,7 @@ bool smb_inherit_flags(int flags, bool is_dir) return false; } -int smb_check_perm_dacl(struct ksmbd_conn *conn, struct path *path, +int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path, __le32 *pdaccess, int uid) { struct user_namespace *user_ns = mnt_user_ns(path->mnt); @@ -1352,7 +1354,7 @@ err_out: } int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, - struct path *path, struct smb_ntsd *pntsd, int ntsd_len, + const struct path *path, struct smb_ntsd *pntsd, int ntsd_len, bool type_check) { int rc; diff --git a/fs/ksmbd/smbacl.h b/fs/ksmbd/smbacl.h index fcb2c83f2992..618f2e0236b3 100644 --- a/fs/ksmbd/smbacl.h +++ b/fs/ksmbd/smbacl.h @@ -201,12 +201,12 @@ void posix_state_to_acl(struct posix_acl_state *state, struct posix_acl_entry *pace); int compare_sids(const struct smb_sid *ctsid, const struct smb_sid *cwsid); bool smb_inherit_flags(int flags, bool is_dir); -int smb_inherit_dacl(struct ksmbd_conn *conn, struct path *path, +int smb_inherit_dacl(struct ksmbd_conn *conn, const struct path *path, unsigned int uid, unsigned int gid); -int smb_check_perm_dacl(struct ksmbd_conn *conn, struct path *path, +int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path, __le32 *pdaccess, int uid); int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, - struct path *path, struct smb_ntsd *pntsd, int ntsd_len, + const struct path *path, struct smb_ntsd *pntsd, int ntsd_len, bool type_check); void id_to_sid(unsigned int cid, uint sidtype, struct smb_sid *ssid); void ksmbd_init_domain(u32 *sub_auth); @@ -214,25 +214,25 @@ void ksmbd_init_domain(u32 *sub_auth); static inline uid_t posix_acl_uid_translate(struct user_namespace *mnt_userns, struct posix_acl_entry *pace) { - kuid_t kuid; + vfsuid_t vfsuid; /* If this is an idmapped mount, apply the idmapping. */ - kuid = mapped_kuid_fs(mnt_userns, &init_user_ns, pace->e_uid); + vfsuid = make_vfsuid(mnt_userns, &init_user_ns, pace->e_uid); /* Translate the kuid into a userspace id ksmbd would see. */ - return from_kuid(&init_user_ns, kuid); + return from_kuid(&init_user_ns, vfsuid_into_kuid(vfsuid)); } static inline gid_t posix_acl_gid_translate(struct user_namespace *mnt_userns, struct posix_acl_entry *pace) { - kgid_t kgid; + vfsgid_t vfsgid; /* If this is an idmapped mount, apply the idmapping. */ - kgid = mapped_kgid_fs(mnt_userns, &init_user_ns, pace->e_gid); + vfsgid = make_vfsgid(mnt_userns, &init_user_ns, pace->e_gid); /* Translate the kgid into a userspace id ksmbd would see. */ - return from_kgid(&init_user_ns, kgid); + return from_kgid(&init_user_ns, vfsgid_into_kgid(vfsgid)); } #endif /* _SMBACL_H */ diff --git a/fs/ksmbd/transport_ipc.c b/fs/ksmbd/transport_ipc.c index 7cb0eeb07c80..c9aca21637d5 100644 --- a/fs/ksmbd/transport_ipc.c +++ b/fs/ksmbd/transport_ipc.c @@ -197,6 +197,7 @@ static struct genl_family ksmbd_genl_family = { .module = THIS_MODULE, .ops = ksmbd_genl_ops, .n_ops = ARRAY_SIZE(ksmbd_genl_ops), + .resv_start_op = KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE + 1, }; static void ksmbd_nl_init_fixup(void) diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index 35b55ee94fe5..096eda9ef873 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -32,7 +32,7 @@ /* SMB_DIRECT negotiation timeout in seconds */ #define SMB_DIRECT_NEGOTIATE_TIMEOUT 120 -#define SMB_DIRECT_MAX_SEND_SGES 8 +#define SMB_DIRECT_MAX_SEND_SGES 6 #define SMB_DIRECT_MAX_RECV_SGES 1 /* @@ -62,13 +62,13 @@ static int smb_direct_receive_credit_max = 255; static int smb_direct_send_credit_target = 255; /* The maximum single message size can be sent to remote peer */ -static int smb_direct_max_send_size = 8192; +static int smb_direct_max_send_size = 1364; /* The maximum fragmented upper-layer payload receive size supported */ static int smb_direct_max_fragmented_recv_size = 1024 * 1024; /* The maximum single-message size which can be received */ -static int smb_direct_max_receive_size = 8192; +static int smb_direct_max_receive_size = 1364; static int smb_direct_max_read_write_size = SMBD_DEFAULT_IOSIZE; @@ -1527,6 +1527,8 @@ static int smb_direct_cm_handler(struct rdma_cm_id *cm_id, } case RDMA_CM_EVENT_DEVICE_REMOVAL: case RDMA_CM_EVENT_DISCONNECTED: { + ib_drain_qp(t->qp); + t->status = SMB_DIRECT_CS_DISCONNECTED; wake_up_interruptible(&t->wait_status); wake_up_interruptible(&t->wait_reassembly_queue); diff --git a/fs/ksmbd/transport_tcp.c b/fs/ksmbd/transport_tcp.c index 143bba4e4db8..63d55f543bd2 100644 --- a/fs/ksmbd/transport_tcp.c +++ b/fs/ksmbd/transport_tcp.c @@ -399,7 +399,8 @@ static int create_socket(struct interface *iface) ret = sock_create(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &ksmbd_socket); if (ret) { - pr_err("Can't create socket for ipv6, try ipv4: %d\n", ret); + if (ret != -EAFNOSUPPORT) + pr_err("Can't create socket for ipv6, fallback to ipv4: %d\n", ret); ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &ksmbd_socket); if (ret) { diff --git a/fs/ksmbd/unicode.h b/fs/ksmbd/unicode.h index 5593024230ae..076f6034a789 100644 --- a/fs/ksmbd/unicode.h +++ b/fs/ksmbd/unicode.h @@ -24,6 +24,7 @@ #include <asm/byteorder.h> #include <linux/types.h> #include <linux/nls.h> +#include <linux/unicode.h> #define UNIUPR_NOLOWER /* Example to not expand lower case tables */ @@ -69,7 +70,7 @@ char *smb_strndup_from_utf16(const char *src, const int maxlen, const struct nls_table *codepage); int smbConvertToUTF16(__le16 *target, const char *source, int srclen, const struct nls_table *cp, int mapchars); -char *ksmbd_extract_sharename(char *treename); +char *ksmbd_extract_sharename(struct unicode_map *um, const char *treename); #endif /* diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c index 78d01033604c..8de970d6146f 100644 --- a/fs/ksmbd/vfs.c +++ b/fs/ksmbd/vfs.c @@ -377,8 +377,7 @@ int ksmbd_vfs_read(struct ksmbd_work *work, struct ksmbd_file *fp, size_t count, if (work->conn->connection_type) { if (!(fp->daccess & (FILE_READ_DATA_LE | FILE_EXECUTE_LE))) { - pr_err("no right to read(%pd)\n", - fp->filp->f_path.dentry); + pr_err("no right to read(%pD)\n", fp->filp); return -EACCES; } } @@ -487,8 +486,7 @@ int ksmbd_vfs_write(struct ksmbd_work *work, struct ksmbd_file *fp, if (work->conn->connection_type) { if (!(fp->daccess & FILE_WRITE_DATA_LE)) { - pr_err("no right to write(%pd)\n", - fp->filp->f_path.dentry); + pr_err("no right to write(%pD)\n", fp->filp); err = -EACCES; goto out; } @@ -527,8 +525,8 @@ int ksmbd_vfs_write(struct ksmbd_work *work, struct ksmbd_file *fp, if (sync) { err = vfs_fsync_range(filp, offset, offset + *written, 0); if (err < 0) - pr_err("fsync failed for filename = %pd, err = %d\n", - fp->filp->f_path.dentry, err); + pr_err("fsync failed for filename = %pD, err = %d\n", + fp->filp, err); } out: @@ -543,7 +541,7 @@ out: * * Return: 0 on success, otherwise error */ -int ksmbd_vfs_getattr(struct path *path, struct kstat *stat) +int ksmbd_vfs_getattr(const struct path *path, struct kstat *stat) { int err; @@ -1105,7 +1103,7 @@ int ksmbd_vfs_unlink(struct user_namespace *user_ns, return err; } -static int __dir_empty(struct dir_context *ctx, const char *name, int namlen, +static bool __dir_empty(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct ksmbd_readdir_data *buf; @@ -1113,9 +1111,7 @@ static int __dir_empty(struct dir_context *ctx, const char *name, int namlen, buf = container_of(ctx, struct ksmbd_readdir_data, ctx); buf->dirent_count++; - if (buf->dirent_count > 2) - return -ENOTEMPTY; - return 0; + return buf->dirent_count <= 2; } /** @@ -1142,22 +1138,33 @@ int ksmbd_vfs_empty_dir(struct ksmbd_file *fp) return err; } -static int __caseless_lookup(struct dir_context *ctx, const char *name, +static bool __caseless_lookup(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct ksmbd_readdir_data *buf; + int cmp = -EINVAL; buf = container_of(ctx, struct ksmbd_readdir_data, ctx); if (buf->used != namlen) - return 0; - if (!strncasecmp((char *)buf->private, name, namlen)) { + return true; + if (IS_ENABLED(CONFIG_UNICODE) && buf->um) { + const struct qstr q_buf = {.name = buf->private, + .len = buf->used}; + const struct qstr q_name = {.name = name, + .len = namlen}; + + cmp = utf8_strncasecmp(buf->um, &q_buf, &q_name); + } + if (cmp < 0) + cmp = strncasecmp((char *)buf->private, name, namlen); + if (!cmp) { memcpy((char *)buf->private, name, namlen); buf->dirent_count = 1; - return -EEXIST; + return false; } - return 0; + return true; } /** @@ -1168,7 +1175,8 @@ static int __caseless_lookup(struct dir_context *ctx, const char *name, * * Return: 0 on success, otherwise error */ -static int ksmbd_vfs_lookup_in_dir(struct path *dir, char *name, size_t namelen) +static int ksmbd_vfs_lookup_in_dir(const struct path *dir, char *name, + size_t namelen, struct unicode_map *um) { int ret; struct file *dfilp; @@ -1178,6 +1186,7 @@ static int ksmbd_vfs_lookup_in_dir(struct path *dir, char *name, size_t namelen) .private = name, .used = namelen, .dirent_count = 0, + .um = um, }; dfilp = dentry_open(dir, flags, current_cred()); @@ -1240,7 +1249,8 @@ int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name, break; err = ksmbd_vfs_lookup_in_dir(&parent, filename, - filename_len); + filename_len, + work->conn->um); path_put(&parent); if (err) goto out; @@ -1743,11 +1753,11 @@ int ksmbd_vfs_copy_file_ranges(struct ksmbd_work *work, *total_size_written = 0; if (!(src_fp->daccess & (FILE_READ_DATA_LE | FILE_EXECUTE_LE))) { - pr_err("no right to read(%pd)\n", src_fp->filp->f_path.dentry); + pr_err("no right to read(%pD)\n", src_fp->filp); return -EACCES; } if (!(dst_fp->daccess & (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE))) { - pr_err("no right to write(%pd)\n", dst_fp->filp->f_path.dentry); + pr_err("no right to write(%pD)\n", dst_fp->filp); return -EACCES; } diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h index 70da4c0ba7ad..593059ca8511 100644 --- a/fs/ksmbd/vfs.h +++ b/fs/ksmbd/vfs.h @@ -12,6 +12,7 @@ #include <linux/namei.h> #include <uapi/linux/xattr.h> #include <linux/posix_acl.h> +#include <linux/unicode.h> #include "smbacl.h" #include "xattr.h" @@ -60,6 +61,7 @@ struct ksmbd_readdir_data { unsigned int used; unsigned int dirent_count; unsigned int file_attr; + struct unicode_map *um; }; /* ksmbd kstat wrapper to get valid create time when reading dir entry */ @@ -85,7 +87,7 @@ int ksmbd_vfs_fsync(struct ksmbd_work *work, u64 fid, u64 p_id); int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name); int ksmbd_vfs_link(struct ksmbd_work *work, const char *oldname, const char *newname); -int ksmbd_vfs_getattr(struct path *path, struct kstat *stat); +int ksmbd_vfs_getattr(const struct path *path, struct kstat *stat); int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp, char *newname); int ksmbd_vfs_truncate(struct ksmbd_work *work, diff --git a/fs/libfs.c b/fs/libfs.c index 31b0ddf01c31..682d56345a1c 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -15,6 +15,7 @@ #include <linux/mutex.h> #include <linux/namei.h> #include <linux/exportfs.h> +#include <linux/iversion.h> #include <linux/writeback.h> #include <linux/buffer_head.h> /* sync_mapping_buffers */ #include <linux/fs_context.h> @@ -1520,3 +1521,48 @@ void generic_set_encrypted_ci_d_ops(struct dentry *dentry) #endif } EXPORT_SYMBOL(generic_set_encrypted_ci_d_ops); + +/** + * inode_maybe_inc_iversion - increments i_version + * @inode: inode with the i_version that should be updated + * @force: increment the counter even if it's not necessary? + * + * Every time the inode is modified, the i_version field must be seen to have + * changed by any observer. + * + * If "force" is set or the QUERIED flag is set, then ensure that we increment + * the value, and clear the queried flag. + * + * In the common case where neither is set, then we can return "false" without + * updating i_version. + * + * If this function returns false, and no other metadata has changed, then we + * can avoid logging the metadata. + */ +bool inode_maybe_inc_iversion(struct inode *inode, bool force) +{ + u64 cur, new; + + /* + * The i_version field is not strictly ordered with any other inode + * information, but the legacy inode_inc_iversion code used a spinlock + * to serialize increments. + * + * Here, we add full memory barriers to ensure that any de-facto + * ordering with other info is preserved. + * + * This barrier pairs with the barrier in inode_query_iversion() + */ + smp_mb(); + cur = inode_peek_iversion_raw(inode); + do { + /* If flag is clear then we needn't do anything */ + if (!force && !(cur & I_VERSION_QUERIED)) + return false; + + /* Since lowest bit is flag, add 2 to avoid it */ + new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT; + } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); + return true; +} +EXPORT_SYMBOL(inode_maybe_inc_iversion); diff --git a/fs/mbcache.c b/fs/mbcache.c index 47ccfcbe0a22..e272ad738faf 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -90,8 +90,14 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, return -ENOMEM; INIT_LIST_HEAD(&entry->e_list); - /* Initial hash reference */ - atomic_set(&entry->e_refcnt, 1); + /* + * We create entry with two references. One reference is kept by the + * hash table, the other reference is used to protect us from + * mb_cache_entry_delete_or_get() until the entry is fully setup. This + * avoids nesting of cache->c_list_lock into hash table bit locks which + * is problematic for RT. + */ + atomic_set(&entry->e_refcnt, 2); entry->e_key = key; entry->e_value = value; entry->e_reusable = reusable; @@ -106,15 +112,12 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, } } hlist_bl_add_head(&entry->e_hash_list, head); - /* - * Add entry to LRU list before it can be found by - * mb_cache_entry_delete() to avoid races - */ + hlist_bl_unlock(head); spin_lock(&cache->c_list_lock); list_add_tail(&entry->e_list, &cache->c_list); cache->c_entry_count++; spin_unlock(&cache->c_list_lock); - hlist_bl_unlock(head); + mb_cache_entry_put(cache, entry); return 0; } diff --git a/fs/minix/namei.c b/fs/minix/namei.c index 937fa5fae2b8..8afdc408ca4f 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -53,16 +53,16 @@ static int minix_mknod(struct user_namespace *mnt_userns, struct inode *dir, } static int minix_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode) + struct file *file, umode_t mode) { int error; struct inode *inode = minix_new_inode(dir, mode, &error); if (inode) { minix_set_inode(inode, 0); mark_inode_dirty(inode); - d_tmpfile(dentry, inode); + d_tmpfile(file, inode); } - return error; + return finish_open_simple(file, error); } static int minix_create(struct user_namespace *mnt_userns, struct inode *dir, diff --git a/fs/namei.c b/fs/namei.c index 53b4bc094db2..9155ecb547ce 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -986,7 +986,7 @@ static int nd_jump_root(struct nameidata *nd) * Helper to directly jump to a known parsed path from ->get_link, * caller must have taken a reference to path beforehand. */ -int nd_jump_link(struct path *path) +int nd_jump_link(const struct path *path) { int error = -ELOOP; struct nameidata *nd = current->nameidata; @@ -1178,7 +1178,7 @@ static bool safe_hardlink_source(struct user_namespace *mnt_userns, * * Returns 0 if successful, -ve on error. */ -int may_linkat(struct user_namespace *mnt_userns, struct path *link) +int may_linkat(struct user_namespace *mnt_userns, const struct path *link) { struct inode *inode = link->dentry->d_inode; @@ -3583,72 +3583,95 @@ static int do_open(struct nameidata *nd, * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply passs init_user_ns. */ -struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns, - struct dentry *dentry, umode_t mode, int open_flag) +static int vfs_tmpfile(struct user_namespace *mnt_userns, + const struct path *parentpath, + struct file *file, umode_t mode) { - struct dentry *child = NULL; - struct inode *dir = dentry->d_inode; + struct dentry *child; + struct inode *dir = d_inode(parentpath->dentry); struct inode *inode; int error; + int open_flag = file->f_flags; /* we want directory to be writable */ error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC); if (error) - goto out_err; - error = -EOPNOTSUPP; + return error; if (!dir->i_op->tmpfile) - goto out_err; - error = -ENOMEM; - child = d_alloc(dentry, &slash_name); + return -EOPNOTSUPP; + child = d_alloc(parentpath->dentry, &slash_name); if (unlikely(!child)) - goto out_err; + return -ENOMEM; + file->f_path.mnt = parentpath->mnt; + file->f_path.dentry = child; mode = vfs_prepare_mode(mnt_userns, dir, mode, mode, mode); - error = dir->i_op->tmpfile(mnt_userns, dir, child, mode); + error = dir->i_op->tmpfile(mnt_userns, dir, file, mode); + dput(child); if (error) - goto out_err; - error = -ENOENT; - inode = child->d_inode; - if (unlikely(!inode)) - goto out_err; + return error; + /* Don't check for other permissions, the inode was just created */ + error = may_open(mnt_userns, &file->f_path, 0, file->f_flags); + if (error) + return error; + inode = file_inode(file); if (!(open_flag & O_EXCL)) { spin_lock(&inode->i_lock); inode->i_state |= I_LINKABLE; spin_unlock(&inode->i_lock); } ima_post_create_tmpfile(mnt_userns, inode); - return child; + return 0; +} -out_err: - dput(child); - return ERR_PTR(error); +/** + * vfs_tmpfile_open - open a tmpfile for kernel internal use + * @mnt_userns: user namespace of the mount the inode was found from + * @parentpath: path of the base directory + * @mode: mode of the new tmpfile + * @open_flag: flags + * @cred: credentials for open + * + * Create and open a temporary file. The file is not accounted in nr_files, + * hence this is only for kernel internal use, and must not be installed into + * file tables or such. + */ +struct file *vfs_tmpfile_open(struct user_namespace *mnt_userns, + const struct path *parentpath, + umode_t mode, int open_flag, const struct cred *cred) +{ + struct file *file; + int error; + + file = alloc_empty_file_noaccount(open_flag, cred); + if (!IS_ERR(file)) { + error = vfs_tmpfile(mnt_userns, parentpath, file, mode); + if (error) { + fput(file); + file = ERR_PTR(error); + } + } + return file; } -EXPORT_SYMBOL(vfs_tmpfile); +EXPORT_SYMBOL(vfs_tmpfile_open); static int do_tmpfile(struct nameidata *nd, unsigned flags, const struct open_flags *op, struct file *file) { struct user_namespace *mnt_userns; - struct dentry *child; struct path path; int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path); + if (unlikely(error)) return error; error = mnt_want_write(path.mnt); if (unlikely(error)) goto out; mnt_userns = mnt_user_ns(path.mnt); - child = vfs_tmpfile(mnt_userns, path.dentry, op->mode, op->open_flag); - error = PTR_ERR(child); - if (IS_ERR(child)) + error = vfs_tmpfile(mnt_userns, &path, file, op->mode); + if (error) goto out2; - dput(path.dentry); - path.dentry = child; - audit_inode(nd->name, child, 0); - /* Don't check for other permissions, the inode was just created */ - error = may_open(mnt_userns, &path, 0, op->open_flag); - if (!error) - error = vfs_open(&path, file); + audit_inode(nd->name, file->f_path.dentry, 0); out2: mnt_drop_write(path.mnt); out: @@ -5088,7 +5111,7 @@ int page_symlink(struct inode *inode, const char *symname, int len) const struct address_space_operations *aops = mapping->a_ops; bool nofs = !mapping_gfp_constraint(mapping, __GFP_FS); struct page *page; - void *fsdata; + void *fsdata = NULL; int err; unsigned int flags; diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 0ce535852151..7679a68e8193 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -17,9 +17,9 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) { struct netfs_io_subrequest *subreq; struct folio *folio; - unsigned int iopos, account = 0; pgoff_t start_page = rreq->start / PAGE_SIZE; pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1; + size_t account = 0; bool subreq_failed = false; XA_STATE(xas, &rreq->mapping->i_pages, start_page); @@ -39,18 +39,23 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) */ subreq = list_first_entry(&rreq->subrequests, struct netfs_io_subrequest, rreq_link); - iopos = 0; subreq_failed = (subreq->error < 0); trace_netfs_rreq(rreq, netfs_rreq_trace_unlock); rcu_read_lock(); xas_for_each(&xas, folio, last_page) { - unsigned int pgpos = (folio_index(folio) - start_page) * PAGE_SIZE; - unsigned int pgend = pgpos + folio_size(folio); + loff_t pg_end; bool pg_failed = false; + if (xas_retry(&xas, folio)) + continue; + + pg_end = folio_pos(folio) + folio_size(folio) - 1; + for (;;) { + loff_t sreq_end; + if (!subreq) { pg_failed = true; break; @@ -58,11 +63,11 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) folio_start_fscache(folio); pg_failed |= subreq_failed; - if (pgend < iopos + subreq->len) + sreq_end = subreq->start + subreq->len - 1; + if (pg_end < sreq_end) break; account += subreq->transferred; - iopos += subreq->len; if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) { subreq = list_next_entry(subreq, rreq_link); subreq_failed = (subreq->error < 0); @@ -70,7 +75,8 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) subreq = NULL; subreq_failed = false; } - if (pgend == iopos) + + if (pg_end == sreq_end) break; } diff --git a/fs/netfs/io.c b/fs/netfs/io.c index 428925899282..e374767d1b68 100644 --- a/fs/netfs/io.c +++ b/fs/netfs/io.c @@ -121,6 +121,9 @@ static void netfs_rreq_unmark_after_write(struct netfs_io_request *rreq, XA_STATE(xas, &rreq->mapping->i_pages, subreq->start / PAGE_SIZE); xas_for_each(&xas, folio, (subreq->start + subreq->len - 1) / PAGE_SIZE) { + if (xas_retry(&xas, folio)) + continue; + /* We might have multiple writes from the same huge * folio, but we mustn't unlock a folio more than once. */ diff --git a/fs/nfs/client.c b/fs/nfs/client.c index da8da5cdbbc1..f50e025ae406 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -280,7 +280,7 @@ EXPORT_SYMBOL_GPL(nfs_put_client); static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *data) { struct nfs_client *clp; - const struct sockaddr *sap = data->addr; + const struct sockaddr *sap = (struct sockaddr *)data->addr; struct nfs_net *nn = net_generic(data->net, nfs_net_id); int error; @@ -666,7 +666,7 @@ static int nfs_init_server(struct nfs_server *server, struct rpc_timeout timeparms; struct nfs_client_initdata cl_init = { .hostname = ctx->nfs_server.hostname, - .addr = (const struct sockaddr *)&ctx->nfs_server.address, + .addr = &ctx->nfs_server._address, .addrlen = ctx->nfs_server.addrlen, .nfs_mod = ctx->nfs_mod, .proto = ctx->nfs_server.protocol, diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 5c97cad741a7..ead8a0e06abf 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -228,8 +228,7 @@ again: * */ void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred, - fmode_t type, - const nfs4_stateid *stateid, + fmode_t type, const nfs4_stateid *stateid, unsigned long pagemod_limit) { struct nfs_delegation *delegation; @@ -239,25 +238,24 @@ void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred, delegation = rcu_dereference(NFS_I(inode)->delegation); if (delegation != NULL) { spin_lock(&delegation->lock); - if (nfs4_is_valid_delegation(delegation, 0)) { - nfs4_stateid_copy(&delegation->stateid, stateid); - delegation->type = type; - delegation->pagemod_limit = pagemod_limit; - oldcred = delegation->cred; - delegation->cred = get_cred(cred); - clear_bit(NFS_DELEGATION_NEED_RECLAIM, - &delegation->flags); - spin_unlock(&delegation->lock); - rcu_read_unlock(); - put_cred(oldcred); - trace_nfs4_reclaim_delegation(inode, type); - return; - } - /* We appear to have raced with a delegation return. */ + nfs4_stateid_copy(&delegation->stateid, stateid); + delegation->type = type; + delegation->pagemod_limit = pagemod_limit; + oldcred = delegation->cred; + delegation->cred = get_cred(cred); + clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); + if (test_and_clear_bit(NFS_DELEGATION_REVOKED, + &delegation->flags)) + atomic_long_inc(&nfs_active_delegations); spin_unlock(&delegation->lock); + rcu_read_unlock(); + put_cred(oldcred); + trace_nfs4_reclaim_delegation(inode, type); + } else { + rcu_read_unlock(); + nfs_inode_set_delegation(inode, cred, type, stateid, + pagemod_limit); } - rcu_read_unlock(); - nfs_inode_set_delegation(inode, cred, type, stateid, pagemod_limit); } static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 5d6c2ddc7ea6..f594dac436a7 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2022,7 +2022,7 @@ static int nfs_finish_open(struct nfs_open_context *ctx, err = finish_open(file, dentry, do_open); if (err) goto out; - if (S_ISREG(file->f_path.dentry->d_inode->i_mode)) + if (S_ISREG(file_inode(file)->i_mode)) nfs_file_set_open_context(file, ctx); else err = -EOPENSTALE; @@ -2489,9 +2489,8 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry) spin_unlock(&dentry->d_lock); goto out; } - if (dentry->d_fsdata) - /* old devname */ - kfree(dentry->d_fsdata); + /* old devname */ + kfree(dentry->d_fsdata); dentry->d_fsdata = NFS_FSDATA_BLOCKED; spin_unlock(&dentry->d_lock); diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c index e87d500ad95a..6603b5cee029 100644 --- a/fs/nfs/dns_resolve.c +++ b/fs/nfs/dns_resolve.c @@ -16,8 +16,9 @@ #include "dns_resolve.h" ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen, - struct sockaddr *sa, size_t salen) + struct sockaddr_storage *ss, size_t salen) { + struct sockaddr *sa = (struct sockaddr *)ss; ssize_t ret; char *ip_addr = NULL; int ip_len; @@ -341,7 +342,7 @@ out: } ssize_t nfs_dns_resolve_name(struct net *net, char *name, - size_t namelen, struct sockaddr *sa, size_t salen) + size_t namelen, struct sockaddr_storage *ss, size_t salen) { struct nfs_dns_ent key = { .hostname = name, @@ -354,7 +355,7 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, ret = do_cache_lookup_wait(nn->nfs_dns_resolve, &key, &item); if (ret == 0) { if (salen >= item->addrlen) { - memcpy(sa, &item->addr, item->addrlen); + memcpy(ss, &item->addr, item->addrlen); ret = item->addrlen; } else ret = -EOVERFLOW; diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h index 576ff4b54c82..fe3b172c4de1 100644 --- a/fs/nfs/dns_resolve.h +++ b/fs/nfs/dns_resolve.h @@ -32,6 +32,6 @@ extern void nfs_dns_resolver_cache_destroy(struct net *net); #endif extern ssize_t nfs_dns_resolve_name(struct net *net, char *name, - size_t namelen, struct sockaddr *sa, size_t salen); + size_t namelen, struct sockaddr_storage *sa, size_t salen); #endif diff --git a/fs/nfs/file.c b/fs/nfs/file.c index e032fe201a36..d8ec889a4b3f 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -567,7 +567,8 @@ static vm_fault_t nfs_vm_page_mkwrite(struct vm_fault *vmf) } wait_on_bit_action(&NFS_I(inode)->flags, NFS_INO_INVALIDATING, - nfs_wait_bit_killable, TASK_KILLABLE); + nfs_wait_bit_killable, + TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); lock_page(page); mapping = page_file_mapping(page); @@ -655,9 +656,9 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) goto out; } if (mntflags & NFS_MOUNT_WRITE_WAIT) { - result = filemap_fdatawait_range(file->f_mapping, - iocb->ki_pos - written, - iocb->ki_pos - 1); + filemap_fdatawait_range(file->f_mapping, + iocb->ki_pos - written, + iocb->ki_pos - 1); } result = generic_write_sync(iocb, written); if (result < 0) diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 7d285561e59f..1ec79ccf89ad 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -30,14 +30,20 @@ #define FF_LAYOUT_POLL_RETRY_MAX (15*HZ) #define FF_LAYOUTRETURN_MAXERR 20 +enum nfs4_ff_op_type { + NFS4_FF_OP_LAYOUTSTATS, + NFS4_FF_OP_LAYOUTRETURN, +}; + static unsigned short io_maxretrans; static const struct pnfs_commit_ops ff_layout_commit_ops; static void ff_layout_read_record_layoutstats_done(struct rpc_task *task, struct nfs_pgio_header *hdr); -static int ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, +static int +ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, struct nfs42_layoutstat_devinfo *devinfo, - int dev_limit); + int dev_limit, enum nfs4_ff_op_type type); static void ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr, const struct nfs42_layoutstat_devinfo *devinfo, struct nfs4_ff_layout_mirror *mirror); @@ -1373,6 +1379,11 @@ static int ff_layout_read_prepare_common(struct rpc_task *task, return -EIO; } + if (!pnfs_is_valid_lseg(hdr->lseg)) { + rpc_exit(task, -EAGAIN); + return -EAGAIN; + } + ff_layout_read_record_layoutstats_start(task, hdr); return 0; } @@ -1553,6 +1564,11 @@ static int ff_layout_write_prepare_common(struct rpc_task *task, return -EIO; } + if (!pnfs_is_valid_lseg(hdr->lseg)) { + rpc_exit(task, -EAGAIN); + return -EAGAIN; + } + ff_layout_write_record_layoutstats_start(task, hdr); return 0; } @@ -1645,15 +1661,23 @@ static void ff_layout_commit_record_layoutstats_done(struct rpc_task *task, set_bit(NFS_LSEG_LAYOUTRETURN, &cdata->lseg->pls_flags); } -static void ff_layout_commit_prepare_common(struct rpc_task *task, - struct nfs_commit_data *cdata) +static int ff_layout_commit_prepare_common(struct rpc_task *task, + struct nfs_commit_data *cdata) { + if (!pnfs_is_valid_lseg(cdata->lseg)) { + rpc_exit(task, -EAGAIN); + return -EAGAIN; + } + ff_layout_commit_record_layoutstats_start(task, cdata); + return 0; } static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data) { - ff_layout_commit_prepare_common(task, data); + if (ff_layout_commit_prepare_common(task, data)) + return; + rpc_call_start(task); } @@ -1949,6 +1973,65 @@ ff_layout_commit_pagelist(struct inode *inode, struct list_head *mds_pages, ff_layout_initiate_commit); } +static bool ff_layout_match_rw(const struct rpc_task *task, + const struct nfs_pgio_header *hdr, + const struct pnfs_layout_segment *lseg) +{ + return hdr->lseg == lseg; +} + +static bool ff_layout_match_commit(const struct rpc_task *task, + const struct nfs_commit_data *cdata, + const struct pnfs_layout_segment *lseg) +{ + return cdata->lseg == lseg; +} + +static bool ff_layout_match_io(const struct rpc_task *task, const void *data) +{ + const struct rpc_call_ops *ops = task->tk_ops; + + if (ops == &ff_layout_read_call_ops_v3 || + ops == &ff_layout_read_call_ops_v4 || + ops == &ff_layout_write_call_ops_v3 || + ops == &ff_layout_write_call_ops_v4) + return ff_layout_match_rw(task, task->tk_calldata, data); + if (ops == &ff_layout_commit_call_ops_v3 || + ops == &ff_layout_commit_call_ops_v4) + return ff_layout_match_commit(task, task->tk_calldata, data); + return false; +} + +static void ff_layout_cancel_io(struct pnfs_layout_segment *lseg) +{ + struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg); + struct nfs4_ff_layout_mirror *mirror; + struct nfs4_ff_layout_ds *mirror_ds; + struct nfs4_pnfs_ds *ds; + struct nfs_client *ds_clp; + struct rpc_clnt *clnt; + u32 idx; + + for (idx = 0; idx < flseg->mirror_array_cnt; idx++) { + mirror = flseg->mirror_array[idx]; + mirror_ds = mirror->mirror_ds; + if (!mirror_ds) + continue; + ds = mirror->mirror_ds->ds; + if (!ds) + continue; + ds_clp = ds->ds_clp; + if (!ds_clp) + continue; + clnt = ds_clp->cl_rpcclient; + if (!clnt) + continue; + if (!rpc_cancel_tasks(clnt, -EAGAIN, ff_layout_match_io, lseg)) + continue; + rpc_clnt_disconnect(clnt); + } +} + static struct pnfs_ds_commit_info * ff_layout_get_ds_info(struct inode *inode) { @@ -2161,8 +2244,9 @@ ff_layout_prepare_layoutreturn(struct nfs4_layoutreturn_args *args) FF_LAYOUTRETURN_MAXERR); spin_lock(&args->inode->i_lock); - ff_args->num_dev = ff_layout_mirror_prepare_stats(&ff_layout->generic_hdr, - &ff_args->devinfo[0], ARRAY_SIZE(ff_args->devinfo)); + ff_args->num_dev = ff_layout_mirror_prepare_stats( + &ff_layout->generic_hdr, &ff_args->devinfo[0], + ARRAY_SIZE(ff_args->devinfo), NFS4_FF_OP_LAYOUTRETURN); spin_unlock(&args->inode->i_lock); args->ld_private->ops = &layoutreturn_ops; @@ -2396,7 +2480,7 @@ static const struct nfs4_xdr_opaque_ops layoutstat_ops = { static int ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, struct nfs42_layoutstat_devinfo *devinfo, - int dev_limit) + int dev_limit, enum nfs4_ff_op_type type) { struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo); struct nfs4_ff_layout_mirror *mirror; @@ -2408,7 +2492,9 @@ ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, break; if (IS_ERR_OR_NULL(mirror->mirror_ds)) continue; - if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags)) + if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL, + &mirror->flags) && + type != NFS4_FF_OP_LAYOUTRETURN) continue; /* mirror refcount put in cleanup_layoutstats */ if (!refcount_inc_not_zero(&mirror->ref)) @@ -2448,7 +2534,9 @@ ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args) spin_lock(&args->inode->i_lock); ff_layout = FF_LAYOUT_FROM_HDR(NFS_I(args->inode)->layout); args->num_dev = ff_layout_mirror_prepare_stats(&ff_layout->generic_hdr, - &args->devinfo[0], dev_count); + &args->devinfo[0], + dev_count, + NFS4_FF_OP_LAYOUTSTATS); spin_unlock(&args->inode->i_lock); if (!args->num_dev) { kfree(args->devinfo); @@ -2501,6 +2589,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = { .prepare_layoutreturn = ff_layout_prepare_layoutreturn, .sync = pnfs_nfs_generic_sync, .prepare_layoutstats = ff_layout_prepare_layoutstats, + .cancel_io = ff_layout_cancel_io, }; static int __init nfs4flexfilelayout_init(void) diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index 4da701fd1424..09833ec102fc 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -273,9 +273,9 @@ static const struct constant_table nfs_secflavor_tokens[] = { * Address family must be initialized, and address must not be * the ANY address for that family. */ -static int nfs_verify_server_address(struct sockaddr *addr) +static int nfs_verify_server_address(struct sockaddr_storage *addr) { - switch (addr->sa_family) { + switch (addr->ss_family) { case AF_INET: { struct sockaddr_in *sa = (struct sockaddr_in *)addr; return sa->sin_addr.s_addr != htonl(INADDR_ANY); @@ -969,7 +969,7 @@ static int nfs23_parse_monolithic(struct fs_context *fc, { struct nfs_fs_context *ctx = nfs_fc2context(fc); struct nfs_fh *mntfh = ctx->mntfh; - struct sockaddr *sap = (struct sockaddr *)&ctx->nfs_server.address; + struct sockaddr_storage *sap = &ctx->nfs_server._address; int extra_flags = NFS_MOUNT_LEGACY_INTERFACE; int ret; @@ -1044,7 +1044,7 @@ static int nfs23_parse_monolithic(struct fs_context *fc, memcpy(sap, &data->addr, sizeof(data->addr)); ctx->nfs_server.addrlen = sizeof(data->addr); ctx->nfs_server.port = ntohs(data->addr.sin_port); - if (sap->sa_family != AF_INET || + if (sap->ss_family != AF_INET || !nfs_verify_server_address(sap)) goto out_no_address; @@ -1200,7 +1200,7 @@ static int nfs4_parse_monolithic(struct fs_context *fc, struct nfs4_mount_data *data) { struct nfs_fs_context *ctx = nfs_fc2context(fc); - struct sockaddr *sap = (struct sockaddr *)&ctx->nfs_server.address; + struct sockaddr_storage *sap = &ctx->nfs_server._address; int ret; char *c; @@ -1314,7 +1314,7 @@ static int nfs_fs_context_validate(struct fs_context *fc) { struct nfs_fs_context *ctx = nfs_fc2context(fc); struct nfs_subversion *nfs_mod; - struct sockaddr *sap = (struct sockaddr *)&ctx->nfs_server.address; + struct sockaddr_storage *sap = &ctx->nfs_server._address; int max_namelen = PAGE_SIZE; int max_pathlen = NFS_MAXPATHLEN; int port = 0; @@ -1540,7 +1540,7 @@ static int nfs_init_fs_context(struct fs_context *fc) ctx->version = nfss->nfs_client->rpc_ops->version; ctx->minorversion = nfss->nfs_client->cl_minorversion; - memcpy(&ctx->nfs_server.address, &nfss->nfs_client->cl_addr, + memcpy(&ctx->nfs_server._address, &nfss->nfs_client->cl_addr, ctx->nfs_server.addrlen); if (fc->net_ns != net) { diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index bea7c005119c..6b2cfa59a1a2 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -72,18 +72,13 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr) return nfs_fileid_to_ino_t(fattr->fileid); } -static int nfs_wait_killable(int mode) +int nfs_wait_bit_killable(struct wait_bit_key *key, int mode) { - freezable_schedule_unsafe(); + schedule(); if (signal_pending_state(mode, current)) return -ERESTARTSYS; return 0; } - -int nfs_wait_bit_killable(struct wait_bit_key *key, int mode) -{ - return nfs_wait_killable(mode); -} EXPORT_SYMBOL_GPL(nfs_wait_bit_killable); /** @@ -318,7 +313,7 @@ struct nfs_find_desc { static int nfs_find_actor(struct inode *inode, void *opaque) { - struct nfs_find_desc *desc = (struct nfs_find_desc *)opaque; + struct nfs_find_desc *desc = opaque; struct nfs_fh *fh = desc->fh; struct nfs_fattr *fattr = desc->fattr; @@ -336,7 +331,7 @@ nfs_find_actor(struct inode *inode, void *opaque) static int nfs_init_locked(struct inode *inode, void *opaque) { - struct nfs_find_desc *desc = (struct nfs_find_desc *)opaque; + struct nfs_find_desc *desc = opaque; struct nfs_fattr *fattr = desc->fattr; set_nfs_fileid(inode, fattr->fileid); @@ -1332,7 +1327,8 @@ int nfs_clear_invalid_mapping(struct address_space *mapping) */ for (;;) { ret = wait_on_bit_action(bitlock, NFS_INO_INVALIDATING, - nfs_wait_bit_killable, TASK_KILLABLE); + nfs_wait_bit_killable, + TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); if (ret) goto out; spin_lock(&inode->i_lock); @@ -2271,7 +2267,7 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi) static void init_once(void *foo) { - struct nfs_inode *nfsi = (struct nfs_inode *) foo; + struct nfs_inode *nfsi = foo; inode_init_once(&nfsi->vfs_inode); INIT_LIST_HEAD(&nfsi->open_files); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 898dd95bc7a7..647fc3f547cb 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -69,7 +69,7 @@ static inline fmode_t flags_to_mode(int flags) struct nfs_client_initdata { unsigned long init_flags; const char *hostname; /* Hostname of the server */ - const struct sockaddr *addr; /* Address of the server */ + const struct sockaddr_storage *addr; /* Address of the server */ const char *nodename; /* Hostname of the client */ const char *ip_addr; /* IP address of the client */ size_t addrlen; @@ -180,7 +180,7 @@ static inline struct nfs_fs_context *nfs_fc2context(const struct fs_context *fc) /* mount_clnt.c */ struct nfs_mount_request { - struct sockaddr *sap; + struct sockaddr_storage *sap; size_t salen; char *hostname; char *dirpath; @@ -223,7 +223,7 @@ extern void nfs4_server_set_init_caps(struct nfs_server *); extern struct nfs_server *nfs4_create_server(struct fs_context *); extern struct nfs_server *nfs4_create_referral_server(struct fs_context *); extern int nfs4_update_server(struct nfs_server *server, const char *hostname, - struct sockaddr *sap, size_t salen, + struct sockaddr_storage *sap, size_t salen, struct net *net); extern void nfs_free_server(struct nfs_server *server); extern struct nfs_server *nfs_clone_server(struct nfs_server *, @@ -235,7 +235,7 @@ extern int nfs_client_init_status(const struct nfs_client *clp); extern int nfs_wait_client_init_complete(const struct nfs_client *clp); extern void nfs_mark_client_ready(struct nfs_client *clp, int state); extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, - const struct sockaddr *ds_addr, + const struct sockaddr_storage *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, @@ -243,7 +243,7 @@ extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *, struct inode *); extern struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, - const struct sockaddr *ds_addr, int ds_addrlen, + const struct sockaddr_storage *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans); #ifdef CONFIG_PROC_FS @@ -435,7 +435,6 @@ extern void nfs_zap_acl_cache(struct inode *inode); extern void nfs_set_cache_invalid(struct inode *inode, unsigned long flags); extern bool nfs_check_cache_invalid(struct inode *, unsigned long); extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode); -extern int nfs_wait_atomic_killable(atomic_t *p, unsigned int mode); /* super.c */ extern const struct super_operations nfs_sops; @@ -503,7 +502,6 @@ extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, const struct nfs_pgio_completion_ops *compl_ops); extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio); extern void nfs_commit_free(struct nfs_commit_data *p); -extern void nfs_write_prepare(struct rpc_task *task, void *calldata); extern void nfs_commit_prepare(struct rpc_task *task, void *calldata); extern int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, @@ -896,13 +894,13 @@ static inline bool nfs_error_is_fatal_on_server(int err) * Select between a default port value and a user-specified port value. * If a zero value is set, then autobind will be used. */ -static inline void nfs_set_port(struct sockaddr *sap, int *port, +static inline void nfs_set_port(struct sockaddr_storage *sap, int *port, const unsigned short default_port) { if (*port == NFS_UNSPEC_PORT) *port = default_port; - rpc_set_port(sap, *port); + rpc_set_port((struct sockaddr *)sap, *port); } struct nfs_direct_req { diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index c5e3b6b3366a..68e76b626371 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -158,7 +158,7 @@ int nfs_mount(struct nfs_mount_request *info, int timeo, int retrans) struct rpc_create_args args = { .net = info->net, .protocol = info->protocol, - .address = info->sap, + .address = (struct sockaddr *)info->sap, .addrsize = info->salen, .timeout = &mnt_timeout, .servername = info->hostname, @@ -245,7 +245,7 @@ void nfs_umount(const struct nfs_mount_request *info) struct rpc_create_args args = { .net = info->net, .protocol = IPPROTO_UDP, - .address = info->sap, + .address = (struct sockaddr *)info->sap, .addrsize = info->salen, .timeout = &nfs_umnt_timeout, .servername = info->hostname, diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 3295af4110f1..2f336ace7555 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -175,7 +175,7 @@ struct vfsmount *nfs_d_automount(struct path *path) } /* for submounts we want the same server; referrals will reassign */ - memcpy(&ctx->nfs_server.address, &client->cl_addr, client->cl_addrlen); + memcpy(&ctx->nfs_server._address, &client->cl_addr, client->cl_addrlen); ctx->nfs_server.addrlen = client->cl_addrlen; ctx->nfs_server.port = server->port; diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c index b49359afac88..669cda757a5c 100644 --- a/fs/nfs/nfs3client.c +++ b/fs/nfs/nfs3client.c @@ -78,7 +78,7 @@ struct nfs_server *nfs3_clone_server(struct nfs_server *source, * the MDS. */ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, - const struct sockaddr *ds_addr, int ds_addrlen, + const struct sockaddr_storage *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans) { struct rpc_timeout ds_timeout; @@ -98,7 +98,7 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, char buf[INET6_ADDRSTRLEN + 1]; /* fake a hostname because lockd wants it */ - if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0) + if (rpc_ntop((struct sockaddr *)ds_addr, buf, sizeof(buf)) <= 0) return ERR_PTR(-EINVAL); cl_init.hostname = buf; diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 1597eef40d54..2e7579626cf0 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -36,7 +36,8 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) res = rpc_call_sync(clnt, msg, flags); if (res != -EJUKEBOX) break; - freezable_schedule_timeout_killable_unsafe(NFS_JUKEBOX_RETRY_TIME); + __set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); + schedule_timeout(NFS_JUKEBOX_RETRY_TIME); res = -ERESTARTSYS; } while (!fatal_signal_pending(current)); return res; diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 6dab9e408372..ecb428512fe1 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -341,7 +341,7 @@ static ssize_t _nfs42_proc_copy(struct file *src, return status; } } - status = nfs_filemap_write_and_wait_range(file_inode(src)->i_mapping, + status = nfs_filemap_write_and_wait_range(src->f_mapping, pos_src, pos_src + (loff_t)count - 1); if (status) return status; @@ -1093,6 +1093,9 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f, &args.seq_args, &res.seq_res, 0); trace_nfs4_clone(src_inode, dst_inode, &args, status); if (status == 0) { + /* a zero-length count means clone to EOF in src */ + if (count == 0 && res.dst_fattr->valid & NFS_ATTR_FATTR_SIZE) + count = nfs_size_to_loff_t(res.dst_fattr->size) - dst_offset; nfs42_copy_dest_done(dst_inode, dst_offset, count); status = nfs_post_op_update_inode(dst_inode, res.dst_fattr); } @@ -1175,6 +1178,7 @@ static int _nfs42_proc_removexattr(struct inode *inode, const char *name) ret = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 1); + trace_nfs4_removexattr(inode, name, ret); if (!ret) nfs4_update_changeattr(inode, &res.cinfo, timestamp, 0); @@ -1214,6 +1218,7 @@ static int _nfs42_proc_setxattr(struct inode *inode, const char *name, ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); + trace_nfs4_setxattr(inode, name, ret); for (; np > 0; np--) put_page(pages[np - 1]); @@ -1246,6 +1251,7 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name, ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 0); + trace_nfs4_getxattr(inode, name, ret); if (ret < 0) return ret; @@ -1317,6 +1323,7 @@ static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void *buf, ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 0); + trace_nfs4_listxattr(inode, ret); if (ret >= 0) { ret = res.copied; diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c index a9bf09fdf2c3..76ae11834206 100644 --- a/fs/nfs/nfs42xattr.c +++ b/fs/nfs/nfs42xattr.c @@ -981,7 +981,7 @@ nfs4_xattr_entry_count(struct shrinker *shrink, struct shrink_control *sc) static void nfs4_xattr_cache_init_once(void *p) { - struct nfs4_xattr_cache *cache = (struct nfs4_xattr_cache *)p; + struct nfs4_xattr_cache *cache = p; spin_lock_init(&cache->listxattr_lock); atomic_long_set(&cache->nent, 0); diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index b56f05113d36..fe1aeb0f048f 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -569,6 +569,14 @@ static int decode_listxattrs(struct xdr_stream *xdr, */ if (status == -ETOOSMALL) status = -ERANGE; + /* + * Special case: for LISTXATTRS, NFS4ERR_NOXATTR + * should be translated to success with zero-length reply. + */ + if (status == -ENODATA) { + res->eof = true; + status = 0; + } goto out; } diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 79df6e83881b..cfef738d765e 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -281,7 +281,7 @@ struct rpc_clnt *nfs4_negotiate_security(struct rpc_clnt *, struct inode *, int nfs4_submount(struct fs_context *, struct nfs_server *); int nfs4_replace_transport(struct nfs_server *server, const struct nfs4_fs_locations *locations); -size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr *sa, +size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr_storage *ss, size_t salen, struct net *net, int port); /* nfs4proc.c */ extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception *); @@ -459,7 +459,6 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *); /* nfs4renewd.c */ extern void nfs4_schedule_state_renewal(struct nfs_client *); -extern void nfs4_renewd_prepare_shutdown(struct nfs_server *); extern void nfs4_kill_renewd(struct nfs_client *); extern void nfs4_renew_state(struct work_struct *); extern void nfs4_set_lease_period(struct nfs_client *clp, unsigned long lease); diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 3c5678aec006..d3051b051a56 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -254,7 +254,7 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) goto error; ip_addr = (const char *)buf; } - strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); + strscpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); err = nfs_idmap_new(clp); if (err < 0) { @@ -346,6 +346,7 @@ int nfs40_init_client(struct nfs_client *clp) ret = nfs4_setup_slot_table(tbl, NFS4_MAX_SLOT_TABLE, "NFSv4.0 transport Slot table"); if (ret) { + nfs4_shutdown_slot_table(tbl); kfree(tbl); return ret; } @@ -889,7 +890,7 @@ nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr, */ static int nfs4_set_client(struct nfs_server *server, const char *hostname, - const struct sockaddr *addr, + const struct sockaddr_storage *addr, const size_t addrlen, const char *ip_addr, int proto, const struct rpc_timeout *timeparms, @@ -924,7 +925,7 @@ static int nfs4_set_client(struct nfs_server *server, __set_bit(NFS_CS_MIGRATION, &cl_init.init_flags); if (test_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status)) __set_bit(NFS_CS_TSM_POSSIBLE, &cl_init.init_flags); - server->port = rpc_get_port(addr); + server->port = rpc_get_port((struct sockaddr *)addr); /* Allocate or find a client reference we can use */ clp = nfs_get_client(&cl_init); @@ -960,7 +961,7 @@ static int nfs4_set_client(struct nfs_server *server, * the MDS. */ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, - const struct sockaddr *ds_addr, int ds_addrlen, + const struct sockaddr_storage *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, u32 minor_version) { @@ -980,7 +981,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, }; char buf[INET6_ADDRSTRLEN + 1]; - if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0) + if (rpc_ntop((struct sockaddr *)ds_addr, buf, sizeof(buf)) <= 0) return ERR_PTR(-EINVAL); cl_init.hostname = buf; @@ -1148,7 +1149,7 @@ static int nfs4_init_server(struct nfs_server *server, struct fs_context *fc) /* Get a client record */ error = nfs4_set_client(server, ctx->nfs_server.hostname, - &ctx->nfs_server.address, + &ctx->nfs_server._address, ctx->nfs_server.addrlen, ctx->client_address, ctx->nfs_server.protocol, @@ -1238,7 +1239,7 @@ struct nfs_server *nfs4_create_referral_server(struct fs_context *fc) rpc_set_port(&ctx->nfs_server.address, NFS_RDMA_PORT); error = nfs4_set_client(server, ctx->nfs_server.hostname, - &ctx->nfs_server.address, + &ctx->nfs_server._address, ctx->nfs_server.addrlen, parent_client->cl_ipaddr, XPRT_TRANSPORT_RDMA, @@ -1254,7 +1255,7 @@ struct nfs_server *nfs4_create_referral_server(struct fs_context *fc) rpc_set_port(&ctx->nfs_server.address, NFS_PORT); error = nfs4_set_client(server, ctx->nfs_server.hostname, - &ctx->nfs_server.address, + &ctx->nfs_server._address, ctx->nfs_server.addrlen, parent_client->cl_ipaddr, XPRT_TRANSPORT_TCP, @@ -1303,14 +1304,14 @@ error: * Returns zero on success, or a negative errno value. */ int nfs4_update_server(struct nfs_server *server, const char *hostname, - struct sockaddr *sap, size_t salen, struct net *net) + struct sockaddr_storage *sap, size_t salen, struct net *net) { struct nfs_client *clp = server->nfs_client; struct rpc_clnt *clnt = server->client; struct xprt_create xargs = { .ident = clp->cl_proto, .net = net, - .dstaddr = sap, + .dstaddr = (struct sockaddr *)sap, .addrlen = salen, .servername = hostname, }; diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index ec6afd3c4bca..e3fdd2f45b01 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -583,7 +583,7 @@ static int nfs_idmap_legacy_upcall(struct key *authkey, void *aux) struct request_key_auth *rka = get_request_key_auth(authkey); struct rpc_pipe_msg *msg; struct idmap_msg *im; - struct idmap *idmap = (struct idmap *)aux; + struct idmap *idmap = aux; struct key *key = rka->target_key; int ret = -ENOKEY; diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index f2dbf904c598..9a98595bb160 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -164,16 +164,17 @@ static int nfs4_validate_fspath(struct dentry *dentry, return 0; } -size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr *sa, +size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr_storage *ss, size_t salen, struct net *net, int port) { + struct sockaddr *sa = (struct sockaddr *)ss; ssize_t ret; ret = rpc_pton(net, string, len, sa, salen); if (ret == 0) { ret = rpc_uaddr2sockaddr(net, string, len, sa, salen); if (ret == 0) { - ret = nfs_dns_resolve_name(net, string, len, sa, salen); + ret = nfs_dns_resolve_name(net, string, len, ss, salen); if (ret < 0) ret = 0; } @@ -331,7 +332,7 @@ static int try_location(struct fs_context *fc, ctx->nfs_server.addrlen = nfs_parse_server_name(buf->data, buf->len, - &ctx->nfs_server.address, + &ctx->nfs_server._address, sizeof(ctx->nfs_server._address), fc->net_ns, 0); if (ctx->nfs_server.addrlen == 0) @@ -483,14 +484,13 @@ static int nfs4_try_replacing_one_location(struct nfs_server *server, char *page, char *page2, const struct nfs4_fs_location *location) { - const size_t addr_bufsize = sizeof(struct sockaddr_storage); struct net *net = rpc_net_ns(server->client); - struct sockaddr *sap; + struct sockaddr_storage *sap; unsigned int s; size_t salen; int error; - sap = kmalloc(addr_bufsize, GFP_KERNEL); + sap = kmalloc(sizeof(*sap), GFP_KERNEL); if (sap == NULL) return -ENOMEM; @@ -506,10 +506,10 @@ static int nfs4_try_replacing_one_location(struct nfs_server *server, continue; salen = nfs_parse_server_name(buf->data, buf->len, - sap, addr_bufsize, net, 0); + sap, sizeof(*sap), net, 0); if (salen == 0) continue; - rpc_set_port(sap, NFS_PORT); + rpc_set_port((struct sockaddr *)sap, NFS_PORT); error = -ENOMEM; hostname = kmemdup_nul(buf->data, buf->len, GFP_KERNEL); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 3ed14a2a84a4..86ed5c0142c3 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -416,8 +416,8 @@ static int nfs4_delay_killable(long *timeout) { might_sleep(); - freezable_schedule_timeout_killable_unsafe( - nfs4_update_delay(timeout)); + __set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); + schedule_timeout(nfs4_update_delay(timeout)); if (!__fatal_signal_pending(current)) return 0; return -EINTR; @@ -427,7 +427,8 @@ static int nfs4_delay_interruptible(long *timeout) { might_sleep(); - freezable_schedule_timeout_interruptible_unsafe(nfs4_update_delay(timeout)); + __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE_UNSAFE); + schedule_timeout(nfs4_update_delay(timeout)); if (!signal_pending(current)) return 0; return __fatal_signal_pending(current) ? -EINTR :-ERESTARTSYS; @@ -3950,7 +3951,7 @@ static void test_fs_location_for_trunking(struct nfs4_fs_location *location, for (i = 0; i < location->nservers; i++) { struct nfs4_string *srv_loc = &location->servers[i]; - struct sockaddr addr; + struct sockaddr_storage addr; size_t addrlen; struct xprt_create xprt_args = { .ident = 0, @@ -3973,7 +3974,7 @@ static void test_fs_location_for_trunking(struct nfs4_fs_location *location, clp->cl_net, server->port); if (!addrlen) return; - xprt_args.dstaddr = &addr; + xprt_args.dstaddr = (struct sockaddr *)&addr; xprt_args.addrlen = addrlen; servername = kmalloc(srv_loc->len + 1, GFP_KERNEL); if (!servername) @@ -6607,7 +6608,7 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data) struct nfs4_delegreturndata *d_data; struct pnfs_layout_hdr *lo; - d_data = (struct nfs4_delegreturndata *)data; + d_data = data; if (!d_data->lr.roc && nfs4_wait_on_layoutreturn(d_data->inode, task)) { nfs4_sequence_done(task, &d_data->res.seq_res); @@ -7137,6 +7138,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) { struct nfs4_lockdata *data = calldata; struct nfs4_lock_state *lsp = data->lsp; + struct nfs_server *server = NFS_SERVER(d_inode(data->ctx->dentry)); if (!nfs4_sequence_done(task, &data->res.seq_res)) return; @@ -7144,8 +7146,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) data->rpc_status = task->tk_status; switch (task->tk_status) { case 0: - renew_lease(NFS_SERVER(d_inode(data->ctx->dentry)), - data->timestamp); + renew_lease(server, data->timestamp); if (data->arg.new_lock && !data->cancelled) { data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS); if (locks_lock_inode_wait(lsp->ls_state->inode, &data->fl) < 0) @@ -7166,6 +7167,8 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) if (!nfs4_stateid_match(&data->arg.open_stateid, &lsp->ls_state->open_stateid)) goto out_restart; + else if (nfs4_async_handle_error(task, server, lsp->ls_state, NULL) == -EAGAIN) + goto out_restart; } else if (!nfs4_stateid_match(&data->arg.lock_stateid, &lsp->ls_stateid)) goto out_restart; @@ -7406,7 +7409,8 @@ nfs4_retry_setlk_simple(struct nfs4_state *state, int cmd, status = nfs4_proc_setlk(state, cmd, request); if ((status != -EAGAIN) || IS_SETLK(cmd)) break; - freezable_schedule_timeout_interruptible(timeout); + __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); + schedule_timeout(timeout); timeout *= 2; timeout = min_t(unsigned long, NFS4_LOCK_MAXTIMEOUT, timeout); status = -ERESTARTSYS; @@ -7474,10 +7478,8 @@ nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) break; status = -ERESTARTSYS; - freezer_do_not_count(); - wait_woken(&waiter.wait, TASK_INTERRUPTIBLE, + wait_woken(&waiter.wait, TASK_INTERRUPTIBLE|TASK_FREEZABLE, NFS4_LOCK_MAXTIMEOUT); - freezer_count(); } while (!signalled()); remove_wait_queue(q, &waiter.wait); @@ -8900,7 +8902,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, const struct cred *cred) void nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *data) { - struct nfs4_add_xprt_data *adata = (struct nfs4_add_xprt_data *)data; + struct nfs4_add_xprt_data *adata = data; struct rpc_task *task; int status; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 9bab3e9c702a..a2d2d5d1b088 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -497,8 +497,7 @@ nfs4_alloc_state_owner(struct nfs_server *server, sp = kzalloc(sizeof(*sp), gfp_flags); if (!sp) return NULL; - sp->so_seqid.owner_id = ida_simple_get(&server->openowner_id, 0, 0, - gfp_flags); + sp->so_seqid.owner_id = ida_alloc(&server->openowner_id, gfp_flags); if (sp->so_seqid.owner_id < 0) { kfree(sp); return NULL; @@ -534,7 +533,7 @@ static void nfs4_free_state_owner(struct nfs4_state_owner *sp) { nfs4_destroy_seqid_counter(&sp->so_seqid); put_cred(sp->so_cred); - ida_simple_remove(&sp->so_server->openowner_id, sp->so_seqid.owner_id); + ida_free(&sp->so_server->openowner_id, sp->so_seqid.owner_id); kfree(sp); } @@ -877,8 +876,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f refcount_set(&lsp->ls_count, 1); lsp->ls_state = state; lsp->ls_owner = fl_owner; - lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id, - 0, 0, GFP_KERNEL_ACCOUNT); + lsp->ls_seqid.owner_id = ida_alloc(&server->lockowner_id, GFP_KERNEL_ACCOUNT); if (lsp->ls_seqid.owner_id < 0) goto out_free; INIT_LIST_HEAD(&lsp->ls_locks); @@ -890,7 +888,7 @@ out_free: void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp) { - ida_simple_remove(&server->lockowner_id, lsp->ls_seqid.owner_id); + ida_free(&server->lockowner_id, lsp->ls_seqid.owner_id); nfs4_destroy_seqid_counter(&lsp->ls_seqid); kfree(lsp); } @@ -1314,7 +1312,8 @@ int nfs4_wait_clnt_recover(struct nfs_client *clp) refcount_inc(&clp->cl_count); res = wait_on_bit_action(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING, - nfs_wait_bit_killable, TASK_KILLABLE); + nfs_wait_bit_killable, + TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); if (res) goto out; if (clp->cl_cons_state < 0) @@ -1787,6 +1786,7 @@ static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp, static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp) { + set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); /* Mark all delegations for reclaim */ nfs_delegation_mark_reclaim(clp); nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_reboot); @@ -2671,6 +2671,7 @@ static void nfs4_state_manager(struct nfs_client *clp) if (status < 0) goto out_error; nfs4_state_end_reclaim_reboot(clp); + continue; } /* Detect expired delegations... */ diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 6ee6ad3674a2..2cff5901c689 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -2097,6 +2097,7 @@ TRACE_EVENT(ff_layout_commit_error, ) ); +#ifdef CONFIG_NFS_V4_2 TRACE_DEFINE_ENUM(NFS4_CONTENT_DATA); TRACE_DEFINE_ENUM(NFS4_CONTENT_HOLE); @@ -2105,7 +2106,6 @@ TRACE_DEFINE_ENUM(NFS4_CONTENT_HOLE); { NFS4_CONTENT_DATA, "DATA" }, \ { NFS4_CONTENT_HOLE, "HOLE" }) -#ifdef CONFIG_NFS_V4_2 TRACE_EVENT(nfs4_llseek, TP_PROTO( const struct inode *inode, @@ -2496,6 +2496,54 @@ TRACE_EVENT(nfs4_offload_cancel, __entry->stateid_seq, __entry->stateid_hash ) ); + +DECLARE_EVENT_CLASS(nfs4_xattr_event, + TP_PROTO( + const struct inode *inode, + const char *name, + int error + ), + + TP_ARGS(inode, name, error), + + TP_STRUCT__entry( + __field(unsigned long, error) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __string(name, name) + ), + + TP_fast_assign( + __entry->error = error < 0 ? -error : 0; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __assign_str(name, name); + ), + + TP_printk( + "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "name=%s", + -__entry->error, show_nfs4_status(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, __get_str(name) + ) +); +#define DEFINE_NFS4_XATTR_EVENT(name) \ + DEFINE_EVENT(nfs4_xattr_event, name, \ + TP_PROTO( \ + const struct inode *inode, \ + const char *name, \ + int error \ + ), \ + TP_ARGS(inode, name, error)) +DEFINE_NFS4_XATTR_EVENT(nfs4_getxattr); +DEFINE_NFS4_XATTR_EVENT(nfs4_setxattr); +DEFINE_NFS4_XATTR_EVENT(nfs4_removexattr); + +DEFINE_NFS4_INODE_EVENT(nfs4_listxattr); #endif /* CONFIG_NFS_V4_2 */ #endif /* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index fa148308822c..620329b7e6ae 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -139,7 +139,7 @@ static int __init nfs_root_setup(char *line) ROOT_DEV = Root_NFS; if (line[0] == '/' || line[0] == ',' || (line[0] >= '0' && line[0] <= '9')) { - strlcpy(nfs_root_parms, line, sizeof(nfs_root_parms)); + strscpy(nfs_root_parms, line, sizeof(nfs_root_parms)); } else { size_t n = strlen(line) + sizeof(NFS_ROOT) - 1; if (n >= sizeof(nfs_root_parms)) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 2613b7e36eb9..a5db5158c634 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -710,6 +710,7 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, u32 seq) { struct pnfs_layout_segment *lseg, *next; + struct nfs_server *server = NFS_SERVER(lo->plh_inode); int remaining = 0; dprintk("%s:Begin lo %p\n", __func__, lo); @@ -722,8 +723,10 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, "offset %llu length %llu\n", __func__, lseg, lseg->pls_range.iomode, lseg->pls_seq, lseg->pls_range.offset, lseg->pls_range.length); - if (!mark_lseg_invalid(lseg, tmp_list)) - remaining++; + if (mark_lseg_invalid(lseg, tmp_list)) + continue; + remaining++; + pnfs_lseg_cancel_io(server, lseg); } dprintk("%s:Return %i\n", __func__, remaining); return remaining; @@ -1908,7 +1911,7 @@ static int pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo) pnfs_layoutcommit_inode(lo->plh_inode, false); return wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN, nfs_wait_bit_killable, - TASK_KILLABLE); + TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); } static void nfs_layoutget_begin(struct pnfs_layout_hdr *lo) @@ -2485,6 +2488,7 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, u32 seq) { struct pnfs_layout_segment *lseg, *next; + struct nfs_server *server = NFS_SERVER(lo->plh_inode); int remaining = 0; dprintk("%s:Begin lo %p\n", __func__, lo); @@ -2507,6 +2511,7 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, continue; remaining++; set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags); + pnfs_lseg_cancel_io(server, lseg); } if (remaining) { @@ -3192,7 +3197,7 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) status = wait_on_bit_lock_action(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING, nfs_wait_bit_killable, - TASK_KILLABLE); + TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); if (status) goto out; } diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index f331f067691b..e3e6a41f19de 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -169,6 +169,8 @@ struct pnfs_layoutdriver_type { void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data); int (*prepare_layoutcommit) (struct nfs4_layoutcommit_args *args); int (*prepare_layoutstats) (struct nfs42_layoutstat_args *args); + + void (*cancel_io)(struct pnfs_layout_segment *lseg); }; struct pnfs_commit_ops { @@ -685,6 +687,13 @@ pnfs_lseg_request_intersecting(struct pnfs_layout_segment *lseg, struct nfs_page req_offset(req), req_last); } +static inline void pnfs_lseg_cancel_io(struct nfs_server *server, + struct pnfs_layout_segment *lseg) +{ + if (server->pnfs_curr_ld->cancel_io) + server->pnfs_curr_ld->cancel_io(lseg); +} + extern unsigned int layoutstats_timer; #ifdef NFS_DEBUG diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 657c242a18ff..5d035dd2d7bf 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -374,12 +374,12 @@ pnfs_bucket_search_commit_reqs(struct pnfs_commit_bucket *buckets, return NULL; } -/* pnfs_generic_search_commit_reqs - Search lists in @cinfo for the head reqest +/* pnfs_generic_search_commit_reqs - Search lists in @cinfo for the head request * for @page * @cinfo - commit info for current inode * @page - page to search for matching head request * - * Returns a the head request if one is found, otherwise returns NULL. + * Return: the head request if one is found, otherwise %NULL. */ struct nfs_page * pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page) @@ -821,7 +821,7 @@ static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds) static struct nfs_client *(*get_v3_ds_connect)( struct nfs_server *mds_srv, - const struct sockaddr *ds_addr, + const struct sockaddr_storage *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, @@ -882,7 +882,7 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, continue; } clp = get_v3_ds_connect(mds_srv, - (struct sockaddr *)&da->da_addr, + &da->da_addr, da->da_addrlen, da->da_transport, timeo, retrans); if (IS_ERR(clp)) @@ -951,7 +951,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, put_cred(xprtdata.cred); } else { clp = nfs4_set_ds_client(mds_srv, - (struct sockaddr *)&da->da_addr, + &da->da_addr, da->da_addrlen, da->da_transport, timeo, retrans, minor_version); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index ee66ffdb985e..05ae23657527 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -822,8 +822,7 @@ static int nfs_request_mount(struct fs_context *fc, { struct nfs_fs_context *ctx = nfs_fc2context(fc); struct nfs_mount_request request = { - .sap = (struct sockaddr *) - &ctx->mount_server.address, + .sap = &ctx->mount_server._address, .dirpath = ctx->nfs_server.export_path, .protocol = ctx->mount_server.protocol, .fh = root_fh, @@ -854,7 +853,7 @@ static int nfs_request_mount(struct fs_context *fc, * Construct the mount server's address. */ if (ctx->mount_server.address.sa_family == AF_UNSPEC) { - memcpy(request.sap, &ctx->nfs_server.address, + memcpy(request.sap, &ctx->nfs_server._address, ctx->nfs_server.addrlen); ctx->mount_server.addrlen = ctx->nfs_server.addrlen; } diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 5d680045fa2c..78b8cd9651d5 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -266,7 +266,7 @@ struct nfs4_dir_ctx { struct list_head names; }; -static int +static bool nfsd4_build_namelist(struct dir_context *__ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { @@ -275,14 +275,14 @@ nfsd4_build_namelist(struct dir_context *__ctx, const char *name, int namlen, struct name_list *entry; if (namlen != HEXDIR_LEN - 1) - return 0; + return true; entry = kmalloc(sizeof(struct name_list), GFP_KERNEL); if (entry == NULL) - return -ENOMEM; + return false; memcpy(entry->name, name, HEXDIR_LEN - 1); entry->name[HEXDIR_LEN - 1] = '\0'; list_add(&entry->list, &ctx->names); - return 0; + return true; } static int diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 4f4706f7139b..836bd825ca4a 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4375,8 +4375,8 @@ nfsd4_init_leases_net(struct nfsd_net *nn) nn->nfsd4_grace = 90; nn->somebody_reclaimed = false; nn->track_reclaim_completes = false; - nn->clverifier_counter = prandom_u32(); - nn->clientid_base = prandom_u32(); + nn->clverifier_counter = get_random_u32(); + nn->clientid_base = get_random_u32(); nn->clientid_counter = nn->clientid_base + 1; nn->s2s_cp_cl_id = nn->clientid_counter++; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 31bc7cc82439..ac3c3844cfc6 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1855,7 +1855,7 @@ struct readdir_data { int full; }; -static int nfsd_buffered_filldir(struct dir_context *ctx, const char *name, +static bool nfsd_buffered_filldir(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { @@ -1867,7 +1867,7 @@ static int nfsd_buffered_filldir(struct dir_context *ctx, const char *name, reclen = ALIGN(sizeof(struct buffered_dirent) + namlen, sizeof(u64)); if (buf->used + reclen > PAGE_SIZE) { buf->full = 1; - return -EINVAL; + return false; } de->namlen = namlen; @@ -1877,7 +1877,7 @@ static int nfsd_buffered_filldir(struct dir_context *ctx, const char *name, memcpy(de->name, name, namlen); buf->used += reclen; - return 0; + return true; } static __be32 nfsd_buffered_readdir(struct file *file, struct svc_fh *fhp, diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 9f4d9432d38a..b9d15c3df3cc 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -1668,8 +1668,7 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *btree, __u64 key) maxkey = nilfs_btree_node_get_key(node, nchildren - 1); nextmaxkey = (nchildren > 1) ? nilfs_btree_node_get_key(node, nchildren - 2) : 0; - if (bh != NULL) - brelse(bh); + brelse(bh); return (maxkey == key) && (nextmaxkey < NILFS_BMAP_LARGE_LOW); } @@ -1717,8 +1716,7 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *btree, ptrs[i] = le64_to_cpu(dptrs[i]); } - if (bh != NULL) - brelse(bh); + brelse(bh); return nitems; } diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 67f63cfeade5..232dd7b6cca1 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -328,6 +328,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) struct inode *inode; struct nilfs_inode_info *ii; struct nilfs_root *root; + struct buffer_head *bh; int err = -ENOMEM; ino_t ino; @@ -343,11 +344,25 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) ii->i_state = BIT(NILFS_I_NEW); ii->i_root = root; - err = nilfs_ifile_create_inode(root->ifile, &ino, &ii->i_bh); + err = nilfs_ifile_create_inode(root->ifile, &ino, &bh); if (unlikely(err)) goto failed_ifile_create_inode; /* reference count of i_bh inherits from nilfs_mdt_read_block() */ + if (unlikely(ino < NILFS_USER_INO)) { + nilfs_warn(sb, + "inode bitmap is inconsistent for reserved inodes"); + do { + brelse(bh); + err = nilfs_ifile_create_inode(root->ifile, &ino, &bh); + if (unlikely(err)) + goto failed_ifile_create_inode; + } while (ino < NILFS_USER_INO); + + nilfs_info(sb, "repaired inode bitmap for reserved inodes"); + } + ii->i_bh = bh; + atomic64_inc(&root->inodes_count); inode_init_owner(&init_user_ns, inode, dir, mode); inode->i_ino = ino; @@ -440,6 +455,8 @@ int nilfs_read_inode_common(struct inode *inode, inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); + if (nilfs_is_metadata_file_inode(inode) && !S_ISREG(inode->i_mode)) + return -EIO; /* this inode is for metadata and corrupted */ if (inode->i_nlink == 0) return -ESTALE; /* this inode is deleted */ diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 3267e96c256c..39b7eea2642a 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -480,41 +480,36 @@ unsigned long nilfs_find_uncommitted_extent(struct inode *inode, sector_t start_blk, sector_t *blkoff) { - unsigned int i; + unsigned int i, nr_folios; pgoff_t index; - unsigned int nblocks_in_page; unsigned long length = 0; - sector_t b; - struct pagevec pvec; - struct page *page; + struct folio_batch fbatch; + struct folio *folio; if (inode->i_mapping->nrpages == 0) return 0; index = start_blk >> (PAGE_SHIFT - inode->i_blkbits); - nblocks_in_page = 1U << (PAGE_SHIFT - inode->i_blkbits); - pagevec_init(&pvec); + folio_batch_init(&fbatch); repeat: - pvec.nr = find_get_pages_contig(inode->i_mapping, index, PAGEVEC_SIZE, - pvec.pages); - if (pvec.nr == 0) + nr_folios = filemap_get_folios_contig(inode->i_mapping, &index, ULONG_MAX, + &fbatch); + if (nr_folios == 0) return length; - if (length > 0 && pvec.pages[0]->index > index) - goto out; - - b = pvec.pages[0]->index << (PAGE_SHIFT - inode->i_blkbits); i = 0; do { - page = pvec.pages[i]; + folio = fbatch.folios[i]; - lock_page(page); - if (page_has_buffers(page)) { + folio_lock(folio); + if (folio_buffers(folio)) { struct buffer_head *bh, *head; + sector_t b; - bh = head = page_buffers(page); + b = folio->index << (PAGE_SHIFT - inode->i_blkbits); + bh = head = folio_buffers(folio); do { if (b < start_blk) continue; @@ -529,21 +524,17 @@ repeat: } else { if (length > 0) goto out_locked; - - b += nblocks_in_page; } - unlock_page(page); + folio_unlock(folio); - } while (++i < pagevec_count(&pvec)); + } while (++i < nr_folios); - index = page->index + 1; - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); goto repeat; out_locked: - unlock_page(page); -out: - pagevec_release(&pvec); + folio_unlock(folio); + folio_batch_release(&fbatch); return length; } diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 0afe0832c754..3335ef352915 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -317,7 +317,7 @@ void nilfs_relax_pressure_in_lock(struct super_block *sb) struct the_nilfs *nilfs = sb->s_fs_info; struct nilfs_sc_info *sci = nilfs->ns_writer; - if (!sci || !sci->sc_flush_request) + if (sb_rdonly(sb) || unlikely(!sci) || !sci->sc_flush_request) return; set_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags); @@ -875,9 +875,11 @@ static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci) nilfs_mdt_mark_dirty(nilfs->ns_cpfile); nilfs_cpfile_put_checkpoint( nilfs->ns_cpfile, nilfs->ns_cno, bh_cp); - } else - WARN_ON(err == -EINVAL || err == -ENOENT); - + } else if (err == -EINVAL || err == -ENOENT) { + nilfs_error(sci->sc_super, + "checkpoint creation failed due to metadata corruption."); + err = -EIO; + } return err; } @@ -891,7 +893,11 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci) err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 0, &raw_cp, &bh_cp); if (unlikely(err)) { - WARN_ON(err == -EINVAL || err == -ENOENT); + if (err == -EINVAL || err == -ENOENT) { + nilfs_error(sci->sc_super, + "checkpoint finalization failed due to metadata corruption."); + err = -EIO; + } goto failed_ibh; } raw_cp->cp_snapshot_list.ssl_next = 0; @@ -2235,16 +2241,14 @@ int nilfs_construct_segment(struct super_block *sb) struct the_nilfs *nilfs = sb->s_fs_info; struct nilfs_sc_info *sci = nilfs->ns_writer; struct nilfs_transaction_info *ti; - int err; - if (!sci) + if (sb_rdonly(sb) || unlikely(!sci)) return -EROFS; /* A call inside transactions causes a deadlock. */ BUG_ON((ti = current->journal_info) && ti->ti_magic == NILFS_TI_MAGIC); - err = nilfs_segctor_sync(sci); - return err; + return nilfs_segctor_sync(sci); } /** @@ -2276,7 +2280,7 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode, struct nilfs_transaction_info ti; int err = 0; - if (!sci) + if (sb_rdonly(sb) || unlikely(!sci)) return -EROFS; nilfs_transaction_lock(sb, &ti, 0); @@ -2772,11 +2776,12 @@ int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root) if (nilfs->ns_writer) { /* - * This happens if the filesystem was remounted - * read/write after nilfs_error degenerated it into a - * read-only mount. + * This happens if the filesystem is made read-only by + * __nilfs_error or nilfs_remount and then remounted + * read/write. In these cases, reuse the existing + * writer. */ - nilfs_detach_log_writer(sb); + return 0; } nilfs->ns_writer = nilfs_segctor_new(sb, root); @@ -2786,10 +2791,9 @@ int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root) inode_attach_wb(nilfs->ns_bdev->bd_inode, NULL); err = nilfs_segctor_start_thread(nilfs->ns_writer); - if (err) { - kfree(nilfs->ns_writer); - nilfs->ns_writer = NULL; - } + if (unlikely(err)) + nilfs_detach_log_writer(sb); + return err; } diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 77ff8e95421f..dc359b56fdfa 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -495,14 +495,22 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum, int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum) { struct buffer_head *bh; + void *kaddr; + struct nilfs_segment_usage *su; int ret; + down_write(&NILFS_MDT(sufile)->mi_sem); ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh); if (!ret) { mark_buffer_dirty(bh); nilfs_mdt_mark_dirty(sufile); + kaddr = kmap_atomic(bh->b_page); + su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr); + nilfs_segment_usage_set_dirty(su); + kunmap_atomic(kaddr); brelse(bh); } + up_write(&NILFS_MDT(sufile)->mi_sem); return ret; } diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index ba108f915391..6edb6e0dd61f 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1133,8 +1133,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) goto out; if (*flags & SB_RDONLY) { - /* Shutting down log writer */ - nilfs_detach_log_writer(sb); sb->s_flags |= SB_RDONLY; /* diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 3b4a079c9617..c8b89b4f94e0 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -690,9 +690,7 @@ int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks) { unsigned long ncleansegs; - down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile); - up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment; return 0; } diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index cd7d09a569ff..a2a15bc4df28 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -18,7 +18,7 @@ #include "fanotify.h" -static bool fanotify_path_equal(struct path *p1, struct path *p2) +static bool fanotify_path_equal(const struct path *p1, const struct path *p2) { return p1->mnt == p2->mnt && p1->dentry == p2->dentry; } diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 1d9f11255c64..57f51a9a3015 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -452,13 +452,7 @@ static inline bool fanotify_is_error_event(u32 mask) return mask & FAN_FS_ERROR; } -static inline bool fanotify_event_has_path(struct fanotify_event *event) -{ - return event->type == FANOTIFY_EVENT_TYPE_PATH || - event->type == FANOTIFY_EVENT_TYPE_PATH_PERM; -} - -static inline struct path *fanotify_event_path(struct fanotify_event *event) +static inline const struct path *fanotify_event_path(struct fanotify_event *event) { if (event->type == FANOTIFY_EVENT_TYPE_PATH) return &FANOTIFY_PE(event)->path; diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index f0e49a406ffa..4546da4a54f9 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -249,7 +249,7 @@ out: return event; } -static int create_fd(struct fsnotify_group *group, struct path *path, +static int create_fd(struct fsnotify_group *group, const struct path *path, struct file **file) { int client_fd; @@ -619,7 +619,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, char __user *buf, size_t count) { struct fanotify_event_metadata metadata; - struct path *path = fanotify_event_path(event); + const struct path *path = fanotify_event_path(event); struct fanotify_info *info = fanotify_event_info(event); unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES); unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD; @@ -1553,7 +1553,7 @@ static int fanotify_test_fid(struct dentry *dentry) } static int fanotify_events_supported(struct fsnotify_group *group, - struct path *path, __u64 mask, + const struct path *path, __u64 mask, unsigned int flags) { unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index 87d8a50ee803..fde74eb333cc 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -76,10 +76,6 @@ static inline void fsnotify_clear_marks_by_sb(struct super_block *sb) */ extern void __fsnotify_update_child_dentry_flags(struct inode *inode); -/* allocate and destroy and event holder to attach events to notification/access queues */ -extern struct fsnotify_event_holder *fsnotify_alloc_event_holder(void); -extern void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder); - extern struct kmem_cache *fsnotify_mark_connector_cachep; #endif /* __FS_NOTIFY_FSNOTIFY_H_ */ diff --git a/fs/nsfs.c b/fs/nsfs.c index 800c1d0eb0d0..3506f6074288 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -28,7 +28,7 @@ static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) struct inode *inode = d_inode(dentry); const struct proc_ns_operations *ns_ops = dentry->d_fsdata; - return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]", + return dynamic_dname(buffer, buflen, "%s:[%lu]", ns_ops->name, inode->i_ino); } diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c index 52615e6090e1..a3865bc4a0c6 100644 --- a/fs/ntfs/attrib.c +++ b/fs/ntfs/attrib.c @@ -594,17 +594,37 @@ static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name, for (;; a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) { u8 *mrec_end = (u8 *)ctx->mrec + le32_to_cpu(ctx->mrec->bytes_allocated); - u8 *name_end = (u8 *)a + le16_to_cpu(a->name_offset) + - a->name_length * sizeof(ntfschar); - if ((u8*)a < (u8*)ctx->mrec || (u8*)a > mrec_end || - name_end > mrec_end) + u8 *name_end; + + /* check whether ATTR_RECORD wrap */ + if ((u8 *)a < (u8 *)ctx->mrec) + break; + + /* check whether Attribute Record Header is within bounds */ + if ((u8 *)a > mrec_end || + (u8 *)a + sizeof(ATTR_RECORD) > mrec_end) + break; + + /* check whether ATTR_RECORD's name is within bounds */ + name_end = (u8 *)a + le16_to_cpu(a->name_offset) + + a->name_length * sizeof(ntfschar); + if (name_end > mrec_end) break; + ctx->attr = a; if (unlikely(le32_to_cpu(a->type) > le32_to_cpu(type) || a->type == AT_END)) return -ENOENT; if (unlikely(!a->length)) break; + + /* check whether ATTR_RECORD's length wrap */ + if ((u8 *)a + le32_to_cpu(a->length) < (u8 *)a) + break; + /* check whether ATTR_RECORD's length is within bounds */ + if ((u8 *)a + le32_to_cpu(a->length) > mrec_end) + break; + if (a->type != type) continue; /* diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 58b660dbbee9..c481b14e4fd9 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -527,12 +527,12 @@ err_out: goto out; } -static inline int ntfs_submit_bh_for_read(struct buffer_head *bh) +static inline void ntfs_submit_bh_for_read(struct buffer_head *bh) { lock_buffer(bh); get_bh(bh); bh->b_end_io = end_buffer_read_sync; - return submit_bh(REQ_OP_READ, bh); + submit_bh(REQ_OP_READ, bh); } /** diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index db0f1995aedd..08c659332e26 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -1829,6 +1829,13 @@ int ntfs_read_inode_mount(struct inode *vi) goto err_out; } + /* Sanity check offset to the first attribute */ + if (le16_to_cpu(m->attrs_offset) >= le32_to_cpu(m->bytes_allocated)) { + ntfs_error(sb, "Incorrect mft offset to the first attribute %u in superblock.", + le16_to_cpu(m->attrs_offset)); + goto err_out; + } + /* Need this to sanity check attribute list references to $MFT. */ vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number); diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 5ae8de09b271..001f4e053c85 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -2092,7 +2092,8 @@ get_ctx_vol_failed: // TODO: Initialize security. /* Get the extended system files' directory inode. */ vol->extend_ino = ntfs_iget(sb, FILE_Extend); - if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino)) { + if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino) || + !S_ISDIR(vol->extend_ino->i_mode)) { if (!IS_ERR(vol->extend_ino)) iput(vol->extend_ino); ntfs_error(sb, "Failed to load $Extend."); diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c index 5d44ceac855b..e92bbd754365 100644 --- a/fs/ntfs3/bitmap.c +++ b/fs/ntfs3/bitmap.c @@ -560,7 +560,7 @@ static int wnd_rescan(struct wnd_bitmap *wnd) buf = (ulong *)bh->b_data; - used = __bitmap_weight(buf, wbits); + used = bitmap_weight(buf, wbits); if (used < wbits) { frb = wbits - used; wnd->free_bits[iw] = frb; @@ -1364,7 +1364,7 @@ int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits) buf = (ulong *)bh->b_data; __bitmap_clear(buf, b0, blocksize * 8 - b0); - frb = wbits - __bitmap_weight(buf, wbits); + frb = wbits - bitmap_weight(buf, wbits); wnd->total_zeroes += frb - wnd->free_bits[iw]; wnd->free_bits[iw] = frb; diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c index e7c494005122..0d611a6c5511 100644 --- a/fs/ntfs3/fslog.c +++ b/fs/ntfs3/fslog.c @@ -3819,7 +3819,7 @@ int log_replay(struct ntfs_inode *ni, bool *initialized) } log_init_pg_hdr(log, page_size, page_size, 1, 1); - log_create(log, l_size, 0, get_random_int(), false, false); + log_create(log, l_size, 0, get_random_u32(), false, false); log->ra = ra; @@ -3893,7 +3893,7 @@ check_restart_area: /* Do some checks based on whether we have a valid log page. */ if (!rst_info.valid_page) { - open_log_count = get_random_int(); + open_log_count = get_random_u32(); goto init_log_instance; } open_log_count = le32_to_cpu(ra2->open_log_count); @@ -4044,7 +4044,7 @@ find_oldest: memcpy(ra->clients, Add2Ptr(ra2, t16), le16_to_cpu(ra2->ra_len) - t16); - log->current_openlog_count = get_random_int(); + log->current_openlog_count = get_random_u32(); ra->open_log_count = cpu_to_le32(log->current_openlog_count); log->ra_size = offsetof(struct RESTART_AREA, clients) + sizeof(struct CLIENT_REC); diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 51363d4e8636..d5a3afbbbfd8 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -630,12 +630,9 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo, bh->b_size = block_size; off = vbo & (PAGE_SIZE - 1); set_bh_page(bh, page, off); - ll_rw_block(REQ_OP_READ, 1, &bh); - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) { - err = -EIO; + err = bh_read(bh, 0); + if (err < 0) goto out; - } zero_user_segment(page, off + voff, off + block_size); } } @@ -1927,8 +1924,6 @@ const struct inode_operations ntfs_link_inode_operations = { .setattr = ntfs3_setattr, .listxattr = ntfs_listxattr, .permission = ntfs_permission, - .get_acl = ntfs_get_acl, - .set_acl = ntfs_set_acl, }; const struct address_space_operations ntfs_aops = { diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c index 6ae1f56b7358..7de8718c68a9 100644 --- a/fs/ntfs3/xattr.c +++ b/fs/ntfs3/xattr.c @@ -625,67 +625,6 @@ int ntfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, return ntfs_set_acl_ex(mnt_userns, inode, acl, type, false); } -static int ntfs_xattr_get_acl(struct user_namespace *mnt_userns, - struct inode *inode, int type, void *buffer, - size_t size) -{ - struct posix_acl *acl; - int err; - - if (!(inode->i_sb->s_flags & SB_POSIXACL)) { - ntfs_inode_warn(inode, "add mount option \"acl\" to use acl"); - return -EOPNOTSUPP; - } - - acl = ntfs_get_acl(inode, type, false); - if (IS_ERR(acl)) - return PTR_ERR(acl); - - if (!acl) - return -ENODATA; - - err = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - - return err; -} - -static int ntfs_xattr_set_acl(struct user_namespace *mnt_userns, - struct inode *inode, int type, const void *value, - size_t size) -{ - struct posix_acl *acl; - int err; - - if (!(inode->i_sb->s_flags & SB_POSIXACL)) { - ntfs_inode_warn(inode, "add mount option \"acl\" to use acl"); - return -EOPNOTSUPP; - } - - if (!inode_owner_or_capable(mnt_userns, inode)) - return -EPERM; - - if (!value) { - acl = NULL; - } else { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - - if (acl) { - err = posix_acl_valid(&init_user_ns, acl); - if (err) - goto release_and_out; - } - } - - err = ntfs_set_acl(mnt_userns, inode, acl, type); - -release_and_out: - posix_acl_release(acl); - return err; -} - /* * ntfs_init_acl - Initialize the ACLs of a new inode. * @@ -852,23 +791,6 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de, goto out; } -#ifdef CONFIG_NTFS3_FS_POSIX_ACL - if ((name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 && - !memcmp(name, XATTR_NAME_POSIX_ACL_ACCESS, - sizeof(XATTR_NAME_POSIX_ACL_ACCESS))) || - (name_len == sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1 && - !memcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, - sizeof(XATTR_NAME_POSIX_ACL_DEFAULT)))) { - /* TODO: init_user_ns? */ - err = ntfs_xattr_get_acl( - &init_user_ns, inode, - name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 - ? ACL_TYPE_ACCESS - : ACL_TYPE_DEFAULT, - buffer, size); - goto out; - } -#endif /* Deal with NTFS extended attribute. */ err = ntfs_get_ea(inode, name, name_len, buffer, size, NULL); @@ -981,22 +903,6 @@ set_new_fa: goto out; } -#ifdef CONFIG_NTFS3_FS_POSIX_ACL - if ((name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 && - !memcmp(name, XATTR_NAME_POSIX_ACL_ACCESS, - sizeof(XATTR_NAME_POSIX_ACL_ACCESS))) || - (name_len == sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1 && - !memcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, - sizeof(XATTR_NAME_POSIX_ACL_DEFAULT)))) { - err = ntfs_xattr_set_acl( - mnt_userns, inode, - name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 - ? ACL_TYPE_ACCESS - : ACL_TYPE_DEFAULT, - value, size); - goto out; - } -#endif /* Deal with NTFS extended attribute. */ err = ntfs_set_ea(inode, name, name_len, value, size, flags, 0); @@ -1086,7 +992,7 @@ static bool ntfs_xattr_user_list(struct dentry *dentry) } // clang-format off -static const struct xattr_handler ntfs_xattr_handler = { +static const struct xattr_handler ntfs_other_xattr_handler = { .prefix = "", .get = ntfs_getxattr, .set = ntfs_setxattr, @@ -1094,7 +1000,11 @@ static const struct xattr_handler ntfs_xattr_handler = { }; const struct xattr_handler *ntfs_xattr_handlers[] = { - &ntfs_xattr_handler, +#ifdef CONFIG_NTFS3_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif + &ntfs_other_xattr_handler, NULL, }; // clang-format on diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index af4157f61927..1d65f6ef00ca 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -636,7 +636,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, !buffer_new(bh) && ocfs2_should_read_blk(inode, page, block_start) && (block_start < from || block_end > to)) { - ll_rw_block(REQ_OP_READ, 1, &bh); + bh_read_nowait(bh, 0); *wait_bh++=bh; } diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 81c3d65d68fe..694471fc46b8 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -2032,7 +2032,7 @@ struct ocfs2_empty_dir_priv { unsigned seen_other; unsigned dx_dir; }; -static int ocfs2_empty_dir_filldir(struct dir_context *ctx, const char *name, +static bool ocfs2_empty_dir_filldir(struct dir_context *ctx, const char *name, int name_len, loff_t pos, u64 ino, unsigned type) { @@ -2052,7 +2052,7 @@ static int ocfs2_empty_dir_filldir(struct dir_context *ctx, const char *name, */ if (name_len == 1 && !strncmp(".", name, 1) && pos == 0) { p->seen_dot = 1; - return 0; + return true; } if (name_len == 2 && !strncmp("..", name, 2) && @@ -2060,13 +2060,13 @@ static int ocfs2_empty_dir_filldir(struct dir_context *ctx, const char *name, p->seen_dot_dot = 1; if (p->dx_dir && p->seen_dot) - return 1; + return false; - return 0; + return true; } p->seen_other = 1; - return 1; + return false; } static int ocfs2_empty_dir_dx(struct inode *inode, diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index fa87d89cf754..126671e6caed 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -2057,7 +2057,7 @@ struct ocfs2_orphan_filldir_priv { enum ocfs2_orphan_reco_type orphan_reco_type; }; -static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, +static bool ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, int name_len, loff_t pos, u64 ino, unsigned type) { @@ -2066,21 +2066,21 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, struct inode *iter; if (name_len == 1 && !strncmp(".", name, 1)) - return 0; + return true; if (name_len == 2 && !strncmp("..", name, 2)) - return 0; + return true; /* do not include dio entry in case of orphan scan */ if ((p->orphan_reco_type == ORPHAN_NO_NEED_TRUNCATE) && (!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX, OCFS2_DIO_ORPHAN_PREFIX_LEN))) - return 0; + return true; /* Skip bad inodes so that recovery can continue */ iter = ocfs2_iget(p->osb, ino, OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0); if (IS_ERR(iter)) - return 0; + return true; if (!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX, OCFS2_DIO_ORPHAN_PREFIX_LEN)) @@ -2090,7 +2090,7 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, * happen concurrently with unlink/rename */ if (OCFS2_I(iter)->ip_next_orphan) { iput(iter); - return 0; + return true; } trace_ocfs2_orphan_filldir((unsigned long long)OCFS2_I(iter)->ip_blkno); @@ -2099,7 +2099,7 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, OCFS2_I(iter)->ip_next_orphan = p->head; p->head = iter; - return 0; + return true; } static int ocfs2_queue_orphans(struct ocfs2_super *osb, diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 961d1cf54388..05f32989bad6 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -232,6 +232,7 @@ static int ocfs2_mknod(struct user_namespace *mnt_userns, handle_t *handle = NULL; struct ocfs2_super *osb; struct ocfs2_dinode *dirfe; + struct ocfs2_dinode *fe = NULL; struct buffer_head *new_fe_bh = NULL; struct inode *inode = NULL; struct ocfs2_alloc_context *inode_ac = NULL; @@ -382,6 +383,7 @@ static int ocfs2_mknod(struct user_namespace *mnt_userns, goto leave; } + fe = (struct ocfs2_dinode *) new_fe_bh->b_data; if (S_ISDIR(mode)) { status = ocfs2_fill_new_dir(osb, handle, dir, inode, new_fe_bh, data_ac, meta_ac); @@ -454,8 +456,11 @@ roll_back: leave: if (status < 0 && did_quota_inode) dquot_free_inode(inode); - if (handle) + if (handle) { + if (status < 0 && fe) + ocfs2_set_links_count(fe, 0); ocfs2_commit_trans(osb, handle); + } ocfs2_inode_unlock(dir, 1); if (did_block_signals) @@ -632,18 +637,9 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, return status; } - status = __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh, + return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh, parent_fe_bh, handle, inode_ac, fe_blkno, suballoc_loc, suballoc_bit); - if (status < 0) { - u64 bg_blkno = ocfs2_which_suballoc_group(fe_blkno, suballoc_bit); - int tmp = ocfs2_free_suballoc_bits(handle, inode_ac->ac_inode, - inode_ac->ac_bh, suballoc_bit, bg_blkno, 1); - if (tmp) - mlog_errno(tmp); - } - - return status; } static int ocfs2_mkdir(struct user_namespace *mnt_userns, @@ -2028,8 +2024,11 @@ bail: ocfs2_clusters_to_bytes(osb->sb, 1)); if (status < 0 && did_quota_inode) dquot_free_inode(inode); - if (handle) + if (handle) { + if (status < 0 && fe) + ocfs2_set_links_count(fe, 0); ocfs2_commit_trans(osb, handle); + } ocfs2_inode_unlock(dir, 1); if (did_block_signals) diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 638d875eccc7..7aebdbf5cc0a 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -527,7 +527,7 @@ struct ocfs2_extent_block * value -1 (0xFFFF) is OCFS2_INVALID_SLOT. This marks a slot empty. */ struct ocfs2_slot_map { -/*00*/ __le16 sm_slots[0]; +/*00*/ DECLARE_FLEX_ARRAY(__le16, sm_slots); /* * Actual on-disk size is one block. OCFS2_MAX_SLOTS is 255, * 255 * sizeof(__le16) == 512B, within the 512B block minimum blocksize. @@ -548,7 +548,7 @@ struct ocfs2_extended_slot { * i_size. */ struct ocfs2_slot_map_extended { -/*00*/ struct ocfs2_extended_slot se_slots[0]; +/*00*/ DECLARE_FLEX_ARRAY(struct ocfs2_extended_slot, se_slots); /* * Actual size is i_size of the slot_map system file. It should * match s_max_slots * sizeof(struct ocfs2_extended_slot) @@ -727,7 +727,7 @@ struct ocfs2_dinode { struct ocfs2_extent_list i_list; struct ocfs2_truncate_log i_dealloc; struct ocfs2_inline_data i_data; - __u8 i_symlink[0]; + DECLARE_FLEX_ARRAY(__u8, i_symlink); } id2; /* Actual on-disk size is one block */ }; @@ -892,7 +892,7 @@ struct ocfs2_group_desc /*30*/ struct ocfs2_block_check bg_check; /* Error checking */ __le64 bg_reserved2; /*40*/ union { - __u8 bg_bitmap[0]; + DECLARE_FLEX_ARRAY(__u8, bg_bitmap); struct { /* * Block groups may be discontiguous when diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 1358981e80a3..623db358b1ef 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2614,7 +2614,7 @@ static inline unsigned int ocfs2_cow_align_length(struct super_block *sb, } /* - * Calculate out the start and number of virtual clusters we need to to CoW. + * Calculate out the start and number of virtual clusters we need to CoW. * * cpos is vitual start cluster position we want to do CoW in a * file and write_len is the cluster length. diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index a75e2b7d67f5..64e6ddcfe329 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -991,7 +991,7 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn) lc->oc_type = NO_CONTROLD; rc = dlm_new_lockspace(conn->cc_name, conn->cc_cluster_name, - DLM_LSFL_FS | DLM_LSFL_NEWEXCL, DLM_LVB_LEN, + DLM_LSFL_NEWEXCL, DLM_LVB_LEN, &ocfs2_ls_ops, conn, &ops_rv, &fsdlm); if (rc) { if (rc == -EEXIST || rc == -EPROTO) diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index dd77b7aaabf5..317126261523 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -334,10 +334,10 @@ int ocfs2_cluster_connect(const char *stack_name, goto out; } - strlcpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1); + strscpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1); new_conn->cc_namelen = grouplen; if (cluster_name_len) - strlcpy(new_conn->cc_cluster_name, cluster_name, + strscpy(new_conn->cc_cluster_name, cluster_name, CLUSTER_NAME_MAX + 1); new_conn->cc_cluster_name_len = cluster_name_len; new_conn->cc_recovery_handler = recovery_handler; diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index 5805a03d100b..9c74eace3adc 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -106,7 +106,7 @@ int ocfs2_claim_clusters(handle_t *handle, u32 *cluster_start, u32 *num_clusters); /* - * Use this variant of ocfs2_claim_clusters to specify a maxiumum + * Use this variant of ocfs2_claim_clusters to specify a maximum * number of clusters smaller than the allocation reserved. */ int __ocfs2_claim_clusters(handle_t *handle, diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index e2cc9eec287c..42c993e53924 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1764,9 +1764,7 @@ static int ocfs2_get_sector(struct super_block *sb, if (!buffer_dirty(*bh)) clear_buffer_uptodate(*bh); unlock_buffer(*bh); - ll_rw_block(REQ_OP_READ, 1, bh); - wait_on_buffer(*bh); - if (!buffer_uptodate(*bh)) { + if (bh_read(*bh, 0) < 0) { mlog_errno(-EIO); brelse(*bh); *bh = NULL; @@ -2221,7 +2219,7 @@ static int ocfs2_initialize_super(struct super_block *sb, goto out_journal; } - strlcpy(osb->vol_label, di->id2.i_super.s_label, + strscpy(osb->vol_label, di->id2.i_super.s_label, OCFS2_MAX_VOL_LABEL_LEN); osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno); osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno); diff --git a/fs/open.c b/fs/open.c index 8a813fa5ca56..a81319b6177f 100644 --- a/fs/open.c +++ b/fs/open.c @@ -716,6 +716,8 @@ int chown_common(const struct path *path, uid_t user, gid_t group) fs_userns = i_user_ns(inode); retry_deleg: + newattrs.ia_vfsuid = INVALID_VFSUID; + newattrs.ia_vfsgid = INVALID_VFSGID; newattrs.ia_valid = ATTR_CTIME; if ((user != (uid_t)-1) && !setattr_vfsuid(&newattrs, uid)) return -EINVAL; @@ -840,7 +842,9 @@ static int do_dentry_open(struct file *f, return 0; } - if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { + if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) { + i_readcount_inc(inode); + } else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { error = get_write_access(inode); if (unlikely(error)) goto cleanup_file; @@ -880,8 +884,6 @@ static int do_dentry_open(struct file *f, goto cleanup_all; } f->f_mode |= FMODE_OPENED; - if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) - i_readcount_inc(inode); if ((f->f_mode & FMODE_READ) && likely(f->f_op->read || f->f_op->read_iter)) f->f_mode |= FMODE_CAN_READ; @@ -935,10 +937,7 @@ cleanup_all: if (WARN_ON_ONCE(error > 0)) error = -EINVAL; fops_put(f->f_op); - if (f->f_mode & FMODE_WRITER) { - put_write_access(inode); - __mnt_drop_write(f->f_path.mnt); - } + put_file_access(f); cleanup_file: path_put(&f->f_path); f->f_path.mnt = NULL; diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c index e2c2699d8016..9cacce5d55c1 100644 --- a/fs/orangefs/dir.c +++ b/fs/orangefs/dir.c @@ -398,7 +398,7 @@ static int orangefs_dir_release(struct inode *inode, struct file *file) const struct file_operations orangefs_dir_operations = { .llseek = orangefs_dir_llseek, .read = generic_read_dir, - .iterate = orangefs_dir_iterate, + .iterate_shared = orangefs_dir_iterate, .open = orangefs_dir_open, .release = orangefs_dir_release }; diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 86810e5d7914..732661aa2680 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -417,9 +417,7 @@ static int orangefs_file_release(struct inode *inode, struct file *file) * readahead cache (if any); this forces an expensive refresh of * data for the next caller of mmap (or 'get_block' accesses) */ - if (file_inode(file) && - file_inode(file)->i_mapping && - mapping_nrpages(&file_inode(file)->i_data)) { + if (mapping_nrpages(file->f_mapping)) { if (orangefs_features & ORANGEFS_FEATURE_READAHEAD) { gossip_debug(GOSSIP_INODE_DEBUG, "calling flush_racache on %pU\n", diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index fdde6c56cc3d..f436d8847f08 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -44,7 +44,7 @@ static bool ovl_must_copy_xattr(const char *name) !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN); } -int ovl_copy_xattr(struct super_block *sb, struct path *oldpath, struct dentry *new) +int ovl_copy_xattr(struct super_block *sb, const struct path *oldpath, struct dentry *new) { struct dentry *old = oldpath->dentry; ssize_t list_size, size, value_size = 0; @@ -132,8 +132,8 @@ out: return error; } -static int ovl_copy_fileattr(struct inode *inode, struct path *old, - struct path *new) +static int ovl_copy_fileattr(struct inode *inode, const struct path *old, + const struct path *new) { struct fileattr oldfa = { .flags_valid = true }; struct fileattr newfa = { .flags_valid = true }; @@ -193,11 +193,11 @@ static int ovl_copy_fileattr(struct inode *inode, struct path *old, return ovl_real_fileattr_set(new, &newfa); } -static int ovl_copy_up_data(struct ovl_fs *ofs, struct path *old, - struct path *new, loff_t len) +static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry, + struct file *new_file, loff_t len) { + struct path datapath; struct file *old_file; - struct file *new_file; loff_t old_pos = 0; loff_t new_pos = 0; loff_t cloned; @@ -206,23 +206,18 @@ static int ovl_copy_up_data(struct ovl_fs *ofs, struct path *old, bool skip_hole = false; int error = 0; - if (len == 0) - return 0; + ovl_path_lowerdata(dentry, &datapath); + if (WARN_ON(datapath.dentry == NULL)) + return -EIO; - old_file = ovl_path_open(old, O_LARGEFILE | O_RDONLY); + old_file = ovl_path_open(&datapath, O_LARGEFILE | O_RDONLY); if (IS_ERR(old_file)) return PTR_ERR(old_file); - new_file = ovl_path_open(new, O_LARGEFILE | O_WRONLY); - if (IS_ERR(new_file)) { - error = PTR_ERR(new_file); - goto out_fput; - } - /* Try to use clone_file_range to clone up within the same fs */ cloned = do_clone_file_range(old_file, 0, new_file, 0, len, 0); if (cloned == len) - goto out; + goto out_fput; /* Couldn't clone, so now we try to copy the data */ /* Check if lower fs supports seek operation */ @@ -282,10 +277,8 @@ static int ovl_copy_up_data(struct ovl_fs *ofs, struct path *old, len -= bytes; } -out: if (!error && ovl_should_sync(ofs)) error = vfs_fsync(new_file, 0); - fput(new_file); out_fput: fput(old_file); return error; @@ -556,30 +549,31 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c) return err; } -static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) +static int ovl_copy_up_data(struct ovl_copy_up_ctx *c, const struct path *temp) { struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); - struct inode *inode = d_inode(c->dentry); - struct path upperpath, datapath; + struct file *new_file; int err; - ovl_path_upper(c->dentry, &upperpath); - if (WARN_ON(upperpath.dentry != NULL)) - return -EIO; + if (!S_ISREG(c->stat.mode) || c->metacopy || !c->stat.size) + return 0; - upperpath.dentry = temp; + new_file = ovl_path_open(temp, O_LARGEFILE | O_WRONLY); + if (IS_ERR(new_file)) + return PTR_ERR(new_file); - /* - * Copy up data first and then xattrs. Writing data after - * xattrs will remove security.capability xattr automatically. - */ - if (S_ISREG(c->stat.mode) && !c->metacopy) { - ovl_path_lowerdata(c->dentry, &datapath); - err = ovl_copy_up_data(ofs, &datapath, &upperpath, - c->stat.size); - if (err) - return err; - } + err = ovl_copy_up_file(ofs, c->dentry, new_file, c->stat.size); + fput(new_file); + + return err; +} + +static int ovl_copy_up_metadata(struct ovl_copy_up_ctx *c, struct dentry *temp) +{ + struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); + struct inode *inode = d_inode(c->dentry); + struct path upperpath = { .mnt = ovl_upper_mnt(ofs), .dentry = temp }; + int err; err = ovl_copy_xattr(c->dentry->d_sb, &c->lowerpath, temp); if (err) @@ -662,6 +656,7 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); struct inode *inode; struct inode *udir = d_inode(c->destdir), *wdir = d_inode(c->workdir); + struct path path = { .mnt = ovl_upper_mnt(ofs) }; struct dentry *temp, *upper; struct ovl_cu_creds cc; int err; @@ -688,7 +683,16 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) if (IS_ERR(temp)) goto unlock; - err = ovl_copy_up_inode(c, temp); + /* + * Copy up data first and then xattrs. Writing data after + * xattrs will remove security.capability xattr automatically. + */ + path.dentry = temp; + err = ovl_copy_up_data(c, &path); + if (err) + goto cleanup; + + err = ovl_copy_up_metadata(c, temp); if (err) goto cleanup; @@ -732,6 +736,7 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); struct inode *udir = d_inode(c->destdir); struct dentry *temp, *upper; + struct file *tmpfile; struct ovl_cu_creds cc; int err; @@ -739,15 +744,22 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) if (err) return err; - temp = ovl_do_tmpfile(ofs, c->workdir, c->stat.mode); + tmpfile = ovl_do_tmpfile(ofs, c->workdir, c->stat.mode); ovl_revert_cu_creds(&cc); - if (IS_ERR(temp)) - return PTR_ERR(temp); + if (IS_ERR(tmpfile)) + return PTR_ERR(tmpfile); - err = ovl_copy_up_inode(c, temp); + temp = tmpfile->f_path.dentry; + if (!c->metacopy && c->stat.size) { + err = ovl_copy_up_file(ofs, c->dentry, tmpfile, c->stat.size); + if (err) + return err; + } + + err = ovl_copy_up_metadata(c, temp); if (err) - goto out_dput; + goto out_fput; inode_lock_nested(udir, I_MUTEX_PARENT); @@ -761,16 +773,14 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) inode_unlock(udir); if (err) - goto out_dput; + goto out_fput; if (!c->metacopy) ovl_set_upperdata(d_inode(c->dentry)); - ovl_inode_update(d_inode(c->dentry), temp); + ovl_inode_update(d_inode(c->dentry), dget(temp)); - return 0; - -out_dput: - dput(temp); +out_fput: + fput(tmpfile); return err; } @@ -872,7 +882,7 @@ static bool ovl_need_meta_copy_up(struct dentry *dentry, umode_t mode, return true; } -static ssize_t ovl_getxattr_value(struct path *path, char *name, char **value) +static ssize_t ovl_getxattr_value(const struct path *path, char *name, char **value) { ssize_t res; char *buf; @@ -899,7 +909,7 @@ static ssize_t ovl_getxattr_value(struct path *path, char *name, char **value) static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) { struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); - struct path upperpath, datapath; + struct path upperpath; int err; char *capability = NULL; ssize_t cap_size; @@ -908,10 +918,6 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) if (WARN_ON(upperpath.dentry == NULL)) return -EIO; - ovl_path_lowerdata(c->dentry, &datapath); - if (WARN_ON(datapath.dentry == NULL)) - return -EIO; - if (c->stat.size) { err = cap_size = ovl_getxattr_value(&upperpath, XATTR_NAME_CAPS, &capability); @@ -919,7 +925,7 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) goto out; } - err = ovl_copy_up_data(ofs, &datapath, &upperpath, c->stat.size); + err = ovl_copy_up_data(c, &upperpath); if (err) goto out_free; diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index daff601b5c41..a1a22f58ba18 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -38,7 +38,7 @@ static char ovl_whatisit(struct inode *inode, struct inode *realinode) #define OVL_OPEN_FLAGS (O_NOATIME | FMODE_NONOTIFY) static struct file *ovl_open_realfile(const struct file *file, - struct path *realpath) + const struct path *realpath) { struct inode *realinode = d_inode(realpath->dentry); struct inode *inode = file_inode(file); diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 0fbcb590af84..9e61511de7a7 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -588,7 +588,7 @@ static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, * Introducing security_inode_fileattr_get/set() hooks would solve this issue * properly. */ -static int ovl_security_fileattr(struct path *realpath, struct fileattr *fa, +static int ovl_security_fileattr(const struct path *realpath, struct fileattr *fa, bool set) { struct file *file; @@ -610,7 +610,7 @@ static int ovl_security_fileattr(struct path *realpath, struct fileattr *fa, return err; } -int ovl_real_fileattr_set(struct path *realpath, struct fileattr *fa) +int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa) { int err; @@ -685,7 +685,7 @@ static void ovl_fileattr_prot_flags(struct inode *inode, struct fileattr *fa) } } -int ovl_real_fileattr_get(struct path *realpath, struct fileattr *fa) +int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa) { int err; diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 69dc577974f8..0fd1d5fdfc72 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -26,7 +26,7 @@ struct ovl_lookup_data { bool metacopy; }; -static int ovl_check_redirect(struct path *path, struct ovl_lookup_data *d, +static int ovl_check_redirect(const struct path *path, struct ovl_lookup_data *d, size_t prelen, const char *post) { int res; @@ -194,7 +194,7 @@ struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh, return real; } -static bool ovl_is_opaquedir(struct ovl_fs *ofs, struct path *path) +static bool ovl_is_opaquedir(struct ovl_fs *ofs, const struct path *path) { return ovl_path_check_dir_xattr(ofs, path, OVL_XATTR_OPAQUE); } diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 87759165d32b..eee8f08d32b6 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -208,7 +208,7 @@ static inline int ovl_do_symlink(struct ovl_fs *ofs, return err; } -static inline ssize_t ovl_do_getxattr(struct path *path, const char *name, +static inline ssize_t ovl_do_getxattr(const struct path *path, const char *name, void *value, size_t size) { int err, len; @@ -238,7 +238,7 @@ static inline ssize_t ovl_getxattr_upper(struct ovl_fs *ofs, } static inline ssize_t ovl_path_getxattr(struct ovl_fs *ofs, - struct path *path, + const struct path *path, enum ovl_xattr ox, void *value, size_t size) { @@ -250,7 +250,7 @@ static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry, size_t size, int flags) { int err = vfs_setxattr(ovl_upper_mnt_userns(ofs), dentry, name, - (void *)value, size, flags); + value, size, flags); pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, %d) = %i\n", dentry, name, min((int)size, 48), value, size, flags, err); @@ -310,14 +310,16 @@ static inline int ovl_do_whiteout(struct ovl_fs *ofs, return err; } -static inline struct dentry *ovl_do_tmpfile(struct ovl_fs *ofs, - struct dentry *dentry, umode_t mode) +static inline struct file *ovl_do_tmpfile(struct ovl_fs *ofs, + struct dentry *dentry, umode_t mode) { - struct dentry *ret = vfs_tmpfile(ovl_upper_mnt_userns(ofs), dentry, mode, 0); - int err = PTR_ERR_OR_ZERO(ret); + struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = dentry }; + struct file *file = vfs_tmpfile_open(ovl_upper_mnt_userns(ofs), &path, mode, + O_LARGEFILE | O_WRONLY, current_cred()); + int err = PTR_ERR_OR_ZERO(file); pr_debug("tmpfile(%pd2, 0%o) = %i\n", dentry, mode, err); - return ret; + return file; } static inline struct dentry *ovl_lookup_upper(struct ovl_fs *ofs, @@ -401,13 +403,13 @@ void ovl_inode_update(struct inode *inode, struct dentry *upperdentry); void ovl_dir_modified(struct dentry *dentry, bool impurity); u64 ovl_dentry_version_get(struct dentry *dentry); bool ovl_is_whiteout(struct dentry *dentry); -struct file *ovl_path_open(struct path *path, int flags); +struct file *ovl_path_open(const struct path *path, int flags); int ovl_copy_up_start(struct dentry *dentry, int flags); void ovl_copy_up_end(struct dentry *dentry); bool ovl_already_copied_up(struct dentry *dentry, int flags); -bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, struct path *path, +bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path, enum ovl_xattr ox); -bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, struct path *path); +bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path); static inline bool ovl_check_origin_xattr(struct ovl_fs *ofs, struct dentry *upperdentry) @@ -430,9 +432,9 @@ bool ovl_need_index(struct dentry *dentry); int ovl_nlink_start(struct dentry *dentry); void ovl_nlink_end(struct dentry *dentry); int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir); -int ovl_check_metacopy_xattr(struct ovl_fs *ofs, struct path *path); +int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path); bool ovl_is_metacopy_dentry(struct dentry *dentry); -char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct path *path, int padding); +char *ovl_get_redirect_xattr(struct ovl_fs *ofs, const struct path *path, int padding); int ovl_sync_status(struct ovl_fs *ofs); static inline void ovl_set_flag(unsigned long flag, struct inode *inode) @@ -556,7 +558,7 @@ void ovl_cleanup_whiteouts(struct ovl_fs *ofs, struct dentry *upper, struct list_head *list); void ovl_cache_free(struct list_head *list); void ovl_dir_cache_free(struct inode *inode); -int ovl_check_d_type_supported(struct path *realpath); +int ovl_check_d_type_supported(const struct path *realpath); int ovl_workdir_cleanup(struct ovl_fs *ofs, struct inode *dir, struct vfsmount *mnt, struct dentry *dentry, int level); int ovl_indexdir_cleanup(struct ovl_fs *ofs); @@ -673,8 +675,8 @@ struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir, extern const struct file_operations ovl_file_operations; int __init ovl_aio_request_cache_init(void); void ovl_aio_request_cache_destroy(void); -int ovl_real_fileattr_get(struct path *realpath, struct fileattr *fa); -int ovl_real_fileattr_set(struct path *realpath, struct fileattr *fa); +int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa); +int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa); int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa); int ovl_fileattr_set(struct user_namespace *mnt_userns, struct dentry *dentry, struct fileattr *fa); @@ -683,7 +685,7 @@ int ovl_fileattr_set(struct user_namespace *mnt_userns, int ovl_copy_up(struct dentry *dentry); int ovl_copy_up_with_data(struct dentry *dentry); int ovl_maybe_copy_up(struct dentry *dentry, int flags); -int ovl_copy_xattr(struct super_block *sb, struct path *path, struct dentry *new); +int ovl_copy_xattr(struct super_block *sb, const struct path *path, struct dentry *new); int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upper, struct kstat *stat); struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real, bool is_upper); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 78f62cc1797b..2b210640036c 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -170,7 +170,7 @@ static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd, return p; } -static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, +static bool ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, const char *name, int len, u64 ino, unsigned int d_type) { @@ -179,22 +179,22 @@ static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, struct ovl_cache_entry *p; if (ovl_cache_entry_find_link(name, len, &newp, &parent)) - return 0; + return true; p = ovl_cache_entry_new(rdd, name, len, ino, d_type); if (p == NULL) { rdd->err = -ENOMEM; - return -ENOMEM; + return false; } list_add_tail(&p->l_node, rdd->list); rb_link_node(&p->node, parent, newp); rb_insert_color(&p->node, rdd->root); - return 0; + return true; } -static int ovl_fill_lowest(struct ovl_readdir_data *rdd, +static bool ovl_fill_lowest(struct ovl_readdir_data *rdd, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) { @@ -211,7 +211,7 @@ static int ovl_fill_lowest(struct ovl_readdir_data *rdd, list_add_tail(&p->l_node, &rdd->middle); } - return rdd->err; + return rdd->err == 0; } void ovl_cache_free(struct list_head *list) @@ -250,7 +250,7 @@ static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry) } } -static int ovl_fill_merge(struct dir_context *ctx, const char *name, +static bool ovl_fill_merge(struct dir_context *ctx, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) { @@ -264,7 +264,7 @@ static int ovl_fill_merge(struct dir_context *ctx, const char *name, return ovl_fill_lowest(rdd, name, namelen, offset, ino, d_type); } -static int ovl_check_whiteouts(struct path *path, struct ovl_readdir_data *rdd) +static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data *rdd) { int err; struct ovl_cache_entry *p; @@ -291,7 +291,7 @@ static int ovl_check_whiteouts(struct path *path, struct ovl_readdir_data *rdd) return err; } -static inline int ovl_dir_read(struct path *realpath, +static inline int ovl_dir_read(const struct path *realpath, struct ovl_readdir_data *rdd) { struct file *realfile; @@ -455,7 +455,7 @@ static u64 ovl_remap_lower_ino(u64 ino, int xinobits, int fsid, * copy up origin, call vfs_getattr() on the overlay entry to make * sure that d_ino will be consistent with st_ino from stat(2). */ -static int ovl_cache_update_ino(struct path *path, struct ovl_cache_entry *p) +static int ovl_cache_update_ino(const struct path *path, struct ovl_cache_entry *p) { struct dentry *dir = path->dentry; @@ -528,7 +528,7 @@ fail: goto out; } -static int ovl_fill_plain(struct dir_context *ctx, const char *name, +static bool ovl_fill_plain(struct dir_context *ctx, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) { @@ -540,14 +540,14 @@ static int ovl_fill_plain(struct dir_context *ctx, const char *name, p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type); if (p == NULL) { rdd->err = -ENOMEM; - return -ENOMEM; + return false; } list_add_tail(&p->l_node, rdd->list); - return 0; + return true; } -static int ovl_dir_read_impure(struct path *path, struct list_head *list, +static int ovl_dir_read_impure(const struct path *path, struct list_head *list, struct rb_root *root) { int err; @@ -592,7 +592,7 @@ static int ovl_dir_read_impure(struct path *path, struct list_head *list, return 0; } -static struct ovl_dir_cache *ovl_cache_get_impure(struct path *path) +static struct ovl_dir_cache *ovl_cache_get_impure(const struct path *path) { int res; struct dentry *dentry = path->dentry; @@ -648,7 +648,7 @@ struct ovl_readdir_translate { bool xinowarn; }; -static int ovl_fill_real(struct dir_context *ctx, const char *name, +static bool ovl_fill_real(struct dir_context *ctx, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) { @@ -834,7 +834,7 @@ out_unlock: } static struct file *ovl_dir_open_realfile(const struct file *file, - struct path *realpath) + const struct path *realpath) { struct file *res; const struct cred *old_cred; @@ -1027,7 +1027,7 @@ void ovl_cleanup_whiteouts(struct ovl_fs *ofs, struct dentry *upper, inode_unlock(upper->d_inode); } -static int ovl_check_d_type(struct dir_context *ctx, const char *name, +static bool ovl_check_d_type(struct dir_context *ctx, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) { @@ -1036,19 +1036,19 @@ static int ovl_check_d_type(struct dir_context *ctx, const char *name, /* Even if d_type is not supported, DT_DIR is returned for . and .. */ if (!strncmp(name, ".", namelen) || !strncmp(name, "..", namelen)) - return 0; + return true; if (d_type != DT_UNKNOWN) rdd->d_type_supported = true; - return 0; + return true; } /* * Returns 1 if d_type is supported, 0 not supported/unknown. Negative values * if error is encountered. */ -int ovl_check_d_type_supported(struct path *realpath) +int ovl_check_d_type_supported(const struct path *realpath) { int err; struct ovl_readdir_data rdd = { @@ -1065,7 +1065,7 @@ int ovl_check_d_type_supported(struct path *realpath) #define OVL_INCOMPATDIR_NAME "incompat" -static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, struct path *path, +static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, const struct path *path, int level) { int err; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index ec746d447f1b..a29a8afe9b26 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -15,6 +15,7 @@ #include <linux/seq_file.h> #include <linux/posix_acl_xattr.h> #include <linux/exportfs.h> +#include <linux/file.h> #include "overlayfs.h" MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); @@ -908,7 +909,7 @@ static int ovl_mount_dir(const char *name, struct path *path) return err; } -static int ovl_check_namelen(struct path *path, struct ovl_fs *ofs, +static int ovl_check_namelen(const struct path *path, struct ovl_fs *ofs, const char *name) { struct kstatfs statfs; @@ -1022,7 +1023,20 @@ ovl_posix_acl_xattr_set(const struct xattr_handler *handler, /* Check that everything is OK before copy-up */ if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); + /* The above comment can be understood in two ways: + * + * 1. We just want to check whether the basic POSIX ACL format + * is ok. For example, if the header is correct and the size + * is sane. + * 2. We want to know whether the ACL_{GROUP,USER} entries can + * be mapped according to the underlying filesystem. + * + * Currently, we only check 1. If we wanted to check 2. we + * would need to pass the mnt_userns and the fs_userns of the + * underlying filesystem. But frankly, I think checking 1. is + * enough to start the copy-up. + */ + acl = vfs_set_acl_prepare(&init_user_ns, &init_user_ns, value, size); if (IS_ERR(acl)) return PTR_ERR(acl); } @@ -1353,10 +1367,11 @@ static int ovl_create_volatile_dirty(struct ovl_fs *ofs) } static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, - struct path *workpath) + const struct path *workpath) { struct vfsmount *mnt = ovl_upper_mnt(ofs); - struct dentry *temp, *workdir; + struct dentry *workdir; + struct file *tmpfile; bool rename_whiteout; bool d_type; int fh_type; @@ -1392,10 +1407,10 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, pr_warn("upper fs needs to support d_type.\n"); /* Check if upper/work fs supports O_TMPFILE */ - temp = ovl_do_tmpfile(ofs, ofs->workdir, S_IFREG | 0); - ofs->tmpfile = !IS_ERR(temp); + tmpfile = ovl_do_tmpfile(ofs, ofs->workdir, S_IFREG | 0); + ofs->tmpfile = !IS_ERR(tmpfile); if (ofs->tmpfile) - dput(temp); + fput(tmpfile); else pr_warn("upper fs does not support tmpfile.\n"); @@ -1482,7 +1497,7 @@ out: } static int ovl_get_workdir(struct super_block *sb, struct ovl_fs *ofs, - struct path *upperpath) + const struct path *upperpath) { int err; struct path workpath = { }; @@ -1525,7 +1540,7 @@ out: } static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs, - struct ovl_entry *oe, struct path *upperpath) + struct ovl_entry *oe, const struct path *upperpath) { struct vfsmount *mnt = ovl_upper_mnt(ofs); struct dentry *indexdir; diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 87f811c089e4..81a57a8d80d9 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -490,7 +490,7 @@ bool ovl_is_whiteout(struct dentry *dentry) return inode && IS_WHITEOUT(inode); } -struct file *ovl_path_open(struct path *path, int flags) +struct file *ovl_path_open(const struct path *path, int flags) { struct inode *inode = d_inode(path->dentry); struct user_namespace *real_mnt_userns = mnt_user_ns(path->mnt); @@ -578,7 +578,7 @@ void ovl_copy_up_end(struct dentry *dentry) ovl_inode_unlock(d_inode(dentry)); } -bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, struct path *path) +bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path) { int res; @@ -591,7 +591,7 @@ bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, struct path *path) return false; } -bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, struct path *path, +bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path, enum ovl_xattr ox) { int res; @@ -971,7 +971,7 @@ err: } /* err < 0, 0 if no metacopy xattr, 1 if metacopy xattr found */ -int ovl_check_metacopy_xattr(struct ovl_fs *ofs, struct path *path) +int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path) { int res; @@ -1015,7 +1015,7 @@ bool ovl_is_metacopy_dentry(struct dentry *dentry) return (oe->numlower > 1); } -char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct path *path, int padding) +char *ovl_get_redirect_xattr(struct ovl_fs *ofs, const struct path *path, int padding) { int res; char *s, *next, *buf = NULL; diff --git a/fs/pipe.c b/fs/pipe.c index 74ae9fafd25a..42c7ff41c2db 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -860,7 +860,7 @@ static struct vfsmount *pipe_mnt __read_mostly; */ static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) { - return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", + return dynamic_dname(buffer, buflen, "pipe:[%lu]", d_inode(dentry)->i_ino); } diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 5af33800743e..74dc0f571dc9 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -24,6 +24,7 @@ #include <linux/user_namespace.h> #include <linux/namei.h> #include <linux/mnt_idmapping.h> +#include <linux/iversion.h> static struct posix_acl **acl_by_type(struct inode *inode, int type) { @@ -710,9 +711,9 @@ EXPORT_SYMBOL(posix_acl_update_mode); /* * Fix up the uids and gids in posix acl extended attributes in place. */ -static int posix_acl_fix_xattr_common(void *value, size_t size) +static int posix_acl_fix_xattr_common(const void *value, size_t size) { - struct posix_acl_xattr_header *header = value; + const struct posix_acl_xattr_header *header = value; int count; if (!header) @@ -720,13 +721,13 @@ static int posix_acl_fix_xattr_common(void *value, size_t size) if (size < sizeof(struct posix_acl_xattr_header)) return -EINVAL; if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) - return -EINVAL; + return -EOPNOTSUPP; count = posix_acl_xattr_count(size); if (count < 0) return -EINVAL; if (count == 0) - return -EINVAL; + return 0; return count; } @@ -748,7 +749,7 @@ void posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns, return; count = posix_acl_fix_xattr_common(value, size); - if (count < 0) + if (count <= 0) return; for (end = entry + count; entry != end; entry++) { @@ -771,46 +772,6 @@ void posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns, } } -void posix_acl_setxattr_idmapped_mnt(struct user_namespace *mnt_userns, - const struct inode *inode, - void *value, size_t size) -{ - struct posix_acl_xattr_header *header = value; - struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end; - struct user_namespace *fs_userns = i_user_ns(inode); - int count; - vfsuid_t vfsuid; - vfsgid_t vfsgid; - kuid_t uid; - kgid_t gid; - - if (no_idmapping(mnt_userns, i_user_ns(inode))) - return; - - count = posix_acl_fix_xattr_common(value, size); - if (count < 0) - return; - - for (end = entry + count; entry != end; entry++) { - switch (le16_to_cpu(entry->e_tag)) { - case ACL_USER: - uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id)); - vfsuid = VFSUIDT_INIT(uid); - uid = from_vfsuid(mnt_userns, fs_userns, vfsuid); - entry->e_id = cpu_to_le32(from_kuid(&init_user_ns, uid)); - break; - case ACL_GROUP: - gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id)); - vfsgid = VFSGIDT_INIT(gid); - gid = from_vfsgid(mnt_userns, fs_userns, vfsgid); - entry->e_id = cpu_to_le32(from_kgid(&init_user_ns, gid)); - break; - default: - break; - } - } -} - static void posix_acl_fix_xattr_userns( struct user_namespace *to, struct user_namespace *from, void *value, size_t size) @@ -822,7 +783,7 @@ static void posix_acl_fix_xattr_userns( kgid_t gid; count = posix_acl_fix_xattr_common(value, size); - if (count < 0) + if (count <= 0) return; for (end = entry + count; entry != end; entry++) { @@ -857,12 +818,32 @@ void posix_acl_fix_xattr_to_user(void *value, size_t size) posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size); } -/* - * Convert from extended attribute to in-memory representation. +/** + * make_posix_acl - convert POSIX ACLs from uapi to VFS format using the + * provided callbacks to map ACL_{GROUP,USER} entries into the + * appropriate format + * @mnt_userns: the mount's idmapping + * @fs_userns: the filesystem's idmapping + * @value: the uapi representation of POSIX ACLs + * @size: the size of @void + * @uid_cb: callback to use for mapping the uid stored in ACL_USER entries + * @gid_cb: callback to use for mapping the gid stored in ACL_GROUP entries + * + * The make_posix_acl() helper is an abstraction to translate from uapi format + * into the VFS format allowing the caller to specific callbacks to map + * ACL_{GROUP,USER} entries into the expected format. This is used in + * posix_acl_from_xattr() and vfs_set_acl_prepare() and avoids pointless code + * duplication. + * + * Return: Allocated struct posix_acl on success, NULL for a valid header but + * without actual POSIX ACL entries, or ERR_PTR() encoded error code. */ -struct posix_acl * -posix_acl_from_xattr(struct user_namespace *user_ns, - const void *value, size_t size) +static struct posix_acl *make_posix_acl(struct user_namespace *mnt_userns, + struct user_namespace *fs_userns, const void *value, size_t size, + kuid_t (*uid_cb)(struct user_namespace *, struct user_namespace *, + const struct posix_acl_xattr_entry *), + kgid_t (*gid_cb)(struct user_namespace *, struct user_namespace *, + const struct posix_acl_xattr_entry *)) { const struct posix_acl_xattr_header *header = value; const struct posix_acl_xattr_entry *entry = (const void *)(header + 1), *end; @@ -870,16 +851,9 @@ posix_acl_from_xattr(struct user_namespace *user_ns, struct posix_acl *acl; struct posix_acl_entry *acl_e; - if (!value) - return NULL; - if (size < sizeof(struct posix_acl_xattr_header)) - return ERR_PTR(-EINVAL); - if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) - return ERR_PTR(-EOPNOTSUPP); - - count = posix_acl_xattr_count(size); + count = posix_acl_fix_xattr_common(value, size); if (count < 0) - return ERR_PTR(-EINVAL); + return ERR_PTR(count); if (count == 0) return NULL; @@ -900,16 +874,12 @@ posix_acl_from_xattr(struct user_namespace *user_ns, break; case ACL_USER: - acl_e->e_uid = - make_kuid(user_ns, - le32_to_cpu(entry->e_id)); + acl_e->e_uid = uid_cb(mnt_userns, fs_userns, entry); if (!uid_valid(acl_e->e_uid)) goto fail; break; case ACL_GROUP: - acl_e->e_gid = - make_kgid(user_ns, - le32_to_cpu(entry->e_id)); + acl_e->e_gid = gid_cb(mnt_userns, fs_userns, entry); if (!gid_valid(acl_e->e_gid)) goto fail; break; @@ -924,6 +894,181 @@ fail: posix_acl_release(acl); return ERR_PTR(-EINVAL); } + +/** + * vfs_set_acl_prepare_kuid - map ACL_USER uid according to mount- and + * filesystem idmapping + * @mnt_userns: the mount's idmapping + * @fs_userns: the filesystem's idmapping + * @e: a ACL_USER entry in POSIX ACL uapi format + * + * The uid stored as ACL_USER entry in @e is a kuid_t stored as a raw {g,u}id + * value. The vfs_set_acl_prepare_kuid() will recover the kuid_t through + * KUIDT_INIT() and then map it according to the idmapped mount. The resulting + * kuid_t is the value which the filesystem can map up into a raw backing store + * id in the filesystem's idmapping. + * + * This is used in vfs_set_acl_prepare() to generate the proper VFS + * representation of POSIX ACLs with ACL_USER entries during setxattr(). + * + * Return: A kuid in @fs_userns for the uid stored in @e. + */ +static inline kuid_t +vfs_set_acl_prepare_kuid(struct user_namespace *mnt_userns, + struct user_namespace *fs_userns, + const struct posix_acl_xattr_entry *e) +{ + kuid_t kuid = KUIDT_INIT(le32_to_cpu(e->e_id)); + return from_vfsuid(mnt_userns, fs_userns, VFSUIDT_INIT(kuid)); +} + +/** + * vfs_set_acl_prepare_kgid - map ACL_GROUP gid according to mount- and + * filesystem idmapping + * @mnt_userns: the mount's idmapping + * @fs_userns: the filesystem's idmapping + * @e: a ACL_GROUP entry in POSIX ACL uapi format + * + * The gid stored as ACL_GROUP entry in @e is a kgid_t stored as a raw {g,u}id + * value. The vfs_set_acl_prepare_kgid() will recover the kgid_t through + * KGIDT_INIT() and then map it according to the idmapped mount. The resulting + * kgid_t is the value which the filesystem can map up into a raw backing store + * id in the filesystem's idmapping. + * + * This is used in vfs_set_acl_prepare() to generate the proper VFS + * representation of POSIX ACLs with ACL_GROUP entries during setxattr(). + * + * Return: A kgid in @fs_userns for the gid stored in @e. + */ +static inline kgid_t +vfs_set_acl_prepare_kgid(struct user_namespace *mnt_userns, + struct user_namespace *fs_userns, + const struct posix_acl_xattr_entry *e) +{ + kgid_t kgid = KGIDT_INIT(le32_to_cpu(e->e_id)); + return from_vfsgid(mnt_userns, fs_userns, VFSGIDT_INIT(kgid)); +} + +/** + * vfs_set_acl_prepare - convert POSIX ACLs from uapi to VFS format taking + * mount and filesystem idmappings into account + * @mnt_userns: the mount's idmapping + * @fs_userns: the filesystem's idmapping + * @value: the uapi representation of POSIX ACLs + * @size: the size of @void + * + * When setting POSIX ACLs with ACL_{GROUP,USER} entries they need to be + * mapped according to the relevant mount- and filesystem idmapping. It is + * important that the ACL_{GROUP,USER} entries in struct posix_acl will be + * mapped into k{g,u}id_t that are supposed to be mapped up in the filesystem + * idmapping. This is crucial since the resulting struct posix_acl might be + * cached filesystem wide. The vfs_set_acl_prepare() function will take care to + * perform all necessary idmappings. + * + * Note, that since basically forever the {g,u}id values encoded as + * ACL_{GROUP,USER} entries in the uapi POSIX ACLs passed via @value contain + * values that have been mapped according to the caller's idmapping. In other + * words, POSIX ACLs passed in uapi format as @value during setxattr() contain + * {g,u}id values in their ACL_{GROUP,USER} entries that should actually have + * been stored as k{g,u}id_t. + * + * This means, vfs_set_acl_prepare() needs to first recover the k{g,u}id_t by + * calling K{G,U}IDT_INIT(). Afterwards they can be interpreted as vfs{g,u}id_t + * through from_vfs{g,u}id() to account for any idmapped mounts. The + * vfs_set_acl_prepare_k{g,u}id() helpers will take care to generate the + * correct k{g,u}id_t. + * + * The filesystem will then receive the POSIX ACLs ready to be cached + * filesystem wide and ready to be written to the backing store taking the + * filesystem's idmapping into account. + * + * Return: Allocated struct posix_acl on success, NULL for a valid header but + * without actual POSIX ACL entries, or ERR_PTR() encoded error code. + */ +struct posix_acl *vfs_set_acl_prepare(struct user_namespace *mnt_userns, + struct user_namespace *fs_userns, + const void *value, size_t size) +{ + return make_posix_acl(mnt_userns, fs_userns, value, size, + vfs_set_acl_prepare_kuid, + vfs_set_acl_prepare_kgid); +} +EXPORT_SYMBOL(vfs_set_acl_prepare); + +/** + * posix_acl_from_xattr_kuid - map ACL_USER uid into filesystem idmapping + * @mnt_userns: unused + * @fs_userns: the filesystem's idmapping + * @e: a ACL_USER entry in POSIX ACL uapi format + * + * Map the uid stored as ACL_USER entry in @e into the filesystem's idmapping. + * This is used in posix_acl_from_xattr() to generate the proper VFS + * representation of POSIX ACLs with ACL_USER entries. + * + * Return: A kuid in @fs_userns for the uid stored in @e. + */ +static inline kuid_t +posix_acl_from_xattr_kuid(struct user_namespace *mnt_userns, + struct user_namespace *fs_userns, + const struct posix_acl_xattr_entry *e) +{ + return make_kuid(fs_userns, le32_to_cpu(e->e_id)); +} + +/** + * posix_acl_from_xattr_kgid - map ACL_GROUP gid into filesystem idmapping + * @mnt_userns: unused + * @fs_userns: the filesystem's idmapping + * @e: a ACL_GROUP entry in POSIX ACL uapi format + * + * Map the gid stored as ACL_GROUP entry in @e into the filesystem's idmapping. + * This is used in posix_acl_from_xattr() to generate the proper VFS + * representation of POSIX ACLs with ACL_GROUP entries. + * + * Return: A kgid in @fs_userns for the gid stored in @e. + */ +static inline kgid_t +posix_acl_from_xattr_kgid(struct user_namespace *mnt_userns, + struct user_namespace *fs_userns, + const struct posix_acl_xattr_entry *e) +{ + return make_kgid(fs_userns, le32_to_cpu(e->e_id)); +} + +/** + * posix_acl_from_xattr - convert POSIX ACLs from backing store to VFS format + * @fs_userns: the filesystem's idmapping + * @value: the uapi representation of POSIX ACLs + * @size: the size of @void + * + * Filesystems that store POSIX ACLs in the unaltered uapi format should use + * posix_acl_from_xattr() when reading them from the backing store and + * converting them into the struct posix_acl VFS format. The helper is + * specifically intended to be called from the ->get_acl() inode operation. + * + * The posix_acl_from_xattr() function will map the raw {g,u}id values stored + * in ACL_{GROUP,USER} entries into the filesystem idmapping in @fs_userns. The + * posix_acl_from_xattr_k{g,u}id() helpers will take care to generate the + * correct k{g,u}id_t. The returned struct posix_acl can be cached. + * + * Note that posix_acl_from_xattr() does not take idmapped mounts into account. + * If it did it calling is from the ->get_acl() inode operation would return + * POSIX ACLs mapped according to an idmapped mount which would mean that the + * value couldn't be cached for the filesystem. Idmapped mounts are taken into + * account on the fly during permission checking or right at the VFS - + * userspace boundary before reporting them to the user. + * + * Return: Allocated struct posix_acl on success, NULL for a valid header but + * without actual POSIX ACL entries, or ERR_PTR() encoded error code. + */ +struct posix_acl * +posix_acl_from_xattr(struct user_namespace *fs_userns, + const void *value, size_t size) +{ + return make_posix_acl(&init_user_ns, fs_userns, value, size, + posix_acl_from_xattr_kuid, + posix_acl_from_xattr_kgid); +} EXPORT_SYMBOL (posix_acl_from_xattr); /* @@ -1027,7 +1172,17 @@ posix_acl_xattr_set(const struct xattr_handler *handler, int ret; if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); + /* + * By the time we end up here the {g,u}ids stored in + * ACL_{GROUP,USER} have already been mapped according to the + * caller's idmapping. The vfs_set_acl_prepare() helper will + * recover them and take idmapped mounts into account. The + * filesystem will receive the POSIX ACLs in the correct + * format ready to be cached or written to the backing store + * taking the filesystem idmapping into account. + */ + acl = vfs_set_acl_prepare(mnt_userns, i_user_ns(inode), + value, size); if (IS_ERR(acl)) return PTR_ERR(acl); } @@ -1073,6 +1228,8 @@ int simple_set_acl(struct user_namespace *mnt_userns, struct inode *inode, } inode->i_ctime = current_time(inode); + if (IS_I_VERSION(inode)) + inode_inc_iversion(inode); set_cached_acl(inode, type, acl); return 0; } diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index c930001056f9..32b1116ae137 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -92,6 +92,7 @@ config PROC_PAGE_MONITOR config PROC_CHILDREN bool "Include /proc/<pid>/task/<tid>/children file" + depends on PROC_FS default n help Provides a fast way to retrieve first level children pids of a task. See diff --git a/fs/proc/array.c b/fs/proc/array.c index 99fcbfda8e25..49283b8103c7 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -279,7 +279,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) collect_sigign_sigcatch(p, &ignored, &caught); num_threads = get_nr_threads(p); rcu_read_lock(); /* FIXME: is this correct? */ - qsize = get_ucounts_value(task_ucounts(p), UCOUNT_RLIMIT_SIGPENDING); + qsize = get_rlimit_value(task_ucounts(p), UCOUNT_RLIMIT_SIGPENDING); rcu_read_unlock(); qlim = task_rlimit(p, RLIMIT_SIGPENDING); unlock_task_sighand(p, &flags); diff --git a/fs/proc/base.c b/fs/proc/base.c index 93f7e3d971e4..9e479d7d202b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1761,7 +1761,7 @@ out: return ERR_PTR(error); } -static int do_proc_readlink(struct path *path, char __user *buffer, int buflen) +static int do_proc_readlink(const struct path *path, char __user *buffer, int buflen) { char *tmp = kmalloc(PATH_MAX, GFP_KERNEL); char *pathname; @@ -2350,6 +2350,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) GENRADIX(struct map_files_info) fa; struct map_files_info *p; int ret; + struct vma_iterator vmi; genradix_init(&fa); @@ -2388,7 +2389,9 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) * routine might require mmap_lock taken in might_fault(). */ - for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) { + pos = 2; + vma_iter_init(&vmi, mm, 0); + for_each_vma(vmi, vma) { if (!vma->vm_file) continue; if (++pos <= ctx->pos) @@ -2728,7 +2731,7 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, return -ESRCH; length = security_getprocattr(task, PROC_I(inode)->op.lsm, - (char*)file->f_path.dentry->d_name.name, + file->f_path.dentry->d_name.name, &p); put_task_struct(task); if (length > 0) @@ -3196,6 +3199,19 @@ static int proc_pid_ksm_merging_pages(struct seq_file *m, struct pid_namespace * return 0; } +static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + struct mm_struct *mm; + + mm = get_task_mm(task); + if (mm) { + seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items); + mmput(mm); + } + + return 0; +} #endif /* CONFIG_KSM */ #ifdef CONFIG_STACKLEAK_METRICS @@ -3331,6 +3347,7 @@ static const struct pid_entry tgid_base_stuff[] = { #endif #ifdef CONFIG_KSM ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages), + ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat), #endif }; @@ -3668,6 +3685,7 @@ static const struct pid_entry tid_base_stuff[] = { #endif #ifdef CONFIG_KSM ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages), + ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat), #endif }; diff --git a/fs/proc/devices.c b/fs/proc/devices.c index 837971e74109..fe7bfcb7d049 100644 --- a/fs/proc/devices.c +++ b/fs/proc/devices.c @@ -4,6 +4,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/blkdev.h> +#include "internal.h" static int devinfo_show(struct seq_file *f, void *v) { @@ -54,7 +55,10 @@ static const struct seq_operations devinfo_ops = { static int __init proc_devices_init(void) { - proc_create_seq("devices", 0, NULL, &devinfo_ops); + struct proc_dir_entry *pde; + + pde = proc_create_seq("devices", 0, NULL, &devinfo_ops); + pde_make_permanent(pde); return 0; } fs_initcall(proc_devices_init); diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 06a80f78433d..b701d0207edf 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -79,6 +79,11 @@ static inline bool pde_is_permanent(const struct proc_dir_entry *pde) return pde->flags & PROC_ENTRY_PERMANENT; } +static inline void pde_make_permanent(struct proc_dir_entry *pde) +{ + pde->flags |= PROC_ENTRY_PERMANENT; +} + extern struct kmem_cache *proc_dir_entry_cache; void pde_free(struct proc_dir_entry *pde); @@ -285,7 +290,7 @@ struct proc_maps_private { struct task_struct *task; struct mm_struct *mm; #ifdef CONFIG_MMU - struct vm_area_struct *tail_vma; + struct vma_iterator iter; #endif #ifdef CONFIG_NUMA struct mempolicy *task_mempolicy; diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index 592e6dc7c110..2fc92a13f9f8 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -17,8 +17,6 @@ #include <asm/io.h> -extern wait_queue_head_t log_wait; - static int kmsg_open(struct inode * inode, struct file * file) { return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_PROC); diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c index f32878d9a39f..817981e57223 100644 --- a/fs/proc/loadavg.c +++ b/fs/proc/loadavg.c @@ -9,6 +9,7 @@ #include <linux/seq_file.h> #include <linux/seqlock.h> #include <linux/time.h> +#include "internal.h" static int loadavg_proc_show(struct seq_file *m, void *v) { @@ -27,7 +28,10 @@ static int loadavg_proc_show(struct seq_file *m, void *v) static int __init proc_loadavg_init(void) { - proc_create_single("loadavg", 0, NULL, loadavg_proc_show); + struct proc_dir_entry *pde; + + pde = proc_create_single("loadavg", 0, NULL, loadavg_proc_show); + pde_make_permanent(pde); return 0; } fs_initcall(proc_loadavg_init); diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 6e89f0e2fd20..440960110a42 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -115,6 +115,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) #endif show_val_kb(m, "PageTables: ", global_node_page_state(NR_PAGETABLE)); + show_val_kb(m, "SecPageTables: ", + global_node_page_state(NR_SECONDARY_PAGETABLE)); show_val_kb(m, "NFS_Unstable: ", 0); show_val_kb(m, "Bounce: ", @@ -162,7 +164,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v) static int __init proc_meminfo_init(void) { - proc_create_single("meminfo", 0, NULL, meminfo_proc_show); + struct proc_dir_entry *pde; + + pde = proc_create_single("meminfo", 0, NULL, meminfo_proc_show); + pde_make_permanent(pde); return 0; } fs_initcall(proc_meminfo_init); diff --git a/fs/proc/page.c b/fs/proc/page.c index a2873a617ae8..f2273b164535 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -91,6 +91,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, } static const struct proc_ops kpagecount_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_lseek = mem_lseek, .proc_read = kpagecount_read, }; @@ -268,6 +269,7 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, } static const struct proc_ops kpageflags_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_lseek = mem_lseek, .proc_read = kpageflags_read, }; @@ -322,6 +324,7 @@ static ssize_t kpagecgroup_read(struct file *file, char __user *buf, } static const struct proc_ops kpagecgroup_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_lseek = mem_lseek, .proc_read = kpagecgroup_read, }; diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 021e83fe831f..48f2d60bd78a 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -28,13 +28,6 @@ static const struct inode_operations proc_sys_inode_operations; static const struct file_operations proc_sys_dir_file_operations; static const struct inode_operations proc_sys_dir_operations; -/* shared constants to be used in various sysctls */ -const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1 }; -EXPORT_SYMBOL(sysctl_vals); - -const unsigned long sysctl_long_vals[] = { 0, 1, LONG_MAX }; -EXPORT_SYMBOL_GPL(sysctl_long_vals); - /* Support for permanently empty directories */ struct ctl_table sysctl_mount_point[] = { @@ -1246,7 +1239,7 @@ static bool get_links(struct ctl_dir *dir, static int insert_links(struct ctl_table_header *head) { struct ctl_table_set *root_set = &sysctl_table_root.default_set; - struct ctl_dir *core_parent = NULL; + struct ctl_dir *core_parent; struct ctl_table_header *links; int err; diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c index 12901dcf57e2..f4616083faef 100644 --- a/fs/proc/softirqs.c +++ b/fs/proc/softirqs.c @@ -3,6 +3,7 @@ #include <linux/kernel_stat.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include "internal.h" /* * /proc/softirqs ... display the number of softirqs @@ -27,7 +28,10 @@ static int show_softirqs(struct seq_file *p, void *v) static int __init proc_softirqs_init(void) { - proc_create_single("softirqs", 0, NULL, show_softirqs); + struct proc_dir_entry *pde; + + pde = proc_create_single("softirqs", 0, NULL, show_softirqs); + pde_make_permanent(pde); return 0; } fs_initcall(proc_softirqs_init); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 4e0023643f8b..8a74cdcc9af0 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/pagewalk.h> -#include <linux/vmacache.h> #include <linux/mm_inline.h> #include <linux/hugetlb.h> #include <linux/huge_mm.h> @@ -124,12 +123,26 @@ static void release_task_mempolicy(struct proc_maps_private *priv) } #endif +static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv, + loff_t *ppos) +{ + struct vm_area_struct *vma = vma_next(&priv->iter); + + if (vma) { + *ppos = vma->vm_start; + } else { + *ppos = -2UL; + vma = get_gate_vma(priv->mm); + } + + return vma; +} + static void *m_start(struct seq_file *m, loff_t *ppos) { struct proc_maps_private *priv = m->private; unsigned long last_addr = *ppos; struct mm_struct *mm; - struct vm_area_struct *vma; /* See m_next(). Zero at the start or after lseek. */ if (last_addr == -1UL) @@ -153,31 +166,21 @@ static void *m_start(struct seq_file *m, loff_t *ppos) return ERR_PTR(-EINTR); } + vma_iter_init(&priv->iter, mm, last_addr); hold_task_mempolicy(priv); - priv->tail_vma = get_gate_vma(mm); - - vma = find_vma(mm, last_addr); - if (vma) - return vma; + if (last_addr == -2UL) + return get_gate_vma(mm); - return priv->tail_vma; + return proc_get_vma(priv, ppos); } static void *m_next(struct seq_file *m, void *v, loff_t *ppos) { - struct proc_maps_private *priv = m->private; - struct vm_area_struct *next, *vma = v; - - if (vma == priv->tail_vma) - next = NULL; - else if (vma->vm_next) - next = vma->vm_next; - else - next = priv->tail_vma; - - *ppos = next ? next->vm_start : -1UL; - - return next; + if (*ppos == -2UL) { + *ppos = -1UL; + return NULL; + } + return proc_get_vma(m->private, ppos); } static void m_stop(struct seq_file *m, void *v) @@ -864,7 +867,7 @@ static int show_smap(struct seq_file *m, void *v) __show_smap(m, &mss, false); seq_printf(m, "THPeligible: %d\n", - hugepage_vma_check(vma, vma->vm_flags, true, false)); + hugepage_vma_check(vma, vma->vm_flags, true, false, true)); if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); @@ -877,16 +880,16 @@ static int show_smaps_rollup(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; struct mem_size_stats mss; - struct mm_struct *mm; + struct mm_struct *mm = priv->mm; struct vm_area_struct *vma; - unsigned long last_vma_end = 0; + unsigned long vma_start = 0, last_vma_end = 0; int ret = 0; + MA_STATE(mas, &mm->mm_mt, 0, 0); priv->task = get_proc_task(priv->inode); if (!priv->task) return -ESRCH; - mm = priv->mm; if (!mm || !mmget_not_zero(mm)) { ret = -ESRCH; goto out_put_task; @@ -899,8 +902,13 @@ static int show_smaps_rollup(struct seq_file *m, void *v) goto out_put_mm; hold_task_mempolicy(priv); + vma = mas_find(&mas, ULONG_MAX); + + if (unlikely(!vma)) + goto empty_set; - for (vma = priv->mm->mmap; vma;) { + vma_start = vma->vm_start; + do { smap_gather_stats(vma, &mss, 0); last_vma_end = vma->vm_end; @@ -909,6 +917,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) * access it for write request. */ if (mmap_lock_is_contended(mm)) { + mas_pause(&mas); mmap_read_unlock(mm); ret = mmap_read_lock_killable(mm); if (ret) { @@ -952,7 +961,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) * contains last_vma_end. * Iterate VMA' from last_vma_end. */ - vma = find_vma(mm, last_vma_end - 1); + vma = mas_find(&mas, ULONG_MAX); /* Case 3 above */ if (!vma) break; @@ -966,11 +975,10 @@ static int show_smaps_rollup(struct seq_file *m, void *v) smap_gather_stats(vma, &mss, last_vma_end); } /* Case 2 above */ - vma = vma->vm_next; - } + } while ((vma = mas_find(&mas, ULONG_MAX)) != NULL); - show_vma_header_prefix(m, priv->mm->mmap->vm_start, - last_vma_end, 0, 0, 0, 0); +empty_set: + show_vma_header_prefix(m, vma_start, last_vma_end, 0, 0, 0, 0); seq_pad(m, ' '); seq_puts(m, "[rollup]\n"); @@ -1263,6 +1271,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, return -ESRCH; mm = get_task_mm(task); if (mm) { + MA_STATE(mas, &mm->mm_mt, 0, 0); struct mmu_notifier_range range; struct clear_refs_private cp = { .type = type, @@ -1282,7 +1291,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, } if (type == CLEAR_REFS_SOFT_DIRTY) { - for (vma = mm->mmap; vma; vma = vma->vm_next) { + mas_for_each(&mas, vma, ULONG_MAX) { if (!(vma->vm_flags & VM_SOFTDIRTY)) continue; vma->vm_flags &= ~VM_SOFTDIRTY; @@ -1294,8 +1303,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, 0, NULL, mm, 0, -1UL); mmu_notifier_invalidate_range_start(&range); } - walk_page_range(mm, 0, mm->highest_vm_end, &clear_refs_walk_ops, - &cp); + walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp); if (type == CLEAR_REFS_SOFT_DIRTY) { mmu_notifier_invalidate_range_end(&range); flush_tlb_mm(mm); @@ -1418,9 +1426,19 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, if (pte_swp_uffd_wp(pte)) flags |= PM_UFFD_WP; entry = pte_to_swp_entry(pte); - if (pm->show_pfn) + if (pm->show_pfn) { + pgoff_t offset; + /* + * For PFN swap offsets, keeping the offset field + * to be PFN only to be compatible with old smaps. + */ + if (is_pfn_swap_entry(entry)) + offset = swp_offset_pfn(entry); + else + offset = swp_offset(entry); frame = swp_type(entry) | - (swp_offset(entry) << MAX_SWAPFILES_SHIFT); + (offset << MAX_SWAPFILES_SHIFT); + } flags |= PM_SWAP; migration = is_migration_entry(entry); if (is_pfn_swap_entry(entry)) @@ -1477,7 +1495,11 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned long offset; if (pm->show_pfn) { - offset = swp_offset(entry) + + if (is_pfn_swap_entry(entry)) + offset = swp_offset_pfn(entry); + else + offset = swp_offset(entry); + offset = offset + ((addr & ~PMD_MASK) >> PAGE_SHIFT); frame = swp_type(entry) | (offset << MAX_SWAPFILES_SHIFT); diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index a6d21fc0033c..2fd06f52b6a4 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -20,15 +20,13 @@ */ void task_mem(struct seq_file *m, struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; struct vm_region *region; - struct rb_node *p; unsigned long bytes = 0, sbytes = 0, slack = 0, size; - - mmap_read_lock(mm); - for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { - vma = rb_entry(p, struct vm_area_struct, vm_rb); + mmap_read_lock(mm); + for_each_vma(vmi, vma) { bytes += kobjsize(vma); region = vma->vm_region; @@ -82,15 +80,13 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) unsigned long task_vsize(struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; - struct rb_node *p; unsigned long vsize = 0; mmap_read_lock(mm); - for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { - vma = rb_entry(p, struct vm_area_struct, vm_rb); + for_each_vma(vmi, vma) vsize += vma->vm_end - vma->vm_start; - } mmap_read_unlock(mm); return vsize; } @@ -99,14 +95,13 @@ unsigned long task_statm(struct mm_struct *mm, unsigned long *shared, unsigned long *text, unsigned long *data, unsigned long *resident) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; struct vm_region *region; - struct rb_node *p; unsigned long size = kobjsize(mm); mmap_read_lock(mm); - for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { - vma = rb_entry(p, struct vm_area_struct, vm_rb); + for_each_vma(vmi, vma) { size += kobjsize(vma); region = vma->vm_region; if (region) { @@ -190,17 +185,19 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) */ static int show_map(struct seq_file *m, void *_p) { - struct rb_node *p = _p; - - return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb)); + return nommu_vma_show(m, _p); } static void *m_start(struct seq_file *m, loff_t *pos) { struct proc_maps_private *priv = m->private; struct mm_struct *mm; - struct rb_node *p; - loff_t n = *pos; + struct vm_area_struct *vma; + unsigned long addr = *pos; + + /* See m_next(). Zero at the start or after lseek. */ + if (addr == -1UL) + return NULL; /* pin the task and mm whilst we play with them */ priv->task = get_proc_task(priv->inode); @@ -216,10 +213,10 @@ static void *m_start(struct seq_file *m, loff_t *pos) return ERR_PTR(-EINTR); } - /* start from the Nth VMA */ - for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) - if (n-- == 0) - return p; + /* start the next element from addr */ + vma = find_vma(mm, addr); + if (vma) + return vma; mmap_read_unlock(mm); mmput(mm); @@ -242,10 +239,10 @@ static void m_stop(struct seq_file *m, void *_vml) static void *m_next(struct seq_file *m, void *_p, loff_t *pos) { - struct rb_node *p = _p; + struct vm_area_struct *vma = _p; - (*pos)++; - return p ? rb_next(p) : NULL; + *pos = vma->vm_end; + return find_vma(vma->vm_mm, vma->vm_end); } static const struct seq_operations proc_pid_maps_ops = { diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index deb99bc9b7e6..b5343d209381 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c @@ -7,6 +7,7 @@ #include <linux/time.h> #include <linux/time_namespace.h> #include <linux/kernel_stat.h> +#include "internal.h" static int uptime_proc_show(struct seq_file *m, void *v) { @@ -39,7 +40,10 @@ static int uptime_proc_show(struct seq_file *m, void *v) static int __init proc_uptime_init(void) { - proc_create_single("uptime", 0, NULL, uptime_proc_show); + struct proc_dir_entry *pde; + + pde = proc_create_single("uptime", 0, NULL, uptime_proc_show); + pde_make_permanent(pde); return 0; } fs_initcall(proc_uptime_init); diff --git a/fs/proc/version.c b/fs/proc/version.c index b449f186577f..02e3c3cd4a9a 100644 --- a/fs/proc/version.c +++ b/fs/proc/version.c @@ -5,6 +5,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/utsname.h> +#include "internal.h" static int version_proc_show(struct seq_file *m, void *v) { @@ -17,7 +18,10 @@ static int version_proc_show(struct seq_file *m, void *v) static int __init proc_version_init(void) { - proc_create_single("version", 0, NULL, version_proc_show); + struct proc_dir_entry *pde; + + pde = proc_create_single("version", 0, NULL, version_proc_show); + pde_make_permanent(pde); return 0; } fs_initcall(proc_version_init); diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index b2fd3c20e7c2..0c034ea39954 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -28,14 +28,11 @@ #include <linux/crypto.h> #include <linux/string.h> #include <linux/timer.h> -#include <linux/scatterlist.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/jiffies.h> #include <linux/workqueue.h> -#include <crypto/acompress.h> - #include "internal.h" /* @@ -93,8 +90,7 @@ module_param(compress, charp, 0444); MODULE_PARM_DESC(compress, "compression to use"); /* Compression parameters */ -static struct crypto_acomp *tfm; -static struct acomp_req *creq; +static struct crypto_comp *tfm; struct pstore_zbackend { int (*zbufsize)(size_t size); @@ -272,21 +268,12 @@ static const struct pstore_zbackend zbackends[] = { static int pstore_compress(const void *in, void *out, unsigned int inlen, unsigned int outlen) { - struct scatterlist src, dst; int ret; if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS)) return -EINVAL; - sg_init_table(&src, 1); - sg_set_buf(&src, in, inlen); - - sg_init_table(&dst, 1); - sg_set_buf(&dst, out, outlen); - - acomp_request_set_params(creq, &src, &dst, inlen, outlen); - - ret = crypto_acomp_compress(creq); + ret = crypto_comp_compress(tfm, in, inlen, out, &outlen); if (ret) { pr_err("crypto_comp_compress failed, ret = %d!\n", ret); return ret; @@ -297,7 +284,7 @@ static int pstore_compress(const void *in, void *out, static void allocate_buf_for_compression(void) { - struct crypto_acomp *acomp; + struct crypto_comp *ctx; int size; char *buf; @@ -309,7 +296,7 @@ static void allocate_buf_for_compression(void) if (!psinfo || tfm) return; - if (!crypto_has_acomp(zbackend->name, 0, CRYPTO_ALG_ASYNC)) { + if (!crypto_has_comp(zbackend->name, 0, 0)) { pr_err("Unknown compression: %s\n", zbackend->name); return; } @@ -328,24 +315,16 @@ static void allocate_buf_for_compression(void) return; } - acomp = crypto_alloc_acomp(zbackend->name, 0, CRYPTO_ALG_ASYNC); - if (IS_ERR_OR_NULL(acomp)) { + ctx = crypto_alloc_comp(zbackend->name, 0, 0); + if (IS_ERR_OR_NULL(ctx)) { kfree(buf); pr_err("crypto_alloc_comp('%s') failed: %ld\n", zbackend->name, - PTR_ERR(acomp)); - return; - } - - creq = acomp_request_alloc(acomp); - if (!creq) { - crypto_free_acomp(acomp); - kfree(buf); - pr_err("acomp_request_alloc('%s') failed\n", zbackend->name); + PTR_ERR(ctx)); return; } /* A non-NULL big_oops_buf indicates compression is available. */ - tfm = acomp; + tfm = ctx; big_oops_buf_sz = size; big_oops_buf = buf; @@ -355,8 +334,7 @@ static void allocate_buf_for_compression(void) static void free_buf_for_compression(void) { if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && tfm) { - acomp_request_free(creq); - crypto_free_acomp(tfm); + crypto_free_comp(tfm); tfm = NULL; } kfree(big_oops_buf); @@ -693,8 +671,6 @@ static void decompress_record(struct pstore_record *record) int ret; int unzipped_len; char *unzipped, *workspace; - struct acomp_req *dreq; - struct scatterlist src, dst; if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS) || !record->compressed) return; @@ -718,30 +694,16 @@ static void decompress_record(struct pstore_record *record) if (!workspace) return; - dreq = acomp_request_alloc(tfm); - if (!dreq) { - kfree(workspace); - return; - } - - sg_init_table(&src, 1); - sg_set_buf(&src, record->buf, record->size); - - sg_init_table(&dst, 1); - sg_set_buf(&dst, workspace, unzipped_len); - - acomp_request_set_params(dreq, &src, &dst, record->size, unzipped_len); - /* After decompression "unzipped_len" is almost certainly smaller. */ - ret = crypto_acomp_decompress(dreq); + ret = crypto_comp_decompress(tfm, record->buf, record->size, + workspace, &unzipped_len); if (ret) { - pr_err("crypto_acomp_decompress failed, ret = %d!\n", ret); + pr_err("crypto_comp_decompress failed, ret = %d!\n", ret); kfree(workspace); return; } /* Append ECC notice to decompressed buffer. */ - unzipped_len = dreq->dlen; memcpy(workspace + unzipped_len, record->buf + record->size, record->ecc_notice_size); @@ -749,7 +711,6 @@ static void decompress_record(struct pstore_record *record) unzipped = kmemdup(workspace, unzipped_len + record->ecc_notice_size, GFP_KERNEL); kfree(workspace); - acomp_request_free(dreq); if (!unzipped) return; diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index b9895afca9d1..85b2fa3b211c 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -470,10 +470,8 @@ out2: out1: iput(sbi->inodes); out: - if (bh1) - brelse(bh1); - if (bh2) - brelse(bh2); + brelse(bh1); + brelse(bh2); outnobh: kfree(qs); s->s_fs_info = NULL; diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c index 5f2405994280..0f1493e0f6d0 100644 --- a/fs/quota/quota_tree.c +++ b/fs/quota/quota_tree.c @@ -71,6 +71,40 @@ static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) return ret; } +static inline int do_check_range(struct super_block *sb, const char *val_name, + uint val, uint min_val, uint max_val) +{ + if (val < min_val || val > max_val) { + quota_error(sb, "Getting %s %u out of range %u-%u", + val_name, val, min_val, max_val); + return -EUCLEAN; + } + + return 0; +} + +static int check_dquot_block_header(struct qtree_mem_dqinfo *info, + struct qt_disk_dqdbheader *dh) +{ + int err = 0; + + err = do_check_range(info->dqi_sb, "dqdh_next_free", + le32_to_cpu(dh->dqdh_next_free), 0, + info->dqi_blocks - 1); + if (err) + return err; + err = do_check_range(info->dqi_sb, "dqdh_prev_free", + le32_to_cpu(dh->dqdh_prev_free), 0, + info->dqi_blocks - 1); + if (err) + return err; + err = do_check_range(info->dqi_sb, "dqdh_entries", + le16_to_cpu(dh->dqdh_entries), 0, + qtree_dqstr_in_blk(info)); + + return err; +} + /* Remove empty block from list and return it */ static int get_free_dqblk(struct qtree_mem_dqinfo *info) { @@ -85,6 +119,9 @@ static int get_free_dqblk(struct qtree_mem_dqinfo *info) ret = read_blk(info, blk, buf); if (ret < 0) goto out_buf; + ret = check_dquot_block_header(info, dh); + if (ret) + goto out_buf; info->dqi_free_blk = le32_to_cpu(dh->dqdh_next_free); } else { @@ -232,6 +269,9 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info, *err = read_blk(info, blk, buf); if (*err < 0) goto out_buf; + *err = check_dquot_block_header(info, dh); + if (*err) + goto out_buf; } else { blk = get_free_dqblk(info); if ((int)blk < 0) { @@ -313,6 +353,10 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, } ref = (__le32 *)buf; newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); + ret = do_check_range(dquot->dq_sb, "block", newblk, 0, + info->dqi_blocks - 1); + if (ret) + goto out_buf; if (!newblk) newson = 1; if (depth == info->dqi_qtree_depth - 1) { @@ -424,6 +468,9 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, goto out_buf; } dh = (struct qt_disk_dqdbheader *)buf; + ret = check_dquot_block_header(info, dh); + if (ret) + goto out_buf; le16_add_cpu(&dh->dqdh_entries, -1); if (!le16_to_cpu(dh->dqdh_entries)) { /* Block got free? */ ret = remove_free_dqentry(info, buf, blk); @@ -480,12 +527,10 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, goto out_buf; } newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); - if (newblk < QT_TREEOFF || newblk >= info->dqi_blocks) { - quota_error(dquot->dq_sb, "Getting block too big (%u >= %u)", - newblk, info->dqi_blocks); - ret = -EUCLEAN; + ret = do_check_range(dquot->dq_sb, "block", newblk, QT_TREEOFF, + info->dqi_blocks - 1); + if (ret) goto out_buf; - } if (depth == info->dqi_qtree_depth - 1) { ret = free_dqentry(info, dquot, newblk); @@ -586,12 +631,10 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info, blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); if (!blk) /* No reference? */ goto out_buf; - if (blk < QT_TREEOFF || blk >= info->dqi_blocks) { - quota_error(dquot->dq_sb, "Getting block too big (%u >= %u)", - blk, info->dqi_blocks); - ret = -EUCLEAN; + ret = do_check_range(dquot->dq_sb, "block", blk, QT_TREEOFF, + info->dqi_blocks - 1); + if (ret) goto out_buf; - } if (depth < info->dqi_qtree_depth - 1) ret = find_tree_dqentry(info, dquot, blk, depth+1); @@ -705,15 +748,21 @@ static int find_next_id(struct qtree_mem_dqinfo *info, qid_t *id, goto out_buf; } for (i = __get_index(info, *id, depth); i < epb; i++) { - if (ref[i] == cpu_to_le32(0)) { + uint blk_no = le32_to_cpu(ref[i]); + + if (blk_no == 0) { *id += level_inc; continue; } + ret = do_check_range(info->dqi_sb, "block", blk_no, 0, + info->dqi_blocks - 1); + if (ret) + goto out_buf; if (depth == info->dqi_qtree_depth - 1) { ret = 0; goto out_buf; } - ret = find_next_id(info, id, le32_to_cpu(ref[i]), depth + 1); + ret = find_next_id(info, id, blk_no, depth + 1); if (ret != -ENOENT) break; } diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index ba3525ccc27e..cb240eac5036 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -203,9 +203,9 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - unsigned long maxpages, lpages, nr, loop, ret; + unsigned long maxpages, lpages, nr_folios, loop, ret, nr_pages, pfn; struct inode *inode = file_inode(file); - struct page **pages = NULL, **ptr, *page; + struct folio_batch fbatch; loff_t isize; /* the mapping mustn't extend beyond the EOF */ @@ -221,31 +221,39 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, goto out; /* gang-find the pages */ - pages = kcalloc(lpages, sizeof(struct page *), GFP_KERNEL); - if (!pages) - goto out_free; - - nr = find_get_pages_contig(inode->i_mapping, pgoff, lpages, pages); - if (nr != lpages) - goto out_free_pages; /* leave if some pages were missing */ + folio_batch_init(&fbatch); + nr_pages = 0; +repeat: + nr_folios = filemap_get_folios_contig(inode->i_mapping, &pgoff, + ULONG_MAX, &fbatch); + if (!nr_folios) { + ret = -ENOSYS; + return ret; + } + if (ret == -ENOSYS) { + ret = (unsigned long) folio_address(fbatch.folios[0]); + pfn = folio_pfn(fbatch.folios[0]); + } /* check the pages for physical adjacency */ - ptr = pages; - page = *ptr++; - page++; - for (loop = lpages; loop > 1; loop--) - if (*ptr++ != page++) - goto out_free_pages; + for (loop = 0; loop < nr_folios; loop++) { + if (pfn + nr_pages != folio_pfn(fbatch.folios[loop])) { + ret = -ENOSYS; + goto out_free; /* leave if not physical adjacent */ + } + nr_pages += folio_nr_pages(fbatch.folios[loop]); + if (nr_pages >= lpages) + goto out_free; /* successfully found desired pages*/ + } + if (nr_pages < lpages) { + folio_batch_release(&fbatch); + goto repeat; /* loop if pages are missing */ + } /* okay - all conditions fulfilled */ - ret = (unsigned long) page_address(pages[0]); -out_free_pages: - ptr = pages; - for (loop = nr; loop > 0; loop--) - put_page(*ptr++); out_free: - kfree(pages); + folio_batch_release(&fbatch); out: return ret; } diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index bc66d0173e33..b3257e852820 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -146,15 +146,15 @@ static int ramfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, } static int ramfs_tmpfile(struct user_namespace *mnt_userns, - struct inode *dir, struct dentry *dentry, umode_t mode) + struct inode *dir, struct file *file, umode_t mode) { struct inode *inode; inode = ramfs_get_inode(dir->i_sb, dir, mode, 0); if (!inode) return -ENOSPC; - d_tmpfile(dentry, inode); - return 0; + d_tmpfile(file, inode); + return finish_open_simple(file, 0); } static const struct inode_operations ramfs_dir_inode_operations = { diff --git a/fs/read_write.c b/fs/read_write.c index 1a261dcf1778..328ce8cf9a85 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -496,14 +496,9 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t } /* caller is responsible for file_start_write/file_end_write */ -ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) +ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos) { - struct kvec iov = { - .iov_base = (void *)buf, - .iov_len = min_t(size_t, count, MAX_RW_COUNT), - }; struct kiocb kiocb; - struct iov_iter iter; ssize_t ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE))) @@ -519,8 +514,7 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t init_sync_kiocb(&kiocb, file); kiocb.ki_pos = pos ? *pos : 0; - iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len); - ret = file->f_op->write_iter(&kiocb, &iter); + ret = file->f_op->write_iter(&kiocb, from); if (ret > 0) { if (pos) *pos = kiocb.ki_pos; @@ -530,6 +524,18 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t inc_syscw(current); return ret; } + +/* caller is responsible for file_start_write/file_end_write */ +ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) +{ + struct kvec iov = { + .iov_base = (void *)buf, + .iov_len = min_t(size_t, count, MAX_RW_COUNT), + }; + struct iov_iter iter; + iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len); + return __kernel_write_iter(file, &iter, pos); +} /* * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()", * but autofs is one of the few internal kernel users that actually diff --git a/fs/readdir.c b/fs/readdir.c index 09e8ed7d4161..9c53edb60c03 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -140,7 +140,7 @@ struct readdir_callback { int result; }; -static int fillonedir(struct dir_context *ctx, const char *name, int namlen, +static bool fillonedir(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct readdir_callback *buf = @@ -149,14 +149,14 @@ static int fillonedir(struct dir_context *ctx, const char *name, int namlen, unsigned long d_ino; if (buf->result) - return -EINVAL; + return false; buf->result = verify_dirent_name(name, namlen); - if (buf->result < 0) - return buf->result; + if (buf->result) + return false; d_ino = ino; if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { buf->result = -EOVERFLOW; - return -EOVERFLOW; + return false; } buf->result++; dirent = buf->dirent; @@ -169,12 +169,12 @@ static int fillonedir(struct dir_context *ctx, const char *name, int namlen, unsafe_put_user(namlen, &dirent->d_namlen, efault_end); unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); user_write_access_end(); - return 0; + return true; efault_end: user_write_access_end(); efault: buf->result = -EFAULT; - return -EFAULT; + return false; } SYSCALL_DEFINE3(old_readdir, unsigned int, fd, @@ -219,7 +219,7 @@ struct getdents_callback { int error; }; -static int filldir(struct dir_context *ctx, const char *name, int namlen, +static bool filldir(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct linux_dirent __user *dirent, *prev; @@ -232,18 +232,18 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen, buf->error = verify_dirent_name(name, namlen); if (unlikely(buf->error)) - return buf->error; + return false; buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) - return -EINVAL; + return false; d_ino = ino; if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { buf->error = -EOVERFLOW; - return -EOVERFLOW; + return false; } prev_reclen = buf->prev_reclen; if (prev_reclen && signal_pending(current)) - return -EINTR; + return false; dirent = buf->current_dir; prev = (void __user *) dirent - prev_reclen; if (!user_write_access_begin(prev, reclen + prev_reclen)) @@ -260,12 +260,12 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen, buf->current_dir = (void __user *)dirent + reclen; buf->prev_reclen = reclen; buf->count -= reclen; - return 0; + return true; efault_end: user_write_access_end(); efault: buf->error = -EFAULT; - return -EFAULT; + return false; } SYSCALL_DEFINE3(getdents, unsigned int, fd, @@ -307,7 +307,7 @@ struct getdents_callback64 { int error; }; -static int filldir64(struct dir_context *ctx, const char *name, int namlen, +static bool filldir64(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct linux_dirent64 __user *dirent, *prev; @@ -319,13 +319,13 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen, buf->error = verify_dirent_name(name, namlen); if (unlikely(buf->error)) - return buf->error; + return false; buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) - return -EINVAL; + return false; prev_reclen = buf->prev_reclen; if (prev_reclen && signal_pending(current)) - return -EINTR; + return false; dirent = buf->current_dir; prev = (void __user *)dirent - prev_reclen; if (!user_write_access_begin(prev, reclen + prev_reclen)) @@ -342,13 +342,13 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen, buf->prev_reclen = reclen; buf->current_dir = (void __user *)dirent + reclen; buf->count -= reclen; - return 0; + return true; efault_end: user_write_access_end(); efault: buf->error = -EFAULT; - return -EFAULT; + return false; } SYSCALL_DEFINE3(getdents64, unsigned int, fd, @@ -397,7 +397,7 @@ struct compat_readdir_callback { int result; }; -static int compat_fillonedir(struct dir_context *ctx, const char *name, +static bool compat_fillonedir(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { @@ -407,14 +407,14 @@ static int compat_fillonedir(struct dir_context *ctx, const char *name, compat_ulong_t d_ino; if (buf->result) - return -EINVAL; + return false; buf->result = verify_dirent_name(name, namlen); - if (buf->result < 0) - return buf->result; + if (buf->result) + return false; d_ino = ino; if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { buf->result = -EOVERFLOW; - return -EOVERFLOW; + return false; } buf->result++; dirent = buf->dirent; @@ -427,12 +427,12 @@ static int compat_fillonedir(struct dir_context *ctx, const char *name, unsafe_put_user(namlen, &dirent->d_namlen, efault_end); unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); user_write_access_end(); - return 0; + return true; efault_end: user_write_access_end(); efault: buf->result = -EFAULT; - return -EFAULT; + return false; } COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd, @@ -471,7 +471,7 @@ struct compat_getdents_callback { int error; }; -static int compat_filldir(struct dir_context *ctx, const char *name, int namlen, +static bool compat_filldir(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct compat_linux_dirent __user *dirent, *prev; @@ -484,18 +484,18 @@ static int compat_filldir(struct dir_context *ctx, const char *name, int namlen, buf->error = verify_dirent_name(name, namlen); if (unlikely(buf->error)) - return buf->error; + return false; buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) - return -EINVAL; + return false; d_ino = ino; if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { buf->error = -EOVERFLOW; - return -EOVERFLOW; + return false; } prev_reclen = buf->prev_reclen; if (prev_reclen && signal_pending(current)) - return -EINTR; + return false; dirent = buf->current_dir; prev = (void __user *) dirent - prev_reclen; if (!user_write_access_begin(prev, reclen + prev_reclen)) @@ -511,12 +511,12 @@ static int compat_filldir(struct dir_context *ctx, const char *name, int namlen, buf->prev_reclen = reclen; buf->current_dir = (void __user *)dirent + reclen; buf->count -= reclen; - return 0; + return true; efault_end: user_write_access_end(); efault: buf->error = -EFAULT; - return -EFAULT; + return false; } COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd, diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 94addfcefede..9f62da7471c9 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -868,7 +868,7 @@ loop_next: */ if (buffer_dirty(bh) && unlikely(bh->b_page->mapping == NULL)) { spin_unlock(lock); - ll_rw_block(REQ_OP_WRITE, 1, &bh); + write_dirty_buffer(bh, 0); spin_lock(lock); } put_bh(bh); @@ -1054,7 +1054,7 @@ static int flush_commit_list(struct super_block *s, if (tbh) { if (buffer_dirty(tbh)) { depth = reiserfs_write_unlock_nested(s); - ll_rw_block(REQ_OP_WRITE, 1, &tbh); + write_dirty_buffer(tbh, 0); reiserfs_write_lock_nested(s, depth); } put_bh(tbh) ; @@ -2240,7 +2240,7 @@ abort_replay: } } /* read in the log blocks, memcpy to the corresponding real block */ - ll_rw_block(REQ_OP_READ, get_desc_trans_len(desc), log_blocks); + bh_read_batch(get_desc_trans_len(desc), log_blocks); for (i = 0; i < get_desc_trans_len(desc); i++) { wait_on_buffer(log_blocks[i]); @@ -2342,10 +2342,11 @@ static struct buffer_head *reiserfs_breada(struct block_device *dev, } else bhlist[j++] = bh; } - ll_rw_block(REQ_OP_READ, j, bhlist); + bh = bhlist[0]; + bh_read_nowait(bh, 0); + bh_readahead_batch(j - 1, &bhlist[1], 0); for (i = 1; i < j; i++) brelse(bhlist[i]); - bh = bhlist[0]; wait_on_buffer(bh); if (buffer_uptodate(bh)) return bh; diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c index 30319dc33c18..84a194b77f19 100644 --- a/fs/reiserfs/prints.c +++ b/fs/reiserfs/prints.c @@ -456,7 +456,7 @@ static int print_internal(struct buffer_head *bh, int first, int last) to = B_NR_ITEMS(bh); } else { from = first; - to = last < B_NR_ITEMS(bh) ? last : B_NR_ITEMS(bh); + to = min_t(int, last, B_NR_ITEMS(bh)); } reiserfs_printk("INTERNAL NODE (%ld) contains %z\n", bh->b_blocknr, bh); diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index 4a7cb16e9345..3dba8acf4e83 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -411,7 +411,7 @@ int reiserfs_proc_info_init(struct super_block *sb) char *s; /* Some block devices use /'s */ - strlcpy(b, sb->s_id, BDEVNAME_SIZE); + strscpy(b, sb->s_id, BDEVNAME_SIZE); s = strchr(b, '/'); if (s) *s = '!'; @@ -441,7 +441,7 @@ int reiserfs_proc_info_done(struct super_block *sb) char *s; /* Some block devices use /'s */ - strlcpy(b, sb->s_id, BDEVNAME_SIZE); + strscpy(b, sb->s_id, BDEVNAME_SIZE); s = strchr(b, '/'); if (s) *s = '!'; diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c index 8096c74c38ac..7b498a0d060b 100644 --- a/fs/reiserfs/resize.c +++ b/fs/reiserfs/resize.c @@ -97,7 +97,7 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) * using the copy_size var below allows this code to work for * both shrinking and expanding the FS. */ - copy_size = bmap_nr_new < bmap_nr ? bmap_nr_new : bmap_nr; + copy_size = min(bmap_nr_new, bmap_nr); copy_size = copy_size * sizeof(struct reiserfs_list_bitmap_node *); for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) { diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index 9a293609a022..84c12a1947b2 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -579,7 +579,7 @@ static int search_by_key_reada(struct super_block *s, if (!buffer_uptodate(bh[j])) { if (depth == -1) depth = reiserfs_write_unlock_nested(s); - ll_rw_block(REQ_OP_READ | REQ_RAHEAD, 1, bh + j); + bh_readahead(bh[j], REQ_RAHEAD); } brelse(bh[j]); } @@ -685,7 +685,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, if (!buffer_uptodate(bh) && depth == -1) depth = reiserfs_write_unlock_nested(sb); - ll_rw_block(REQ_OP_READ, 1, &bh); + bh_read_nowait(bh, 0); wait_on_buffer(bh); if (depth != -1) diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index c88cd2ce0665..929acce6e731 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1702,9 +1702,7 @@ static int read_super_block(struct super_block *s, int offset) /* after journal replay, reread all bitmap and super blocks */ static int reread_meta_blocks(struct super_block *s) { - ll_rw_block(REQ_OP_READ, 1, &SB_BUFFER_WITH_SB(s)); - wait_on_buffer(SB_BUFFER_WITH_SB(s)); - if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) { + if (bh_read(SB_BUFFER_WITH_SB(s), 0) < 0) { reiserfs_warning(s, "reiserfs-2504", "error reading the super"); return 1; } @@ -2504,9 +2502,7 @@ static ssize_t reiserfs_quota_read(struct super_block *sb, int type, char *data, len = i_size - off; toread = len; while (toread > 0) { - tocopy = - sb->s_blocksize - offset < - toread ? sb->s_blocksize - offset : toread; + tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread); tmp_bh.b_state = 0; /* * Quota files are without tails so we can safely @@ -2554,8 +2550,7 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type, return -EIO; } while (towrite > 0) { - tocopy = sb->s_blocksize - offset < towrite ? - sb->s_blocksize - offset : towrite; + tocopy = min_t(unsigned long, sb->s_blocksize - offset, towrite); tmp_bh.b_state = 0; reiserfs_write_lock(sb); err = reiserfs_get_block(inode, blk, &tmp_bh, GET_BLOCK_CREATE); diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 436641369283..8b2d52443f41 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -189,7 +189,7 @@ struct reiserfs_dentry_buf { struct dentry *dentries[8]; }; -static int +static bool fill_with_dentries(struct dir_context *ctx, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) { @@ -200,16 +200,16 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen, WARN_ON_ONCE(!inode_is_locked(d_inode(dbuf->xadir))); if (dbuf->count == ARRAY_SIZE(dbuf->dentries)) - return -ENOSPC; + return false; if (name[0] == '.' && (namelen < 2 || (namelen == 2 && name[1] == '.'))) - return 0; + return true; dentry = lookup_one_len(name, dbuf->xadir, namelen); if (IS_ERR(dentry)) { dbuf->err = PTR_ERR(dentry); - return PTR_ERR(dentry); + return false; } else if (d_really_is_negative(dentry)) { /* A directory entry exists, but no file? */ reiserfs_error(dentry->d_sb, "xattr-20003", @@ -218,11 +218,11 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen, dentry, dbuf->xadir); dput(dentry); dbuf->err = -EIO; - return -EIO; + return false; } dbuf->dentries[dbuf->count++] = dentry; - return 0; + return true; } static void @@ -797,7 +797,7 @@ struct listxattr_buf { struct dentry *dentry; }; -static int listxattr_filler(struct dir_context *ctx, const char *name, +static bool listxattr_filler(struct dir_context *ctx, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) { @@ -813,19 +813,19 @@ static int listxattr_filler(struct dir_context *ctx, const char *name, name); if (!handler /* Unsupported xattr name */ || (handler->list && !handler->list(b->dentry))) - return 0; + return true; size = namelen + 1; if (b->buf) { if (b->pos + size > b->size) { b->pos = -ERANGE; - return -ERANGE; + return false; } memcpy(b->buf + b->pos, name, namelen); b->buf[b->pos + namelen] = 0; } b->pos += size; } - return 0; + return true; } /* diff --git a/fs/smbfs_common/smb2pdu.h b/fs/smbfs_common/smb2pdu.h index 2cab413fffee..7d605db3bb3b 100644 --- a/fs/smbfs_common/smb2pdu.h +++ b/fs/smbfs_common/smb2pdu.h @@ -1101,7 +1101,11 @@ struct smb2_change_notify_rsp { #define SMB2_CREATE_REQUEST_LEASE "RqLs" #define SMB2_CREATE_DURABLE_HANDLE_REQUEST_V2 "DH2Q" #define SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2 "DH2C" -#define SMB2_CREATE_TAG_POSIX "\x93\xAD\x25\x50\x9C\xB4\x11\xE7\xB4\x23\x83\xDE\x96\x8B\xCD\x7C" +#define SMB2_CREATE_TAG_POSIX "\x93\xAD\x25\x50\x9C\xB4\x11\xE7\xB4\x23\x83\xDE\x96\x8B\xCD\x7C" +#define SMB2_CREATE_APP_INSTANCE_ID "\x45\xBC\xA6\x6A\xEF\xA7\xF7\x4A\x90\x08\xFA\x46\x2E\x14\x4D\x74" +#define SMB2_CREATE_APP_INSTANCE_VERSION "\xB9\x82\xD0\xB7\x3B\x56\x07\x4F\xA0\x7B\x52\x4A\x81\x16\xA0\x10" +#define SVHDX_OPEN_DEVICE_CONTEXT "\x9C\xCB\xCF\x9E\x04\xC1\xE6\x43\x98\x0E\x15\x8D\xA1\xF6\xEC\x83" +#define SMB2_CREATE_TAG_AAPL "AAPL" /* Flag (SMB3 open response) values */ #define SMB2_CREATE_FLAG_REPARSEPOINT 0x01 diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index e56510964b22..8ba8c4c50770 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -506,8 +506,9 @@ static int squashfs_readahead_fragment(struct page **page, squashfs_i(inode)->fragment_size); struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; unsigned int n, mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1; + int error = buffer->error; - if (buffer->error) + if (error) goto out; expected += squashfs_i(inode)->fragment_offset; @@ -529,7 +530,7 @@ static int squashfs_readahead_fragment(struct page **page, out: squashfs_cache_put(buffer); - return buffer->error; + return error; } static void squashfs_readahead(struct readahead_control *ractl) @@ -557,6 +558,13 @@ static void squashfs_readahead(struct readahead_control *ractl) int res, bsize; u64 block = 0; unsigned int expected; + struct page *last_page; + + expected = start >> msblk->block_log == file_end ? + (i_size_read(inode) & (msblk->block_size - 1)) : + msblk->block_size; + + max_pages = (expected + PAGE_SIZE - 1) >> PAGE_SHIFT; nr_pages = __readahead_batch(ractl, pages, max_pages); if (!nr_pages) @@ -566,13 +574,10 @@ static void squashfs_readahead(struct readahead_control *ractl) goto skip_pages; index = pages[0]->index >> shift; + if ((pages[nr_pages - 1]->index >> shift) != index) goto skip_pages; - expected = index == file_end ? - (i_size_read(inode) & (msblk->block_size - 1)) : - msblk->block_size; - if (index == file_end && squashfs_i(inode)->fragment_block != SQUASHFS_INVALID_BLK) { res = squashfs_readahead_fragment(pages, nr_pages, @@ -593,15 +598,15 @@ static void squashfs_readahead(struct readahead_control *ractl) res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor); - squashfs_page_actor_free(actor); + last_page = squashfs_page_actor_free(actor); if (res == expected) { int bytes; /* Last page (if present) may have trailing bytes not filled */ bytes = res % PAGE_SIZE; - if (pages[nr_pages - 1]->index == file_end && bytes) - memzero_page(pages[nr_pages - 1], bytes, + if (index == file_end && bytes && last_page) + memzero_page(last_page, bytes, PAGE_SIZE - bytes); for (i = 0; i < nr_pages; i++) { diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c index 54b93bf4a25c..81af6c4ca115 100644 --- a/fs/squashfs/page_actor.c +++ b/fs/squashfs/page_actor.c @@ -71,11 +71,13 @@ static void *handle_next_page(struct squashfs_page_actor *actor) (actor->next_index != actor->page[actor->next_page]->index)) { actor->next_index++; actor->returned_pages++; + actor->last_page = NULL; return actor->alloc_buffer ? actor->tmp_buffer : ERR_PTR(-ENOMEM); } actor->next_index++; actor->returned_pages++; + actor->last_page = actor->page[actor->next_page]; return actor->pageaddr = kmap_local_page(actor->page[actor->next_page++]); } @@ -125,6 +127,7 @@ struct squashfs_page_actor *squashfs_page_actor_init_special(struct squashfs_sb_ actor->returned_pages = 0; actor->next_index = page[0]->index & ~((1 << (msblk->block_log - PAGE_SHIFT)) - 1); actor->pageaddr = NULL; + actor->last_page = NULL; actor->alloc_buffer = msblk->decompressor->alloc_buffer; actor->squashfs_first_page = direct_first_page; actor->squashfs_next_page = direct_next_page; diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h index 95ffbb543d91..97d4983559b1 100644 --- a/fs/squashfs/page_actor.h +++ b/fs/squashfs/page_actor.h @@ -16,6 +16,7 @@ struct squashfs_page_actor { void *(*squashfs_first_page)(struct squashfs_page_actor *); void *(*squashfs_next_page)(struct squashfs_page_actor *); void (*squashfs_finish_page)(struct squashfs_page_actor *); + struct page *last_page; int pages; int length; int next_page; @@ -29,10 +30,13 @@ extern struct squashfs_page_actor *squashfs_page_actor_init(void **buffer, extern struct squashfs_page_actor *squashfs_page_actor_init_special( struct squashfs_sb_info *msblk, struct page **page, int pages, int length); -static inline void squashfs_page_actor_free(struct squashfs_page_actor *actor) +static inline struct page *squashfs_page_actor_free(struct squashfs_page_actor *actor) { + struct page *last_page = actor->last_page; + kfree(actor->tmp_buffer); kfree(actor); + return last_page; } static inline void *squashfs_first_page(struct squashfs_page_actor *actor) { diff --git a/fs/stat.c b/fs/stat.c index 9ced8860e0f3..ef50573c72a2 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -5,6 +5,7 @@ * Copyright (C) 1991, 1992 Linus Torvalds */ +#include <linux/blkdev.h> #include <linux/export.h> #include <linux/mm.h> #include <linux/errno.h> @@ -230,11 +231,22 @@ retry: goto out; error = vfs_getattr(&path, stat, request_mask, flags); + stat->mnt_id = real_mount(path.mnt)->mnt_id; stat->result_mask |= STATX_MNT_ID; + if (path.mnt->mnt_root == path.dentry) stat->attributes |= STATX_ATTR_MOUNT_ROOT; stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT; + + /* Handle STATX_DIOALIGN for block devices. */ + if (request_mask & STATX_DIOALIGN) { + struct inode *inode = d_backing_inode(path.dentry); + + if (S_ISBLK(inode->i_mode)) + bdev_statx_dioalign(inode, stat); + } + path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; @@ -611,6 +623,8 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer) tmp.stx_dev_major = MAJOR(stat->dev); tmp.stx_dev_minor = MINOR(stat->dev); tmp.stx_mnt_id = stat->mnt_id; + tmp.stx_dio_mem_align = stat->dio_mem_align; + tmp.stx_dio_offset_align = stat->dio_offset_align; return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0; } diff --git a/fs/super.c b/fs/super.c index 734ed584a946..8d39e4f11cfa 100644 --- a/fs/super.c +++ b/fs/super.c @@ -291,7 +291,7 @@ static void __put_super(struct super_block *s) WARN_ON(s->s_inode_lru.node); WARN_ON(!list_empty(&s->s_mounts)); security_sb_free(s); - fscrypt_sb_free(s); + fscrypt_destroy_keyring(s); put_user_ns(s->s_user_ns); kfree(s->s_subtype); call_rcu(&s->rcu, destroy_super_rcu); @@ -480,6 +480,7 @@ void generic_shutdown_super(struct super_block *sb) evict_inodes(sb); /* only nonzero refcount inodes can have marks */ fsnotify_sb_delete(sb); + fscrypt_destroy_keyring(sb); security_sb_delete(sb); if (sb->s_dio_done_wq) { diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c index c57b46a352d8..3125e76376ee 100644 --- a/fs/ubifs/crypto.c +++ b/fs/ubifs/crypto.c @@ -24,6 +24,17 @@ static bool ubifs_crypt_empty_dir(struct inode *inode) return ubifs_check_dir_empty(inode) == 0; } +/** + * ubifs_encrypt - Encrypt data. + * @inode: inode which refers to the data node + * @dn: data node to encrypt + * @in_len: length of data to be compressed + * @out_len: allocated memory size for the data area of @dn + * @block: logical block number of the block + * + * This function encrypt a possibly-compressed data in the data node. + * The encrypted data length will store in @out_len. + */ int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn, unsigned int in_len, unsigned int *out_len, int block) { diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index fc718f6178f2..3f128b9fdfbb 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -2467,7 +2467,7 @@ error_dump: static inline int chance(unsigned int n, unsigned int out_of) { - return !!((prandom_u32() % out_of) + 1 <= n); + return !!(prandom_u32_max(out_of) + 1 <= n); } @@ -2485,13 +2485,13 @@ static int power_cut_emulated(struct ubifs_info *c, int lnum, int write) if (chance(1, 2)) { d->pc_delay = 1; /* Fail within 1 minute */ - delay = prandom_u32() % 60000; + delay = prandom_u32_max(60000); d->pc_timeout = jiffies; d->pc_timeout += msecs_to_jiffies(delay); ubifs_warn(c, "failing after %lums", delay); } else { d->pc_delay = 2; - delay = prandom_u32() % 10000; + delay = prandom_u32_max(10000); /* Fail within 10000 operations */ d->pc_cnt_max = delay; ubifs_warn(c, "failing after %lu calls", delay); @@ -2571,7 +2571,7 @@ static int corrupt_data(const struct ubifs_info *c, const void *buf, unsigned int from, to, ffs = chance(1, 2); unsigned char *p = (void *)buf; - from = prandom_u32() % len; + from = prandom_u32_max(len); /* Corruption span max to end of write unit */ to = min(len, ALIGN(from + 1, c->max_write_size)); @@ -2581,7 +2581,7 @@ static int corrupt_data(const struct ubifs_info *c, const void *buf, if (ffs) memset(p + from, 0xFF, to - from); else - prandom_bytes(p + from, to - from); + get_random_bytes(p + from, to - from); return to; } diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 86151889548e..0f29cf201136 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -68,13 +68,14 @@ static int inherit_flags(const struct inode *dir, umode_t mode) * @c: UBIFS file-system description object * @dir: parent directory inode * @mode: inode mode flags + * @is_xattr: whether the inode is xattr inode * * This function finds an unused inode number, allocates new inode and * initializes it. Returns new inode in case of success and an error code in * case of failure. */ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, - umode_t mode) + umode_t mode, bool is_xattr) { int err; struct inode *inode; @@ -99,10 +100,12 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, current_time(inode); inode->i_mapping->nrpages = 0; - err = fscrypt_prepare_new_inode(dir, inode, &encrypted); - if (err) { - ubifs_err(c, "fscrypt_prepare_new_inode failed: %i", err); - goto out_iput; + if (!is_xattr) { + err = fscrypt_prepare_new_inode(dir, inode, &encrypted); + if (err) { + ubifs_err(c, "fscrypt_prepare_new_inode failed: %i", err); + goto out_iput; + } } switch (mode & S_IFMT) { @@ -309,7 +312,7 @@ static int ubifs_create(struct user_namespace *mnt_userns, struct inode *dir, sz_change = CALC_DENT_SIZE(fname_len(&nm)); - inode = ubifs_new_inode(c, dir, mode); + inode = ubifs_new_inode(c, dir, mode, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_fname; @@ -370,7 +373,7 @@ static struct inode *create_whiteout(struct inode *dir, struct dentry *dentry) if (err) return ERR_PTR(err); - inode = ubifs_new_inode(c, dir, mode); + inode = ubifs_new_inode(c, dir, mode, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_free; @@ -424,8 +427,9 @@ static void unlock_2_inodes(struct inode *inode1, struct inode *inode2) } static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode) + struct file *file, umode_t mode) { + struct dentry *dentry = file->f_path.dentry; struct inode *inode; struct ubifs_info *c = dir->i_sb->s_fs_info; struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, @@ -462,7 +466,7 @@ static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, return err; } - inode = ubifs_new_inode(c, dir, mode); + inode = ubifs_new_inode(c, dir, mode, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_budg; @@ -475,7 +479,7 @@ static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, mutex_lock(&ui->ui_mutex); insert_inode_hash(inode); - d_tmpfile(dentry, inode); + d_tmpfile(file, inode); ubifs_assert(c, ui->dirty); instantiated = 1; @@ -489,7 +493,7 @@ static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, ubifs_release_budget(c, &req); - return 0; + return finish_open_simple(file, 0); out_cancel: unlock_2_inodes(dir, inode); @@ -872,7 +876,7 @@ out_fname: } /** - * check_dir_empty - check if a directory is empty or not. + * ubifs_check_dir_empty - check if a directory is empty or not. * @dir: VFS inode object of the directory to check * * This function checks if directory @dir is empty. Returns zero if the @@ -1004,7 +1008,7 @@ static int ubifs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, sz_change = CALC_DENT_SIZE(fname_len(&nm)); - inode = ubifs_new_inode(c, dir, S_IFDIR | mode); + inode = ubifs_new_inode(c, dir, S_IFDIR | mode, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_fname; @@ -1091,7 +1095,7 @@ static int ubifs_mknod(struct user_namespace *mnt_userns, struct inode *dir, sz_change = CALC_DENT_SIZE(fname_len(&nm)); - inode = ubifs_new_inode(c, dir, mode); + inode = ubifs_new_inode(c, dir, mode, false); if (IS_ERR(inode)) { kfree(dev); err = PTR_ERR(inode); @@ -1173,7 +1177,7 @@ static int ubifs_symlink(struct user_namespace *mnt_userns, struct inode *dir, sz_change = CALC_DENT_SIZE(fname_len(&nm)); - inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO); + inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_fname; diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 75dab0ae3939..d02509920baf 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -503,7 +503,7 @@ static void mark_inode_clean(struct ubifs_info *c, struct ubifs_inode *ui) static void set_dent_cookie(struct ubifs_info *c, struct ubifs_dent_node *dent) { if (c->double_hash) - dent->cookie = (__force __le32) prandom_u32(); + dent->cookie = (__force __le32) get_random_u32(); else dent->cookie = 0; } @@ -1472,23 +1472,25 @@ out_free: * @block: data block number * @dn: data node to re-compress * @new_len: new length + * @dn_size: size of the data node @dn in memory * * This function is used when an inode is truncated and the last data node of * the inode has to be re-compressed/encrypted and re-written. */ static int truncate_data_node(const struct ubifs_info *c, const struct inode *inode, unsigned int block, struct ubifs_data_node *dn, - int *new_len) + int *new_len, int dn_size) { void *buf; - int err, dlen, compr_type, out_len, old_dlen; + int err, dlen, compr_type, out_len, data_size; out_len = le32_to_cpu(dn->size); buf = kmalloc_array(out_len, WORST_COMPR_FACTOR, GFP_NOFS); if (!buf) return -ENOMEM; - dlen = old_dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; + dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; + data_size = dn_size - UBIFS_DATA_NODE_SZ; compr_type = le16_to_cpu(dn->compr_type); if (IS_ENCRYPTED(inode)) { @@ -1508,11 +1510,11 @@ static int truncate_data_node(const struct ubifs_info *c, const struct inode *in } if (IS_ENCRYPTED(inode)) { - err = ubifs_encrypt(inode, dn, out_len, &old_dlen, block); + err = ubifs_encrypt(inode, dn, out_len, &data_size, block); if (err) goto out; - out_len = old_dlen; + out_len = data_size; } else { dn->compr_size = 0; } @@ -1550,6 +1552,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, struct ubifs_trun_node *trun; struct ubifs_data_node *dn; int err, dlen, len, lnum, offs, bit, sz, sync = IS_SYNC(inode); + int dn_size; struct ubifs_inode *ui = ubifs_inode(inode); ino_t inum = inode->i_ino; unsigned int blk; @@ -1562,10 +1565,13 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, ubifs_assert(c, S_ISREG(inode->i_mode)); ubifs_assert(c, mutex_is_locked(&ui->ui_mutex)); - sz = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ + - UBIFS_MAX_DATA_NODE_SZ * WORST_COMPR_FACTOR; + dn_size = COMPRESSED_DATA_NODE_BUF_SZ; - sz += ubifs_auth_node_sz(c); + if (IS_ENCRYPTED(inode)) + dn_size += UBIFS_CIPHER_BLOCK_SIZE; + + sz = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ + + dn_size + ubifs_auth_node_sz(c); ino = kmalloc(sz, GFP_NOFS); if (!ino) @@ -1596,15 +1602,15 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, if (dn_len <= 0 || dn_len > UBIFS_BLOCK_SIZE) { ubifs_err(c, "bad data node (block %u, inode %lu)", blk, inode->i_ino); - ubifs_dump_node(c, dn, sz - UBIFS_INO_NODE_SZ - - UBIFS_TRUN_NODE_SZ); + ubifs_dump_node(c, dn, dn_size); goto out_free; } if (dn_len <= dlen) dlen = 0; /* Nothing to do */ else { - err = truncate_data_node(c, inode, blk, dn, &dlen); + err = truncate_data_node(c, inode, blk, dn, + &dlen, dn_size); if (err) goto out_free; } diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index d76a19e460cd..cfbc31f709f4 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -1970,28 +1970,28 @@ static int dbg_populate_lsave(struct ubifs_info *c) if (!dbg_is_chk_gen(c)) return 0; - if (prandom_u32() & 3) + if (prandom_u32_max(4)) return 0; for (i = 0; i < c->lsave_cnt; i++) c->lsave[i] = c->main_first; list_for_each_entry(lprops, &c->empty_list, list) - c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum; + c->lsave[prandom_u32_max(c->lsave_cnt)] = lprops->lnum; list_for_each_entry(lprops, &c->freeable_list, list) - c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum; + c->lsave[prandom_u32_max(c->lsave_cnt)] = lprops->lnum; list_for_each_entry(lprops, &c->frdi_idx_list, list) - c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum; + c->lsave[prandom_u32_max(c->lsave_cnt)] = lprops->lnum; heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1]; for (i = 0; i < heap->cnt; i++) - c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum; + c->lsave[prandom_u32_max(c->lsave_cnt)] = heap->arr[i]->lnum; heap = &c->lpt_heap[LPROPS_DIRTY - 1]; for (i = 0; i < heap->cnt; i++) - c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum; + c->lsave[prandom_u32_max(c->lsave_cnt)] = heap->arr[i]->lnum; heap = &c->lpt_heap[LPROPS_FREE - 1]; for (i = 0; i < heap->cnt; i++) - c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum; + c->lsave[prandom_u32_max(c->lsave_cnt)] = heap->arr[i]->lnum; return 1; } diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c index 58c92c96ecef..01362ad5f804 100644 --- a/fs/ubifs/tnc_commit.c +++ b/fs/ubifs/tnc_commit.c @@ -700,7 +700,7 @@ static int alloc_idx_lebs(struct ubifs_info *c, int cnt) c->ilebs[c->ileb_cnt++] = lnum; dbg_cmt("LEB %d", lnum); } - if (dbg_is_chk_index(c) && !(prandom_u32() & 7)) + if (dbg_is_chk_index(c) && !prandom_u32_max(8)) return -ENOSPC; return 0; } diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 7d6d2f152e03..478bbbb5382f 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -2026,7 +2026,7 @@ int ubifs_update_time(struct inode *inode, struct timespec64 *time, int flags); /* dir.c */ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, - umode_t mode); + umode_t mode, bool is_xattr); int ubifs_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); int ubifs_check_dir_empty(struct inode *dir); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index e4c4761aff7f..3db8486e3725 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -110,7 +110,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, if (err) return err; - inode = ubifs_new_inode(c, host, S_IFREG | S_IRWXUGO); + inode = ubifs_new_inode(c, host, S_IFREG | S_IRWXUGO, true); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_budg; diff --git a/fs/udf/dir.c b/fs/udf/dir.c index cad3772f9dbe..be640f4b2f2c 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -130,7 +130,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) brelse(tmp); } if (num) { - ll_rw_block(REQ_OP_READ | REQ_RAHEAD, num, bha); + bh_readahead_batch(num, bha, REQ_RAHEAD); for (i = 0; i < num; i++) brelse(bha[i]); } diff --git a/fs/udf/directory.c b/fs/udf/directory.c index a2adf6293093..16bcf2c6b8b3 100644 --- a/fs/udf/directory.c +++ b/fs/udf/directory.c @@ -89,7 +89,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, brelse(tmp); } if (num) { - ll_rw_block(REQ_OP_READ | REQ_RAHEAD, num, bha); + bh_readahead_batch(num, bha, REQ_RAHEAD); for (i = 0; i < num; i++) brelse(bha[i]); } diff --git a/fs/udf/file.c b/fs/udf/file.c index 09aef77269fe..5c659e23e578 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -252,6 +252,7 @@ const struct file_operations udf_file_operations = { .release = udf_release_file, .fsync = generic_file_fsync, .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, .llseek = generic_file_llseek, }; diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 8d06daed549f..dce6ae9ae306 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1211,13 +1211,7 @@ struct buffer_head *udf_bread(struct inode *inode, udf_pblk_t block, if (!bh) return NULL; - if (buffer_uptodate(bh)) - return bh; - - ll_rw_block(REQ_OP_READ, 1, &bh); - - wait_on_buffer(bh); - if (buffer_uptodate(bh)) + if (bh_read(bh, 0) >= 0) return bh; brelse(bh); diff --git a/fs/udf/namei.c b/fs/udf/namei.c index b3d5f97f16cd..ae7bc13a5298 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -240,7 +240,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir, poffset - lfi); else { if (!copy_name) { - copy_name = kmalloc(UDF_NAME_LEN, + copy_name = kmalloc(UDF_NAME_LEN_CS0, GFP_NOFS); if (!copy_name) { fi = ERR_PTR(-ENOMEM); @@ -626,7 +626,7 @@ static int udf_create(struct user_namespace *mnt_userns, struct inode *dir, } static int udf_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode) + struct file *file, umode_t mode) { struct inode *inode = udf_new_inode(dir, mode); @@ -640,9 +640,9 @@ static int udf_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, inode->i_op = &udf_file_inode_operations; inode->i_fop = &udf_file_operations; mark_inode_dirty(inode); - d_tmpfile(dentry, inode); + d_tmpfile(file, inode); unlock_new_inode(inode); - return 0; + return finish_open_simple(file, 0); } static int udf_mknod(struct user_namespace *mnt_userns, struct inode *dir, diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index bd810d8239f2..2436e3f82147 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -295,14 +295,10 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg, if (!buffer_mapped(bh)) map_bh(bh, inode->i_sb, oldb + pos); - if (!buffer_uptodate(bh)) { - ll_rw_block(REQ_OP_READ, 1, &bh); - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) { - ufs_error(inode->i_sb, __func__, - "read of block failed\n"); - break; - } + if (bh_read(bh, 0) < 0) { + ufs_error(inode->i_sb, __func__, + "read of block failed\n"); + break; } UFSD(" change from %llu to %llu, pos %u\n", diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 175de70e3adf..98ac37e34e3d 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -30,6 +30,7 @@ #include <linux/security.h> #include <linux/hugetlb.h> #include <linux/swapops.h> +#include <linux/miscdevice.h> int sysctl_unprivileged_userfaultfd __read_mostly; @@ -415,13 +416,8 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) if (ctx->features & UFFD_FEATURE_SIGBUS) goto out; - if ((vmf->flags & FAULT_FLAG_USER) == 0 && - ctx->flags & UFFD_USER_MODE_ONLY) { - printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd " - "sysctl knob to 1 if kernel faults must be handled " - "without obtaining CAP_SYS_PTRACE capability\n"); + if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY)) goto out; - } /* * If it's already released don't get it. This avoids to loop @@ -615,14 +611,16 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, if (release_new_ctx) { struct vm_area_struct *vma; struct mm_struct *mm = release_new_ctx->mm; + VMA_ITERATOR(vmi, mm, 0); /* the various vma->vm_userfaultfd_ctx still points to it */ mmap_write_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) + for_each_vma(vmi, vma) { if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; vma->vm_flags &= ~__VM_UFFD_FLAGS; } + } mmap_write_unlock(mm); userfaultfd_ctx_put(release_new_ctx); @@ -803,11 +801,13 @@ static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps, return false; } -int userfaultfd_unmap_prep(struct vm_area_struct *vma, - unsigned long start, unsigned long end, - struct list_head *unmaps) +int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start, + unsigned long end, struct list_head *unmaps) { - for ( ; vma && vma->vm_start < end; vma = vma->vm_next) { + VMA_ITERATOR(vmi, mm, start); + struct vm_area_struct *vma; + + for_each_vma_range(vmi, vma, end) { struct userfaultfd_unmap_ctx *unmap_ctx; struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx; @@ -857,6 +857,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) /* len == 0 means wake all */ struct userfaultfd_wake_range range = { .len = 0, }; unsigned long new_flags; + MA_STATE(mas, &mm->mm_mt, 0, 0); WRITE_ONCE(ctx->released, true); @@ -873,7 +874,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) */ mmap_write_lock(mm); prev = NULL; - for (vma = mm->mmap; vma; vma = vma->vm_next) { + mas_for_each(&mas, vma, ULONG_MAX) { cond_resched(); BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ !!(vma->vm_flags & __VM_UFFD_FLAGS)); @@ -887,10 +888,13 @@ static int userfaultfd_release(struct inode *inode, struct file *file) vma->vm_file, vma->vm_pgoff, vma_policy(vma), NULL_VM_UFFD_CTX, anon_vma_name(vma)); - if (prev) + if (prev) { + mas_pause(&mas); vma = prev; - else + } else { prev = vma; + } + vma->vm_flags = new_flags; vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; } @@ -991,7 +995,7 @@ static int resolve_userfault_fork(struct userfaultfd_ctx *new, int fd; fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, new, - O_RDWR | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode); + O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode); if (fd < 0) return fd; @@ -1272,6 +1276,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, bool found; bool basic_ioctls; unsigned long start, end, vma_end; + MA_STATE(mas, &mm->mm_mt, 0, 0); user_uffdio_register = (struct uffdio_register __user *) arg; @@ -1314,7 +1319,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, goto out; mmap_write_lock(mm); - vma = find_vma_prev(mm, start, &prev); + mas_set(&mas, start); + vma = mas_find(&mas, ULONG_MAX); if (!vma) goto out_unlock; @@ -1339,7 +1345,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, */ found = false; basic_ioctls = false; - for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { + for (cur = vma; cur; cur = mas_next(&mas, end - 1)) { cond_resched(); BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ @@ -1399,8 +1405,10 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, } BUG_ON(!found); - if (vma->vm_start < start) - prev = vma; + mas_set(&mas, start); + prev = mas_prev(&mas, 0); + if (prev != vma) + mas_next(&mas, ULONG_MAX); ret = 0; do { @@ -1430,6 +1438,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, ((struct vm_userfaultfd_ctx){ ctx }), anon_vma_name(vma)); if (prev) { + /* vma_merge() invalidated the mas */ + mas_pause(&mas); vma = prev; goto next; } @@ -1437,11 +1447,15 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, ret = split_vma(mm, vma, start, 1); if (ret) break; + /* split_vma() invalidated the mas */ + mas_pause(&mas); } if (vma->vm_end > end) { ret = split_vma(mm, vma, end, 0); if (ret) break; + /* split_vma() invalidated the mas */ + mas_pause(&mas); } next: /* @@ -1458,8 +1472,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, skip: prev = vma; start = vma->vm_end; - vma = vma->vm_next; - } while (vma && vma->vm_start < end); + vma = mas_next(&mas, end - 1); + } while (vma); out_unlock: mmap_write_unlock(mm); mmput(mm); @@ -1503,6 +1517,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, bool found; unsigned long start, end, vma_end; const void __user *buf = (void __user *)arg; + MA_STATE(mas, &mm->mm_mt, 0, 0); ret = -EFAULT; if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) @@ -1521,7 +1536,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, goto out; mmap_write_lock(mm); - vma = find_vma_prev(mm, start, &prev); + mas_set(&mas, start); + vma = mas_find(&mas, ULONG_MAX); if (!vma) goto out_unlock; @@ -1546,7 +1562,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, */ found = false; ret = -EINVAL; - for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { + for (cur = vma; cur; cur = mas_next(&mas, end - 1)) { cond_resched(); BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ @@ -1566,8 +1582,10 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, } BUG_ON(!found); - if (vma->vm_start < start) - prev = vma; + mas_set(&mas, start); + prev = mas_prev(&mas, 0); + if (prev != vma) + mas_next(&mas, ULONG_MAX); ret = 0; do { @@ -1612,17 +1630,20 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, NULL_VM_UFFD_CTX, anon_vma_name(vma)); if (prev) { vma = prev; + mas_pause(&mas); goto next; } if (vma->vm_start < start) { ret = split_vma(mm, vma, start, 1); if (ret) break; + mas_pause(&mas); } if (vma->vm_end > end) { ret = split_vma(mm, vma, end, 0); if (ret) break; + mas_pause(&mas); } next: /* @@ -1636,8 +1657,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, skip: prev = vma; start = vma->vm_end; - vma = vma->vm_next; - } while (vma && vma->vm_start < end); + vma = mas_next(&mas, end - 1); + } while (vma); out_unlock: mmap_write_unlock(mm); mmput(mm); @@ -2056,20 +2077,11 @@ static void init_once_userfaultfd_ctx(void *mem) seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock); } -SYSCALL_DEFINE1(userfaultfd, int, flags) +static int new_userfaultfd(int flags) { struct userfaultfd_ctx *ctx; int fd; - if (!sysctl_unprivileged_userfaultfd && - (flags & UFFD_USER_MODE_ONLY) == 0 && - !capable(CAP_SYS_PTRACE)) { - printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd " - "sysctl knob to 1 if kernel faults must be handled " - "without obtaining CAP_SYS_PTRACE capability\n"); - return -EPERM; - } - BUG_ON(!current->mm); /* Check the UFFD_* constants for consistency. */ @@ -2094,7 +2106,7 @@ SYSCALL_DEFINE1(userfaultfd, int, flags) mmgrab(ctx->mm); fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, ctx, - O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL); + O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL); if (fd < 0) { mmdrop(ctx->mm); kmem_cache_free(userfaultfd_ctx_cachep, ctx); @@ -2102,8 +2114,60 @@ SYSCALL_DEFINE1(userfaultfd, int, flags) return fd; } +static inline bool userfaultfd_syscall_allowed(int flags) +{ + /* Userspace-only page faults are always allowed */ + if (flags & UFFD_USER_MODE_ONLY) + return true; + + /* + * The user is requesting a userfaultfd which can handle kernel faults. + * Privileged users are always allowed to do this. + */ + if (capable(CAP_SYS_PTRACE)) + return true; + + /* Otherwise, access to kernel fault handling is sysctl controlled. */ + return sysctl_unprivileged_userfaultfd; +} + +SYSCALL_DEFINE1(userfaultfd, int, flags) +{ + if (!userfaultfd_syscall_allowed(flags)) + return -EPERM; + + return new_userfaultfd(flags); +} + +static long userfaultfd_dev_ioctl(struct file *file, unsigned int cmd, unsigned long flags) +{ + if (cmd != USERFAULTFD_IOC_NEW) + return -EINVAL; + + return new_userfaultfd(flags); +} + +static const struct file_operations userfaultfd_dev_fops = { + .unlocked_ioctl = userfaultfd_dev_ioctl, + .compat_ioctl = userfaultfd_dev_ioctl, + .owner = THIS_MODULE, + .llseek = noop_llseek, +}; + +static struct miscdevice userfaultfd_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = "userfaultfd", + .fops = &userfaultfd_dev_fops +}; + static int __init userfaultfd_init(void) { + int ret; + + ret = misc_register(&userfaultfd_misc); + if (ret) + return ret; + userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache", sizeof(struct userfaultfd_ctx), 0, diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index 629785c95007..dbe1ce5b450a 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -70,8 +70,6 @@ struct fsverity_info { const struct inode *inode; }; -/* Arbitrary limit to bound the kmalloc() size. Can be changed. */ -#define FS_VERITY_MAX_DESCRIPTOR_SIZE 16384 #define FS_VERITY_MAX_SIGNATURE_SIZE (FS_VERITY_MAX_DESCRIPTOR_SIZE - \ sizeof(struct fsverity_descriptor)) diff --git a/fs/verity/read_metadata.c b/fs/verity/read_metadata.c index 6ee849dc7bc1..2aefc5565152 100644 --- a/fs/verity/read_metadata.c +++ b/fs/verity/read_metadata.c @@ -53,14 +53,14 @@ static int fsverity_read_merkle_tree(struct inode *inode, break; } - virt = kmap(page); + virt = kmap_local_page(page); if (copy_to_user(buf, virt + offs_in_page, bytes_to_copy)) { - kunmap(page); + kunmap_local(virt); put_page(page); err = -EFAULT; break; } - kunmap(page); + kunmap_local(virt); put_page(page); retval += bytes_to_copy; diff --git a/fs/verity/verify.c b/fs/verity/verify.c index 14e2fb49cff5..bde8c9b7d25f 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -39,16 +39,6 @@ static void hash_at_level(const struct merkle_tree_params *params, (params->log_blocksize - params->log_arity); } -/* Extract a hash from a hash page */ -static void extract_hash(struct page *hpage, unsigned int hoffset, - unsigned int hsize, u8 *out) -{ - void *virt = kmap_atomic(hpage); - - memcpy(out, virt + hoffset, hsize); - kunmap_atomic(virt); -} - static inline int cmp_hashes(const struct fsverity_info *vi, const u8 *want_hash, const u8 *real_hash, pgoff_t index, int level) @@ -129,7 +119,7 @@ static bool verify_page(struct inode *inode, const struct fsverity_info *vi, } if (PageChecked(hpage)) { - extract_hash(hpage, hoffset, hsize, _want_hash); + memcpy_from_page(_want_hash, hpage, hoffset, hsize); want_hash = _want_hash; put_page(hpage); pr_debug_ratelimited("Hash page already checked, want %s:%*phN\n", @@ -158,7 +148,7 @@ descend: if (err) goto out; SetPageChecked(hpage); - extract_hash(hpage, hoffset, hsize, _want_hash); + memcpy_from_page(_want_hash, hpage, hoffset, hsize); want_hash = _want_hash; put_page(hpage); pr_debug("Verified hash page at level %d, now want %s:%*phN\n", diff --git a/fs/xattr.c b/fs/xattr.c index a1f4998bc6be..61107b6bbed2 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -290,7 +290,7 @@ static inline bool is_posix_acl_xattr(const char *name) int vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, - const char *name, void *value, size_t size, int flags) + const char *name, const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; struct inode *delegated_inode = NULL; @@ -298,16 +298,12 @@ vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, int error; if (size && strcmp(name, XATTR_NAME_CAPS) == 0) { - error = cap_convert_nscap(mnt_userns, dentry, - (const void **)&value, size); + error = cap_convert_nscap(mnt_userns, dentry, &value, size); if (error < 0) return error; size = error; } - if (size && is_posix_acl_xattr(name)) - posix_acl_setxattr_idmapped_mnt(mnt_userns, inode, value, size); - retry_deleg: inode_lock(inode); error = __vfs_setxattr_locked(mnt_userns, dentry, name, value, size, @@ -587,9 +583,7 @@ int setxattr_copy(const char __user *name, struct xattr_ctx *ctx) static void setxattr_convert(struct user_namespace *mnt_userns, struct dentry *d, struct xattr_ctx *ctx) { - if (ctx->size && - ((strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || - (strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))) + if (ctx->size && is_posix_acl_xattr(ctx->kname->name)) posix_acl_fix_xattr_from_user(ctx->kvalue, ctx->size); } @@ -705,8 +699,7 @@ do_getxattr(struct user_namespace *mnt_userns, struct dentry *d, error = vfs_getxattr(mnt_userns, d, kname, ctx->kvalue, ctx->size); if (error > 0) { - if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || - (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) + if (is_posix_acl_xattr(kname)) posix_acl_fix_xattr_to_user(ctx->kvalue, error); if (ctx->size && copy_to_user(ctx->value, ctx->kvalue, error)) error = -EFAULT; diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 517a138faa66..191b22b9a35b 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -133,6 +133,21 @@ xfs_verify_agbno(struct xfs_perag *pag, xfs_agblock_t agbno) return true; } +static inline bool +xfs_verify_agbext( + struct xfs_perag *pag, + xfs_agblock_t agbno, + xfs_agblock_t len) +{ + if (agbno + len <= agbno) + return false; + + if (!xfs_verify_agbno(pag, agbno)) + return false; + + return xfs_verify_agbno(pag, agbno + len - 1); +} + /* * Verify that an AG inode number pointer neither points outside the AG * nor points at static metadata. diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index e2bdf089c0a3..de79f5d07f65 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -263,11 +263,7 @@ xfs_alloc_get_rec( goto out_bad_rec; /* check for valid extent range, including overflow */ - if (!xfs_verify_agbno(pag, *bno)) - goto out_bad_rec; - if (*bno > *bno + *len) - goto out_bad_rec; - if (!xfs_verify_agbno(pag, *bno + *len - 1)) + if (!xfs_verify_agbext(pag, *bno, *len)) goto out_bad_rec; return 0; @@ -1520,7 +1516,7 @@ xfs_alloc_ag_vextent_lastblock( #ifdef DEBUG /* Randomly don't execute the first algorithm. */ - if (prandom_u32() & 1) + if (prandom_u32_max(2)) return 0; #endif diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index e56723dc9cd5..49d0d4ea63fc 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -294,7 +294,7 @@ xfs_check_block( else thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr); if (*thispa == *pp) { - xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld", + xfs_warn(mp, "%s: thispa(%d) == pp(%d) %lld", __func__, j, i, (unsigned long long)be64_to_cpu(*thispa)); xfs_err(mp, "%s: ptrs are equal in node\n", diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index e7201dc68f43..e576560b46e9 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -2192,8 +2192,8 @@ xfs_da_grow_inode_int( */ mapp = kmem_alloc(sizeof(*mapp) * count, 0); for (b = *bno, mapi = 0; b < *bno + count; ) { - nmap = min(XFS_BMAP_MAX_NMAP, count); c = (int)(*bno + count - b); + nmap = min(XFS_BMAP_MAX_NMAP, c); error = xfs_bmapi_write(tp, dp, b, c, xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, args->total, &mapp[mapi], &nmap); diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 76eedc2756b3..92bac3373f1f 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -261,7 +261,7 @@ xfs_dir_createname( { struct xfs_da_args *args; int rval; - int v; /* type-checking value */ + bool v; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); @@ -357,7 +357,7 @@ xfs_dir_lookup( { struct xfs_da_args *args; int rval; - int v; /* type-checking value */ + bool v; int lock_mode; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); @@ -435,7 +435,7 @@ xfs_dir_removename( { struct xfs_da_args *args; int rval; - int v; /* type-checking value */ + bool v; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); XFS_STATS_INC(dp->i_mount, xs_dir_remove); @@ -493,7 +493,7 @@ xfs_dir_replace( { struct xfs_da_args *args; int rval; - int v; /* type-checking value */ + bool v; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); @@ -610,19 +610,23 @@ xfs_dir2_grow_inode( int xfs_dir2_isblock( struct xfs_da_args *args, - int *vp) /* out: 1 is block, 0 is not block */ + bool *isblock) { - xfs_fileoff_t last; /* last file offset */ - int rval; + struct xfs_mount *mp = args->dp->i_mount; + xfs_fileoff_t eof; + int error; - if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK))) - return rval; - rval = XFS_FSB_TO_B(args->dp->i_mount, last) == args->geo->blksize; - if (XFS_IS_CORRUPT(args->dp->i_mount, - rval != 0 && - args->dp->i_disk_size != args->geo->blksize)) + error = xfs_bmap_last_offset(args->dp, &eof, XFS_DATA_FORK); + if (error) + return error; + + *isblock = false; + if (XFS_FSB_TO_B(mp, eof) != args->geo->blksize) + return 0; + + *isblock = true; + if (XFS_IS_CORRUPT(mp, args->dp->i_disk_size != args->geo->blksize)) return -EFSCORRUPTED; - *vp = rval; return 0; } @@ -632,14 +636,20 @@ xfs_dir2_isblock( int xfs_dir2_isleaf( struct xfs_da_args *args, - int *vp) /* out: 1 is block, 0 is not block */ + bool *isleaf) { - xfs_fileoff_t last; /* last file offset */ - int rval; + xfs_fileoff_t eof; + int error; - if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK))) - return rval; - *vp = last == args->geo->leafblk + args->geo->fsbcount; + error = xfs_bmap_last_offset(args->dp, &eof, XFS_DATA_FORK); + if (error) + return error; + + *isleaf = false; + if (eof != args->geo->leafblk + args->geo->fsbcount) + return 0; + + *isleaf = true; return 0; } diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index b6df3c34b26a..dd39f17dd9a9 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -61,8 +61,8 @@ extern int xfs_dir2_sf_to_block(struct xfs_da_args *args); /* * Interface routines used by userspace utilities */ -extern int xfs_dir2_isblock(struct xfs_da_args *args, int *r); -extern int xfs_dir2_isleaf(struct xfs_da_args *args, int *r); +extern int xfs_dir2_isblock(struct xfs_da_args *args, bool *isblock); +extern int xfs_dir2_isleaf(struct xfs_da_args *args, bool *isleaf); extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, struct xfs_buf *bp); diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index d9b66306a9a7..cb9e950a911d 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -146,6 +146,8 @@ xfs_dir3_leaf_check_int( xfs_dir2_leaf_tail_t *ltp; int stale; int i; + bool isleaf1 = (hdr->magic == XFS_DIR2_LEAF1_MAGIC || + hdr->magic == XFS_DIR3_LEAF1_MAGIC); ltp = xfs_dir2_leaf_tail_p(geo, leaf); @@ -158,8 +160,7 @@ xfs_dir3_leaf_check_int( return __this_address; /* Leaves and bests don't overlap in leaf format. */ - if ((hdr->magic == XFS_DIR2_LEAF1_MAGIC || - hdr->magic == XFS_DIR3_LEAF1_MAGIC) && + if (isleaf1 && (char *)&hdr->ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp)) return __this_address; @@ -175,6 +176,10 @@ xfs_dir3_leaf_check_int( } if (hdr->ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) stale++; + if (isleaf1 && xfs_dir2_dataptr_to_db(geo, + be32_to_cpu(hdr->ents[i].address)) >= + be32_to_cpu(ltp->bestcount)) + return __this_address; } if (hdr->stale != stale) return __this_address; diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 003812fd7d35..8cd37e6e9d38 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -865,7 +865,6 @@ xfs_dir2_sf_lookup( struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; int i; /* entry index */ - int error; xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ enum xfs_dacmp cmp; /* comparison result */ @@ -929,8 +928,7 @@ xfs_dir2_sf_lookup( if (!ci_sfep) return -ENOENT; /* otherwise process the CI match as required by the caller */ - error = xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen); - return error; + return xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen); } /* diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index b55bdfa9c8a8..371dc07233e0 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -1564,20 +1564,6 @@ struct xfs_rmap_rec { #define RMAPBT_UNUSED_OFFSET_BITLEN 7 #define RMAPBT_OFFSET_BITLEN 54 -#define XFS_RMAP_ATTR_FORK (1 << 0) -#define XFS_RMAP_BMBT_BLOCK (1 << 1) -#define XFS_RMAP_UNWRITTEN (1 << 2) -#define XFS_RMAP_KEY_FLAGS (XFS_RMAP_ATTR_FORK | \ - XFS_RMAP_BMBT_BLOCK) -#define XFS_RMAP_REC_FLAGS (XFS_RMAP_UNWRITTEN) -struct xfs_rmap_irec { - xfs_agblock_t rm_startblock; /* extent start block */ - xfs_extlen_t rm_blockcount; /* extent length */ - uint64_t rm_owner; /* extent owner */ - uint64_t rm_offset; /* offset within the owner */ - unsigned int rm_flags; /* state flags */ -}; - /* * Key structure * @@ -1626,7 +1612,7 @@ unsigned int xfs_refc_block(struct xfs_mount *mp); * on the startblock. This speeds up mount time deletion of stale * staging extents because they're all at the right side of the tree. */ -#define XFS_REFC_COW_START ((xfs_agblock_t)(1U << 31)) +#define XFS_REFC_COWFLAG (1U << 31) #define REFCNTBT_COWFLAG_BITLEN 1 #define REFCNTBT_AGBLOCK_BITLEN 31 @@ -1640,12 +1626,6 @@ struct xfs_refcount_key { __be32 rc_startblock; /* starting block number */ }; -struct xfs_refcount_irec { - xfs_agblock_t rc_startblock; /* starting block number */ - xfs_extlen_t rc_blockcount; /* count of free blocks */ - xfs_nlink_t rc_refcount; /* number of inodes linked here */ -}; - #define MAXREFCOUNT ((xfs_nlink_t)~0U) #define MAXREFCEXTLEN ((xfs_extlen_t)~0U) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 6cdfd64bc56b..94db50eb706a 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -636,7 +636,7 @@ xfs_ialloc_ag_alloc( /* randomly do sparse inode allocations */ if (xfs_has_sparseinodes(tp->t_mountp) && igeo->ialloc_min_blks < igeo->ialloc_blks) - do_sparse = prandom_u32() & 1; + do_sparse = prandom_u32_max(2); #endif /* @@ -805,7 +805,7 @@ sparse_alloc: * number from being easily guessable. */ error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, pag->pag_agno, - args.agbno, args.len, prandom_u32()); + args.agbno, args.len, get_random_u32()); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 9327a4f39206..6b21760184d9 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -78,7 +78,7 @@ xfs_iformat_local( */ if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { xfs_warn(ip->i_mount, - "corrupt inode %Lu (bad size %d for local fork, size = %zd).", + "corrupt inode %llu (bad size %d for local fork, size = %zd).", (unsigned long long) ip->i_ino, size, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)); xfs_inode_verifier_error(ip, -EFSCORRUPTED, @@ -192,7 +192,7 @@ xfs_iformat_btree( XFS_DFORK_SIZE(dip, mp, whichfork) || ifp->if_nextents > ip->i_nblocks) || level == 0 || level > XFS_BM_MAXLEVELS(mp, whichfork)) { - xfs_warn(mp, "corrupt inode %Lu (btree).", + xfs_warn(mp, "corrupt inode %llu (btree).", (unsigned long long) ip->i_ino); xfs_inode_verifier_error(ip, -EFSCORRUPTED, "xfs_iformat_btree", dfp, size, diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index b351b9dc6561..f13e0809dc63 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -613,25 +613,49 @@ typedef struct xfs_efi_log_format { uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ - xfs_extent_t efi_extents[1]; /* array of extents to free */ + xfs_extent_t efi_extents[]; /* array of extents to free */ } xfs_efi_log_format_t; +static inline size_t +xfs_efi_log_format_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_efi_log_format) + + nr * sizeof(struct xfs_extent); +} + typedef struct xfs_efi_log_format_32 { uint16_t efi_type; /* efi log item type */ uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ - xfs_extent_32_t efi_extents[1]; /* array of extents to free */ + xfs_extent_32_t efi_extents[]; /* array of extents to free */ } __attribute__((packed)) xfs_efi_log_format_32_t; +static inline size_t +xfs_efi_log_format32_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_efi_log_format_32) + + nr * sizeof(struct xfs_extent_32); +} + typedef struct xfs_efi_log_format_64 { uint16_t efi_type; /* efi log item type */ uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ - xfs_extent_64_t efi_extents[1]; /* array of extents to free */ + xfs_extent_64_t efi_extents[]; /* array of extents to free */ } xfs_efi_log_format_64_t; +static inline size_t +xfs_efi_log_format64_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_efi_log_format_64) + + nr * sizeof(struct xfs_extent_64); +} + /* * This is the structure used to lay out an efd log item in the * log. The efd_extents array is a variable size array whose @@ -642,25 +666,49 @@ typedef struct xfs_efd_log_format { uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_t efd_extents[1]; /* array of extents freed */ + xfs_extent_t efd_extents[]; /* array of extents freed */ } xfs_efd_log_format_t; +static inline size_t +xfs_efd_log_format_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_efd_log_format) + + nr * sizeof(struct xfs_extent); +} + typedef struct xfs_efd_log_format_32 { uint16_t efd_type; /* efd log item type */ uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_32_t efd_extents[1]; /* array of extents freed */ + xfs_extent_32_t efd_extents[]; /* array of extents freed */ } __attribute__((packed)) xfs_efd_log_format_32_t; +static inline size_t +xfs_efd_log_format32_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_efd_log_format_32) + + nr * sizeof(struct xfs_extent_32); +} + typedef struct xfs_efd_log_format_64 { uint16_t efd_type; /* efd log item type */ uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_64_t efd_extents[1]; /* array of extents freed */ + xfs_extent_64_t efd_extents[]; /* array of extents freed */ } xfs_efd_log_format_64_t; +static inline size_t +xfs_efd_log_format64_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_efd_log_format_64) + + nr * sizeof(struct xfs_extent_64); +} + /* * RUI/RUD (reverse mapping) log format definitions */ diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 64b910caafaa..3f34bafe18dd 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -46,13 +46,16 @@ STATIC int __xfs_refcount_cow_free(struct xfs_btree_cur *rcur, int xfs_refcount_lookup_le( struct xfs_btree_cur *cur, + enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, bno, + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, + xfs_refcount_encode_startblock(bno, domain), XFS_LOOKUP_LE); cur->bc_rec.rc.rc_startblock = bno; cur->bc_rec.rc.rc_blockcount = 0; + cur->bc_rec.rc.rc_domain = domain; return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); } @@ -63,13 +66,16 @@ xfs_refcount_lookup_le( int xfs_refcount_lookup_ge( struct xfs_btree_cur *cur, + enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, bno, + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, + xfs_refcount_encode_startblock(bno, domain), XFS_LOOKUP_GE); cur->bc_rec.rc.rc_startblock = bno; cur->bc_rec.rc.rc_blockcount = 0; + cur->bc_rec.rc.rc_domain = domain; return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); } @@ -80,13 +86,16 @@ xfs_refcount_lookup_ge( int xfs_refcount_lookup_eq( struct xfs_btree_cur *cur, + enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, bno, + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, + xfs_refcount_encode_startblock(bno, domain), XFS_LOOKUP_LE); cur->bc_rec.rc.rc_startblock = bno; cur->bc_rec.rc.rc_blockcount = 0; + cur->bc_rec.rc.rc_domain = domain; return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); } @@ -96,7 +105,17 @@ xfs_refcount_btrec_to_irec( const union xfs_btree_rec *rec, struct xfs_refcount_irec *irec) { - irec->rc_startblock = be32_to_cpu(rec->refc.rc_startblock); + uint32_t start; + + start = be32_to_cpu(rec->refc.rc_startblock); + if (start & XFS_REFC_COWFLAG) { + start &= ~XFS_REFC_COWFLAG; + irec->rc_domain = XFS_REFC_DOMAIN_COW; + } else { + irec->rc_domain = XFS_REFC_DOMAIN_SHARED; + } + + irec->rc_startblock = start; irec->rc_blockcount = be32_to_cpu(rec->refc.rc_blockcount); irec->rc_refcount = be32_to_cpu(rec->refc.rc_refcount); } @@ -114,7 +133,6 @@ xfs_refcount_get_rec( struct xfs_perag *pag = cur->bc_ag.pag; union xfs_btree_rec *rec; int error; - xfs_agblock_t realstart; error = xfs_btree_get_rec(cur, &rec, stat); if (error || !*stat) @@ -124,22 +142,11 @@ xfs_refcount_get_rec( if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN) goto out_bad_rec; - /* handle special COW-staging state */ - realstart = irec->rc_startblock; - if (realstart & XFS_REFC_COW_START) { - if (irec->rc_refcount != 1) - goto out_bad_rec; - realstart &= ~XFS_REFC_COW_START; - } else if (irec->rc_refcount < 2) { + if (!xfs_refcount_check_domain(irec)) goto out_bad_rec; - } /* check for valid extent range, including overflow */ - if (!xfs_verify_agbno(pag, realstart)) - goto out_bad_rec; - if (realstart > realstart + irec->rc_blockcount) - goto out_bad_rec; - if (!xfs_verify_agbno(pag, realstart + irec->rc_blockcount - 1)) + if (!xfs_verify_agbext(pag, irec->rc_startblock, irec->rc_blockcount)) goto out_bad_rec; if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT) @@ -169,12 +176,17 @@ xfs_refcount_update( struct xfs_refcount_irec *irec) { union xfs_btree_rec rec; + uint32_t start; int error; trace_xfs_refcount_update(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec); - rec.refc.rc_startblock = cpu_to_be32(irec->rc_startblock); + + start = xfs_refcount_encode_startblock(irec->rc_startblock, + irec->rc_domain); + rec.refc.rc_startblock = cpu_to_be32(start); rec.refc.rc_blockcount = cpu_to_be32(irec->rc_blockcount); rec.refc.rc_refcount = cpu_to_be32(irec->rc_refcount); + error = xfs_btree_update(cur, &rec); if (error) trace_xfs_refcount_update_error(cur->bc_mp, @@ -196,9 +208,12 @@ xfs_refcount_insert( int error; trace_xfs_refcount_insert(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec); + cur->bc_rec.rc.rc_startblock = irec->rc_startblock; cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount; cur->bc_rec.rc.rc_refcount = irec->rc_refcount; + cur->bc_rec.rc.rc_domain = irec->rc_domain; + error = xfs_btree_insert(cur, i); if (error) goto out_error; @@ -244,7 +259,8 @@ xfs_refcount_delete( } if (error) goto out_error; - error = xfs_refcount_lookup_ge(cur, irec.rc_startblock, &found_rec); + error = xfs_refcount_lookup_ge(cur, irec.rc_domain, irec.rc_startblock, + &found_rec); out_error: if (error) trace_xfs_refcount_delete_error(cur->bc_mp, @@ -343,6 +359,7 @@ xfs_refc_next( STATIC int xfs_refcount_split_extent( struct xfs_btree_cur *cur, + enum xfs_refc_domain domain, xfs_agblock_t agbno, bool *shape_changed) { @@ -351,7 +368,7 @@ xfs_refcount_split_extent( int error; *shape_changed = false; - error = xfs_refcount_lookup_le(cur, agbno, &found_rec); + error = xfs_refcount_lookup_le(cur, domain, agbno, &found_rec); if (error) goto out_error; if (!found_rec) @@ -364,6 +381,8 @@ xfs_refcount_split_extent( error = -EFSCORRUPTED; goto out_error; } + if (rcext.rc_domain != domain) + return 0; if (rcext.rc_startblock == agbno || xfs_refc_next(&rcext) <= agbno) return 0; @@ -415,6 +434,9 @@ xfs_refcount_merge_center_extents( trace_xfs_refcount_merge_center_extents(cur->bc_mp, cur->bc_ag.pag->pag_agno, left, center, right); + ASSERT(left->rc_domain == center->rc_domain); + ASSERT(right->rc_domain == center->rc_domain); + /* * Make sure the center and right extents are not in the btree. * If the center extent was synthesized, the first delete call @@ -423,8 +445,8 @@ xfs_refcount_merge_center_extents( * call removes the center and the second one removes the right * extent. */ - error = xfs_refcount_lookup_ge(cur, center->rc_startblock, - &found_rec); + error = xfs_refcount_lookup_ge(cur, center->rc_domain, + center->rc_startblock, &found_rec); if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { @@ -451,8 +473,8 @@ xfs_refcount_merge_center_extents( } /* Enlarge the left extent. */ - error = xfs_refcount_lookup_le(cur, left->rc_startblock, - &found_rec); + error = xfs_refcount_lookup_le(cur, left->rc_domain, + left->rc_startblock, &found_rec); if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { @@ -491,10 +513,12 @@ xfs_refcount_merge_left_extent( trace_xfs_refcount_merge_left_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, left, cleft); + ASSERT(left->rc_domain == cleft->rc_domain); + /* If the extent at agbno (cleft) wasn't synthesized, remove it. */ if (cleft->rc_refcount > 1) { - error = xfs_refcount_lookup_le(cur, cleft->rc_startblock, - &found_rec); + error = xfs_refcount_lookup_le(cur, cleft->rc_domain, + cleft->rc_startblock, &found_rec); if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { @@ -512,8 +536,8 @@ xfs_refcount_merge_left_extent( } /* Enlarge the left extent. */ - error = xfs_refcount_lookup_le(cur, left->rc_startblock, - &found_rec); + error = xfs_refcount_lookup_le(cur, left->rc_domain, + left->rc_startblock, &found_rec); if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { @@ -552,13 +576,15 @@ xfs_refcount_merge_right_extent( trace_xfs_refcount_merge_right_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, cright, right); + ASSERT(right->rc_domain == cright->rc_domain); + /* * If the extent ending at agbno+aglen (cright) wasn't synthesized, * remove it. */ if (cright->rc_refcount > 1) { - error = xfs_refcount_lookup_le(cur, cright->rc_startblock, - &found_rec); + error = xfs_refcount_lookup_le(cur, cright->rc_domain, + cright->rc_startblock, &found_rec); if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { @@ -576,8 +602,8 @@ xfs_refcount_merge_right_extent( } /* Enlarge the right extent. */ - error = xfs_refcount_lookup_le(cur, right->rc_startblock, - &found_rec); + error = xfs_refcount_lookup_le(cur, right->rc_domain, + right->rc_startblock, &found_rec); if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { @@ -600,8 +626,6 @@ out_error: return error; } -#define XFS_FIND_RCEXT_SHARED 1 -#define XFS_FIND_RCEXT_COW 2 /* * Find the left extent and the one after it (cleft). This function assumes * that we've already split any extent crossing agbno. @@ -611,16 +635,16 @@ xfs_refcount_find_left_extents( struct xfs_btree_cur *cur, struct xfs_refcount_irec *left, struct xfs_refcount_irec *cleft, + enum xfs_refc_domain domain, xfs_agblock_t agbno, - xfs_extlen_t aglen, - int flags) + xfs_extlen_t aglen) { struct xfs_refcount_irec tmp; int error; int found_rec; left->rc_startblock = cleft->rc_startblock = NULLAGBLOCK; - error = xfs_refcount_lookup_le(cur, agbno - 1, &found_rec); + error = xfs_refcount_lookup_le(cur, domain, agbno - 1, &found_rec); if (error) goto out_error; if (!found_rec) @@ -634,11 +658,9 @@ xfs_refcount_find_left_extents( goto out_error; } - if (xfs_refc_next(&tmp) != agbno) - return 0; - if ((flags & XFS_FIND_RCEXT_SHARED) && tmp.rc_refcount < 2) + if (tmp.rc_domain != domain) return 0; - if ((flags & XFS_FIND_RCEXT_COW) && tmp.rc_refcount > 1) + if (xfs_refc_next(&tmp) != agbno) return 0; /* We have a left extent; retrieve (or invent) the next right one */ *left = tmp; @@ -655,6 +677,9 @@ xfs_refcount_find_left_extents( goto out_error; } + if (tmp.rc_domain != domain) + goto not_found; + /* if tmp starts at the end of our range, just use that */ if (tmp.rc_startblock == agbno) *cleft = tmp; @@ -671,8 +696,10 @@ xfs_refcount_find_left_extents( cleft->rc_blockcount = min(aglen, tmp.rc_startblock - agbno); cleft->rc_refcount = 1; + cleft->rc_domain = domain; } } else { +not_found: /* * No extents, so pretend that there's one covering the whole * range. @@ -680,6 +707,7 @@ xfs_refcount_find_left_extents( cleft->rc_startblock = agbno; cleft->rc_blockcount = aglen; cleft->rc_refcount = 1; + cleft->rc_domain = domain; } trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, left, cleft, agbno); @@ -700,16 +728,16 @@ xfs_refcount_find_right_extents( struct xfs_btree_cur *cur, struct xfs_refcount_irec *right, struct xfs_refcount_irec *cright, + enum xfs_refc_domain domain, xfs_agblock_t agbno, - xfs_extlen_t aglen, - int flags) + xfs_extlen_t aglen) { struct xfs_refcount_irec tmp; int error; int found_rec; right->rc_startblock = cright->rc_startblock = NULLAGBLOCK; - error = xfs_refcount_lookup_ge(cur, agbno + aglen, &found_rec); + error = xfs_refcount_lookup_ge(cur, domain, agbno + aglen, &found_rec); if (error) goto out_error; if (!found_rec) @@ -723,11 +751,9 @@ xfs_refcount_find_right_extents( goto out_error; } - if (tmp.rc_startblock != agbno + aglen) - return 0; - if ((flags & XFS_FIND_RCEXT_SHARED) && tmp.rc_refcount < 2) + if (tmp.rc_domain != domain) return 0; - if ((flags & XFS_FIND_RCEXT_COW) && tmp.rc_refcount > 1) + if (tmp.rc_startblock != agbno + aglen) return 0; /* We have a right extent; retrieve (or invent) the next left one */ *right = tmp; @@ -744,6 +770,9 @@ xfs_refcount_find_right_extents( goto out_error; } + if (tmp.rc_domain != domain) + goto not_found; + /* if tmp ends at the end of our range, just use that */ if (xfs_refc_next(&tmp) == agbno + aglen) *cright = tmp; @@ -760,8 +789,10 @@ xfs_refcount_find_right_extents( cright->rc_blockcount = right->rc_startblock - cright->rc_startblock; cright->rc_refcount = 1; + cright->rc_domain = domain; } } else { +not_found: /* * No extents, so pretend that there's one covering the whole * range. @@ -769,6 +800,7 @@ xfs_refcount_find_right_extents( cright->rc_startblock = agbno; cright->rc_blockcount = aglen; cright->rc_refcount = 1; + cright->rc_domain = domain; } trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, cright, right, agbno + aglen); @@ -794,10 +826,10 @@ xfs_refc_valid( STATIC int xfs_refcount_merge_extents( struct xfs_btree_cur *cur, + enum xfs_refc_domain domain, xfs_agblock_t *agbno, xfs_extlen_t *aglen, enum xfs_refc_adjust_op adjust, - int flags, bool *shape_changed) { struct xfs_refcount_irec left = {0}, cleft = {0}; @@ -812,12 +844,12 @@ xfs_refcount_merge_extents( * just below (agbno + aglen) [cright], and just above (agbno + aglen) * [right]. */ - error = xfs_refcount_find_left_extents(cur, &left, &cleft, *agbno, - *aglen, flags); + error = xfs_refcount_find_left_extents(cur, &left, &cleft, domain, + *agbno, *aglen); if (error) return error; - error = xfs_refcount_find_right_extents(cur, &right, &cright, *agbno, - *aglen, flags); + error = xfs_refcount_find_right_extents(cur, &right, &cright, domain, + *agbno, *aglen); if (error) return error; @@ -870,7 +902,7 @@ xfs_refcount_merge_extents( aglen); } - return error; + return 0; } /* @@ -933,7 +965,8 @@ xfs_refcount_adjust_extents( if (*aglen == 0) return 0; - error = xfs_refcount_lookup_ge(cur, *agbno, &found_rec); + error = xfs_refcount_lookup_ge(cur, XFS_REFC_DOMAIN_SHARED, *agbno, + &found_rec); if (error) goto out_error; @@ -941,10 +974,11 @@ xfs_refcount_adjust_extents( error = xfs_refcount_get_rec(cur, &ext, &found_rec); if (error) goto out_error; - if (!found_rec) { + if (!found_rec || ext.rc_domain != XFS_REFC_DOMAIN_SHARED) { ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks; ext.rc_blockcount = 0; ext.rc_refcount = 0; + ext.rc_domain = XFS_REFC_DOMAIN_SHARED; } /* @@ -957,6 +991,8 @@ xfs_refcount_adjust_extents( tmp.rc_blockcount = min(*aglen, ext.rc_startblock - *agbno); tmp.rc_refcount = 1 + adj; + tmp.rc_domain = XFS_REFC_DOMAIN_SHARED; + trace_xfs_refcount_modify_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, &tmp); @@ -986,15 +1022,30 @@ xfs_refcount_adjust_extents( (*agbno) += tmp.rc_blockcount; (*aglen) -= tmp.rc_blockcount; - error = xfs_refcount_lookup_ge(cur, *agbno, + /* Stop if there's nothing left to modify */ + if (*aglen == 0 || !xfs_refcount_still_have_space(cur)) + break; + + /* Move the cursor to the start of ext. */ + error = xfs_refcount_lookup_ge(cur, + XFS_REFC_DOMAIN_SHARED, *agbno, &found_rec); if (error) goto out_error; } - /* Stop if there's nothing left to modify */ - if (*aglen == 0 || !xfs_refcount_still_have_space(cur)) - break; + /* + * A previous step trimmed agbno/aglen such that the end of the + * range would not be in the middle of the record. If this is + * no longer the case, something is seriously wrong with the + * btree. Make sure we never feed the synthesized record into + * the processing loop below. + */ + if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount == 0) || + XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount > *aglen)) { + error = -EFSCORRUPTED; + goto out_error; + } /* * Adjust the reference count and either update the tree @@ -1070,13 +1121,15 @@ xfs_refcount_adjust( /* * Ensure that no rcextents cross the boundary of the adjustment range. */ - error = xfs_refcount_split_extent(cur, agbno, &shape_changed); + error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_SHARED, + agbno, &shape_changed); if (error) goto out_error; if (shape_changed) shape_changes++; - error = xfs_refcount_split_extent(cur, agbno + aglen, &shape_changed); + error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_SHARED, + agbno + aglen, &shape_changed); if (error) goto out_error; if (shape_changed) @@ -1085,8 +1138,8 @@ xfs_refcount_adjust( /* * Try to merge with the left or right extents of the range. */ - error = xfs_refcount_merge_extents(cur, new_agbno, new_aglen, adj, - XFS_FIND_RCEXT_SHARED, &shape_changed); + error = xfs_refcount_merge_extents(cur, XFS_REFC_DOMAIN_SHARED, + new_agbno, new_aglen, adj, &shape_changed); if (error) goto out_error; if (shape_changed) @@ -1125,6 +1178,32 @@ xfs_refcount_finish_one_cleanup( } /* + * Set up a continuation a deferred refcount operation by updating the intent. + * Checks to make sure we're not going to run off the end of the AG. + */ +static inline int +xfs_refcount_continue_op( + struct xfs_btree_cur *cur, + xfs_fsblock_t startblock, + xfs_agblock_t new_agbno, + xfs_extlen_t new_len, + xfs_fsblock_t *new_fsbno) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_perag *pag = cur->bc_ag.pag; + + if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno, new_len))) + return -EFSCORRUPTED; + + *new_fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno); + + ASSERT(xfs_verify_fsbext(mp, *new_fsbno, new_len)); + ASSERT(pag->pag_agno == XFS_FSB_TO_AGNO(mp, *new_fsbno)); + + return 0; +} + +/* * Process one of the deferred refcount operations. We pass back the * btree cursor to maintain our lock on the btree between calls. * This saves time and eliminates a buffer deadlock between the @@ -1191,12 +1270,20 @@ xfs_refcount_finish_one( case XFS_REFCOUNT_INCREASE: error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno, new_len, XFS_REFCOUNT_ADJUST_INCREASE); - *new_fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno); + if (error) + goto out_drop; + if (*new_len > 0) + error = xfs_refcount_continue_op(rcur, startblock, + new_agbno, *new_len, new_fsb); break; case XFS_REFCOUNT_DECREASE: error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno, new_len, XFS_REFCOUNT_ADJUST_DECREASE); - *new_fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno); + if (error) + goto out_drop; + if (*new_len > 0) + error = xfs_refcount_continue_op(rcur, startblock, + new_agbno, *new_len, new_fsb); break; case XFS_REFCOUNT_ALLOC_COW: *new_fsb = startblock + blockcount; @@ -1307,7 +1394,8 @@ xfs_refcount_find_shared( *flen = 0; /* Try to find a refcount extent that crosses the start */ - error = xfs_refcount_lookup_le(cur, agbno, &have); + error = xfs_refcount_lookup_le(cur, XFS_REFC_DOMAIN_SHARED, agbno, + &have); if (error) goto out_error; if (!have) { @@ -1325,6 +1413,8 @@ xfs_refcount_find_shared( error = -EFSCORRUPTED; goto out_error; } + if (tmp.rc_domain != XFS_REFC_DOMAIN_SHARED) + goto done; /* If the extent ends before the start, look at the next one */ if (tmp.rc_startblock + tmp.rc_blockcount <= agbno) { @@ -1340,6 +1430,8 @@ xfs_refcount_find_shared( error = -EFSCORRUPTED; goto out_error; } + if (tmp.rc_domain != XFS_REFC_DOMAIN_SHARED) + goto done; } /* If the extent starts after the range we want, bail out */ @@ -1371,7 +1463,8 @@ xfs_refcount_find_shared( error = -EFSCORRUPTED; goto out_error; } - if (tmp.rc_startblock >= agbno + aglen || + if (tmp.rc_domain != XFS_REFC_DOMAIN_SHARED || + tmp.rc_startblock >= agbno + aglen || tmp.rc_startblock != *fbno + *flen) break; *flen = min(*flen + tmp.rc_blockcount, agbno + aglen - *fbno); @@ -1455,17 +1548,23 @@ xfs_refcount_adjust_cow_extents( return 0; /* Find any overlapping refcount records */ - error = xfs_refcount_lookup_ge(cur, agbno, &found_rec); + error = xfs_refcount_lookup_ge(cur, XFS_REFC_DOMAIN_COW, agbno, + &found_rec); if (error) goto out_error; error = xfs_refcount_get_rec(cur, &ext, &found_rec); if (error) goto out_error; + if (XFS_IS_CORRUPT(cur->bc_mp, found_rec && + ext.rc_domain != XFS_REFC_DOMAIN_COW)) { + error = -EFSCORRUPTED; + goto out_error; + } if (!found_rec) { - ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks + - XFS_REFC_COW_START; + ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks; ext.rc_blockcount = 0; ext.rc_refcount = 0; + ext.rc_domain = XFS_REFC_DOMAIN_COW; } switch (adj) { @@ -1480,6 +1579,8 @@ xfs_refcount_adjust_cow_extents( tmp.rc_startblock = agbno; tmp.rc_blockcount = aglen; tmp.rc_refcount = 1; + tmp.rc_domain = XFS_REFC_DOMAIN_COW; + trace_xfs_refcount_modify_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, &tmp); @@ -1542,24 +1643,24 @@ xfs_refcount_adjust_cow( bool shape_changed; int error; - agbno += XFS_REFC_COW_START; - /* * Ensure that no rcextents cross the boundary of the adjustment range. */ - error = xfs_refcount_split_extent(cur, agbno, &shape_changed); + error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_COW, + agbno, &shape_changed); if (error) goto out_error; - error = xfs_refcount_split_extent(cur, agbno + aglen, &shape_changed); + error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_COW, + agbno + aglen, &shape_changed); if (error) goto out_error; /* * Try to merge with the left or right extents of the range. */ - error = xfs_refcount_merge_extents(cur, &agbno, &aglen, adj, - XFS_FIND_RCEXT_COW, &shape_changed); + error = xfs_refcount_merge_extents(cur, XFS_REFC_DOMAIN_COW, &agbno, + &aglen, adj, &shape_changed); if (error) goto out_error; @@ -1666,10 +1767,18 @@ xfs_refcount_recover_extent( be32_to_cpu(rec->refc.rc_refcount) != 1)) return -EFSCORRUPTED; - rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), 0); + rr = kmalloc(sizeof(struct xfs_refcount_recovery), + GFP_KERNEL | __GFP_NOFAIL); + INIT_LIST_HEAD(&rr->rr_list); xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec); - list_add_tail(&rr->rr_list, debris); + if (XFS_IS_CORRUPT(cur->bc_mp, + rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) { + kfree(rr); + return -EFSCORRUPTED; + } + + list_add_tail(&rr->rr_list, debris); return 0; } @@ -1687,10 +1796,11 @@ xfs_refcount_recover_cow_leftovers( union xfs_btree_irec low; union xfs_btree_irec high; xfs_fsblock_t fsb; - xfs_agblock_t agbno; int error; - if (mp->m_sb.sb_agblocks >= XFS_REFC_COW_START) + /* reflink filesystems mustn't have AGs larger than 2^31-1 blocks */ + BUILD_BUG_ON(XFS_MAX_CRC_AG_BLOCKS >= XFS_REFC_COWFLAG); + if (mp->m_sb.sb_agblocks > XFS_MAX_CRC_AG_BLOCKS) return -EOPNOTSUPP; INIT_LIST_HEAD(&debris); @@ -1717,7 +1827,7 @@ xfs_refcount_recover_cow_leftovers( /* Find all the leftover CoW staging extents. */ memset(&low, 0, sizeof(low)); memset(&high, 0, sizeof(high)); - low.rc.rc_startblock = XFS_REFC_COW_START; + low.rc.rc_domain = high.rc.rc_domain = XFS_REFC_DOMAIN_COW; high.rc.rc_startblock = -1U; error = xfs_btree_query_range(cur, &low, &high, xfs_refcount_recover_extent, &debris); @@ -1738,8 +1848,8 @@ xfs_refcount_recover_cow_leftovers( &rr->rr_rrec); /* Free the orphan record */ - agbno = rr->rr_rrec.rc_startblock - XFS_REFC_COW_START; - fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, agbno); + fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, + rr->rr_rrec.rc_startblock); xfs_refcount_free_cow_extent(tp, fsb, rr->rr_rrec.rc_blockcount); @@ -1751,7 +1861,7 @@ xfs_refcount_recover_cow_leftovers( goto out_free; list_del(&rr->rr_list); - kmem_free(rr); + kfree(rr); } return error; @@ -1761,7 +1871,7 @@ out_free: /* Free the leftover list */ list_for_each_entry_safe(rr, n, &debris, rr_list) { list_del(&rr->rr_list); - kmem_free(rr); + kfree(rr); } return error; } @@ -1770,6 +1880,7 @@ out_free: int xfs_refcount_has_record( struct xfs_btree_cur *cur, + enum xfs_refc_domain domain, xfs_agblock_t bno, xfs_extlen_t len, bool *exists) @@ -1781,6 +1892,7 @@ xfs_refcount_has_record( low.rc.rc_startblock = bno; memset(&high, 0xFF, sizeof(high)); high.rc.rc_startblock = bno + len - 1; + low.rc.rc_domain = high.rc.rc_domain = domain; return xfs_btree_has_record(cur, &low, &high, exists); } diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index e8b322de7f3d..452f30556f5a 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -14,14 +14,33 @@ struct xfs_bmbt_irec; struct xfs_refcount_irec; extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur, - xfs_agblock_t bno, int *stat); + enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat); extern int xfs_refcount_lookup_ge(struct xfs_btree_cur *cur, - xfs_agblock_t bno, int *stat); + enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat); extern int xfs_refcount_lookup_eq(struct xfs_btree_cur *cur, - xfs_agblock_t bno, int *stat); + enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat); extern int xfs_refcount_get_rec(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, int *stat); +static inline uint32_t +xfs_refcount_encode_startblock( + xfs_agblock_t startblock, + enum xfs_refc_domain domain) +{ + uint32_t start; + + /* + * low level btree operations need to handle the generic btree range + * query functions (which set rc_domain == -1U), so we check that the + * domain is /not/ shared. + */ + start = startblock & ~XFS_REFC_COWFLAG; + if (domain != XFS_REFC_DOMAIN_SHARED) + start |= XFS_REFC_COWFLAG; + + return start; +} + enum xfs_refcount_intent_type { XFS_REFCOUNT_INCREASE = 1, XFS_REFCOUNT_DECREASE, @@ -36,6 +55,18 @@ struct xfs_refcount_intent { xfs_fsblock_t ri_startblock; }; +/* Check that the refcount is appropriate for the record domain. */ +static inline bool +xfs_refcount_check_domain( + const struct xfs_refcount_irec *irec) +{ + if (irec->rc_domain == XFS_REFC_DOMAIN_COW && irec->rc_refcount != 1) + return false; + if (irec->rc_domain == XFS_REFC_DOMAIN_SHARED && irec->rc_refcount < 2) + return false; + return true; +} + void xfs_refcount_increase_extent(struct xfs_trans *tp, struct xfs_bmbt_irec *irec); void xfs_refcount_decrease_extent(struct xfs_trans *tp, @@ -79,7 +110,8 @@ extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp, #define XFS_REFCOUNT_ITEM_OVERHEAD 32 extern int xfs_refcount_has_record(struct xfs_btree_cur *cur, - xfs_agblock_t bno, xfs_extlen_t len, bool *exists); + enum xfs_refc_domain domain, xfs_agblock_t bno, + xfs_extlen_t len, bool *exists); union xfs_btree_rec; extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec, struct xfs_refcount_irec *irec); diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 316c1ec0c3c2..e1f789866683 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -13,6 +13,7 @@ #include "xfs_btree.h" #include "xfs_btree_staging.h" #include "xfs_refcount_btree.h" +#include "xfs_refcount.h" #include "xfs_alloc.h" #include "xfs_error.h" #include "xfs_trace.h" @@ -160,7 +161,12 @@ xfs_refcountbt_init_rec_from_cur( struct xfs_btree_cur *cur, union xfs_btree_rec *rec) { - rec->refc.rc_startblock = cpu_to_be32(cur->bc_rec.rc.rc_startblock); + const struct xfs_refcount_irec *irec = &cur->bc_rec.rc; + uint32_t start; + + start = xfs_refcount_encode_startblock(irec->rc_startblock, + irec->rc_domain); + rec->refc.rc_startblock = cpu_to_be32(start); rec->refc.rc_blockcount = cpu_to_be32(cur->bc_rec.rc.rc_blockcount); rec->refc.rc_refcount = cpu_to_be32(cur->bc_rec.rc.rc_refcount); } @@ -182,10 +188,13 @@ xfs_refcountbt_key_diff( struct xfs_btree_cur *cur, const union xfs_btree_key *key) { - struct xfs_refcount_irec *rec = &cur->bc_rec.rc; const struct xfs_refcount_key *kp = &key->refc; + const struct xfs_refcount_irec *irec = &cur->bc_rec.rc; + uint32_t start; - return (int64_t)be32_to_cpu(kp->rc_startblock) - rec->rc_startblock; + start = xfs_refcount_encode_startblock(irec->rc_startblock, + irec->rc_domain); + return (int64_t)be32_to_cpu(kp->rc_startblock) - start; } STATIC int64_t diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 094dfc897ebc..b56aca1e7c66 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -235,13 +235,8 @@ xfs_rmap_get_rec( goto out_bad_rec; } else { /* check for valid extent range, including overflow */ - if (!xfs_verify_agbno(pag, irec->rm_startblock)) - goto out_bad_rec; - if (irec->rm_startblock > - irec->rm_startblock + irec->rm_blockcount) - goto out_bad_rec; - if (!xfs_verify_agbno(pag, - irec->rm_startblock + irec->rm_blockcount - 1)) + if (!xfs_verify_agbext(pag, irec->rm_startblock, + irec->rm_blockcount)) goto out_bad_rec; } diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 2c4ad6e4bb14..5b2f27cbdb80 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -422,7 +422,7 @@ xfs_calc_itruncate_reservation_minlogsize( /* * In renaming a files we can modify: - * the four inodes involved: 4 * inode size + * the five inodes involved: 5 * inode size * the two directory btrees: 2 * (max depth + v2) * dir block size * the two directory bmap btrees: 2 * max depth * block size * And the bmap_finish transaction can free dir and bmap blocks (two sets @@ -437,7 +437,7 @@ xfs_calc_rename_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - max((xfs_calc_inode_res(mp, 4) + + max((xfs_calc_inode_res(mp, 5) + xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) + diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index a6b7d98cf68f..5ebdda7e1078 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -166,6 +166,36 @@ typedef struct xfs_bmbt_irec xfs_exntst_t br_state; /* extent state */ } xfs_bmbt_irec_t; +enum xfs_refc_domain { + XFS_REFC_DOMAIN_SHARED = 0, + XFS_REFC_DOMAIN_COW, +}; + +#define XFS_REFC_DOMAIN_STRINGS \ + { XFS_REFC_DOMAIN_SHARED, "shared" }, \ + { XFS_REFC_DOMAIN_COW, "cow" } + +struct xfs_refcount_irec { + xfs_agblock_t rc_startblock; /* starting block number */ + xfs_extlen_t rc_blockcount; /* count of free blocks */ + xfs_nlink_t rc_refcount; /* number of inodes linked here */ + enum xfs_refc_domain rc_domain; /* shared or cow staging extent? */ +}; + +#define XFS_RMAP_ATTR_FORK (1 << 0) +#define XFS_RMAP_BMBT_BLOCK (1 << 1) +#define XFS_RMAP_UNWRITTEN (1 << 2) +#define XFS_RMAP_KEY_FLAGS (XFS_RMAP_ATTR_FORK | \ + XFS_RMAP_BMBT_BLOCK) +#define XFS_RMAP_REC_FLAGS (XFS_RMAP_UNWRITTEN) +struct xfs_rmap_irec { + xfs_agblock_t rm_startblock; /* extent start block */ + xfs_extlen_t rm_blockcount; /* extent length */ + uint64_t rm_owner; /* extent owner */ + uint64_t rm_offset; /* offset within the owner */ + unsigned int rm_flags; /* state flags */ +}; + /* per-AG block reservation types */ enum xfs_ag_resv_type { XFS_AG_RESV_NONE = 0, diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index ab427b4d7fe0..3b38f4e2a537 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -100,9 +100,7 @@ xchk_allocbt_rec( bno = be32_to_cpu(rec->alloc.ar_startblock); len = be32_to_cpu(rec->alloc.ar_blockcount); - if (bno + len <= bno || - !xfs_verify_agbno(pag, bno) || - !xfs_verify_agbno(pag, bno + len - 1)) + if (!xfs_verify_agbext(pag, bno, len)) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); xchk_allocbt_xref(bs->sc, bno, len); diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index 5abb5fdb71d9..5c87800ab223 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -99,7 +99,7 @@ out: * we check the inode number to make sure it's sane, then we check that * we can look up this filename. Finally, we check the ftype. */ -STATIC int +STATIC bool xchk_dir_actor( struct dir_context *dir_iter, const char *name, @@ -124,7 +124,7 @@ xchk_dir_actor( xfs_dir2_dataptr_to_db(mp->m_dir_geo, pos)); if (xchk_should_terminate(sdc->sc, &error)) - return error; + return !error; /* Does this inode number make sense? */ if (!xfs_verify_dir_ino(mp, ino)) { @@ -191,8 +191,8 @@ out: * and return zero to xchk_directory. */ if (error == 0 && sdc->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return -EFSCORRUPTED; - return error; + return false; + return !error; } /* Scrub a directory btree record. */ @@ -676,7 +676,7 @@ xchk_directory_blocks( xfs_dablk_t dabno; xfs_dir2_db_t last_data_db = 0; bool found; - int is_block = 0; + bool is_block = false; int error; /* Ignore local format directories. */ diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index e1026e07bf94..e312be7cd375 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -108,9 +108,8 @@ xchk_iallocbt_chunk( xfs_agblock_t bno; bno = XFS_AGINO_TO_AGBNO(mp, agino); - if (bno + len <= bno || - !xfs_verify_agbno(pag, bno) || - !xfs_verify_agbno(pag, bno + len - 1)) + + if (!xfs_verify_agbext(pag, bno, len)) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); xchk_iallocbt_chunk_xref(bs->sc, irec, agino, bno, len); diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index ab182a5cd0c0..d8dff3fd8053 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -38,7 +38,7 @@ struct xchk_parent_ctx { }; /* Look for a single entry in a directory pointing to an inode. */ -STATIC int +STATIC bool xchk_parent_actor( struct dir_context *dc, const char *name, @@ -62,7 +62,7 @@ xchk_parent_actor( if (xchk_should_terminate(spc->sc, &error)) spc->cancelled = true; - return error; + return !error; } /* Count the number of dentries in the parent dir that point to this inode. */ diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index c68b767dc08f..a26ee0f24ef2 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -269,15 +269,13 @@ done: STATIC void xchk_refcountbt_xref_rmap( struct xfs_scrub *sc, - xfs_agblock_t bno, - xfs_extlen_t len, - xfs_nlink_t refcount) + const struct xfs_refcount_irec *irec) { struct xchk_refcnt_check refchk = { - .sc = sc, - .bno = bno, - .len = len, - .refcount = refcount, + .sc = sc, + .bno = irec->rc_startblock, + .len = irec->rc_blockcount, + .refcount = irec->rc_refcount, .seen = 0, }; struct xfs_rmap_irec low; @@ -291,9 +289,9 @@ xchk_refcountbt_xref_rmap( /* Cross-reference with the rmapbt to confirm the refcount. */ memset(&low, 0, sizeof(low)); - low.rm_startblock = bno; + low.rm_startblock = irec->rc_startblock; memset(&high, 0xFF, sizeof(high)); - high.rm_startblock = bno + len - 1; + high.rm_startblock = irec->rc_startblock + irec->rc_blockcount - 1; INIT_LIST_HEAD(&refchk.fragments); error = xfs_rmap_query_range(sc->sa.rmap_cur, &low, &high, @@ -302,7 +300,7 @@ xchk_refcountbt_xref_rmap( goto out_free; xchk_refcountbt_process_rmap_fragments(&refchk); - if (refcount != refchk.seen) + if (irec->rc_refcount != refchk.seen) xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); out_free: @@ -315,17 +313,16 @@ out_free: /* Cross-reference with the other btrees. */ STATIC void xchk_refcountbt_xref( - struct xfs_scrub *sc, - xfs_agblock_t agbno, - xfs_extlen_t len, - xfs_nlink_t refcount) + struct xfs_scrub *sc, + const struct xfs_refcount_irec *irec) { if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return; - xchk_xref_is_used_space(sc, agbno, len); - xchk_xref_is_not_inode_chunk(sc, agbno, len); - xchk_refcountbt_xref_rmap(sc, agbno, len, refcount); + xchk_xref_is_used_space(sc, irec->rc_startblock, irec->rc_blockcount); + xchk_xref_is_not_inode_chunk(sc, irec->rc_startblock, + irec->rc_blockcount); + xchk_refcountbt_xref_rmap(sc, irec); } /* Scrub a refcountbt record. */ @@ -334,35 +331,27 @@ xchk_refcountbt_rec( struct xchk_btree *bs, const union xfs_btree_rec *rec) { + struct xfs_refcount_irec irec; xfs_agblock_t *cow_blocks = bs->private; struct xfs_perag *pag = bs->cur->bc_ag.pag; - xfs_agblock_t bno; - xfs_extlen_t len; - xfs_nlink_t refcount; - bool has_cowflag; - bno = be32_to_cpu(rec->refc.rc_startblock); - len = be32_to_cpu(rec->refc.rc_blockcount); - refcount = be32_to_cpu(rec->refc.rc_refcount); + xfs_refcount_btrec_to_irec(rec, &irec); - /* Only CoW records can have refcount == 1. */ - has_cowflag = (bno & XFS_REFC_COW_START); - if ((refcount == 1 && !has_cowflag) || (refcount != 1 && has_cowflag)) + /* Check the domain and refcount are not incompatible. */ + if (!xfs_refcount_check_domain(&irec)) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); - if (has_cowflag) - (*cow_blocks) += len; + + if (irec.rc_domain == XFS_REFC_DOMAIN_COW) + (*cow_blocks) += irec.rc_blockcount; /* Check the extent. */ - bno &= ~XFS_REFC_COW_START; - if (bno + len <= bno || - !xfs_verify_agbno(pag, bno) || - !xfs_verify_agbno(pag, bno + len - 1)) + if (!xfs_verify_agbext(pag, irec.rc_startblock, irec.rc_blockcount)) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); - if (refcount == 0) + if (irec.rc_refcount == 0) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); - xchk_refcountbt_xref(bs->sc, bno, len, refcount); + xchk_refcountbt_xref(bs->sc, &irec); return 0; } @@ -426,7 +415,6 @@ xchk_xref_is_cow_staging( xfs_extlen_t len) { struct xfs_refcount_irec rc; - bool has_cowflag; int has_refcount; int error; @@ -434,8 +422,8 @@ xchk_xref_is_cow_staging( return; /* Find the CoW staging extent. */ - error = xfs_refcount_lookup_le(sc->sa.refc_cur, - agbno + XFS_REFC_COW_START, &has_refcount); + error = xfs_refcount_lookup_le(sc->sa.refc_cur, XFS_REFC_DOMAIN_COW, + agbno, &has_refcount); if (!xchk_should_check_xref(sc, &error, &sc->sa.refc_cur)) return; if (!has_refcount) { @@ -451,9 +439,8 @@ xchk_xref_is_cow_staging( return; } - /* CoW flag must be set, refcount must be 1. */ - has_cowflag = (rc.rc_startblock & XFS_REFC_COW_START); - if (!has_cowflag || rc.rc_refcount != 1) + /* CoW lookup returned a shared extent record? */ + if (rc.rc_domain != XFS_REFC_DOMAIN_COW) xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0); /* Must be at least as long as what was passed in */ @@ -477,7 +464,8 @@ xchk_xref_is_not_shared( if (!sc->sa.refc_cur || xchk_skip_xref(sc->sm)) return; - error = xfs_refcount_has_record(sc->sa.refc_cur, agbno, len, &shared); + error = xfs_refcount_has_record(sc->sa.refc_cur, XFS_REFC_DOMAIN_SHARED, + agbno, len, &shared); if (!xchk_should_check_xref(sc, &error, &sc->sa.refc_cur)) return; if (shared) diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 5077a7ad5646..2788a6f2edcd 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -86,8 +86,6 @@ xfs_attri_log_nameval_alloc( */ nv = xlog_kvmalloc(sizeof(struct xfs_attri_log_nameval) + name_len + value_len); - if (!nv) - return nv; nv->name.i_addr = nv + 1; nv->name.i_len = name_len; @@ -247,28 +245,6 @@ xfs_attri_init( return attrip; } -/* - * Copy an attr format buffer from the given buf, and into the destination attr - * format structure. - */ -STATIC int -xfs_attri_copy_format( - struct xfs_log_iovec *buf, - struct xfs_attri_log_format *dst_attr_fmt) -{ - struct xfs_attri_log_format *src_attr_fmt = buf->i_addr; - size_t len; - - len = sizeof(struct xfs_attri_log_format); - if (buf->i_len != len) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); - return -EFSCORRUPTED; - } - - memcpy((char *)dst_attr_fmt, (char *)src_attr_fmt, len); - return 0; -} - static inline struct xfs_attrd_log_item *ATTRD_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_attrd_log_item, attrd_item); @@ -441,8 +417,6 @@ xfs_attr_create_intent( attr->xattri_nameval = xfs_attri_log_nameval_alloc(args->name, args->namelen, args->value, args->valuelen); } - if (!attr->xattri_nameval) - return ERR_PTR(-ENOMEM); attrip = xfs_attri_init(mp, attr->xattri_nameval); xfs_trans_add_item(tp, &attrip->attri_item); @@ -735,24 +709,50 @@ xlog_recover_attri_commit_pass2( struct xfs_attri_log_nameval *nv; const void *attr_value = NULL; const void *attr_name; - int error; + size_t len; attri_formatp = item->ri_buf[0].i_addr; attr_name = item->ri_buf[1].i_addr; /* Validate xfs_attri_log_format before the large memory allocation */ + len = sizeof(struct xfs_attri_log_format); + if (item->ri_buf[0].i_len != len) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + if (!xfs_attri_validate(mp, attri_formatp)) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + + /* Validate the attr name */ + if (item->ri_buf[1].i_len != + xlog_calc_iovec_len(attri_formatp->alfi_name_len)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); return -EFSCORRUPTED; } if (!xfs_attr_namecheck(attr_name, attri_formatp->alfi_name_len)) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[1].i_addr, item->ri_buf[1].i_len); return -EFSCORRUPTED; } - if (attri_formatp->alfi_value_len) + /* Validate the attr value, if present */ + if (attri_formatp->alfi_value_len != 0) { + if (item->ri_buf[2].i_len != xlog_calc_iovec_len(attri_formatp->alfi_value_len)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, + item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + attr_value = item->ri_buf[2].i_addr; + } /* * Memory alloc failure will cause replay to abort. We attach the @@ -762,13 +762,9 @@ xlog_recover_attri_commit_pass2( nv = xfs_attri_log_nameval_alloc(attr_name, attri_formatp->alfi_name_len, attr_value, attri_formatp->alfi_value_len); - if (!nv) - return -ENOMEM; attrip = xfs_attri_init(mp, nv); - error = xfs_attri_copy_format(&item->ri_buf[0], &attrip->attri_format); - if (error) - goto out; + memcpy(&attrip->attri_format, attri_formatp, len); /* * The ATTRI has two references. One for the ATTRD and one for ATTRI to @@ -780,10 +776,6 @@ xlog_recover_attri_commit_pass2( xfs_attri_release(attrip); xfs_attri_log_nameval_put(nv); return 0; -out: - xfs_attri_item_free(attrip); - xfs_attri_log_nameval_put(nv); - return error; } /* @@ -848,7 +840,8 @@ xlog_recover_attrd_commit_pass2( attrd_formatp = item->ri_buf[0].i_addr; if (item->ri_buf[0].i_len != sizeof(struct xfs_attrd_log_format)) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); return -EFSCORRUPTED; } diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 51f66e982484..41323da523d1 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -608,28 +608,18 @@ static const struct xfs_item_ops xfs_bui_item_ops = { .iop_relog = xfs_bui_item_relog, }; -/* - * Copy an BUI format buffer from the given buf, and into the destination - * BUI format structure. The BUI/BUD items were designed not to need any - * special alignment handling. - */ -static int +static inline void xfs_bui_copy_format( - struct xfs_log_iovec *buf, - struct xfs_bui_log_format *dst_bui_fmt) + struct xfs_bui_log_format *dst, + const struct xfs_bui_log_format *src) { - struct xfs_bui_log_format *src_bui_fmt; - uint len; + unsigned int i; - src_bui_fmt = buf->i_addr; - len = xfs_bui_log_format_sizeof(src_bui_fmt->bui_nextents); + memcpy(dst, src, offsetof(struct xfs_bui_log_format, bui_extents)); - if (buf->i_len == len) { - memcpy(dst_bui_fmt, src_bui_fmt, len); - return 0; - } - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); - return -EFSCORRUPTED; + for (i = 0; i < src->bui_nextents; i++) + memcpy(&dst->bui_extents[i], &src->bui_extents[i], + sizeof(struct xfs_map_extent)); } /* @@ -646,23 +636,34 @@ xlog_recover_bui_commit_pass2( struct xlog_recover_item *item, xfs_lsn_t lsn) { - int error; struct xfs_mount *mp = log->l_mp; struct xfs_bui_log_item *buip; struct xfs_bui_log_format *bui_formatp; + size_t len; bui_formatp = item->ri_buf[0].i_addr; + if (item->ri_buf[0].i_len < xfs_bui_log_format_sizeof(0)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); return -EFSCORRUPTED; } - buip = xfs_bui_init(mp); - error = xfs_bui_copy_format(&item->ri_buf[0], &buip->bui_format); - if (error) { - xfs_bui_item_free(buip); - return error; + + len = xfs_bui_log_format_sizeof(bui_formatp->bui_nextents); + if (item->ri_buf[0].i_len != len) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; } + + buip = xfs_bui_init(mp); + xfs_bui_copy_format(&buip->bui_format, bui_formatp); atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents); /* * Insert the intent into the AIL directly and drop one reference so @@ -696,7 +697,8 @@ xlog_recover_bud_commit_pass2( bud_formatp = item->ri_buf[0].i_addr; if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format)) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); return -EFSCORRUPTED; } diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index e295fc8062d8..9f3ceb461515 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -512,7 +512,7 @@ xfs_readdir( { struct xfs_da_args args = { NULL }; unsigned int lock_mode; - int isblock; + bool isblock; int error; trace_xfs_readdir(dp); diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 296faa41d81d..c6b2aabd6f18 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -234,13 +234,18 @@ int xfs_errortag_init( struct xfs_mount *mp) { + int ret; + mp->m_errortag = kmem_zalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX, KM_MAYFAIL); if (!mp->m_errortag) return -ENOMEM; - return xfs_sysfs_init(&mp->m_errortag_kobj, &xfs_errortag_ktype, - &mp->m_kobj, "errortag"); + ret = xfs_sysfs_init(&mp->m_errortag_kobj, &xfs_errortag_ktype, + &mp->m_kobj, "errortag"); + if (ret) + kmem_free(mp->m_errortag); + return ret; } void @@ -274,7 +279,7 @@ xfs_errortag_test( ASSERT(error_tag < XFS_ERRTAG_MAX); randfactor = mp->m_errortag[error_tag]; - if (!randfactor || prandom_u32() % randfactor) + if (!randfactor || prandom_u32_max(randfactor)) return false; xfs_warn_ratelimited(mp, diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 27ccfcd82f04..d5130d1fcfae 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -66,27 +66,16 @@ xfs_efi_release( xfs_efi_item_free(efip); } -/* - * This returns the number of iovecs needed to log the given efi item. - * We only need 1 iovec for an efi item. It just logs the efi_log_format - * structure. - */ -static inline int -xfs_efi_item_sizeof( - struct xfs_efi_log_item *efip) -{ - return sizeof(struct xfs_efi_log_format) + - (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t); -} - STATIC void xfs_efi_item_size( struct xfs_log_item *lip, int *nvecs, int *nbytes) { + struct xfs_efi_log_item *efip = EFI_ITEM(lip); + *nvecs += 1; - *nbytes += xfs_efi_item_sizeof(EFI_ITEM(lip)); + *nbytes += xfs_efi_log_format_sizeof(efip->efi_format.efi_nextents); } /* @@ -112,7 +101,7 @@ xfs_efi_item_format( xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFI_FORMAT, &efip->efi_format, - xfs_efi_item_sizeof(efip)); + xfs_efi_log_format_sizeof(efip->efi_format.efi_nextents)); } @@ -155,13 +144,11 @@ xfs_efi_init( { struct xfs_efi_log_item *efip; - uint size; ASSERT(nextents > 0); if (nextents > XFS_EFI_MAX_FAST_EXTENTS) { - size = (uint)(sizeof(struct xfs_efi_log_item) + - ((nextents - 1) * sizeof(xfs_extent_t))); - efip = kmem_zalloc(size, 0); + efip = kzalloc(xfs_efi_log_item_sizeof(nextents), + GFP_KERNEL | __GFP_NOFAIL); } else { efip = kmem_cache_zalloc(xfs_efi_cache, GFP_KERNEL | __GFP_NOFAIL); @@ -188,15 +175,17 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) { xfs_efi_log_format_t *src_efi_fmt = buf->i_addr; uint i; - uint len = sizeof(xfs_efi_log_format_t) + - (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_t); - uint len32 = sizeof(xfs_efi_log_format_32_t) + - (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_32_t); - uint len64 = sizeof(xfs_efi_log_format_64_t) + - (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_64_t); + uint len = xfs_efi_log_format_sizeof(src_efi_fmt->efi_nextents); + uint len32 = xfs_efi_log_format32_sizeof(src_efi_fmt->efi_nextents); + uint len64 = xfs_efi_log_format64_sizeof(src_efi_fmt->efi_nextents); if (buf->i_len == len) { - memcpy((char *)dst_efi_fmt, (char*)src_efi_fmt, len); + memcpy(dst_efi_fmt, src_efi_fmt, + offsetof(struct xfs_efi_log_format, efi_extents)); + for (i = 0; i < src_efi_fmt->efi_nextents; i++) + memcpy(&dst_efi_fmt->efi_extents[i], + &src_efi_fmt->efi_extents[i], + sizeof(struct xfs_extent)); return 0; } else if (buf->i_len == len32) { xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->i_addr; @@ -227,7 +216,8 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) } return 0; } - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, NULL, buf->i_addr, + buf->i_len); return -EFSCORRUPTED; } @@ -246,27 +236,16 @@ xfs_efd_item_free(struct xfs_efd_log_item *efdp) kmem_cache_free(xfs_efd_cache, efdp); } -/* - * This returns the number of iovecs needed to log the given efd item. - * We only need 1 iovec for an efd item. It just logs the efd_log_format - * structure. - */ -static inline int -xfs_efd_item_sizeof( - struct xfs_efd_log_item *efdp) -{ - return sizeof(xfs_efd_log_format_t) + - (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t); -} - STATIC void xfs_efd_item_size( struct xfs_log_item *lip, int *nvecs, int *nbytes) { + struct xfs_efd_log_item *efdp = EFD_ITEM(lip); + *nvecs += 1; - *nbytes += xfs_efd_item_sizeof(EFD_ITEM(lip)); + *nbytes += xfs_efd_log_format_sizeof(efdp->efd_format.efd_nextents); } /* @@ -291,7 +270,7 @@ xfs_efd_item_format( xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFD_FORMAT, &efdp->efd_format, - xfs_efd_item_sizeof(efdp)); + xfs_efd_log_format_sizeof(efdp->efd_format.efd_nextents)); } /* @@ -340,9 +319,8 @@ xfs_trans_get_efd( ASSERT(nextents > 0); if (nextents > XFS_EFD_MAX_FAST_EXTENTS) { - efdp = kmem_zalloc(sizeof(struct xfs_efd_log_item) + - (nextents - 1) * sizeof(struct xfs_extent), - 0); + efdp = kzalloc(xfs_efd_log_item_sizeof(nextents), + GFP_KERNEL | __GFP_NOFAIL); } else { efdp = kmem_cache_zalloc(xfs_efd_cache, GFP_KERNEL | __GFP_NOFAIL); @@ -733,6 +711,12 @@ xlog_recover_efi_commit_pass2( efi_formatp = item->ri_buf[0].i_addr; + if (item->ri_buf[0].i_len < xfs_efi_log_format_sizeof(0)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + efip = xfs_efi_init(mp, efi_formatp->efi_nextents); error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format); if (error) { @@ -769,12 +753,24 @@ xlog_recover_efd_commit_pass2( xfs_lsn_t lsn) { struct xfs_efd_log_format *efd_formatp; + int buflen = item->ri_buf[0].i_len; efd_formatp = item->ri_buf[0].i_addr; - ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) + - ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) || - (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) + - ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t))))); + + if (buflen < sizeof(struct xfs_efd_log_format)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, + efd_formatp, buflen); + return -EFSCORRUPTED; + } + + if (item->ri_buf[0].i_len != xfs_efd_log_format32_sizeof( + efd_formatp->efd_nextents) && + item->ri_buf[0].i_len != xfs_efd_log_format64_sizeof( + efd_formatp->efd_nextents)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, + efd_formatp, buflen); + return -EFSCORRUPTED; + } xlog_recover_release_intent(log, XFS_LI_EFI, efd_formatp->efd_efi_id); return 0; diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index 186d0f2137f1..da6a5afa607c 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -52,6 +52,14 @@ struct xfs_efi_log_item { xfs_efi_log_format_t efi_format; }; +static inline size_t +xfs_efi_log_item_sizeof( + unsigned int nr) +{ + return offsetof(struct xfs_efi_log_item, efi_format) + + xfs_efi_log_format_sizeof(nr); +} + /* * This is the "extent free done" log item. It is used to log * the fact that some extents earlier mentioned in an efi item @@ -64,6 +72,14 @@ struct xfs_efd_log_item { xfs_efd_log_format_t efd_format; }; +static inline size_t +xfs_efd_log_item_sizeof( + unsigned int nr) +{ + return offsetof(struct xfs_efd_log_item, efd_format) + + xfs_efd_log_format_sizeof(nr); +} + /* * Max number of extents in fast allocation path. */ diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index c6c80265c0b2..e462d39c840e 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1261,7 +1261,7 @@ xfs_file_llseek( } #ifdef CONFIG_FS_DAX -static int +static inline vm_fault_t xfs_dax_fault( struct vm_fault *vmf, enum page_entry_size pe_size, @@ -1274,14 +1274,15 @@ xfs_dax_fault( &xfs_read_iomap_ops); } #else -static int +static inline vm_fault_t xfs_dax_fault( struct vm_fault *vmf, enum page_entry_size pe_size, bool write_fault, pfn_t *pfn) { - return 0; + ASSERT(0); + return VM_FAULT_SIGBUS; } #endif diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 2bbe7916a998..eae7427062cf 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -596,7 +596,7 @@ xfs_iget_cache_miss( */ if (xfs_has_v3inodes(mp) && (flags & XFS_IGET_CREATE) && !xfs_has_ikeep(mp)) { - VFS_I(ip)->i_generation = prandom_u32(); + VFS_I(ip)->i_generation = get_random_u32(); } else { struct xfs_buf *bp; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 28493c8e9bb2..aa303be11576 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -835,9 +835,8 @@ xfs_init_new_inode( * ID or one of the supplementary group IDs, the S_ISGID bit is cleared * (and only if the irix_sgid_inherit compatibility variable is set). */ - if (irix_sgid_inherit && - (inode->i_mode & S_ISGID) && - !in_group_p(i_gid_into_mnt(mnt_userns, inode))) + if (irix_sgid_inherit && (inode->i_mode & S_ISGID) && + !vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode))) inode->i_mode &= ~S_ISGID; ip->i_disk_size = 0; @@ -2819,7 +2818,7 @@ retry: * Lock all the participating inodes. Depending upon whether * the target_name exists in the target directory, and * whether the target directory is the same as the source - * directory, we can lock from 2 to 4 inodes. + * directory, we can lock from 2 to 5 inodes. */ xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL); @@ -3119,7 +3118,7 @@ xfs_iflush( if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), mp, XFS_ERRTAG_IFLUSH_1)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, - "%s: Bad inode %Lu magic number 0x%x, ptr "PTR_FMT, + "%s: Bad inode %llu magic number 0x%x, ptr "PTR_FMT, __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); goto flush_out; } @@ -3129,7 +3128,7 @@ xfs_iflush( ip->i_df.if_format != XFS_DINODE_FMT_BTREE, mp, XFS_ERRTAG_IFLUSH_3)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, - "%s: Bad regular inode %Lu, ptr "PTR_FMT, + "%s: Bad regular inode %llu, ptr "PTR_FMT, __func__, ip->i_ino, ip); goto flush_out; } @@ -3140,7 +3139,7 @@ xfs_iflush( ip->i_df.if_format != XFS_DINODE_FMT_LOCAL, mp, XFS_ERRTAG_IFLUSH_4)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, - "%s: Bad directory inode %Lu, ptr "PTR_FMT, + "%s: Bad directory inode %llu, ptr "PTR_FMT, __func__, ip->i_ino, ip); goto flush_out; } @@ -3158,7 +3157,7 @@ xfs_iflush( if (XFS_TEST_ERROR(ip->i_forkoff > mp->m_sb.sb_inodesize, mp, XFS_ERRTAG_IFLUSH_6)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, - "%s: bad inode %Lu, forkoff 0x%x, ptr "PTR_FMT, + "%s: bad inode %llu, forkoff 0x%x, ptr "PTR_FMT, __func__, ip->i_ino, ip->i_forkoff, ip); goto flush_out; } diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 6e19ece916bf..ca2941ab6cbc 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -550,7 +550,7 @@ xfs_inode_item_push( if (!bp || (ip->i_flags & XFS_ISTALE)) { /* - * Inode item/buffer is being being aborted due to cluster + * Inode item/buffer is being aborted due to cluster * buffer deletion. Trigger a log force to have that operation * completed and items removed from the AIL before the next push * attempt. diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index d28ffaebd067..0e5dba2343ea 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -321,7 +321,7 @@ xlog_recover_inode_commit_pass2( */ if (XFS_IS_CORRUPT(mp, !xfs_verify_magic16(bp, dip->di_magic))) { xfs_alert(mp, - "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld", + "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %lld", __func__, dip, bp, in_f->ilf_ino); error = -EFSCORRUPTED; goto out_release; @@ -329,7 +329,7 @@ xlog_recover_inode_commit_pass2( ldip = item->ri_buf[1].i_addr; if (XFS_IS_CORRUPT(mp, ldip->di_magic != XFS_DINODE_MAGIC)) { xfs_alert(mp, - "%s: Bad inode log record, rec ptr "PTR_FMT", ino %Ld", + "%s: Bad inode log record, rec ptr "PTR_FMT", ino %lld", __func__, item, in_f->ilf_ino); error = -EFSCORRUPTED; goto out_release; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 45518b8c613c..2e10e1c66ad6 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -167,7 +167,7 @@ xfs_generic_create( struct dentry *dentry, umode_t mode, dev_t rdev, - bool tmpfile) /* unnamed file */ + struct file *tmpfile) /* unnamed file */ { struct inode *inode; struct xfs_inode *ip = NULL; @@ -234,7 +234,7 @@ xfs_generic_create( * d_tmpfile can immediately set it back to zero. */ set_nlink(inode, 1); - d_tmpfile(dentry, inode); + d_tmpfile(tmpfile, inode); } else d_instantiate(dentry, inode); @@ -261,7 +261,7 @@ xfs_vn_mknod( umode_t mode, dev_t rdev) { - return xfs_generic_create(mnt_userns, dir, dentry, mode, rdev, false); + return xfs_generic_create(mnt_userns, dir, dentry, mode, rdev, NULL); } STATIC int @@ -272,7 +272,7 @@ xfs_vn_create( umode_t mode, bool flags) { - return xfs_generic_create(mnt_userns, dir, dentry, mode, 0, false); + return xfs_generic_create(mnt_userns, dir, dentry, mode, 0, NULL); } STATIC int @@ -283,7 +283,7 @@ xfs_vn_mkdir( umode_t mode) { return xfs_generic_create(mnt_userns, dir, dentry, mode | S_IFDIR, 0, - false); + NULL); } STATIC struct dentry * @@ -558,6 +558,8 @@ xfs_vn_getattr( struct inode *inode = d_inode(path->dentry); struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; + vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); trace_xfs_getattr(ip); @@ -568,8 +570,8 @@ xfs_vn_getattr( stat->dev = inode->i_sb->s_dev; stat->mode = inode->i_mode; stat->nlink = inode->i_nlink; - stat->uid = i_uid_into_mnt(mnt_userns, inode); - stat->gid = i_gid_into_mnt(mnt_userns, inode); + stat->uid = vfsuid_into_kuid(vfsuid); + stat->gid = vfsgid_into_kgid(vfsgid); stat->ino = ip->i_ino; stat->atime = inode->i_atime; stat->mtime = inode->i_mtime; @@ -604,6 +606,16 @@ xfs_vn_getattr( stat->blksize = BLKDEV_IOSIZE; stat->rdev = inode->i_rdev; break; + case S_IFREG: + if (request_mask & STATX_DIOALIGN) { + struct xfs_buftarg *target = xfs_inode_buftarg(ip); + struct block_device *bdev = target->bt_bdev; + + stat->result_mask |= STATX_DIOALIGN; + stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; + stat->dio_offset_align = bdev_logical_block_size(bdev); + } + fallthrough; default: stat->blksize = xfs_stat_blksize(ip); stat->rdev = 0; @@ -1080,10 +1092,12 @@ STATIC int xfs_vn_tmpfile( struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, + struct file *file, umode_t mode) { - return xfs_generic_create(mnt_userns, dir, dentry, mode, 0, true); + int err = xfs_generic_create(mnt_userns, dir, file->f_path.dentry, mode, 0, file); + + return finish_open_simple(file, err); } static const struct inode_operations xfs_inode_operations = { diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h index cb5fc68c9ea0..e570dcb5df8d 100644 --- a/fs/xfs/xfs_iops.h +++ b/fs/xfs/xfs_iops.h @@ -13,7 +13,6 @@ extern const struct file_operations xfs_dir_file_operations; extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size); -extern void xfs_setattr_time(struct xfs_inode *ip, struct iattr *iattr); int xfs_vn_setattr_size(struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *vap); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 36312b00b164..a1c2bcf65d37 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -66,6 +66,8 @@ xfs_bulkstat_one_int( struct xfs_bulkstat *buf = bc->buf; xfs_extnum_t nextents; int error = -EINVAL; + vfsuid_t vfsuid; + vfsgid_t vfsgid; if (xfs_internal_inum(mp, ino)) goto out_advance; @@ -81,14 +83,16 @@ xfs_bulkstat_one_int( ASSERT(ip != NULL); ASSERT(ip->i_imap.im_blkno != 0); inode = VFS_I(ip); + vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + vfsgid = i_gid_into_vfsgid(mnt_userns, inode); /* xfs_iget returns the following without needing * further change. */ buf->bs_projectid = ip->i_projid; buf->bs_ino = ino; - buf->bs_uid = from_kuid(sb_userns, i_uid_into_mnt(mnt_userns, inode)); - buf->bs_gid = from_kgid(sb_userns, i_gid_into_mnt(mnt_userns, inode)); + buf->bs_uid = from_kuid(sb_userns, vfsuid_into_kuid(vfsuid)); + buf->bs_gid = from_kgid(sb_userns, vfsgid_into_kgid(vfsgid)); buf->bs_size = ip->i_disk_size; buf->bs_nlink = inode->i_nlink; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 386b0307aed8..f02a0dd522b3 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -226,12 +226,12 @@ xlog_ticket_reservation( if (head == &log->l_write_head) { ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV); return tic->t_unit_res; - } else { - if (tic->t_flags & XLOG_TIC_PERM_RESERV) - return tic->t_unit_res * tic->t_cnt; - else - return tic->t_unit_res; } + + if (tic->t_flags & XLOG_TIC_PERM_RESERV) + return tic->t_unit_res * tic->t_cnt; + + return tic->t_unit_res; } STATIC bool @@ -3544,7 +3544,7 @@ xlog_ticket_alloc( tic->t_curr_res = unit_res; tic->t_cnt = cnt; tic->t_ocnt = cnt; - tic->t_tid = prandom_u32(); + tic->t_tid = get_random_u32(); if (permanent) tic->t_flags |= XLOG_TIC_PERM_RESERV; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 17e923b9c5fa..322eb2ee6c55 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2552,6 +2552,8 @@ xlog_recover_process_intents( for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); lip != NULL; lip = xfs_trans_ail_cursor_next(ailp, &cur)) { + const struct xfs_item_ops *ops; + if (!xlog_item_is_intent(lip)) break; @@ -2567,13 +2569,17 @@ xlog_recover_process_intents( * deferred ops, you /must/ attach them to the capture list in * the recover routine or else those subsequent intents will be * replayed in the wrong order! + * + * The recovery function can free the log item, so we must not + * access lip after it returns. */ spin_unlock(&ailp->ail_lock); - error = lip->li_ops->iop_recover(lip, &capture_list); + ops = lip->li_ops; + error = ops->iop_recover(lip, &capture_list); spin_lock(&ailp->ail_lock); if (error) { trace_xlog_intent_recovery_failed(log->l_mp, error, - lip->li_ops->iop_recover); + ops->iop_recover); break; } } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index f10c88cee116..e8bb3c2e847e 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -300,26 +300,28 @@ xfs_validate_new_dalign( "alignment check failed: sunit/swidth vs. blocksize(%d)", mp->m_sb.sb_blocksize); return -EINVAL; - } else { - /* - * Convert the stripe unit and width to FSBs. - */ - mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); - if (mp->m_dalign && (mp->m_sb.sb_agblocks % mp->m_dalign)) { - xfs_warn(mp, - "alignment check failed: sunit/swidth vs. agsize(%d)", - mp->m_sb.sb_agblocks); - return -EINVAL; - } else if (mp->m_dalign) { - mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); - } else { - xfs_warn(mp, - "alignment check failed: sunit(%d) less than bsize(%d)", - mp->m_dalign, mp->m_sb.sb_blocksize); - return -EINVAL; - } } + /* + * Convert the stripe unit and width to FSBs. + */ + mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); + if (mp->m_dalign && (mp->m_sb.sb_agblocks % mp->m_dalign)) { + xfs_warn(mp, + "alignment check failed: sunit/swidth vs. agsize(%d)", + mp->m_sb.sb_agblocks); + return -EINVAL; + } + + if (!mp->m_dalign) { + xfs_warn(mp, + "alignment check failed: sunit(%d) less than bsize(%d)", + mp->m_dalign, mp->m_sb.sb_blocksize); + return -EINVAL; + } + + mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); + if (!xfs_has_dalign(mp)) { xfs_warn(mp, "cannot change alignment: superblock does not support data alignment"); diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c index 69d9c83ea4b2..c4078d0ec108 100644 --- a/fs/xfs/xfs_notify_failure.c +++ b/fs/xfs/xfs_notify_failure.c @@ -23,17 +23,18 @@ #include <linux/mm.h> #include <linux/dax.h> -struct failure_info { +struct xfs_failure_info { xfs_agblock_t startblock; xfs_extlen_t blockcount; int mf_flags; + bool want_shutdown; }; static pgoff_t xfs_failure_pgoff( struct xfs_mount *mp, const struct xfs_rmap_irec *rec, - const struct failure_info *notify) + const struct xfs_failure_info *notify) { loff_t pos = XFS_FSB_TO_B(mp, rec->rm_offset); @@ -47,7 +48,7 @@ static unsigned long xfs_failure_pgcnt( struct xfs_mount *mp, const struct xfs_rmap_irec *rec, - const struct failure_info *notify) + const struct xfs_failure_info *notify) { xfs_agblock_t end_rec; xfs_agblock_t end_notify; @@ -71,13 +72,13 @@ xfs_dax_failure_fn( { struct xfs_mount *mp = cur->bc_mp; struct xfs_inode *ip; - struct failure_info *notify = data; + struct xfs_failure_info *notify = data; int error = 0; if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) || (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) { - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); - return -EFSCORRUPTED; + notify->want_shutdown = true; + return 0; } /* Get files that incore, filter out others that are not in use. */ @@ -86,8 +87,10 @@ xfs_dax_failure_fn( /* Continue the rmap query if the inode isn't incore */ if (error == -ENODATA) return 0; - if (error) - return error; + if (error) { + notify->want_shutdown = true; + return 0; + } error = mf_dax_kill_procs(VFS_I(ip)->i_mapping, xfs_failure_pgoff(mp, rec, notify), @@ -104,6 +107,7 @@ xfs_dax_notify_ddev_failure( xfs_daddr_t bblen, int mf_flags) { + struct xfs_failure_info notify = { .mf_flags = mf_flags }; struct xfs_trans *tp = NULL; struct xfs_btree_cur *cur = NULL; struct xfs_buf *agf_bp = NULL; @@ -120,7 +124,6 @@ xfs_dax_notify_ddev_failure( for (; agno <= end_agno; agno++) { struct xfs_rmap_irec ri_low = { }; struct xfs_rmap_irec ri_high; - struct failure_info notify; struct xfs_agf *agf; xfs_agblock_t agend; struct xfs_perag *pag; @@ -161,6 +164,11 @@ xfs_dax_notify_ddev_failure( } xfs_trans_cancel(tp); + if (error || notify.want_shutdown) { + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); + if (!error) + error = -EFSCORRUPTED; + } return error; } @@ -175,13 +183,13 @@ xfs_dax_notify_failure( u64 ddev_start; u64 ddev_end; - if (!(mp->m_sb.sb_flags & SB_BORN)) { + if (!(mp->m_super->s_flags & SB_BORN)) { xfs_warn(mp, "filesystem is not ready for notify_failure()!"); return -EIO; } if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) { - xfs_warn(mp, + xfs_debug(mp, "notify_failure() not supported on realtime device!"); return -EOPNOTSUPP; } @@ -194,7 +202,7 @@ xfs_dax_notify_failure( } if (!xfs_has_rmapbt(mp)) { - xfs_warn(mp, "notify_failure() needs rmapbt enabled!"); + xfs_debug(mp, "notify_failure() needs rmapbt enabled!"); return -EOPNOTSUPP; } diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 758702b9495f..9737b5a9f405 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -118,10 +118,10 @@ xfs_check_ondisk_structs(void) /* log structures */ XFS_CHECK_STRUCT_SIZE(struct xfs_buf_log_format, 88); XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat, 24); - XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_32, 28); - XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_64, 32); - XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_32, 28); - XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_64, 32); + XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_32, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_64, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_32, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_64, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_extent_32, 12); XFS_CHECK_STRUCT_SIZE(struct xfs_extent_64, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_log_dinode, 176); @@ -134,6 +134,21 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_attri_log_format, 40); XFS_CHECK_STRUCT_SIZE(struct xfs_attrd_log_format, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_bui_log_format, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_bud_log_format, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_cui_log_format, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_cud_log_format, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_rui_log_format, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_rud_log_format, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_map_extent, 32); + XFS_CHECK_STRUCT_SIZE(struct xfs_phys_extent, 16); + + XFS_CHECK_OFFSET(struct xfs_bui_log_format, bui_extents, 16); + XFS_CHECK_OFFSET(struct xfs_cui_log_format, cui_extents, 16); + XFS_CHECK_OFFSET(struct xfs_rui_log_format, rui_extents, 16); + XFS_CHECK_OFFSET(struct xfs_efi_log_format, efi_extents, 16); + XFS_CHECK_OFFSET(struct xfs_efi_log_format_32, efi_extents, 16); + XFS_CHECK_OFFSET(struct xfs_efi_log_format_64, efi_extents, 16); /* * The v5 superblock format extended several v4 header structures with diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 7e97bf19793d..858e3e9eb4a8 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -523,7 +523,9 @@ xfs_cui_item_recover( type = refc_type; break; default: - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + &cuip->cui_format, + sizeof(cuip->cui_format)); error = -EFSCORRUPTED; goto abort_error; } @@ -536,7 +538,8 @@ xfs_cui_item_recover( &new_fsb, &new_len, &rcur); if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - refc, sizeof(*refc)); + &cuip->cui_format, + sizeof(cuip->cui_format)); if (error) goto abort_error; @@ -622,28 +625,18 @@ static const struct xfs_item_ops xfs_cui_item_ops = { .iop_relog = xfs_cui_item_relog, }; -/* - * Copy an CUI format buffer from the given buf, and into the destination - * CUI format structure. The CUI/CUD items were designed not to need any - * special alignment handling. - */ -static int +static inline void xfs_cui_copy_format( - struct xfs_log_iovec *buf, - struct xfs_cui_log_format *dst_cui_fmt) + struct xfs_cui_log_format *dst, + const struct xfs_cui_log_format *src) { - struct xfs_cui_log_format *src_cui_fmt; - uint len; + unsigned int i; - src_cui_fmt = buf->i_addr; - len = xfs_cui_log_format_sizeof(src_cui_fmt->cui_nextents); + memcpy(dst, src, offsetof(struct xfs_cui_log_format, cui_extents)); - if (buf->i_len == len) { - memcpy(dst_cui_fmt, src_cui_fmt, len); - return 0; - } - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); - return -EFSCORRUPTED; + for (i = 0; i < src->cui_nextents; i++) + memcpy(&dst->cui_extents[i], &src->cui_extents[i], + sizeof(struct xfs_phys_extent)); } /* @@ -660,19 +653,28 @@ xlog_recover_cui_commit_pass2( struct xlog_recover_item *item, xfs_lsn_t lsn) { - int error; struct xfs_mount *mp = log->l_mp; struct xfs_cui_log_item *cuip; struct xfs_cui_log_format *cui_formatp; + size_t len; cui_formatp = item->ri_buf[0].i_addr; - cuip = xfs_cui_init(mp, cui_formatp->cui_nextents); - error = xfs_cui_copy_format(&item->ri_buf[0], &cuip->cui_format); - if (error) { - xfs_cui_item_free(cuip); - return error; + if (item->ri_buf[0].i_len < xfs_cui_log_format_sizeof(0)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; } + + len = xfs_cui_log_format_sizeof(cui_formatp->cui_nextents); + if (item->ri_buf[0].i_len != len) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + + cuip = xfs_cui_init(mp, cui_formatp->cui_nextents); + xfs_cui_copy_format(&cuip->cui_format, cui_formatp); atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents); /* * Insert the intent into the AIL directly and drop one reference so @@ -706,7 +708,8 @@ xlog_recover_cud_commit_pass2( cud_formatp = item->ri_buf[0].i_addr; if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); return -EFSCORRUPTED; } diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 251f20ddd368..93bdd25680bc 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -200,7 +200,9 @@ xfs_reflink_trim_around_shared( if (fbno == NULLAGBLOCK) { /* No shared blocks at all. */ return 0; - } else if (fbno == agbno) { + } + + if (fbno == agbno) { /* * The start of this extent is shared. Truncate the * mapping at the end of the shared region so that a @@ -210,16 +212,16 @@ xfs_reflink_trim_around_shared( irec->br_blockcount = flen; *shared = true; return 0; - } else { - /* - * There's a shared extent midway through this extent. - * Truncate the mapping at the start of the shared - * extent so that a subsequent iteration starts at the - * start of the shared region. - */ - irec->br_blockcount = fbno - agbno; - return 0; } + + /* + * There's a shared extent midway through this extent. + * Truncate the mapping at the start of the shared + * extent so that a subsequent iteration starts at the + * start of the shared region. + */ + irec->br_blockcount = fbno - agbno; + return 0; } int diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index fef92e02f3bb..534504ede1a3 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -155,31 +155,6 @@ xfs_rui_init( return ruip; } -/* - * Copy an RUI format buffer from the given buf, and into the destination - * RUI format structure. The RUI/RUD items were designed not to need any - * special alignment handling. - */ -STATIC int -xfs_rui_copy_format( - struct xfs_log_iovec *buf, - struct xfs_rui_log_format *dst_rui_fmt) -{ - struct xfs_rui_log_format *src_rui_fmt; - uint len; - - src_rui_fmt = buf->i_addr; - len = xfs_rui_log_format_sizeof(src_rui_fmt->rui_nextents); - - if (buf->i_len != len) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); - return -EFSCORRUPTED; - } - - memcpy(dst_rui_fmt, src_rui_fmt, len); - return 0; -} - static inline struct xfs_rud_log_item *RUD_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_rud_log_item, rud_item); @@ -582,7 +557,9 @@ xfs_rui_item_recover( type = XFS_RMAP_FREE; break; default: - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + &ruip->rui_format, + sizeof(ruip->rui_format)); error = -EFSCORRUPTED; goto abort_error; } @@ -652,6 +629,20 @@ static const struct xfs_item_ops xfs_rui_item_ops = { .iop_relog = xfs_rui_item_relog, }; +static inline void +xfs_rui_copy_format( + struct xfs_rui_log_format *dst, + const struct xfs_rui_log_format *src) +{ + unsigned int i; + + memcpy(dst, src, offsetof(struct xfs_rui_log_format, rui_extents)); + + for (i = 0; i < src->rui_nextents; i++) + memcpy(&dst->rui_extents[i], &src->rui_extents[i], + sizeof(struct xfs_map_extent)); +} + /* * This routine is called to create an in-core extent rmap update * item from the rui format structure which was logged on disk. @@ -666,19 +657,28 @@ xlog_recover_rui_commit_pass2( struct xlog_recover_item *item, xfs_lsn_t lsn) { - int error; struct xfs_mount *mp = log->l_mp; struct xfs_rui_log_item *ruip; struct xfs_rui_log_format *rui_formatp; + size_t len; rui_formatp = item->ri_buf[0].i_addr; - ruip = xfs_rui_init(mp, rui_formatp->rui_nextents); - error = xfs_rui_copy_format(&item->ri_buf[0], &ruip->rui_format); - if (error) { - xfs_rui_item_free(ruip); - return error; + if (item->ri_buf[0].i_len < xfs_rui_log_format_sizeof(0)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + + len = xfs_rui_log_format_sizeof(rui_formatp->rui_nextents); + if (item->ri_buf[0].i_len != len) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; } + + ruip = xfs_rui_init(mp, rui_formatp->rui_nextents); + xfs_rui_copy_format(&ruip->rui_format, rui_formatp); atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents); /* * Insert the intent into the AIL directly and drop one reference so @@ -711,7 +711,11 @@ xlog_recover_rud_commit_pass2( struct xfs_rud_log_format *rud_formatp; rud_formatp = item->ri_buf[0].i_addr; - ASSERT(item->ri_buf[0].i_len == sizeof(struct xfs_rud_log_format)); + if (item->ri_buf[0].i_len != sizeof(struct xfs_rud_log_format)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, + rud_formatp, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } xlog_recover_release_intent(log, XFS_LI_RUI, rud_formatp->rud_rui_id); return 0; diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 20e0534a772c..90a77cd3ebad 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -74,7 +74,7 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) defer_relog += per_cpu_ptr(stats, i)->s.defer_relog; } - len += scnprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n", + len += scnprintf(buf + len, PATH_MAX-len, "xpc %llu %llu %llu\n", xs_xstrat_bytes, xs_write_bytes, xs_read_bytes); len += scnprintf(buf + len, PATH_MAX-len, "defer_relog %llu\n", defer_relog); @@ -125,7 +125,7 @@ static int xqmstat_proc_show(struct seq_file *m, void *v) { int j; - seq_printf(m, "qm"); + seq_puts(m, "qm"); for (j = XFSSTAT_START_XQMSTAT; j < XFSSTAT_END_XQMSTAT; j++) seq_printf(m, " %u", counter_val(xfsstats.xs_stats, j)); seq_putc(m, '\n'); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 9ac59814bbb6..ee4b429a2f2c 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -653,7 +653,7 @@ xfs_fs_destroy_inode( static void xfs_fs_dirty_inode( struct inode *inode, - int flag) + int flags) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -661,7 +661,13 @@ xfs_fs_dirty_inode( if (!(inode->i_sb->s_flags & SB_LAZYTIME)) return; - if (flag != I_DIRTY_SYNC || !(inode->i_state & I_DIRTY_TIME)) + + /* + * Only do the timestamp update if the inode is dirty (I_DIRTY_SYNC) + * and has dirty timestamp (I_DIRTY_TIME). I_DIRTY_TIME can be passed + * in flags possibly together with I_DIRTY_SYNC. + */ + if ((flags & ~I_DIRTY_TIME) != I_DIRTY_SYNC || !(flags & I_DIRTY_TIME)) return; if (xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp)) @@ -2022,18 +2028,14 @@ xfs_init_caches(void) goto out_destroy_trans_cache; xfs_efd_cache = kmem_cache_create("xfs_efd_item", - (sizeof(struct xfs_efd_log_item) + - (XFS_EFD_MAX_FAST_EXTENTS - 1) * - sizeof(struct xfs_extent)), - 0, 0, NULL); + xfs_efd_log_item_sizeof(XFS_EFD_MAX_FAST_EXTENTS), + 0, 0, NULL); if (!xfs_efd_cache) goto out_destroy_buf_item_cache; xfs_efi_cache = kmem_cache_create("xfs_efi_item", - (sizeof(struct xfs_efi_log_item) + - (XFS_EFI_MAX_FAST_EXTENTS - 1) * - sizeof(struct xfs_extent)), - 0, 0, NULL); + xfs_efi_log_item_sizeof(XFS_EFI_MAX_FAST_EXTENTS), + 0, 0, NULL); if (!xfs_efi_cache) goto out_destroy_efd_cache; diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h index 43585850f154..513095e353a5 100644 --- a/fs/xfs/xfs_sysfs.h +++ b/fs/xfs/xfs_sysfs.h @@ -33,10 +33,15 @@ xfs_sysfs_init( const char *name) { struct kobject *parent; + int err; parent = parent_kobj ? &parent_kobj->kobject : NULL; init_completion(&kobj->complete); - return kobject_init_and_add(&kobj->kobject, ktype, parent, "%s", name); + err = kobject_init_and_add(&kobj->kobject, ktype, parent, "%s", name); + if (err) + kobject_put(&kobj->kobject); + + return err; } static inline void diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index f9057af6e0c8..372d871bccc5 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -799,6 +799,9 @@ TRACE_DEFINE_ENUM(PE_SIZE_PTE); TRACE_DEFINE_ENUM(PE_SIZE_PMD); TRACE_DEFINE_ENUM(PE_SIZE_PUD); +TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED); +TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW); + TRACE_EVENT(xfs_filemap_fault, TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size, bool write_fault), @@ -1170,7 +1173,7 @@ DECLARE_EVENT_CLASS(xfs_dqtrx_class, __entry->ino_res_used = qtrx->qt_ino_res_used; __entry->icount_delta = qtrx->qt_icount_delta; ), - TP_printk("dev %d:%d dquot id 0x%x type %s flags %s" + TP_printk("dev %d:%d dquot id 0x%x type %s flags %s " "blk_res %llu bcount_delta %lld delbcnt_delta %lld " "rtblk_res %llu rtblk_res_used %llu rtbcount_delta %lld delrtb_delta %lld " "ino_res %llu ino_res_used %llu icount_delta %lld", @@ -1602,7 +1605,7 @@ TRACE_EVENT(xfs_bunmap, __entry->caller_ip = caller_ip; __entry->flags = flags; ), - TP_printk("dev %d:%d ino 0x%llx disize 0x%llx fileoff 0x%llx fsbcount 0x%llx" + TP_printk("dev %d:%d ino 0x%llx disize 0x%llx fileoff 0x%llx fsbcount 0x%llx " "flags %s caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, @@ -2925,6 +2928,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) + __field(enum xfs_refc_domain, domain) __field(xfs_agblock_t, startblock) __field(xfs_extlen_t, blockcount) __field(xfs_nlink_t, refcount) @@ -2932,13 +2936,15 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class, TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; + __entry->domain = irec->rc_domain; __entry->startblock = irec->rc_startblock; __entry->blockcount = irec->rc_blockcount; __entry->refcount = irec->rc_refcount; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u", + TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, + __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS), __entry->startblock, __entry->blockcount, __entry->refcount) @@ -2958,6 +2964,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) + __field(enum xfs_refc_domain, domain) __field(xfs_agblock_t, startblock) __field(xfs_extlen_t, blockcount) __field(xfs_nlink_t, refcount) @@ -2966,14 +2973,16 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class, TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; + __entry->domain = irec->rc_domain; __entry->startblock = irec->rc_startblock; __entry->blockcount = irec->rc_blockcount; __entry->refcount = irec->rc_refcount; __entry->agbno = agbno; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x", + TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, + __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS), __entry->startblock, __entry->blockcount, __entry->refcount, @@ -2994,9 +3003,11 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) + __field(enum xfs_refc_domain, i1_domain) __field(xfs_agblock_t, i1_startblock) __field(xfs_extlen_t, i1_blockcount) __field(xfs_nlink_t, i1_refcount) + __field(enum xfs_refc_domain, i2_domain) __field(xfs_agblock_t, i2_startblock) __field(xfs_extlen_t, i2_blockcount) __field(xfs_nlink_t, i2_refcount) @@ -3004,20 +3015,24 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class, TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; + __entry->i1_domain = i1->rc_domain; __entry->i1_startblock = i1->rc_startblock; __entry->i1_blockcount = i1->rc_blockcount; __entry->i1_refcount = i1->rc_refcount; + __entry->i2_domain = i2->rc_domain; __entry->i2_startblock = i2->rc_startblock; __entry->i2_blockcount = i2->rc_blockcount; __entry->i2_refcount = i2->rc_refcount; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u -- " - "agbno 0x%x fsbcount 0x%x refcount %u", + TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- " + "dom %s agbno 0x%x fsbcount 0x%x refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, + __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS), __entry->i1_startblock, __entry->i1_blockcount, __entry->i1_refcount, + __print_symbolic(__entry->i2_domain, XFS_REFC_DOMAIN_STRINGS), __entry->i2_startblock, __entry->i2_blockcount, __entry->i2_refcount) @@ -3038,9 +3053,11 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) + __field(enum xfs_refc_domain, i1_domain) __field(xfs_agblock_t, i1_startblock) __field(xfs_extlen_t, i1_blockcount) __field(xfs_nlink_t, i1_refcount) + __field(enum xfs_refc_domain, i2_domain) __field(xfs_agblock_t, i2_startblock) __field(xfs_extlen_t, i2_blockcount) __field(xfs_nlink_t, i2_refcount) @@ -3049,21 +3066,25 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class, TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; + __entry->i1_domain = i1->rc_domain; __entry->i1_startblock = i1->rc_startblock; __entry->i1_blockcount = i1->rc_blockcount; __entry->i1_refcount = i1->rc_refcount; + __entry->i2_domain = i2->rc_domain; __entry->i2_startblock = i2->rc_startblock; __entry->i2_blockcount = i2->rc_blockcount; __entry->i2_refcount = i2->rc_refcount; __entry->agbno = agbno; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u -- " - "agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x", + TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- " + "dom %s agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, + __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS), __entry->i1_startblock, __entry->i1_blockcount, __entry->i1_refcount, + __print_symbolic(__entry->i2_domain, XFS_REFC_DOMAIN_STRINGS), __entry->i2_startblock, __entry->i2_blockcount, __entry->i2_refcount, @@ -3086,12 +3107,15 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) + __field(enum xfs_refc_domain, i1_domain) __field(xfs_agblock_t, i1_startblock) __field(xfs_extlen_t, i1_blockcount) __field(xfs_nlink_t, i1_refcount) + __field(enum xfs_refc_domain, i2_domain) __field(xfs_agblock_t, i2_startblock) __field(xfs_extlen_t, i2_blockcount) __field(xfs_nlink_t, i2_refcount) + __field(enum xfs_refc_domain, i3_domain) __field(xfs_agblock_t, i3_startblock) __field(xfs_extlen_t, i3_blockcount) __field(xfs_nlink_t, i3_refcount) @@ -3099,27 +3123,33 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class, TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; + __entry->i1_domain = i1->rc_domain; __entry->i1_startblock = i1->rc_startblock; __entry->i1_blockcount = i1->rc_blockcount; __entry->i1_refcount = i1->rc_refcount; + __entry->i2_domain = i2->rc_domain; __entry->i2_startblock = i2->rc_startblock; __entry->i2_blockcount = i2->rc_blockcount; __entry->i2_refcount = i2->rc_refcount; + __entry->i3_domain = i3->rc_domain; __entry->i3_startblock = i3->rc_startblock; __entry->i3_blockcount = i3->rc_blockcount; __entry->i3_refcount = i3->rc_refcount; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u -- " - "agbno 0x%x fsbcount 0x%x refcount %u -- " - "agbno 0x%x fsbcount 0x%x refcount %u", + TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- " + "dom %s agbno 0x%x fsbcount 0x%x refcount %u -- " + "dom %s agbno 0x%x fsbcount 0x%x refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, + __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS), __entry->i1_startblock, __entry->i1_blockcount, __entry->i1_refcount, + __print_symbolic(__entry->i2_domain, XFS_REFC_DOMAIN_STRINGS), __entry->i2_startblock, __entry->i2_blockcount, __entry->i2_refcount, + __print_symbolic(__entry->i3_domain, XFS_REFC_DOMAIN_STRINGS), __entry->i3_startblock, __entry->i3_blockcount, __entry->i3_refcount) diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index d3a97a028560..f51df7d94ef7 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -602,9 +602,9 @@ xfsaild( while (1) { if (tout && tout <= 20) - set_current_state(TASK_KILLABLE); + set_current_state(TASK_KILLABLE|TASK_FREEZABLE); else - set_current_state(TASK_INTERRUPTIBLE); + set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); /* * Check kthread_should_stop() after we set the task state to @@ -653,14 +653,14 @@ xfsaild( ailp->ail_target == ailp->ail_target_prev && list_empty(&ailp->ail_buf_list)) { spin_unlock(&ailp->ail_lock); - freezable_schedule(); + schedule(); tout = 0; continue; } spin_unlock(&ailp->ail_lock); if (tout) - freezable_schedule_timeout(msecs_to_jiffies(tout)); + schedule_timeout(msecs_to_jiffies(tout)); __set_current_state(TASK_RUNNING); @@ -730,11 +730,10 @@ void xfs_ail_push_all_sync( struct xfs_ail *ailp) { - struct xfs_log_item *lip; DEFINE_WAIT(wait); spin_lock(&ailp->ail_lock); - while ((lip = xfs_ail_max(ailp)) != NULL) { + while (xfs_ail_max(ailp) != NULL) { prepare_to_wait(&ailp->ail_empty, &wait, TASK_UNINTERRUPTIBLE); wake_up_process(ailp->ail_task); spin_unlock(&ailp->ail_lock); diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 860f0b1032c6..2c53fbb8d918 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -41,6 +41,13 @@ static void zonefs_account_active(struct inode *inode) return; /* + * For zones that transitioned to the offline or readonly condition, + * we only need to clear the active state. + */ + if (zi->i_flags & (ZONEFS_ZONE_OFFLINE | ZONEFS_ZONE_READONLY)) + goto out; + + /* * If the zone is active, that is, if it is explicitly open or * partially written, check if it was already accounted as active. */ @@ -53,6 +60,7 @@ static void zonefs_account_active(struct inode *inode) return; } +out: /* The zone is not active. If it was, update the active count */ if (zi->i_flags & ZONEFS_ZONE_ACTIVE) { zi->i_flags &= ~ZONEFS_ZONE_ACTIVE; @@ -324,6 +332,7 @@ static loff_t zonefs_check_zone_condition(struct inode *inode, inode->i_flags |= S_IMMUTABLE; inode->i_mode &= ~0777; zone->wp = zone->start; + zi->i_flags |= ZONEFS_ZONE_OFFLINE; return 0; case BLK_ZONE_COND_READONLY: /* @@ -342,8 +351,10 @@ static loff_t zonefs_check_zone_condition(struct inode *inode, zone->cond = BLK_ZONE_COND_OFFLINE; inode->i_mode &= ~0777; zone->wp = zone->start; + zi->i_flags |= ZONEFS_ZONE_OFFLINE; return 0; } + zi->i_flags |= ZONEFS_ZONE_READONLY; inode->i_mode &= ~0222; return i_size_read(inode); case BLK_ZONE_COND_FULL: @@ -478,8 +489,7 @@ static void __zonefs_io_error(struct inode *inode, bool write) struct super_block *sb = inode->i_sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); unsigned int noio_flag; - unsigned int nr_zones = - zi->i_zone_size >> (sbi->s_zone_sectors_shift + SECTOR_SHIFT); + unsigned int nr_zones = 1; struct zonefs_ioerr_data err = { .inode = inode, .write = write, @@ -487,6 +497,15 @@ static void __zonefs_io_error(struct inode *inode, bool write) int ret; /* + * The only files that have more than one zone are conventional zone + * files with aggregated conventional zones, for which the inode zone + * size is always larger than the device zone size. + */ + if (zi->i_zone_size > bdev_zone_sectors(sb->s_bdev)) + nr_zones = zi->i_zone_size >> + (sbi->s_zone_sectors_shift + SECTOR_SHIFT); + + /* * Memory allocations in blkdev_report_zones() can trigger a memory * reclaim which may in turn cause a recursion into zonefs as well as * struct request allocations for the same device. The former case may @@ -1407,6 +1426,14 @@ static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone, zi->i_ztype = type; zi->i_zsector = zone->start; zi->i_zone_size = zone->len << SECTOR_SHIFT; + if (zi->i_zone_size > bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT && + !(sbi->s_features & ZONEFS_F_AGGRCNV)) { + zonefs_err(sb, + "zone size %llu doesn't match device's zone sectors %llu\n", + zi->i_zone_size, + bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT); + return -EINVAL; + } zi->i_max_size = min_t(loff_t, MAX_LFS_FILESIZE, zone->capacity << SECTOR_SHIFT); @@ -1456,11 +1483,11 @@ static struct dentry *zonefs_create_inode(struct dentry *parent, struct inode *dir = d_inode(parent); struct dentry *dentry; struct inode *inode; - int ret; + int ret = -ENOMEM; dentry = d_alloc_name(parent, name); if (!dentry) - return NULL; + return ERR_PTR(ret); inode = new_inode(parent->d_sb); if (!inode) @@ -1485,7 +1512,7 @@ static struct dentry *zonefs_create_inode(struct dentry *parent, dput: dput(dentry); - return NULL; + return ERR_PTR(ret); } struct zonefs_zone_data { @@ -1505,7 +1532,7 @@ static int zonefs_create_zgroup(struct zonefs_zone_data *zd, struct blk_zone *zone, *next, *end; const char *zgroup_name; char *file_name; - struct dentry *dir; + struct dentry *dir, *dent; unsigned int n = 0; int ret; @@ -1523,8 +1550,8 @@ static int zonefs_create_zgroup(struct zonefs_zone_data *zd, zgroup_name = "seq"; dir = zonefs_create_inode(sb->s_root, zgroup_name, NULL, type); - if (!dir) { - ret = -ENOMEM; + if (IS_ERR(dir)) { + ret = PTR_ERR(dir); goto free; } @@ -1570,8 +1597,9 @@ static int zonefs_create_zgroup(struct zonefs_zone_data *zd, * Use the file number within its group as file name. */ snprintf(file_name, ZONEFS_NAME_MAX - 1, "%u", n); - if (!zonefs_create_inode(dir, file_name, zone, type)) { - ret = -ENOMEM; + dent = zonefs_create_inode(dir, file_name, zone, type); + if (IS_ERR(dent)) { + ret = PTR_ERR(dent); goto free; } @@ -1905,18 +1933,18 @@ static int __init zonefs_init(void) if (ret) return ret; - ret = register_filesystem(&zonefs_type); + ret = zonefs_sysfs_init(); if (ret) goto destroy_inodecache; - ret = zonefs_sysfs_init(); + ret = register_filesystem(&zonefs_type); if (ret) - goto unregister_fs; + goto sysfs_exit; return 0; -unregister_fs: - unregister_filesystem(&zonefs_type); +sysfs_exit: + zonefs_sysfs_exit(); destroy_inodecache: zonefs_destroy_inodecache(); @@ -1925,9 +1953,9 @@ destroy_inodecache: static void __exit zonefs_exit(void) { + unregister_filesystem(&zonefs_type); zonefs_sysfs_exit(); zonefs_destroy_inodecache(); - unregister_filesystem(&zonefs_type); } MODULE_AUTHOR("Damien Le Moal"); diff --git a/fs/zonefs/sysfs.c b/fs/zonefs/sysfs.c index 9cb6755ce39a..9920689dc098 100644 --- a/fs/zonefs/sysfs.c +++ b/fs/zonefs/sysfs.c @@ -15,11 +15,6 @@ struct zonefs_sysfs_attr { ssize_t (*show)(struct zonefs_sb_info *sbi, char *buf); }; -static inline struct zonefs_sysfs_attr *to_attr(struct attribute *attr) -{ - return container_of(attr, struct zonefs_sysfs_attr, attr); -} - #define ZONEFS_SYSFS_ATTR_RO(name) \ static struct zonefs_sysfs_attr zonefs_sysfs_attr_##name = __ATTR_RO(name) diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h index 4b3de66c3233..1dbe78119ff1 100644 --- a/fs/zonefs/zonefs.h +++ b/fs/zonefs/zonefs.h @@ -39,8 +39,10 @@ static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone) return ZONEFS_ZTYPE_SEQ; } -#define ZONEFS_ZONE_OPEN (1 << 0) -#define ZONEFS_ZONE_ACTIVE (1 << 1) +#define ZONEFS_ZONE_OPEN (1U << 0) +#define ZONEFS_ZONE_ACTIVE (1U << 1) +#define ZONEFS_ZONE_OFFLINE (1U << 2) +#define ZONEFS_ZONE_READONLY (1U << 3) /* * In-memory inode data. |