diff options
Diffstat (limited to 'fs')
212 files changed, 4611 insertions, 3230 deletions
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 21e154516bf2..f14478643b91 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -142,39 +142,6 @@ config BINFMT_ZFLAT help Support FLAT format compressed binaries -config HAVE_AOUT - def_bool n - -config BINFMT_AOUT - tristate "Kernel support for a.out and ECOFF binaries" - depends on HAVE_AOUT - help - A.out (Assembler.OUTput) is a set of formats for libraries and - executables used in the earliest versions of UNIX. Linux used - the a.out formats QMAGIC and ZMAGIC until they were replaced - with the ELF format. - - The conversion to ELF started in 1995. This option is primarily - provided for historical interest and for the benefit of those - who need to run binaries from that era. - - Most people should answer N here. If you think you may have - occasional use for this format, enable module support above - and answer M here to compile this support as a module called - binfmt_aout. - - If any crucial components of your system (such as /sbin/init - or /lib/ld.so) are still in a.out format, you will have to - say Y here. - -config OSF4_COMPAT - bool "OSF/1 v4 readv/writev compatibility" - depends on ALPHA && BINFMT_AOUT - help - Say Y if you are using OSF/1 binaries (like Netscape and Acrobat) - with v4 shared libraries freely available from Compaq. If you're - going to use shared libraries from Tru64 version 5.0 or later, say N. - config BINFMT_MISC tristate "Kernel support for MISC binaries" help diff --git a/fs/Makefile b/fs/Makefile index 93b80529f8e8..4dea17840761 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -38,7 +38,6 @@ obj-$(CONFIG_FS_DAX) += dax.o obj-$(CONFIG_FS_ENCRYPTION) += crypto/ obj-$(CONFIG_FS_VERITY) += verity/ obj-$(CONFIG_FILE_LOCKING) += locks.o -obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o obj-$(CONFIG_BINFMT_SCRIPT) += binfmt_script.o obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 56ae5cd5184f..230c2d19116d 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -24,9 +24,9 @@ static int afs_readdir(struct file *file, struct dir_context *ctx); static int afs_d_revalidate(struct dentry *dentry, unsigned int flags); static int afs_d_delete(const struct dentry *dentry); static void afs_d_iput(struct dentry *dentry, struct inode *inode); -static int afs_lookup_one_filldir(struct dir_context *ctx, const char *name, int nlen, +static bool afs_lookup_one_filldir(struct dir_context *ctx, const char *name, int nlen, loff_t fpos, u64 ino, unsigned dtype); -static int afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen, +static bool afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen, loff_t fpos, u64 ino, unsigned dtype); static int afs_create(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl); @@ -568,7 +568,7 @@ static int afs_readdir(struct file *file, struct dir_context *ctx) * - if afs_dir_iterate_block() spots this function, it'll pass the FID * uniquifier through dtype */ -static int afs_lookup_one_filldir(struct dir_context *ctx, const char *name, +static bool afs_lookup_one_filldir(struct dir_context *ctx, const char *name, int nlen, loff_t fpos, u64 ino, unsigned dtype) { struct afs_lookup_one_cookie *cookie = @@ -584,16 +584,16 @@ static int afs_lookup_one_filldir(struct dir_context *ctx, const char *name, if (cookie->name.len != nlen || memcmp(cookie->name.name, name, nlen) != 0) { - _leave(" = 0 [no]"); - return 0; + _leave(" = true [keep looking]"); + return true; } cookie->fid.vnode = ino; cookie->fid.unique = dtype; cookie->found = 1; - _leave(" = -1 [found]"); - return -1; + _leave(" = false [found]"); + return false; } /* @@ -636,12 +636,11 @@ static int afs_do_lookup_one(struct inode *dir, struct dentry *dentry, * - if afs_dir_iterate_block() spots this function, it'll pass the FID * uniquifier through dtype */ -static int afs_lookup_filldir(struct dir_context *ctx, const char *name, +static bool afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen, loff_t fpos, u64 ino, unsigned dtype) { struct afs_lookup_cookie *cookie = container_of(ctx, struct afs_lookup_cookie, ctx); - int ret; _enter("{%s,%u},%s,%u,,%llu,%u", cookie->name.name, cookie->name.len, name, nlen, @@ -663,12 +662,10 @@ static int afs_lookup_filldir(struct dir_context *ctx, const char *name, cookie->fids[1].unique = dtype; cookie->found = 1; if (cookie->one_only) - return -1; + return false; } - ret = cookie->nr_fids >= 50 ? -1 : 0; - _leave(" = %d", ret); - return ret; + return cookie->nr_fids < 50; } /* diff --git a/fs/afs/flock.c b/fs/afs/flock.c index c4210a3964d8..bbcc5afd1576 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -76,7 +76,7 @@ void afs_lock_op_done(struct afs_call *call) if (call->error == 0) { spin_lock(&vnode->lock); trace_afs_flock_ev(vnode, NULL, afs_flock_timestamp, 0); - vnode->locked_at = call->reply_time; + vnode->locked_at = call->issue_time; afs_schedule_lock_extension(vnode); spin_unlock(&vnode->lock); } diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 4943413d9c5f..7d37f63ef0f0 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -131,7 +131,7 @@ bad: static time64_t xdr_decode_expiry(struct afs_call *call, u32 expiry) { - return ktime_divns(call->reply_time, NSEC_PER_SEC) + expiry; + return ktime_divns(call->issue_time, NSEC_PER_SEC) + expiry; } static void xdr_decode_AFSCallBack(const __be32 **_bp, diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 64ad55494349..723d162078a3 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -137,7 +137,6 @@ struct afs_call { bool need_attention; /* T if RxRPC poked us */ bool async; /* T if asynchronous */ bool upgrade; /* T to request service upgrade */ - bool have_reply_time; /* T if have got reply_time */ bool intr; /* T if interruptible */ bool unmarshalling_error; /* T if an unmarshalling error occurred */ u16 service_id; /* Actual service ID (after upgrade) */ @@ -151,7 +150,7 @@ struct afs_call { } __attribute__((packed)); __be64 tmp64; }; - ktime_t reply_time; /* Time of first reply packet */ + ktime_t issue_time; /* Time of issue of operation */ }; struct afs_call_type { diff --git a/fs/afs/misc.c b/fs/afs/misc.c index 933e67fcdab1..805328ca5428 100644 --- a/fs/afs/misc.c +++ b/fs/afs/misc.c @@ -69,6 +69,7 @@ int afs_abort_to_error(u32 abort_code) /* Unified AFS error table */ case UAEPERM: return -EPERM; case UAENOENT: return -ENOENT; + case UAEAGAIN: return -EAGAIN; case UAEACCES: return -EACCES; case UAEBUSY: return -EBUSY; case UAEEXIST: return -EEXIST; diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index d5c4785c862d..eccc3cd0cb70 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -351,6 +351,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) if (call->max_lifespan) rxrpc_kernel_set_max_life(call->net->socket, rxcall, call->max_lifespan); + call->issue_time = ktime_get_real(); /* send the request */ iov[0].iov_base = call->request; @@ -501,12 +502,6 @@ static void afs_deliver_to_call(struct afs_call *call) return; } - if (!call->have_reply_time && - rxrpc_kernel_get_reply_time(call->net->socket, - call->rxcall, - &call->reply_time)) - call->have_reply_time = true; - ret = call->type->deliver(call); state = READ_ONCE(call->state); if (ret == 0 && call->unmarshalling_error) diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index fdc7d675b4b0..11571cca86c1 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -232,8 +232,7 @@ static void xdr_decode_YFSCallBack(const __be32 **_bp, struct afs_callback *cb = &scb->callback; ktime_t cb_expiry; - cb_expiry = call->reply_time; - cb_expiry = ktime_add(cb_expiry, xdr_to_u64(x->expiration_time) * 100); + cb_expiry = ktime_add(call->issue_time, xdr_to_u64(x->expiration_time) * 100); cb->expires_at = ktime_divns(cb_expiry, NSEC_PER_SEC); scb->have_cb = true; *_bp += xdr_size(x); diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index e0c3e33c4177..24192a7667ed 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -32,7 +32,7 @@ static struct inode *anon_inode_inode; */ static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen) { - return dynamic_dname(dentry, buffer, buflen, "anon_inode:%s", + return dynamic_dname(buffer, buflen, "anon_inode:%s", dentry->d_name.name); } diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c deleted file mode 100644 index 0dcfc691e7e2..000000000000 --- a/fs/binfmt_aout.c +++ /dev/null @@ -1,342 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/fs/binfmt_aout.c - * - * Copyright (C) 1991, 1992, 1996 Linus Torvalds - */ - -#include <linux/module.h> - -#include <linux/time.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/mman.h> -#include <linux/a.out.h> -#include <linux/errno.h> -#include <linux/signal.h> -#include <linux/string.h> -#include <linux/fs.h> -#include <linux/file.h> -#include <linux/stat.h> -#include <linux/fcntl.h> -#include <linux/ptrace.h> -#include <linux/user.h> -#include <linux/binfmts.h> -#include <linux/personality.h> -#include <linux/init.h> -#include <linux/coredump.h> -#include <linux/slab.h> -#include <linux/sched/task_stack.h> - -#include <linux/uaccess.h> -#include <asm/cacheflush.h> - -static int load_aout_binary(struct linux_binprm *); -static int load_aout_library(struct file*); - -static struct linux_binfmt aout_format = { - .module = THIS_MODULE, - .load_binary = load_aout_binary, - .load_shlib = load_aout_library, -}; - -#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE) - -static int set_brk(unsigned long start, unsigned long end) -{ - start = PAGE_ALIGN(start); - end = PAGE_ALIGN(end); - if (end > start) - return vm_brk(start, end - start); - return 0; -} - -/* - * create_aout_tables() parses the env- and arg-strings in new user - * memory and creates the pointer tables from them, and puts their - * addresses on the "stack", returning the new stack pointer value. - */ -static unsigned long __user *create_aout_tables(char __user *p, struct linux_binprm * bprm) -{ - char __user * __user *argv; - char __user * __user *envp; - unsigned long __user *sp; - int argc = bprm->argc; - int envc = bprm->envc; - - sp = (void __user *)((-(unsigned long)sizeof(char *)) & (unsigned long) p); -#ifdef __alpha__ -/* whee.. test-programs are so much fun. */ - put_user(0, --sp); - put_user(0, --sp); - if (bprm->loader) { - put_user(0, --sp); - put_user(1003, --sp); - put_user(bprm->loader, --sp); - put_user(1002, --sp); - } - put_user(bprm->exec, --sp); - put_user(1001, --sp); -#endif - sp -= envc+1; - envp = (char __user * __user *) sp; - sp -= argc+1; - argv = (char __user * __user *) sp; -#ifndef __alpha__ - put_user((unsigned long) envp,--sp); - put_user((unsigned long) argv,--sp); -#endif - put_user(argc,--sp); - current->mm->arg_start = (unsigned long) p; - while (argc-->0) { - char c; - put_user(p,argv++); - do { - get_user(c,p++); - } while (c); - } - put_user(NULL,argv); - current->mm->arg_end = current->mm->env_start = (unsigned long) p; - while (envc-->0) { - char c; - put_user(p,envp++); - do { - get_user(c,p++); - } while (c); - } - put_user(NULL,envp); - current->mm->env_end = (unsigned long) p; - return sp; -} - -/* - * These are the functions used to load a.out style executables and shared - * libraries. There is no binary dependent code anywhere else. - */ - -static int load_aout_binary(struct linux_binprm * bprm) -{ - struct pt_regs *regs = current_pt_regs(); - struct exec ex; - unsigned long error; - unsigned long fd_offset; - unsigned long rlim; - int retval; - - ex = *((struct exec *) bprm->buf); /* exec-header */ - if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC && - N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) || - N_TRSIZE(ex) || N_DRSIZE(ex) || - i_size_read(file_inode(bprm->file)) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { - return -ENOEXEC; - } - - /* - * Requires a mmap handler. This prevents people from using a.out - * as part of an exploit attack against /proc-related vulnerabilities. - */ - if (!bprm->file->f_op->mmap) - return -ENOEXEC; - - fd_offset = N_TXTOFF(ex); - - /* Check initial limits. This avoids letting people circumvent - * size limits imposed on them by creating programs with large - * arrays in the data or bss. - */ - rlim = rlimit(RLIMIT_DATA); - if (rlim >= RLIM_INFINITY) - rlim = ~0; - if (ex.a_data + ex.a_bss > rlim) - return -ENOMEM; - - /* Flush all traces of the currently running executable */ - retval = begin_new_exec(bprm); - if (retval) - return retval; - - /* OK, This is the point of no return */ -#ifdef __alpha__ - SET_AOUT_PERSONALITY(bprm, ex); -#else - set_personality(PER_LINUX); -#endif - setup_new_exec(bprm); - - current->mm->end_code = ex.a_text + - (current->mm->start_code = N_TXTADDR(ex)); - current->mm->end_data = ex.a_data + - (current->mm->start_data = N_DATADDR(ex)); - current->mm->brk = ex.a_bss + - (current->mm->start_brk = N_BSSADDR(ex)); - - retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); - if (retval < 0) - return retval; - - - if (N_MAGIC(ex) == OMAGIC) { - unsigned long text_addr, map_size; - loff_t pos; - - text_addr = N_TXTADDR(ex); - -#ifdef __alpha__ - pos = fd_offset; - map_size = ex.a_text+ex.a_data + PAGE_SIZE - 1; -#else - pos = 32; - map_size = ex.a_text+ex.a_data; -#endif - error = vm_brk(text_addr & PAGE_MASK, map_size); - if (error) - return error; - - error = read_code(bprm->file, text_addr, pos, - ex.a_text+ex.a_data); - if ((signed long)error < 0) - return error; - } else { - if ((ex.a_text & 0xfff || ex.a_data & 0xfff) && - (N_MAGIC(ex) != NMAGIC) && printk_ratelimit()) - { - printk(KERN_NOTICE "executable not page aligned\n"); - } - - if ((fd_offset & ~PAGE_MASK) != 0 && printk_ratelimit()) - { - printk(KERN_WARNING - "fd_offset is not page aligned. Please convert program: %pD\n", - bprm->file); - } - - if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) { - error = vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); - if (error) - return error; - - read_code(bprm->file, N_TXTADDR(ex), fd_offset, - ex.a_text + ex.a_data); - goto beyond_if; - } - - error = vm_mmap(bprm->file, N_TXTADDR(ex), ex.a_text, - PROT_READ | PROT_EXEC, MAP_FIXED | MAP_PRIVATE, - fd_offset); - - if (error != N_TXTADDR(ex)) - return error; - - error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data, - PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE, - fd_offset + ex.a_text); - if (error != N_DATADDR(ex)) - return error; - } -beyond_if: - set_binfmt(&aout_format); - - retval = set_brk(current->mm->start_brk, current->mm->brk); - if (retval < 0) - return retval; - - current->mm->start_stack = - (unsigned long) create_aout_tables((char __user *) bprm->p, bprm); -#ifdef __alpha__ - regs->gp = ex.a_gpvalue; -#endif - finalize_exec(bprm); - start_thread(regs, ex.a_entry, current->mm->start_stack); - return 0; -} - -static int load_aout_library(struct file *file) -{ - struct inode * inode; - unsigned long bss, start_addr, len; - unsigned long error; - int retval; - struct exec ex; - loff_t pos = 0; - - inode = file_inode(file); - - retval = -ENOEXEC; - error = kernel_read(file, &ex, sizeof(ex), &pos); - if (error != sizeof(ex)) - goto out; - - /* We come in here for the regular a.out style of shared libraries */ - if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) || - N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) || - i_size_read(inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { - goto out; - } - - /* - * Requires a mmap handler. This prevents people from using a.out - * as part of an exploit attack against /proc-related vulnerabilities. - */ - if (!file->f_op->mmap) - goto out; - - if (N_FLAGS(ex)) - goto out; - - /* For QMAGIC, the starting address is 0x20 into the page. We mask - this off to get the starting address for the page */ - - start_addr = ex.a_entry & 0xfffff000; - - if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) { - if (printk_ratelimit()) - { - printk(KERN_WARNING - "N_TXTOFF is not page aligned. Please convert library: %pD\n", - file); - } - retval = vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); - if (retval) - goto out; - - read_code(file, start_addr, N_TXTOFF(ex), - ex.a_text + ex.a_data); - retval = 0; - goto out; - } - /* Now use mmap to map the library into memory. */ - error = vm_mmap(file, start_addr, ex.a_text + ex.a_data, - PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE, - N_TXTOFF(ex)); - retval = error; - if (error != start_addr) - goto out; - - len = PAGE_ALIGN(ex.a_text + ex.a_data); - bss = ex.a_text + ex.a_data + ex.a_bss; - if (bss > len) { - retval = vm_brk(start_addr + len, bss - len); - if (retval) - goto out; - } - retval = 0; -out: - return retval; -} - -static int __init init_aout_binfmt(void) -{ - register_binfmt(&aout_format); - return 0; -} - -static void __exit exit_aout_binfmt(void) -{ - unregister_binfmt(&aout_format); -} - -core_initcall(init_aout_binfmt); -module_exit(exit_aout_binfmt); -MODULE_LICENSE("GPL"); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index c3aecfb0a71d..e0375ba9d0fe 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -440,39 +440,26 @@ void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache, btrfs_put_caching_control(caching_ctl); } -int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache) +static int btrfs_caching_ctl_wait_done(struct btrfs_block_group *cache, + struct btrfs_caching_control *caching_ctl) +{ + wait_event(caching_ctl->wait, btrfs_block_group_done(cache)); + return cache->cached == BTRFS_CACHE_ERROR ? -EIO : 0; +} + +static int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache) { struct btrfs_caching_control *caching_ctl; - int ret = 0; + int ret; caching_ctl = btrfs_get_caching_control(cache); if (!caching_ctl) return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; - - wait_event(caching_ctl->wait, btrfs_block_group_done(cache)); - if (cache->cached == BTRFS_CACHE_ERROR) - ret = -EIO; + ret = btrfs_caching_ctl_wait_done(cache, caching_ctl); btrfs_put_caching_control(caching_ctl); return ret; } -static bool space_cache_v1_done(struct btrfs_block_group *cache) -{ - bool ret; - - spin_lock(&cache->lock); - ret = cache->cached != BTRFS_CACHE_FAST; - spin_unlock(&cache->lock); - - return ret; -} - -void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache, - struct btrfs_caching_control *caching_ctl) -{ - wait_event(caching_ctl->wait, space_cache_v1_done(cache)); -} - #ifdef CONFIG_BTRFS_DEBUG static void fragment_free_space(struct btrfs_block_group *block_group) { @@ -750,9 +737,8 @@ done: btrfs_put_block_group(block_group); } -int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only) +int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait) { - DEFINE_WAIT(wait); struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_caching_control *caching_ctl = NULL; int ret = 0; @@ -785,10 +771,7 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only } WARN_ON(cache->caching_ctl); cache->caching_ctl = caching_ctl; - if (btrfs_test_opt(fs_info, SPACE_CACHE)) - cache->cached = BTRFS_CACHE_FAST; - else - cache->cached = BTRFS_CACHE_STARTED; + cache->cached = BTRFS_CACHE_STARTED; cache->has_caching_ctl = 1; spin_unlock(&cache->lock); @@ -801,8 +784,8 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); out: - if (load_cache_only && caching_ctl) - btrfs_wait_space_cache_v1_finished(cache, caching_ctl); + if (wait && caching_ctl) + ret = btrfs_caching_ctl_wait_done(cache, caching_ctl); if (caching_ctl) btrfs_put_caching_control(caching_ctl); @@ -1640,9 +1623,11 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) div64_u64(zone_unusable * 100, bg->length)); trace_btrfs_reclaim_block_group(bg); ret = btrfs_relocate_chunk(fs_info, bg->start); - if (ret) + if (ret) { + btrfs_dec_block_group_ro(bg); btrfs_err(fs_info, "error relocating chunk %llu", bg->start); + } next: btrfs_put_block_group(bg); @@ -3310,7 +3295,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, * space back to the block group, otherwise we will leak space. */ if (!alloc && !btrfs_block_group_done(cache)) - btrfs_cache_block_group(cache, 1); + btrfs_cache_block_group(cache, true); byte_in_group = bytenr - cache->start; WARN_ON(byte_in_group > cache->length); diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 35e0e860cc0b..6b3cdc4cbc41 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -263,9 +263,7 @@ void btrfs_dec_nocow_writers(struct btrfs_block_group *bg); void btrfs_wait_nocow_writers(struct btrfs_block_group *bg); void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache, u64 num_bytes); -int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache); -int btrfs_cache_block_group(struct btrfs_block_group *cache, - int load_cache_only); +int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait); void btrfs_put_caching_control(struct btrfs_caching_control *ctl); struct btrfs_caching_control *btrfs_get_caching_control( struct btrfs_block_group *cache); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 6e556031a8f3..ebfa35fe1c38 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2075,6 +2075,9 @@ cow_done: if (!p->skip_locking) { level = btrfs_header_level(b); + + btrfs_maybe_reset_lockdep_class(root, b); + if (level <= write_lock_level) { btrfs_tree_lock(b); p->locks[level] = BTRFS_WRITE_LOCK; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4db85b9dc7ed..df8c99c99df9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -505,7 +505,6 @@ struct btrfs_free_cluster { enum btrfs_caching_type { BTRFS_CACHE_NO, BTRFS_CACHE_STARTED, - BTRFS_CACHE_FAST, BTRFS_CACHE_FINISHED, BTRFS_CACHE_ERROR, }; @@ -1089,8 +1088,6 @@ struct btrfs_fs_info { spinlock_t zone_active_bgs_lock; struct list_head zone_active_bgs; - /* Waiters when BTRFS_FS_NEED_ZONE_FINISH is set */ - wait_queue_head_t zone_finish_wait; /* Updates are not protected by any lock */ struct btrfs_commit_stats commit_stats; @@ -1173,6 +1170,8 @@ enum { BTRFS_ROOT_ORPHAN_CLEANUP, /* This root has a drop operation that was started previously. */ BTRFS_ROOT_UNFINISHED_DROP, + /* This reloc root needs to have its buffers lockdep class reset. */ + BTRFS_ROOT_RESET_LOCKDEP_CLASS, }; static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index f43196a893ca..41cddd3ff059 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -165,7 +165,7 @@ no_valid_dev_replace_entry_found: */ if (btrfs_find_device(fs_info->fs_devices, &args)) { btrfs_err(fs_info, - "replace devid present without an active replace item"); +"replace without active item, run 'device scan --forget' on the target device"); ret = -EUCLEAN; } else { dev_replace->srcdev = NULL; @@ -1129,8 +1129,7 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) up_write(&dev_replace->rwsem); /* Scrub for replace must not be running in suspended state */ - ret = btrfs_scrub_cancel(fs_info); - ASSERT(ret != -ENOTCONN); + btrfs_scrub_cancel(fs_info); trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4c3166f3c725..2633137c3e9f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -87,88 +87,6 @@ struct async_submit_bio { }; /* - * Lockdep class keys for extent_buffer->lock's in this root. For a given - * eb, the lockdep key is determined by the btrfs_root it belongs to and - * the level the eb occupies in the tree. - * - * Different roots are used for different purposes and may nest inside each - * other and they require separate keysets. As lockdep keys should be - * static, assign keysets according to the purpose of the root as indicated - * by btrfs_root->root_key.objectid. This ensures that all special purpose - * roots have separate keysets. - * - * Lock-nesting across peer nodes is always done with the immediate parent - * node locked thus preventing deadlock. As lockdep doesn't know this, use - * subclass to avoid triggering lockdep warning in such cases. - * - * The key is set by the readpage_end_io_hook after the buffer has passed - * csum validation but before the pages are unlocked. It is also set by - * btrfs_init_new_buffer on freshly allocated blocks. - * - * We also add a check to make sure the highest level of the tree is the - * same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this code - * needs update as well. - */ -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# if BTRFS_MAX_LEVEL != 8 -# error -# endif - -#define DEFINE_LEVEL(stem, level) \ - .names[level] = "btrfs-" stem "-0" #level, - -#define DEFINE_NAME(stem) \ - DEFINE_LEVEL(stem, 0) \ - DEFINE_LEVEL(stem, 1) \ - DEFINE_LEVEL(stem, 2) \ - DEFINE_LEVEL(stem, 3) \ - DEFINE_LEVEL(stem, 4) \ - DEFINE_LEVEL(stem, 5) \ - DEFINE_LEVEL(stem, 6) \ - DEFINE_LEVEL(stem, 7) - -static struct btrfs_lockdep_keyset { - u64 id; /* root objectid */ - /* Longest entry: btrfs-free-space-00 */ - char names[BTRFS_MAX_LEVEL][20]; - struct lock_class_key keys[BTRFS_MAX_LEVEL]; -} btrfs_lockdep_keysets[] = { - { .id = BTRFS_ROOT_TREE_OBJECTID, DEFINE_NAME("root") }, - { .id = BTRFS_EXTENT_TREE_OBJECTID, DEFINE_NAME("extent") }, - { .id = BTRFS_CHUNK_TREE_OBJECTID, DEFINE_NAME("chunk") }, - { .id = BTRFS_DEV_TREE_OBJECTID, DEFINE_NAME("dev") }, - { .id = BTRFS_CSUM_TREE_OBJECTID, DEFINE_NAME("csum") }, - { .id = BTRFS_QUOTA_TREE_OBJECTID, DEFINE_NAME("quota") }, - { .id = BTRFS_TREE_LOG_OBJECTID, DEFINE_NAME("log") }, - { .id = BTRFS_TREE_RELOC_OBJECTID, DEFINE_NAME("treloc") }, - { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, DEFINE_NAME("dreloc") }, - { .id = BTRFS_UUID_TREE_OBJECTID, DEFINE_NAME("uuid") }, - { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") }, - { .id = 0, DEFINE_NAME("tree") }, -}; - -#undef DEFINE_LEVEL -#undef DEFINE_NAME - -void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, - int level) -{ - struct btrfs_lockdep_keyset *ks; - - BUG_ON(level >= ARRAY_SIZE(ks->keys)); - - /* find the matching keyset, id 0 is the default entry */ - for (ks = btrfs_lockdep_keysets; ks->id; ks++) - if (ks->id == objectid) - break; - - lockdep_set_class_and_name(&eb->lock, - &ks->keys[level], ks->names[level]); -} - -#endif - -/* * Compute the csum of a btree block and store the result to provided buffer. */ static void csum_tree_block(struct extent_buffer *buf, u8 *result) @@ -3150,7 +3068,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) init_waitqueue_head(&fs_info->transaction_blocked_wait); init_waitqueue_head(&fs_info->async_submit_wait); init_waitqueue_head(&fs_info->delayed_iputs_wait); - init_waitqueue_head(&fs_info->zone_finish_wait); /* Usable values until the real ones are cached from the superblock */ fs_info->nodesize = 4096; @@ -4558,6 +4475,17 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags); /* + * If we had UNFINISHED_DROPS we could still be processing them, so + * clear that bit and wake up relocation so it can stop. + * We must do this before stopping the block group reclaim task, because + * at btrfs_relocate_block_group() we wait for this bit, and after the + * wait we stop with -EINTR if btrfs_fs_closing() returns non-zero - we + * have just set BTRFS_FS_CLOSING_START, so btrfs_fs_closing() will + * return 1. + */ + btrfs_wake_unfinished_drop(fs_info); + + /* * We may have the reclaim task running and relocating a data block group, * in which case it may create delayed iputs. So stop it before we park * the cleaner kthread otherwise we can get new delayed iputs after @@ -4575,12 +4503,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) */ kthread_park(fs_info->cleaner_kthread); - /* - * If we had UNFINISHED_DROPS we could still be processing them, so - * clear that bit and wake up relocation so it can stop. - */ - btrfs_wake_unfinished_drop(fs_info); - /* wait for the qgroup rescan worker to stop */ btrfs_qgroup_wait_for_completion(fs_info, false); @@ -4603,6 +4525,31 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) /* clear out the rbtree of defraggable inodes */ btrfs_cleanup_defrag_inodes(fs_info); + /* + * After we parked the cleaner kthread, ordered extents may have + * completed and created new delayed iputs. If one of the async reclaim + * tasks is running and in the RUN_DELAYED_IPUTS flush state, then we + * can hang forever trying to stop it, because if a delayed iput is + * added after it ran btrfs_run_delayed_iputs() and before it called + * btrfs_wait_on_delayed_iputs(), it will hang forever since there is + * no one else to run iputs. + * + * So wait for all ongoing ordered extents to complete and then run + * delayed iputs. This works because once we reach this point no one + * can either create new ordered extents nor create delayed iputs + * through some other means. + * + * Also note that btrfs_wait_ordered_roots() is not safe here, because + * it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent, + * but the delayed iput for the respective inode is made only when doing + * the final btrfs_put_ordered_extent() (which must happen at + * btrfs_finish_ordered_io() when we are unmounting). + */ + btrfs_flush_workqueue(fs_info->endio_write_workers); + /* Ordered extents for free space inodes. */ + btrfs_flush_workqueue(fs_info->endio_freespace_worker); + btrfs_run_delayed_iputs(fs_info); + cancel_work_sync(&fs_info->async_reclaim_work); cancel_work_sync(&fs_info->async_data_reclaim_work); cancel_work_sync(&fs_info->preempt_reclaim_work); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 8993b428e09c..47ad8e0a2d33 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -137,14 +137,4 @@ int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid); int btrfs_init_root_free_objectid(struct btrfs_root *root); -#ifdef CONFIG_DEBUG_LOCK_ALLOC -void btrfs_set_buffer_lockdep_class(u64 objectid, - struct extent_buffer *eb, int level); -#else -static inline void btrfs_set_buffer_lockdep_class(u64 objectid, - struct extent_buffer *eb, int level) -{ -} -#endif - #endif diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ea3ec1e761e8..6914cd8024ba 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2551,17 +2551,10 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, return -EINVAL; /* - * pull in the free space cache (if any) so that our pin - * removes the free space from the cache. We have load_only set - * to one because the slow code to read in the free extents does check - * the pinned extents. + * Fully cache the free space first so that our pin removes the free space + * from the cache. */ - btrfs_cache_block_group(cache, 1); - /* - * Make sure we wait until the cache is completely built in case it is - * missing or is invalid and therefore needs to be rebuilt. - */ - ret = btrfs_wait_block_group_cache_done(cache); + ret = btrfs_cache_block_group(cache, true); if (ret) goto out; @@ -2584,12 +2577,7 @@ static int __exclude_logged_extent(struct btrfs_fs_info *fs_info, if (!block_group) return -EINVAL; - btrfs_cache_block_group(block_group, 1); - /* - * Make sure we wait until the cache is completely built in case it is - * missing or is invalid and therefore needs to be rebuilt. - */ - ret = btrfs_wait_block_group_cache_done(block_group); + ret = btrfs_cache_block_group(block_group, true); if (ret) goto out; @@ -4399,7 +4387,7 @@ have_block_group: ffe_ctl->cached = btrfs_block_group_done(block_group); if (unlikely(!ffe_ctl->cached)) { ffe_ctl->have_caching_bg = true; - ret = btrfs_cache_block_group(block_group, 0); + ret = btrfs_cache_block_group(block_group, false); /* * If we get ENOMEM here or something else we want to @@ -4867,6 +4855,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *buf; + u64 lockdep_owner = owner; buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level); if (IS_ERR(buf)) @@ -4886,11 +4875,26 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, } /* + * The reloc trees are just snapshots, so we need them to appear to be + * just like any other fs tree WRT lockdep. + * + * The exception however is in replace_path() in relocation, where we + * hold the lock on the original fs root and then search for the reloc + * root. At that point we need to make sure any reloc root buffers are + * set to the BTRFS_TREE_RELOC_OBJECTID lockdep class in order to make + * lockdep happy. + */ + if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID && + !test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state)) + lockdep_owner = BTRFS_FS_TREE_OBJECTID; + + /* * This needs to stay, because we could allocate a freed block from an * old tree into a new tree, so we need to make sure this new block is * set to the appropriate level and owner. */ - btrfs_set_buffer_lockdep_class(owner, buf, level); + btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level); + __btrfs_tree_lock(buf, nest); btrfs_clean_tree_block(buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); @@ -6153,13 +6157,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) if (end - start >= range->minlen) { if (!btrfs_block_group_done(cache)) { - ret = btrfs_cache_block_group(cache, 0); - if (ret) { - bg_failed++; - bg_ret = ret; - continue; - } - ret = btrfs_wait_block_group_cache_done(cache); + ret = btrfs_cache_block_group(cache, true); if (ret) { bg_failed++; bg_ret = ret; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index bfae67c593c5..cf4f19e80e2f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3233,7 +3233,7 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, u32 bio_size = bio->bi_iter.bi_size; u32 real_size; const sector_t sector = disk_bytenr >> SECTOR_SHIFT; - bool contig; + bool contig = false; int ret; ASSERT(bio); @@ -3242,10 +3242,35 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, if (bio_ctrl->compress_type != compress_type) return 0; - if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) + + if (bio->bi_iter.bi_size == 0) { + /* We can always add a page into an empty bio. */ + contig = true; + } else if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE) { + struct bio_vec *bvec = bio_last_bvec_all(bio); + + /* + * The contig check requires the following conditions to be met: + * 1) The pages are belonging to the same inode + * This is implied by the call chain. + * + * 2) The range has adjacent logical bytenr + * + * 3) The range has adjacent file offset + * This is required for the usage of btrfs_bio->file_offset. + */ + if (bio_end_sector(bio) == sector && + page_offset(bvec->bv_page) + bvec->bv_offset + + bvec->bv_len == page_offset(page) + pg_offset) + contig = true; + } else { + /* + * For compression, all IO should have its logical bytenr + * set to the starting bytenr of the compressed extent. + */ contig = bio->bi_iter.bi_sector == sector; - else - contig = bio_end_sector(bio) == sector; + } + if (!contig) return 0; @@ -6140,6 +6165,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, struct extent_buffer *exists = NULL; struct page *p; struct address_space *mapping = fs_info->btree_inode->i_mapping; + u64 lockdep_owner = owner_root; int uptodate = 1; int ret; @@ -6164,7 +6190,15 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, eb = __alloc_extent_buffer(fs_info, start, len); if (!eb) return ERR_PTR(-ENOMEM); - btrfs_set_buffer_lockdep_class(owner_root, eb, level); + + /* + * The reloc trees are just snapshots, so we need them to appear to be + * just like any other fs tree WRT lockdep. + */ + if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID) + lockdep_owner = BTRFS_FS_TREE_OBJECTID; + + btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level); num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++, index++) { diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 66c822182ecc..5a3f6e0d9688 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2482,6 +2482,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_offset(leaf, fi, 0); + btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_mark_buffer_dirty(leaf); goto out; } @@ -2498,6 +2499,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_offset(leaf, fi, 0); + btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_mark_buffer_dirty(leaf); goto out; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f0c97d25b4a0..1372210869b1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1644,10 +1644,9 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode, done_offset = end; if (done_offset == start) { - struct btrfs_fs_info *info = inode->root->fs_info; - - wait_var_event(&info->zone_finish_wait, - !test_bit(BTRFS_FS_NEED_ZONE_FINISH, &info->flags)); + wait_on_bit_io(&inode->root->fs_info->flags, + BTRFS_FS_NEED_ZONE_FINISH, + TASK_UNINTERRUPTIBLE); continue; } @@ -7694,6 +7693,20 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, bool unlock_extents = false; /* + * We could potentially fault if we have a buffer > PAGE_SIZE, and if + * we're NOWAIT we may submit a bio for a partial range and return + * EIOCBQUEUED, which would result in an errant short read. + * + * The best way to handle this would be to allow for partial completions + * of iocb's, so we could submit the partial bio, return and fault in + * the rest of the pages, and then submit the io for the rest of the + * range. However we don't have that currently, so simply return + * -EAGAIN at this point so that the normal path is used. + */ + if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE) + return -EAGAIN; + + /* * Cap the size of reads to that usually seen in buffered I/O as we need * to allocate a contiguous array for the checksums. */ diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 33461b4f9c8b..9063072b399b 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -14,6 +14,93 @@ #include "locking.h" /* + * Lockdep class keys for extent_buffer->lock's in this root. For a given + * eb, the lockdep key is determined by the btrfs_root it belongs to and + * the level the eb occupies in the tree. + * + * Different roots are used for different purposes and may nest inside each + * other and they require separate keysets. As lockdep keys should be + * static, assign keysets according to the purpose of the root as indicated + * by btrfs_root->root_key.objectid. This ensures that all special purpose + * roots have separate keysets. + * + * Lock-nesting across peer nodes is always done with the immediate parent + * node locked thus preventing deadlock. As lockdep doesn't know this, use + * subclass to avoid triggering lockdep warning in such cases. + * + * The key is set by the readpage_end_io_hook after the buffer has passed + * csum validation but before the pages are unlocked. It is also set by + * btrfs_init_new_buffer on freshly allocated blocks. + * + * We also add a check to make sure the highest level of the tree is the + * same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this code + * needs update as well. + */ +#ifdef CONFIG_DEBUG_LOCK_ALLOC +#if BTRFS_MAX_LEVEL != 8 +#error +#endif + +#define DEFINE_LEVEL(stem, level) \ + .names[level] = "btrfs-" stem "-0" #level, + +#define DEFINE_NAME(stem) \ + DEFINE_LEVEL(stem, 0) \ + DEFINE_LEVEL(stem, 1) \ + DEFINE_LEVEL(stem, 2) \ + DEFINE_LEVEL(stem, 3) \ + DEFINE_LEVEL(stem, 4) \ + DEFINE_LEVEL(stem, 5) \ + DEFINE_LEVEL(stem, 6) \ + DEFINE_LEVEL(stem, 7) + +static struct btrfs_lockdep_keyset { + u64 id; /* root objectid */ + /* Longest entry: btrfs-free-space-00 */ + char names[BTRFS_MAX_LEVEL][20]; + struct lock_class_key keys[BTRFS_MAX_LEVEL]; +} btrfs_lockdep_keysets[] = { + { .id = BTRFS_ROOT_TREE_OBJECTID, DEFINE_NAME("root") }, + { .id = BTRFS_EXTENT_TREE_OBJECTID, DEFINE_NAME("extent") }, + { .id = BTRFS_CHUNK_TREE_OBJECTID, DEFINE_NAME("chunk") }, + { .id = BTRFS_DEV_TREE_OBJECTID, DEFINE_NAME("dev") }, + { .id = BTRFS_CSUM_TREE_OBJECTID, DEFINE_NAME("csum") }, + { .id = BTRFS_QUOTA_TREE_OBJECTID, DEFINE_NAME("quota") }, + { .id = BTRFS_TREE_LOG_OBJECTID, DEFINE_NAME("log") }, + { .id = BTRFS_TREE_RELOC_OBJECTID, DEFINE_NAME("treloc") }, + { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, DEFINE_NAME("dreloc") }, + { .id = BTRFS_UUID_TREE_OBJECTID, DEFINE_NAME("uuid") }, + { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") }, + { .id = 0, DEFINE_NAME("tree") }, +}; + +#undef DEFINE_LEVEL +#undef DEFINE_NAME + +void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int level) +{ + struct btrfs_lockdep_keyset *ks; + + BUG_ON(level >= ARRAY_SIZE(ks->keys)); + + /* Find the matching keyset, id 0 is the default entry */ + for (ks = btrfs_lockdep_keysets; ks->id; ks++) + if (ks->id == objectid) + break; + + lockdep_set_class_and_name(&eb->lock, &ks->keys[level], ks->names[level]); +} + +void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buffer *eb) +{ + if (test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state)) + btrfs_set_buffer_lockdep_class(root->root_key.objectid, + eb, btrfs_header_level(eb)); +} + +#endif + +/* * Extent buffer locking * ===================== * @@ -164,6 +251,8 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) while (1) { eb = btrfs_root_node(root); + + btrfs_maybe_reset_lockdep_class(root, eb); btrfs_tree_lock(eb); if (eb == root->node) break; @@ -185,6 +274,8 @@ struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) while (1) { eb = btrfs_root_node(root); + + btrfs_maybe_reset_lockdep_class(root, eb); btrfs_tree_read_lock(eb); if (eb == root->node) break; diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index bbc45534ae9a..ab268be09bb5 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -131,4 +131,18 @@ void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock); void btrfs_drew_read_lock(struct btrfs_drew_lock *lock); void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock); +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int level); +void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buffer *eb); +#else +static inline void btrfs_set_buffer_lockdep_class(u64 objectid, + struct extent_buffer *eb, int level) +{ +} +static inline void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, + struct extent_buffer *eb) +{ +} +#endif + #endif diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index a6dc827e75af..45c02aba2492 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1326,7 +1326,9 @@ again: btrfs_release_path(path); path->lowest_level = level; + set_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state); ret = btrfs_search_slot(trans, src, &key, path, 0, 1); + clear_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state); path->lowest_level = 0; if (ret) { if (ret > 0) @@ -3573,7 +3575,12 @@ int prepare_to_relocate(struct reloc_control *rc) */ return PTR_ERR(trans); } - return btrfs_commit_transaction(trans); + + ret = btrfs_commit_transaction(trans); + if (ret) + unset_reloc_control(rc); + + return ret; } static noinline_for_stack int relocate_block_group(struct reloc_control *rc) diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index a64b26b16904..d647cb2938c0 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -349,9 +349,10 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, key.offset = ref_id; again: ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); - if (ret < 0) + if (ret < 0) { + err = ret; goto out; - if (ret == 0) { + } else if (ret == 0) { leaf = path->nodes[0]; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index d0cbeb7ae81c..435559ba94fa 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -199,7 +199,7 @@ static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags) ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK); if (flags & BTRFS_BLOCK_GROUP_DATA) - return SZ_1G; + return BTRFS_MAX_DATA_CHUNK_SIZE; else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) return SZ_32M; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 9e0e0ae2288c..43f905ab0a18 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1233,7 +1233,8 @@ static void extent_err(const struct extent_buffer *eb, int slot, } static int check_extent_item(struct extent_buffer *leaf, - struct btrfs_key *key, int slot) + struct btrfs_key *key, int slot, + struct btrfs_key *prev_key) { struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_extent_item *ei; @@ -1453,6 +1454,26 @@ static int check_extent_item(struct extent_buffer *leaf, total_refs, inline_refs); return -EUCLEAN; } + + if ((prev_key->type == BTRFS_EXTENT_ITEM_KEY) || + (prev_key->type == BTRFS_METADATA_ITEM_KEY)) { + u64 prev_end = prev_key->objectid; + + if (prev_key->type == BTRFS_METADATA_ITEM_KEY) + prev_end += fs_info->nodesize; + else + prev_end += prev_key->offset; + + if (unlikely(prev_end > key->objectid)) { + extent_err(leaf, slot, + "previous extent [%llu %u %llu] overlaps current extent [%llu %u %llu]", + prev_key->objectid, prev_key->type, + prev_key->offset, key->objectid, key->type, + key->offset); + return -EUCLEAN; + } + } + return 0; } @@ -1621,7 +1642,7 @@ static int check_leaf_item(struct extent_buffer *leaf, break; case BTRFS_EXTENT_ITEM_KEY: case BTRFS_METADATA_ITEM_KEY: - ret = check_extent_item(leaf, key, slot); + ret = check_extent_item(leaf, key, slot, prev_key); break; case BTRFS_TREE_BLOCK_REF_KEY: case BTRFS_SHARED_DATA_REF_KEY: diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index dcf75a8daa20..9205c4a5ca81 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1146,7 +1146,9 @@ again: extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, inode_objectid, parent_objectid, 0, 0); - if (!IS_ERR_OR_NULL(extref)) { + if (IS_ERR(extref)) { + return PTR_ERR(extref); + } else if (extref) { u32 item_size; u32 cur_offset = 0; unsigned long base; @@ -1457,7 +1459,7 @@ static int add_link(struct btrfs_trans_handle *trans, * on the inode will not free it. We will fixup the link count later. */ if (other_inode->i_nlink == 0) - inc_nlink(other_inode); + set_nlink(other_inode, 1); add_link: ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, namelen, 0, ref_index); @@ -1600,7 +1602,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * free it. We will fixup the link count later. */ if (!ret && inode->i_nlink == 0) - inc_nlink(inode); + set_nlink(inode, 1); } if (ret < 0) goto out; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 272901514b0c..f63ff91e2883 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2345,8 +2345,11 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, ret = btrfs_get_bdev_and_sb(path, FMODE_READ, fs_info->bdev_holder, 0, &bdev, &disk_super); - if (ret) + if (ret) { + btrfs_put_dev_args_from_path(args); return ret; + } + args->devid = btrfs_stack_device_id(&disk_super->dev_item); memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE); if (btrfs_fs_incompat(fs_info, METADATA_UUID)) @@ -5264,6 +5267,9 @@ static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl, ctl->stripe_size); } + /* Stripe size should not go beyond 1G. */ + ctl->stripe_size = min_t(u64, ctl->stripe_size, SZ_1G); + /* Align to BTRFS_STRIPE_LEN */ ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN); ctl->chunk_size = ctl->stripe_size * data_stripes; diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 7421abcf325a..5bb8d8c86311 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -371,6 +371,9 @@ static int btrfs_xattr_handler_set(const struct xattr_handler *handler, const char *name, const void *buffer, size_t size, int flags) { + if (btrfs_root_readonly(BTRFS_I(inode)->root)) + return -EROFS; + name = xattr_full_name(handler, name); return btrfs_setxattr_trans(inode, name, buffer, size, flags); } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index b150b07ba1a7..73c6929f7be6 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -421,10 +421,19 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) * since btrfs adds the pages one by one to a bio, and btrfs cannot * increase the metadata reservation even if it increases the number of * extents, it is safe to stick with the limit. + * + * With the zoned emulation, we can have non-zoned device on the zoned + * mode. In this case, we don't have a valid max zone append size. So, + * use max_segments * PAGE_SIZE as the pseudo max_zone_append_size. */ - zone_info->max_zone_append_size = - min_t(u64, (u64)bdev_max_zone_append_sectors(bdev) << SECTOR_SHIFT, - (u64)bdev_max_segments(bdev) << PAGE_SHIFT); + if (bdev_is_zoned(bdev)) { + zone_info->max_zone_append_size = min_t(u64, + (u64)bdev_max_zone_append_sectors(bdev) << SECTOR_SHIFT, + (u64)bdev_max_segments(bdev) << PAGE_SHIFT); + } else { + zone_info->max_zone_append_size = + (u64)bdev_max_segments(bdev) << PAGE_SHIFT; + } if (!IS_ALIGNED(nr_sectors, zone_sectors)) zone_info->nr_zones++; @@ -1178,7 +1187,7 @@ int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size) * offset. */ static int calculate_alloc_pointer(struct btrfs_block_group *cache, - u64 *offset_ret) + u64 *offset_ret, bool new) { struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_root *root; @@ -1188,6 +1197,21 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, int ret; u64 length; + /* + * Avoid tree lookups for a new block group, there's no use for it. + * It must always be 0. + * + * Also, we have a lock chain of extent buffer lock -> chunk mutex. + * For new a block group, this function is called from + * btrfs_make_block_group() which is already taking the chunk mutex. + * Thus, we cannot call calculate_alloc_pointer() which takes extent + * buffer locks to avoid deadlock. + */ + if (new) { + *offset_ret = 0; + return 0; + } + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -1323,6 +1347,13 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) else num_conventional++; + /* + * Consider a zone as active if we can allow any number of + * active zones. + */ + if (!device->zone_info->max_active_zones) + __set_bit(i, active); + if (!is_sequential) { alloc_offsets[i] = WP_CONVENTIONAL; continue; @@ -1389,45 +1420,23 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) __set_bit(i, active); break; } - - /* - * Consider a zone as active if we can allow any number of - * active zones. - */ - if (!device->zone_info->max_active_zones) - __set_bit(i, active); } if (num_sequential > 0) cache->seq_zone = true; if (num_conventional > 0) { - /* - * Avoid calling calculate_alloc_pointer() for new BG. It - * is no use for new BG. It must be always 0. - * - * Also, we have a lock chain of extent buffer lock -> - * chunk mutex. For new BG, this function is called from - * btrfs_make_block_group() which is already taking the - * chunk mutex. Thus, we cannot call - * calculate_alloc_pointer() which takes extent buffer - * locks to avoid deadlock. - */ - /* Zone capacity is always zone size in emulation */ cache->zone_capacity = cache->length; - if (new) { - cache->alloc_offset = 0; - goto out; - } - ret = calculate_alloc_pointer(cache, &last_alloc); - if (ret || map->num_stripes == num_conventional) { - if (!ret) - cache->alloc_offset = last_alloc; - else - btrfs_err(fs_info, + ret = calculate_alloc_pointer(cache, &last_alloc, new); + if (ret) { + btrfs_err(fs_info, "zoned: failed to determine allocation offset of bg %llu", - cache->start); + cache->start); + goto out; + } else if (map->num_stripes == num_conventional) { + cache->alloc_offset = last_alloc; + cache->zone_is_active = 1; goto out; } } @@ -1495,13 +1504,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } - if (cache->zone_is_active) { - btrfs_get_block_group(cache); - spin_lock(&fs_info->zone_active_bgs_lock); - list_add_tail(&cache->active_bg_list, &fs_info->zone_active_bgs); - spin_unlock(&fs_info->zone_active_bgs_lock); - } - out: if (cache->alloc_offset > fs_info->zone_size) { btrfs_err(fs_info, @@ -1526,10 +1528,16 @@ out: ret = -EIO; } - if (!ret) + if (!ret) { cache->meta_write_pointer = cache->alloc_offset + cache->start; - - if (ret) { + if (cache->zone_is_active) { + btrfs_get_block_group(cache); + spin_lock(&fs_info->zone_active_bgs_lock); + list_add_tail(&cache->active_bg_list, + &fs_info->zone_active_bgs); + spin_unlock(&fs_info->zone_active_bgs_lock); + } + } else { kfree(cache->physical_map); cache->physical_map = NULL; } @@ -1910,10 +1918,44 @@ out_unlock: return ret; } +static void wait_eb_writebacks(struct btrfs_block_group *block_group) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + const u64 end = block_group->start + block_group->length; + struct radix_tree_iter iter; + struct extent_buffer *eb; + void __rcu **slot; + + rcu_read_lock(); + radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, + block_group->start >> fs_info->sectorsize_bits) { + eb = radix_tree_deref_slot(slot); + if (!eb) + continue; + if (radix_tree_deref_retry(eb)) { + slot = radix_tree_iter_retry(&iter); + continue; + } + + if (eb->start < block_group->start) + continue; + if (eb->start >= end) + break; + + slot = radix_tree_iter_resume(slot, &iter); + rcu_read_unlock(); + wait_on_extent_buffer_writeback(eb); + rcu_read_lock(); + } + rcu_read_unlock(); +} + static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct map_lookup *map; + const bool is_metadata = (block_group->flags & + (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)); int ret = 0; int i; @@ -1924,8 +1966,7 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ } /* Check if we have unwritten allocated space */ - if ((block_group->flags & - (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) && + if (is_metadata && block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) { spin_unlock(&block_group->lock); return -EAGAIN; @@ -1950,6 +1991,9 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ /* No need to wait for NOCOW writers. Zoned mode does not allow that */ btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start, block_group->length); + /* Wait for extent buffers to be written. */ + if (is_metadata) + wait_eb_writebacks(block_group); spin_lock(&block_group->lock); @@ -2007,8 +2051,7 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ /* For active_bg_list */ btrfs_put_block_group(block_group); - clear_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags); - wake_up_all(&fs_info->zone_finish_wait); + clear_and_wake_up_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags); return 0; } diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 6cba2c6de2f9..2ad58c465208 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -111,6 +111,7 @@ struct cachefiles_cache { char *tag; /* cache binding tag */ refcount_t unbind_pincount;/* refcount to do daemon unbind */ struct xarray reqs; /* xarray of pending on-demand requests */ + unsigned long req_id_next; struct xarray ondemand_ids; /* xarray for ondemand_id allocation */ u32 ondemand_id_next; }; diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index 1fee702d5529..0254ed39f68c 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -158,9 +158,13 @@ int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args) /* fail OPEN request if daemon reports an error */ if (size < 0) { - if (!IS_ERR_VALUE(size)) - size = -EINVAL; - req->error = size; + if (!IS_ERR_VALUE(size)) { + req->error = -EINVAL; + ret = -EINVAL; + } else { + req->error = size; + ret = 0; + } goto out; } @@ -238,14 +242,19 @@ ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache, unsigned long id = 0; size_t n; int ret = 0; - XA_STATE(xas, &cache->reqs, 0); + XA_STATE(xas, &cache->reqs, cache->req_id_next); /* - * Search for a request that has not ever been processed, to prevent - * requests from being processed repeatedly. + * Cyclically search for a request that has not ever been processed, + * to prevent requests from being processed repeatedly, and make + * request distribution fair. */ xa_lock(&cache->reqs); req = xas_find_marked(&xas, UINT_MAX, CACHEFILES_REQ_NEW); + if (!req && cache->req_id_next > 0) { + xas_set(&xas, 0); + req = xas_find_marked(&xas, cache->req_id_next - 1, CACHEFILES_REQ_NEW); + } if (!req) { xa_unlock(&cache->reqs); return 0; @@ -260,6 +269,7 @@ ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache, } xas_clear_mark(&xas, CACHEFILES_REQ_NEW); + cache->req_id_next = xas.xa_index + 1; xa_unlock(&cache->reqs); id = xas.xa_index; diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 11fd85de7217..c05477e28cff 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -42,7 +42,7 @@ void cifs_dump_detail(void *buf, struct TCP_Server_Info *server) smb->Command, smb->Status.CifsError, smb->Flags, smb->Flags2, smb->Mid, smb->Pid); cifs_dbg(VFS, "smb buf %p len %u\n", smb, - server->ops->calc_smb_size(smb, server)); + server->ops->calc_smb_size(smb)); #endif /* CONFIG_CIFS_DEBUG2 */ } diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 8f7835ccbca1..46f5718754f9 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -32,10 +32,9 @@ int __cifs_calc_signature(struct smb_rqst *rqst, int rc; struct kvec *iov = rqst->rq_iov; int n_vec = rqst->rq_nvec; - int is_smb2 = server->vals->header_preamble_size == 0; /* iov[0] is actual data and not the rfc1002 length for SMB2+ */ - if (is_smb2) { + if (!is_smb1(server)) { if (iov[0].iov_len <= 4) return -EIO; i = 0; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index f54d8bf2732a..8042d7280dec 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -1248,6 +1248,12 @@ ssize_t cifs_file_copychunk_range(unsigned int xid, lock_two_nondirectories(target_inode, src_inode); cifs_dbg(FYI, "about to flush pages\n"); + + rc = filemap_write_and_wait_range(src_inode->i_mapping, off, + off + len - 1); + if (rc) + goto out; + /* should we flush first and last page first */ truncate_inode_pages(&target_inode->i_data, 0); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 81f4c15936d0..5b4a7a32bdc5 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -153,6 +153,6 @@ extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ /* when changing internal version - update following two lines at same time */ -#define SMB3_PRODUCT_BUILD 38 -#define CIFS_VERSION "2.38" +#define SMB3_PRODUCT_BUILD 39 +#define CIFS_VERSION "2.39" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index bc0ee2d4b47b..ae7f571a7dba 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -417,7 +417,7 @@ struct smb_version_operations { int (*close_dir)(const unsigned int, struct cifs_tcon *, struct cifs_fid *); /* calculate a size of SMB message */ - unsigned int (*calc_smb_size)(void *buf, struct TCP_Server_Info *ptcpi); + unsigned int (*calc_smb_size)(void *buf); /* check for STATUS_PENDING and process the response if yes */ bool (*is_status_pending)(char *buf, struct TCP_Server_Info *server); /* check for STATUS_NETWORK_SESSION_EXPIRED */ @@ -557,6 +557,8 @@ struct smb_version_values { #define HEADER_SIZE(server) (server->vals->header_size) #define MAX_HEADER_SIZE(server) (server->vals->max_header_size) +#define HEADER_PREAMBLE_SIZE(server) (server->vals->header_preamble_size) +#define MID_HEADER_SIZE(server) (HEADER_SIZE(server) - 1 - HEADER_PREAMBLE_SIZE(server)) /** * CIFS superblock mount flags (mnt_cifs_flags) to consider when @@ -750,6 +752,11 @@ struct TCP_Server_Info { #endif }; +static inline bool is_smb1(struct TCP_Server_Info *server) +{ + return HEADER_PREAMBLE_SIZE(server) != 0; +} + static inline void cifs_server_lock(struct TCP_Server_Info *server) { unsigned int nofs_flag = memalloc_nofs_save(); diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 87a77a684339..3bc94bcc7177 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -151,7 +151,7 @@ extern int cifs_get_writable_path(struct cifs_tcon *tcon, const char *name, extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool); extern int cifs_get_readable_path(struct cifs_tcon *tcon, const char *name, struct cifsFileInfo **ret_file); -extern unsigned int smbCalcSize(void *buf, struct TCP_Server_Info *server); +extern unsigned int smbCalcSize(void *buf); extern int decode_negTokenInit(unsigned char *security_blob, int length, struct TCP_Server_Info *server); extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len); diff --git a/fs/cifs/cifsroot.c b/fs/cifs/cifsroot.c index 9e91a5a40aae..56ec1b233f52 100644 --- a/fs/cifs/cifsroot.c +++ b/fs/cifs/cifsroot.c @@ -59,7 +59,7 @@ static int __init cifs_root_setup(char *line) pr_err("Root-CIFS: UNC path too long\n"); return 1; } - strlcpy(root_dev, line, len); + strscpy(root_dev, line, len); srvaddr = parse_srvaddr(&line[2], s); if (*s) { int n = snprintf(root_opts, diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 9111c025bcb8..7ae6f2c08153 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -702,9 +702,6 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg) int length = 0; int total_read; - smb_msg->msg_control = NULL; - smb_msg->msg_controllen = 0; - for (total_read = 0; msg_data_left(smb_msg); total_read += length) { try_to_freeze(); @@ -760,7 +757,7 @@ int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, unsigned int to_read) { - struct msghdr smb_msg; + struct msghdr smb_msg = {}; struct kvec iov = {.iov_base = buf, .iov_len = to_read}; iov_iter_kvec(&smb_msg.msg_iter, READ, &iov, 1, to_read); @@ -770,15 +767,13 @@ cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, ssize_t cifs_discard_from_socket(struct TCP_Server_Info *server, size_t to_read) { - struct msghdr smb_msg; + struct msghdr smb_msg = {}; /* * iov_iter_discard already sets smb_msg.type and count and iov_offset * and cifs_readv_from_socket sets msg_control and msg_controllen * so little to initialize in struct msghdr */ - smb_msg.msg_name = NULL; - smb_msg.msg_namelen = 0; iov_iter_discard(&smb_msg.msg_iter, READ, to_read); return cifs_readv_from_socket(server, &smb_msg); @@ -788,7 +783,7 @@ int cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page, unsigned int page_offset, unsigned int to_read) { - struct msghdr smb_msg; + struct msghdr smb_msg = {}; struct bio_vec bv = { .bv_page = page, .bv_len = to_read, .bv_offset = page_offset}; iov_iter_bvec(&smb_msg.msg_iter, READ, &bv, 1, to_read); @@ -871,7 +866,7 @@ smb2_get_credits_from_hdr(char *buffer, struct TCP_Server_Info *server) /* * SMB1 does not use credits. */ - if (server->vals->header_preamble_size) + if (is_smb1(server)) return 0; return le16_to_cpu(shdr->CreditRequest); @@ -1050,7 +1045,7 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid) /* make sure this will fit in a large buffer */ if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server) - - server->vals->header_preamble_size) { + HEADER_PREAMBLE_SIZE(server)) { cifs_server_dbg(VFS, "SMB response too long (%u bytes)\n", pdu_length); cifs_reconnect(server, true); return -ECONNABORTED; @@ -1065,8 +1060,7 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid) /* now read the rest */ length = cifs_read_from_socket(server, buf + HEADER_SIZE(server) - 1, - pdu_length - HEADER_SIZE(server) + 1 - + server->vals->header_preamble_size); + pdu_length - MID_HEADER_SIZE(server)); if (length < 0) return length; @@ -1122,7 +1116,7 @@ smb2_add_credits_from_hdr(char *buffer, struct TCP_Server_Info *server) /* * SMB1 does not use credits. */ - if (server->vals->header_preamble_size) + if (is_smb1(server)) return; if (shdr->CreditRequest) { @@ -1180,10 +1174,10 @@ cifs_demultiplex_thread(void *p) if (length < 0) continue; - if (server->vals->header_preamble_size == 0) - server->total_read = 0; - else + if (is_smb1(server)) server->total_read = length; + else + server->total_read = 0; /* * The right amount was read from socket - 4 bytes, @@ -1198,8 +1192,7 @@ next_pdu: server->pdu_size = pdu_length; /* make sure we have enough to get to the MID */ - if (server->pdu_size < HEADER_SIZE(server) - 1 - - server->vals->header_preamble_size) { + if (server->pdu_size < MID_HEADER_SIZE(server)) { cifs_server_dbg(VFS, "SMB response too short (%u bytes)\n", server->pdu_size); cifs_reconnect(server, true); @@ -1208,9 +1201,8 @@ next_pdu: /* read down to the MID */ length = cifs_read_from_socket(server, - buf + server->vals->header_preamble_size, - HEADER_SIZE(server) - 1 - - server->vals->header_preamble_size); + buf + HEADER_PREAMBLE_SIZE(server), + MID_HEADER_SIZE(server)); if (length < 0) continue; server->total_read += length; @@ -2353,7 +2345,9 @@ cifs_put_tcon(struct cifs_tcon *tcon) ses = tcon->ses; cifs_dbg(FYI, "%s: tc_count=%d\n", __func__, tcon->tc_count); spin_lock(&cifs_tcp_ses_lock); + spin_lock(&tcon->tc_lock); if (--tcon->tc_count > 0) { + spin_unlock(&tcon->tc_lock); spin_unlock(&cifs_tcp_ses_lock); return; } @@ -2362,6 +2356,7 @@ cifs_put_tcon(struct cifs_tcon *tcon) WARN_ON(tcon->tc_count < 0); list_del_init(&tcon->tcon_list); + spin_unlock(&tcon->tc_lock); spin_unlock(&cifs_tcp_ses_lock); /* cancel polling of interfaces */ @@ -3994,7 +3989,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, } bcc_ptr += length + 1; bytes_left -= (length + 1); - strlcpy(tcon->treeName, tree, sizeof(tcon->treeName)); + strscpy(tcon->treeName, tree, sizeof(tcon->treeName)); /* mostly informational -- no need to fail on error here */ kfree(tcon->nativeFileSystem); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index fa738adc031f..6f38b134a346 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3575,6 +3575,9 @@ static ssize_t __cifs_writev( ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from) { + struct file *file = iocb->ki_filp; + + cifs_revalidate_mapping(file->f_inode); return __cifs_writev(iocb, from, true); } diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 34d990f06fd6..87f60f736731 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -354,7 +354,7 @@ checkSMB(char *buf, unsigned int total_read, struct TCP_Server_Info *server) /* otherwise, there is enough to get to the BCC */ if (check_smb_hdr(smb)) return -EIO; - clc_len = smbCalcSize(smb, server); + clc_len = smbCalcSize(smb); if (4 + rfclen != total_read) { cifs_dbg(VFS, "Length read does not match RFC1001 length %d\n", @@ -737,6 +737,8 @@ cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode) list_for_each_entry(cfile, &cifs_inode->openFileList, flist) { if (delayed_work_pending(&cfile->deferred)) { if (cancel_delayed_work(&cfile->deferred)) { + cifs_del_deferred_close(cfile); + tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC); if (tmp_list == NULL) break; @@ -766,6 +768,8 @@ cifs_close_all_deferred_files(struct cifs_tcon *tcon) list_for_each_entry(cfile, &tcon->openFileList, tlist) { if (delayed_work_pending(&cfile->deferred)) { if (cancel_delayed_work(&cfile->deferred)) { + cifs_del_deferred_close(cfile); + tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC); if (tmp_list == NULL) break; @@ -799,6 +803,8 @@ cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path) if (strstr(full_path, path)) { if (delayed_work_pending(&cfile->deferred)) { if (cancel_delayed_work(&cfile->deferred)) { + cifs_del_deferred_close(cfile); + tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC); if (tmp_list == NULL) break; diff --git a/fs/cifs/netlink.c b/fs/cifs/netlink.c index 291cb606f149..147d9409252c 100644 --- a/fs/cifs/netlink.c +++ b/fs/cifs/netlink.c @@ -51,6 +51,7 @@ struct genl_family cifs_genl_family = { .policy = cifs_genl_policy, .ops = cifs_genl_ops, .n_ops = ARRAY_SIZE(cifs_genl_ops), + .resv_start_op = CIFS_GENL_CMD_SWN_NOTIFY + 1, .mcgrps = cifs_genl_mcgrps, .n_mcgrps = ARRAY_SIZE(cifs_genl_mcgrps), }; diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index 28caae7aed1b..1b52e6ac431c 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -909,7 +909,7 @@ map_and_check_smb_error(struct mid_q_entry *mid, bool logErr) * portion, the number of word parameters and the data portion of the message */ unsigned int -smbCalcSize(void *buf, struct TCP_Server_Info *server) +smbCalcSize(void *buf) { struct smb_hdr *ptr = buf; return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) + diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 2eece8a07c11..8e060c00c969 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -806,8 +806,7 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, end_of_smb = cfile->srch_inf.ntwrk_buf_start + server->ops->calc_smb_size( - cfile->srch_inf.ntwrk_buf_start, - server); + cfile->srch_inf.ntwrk_buf_start); cur_ent = cfile->srch_inf.srch_entries_start; first_entry_in_buffer = cfile->srch_inf.index_of_last_entry @@ -1161,8 +1160,7 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) cifs_dbg(FYI, "loop through %d times filling dir for net buf %p\n", num_to_fill, cifsFile->srch_inf.ntwrk_buf_start); max_len = tcon->ses->server->ops->calc_smb_size( - cifsFile->srch_inf.ntwrk_buf_start, - tcon->ses->server); + cifsFile->srch_inf.ntwrk_buf_start); end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len; tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL); diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index f5dcc4940b6d..9dfd2dd612c2 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -61,7 +61,6 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, nr_ioctl_req.Reserved = 0; rc = SMB2_ioctl(xid, oparms->tcon, fid->persistent_fid, fid->volatile_fid, FSCTL_LMR_REQUEST_RESILIENCY, - true /* is_fsctl */, (char *)&nr_ioctl_req, sizeof(nr_ioctl_req), CIFSMaxBufSize, NULL, NULL /* no return info */); if (rc == -EOPNOTSUPP) { diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 6a6ec6efb45a..d73e5672aac4 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -222,7 +222,7 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server) } } - calc_len = smb2_calc_size(buf, server); + calc_len = smb2_calc_size(buf); /* For SMB2_IOCTL, OutputOffset and OutputLength are optional, so might * be 0, and not a real miscalculation */ @@ -410,7 +410,7 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *shdr) * portion, the number of word parameters and the data portion of the message. */ unsigned int -smb2_calc_size(void *buf, struct TCP_Server_Info *srvr) +smb2_calc_size(void *buf) { struct smb2_pdu *pdu = buf; struct smb2_hdr *shdr = &pdu->hdr; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index f406af596887..421be43af425 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -387,7 +387,7 @@ smb2_dump_detail(void *buf, struct TCP_Server_Info *server) shdr->Command, shdr->Status, shdr->Flags, shdr->MessageId, shdr->Id.SyncId.ProcessId); cifs_server_dbg(VFS, "smb buf %p len %u\n", buf, - server->ops->calc_smb_size(buf, server)); + server->ops->calc_smb_size(buf)); #endif } @@ -681,7 +681,7 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) struct cifs_ses *ses = tcon->ses; rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, - FSCTL_QUERY_NETWORK_INTERFACE_INFO, true /* is_fsctl */, + FSCTL_QUERY_NETWORK_INTERFACE_INFO, NULL /* no data input */, 0 /* no data input */, CIFSMaxBufSize, (char **)&out_buf, &ret_data_len); if (rc == -EOPNOTSUPP) { @@ -1323,9 +1323,8 @@ SMB2_request_res_key(const unsigned int xid, struct cifs_tcon *tcon, struct resume_key_req *res_key; rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid, - FSCTL_SRV_REQUEST_RESUME_KEY, true /* is_fsctl */, - NULL, 0 /* no input */, CIFSMaxBufSize, - (char **)&res_key, &ret_data_len); + FSCTL_SRV_REQUEST_RESUME_KEY, NULL, 0 /* no input */, + CIFSMaxBufSize, (char **)&res_key, &ret_data_len); if (rc == -EOPNOTSUPP) { pr_warn_once("Server share %s does not support copy range\n", tcon->treeName); @@ -1467,7 +1466,7 @@ smb2_ioctl_query_info(const unsigned int xid, rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE; rc = SMB2_ioctl_init(tcon, server, &rqst[1], COMPOUND_FID, COMPOUND_FID, - qi.info_type, true, buffer, qi.output_buffer_length, + qi.info_type, buffer, qi.output_buffer_length, CIFSMaxBufSize - MAX_SMB2_CREATE_RESPONSE_SIZE - MAX_SMB2_CLOSE_RESPONSE_SIZE); free_req1_func = SMB2_ioctl_free; @@ -1601,17 +1600,8 @@ smb2_copychunk_range(const unsigned int xid, int chunks_copied = 0; bool chunk_sizes_updated = false; ssize_t bytes_written, total_bytes_written = 0; - struct inode *inode; pcchunk = kmalloc(sizeof(struct copychunk_ioctl), GFP_KERNEL); - - /* - * We need to flush all unwritten data before we can send the - * copychunk ioctl to the server. - */ - inode = d_inode(trgtfile->dentry); - filemap_write_and_wait(inode->i_mapping); - if (pcchunk == NULL) return -ENOMEM; @@ -1643,9 +1633,8 @@ smb2_copychunk_range(const unsigned int xid, retbuf = NULL; rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid, trgtfile->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE, - true /* is_fsctl */, (char *)pcchunk, - sizeof(struct copychunk_ioctl), CIFSMaxBufSize, - (char **)&retbuf, &ret_data_len); + (char *)pcchunk, sizeof(struct copychunk_ioctl), + CIFSMaxBufSize, (char **)&retbuf, &ret_data_len); if (rc == 0) { if (ret_data_len != sizeof(struct copychunk_ioctl_rsp)) { @@ -1805,7 +1794,6 @@ static bool smb2_set_sparse(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, FSCTL_SET_SPARSE, - true /* is_fctl */, &setsparse, 1, CIFSMaxBufSize, NULL, NULL); if (rc) { tcon->broken_sparse_sup = true; @@ -1888,7 +1876,6 @@ smb2_duplicate_extents(const unsigned int xid, rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid, trgtfile->fid.volatile_fid, FSCTL_DUPLICATE_EXTENTS_TO_FILE, - true /* is_fsctl */, (char *)&dup_ext_buf, sizeof(struct duplicate_extents_to_file), CIFSMaxBufSize, NULL, @@ -1923,7 +1910,6 @@ smb3_set_integrity(const unsigned int xid, struct cifs_tcon *tcon, return SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, FSCTL_SET_INTEGRITY_INFORMATION, - true /* is_fsctl */, (char *)&integr_info, sizeof(struct fsctl_set_integrity_information_req), CIFSMaxBufSize, NULL, @@ -1976,7 +1962,6 @@ smb3_enum_snapshots(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, FSCTL_SRV_ENUMERATE_SNAPSHOTS, - true /* is_fsctl */, NULL, 0 /* no input data */, max_response_size, (char **)&retbuf, &ret_data_len); @@ -2699,7 +2684,6 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses, do { rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, FSCTL_DFS_GET_REFERRALS, - true /* is_fsctl */, (char *)dfs_req, dfs_req_size, CIFSMaxBufSize, (char **)&dfs_rsp, &dfs_rsp_size); if (!is_retryable_error(rc)) @@ -2906,8 +2890,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_ioctl_init(tcon, server, &rqst[1], fid.persistent_fid, - fid.volatile_fid, FSCTL_GET_REPARSE_POINT, - true /* is_fctl */, NULL, 0, + fid.volatile_fid, FSCTL_GET_REPARSE_POINT, NULL, 0, CIFSMaxBufSize - MAX_SMB2_CREATE_RESPONSE_SIZE - MAX_SMB2_CLOSE_RESPONSE_SIZE); @@ -3087,8 +3070,7 @@ smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_ioctl_init(tcon, server, &rqst[1], COMPOUND_FID, - COMPOUND_FID, FSCTL_GET_REPARSE_POINT, - true /* is_fctl */, NULL, 0, + COMPOUND_FID, FSCTL_GET_REPARSE_POINT, NULL, 0, CIFSMaxBufSize - MAX_SMB2_CREATE_RESPONSE_SIZE - MAX_SMB2_CLOSE_RESPONSE_SIZE); @@ -3316,26 +3298,43 @@ get_smb2_acl(struct cifs_sb_info *cifs_sb, return pntsd; } +static long smb3_zero_data(struct file *file, struct cifs_tcon *tcon, + loff_t offset, loff_t len, unsigned int xid) +{ + struct cifsFileInfo *cfile = file->private_data; + struct file_zero_data_information fsctl_buf; + + cifs_dbg(FYI, "Offset %lld len %lld\n", offset, len); + + fsctl_buf.FileOffset = cpu_to_le64(offset); + fsctl_buf.BeyondFinalZero = cpu_to_le64(offset + len); + + return SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA, + (char *)&fsctl_buf, + sizeof(struct file_zero_data_information), + 0, NULL, NULL); +} + static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, loff_t offset, loff_t len, bool keep_size) { struct cifs_ses *ses = tcon->ses; - struct inode *inode; - struct cifsInodeInfo *cifsi; + struct inode *inode = file_inode(file); + struct cifsInodeInfo *cifsi = CIFS_I(inode); struct cifsFileInfo *cfile = file->private_data; - struct file_zero_data_information fsctl_buf; long rc; unsigned int xid; __le64 eof; xid = get_xid(); - inode = d_inode(cfile->dentry); - cifsi = CIFS_I(inode); - trace_smb3_zero_enter(xid, cfile->fid.persistent_fid, tcon->tid, ses->Suid, offset, len); + inode_lock(inode); + filemap_invalidate_lock(inode->i_mapping); + /* * We zero the range through ioctl, so we need remove the page caches * first, otherwise the data may be inconsistent with the server. @@ -3343,26 +3342,12 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, truncate_pagecache_range(inode, offset, offset + len - 1); /* if file not oplocked can't be sure whether asking to extend size */ - if (!CIFS_CACHE_READ(cifsi)) - if (keep_size == false) { - rc = -EOPNOTSUPP; - trace_smb3_zero_err(xid, cfile->fid.persistent_fid, - tcon->tid, ses->Suid, offset, len, rc); - free_xid(xid); - return rc; - } - - cifs_dbg(FYI, "Offset %lld len %lld\n", offset, len); - - fsctl_buf.FileOffset = cpu_to_le64(offset); - fsctl_buf.BeyondFinalZero = cpu_to_le64(offset + len); + rc = -EOPNOTSUPP; + if (keep_size == false && !CIFS_CACHE_READ(cifsi)) + goto zero_range_exit; - rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, - cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA, true, - (char *)&fsctl_buf, - sizeof(struct file_zero_data_information), - 0, NULL, NULL); - if (rc) + rc = smb3_zero_data(file, tcon, offset, len, xid); + if (rc < 0) goto zero_range_exit; /* @@ -3375,6 +3360,8 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, } zero_range_exit: + filemap_invalidate_unlock(inode->i_mapping); + inode_unlock(inode); free_xid(xid); if (rc) trace_smb3_zero_err(xid, cfile->fid.persistent_fid, tcon->tid, @@ -3388,7 +3375,7 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, loff_t offset, loff_t len) { - struct inode *inode; + struct inode *inode = file_inode(file); struct cifsFileInfo *cfile = file->private_data; struct file_zero_data_information fsctl_buf; long rc; @@ -3397,14 +3384,12 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, xid = get_xid(); - inode = d_inode(cfile->dentry); - + inode_lock(inode); /* Need to make file sparse, if not already, before freeing range. */ /* Consider adding equivalent for compressed since it could also work */ if (!smb2_set_sparse(xid, tcon, cfile, inode, set_sparse)) { rc = -EOPNOTSUPP; - free_xid(xid); - return rc; + goto out; } filemap_invalidate_lock(inode->i_mapping); @@ -3421,11 +3406,13 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA, - true /* is_fctl */, (char *)&fsctl_buf, + (char *)&fsctl_buf, sizeof(struct file_zero_data_information), CIFSMaxBufSize, NULL, NULL); - free_xid(xid); filemap_invalidate_unlock(inode->i_mapping); +out: + inode_unlock(inode); + free_xid(xid); return rc; } @@ -3481,7 +3468,7 @@ static int smb3_simple_fallocate_range(unsigned int xid, in_data.length = cpu_to_le64(len); rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, - FSCTL_QUERY_ALLOCATED_RANGES, true, + FSCTL_QUERY_ALLOCATED_RANGES, (char *)&in_data, sizeof(in_data), 1024 * sizeof(struct file_allocated_range_buffer), (char **)&out_data, &out_data_len); @@ -3682,39 +3669,50 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon, { int rc; unsigned int xid; - struct inode *inode; + struct inode *inode = file_inode(file); struct cifsFileInfo *cfile = file->private_data; - struct cifsInodeInfo *cifsi; + struct cifsInodeInfo *cifsi = CIFS_I(inode); __le64 eof; + loff_t old_eof; xid = get_xid(); - inode = d_inode(cfile->dentry); - cifsi = CIFS_I(inode); + inode_lock(inode); - if (off >= i_size_read(inode) || - off + len >= i_size_read(inode)) { + old_eof = i_size_read(inode); + if ((off >= old_eof) || + off + len >= old_eof) { rc = -EINVAL; goto out; } + filemap_invalidate_lock(inode->i_mapping); + rc = filemap_write_and_wait_range(inode->i_mapping, off, old_eof - 1); + if (rc < 0) + goto out_2; + + truncate_pagecache_range(inode, off, old_eof); + rc = smb2_copychunk_range(xid, cfile, cfile, off + len, - i_size_read(inode) - off - len, off); + old_eof - off - len, off); if (rc < 0) - goto out; + goto out_2; - eof = cpu_to_le64(i_size_read(inode) - len); + eof = cpu_to_le64(old_eof - len); rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, cfile->pid, &eof); if (rc < 0) - goto out; + goto out_2; rc = 0; cifsi->server_eof = i_size_read(inode) - len; truncate_setsize(inode, cifsi->server_eof); fscache_resize_cookie(cifs_inode_cookie(inode), cifsi->server_eof); +out_2: + filemap_invalidate_unlock(inode->i_mapping); out: + inode_unlock(inode); free_xid(xid); return rc; } @@ -3725,34 +3723,47 @@ static long smb3_insert_range(struct file *file, struct cifs_tcon *tcon, int rc; unsigned int xid; struct cifsFileInfo *cfile = file->private_data; + struct inode *inode = file_inode(file); __le64 eof; - __u64 count; + __u64 count, old_eof; xid = get_xid(); - if (off >= i_size_read(file->f_inode)) { + inode_lock(inode); + + old_eof = i_size_read(inode); + if (off >= old_eof) { rc = -EINVAL; goto out; } - count = i_size_read(file->f_inode) - off; - eof = cpu_to_le64(i_size_read(file->f_inode) + len); + count = old_eof - off; + eof = cpu_to_le64(old_eof + len); + + filemap_invalidate_lock(inode->i_mapping); + rc = filemap_write_and_wait_range(inode->i_mapping, off, old_eof + len - 1); + if (rc < 0) + goto out_2; + truncate_pagecache_range(inode, off, old_eof); rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, cfile->pid, &eof); if (rc < 0) - goto out; + goto out_2; rc = smb2_copychunk_range(xid, cfile, cfile, off, count, off + len); if (rc < 0) - goto out; + goto out_2; - rc = smb3_zero_range(file, tcon, off, len, 1); + rc = smb3_zero_data(file, tcon, off, len, xid); if (rc < 0) - goto out; + goto out_2; rc = 0; +out_2: + filemap_invalidate_unlock(inode->i_mapping); out: + inode_unlock(inode); free_xid(xid); return rc; } @@ -3802,7 +3813,7 @@ static loff_t smb3_llseek(struct file *file, struct cifs_tcon *tcon, loff_t offs rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, - FSCTL_QUERY_ALLOCATED_RANGES, true, + FSCTL_QUERY_ALLOCATED_RANGES, (char *)&in_data, sizeof(in_data), sizeof(struct file_allocated_range_buffer), (char **)&out_data, &out_data_len); @@ -3862,7 +3873,7 @@ static int smb3_fiemap(struct cifs_tcon *tcon, rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, - FSCTL_QUERY_ALLOCATED_RANGES, true, + FSCTL_QUERY_ALLOCATED_RANGES, (char *)&in_data, sizeof(in_data), 1024 * sizeof(struct file_allocated_range_buffer), (char **)&out_data, &out_data_len); diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 9b31ea946d45..6352ab32c7e7 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -965,16 +965,17 @@ SMB2_negotiate(const unsigned int xid, } else if (rc != 0) goto neg_exit; + rc = -EIO; if (strcmp(server->vals->version_string, SMB3ANY_VERSION_STRING) == 0) { if (rsp->DialectRevision == cpu_to_le16(SMB20_PROT_ID)) { cifs_server_dbg(VFS, "SMB2 dialect returned but not requested\n"); - return -EIO; + goto neg_exit; } else if (rsp->DialectRevision == cpu_to_le16(SMB21_PROT_ID)) { cifs_server_dbg(VFS, "SMB2.1 dialect returned but not requested\n"); - return -EIO; + goto neg_exit; } else if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID)) { /* ops set to 3.0 by default for default so update */ server->ops = &smb311_operations; @@ -985,7 +986,7 @@ SMB2_negotiate(const unsigned int xid, if (rsp->DialectRevision == cpu_to_le16(SMB20_PROT_ID)) { cifs_server_dbg(VFS, "SMB2 dialect returned but not requested\n"); - return -EIO; + goto neg_exit; } else if (rsp->DialectRevision == cpu_to_le16(SMB21_PROT_ID)) { /* ops set to 3.0 by default for default so update */ server->ops = &smb21_operations; @@ -999,7 +1000,7 @@ SMB2_negotiate(const unsigned int xid, /* if requested single dialect ensure returned dialect matched */ cifs_server_dbg(VFS, "Invalid 0x%x dialect returned: not requested\n", le16_to_cpu(rsp->DialectRevision)); - return -EIO; + goto neg_exit; } cifs_dbg(FYI, "mode 0x%x\n", rsp->SecurityMode); @@ -1017,9 +1018,10 @@ SMB2_negotiate(const unsigned int xid, else { cifs_server_dbg(VFS, "Invalid dialect returned by server 0x%x\n", le16_to_cpu(rsp->DialectRevision)); - rc = -EIO; goto neg_exit; } + + rc = 0; server->dialect = le16_to_cpu(rsp->DialectRevision); /* @@ -1173,7 +1175,7 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) } rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, - FSCTL_VALIDATE_NEGOTIATE_INFO, true /* is_fsctl */, + FSCTL_VALIDATE_NEGOTIATE_INFO, (char *)pneg_inbuf, inbuflen, CIFSMaxBufSize, (char **)&pneg_rsp, &rsplen); if (rc == -EOPNOTSUPP) { @@ -1928,7 +1930,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, tcon->capabilities = rsp->Capabilities; /* we keep caps little endian */ tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess); tcon->tid = le32_to_cpu(rsp->hdr.Id.SyncId.TreeId); - strlcpy(tcon->treeName, tree, sizeof(tcon->treeName)); + strscpy(tcon->treeName, tree, sizeof(tcon->treeName)); if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) && ((tcon->share_flags & SHI1005_FLAGS_DFS) == 0)) @@ -2572,19 +2574,15 @@ alloc_path_with_tree_prefix(__le16 **out_path, int *out_size, int *out_len, path_len = UniStrnlen((wchar_t *)path, PATH_MAX); - /* - * make room for one path separator between the treename and - * path - */ - *out_len = treename_len + 1 + path_len; + /* make room for one path separator only if @path isn't empty */ + *out_len = treename_len + (path[0] ? 1 : 0) + path_len; /* - * final path needs to be null-terminated UTF16 with a - * size aligned to 8 + * final path needs to be 8-byte aligned as specified in + * MS-SMB2 2.2.13 SMB2 CREATE Request. */ - - *out_size = roundup((*out_len+1)*2, 8); - *out_path = kzalloc(*out_size, GFP_KERNEL); + *out_size = roundup(*out_len * sizeof(__le16), 8); + *out_path = kzalloc(*out_size + sizeof(__le16) /* null */, GFP_KERNEL); if (!*out_path) return -ENOMEM; @@ -3056,7 +3054,7 @@ int SMB2_ioctl_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, struct smb_rqst *rqst, u64 persistent_fid, u64 volatile_fid, u32 opcode, - bool is_fsctl, char *in_data, u32 indatalen, + char *in_data, u32 indatalen, __u32 max_response_size) { struct smb2_ioctl_req *req; @@ -3131,10 +3129,8 @@ SMB2_ioctl_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, req->hdr.CreditCharge = cpu_to_le16(DIV_ROUND_UP(max(indatalen, max_response_size), SMB2_MAX_BUFFER_SIZE)); - if (is_fsctl) - req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL); - else - req->Flags = 0; + /* always an FSCTL (for now) */ + req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL); /* validate negotiate request must be signed - see MS-SMB2 3.2.5.5 */ if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO) @@ -3161,9 +3157,9 @@ SMB2_ioctl_free(struct smb_rqst *rqst) */ int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, - u64 volatile_fid, u32 opcode, bool is_fsctl, - char *in_data, u32 indatalen, u32 max_out_data_len, - char **out_data, u32 *plen /* returned data len */) + u64 volatile_fid, u32 opcode, char *in_data, u32 indatalen, + u32 max_out_data_len, char **out_data, + u32 *plen /* returned data len */) { struct smb_rqst rqst; struct smb2_ioctl_rsp *rsp = NULL; @@ -3205,7 +3201,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, rc = SMB2_ioctl_init(tcon, server, &rqst, persistent_fid, volatile_fid, opcode, - is_fsctl, in_data, indatalen, max_out_data_len); + in_data, indatalen, max_out_data_len); if (rc) goto ioctl_exit; @@ -3297,7 +3293,7 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, cpu_to_le16(COMPRESSION_FORMAT_DEFAULT); rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid, - FSCTL_SET_COMPRESSION, true /* is_fsctl */, + FSCTL_SET_COMPRESSION, (char *)&fsctl_input /* data input */, 2 /* in data len */, CIFSMaxBufSize /* max out data */, &ret_data /* out data */, NULL); diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 51c5bf4a338a..3f740f24b96a 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -23,7 +23,7 @@ struct smb_rqst; extern int map_smb2_to_linux_error(char *buf, bool log_err); extern int smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *server); -extern unsigned int smb2_calc_size(void *buf, struct TCP_Server_Info *server); +extern unsigned int smb2_calc_size(void *buf); extern char *smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *shdr); extern __le16 *cifs_convert_path_to_utf16(const char *from, @@ -137,13 +137,13 @@ extern int SMB2_open_init(struct cifs_tcon *tcon, extern void SMB2_open_free(struct smb_rqst *rqst); extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, u32 opcode, - bool is_fsctl, char *in_data, u32 indatalen, u32 maxoutlen, + char *in_data, u32 indatalen, u32 maxoutlen, char **out_data, u32 *plen /* returned data len */); extern int SMB2_ioctl_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, struct smb_rqst *rqst, u64 persistent_fid, u64 volatile_fid, u32 opcode, - bool is_fsctl, char *in_data, u32 indatalen, + char *in_data, u32 indatalen, __u32 max_response_size); extern void SMB2_ioctl_free(struct smb_rqst *rqst); extern int SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index de7aeced7e16..9a2753e21170 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -194,10 +194,6 @@ smb_send_kvec(struct TCP_Server_Info *server, struct msghdr *smb_msg, *sent = 0; - smb_msg->msg_name = (struct sockaddr *) &server->dstaddr; - smb_msg->msg_namelen = sizeof(struct sockaddr); - smb_msg->msg_control = NULL; - smb_msg->msg_controllen = 0; if (server->noblocksnd) smb_msg->msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL; else @@ -261,8 +257,8 @@ smb_rqst_len(struct TCP_Server_Info *server, struct smb_rqst *rqst) int nvec; unsigned long buflen = 0; - if (server->vals->header_preamble_size == 0 && - rqst->rq_nvec >= 2 && rqst->rq_iov[0].iov_len == 4) { + if (!is_smb1(server) && rqst->rq_nvec >= 2 && + rqst->rq_iov[0].iov_len == 4) { iov = &rqst->rq_iov[1]; nvec = rqst->rq_nvec - 1; } else { @@ -309,7 +305,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, sigset_t mask, oldmask; size_t total_len = 0, sent, size; struct socket *ssocket = server->ssocket; - struct msghdr smb_msg; + struct msghdr smb_msg = {}; __be32 rfc1002_marker; if (cifs_rdma_enabled(server)) { @@ -346,7 +342,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, sigprocmask(SIG_BLOCK, &mask, &oldmask); /* Generate a rfc1002 marker for SMB2+ */ - if (server->vals->header_preamble_size == 0) { + if (!is_smb1(server)) { struct kvec hiov = { .iov_base = &rfc1002_marker, .iov_len = 4 @@ -1238,7 +1234,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, buf = (char *)midQ[i]->resp_buf; resp_iov[i].iov_base = buf; resp_iov[i].iov_len = midQ[i]->resp_buf_size + - server->vals->header_preamble_size; + HEADER_PREAMBLE_SIZE(server); if (midQ[i]->large_buf) resp_buf_type[i] = CIFS_LARGE_BUFFER; @@ -1643,7 +1639,7 @@ int cifs_discard_remaining_data(struct TCP_Server_Info *server) { unsigned int rfclen = server->pdu_size; - int remaining = rfclen + server->vals->header_preamble_size - + int remaining = rfclen + HEADER_PREAMBLE_SIZE(server) - server->total_read; while (remaining > 0) { @@ -1689,8 +1685,7 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) unsigned int data_offset, data_len; struct cifs_readdata *rdata = mid->callback_data; char *buf = server->smallbuf; - unsigned int buflen = server->pdu_size + - server->vals->header_preamble_size; + unsigned int buflen = server->pdu_size + HEADER_PREAMBLE_SIZE(server); bool use_rdma_mr = false; cifs_dbg(FYI, "%s: mid=%llu offset=%llu bytes=%u\n", @@ -1724,10 +1719,10 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) /* set up first two iov for signature check and to get credits */ rdata->iov[0].iov_base = buf; - rdata->iov[0].iov_len = server->vals->header_preamble_size; - rdata->iov[1].iov_base = buf + server->vals->header_preamble_size; + rdata->iov[0].iov_len = HEADER_PREAMBLE_SIZE(server); + rdata->iov[1].iov_base = buf + HEADER_PREAMBLE_SIZE(server); rdata->iov[1].iov_len = - server->total_read - server->vals->header_preamble_size; + server->total_read - HEADER_PREAMBLE_SIZE(server); cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n", rdata->iov[0].iov_base, rdata->iov[0].iov_len); cifs_dbg(FYI, "1: iov_base=%p iov_len=%zu\n", @@ -1752,7 +1747,7 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) } data_offset = server->ops->read_data_offset(buf) + - server->vals->header_preamble_size; + HEADER_PREAMBLE_SIZE(server); if (data_offset < server->total_read) { /* * win2k8 sometimes sends an offset of 0 when the read diff --git a/fs/coredump.c b/fs/coredump.c index 9f4aae202109..3538f3a63965 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -832,6 +832,39 @@ static int __dump_skip(struct coredump_params *cprm, size_t nr) } } +static int dump_emit_page(struct coredump_params *cprm, struct page *page) +{ + struct bio_vec bvec = { + .bv_page = page, + .bv_offset = 0, + .bv_len = PAGE_SIZE, + }; + struct iov_iter iter; + struct file *file = cprm->file; + loff_t pos; + ssize_t n; + + if (cprm->to_skip) { + if (!__dump_skip(cprm, cprm->to_skip)) + return 0; + cprm->to_skip = 0; + } + if (cprm->written + PAGE_SIZE > cprm->limit) + return 0; + if (dump_interrupted()) + return 0; + pos = file->f_pos; + iov_iter_bvec(&iter, WRITE, &bvec, 1, PAGE_SIZE); + n = __kernel_write_iter(cprm->file, &iter, &pos); + if (n != PAGE_SIZE) + return 0; + file->f_pos = pos; + cprm->written += PAGE_SIZE; + cprm->pos += PAGE_SIZE; + + return 1; +} + int dump_emit(struct coredump_params *cprm, const void *addr, int nr) { if (cprm->to_skip) { @@ -863,7 +896,6 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, for (addr = start; addr < start + len; addr += PAGE_SIZE) { struct page *page; - int stop; /* * To avoid having to allocate page tables for virtual address @@ -874,10 +906,7 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, */ page = get_dump_page(addr); if (page) { - void *kaddr = kmap_local_page(page); - - stop = !dump_emit(cprm, kaddr, PAGE_SIZE); - kunmap_local(kaddr); + int stop = !dump_emit_page(cprm, page); put_page(page); if (stop) return 0; diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 2217fe5ece6f..1b4403136d05 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -25,21 +25,25 @@ * then this function isn't applicable. This function may sleep, so it must be * called from a workqueue rather than from the bio's bi_end_io callback. * - * This function sets PG_error on any pages that contain any blocks that failed - * to be decrypted. The filesystem must not mark such pages uptodate. + * Return: %true on success; %false on failure. On failure, bio->bi_status is + * also set to an error status. */ -void fscrypt_decrypt_bio(struct bio *bio) +bool fscrypt_decrypt_bio(struct bio *bio) { struct bio_vec *bv; struct bvec_iter_all iter_all; bio_for_each_segment_all(bv, bio, iter_all) { struct page *page = bv->bv_page; - int ret = fscrypt_decrypt_pagecache_blocks(page, bv->bv_len, + int err = fscrypt_decrypt_pagecache_blocks(page, bv->bv_len, bv->bv_offset); - if (ret) - SetPageError(page); + + if (err) { + bio->bi_status = errno_to_blk_status(err); + return false; + } } + return true; } EXPORT_SYMBOL(fscrypt_decrypt_bio); diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 3afdaa084773..d5f68a0c5d15 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -184,7 +184,7 @@ struct fscrypt_symlink_data { struct fscrypt_prepared_key { struct crypto_skcipher *tfm; #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT - struct fscrypt_blk_crypto_key *blk_key; + struct blk_crypto_key *blk_key; #endif }; @@ -225,7 +225,7 @@ struct fscrypt_info { * will be NULL if the master key was found in a process-subscribed * keyring rather than in the filesystem-level keyring. */ - struct key *ci_master_key; + struct fscrypt_master_key *ci_master_key; /* * Link in list of inodes that were unlocked with the master key. @@ -344,7 +344,8 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, const struct fscrypt_info *ci); -void fscrypt_destroy_inline_crypt_key(struct fscrypt_prepared_key *prep_key); +void fscrypt_destroy_inline_crypt_key(struct super_block *sb, + struct fscrypt_prepared_key *prep_key); /* * Check whether the crypto transform or blk-crypto key has been allocated in @@ -390,7 +391,8 @@ fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, } static inline void -fscrypt_destroy_inline_crypt_key(struct fscrypt_prepared_key *prep_key) +fscrypt_destroy_inline_crypt_key(struct super_block *sb, + struct fscrypt_prepared_key *prep_key) { } @@ -437,6 +439,40 @@ struct fscrypt_master_key_secret { struct fscrypt_master_key { /* + * Back-pointer to the super_block of the filesystem to which this + * master key has been added. Only valid if ->mk_active_refs > 0. + */ + struct super_block *mk_sb; + + /* + * Link in ->mk_sb->s_master_keys->key_hashtable. + * Only valid if ->mk_active_refs > 0. + */ + struct hlist_node mk_node; + + /* Semaphore that protects ->mk_secret and ->mk_users */ + struct rw_semaphore mk_sem; + + /* + * Active and structural reference counts. An active ref guarantees + * that the struct continues to exist, continues to be in the keyring + * ->mk_sb->s_master_keys, and that any embedded subkeys (e.g. + * ->mk_direct_keys) that have been prepared continue to exist. + * A structural ref only guarantees that the struct continues to exist. + * + * There is one active ref associated with ->mk_secret being present, + * and one active ref for each inode in ->mk_decrypted_inodes. + * + * There is one structural ref associated with the active refcount being + * nonzero. Finding a key in the keyring also takes a structural ref, + * which is then held temporarily while the key is operated on. + */ + refcount_t mk_active_refs; + refcount_t mk_struct_refs; + + struct rcu_head mk_rcu_head; + + /* * The secret key material. After FS_IOC_REMOVE_ENCRYPTION_KEY is * executed, this is wiped and no new inodes can be unlocked with this * key; however, there may still be inodes in ->mk_decrypted_inodes @@ -444,7 +480,10 @@ struct fscrypt_master_key { * FS_IOC_REMOVE_ENCRYPTION_KEY can be retried, or * FS_IOC_ADD_ENCRYPTION_KEY can add the secret again. * - * Locking: protected by this master key's key->sem. + * While ->mk_secret is present, one ref in ->mk_active_refs is held. + * + * Locking: protected by ->mk_sem. The manipulation of ->mk_active_refs + * associated with this field is protected by ->mk_sem as well. */ struct fscrypt_master_key_secret mk_secret; @@ -465,23 +504,13 @@ struct fscrypt_master_key { * * This is NULL for v1 policy keys; those can only be added by root. * - * Locking: in addition to this keyring's own semaphore, this is - * protected by this master key's key->sem, so we can do atomic - * search+insert. It can also be searched without taking any locks, but - * in that case the returned key may have already been removed. + * Locking: protected by ->mk_sem. (We don't just rely on the keyrings + * subsystem semaphore ->mk_users->sem, as we need support for atomic + * search+insert along with proper synchronization with ->mk_secret.) */ struct key *mk_users; /* - * Length of ->mk_decrypted_inodes, plus one if mk_secret is present. - * Once this goes to 0, the master key is removed from ->s_master_keys. - * The 'struct fscrypt_master_key' will continue to live as long as the - * 'struct key' whose payload it is, but we won't let this reference - * count rise again. - */ - refcount_t mk_refcount; - - /* * List of inodes that were unlocked using this key. This allows the * inodes to be evicted efficiently if the key is removed. */ @@ -506,10 +535,10 @@ static inline bool is_master_key_secret_present(const struct fscrypt_master_key_secret *secret) { /* - * The READ_ONCE() is only necessary for fscrypt_drop_inode() and - * fscrypt_key_describe(). These run in atomic context, so they can't - * take the key semaphore and thus 'secret' can change concurrently - * which would be a data race. But they only need to know whether the + * The READ_ONCE() is only necessary for fscrypt_drop_inode(). + * fscrypt_drop_inode() runs in atomic context, so it can't take the key + * semaphore and thus 'secret' can change concurrently which would be a + * data race. But fscrypt_drop_inode() only need to know whether the * secret *was* present at the time of check, so READ_ONCE() suffices. */ return READ_ONCE(secret->size) != 0; @@ -538,7 +567,11 @@ static inline int master_key_spec_len(const struct fscrypt_key_specifier *spec) return 0; } -struct key * +void fscrypt_put_master_key(struct fscrypt_master_key *mk); + +void fscrypt_put_master_key_activeref(struct fscrypt_master_key *mk); + +struct fscrypt_master_key * fscrypt_find_master_key(struct super_block *sb, const struct fscrypt_key_specifier *mk_spec); @@ -569,7 +602,8 @@ extern struct fscrypt_mode fscrypt_modes[]; int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, const struct fscrypt_info *ci); -void fscrypt_destroy_prepared_key(struct fscrypt_prepared_key *prep_key); +void fscrypt_destroy_prepared_key(struct super_block *sb, + struct fscrypt_prepared_key *prep_key); int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key); diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 7c01025879b3..7b8c5a1104b5 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -5,8 +5,6 @@ * Encryption hooks for higher-level filesystem operations. */ -#include <linux/key.h> - #include "fscrypt_private.h" /** @@ -142,7 +140,6 @@ int fscrypt_prepare_setflags(struct inode *inode, unsigned int oldflags, unsigned int flags) { struct fscrypt_info *ci; - struct key *key; struct fscrypt_master_key *mk; int err; @@ -158,14 +155,13 @@ int fscrypt_prepare_setflags(struct inode *inode, ci = inode->i_crypt_info; if (ci->ci_policy.version != FSCRYPT_POLICY_V2) return -EINVAL; - key = ci->ci_master_key; - mk = key->payload.data[0]; - down_read(&key->sem); + mk = ci->ci_master_key; + down_read(&mk->mk_sem); if (is_master_key_secret_present(&mk->mk_secret)) err = fscrypt_derive_dirhash_key(ci, mk); else err = -ENOKEY; - up_read(&key->sem); + up_read(&mk->mk_sem); return err; } return 0; diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index 90f3e68f166e..cea8b14007e6 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -21,26 +21,22 @@ #include "fscrypt_private.h" -struct fscrypt_blk_crypto_key { - struct blk_crypto_key base; - int num_devs; - struct request_queue *devs[]; -}; - -static int fscrypt_get_num_devices(struct super_block *sb) +static struct block_device **fscrypt_get_devices(struct super_block *sb, + unsigned int *num_devs) { - if (sb->s_cop->get_num_devices) - return sb->s_cop->get_num_devices(sb); - return 1; -} + struct block_device **devs; -static void fscrypt_get_devices(struct super_block *sb, int num_devs, - struct request_queue **devs) -{ - if (num_devs == 1) - devs[0] = bdev_get_queue(sb->s_bdev); - else - sb->s_cop->get_devices(sb, devs); + if (sb->s_cop->get_devices) { + devs = sb->s_cop->get_devices(sb, num_devs); + if (devs) + return devs; + } + devs = kmalloc(sizeof(*devs), GFP_KERNEL); + if (!devs) + return ERR_PTR(-ENOMEM); + devs[0] = sb->s_bdev; + *num_devs = 1; + return devs; } static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_info *ci) @@ -74,15 +70,17 @@ static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_info *ci) * helpful for debugging problems where the "wrong" implementation is used. */ static void fscrypt_log_blk_crypto_impl(struct fscrypt_mode *mode, - struct request_queue **devs, - int num_devs, + struct block_device **devs, + unsigned int num_devs, const struct blk_crypto_config *cfg) { - int i; + unsigned int i; for (i = 0; i < num_devs; i++) { + struct request_queue *q = bdev_get_queue(devs[i]); + if (!IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) || - __blk_crypto_cfg_supported(devs[i]->crypto_profile, cfg)) { + __blk_crypto_cfg_supported(q->crypto_profile, cfg)) { if (!xchg(&mode->logged_blk_crypto_native, 1)) pr_info("fscrypt: %s using blk-crypto (native)\n", mode->friendly_name); @@ -99,9 +97,9 @@ int fscrypt_select_encryption_impl(struct fscrypt_info *ci) const struct inode *inode = ci->ci_inode; struct super_block *sb = inode->i_sb; struct blk_crypto_config crypto_cfg; - int num_devs; - struct request_queue **devs; - int i; + struct block_device **devs; + unsigned int num_devs; + unsigned int i; /* The file must need contents encryption, not filenames encryption */ if (!S_ISREG(inode->i_mode)) @@ -129,20 +127,20 @@ int fscrypt_select_encryption_impl(struct fscrypt_info *ci) return 0; /* - * On all the filesystem's devices, blk-crypto must support the crypto - * configuration that the file would use. + * On all the filesystem's block devices, blk-crypto must support the + * crypto configuration that the file would use. */ crypto_cfg.crypto_mode = ci->ci_mode->blk_crypto_mode; crypto_cfg.data_unit_size = sb->s_blocksize; crypto_cfg.dun_bytes = fscrypt_get_dun_bytes(ci); - num_devs = fscrypt_get_num_devices(sb); - devs = kmalloc_array(num_devs, sizeof(*devs), GFP_KERNEL); - if (!devs) - return -ENOMEM; - fscrypt_get_devices(sb, num_devs, devs); + + devs = fscrypt_get_devices(sb, &num_devs); + if (IS_ERR(devs)) + return PTR_ERR(devs); for (i = 0; i < num_devs; i++) { - if (!blk_crypto_config_supported(devs[i], &crypto_cfg)) + if (!blk_crypto_config_supported(bdev_get_queue(devs[i]), + &crypto_cfg)) goto out_free_devs; } @@ -162,49 +160,41 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, const struct inode *inode = ci->ci_inode; struct super_block *sb = inode->i_sb; enum blk_crypto_mode_num crypto_mode = ci->ci_mode->blk_crypto_mode; - int num_devs = fscrypt_get_num_devices(sb); - int queue_refs = 0; - struct fscrypt_blk_crypto_key *blk_key; + struct blk_crypto_key *blk_key; + struct block_device **devs; + unsigned int num_devs; + unsigned int i; int err; - int i; - blk_key = kzalloc(struct_size(blk_key, devs, num_devs), GFP_KERNEL); + blk_key = kmalloc(sizeof(*blk_key), GFP_KERNEL); if (!blk_key) return -ENOMEM; - blk_key->num_devs = num_devs; - fscrypt_get_devices(sb, num_devs, blk_key->devs); - - err = blk_crypto_init_key(&blk_key->base, raw_key, crypto_mode, + err = blk_crypto_init_key(blk_key, raw_key, crypto_mode, fscrypt_get_dun_bytes(ci), sb->s_blocksize); if (err) { fscrypt_err(inode, "error %d initializing blk-crypto key", err); goto fail; } - /* - * We have to start using blk-crypto on all the filesystem's devices. - * We also have to save all the request_queue's for later so that the - * key can be evicted from them. This is needed because some keys - * aren't destroyed until after the filesystem was already unmounted - * (namely, the per-mode keys in struct fscrypt_master_key). - */ + /* Start using blk-crypto on all the filesystem's block devices. */ + devs = fscrypt_get_devices(sb, &num_devs); + if (IS_ERR(devs)) { + err = PTR_ERR(devs); + goto fail; + } for (i = 0; i < num_devs; i++) { - if (!blk_get_queue(blk_key->devs[i])) { - fscrypt_err(inode, "couldn't get request_queue"); - err = -EAGAIN; - goto fail; - } - queue_refs++; - - err = blk_crypto_start_using_key(&blk_key->base, - blk_key->devs[i]); - if (err) { - fscrypt_err(inode, - "error %d starting to use blk-crypto", err); - goto fail; - } + err = blk_crypto_start_using_key(blk_key, + bdev_get_queue(devs[i])); + if (err) + break; } + kfree(devs); + if (err) { + fscrypt_err(inode, "error %d starting to use blk-crypto", err); + goto fail; + } + /* * Pairs with the smp_load_acquire() in fscrypt_is_key_prepared(). * I.e., here we publish ->blk_key with a RELEASE barrier so that @@ -215,24 +205,29 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, return 0; fail: - for (i = 0; i < queue_refs; i++) - blk_put_queue(blk_key->devs[i]); kfree_sensitive(blk_key); return err; } -void fscrypt_destroy_inline_crypt_key(struct fscrypt_prepared_key *prep_key) +void fscrypt_destroy_inline_crypt_key(struct super_block *sb, + struct fscrypt_prepared_key *prep_key) { - struct fscrypt_blk_crypto_key *blk_key = prep_key->blk_key; - int i; + struct blk_crypto_key *blk_key = prep_key->blk_key; + struct block_device **devs; + unsigned int num_devs; + unsigned int i; - if (blk_key) { - for (i = 0; i < blk_key->num_devs; i++) { - blk_crypto_evict_key(blk_key->devs[i], &blk_key->base); - blk_put_queue(blk_key->devs[i]); - } - kfree_sensitive(blk_key); + if (!blk_key) + return; + + /* Evict the key from all the filesystem's block devices. */ + devs = fscrypt_get_devices(sb, &num_devs); + if (!IS_ERR(devs)) { + for (i = 0; i < num_devs; i++) + blk_crypto_evict_key(bdev_get_queue(devs[i]), blk_key); + kfree(devs); } + kfree_sensitive(blk_key); } bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode) @@ -282,7 +277,7 @@ void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, ci = inode->i_crypt_info; fscrypt_generate_dun(ci, first_lblk, dun); - bio_crypt_set_ctx(bio, &ci->ci_enc_key.blk_key->base, dun, gfp_mask); + bio_crypt_set_ctx(bio, ci->ci_enc_key.blk_key, dun, gfp_mask); } EXPORT_SYMBOL_GPL(fscrypt_set_bio_crypt_ctx); @@ -369,7 +364,7 @@ bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, * uses the same pointer. I.e., there's currently no need to support * merging requests where the keys are the same but the pointers differ. */ - if (bc->bc_key != &inode->i_crypt_info->ci_enc_key.blk_key->base) + if (bc->bc_key != inode->i_crypt_info->ci_enc_key.blk_key) return false; fscrypt_generate_dun(inode->i_crypt_info, next_lblk, next_dun); @@ -401,46 +396,45 @@ bool fscrypt_mergeable_bio_bh(struct bio *bio, EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio_bh); /** - * fscrypt_dio_supported() - check whether a DIO (direct I/O) request is - * supported as far as encryption is concerned - * @iocb: the file and position the I/O is targeting - * @iter: the I/O data segment(s) + * fscrypt_dio_supported() - check whether DIO (direct I/O) is supported on an + * inode, as far as encryption is concerned + * @inode: the inode in question * * Return: %true if there are no encryption constraints that prevent DIO from * being supported; %false if DIO is unsupported. (Note that in the * %true case, the filesystem might have other, non-encryption-related - * constraints that prevent DIO from actually being supported.) + * constraints that prevent DIO from actually being supported. Also, on + * encrypted files the filesystem is still responsible for only allowing + * DIO when requests are filesystem-block-aligned.) */ -bool fscrypt_dio_supported(struct kiocb *iocb, struct iov_iter *iter) +bool fscrypt_dio_supported(struct inode *inode) { - const struct inode *inode = file_inode(iocb->ki_filp); - const unsigned int blocksize = i_blocksize(inode); + int err; /* If the file is unencrypted, no veto from us. */ if (!fscrypt_needs_contents_encryption(inode)) return true; - /* We only support DIO with inline crypto, not fs-layer crypto. */ - if (!fscrypt_inode_uses_inline_crypto(inode)) - return false; - /* - * Since the granularity of encryption is filesystem blocks, the file - * position and total I/O length must be aligned to the filesystem block - * size -- not just to the block device's logical block size as is - * traditionally the case for DIO on many filesystems. + * We only support DIO with inline crypto, not fs-layer crypto. * - * We require that the user-provided memory buffers be filesystem block - * aligned too. It is simpler to have a single alignment value required - * for all properties of the I/O, as is normally the case for DIO. - * Also, allowing less aligned buffers would imply that data units could - * cross bvecs, which would greatly complicate the I/O stack, which - * assumes that bios can be split at any bvec boundary. + * To determine whether the inode is using inline crypto, we have to set + * up the key if it wasn't already done. This is because in the current + * design of fscrypt, the decision of whether to use inline crypto or + * not isn't made until the inode's encryption key is being set up. In + * the DIO read/write case, the key will always be set up already, since + * the file will be open. But in the case of statx(), the key might not + * be set up yet, as the file might not have been opened yet. */ - if (!IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), blocksize)) + err = fscrypt_require_key(inode); + if (err) { + /* + * Key unavailable or couldn't be set up. This edge case isn't + * worth worrying about; just report that DIO is unsupported. + */ return false; - - return true; + } + return fscrypt_inode_uses_inline_crypto(inode); } EXPORT_SYMBOL_GPL(fscrypt_dio_supported); diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index caee9f8620dd..1cca09aa43f8 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -18,6 +18,7 @@ * information about these ioctls. */ +#include <asm/unaligned.h> #include <crypto/skcipher.h> #include <linux/key-type.h> #include <linux/random.h> @@ -25,6 +26,18 @@ #include "fscrypt_private.h" +/* The master encryption keys for a filesystem (->s_master_keys) */ +struct fscrypt_keyring { + /* + * Lock that protects ->key_hashtable. It does *not* protect the + * fscrypt_master_key structs themselves. + */ + spinlock_t lock; + + /* Hash table that maps fscrypt_key_specifier to fscrypt_master_key */ + struct hlist_head key_hashtable[128]; +}; + static void wipe_master_key_secret(struct fscrypt_master_key_secret *secret) { fscrypt_destroy_hkdf(&secret->hkdf); @@ -38,66 +51,81 @@ static void move_master_key_secret(struct fscrypt_master_key_secret *dst, memzero_explicit(src, sizeof(*src)); } -static void free_master_key(struct fscrypt_master_key *mk) +static void fscrypt_free_master_key(struct rcu_head *head) { - size_t i; - - wipe_master_key_secret(&mk->mk_secret); - - for (i = 0; i <= FSCRYPT_MODE_MAX; i++) { - fscrypt_destroy_prepared_key(&mk->mk_direct_keys[i]); - fscrypt_destroy_prepared_key(&mk->mk_iv_ino_lblk_64_keys[i]); - fscrypt_destroy_prepared_key(&mk->mk_iv_ino_lblk_32_keys[i]); - } - - key_put(mk->mk_users); + struct fscrypt_master_key *mk = + container_of(head, struct fscrypt_master_key, mk_rcu_head); + /* + * The master key secret and any embedded subkeys should have already + * been wiped when the last active reference to the fscrypt_master_key + * struct was dropped; doing it here would be unnecessarily late. + * Nevertheless, use kfree_sensitive() in case anything was missed. + */ kfree_sensitive(mk); } -static inline bool valid_key_spec(const struct fscrypt_key_specifier *spec) +void fscrypt_put_master_key(struct fscrypt_master_key *mk) { - if (spec->__reserved) - return false; - return master_key_spec_len(spec) != 0; + if (!refcount_dec_and_test(&mk->mk_struct_refs)) + return; + /* + * No structural references left, so free ->mk_users, and also free the + * fscrypt_master_key struct itself after an RCU grace period ensures + * that concurrent keyring lookups can no longer find it. + */ + WARN_ON(refcount_read(&mk->mk_active_refs) != 0); + key_put(mk->mk_users); + mk->mk_users = NULL; + call_rcu(&mk->mk_rcu_head, fscrypt_free_master_key); } -static int fscrypt_key_instantiate(struct key *key, - struct key_preparsed_payload *prep) +void fscrypt_put_master_key_activeref(struct fscrypt_master_key *mk) { - key->payload.data[0] = (struct fscrypt_master_key *)prep->data; - return 0; -} + struct super_block *sb = mk->mk_sb; + struct fscrypt_keyring *keyring = sb->s_master_keys; + size_t i; -static void fscrypt_key_destroy(struct key *key) -{ - free_master_key(key->payload.data[0]); -} + if (!refcount_dec_and_test(&mk->mk_active_refs)) + return; + /* + * No active references left, so complete the full removal of this + * fscrypt_master_key struct by removing it from the keyring and + * destroying any subkeys embedded in it. + */ -static void fscrypt_key_describe(const struct key *key, struct seq_file *m) -{ - seq_puts(m, key->description); + spin_lock(&keyring->lock); + hlist_del_rcu(&mk->mk_node); + spin_unlock(&keyring->lock); - if (key_is_positive(key)) { - const struct fscrypt_master_key *mk = key->payload.data[0]; + /* + * ->mk_active_refs == 0 implies that ->mk_secret is not present and + * that ->mk_decrypted_inodes is empty. + */ + WARN_ON(is_master_key_secret_present(&mk->mk_secret)); + WARN_ON(!list_empty(&mk->mk_decrypted_inodes)); - if (!is_master_key_secret_present(&mk->mk_secret)) - seq_puts(m, ": secret removed"); + for (i = 0; i <= FSCRYPT_MODE_MAX; i++) { + fscrypt_destroy_prepared_key( + sb, &mk->mk_direct_keys[i]); + fscrypt_destroy_prepared_key( + sb, &mk->mk_iv_ino_lblk_64_keys[i]); + fscrypt_destroy_prepared_key( + sb, &mk->mk_iv_ino_lblk_32_keys[i]); } + memzero_explicit(&mk->mk_ino_hash_key, + sizeof(mk->mk_ino_hash_key)); + mk->mk_ino_hash_key_initialized = false; + + /* Drop the structural ref associated with the active refs. */ + fscrypt_put_master_key(mk); } -/* - * Type of key in ->s_master_keys. Each key of this type represents a master - * key which has been added to the filesystem. Its payload is a - * 'struct fscrypt_master_key'. The "." prefix in the key type name prevents - * users from adding keys of this type via the keyrings syscalls rather than via - * the intended method of FS_IOC_ADD_ENCRYPTION_KEY. - */ -static struct key_type key_type_fscrypt = { - .name = "._fscrypt", - .instantiate = fscrypt_key_instantiate, - .destroy = fscrypt_key_destroy, - .describe = fscrypt_key_describe, -}; +static inline bool valid_key_spec(const struct fscrypt_key_specifier *spec) +{ + if (spec->__reserved) + return false; + return master_key_spec_len(spec) != 0; +} static int fscrypt_user_key_instantiate(struct key *key, struct key_preparsed_payload *prep) @@ -131,32 +159,6 @@ static struct key_type key_type_fscrypt_user = { .describe = fscrypt_user_key_describe, }; -/* Search ->s_master_keys or ->mk_users */ -static struct key *search_fscrypt_keyring(struct key *keyring, - struct key_type *type, - const char *description) -{ - /* - * We need to mark the keyring reference as "possessed" so that we - * acquire permission to search it, via the KEY_POS_SEARCH permission. - */ - key_ref_t keyref = make_key_ref(keyring, true /* possessed */); - - keyref = keyring_search(keyref, type, description, false); - if (IS_ERR(keyref)) { - if (PTR_ERR(keyref) == -EAGAIN || /* not found */ - PTR_ERR(keyref) == -EKEYREVOKED) /* recently invalidated */ - keyref = ERR_PTR(-ENOKEY); - return ERR_CAST(keyref); - } - return key_ref_to_ptr(keyref); -} - -#define FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE \ - (CONST_STRLEN("fscrypt-") + sizeof_field(struct super_block, s_id)) - -#define FSCRYPT_MK_DESCRIPTION_SIZE (2 * FSCRYPT_KEY_IDENTIFIER_SIZE + 1) - #define FSCRYPT_MK_USERS_DESCRIPTION_SIZE \ (CONST_STRLEN("fscrypt-") + 2 * FSCRYPT_KEY_IDENTIFIER_SIZE + \ CONST_STRLEN("-users") + 1) @@ -164,21 +166,6 @@ static struct key *search_fscrypt_keyring(struct key *keyring, #define FSCRYPT_MK_USER_DESCRIPTION_SIZE \ (2 * FSCRYPT_KEY_IDENTIFIER_SIZE + CONST_STRLEN(".uid.") + 10 + 1) -static void format_fs_keyring_description( - char description[FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE], - const struct super_block *sb) -{ - sprintf(description, "fscrypt-%s", sb->s_id); -} - -static void format_mk_description( - char description[FSCRYPT_MK_DESCRIPTION_SIZE], - const struct fscrypt_key_specifier *mk_spec) -{ - sprintf(description, "%*phN", - master_key_spec_len(mk_spec), (u8 *)&mk_spec->u); -} - static void format_mk_users_keyring_description( char description[FSCRYPT_MK_USERS_DESCRIPTION_SIZE], const u8 mk_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]) @@ -199,20 +186,15 @@ static void format_mk_user_description( /* Create ->s_master_keys if needed. Synchronized by fscrypt_add_key_mutex. */ static int allocate_filesystem_keyring(struct super_block *sb) { - char description[FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE]; - struct key *keyring; + struct fscrypt_keyring *keyring; if (sb->s_master_keys) return 0; - format_fs_keyring_description(description, sb); - keyring = keyring_alloc(description, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, - current_cred(), KEY_POS_SEARCH | - KEY_USR_SEARCH | KEY_USR_READ | KEY_USR_VIEW, - KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); - if (IS_ERR(keyring)) - return PTR_ERR(keyring); - + keyring = kzalloc(sizeof(*keyring), GFP_KERNEL); + if (!keyring) + return -ENOMEM; + spin_lock_init(&keyring->lock); /* * Pairs with the smp_load_acquire() in fscrypt_find_master_key(). * I.e., here we publish ->s_master_keys with a RELEASE barrier so that @@ -222,21 +204,75 @@ static int allocate_filesystem_keyring(struct super_block *sb) return 0; } -void fscrypt_sb_free(struct super_block *sb) +/* + * This is called at unmount time to release all encryption keys that have been + * added to the filesystem, along with the keyring that contains them. + * + * Note that besides clearing and freeing memory, this might need to evict keys + * from the keyslots of an inline crypto engine. Therefore, this must be called + * while the filesystem's underlying block device(s) are still available. + */ +void fscrypt_sb_delete(struct super_block *sb) { - key_put(sb->s_master_keys); + struct fscrypt_keyring *keyring = sb->s_master_keys; + size_t i; + + if (!keyring) + return; + + for (i = 0; i < ARRAY_SIZE(keyring->key_hashtable); i++) { + struct hlist_head *bucket = &keyring->key_hashtable[i]; + struct fscrypt_master_key *mk; + struct hlist_node *tmp; + + hlist_for_each_entry_safe(mk, tmp, bucket, mk_node) { + /* + * Since all inodes were already evicted, every key + * remaining in the keyring should have an empty inode + * list, and should only still be in the keyring due to + * the single active ref associated with ->mk_secret. + * There should be no structural refs beyond the one + * associated with the active ref. + */ + WARN_ON(refcount_read(&mk->mk_active_refs) != 1); + WARN_ON(refcount_read(&mk->mk_struct_refs) != 1); + WARN_ON(!is_master_key_secret_present(&mk->mk_secret)); + wipe_master_key_secret(&mk->mk_secret); + fscrypt_put_master_key_activeref(mk); + } + } + kfree_sensitive(keyring); sb->s_master_keys = NULL; } +static struct hlist_head * +fscrypt_mk_hash_bucket(struct fscrypt_keyring *keyring, + const struct fscrypt_key_specifier *mk_spec) +{ + /* + * Since key specifiers should be "random" values, it is sufficient to + * use a trivial hash function that just takes the first several bits of + * the key specifier. + */ + unsigned long i = get_unaligned((unsigned long *)&mk_spec->u); + + return &keyring->key_hashtable[i % ARRAY_SIZE(keyring->key_hashtable)]; +} + /* - * Find the specified master key in ->s_master_keys. - * Returns ERR_PTR(-ENOKEY) if not found. + * Find the specified master key struct in ->s_master_keys and take a structural + * ref to it. The structural ref guarantees that the key struct continues to + * exist, but it does *not* guarantee that ->s_master_keys continues to contain + * the key struct. The structural ref needs to be dropped by + * fscrypt_put_master_key(). Returns NULL if the key struct is not found. */ -struct key *fscrypt_find_master_key(struct super_block *sb, - const struct fscrypt_key_specifier *mk_spec) +struct fscrypt_master_key * +fscrypt_find_master_key(struct super_block *sb, + const struct fscrypt_key_specifier *mk_spec) { - struct key *keyring; - char description[FSCRYPT_MK_DESCRIPTION_SIZE]; + struct fscrypt_keyring *keyring; + struct hlist_head *bucket; + struct fscrypt_master_key *mk; /* * Pairs with the smp_store_release() in allocate_filesystem_keyring(). @@ -246,10 +282,38 @@ struct key *fscrypt_find_master_key(struct super_block *sb, */ keyring = smp_load_acquire(&sb->s_master_keys); if (keyring == NULL) - return ERR_PTR(-ENOKEY); /* No keyring yet, so no keys yet. */ - - format_mk_description(description, mk_spec); - return search_fscrypt_keyring(keyring, &key_type_fscrypt, description); + return NULL; /* No keyring yet, so no keys yet. */ + + bucket = fscrypt_mk_hash_bucket(keyring, mk_spec); + rcu_read_lock(); + switch (mk_spec->type) { + case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR: + hlist_for_each_entry_rcu(mk, bucket, mk_node) { + if (mk->mk_spec.type == + FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR && + memcmp(mk->mk_spec.u.descriptor, + mk_spec->u.descriptor, + FSCRYPT_KEY_DESCRIPTOR_SIZE) == 0 && + refcount_inc_not_zero(&mk->mk_struct_refs)) + goto out; + } + break; + case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER: + hlist_for_each_entry_rcu(mk, bucket, mk_node) { + if (mk->mk_spec.type == + FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER && + memcmp(mk->mk_spec.u.identifier, + mk_spec->u.identifier, + FSCRYPT_KEY_IDENTIFIER_SIZE) == 0 && + refcount_inc_not_zero(&mk->mk_struct_refs)) + goto out; + } + break; + } + mk = NULL; +out: + rcu_read_unlock(); + return mk; } static int allocate_master_key_users_keyring(struct fscrypt_master_key *mk) @@ -277,17 +341,30 @@ static int allocate_master_key_users_keyring(struct fscrypt_master_key *mk) static struct key *find_master_key_user(struct fscrypt_master_key *mk) { char description[FSCRYPT_MK_USER_DESCRIPTION_SIZE]; + key_ref_t keyref; format_mk_user_description(description, mk->mk_spec.u.identifier); - return search_fscrypt_keyring(mk->mk_users, &key_type_fscrypt_user, - description); + + /* + * We need to mark the keyring reference as "possessed" so that we + * acquire permission to search it, via the KEY_POS_SEARCH permission. + */ + keyref = keyring_search(make_key_ref(mk->mk_users, true /*possessed*/), + &key_type_fscrypt_user, description, false); + if (IS_ERR(keyref)) { + if (PTR_ERR(keyref) == -EAGAIN || /* not found */ + PTR_ERR(keyref) == -EKEYREVOKED) /* recently invalidated */ + keyref = ERR_PTR(-ENOKEY); + return ERR_CAST(keyref); + } + return key_ref_to_ptr(keyref); } /* * Give the current user a "key" in ->mk_users. This charges the user's quota * and marks the master key as added by the current user, so that it cannot be - * removed by another user with the key. Either the master key's key->sem must - * be held for write, or the master key must be still undergoing initialization. + * removed by another user with the key. Either ->mk_sem must be held for + * write, or the master key must be still undergoing initialization. */ static int add_master_key_user(struct fscrypt_master_key *mk) { @@ -309,7 +386,7 @@ static int add_master_key_user(struct fscrypt_master_key *mk) /* * Remove the current user's "key" from ->mk_users. - * The master key's key->sem must be held for write. + * ->mk_sem must be held for write. * * Returns 0 if removed, -ENOKEY if not found, or another -errno code. */ @@ -327,63 +404,49 @@ static int remove_master_key_user(struct fscrypt_master_key *mk) } /* - * Allocate a new fscrypt_master_key which contains the given secret, set it as - * the payload of a new 'struct key' of type fscrypt, and link the 'struct key' - * into the given keyring. Synchronized by fscrypt_add_key_mutex. + * Allocate a new fscrypt_master_key, transfer the given secret over to it, and + * insert it into sb->s_master_keys. */ -static int add_new_master_key(struct fscrypt_master_key_secret *secret, - const struct fscrypt_key_specifier *mk_spec, - struct key *keyring) +static int add_new_master_key(struct super_block *sb, + struct fscrypt_master_key_secret *secret, + const struct fscrypt_key_specifier *mk_spec) { + struct fscrypt_keyring *keyring = sb->s_master_keys; struct fscrypt_master_key *mk; - char description[FSCRYPT_MK_DESCRIPTION_SIZE]; - struct key *key; int err; mk = kzalloc(sizeof(*mk), GFP_KERNEL); if (!mk) return -ENOMEM; + mk->mk_sb = sb; + init_rwsem(&mk->mk_sem); + refcount_set(&mk->mk_struct_refs, 1); mk->mk_spec = *mk_spec; - move_master_key_secret(&mk->mk_secret, secret); - - refcount_set(&mk->mk_refcount, 1); /* secret is present */ INIT_LIST_HEAD(&mk->mk_decrypted_inodes); spin_lock_init(&mk->mk_decrypted_inodes_lock); if (mk_spec->type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) { err = allocate_master_key_users_keyring(mk); if (err) - goto out_free_mk; + goto out_put; err = add_master_key_user(mk); if (err) - goto out_free_mk; + goto out_put; } - /* - * Note that we don't charge this key to anyone's quota, since when - * ->mk_users is in use those keys are charged instead, and otherwise - * (when ->mk_users isn't in use) only root can add these keys. - */ - format_mk_description(description, mk_spec); - key = key_alloc(&key_type_fscrypt, description, - GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(), - KEY_POS_SEARCH | KEY_USR_SEARCH | KEY_USR_VIEW, - KEY_ALLOC_NOT_IN_QUOTA, NULL); - if (IS_ERR(key)) { - err = PTR_ERR(key); - goto out_free_mk; - } - err = key_instantiate_and_link(key, mk, sizeof(*mk), keyring, NULL); - key_put(key); - if (err) - goto out_free_mk; + move_master_key_secret(&mk->mk_secret, secret); + refcount_set(&mk->mk_active_refs, 1); /* ->mk_secret is present */ + spin_lock(&keyring->lock); + hlist_add_head_rcu(&mk->mk_node, + fscrypt_mk_hash_bucket(keyring, mk_spec)); + spin_unlock(&keyring->lock); return 0; -out_free_mk: - free_master_key(mk); +out_put: + fscrypt_put_master_key(mk); return err; } @@ -392,42 +455,34 @@ out_free_mk: static int add_existing_master_key(struct fscrypt_master_key *mk, struct fscrypt_master_key_secret *secret) { - struct key *mk_user; - bool rekey; int err; /* * If the current user is already in ->mk_users, then there's nothing to - * do. (Not applicable for v1 policy keys, which have NULL ->mk_users.) + * do. Otherwise, we need to add the user to ->mk_users. (Neither is + * applicable for v1 policy keys, which have NULL ->mk_users.) */ if (mk->mk_users) { - mk_user = find_master_key_user(mk); + struct key *mk_user = find_master_key_user(mk); + if (mk_user != ERR_PTR(-ENOKEY)) { if (IS_ERR(mk_user)) return PTR_ERR(mk_user); key_put(mk_user); return 0; } - } - - /* If we'll be re-adding ->mk_secret, try to take the reference. */ - rekey = !is_master_key_secret_present(&mk->mk_secret); - if (rekey && !refcount_inc_not_zero(&mk->mk_refcount)) - return KEY_DEAD; - - /* Add the current user to ->mk_users, if applicable. */ - if (mk->mk_users) { err = add_master_key_user(mk); - if (err) { - if (rekey && refcount_dec_and_test(&mk->mk_refcount)) - return KEY_DEAD; + if (err) return err; - } } /* Re-add the secret if needed. */ - if (rekey) + if (!is_master_key_secret_present(&mk->mk_secret)) { + if (!refcount_inc_not_zero(&mk->mk_active_refs)) + return KEY_DEAD; move_master_key_secret(&mk->mk_secret, secret); + } + return 0; } @@ -436,38 +491,36 @@ static int do_add_master_key(struct super_block *sb, const struct fscrypt_key_specifier *mk_spec) { static DEFINE_MUTEX(fscrypt_add_key_mutex); - struct key *key; + struct fscrypt_master_key *mk; int err; mutex_lock(&fscrypt_add_key_mutex); /* serialize find + link */ -retry: - key = fscrypt_find_master_key(sb, mk_spec); - if (IS_ERR(key)) { - err = PTR_ERR(key); - if (err != -ENOKEY) - goto out_unlock; + + mk = fscrypt_find_master_key(sb, mk_spec); + if (!mk) { /* Didn't find the key in ->s_master_keys. Add it. */ err = allocate_filesystem_keyring(sb); - if (err) - goto out_unlock; - err = add_new_master_key(secret, mk_spec, sb->s_master_keys); + if (!err) + err = add_new_master_key(sb, secret, mk_spec); } else { /* * Found the key in ->s_master_keys. Re-add the secret if * needed, and add the user to ->mk_users if needed. */ - down_write(&key->sem); - err = add_existing_master_key(key->payload.data[0], secret); - up_write(&key->sem); + down_write(&mk->mk_sem); + err = add_existing_master_key(mk, secret); + up_write(&mk->mk_sem); if (err == KEY_DEAD) { - /* Key being removed or needs to be removed */ - key_invalidate(key); - key_put(key); - goto retry; + /* + * We found a key struct, but it's already been fully + * removed. Ignore the old struct and add a new one. + * fscrypt_add_key_mutex means we don't need to worry + * about concurrent adds. + */ + err = add_new_master_key(sb, secret, mk_spec); } - key_put(key); + fscrypt_put_master_key(mk); } -out_unlock: mutex_unlock(&fscrypt_add_key_mutex); return err; } @@ -771,19 +824,19 @@ int fscrypt_verify_key_added(struct super_block *sb, const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]) { struct fscrypt_key_specifier mk_spec; - struct key *key, *mk_user; struct fscrypt_master_key *mk; + struct key *mk_user; int err; mk_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER; memcpy(mk_spec.u.identifier, identifier, FSCRYPT_KEY_IDENTIFIER_SIZE); - key = fscrypt_find_master_key(sb, &mk_spec); - if (IS_ERR(key)) { - err = PTR_ERR(key); + mk = fscrypt_find_master_key(sb, &mk_spec); + if (!mk) { + err = -ENOKEY; goto out; } - mk = key->payload.data[0]; + down_read(&mk->mk_sem); mk_user = find_master_key_user(mk); if (IS_ERR(mk_user)) { err = PTR_ERR(mk_user); @@ -791,7 +844,8 @@ int fscrypt_verify_key_added(struct super_block *sb, key_put(mk_user); err = 0; } - key_put(key); + up_read(&mk->mk_sem); + fscrypt_put_master_key(mk); out: if (err == -ENOKEY && capable(CAP_FOWNER)) err = 0; @@ -953,11 +1007,10 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users) struct super_block *sb = file_inode(filp)->i_sb; struct fscrypt_remove_key_arg __user *uarg = _uarg; struct fscrypt_remove_key_arg arg; - struct key *key; struct fscrypt_master_key *mk; u32 status_flags = 0; int err; - bool dead; + bool inodes_remain; if (copy_from_user(&arg, uarg, sizeof(arg))) return -EFAULT; @@ -977,12 +1030,10 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users) return -EACCES; /* Find the key being removed. */ - key = fscrypt_find_master_key(sb, &arg.key_spec); - if (IS_ERR(key)) - return PTR_ERR(key); - mk = key->payload.data[0]; - - down_write(&key->sem); + mk = fscrypt_find_master_key(sb, &arg.key_spec); + if (!mk) + return -ENOKEY; + down_write(&mk->mk_sem); /* If relevant, remove current user's (or all users) claim to the key */ if (mk->mk_users && mk->mk_users->keys.nr_leaves_on_tree != 0) { @@ -991,7 +1042,7 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users) else err = remove_master_key_user(mk); if (err) { - up_write(&key->sem); + up_write(&mk->mk_sem); goto out_put_key; } if (mk->mk_users->keys.nr_leaves_on_tree != 0) { @@ -1003,26 +1054,22 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users) status_flags |= FSCRYPT_KEY_REMOVAL_STATUS_FLAG_OTHER_USERS; err = 0; - up_write(&key->sem); + up_write(&mk->mk_sem); goto out_put_key; } } /* No user claims remaining. Go ahead and wipe the secret. */ - dead = false; + err = -ENOKEY; if (is_master_key_secret_present(&mk->mk_secret)) { wipe_master_key_secret(&mk->mk_secret); - dead = refcount_dec_and_test(&mk->mk_refcount); - } - up_write(&key->sem); - if (dead) { - /* - * No inodes reference the key, and we wiped the secret, so the - * key object is free to be removed from the keyring. - */ - key_invalidate(key); + fscrypt_put_master_key_activeref(mk); err = 0; - } else { + } + inodes_remain = refcount_read(&mk->mk_active_refs) > 0; + up_write(&mk->mk_sem); + + if (inodes_remain) { /* Some inodes still reference this key; try to evict them. */ err = try_to_lock_encrypted_files(sb, mk); if (err == -EBUSY) { @@ -1038,7 +1085,7 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users) * has been fully removed including all files locked. */ out_put_key: - key_put(key); + fscrypt_put_master_key(mk); if (err == 0) err = put_user(status_flags, &uarg->removal_status_flags); return err; @@ -1085,7 +1132,6 @@ int fscrypt_ioctl_get_key_status(struct file *filp, void __user *uarg) { struct super_block *sb = file_inode(filp)->i_sb; struct fscrypt_get_key_status_arg arg; - struct key *key; struct fscrypt_master_key *mk; int err; @@ -1102,19 +1148,18 @@ int fscrypt_ioctl_get_key_status(struct file *filp, void __user *uarg) arg.user_count = 0; memset(arg.__out_reserved, 0, sizeof(arg.__out_reserved)); - key = fscrypt_find_master_key(sb, &arg.key_spec); - if (IS_ERR(key)) { - if (key != ERR_PTR(-ENOKEY)) - return PTR_ERR(key); + mk = fscrypt_find_master_key(sb, &arg.key_spec); + if (!mk) { arg.status = FSCRYPT_KEY_STATUS_ABSENT; err = 0; goto out; } - mk = key->payload.data[0]; - down_read(&key->sem); + down_read(&mk->mk_sem); if (!is_master_key_secret_present(&mk->mk_secret)) { - arg.status = FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED; + arg.status = refcount_read(&mk->mk_active_refs) > 0 ? + FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED : + FSCRYPT_KEY_STATUS_ABSENT /* raced with full removal */; err = 0; goto out_release_key; } @@ -1136,8 +1181,8 @@ int fscrypt_ioctl_get_key_status(struct file *filp, void __user *uarg) } err = 0; out_release_key: - up_read(&key->sem); - key_put(key); + up_read(&mk->mk_sem); + fscrypt_put_master_key(mk); out: if (!err && copy_to_user(uarg, &arg, sizeof(arg))) err = -EFAULT; @@ -1149,13 +1194,9 @@ int __init fscrypt_init_keyring(void) { int err; - err = register_key_type(&key_type_fscrypt); - if (err) - return err; - err = register_key_type(&key_type_fscrypt_user); if (err) - goto err_unregister_fscrypt; + return err; err = register_key_type(&key_type_fscrypt_provisioning); if (err) @@ -1165,7 +1206,5 @@ int __init fscrypt_init_keyring(void) err_unregister_fscrypt_user: unregister_key_type(&key_type_fscrypt_user); -err_unregister_fscrypt: - unregister_key_type(&key_type_fscrypt); return err; } diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index fbc71abdabe3..f7407071a952 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -9,7 +9,6 @@ */ #include <crypto/skcipher.h> -#include <linux/key.h> #include <linux/random.h> #include "fscrypt_private.h" @@ -155,10 +154,12 @@ int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key, } /* Destroy a crypto transform object and/or blk-crypto key. */ -void fscrypt_destroy_prepared_key(struct fscrypt_prepared_key *prep_key) +void fscrypt_destroy_prepared_key(struct super_block *sb, + struct fscrypt_prepared_key *prep_key) { crypto_free_skcipher(prep_key->tfm); - fscrypt_destroy_inline_crypt_key(prep_key); + fscrypt_destroy_inline_crypt_key(sb, prep_key); + memzero_explicit(prep_key, sizeof(*prep_key)); } /* Given a per-file encryption key, set up the file's crypto transform object */ @@ -412,20 +413,18 @@ static bool fscrypt_valid_master_key_size(const struct fscrypt_master_key *mk, /* * Find the master key, then set up the inode's actual encryption key. * - * If the master key is found in the filesystem-level keyring, then the - * corresponding 'struct key' is returned in *master_key_ret with its semaphore - * read-locked. This is needed to ensure that only one task links the - * fscrypt_info into ->mk_decrypted_inodes (as multiple tasks may race to create - * an fscrypt_info for the same inode), and to synchronize the master key being - * removed with a new inode starting to use it. + * If the master key is found in the filesystem-level keyring, then it is + * returned in *mk_ret with its semaphore read-locked. This is needed to ensure + * that only one task links the fscrypt_info into ->mk_decrypted_inodes (as + * multiple tasks may race to create an fscrypt_info for the same inode), and to + * synchronize the master key being removed with a new inode starting to use it. */ static int setup_file_encryption_key(struct fscrypt_info *ci, bool need_dirhash_key, - struct key **master_key_ret) + struct fscrypt_master_key **mk_ret) { - struct key *key; - struct fscrypt_master_key *mk = NULL; struct fscrypt_key_specifier mk_spec; + struct fscrypt_master_key *mk; int err; err = fscrypt_select_encryption_impl(ci); @@ -436,11 +435,10 @@ static int setup_file_encryption_key(struct fscrypt_info *ci, if (err) return err; - key = fscrypt_find_master_key(ci->ci_inode->i_sb, &mk_spec); - if (IS_ERR(key)) { - if (key != ERR_PTR(-ENOKEY) || - ci->ci_policy.version != FSCRYPT_POLICY_V1) - return PTR_ERR(key); + mk = fscrypt_find_master_key(ci->ci_inode->i_sb, &mk_spec); + if (!mk) { + if (ci->ci_policy.version != FSCRYPT_POLICY_V1) + return -ENOKEY; /* * As a legacy fallback for v1 policies, search for the key in @@ -450,9 +448,7 @@ static int setup_file_encryption_key(struct fscrypt_info *ci, */ return fscrypt_setup_v1_file_key_via_subscribed_keyrings(ci); } - - mk = key->payload.data[0]; - down_read(&key->sem); + down_read(&mk->mk_sem); /* Has the secret been removed (via FS_IOC_REMOVE_ENCRYPTION_KEY)? */ if (!is_master_key_secret_present(&mk->mk_secret)) { @@ -480,18 +476,18 @@ static int setup_file_encryption_key(struct fscrypt_info *ci, if (err) goto out_release_key; - *master_key_ret = key; + *mk_ret = mk; return 0; out_release_key: - up_read(&key->sem); - key_put(key); + up_read(&mk->mk_sem); + fscrypt_put_master_key(mk); return err; } static void put_crypt_info(struct fscrypt_info *ci) { - struct key *key; + struct fscrypt_master_key *mk; if (!ci) return; @@ -499,26 +495,21 @@ static void put_crypt_info(struct fscrypt_info *ci) if (ci->ci_direct_key) fscrypt_put_direct_key(ci->ci_direct_key); else if (ci->ci_owns_key) - fscrypt_destroy_prepared_key(&ci->ci_enc_key); - - key = ci->ci_master_key; - if (key) { - struct fscrypt_master_key *mk = key->payload.data[0]; + fscrypt_destroy_prepared_key(ci->ci_inode->i_sb, + &ci->ci_enc_key); + mk = ci->ci_master_key; + if (mk) { /* * Remove this inode from the list of inodes that were unlocked - * with the master key. - * - * In addition, if we're removing the last inode from a key that - * already had its secret removed, invalidate the key so that it - * gets removed from ->s_master_keys. + * with the master key. In addition, if we're removing the last + * inode from a master key struct that already had its secret + * removed, then complete the full removal of the struct. */ spin_lock(&mk->mk_decrypted_inodes_lock); list_del(&ci->ci_master_key_link); spin_unlock(&mk->mk_decrypted_inodes_lock); - if (refcount_dec_and_test(&mk->mk_refcount)) - key_invalidate(key); - key_put(key); + fscrypt_put_master_key_activeref(mk); } memzero_explicit(ci, sizeof(*ci)); kmem_cache_free(fscrypt_info_cachep, ci); @@ -532,7 +523,7 @@ fscrypt_setup_encryption_info(struct inode *inode, { struct fscrypt_info *crypt_info; struct fscrypt_mode *mode; - struct key *master_key = NULL; + struct fscrypt_master_key *mk = NULL; int res; res = fscrypt_initialize(inode->i_sb->s_cop->flags); @@ -555,8 +546,7 @@ fscrypt_setup_encryption_info(struct inode *inode, WARN_ON(mode->ivsize > FSCRYPT_MAX_IV_SIZE); crypt_info->ci_mode = mode; - res = setup_file_encryption_key(crypt_info, need_dirhash_key, - &master_key); + res = setup_file_encryption_key(crypt_info, need_dirhash_key, &mk); if (res) goto out; @@ -571,12 +561,9 @@ fscrypt_setup_encryption_info(struct inode *inode, * We won the race and set ->i_crypt_info to our crypt_info. * Now link it into the master key's inode list. */ - if (master_key) { - struct fscrypt_master_key *mk = - master_key->payload.data[0]; - - refcount_inc(&mk->mk_refcount); - crypt_info->ci_master_key = key_get(master_key); + if (mk) { + crypt_info->ci_master_key = mk; + refcount_inc(&mk->mk_active_refs); spin_lock(&mk->mk_decrypted_inodes_lock); list_add(&crypt_info->ci_master_key_link, &mk->mk_decrypted_inodes); @@ -586,9 +573,9 @@ fscrypt_setup_encryption_info(struct inode *inode, } res = 0; out: - if (master_key) { - up_read(&master_key->sem); - key_put(master_key); + if (mk) { + up_read(&mk->mk_sem); + fscrypt_put_master_key(mk); } put_crypt_info(crypt_info); return res; @@ -753,7 +740,6 @@ EXPORT_SYMBOL(fscrypt_free_inode); int fscrypt_drop_inode(struct inode *inode) { const struct fscrypt_info *ci = fscrypt_get_info(inode); - const struct fscrypt_master_key *mk; /* * If ci is NULL, then the inode doesn't have an encryption key set up @@ -763,7 +749,6 @@ int fscrypt_drop_inode(struct inode *inode) */ if (!ci || !ci->ci_master_key) return 0; - mk = ci->ci_master_key->payload.data[0]; /* * With proper, non-racy use of FS_IOC_REMOVE_ENCRYPTION_KEY, all inodes @@ -782,6 +767,6 @@ int fscrypt_drop_inode(struct inode *inode) * then the thread removing the key will either evict the inode itself * or will correctly detect that it wasn't evicted due to the race. */ - return !is_master_key_secret_present(&mk->mk_secret); + return !is_master_key_secret_present(&ci->ci_master_key->mk_secret); } EXPORT_SYMBOL_GPL(fscrypt_drop_inode); diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c index 2762c5350432..75dabd9b27f9 100644 --- a/fs/crypto/keysetup_v1.c +++ b/fs/crypto/keysetup_v1.c @@ -143,6 +143,7 @@ invalid: /* Master key referenced by DIRECT_KEY policy */ struct fscrypt_direct_key { + struct super_block *dk_sb; struct hlist_node dk_node; refcount_t dk_refcount; const struct fscrypt_mode *dk_mode; @@ -154,7 +155,7 @@ struct fscrypt_direct_key { static void free_direct_key(struct fscrypt_direct_key *dk) { if (dk) { - fscrypt_destroy_prepared_key(&dk->dk_key); + fscrypt_destroy_prepared_key(dk->dk_sb, &dk->dk_key); kfree_sensitive(dk); } } @@ -231,6 +232,7 @@ fscrypt_get_direct_key(const struct fscrypt_info *ci, const u8 *raw_key) dk = kzalloc(sizeof(*dk), GFP_KERNEL); if (!dk) return ERR_PTR(-ENOMEM); + dk->dk_sb = ci->ci_inode->i_sb; refcount_set(&dk->dk_refcount, 1); dk->dk_mode = ci->ci_mode; err = fscrypt_prepare_key(&dk->dk_key, raw_key, ci); diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 80b8ca0f340b..46757c3052ef 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -744,12 +744,8 @@ int fscrypt_set_context(struct inode *inode, void *fs_data) * delayed key setup that requires the inode number. */ if (ci->ci_policy.version == FSCRYPT_POLICY_V2 && - (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) { - const struct fscrypt_master_key *mk = - ci->ci_master_key->payload.data[0]; - - fscrypt_hash_inode_number(ci, mk); - } + (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) + fscrypt_hash_inode_number(ci, ci->ci_master_key); return inode->i_sb->s_cop->set_context(inode, &ctx, ctxsize, fs_data); } @@ -833,19 +829,6 @@ bool fscrypt_dummy_policies_equal(const struct fscrypt_dummy_policy *p1, } EXPORT_SYMBOL_GPL(fscrypt_dummy_policies_equal); -/* Deprecated, do not use */ -int fscrypt_set_test_dummy_encryption(struct super_block *sb, const char *arg, - struct fscrypt_dummy_policy *dummy_policy) -{ - struct fs_parameter param = { - .type = fs_value_is_string, - .string = arg ? (char *)arg : "", - }; - return fscrypt_parse_test_dummy_encryption(¶m, dummy_policy) ?: - fscrypt_add_test_dummy_key(sb, dummy_policy); -} -EXPORT_SYMBOL_GPL(fscrypt_set_test_dummy_encryption); - /** * fscrypt_show_test_dummy_encryption() - show '-o test_dummy_encryption' * @seq: the seq_file to print the option to diff --git a/fs/d_path.c b/fs/d_path.c index e4e0ebad1f15..56a6ee4c6331 100644 --- a/fs/d_path.c +++ b/fs/d_path.c @@ -34,7 +34,7 @@ static bool prepend_char(struct prepend_buffer *p, unsigned char c) } /* - * The source of the prepend data can be an optimistoc load + * The source of the prepend data can be an optimistic load * of a dentry name and length. And because we don't hold any * locks, the length and the pointer to the name may not be * in sync if a concurrent rename happens, and the kernel @@ -297,8 +297,7 @@ EXPORT_SYMBOL(d_path); /* * Helper function for dentry_operations.d_dname() members */ -char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen, - const char *fmt, ...) +char *dynamic_dname(char *buffer, int buflen, const char *fmt, ...) { va_list args; char temp[64]; @@ -1445,6 +1445,9 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, loff_t done = 0; int ret; + if (!iomi.len) + return 0; + if (iov_iter_rw(iter) == WRITE) { lockdep_assert_held_write(&iomi.inode->i_rwsem); iomi.flags |= IOMAP_WRITE; diff --git a/fs/dcache.c b/fs/dcache.c index c5dc32a59c76..bb0c4d0038db 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2270,6 +2270,48 @@ bool d_same_name(const struct dentry *dentry, const struct dentry *parent, } EXPORT_SYMBOL_GPL(d_same_name); +/* + * This is __d_lookup_rcu() when the parent dentry has + * DCACHE_OP_COMPARE, which makes things much nastier. + */ +static noinline struct dentry *__d_lookup_rcu_op_compare( + const struct dentry *parent, + const struct qstr *name, + unsigned *seqp) +{ + u64 hashlen = name->hash_len; + struct hlist_bl_head *b = d_hash(hashlen_hash(hashlen)); + struct hlist_bl_node *node; + struct dentry *dentry; + + hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { + int tlen; + const char *tname; + unsigned seq; + +seqretry: + seq = raw_seqcount_begin(&dentry->d_seq); + if (dentry->d_parent != parent) + continue; + if (d_unhashed(dentry)) + continue; + if (dentry->d_name.hash != hashlen_hash(hashlen)) + continue; + tlen = dentry->d_name.len; + tname = dentry->d_name.name; + /* we want a consistent (name,len) pair */ + if (read_seqcount_retry(&dentry->d_seq, seq)) { + cpu_relax(); + goto seqretry; + } + if (parent->d_op->d_compare(dentry, tlen, tname, name) != 0) + continue; + *seqp = seq; + return dentry; + } + return NULL; +} + /** * __d_lookup_rcu - search for a dentry (racy, store-free) * @parent: parent dentry @@ -2316,6 +2358,9 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent, * Keep the two functions in sync. */ + if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) + return __d_lookup_rcu_op_compare(parent, name, seqp); + /* * The hash list is protected using RCU. * @@ -2332,7 +2377,6 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent, hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { unsigned seq; -seqretry: /* * The dentry sequence count protects us from concurrent * renames, and thus protects parent and name fields. @@ -2355,28 +2399,10 @@ seqretry: continue; if (d_unhashed(dentry)) continue; - - if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) { - int tlen; - const char *tname; - if (dentry->d_name.hash != hashlen_hash(hashlen)) - continue; - tlen = dentry->d_name.len; - tname = dentry->d_name.name; - /* we want a consistent (name,len) pair */ - if (read_seqcount_retry(&dentry->d_seq, seq)) { - cpu_relax(); - goto seqretry; - } - if (parent->d_op->d_compare(dentry, - tlen, tname, name) != 0) - continue; - } else { - if (dentry->d_name.hash_len != hashlen) - continue; - if (dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0) - continue; - } + if (dentry->d_name.hash_len != hashlen) + continue; + if (dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0) + continue; *seqp = seq; return dentry; } diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 3dcf0b8b4e93..232cfdf095ae 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -745,6 +745,28 @@ void debugfs_remove(struct dentry *dentry) EXPORT_SYMBOL_GPL(debugfs_remove); /** + * debugfs_lookup_and_remove - lookup a directory or file and recursively remove it + * @name: a pointer to a string containing the name of the item to look up. + * @parent: a pointer to the parent dentry of the item. + * + * This is the equlivant of doing something like + * debugfs_remove(debugfs_lookup(..)) but with the proper reference counting + * handled for the directory being looked up. + */ +void debugfs_lookup_and_remove(const char *name, struct dentry *parent) +{ + struct dentry *dentry; + + dentry = debugfs_lookup(name, parent); + if (!dentry) + return; + + debugfs_remove(dentry); + dput(dentry); +} +EXPORT_SYMBOL_GPL(debugfs_lookup_and_remove); + +/** * debugfs_rename - rename a file/directory in the debugfs filesystem * @old_dir: a pointer to the parent dentry for the renamed object. This * should be a directory dentry. diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index 19ef136f9e4f..d60a8d8f109d 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -200,13 +200,13 @@ void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, if (!prev_seq) { kref_get(&lkb->lkb_ref); + mutex_lock(&ls->ls_cb_mutex); if (test_bit(LSFL_CB_DELAY, &ls->ls_flags)) { - mutex_lock(&ls->ls_cb_mutex); list_add(&lkb->lkb_cb_list, &ls->ls_cb_delay); - mutex_unlock(&ls->ls_cb_mutex); } else { queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work); } + mutex_unlock(&ls->ls_cb_mutex); } out: mutex_unlock(&lkb->lkb_cb_mutex); @@ -288,10 +288,13 @@ void dlm_callback_stop(struct dlm_ls *ls) void dlm_callback_suspend(struct dlm_ls *ls) { - set_bit(LSFL_CB_DELAY, &ls->ls_flags); + if (ls->ls_callback_wq) { + mutex_lock(&ls->ls_cb_mutex); + set_bit(LSFL_CB_DELAY, &ls->ls_flags); + mutex_unlock(&ls->ls_cb_mutex); - if (ls->ls_callback_wq) flush_workqueue(ls->ls_callback_wq); + } } #define MAX_CB_QUEUE 25 @@ -302,11 +305,11 @@ void dlm_callback_resume(struct dlm_ls *ls) int count = 0, sum = 0; bool empty; - clear_bit(LSFL_CB_DELAY, &ls->ls_flags); - if (!ls->ls_callback_wq) return; + clear_bit(LSFL_CB_DELAY, &ls->ls_flags); + more: mutex_lock(&ls->ls_cb_mutex); list_for_each_entry_safe(lkb, safe, &ls->ls_cb_delay, lkb_cb_list) { diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h index 181ad7d20c4d..e5e05fcc5813 100644 --- a/fs/dlm/ast.h +++ b/fs/dlm/ast.h @@ -11,7 +11,6 @@ #ifndef __ASTD_DOT_H__ #define __ASTD_DOT_H__ -void dlm_del_ast(struct dlm_lkb *lkb); int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, uint32_t sbflags, uint64_t seq); int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb, diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 8aca8085d24e..e34c3d2639a5 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -661,7 +661,7 @@ struct dlm_ls { spinlock_t ls_recover_idr_lock; wait_queue_head_t ls_wait_general; wait_queue_head_t ls_recover_lock_wait; - struct mutex ls_clear_proc_locks; + spinlock_t ls_clear_proc_locks; struct list_head ls_root_list; /* root resources */ struct rw_semaphore ls_root_sem; /* protect root_list */ diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index dac7eb75dba9..94a72ede5764 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -401,7 +401,7 @@ static int pre_rsb_struct(struct dlm_ls *ls) unlock any spinlocks, go back and call pre_rsb_struct again. Otherwise, take an rsb off the list and return it. */ -static int get_rsb_struct(struct dlm_ls *ls, char *name, int len, +static int get_rsb_struct(struct dlm_ls *ls, const void *name, int len, struct dlm_rsb **r_ret) { struct dlm_rsb *r; @@ -412,7 +412,8 @@ static int get_rsb_struct(struct dlm_ls *ls, char *name, int len, count = ls->ls_new_rsb_count; spin_unlock(&ls->ls_new_rsb_spin); log_debug(ls, "find_rsb retry %d %d %s", - count, dlm_config.ci_new_rsb_count, name); + count, dlm_config.ci_new_rsb_count, + (const char *)name); return -EAGAIN; } @@ -448,7 +449,7 @@ static int rsb_cmp(struct dlm_rsb *r, const char *name, int nlen) return memcmp(r->res_name, maxname, DLM_RESNAME_MAXLEN); } -int dlm_search_rsb_tree(struct rb_root *tree, char *name, int len, +int dlm_search_rsb_tree(struct rb_root *tree, const void *name, int len, struct dlm_rsb **r_ret) { struct rb_node *node = tree->rb_node; @@ -546,7 +547,7 @@ static int rsb_insert(struct dlm_rsb *rsb, struct rb_root *tree) * while that rsb has a potentially stale master.) */ -static int find_rsb_dir(struct dlm_ls *ls, char *name, int len, +static int find_rsb_dir(struct dlm_ls *ls, const void *name, int len, uint32_t hash, uint32_t b, int dir_nodeid, int from_nodeid, unsigned int flags, struct dlm_rsb **r_ret) @@ -724,7 +725,7 @@ static int find_rsb_dir(struct dlm_ls *ls, char *name, int len, dlm_recover_locks) before we've made ourself master (in dlm_recover_masters). */ -static int find_rsb_nodir(struct dlm_ls *ls, char *name, int len, +static int find_rsb_nodir(struct dlm_ls *ls, const void *name, int len, uint32_t hash, uint32_t b, int dir_nodeid, int from_nodeid, unsigned int flags, struct dlm_rsb **r_ret) @@ -818,8 +819,9 @@ static int find_rsb_nodir(struct dlm_ls *ls, char *name, int len, return error; } -static int find_rsb(struct dlm_ls *ls, char *name, int len, int from_nodeid, - unsigned int flags, struct dlm_rsb **r_ret) +static int find_rsb(struct dlm_ls *ls, const void *name, int len, + int from_nodeid, unsigned int flags, + struct dlm_rsb **r_ret) { uint32_t hash, b; int dir_nodeid; @@ -2864,17 +2866,9 @@ static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args) static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, struct dlm_args *args) { - int rv = -EINVAL; + int rv = -EBUSY; if (args->flags & DLM_LKF_CONVERT) { - if (lkb->lkb_flags & DLM_IFL_MSTCPY) - goto out; - - if (args->flags & DLM_LKF_QUECVT && - !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1]) - goto out; - - rv = -EBUSY; if (lkb->lkb_status != DLM_LKSTS_GRANTED) goto out; @@ -2884,6 +2878,14 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, if (is_overlap(lkb)) goto out; + + rv = -EINVAL; + if (lkb->lkb_flags & DLM_IFL_MSTCPY) + goto out; + + if (args->flags & DLM_LKF_QUECVT && + !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1]) + goto out; } lkb->lkb_exflags = args->flags; @@ -2900,11 +2902,25 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, #endif rv = 0; out: - if (rv) - log_debug(ls, "validate_lock_args %d %x %x %x %d %d %s", + switch (rv) { + case 0: + break; + case -EINVAL: + /* annoy the user because dlm usage is wrong */ + WARN_ON(1); + log_error(ls, "%s %d %x %x %x %d %d %s", __func__, + rv, lkb->lkb_id, lkb->lkb_flags, args->flags, + lkb->lkb_status, lkb->lkb_wait_type, + lkb->lkb_resource->res_name); + break; + default: + log_debug(ls, "%s %d %x %x %x %d %d %s", __func__, rv, lkb->lkb_id, lkb->lkb_flags, args->flags, lkb->lkb_status, lkb->lkb_wait_type, lkb->lkb_resource->res_name); + break; + } + return rv; } @@ -2918,23 +2934,12 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args) { struct dlm_ls *ls = lkb->lkb_resource->res_ls; - int rv = -EINVAL; - - if (lkb->lkb_flags & DLM_IFL_MSTCPY) { - log_error(ls, "unlock on MSTCPY %x", lkb->lkb_id); - dlm_print_lkb(lkb); - goto out; - } - - /* an lkb may still exist even though the lock is EOL'ed due to a - cancel, unlock or failed noqueue request; an app can't use these - locks; return same error as if the lkid had not been found at all */ + int rv = -EBUSY; - if (lkb->lkb_flags & DLM_IFL_ENDOFLIFE) { - log_debug(ls, "unlock on ENDOFLIFE %x", lkb->lkb_id); - rv = -ENOENT; + /* normal unlock not allowed if there's any op in progress */ + if (!(args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) && + (lkb->lkb_wait_type || lkb->lkb_wait_count)) goto out; - } /* an lkb may be waiting for an rsb lookup to complete where the lookup was initiated by another lock */ @@ -2949,7 +2954,24 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args) unhold_lkb(lkb); /* undoes create_lkb() */ } /* caller changes -EBUSY to 0 for CANCEL and FORCEUNLOCK */ - rv = -EBUSY; + goto out; + } + + rv = -EINVAL; + if (lkb->lkb_flags & DLM_IFL_MSTCPY) { + log_error(ls, "unlock on MSTCPY %x", lkb->lkb_id); + dlm_print_lkb(lkb); + goto out; + } + + /* an lkb may still exist even though the lock is EOL'ed due to a + * cancel, unlock or failed noqueue request; an app can't use these + * locks; return same error as if the lkid had not been found at all + */ + + if (lkb->lkb_flags & DLM_IFL_ENDOFLIFE) { + log_debug(ls, "unlock on ENDOFLIFE %x", lkb->lkb_id); + rv = -ENOENT; goto out; } @@ -3022,14 +3044,8 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args) goto out; } /* add_to_waiters() will set OVERLAP_UNLOCK */ - goto out_ok; } - /* normal unlock not allowed if there's any op in progress */ - rv = -EBUSY; - if (lkb->lkb_wait_type || lkb->lkb_wait_count) - goto out; - out_ok: /* an overlapping op shouldn't blow away exflags from other op */ lkb->lkb_exflags |= args->flags; @@ -3037,11 +3053,25 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args) lkb->lkb_astparam = args->astparam; rv = 0; out: - if (rv) - log_debug(ls, "validate_unlock_args %d %x %x %x %x %d %s", rv, + switch (rv) { + case 0: + break; + case -EINVAL: + /* annoy the user because dlm usage is wrong */ + WARN_ON(1); + log_error(ls, "%s %d %x %x %x %x %d %s", __func__, rv, + lkb->lkb_id, lkb->lkb_flags, lkb->lkb_exflags, + args->flags, lkb->lkb_wait_type, + lkb->lkb_resource->res_name); + break; + default: + log_debug(ls, "%s %d %x %x %x %x %d %s", __func__, rv, lkb->lkb_id, lkb->lkb_flags, lkb->lkb_exflags, args->flags, lkb->lkb_wait_type, lkb->lkb_resource->res_name); + break; + } + return rv; } @@ -3292,8 +3322,9 @@ static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) * request_lock(), convert_lock(), unlock_lock(), cancel_lock() */ -static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, char *name, - int len, struct dlm_args *args) +static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, + const void *name, int len, + struct dlm_args *args) { struct dlm_rsb *r; int error; @@ -3392,7 +3423,7 @@ int dlm_lock(dlm_lockspace_t *lockspace, int mode, struct dlm_lksb *lksb, uint32_t flags, - void *name, + const void *name, unsigned int namelen, uint32_t parent_lkid, void (*ast) (void *astarg), @@ -3438,7 +3469,7 @@ int dlm_lock(dlm_lockspace_t *lockspace, if (error == -EINPROGRESS) error = 0; out_put: - trace_dlm_lock_end(ls, lkb, name, namelen, mode, flags, error); + trace_dlm_lock_end(ls, lkb, name, namelen, mode, flags, error, true); if (convert || error) __put_lkb(ls, lkb); @@ -3623,7 +3654,7 @@ static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb, case cpu_to_le32(DLM_MSG_REQUEST_REPLY): case cpu_to_le32(DLM_MSG_CONVERT_REPLY): case cpu_to_le32(DLM_MSG_GRANT): - if (!lkb->lkb_lvbptr) + if (!lkb->lkb_lvbptr || !(lkb->lkb_exflags & DLM_LKF_VALBLK)) break; memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen); break; @@ -5080,8 +5111,11 @@ void dlm_receive_buffer(union dlm_packet *p, int nodeid) down_read(&ls->ls_recv_active); if (hd->h_cmd == DLM_MSG) dlm_receive_message(ls, &p->message, nodeid); - else + else if (hd->h_cmd == DLM_RCOM) dlm_receive_rcom(ls, &p->rcom, nodeid); + else + log_error(ls, "invalid h_cmd %d from %d lockspace %x", + hd->h_cmd, nodeid, le32_to_cpu(hd->u.h_lockspace)); up_read(&ls->ls_recv_active); dlm_put_lockspace(ls); @@ -5801,6 +5835,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, { struct dlm_lkb *lkb; struct dlm_args args; + bool do_put = true; int error; dlm_lock_recovery(ls); @@ -5811,13 +5846,14 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, goto out; } + trace_dlm_lock_start(ls, lkb, name, namelen, mode, flags); + if (flags & DLM_LKF_VALBLK) { ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS); if (!ua->lksb.sb_lvbptr) { kfree(ua); - __put_lkb(ls, lkb); error = -ENOMEM; - goto out; + goto out_put; } } #ifdef CONFIG_DLM_DEPRECATED_API @@ -5831,8 +5867,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, kfree(ua->lksb.sb_lvbptr); ua->lksb.sb_lvbptr = NULL; kfree(ua); - __put_lkb(ls, lkb); - goto out; + goto out_put; } /* After ua is attached to lkb it will be freed by dlm_free_lkb(). @@ -5851,8 +5886,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, error = 0; fallthrough; default: - __put_lkb(ls, lkb); - goto out; + goto out_put; } /* add this new lkb to the per-process list of locks */ @@ -5860,6 +5894,11 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, hold_lkb(lkb); list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks); spin_unlock(&ua->proc->locks_spin); + do_put = false; + out_put: + trace_dlm_lock_end(ls, lkb, name, namelen, mode, flags, error, false); + if (do_put) + __put_lkb(ls, lkb); out: dlm_unlock_recovery(ls); return error; @@ -5885,6 +5924,8 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, if (error) goto out; + trace_dlm_lock_start(ls, lkb, NULL, 0, mode, flags); + /* user can change the params on its lock when it converts it, or add an lvb that didn't exist before */ @@ -5922,6 +5963,7 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, if (error == -EINPROGRESS || error == -EAGAIN || error == -EDEADLK) error = 0; out_put: + trace_dlm_lock_end(ls, lkb, NULL, 0, mode, flags, error, false); dlm_put_lkb(lkb); out: dlm_unlock_recovery(ls); @@ -6014,6 +6056,8 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, if (error) goto out; + trace_dlm_unlock_start(ls, lkb, flags); + ua = lkb->lkb_ua; if (lvb_in && ua->lksb.sb_lvbptr) @@ -6042,6 +6086,7 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, list_move(&lkb->lkb_ownqueue, &ua->proc->unlocking); spin_unlock(&ua->proc->locks_spin); out_put: + trace_dlm_unlock_end(ls, lkb, flags, error); dlm_put_lkb(lkb); out: dlm_unlock_recovery(ls); @@ -6063,6 +6108,8 @@ int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, if (error) goto out; + trace_dlm_unlock_start(ls, lkb, flags); + ua = lkb->lkb_ua; if (ua_tmp->castparam) ua->castparam = ua_tmp->castparam; @@ -6080,6 +6127,7 @@ int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, if (error == -EBUSY) error = 0; out_put: + trace_dlm_unlock_end(ls, lkb, flags, error); dlm_put_lkb(lkb); out: dlm_unlock_recovery(ls); @@ -6101,6 +6149,8 @@ int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid) if (error) goto out; + trace_dlm_unlock_start(ls, lkb, flags); + ua = lkb->lkb_ua; error = set_unlock_args(flags, ua, &args); @@ -6129,6 +6179,7 @@ int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid) if (error == -EBUSY) error = 0; out_put: + trace_dlm_unlock_end(ls, lkb, flags, error); dlm_put_lkb(lkb); out: dlm_unlock_recovery(ls); @@ -6184,7 +6235,7 @@ static struct dlm_lkb *del_proc_lock(struct dlm_ls *ls, { struct dlm_lkb *lkb = NULL; - mutex_lock(&ls->ls_clear_proc_locks); + spin_lock(&ls->ls_clear_proc_locks); if (list_empty(&proc->locks)) goto out; @@ -6196,7 +6247,7 @@ static struct dlm_lkb *del_proc_lock(struct dlm_ls *ls, else lkb->lkb_flags |= DLM_IFL_DEAD; out: - mutex_unlock(&ls->ls_clear_proc_locks); + spin_unlock(&ls->ls_clear_proc_locks); return lkb; } @@ -6233,7 +6284,7 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) dlm_put_lkb(lkb); } - mutex_lock(&ls->ls_clear_proc_locks); + spin_lock(&ls->ls_clear_proc_locks); /* in-progress unlocks */ list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) { @@ -6249,7 +6300,7 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) dlm_put_lkb(lkb); } - mutex_unlock(&ls->ls_clear_proc_locks); + spin_unlock(&ls->ls_clear_proc_locks); dlm_unlock_recovery(ls); } diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h index a7b6474f009d..40c76b5544da 100644 --- a/fs/dlm/lock.h +++ b/fs/dlm/lock.h @@ -36,7 +36,7 @@ static inline void dlm_adjust_timeouts(struct dlm_ls *ls) { } int dlm_master_lookup(struct dlm_ls *ls, int nodeid, char *name, int len, unsigned int flags, int *r_nodeid, int *result); -int dlm_search_rsb_tree(struct rb_root *tree, char *name, int len, +int dlm_search_rsb_tree(struct rb_root *tree, const void *name, int len, struct dlm_rsb **r_ret); void dlm_recover_purge(struct dlm_ls *ls); diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 3972f4d86c75..bae050df7abf 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -416,7 +416,7 @@ static int new_lockspace(const char *name, const char *cluster, if (namelen > DLM_LOCKSPACE_LEN || namelen == 0) return -EINVAL; - if (!lvblen || (lvblen % 8)) + if (lvblen % 8) return -EINVAL; if (!try_module_get(THIS_MODULE)) @@ -584,7 +584,7 @@ static int new_lockspace(const char *name, const char *cluster, atomic_set(&ls->ls_requestqueue_cnt, 0); init_waitqueue_head(&ls->ls_requestqueue_wait); mutex_init(&ls->ls_requestqueue_mutex); - mutex_init(&ls->ls_clear_proc_locks); + spin_lock_init(&ls->ls_clear_proc_locks); /* Due backwards compatibility with 3.1 we need to use maximum * possible dlm message size to be sure the message will fit and @@ -703,10 +703,11 @@ static int new_lockspace(const char *name, const char *cluster, return error; } -int dlm_new_lockspace(const char *name, const char *cluster, - uint32_t flags, int lvblen, - const struct dlm_lockspace_ops *ops, void *ops_arg, - int *ops_result, dlm_lockspace_t **lockspace) +static int __dlm_new_lockspace(const char *name, const char *cluster, + uint32_t flags, int lvblen, + const struct dlm_lockspace_ops *ops, + void *ops_arg, int *ops_result, + dlm_lockspace_t **lockspace) { int error = 0; @@ -732,6 +733,25 @@ int dlm_new_lockspace(const char *name, const char *cluster, return error; } +int dlm_new_lockspace(const char *name, const char *cluster, uint32_t flags, + int lvblen, const struct dlm_lockspace_ops *ops, + void *ops_arg, int *ops_result, + dlm_lockspace_t **lockspace) +{ + return __dlm_new_lockspace(name, cluster, flags | DLM_LSFL_FS, lvblen, + ops, ops_arg, ops_result, lockspace); +} + +int dlm_new_user_lockspace(const char *name, const char *cluster, + uint32_t flags, int lvblen, + const struct dlm_lockspace_ops *ops, + void *ops_arg, int *ops_result, + dlm_lockspace_t **lockspace) +{ + return __dlm_new_lockspace(name, cluster, flags, lvblen, ops, + ops_arg, ops_result, lockspace); +} + static int lkb_idr_is_local(int id, void *p, void *data) { struct dlm_lkb *lkb = p; diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h index 306fc4f4ea15..03f4a4a3a871 100644 --- a/fs/dlm/lockspace.h +++ b/fs/dlm/lockspace.h @@ -12,6 +12,14 @@ #ifndef __LOCKSPACE_DOT_H__ #define __LOCKSPACE_DOT_H__ +/* DLM_LSFL_FS + * The lockspace user is in the kernel (i.e. filesystem). Enables + * direct bast/cast callbacks. + * + * internal lockspace flag - will be removed in future + */ +#define DLM_LSFL_FS 0x00000004 + int dlm_lockspace_init(void); void dlm_lockspace_exit(void); struct dlm_ls *dlm_find_lockspace_global(uint32_t id); @@ -20,6 +28,11 @@ struct dlm_ls *dlm_find_lockspace_device(int minor); void dlm_put_lockspace(struct dlm_ls *ls); void dlm_stop_lockspaces(void); void dlm_stop_lockspaces_check(void); +int dlm_new_user_lockspace(const char *name, const char *cluster, + uint32_t flags, int lvblen, + const struct dlm_lockspace_ops *ops, + void *ops_arg, int *ops_result, + dlm_lockspace_t **lockspace); #endif /* __LOCKSPACE_DOT_H__ */ diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index a4e84e8d94c8..59f64c596233 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1336,6 +1336,8 @@ struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation, return NULL; } + /* for dlm_lowcomms_commit_msg() */ + kref_get(&msg->ref); /* we assume if successful commit must called */ msg->idx = idx; return msg; @@ -1375,6 +1377,8 @@ void dlm_lowcomms_commit_msg(struct dlm_msg *msg) { _dlm_lowcomms_commit_msg(msg); srcu_read_unlock(&connections_srcu, msg->idx); + /* because dlm_lowcomms_new_msg() */ + kref_put(&msg->ref, dlm_msg_release); } #endif diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c index 67f68d48d60c..4de4b8651c6c 100644 --- a/fs/dlm/netlink.c +++ b/fs/dlm/netlink.c @@ -75,6 +75,7 @@ static struct genl_family family __ro_after_init = { .version = DLM_GENL_VERSION, .small_ops = dlm_nl_ops, .n_small_ops = ARRAY_SIZE(dlm_nl_ops), + .resv_start_op = DLM_CMD_HELLO + 1, .module = THIS_MODULE, }; diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 99e8f0744513..c5d27bccc3dc 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -16,6 +16,8 @@ #include <linux/slab.h> #include <linux/sched/signal.h> +#include <trace/events/dlm.h> + #include "dlm_internal.h" #include "lockspace.h" #include "lock.h" @@ -184,7 +186,7 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, return; ls = lkb->lkb_resource->res_ls; - mutex_lock(&ls->ls_clear_proc_locks); + spin_lock(&ls->ls_clear_proc_locks); /* If ORPHAN/DEAD flag is set, it means the process is dead so an ast can't be delivered. For ORPHAN's, dlm_clear_proc_locks() freed @@ -230,7 +232,7 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, spin_unlock(&proc->locks_spin); } out: - mutex_unlock(&ls->ls_clear_proc_locks); + spin_unlock(&ls->ls_clear_proc_locks); } static int device_user_lock(struct dlm_user_proc *proc, @@ -421,9 +423,9 @@ static int device_create_lockspace(struct dlm_lspace_params *params) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - error = dlm_new_lockspace(params->name, dlm_config.ci_cluster_name, params->flags, - DLM_USER_LVB_LEN, NULL, NULL, NULL, - &lockspace); + error = dlm_new_user_lockspace(params->name, dlm_config.ci_cluster_name, + params->flags, DLM_USER_LVB_LEN, NULL, + NULL, NULL, &lockspace); if (error) return error; @@ -882,7 +884,9 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count, goto try_another; } - if (cb.flags & DLM_CB_CAST) { + if (cb.flags & DLM_CB_BAST) { + trace_dlm_bast(lkb->lkb_resource->res_ls, lkb, cb.mode); + } else if (cb.flags & DLM_CB_CAST) { new_mode = cb.mode; if (!cb.sb_status && lkb->lkb_lksb->sb_lvbptr && @@ -891,6 +895,7 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count, lkb->lkb_lksb->sb_status = cb.sb_status; lkb->lkb_lksb->sb_flags = cb.sb_flags; + trace_dlm_ast(lkb->lkb_resource->res_ls, lkb); } rv = copy_result_to_user(lkb->lkb_ua, diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 195f7c2a16e8..268b74499c28 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -53,7 +53,7 @@ struct ecryptfs_getdents_callback { }; /* Inspired by generic filldir in fs/readdir.c */ -static int +static bool ecryptfs_filldir(struct dir_context *ctx, const char *lower_name, int lower_namelen, loff_t offset, u64 ino, unsigned int d_type) { @@ -61,18 +61,19 @@ ecryptfs_filldir(struct dir_context *ctx, const char *lower_name, container_of(ctx, struct ecryptfs_getdents_callback, ctx); size_t name_size; char *name; - int rc; + int err; + bool res; buf->filldir_called++; - rc = ecryptfs_decode_and_decrypt_filename(&name, &name_size, - buf->sb, lower_name, - lower_namelen); - if (rc) { - if (rc != -EINVAL) { + err = ecryptfs_decode_and_decrypt_filename(&name, &name_size, + buf->sb, lower_name, + lower_namelen); + if (err) { + if (err != -EINVAL) { ecryptfs_printk(KERN_DEBUG, "%s: Error attempting to decode and decrypt filename [%s]; rc = [%d]\n", - __func__, lower_name, rc); - return rc; + __func__, lower_name, err); + return false; } /* Mask -EINVAL errors as these are most likely due a plaintext @@ -81,16 +82,15 @@ ecryptfs_filldir(struct dir_context *ctx, const char *lower_name, * the "lost+found" dentry in the root directory of an Ext4 * filesystem. */ - return 0; + return true; } buf->caller->pos = buf->ctx.pos; - rc = !dir_emit(buf->caller, name, name_size, ino, d_type); + res = dir_emit(buf->caller, name, name_size, ino, d_type); kfree(name); - if (!rc) + if (res) buf->entries_written++; - - return rc; + return res; } /** @@ -111,14 +111,8 @@ static int ecryptfs_readdir(struct file *file, struct dir_context *ctx) lower_file = ecryptfs_file_to_lower(file); rc = iterate_dir(lower_file, &buf.ctx); ctx->pos = buf.ctx.pos; - if (rc < 0) - goto out; - if (buf.filldir_called && !buf.entries_written) - goto out; - if (rc >= 0) - fsstack_copy_attr_atime(inode, - file_inode(lower_file)); -out: + if (rc >= 0 && (buf.entries_written || !buf.filldir_called)) + fsstack_copy_attr_atime(inode, file_inode(lower_file)); return rc; } diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 2d55569f96ac..51b7ac7166d9 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -317,52 +317,61 @@ dstmap_out: return ret; } -static int z_erofs_shifted_transform(struct z_erofs_decompress_req *rq, - struct page **pagepool) +static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, + struct page **pagepool) { - const unsigned int nrpages_out = + const unsigned int inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT; + const unsigned int outpages = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; const unsigned int righthalf = min_t(unsigned int, rq->outputsize, PAGE_SIZE - rq->pageofs_out); const unsigned int lefthalf = rq->outputsize - righthalf; + const unsigned int interlaced_offset = + rq->alg == Z_EROFS_COMPRESSION_SHIFTED ? 0 : rq->pageofs_out; unsigned char *src, *dst; - if (nrpages_out > 2) { + if (outpages > 2 && rq->alg == Z_EROFS_COMPRESSION_SHIFTED) { DBG_BUGON(1); - return -EIO; + return -EFSCORRUPTED; } if (rq->out[0] == *rq->in) { - DBG_BUGON(nrpages_out != 1); + DBG_BUGON(rq->pageofs_out); return 0; } - src = kmap_atomic(*rq->in) + rq->pageofs_in; + src = kmap_local_page(rq->in[inpages - 1]) + rq->pageofs_in; if (rq->out[0]) { - dst = kmap_atomic(rq->out[0]); - memcpy(dst + rq->pageofs_out, src, righthalf); - kunmap_atomic(dst); + dst = kmap_local_page(rq->out[0]); + memcpy(dst + rq->pageofs_out, src + interlaced_offset, + righthalf); + kunmap_local(dst); } - if (nrpages_out == 2) { - DBG_BUGON(!rq->out[1]); - if (rq->out[1] == *rq->in) { + if (outpages > inpages) { + DBG_BUGON(!rq->out[outpages - 1]); + if (rq->out[outpages - 1] != rq->in[inpages - 1]) { + dst = kmap_local_page(rq->out[outpages - 1]); + memcpy(dst, interlaced_offset ? src : + (src + righthalf), lefthalf); + kunmap_local(dst); + } else if (!interlaced_offset) { memmove(src, src + righthalf, lefthalf); - } else { - dst = kmap_atomic(rq->out[1]); - memcpy(dst, src + righthalf, lefthalf); - kunmap_atomic(dst); } } - kunmap_atomic(src); + kunmap_local(src); return 0; } static struct z_erofs_decompressor decompressors[] = { [Z_EROFS_COMPRESSION_SHIFTED] = { - .decompress = z_erofs_shifted_transform, + .decompress = z_erofs_transform_plain, .name = "shifted" }, + [Z_EROFS_COMPRESSION_INTERLACED] = { + .decompress = z_erofs_transform_plain, + .name = "interlaced" + }, [Z_EROFS_COMPRESSION_LZ4] = { .decompress = z_erofs_lz4_decompress, .name = "lz4" diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c index 5e59b3f523eb..091fd5adf818 100644 --- a/fs/erofs/decompressor_lzma.c +++ b/fs/erofs/decompressor_lzma.c @@ -217,6 +217,9 @@ again: strm->buf.out_size = min_t(u32, outlen, PAGE_SIZE - pageofs); outlen -= strm->buf.out_size; + if (!rq->out[no] && rq->fillgaps) /* deduped */ + rq->out[no] = erofs_allocpage(pagepool, + GFP_KERNEL | __GFP_NOFAIL); if (rq->out[no]) strm->buf.out = kmap(rq->out[no]) + pageofs; pageofs = 0; diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 2b48373f690b..dbcd24371002 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -25,6 +25,8 @@ #define EROFS_FEATURE_INCOMPAT_DEVICE_TABLE 0x00000008 #define EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 0x00000008 #define EROFS_FEATURE_INCOMPAT_ZTAILPACKING 0x00000010 +#define EROFS_FEATURE_INCOMPAT_FRAGMENTS 0x00000020 +#define EROFS_FEATURE_INCOMPAT_DEDUPE 0x00000020 #define EROFS_ALL_FEATURE_INCOMPAT \ (EROFS_FEATURE_INCOMPAT_ZERO_PADDING | \ EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \ @@ -32,7 +34,9 @@ EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \ EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \ EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 | \ - EROFS_FEATURE_INCOMPAT_ZTAILPACKING) + EROFS_FEATURE_INCOMPAT_ZTAILPACKING | \ + EROFS_FEATURE_INCOMPAT_FRAGMENTS | \ + EROFS_FEATURE_INCOMPAT_DEDUPE) #define EROFS_SB_EXTSLOT_SIZE 16 @@ -71,7 +75,9 @@ struct erofs_super_block { } __packed u1; __le16 extra_devices; /* # of devices besides the primary device */ __le16 devt_slotoff; /* startoff = devt_slotoff * devt_slotsize */ - __u8 reserved2[38]; + __u8 reserved[6]; + __le64 packed_nid; /* nid of the special packed inode */ + __u8 reserved2[24]; }; /* @@ -295,16 +301,27 @@ struct z_erofs_lzma_cfgs { * bit 1 : HEAD1 big pcluster (0 - off; 1 - on) * bit 2 : HEAD2 big pcluster (0 - off; 1 - on) * bit 3 : tailpacking inline pcluster (0 - off; 1 - on) + * bit 4 : interlaced plain pcluster (0 - off; 1 - on) + * bit 5 : fragment pcluster (0 - off; 1 - on) */ #define Z_EROFS_ADVISE_COMPACTED_2B 0x0001 #define Z_EROFS_ADVISE_BIG_PCLUSTER_1 0x0002 #define Z_EROFS_ADVISE_BIG_PCLUSTER_2 0x0004 #define Z_EROFS_ADVISE_INLINE_PCLUSTER 0x0008 +#define Z_EROFS_ADVISE_INTERLACED_PCLUSTER 0x0010 +#define Z_EROFS_ADVISE_FRAGMENT_PCLUSTER 0x0020 +#define Z_EROFS_FRAGMENT_INODE_BIT 7 struct z_erofs_map_header { - __le16 h_reserved1; - /* indicates the encoded size of tailpacking data */ - __le16 h_idata_size; + union { + /* fragment data offset in the packed inode */ + __le32 h_fragmentoff; + struct { + __le16 h_reserved1; + /* indicates the encoded size of tailpacking data */ + __le16 h_idata_size; + }; + }; __le16 h_advise; /* * bit 0-3 : algorithm type of head 1 (logical cluster type 01); @@ -313,7 +330,8 @@ struct z_erofs_map_header { __u8 h_algorithmtype; /* * bit 0-2 : logical cluster bits - 12, e.g. 0 for 4096; - * bit 3-7 : reserved. + * bit 3-6 : reserved; + * bit 7 : move the whole file into packed inode or not. */ __u8 h_clusterbits; }; @@ -355,6 +373,9 @@ enum { #define Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS 2 #define Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT 0 +/* (noncompact only, HEAD) This pcluster refers to partial decompressed data */ +#define Z_EROFS_VLE_DI_PARTIAL_REF (1 << 15) + /* * D0_CBLKCNT will be marked _only_ at the 1st non-head lcluster to store the * compressed block count of a compressed extent (in logical clusters, aka. @@ -402,6 +423,10 @@ struct erofs_dirent { /* check the EROFS on-disk layout strictly at compile time */ static inline void erofs_check_ondisk_layout_definitions(void) { + const __le64 fmh = *(__le64 *)&(struct z_erofs_map_header) { + .h_clusterbits = 1 << Z_EROFS_FRAGMENT_INODE_BIT + }; + BUILD_BUG_ON(sizeof(struct erofs_super_block) != 128); BUILD_BUG_ON(sizeof(struct erofs_inode_compact) != 32); BUILD_BUG_ON(sizeof(struct erofs_inode_extended) != 64); @@ -419,6 +444,9 @@ static inline void erofs_check_ondisk_layout_definitions(void) BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) < Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1); + /* exclude old compiler versions like gcc 7.5.0 */ + BUILD_BUG_ON(__builtin_constant_p(fmh) ? + fmh != cpu_to_le64(1ULL << 63) : 0); } #endif diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index 8e01d89c3319..998cd26a1b3b 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -1,10 +1,16 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2022, Alibaba Cloud + * Copyright (C) 2022, Bytedance Inc. All rights reserved. */ #include <linux/fscache.h> #include "internal.h" +static DEFINE_MUTEX(erofs_domain_list_lock); +static DEFINE_MUTEX(erofs_domain_cookies_lock); +static LIST_HEAD(erofs_domain_list); +static struct vfsmount *erofs_pseudo_mnt; + static struct netfs_io_request *erofs_fscache_alloc_request(struct address_space *mapping, loff_t start, size_t len) { @@ -222,8 +228,10 @@ static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio) rreq = erofs_fscache_alloc_request(folio_mapping(folio), folio_pos(folio), folio_size(folio)); - if (IS_ERR(rreq)) + if (IS_ERR(rreq)) { + ret = PTR_ERR(rreq); goto out; + } return erofs_fscache_read_folios_async(mdev.m_fscache->cookie, rreq, mdev.m_pa); @@ -232,111 +240,111 @@ out: return ret; } -static int erofs_fscache_read_folio_inline(struct folio *folio, - struct erofs_map_blocks *map) -{ - struct super_block *sb = folio_mapping(folio)->host->i_sb; - struct erofs_buf buf = __EROFS_BUF_INITIALIZER; - erofs_blk_t blknr; - size_t offset, len; - void *src, *dst; - - /* For tail packing layout, the offset may be non-zero. */ - offset = erofs_blkoff(map->m_pa); - blknr = erofs_blknr(map->m_pa); - len = map->m_llen; - - src = erofs_read_metabuf(&buf, sb, blknr, EROFS_KMAP); - if (IS_ERR(src)) - return PTR_ERR(src); - - dst = kmap_local_folio(folio, 0); - memcpy(dst, src + offset, len); - memset(dst + len, 0, PAGE_SIZE - len); - kunmap_local(dst); - - erofs_put_metabuf(&buf); - return 0; -} - -static int erofs_fscache_read_folio(struct file *file, struct folio *folio) +/* + * Read into page cache in the range described by (@pos, @len). + * + * On return, the caller is responsible for page unlocking if the output @unlock + * is true, or the callee will take this responsibility through netfs_io_request + * interface. + * + * The return value is the number of bytes successfully handled, or negative + * error code on failure. The only exception is that, the length of the range + * instead of the error code is returned on failure after netfs_io_request is + * allocated, so that .readahead() could advance rac accordingly. + */ +static int erofs_fscache_data_read(struct address_space *mapping, + loff_t pos, size_t len, bool *unlock) { - struct inode *inode = folio_mapping(folio)->host; + struct inode *inode = mapping->host; struct super_block *sb = inode->i_sb; + struct netfs_io_request *rreq; struct erofs_map_blocks map; struct erofs_map_dev mdev; - struct netfs_io_request *rreq; - erofs_off_t pos; - loff_t pstart; + struct iov_iter iter; + size_t count; int ret; - DBG_BUGON(folio_size(folio) != EROFS_BLKSIZ); + *unlock = true; - pos = folio_pos(folio); map.m_la = pos; - ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); if (ret) - goto out_unlock; + return ret; - if (!(map.m_flags & EROFS_MAP_MAPPED)) { - folio_zero_range(folio, 0, folio_size(folio)); - goto out_uptodate; + if (map.m_flags & EROFS_MAP_META) { + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + erofs_blk_t blknr; + size_t offset, size; + void *src; + + /* For tail packing layout, the offset may be non-zero. */ + offset = erofs_blkoff(map.m_pa); + blknr = erofs_blknr(map.m_pa); + size = map.m_llen; + + src = erofs_read_metabuf(&buf, sb, blknr, EROFS_KMAP); + if (IS_ERR(src)) + return PTR_ERR(src); + + iov_iter_xarray(&iter, READ, &mapping->i_pages, pos, PAGE_SIZE); + if (copy_to_iter(src + offset, size, &iter) != size) + return -EFAULT; + iov_iter_zero(PAGE_SIZE - size, &iter); + erofs_put_metabuf(&buf); + return PAGE_SIZE; } - if (map.m_flags & EROFS_MAP_META) { - ret = erofs_fscache_read_folio_inline(folio, &map); - goto out_uptodate; + count = min_t(size_t, map.m_llen - (pos - map.m_la), len); + DBG_BUGON(!count || count % PAGE_SIZE); + + if (!(map.m_flags & EROFS_MAP_MAPPED)) { + iov_iter_xarray(&iter, READ, &mapping->i_pages, pos, count); + iov_iter_zero(count, &iter); + return count; } mdev = (struct erofs_map_dev) { .m_deviceid = map.m_deviceid, .m_pa = map.m_pa, }; - ret = erofs_map_dev(sb, &mdev); if (ret) - goto out_unlock; - + return ret; - rreq = erofs_fscache_alloc_request(folio_mapping(folio), - folio_pos(folio), folio_size(folio)); + rreq = erofs_fscache_alloc_request(mapping, pos, count); if (IS_ERR(rreq)) - goto out_unlock; - - pstart = mdev.m_pa + (pos - map.m_la); - return erofs_fscache_read_folios_async(mdev.m_fscache->cookie, - rreq, pstart); + return PTR_ERR(rreq); -out_uptodate: - if (!ret) - folio_mark_uptodate(folio); -out_unlock: - folio_unlock(folio); - return ret; + *unlock = false; + erofs_fscache_read_folios_async(mdev.m_fscache->cookie, + rreq, mdev.m_pa + (pos - map.m_la)); + return count; } -static void erofs_fscache_advance_folios(struct readahead_control *rac, - size_t len, bool unlock) +static int erofs_fscache_read_folio(struct file *file, struct folio *folio) { - while (len) { - struct folio *folio = readahead_folio(rac); - len -= folio_size(folio); - if (unlock) { + bool unlock; + int ret; + + DBG_BUGON(folio_size(folio) != EROFS_BLKSIZ); + + ret = erofs_fscache_data_read(folio_mapping(folio), folio_pos(folio), + folio_size(folio), &unlock); + if (unlock) { + if (ret > 0) folio_mark_uptodate(folio); - folio_unlock(folio); - } + folio_unlock(folio); } + return ret < 0 ? ret : 0; } static void erofs_fscache_readahead(struct readahead_control *rac) { - struct inode *inode = rac->mapping->host; - struct super_block *sb = inode->i_sb; - size_t len, count, done = 0; - erofs_off_t pos; - loff_t start, offset; - int ret; + struct folio *folio; + size_t len, done = 0; + loff_t start, pos; + bool unlock; + int ret, size; if (!readahead_count(rac)) return; @@ -345,67 +353,22 @@ static void erofs_fscache_readahead(struct readahead_control *rac) len = readahead_length(rac); do { - struct erofs_map_blocks map; - struct erofs_map_dev mdev; - struct netfs_io_request *rreq; - pos = start + done; - map.m_la = pos; - - ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); - if (ret) + ret = erofs_fscache_data_read(rac->mapping, pos, + len - done, &unlock); + if (ret <= 0) return; - offset = start + done; - count = min_t(size_t, map.m_llen - (pos - map.m_la), - len - done); - - if (!(map.m_flags & EROFS_MAP_MAPPED)) { - struct iov_iter iter; - - iov_iter_xarray(&iter, READ, &rac->mapping->i_pages, - offset, count); - iov_iter_zero(count, &iter); - - erofs_fscache_advance_folios(rac, count, true); - ret = count; - continue; - } - - if (map.m_flags & EROFS_MAP_META) { - struct folio *folio = readahead_folio(rac); - - ret = erofs_fscache_read_folio_inline(folio, &map); - if (!ret) { + size = ret; + while (size) { + folio = readahead_folio(rac); + size -= folio_size(folio); + if (unlock) { folio_mark_uptodate(folio); - ret = folio_size(folio); + folio_unlock(folio); } - - folio_unlock(folio); - continue; } - - mdev = (struct erofs_map_dev) { - .m_deviceid = map.m_deviceid, - .m_pa = map.m_pa, - }; - ret = erofs_map_dev(sb, &mdev); - if (ret) - return; - - rreq = erofs_fscache_alloc_request(rac->mapping, offset, count); - if (IS_ERR(rreq)) - return; - /* - * Drop the ref of folios here. Unlock them in - * rreq_unlock_folios() when rreq complete. - */ - erofs_fscache_advance_folios(rac, count, false); - ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie, - rreq, mdev.m_pa + (pos - map.m_la)); - if (!ret) - ret = count; - } while (ret > 0 && ((done += ret) < len)); + } while ((done += ret) < len); } static const struct address_space_operations erofs_fscache_meta_aops = { @@ -417,9 +380,114 @@ const struct address_space_operations erofs_fscache_access_aops = { .readahead = erofs_fscache_readahead, }; -int erofs_fscache_register_cookie(struct super_block *sb, - struct erofs_fscache **fscache, - char *name, bool need_inode) +static void erofs_fscache_domain_put(struct erofs_domain *domain) +{ + if (!domain) + return; + mutex_lock(&erofs_domain_list_lock); + if (refcount_dec_and_test(&domain->ref)) { + list_del(&domain->list); + if (list_empty(&erofs_domain_list)) { + kern_unmount(erofs_pseudo_mnt); + erofs_pseudo_mnt = NULL; + } + mutex_unlock(&erofs_domain_list_lock); + fscache_relinquish_volume(domain->volume, NULL, false); + kfree(domain->domain_id); + kfree(domain); + return; + } + mutex_unlock(&erofs_domain_list_lock); +} + +static int erofs_fscache_register_volume(struct super_block *sb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + char *domain_id = sbi->opt.domain_id; + struct fscache_volume *volume; + char *name; + int ret = 0; + + name = kasprintf(GFP_KERNEL, "erofs,%s", + domain_id ? domain_id : sbi->opt.fsid); + if (!name) + return -ENOMEM; + + volume = fscache_acquire_volume(name, NULL, NULL, 0); + if (IS_ERR_OR_NULL(volume)) { + erofs_err(sb, "failed to register volume for %s", name); + ret = volume ? PTR_ERR(volume) : -EOPNOTSUPP; + volume = NULL; + } + + sbi->volume = volume; + kfree(name); + return ret; +} + +static int erofs_fscache_init_domain(struct super_block *sb) +{ + int err; + struct erofs_domain *domain; + struct erofs_sb_info *sbi = EROFS_SB(sb); + + domain = kzalloc(sizeof(struct erofs_domain), GFP_KERNEL); + if (!domain) + return -ENOMEM; + + domain->domain_id = kstrdup(sbi->opt.domain_id, GFP_KERNEL); + if (!domain->domain_id) { + kfree(domain); + return -ENOMEM; + } + + err = erofs_fscache_register_volume(sb); + if (err) + goto out; + + if (!erofs_pseudo_mnt) { + erofs_pseudo_mnt = kern_mount(&erofs_fs_type); + if (IS_ERR(erofs_pseudo_mnt)) { + err = PTR_ERR(erofs_pseudo_mnt); + goto out; + } + } + + domain->volume = sbi->volume; + refcount_set(&domain->ref, 1); + list_add(&domain->list, &erofs_domain_list); + sbi->domain = domain; + return 0; +out: + kfree(domain->domain_id); + kfree(domain); + return err; +} + +static int erofs_fscache_register_domain(struct super_block *sb) +{ + int err; + struct erofs_domain *domain; + struct erofs_sb_info *sbi = EROFS_SB(sb); + + mutex_lock(&erofs_domain_list_lock); + list_for_each_entry(domain, &erofs_domain_list, list) { + if (!strcmp(domain->domain_id, sbi->opt.domain_id)) { + sbi->domain = domain; + sbi->volume = domain->volume; + refcount_inc(&domain->ref); + mutex_unlock(&erofs_domain_list_lock); + return 0; + } + } + err = erofs_fscache_init_domain(sb); + mutex_unlock(&erofs_domain_list_lock); + return err; +} + +static +struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb, + char *name, bool need_inode) { struct fscache_volume *volume = EROFS_SB(sb)->volume; struct erofs_fscache *ctx; @@ -428,7 +496,7 @@ int erofs_fscache_register_cookie(struct super_block *sb, ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) - return -ENOMEM; + return ERR_PTR(-ENOMEM); cookie = fscache_acquire_cookie(volume, FSCACHE_ADV_WANT_CACHE_SIZE, name, strlen(name), NULL, 0, 0); @@ -458,63 +526,146 @@ int erofs_fscache_register_cookie(struct super_block *sb, ctx->inode = inode; } - *fscache = ctx; - return 0; + return ctx; err_cookie: fscache_unuse_cookie(ctx->cookie, NULL, NULL); fscache_relinquish_cookie(ctx->cookie, false); - ctx->cookie = NULL; err: kfree(ctx); - return ret; + return ERR_PTR(ret); } -void erofs_fscache_unregister_cookie(struct erofs_fscache **fscache) +static void erofs_fscache_relinquish_cookie(struct erofs_fscache *ctx) { - struct erofs_fscache *ctx = *fscache; - - if (!ctx) - return; - fscache_unuse_cookie(ctx->cookie, NULL, NULL); fscache_relinquish_cookie(ctx->cookie, false); - ctx->cookie = NULL; - iput(ctx->inode); - ctx->inode = NULL; - + kfree(ctx->name); kfree(ctx); - *fscache = NULL; +} + +static +struct erofs_fscache *erofs_fscache_domain_init_cookie(struct super_block *sb, + char *name, bool need_inode) +{ + int err; + struct inode *inode; + struct erofs_fscache *ctx; + struct erofs_domain *domain = EROFS_SB(sb)->domain; + + ctx = erofs_fscache_acquire_cookie(sb, name, need_inode); + if (IS_ERR(ctx)) + return ctx; + + ctx->name = kstrdup(name, GFP_KERNEL); + if (!ctx->name) { + err = -ENOMEM; + goto out; + } + + inode = new_inode(erofs_pseudo_mnt->mnt_sb); + if (!inode) { + err = -ENOMEM; + goto out; + } + + ctx->domain = domain; + ctx->anon_inode = inode; + inode->i_private = ctx; + refcount_inc(&domain->ref); + return ctx; +out: + erofs_fscache_relinquish_cookie(ctx); + return ERR_PTR(err); +} + +static +struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb, + char *name, bool need_inode) +{ + struct inode *inode; + struct erofs_fscache *ctx; + struct erofs_domain *domain = EROFS_SB(sb)->domain; + struct super_block *psb = erofs_pseudo_mnt->mnt_sb; + + mutex_lock(&erofs_domain_cookies_lock); + list_for_each_entry(inode, &psb->s_inodes, i_sb_list) { + ctx = inode->i_private; + if (!ctx || ctx->domain != domain || strcmp(ctx->name, name)) + continue; + igrab(inode); + mutex_unlock(&erofs_domain_cookies_lock); + return ctx; + } + ctx = erofs_fscache_domain_init_cookie(sb, name, need_inode); + mutex_unlock(&erofs_domain_cookies_lock); + return ctx; +} + +struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, + char *name, bool need_inode) +{ + if (EROFS_SB(sb)->opt.domain_id) + return erofs_domain_register_cookie(sb, name, need_inode); + return erofs_fscache_acquire_cookie(sb, name, need_inode); +} + +void erofs_fscache_unregister_cookie(struct erofs_fscache *ctx) +{ + bool drop; + struct erofs_domain *domain; + + if (!ctx) + return; + domain = ctx->domain; + if (domain) { + mutex_lock(&erofs_domain_cookies_lock); + drop = atomic_read(&ctx->anon_inode->i_count) == 1; + iput(ctx->anon_inode); + mutex_unlock(&erofs_domain_cookies_lock); + if (!drop) + return; + } + + erofs_fscache_relinquish_cookie(ctx); + erofs_fscache_domain_put(domain); } int erofs_fscache_register_fs(struct super_block *sb) { + int ret; struct erofs_sb_info *sbi = EROFS_SB(sb); - struct fscache_volume *volume; - char *name; - int ret = 0; + struct erofs_fscache *fscache; - name = kasprintf(GFP_KERNEL, "erofs,%s", sbi->opt.fsid); - if (!name) - return -ENOMEM; + if (sbi->opt.domain_id) + ret = erofs_fscache_register_domain(sb); + else + ret = erofs_fscache_register_volume(sb); + if (ret) + return ret; - volume = fscache_acquire_volume(name, NULL, NULL, 0); - if (IS_ERR_OR_NULL(volume)) { - erofs_err(sb, "failed to register volume for %s", name); - ret = volume ? PTR_ERR(volume) : -EOPNOTSUPP; - volume = NULL; - } + /* acquired domain/volume will be relinquished in kill_sb() on error */ + fscache = erofs_fscache_register_cookie(sb, sbi->opt.fsid, true); + if (IS_ERR(fscache)) + return PTR_ERR(fscache); - sbi->volume = volume; - kfree(name); - return ret; + sbi->s_fscache = fscache; + return 0; } void erofs_fscache_unregister_fs(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); - fscache_relinquish_volume(sbi->volume, NULL, false); + erofs_fscache_unregister_cookie(sbi->s_fscache); + + if (sbi->domain) + erofs_fscache_domain_put(sbi->domain); + else + fscache_relinquish_volume(sbi->volume, NULL, false); + + sbi->s_fscache = NULL; sbi->volume = NULL; + sbi->domain = NULL; } diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 95a403720e8c..ad2a82f2eb4c 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -214,7 +214,7 @@ static int erofs_fill_symlink(struct inode *inode, void *kaddr, /* if it cannot be handled with fast symlink scheme */ if (vi->datalayout != EROFS_INODE_FLAT_INLINE || - inode->i_size >= EROFS_BLKSIZ) { + inode->i_size >= EROFS_BLKSIZ || inode->i_size < 0) { inode->i_op = &erofs_symlink_iops; return 0; } @@ -241,7 +241,7 @@ static int erofs_fill_symlink(struct inode *inode, void *kaddr, return 0; } -static int erofs_fill_inode(struct inode *inode, int isdir) +static int erofs_fill_inode(struct inode *inode) { struct erofs_inode *vi = EROFS_I(inode); struct erofs_buf buf = __EROFS_BUF_INITIALIZER; @@ -249,7 +249,7 @@ static int erofs_fill_inode(struct inode *inode, int isdir) unsigned int ofs; int err = 0; - trace_erofs_fill_inode(inode, isdir); + trace_erofs_fill_inode(inode); /* read inode base data from disk */ kaddr = erofs_read_inode(&buf, inode, &ofs); @@ -324,21 +324,13 @@ static int erofs_iget_set_actor(struct inode *inode, void *opaque) return 0; } -static inline struct inode *erofs_iget_locked(struct super_block *sb, - erofs_nid_t nid) +struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid) { const unsigned long hashval = erofs_inode_hash(nid); + struct inode *inode; - return iget5_locked(sb, hashval, erofs_ilookup_test_actor, + inode = iget5_locked(sb, hashval, erofs_ilookup_test_actor, erofs_iget_set_actor, &nid); -} - -struct inode *erofs_iget(struct super_block *sb, - erofs_nid_t nid, - bool isdir) -{ - struct inode *inode = erofs_iget_locked(sb, nid); - if (!inode) return ERR_PTR(-ENOMEM); @@ -348,10 +340,10 @@ struct inode *erofs_iget(struct super_block *sb, vi->nid = nid; - err = erofs_fill_inode(inode, isdir); - if (!err) + err = erofs_fill_inode(inode); + if (!err) { unlock_new_inode(inode); - else { + } else { iget_failed(inode); inode = ERR_PTR(err); } diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index cfee49d33b95..1701df48c446 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -76,6 +76,7 @@ struct erofs_mount_opts { #endif unsigned int mount_opt; char *fsid; + char *domain_id; }; struct erofs_dev_context { @@ -98,9 +99,19 @@ struct erofs_sb_lz4_info { u16 max_pclusterblks; }; +struct erofs_domain { + refcount_t ref; + struct list_head list; + struct fscache_volume *volume; + char *domain_id; +}; + struct erofs_fscache { struct fscache_cookie *cookie; struct inode *inode; + struct inode *anon_inode; + struct erofs_domain *domain; + char *name; }; struct erofs_sb_info { @@ -120,6 +131,7 @@ struct erofs_sb_info { struct inode *managed_cache; struct erofs_sb_lz4_info lz4; + struct inode *packed_inode; #endif /* CONFIG_EROFS_FS_ZIP */ struct erofs_dev_context *devs; struct dax_device *dax_dev; @@ -157,6 +169,7 @@ struct erofs_sb_info { /* fscache support */ struct fscache_volume *volume; struct erofs_fscache *s_fscache; + struct erofs_domain *domain; }; #define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info) @@ -183,7 +196,6 @@ enum { EROFS_ZIP_CACHE_READAROUND }; -#ifdef CONFIG_EROFS_FS_ZIP #define EROFS_LOCKED_MAGIC (INT_MIN | 0xE0F510CCL) /* basic unit of the workstation of a super_block */ @@ -195,7 +207,6 @@ struct erofs_workgroup { atomic_t refcount; }; -#if defined(CONFIG_SMP) static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp, int val) { @@ -224,35 +235,6 @@ static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp) return atomic_cond_read_relaxed(&grp->refcount, VAL != EROFS_LOCKED_MAGIC); } -#else -static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp, - int val) -{ - preempt_disable(); - /* no need to spin on UP platforms, let's just disable preemption. */ - if (val != atomic_read(&grp->refcount)) { - preempt_enable(); - return false; - } - return true; -} - -static inline void erofs_workgroup_unfreeze(struct erofs_workgroup *grp, - int orig_val) -{ - preempt_enable(); -} - -static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp) -{ - int v = atomic_read(&grp->refcount); - - /* workgroup is never freezed on uniprocessor systems */ - DBG_BUGON(v == EROFS_LOCKED_MAGIC); - return v; -} -#endif /* !CONFIG_SMP */ -#endif /* !CONFIG_EROFS_FS_ZIP */ /* we strictly follow PAGE_SIZE and no buffer head yet */ #define LOG_BLOCK_SIZE PAGE_SHIFT @@ -306,6 +288,8 @@ EROFS_FEATURE_FUNCS(chunked_file, incompat, INCOMPAT_CHUNKED_FILE) EROFS_FEATURE_FUNCS(device_table, incompat, INCOMPAT_DEVICE_TABLE) EROFS_FEATURE_FUNCS(compr_head2, incompat, INCOMPAT_COMPR_HEAD2) EROFS_FEATURE_FUNCS(ztailpacking, incompat, INCOMPAT_ZTAILPACKING) +EROFS_FEATURE_FUNCS(fragments, incompat, INCOMPAT_FRAGMENTS) +EROFS_FEATURE_FUNCS(dedupe, incompat, INCOMPAT_DEDUPE) EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM) /* atomic flag definitions */ @@ -341,8 +325,13 @@ struct erofs_inode { unsigned char z_algorithmtype[2]; unsigned char z_logical_clusterbits; unsigned long z_tailextent_headlcn; - erofs_off_t z_idataoff; - unsigned short z_idata_size; + union { + struct { + erofs_off_t z_idataoff; + unsigned short z_idata_size; + }; + erofs_off_t z_fragmentoff; + }; }; #endif /* CONFIG_EROFS_FS_ZIP */ }; @@ -393,6 +382,7 @@ struct page *erofs_grab_cache_page_nowait(struct address_space *mapping, } extern const struct super_operations erofs_sops; +extern struct file_system_type erofs_fs_type; extern const struct address_space_operations erofs_raw_access_aops; extern const struct address_space_operations z_erofs_aops; @@ -400,6 +390,8 @@ extern const struct address_space_operations z_erofs_aops; enum { BH_Encoded = BH_PrivateStart, BH_FullMapped, + BH_Fragment, + BH_Partialref, }; /* Has a disk mapping */ @@ -410,6 +402,10 @@ enum { #define EROFS_MAP_ENCODED (1 << BH_Encoded) /* The length of extent is full */ #define EROFS_MAP_FULL_MAPPED (1 << BH_FullMapped) +/* Located in the special packed inode */ +#define EROFS_MAP_FRAGMENT (1 << BH_Fragment) +/* The extent refers to partial decompressed data */ +#define EROFS_MAP_PARTIAL_REF (1 << BH_Partialref) struct erofs_map_blocks { struct erofs_buf buf; @@ -431,11 +427,12 @@ struct erofs_map_blocks { #define EROFS_GET_BLOCKS_FIEMAP 0x0002 /* Used to map the whole extent if non-negligible data is requested for LZMA */ #define EROFS_GET_BLOCKS_READMORE 0x0004 -/* Used to map tail extent for tailpacking inline pcluster */ +/* Used to map tail extent for tailpacking inline or fragment pcluster */ #define EROFS_GET_BLOCKS_FINDTAIL 0x0008 enum { Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX, + Z_EROFS_COMPRESSION_INTERLACED, Z_EROFS_COMPRESSION_RUNTIME_MAX }; @@ -495,7 +492,7 @@ extern const struct inode_operations erofs_generic_iops; extern const struct inode_operations erofs_symlink_iops; extern const struct inode_operations erofs_fast_symlink_iops; -struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid, bool dir); +struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid); int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags); @@ -610,27 +607,26 @@ static inline int z_erofs_load_lzma_config(struct super_block *sb, int erofs_fscache_register_fs(struct super_block *sb); void erofs_fscache_unregister_fs(struct super_block *sb); -int erofs_fscache_register_cookie(struct super_block *sb, - struct erofs_fscache **fscache, - char *name, bool need_inode); -void erofs_fscache_unregister_cookie(struct erofs_fscache **fscache); +struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, + char *name, bool need_inode); +void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache); extern const struct address_space_operations erofs_fscache_access_aops; #else static inline int erofs_fscache_register_fs(struct super_block *sb) { - return 0; + return -EOPNOTSUPP; } static inline void erofs_fscache_unregister_fs(struct super_block *sb) {} -static inline int erofs_fscache_register_cookie(struct super_block *sb, - struct erofs_fscache **fscache, - char *name, bool need_inode) +static inline +struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, + char *name, bool need_inode) { - return -EOPNOTSUPP; + return ERR_PTR(-EOPNOTSUPP); } -static inline void erofs_fscache_unregister_cookie(struct erofs_fscache **fscache) +static inline void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache) { } #endif diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c index fd75506799c4..0dc34721080c 100644 --- a/fs/erofs/namei.c +++ b/fs/erofs/namei.c @@ -185,7 +185,6 @@ int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid, if (IS_ERR(de)) return PTR_ERR(de); - /* the target page has been mapped */ if (ndirents) de = find_target_dirent(&qn, (u8 *)de, EROFS_BLKSIZ, ndirents); @@ -197,9 +196,7 @@ int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid, return PTR_ERR_OR_ZERO(de); } -/* NOTE: i_mutex is already held by vfs */ -static struct dentry *erofs_lookup(struct inode *dir, - struct dentry *dentry, +static struct dentry *erofs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { int err; @@ -207,17 +204,11 @@ static struct dentry *erofs_lookup(struct inode *dir, unsigned int d_type; struct inode *inode; - DBG_BUGON(!d_really_is_negative(dentry)); - /* dentry must be unhashed in lookup, no need to worry about */ - DBG_BUGON(!d_unhashed(dentry)); - trace_erofs_lookup(dir, dentry, flags); - /* file name exceeds fs limit */ if (dentry->d_name.len > EROFS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); - /* false uninitialized warnings on gcc 4.8.x */ err = erofs_namei(dir, &dentry->d_name, &nid, &d_type); if (err == -ENOENT) { @@ -228,7 +219,7 @@ static struct dentry *erofs_lookup(struct inode *dir, } else { erofs_dbg("%s, %pd (nid %llu) found, d_type %u", __func__, dentry, nid, d_type); - inode = erofs_iget(dir->i_sb, nid, d_type == FT_DIR); + inode = erofs_iget(dir->i_sb, nid); } return d_splice_alias(inode, dentry); } diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 3173debeaa5a..2cf96ce1c32e 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -224,10 +224,10 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, struct erofs_device_info *dif, erofs_off_t *pos) { struct erofs_sb_info *sbi = EROFS_SB(sb); + struct erofs_fscache *fscache; struct erofs_deviceslot *dis; struct block_device *bdev; void *ptr; - int ret; ptr = erofs_read_metabuf(buf, sb, erofs_blknr(*pos), EROFS_KMAP); if (IS_ERR(ptr)) @@ -245,10 +245,10 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, } if (erofs_is_fscache_mode(sb)) { - ret = erofs_fscache_register_cookie(sb, &dif->fscache, - dif->path, false); - if (ret) - return ret; + fscache = erofs_fscache_register_cookie(sb, dif->path, false); + if (IS_ERR(fscache)) + return PTR_ERR(fscache); + dif->fscache = fscache; } else { bdev = blkdev_get_by_path(dif->path, FMODE_READ | FMODE_EXCL, sb->s_type); @@ -381,6 +381,17 @@ static int erofs_read_superblock(struct super_block *sb) #endif sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact)); sbi->root_nid = le16_to_cpu(dsb->root_nid); +#ifdef CONFIG_EROFS_FS_ZIP + sbi->packed_inode = NULL; + if (erofs_sb_has_fragments(sbi) && dsb->packed_nid) { + sbi->packed_inode = + erofs_iget(sb, le64_to_cpu(dsb->packed_nid)); + if (IS_ERR(sbi->packed_inode)) { + ret = PTR_ERR(sbi->packed_inode); + goto out; + } + } +#endif sbi->inos = le64_to_cpu(dsb->inos); sbi->build_time = le64_to_cpu(dsb->build_time); @@ -411,6 +422,10 @@ static int erofs_read_superblock(struct super_block *sb) erofs_info(sb, "EXPERIMENTAL compressed inline data feature in use. Use at your own risk!"); if (erofs_is_fscache_mode(sb)) erofs_info(sb, "EXPERIMENTAL fscache-based on-demand read feature in use. Use at your own risk!"); + if (erofs_sb_has_fragments(sbi)) + erofs_info(sb, "EXPERIMENTAL compressed fragments feature in use. Use at your own risk!"); + if (erofs_sb_has_dedupe(sbi)) + erofs_info(sb, "EXPERIMENTAL global deduplication feature in use. Use at your own risk!"); out: erofs_put_metabuf(&buf); return ret; @@ -440,6 +455,7 @@ enum { Opt_dax_enum, Opt_device, Opt_fsid, + Opt_domain_id, Opt_err }; @@ -465,6 +481,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = { fsparam_enum("dax", Opt_dax_enum, erofs_dax_param_enums), fsparam_string("device", Opt_device), fsparam_string("fsid", Opt_fsid), + fsparam_string("domain_id", Opt_domain_id), {} }; @@ -570,6 +587,16 @@ static int erofs_fc_parse_param(struct fs_context *fc, errorfc(fc, "fsid option not supported"); #endif break; + case Opt_domain_id: +#ifdef CONFIG_EROFS_FS_ONDEMAND + kfree(ctx->opt.domain_id); + ctx->opt.domain_id = kstrdup(param->string, GFP_KERNEL); + if (!ctx->opt.domain_id) + return -ENOMEM; +#else + errorfc(fc, "domain_id option not supported"); +#endif + break; default: return -ENOPARAM; } @@ -641,7 +668,7 @@ static int erofs_init_managed_cache(struct super_block *sb) { return 0; } static struct inode *erofs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { - return erofs_iget(sb, ino, false); + return erofs_iget(sb, ino); } static struct dentry *erofs_fh_to_dentry(struct super_block *sb, @@ -667,7 +694,7 @@ static struct dentry *erofs_get_parent(struct dentry *child) err = erofs_namei(d_inode(child), &dotdot_name, &nid, &d_type); if (err) return ERR_PTR(err); - return d_obtain_alias(erofs_iget(child->d_sb, nid, d_type == FT_DIR)); + return d_obtain_alias(erofs_iget(child->d_sb, nid)); } static const struct export_operations erofs_export_ops = { @@ -676,6 +703,13 @@ static const struct export_operations erofs_export_ops = { .get_parent = erofs_get_parent, }; +static int erofs_fc_fill_pseudo_super(struct super_block *sb, struct fs_context *fc) +{ + static const struct tree_descr empty_descr = {""}; + + return simple_fill_super(sb, EROFS_SUPER_MAGIC, &empty_descr); +} + static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) { struct inode *inode; @@ -695,6 +729,7 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_fs_info = sbi; sbi->opt = ctx->opt; ctx->opt.fsid = NULL; + ctx->opt.domain_id = NULL; sbi->devs = ctx->devs; ctx->devs = NULL; @@ -706,11 +741,6 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) if (err) return err; - err = erofs_fscache_register_cookie(sb, &sbi->s_fscache, - sbi->opt.fsid, true); - if (err) - return err; - err = super_setup_bdi(sb); if (err) return err; @@ -752,7 +782,7 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) #endif /* get the root inode */ - inode = erofs_iget(sb, ROOT_NID(sbi), true); + inode = erofs_iget(sb, ROOT_NID(sbi)); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -781,6 +811,11 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) return 0; } +static int erofs_fc_anon_get_tree(struct fs_context *fc) +{ + return get_tree_nodev(fc, erofs_fc_fill_pseudo_super); +} + static int erofs_fc_get_tree(struct fs_context *fc) { struct erofs_fs_context *ctx = fc->fs_private; @@ -817,7 +852,8 @@ static int erofs_release_device_info(int id, void *ptr, void *data) fs_put_dax(dif->dax_dev, NULL); if (dif->bdev) blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL); - erofs_fscache_unregister_cookie(&dif->fscache); + erofs_fscache_unregister_cookie(dif->fscache); + dif->fscache = NULL; kfree(dif->path); kfree(dif); return 0; @@ -838,6 +874,7 @@ static void erofs_fc_free(struct fs_context *fc) erofs_free_dev_context(ctx->devs); kfree(ctx->opt.fsid); + kfree(ctx->opt.domain_id); kfree(ctx); } @@ -848,10 +885,21 @@ static const struct fs_context_operations erofs_context_ops = { .free = erofs_fc_free, }; +static const struct fs_context_operations erofs_anon_context_ops = { + .get_tree = erofs_fc_anon_get_tree, +}; + static int erofs_init_fs_context(struct fs_context *fc) { - struct erofs_fs_context *ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + struct erofs_fs_context *ctx; + + /* pseudo mount for anon inodes */ + if (fc->sb_flags & SB_KERNMOUNT) { + fc->ops = &erofs_anon_context_ops; + return 0; + } + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return -ENOMEM; ctx->devs = kzalloc(sizeof(struct erofs_dev_context), GFP_KERNEL); @@ -878,8 +926,14 @@ static void erofs_kill_sb(struct super_block *sb) WARN_ON(sb->s_magic != EROFS_SUPER_MAGIC); + /* pseudo mount for anon inodes */ + if (sb->s_flags & SB_KERNMOUNT) { + kill_anon_super(sb); + return; + } + if (erofs_is_fscache_mode(sb)) - generic_shutdown_super(sb); + kill_anon_super(sb); else kill_block_super(sb); @@ -889,9 +943,9 @@ static void erofs_kill_sb(struct super_block *sb) erofs_free_dev_context(sbi->devs); fs_put_dax(sbi->dax_dev, NULL); - erofs_fscache_unregister_cookie(&sbi->s_fscache); erofs_fscache_unregister_fs(sb); kfree(sbi->opt.fsid); + kfree(sbi->opt.domain_id); kfree(sbi); sb->s_fs_info = NULL; } @@ -908,11 +962,13 @@ static void erofs_put_super(struct super_block *sb) #ifdef CONFIG_EROFS_FS_ZIP iput(sbi->managed_cache); sbi->managed_cache = NULL; + iput(sbi->packed_inode); + sbi->packed_inode = NULL; #endif - erofs_fscache_unregister_cookie(&sbi->s_fscache); + erofs_fscache_unregister_fs(sb); } -static struct file_system_type erofs_fs_type = { +struct file_system_type erofs_fs_type = { .owner = THIS_MODULE, .name = "erofs", .init_fs_context = erofs_init_fs_context, @@ -1044,6 +1100,8 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root) #ifdef CONFIG_EROFS_FS_ONDEMAND if (opt->fsid) seq_printf(seq, ",fsid=%s", opt->fsid); + if (opt->domain_id) + seq_printf(seq, ",domain_id=%s", opt->domain_id); #endif return 0; } diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c index c1383e508bbe..783bb7b21b51 100644 --- a/fs/erofs/sysfs.c +++ b/fs/erofs/sysfs.c @@ -76,6 +76,8 @@ EROFS_ATTR_FEATURE(device_table); EROFS_ATTR_FEATURE(compr_head2); EROFS_ATTR_FEATURE(sb_chksum); EROFS_ATTR_FEATURE(ztailpacking); +EROFS_ATTR_FEATURE(fragments); +EROFS_ATTR_FEATURE(dedupe); static struct attribute *erofs_feat_attrs[] = { ATTR_LIST(zero_padding), @@ -86,6 +88,8 @@ static struct attribute *erofs_feat_attrs[] = { ATTR_LIST(compr_head2), ATTR_LIST(sb_chksum), ATTR_LIST(ztailpacking), + ATTR_LIST(fragments), + ATTR_LIST(dedupe), NULL, }; ATTRIBUTE_GROUPS(erofs_feat); @@ -201,12 +205,27 @@ static struct kobject erofs_feat = { int erofs_register_sysfs(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); + char *name; + char *str = NULL; int err; + if (erofs_is_fscache_mode(sb)) { + if (sbi->opt.domain_id) { + str = kasprintf(GFP_KERNEL, "%s,%s", sbi->opt.domain_id, + sbi->opt.fsid); + if (!str) + return -ENOMEM; + name = str; + } else { + name = sbi->opt.fsid; + } + } else { + name = sb->s_id; + } sbi->s_kobj.kset = &erofs_root; init_completion(&sbi->s_kobj_unregister); - err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s", - erofs_is_fscache_mode(sb) ? sbi->opt.fsid : sb->s_id); + err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s", name); + kfree(str); if (err) goto put_sb_kobj; return 0; diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h index 332462c59f11..0a43c9ee9f8f 100644 --- a/fs/erofs/xattr.h +++ b/fs/erofs/xattr.h @@ -39,9 +39,7 @@ static inline unsigned int xattrblock_offset(struct erofs_sb_info *sbi, #ifdef CONFIG_EROFS_FS_XATTR extern const struct xattr_handler erofs_xattr_user_handler; extern const struct xattr_handler erofs_xattr_trusted_handler; -#ifdef CONFIG_EROFS_FS_SECURITY extern const struct xattr_handler erofs_xattr_security_handler; -#endif static inline const struct xattr_handler *erofs_xattr_handler(unsigned int idx) { diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 5792ca9e0d5e..cce56dde135c 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -650,6 +650,35 @@ static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe, la < fe->headoffset; } +static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos, + struct page *page, unsigned int pageofs, + unsigned int len) +{ + struct inode *packed_inode = EROFS_I_SB(inode)->packed_inode; + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + u8 *src, *dst; + unsigned int i, cnt; + + pos += EROFS_I(inode)->z_fragmentoff; + for (i = 0; i < len; i += cnt) { + cnt = min_t(unsigned int, len - i, + EROFS_BLKSIZ - erofs_blkoff(pos)); + src = erofs_bread(&buf, packed_inode, + erofs_blknr(pos), EROFS_KMAP); + if (IS_ERR(src)) { + erofs_put_metabuf(&buf); + return PTR_ERR(src); + } + + dst = kmap_local_page(page); + memcpy(dst + pageofs + i, src + erofs_blkoff(pos), cnt); + kunmap_local(dst); + pos += cnt; + } + erofs_put_metabuf(&buf); + return 0; +} + static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, struct page *page, struct page **pagepool) { @@ -688,7 +717,8 @@ repeat: /* didn't get a valid pcluster previously (very rare) */ } - if (!(map->m_flags & EROFS_MAP_MAPPED)) + if (!(map->m_flags & EROFS_MAP_MAPPED) || + map->m_flags & EROFS_MAP_FRAGMENT) goto hitted; err = z_erofs_collector_begin(fe); @@ -735,6 +765,24 @@ hitted: zero_user_segment(page, cur, end); goto next_part; } + if (map->m_flags & EROFS_MAP_FRAGMENT) { + unsigned int pageofs, skip, len; + + if (offset > map->m_la) { + pageofs = 0; + skip = offset - map->m_la; + } else { + pageofs = map->m_la & ~PAGE_MASK; + skip = 0; + } + len = min_t(unsigned int, map->m_llen - skip, end - cur); + err = z_erofs_read_fragment(inode, skip, page, pageofs, len); + if (err) + goto out; + ++spiltted; + tight = false; + goto next_part; + } exclusive = (!cur && (!spiltted || tight)); if (cur) @@ -766,6 +814,7 @@ retry: fe->pcl->multibases = true; if ((map->m_flags & EROFS_MAP_FULL_MAPPED) && + !(map->m_flags & EROFS_MAP_PARTIAL_REF) && fe->pcl->length == map->m_llen) fe->pcl->partial = false; if (fe->pcl->length < offset + end - map->m_la) { diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 572f0b8151ba..44c27ef39c43 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -17,7 +17,7 @@ int z_erofs_fill_inode(struct inode *inode) struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); if (!erofs_sb_has_big_pcluster(sbi) && - !erofs_sb_has_ztailpacking(sbi) && + !erofs_sb_has_ztailpacking(sbi) && !erofs_sb_has_fragments(sbi) && vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) { vi->z_advise = 0; vi->z_algorithmtype[0] = 0; @@ -55,10 +55,6 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) goto out_unlock; - DBG_BUGON(!erofs_sb_has_big_pcluster(EROFS_SB(sb)) && - !erofs_sb_has_ztailpacking(EROFS_SB(sb)) && - vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY); - pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize + vi->xattr_isize, 8); kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), @@ -69,6 +65,16 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) } h = kaddr + erofs_blkoff(pos); + /* + * if the highest bit of the 8-byte map header is set, the whole file + * is stored in the packed inode. The rest bits keeps z_fragmentoff. + */ + if (h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT) { + vi->z_advise = Z_EROFS_ADVISE_FRAGMENT_PCLUSTER; + vi->z_fragmentoff = le64_to_cpu(*(__le64 *)h) ^ (1ULL << 63); + vi->z_tailextent_headlcn = 0; + goto unmap_done; + } vi->z_advise = le16_to_cpu(h->h_advise); vi->z_algorithmtype[0] = h->h_algorithmtype & 15; vi->z_algorithmtype[1] = h->h_algorithmtype >> 4; @@ -123,6 +129,20 @@ unmap_done: if (err < 0) goto out_unlock; } + + if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER && + !(h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT)) { + struct erofs_map_blocks map = { + .buf = __EROFS_BUF_INITIALIZER + }; + + vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff); + err = z_erofs_do_map_blocks(inode, &map, + EROFS_GET_BLOCKS_FINDTAIL); + erofs_put_metabuf(&map.buf); + if (err < 0) + goto out_unlock; + } /* paired with smp_mb() at the beginning of the function */ smp_mb(); set_bit(EROFS_I_Z_INITED_BIT, &vi->flags); @@ -141,22 +161,11 @@ struct z_erofs_maprecorder { u8 type, headtype; u16 clusterofs; u16 delta[2]; - erofs_blk_t pblk, compressedlcs; + erofs_blk_t pblk, compressedblks; erofs_off_t nextpackoff; + bool partialref; }; -static int z_erofs_reload_indexes(struct z_erofs_maprecorder *m, - erofs_blk_t eblk) -{ - struct super_block *const sb = m->inode->i_sb; - - m->kaddr = erofs_read_metabuf(&m->map->buf, sb, eblk, - EROFS_KMAP_ATOMIC); - if (IS_ERR(m->kaddr)) - return PTR_ERR(m->kaddr); - return 0; -} - static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, unsigned long lcn) { @@ -169,11 +178,11 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, lcn * sizeof(struct z_erofs_vle_decompressed_index); struct z_erofs_vle_decompressed_index *di; unsigned int advise, type; - int err; - err = z_erofs_reload_indexes(m, erofs_blknr(pos)); - if (err) - return err; + m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb, + erofs_blknr(pos), EROFS_KMAP_ATOMIC); + if (IS_ERR(m->kaddr)) + return PTR_ERR(m->kaddr); m->nextpackoff = pos + sizeof(struct z_erofs_vle_decompressed_index); m->lcn = lcn; @@ -192,7 +201,7 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, DBG_BUGON(1); return -EFSCORRUPTED; } - m->compressedlcs = m->delta[0] & + m->compressedblks = m->delta[0] & ~Z_EROFS_VLE_DI_D0_CBLKCNT; m->delta[0] = 1; } @@ -201,6 +210,8 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1: case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2: + if (advise & Z_EROFS_VLE_DI_PARTIAL_REF) + m->partialref = true; m->clusterofs = le16_to_cpu(di->di_clusterofs); m->pblk = le32_to_cpu(di->di_u.blkaddr); break; @@ -293,7 +304,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, DBG_BUGON(1); return -EFSCORRUPTED; } - m->compressedlcs = lo & ~Z_EROFS_VLE_DI_D0_CBLKCNT; + m->compressedblks = lo & ~Z_EROFS_VLE_DI_D0_CBLKCNT; m->delta[0] = 1; return 0; } else if (i + 1 != (int)vcnt) { @@ -370,7 +381,6 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m, unsigned int compacted_4b_initial, compacted_2b; unsigned int amortizedshift; erofs_off_t pos; - int err; if (lclusterbits != 12) return -EOPNOTSUPP; @@ -407,9 +417,10 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m, amortizedshift = 2; out: pos += lcn * (1 << amortizedshift); - err = z_erofs_reload_indexes(m, erofs_blknr(pos)); - if (err) - return err; + m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb, + erofs_blknr(pos), EROFS_KMAP_ATOMIC); + if (IS_ERR(m->kaddr)) + return PTR_ERR(m->kaddr); return unpack_compacted_index(m, amortizedshift, pos, lookahead); } @@ -497,7 +508,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, return 0; } lcn = m->lcn + 1; - if (m->compressedlcs) + if (m->compressedblks) goto out; err = z_erofs_load_cluster_from_disk(m, lcn, false); @@ -506,7 +517,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, /* * If the 1st NONHEAD lcluster has already been handled initially w/o - * valid compressedlcs, which means at least it mustn't be CBLKCNT, or + * valid compressedblks, which means at least it mustn't be CBLKCNT, or * an internal implemenatation error is detected. * * The following code can also handle it properly anyway, but let's @@ -523,12 +534,12 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, * if the 1st NONHEAD lcluster is actually PLAIN or HEAD type * rather than CBLKCNT, it's a 1 lcluster-sized pcluster. */ - m->compressedlcs = 1; + m->compressedblks = 1 << (lclusterbits - LOG_BLOCK_SIZE); break; case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: if (m->delta[0] != 1) goto err_bonus_cblkcnt; - if (m->compressedlcs) + if (m->compressedblks) break; fallthrough; default: @@ -539,7 +550,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, return -EFSCORRUPTED; } out: - map->m_plen = (u64)m->compressedlcs << lclusterbits; + map->m_plen = (u64)m->compressedblks << LOG_BLOCK_SIZE; return 0; err_bonus_cblkcnt: erofs_err(m->inode->i_sb, @@ -598,6 +609,7 @@ static int z_erofs_do_map_blocks(struct inode *inode, { struct erofs_inode *const vi = EROFS_I(inode); bool ztailpacking = vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER; + bool fragment = vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER; struct z_erofs_maprecorder m = { .inode = inode, .map = map, @@ -663,15 +675,23 @@ static int z_erofs_do_map_blocks(struct inode *inode, err = -EOPNOTSUPP; goto unmap_out; } - + if (m.partialref) + map->m_flags |= EROFS_MAP_PARTIAL_REF; map->m_llen = end - map->m_la; - if (flags & EROFS_GET_BLOCKS_FINDTAIL) + if (flags & EROFS_GET_BLOCKS_FINDTAIL) { vi->z_tailextent_headlcn = m.lcn; + /* for non-compact indexes, fragmentoff is 64 bits */ + if (fragment && + vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) + vi->z_fragmentoff |= (u64)m.pblk << 32; + } if (ztailpacking && m.lcn == vi->z_tailextent_headlcn) { map->m_flags |= EROFS_MAP_META; map->m_pa = vi->z_idataoff; map->m_plen = vi->z_idata_size; + } else if (fragment && m.lcn == vi->z_tailextent_headlcn) { + map->m_flags |= EROFS_MAP_FRAGMENT; } else { map->m_pa = blknr_to_addr(m.pblk); err = z_erofs_get_extent_compressedlen(&m, initial_lcn); @@ -679,12 +699,18 @@ static int z_erofs_do_map_blocks(struct inode *inode, goto out; } - if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN) - map->m_algorithmformat = Z_EROFS_COMPRESSION_SHIFTED; - else if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) + if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN) { + if (vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER) + map->m_algorithmformat = + Z_EROFS_COMPRESSION_INTERLACED; + else + map->m_algorithmformat = + Z_EROFS_COMPRESSION_SHIFTED; + } else if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) { map->m_algorithmformat = vi->z_algorithmtype[1]; - else + } else { map->m_algorithmformat = vi->z_algorithmtype[0]; + } if ((flags & EROFS_GET_BLOCKS_FIEMAP) || ((flags & EROFS_GET_BLOCKS_READMORE) && @@ -705,10 +731,10 @@ out: return err; } -int z_erofs_map_blocks_iter(struct inode *inode, - struct erofs_map_blocks *map, +int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, int flags) { + struct erofs_inode *const vi = EROFS_I(inode); int err = 0; trace_z_erofs_map_blocks_iter_enter(inode, map, flags); @@ -725,6 +751,15 @@ int z_erofs_map_blocks_iter(struct inode *inode, if (err) goto out; + if ((vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) && + !vi->z_tailextent_headlcn) { + map->m_la = 0; + map->m_llen = inode->i_size; + map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_FULL_MAPPED | + EROFS_MAP_FRAGMENT; + goto out; + } + err = z_erofs_do_map_blocks(inode, map, flags); out: trace_z_erofs_map_blocks_iter_exit(inode, map, flags, err); @@ -751,7 +786,8 @@ static int z_erofs_iomap_begin_report(struct inode *inode, loff_t offset, iomap->length = map.m_llen; if (map.m_flags & EROFS_MAP_MAPPED) { iomap->type = IOMAP_MAPPED; - iomap->addr = map.m_pa; + iomap->addr = map.m_flags & EROFS_MAP_FRAGMENT ? + IOMAP_NULL_ADDR : map.m_pa; } else { iomap->type = IOMAP_HOLE; iomap->addr = IOMAP_NULL_ADDR; diff --git a/fs/exec.c b/fs/exec.c index f793221f4eb6..768843477a49 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -65,7 +65,6 @@ #include <linux/io_uring.h> #include <linux/syscall_user_dispatch.h> #include <linux/coredump.h> -#include <linux/time_namespace.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> @@ -584,11 +583,11 @@ static int copy_strings(int argc, struct user_arg_ptr argv, if (kmapped_page) { flush_dcache_page(kmapped_page); - kunmap(kmapped_page); + kunmap_local(kaddr); put_arg_page(kmapped_page); } kmapped_page = page; - kaddr = kmap(kmapped_page); + kaddr = kmap_local_page(kmapped_page); kpos = pos & PAGE_MASK; flush_arg_page(bprm, kpos, kmapped_page); } @@ -602,7 +601,7 @@ static int copy_strings(int argc, struct user_arg_ptr argv, out: if (kmapped_page) { flush_dcache_page(kmapped_page); - kunmap(kmapped_page); + kunmap_local(kaddr); put_arg_page(kmapped_page); } return ret; @@ -880,11 +879,11 @@ int transfer_args_to_stack(struct linux_binprm *bprm, for (index = MAX_ARG_PAGES - 1; index >= stop; index--) { unsigned int offset = index == stop ? bprm->p & ~PAGE_MASK : 0; - char *src = kmap(bprm->page[index]) + offset; + char *src = kmap_local_page(bprm->page[index]) + offset; sp -= PAGE_SIZE - offset; if (copy_to_user((void *) sp, src, PAGE_SIZE - offset) != 0) ret = -EFAULT; - kunmap(bprm->page[index]); + kunmap_local(src); if (ret) goto out; } @@ -958,8 +957,7 @@ struct file *open_exec(const char *name) } EXPORT_SYMBOL(open_exec); -#if defined(CONFIG_HAVE_AOUT) || defined(CONFIG_BINFMT_FLAT) || \ - defined(CONFIG_BINFMT_ELF_FDPIC) +#if defined(CONFIG_BINFMT_FLAT) || defined(CONFIG_BINFMT_ELF_FDPIC) ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) { ssize_t res = vfs_read(file, (void __user *)addr, len, &pos); @@ -979,12 +977,10 @@ static int exec_mmap(struct mm_struct *mm) { struct task_struct *tsk; struct mm_struct *old_mm, *active_mm; - bool vfork; int ret; /* Notify parent that we're no longer interested in the old VM */ tsk = current; - vfork = !!tsk->vfork_done; old_mm = current->mm; exec_mm_release(tsk, old_mm); if (old_mm) @@ -1029,10 +1025,6 @@ static int exec_mmap(struct mm_struct *mm) tsk->mm->vmacache_seqnum = 0; vmacache_flush(tsk); task_unlock(tsk); - - if (vfork) - timens_on_fork(tsk->nsproxy, tsk); - if (old_mm) { mmap_read_unlock(old_mm); BUG_ON(active_mm != old_mm); @@ -1595,7 +1587,7 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) { /* Handle suid and sgid on files */ struct user_namespace *mnt_userns; - struct inode *inode; + struct inode *inode = file_inode(file); unsigned int mode; kuid_t uid; kgid_t gid; @@ -1606,7 +1598,6 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) if (task_no_new_privs(current)) return; - inode = file->f_path.dentry->d_inode; mode = READ_ONCE(inode->i_mode); if (!(mode & (S_ISUID|S_ISGID))) return; @@ -1686,13 +1677,13 @@ int remove_arg_zero(struct linux_binprm *bprm) ret = -EFAULT; goto out; } - kaddr = kmap_atomic(page); + kaddr = kmap_local_page(page); for (; offset < PAGE_SIZE && kaddr[offset]; offset++, bprm->p++) ; - kunmap_atomic(kaddr); + kunmap_local(kaddr); put_arg_page(page); } while (offset == PAGE_SIZE); diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c index a27b55ec060a..0fc08fdcba73 100644 --- a/fs/exfat/dir.c +++ b/fs/exfat/dir.c @@ -212,9 +212,9 @@ static void exfat_free_namebuf(struct exfat_dentry_namebuf *nb) /* skip iterating emit_dots when dir is empty */ #define ITER_POS_FILLED_DOTS (2) -static int exfat_iterate(struct file *filp, struct dir_context *ctx) +static int exfat_iterate(struct file *file, struct dir_context *ctx) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct inode *tmp; struct exfat_dir_entry de; @@ -228,7 +228,7 @@ static int exfat_iterate(struct file *filp, struct dir_context *ctx) mutex_lock(&EXFAT_SB(sb)->s_lock); cpos = ctx->pos; - if (!dir_emit_dots(filp, ctx)) + if (!dir_emit_dots(file, ctx)) goto unlock; if (ctx->pos == ITER_POS_FILLED_DOTS) { diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c index ee0b7cf51157..41ae4cce1f42 100644 --- a/fs/exfat/fatent.c +++ b/fs/exfat/fatent.c @@ -270,8 +270,7 @@ int exfat_zeroed_cluster(struct inode *dir, unsigned int clu) struct super_block *sb = dir->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct buffer_head *bh; - sector_t blknr, last_blknr; - int i; + sector_t blknr, last_blknr, i; blknr = exfat_cluster_to_sector(sbi, clu); last_blknr = blknr + sbi->sect_per_clus; diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 3ef80d000e13..c648a493faf2 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -248,21 +248,20 @@ struct getdents_callback { * A rather strange filldir function to capture * the name matching the specified inode number. */ -static int filldir_one(struct dir_context *ctx, const char *name, int len, +static bool filldir_one(struct dir_context *ctx, const char *name, int len, loff_t pos, u64 ino, unsigned int d_type) { struct getdents_callback *buf = container_of(ctx, struct getdents_callback, ctx); - int result = 0; buf->sequence++; if (buf->ino == ino && len <= NAME_MAX) { memcpy(buf->name, name, len); buf->name[len] = '\0'; buf->found = 1; - result = -1; + return false; // no more } - return result; + return true; } /** diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 9bca5565547b..e5f2f5ca5120 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -167,8 +167,6 @@ enum SHIFT_DIRECTION { #define EXT4_MB_CR0_OPTIMIZED 0x8000 /* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */ #define EXT4_MB_CR1_OPTIMIZED 0x00010000 -/* Perform linear traversal for one group */ -#define EXT4_MB_SEARCH_NEXT_LINEAR 0x00020000 struct ext4_allocation_request { /* target inode for block we're allocating */ struct inode *inode; @@ -1600,8 +1598,8 @@ struct ext4_sb_info { struct list_head s_discard_list; struct work_struct s_discard_work; atomic_t s_retry_alloc_pending; - struct rb_root s_mb_avg_fragment_size_root; - rwlock_t s_mb_rb_lock; + struct list_head *s_mb_avg_fragment_size; + rwlock_t *s_mb_avg_fragment_size_locks; struct list_head *s_mb_largest_free_orders; rwlock_t *s_mb_largest_free_orders_locks; @@ -2979,6 +2977,7 @@ extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, extern int ext4_write_inode(struct inode *, struct writeback_control *); extern int ext4_setattr(struct user_namespace *, struct dentry *, struct iattr *); +extern u32 ext4_dio_alignment(struct inode *inode); extern int ext4_getattr(struct user_namespace *, const struct path *, struct kstat *, u32, unsigned int); extern void ext4_evict_inode(struct inode *); @@ -3413,6 +3412,8 @@ struct ext4_group_info { ext4_grpblk_t bb_first_free; /* first free block */ ext4_grpblk_t bb_free; /* total free blocks */ ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ + int bb_avg_fragment_size_order; /* order of average + fragment in BG */ ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ ext4_group_t bb_group; /* Group number */ struct list_head bb_prealloc_list; @@ -3420,7 +3421,7 @@ struct ext4_group_info { void *bb_bitmap; #endif struct rw_semaphore alloc_sem; - struct rb_node bb_avg_fragment_size_rb; + struct list_head bb_avg_fragment_size_node; struct list_head bb_largest_free_order_node; ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block * regions, index is order. diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c148bb97b527..5235974126bd 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -460,6 +460,10 @@ static int __ext4_ext_check(const char *function, unsigned int line, error_msg = "invalid eh_entries"; goto corrupted; } + if (unlikely((eh->eh_entries == 0) && (depth > 0))) { + error_msg = "eh_entries is 0 but eh_depth is > 0"; + goto corrupted; + } if (!ext4_valid_extent_entries(inode, eh, lblk, &pblk, depth)) { error_msg = "invalid extent entries"; goto corrupted; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 109d07629f81..8bb1c35fd6dd 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -36,19 +36,34 @@ #include "acl.h" #include "truncate.h" -static bool ext4_dio_supported(struct kiocb *iocb, struct iov_iter *iter) +/* + * Returns %true if the given DIO request should be attempted with DIO, or + * %false if it should fall back to buffered I/O. + * + * DIO isn't well specified; when it's unsupported (either due to the request + * being misaligned, or due to the file not supporting DIO at all), filesystems + * either fall back to buffered I/O or return EINVAL. For files that don't use + * any special features like encryption or verity, ext4 has traditionally + * returned EINVAL for misaligned DIO. iomap_dio_rw() uses this convention too. + * In this case, we should attempt the DIO, *not* fall back to buffered I/O. + * + * In contrast, in cases where DIO is unsupported due to ext4 features, ext4 + * traditionally falls back to buffered I/O. + * + * This function implements the traditional ext4 behavior in all these cases. + */ +static bool ext4_should_use_dio(struct kiocb *iocb, struct iov_iter *iter) { struct inode *inode = file_inode(iocb->ki_filp); + u32 dio_align = ext4_dio_alignment(inode); - if (!fscrypt_dio_supported(iocb, iter)) - return false; - if (fsverity_active(inode)) + if (dio_align == 0) return false; - if (ext4_should_journal_data(inode)) - return false; - if (ext4_has_inline_data(inode)) - return false; - return true; + + if (dio_align == 1) + return true; + + return IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), dio_align); } static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) @@ -63,7 +78,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) inode_lock_shared(inode); } - if (!ext4_dio_supported(iocb, to)) { + if (!ext4_should_use_dio(iocb, to)) { inode_unlock_shared(inode); /* * Fallback to buffered I/O if the operation being performed on @@ -511,7 +526,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) } /* Fallback to buffered I/O if the inode does not support direct I/O. */ - if (!ext4_dio_supported(iocb, from)) { + if (!ext4_should_use_dio(iocb, from)) { if (ilock_shared) inode_unlock_shared(inode); else diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index f73e5eb43eae..208b87ce8858 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -510,7 +510,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, goto fallback; } - max_dirs = ndirs / ngroups + inodes_per_group / 16; + max_dirs = ndirs / ngroups + inodes_per_group*flex_size / 16; min_inodes = avefreei - inodes_per_group*flex_size / 4; if (min_inodes < 1) min_inodes = 1; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 601214453c3a..364774230d87 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5550,6 +5550,22 @@ err_out: return error; } +u32 ext4_dio_alignment(struct inode *inode) +{ + if (fsverity_active(inode)) + return 0; + if (ext4_should_journal_data(inode)) + return 0; + if (ext4_has_inline_data(inode)) + return 0; + if (IS_ENCRYPTED(inode)) { + if (!fscrypt_dio_supported(inode)) + return 0; + return i_blocksize(inode); + } + return 1; /* use the iomap defaults */ +} + int ext4_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { @@ -5565,6 +5581,27 @@ int ext4_getattr(struct user_namespace *mnt_userns, const struct path *path, stat->btime.tv_nsec = ei->i_crtime.tv_nsec; } + /* + * Return the DIO alignment restrictions if requested. We only return + * this information when requested, since on encrypted files it might + * take a fair bit of work to get if the file wasn't opened recently. + */ + if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) { + u32 dio_align = ext4_dio_alignment(inode); + + stat->result_mask |= STATX_DIOALIGN; + if (dio_align == 1) { + struct block_device *bdev = inode->i_sb->s_bdev; + + /* iomap defaults */ + stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; + stat->dio_offset_align = bdev_logical_block_size(bdev); + } else { + stat->dio_mem_align = dio_align; + stat->dio_offset_align = dio_align; + } + } + flags = ei->i_flags & EXT4_FL_USER_VISIBLE; if (flags & EXT4_APPEND_FL) stat->attributes |= STATX_ATTR_APPEND; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index bd8f8b5c3d30..9dad93059945 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -140,13 +140,15 @@ * number of buddy bitmap orders possible) number of lists. Group-infos are * placed in appropriate lists. * - * 2) Average fragment size rb tree (sbi->s_mb_avg_fragment_size_root) + * 2) Average fragment size lists (sbi->s_mb_avg_fragment_size) * - * Locking: sbi->s_mb_rb_lock (rwlock) + * Locking: sbi->s_mb_avg_fragment_size_locks(array of rw locks) * - * This is a red black tree consisting of group infos and the tree is sorted - * by average fragment sizes (which is calculated as ext4_group_info->bb_free - * / ext4_group_info->bb_fragments). + * This is an array of lists where in the i-th list there are groups with + * average fragment size >= 2^i and < 2^(i+1). The average fragment size + * is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments. + * Note that we don't bother with a special list for completely empty groups + * so we only have MB_NUM_ORDERS(sb) lists. * * When "mb_optimize_scan" mount option is set, mballoc consults the above data * structures to decide the order in which groups are to be traversed for @@ -160,7 +162,8 @@ * * At CR = 1, we only consider groups where average fragment size > request * size. So, we lookup a group which has average fragment size just above or - * equal to request size using our rb tree (data structure 2) in O(log N) time. + * equal to request size using our average fragment size group lists (data + * structure 2) in O(1) time. * * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in * linear order which requires O(N) search time for each CR 0 and CR 1 phase. @@ -802,65 +805,51 @@ static void ext4_mb_mark_free_simple(struct super_block *sb, } } -static void ext4_mb_rb_insert(struct rb_root *root, struct rb_node *new, - int (*cmp)(struct rb_node *, struct rb_node *)) +static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len) { - struct rb_node **iter = &root->rb_node, *parent = NULL; + int order; - while (*iter) { - parent = *iter; - if (cmp(new, *iter) > 0) - iter = &((*iter)->rb_left); - else - iter = &((*iter)->rb_right); - } - - rb_link_node(new, parent, iter); - rb_insert_color(new, root); -} - -static int -ext4_mb_avg_fragment_size_cmp(struct rb_node *rb1, struct rb_node *rb2) -{ - struct ext4_group_info *grp1 = rb_entry(rb1, - struct ext4_group_info, - bb_avg_fragment_size_rb); - struct ext4_group_info *grp2 = rb_entry(rb2, - struct ext4_group_info, - bb_avg_fragment_size_rb); - int num_frags_1, num_frags_2; - - num_frags_1 = grp1->bb_fragments ? - grp1->bb_free / grp1->bb_fragments : 0; - num_frags_2 = grp2->bb_fragments ? - grp2->bb_free / grp2->bb_fragments : 0; - - return (num_frags_2 - num_frags_1); + /* + * We don't bother with a special lists groups with only 1 block free + * extents and for completely empty groups. + */ + order = fls(len) - 2; + if (order < 0) + return 0; + if (order == MB_NUM_ORDERS(sb)) + order--; + return order; } -/* - * Reinsert grpinfo into the avg_fragment_size tree with new average - * fragment size. - */ +/* Move group to appropriate avg_fragment_size list */ static void mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) { struct ext4_sb_info *sbi = EXT4_SB(sb); + int new_order; if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0) return; - write_lock(&sbi->s_mb_rb_lock); - if (!RB_EMPTY_NODE(&grp->bb_avg_fragment_size_rb)) { - rb_erase(&grp->bb_avg_fragment_size_rb, - &sbi->s_mb_avg_fragment_size_root); - RB_CLEAR_NODE(&grp->bb_avg_fragment_size_rb); - } + new_order = mb_avg_fragment_size_order(sb, + grp->bb_free / grp->bb_fragments); + if (new_order == grp->bb_avg_fragment_size_order) + return; - ext4_mb_rb_insert(&sbi->s_mb_avg_fragment_size_root, - &grp->bb_avg_fragment_size_rb, - ext4_mb_avg_fragment_size_cmp); - write_unlock(&sbi->s_mb_rb_lock); + if (grp->bb_avg_fragment_size_order != -1) { + write_lock(&sbi->s_mb_avg_fragment_size_locks[ + grp->bb_avg_fragment_size_order]); + list_del(&grp->bb_avg_fragment_size_node); + write_unlock(&sbi->s_mb_avg_fragment_size_locks[ + grp->bb_avg_fragment_size_order]); + } + grp->bb_avg_fragment_size_order = new_order; + write_lock(&sbi->s_mb_avg_fragment_size_locks[ + grp->bb_avg_fragment_size_order]); + list_add_tail(&grp->bb_avg_fragment_size_node, + &sbi->s_mb_avg_fragment_size[grp->bb_avg_fragment_size_order]); + write_unlock(&sbi->s_mb_avg_fragment_size_locks[ + grp->bb_avg_fragment_size_order]); } /* @@ -909,86 +898,55 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, *new_cr = 1; } else { *group = grp->bb_group; - ac->ac_last_optimal_group = *group; ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED; } } /* - * Choose next group by traversing average fragment size tree. Updates *new_cr - * if cr lvel needs an update. Sets EXT4_MB_SEARCH_NEXT_LINEAR to indicate that - * the linear search should continue for one iteration since there's lock - * contention on the rb tree lock. + * Choose next group by traversing average fragment size list of suitable + * order. Updates *new_cr if cr level needs an update. */ static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, int *new_cr, ext4_group_t *group, ext4_group_t ngroups) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - int avg_fragment_size, best_so_far; - struct rb_node *node, *found; - struct ext4_group_info *grp; - - /* - * If there is contention on the lock, instead of waiting for the lock - * to become available, just continue searching lineraly. We'll resume - * our rb tree search later starting at ac->ac_last_optimal_group. - */ - if (!read_trylock(&sbi->s_mb_rb_lock)) { - ac->ac_flags |= EXT4_MB_SEARCH_NEXT_LINEAR; - return; - } + struct ext4_group_info *grp = NULL, *iter; + int i; if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) { if (sbi->s_mb_stats) atomic_inc(&sbi->s_bal_cr1_bad_suggestions); - /* We have found something at CR 1 in the past */ - grp = ext4_get_group_info(ac->ac_sb, ac->ac_last_optimal_group); - for (found = rb_next(&grp->bb_avg_fragment_size_rb); found != NULL; - found = rb_next(found)) { - grp = rb_entry(found, struct ext4_group_info, - bb_avg_fragment_size_rb); + } + + for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); + i < MB_NUM_ORDERS(ac->ac_sb); i++) { + if (list_empty(&sbi->s_mb_avg_fragment_size[i])) + continue; + read_lock(&sbi->s_mb_avg_fragment_size_locks[i]); + if (list_empty(&sbi->s_mb_avg_fragment_size[i])) { + read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]); + continue; + } + list_for_each_entry(iter, &sbi->s_mb_avg_fragment_size[i], + bb_avg_fragment_size_node) { if (sbi->s_mb_stats) atomic64_inc(&sbi->s_bal_cX_groups_considered[1]); - if (likely(ext4_mb_good_group(ac, grp->bb_group, 1))) + if (likely(ext4_mb_good_group(ac, iter->bb_group, 1))) { + grp = iter; break; - } - goto done; - } - - node = sbi->s_mb_avg_fragment_size_root.rb_node; - best_so_far = 0; - found = NULL; - - while (node) { - grp = rb_entry(node, struct ext4_group_info, - bb_avg_fragment_size_rb); - avg_fragment_size = 0; - if (ext4_mb_good_group(ac, grp->bb_group, 1)) { - avg_fragment_size = grp->bb_fragments ? - grp->bb_free / grp->bb_fragments : 0; - if (!best_so_far || avg_fragment_size < best_so_far) { - best_so_far = avg_fragment_size; - found = node; } } - if (avg_fragment_size > ac->ac_g_ex.fe_len) - node = node->rb_right; - else - node = node->rb_left; + read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]); + if (grp) + break; } -done: - if (found) { - grp = rb_entry(found, struct ext4_group_info, - bb_avg_fragment_size_rb); + if (grp) { *group = grp->bb_group; ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED; } else { *new_cr = 2; } - - read_unlock(&sbi->s_mb_rb_lock); - ac->ac_last_optimal_group = *group; } static inline int should_optimize_scan(struct ext4_allocation_context *ac) @@ -1017,11 +975,6 @@ next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups) goto inc_and_return; } - if (ac->ac_flags & EXT4_MB_SEARCH_NEXT_LINEAR) { - ac->ac_flags &= ~EXT4_MB_SEARCH_NEXT_LINEAR; - goto inc_and_return; - } - return group; inc_and_return: /* @@ -1049,8 +1002,10 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, { *new_cr = ac->ac_criteria; - if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) + if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) { + *group = next_linear_group(ac, *group, ngroups); return; + } if (*new_cr == 0) { ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups); @@ -1075,23 +1030,25 @@ mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) struct ext4_sb_info *sbi = EXT4_SB(sb); int i; - if (test_opt2(sb, MB_OPTIMIZE_SCAN) && grp->bb_largest_free_order >= 0) { + for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) + if (grp->bb_counters[i] > 0) + break; + /* No need to move between order lists? */ + if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || + i == grp->bb_largest_free_order) { + grp->bb_largest_free_order = i; + return; + } + + if (grp->bb_largest_free_order >= 0) { write_lock(&sbi->s_mb_largest_free_orders_locks[ grp->bb_largest_free_order]); list_del_init(&grp->bb_largest_free_order_node); write_unlock(&sbi->s_mb_largest_free_orders_locks[ grp->bb_largest_free_order]); } - grp->bb_largest_free_order = -1; /* uninit */ - - for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) { - if (grp->bb_counters[i] > 0) { - grp->bb_largest_free_order = i; - break; - } - } - if (test_opt2(sb, MB_OPTIMIZE_SCAN) && - grp->bb_largest_free_order >= 0 && grp->bb_free) { + grp->bb_largest_free_order = i; + if (grp->bb_largest_free_order >= 0 && grp->bb_free) { write_lock(&sbi->s_mb_largest_free_orders_locks[ grp->bb_largest_free_order]); list_add_tail(&grp->bb_largest_free_order_node, @@ -1148,13 +1105,13 @@ void ext4_mb_generate_buddy(struct super_block *sb, EXT4_GROUP_INFO_BBITMAP_CORRUPT); } mb_set_largest_free_order(sb, grp); + mb_update_avg_fragment_size(sb, grp); clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); period = get_cycles() - period; atomic_inc(&sbi->s_mb_buddies_generated); atomic64_add(period, &sbi->s_mb_generation_time); - mb_update_avg_fragment_size(sb, grp); } /* The buddy information is attached the buddy cache inode @@ -2636,7 +2593,7 @@ static noinline_for_stack int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { ext4_group_t prefetch_grp = 0, ngroups, group, i; - int cr = -1; + int cr = -1, new_cr; int err = 0, first_err = 0; unsigned int nr = 0, prefetch_ios = 0; struct ext4_sb_info *sbi; @@ -2707,17 +2664,14 @@ repeat: * from the goal value specified */ group = ac->ac_g_ex.fe_group; - ac->ac_last_optimal_group = group; ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups; prefetch_grp = group; - for (i = 0; i < ngroups; group = next_linear_group(ac, group, ngroups), - i++) { - int ret = 0, new_cr; + for (i = 0, new_cr = cr; i < ngroups; i++, + ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) { + int ret = 0; cond_resched(); - - ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups); if (new_cr != cr) { cr = new_cr; goto repeat; @@ -2991,9 +2945,7 @@ __acquires(&EXT4_SB(sb)->s_mb_rb_lock) struct super_block *sb = pde_data(file_inode(seq->file)); unsigned long position; - read_lock(&EXT4_SB(sb)->s_mb_rb_lock); - - if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1) + if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb)) return NULL; position = *pos + 1; return (void *) ((unsigned long) position); @@ -3005,7 +2957,7 @@ static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, lof unsigned long position; ++*pos; - if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1) + if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb)) return NULL; position = *pos + 1; return (void *) ((unsigned long) position); @@ -3017,29 +2969,22 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned long position = ((unsigned long) v); struct ext4_group_info *grp; - struct rb_node *n; - unsigned int count, min, max; + unsigned int count; position--; if (position >= MB_NUM_ORDERS(sb)) { - seq_puts(seq, "fragment_size_tree:\n"); - n = rb_first(&sbi->s_mb_avg_fragment_size_root); - if (!n) { - seq_puts(seq, "\ttree_min: 0\n\ttree_max: 0\n\ttree_nodes: 0\n"); - return 0; - } - grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb); - min = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0; - count = 1; - while (rb_next(n)) { - count++; - n = rb_next(n); - } - grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb); - max = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0; + position -= MB_NUM_ORDERS(sb); + if (position == 0) + seq_puts(seq, "avg_fragment_size_lists:\n"); - seq_printf(seq, "\ttree_min: %u\n\ttree_max: %u\n\ttree_nodes: %u\n", - min, max, count); + count = 0; + read_lock(&sbi->s_mb_avg_fragment_size_locks[position]); + list_for_each_entry(grp, &sbi->s_mb_avg_fragment_size[position], + bb_avg_fragment_size_node) + count++; + read_unlock(&sbi->s_mb_avg_fragment_size_locks[position]); + seq_printf(seq, "\tlist_order_%u_groups: %u\n", + (unsigned int)position, count); return 0; } @@ -3049,9 +2994,11 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) seq_puts(seq, "max_free_order_lists:\n"); } count = 0; + read_lock(&sbi->s_mb_largest_free_orders_locks[position]); list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position], bb_largest_free_order_node) count++; + read_unlock(&sbi->s_mb_largest_free_orders_locks[position]); seq_printf(seq, "\tlist_order_%u_groups: %u\n", (unsigned int)position, count); @@ -3059,11 +3006,7 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) } static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v) -__releases(&EXT4_SB(sb)->s_mb_rb_lock) { - struct super_block *sb = pde_data(file_inode(seq->file)); - - read_unlock(&EXT4_SB(sb)->s_mb_rb_lock); } const struct seq_operations ext4_mb_seq_structs_summary_ops = { @@ -3176,8 +3119,9 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, init_rwsem(&meta_group_info[i]->alloc_sem); meta_group_info[i]->bb_free_root = RB_ROOT; INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node); - RB_CLEAR_NODE(&meta_group_info[i]->bb_avg_fragment_size_rb); + INIT_LIST_HEAD(&meta_group_info[i]->bb_avg_fragment_size_node); meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ + meta_group_info[i]->bb_avg_fragment_size_order = -1; /* uninit */ meta_group_info[i]->bb_group = group; mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group); @@ -3426,7 +3370,24 @@ int ext4_mb_init(struct super_block *sb) i++; } while (i < MB_NUM_ORDERS(sb)); - sbi->s_mb_avg_fragment_size_root = RB_ROOT; + sbi->s_mb_avg_fragment_size = + kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), + GFP_KERNEL); + if (!sbi->s_mb_avg_fragment_size) { + ret = -ENOMEM; + goto out; + } + sbi->s_mb_avg_fragment_size_locks = + kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t), + GFP_KERNEL); + if (!sbi->s_mb_avg_fragment_size_locks) { + ret = -ENOMEM; + goto out; + } + for (i = 0; i < MB_NUM_ORDERS(sb); i++) { + INIT_LIST_HEAD(&sbi->s_mb_avg_fragment_size[i]); + rwlock_init(&sbi->s_mb_avg_fragment_size_locks[i]); + } sbi->s_mb_largest_free_orders = kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), GFP_KERNEL); @@ -3445,7 +3406,6 @@ int ext4_mb_init(struct super_block *sb) INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]); rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]); } - rwlock_init(&sbi->s_mb_rb_lock); spin_lock_init(&sbi->s_md_lock); sbi->s_mb_free_pending = 0; @@ -3516,6 +3476,8 @@ out_free_locality_groups: free_percpu(sbi->s_locality_groups); sbi->s_locality_groups = NULL; out: + kfree(sbi->s_mb_avg_fragment_size); + kfree(sbi->s_mb_avg_fragment_size_locks); kfree(sbi->s_mb_largest_free_orders); kfree(sbi->s_mb_largest_free_orders_locks); kfree(sbi->s_mb_offsets); @@ -3582,6 +3544,8 @@ int ext4_mb_release(struct super_block *sb) kvfree(group_info); rcu_read_unlock(); } + kfree(sbi->s_mb_avg_fragment_size); + kfree(sbi->s_mb_avg_fragment_size_locks); kfree(sbi->s_mb_largest_free_orders); kfree(sbi->s_mb_largest_free_orders_locks); kfree(sbi->s_mb_offsets); @@ -5193,6 +5157,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int bsbits = ac->ac_sb->s_blocksize_bits; loff_t size, isize; + bool inode_pa_eligible, group_pa_eligible; if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) return; @@ -5200,25 +5165,27 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) return; + group_pa_eligible = sbi->s_mb_group_prealloc > 0; + inode_pa_eligible = true; size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) >> bsbits; + /* No point in using inode preallocation for closed files */ if ((size == isize) && !ext4_fs_is_busy(sbi) && - !inode_is_open_for_write(ac->ac_inode)) { - ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; - return; - } + !inode_is_open_for_write(ac->ac_inode)) + inode_pa_eligible = false; - if (sbi->s_mb_group_prealloc <= 0) { - ac->ac_flags |= EXT4_MB_STREAM_ALLOC; - return; - } - - /* don't use group allocation for large files */ size = max(size, isize); - if (size > sbi->s_mb_stream_request) { - ac->ac_flags |= EXT4_MB_STREAM_ALLOC; + /* Don't use group allocation for large files */ + if (size > sbi->s_mb_stream_request) + group_pa_eligible = false; + + if (!group_pa_eligible) { + if (inode_pa_eligible) + ac->ac_flags |= EXT4_MB_STREAM_ALLOC; + else + ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; return; } @@ -5565,6 +5532,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, ext4_fsblk_t block = 0; unsigned int inquota = 0; unsigned int reserv_clstrs = 0; + int retries = 0; u64 seq; might_sleep(); @@ -5667,7 +5635,8 @@ repeat: ar->len = ac->ac_b_ex.fe_len; } } else { - if (ext4_mb_discard_preallocations_should_retry(sb, ac, &seq)) + if (++retries < 3 && + ext4_mb_discard_preallocations_should_retry(sb, ac, &seq)) goto repeat; /* * If block allocation fails then the pa allocated above diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 39da92ceabf8..dcda2a943cee 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -178,7 +178,6 @@ struct ext4_allocation_context { /* copy of the best found extent taken before preallocation efforts */ struct ext4_free_extent ac_f_ex; - ext4_group_t ac_last_optimal_group; __u32 ac_groups_considered; __u32 ac_flags; /* allocation hints */ __u16 ac_groups_scanned; diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index e02a5f14e021..3d21eae267fc 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -75,7 +75,7 @@ static void __read_end_io(struct bio *bio) bio_for_each_segment_all(bv, bio, iter_all) { page = bv->bv_page; - /* PG_error was set if any post_read step failed */ + /* PG_error was set if verity failed. */ if (bio->bi_status || PageError(page)) { ClearPageUptodate(page); /* will re-read again later */ @@ -96,10 +96,12 @@ static void decrypt_work(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); + struct bio *bio = ctx->bio; - fscrypt_decrypt_bio(ctx->bio); - - bio_post_read_processing(ctx); + if (fscrypt_decrypt_bio(bio)) + bio_post_read_processing(ctx); + else + __read_end_io(bio); } static void verity_work(struct work_struct *work) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index aa3ccddfa037..93cc2ec51c2a 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -139,7 +139,7 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task) continue; } - /* PG_error was set if decryption or verity failed. */ + /* PG_error was set if verity failed. */ if (bio->bi_status || PageError(page)) { ClearPageUptodate(page); /* will re-read again later */ @@ -185,7 +185,7 @@ static void f2fs_verify_bio(struct work_struct *work) struct page *page = bv->bv_page; if (!f2fs_is_compressed_page(page) && - !PageError(page) && !fsverity_verify_page(page)) + !fsverity_verify_page(page)) SetPageError(page); } } else { @@ -236,10 +236,9 @@ static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx, bio_for_each_segment_all(bv, ctx->bio, iter_all) { struct page *page = bv->bv_page; - /* PG_error was set if decryption failed. */ if (f2fs_is_compressed_page(page)) - f2fs_end_read_compressed_page(page, PageError(page), - blkaddr, in_task); + f2fs_end_read_compressed_page(page, false, blkaddr, + in_task); else all_compressed = false; @@ -259,14 +258,17 @@ static void f2fs_post_read_work(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); + struct bio *bio = ctx->bio; - if (ctx->enabled_steps & STEP_DECRYPT) - fscrypt_decrypt_bio(ctx->bio); + if ((ctx->enabled_steps & STEP_DECRYPT) && !fscrypt_decrypt_bio(bio)) { + f2fs_finish_read_bio(bio, true); + return; + } if (ctx->enabled_steps & STEP_DECOMPRESS) f2fs_handle_step_decompress(ctx, true); - f2fs_verify_and_finish_bio(ctx->bio, true); + f2fs_verify_and_finish_bio(bio, true); } static void f2fs_read_end_io(struct bio *bio) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3c7cdb70fe2e..aea816a133a8 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4471,17 +4471,6 @@ static inline void f2fs_i_compr_blocks_update(struct inode *inode, f2fs_mark_inode_dirty_sync(inode, true); } -static inline int block_unaligned_IO(struct inode *inode, - struct kiocb *iocb, struct iov_iter *iter) -{ - unsigned int i_blkbits = READ_ONCE(inode->i_blkbits); - unsigned int blocksize_mask = (1 << i_blkbits) - 1; - loff_t offset = iocb->ki_pos; - unsigned long align = offset | iov_iter_alignment(iter); - - return align & blocksize_mask; -} - static inline bool f2fs_allow_multi_device_dio(struct f2fs_sb_info *sbi, int flag) { @@ -4492,35 +4481,6 @@ static inline bool f2fs_allow_multi_device_dio(struct f2fs_sb_info *sbi, return sbi->aligned_blksize; } -static inline bool f2fs_force_buffered_io(struct inode *inode, - struct kiocb *iocb, struct iov_iter *iter) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - int rw = iov_iter_rw(iter); - - if (!fscrypt_dio_supported(iocb, iter)) - return true; - if (fsverity_active(inode)) - return true; - if (f2fs_compressed_file(inode)) - return true; - - /* disallow direct IO if any of devices has unaligned blksize */ - if (f2fs_is_multi_device(sbi) && !sbi->aligned_blksize) - return true; - - if (f2fs_lfs_mode(sbi) && (rw == WRITE)) { - if (block_unaligned_IO(inode, iocb, iter)) - return true; - if (F2FS_IO_ALIGNED(sbi)) - return true; - } - if (is_sbi_flag_set(F2FS_I_SB(inode), SBI_CP_DISABLED)) - return true; - - return false; -} - static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx) { return fsverity_active(inode) && diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ce4905a073b3..791770507328 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -808,6 +808,29 @@ int f2fs_truncate(struct inode *inode) return 0; } +static bool f2fs_force_buffered_io(struct inode *inode, int rw) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (!fscrypt_dio_supported(inode)) + return true; + if (fsverity_active(inode)) + return true; + if (f2fs_compressed_file(inode)) + return true; + + /* disallow direct IO if any of devices has unaligned blksize */ + if (f2fs_is_multi_device(sbi) && !sbi->aligned_blksize) + return true; + + if (f2fs_lfs_mode(sbi) && rw == WRITE && F2FS_IO_ALIGNED(sbi)) + return true; + if (is_sbi_flag_set(sbi, SBI_CP_DISABLED)) + return true; + + return false; +} + int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { @@ -824,6 +847,24 @@ int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path, stat->btime.tv_nsec = fi->i_crtime.tv_nsec; } + /* + * Return the DIO alignment restrictions if requested. We only return + * this information when requested, since on encrypted files it might + * take a fair bit of work to get if the file wasn't opened recently. + * + * f2fs sometimes supports DIO reads but not DIO writes. STATX_DIOALIGN + * cannot represent that, so in that case we report no DIO support. + */ + if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) { + unsigned int bsize = i_blocksize(inode); + + stat->result_mask |= STATX_DIOALIGN; + if (!f2fs_force_buffered_io(inode, WRITE)) { + stat->dio_mem_align = bsize; + stat->dio_offset_align = bsize; + } + } + flags = fi->i_flags; if (flags & F2FS_COMPR_FL) stat->attributes |= STATX_ATTR_COMPRESSED; @@ -4182,7 +4223,7 @@ static bool f2fs_should_use_dio(struct inode *inode, struct kiocb *iocb, if (!(iocb->ki_flags & IOCB_DIRECT)) return false; - if (f2fs_force_buffered_io(inode, iocb, iter)) + if (f2fs_force_buffered_io(inode, iov_iter_rw(iter))) return false; /* diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 2451623c05a7..26817b5aeac7 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3039,23 +3039,24 @@ static void f2fs_get_ino_and_lblk_bits(struct super_block *sb, *lblk_bits_ret = 8 * sizeof(block_t); } -static int f2fs_get_num_devices(struct super_block *sb) +static struct block_device **f2fs_get_devices(struct super_block *sb, + unsigned int *num_devs) { struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct block_device **devs; + int i; - if (f2fs_is_multi_device(sbi)) - return sbi->s_ndevs; - return 1; -} + if (!f2fs_is_multi_device(sbi)) + return NULL; -static void f2fs_get_devices(struct super_block *sb, - struct request_queue **devs) -{ - struct f2fs_sb_info *sbi = F2FS_SB(sb); - int i; + devs = kmalloc_array(sbi->s_ndevs, sizeof(*devs), GFP_KERNEL); + if (!devs) + return ERR_PTR(-ENOMEM); for (i = 0; i < sbi->s_ndevs; i++) - devs[i] = bdev_get_queue(FDEV(i).bdev); + devs[i] = FDEV(i).bdev; + *num_devs = sbi->s_ndevs; + return devs; } static const struct fscrypt_operations f2fs_cryptops = { @@ -3066,7 +3067,6 @@ static const struct fscrypt_operations f2fs_cryptops = { .empty_dir = f2fs_empty_dir, .has_stable_inodes = f2fs_has_stable_inodes, .get_ino_and_lblk_bits = f2fs_get_ino_and_lblk_bits, - .get_num_devices = f2fs_get_num_devices, .get_devices = f2fs_get_devices, }; #endif diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 249825017da7..00235b8a1823 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -705,7 +705,7 @@ static int fat_readdir(struct file *file, struct dir_context *ctx) } #define FAT_IOCTL_FILLDIR_FUNC(func, dirent_type) \ -static int func(struct dir_context *ctx, const char *name, int name_len, \ +static bool func(struct dir_context *ctx, const char *name, int name_len, \ loff_t offset, u64 ino, unsigned int d_type) \ { \ struct fat_ioctl_filldir_callback *buf = \ @@ -714,7 +714,7 @@ static int func(struct dir_context *ctx, const char *name, int name_len, \ struct dirent_type __user *d2 = d1 + 1; \ \ if (buf->result) \ - return -EINVAL; \ + return false; \ buf->result++; \ \ if (name != NULL) { \ @@ -750,10 +750,10 @@ static int func(struct dir_context *ctx, const char *name, int name_len, \ put_user(short_len, &d1->d_reclen)) \ goto efault; \ } \ - return 0; \ + return true; \ efault: \ buf->result = -EFAULT; \ - return -EFAULT; \ + return false; \ } FAT_IOCTL_FILLDIR_FUNC(fat_ioctl_filldir, __fat_dirent) diff --git a/fs/fat/file.c b/fs/fat/file.c index 3e4eb3467cb4..8a6b493b5b5f 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -461,8 +461,9 @@ static int fat_allow_set_time(struct user_namespace *mnt_userns, { umode_t allow_utime = sbi->options.allow_utime; - if (!uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode))) { - if (in_group_p(i_gid_into_mnt(mnt_userns, inode))) + if (!vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), + current_fsuid())) { + if (vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode))) allow_utime >>= 3; if (allow_utime & MAY_WRITE) return 1; diff --git a/fs/file_table.c b/fs/file_table.c index 99c6796c9f28..dd88701e54a9 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -324,12 +324,7 @@ static void __fput(struct file *file) } fops_put(file->f_op); put_pid(file->f_owner.pid); - if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) - i_readcount_dec(inode); - if (mode & FMODE_WRITER) { - put_write_access(inode); - __mnt_drop_write(mnt); - } + put_file_access(file); dput(dentry); if (unlikely(mode & FMODE_NEED_UNMOUNT)) dissolve_on_fput(mnt); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 05221366a16d..08a1993ab7fd 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -134,10 +134,10 @@ static bool inode_io_list_move_locked(struct inode *inode, static void wb_wakeup(struct bdi_writeback *wb) { - spin_lock_bh(&wb->work_lock); + spin_lock_irq(&wb->work_lock); if (test_bit(WB_registered, &wb->state)) mod_delayed_work(bdi_wq, &wb->dwork, 0); - spin_unlock_bh(&wb->work_lock); + spin_unlock_irq(&wb->work_lock); } static void finish_writeback_work(struct bdi_writeback *wb, @@ -164,7 +164,7 @@ static void wb_queue_work(struct bdi_writeback *wb, if (work->done) atomic_inc(&work->done->cnt); - spin_lock_bh(&wb->work_lock); + spin_lock_irq(&wb->work_lock); if (test_bit(WB_registered, &wb->state)) { list_add_tail(&work->list, &wb->work_list); @@ -172,7 +172,7 @@ static void wb_queue_work(struct bdi_writeback *wb, } else finish_writeback_work(wb, work); - spin_unlock_bh(&wb->work_lock); + spin_unlock_irq(&wb->work_lock); } /** @@ -2082,13 +2082,13 @@ static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb) { struct wb_writeback_work *work = NULL; - spin_lock_bh(&wb->work_lock); + spin_lock_irq(&wb->work_lock); if (!list_empty(&wb->work_list)) { work = list_entry(wb->work_list.next, struct wb_writeback_work, list); list_del_init(&work->list); } - spin_unlock_bh(&wb->work_lock); + spin_unlock_irq(&wb->work_lock); return work; } diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index 756d05779200..cf40895233f5 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -66,7 +66,7 @@ struct get_name_filldir { char *name; }; -static int get_name_filldir(struct dir_context *ctx, const char *name, +static bool get_name_filldir(struct dir_context *ctx, const char *name, int length, loff_t offset, u64 inum, unsigned int type) { @@ -74,12 +74,12 @@ static int get_name_filldir(struct dir_context *ctx, const char *name, container_of(ctx, struct get_name_filldir, ctx); if (inum != gnfd->inum.no_addr) - return 0; + return true; memcpy(gnfd->name, name, length); gnfd->name[length] = 0; - return 1; + return false; } static int gfs2_get_name(struct dentry *parent, char *name, diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 6ce369b096d4..71911bf9ab34 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -1302,7 +1302,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table) memcpy(cluster, table, strlen(table) - strlen(fsname)); fsname++; - flags = DLM_LSFL_FS | DLM_LSFL_NEWEXCL; + flags = DLM_LSFL_NEWEXCL; /* * create/join lockspace diff --git a/fs/inode.c b/fs/inode.c index 6462276dfdf0..b608528efd3a 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -192,8 +192,6 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_wb_frn_history = 0; #endif - if (security_inode_alloc(inode)) - goto out; spin_lock_init(&inode->i_lock); lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key); @@ -228,11 +226,12 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_fsnotify_mask = 0; #endif inode->i_flctx = NULL; + + if (unlikely(security_inode_alloc(inode))) + return -ENOMEM; this_cpu_inc(nr_inodes); return 0; -out: - return -ENOMEM; } EXPORT_SYMBOL(inode_init_always); @@ -2018,23 +2017,25 @@ static int __file_remove_privs(struct file *file, unsigned int flags) { struct dentry *dentry = file_dentry(file); struct inode *inode = file_inode(file); - int error; + int error = 0; int kill; if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode)) return 0; kill = dentry_needs_remove_privs(dentry); - if (kill <= 0) + if (kill < 0) return kill; - if (flags & IOCB_NOWAIT) - return -EAGAIN; + if (kill) { + if (flags & IOCB_NOWAIT) + return -EAGAIN; + + error = __remove_privs(file_mnt_user_ns(file), dentry, kill); + } - error = __remove_privs(file_mnt_user_ns(file), dentry, kill); if (!error) inode_has_no_xattr(inode); - return error; } diff --git a/fs/internal.h b/fs/internal.h index c209b4838d68..6f0386b34fae 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -16,6 +16,7 @@ struct shrink_control; struct fs_context; struct user_namespace; struct pipe_inode_info; +struct iov_iter; /* * block/bdev.c @@ -101,6 +102,16 @@ extern void chroot_fs_refs(const struct path *, const struct path *); extern struct file *alloc_empty_file(int, const struct cred *); extern struct file *alloc_empty_file_noaccount(int, const struct cred *); +static inline void put_file_access(struct file *file) +{ + if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) { + i_readcount_dec(file->f_inode); + } else if (file->f_mode & FMODE_WRITER) { + put_write_access(file->f_inode); + __mnt_drop_write(file->f_path.mnt); + } +} + /* * super.c */ @@ -221,3 +232,5 @@ ssize_t do_getxattr(struct user_namespace *mnt_userns, int setxattr_copy(const char __user *name, struct xattr_ctx *ctx); int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct xattr_ctx *ctx); + +ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos); diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h index 52aa0adeb951..e0cbcfa98c7e 100644 --- a/fs/ksmbd/ksmbd_netlink.h +++ b/fs/ksmbd/ksmbd_netlink.h @@ -349,6 +349,7 @@ enum KSMBD_TREE_CONN_STATUS { #define KSMBD_SHARE_FLAG_STREAMS BIT(11) #define KSMBD_SHARE_FLAG_FOLLOW_SYMLINKS BIT(12) #define KSMBD_SHARE_FLAG_ACL_XATTR BIT(13) +#define KSMBD_SHARE_FLAG_UPDATE BIT(14) /* * Tree connect request flags. @@ -364,6 +365,7 @@ enum KSMBD_TREE_CONN_STATUS { #define KSMBD_TREE_CONN_FLAG_READ_ONLY BIT(1) #define KSMBD_TREE_CONN_FLAG_WRITABLE BIT(2) #define KSMBD_TREE_CONN_FLAG_ADMIN_ACCOUNT BIT(3) +#define KSMBD_TREE_CONN_FLAG_UPDATE BIT(4) /* * RPC over IPC. diff --git a/fs/ksmbd/mgmt/share_config.c b/fs/ksmbd/mgmt/share_config.c index 70655af93b44..c9bca1c2c834 100644 --- a/fs/ksmbd/mgmt/share_config.c +++ b/fs/ksmbd/mgmt/share_config.c @@ -51,12 +51,16 @@ static void kill_share(struct ksmbd_share_config *share) kfree(share); } -void __ksmbd_share_config_put(struct ksmbd_share_config *share) +void ksmbd_share_config_del(struct ksmbd_share_config *share) { down_write(&shares_table_lock); hash_del(&share->hlist); up_write(&shares_table_lock); +} +void __ksmbd_share_config_put(struct ksmbd_share_config *share) +{ + ksmbd_share_config_del(share); kill_share(share); } diff --git a/fs/ksmbd/mgmt/share_config.h b/fs/ksmbd/mgmt/share_config.h index 28bf3511763f..902f2cb1963a 100644 --- a/fs/ksmbd/mgmt/share_config.h +++ b/fs/ksmbd/mgmt/share_config.h @@ -64,6 +64,7 @@ static inline int test_share_config_flag(struct ksmbd_share_config *share, return share->flags & flag; } +void ksmbd_share_config_del(struct ksmbd_share_config *share); void __ksmbd_share_config_put(struct ksmbd_share_config *share); static inline void ksmbd_share_config_put(struct ksmbd_share_config *share) diff --git a/fs/ksmbd/mgmt/tree_connect.c b/fs/ksmbd/mgmt/tree_connect.c index b35ea6a6abc5..97ab7987df6e 100644 --- a/fs/ksmbd/mgmt/tree_connect.c +++ b/fs/ksmbd/mgmt/tree_connect.c @@ -19,7 +19,7 @@ struct ksmbd_tree_conn_status ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess, char *share_name) { - struct ksmbd_tree_conn_status status = {-EINVAL, NULL}; + struct ksmbd_tree_conn_status status = {-ENOENT, NULL}; struct ksmbd_tree_connect_response *resp = NULL; struct ksmbd_share_config *sc; struct ksmbd_tree_connect *tree_conn = NULL; @@ -57,6 +57,20 @@ ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess, goto out_error; tree_conn->flags = resp->connection_flags; + if (test_tree_conn_flag(tree_conn, KSMBD_TREE_CONN_FLAG_UPDATE)) { + struct ksmbd_share_config *new_sc; + + ksmbd_share_config_del(sc); + new_sc = ksmbd_share_config_get(share_name); + if (!new_sc) { + pr_err("Failed to update stale share config\n"); + status.ret = -ESTALE; + goto out_error; + } + ksmbd_share_config_put(sc); + sc = new_sc; + } + tree_conn->user = sess->user; tree_conn->share_conf = sc; status.tree_conn = tree_conn; diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 9751cc92c111..bfa6b41d895b 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -1944,8 +1944,10 @@ out_err1: rsp->hdr.Status = STATUS_SUCCESS; rc = 0; break; + case -ESTALE: + case -ENOENT: case KSMBD_TREE_CONN_STATUS_NO_SHARE: - rsp->hdr.Status = STATUS_BAD_NETWORK_PATH; + rsp->hdr.Status = STATUS_BAD_NETWORK_NAME; break; case -ENOMEM: case KSMBD_TREE_CONN_STATUS_NOMEM: @@ -2328,15 +2330,15 @@ static int smb2_remove_smb_xattrs(struct path *path) name += strlen(name) + 1) { ksmbd_debug(SMB, "%s, len %zd\n", name, strlen(name)); - if (strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) && - strncmp(&name[XATTR_USER_PREFIX_LEN], DOS_ATTRIBUTE_PREFIX, - DOS_ATTRIBUTE_PREFIX_LEN) && - strncmp(&name[XATTR_USER_PREFIX_LEN], STREAM_PREFIX, STREAM_PREFIX_LEN)) - continue; - - err = ksmbd_vfs_remove_xattr(user_ns, path->dentry, name); - if (err) - ksmbd_debug(SMB, "remove xattr failed : %s\n", name); + if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) && + !strncmp(&name[XATTR_USER_PREFIX_LEN], STREAM_PREFIX, + STREAM_PREFIX_LEN)) { + err = ksmbd_vfs_remove_xattr(user_ns, path->dentry, + name); + if (err) + ksmbd_debug(SMB, "remove xattr failed : %s\n", + name); + } } out: kvfree(xattr_list); @@ -3042,12 +3044,6 @@ int smb2_open(struct ksmbd_work *work) list_add(&fp->node, &fp->f_ci->m_fp_list); write_unlock(&fp->f_ci->m_lock); - rc = ksmbd_vfs_getattr(&path, &stat); - if (rc) { - generic_fillattr(user_ns, d_inode(path.dentry), &stat); - rc = 0; - } - /* Check delete pending among previous fp before oplock break */ if (ksmbd_inode_pending_delete(fp)) { rc = -EBUSY; @@ -3134,6 +3130,10 @@ int smb2_open(struct ksmbd_work *work) } } + rc = ksmbd_vfs_getattr(&path, &stat); + if (rc) + goto err_out; + if (stat.result_mask & STATX_BTIME) fp->create_time = ksmbd_UnixTimeToNT(stat.btime); else @@ -3149,9 +3149,6 @@ int smb2_open(struct ksmbd_work *work) memcpy(fp->client_guid, conn->ClientGUID, SMB2_CLIENT_GUID_SIZE); - generic_fillattr(user_ns, file_inode(fp->filp), - &stat); - rsp->StructureSize = cpu_to_le16(89); rcu_read_lock(); opinfo = rcu_dereference(fp->f_opinfo); @@ -3779,7 +3776,7 @@ static int reserve_populate_dentry(struct ksmbd_dir_info *d_info, return 0; } -static int __query_dir(struct dir_context *ctx, const char *name, int namlen, +static bool __query_dir(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct ksmbd_readdir_data *buf; @@ -3793,22 +3790,20 @@ static int __query_dir(struct dir_context *ctx, const char *name, int namlen, /* dot and dotdot entries are already reserved */ if (!strcmp(".", name) || !strcmp("..", name)) - return 0; + return true; if (ksmbd_share_veto_filename(priv->work->tcon->share_conf, name)) - return 0; + return true; if (!match_pattern(name, namlen, priv->search_pattern)) - return 0; + return true; d_info->name = name; d_info->name_len = namlen; rc = reserve_populate_dentry(d_info, priv->info_level); if (rc) - return rc; - if (d_info->flags & SMB2_RETURN_SINGLE_ENTRY) { + return false; + if (d_info->flags & SMB2_RETURN_SINGLE_ENTRY) d_info->out_buf_len = 0; - return 0; - } - return 0; + return true; } static void restart_ctx(struct dir_context *ctx) diff --git a/fs/ksmbd/transport_ipc.c b/fs/ksmbd/transport_ipc.c index 7cb0eeb07c80..c9aca21637d5 100644 --- a/fs/ksmbd/transport_ipc.c +++ b/fs/ksmbd/transport_ipc.c @@ -197,6 +197,7 @@ static struct genl_family ksmbd_genl_family = { .module = THIS_MODULE, .ops = ksmbd_genl_ops, .n_ops = ARRAY_SIZE(ksmbd_genl_ops), + .resv_start_op = KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE + 1, }; static void ksmbd_nl_init_fixup(void) diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c index 78d01033604c..48b2b901f6e5 100644 --- a/fs/ksmbd/vfs.c +++ b/fs/ksmbd/vfs.c @@ -1105,7 +1105,7 @@ int ksmbd_vfs_unlink(struct user_namespace *user_ns, return err; } -static int __dir_empty(struct dir_context *ctx, const char *name, int namlen, +static bool __dir_empty(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct ksmbd_readdir_data *buf; @@ -1113,9 +1113,7 @@ static int __dir_empty(struct dir_context *ctx, const char *name, int namlen, buf = container_of(ctx, struct ksmbd_readdir_data, ctx); buf->dirent_count++; - if (buf->dirent_count > 2) - return -ENOTEMPTY; - return 0; + return buf->dirent_count <= 2; } /** @@ -1142,7 +1140,7 @@ int ksmbd_vfs_empty_dir(struct ksmbd_file *fp) return err; } -static int __caseless_lookup(struct dir_context *ctx, const char *name, +static bool __caseless_lookup(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { @@ -1151,13 +1149,13 @@ static int __caseless_lookup(struct dir_context *ctx, const char *name, buf = container_of(ctx, struct ksmbd_readdir_data, ctx); if (buf->used != namlen) - return 0; + return true; if (!strncasecmp((char *)buf->private, name, namlen)) { memcpy((char *)buf->private, name, namlen); buf->dirent_count = 1; - return -EEXIST; + return false; } - return 0; + return true; } /** diff --git a/fs/lockd/host.c b/fs/lockd/host.c index f802223e71ab..cdc8e12cdac4 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -164,7 +164,7 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni, host->h_addrbuf = nsm->sm_addrbuf; host->net = ni->net; host->h_cred = get_cred(ni->cred); - strlcpy(host->nodename, utsname()->nodename, sizeof(host->nodename)); + strscpy(host->nodename, utsname()->nodename, sizeof(host->nodename)); out: return host; diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index bf274f23969b..284b019cb652 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -521,6 +521,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_void, .pc_encode = nlm4svc_encode_void, .pc_argsize = sizeof(struct nlm_void), + .pc_argzero = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "NULL", @@ -530,6 +531,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_testargs, .pc_encode = nlm4svc_encode_testres, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St+2+No+Rg, .pc_name = "TEST", @@ -539,6 +541,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_lockargs, .pc_encode = nlm4svc_encode_res, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, .pc_name = "LOCK", @@ -548,6 +551,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_cancargs, .pc_encode = nlm4svc_encode_res, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, .pc_name = "CANCEL", @@ -557,6 +561,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_unlockargs, .pc_encode = nlm4svc_encode_res, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, .pc_name = "UNLOCK", @@ -566,6 +571,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_testargs, .pc_encode = nlm4svc_encode_res, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, .pc_name = "GRANTED", @@ -575,6 +581,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_testargs, .pc_encode = nlm4svc_encode_void, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "TEST_MSG", @@ -584,6 +591,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_lockargs, .pc_encode = nlm4svc_encode_void, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "LOCK_MSG", @@ -593,6 +601,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_cancargs, .pc_encode = nlm4svc_encode_void, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "CANCEL_MSG", @@ -602,6 +611,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_unlockargs, .pc_encode = nlm4svc_encode_void, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "UNLOCK_MSG", @@ -611,6 +621,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_testargs, .pc_encode = nlm4svc_encode_void, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "GRANTED_MSG", @@ -620,6 +631,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_void, .pc_encode = nlm4svc_encode_void, .pc_argsize = sizeof(struct nlm_res), + .pc_argzero = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "TEST_RES", @@ -629,6 +641,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_void, .pc_encode = nlm4svc_encode_void, .pc_argsize = sizeof(struct nlm_res), + .pc_argzero = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "LOCK_RES", @@ -638,6 +651,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_void, .pc_encode = nlm4svc_encode_void, .pc_argsize = sizeof(struct nlm_res), + .pc_argzero = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "CANCEL_RES", @@ -647,6 +661,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_void, .pc_encode = nlm4svc_encode_void, .pc_argsize = sizeof(struct nlm_res), + .pc_argzero = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "UNLOCK_RES", @@ -656,6 +671,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_res, .pc_encode = nlm4svc_encode_void, .pc_argsize = sizeof(struct nlm_res), + .pc_argzero = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "GRANTED_RES", @@ -665,6 +681,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_reboot, .pc_encode = nlm4svc_encode_void, .pc_argsize = sizeof(struct nlm_reboot), + .pc_argzero = sizeof(struct nlm_reboot), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "SM_NOTIFY", @@ -674,6 +691,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_void, .pc_encode = nlm4svc_encode_void, .pc_argsize = sizeof(struct nlm_void), + .pc_argzero = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = 0, .pc_name = "UNUSED", @@ -683,6 +701,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_void, .pc_encode = nlm4svc_encode_void, .pc_argsize = sizeof(struct nlm_void), + .pc_argzero = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = 0, .pc_name = "UNUSED", @@ -692,6 +711,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_void, .pc_encode = nlm4svc_encode_void, .pc_argsize = sizeof(struct nlm_void), + .pc_argzero = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = 0, .pc_name = "UNUSED", @@ -701,6 +721,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_shareargs, .pc_encode = nlm4svc_encode_shareres, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St+1, .pc_name = "SHARE", @@ -710,6 +731,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_shareargs, .pc_encode = nlm4svc_encode_shareres, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St+1, .pc_name = "UNSHARE", @@ -719,6 +741,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_lockargs, .pc_encode = nlm4svc_encode_res, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, .pc_name = "NM_LOCK", @@ -728,6 +751,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_decode = nlm4svc_decode_notify, .pc_encode = nlm4svc_encode_void, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "FREE_ALL", diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index b09ca35b527c..e35c05e27806 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -555,6 +555,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_void, .pc_encode = nlmsvc_encode_void, .pc_argsize = sizeof(struct nlm_void), + .pc_argzero = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "NULL", @@ -564,6 +565,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_testargs, .pc_encode = nlmsvc_encode_testres, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St+2+No+Rg, .pc_name = "TEST", @@ -573,6 +575,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_lockargs, .pc_encode = nlmsvc_encode_res, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, .pc_name = "LOCK", @@ -582,6 +585,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_cancargs, .pc_encode = nlmsvc_encode_res, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, .pc_name = "CANCEL", @@ -591,6 +595,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_unlockargs, .pc_encode = nlmsvc_encode_res, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, .pc_name = "UNLOCK", @@ -600,6 +605,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_testargs, .pc_encode = nlmsvc_encode_res, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, .pc_name = "GRANTED", @@ -609,6 +615,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_testargs, .pc_encode = nlmsvc_encode_void, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "TEST_MSG", @@ -618,6 +625,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_lockargs, .pc_encode = nlmsvc_encode_void, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "LOCK_MSG", @@ -627,6 +635,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_cancargs, .pc_encode = nlmsvc_encode_void, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "CANCEL_MSG", @@ -636,6 +645,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_unlockargs, .pc_encode = nlmsvc_encode_void, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "UNLOCK_MSG", @@ -645,6 +655,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_testargs, .pc_encode = nlmsvc_encode_void, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "GRANTED_MSG", @@ -654,6 +665,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_void, .pc_encode = nlmsvc_encode_void, .pc_argsize = sizeof(struct nlm_res), + .pc_argzero = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "TEST_RES", @@ -663,6 +675,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_void, .pc_encode = nlmsvc_encode_void, .pc_argsize = sizeof(struct nlm_res), + .pc_argzero = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "LOCK_RES", @@ -672,6 +685,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_void, .pc_encode = nlmsvc_encode_void, .pc_argsize = sizeof(struct nlm_res), + .pc_argzero = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "CANCEL_RES", @@ -681,6 +695,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_void, .pc_encode = nlmsvc_encode_void, .pc_argsize = sizeof(struct nlm_res), + .pc_argzero = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "UNLOCK_RES", @@ -690,6 +705,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_res, .pc_encode = nlmsvc_encode_void, .pc_argsize = sizeof(struct nlm_res), + .pc_argzero = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "GRANTED_RES", @@ -699,6 +715,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_reboot, .pc_encode = nlmsvc_encode_void, .pc_argsize = sizeof(struct nlm_reboot), + .pc_argzero = sizeof(struct nlm_reboot), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "SM_NOTIFY", @@ -708,6 +725,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_void, .pc_encode = nlmsvc_encode_void, .pc_argsize = sizeof(struct nlm_void), + .pc_argzero = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "UNUSED", @@ -717,6 +735,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_void, .pc_encode = nlmsvc_encode_void, .pc_argsize = sizeof(struct nlm_void), + .pc_argzero = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "UNUSED", @@ -726,6 +745,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_void, .pc_encode = nlmsvc_encode_void, .pc_argsize = sizeof(struct nlm_void), + .pc_argzero = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, .pc_name = "UNUSED", @@ -735,6 +755,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_shareargs, .pc_encode = nlmsvc_encode_shareres, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St+1, .pc_name = "SHARE", @@ -744,6 +765,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_shareargs, .pc_encode = nlmsvc_encode_shareres, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St+1, .pc_name = "UNSHARE", @@ -753,6 +775,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_lockargs, .pc_encode = nlmsvc_encode_res, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, .pc_name = "NM_LOCK", @@ -762,6 +785,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_decode = nlmsvc_decode_notify, .pc_encode = nlmsvc_encode_void, .pc_argsize = sizeof(struct nlm_args), + .pc_argzero = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = 0, .pc_name = "FREE_ALL", diff --git a/fs/locks.c b/fs/locks.c index c266cfdc3291..607f94a0e789 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2129,6 +2129,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) else error = locks_lock_file_wait(f.file, &fl); + locks_release_private(&fl); out_putf: fdput(f); diff --git a/fs/namespace.c b/fs/namespace.c index 68789f896f08..df137ba19d37 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4238,6 +4238,13 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, err = -EPERM; goto out_fput; } + + /* We're not controlling the target namespace. */ + if (!ns_capable(mnt_userns, CAP_SYS_ADMIN)) { + err = -EPERM; + goto out_fput; + } + kattr->mnt_userns = get_user_ns(mnt_userns); out_fput: diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 8dcb08e1a885..d0cccddb7d08 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -1065,6 +1065,7 @@ static const struct svc_procedure nfs4_callback_procedures1[] = { .pc_func = nfs4_callback_compound, .pc_encode = nfs4_encode_void, .pc_argsize = 256, + .pc_argzero = 256, .pc_ressize = 256, .pc_xdrressize = NFS4_CALLBACK_BUFSIZE, .pc_name = "COMPOUND", diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index dbab3caa15ed..58036f657126 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2022,7 +2022,7 @@ static int nfs_finish_open(struct nfs_open_context *ctx, err = finish_open(file, dentry, do_open); if (err) goto out; - if (S_ISREG(file->f_path.dentry->d_inode->i_mode)) + if (S_ISREG(file_inode(file)->i_mode)) nfs_file_set_open_context(file, ctx); else err = -EOPENSTALE; @@ -2382,7 +2382,8 @@ static void nfs_dentry_remove_handle_error(struct inode *dir, { switch (error) { case -ENOENT: - d_delete(dentry); + if (d_really_is_positive(dentry)) + d_delete(dentry); nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); break; case 0: @@ -2484,8 +2485,10 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry) */ error = -ETXTBSY; if (WARN_ON(dentry->d_flags & DCACHE_NFSFS_RENAMED) || - WARN_ON(dentry->d_fsdata == NFS_FSDATA_BLOCKED)) + WARN_ON(dentry->d_fsdata == NFS_FSDATA_BLOCKED)) { + spin_unlock(&dentry->d_lock); goto out; + } if (dentry->d_fsdata) /* old devname */ kfree(dentry->d_fsdata); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index d2bcd4834c0e..e032fe201a36 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -221,8 +221,10 @@ nfs_file_fsync_commit(struct file *file, int datasync) int nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) { - struct nfs_open_context *ctx = nfs_file_open_context(file); struct inode *inode = file_inode(file); + struct nfs_inode *nfsi = NFS_I(inode); + long save_nredirtied = atomic_long_read(&nfsi->redirtied_pages); + long nredirtied; int ret; trace_nfs_fsync_enter(inode); @@ -237,15 +239,10 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) ret = pnfs_sync_inode(inode, !!datasync); if (ret != 0) break; - if (!test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags)) + nredirtied = atomic_long_read(&nfsi->redirtied_pages); + if (nredirtied == save_nredirtied) break; - /* - * If nfs_file_fsync_commit detected a server reboot, then - * resend all dirty pages that might have been covered by - * the NFS_CONTEXT_RESEND_WRITES flag - */ - start = 0; - end = LLONG_MAX; + save_nredirtied = nredirtied; } trace_nfs_fsync_exit(inode, ret); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index b4e46b0ffa2d..bea7c005119c 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -426,6 +426,7 @@ nfs_ilookup(struct super_block *sb, struct nfs_fattr *fattr, struct nfs_fh *fh) static void nfs_inode_init_regular(struct nfs_inode *nfsi) { atomic_long_set(&nfsi->nrequests, 0); + atomic_long_set(&nfsi->redirtied_pages, 0); INIT_LIST_HEAD(&nfsi->commit_info.list); atomic_long_set(&nfsi->commit_info.ncommit, 0); atomic_set(&nfsi->commit_info.rpcs_out, 0); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 27c720d71b4e..898dd95bc7a7 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -606,6 +606,31 @@ static inline gfp_t nfs_io_gfp_mask(void) return GFP_KERNEL; } +/* + * Special version of should_remove_suid() that ignores capabilities. + */ +static inline int nfs_should_remove_suid(const struct inode *inode) +{ + umode_t mode = inode->i_mode; + int kill = 0; + + /* suid always must be killed */ + if (unlikely(mode & S_ISUID)) + kill = ATTR_KILL_SUID; + + /* + * sgid without any exec bits is just a mandatory locking mark; leave + * it alone. If some exec bits are set, it's a real sgid; kill it. + */ + if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) + kill |= ATTR_KILL_SGID; + + if (unlikely(kill && S_ISREG(mode))) + return kill; + + return 0; +} + /* unlink.c */ extern struct rpc_task * nfs_async_rename(struct inode *old_dir, struct inode *new_dir, diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 068c45b3bc1a..d37e4a5401b1 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -78,10 +78,15 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, status = nfs4_call_sync(server->client, server, msg, &args.seq_args, &res.seq_res, 0); - if (status == 0) + if (status == 0) { + if (nfs_should_remove_suid(inode)) { + spin_lock(&inode->i_lock); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE); + spin_unlock(&inode->i_lock); + } status = nfs_post_op_update_inode_force_wcc(inode, res.falloc_fattr); - + } if (msg->rpc_proc == &nfs4_procedures[NFSPROC4_CLNT_ALLOCATE]) trace_nfs4_fallocate(inode, &args, status); else @@ -336,7 +341,7 @@ static ssize_t _nfs42_proc_copy(struct file *src, return status; } } - status = nfs_filemap_write_and_wait_range(file_inode(src)->i_mapping, + status = nfs_filemap_write_and_wait_range(src->f_mapping, pos_src, pos_src + (loff_t)count - 1); if (status) return status; diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index e88f6b18445e..9eb181287879 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -340,6 +340,11 @@ static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt, goto out; } + if (!S_ISREG(fattr->mode)) { + res = ERR_PTR(-EBADF); + goto out; + } + res = ERR_PTR(-ENOMEM); len = strlen(SSC_READ_NAME_BODY) + 16; read_name = kzalloc(len, GFP_KERNEL); @@ -357,6 +362,7 @@ static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt, r_ino->i_fop); if (IS_ERR(filep)) { res = ERR_CAST(filep); + iput(r_ino); goto out_free_name; } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 41a9b6b58fb9..2613b7e36eb9 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -2817,7 +2817,6 @@ int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr) /* Resend all requests through the MDS */ nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true, hdr->completion_ops); - set_bit(NFS_CONTEXT_RESEND_WRITES, &hdr->args.context->flags); return nfs_pageio_resend(&pgio, hdr); } EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 82944e14fcea..ee66ffdb985e 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1051,22 +1051,31 @@ static void nfs_fill_super(struct super_block *sb, struct nfs_fs_context *ctx) if (ctx->bsize) sb->s_blocksize = nfs_block_size(ctx->bsize, &sb->s_blocksize_bits); - if (server->nfs_client->rpc_ops->version != 2) { - /* The VFS shouldn't apply the umask to mode bits. We will do - * so ourselves when necessary. + switch (server->nfs_client->rpc_ops->version) { + case 2: + sb->s_time_gran = 1000; + sb->s_time_min = 0; + sb->s_time_max = U32_MAX; + break; + case 3: + /* + * The VFS shouldn't apply the umask to mode bits. + * We will do so ourselves when necessary. */ sb->s_flags |= SB_POSIXACL; sb->s_time_gran = 1; - sb->s_export_op = &nfs_export_ops; - } else - sb->s_time_gran = 1000; - - if (server->nfs_client->rpc_ops->version != 4) { sb->s_time_min = 0; sb->s_time_max = U32_MAX; - } else { + sb->s_export_op = &nfs_export_ops; + break; + case 4: + sb->s_flags |= SB_POSIXACL; + sb->s_time_gran = 1; sb->s_time_min = S64_MIN; sb->s_time_max = S64_MAX; + if (server->caps & NFS_CAP_ATOMIC_OPEN_V1) + sb->s_export_op = &nfs_export_ops; + break; } sb->s_magic = NFS_SUPER_MAGIC; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 51a7e202d6e5..f41d24b54fd1 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1420,10 +1420,12 @@ static void nfs_initiate_write(struct nfs_pgio_header *hdr, */ static void nfs_redirty_request(struct nfs_page *req) { + struct nfs_inode *nfsi = NFS_I(page_file_mapping(req->wb_page)->host); + /* Bump the transmission count */ req->wb_nio++; nfs_mark_request_dirty(req); - set_bit(NFS_CONTEXT_RESEND_WRITES, &nfs_req_openctx(req)->flags); + atomic_long_inc(&nfsi->redirtied_pages); nfs_end_page_writeback(req); nfs_release_request(req); } @@ -1494,31 +1496,6 @@ void nfs_commit_prepare(struct rpc_task *task, void *calldata) NFS_PROTO(data->inode)->commit_rpc_prepare(task, data); } -/* - * Special version of should_remove_suid() that ignores capabilities. - */ -static int nfs_should_remove_suid(const struct inode *inode) -{ - umode_t mode = inode->i_mode; - int kill = 0; - - /* suid always must be killed */ - if (unlikely(mode & S_ISUID)) - kill = ATTR_KILL_SUID; - - /* - * sgid without any exec bits is just a mandatory locking mark; leave - * it alone. If some exec bits are set, it's a real sgid; kill it. - */ - if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) - kill |= ATTR_KILL_SGID; - - if (unlikely(kill && S_ISREG(mode))) - return kill; - - return 0; -} - static void nfs_writeback_check_extend(struct nfs_pgio_header *hdr, struct nfs_fattr *fattr) { @@ -1904,7 +1881,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) /* We have a mismatch. Write the page again */ dprintk_cont(" mismatch\n"); nfs_mark_request_dirty(req); - set_bit(NFS_CONTEXT_RESEND_WRITES, &nfs_req_openctx(req)->flags); + atomic_long_inc(&NFS_I(data->inode)->redirtied_pages); next: nfs_unlock_and_release_request(req); /* Latency breaker */ diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h index 65c331f75e9c..f21259ead64b 100644 --- a/fs/nfsd/cache.h +++ b/fs/nfsd/cache.h @@ -84,6 +84,6 @@ int nfsd_reply_cache_init(struct nfsd_net *); void nfsd_reply_cache_shutdown(struct nfsd_net *); int nfsd_cache_lookup(struct svc_rqst *); void nfsd_cache_update(struct svc_rqst *, int, __be32 *); -int nfsd_reply_cache_stats_open(struct inode *, struct file *); +int nfsd_reply_cache_stats_show(struct seq_file *m, void *v); #endif /* NFSCACHE_H */ diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index eeed4ae5b4ad..d5c57360b418 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -1212,7 +1212,7 @@ nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp, * scraping this file for info should test the labels to ensure they're * getting the correct field. */ -static int nfsd_file_cache_stats_show(struct seq_file *m, void *v) +int nfsd_file_cache_stats_show(struct seq_file *m, void *v) { unsigned long releases = 0, pages_flushed = 0, evictions = 0; unsigned long hits = 0, acquisitions = 0; @@ -1259,8 +1259,3 @@ static int nfsd_file_cache_stats_show(struct seq_file *m, void *v) seq_printf(m, "pages flushed: %lu\n", pages_flushed); return 0; } - -int nfsd_file_cache_stats_open(struct inode *inode, struct file *file) -{ - return single_open(file, nfsd_file_cache_stats_show, NULL); -} diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index 8e8c0c47d67d..357832bac736 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -60,5 +60,5 @@ __be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned int may_flags, struct nfsd_file **nfp); __be32 nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned int may_flags, struct nfsd_file **nfp); -int nfsd_file_cache_stats_open(struct inode *, struct file *); +int nfsd_file_cache_stats_show(struct seq_file *m, void *v); #endif /* _FS_NFSD_FILECACHE_H */ diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index ffe17743cc74..8c854ba3285b 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -192,6 +192,10 @@ struct nfsd_net { atomic_t nfs4_client_count; int nfs4_max_clients; + + atomic_t nfsd_courtesy_clients; + struct shrinker nfsd_client_shrinker; + struct delayed_work nfsd_shrinker_work; }; /* Simple check to find out if a given net was properly initialized */ diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 9edd3c1a30fb..13e6e6897f6c 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -331,6 +331,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = { .pc_decode = nfssvc_decode_voidarg, .pc_encode = nfssvc_encode_voidres, .pc_argsize = sizeof(struct nfsd_voidargs), + .pc_argzero = sizeof(struct nfsd_voidargs), .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST, @@ -342,6 +343,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = { .pc_encode = nfsaclsvc_encode_getaclres, .pc_release = nfsaclsvc_release_getacl, .pc_argsize = sizeof(struct nfsd3_getaclargs), + .pc_argzero = sizeof(struct nfsd3_getaclargs), .pc_ressize = sizeof(struct nfsd3_getaclres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+1+2*(1+ACL), @@ -353,6 +355,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = { .pc_encode = nfssvc_encode_attrstatres, .pc_release = nfssvc_release_attrstat, .pc_argsize = sizeof(struct nfsd3_setaclargs), + .pc_argzero = sizeof(struct nfsd3_setaclargs), .pc_ressize = sizeof(struct nfsd_attrstat), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+AT, @@ -364,6 +367,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = { .pc_encode = nfssvc_encode_attrstatres, .pc_release = nfssvc_release_attrstat, .pc_argsize = sizeof(struct nfsd_fhandle), + .pc_argzero = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd_attrstat), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+AT, @@ -375,6 +379,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = { .pc_encode = nfsaclsvc_encode_accessres, .pc_release = nfsaclsvc_release_access, .pc_argsize = sizeof(struct nfsd3_accessargs), + .pc_argzero = sizeof(struct nfsd3_accessargs), .pc_ressize = sizeof(struct nfsd3_accessres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+AT+1, diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 9446c6743664..2fb9ee356455 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -252,6 +252,7 @@ static const struct svc_procedure nfsd_acl_procedures3[3] = { .pc_decode = nfssvc_decode_voidarg, .pc_encode = nfssvc_encode_voidres, .pc_argsize = sizeof(struct nfsd_voidargs), + .pc_argzero = sizeof(struct nfsd_voidargs), .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST, @@ -263,6 +264,7 @@ static const struct svc_procedure nfsd_acl_procedures3[3] = { .pc_encode = nfs3svc_encode_getaclres, .pc_release = nfs3svc_release_getacl, .pc_argsize = sizeof(struct nfsd3_getaclargs), + .pc_argzero = sizeof(struct nfsd3_getaclargs), .pc_ressize = sizeof(struct nfsd3_getaclres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+1+2*(1+ACL), @@ -274,6 +276,7 @@ static const struct svc_procedure nfsd_acl_procedures3[3] = { .pc_encode = nfs3svc_encode_setaclres, .pc_release = nfs3svc_release_fhandle, .pc_argsize = sizeof(struct nfsd3_setaclargs), + .pc_argzero = sizeof(struct nfsd3_setaclargs), .pc_ressize = sizeof(struct nfsd3_attrstat), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT, diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index a41cca619338..923d9a80df92 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -150,7 +150,6 @@ nfsd3_proc_read(struct svc_rqst *rqstp) { struct nfsd3_readargs *argp = rqstp->rq_argp; struct nfsd3_readres *resp = rqstp->rq_resp; - u32 max_blocksize = svc_max_payload(rqstp); unsigned int len; int v; @@ -159,7 +158,8 @@ nfsd3_proc_read(struct svc_rqst *rqstp) (unsigned long) argp->count, (unsigned long long) argp->offset); - argp->count = min_t(u32, argp->count, max_blocksize); + argp->count = min_t(u32, argp->count, svc_max_payload(rqstp)); + argp->count = min_t(u32, argp->count, rqstp->rq_res.buflen); if (argp->offset > (u64)OFFSET_MAX) argp->offset = (u64)OFFSET_MAX; if (argp->offset + argp->count > (u64)OFFSET_MAX) @@ -563,25 +563,18 @@ static void nfsd3_init_dirlist_pages(struct svc_rqst *rqstp, { struct xdr_buf *buf = &resp->dirlist; struct xdr_stream *xdr = &resp->xdr; - - count = clamp(count, (u32)(XDR_UNIT * 2), svc_max_payload(rqstp)); + unsigned int sendbuf = min_t(unsigned int, rqstp->rq_res.buflen, + svc_max_payload(rqstp)); memset(buf, 0, sizeof(*buf)); /* Reserve room for the NULL ptr & eof flag (-2 words) */ - buf->buflen = count - XDR_UNIT * 2; + buf->buflen = clamp(count, (u32)(XDR_UNIT * 2), sendbuf); + buf->buflen -= XDR_UNIT * 2; buf->pages = rqstp->rq_next_page; rqstp->rq_next_page += (buf->buflen + PAGE_SIZE - 1) >> PAGE_SHIFT; - /* This is xdr_init_encode(), but it assumes that - * the head kvec has already been consumed. */ - xdr_set_scratch_buffer(xdr, NULL, 0); - xdr->buf = buf; - xdr->page_ptr = buf->pages; - xdr->iov = NULL; - xdr->p = page_address(*buf->pages); - xdr->end = (void *)xdr->p + min_t(u32, buf->buflen, PAGE_SIZE); - xdr->rqst = NULL; + xdr_init_encode_pages(xdr, buf, buf->pages, NULL); } /* @@ -808,6 +801,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_decode = nfssvc_decode_voidarg, .pc_encode = nfssvc_encode_voidres, .pc_argsize = sizeof(struct nfsd_voidargs), + .pc_argzero = sizeof(struct nfsd_voidargs), .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST, @@ -819,6 +813,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_getattrres, .pc_release = nfs3svc_release_fhandle, .pc_argsize = sizeof(struct nfsd_fhandle), + .pc_argzero = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd3_attrstatres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+AT, @@ -830,6 +825,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_wccstatres, .pc_release = nfs3svc_release_fhandle, .pc_argsize = sizeof(struct nfsd3_sattrargs), + .pc_argzero = sizeof(struct nfsd3_sattrargs), .pc_ressize = sizeof(struct nfsd3_wccstatres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+WC, @@ -841,6 +837,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_lookupres, .pc_release = nfs3svc_release_fhandle2, .pc_argsize = sizeof(struct nfsd3_diropargs), + .pc_argzero = sizeof(struct nfsd3_diropargs), .pc_ressize = sizeof(struct nfsd3_diropres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+FH+pAT+pAT, @@ -852,6 +849,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_accessres, .pc_release = nfs3svc_release_fhandle, .pc_argsize = sizeof(struct nfsd3_accessargs), + .pc_argzero = sizeof(struct nfsd3_accessargs), .pc_ressize = sizeof(struct nfsd3_accessres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT+1, @@ -863,6 +861,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_readlinkres, .pc_release = nfs3svc_release_fhandle, .pc_argsize = sizeof(struct nfsd_fhandle), + .pc_argzero = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd3_readlinkres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT+1+NFS3_MAXPATHLEN/4, @@ -874,6 +873,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_readres, .pc_release = nfs3svc_release_fhandle, .pc_argsize = sizeof(struct nfsd3_readargs), + .pc_argzero = sizeof(struct nfsd3_readargs), .pc_ressize = sizeof(struct nfsd3_readres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT+4+NFSSVC_MAXBLKSIZE/4, @@ -885,6 +885,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_writeres, .pc_release = nfs3svc_release_fhandle, .pc_argsize = sizeof(struct nfsd3_writeargs), + .pc_argzero = sizeof(struct nfsd3_writeargs), .pc_ressize = sizeof(struct nfsd3_writeres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+WC+4, @@ -896,6 +897,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_createres, .pc_release = nfs3svc_release_fhandle2, .pc_argsize = sizeof(struct nfsd3_createargs), + .pc_argzero = sizeof(struct nfsd3_createargs), .pc_ressize = sizeof(struct nfsd3_createres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+(1+FH+pAT)+WC, @@ -907,6 +909,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_createres, .pc_release = nfs3svc_release_fhandle2, .pc_argsize = sizeof(struct nfsd3_mkdirargs), + .pc_argzero = sizeof(struct nfsd3_mkdirargs), .pc_ressize = sizeof(struct nfsd3_createres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+(1+FH+pAT)+WC, @@ -918,6 +921,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_createres, .pc_release = nfs3svc_release_fhandle2, .pc_argsize = sizeof(struct nfsd3_symlinkargs), + .pc_argzero = sizeof(struct nfsd3_symlinkargs), .pc_ressize = sizeof(struct nfsd3_createres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+(1+FH+pAT)+WC, @@ -929,6 +933,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_createres, .pc_release = nfs3svc_release_fhandle2, .pc_argsize = sizeof(struct nfsd3_mknodargs), + .pc_argzero = sizeof(struct nfsd3_mknodargs), .pc_ressize = sizeof(struct nfsd3_createres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+(1+FH+pAT)+WC, @@ -940,6 +945,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_wccstatres, .pc_release = nfs3svc_release_fhandle, .pc_argsize = sizeof(struct nfsd3_diropargs), + .pc_argzero = sizeof(struct nfsd3_diropargs), .pc_ressize = sizeof(struct nfsd3_wccstatres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+WC, @@ -951,6 +957,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_wccstatres, .pc_release = nfs3svc_release_fhandle, .pc_argsize = sizeof(struct nfsd3_diropargs), + .pc_argzero = sizeof(struct nfsd3_diropargs), .pc_ressize = sizeof(struct nfsd3_wccstatres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+WC, @@ -962,6 +969,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_renameres, .pc_release = nfs3svc_release_fhandle2, .pc_argsize = sizeof(struct nfsd3_renameargs), + .pc_argzero = sizeof(struct nfsd3_renameargs), .pc_ressize = sizeof(struct nfsd3_renameres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+WC+WC, @@ -973,6 +981,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_linkres, .pc_release = nfs3svc_release_fhandle2, .pc_argsize = sizeof(struct nfsd3_linkargs), + .pc_argzero = sizeof(struct nfsd3_linkargs), .pc_ressize = sizeof(struct nfsd3_linkres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+pAT+WC, @@ -984,6 +993,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_readdirres, .pc_release = nfs3svc_release_fhandle, .pc_argsize = sizeof(struct nfsd3_readdirargs), + .pc_argzero = sizeof(struct nfsd3_readdirargs), .pc_ressize = sizeof(struct nfsd3_readdirres), .pc_cachetype = RC_NOCACHE, .pc_name = "READDIR", @@ -994,6 +1004,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_readdirres, .pc_release = nfs3svc_release_fhandle, .pc_argsize = sizeof(struct nfsd3_readdirplusargs), + .pc_argzero = sizeof(struct nfsd3_readdirplusargs), .pc_ressize = sizeof(struct nfsd3_readdirres), .pc_cachetype = RC_NOCACHE, .pc_name = "READDIRPLUS", @@ -1003,6 +1014,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_decode = nfs3svc_decode_fhandleargs, .pc_encode = nfs3svc_encode_fsstatres, .pc_argsize = sizeof(struct nfsd3_fhandleargs), + .pc_argzero = sizeof(struct nfsd3_fhandleargs), .pc_ressize = sizeof(struct nfsd3_fsstatres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT+2*6+1, @@ -1013,6 +1025,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_decode = nfs3svc_decode_fhandleargs, .pc_encode = nfs3svc_encode_fsinfores, .pc_argsize = sizeof(struct nfsd3_fhandleargs), + .pc_argzero = sizeof(struct nfsd3_fhandleargs), .pc_ressize = sizeof(struct nfsd3_fsinfores), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT+12, @@ -1023,6 +1036,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_decode = nfs3svc_decode_fhandleargs, .pc_encode = nfs3svc_encode_pathconfres, .pc_argsize = sizeof(struct nfsd3_fhandleargs), + .pc_argzero = sizeof(struct nfsd3_fhandleargs), .pc_ressize = sizeof(struct nfsd3_pathconfres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT+6, @@ -1034,6 +1048,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_commitres, .pc_release = nfs3svc_release_fhandle, .pc_argsize = sizeof(struct nfsd3_commitargs), + .pc_argzero = sizeof(struct nfsd3_commitargs), .pc_ressize = sizeof(struct nfsd3_commitres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+WC+2, diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 0293b8d65f10..3308dd671ef0 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -571,10 +571,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) args->count = max_blocksize; args->len = max_blocksize; } - if (!xdr_stream_subsegment(xdr, &args->payload, args->count)) - return false; - return true; + return xdr_stream_subsegment(xdr, &args->payload, args->count); } bool @@ -616,8 +614,6 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_symlinkargs *args = rqstp->rq_argp; struct kvec *head = rqstp->rq_arg.head; - struct kvec *tail = rqstp->rq_arg.tail; - size_t remaining; if (!svcxdr_decode_diropargs3(xdr, &args->ffh, &args->fname, &args->flen)) return false; @@ -626,16 +622,10 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) if (xdr_stream_decode_u32(xdr, &args->tlen) < 0) return false; - /* request sanity */ - remaining = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len; - remaining -= xdr_stream_pos(xdr); - if (remaining < xdr_align_size(args->tlen)) - return false; - - args->first.iov_base = xdr->p; + /* symlink_data */ args->first.iov_len = head->iov_len - xdr_stream_pos(xdr); - - return true; + args->first.iov_base = xdr_inline_decode(xdr, args->tlen); + return args->first.iov_base != NULL; } bool diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 4ce328209f61..f0e69edf5f0f 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -1371,11 +1371,21 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, cb->cb_holds_slot = false; } -void nfsd4_run_cb(struct nfsd4_callback *cb) +/** + * nfsd4_run_cb - queue up a callback job to run + * @cb: callback to queue + * + * Kick off a callback to do its thing. Returns false if it was already + * on a queue, true otherwise. + */ +bool nfsd4_run_cb(struct nfsd4_callback *cb) { struct nfs4_client *clp = cb->cb_clp; + bool queued; nfsd41_cb_inflight_begin(clp); - if (!nfsd4_queue_cb(cb)) + queued = nfsd4_queue_cb(cb); + if (!queued) nfsd41_cb_inflight_end(clp); + return queued; } diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index f92161ce1f97..e70a1a2999b7 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -82,8 +82,8 @@ ent_init(struct cache_head *cnew, struct cache_head *citm) new->id = itm->id; new->type = itm->type; - strlcpy(new->name, itm->name, sizeof(new->name)); - strlcpy(new->authname, itm->authname, sizeof(new->authname)); + strscpy(new->name, itm->name, sizeof(new->name)); + strscpy(new->authname, itm->authname, sizeof(new->authname)); } static void @@ -548,7 +548,7 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen return nfserr_badowner; memcpy(key.name, name, namelen); key.name[namelen] = '\0'; - strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); + strscpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); ret = idmap_lookup(rqstp, nametoid_lookup, &key, nn->nametoid_cache, &item); if (ret == -ENOENT) return nfserr_badowner; @@ -584,7 +584,7 @@ static __be32 idmap_id_to_name(struct xdr_stream *xdr, int ret; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); - strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); + strscpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); ret = idmap_lookup(rqstp, idtoname_lookup, &key, nn->idtoname_cache, &item); if (ret == -ENOENT) return encode_ascii_id(xdr, id); diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 2c05692a9abf..3564d1c6f610 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -658,7 +658,7 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) ktime_t now, cutoff; const struct nfsd4_layout_ops *ops; - + trace_nfsd_cb_layout_done(&ls->ls_stid.sc_stateid, task); switch (task->tk_status) { case 0: case -NFS4ERR_DELAY: diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index a72ab97f77ef..8beb2bc4c328 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -141,7 +141,6 @@ fh_dup2(struct svc_fh *dst, struct svc_fh *src) static __be32 do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open, int accmode) { - __be32 status; if (open->op_truncate && !(open->op_share_access & NFS4_SHARE_ACCESS_WRITE)) @@ -156,9 +155,7 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs if (open->op_share_deny & NFS4_SHARE_DENY_READ) accmode |= NFSD_MAY_WRITE; - status = fh_verify(rqstp, current_fh, S_IFREG, accmode); - - return status; + return fh_verify(rqstp, current_fh, S_IFREG, accmode); } static __be32 nfsd_check_obj_isreg(struct svc_fh *fh) @@ -454,7 +451,6 @@ static __be32 do_open_fhandle(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open) { struct svc_fh *current_fh = &cstate->current_fh; - __be32 status; int accmode = 0; /* We don't know the target directory, and therefore can not @@ -479,9 +475,7 @@ do_open_fhandle(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, str if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEG_CUR_FH) accmode = NFSD_MAY_OWNER_OVERRIDE; - status = do_open_permission(rqstp, current_fh, open, accmode); - - return status; + return do_open_permission(rqstp, current_fh, open, accmode); } static void @@ -668,11 +662,9 @@ static __be32 nfsd4_putrootfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { - __be32 status; - fh_put(&cstate->current_fh); - status = exp_pseudoroot(rqstp, &cstate->current_fh); - return status; + + return exp_pseudoroot(rqstp, &cstate->current_fh); } static __be32 @@ -1343,7 +1335,7 @@ try_again: return 0; } if (work) { - strlcpy(work->nsui_ipaddr, ipaddr, sizeof(work->nsui_ipaddr) - 1); + strscpy(work->nsui_ipaddr, ipaddr, sizeof(work->nsui_ipaddr) - 1); refcount_set(&work->nsui_refcnt, 2); work->nsui_busy = true; list_add_tail(&work->nsui_list, &nn->nfsd_ssc_mount_list); @@ -1621,6 +1613,10 @@ static void nfsd4_cb_offload_release(struct nfsd4_callback *cb) static int nfsd4_cb_offload_done(struct nfsd4_callback *cb, struct rpc_task *task) { + struct nfsd4_cb_offload *cbo = + container_of(cb, struct nfsd4_cb_offload, co_cb); + + trace_nfsd_cb_offload_done(&cbo->co_res.cb_stateid, task); return 1; } @@ -1768,7 +1764,13 @@ static int nfsd4_do_async_copy(void *data) filp = nfs42_ssc_open(copy->ss_mnt, ©->c_fh, ©->stateid); if (IS_ERR(filp)) { - nfserr = nfserr_offload_denied; + switch (PTR_ERR(filp)) { + case -EBADF: + nfserr = nfserr_wrong_type; + break; + default: + nfserr = nfserr_offload_denied; + } nfsd4_interssc_disconnect(copy->ss_mnt); goto do_callback; } @@ -1826,7 +1828,7 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (!nfs4_init_copy_state(nn, copy)) goto out_err; refcount_set(&async_copy->refcount, 1); - memcpy(©->cp_res.cb_stateid, ©->cp_stateid.stid, + memcpy(©->cp_res.cb_stateid, ©->cp_stateid.cs_stid, sizeof(copy->cp_res.cb_stateid)); dup_copy_fields(copy, async_copy); async_copy->copy_task = kthread_create(nfsd4_do_async_copy, @@ -1862,7 +1864,7 @@ find_async_copy(struct nfs4_client *clp, stateid_t *stateid) spin_lock(&clp->async_lock); list_for_each_entry(copy, &clp->async_copies, copies) { - if (memcmp(©->cp_stateid.stid, stateid, NFS4_STATEID_SIZE)) + if (memcmp(©->cp_stateid.cs_stid, stateid, NFS4_STATEID_SIZE)) continue; refcount_inc(©->refcount); spin_unlock(&clp->async_lock); @@ -1916,7 +1918,7 @@ nfsd4_copy_notify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, cps = nfs4_alloc_init_cpntf_state(nn, stid); if (!cps) goto out; - memcpy(&cn->cpn_cnr_stateid, &cps->cp_stateid.stid, sizeof(stateid_t)); + memcpy(&cn->cpn_cnr_stateid, &cps->cp_stateid.cs_stid, sizeof(stateid_t)); memcpy(&cps->cp_p_stateid, &stid->sc_stateid, sizeof(stateid_t)); memcpy(&cps->cp_p_clid, &clp->cl_clientid, sizeof(clientid_t)); @@ -2633,9 +2635,6 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) status = nfserr_minor_vers_mismatch; if (nfsd_minorversion(nn, args->minorversion, NFSD_TEST) <= 0) goto out; - status = nfserr_resource; - if (args->opcnt > NFSD_MAX_OPS_PER_COMPOUND) - goto out; status = nfs41_check_op_ordering(args); if (status) { @@ -2648,10 +2647,20 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) rqstp->rq_lease_breaker = (void **)&cstate->clp; - trace_nfsd_compound(rqstp, args->opcnt); + trace_nfsd_compound(rqstp, args->tag, args->taglen, args->client_opcnt); while (!status && resp->opcnt < args->opcnt) { op = &args->ops[resp->opcnt++]; + if (unlikely(resp->opcnt == NFSD_MAX_OPS_PER_COMPOUND)) { + /* If there are still more operations to process, + * stop here and report NFS4ERR_RESOURCE. */ + if (cstate->minorversion == 0 && + args->client_opcnt > resp->opcnt) { + op->status = nfserr_resource; + goto encode_op; + } + } + /* * The XDR decode routines may have pre-set op->status; * for example, if there is a miscellaneous XDR error @@ -2727,8 +2736,8 @@ encode_op: status = op->status; } - trace_nfsd_compound_status(args->opcnt, resp->opcnt, status, - nfsd4_op_name(op->opnum)); + trace_nfsd_compound_status(args->client_opcnt, resp->opcnt, + status, nfsd4_op_name(op->opnum)); nfsd4_cstate_clear_replay(cstate); nfsd4_increment_op_stats(op->opnum); @@ -2762,28 +2771,49 @@ out: #define op_encode_channel_attrs_maxsz (6 + 1 + 1) -static inline u32 nfsd4_only_status_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +/* + * The _rsize() helpers are invoked by the NFSv4 COMPOUND decoder, which + * is called before sunrpc sets rq_res.buflen. Thus we have to compute + * the maximum payload size here, based on transport limits and the size + * of the remaining space in the rq_pages array. + */ +static u32 nfsd4_max_payload(const struct svc_rqst *rqstp) +{ + u32 buflen; + + buflen = (rqstp->rq_page_end - rqstp->rq_next_page) * PAGE_SIZE; + buflen -= rqstp->rq_auth_slack; + buflen -= rqstp->rq_res.head[0].iov_len; + return min_t(u32, buflen, svc_max_payload(rqstp)); +} + +static u32 nfsd4_only_status_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size) * sizeof(__be32); } -static inline u32 nfsd4_status_stateid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_status_stateid_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_stateid_maxsz)* sizeof(__be32); } -static inline u32 nfsd4_access_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_access_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { /* ac_supported, ac_resp_access */ return (op_encode_hdr_size + 2)* sizeof(__be32); } -static inline u32 nfsd4_commit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_commit_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32); } -static inline u32 nfsd4_create_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_create_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_change_info_maxsz + nfs4_fattr_bitmap_maxsz) * sizeof(__be32); @@ -2794,17 +2824,17 @@ static inline u32 nfsd4_create_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op * the op prematurely if the estimate is too large. We may turn off splice * reads unnecessarily. */ -static inline u32 nfsd4_getattr_rsize(struct svc_rqst *rqstp, - struct nfsd4_op *op) +static u32 nfsd4_getattr_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { - u32 *bmap = op->u.getattr.ga_bmval; + const u32 *bmap = op->u.getattr.ga_bmval; u32 bmap0 = bmap[0], bmap1 = bmap[1], bmap2 = bmap[2]; u32 ret = 0; if (bmap0 & FATTR4_WORD0_ACL) - return svc_max_payload(rqstp); + return nfsd4_max_payload(rqstp); if (bmap0 & FATTR4_WORD0_FS_LOCATIONS) - return svc_max_payload(rqstp); + return nfsd4_max_payload(rqstp); if (bmap1 & FATTR4_WORD1_OWNER) { ret += IDMAP_NAMESZ + 4; @@ -2832,24 +2862,28 @@ static inline u32 nfsd4_getattr_rsize(struct svc_rqst *rqstp, return ret; } -static inline u32 nfsd4_getfh_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_getfh_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + 1) * sizeof(__be32) + NFS4_FHSIZE; } -static inline u32 nfsd4_link_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_link_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_change_info_maxsz) * sizeof(__be32); } -static inline u32 nfsd4_lock_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_lock_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_lock_denied_maxsz) * sizeof(__be32); } -static inline u32 nfsd4_open_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_open_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_stateid_maxsz + op_encode_change_info_maxsz + 1 @@ -2857,20 +2891,18 @@ static inline u32 nfsd4_open_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) + op_encode_delegation_maxsz) * sizeof(__be32); } -static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_read_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { - u32 maxcount = 0, rlen = 0; - - maxcount = svc_max_payload(rqstp); - rlen = min(op->u.read.rd_length, maxcount); + u32 rlen = min(op->u.read.rd_length, nfsd4_max_payload(rqstp)); return (op_encode_hdr_size + 2 + XDR_QUADLEN(rlen)) * sizeof(__be32); } -static inline u32 nfsd4_read_plus_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_read_plus_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { - u32 maxcount = svc_max_payload(rqstp); - u32 rlen = min(op->u.read.rd_length, maxcount); + u32 rlen = min(op->u.read.rd_length, nfsd4_max_payload(rqstp)); /* * If we detect that the file changed during hole encoding, then we * recover by encoding the remaining reply as data. This means we need @@ -2881,70 +2913,77 @@ static inline u32 nfsd4_read_plus_rsize(struct svc_rqst *rqstp, struct nfsd4_op return (op_encode_hdr_size + 2 + seg_len + XDR_QUADLEN(rlen)) * sizeof(__be32); } -static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_readdir_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { - u32 maxcount = 0, rlen = 0; - - maxcount = svc_max_payload(rqstp); - rlen = min(op->u.readdir.rd_maxcount, maxcount); + u32 rlen = min(op->u.readdir.rd_maxcount, nfsd4_max_payload(rqstp)); return (op_encode_hdr_size + op_encode_verifier_maxsz + XDR_QUADLEN(rlen)) * sizeof(__be32); } -static inline u32 nfsd4_readlink_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_readlink_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + 1) * sizeof(__be32) + PAGE_SIZE; } -static inline u32 nfsd4_remove_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_remove_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_change_info_maxsz) * sizeof(__be32); } -static inline u32 nfsd4_rename_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_rename_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_change_info_maxsz + op_encode_change_info_maxsz) * sizeof(__be32); } -static inline u32 nfsd4_sequence_rsize(struct svc_rqst *rqstp, - struct nfsd4_op *op) +static u32 nfsd4_sequence_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) * sizeof(__be32); } -static inline u32 nfsd4_test_stateid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_test_stateid_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + 1 + op->u.test_stateid.ts_num_ids) * sizeof(__be32); } -static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_setattr_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + nfs4_fattr_bitmap_maxsz) * sizeof(__be32); } -static inline u32 nfsd4_secinfo_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_secinfo_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + RPC_AUTH_MAXFLAVOR * (4 + XDR_QUADLEN(GSS_OID_MAX_LEN))) * sizeof(__be32); } -static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_setclientid_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + 2 + XDR_QUADLEN(NFS4_VERIFIER_SIZE)) * sizeof(__be32); } -static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_write_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + 2 + op_encode_verifier_maxsz) * sizeof(__be32); } -static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_exchange_id_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + 2 + 1 + /* eir_clientid, eir_sequenceid */\ 1 + 1 + /* eir_flags, spr_how */\ @@ -2958,14 +2997,16 @@ static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_o 0 /* ignored eir_server_impl_id contents */) * sizeof(__be32); } -static inline u32 nfsd4_bind_conn_to_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_bind_conn_to_session_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + \ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* bctsr_sessid */\ 2 /* bctsr_dir, use_conn_in_rdma_mode */) * sizeof(__be32); } -static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_create_session_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + \ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* sessionid */\ @@ -2974,7 +3015,8 @@ static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd op_encode_channel_attrs_maxsz) * sizeof(__be32); } -static inline u32 nfsd4_copy_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_copy_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + 1 /* wr_callback */ + @@ -2986,16 +3028,16 @@ static inline u32 nfsd4_copy_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) 1 /* cr_synchronous */) * sizeof(__be32); } -static inline u32 nfsd4_offload_status_rsize(struct svc_rqst *rqstp, - struct nfsd4_op *op) +static u32 nfsd4_offload_status_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + 2 /* osr_count */ + 1 /* osr_complete<1> optional 0 for now */) * sizeof(__be32); } -static inline u32 nfsd4_copy_notify_rsize(struct svc_rqst *rqstp, - struct nfsd4_op *op) +static u32 nfsd4_copy_notify_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + 3 /* cnr_lease_time */ + @@ -3010,12 +3052,10 @@ static inline u32 nfsd4_copy_notify_rsize(struct svc_rqst *rqstp, } #ifdef CONFIG_NFSD_PNFS -static inline u32 nfsd4_getdeviceinfo_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_getdeviceinfo_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { - u32 maxcount = 0, rlen = 0; - - maxcount = svc_max_payload(rqstp); - rlen = min(op->u.getdeviceinfo.gd_maxcount, maxcount); + u32 rlen = min(op->u.getdeviceinfo.gd_maxcount, nfsd4_max_payload(rqstp)); return (op_encode_hdr_size + 1 /* gd_layout_type*/ + @@ -3028,7 +3068,8 @@ static inline u32 nfsd4_getdeviceinfo_rsize(struct svc_rqst *rqstp, struct nfsd4 * so we need to define an arbitrary upper bound here. */ #define MAX_LAYOUT_SIZE 128 -static inline u32 nfsd4_layoutget_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_layoutget_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + 1 /* logr_return_on_close */ + @@ -3037,14 +3078,16 @@ static inline u32 nfsd4_layoutget_rsize(struct svc_rqst *rqstp, struct nfsd4_op MAX_LAYOUT_SIZE) * sizeof(__be32); } -static inline u32 nfsd4_layoutcommit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_layoutcommit_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + 1 /* locr_newsize */ + 2 /* ns_size */) * sizeof(__be32); } -static inline u32 nfsd4_layoutreturn_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_layoutreturn_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + 1 /* lrs_stateid */ + @@ -3053,41 +3096,36 @@ static inline u32 nfsd4_layoutreturn_rsize(struct svc_rqst *rqstp, struct nfsd4_ #endif /* CONFIG_NFSD_PNFS */ -static inline u32 nfsd4_seek_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_seek_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + 3) * sizeof(__be32); } -static inline u32 nfsd4_getxattr_rsize(struct svc_rqst *rqstp, - struct nfsd4_op *op) +static u32 nfsd4_getxattr_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { - u32 maxcount, rlen; - - maxcount = svc_max_payload(rqstp); - rlen = min_t(u32, XATTR_SIZE_MAX, maxcount); + u32 rlen = min_t(u32, XATTR_SIZE_MAX, nfsd4_max_payload(rqstp)); return (op_encode_hdr_size + 1 + XDR_QUADLEN(rlen)) * sizeof(__be32); } -static inline u32 nfsd4_setxattr_rsize(struct svc_rqst *rqstp, - struct nfsd4_op *op) +static u32 nfsd4_setxattr_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_change_info_maxsz) * sizeof(__be32); } -static inline u32 nfsd4_listxattrs_rsize(struct svc_rqst *rqstp, - struct nfsd4_op *op) +static u32 nfsd4_listxattrs_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { - u32 maxcount, rlen; - - maxcount = svc_max_payload(rqstp); - rlen = min(op->u.listxattrs.lsxa_maxcount, maxcount); + u32 rlen = min(op->u.listxattrs.lsxa_maxcount, nfsd4_max_payload(rqstp)); return (op_encode_hdr_size + 4 + XDR_QUADLEN(rlen)) * sizeof(__be32); } -static inline u32 nfsd4_removexattr_rsize(struct svc_rqst *rqstp, - struct nfsd4_op *op) +static u32 nfsd4_removexattr_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_change_info_maxsz) * sizeof(__be32); @@ -3576,6 +3614,7 @@ static const struct svc_procedure nfsd_procedures4[2] = { .pc_decode = nfssvc_decode_voidarg, .pc_encode = nfssvc_encode_voidres, .pc_argsize = sizeof(struct nfsd_voidargs), + .pc_argzero = sizeof(struct nfsd_voidargs), .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = 1, @@ -3586,6 +3625,7 @@ static const struct svc_procedure nfsd_procedures4[2] = { .pc_decode = nfs4svc_decode_compoundargs, .pc_encode = nfs4svc_encode_compoundres, .pc_argsize = sizeof(struct nfsd4_compoundargs), + .pc_argzero = offsetof(struct nfsd4_compoundargs, iops), .pc_ressize = sizeof(struct nfsd4_compoundres), .pc_release = nfsd4_release_compoundargs, .pc_cachetype = RC_NOCACHE, diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index c634483d85d2..78b8cd9651d5 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -266,7 +266,7 @@ struct nfs4_dir_ctx { struct list_head names; }; -static int +static bool nfsd4_build_namelist(struct dir_context *__ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { @@ -275,14 +275,14 @@ nfsd4_build_namelist(struct dir_context *__ctx, const char *name, int namlen, struct name_list *entry; if (namlen != HEXDIR_LEN - 1) - return 0; + return true; entry = kmalloc(sizeof(struct name_list), GFP_KERNEL); if (entry == NULL) - return -ENOMEM; + return false; memcpy(entry->name, name, HEXDIR_LEN - 1); entry->name[HEXDIR_LEN - 1] = '\0'; list_add(&entry->list, &ctx->names); - return 0; + return true; } static int @@ -807,16 +807,18 @@ __cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg, if (get_user(namelen, &ci->cc_name.cn_len)) return -EFAULT; name.data = memdup_user(&ci->cc_name.cn_id, namelen); - if (IS_ERR_OR_NULL(name.data)) - return -EFAULT; + if (IS_ERR(name.data)) + return PTR_ERR(name.data); name.len = namelen; get_user(princhashlen, &ci->cc_princhash.cp_len); if (princhashlen > 0) { princhash.data = memdup_user( &ci->cc_princhash.cp_data, princhashlen); - if (IS_ERR_OR_NULL(princhash.data)) - return -EFAULT; + if (IS_ERR(princhash.data)) { + kfree(name.data); + return PTR_ERR(princhash.data); + } princhash.len = princhashlen; } else princhash.len = 0; @@ -827,8 +829,8 @@ __cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg, if (get_user(namelen, &cnm->cn_len)) return -EFAULT; name.data = memdup_user(&cnm->cn_id, namelen); - if (IS_ERR_OR_NULL(name.data)) - return -EFAULT; + if (IS_ERR(name.data)) + return PTR_ERR(name.data); name.len = namelen; } if (name.len > 5 && memcmp(name.data, "hash:", 5) == 0) { diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index c5d199d7e6b4..198d7abf34e4 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -160,6 +160,13 @@ static bool is_client_expired(struct nfs4_client *clp) return clp->cl_time == 0; } +static void nfsd4_dec_courtesy_client_count(struct nfsd_net *nn, + struct nfs4_client *clp) +{ + if (clp->cl_state != NFSD4_ACTIVE) + atomic_add_unless(&nn->nfsd_courtesy_clients, -1, 0); +} + static __be32 get_client_locked(struct nfs4_client *clp) { struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); @@ -169,6 +176,7 @@ static __be32 get_client_locked(struct nfs4_client *clp) if (is_client_expired(clp)) return nfserr_expired; atomic_inc(&clp->cl_rpc_users); + nfsd4_dec_courtesy_client_count(nn, clp); clp->cl_state = NFSD4_ACTIVE; return nfs_ok; } @@ -190,6 +198,7 @@ renew_client_locked(struct nfs4_client *clp) list_move_tail(&clp->cl_lru, &nn->client_lru); clp->cl_time = ktime_get_boottime_seconds(); + nfsd4_dec_courtesy_client_count(nn, clp); clp->cl_state = NFSD4_ACTIVE; } @@ -357,6 +366,8 @@ nfsd4_cb_notify_lock_prepare(struct nfsd4_callback *cb) static int nfsd4_cb_notify_lock_done(struct nfsd4_callback *cb, struct rpc_task *task) { + trace_nfsd_cb_notify_lock_done(&zero_stateid, task); + /* * Since this is just an optimization, we don't try very hard if it * turns out not to succeed. We'll requeue it on NFS4ERR_DELAY, and @@ -963,19 +974,19 @@ out_free: * Create a unique stateid_t to represent each COPY. */ static int nfs4_init_cp_state(struct nfsd_net *nn, copy_stateid_t *stid, - unsigned char sc_type) + unsigned char cs_type) { int new_id; - stid->stid.si_opaque.so_clid.cl_boot = (u32)nn->boot_time; - stid->stid.si_opaque.so_clid.cl_id = nn->s2s_cp_cl_id; - stid->sc_type = sc_type; + stid->cs_stid.si_opaque.so_clid.cl_boot = (u32)nn->boot_time; + stid->cs_stid.si_opaque.so_clid.cl_id = nn->s2s_cp_cl_id; + stid->cs_type = cs_type; idr_preload(GFP_KERNEL); spin_lock(&nn->s2s_cp_lock); new_id = idr_alloc_cyclic(&nn->s2s_cp_stateids, stid, 0, 0, GFP_NOWAIT); - stid->stid.si_opaque.so_id = new_id; - stid->stid.si_generation = 1; + stid->cs_stid.si_opaque.so_id = new_id; + stid->cs_stid.si_generation = 1; spin_unlock(&nn->s2s_cp_lock); idr_preload_end(); if (new_id < 0) @@ -997,7 +1008,7 @@ struct nfs4_cpntf_state *nfs4_alloc_init_cpntf_state(struct nfsd_net *nn, if (!cps) return NULL; cps->cpntf_time = ktime_get_boottime_seconds(); - refcount_set(&cps->cp_stateid.sc_count, 1); + refcount_set(&cps->cp_stateid.cs_count, 1); if (!nfs4_init_cp_state(nn, &cps->cp_stateid, NFS4_COPYNOTIFY_STID)) goto out_free; spin_lock(&nn->s2s_cp_lock); @@ -1013,11 +1024,11 @@ void nfs4_free_copy_state(struct nfsd4_copy *copy) { struct nfsd_net *nn; - WARN_ON_ONCE(copy->cp_stateid.sc_type != NFS4_COPY_STID); + WARN_ON_ONCE(copy->cp_stateid.cs_type != NFS4_COPY_STID); nn = net_generic(copy->cp_clp->net, nfsd_net_id); spin_lock(&nn->s2s_cp_lock); idr_remove(&nn->s2s_cp_stateids, - copy->cp_stateid.stid.si_opaque.so_id); + copy->cp_stateid.cs_stid.si_opaque.so_id); spin_unlock(&nn->s2s_cp_lock); } @@ -1049,6 +1060,12 @@ static struct nfs4_ol_stateid * nfs4_alloc_open_stateid(struct nfs4_client *clp) static void nfs4_free_deleg(struct nfs4_stid *stid) { + struct nfs4_delegation *dp = delegstateid(stid); + + WARN_ON_ONCE(!list_empty(&stid->sc_cp_list)); + WARN_ON_ONCE(!list_empty(&dp->dl_perfile)); + WARN_ON_ONCE(!list_empty(&dp->dl_perclnt)); + WARN_ON_ONCE(!list_empty(&dp->dl_recall_lru)); kmem_cache_free(deleg_slab, stid); atomic_long_dec(&num_delegations); } @@ -1462,6 +1479,7 @@ static void nfs4_free_ol_stateid(struct nfs4_stid *stid) release_all_access(stp); if (stp->st_stateowner) nfs4_put_stateowner(stp->st_stateowner); + WARN_ON(!list_empty(&stid->sc_cp_list)); kmem_cache_free(stateid_slab, stid); } @@ -2233,6 +2251,7 @@ __destroy_client(struct nfs4_client *clp) if (clp->cl_cb_conn.cb_xprt) svc_xprt_put(clp->cl_cb_conn.cb_xprt); atomic_add_unless(&nn->nfs4_client_count, -1, 0); + nfsd4_dec_courtesy_client_count(nn, clp); free_client(clp); wake_up_all(&expiry_wq); } @@ -2478,7 +2497,7 @@ static const char *cb_state2str(int state) static int client_info_show(struct seq_file *m, void *v) { - struct inode *inode = m->private; + struct inode *inode = file_inode(m->file); struct nfs4_client *clp; u64 clid; @@ -2518,17 +2537,7 @@ static int client_info_show(struct seq_file *m, void *v) return 0; } -static int client_info_open(struct inode *inode, struct file *file) -{ - return single_open(file, client_info_show, inode); -} - -static const struct file_operations client_info_fops = { - .open = client_info_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(client_info); static void *states_start(struct seq_file *s, loff_t *pos) __acquires(&clp->cl_lock) @@ -4337,7 +4346,27 @@ out: return -ENOMEM; } -void nfsd4_init_leases_net(struct nfsd_net *nn) +static unsigned long +nfsd_courtesy_client_count(struct shrinker *shrink, struct shrink_control *sc) +{ + int cnt; + struct nfsd_net *nn = container_of(shrink, + struct nfsd_net, nfsd_client_shrinker); + + cnt = atomic_read(&nn->nfsd_courtesy_clients); + if (cnt > 0) + mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0); + return (unsigned long)cnt; +} + +static unsigned long +nfsd_courtesy_client_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + return SHRINK_STOP; +} + +int +nfsd4_init_leases_net(struct nfsd_net *nn) { struct sysinfo si; u64 max_clients; @@ -4356,6 +4385,18 @@ void nfsd4_init_leases_net(struct nfsd_net *nn) max_clients = (u64)si.totalram * si.mem_unit / (1024 * 1024 * 1024); max_clients *= NFS4_CLIENTS_PER_GB; nn->nfs4_max_clients = max_t(int, max_clients, NFS4_CLIENTS_PER_GB); + + atomic_set(&nn->nfsd_courtesy_clients, 0); + nn->nfsd_client_shrinker.scan_objects = nfsd_courtesy_client_scan; + nn->nfsd_client_shrinker.count_objects = nfsd_courtesy_client_count; + nn->nfsd_client_shrinker.seeks = DEFAULT_SEEKS; + return register_shrinker(&nn->nfsd_client_shrinker, "nfsd-client"); +} + +void +nfsd4_leases_net_shutdown(struct nfsd_net *nn) +{ + unregister_shrinker(&nn->nfsd_client_shrinker); } static void init_nfs4_replay(struct nfs4_replay *rp) @@ -4715,6 +4756,35 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type) return ret; } +static bool nfsd4_deleg_present(const struct inode *inode) +{ + struct file_lock_context *ctx = smp_load_acquire(&inode->i_flctx); + + return ctx && !list_empty_careful(&ctx->flc_lease); +} + +/** + * nfsd_wait_for_delegreturn - wait for delegations to be returned + * @rqstp: the RPC transaction being executed + * @inode: in-core inode of the file being waited for + * + * The timeout prevents deadlock if all nfsd threads happen to be + * tied up waiting for returning delegations. + * + * Return values: + * %true: delegation was returned + * %false: timed out waiting for delegreturn + */ +bool nfsd_wait_for_delegreturn(struct svc_rqst *rqstp, struct inode *inode) +{ + long __maybe_unused timeo; + + timeo = wait_var_event_timeout(inode, !nfsd4_deleg_present(inode), + NFSD_DELEGRETURN_TIMEOUT); + trace_nfsd_delegret_wakeup(rqstp, inode, timeo); + return timeo > 0; +} + static void nfsd4_cb_recall_prepare(struct nfsd4_callback *cb) { struct nfs4_delegation *dp = cb_to_delegation(cb); @@ -4743,6 +4813,8 @@ static int nfsd4_cb_recall_done(struct nfsd4_callback *cb, { struct nfs4_delegation *dp = cb_to_delegation(cb); + trace_nfsd_cb_recall_done(&dp->dl_stid.sc_stateid, task); + if (dp->dl_stid.sc_type == NFS4_CLOSED_DELEG_STID || dp->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID) return 1; @@ -4788,18 +4860,17 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp) * We're assuming the state code never drops its reference * without first removing the lease. Since we're in this lease * callback (and since the lease code is serialized by the - * i_lock) we know the server hasn't removed the lease yet, and + * flc_lock) we know the server hasn't removed the lease yet, and * we know it's safe to take a reference. */ refcount_inc(&dp->dl_stid.sc_count); - nfsd4_run_cb(&dp->dl_recall); + WARN_ON_ONCE(!nfsd4_run_cb(&dp->dl_recall)); } -/* Called from break_lease() with i_lock held. */ +/* Called from break_lease() with flc_lock held. */ static bool nfsd_break_deleg_cb(struct file_lock *fl) { - bool ret = false; struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner; struct nfs4_file *fp = dp->dl_stid.sc_file; struct nfs4_client *clp = dp->dl_stid.sc_client; @@ -4825,7 +4896,7 @@ nfsd_break_deleg_cb(struct file_lock *fl) fp->fi_had_conflict = true; nfsd_break_one_deleg(dp); spin_unlock(&fp->fi_lock); - return ret; + return false; } /** @@ -5878,8 +5949,11 @@ nfs4_get_client_reaplist(struct nfsd_net *nn, struct list_head *reaplist, goto exp_client; if (!state_expired(lt, clp->cl_time)) break; - if (!atomic_read(&clp->cl_rpc_users)) + if (!atomic_read(&clp->cl_rpc_users)) { + if (clp->cl_state == NFSD4_ACTIVE) + atomic_inc(&nn->nfsd_courtesy_clients); clp->cl_state = NFSD4_COURTESY; + } if (!client_has_state(clp)) goto exp_client; if (!nfs4_anylock_blockers(clp)) @@ -5894,10 +5968,49 @@ exp_client: spin_unlock(&nn->client_lock); } +static void +nfs4_get_courtesy_client_reaplist(struct nfsd_net *nn, + struct list_head *reaplist) +{ + unsigned int maxreap = 0, reapcnt = 0; + struct list_head *pos, *next; + struct nfs4_client *clp; + + maxreap = NFSD_CLIENT_MAX_TRIM_PER_RUN; + INIT_LIST_HEAD(reaplist); + + spin_lock(&nn->client_lock); + list_for_each_safe(pos, next, &nn->client_lru) { + clp = list_entry(pos, struct nfs4_client, cl_lru); + if (clp->cl_state == NFSD4_ACTIVE) + break; + if (reapcnt >= maxreap) + break; + if (!mark_client_expired_locked(clp)) { + list_add(&clp->cl_lru, reaplist); + reapcnt++; + } + } + spin_unlock(&nn->client_lock); +} + +static void +nfs4_process_client_reaplist(struct list_head *reaplist) +{ + struct list_head *pos, *next; + struct nfs4_client *clp; + + list_for_each_safe(pos, next, reaplist) { + clp = list_entry(pos, struct nfs4_client, cl_lru); + trace_nfsd_clid_purged(&clp->cl_clientid); + list_del_init(&clp->cl_lru); + expire_client(clp); + } +} + static time64_t nfs4_laundromat(struct nfsd_net *nn) { - struct nfs4_client *clp; struct nfs4_openowner *oo; struct nfs4_delegation *dp; struct nfs4_ol_stateid *stp; @@ -5920,18 +6033,14 @@ nfs4_laundromat(struct nfsd_net *nn) spin_lock(&nn->s2s_cp_lock); idr_for_each_entry(&nn->s2s_cp_stateids, cps_t, i) { cps = container_of(cps_t, struct nfs4_cpntf_state, cp_stateid); - if (cps->cp_stateid.sc_type == NFS4_COPYNOTIFY_STID && + if (cps->cp_stateid.cs_type == NFS4_COPYNOTIFY_STID && state_expired(<, cps->cpntf_time)) _free_cpntf_state_locked(nn, cps); } spin_unlock(&nn->s2s_cp_lock); nfs4_get_client_reaplist(nn, &reaplist, <); - list_for_each_safe(pos, next, &reaplist) { - clp = list_entry(pos, struct nfs4_client, cl_lru); - trace_nfsd_clid_purged(&clp->cl_clientid); - list_del_init(&clp->cl_lru); - expire_client(clp); - } + nfs4_process_client_reaplist(&reaplist); + spin_lock(&state_lock); list_for_each_safe(pos, next, &nn->del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); @@ -6014,6 +6123,18 @@ laundromat_main(struct work_struct *laundry) queue_delayed_work(laundry_wq, &nn->laundromat_work, t*HZ); } +static void +courtesy_client_reaper(struct work_struct *reaper) +{ + struct list_head reaplist; + struct delayed_work *dwork = to_delayed_work(reaper); + struct nfsd_net *nn = container_of(dwork, struct nfsd_net, + nfsd_shrinker_work); + + nfs4_get_courtesy_client_reaplist(nn, &reaplist); + nfs4_process_client_reaplist(&reaplist); +} + static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp) { if (!fh_match(&fhp->fh_handle, &stp->sc_file->fi_fhandle)) @@ -6149,6 +6270,7 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, struct nfs4_stid **s, struct nfsd_net *nn) { __be32 status; + struct nfs4_stid *stid; bool return_revoked = false; /* @@ -6171,15 +6293,16 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, } if (status) return status; - *s = find_stateid_by_type(cstate->clp, stateid, typemask); - if (!*s) + stid = find_stateid_by_type(cstate->clp, stateid, typemask); + if (!stid) return nfserr_bad_stateid; - if (((*s)->sc_type == NFS4_REVOKED_DELEG_STID) && !return_revoked) { - nfs4_put_stid(*s); + if ((stid->sc_type == NFS4_REVOKED_DELEG_STID) && !return_revoked) { + nfs4_put_stid(stid); if (cstate->minorversion) return nfserr_deleg_revoked; return nfserr_bad_stateid; } + *s = stid; return nfs_ok; } @@ -6244,12 +6367,12 @@ out: static void _free_cpntf_state_locked(struct nfsd_net *nn, struct nfs4_cpntf_state *cps) { - WARN_ON_ONCE(cps->cp_stateid.sc_type != NFS4_COPYNOTIFY_STID); - if (!refcount_dec_and_test(&cps->cp_stateid.sc_count)) + WARN_ON_ONCE(cps->cp_stateid.cs_type != NFS4_COPYNOTIFY_STID); + if (!refcount_dec_and_test(&cps->cp_stateid.cs_count)) return; list_del(&cps->cp_list); idr_remove(&nn->s2s_cp_stateids, - cps->cp_stateid.stid.si_opaque.so_id); + cps->cp_stateid.cs_stid.si_opaque.so_id); kfree(cps); } /* @@ -6271,12 +6394,12 @@ __be32 manage_cpntf_state(struct nfsd_net *nn, stateid_t *st, if (cps_t) { state = container_of(cps_t, struct nfs4_cpntf_state, cp_stateid); - if (state->cp_stateid.sc_type != NFS4_COPYNOTIFY_STID) { + if (state->cp_stateid.cs_type != NFS4_COPYNOTIFY_STID) { state = NULL; goto unlock; } if (!clp) - refcount_inc(&state->cp_stateid.sc_count); + refcount_inc(&state->cp_stateid.cs_count); else _free_cpntf_state_locked(nn, state); } @@ -6684,6 +6807,7 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s) struct nfs4_client *clp = s->st_stid.sc_client; bool unhashed; LIST_HEAD(reaplist); + struct nfs4_ol_stateid *stp; spin_lock(&clp->cl_lock); unhashed = unhash_open_stateid(s, &reaplist); @@ -6692,6 +6816,8 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s) if (unhashed) put_ol_stateid_locked(s, &reaplist); spin_unlock(&clp->cl_lock); + list_for_each_entry(stp, &reaplist, st_locks) + nfs4_free_cpntf_statelist(clp->net, &stp->st_stid); free_ol_stateid_reaplist(&reaplist); } else { spin_unlock(&clp->cl_lock); @@ -6775,6 +6901,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) goto put_stateid; + wake_up_var(d_inode(cstate->current_fh.fh_dentry)); destroy_delegation(dp); put_stateid: nfs4_put_stid(&dp->dl_stid); @@ -7830,6 +7957,7 @@ static int nfs4_state_create_net(struct net *net) INIT_LIST_HEAD(&nn->blocked_locks_lru); INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main); + INIT_DELAYED_WORK(&nn->nfsd_shrinker_work, courtesy_client_reaper); get_net(net); return 0; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 1e9690a061ec..bcfeb1a922c0 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -42,6 +42,8 @@ #include <linux/sunrpc/svcauth_gss.h> #include <linux/sunrpc/addr.h> #include <linux/xattr.h> +#include <linux/vmalloc.h> + #include <uapi/linux/xattr.h> #include "idmap.h" @@ -791,6 +793,7 @@ nfsd4_decode_commit(struct nfsd4_compoundargs *argp, struct nfsd4_commit *commit return nfserr_bad_xdr; if (xdr_stream_decode_u32(argp->xdr, &commit->co_count) < 0) return nfserr_bad_xdr; + memset(&commit->co_verf, 0, sizeof(commit->co_verf)); return nfs_ok; } @@ -799,6 +802,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create { __be32 *p, status; + memset(create, 0, sizeof(*create)); if (xdr_stream_decode_u32(argp->xdr, &create->cr_type) < 0) return nfserr_bad_xdr; switch (create->cr_type) { @@ -848,6 +852,7 @@ nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegretu static inline __be32 nfsd4_decode_getattr(struct nfsd4_compoundargs *argp, struct nfsd4_getattr *getattr) { + memset(getattr, 0, sizeof(*getattr)); return nfsd4_decode_bitmap4(argp, getattr->ga_bmval, ARRAY_SIZE(getattr->ga_bmval)); } @@ -855,6 +860,7 @@ nfsd4_decode_getattr(struct nfsd4_compoundargs *argp, struct nfsd4_getattr *geta static __be32 nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link) { + memset(link, 0, sizeof(*link)); return nfsd4_decode_component4(argp, &link->li_name, &link->li_namelen); } @@ -903,6 +909,7 @@ nfsd4_decode_locker4(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock) static __be32 nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock) { + memset(lock, 0, sizeof(*lock)); if (xdr_stream_decode_u32(argp->xdr, &lock->lk_type) < 0) return nfserr_bad_xdr; if ((lock->lk_type < NFS4_READ_LT) || (lock->lk_type > NFS4_WRITEW_LT)) @@ -919,6 +926,7 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock) static __be32 nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt) { + memset(lockt, 0, sizeof(*lockt)); if (xdr_stream_decode_u32(argp->xdr, &lockt->lt_type) < 0) return nfserr_bad_xdr; if ((lockt->lt_type < NFS4_READ_LT) || (lockt->lt_type > NFS4_WRITEW_LT)) @@ -1140,11 +1148,8 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) __be32 status; u32 dummy; - memset(open->op_bmval, 0, sizeof(open->op_bmval)); - open->op_iattr.ia_valid = 0; - open->op_openowner = NULL; + memset(open, 0, sizeof(*open)); - open->op_xdr_error = 0; if (xdr_stream_decode_u32(argp->xdr, &open->op_seqid) < 0) return nfserr_bad_xdr; /* deleg_want is ignored */ @@ -1179,6 +1184,8 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con if (xdr_stream_decode_u32(argp->xdr, &open_conf->oc_seqid) < 0) return nfserr_bad_xdr; + memset(&open_conf->oc_resp_stateid, 0, + sizeof(open_conf->oc_resp_stateid)); return nfs_ok; } @@ -1187,6 +1194,7 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d { __be32 status; + memset(open_down, 0, sizeof(*open_down)); status = nfsd4_decode_stateid4(argp, &open_down->od_stateid); if (status) return status; @@ -1216,6 +1224,7 @@ nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh) if (!putfh->pf_fhval) return nfserr_jukebox; + putfh->no_verify = false; return nfs_ok; } @@ -1232,6 +1241,7 @@ nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read) { __be32 status; + memset(read, 0, sizeof(*read)); status = nfsd4_decode_stateid4(argp, &read->rd_stateid); if (status) return status; @@ -1248,6 +1258,7 @@ nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *read { __be32 status; + memset(readdir, 0, sizeof(*readdir)); if (xdr_stream_decode_u64(argp->xdr, &readdir->rd_cookie) < 0) return nfserr_bad_xdr; status = nfsd4_decode_verifier4(argp, &readdir->rd_verf); @@ -1267,6 +1278,7 @@ nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *read static __be32 nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove) { + memset(&remove->rm_cinfo, 0, sizeof(remove->rm_cinfo)); return nfsd4_decode_component4(argp, &remove->rm_name, &remove->rm_namelen); } @@ -1275,6 +1287,7 @@ nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename { __be32 status; + memset(rename, 0, sizeof(*rename)); status = nfsd4_decode_component4(argp, &rename->rn_sname, &rename->rn_snamelen); if (status) return status; @@ -1291,6 +1304,7 @@ static __be32 nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp, struct nfsd4_secinfo *secinfo) { + secinfo->si_exp = NULL; return nfsd4_decode_component4(argp, &secinfo->si_name, &secinfo->si_namelen); } @@ -1299,6 +1313,7 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta { __be32 status; + memset(setattr, 0, sizeof(*setattr)); status = nfsd4_decode_stateid4(argp, &setattr->sa_stateid); if (status) return status; @@ -1313,6 +1328,8 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient { __be32 *p, status; + memset(setclientid, 0, sizeof(*setclientid)); + if (argp->minorversion >= 1) return nfserr_notsupp; @@ -1369,6 +1386,8 @@ nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify { __be32 *p, status; + memset(verify, 0, sizeof(*verify)); + status = nfsd4_decode_bitmap4(argp, verify->ve_bmval, ARRAY_SIZE(verify->ve_bmval)); if (status) @@ -1408,6 +1427,9 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) if (!xdr_stream_subsegment(argp->xdr, &write->wr_payload, write->wr_buflen)) return nfserr_bad_xdr; + write->wr_bytes_written = 0; + write->wr_how_written = 0; + memset(&write->wr_verifier, 0, sizeof(write->wr_verifier)); return nfs_ok; } @@ -1432,6 +1454,7 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp, struct nfsd4_backchannel_ctl *bc) { + memset(bc, 0, sizeof(*bc)); if (xdr_stream_decode_u32(argp->xdr, &bc->bc_cb_program) < 0) return nfserr_bad_xdr; return nfsd4_decode_cb_sec(argp, &bc->bc_cb_sec); @@ -1442,6 +1465,7 @@ static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, u32 use_conn_in_rdma_mode; __be32 status; + memset(bcts, 0, sizeof(*bcts)); status = nfsd4_decode_sessionid4(argp, &bcts->sessionid); if (status) return status; @@ -1583,6 +1607,7 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp, { __be32 status; + memset(exid, 0, sizeof(*exid)); status = nfsd4_decode_verifier4(argp, &exid->verifier); if (status) return status; @@ -1635,6 +1660,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, { __be32 status; + memset(sess, 0, sizeof(*sess)); status = nfsd4_decode_clientid4(argp, &sess->clientid); if (status) return status; @@ -1650,11 +1676,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, return status; if (xdr_stream_decode_u32(argp->xdr, &sess->callback_prog) < 0) return nfserr_bad_xdr; - status = nfsd4_decode_cb_sec(argp, &sess->cb_sec); - if (status) - return status; - - return nfs_ok; + return nfsd4_decode_cb_sec(argp, &sess->cb_sec); } static __be32 @@ -1678,6 +1700,7 @@ nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp, { __be32 status; + memset(gdev, 0, sizeof(*gdev)); status = nfsd4_decode_deviceid4(argp, &gdev->gd_devid); if (status) return status; @@ -1698,6 +1721,7 @@ nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp, { __be32 *p, status; + memset(lcp, 0, sizeof(*lcp)); if (xdr_stream_decode_u64(argp->xdr, &lcp->lc_seg.offset) < 0) return nfserr_bad_xdr; if (xdr_stream_decode_u64(argp->xdr, &lcp->lc_seg.length) < 0) @@ -1733,6 +1757,7 @@ nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp, { __be32 status; + memset(lgp, 0, sizeof(*lgp)); if (xdr_stream_decode_u32(argp->xdr, &lgp->lg_signal) < 0) return nfserr_bad_xdr; if (xdr_stream_decode_u32(argp->xdr, &lgp->lg_layout_type) < 0) @@ -1758,6 +1783,7 @@ static __be32 nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp, struct nfsd4_layoutreturn *lrp) { + memset(lrp, 0, sizeof(*lrp)); if (xdr_stream_decode_bool(argp->xdr, &lrp->lr_reclaim) < 0) return nfserr_bad_xdr; if (xdr_stream_decode_u32(argp->xdr, &lrp->lr_layout_type) < 0) @@ -1773,6 +1799,8 @@ static __be32 nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp, { if (xdr_stream_decode_u32(argp->xdr, &sin->sin_style) < 0) return nfserr_bad_xdr; + + sin->sin_exp = NULL; return nfs_ok; } @@ -1793,6 +1821,7 @@ nfsd4_decode_sequence(struct nfsd4_compoundargs *argp, seq->maxslots = be32_to_cpup(p++); seq->cachethis = be32_to_cpup(p); + seq->status_flags = 0; return nfs_ok; } @@ -1803,6 +1832,7 @@ nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_sta __be32 status; u32 i; + memset(test_stateid, 0, sizeof(*test_stateid)); if (xdr_stream_decode_u32(argp->xdr, &test_stateid->ts_num_ids) < 0) return nfserr_bad_xdr; @@ -1900,6 +1930,7 @@ nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy) struct nl4_server *ns_dummy; __be32 status; + memset(copy, 0, sizeof(*copy)); status = nfsd4_decode_stateid4(argp, ©->cp_src_stateid); if (status) return status; @@ -1955,6 +1986,7 @@ nfsd4_decode_copy_notify(struct nfsd4_compoundargs *argp, { __be32 status; + memset(cn, 0, sizeof(*cn)); cn->cpn_src = svcxdr_tmpalloc(argp, sizeof(*cn->cpn_src)); if (cn->cpn_src == NULL) return nfserr_jukebox; @@ -1972,6 +2004,8 @@ static __be32 nfsd4_decode_offload_status(struct nfsd4_compoundargs *argp, struct nfsd4_offload_status *os) { + os->count = 0; + os->status = 0; return nfsd4_decode_stateid4(argp, &os->stateid); } @@ -1988,6 +2022,8 @@ nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek) if (xdr_stream_decode_u32(argp->xdr, &seek->seek_whence) < 0) return nfserr_bad_xdr; + seek->seek_eof = 0; + seek->seek_pos = 0; return nfs_ok; } @@ -2123,6 +2159,7 @@ nfsd4_decode_getxattr(struct nfsd4_compoundargs *argp, __be32 status; u32 maxcount; + memset(getxattr, 0, sizeof(*getxattr)); status = nfsd4_decode_xattr_name(argp, &getxattr->getxa_name); if (status) return status; @@ -2131,8 +2168,7 @@ nfsd4_decode_getxattr(struct nfsd4_compoundargs *argp, maxcount = min_t(u32, XATTR_SIZE_MAX, maxcount); getxattr->getxa_len = maxcount; - - return status; + return nfs_ok; } static __be32 @@ -2142,6 +2178,8 @@ nfsd4_decode_setxattr(struct nfsd4_compoundargs *argp, u32 flags, maxcount, size; __be32 status; + memset(setxattr, 0, sizeof(*setxattr)); + if (xdr_stream_decode_u32(argp->xdr, &flags) < 0) return nfserr_bad_xdr; @@ -2180,6 +2218,8 @@ nfsd4_decode_listxattrs(struct nfsd4_compoundargs *argp, { u32 maxcount; + memset(listxattrs, 0, sizeof(*listxattrs)); + if (xdr_stream_decode_u64(argp->xdr, &listxattrs->lsxa_cookie) < 0) return nfserr_bad_xdr; @@ -2207,6 +2247,7 @@ static __be32 nfsd4_decode_removexattr(struct nfsd4_compoundargs *argp, struct nfsd4_removexattr *removexattr) { + memset(removexattr, 0, sizeof(*removexattr)); return nfsd4_decode_xattr_name(argp, &removexattr->rmxa_name); } @@ -2357,22 +2398,15 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) if (xdr_stream_decode_u32(argp->xdr, &argp->minorversion) < 0) return false; - if (xdr_stream_decode_u32(argp->xdr, &argp->opcnt) < 0) + if (xdr_stream_decode_u32(argp->xdr, &argp->client_opcnt) < 0) return false; - - /* - * NFS4ERR_RESOURCE is a more helpful error than GARBAGE_ARGS - * here, so we return success at the xdr level so that - * nfsd4_proc can handle this is an NFS-level error. - */ - if (argp->opcnt > NFSD_MAX_OPS_PER_COMPOUND) - return true; + argp->opcnt = min_t(u32, argp->client_opcnt, + NFSD_MAX_OPS_PER_COMPOUND); if (argp->opcnt > ARRAY_SIZE(argp->iops)) { - argp->ops = kzalloc(argp->opcnt * sizeof(*argp->ops), GFP_KERNEL); + argp->ops = vcalloc(argp->opcnt, sizeof(*argp->ops)); if (!argp->ops) { argp->ops = argp->iops; - dprintk("nfsd: couldn't allocate room for COMPOUND\n"); return false; } } @@ -2774,9 +2808,10 @@ static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *bmval2, u32 } -static int get_parent_attributes(struct svc_export *exp, struct kstat *stat) +static int nfsd4_get_mounted_on_ino(struct svc_export *exp, u64 *pino) { struct path path = exp->ex_path; + struct kstat stat; int err; path_get(&path); @@ -2784,8 +2819,10 @@ static int get_parent_attributes(struct svc_export *exp, struct kstat *stat) if (path.dentry != path.mnt->mnt_root) break; } - err = vfs_getattr(&path, stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT); + err = vfs_getattr(&path, &stat, STATX_INO, AT_STATX_SYNC_AS_STAT); path_put(&path); + if (!err) + *pino = stat.ino; return err; } @@ -3282,22 +3319,21 @@ out_acl: *p++ = cpu_to_be32(stat.btime.tv_nsec); } if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) { - struct kstat parent_stat; u64 ino = stat.ino; p = xdr_reserve_space(xdr, 8); if (!p) goto out_resource; /* - * Get parent's attributes if not ignoring crossmount - * and this is the root of a cross-mounted filesystem. + * Get ino of mountpoint in parent filesystem, if not ignoring + * crossmount and this is the root of a cross-mounted + * filesystem. */ if (ignore_crossmnt == 0 && dentry == exp->ex_path.mnt->mnt_root) { - err = get_parent_attributes(exp, &parent_stat); + err = nfsd4_get_mounted_on_ino(exp, &ino); if (err) goto out_nfserr; - ino = parent_stat.ino; } p = xdr_encode_hyper(p, ino); } @@ -3994,7 +4030,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, } if (resp->xdr->buf->page_len && splice_ok) { WARN_ON_ONCE(1); - return nfserr_resource; + return nfserr_serverfault; } xdr_commit_encode(xdr); @@ -5394,7 +5430,7 @@ void nfsd4_release_compoundargs(struct svc_rqst *rqstp) struct nfsd4_compoundargs *args = rqstp->rq_argp; if (args->ops != args->iops) { - kfree(args->ops); + vfree(args->ops); args->ops = args->iops; } while (args->to_free) { @@ -5423,12 +5459,8 @@ bool nfs4svc_encode_compoundres(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd4_compoundres *resp = rqstp->rq_resp; - struct xdr_buf *buf = xdr->buf; __be32 *p; - WARN_ON_ONCE(buf->len != buf->head[0].iov_len + buf->page_len + - buf->tail[0].iov_len); - /* * Send buffer space for the following items is reserved * at the top of nfsd4_proc_compound(). diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 9b31e1103e7b..3e64a3d50a1c 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -604,9 +604,10 @@ nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data) * scraping this file for info should test the labels to ensure they're * getting the correct field. */ -static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v) +int nfsd_reply_cache_stats_show(struct seq_file *m, void *v) { - struct nfsd_net *nn = m->private; + struct nfsd_net *nn = net_generic(file_inode(m->file)->i_sb->s_fs_info, + nfsd_net_id); seq_printf(m, "max entries: %u\n", nn->max_drc_entries); seq_printf(m, "num entries: %u\n", @@ -626,11 +627,3 @@ static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v) seq_printf(m, "cachesize at longest: %u\n", nn->longest_chain_cachesize); return 0; } - -int nfsd_reply_cache_stats_open(struct inode *inode, struct file *file) -{ - struct nfsd_net *nn = net_generic(file_inode(file)->i_sb->s_fs_info, - nfsd_net_id); - - return single_open(file, nfsd_reply_cache_stats_show, nn); -} diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 917fa1892fd2..6a29bcfc9390 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -185,17 +185,7 @@ static int export_features_show(struct seq_file *m, void *v) return 0; } -static int export_features_open(struct inode *inode, struct file *file) -{ - return single_open(file, export_features_show, NULL); -} - -static const struct file_operations export_features_operations = { - .open = export_features_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(export_features); #if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE) static int supported_enctypes_show(struct seq_file *m, void *v) @@ -204,17 +194,7 @@ static int supported_enctypes_show(struct seq_file *m, void *v) return 0; } -static int supported_enctypes_open(struct inode *inode, struct file *file) -{ - return single_open(file, supported_enctypes_show, NULL); -} - -static const struct file_operations supported_enctypes_ops = { - .open = supported_enctypes_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(supported_enctypes); #endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */ static const struct file_operations pool_stats_operations = { @@ -224,19 +204,9 @@ static const struct file_operations pool_stats_operations = { .release = nfsd_pool_stats_release, }; -static const struct file_operations reply_cache_stats_operations = { - .open = nfsd_reply_cache_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(nfsd_reply_cache_stats); -static const struct file_operations filecache_ops = { - .open = nfsd_file_cache_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(nfsd_file_cache_stats); /*----------------------------------------------------------------------------*/ /* @@ -1365,7 +1335,7 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc) /* Per-export io stats use same ops as exports file */ [NFSD_Export_Stats] = {"export_stats", &exports_nfsd_operations, S_IRUGO}, [NFSD_Export_features] = {"export_features", - &export_features_operations, S_IRUGO}, + &export_features_fops, S_IRUGO}, [NFSD_FO_UnlockIP] = {"unlock_ip", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_FO_UnlockFS] = {"unlock_filesystem", @@ -1374,14 +1344,16 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc) [NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Pool_Stats] = {"pool_stats", &pool_stats_operations, S_IRUGO}, - [NFSD_Reply_Cache_Stats] = {"reply_cache_stats", &reply_cache_stats_operations, S_IRUGO}, + [NFSD_Reply_Cache_Stats] = {"reply_cache_stats", + &nfsd_reply_cache_stats_fops, S_IRUGO}, [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO}, [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, [NFSD_MaxConnections] = {"max_connections", &transaction_ops, S_IWUSR|S_IRUGO}, - [NFSD_Filecache] = {"filecache", &filecache_ops, S_IRUGO}, + [NFSD_Filecache] = {"filecache", &nfsd_file_cache_stats_fops, S_IRUGO}, #if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE) - [NFSD_SupportedEnctypes] = {"supported_krb5_enctypes", &supported_enctypes_ops, S_IRUGO}, + [NFSD_SupportedEnctypes] = {"supported_krb5_enctypes", + &supported_enctypes_fops, S_IRUGO}, #endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */ #ifdef CONFIG_NFSD_V4 [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR}, @@ -1481,11 +1453,12 @@ static __net_init int nfsd_init_net(struct net *net) goto out_idmap_error; nn->nfsd_versions = NULL; nn->nfsd4_minorversions = NULL; + retval = nfsd4_init_leases_net(nn); + if (retval) + goto out_drc_error; retval = nfsd_reply_cache_init(nn); if (retval) goto out_drc_error; - nfsd4_init_leases_net(nn); - get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key)); seqlock_init(&nn->writeverf_lock); @@ -1507,6 +1480,7 @@ static __net_exit void nfsd_exit_net(struct net *net) nfsd_idmap_shutdown(net); nfsd_export_shutdown(net); nfsd_netns_free_versions(net_generic(net, nfsd_net_id)); + nfsd4_leases_net_shutdown(nn); } static struct pernet_operations nfsd_net_ops = { diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 57a468ed85c3..09726c5b9a31 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -164,6 +164,7 @@ char * nfs4_recoverydir(void); bool nfsd4_spo_must_allow(struct svc_rqst *rqstp); int nfsd4_create_laundry_wq(void); void nfsd4_destroy_laundry_wq(void); +bool nfsd_wait_for_delegreturn(struct svc_rqst *rqstp, struct inode *inode); #else static inline int nfsd4_init_slabs(void) { return 0; } static inline void nfsd4_free_slabs(void) { } @@ -179,6 +180,11 @@ static inline bool nfsd4_spo_must_allow(struct svc_rqst *rqstp) } static inline int nfsd4_create_laundry_wq(void) { return 0; }; static inline void nfsd4_destroy_laundry_wq(void) {}; +static inline bool nfsd_wait_for_delegreturn(struct svc_rqst *rqstp, + struct inode *inode) +{ + return false; +} #endif /* @@ -343,6 +349,7 @@ void nfsd_lockd_shutdown(void); #define NFSD_COURTESY_CLIENT_TIMEOUT (24 * 60 * 60) /* seconds */ #define NFSD_CLIENT_MAX_TRIM_PER_RUN 128 #define NFS4_CLIENTS_PER_GB 1024 +#define NFSD_DELEGRETURN_TIMEOUT (HZ / 34) /* 30ms */ /* * The following attributes are currently not supported by the NFSv4 server: @@ -498,7 +505,8 @@ extern void unregister_cld_notifier(void); extern void nfsd4_ssc_init_umount_work(struct nfsd_net *nn); #endif -extern void nfsd4_init_leases_net(struct nfsd_net *nn); +extern int nfsd4_init_leases_net(struct nfsd_net *nn); +extern void nfsd4_leases_net_shutdown(struct nfsd_net *nn); #else /* CONFIG_NFSD_V4 */ static inline int nfsd4_is_junction(struct dentry *dentry) @@ -506,7 +514,8 @@ static inline int nfsd4_is_junction(struct dentry *dentry) return 0; } -static inline void nfsd4_init_leases_net(struct nfsd_net *nn) {}; +static inline int nfsd4_init_leases_net(struct nfsd_net *nn) { return 0; }; +static inline void nfsd4_leases_net_shutdown(struct nfsd_net *nn) {}; #define register_cld_notifier() 0 #define unregister_cld_notifier() do { } while(0) diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index a5b71526cee0..d73434200df9 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -392,13 +392,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access) skip_pseudoflavor_check: /* Finally, check access permissions. */ error = nfsd_permission(rqstp, exp, dentry, access); - - if (error) { - dprintk("fh_verify: %pd2 permission failure, " - "acc=%x, error=%d\n", - dentry, - access, ntohl(error)); - } + trace_nfsd_fh_verify_err(rqstp, fhp, type, access, error); out: if (error == nfserr_stale) nfsd_stats_fh_stale_inc(exp); diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 7381972f1677..82b3ddeacc33 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -185,6 +185,7 @@ nfsd_proc_read(struct svc_rqst *rqstp) argp->count, argp->offset); argp->count = min_t(u32, argp->count, NFSSVC_MAXBLKSIZE_V2); + argp->count = min_t(u32, argp->count, rqstp->rq_res.buflen); v = 0; len = argp->count; @@ -390,9 +391,8 @@ nfsd_proc_create(struct svc_rqst *rqstp) resp->status = nfs_ok; if (!inode) { /* File doesn't exist. Create it and set attrs */ - resp->status = nfsd_create_locked(rqstp, dirfhp, argp->name, - argp->len, &attrs, type, rdev, - newfhp); + resp->status = nfsd_create_locked(rqstp, dirfhp, &attrs, type, + rdev, newfhp); } else if (type == S_IFREG) { dprintk("nfsd: existing %s, valid=%x, size=%ld\n", argp->name, attr->ia_valid, (long) attr->ia_size); @@ -567,24 +567,15 @@ static void nfsd_init_dirlist_pages(struct svc_rqst *rqstp, struct xdr_buf *buf = &resp->dirlist; struct xdr_stream *xdr = &resp->xdr; - count = clamp(count, (u32)(XDR_UNIT * 2), svc_max_payload(rqstp)); - memset(buf, 0, sizeof(*buf)); /* Reserve room for the NULL ptr & eof flag (-2 words) */ - buf->buflen = count - XDR_UNIT * 2; + buf->buflen = clamp(count, (u32)(XDR_UNIT * 2), (u32)PAGE_SIZE); + buf->buflen -= XDR_UNIT * 2; buf->pages = rqstp->rq_next_page; rqstp->rq_next_page++; - /* This is xdr_init_encode(), but it assumes that - * the head kvec has already been consumed. */ - xdr_set_scratch_buffer(xdr, NULL, 0); - xdr->buf = buf; - xdr->page_ptr = buf->pages; - xdr->iov = NULL; - xdr->p = page_address(*buf->pages); - xdr->end = (void *)xdr->p + min_t(u32, buf->buflen, PAGE_SIZE); - xdr->rqst = NULL; + xdr_init_encode_pages(xdr, buf, buf->pages, NULL); } /* @@ -646,6 +637,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_decode = nfssvc_decode_voidarg, .pc_encode = nfssvc_encode_voidres, .pc_argsize = sizeof(struct nfsd_voidargs), + .pc_argzero = sizeof(struct nfsd_voidargs), .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = 0, @@ -657,6 +649,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_encode = nfssvc_encode_attrstatres, .pc_release = nfssvc_release_attrstat, .pc_argsize = sizeof(struct nfsd_fhandle), + .pc_argzero = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd_attrstat), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+AT, @@ -668,6 +661,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_encode = nfssvc_encode_attrstatres, .pc_release = nfssvc_release_attrstat, .pc_argsize = sizeof(struct nfsd_sattrargs), + .pc_argzero = sizeof(struct nfsd_sattrargs), .pc_ressize = sizeof(struct nfsd_attrstat), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+AT, @@ -678,6 +672,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_decode = nfssvc_decode_voidarg, .pc_encode = nfssvc_encode_voidres, .pc_argsize = sizeof(struct nfsd_voidargs), + .pc_argzero = sizeof(struct nfsd_voidargs), .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = 0, @@ -689,6 +684,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_encode = nfssvc_encode_diropres, .pc_release = nfssvc_release_diropres, .pc_argsize = sizeof(struct nfsd_diropargs), + .pc_argzero = sizeof(struct nfsd_diropargs), .pc_ressize = sizeof(struct nfsd_diropres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+FH+AT, @@ -699,6 +695,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_decode = nfssvc_decode_fhandleargs, .pc_encode = nfssvc_encode_readlinkres, .pc_argsize = sizeof(struct nfsd_fhandle), + .pc_argzero = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd_readlinkres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+1+NFS_MAXPATHLEN/4, @@ -710,6 +707,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_encode = nfssvc_encode_readres, .pc_release = nfssvc_release_readres, .pc_argsize = sizeof(struct nfsd_readargs), + .pc_argzero = sizeof(struct nfsd_readargs), .pc_ressize = sizeof(struct nfsd_readres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+AT+1+NFSSVC_MAXBLKSIZE_V2/4, @@ -720,6 +718,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_decode = nfssvc_decode_voidarg, .pc_encode = nfssvc_encode_voidres, .pc_argsize = sizeof(struct nfsd_voidargs), + .pc_argzero = sizeof(struct nfsd_voidargs), .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = 0, @@ -731,6 +730,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_encode = nfssvc_encode_attrstatres, .pc_release = nfssvc_release_attrstat, .pc_argsize = sizeof(struct nfsd_writeargs), + .pc_argzero = sizeof(struct nfsd_writeargs), .pc_ressize = sizeof(struct nfsd_attrstat), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+AT, @@ -742,6 +742,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_encode = nfssvc_encode_diropres, .pc_release = nfssvc_release_diropres, .pc_argsize = sizeof(struct nfsd_createargs), + .pc_argzero = sizeof(struct nfsd_createargs), .pc_ressize = sizeof(struct nfsd_diropres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+FH+AT, @@ -752,6 +753,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_decode = nfssvc_decode_diropargs, .pc_encode = nfssvc_encode_statres, .pc_argsize = sizeof(struct nfsd_diropargs), + .pc_argzero = sizeof(struct nfsd_diropargs), .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, .pc_xdrressize = ST, @@ -762,6 +764,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_decode = nfssvc_decode_renameargs, .pc_encode = nfssvc_encode_statres, .pc_argsize = sizeof(struct nfsd_renameargs), + .pc_argzero = sizeof(struct nfsd_renameargs), .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, .pc_xdrressize = ST, @@ -772,6 +775,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_decode = nfssvc_decode_linkargs, .pc_encode = nfssvc_encode_statres, .pc_argsize = sizeof(struct nfsd_linkargs), + .pc_argzero = sizeof(struct nfsd_linkargs), .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, .pc_xdrressize = ST, @@ -782,6 +786,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_decode = nfssvc_decode_symlinkargs, .pc_encode = nfssvc_encode_statres, .pc_argsize = sizeof(struct nfsd_symlinkargs), + .pc_argzero = sizeof(struct nfsd_symlinkargs), .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, .pc_xdrressize = ST, @@ -793,6 +798,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_encode = nfssvc_encode_diropres, .pc_release = nfssvc_release_diropres, .pc_argsize = sizeof(struct nfsd_createargs), + .pc_argzero = sizeof(struct nfsd_createargs), .pc_ressize = sizeof(struct nfsd_diropres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+FH+AT, @@ -803,6 +809,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_decode = nfssvc_decode_diropargs, .pc_encode = nfssvc_encode_statres, .pc_argsize = sizeof(struct nfsd_diropargs), + .pc_argzero = sizeof(struct nfsd_diropargs), .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, .pc_xdrressize = ST, @@ -813,6 +820,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_decode = nfssvc_decode_readdirargs, .pc_encode = nfssvc_encode_readdirres, .pc_argsize = sizeof(struct nfsd_readdirargs), + .pc_argzero = sizeof(struct nfsd_readdirargs), .pc_ressize = sizeof(struct nfsd_readdirres), .pc_cachetype = RC_NOCACHE, .pc_name = "READDIR", @@ -822,6 +830,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_decode = nfssvc_decode_fhandleargs, .pc_encode = nfssvc_encode_statfsres, .pc_argsize = sizeof(struct nfsd_fhandle), + .pc_argzero = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd_statfsres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+5, diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 4bb5baa17040..bfbd9f672f59 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -799,7 +799,7 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred) if (nrservs == 0 && nn->nfsd_serv == NULL) goto out; - strlcpy(nn->nfsd_name, utsname()->nodename, + strscpy(nn->nfsd_name, utsname()->nodename, sizeof(nn->nfsd_name)); error = nfsd_create_serv(net); diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index aba8520b4b8b..caf6355b18fa 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -338,10 +338,8 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) return false; if (args->len > NFSSVC_MAXBLKSIZE_V2) return false; - if (!xdr_stream_subsegment(xdr, &args->payload, args->len)) - return false; - return true; + return xdr_stream_subsegment(xdr, &args->payload, args->len); } bool diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index ae596dbf8667..e2daef3cc003 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -57,11 +57,11 @@ typedef struct { } stateid_t; typedef struct { - stateid_t stid; + stateid_t cs_stid; #define NFS4_COPY_STID 1 #define NFS4_COPYNOTIFY_STID 2 - unsigned char sc_type; - refcount_t sc_count; + unsigned char cs_type; + refcount_t cs_count; } copy_stateid_t; struct nfsd4_callback { @@ -175,7 +175,7 @@ static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s) /* Maximum number of slots per session. 160 is useful for long haul TCP */ #define NFSD_MAX_SLOTS_PER_SESSION 160 /* Maximum number of operations per session compound */ -#define NFSD_MAX_OPS_PER_COMPOUND 16 +#define NFSD_MAX_OPS_PER_COMPOUND 50 /* Maximum session per slot cache size */ #define NFSD_SLOT_CACHE_SIZE 2048 /* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */ @@ -692,12 +692,11 @@ extern void nfsd4_probe_callback_sync(struct nfs4_client *clp); extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op); -extern void nfsd4_run_cb(struct nfsd4_callback *cb); +extern bool nfsd4_run_cb(struct nfsd4_callback *cb); extern int nfsd4_create_callback_queue(void); extern void nfsd4_destroy_callback_queue(void); extern void nfsd4_shutdown_callback(struct nfs4_client *); extern void nfsd4_shutdown_copy(struct nfs4_client *clp); -extern void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp); extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(struct xdr_netobj name, struct xdr_netobj princhash, struct nfsd_net *nn); extern bool nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn); diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index a8c5a02a84f0..777e24e5da33 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -32,7 +32,7 @@ struct svc_stat nfsd_svcstats = { .program = &nfsd_program, }; -static int nfsd_proc_show(struct seq_file *seq, void *v) +static int nfsd_show(struct seq_file *seq, void *v) { int i; @@ -72,17 +72,7 @@ static int nfsd_proc_show(struct seq_file *seq, void *v) return 0; } -static int nfsd_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, nfsd_proc_show, NULL); -} - -static const struct proc_ops nfsd_proc_ops = { - .proc_open = nfsd_proc_open, - .proc_read = seq_read, - .proc_lseek = seq_lseek, - .proc_release = single_release, -}; +DEFINE_PROC_SHOW_ATTRIBUTE(nfsd); int nfsd_percpu_counters_init(struct percpu_counter counters[], int num) { diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 9ebd67d461f9..06a96e955bd0 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -84,19 +84,26 @@ DEFINE_NFSD_XDR_ERR_EVENT(cant_encode); { NFSD_MAY_64BIT_COOKIE, "64BIT_COOKIE" }) TRACE_EVENT(nfsd_compound, - TP_PROTO(const struct svc_rqst *rqst, - u32 args_opcnt), - TP_ARGS(rqst, args_opcnt), + TP_PROTO( + const struct svc_rqst *rqst, + const char *tag, + u32 taglen, + u32 opcnt + ), + TP_ARGS(rqst, tag, taglen, opcnt), TP_STRUCT__entry( __field(u32, xid) - __field(u32, args_opcnt) + __field(u32, opcnt) + __string_len(tag, tag, taglen) ), TP_fast_assign( __entry->xid = be32_to_cpu(rqst->rq_xid); - __entry->args_opcnt = args_opcnt; + __entry->opcnt = opcnt; + __assign_str_len(tag, tag, taglen); ), - TP_printk("xid=0x%08x opcnt=%u", - __entry->xid, __entry->args_opcnt) + TP_printk("xid=0x%08x opcnt=%u tag=%s", + __entry->xid, __entry->opcnt, __get_str(tag) + ) ) TRACE_EVENT(nfsd_compound_status, @@ -195,7 +202,7 @@ TRACE_EVENT(nfsd_fh_verify, __sockaddr(client, rqstp->rq_xprt->xpt_remotelen) __field(u32, xid) __field(u32, fh_hash) - __field(void *, inode) + __field(const void *, inode) __field(unsigned long, type) __field(unsigned long, access) ), @@ -211,13 +218,55 @@ TRACE_EVENT(nfsd_fh_verify, __entry->type = type; __entry->access = access; ), - TP_printk("xid=0x%08x fh_hash=0x%08x inode=%p type=%s access=%s", - __entry->xid, __entry->fh_hash, __entry->inode, + TP_printk("xid=0x%08x fh_hash=0x%08x type=%s access=%s", + __entry->xid, __entry->fh_hash, show_fs_file_type(__entry->type), show_nfsd_may_flags(__entry->access) ) ); +TRACE_EVENT_CONDITION(nfsd_fh_verify_err, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *fhp, + umode_t type, + int access, + __be32 error + ), + TP_ARGS(rqstp, fhp, type, access, error), + TP_CONDITION(error), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __sockaddr(server, rqstp->rq_xprt->xpt_remotelen) + __sockaddr(client, rqstp->rq_xprt->xpt_remotelen) + __field(u32, xid) + __field(u32, fh_hash) + __field(const void *, inode) + __field(unsigned long, type) + __field(unsigned long, access) + __field(int, error) + ), + TP_fast_assign( + __entry->netns_ino = SVC_NET(rqstp)->ns.inum; + __assign_sockaddr(server, &rqstp->rq_xprt->xpt_local, + rqstp->rq_xprt->xpt_locallen); + __assign_sockaddr(client, &rqstp->rq_xprt->xpt_remote, + rqstp->rq_xprt->xpt_remotelen); + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + __entry->inode = d_inode(fhp->fh_dentry); + __entry->type = type; + __entry->access = access; + __entry->error = be32_to_cpu(error); + ), + TP_printk("xid=0x%08x fh_hash=0x%08x type=%s access=%s error=%d", + __entry->xid, __entry->fh_hash, + show_fs_file_type(__entry->type), + show_nfsd_may_flags(__entry->access), + __entry->error + ) +); + DECLARE_EVENT_CLASS(nfsd_fh_err_class, TP_PROTO(struct svc_rqst *rqstp, struct svc_fh *fhp, @@ -489,6 +538,29 @@ DEFINE_NFSD_COPY_ERR_EVENT(clone_file_range_err); #include "filecache.h" #include "vfs.h" +TRACE_EVENT(nfsd_delegret_wakeup, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct inode *inode, + long timeo + ), + TP_ARGS(rqstp, inode, timeo), + TP_STRUCT__entry( + __field(u32, xid) + __field(const void *, inode) + __field(long, timeo) + ), + TP_fast_assign( + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->inode = inode; + __entry->timeo = timeo; + ), + TP_printk("xid=0x%08x inode=%p%s", + __entry->xid, __entry->inode, + __entry->timeo == 0 ? " (timed out)" : "" + ) +); + DECLARE_EVENT_CLASS(nfsd_stateid_class, TP_PROTO(stateid_t *stp), TP_ARGS(stp), @@ -1399,6 +1471,45 @@ TRACE_EVENT(nfsd_cb_offload, __entry->fh_hash, __entry->count, __entry->status) ); +DECLARE_EVENT_CLASS(nfsd_cb_done_class, + TP_PROTO( + const stateid_t *stp, + const struct rpc_task *task + ), + TP_ARGS(stp, task), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + __field(u32, si_id) + __field(u32, si_generation) + __field(int, status) + ), + TP_fast_assign( + __entry->cl_boot = stp->si_opaque.so_clid.cl_boot; + __entry->cl_id = stp->si_opaque.so_clid.cl_id; + __entry->si_id = stp->si_opaque.so_id; + __entry->si_generation = stp->si_generation; + __entry->status = task->tk_status; + ), + TP_printk("client %08x:%08x stateid %08x:%08x status=%d", + __entry->cl_boot, __entry->cl_id, __entry->si_id, + __entry->si_generation, __entry->status + ) +); + +#define DEFINE_NFSD_CB_DONE_EVENT(name) \ +DEFINE_EVENT(nfsd_cb_done_class, name, \ + TP_PROTO( \ + const stateid_t *stp, \ + const struct rpc_task *task \ + ), \ + TP_ARGS(stp, task)) + +DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_recall_done); +DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_notify_lock_done); +DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_layout_done); +DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_offload_done); + #endif /* _NFSD_TRACE_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 9f486b788ed0..f650afedd67f 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -300,6 +300,10 @@ commit_metadata(struct svc_fh *fhp) static void nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap) { + /* Ignore mode updates on symlinks */ + if (S_ISLNK(inode->i_mode)) + iap->ia_valid &= ~ATTR_MODE; + /* sanitize the mode change */ if (iap->ia_valid & ATTR_MODE) { iap->ia_mode &= S_IALLUGO; @@ -339,8 +343,61 @@ nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp, return nfserrno(get_write_access(inode)); } -/* - * Set various file attributes. After this call fhp needs an fh_put. +static int __nfsd_setattr(struct dentry *dentry, struct iattr *iap) +{ + int host_err; + + if (iap->ia_valid & ATTR_SIZE) { + /* + * RFC5661, Section 18.30.4: + * Changing the size of a file with SETATTR indirectly + * changes the time_modify and change attributes. + * + * (and similar for the older RFCs) + */ + struct iattr size_attr = { + .ia_valid = ATTR_SIZE | ATTR_CTIME | ATTR_MTIME, + .ia_size = iap->ia_size, + }; + + if (iap->ia_size < 0) + return -EFBIG; + + host_err = notify_change(&init_user_ns, dentry, &size_attr, NULL); + if (host_err) + return host_err; + iap->ia_valid &= ~ATTR_SIZE; + + /* + * Avoid the additional setattr call below if the only other + * attribute that the client sends is the mtime, as we update + * it as part of the size change above. + */ + if ((iap->ia_valid & ~ATTR_MTIME) == 0) + return 0; + } + + if (!iap->ia_valid) + return 0; + + iap->ia_valid |= ATTR_CTIME; + return notify_change(&init_user_ns, dentry, iap, NULL); +} + +/** + * nfsd_setattr - Set various file attributes. + * @rqstp: controlling RPC transaction + * @fhp: filehandle of target + * @attr: attributes to set + * @check_guard: set to 1 if guardtime is a valid timestamp + * @guardtime: do not act if ctime.tv_sec does not match this timestamp + * + * This call may adjust the contents of @attr (in particular, this + * call may change the bits in the na_iattr.ia_valid field). + * + * Returns nfs_ok on success, otherwise an NFS status code is + * returned. Caller must release @fhp by calling fh_put in either + * case. */ __be32 nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, @@ -356,6 +413,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, int host_err; bool get_write_count; bool size_change = (iap->ia_valid & ATTR_SIZE); + int retries; if (iap->ia_valid & ATTR_SIZE) { accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE; @@ -391,13 +449,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, dentry = fhp->fh_dentry; inode = d_inode(dentry); - /* Ignore any mode updates on symlinks */ - if (S_ISLNK(inode->i_mode)) - iap->ia_valid &= ~ATTR_MODE; - - if (!iap->ia_valid) - return 0; - nfsd_sanitize_attrs(inode, iap); if (check_guard && guardtime != inode->i_ctime.tv_sec) @@ -417,41 +468,13 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, } inode_lock(inode); - if (size_change) { - /* - * RFC5661, Section 18.30.4: - * Changing the size of a file with SETATTR indirectly - * changes the time_modify and change attributes. - * - * (and similar for the older RFCs) - */ - struct iattr size_attr = { - .ia_valid = ATTR_SIZE | ATTR_CTIME | ATTR_MTIME, - .ia_size = iap->ia_size, - }; - - host_err = -EFBIG; - if (iap->ia_size < 0) - goto out_unlock; - - host_err = notify_change(&init_user_ns, dentry, &size_attr, NULL); - if (host_err) - goto out_unlock; - iap->ia_valid &= ~ATTR_SIZE; - - /* - * Avoid the additional setattr call below if the only other - * attribute that the client sends is the mtime, as we update - * it as part of the size change above. - */ - if ((iap->ia_valid & ~ATTR_MTIME) == 0) - goto out_unlock; + for (retries = 1;;) { + host_err = __nfsd_setattr(dentry, iap); + if (host_err != -EAGAIN || !retries--) + break; + if (!nfsd_wait_for_delegreturn(rqstp, inode)) + break; } - - iap->ia_valid |= ATTR_CTIME; - host_err = notify_change(&init_user_ns, dentry, iap, NULL); - -out_unlock: if (attr->na_seclabel && attr->na_seclabel->len) attr->na_labelerr = security_inode_setsecctx(dentry, attr->na_seclabel->data, attr->na_seclabel->len); @@ -846,10 +869,14 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { struct svc_rqst *rqstp = sd->u.data; - - svc_rqst_replace_page(rqstp, buf->page); - if (rqstp->rq_res.page_len == 0) - rqstp->rq_res.page_base = buf->offset; + struct page *page = buf->page; // may be a compound one + unsigned offset = buf->offset; + + page += offset / PAGE_SIZE; + for (int i = sd->len; i > 0; i -= PAGE_SIZE) + svc_rqst_replace_page(rqstp, page++); + if (rqstp->rq_res.page_len == 0) // first call + rqstp->rq_res.page_base = offset % PAGE_SIZE; rqstp->rq_res.page_len += sd->len; return sd->len; } @@ -1252,7 +1279,7 @@ nfsd_check_ignore_resizing(struct iattr *iap) /* The parent directory should already be locked: */ __be32 nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, - char *fname, int flen, struct nfsd_attrs *attrs, + struct nfsd_attrs *attrs, int type, dev_t rdev, struct svc_fh *resfhp) { struct dentry *dentry, *dchild; @@ -1379,8 +1406,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, if (err) goto out_unlock; fh_fill_pre_attrs(fhp); - err = nfsd_create_locked(rqstp, fhp, fname, flen, attrs, type, - rdev, resfhp); + err = nfsd_create_locked(rqstp, fhp, attrs, type, rdev, resfhp); fh_fill_post_attrs(fhp); out_unlock: inode_unlock(dentry->d_inode); @@ -1670,7 +1696,15 @@ retry: .new_dir = tdir, .new_dentry = ndentry, }; - host_err = vfs_rename(&rd); + int retries; + + for (retries = 1;;) { + host_err = vfs_rename(&rd); + if (host_err != -EAGAIN || !retries--) + break; + if (!nfsd_wait_for_delegreturn(rqstp, d_inode(odentry))) + break; + } if (!host_err) { host_err = commit_metadata(tfhp); if (!host_err) @@ -1754,9 +1788,18 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, fh_fill_pre_attrs(fhp); if (type != S_IFDIR) { + int retries; + if (rdentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK) nfsd_close_cached_files(rdentry); - host_err = vfs_unlink(&init_user_ns, dirp, rdentry, NULL); + + for (retries = 1;;) { + host_err = vfs_unlink(&init_user_ns, dirp, rdentry, NULL); + if (host_err != -EAGAIN || !retries--) + break; + if (!nfsd_wait_for_delegreturn(rqstp, rinode)) + break; + } } else { host_err = vfs_rmdir(&init_user_ns, dirp, rdentry); } @@ -1811,7 +1854,7 @@ struct readdir_data { int full; }; -static int nfsd_buffered_filldir(struct dir_context *ctx, const char *name, +static bool nfsd_buffered_filldir(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { @@ -1823,7 +1866,7 @@ static int nfsd_buffered_filldir(struct dir_context *ctx, const char *name, reclen = ALIGN(sizeof(struct buffered_dirent) + namlen, sizeof(u64)); if (buf->used + reclen > PAGE_SIZE) { buf->full = 1; - return -EINVAL; + return false; } de->namlen = namlen; @@ -1833,7 +1876,7 @@ static int nfsd_buffered_filldir(struct dir_context *ctx, const char *name, memcpy(de->name, name, namlen); buf->used += reclen; - return 0; + return true; } static __be32 nfsd_buffered_readdir(struct file *file, struct svc_fh *fhp, diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index c95cd414b4bb..120521bc7b24 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -79,8 +79,8 @@ __be32 nfsd4_clone_file_range(struct svc_rqst *rqstp, u64 count, bool sync); #endif /* CONFIG_NFSD_V4 */ __be32 nfsd_create_locked(struct svc_rqst *, struct svc_fh *, - char *name, int len, struct nfsd_attrs *attrs, - int type, dev_t rdev, struct svc_fh *res); + struct nfsd_attrs *attrs, int type, dev_t rdev, + struct svc_fh *res); __be32 nfsd_create(struct svc_rqst *, struct svc_fh *, char *name, int len, struct nfsd_attrs *attrs, int type, dev_t rdev, struct svc_fh *res); diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 96267258e629..0eb00105d845 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -717,13 +717,13 @@ struct nfsd4_compoundargs { struct svcxdr_tmpbuf *to_free; struct svc_rqst *rqstp; - u32 taglen; char * tag; + u32 taglen; u32 minorversion; + u32 client_opcnt; u32 opcnt; struct nfsd4_op *ops; struct nfsd4_op iops[8]; - int cachetype; }; struct nfsd4_compoundres { @@ -732,8 +732,8 @@ struct nfsd4_compoundres { struct svc_rqst * rqstp; __be32 *statusp; - u32 taglen; char * tag; + u32 taglen; u32 opcnt; struct nfsd4_compound_state cstate; @@ -888,7 +888,8 @@ struct nfsd4_operation { u32 op_flags; char *op_name; /* Try to get response size before operation */ - u32 (*op_rsize_bop)(struct svc_rqst *, struct nfsd4_op *); + u32 (*op_rsize_bop)(const struct svc_rqst *rqstp, + const struct nfsd4_op *op); void (*op_get_currentstateid)(struct nfsd4_compound_state *, union nfsd4_op_u *); void (*op_set_currentstateid)(struct nfsd4_compound_state *, diff --git a/fs/nsfs.c b/fs/nsfs.c index 800c1d0eb0d0..3506f6074288 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -28,7 +28,7 @@ static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) struct inode *inode = d_inode(dentry); const struct proc_ns_operations *ns_ops = dentry->d_fsdata; - return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]", + return dynamic_dname(buffer, buflen, "%s:[%lu]", ns_ops->name, inode->i_ino); } diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 5ae8de09b271..001f4e053c85 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -2092,7 +2092,8 @@ get_ctx_vol_failed: // TODO: Initialize security. /* Get the extended system files' directory inode. */ vol->extend_ino = ntfs_iget(sb, FILE_Extend); - if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino)) { + if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino) || + !S_ISDIR(vol->extend_ino->i_mode)) { if (!IS_ERR(vol->extend_ino)) iput(vol->extend_ino); ntfs_error(sb, "Failed to load $Extend."); diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c index e8c00dda42ad..71f870d497ae 100644 --- a/fs/ntfs3/attrib.c +++ b/fs/ntfs3/attrib.c @@ -84,8 +84,8 @@ static inline bool attr_must_be_resident(struct ntfs_sb_info *sbi, /* * attr_load_runs - Load all runs stored in @attr. */ -int attr_load_runs(struct ATTRIB *attr, struct ntfs_inode *ni, - struct runs_tree *run, const CLST *vcn) +static int attr_load_runs(struct ATTRIB *attr, struct ntfs_inode *ni, + struct runs_tree *run, const CLST *vcn) { int err; CLST svcn = le64_to_cpu(attr->nres.svcn); @@ -140,7 +140,10 @@ failed: } if (lcn != SPARSE_LCN) { - mark_as_free_ex(sbi, lcn, clen, trim); + if (sbi) { + /* mark bitmap range [lcn + clen) as free and trim clusters. */ + mark_as_free_ex(sbi, lcn, clen, trim); + } dn += clen; } @@ -173,7 +176,6 @@ int attr_allocate_clusters(struct ntfs_sb_info *sbi, struct runs_tree *run, { int err; CLST flen, vcn0 = vcn, pre = pre_alloc ? *pre_alloc : 0; - struct wnd_bitmap *wnd = &sbi->used.bitmap; size_t cnt = run->count; for (;;) { @@ -196,9 +198,7 @@ int attr_allocate_clusters(struct ntfs_sb_info *sbi, struct runs_tree *run, /* Add new fragment into run storage. */ if (!run_add_entry(run, vcn, lcn, flen, opt == ALLOCATE_MFT)) { /* Undo last 'ntfs_look_for_free_space' */ - down_write_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS); - wnd_set_free(wnd, lcn, flen); - up_write(&wnd->rw_lock); + mark_as_free_ex(sbi, lcn, len, false); err = -ENOMEM; goto out; } @@ -320,7 +320,7 @@ int attr_make_nonresident(struct ntfs_inode *ni, struct ATTRIB *attr, err = ni_insert_nonresident(ni, attr_s->type, attr_name(attr_s), attr_s->name_len, run, 0, alen, - attr_s->flags, &attr, NULL); + attr_s->flags, &attr, NULL, NULL); if (err) goto out3; @@ -419,40 +419,44 @@ int attr_set_size(struct ntfs_inode *ni, enum ATTR_TYPE type, struct mft_inode *mi, *mi_b; CLST alen, vcn, lcn, new_alen, old_alen, svcn, evcn; CLST next_svcn, pre_alloc = -1, done = 0; - bool is_ext; + bool is_ext, is_bad = false; u32 align; struct MFT_REC *rec; again: + alen = 0; le_b = NULL; attr_b = ni_find_attr(ni, NULL, &le_b, type, name, name_len, NULL, &mi_b); if (!attr_b) { err = -ENOENT; - goto out; + goto bad_inode; } if (!attr_b->non_res) { err = attr_set_size_res(ni, attr_b, le_b, mi_b, new_size, run, &attr_b); - if (err || !attr_b->non_res) - goto out; + if (err) + return err; + + /* Return if file is still resident. */ + if (!attr_b->non_res) + goto ok1; /* Layout of records may be changed, so do a full search. */ goto again; } is_ext = is_attr_ext(attr_b); - -again_1: align = sbi->cluster_size; - if (is_ext) align <<= attr_b->nres.c_unit; old_valid = le64_to_cpu(attr_b->nres.valid_size); old_size = le64_to_cpu(attr_b->nres.data_size); old_alloc = le64_to_cpu(attr_b->nres.alloc_size); + +again_1: old_alen = old_alloc >> cluster_bits; new_alloc = (new_size + align - 1) & ~(u64)(align - 1); @@ -475,24 +479,27 @@ again_1: mi = mi_b; } else if (!le_b) { err = -EINVAL; - goto out; + goto bad_inode; } else { le = le_b; attr = ni_find_attr(ni, attr_b, &le, type, name, name_len, &vcn, &mi); if (!attr) { err = -EINVAL; - goto out; + goto bad_inode; } next_le_1: svcn = le64_to_cpu(attr->nres.svcn); evcn = le64_to_cpu(attr->nres.evcn); } - + /* + * Here we have: + * attr,mi,le - last attribute segment (containing 'vcn'). + * attr_b,mi_b,le_b - base (primary) attribute segment. + */ next_le: rec = mi->mrec; - err = attr_load_runs(attr, ni, run, NULL); if (err) goto out; @@ -507,6 +514,13 @@ next_le: goto ok; } + /* + * Add clusters. In simple case we have to: + * - allocate space (vcn, lcn, len) + * - update packed run in 'mi' + * - update attr->nres.evcn + * - update attr_b->nres.data_size/attr_b->nres.alloc_size + */ to_allocate = new_alen - old_alen; add_alloc_in_same_attr_seg: lcn = 0; @@ -520,9 +534,11 @@ add_alloc_in_same_attr_seg: pre_alloc = 0; if (type == ATTR_DATA && !name_len && sbi->options->prealloc) { - CLST new_alen2 = bytes_to_cluster( - sbi, get_pre_allocated(new_size)); - pre_alloc = new_alen2 - new_alen; + pre_alloc = + bytes_to_cluster( + sbi, + get_pre_allocated(new_size)) - + new_alen; } /* Get the last LCN to allocate from. */ @@ -580,7 +596,7 @@ add_alloc_in_same_attr_seg: pack_runs: err = mi_pack_runs(mi, attr, run, vcn - svcn); if (err) - goto out; + goto undo_1; next_svcn = le64_to_cpu(attr->nres.evcn) + 1; new_alloc_tmp = (u64)next_svcn << cluster_bits; @@ -614,7 +630,7 @@ pack_runs: if (type == ATTR_LIST) { err = ni_expand_list(ni); if (err) - goto out; + goto undo_2; if (next_svcn < vcn) goto pack_runs; @@ -624,8 +640,9 @@ pack_runs: if (!ni->attr_list.size) { err = ni_create_attr_list(ni); + /* In case of error layout of records is not changed. */ if (err) - goto out; + goto undo_2; /* Layout of records is changed. */ } @@ -637,48 +654,57 @@ pack_runs: /* Insert new attribute segment. */ err = ni_insert_nonresident(ni, type, name, name_len, run, next_svcn, vcn - next_svcn, - attr_b->flags, &attr, &mi); - if (err) - goto out; - - if (!is_mft) - run_truncate_head(run, evcn + 1); - - svcn = le64_to_cpu(attr->nres.svcn); - evcn = le64_to_cpu(attr->nres.evcn); + attr_b->flags, &attr, &mi, NULL); - le_b = NULL; /* * Layout of records maybe changed. * Find base attribute to update. */ + le_b = NULL; attr_b = ni_find_attr(ni, NULL, &le_b, type, name, name_len, NULL, &mi_b); if (!attr_b) { - err = -ENOENT; - goto out; + err = -EINVAL; + goto bad_inode; } - attr_b->nres.alloc_size = cpu_to_le64((u64)vcn << cluster_bits); - attr_b->nres.data_size = attr_b->nres.alloc_size; - attr_b->nres.valid_size = attr_b->nres.alloc_size; + if (err) { + /* ni_insert_nonresident failed. */ + attr = NULL; + goto undo_2; + } + + if (!is_mft) + run_truncate_head(run, evcn + 1); + + svcn = le64_to_cpu(attr->nres.svcn); + evcn = le64_to_cpu(attr->nres.evcn); + + /* + * Attribute is in consistency state. + * Save this point to restore to if next steps fail. + */ + old_valid = old_size = old_alloc = (u64)vcn << cluster_bits; + attr_b->nres.valid_size = attr_b->nres.data_size = + attr_b->nres.alloc_size = cpu_to_le64(old_size); mi_b->dirty = true; goto again_1; } if (new_size != old_size || (new_alloc != old_alloc && !keep_prealloc)) { + /* + * Truncate clusters. In simple case we have to: + * - update packed run in 'mi' + * - update attr->nres.evcn + * - update attr_b->nres.data_size/attr_b->nres.alloc_size + * - mark and trim clusters as free (vcn, lcn, len) + */ + CLST dlen = 0; + vcn = max(svcn, new_alen); new_alloc_tmp = (u64)vcn << cluster_bits; - alen = 0; - err = run_deallocate_ex(sbi, run, vcn, evcn - vcn + 1, &alen, - true); - if (err) - goto out; - - run_truncate(run, vcn); - if (vcn > svcn) { err = mi_pack_runs(mi, attr, run, vcn - svcn); if (err) @@ -697,7 +723,7 @@ pack_runs: if (!al_remove_le(ni, le)) { err = -EINVAL; - goto out; + goto bad_inode; } le = (struct ATTR_LIST_ENTRY *)((u8 *)le - le_sz); @@ -723,12 +749,20 @@ pack_runs: attr_b->nres.valid_size = attr_b->nres.alloc_size; } + mi_b->dirty = true; - if (is_ext) + err = run_deallocate_ex(sbi, run, vcn, evcn - vcn + 1, &dlen, + true); + if (err) + goto out; + + if (is_ext) { + /* dlen - really deallocated clusters. */ le64_sub_cpu(&attr_b->nres.total_size, - ((u64)alen << cluster_bits)); + ((u64)dlen << cluster_bits)); + } - mi_b->dirty = true; + run_truncate(run, vcn); if (new_alloc_tmp <= new_alloc) goto ok; @@ -747,7 +781,7 @@ pack_runs: if (le->type != type || le->name_len != name_len || memcmp(le_name(le), name, name_len * sizeof(short))) { err = -EINVAL; - goto out; + goto bad_inode; } err = ni_load_mi(ni, le, &mi); @@ -757,7 +791,7 @@ pack_runs: attr = mi_find_attr(mi, NULL, type, name, name_len, &le->id); if (!attr) { err = -EINVAL; - goto out; + goto bad_inode; } goto next_le_1; } @@ -772,13 +806,13 @@ ok: } } -out: - if (!err && attr_b && ret) +ok1: + if (ret) *ret = attr_b; /* Update inode_set_bytes. */ - if (!err && ((type == ATTR_DATA && !name_len) || - (type == ATTR_ALLOC && name == I30_NAME))) { + if (((type == ATTR_DATA && !name_len) || + (type == ATTR_ALLOC && name == I30_NAME))) { bool dirty = false; if (ni->vfs_inode.i_size != new_size) { @@ -786,7 +820,7 @@ out: dirty = true; } - if (attr_b && attr_b->non_res) { + if (attr_b->non_res) { new_alloc = le64_to_cpu(attr_b->nres.alloc_size); if (inode_get_bytes(&ni->vfs_inode) != new_alloc) { inode_set_bytes(&ni->vfs_inode, new_alloc); @@ -800,6 +834,47 @@ out: } } + return 0; + +undo_2: + vcn -= alen; + attr_b->nres.data_size = cpu_to_le64(old_size); + attr_b->nres.valid_size = cpu_to_le64(old_valid); + attr_b->nres.alloc_size = cpu_to_le64(old_alloc); + + /* Restore 'attr' and 'mi'. */ + if (attr) + goto restore_run; + + if (le64_to_cpu(attr_b->nres.svcn) <= svcn && + svcn <= le64_to_cpu(attr_b->nres.evcn)) { + attr = attr_b; + le = le_b; + mi = mi_b; + } else if (!le_b) { + err = -EINVAL; + goto bad_inode; + } else { + le = le_b; + attr = ni_find_attr(ni, attr_b, &le, type, name, name_len, + &svcn, &mi); + if (!attr) + goto bad_inode; + } + +restore_run: + if (mi_pack_runs(mi, attr, run, evcn - svcn + 1)) + is_bad = true; + +undo_1: + run_deallocate_ex(sbi, run, vcn, alen, NULL, false); + + run_truncate(run, vcn); +out: + if (is_bad) { +bad_inode: + _ntfs_bad_inode(&ni->vfs_inode); + } return err; } @@ -855,7 +930,7 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn, goto out; } - asize = le64_to_cpu(attr_b->nres.alloc_size) >> sbi->cluster_bits; + asize = le64_to_cpu(attr_b->nres.alloc_size) >> cluster_bits; if (vcn >= asize) { err = -EINVAL; goto out; @@ -1047,7 +1122,7 @@ ins_ext: if (evcn1 > next_svcn) { err = ni_insert_nonresident(ni, ATTR_DATA, NULL, 0, run, next_svcn, evcn1 - next_svcn, - attr_b->flags, &attr, &mi); + attr_b->flags, &attr, &mi, NULL); if (err) goto out; } @@ -1173,7 +1248,7 @@ int attr_load_runs_range(struct ntfs_inode *ni, enum ATTR_TYPE type, { struct ntfs_sb_info *sbi = ni->mi.sbi; u8 cluster_bits = sbi->cluster_bits; - CLST vcn = from >> cluster_bits; + CLST vcn; CLST vcn_last = (to - 1) >> cluster_bits; CLST lcn, clen; int err; @@ -1647,7 +1722,7 @@ ins_ext: if (evcn1 > next_svcn) { err = ni_insert_nonresident(ni, ATTR_DATA, NULL, 0, run, next_svcn, evcn1 - next_svcn, - attr_b->flags, &attr, &mi); + attr_b->flags, &attr, &mi, NULL); if (err) goto out; } @@ -1812,18 +1887,12 @@ int attr_collapse_range(struct ntfs_inode *ni, u64 vbo, u64 bytes) err = ni_insert_nonresident( ni, ATTR_DATA, NULL, 0, run, next_svcn, evcn1 - eat - next_svcn, a_flags, &attr, - &mi); + &mi, &le); if (err) goto out; /* Layout of records maybe changed. */ attr_b = NULL; - le = al_find_ex(ni, NULL, ATTR_DATA, NULL, 0, - &next_svcn); - if (!le) { - err = -EINVAL; - goto out; - } } /* Free all allocated memory. */ @@ -1918,7 +1987,7 @@ next_attr: out: up_write(&ni->file.run_lock); if (err) - make_bad_inode(&ni->vfs_inode); + _ntfs_bad_inode(&ni->vfs_inode); return err; } @@ -1936,9 +2005,11 @@ int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size) struct ATTRIB *attr = NULL, *attr_b; struct ATTR_LIST_ENTRY *le, *le_b; struct mft_inode *mi, *mi_b; - CLST svcn, evcn1, vcn, len, end, alen, dealloc; + CLST svcn, evcn1, vcn, len, end, alen, hole, next_svcn; u64 total_size, alloc_size; u32 mask; + __le16 a_flags; + struct runs_tree run2; if (!bytes) return 0; @@ -1990,6 +2061,9 @@ int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size) } down_write(&ni->file.run_lock); + run_init(&run2); + run_truncate(run, 0); + /* * Enumerate all attribute segments and punch hole where necessary. */ @@ -1997,10 +2071,11 @@ int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size) vcn = vbo >> sbi->cluster_bits; len = bytes >> sbi->cluster_bits; end = vcn + len; - dealloc = 0; + hole = 0; svcn = le64_to_cpu(attr_b->nres.svcn); evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1; + a_flags = attr_b->flags; if (svcn <= vcn && vcn < evcn1) { attr = attr_b; @@ -2008,14 +2083,14 @@ int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size) mi = mi_b; } else if (!le_b) { err = -EINVAL; - goto out; + goto bad_inode; } else { le = le_b; attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, &vcn, &mi); if (!attr) { err = -EINVAL; - goto out; + goto bad_inode; } svcn = le64_to_cpu(attr->nres.svcn); @@ -2023,49 +2098,91 @@ int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size) } while (svcn < end) { - CLST vcn1, zero, dealloc2; + CLST vcn1, zero, hole2 = hole; err = attr_load_runs(attr, ni, run, &svcn); if (err) - goto out; + goto done; vcn1 = max(vcn, svcn); zero = min(end, evcn1) - vcn1; - dealloc2 = dealloc; - err = run_deallocate_ex(sbi, run, vcn1, zero, &dealloc, true); + /* + * Check range [vcn1 + zero). + * Calculate how many clusters there are. + * Don't do any destructive actions. + */ + err = run_deallocate_ex(NULL, run, vcn1, zero, &hole2, false); if (err) - goto out; + goto done; - if (dealloc2 == dealloc) { - /* Looks like the required range is already sparsed. */ - } else { - if (!run_add_entry(run, vcn1, SPARSE_LCN, zero, - false)) { - err = -ENOMEM; - goto out; - } + /* Check if required range is already hole. */ + if (hole2 == hole) + goto next_attr; + + /* Make a clone of run to undo. */ + err = run_clone(run, &run2); + if (err) + goto done; + + /* Make a hole range (sparse) [vcn1 + zero). */ + if (!run_add_entry(run, vcn1, SPARSE_LCN, zero, false)) { + err = -ENOMEM; + goto done; + } - err = mi_pack_runs(mi, attr, run, evcn1 - svcn); + /* Update run in attribute segment. */ + err = mi_pack_runs(mi, attr, run, evcn1 - svcn); + if (err) + goto done; + next_svcn = le64_to_cpu(attr->nres.evcn) + 1; + if (next_svcn < evcn1) { + /* Insert new attribute segment. */ + err = ni_insert_nonresident(ni, ATTR_DATA, NULL, 0, run, + next_svcn, + evcn1 - next_svcn, a_flags, + &attr, &mi, &le); if (err) - goto out; + goto undo_punch; + + /* Layout of records maybe changed. */ + attr_b = NULL; } + + /* Real deallocate. Should not fail. */ + run_deallocate_ex(sbi, &run2, vcn1, zero, &hole, true); + +next_attr: /* Free all allocated memory. */ run_truncate(run, 0); if (evcn1 >= alen) break; + /* Get next attribute segment. */ attr = ni_enum_attr_ex(ni, attr, &le, &mi); if (!attr) { err = -EINVAL; - goto out; + goto bad_inode; } svcn = le64_to_cpu(attr->nres.svcn); evcn1 = le64_to_cpu(attr->nres.evcn) + 1; } - total_size -= (u64)dealloc << sbi->cluster_bits; +done: + if (!hole) + goto out; + + if (!attr_b) { + attr_b = ni_find_attr(ni, NULL, NULL, ATTR_DATA, NULL, 0, NULL, + &mi_b); + if (!attr_b) { + err = -EINVAL; + goto bad_inode; + } + } + + total_size -= (u64)hole << sbi->cluster_bits; attr_b->nres.total_size = cpu_to_le64(total_size); mi_b->dirty = true; @@ -2075,9 +2192,263 @@ int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size) mark_inode_dirty(&ni->vfs_inode); out: + run_close(&run2); up_write(&ni->file.run_lock); + return err; + +bad_inode: + _ntfs_bad_inode(&ni->vfs_inode); + goto out; + +undo_punch: + /* + * Restore packed runs. + * 'mi_pack_runs' should not fail, cause we restore original. + */ + if (mi_pack_runs(mi, attr, &run2, evcn1 - svcn)) + goto bad_inode; + + goto done; +} + +/* + * attr_insert_range - Insert range (hole) in file. + * Not for normal files. + */ +int attr_insert_range(struct ntfs_inode *ni, u64 vbo, u64 bytes) +{ + int err = 0; + struct runs_tree *run = &ni->file.run; + struct ntfs_sb_info *sbi = ni->mi.sbi; + struct ATTRIB *attr = NULL, *attr_b; + struct ATTR_LIST_ENTRY *le, *le_b; + struct mft_inode *mi, *mi_b; + CLST vcn, svcn, evcn1, len, next_svcn; + u64 data_size, alloc_size; + u32 mask; + __le16 a_flags; + + if (!bytes) + return 0; + + le_b = NULL; + attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); + if (!attr_b) + return -ENOENT; + + if (!is_attr_ext(attr_b)) { + /* It was checked above. See fallocate. */ + return -EOPNOTSUPP; + } + + if (!attr_b->non_res) { + data_size = le32_to_cpu(attr_b->res.data_size); + alloc_size = data_size; + mask = sbi->cluster_mask; /* cluster_size - 1 */ + } else { + data_size = le64_to_cpu(attr_b->nres.data_size); + alloc_size = le64_to_cpu(attr_b->nres.alloc_size); + mask = (sbi->cluster_size << attr_b->nres.c_unit) - 1; + } + + if (vbo > data_size) { + /* Insert range after the file size is not allowed. */ + return -EINVAL; + } + + if ((vbo & mask) || (bytes & mask)) { + /* Allow to insert only frame aligned ranges. */ + return -EINVAL; + } + + /* + * valid_size <= data_size <= alloc_size + * Check alloc_size for maximum possible. + */ + if (bytes > sbi->maxbytes_sparse - alloc_size) + return -EFBIG; + + vcn = vbo >> sbi->cluster_bits; + len = bytes >> sbi->cluster_bits; + + down_write(&ni->file.run_lock); + + if (!attr_b->non_res) { + err = attr_set_size(ni, ATTR_DATA, NULL, 0, run, + data_size + bytes, NULL, false, NULL); + + le_b = NULL; + attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, + &mi_b); + if (!attr_b) { + err = -EINVAL; + goto bad_inode; + } + + if (err) + goto out; + + if (!attr_b->non_res) { + /* Still resident. */ + char *data = Add2Ptr(attr_b, attr_b->res.data_off); + + memmove(data + bytes, data, bytes); + memset(data, 0, bytes); + goto done; + } + + /* Resident files becomes nonresident. */ + data_size = le64_to_cpu(attr_b->nres.data_size); + alloc_size = le64_to_cpu(attr_b->nres.alloc_size); + } + + /* + * Enumerate all attribute segments and shift start vcn. + */ + a_flags = attr_b->flags; + svcn = le64_to_cpu(attr_b->nres.svcn); + evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1; + + if (svcn <= vcn && vcn < evcn1) { + attr = attr_b; + le = le_b; + mi = mi_b; + } else if (!le_b) { + err = -EINVAL; + goto bad_inode; + } else { + le = le_b; + attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, &vcn, + &mi); + if (!attr) { + err = -EINVAL; + goto bad_inode; + } + + svcn = le64_to_cpu(attr->nres.svcn); + evcn1 = le64_to_cpu(attr->nres.evcn) + 1; + } + + run_truncate(run, 0); /* clear cached values. */ + err = attr_load_runs(attr, ni, run, NULL); + if (err) + goto out; + + if (!run_insert_range(run, vcn, len)) { + err = -ENOMEM; + goto out; + } + + /* Try to pack in current record as much as possible. */ + err = mi_pack_runs(mi, attr, run, evcn1 + len - svcn); if (err) - make_bad_inode(&ni->vfs_inode); + goto out; + + next_svcn = le64_to_cpu(attr->nres.evcn) + 1; + + while ((attr = ni_enum_attr_ex(ni, attr, &le, &mi)) && + attr->type == ATTR_DATA && !attr->name_len) { + le64_add_cpu(&attr->nres.svcn, len); + le64_add_cpu(&attr->nres.evcn, len); + if (le) { + le->vcn = attr->nres.svcn; + ni->attr_list.dirty = true; + } + mi->dirty = true; + } + + if (next_svcn < evcn1 + len) { + err = ni_insert_nonresident(ni, ATTR_DATA, NULL, 0, run, + next_svcn, evcn1 + len - next_svcn, + a_flags, NULL, NULL, NULL); + + le_b = NULL; + attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, + &mi_b); + if (!attr_b) { + err = -EINVAL; + goto bad_inode; + } + + if (err) { + /* ni_insert_nonresident failed. Try to undo. */ + goto undo_insert_range; + } + } + + /* + * Update primary attribute segment. + */ + if (vbo <= ni->i_valid) + ni->i_valid += bytes; + + attr_b->nres.data_size = le64_to_cpu(data_size + bytes); + attr_b->nres.alloc_size = le64_to_cpu(alloc_size + bytes); + + /* ni->valid may be not equal valid_size (temporary). */ + if (ni->i_valid > data_size + bytes) + attr_b->nres.valid_size = attr_b->nres.data_size; + else + attr_b->nres.valid_size = cpu_to_le64(ni->i_valid); + mi_b->dirty = true; + +done: + ni->vfs_inode.i_size += bytes; + ni->ni_flags |= NI_FLAG_UPDATE_PARENT; + mark_inode_dirty(&ni->vfs_inode); + +out: + run_truncate(run, 0); /* clear cached values. */ + + up_write(&ni->file.run_lock); return err; + +bad_inode: + _ntfs_bad_inode(&ni->vfs_inode); + goto out; + +undo_insert_range: + svcn = le64_to_cpu(attr_b->nres.svcn); + evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1; + + if (svcn <= vcn && vcn < evcn1) { + attr = attr_b; + le = le_b; + mi = mi_b; + } else if (!le_b) { + goto bad_inode; + } else { + le = le_b; + attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, &vcn, + &mi); + if (!attr) { + goto bad_inode; + } + + svcn = le64_to_cpu(attr->nres.svcn); + evcn1 = le64_to_cpu(attr->nres.evcn) + 1; + } + + if (attr_load_runs(attr, ni, run, NULL)) + goto bad_inode; + + if (!run_collapse_range(run, vcn, len)) + goto bad_inode; + + if (mi_pack_runs(mi, attr, run, evcn1 + len - svcn)) + goto bad_inode; + + while ((attr = ni_enum_attr_ex(ni, attr, &le, &mi)) && + attr->type == ATTR_DATA && !attr->name_len) { + le64_sub_cpu(&attr->nres.svcn, len); + le64_sub_cpu(&attr->nres.evcn, len); + if (le) { + le->vcn = attr->nres.svcn; + ni->attr_list.dirty = true; + } + mi->dirty = true; + } + + goto out; } diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c index aa184407520f..5d44ceac855b 100644 --- a/fs/ntfs3/bitmap.c +++ b/fs/ntfs3/bitmap.c @@ -51,11 +51,6 @@ void ntfs3_exit_bitmap(void) kmem_cache_destroy(ntfs_enode_cachep); } -static inline u32 wnd_bits(const struct wnd_bitmap *wnd, size_t i) -{ - return i + 1 == wnd->nwnd ? wnd->bits_last : wnd->sb->s_blocksize * 8; -} - /* * wnd_scan * @@ -1333,9 +1328,7 @@ int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits) if (!new_free) return -ENOMEM; - if (new_free != wnd->free_bits) - memcpy(new_free, wnd->free_bits, - wnd->nwnd * sizeof(short)); + memcpy(new_free, wnd->free_bits, wnd->nwnd * sizeof(short)); memset(new_free + wnd->nwnd, 0, (new_wnd - wnd->nwnd) * sizeof(short)); kfree(wnd->free_bits); @@ -1395,9 +1388,8 @@ int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits) void wnd_zone_set(struct wnd_bitmap *wnd, size_t lcn, size_t len) { - size_t zlen; + size_t zlen = wnd->zone_end - wnd->zone_bit; - zlen = wnd->zone_end - wnd->zone_bit; if (zlen) wnd_add_free_ext(wnd, wnd->zone_bit, zlen, false); diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 4a21745711fe..4f2ffc7ef296 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -530,21 +530,35 @@ static int ntfs_truncate(struct inode *inode, loff_t new_size) static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) { struct inode *inode = file->f_mapping->host; + struct address_space *mapping = inode->i_mapping; struct super_block *sb = inode->i_sb; struct ntfs_sb_info *sbi = sb->s_fs_info; struct ntfs_inode *ni = ntfs_i(inode); loff_t end = vbo + len; loff_t vbo_down = round_down(vbo, PAGE_SIZE); - loff_t i_size; + bool is_supported_holes = is_sparsed(ni) || is_compressed(ni); + loff_t i_size, new_size; + bool map_locked; int err; /* No support for dir. */ if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; - /* Return error if mode is not supported. */ - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | - FALLOC_FL_COLLAPSE_RANGE)) { + /* + * vfs_fallocate checks all possible combinations of mode. + * Do additional checks here before ntfs_set_state(dirty). + */ + if (mode & FALLOC_FL_PUNCH_HOLE) { + if (!is_supported_holes) + return -EOPNOTSUPP; + } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { + } else if (mode & FALLOC_FL_INSERT_RANGE) { + if (!is_supported_holes) + return -EOPNOTSUPP; + } else if (mode & + ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)) { ntfs_inode_warn(inode, "fallocate(0x%x) is not supported", mode); return -EOPNOTSUPP; @@ -554,6 +568,8 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) inode_lock(inode); i_size = inode->i_size; + new_size = max(end, i_size); + map_locked = false; if (WARN_ON(ni->ni_flags & NI_FLAG_COMPRESSED_MASK)) { /* Should never be here, see ntfs_file_open. */ @@ -561,38 +577,27 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) goto out; } + if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | + FALLOC_FL_INSERT_RANGE)) { + inode_dio_wait(inode); + filemap_invalidate_lock(mapping); + map_locked = true; + } + if (mode & FALLOC_FL_PUNCH_HOLE) { u32 frame_size; loff_t mask, vbo_a, end_a, tmp; - if (!(mode & FALLOC_FL_KEEP_SIZE)) { - err = -EINVAL; - goto out; - } - - err = filemap_write_and_wait_range(inode->i_mapping, vbo, - end - 1); + err = filemap_write_and_wait_range(mapping, vbo, end - 1); if (err) goto out; - err = filemap_write_and_wait_range(inode->i_mapping, end, - LLONG_MAX); + err = filemap_write_and_wait_range(mapping, end, LLONG_MAX); if (err) goto out; - inode_dio_wait(inode); - truncate_pagecache(inode, vbo_down); - if (!is_sparsed(ni) && !is_compressed(ni)) { - /* - * Normal file, can't make hole. - * TODO: Try to find way to save info about hole. - */ - err = -EOPNOTSUPP; - goto out; - } - ni_lock(ni); err = attr_punch_hole(ni, vbo, len, &frame_size); ni_unlock(ni); @@ -624,17 +629,11 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) ni_unlock(ni); } } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { - if (mode & ~FALLOC_FL_COLLAPSE_RANGE) { - err = -EINVAL; - goto out; - } - /* * Write tail of the last page before removed range since * it will get removed from the page cache below. */ - err = filemap_write_and_wait_range(inode->i_mapping, vbo_down, - vbo); + err = filemap_write_and_wait_range(mapping, vbo_down, vbo); if (err) goto out; @@ -642,34 +641,58 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) * Write data that will be shifted to preserve them * when discarding page cache below. */ - err = filemap_write_and_wait_range(inode->i_mapping, end, - LLONG_MAX); + err = filemap_write_and_wait_range(mapping, end, LLONG_MAX); if (err) goto out; - /* Wait for existing dio to complete. */ - inode_dio_wait(inode); - truncate_pagecache(inode, vbo_down); ni_lock(ni); err = attr_collapse_range(ni, vbo, len); ni_unlock(ni); + } else if (mode & FALLOC_FL_INSERT_RANGE) { + /* Check new size. */ + err = inode_newsize_ok(inode, new_size); + if (err) + goto out; + + /* Write out all dirty pages. */ + err = filemap_write_and_wait_range(mapping, vbo_down, + LLONG_MAX); + if (err) + goto out; + truncate_pagecache(inode, vbo_down); + + ni_lock(ni); + err = attr_insert_range(ni, vbo, len); + ni_unlock(ni); } else { - /* - * Normal file: Allocate clusters, do not change 'valid' size. - */ - loff_t new_size = max(end, i_size); + /* Check new size. */ + + /* generic/213: expected -ENOSPC instead of -EFBIG. */ + if (!is_supported_holes) { + loff_t to_alloc = new_size - inode_get_bytes(inode); + + if (to_alloc > 0 && + (to_alloc >> sbi->cluster_bits) > + wnd_zeroes(&sbi->used.bitmap)) { + err = -ENOSPC; + goto out; + } + } err = inode_newsize_ok(inode, new_size); if (err) goto out; + /* + * Allocate clusters, do not change 'valid' size. + */ err = ntfs_set_size(inode, new_size); if (err) goto out; - if (is_sparsed(ni) || is_compressed(ni)) { + if (is_supported_holes) { CLST vcn_v = ni->i_valid >> sbi->cluster_bits; CLST vcn = vbo >> sbi->cluster_bits; CLST cend = bytes_to_cluster(sbi, end); @@ -717,8 +740,8 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) } out: - if (err == -EFBIG) - err = -ENOSPC; + if (map_locked) + filemap_invalidate_unlock(mapping); if (!err) { inode->i_ctime = inode->i_mtime = current_time(inode); @@ -989,7 +1012,6 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) if (bytes > count) bytes = count; - frame = pos >> frame_bits; frame_vbo = pos & ~(frame_size - 1); index = frame_vbo >> PAGE_SHIFT; diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c index 18842998c8fa..381a38a06ec2 100644 --- a/fs/ntfs3/frecord.c +++ b/fs/ntfs3/frecord.c @@ -7,6 +7,7 @@ #include <linux/fiemap.h> #include <linux/fs.h> +#include <linux/minmax.h> #include <linux/vmalloc.h> #include "debug.h" @@ -468,7 +469,7 @@ ni_ins_new_attr(struct ntfs_inode *ni, struct mft_inode *mi, &ref, &le); if (err) { /* No memory or no space. */ - return NULL; + return ERR_PTR(err); } le_added = true; @@ -649,6 +650,7 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni) struct mft_inode *mi; u32 asize, free; struct MFT_REF ref; + struct MFT_REC *mrec; __le16 id; if (!ni->attr_list.dirty) @@ -692,11 +694,17 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni) free -= asize; } + /* Make a copy of primary record to restore if error. */ + mrec = kmemdup(ni->mi.mrec, sbi->record_size, GFP_NOFS); + if (!mrec) + return 0; /* Not critical. */ + /* It seems that attribute list can be removed from primary record. */ mi_remove_attr(NULL, &ni->mi, attr_list); /* - * Repeat the cycle above and move all attributes to primary record. + * Repeat the cycle above and copy all attributes to primary record. + * Do not remove original attributes from subrecords! * It should be success! */ le = NULL; @@ -707,14 +715,14 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni) mi = ni_find_mi(ni, ino_get(&le->ref)); if (!mi) { /* Should never happened, 'cause already checked. */ - goto bad; + goto out; } attr = mi_find_attr(mi, NULL, le->type, le_name(le), le->name_len, &le->id); if (!attr) { /* Should never happened, 'cause already checked. */ - goto bad; + goto out; } asize = le32_to_cpu(attr->size); @@ -724,18 +732,33 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni) le16_to_cpu(attr->name_off)); if (!attr_ins) { /* - * Internal error. - * Either no space in primary record (already checked). - * Either tried to insert another - * non indexed attribute (logic error). + * No space in primary record (already checked). */ - goto bad; + goto out; } /* Copy all except id. */ id = attr_ins->id; memcpy(attr_ins, attr, asize); attr_ins->id = id; + } + + /* + * Repeat the cycle above and remove all attributes from subrecords. + */ + le = NULL; + while ((le = al_enumerate(ni, le))) { + if (!memcmp(&le->ref, &ref, sizeof(ref))) + continue; + + mi = ni_find_mi(ni, ino_get(&le->ref)); + if (!mi) + continue; + + attr = mi_find_attr(mi, NULL, le->type, le_name(le), + le->name_len, &le->id); + if (!attr) + continue; /* Remove from original record. */ mi_remove_attr(NULL, mi, attr); @@ -748,11 +771,13 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni) ni->attr_list.le = NULL; ni->attr_list.dirty = false; + kfree(mrec); + return 0; +out: + /* Restore primary record. */ + swap(mrec, ni->mi.mrec); + kfree(mrec); return 0; -bad: - ntfs_inode_err(&ni->vfs_inode, "Internal error"); - make_bad_inode(&ni->vfs_inode); - return -EINVAL; } /* @@ -986,6 +1011,8 @@ static int ni_ins_attr_ext(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le, name_off, svcn, ins_le); if (!attr) continue; + if (IS_ERR(attr)) + return PTR_ERR(attr); if (ins_attr) *ins_attr = attr; @@ -1007,8 +1034,15 @@ insert_ext: attr = ni_ins_new_attr(ni, mi, le, type, name, name_len, asize, name_off, svcn, ins_le); - if (!attr) + if (!attr) { + err = -EINVAL; goto out2; + } + + if (IS_ERR(attr)) { + err = PTR_ERR(attr); + goto out2; + } if (ins_attr) *ins_attr = attr; @@ -1020,10 +1054,9 @@ insert_ext: out2: ni_remove_mi(ni, mi); mi_put(mi); - err = -EINVAL; out1: - ntfs_mark_rec_free(sbi, rno); + ntfs_mark_rec_free(sbi, rno, is_mft); out: return err; @@ -1076,6 +1109,11 @@ static int ni_insert_attr(struct ntfs_inode *ni, enum ATTR_TYPE type, if (asize <= free) { attr = ni_ins_new_attr(ni, &ni->mi, NULL, type, name, name_len, asize, name_off, svcn, ins_le); + if (IS_ERR(attr)) { + err = PTR_ERR(attr); + goto out; + } + if (attr) { if (ins_attr) *ins_attr = attr; @@ -1173,6 +1211,11 @@ static int ni_insert_attr(struct ntfs_inode *ni, enum ATTR_TYPE type, goto out; } + if (IS_ERR(attr)) { + err = PTR_ERR(attr); + goto out; + } + if (ins_attr) *ins_attr = attr; if (ins_mi) @@ -1218,7 +1261,7 @@ static int ni_expand_mft_list(struct ntfs_inode *ni) mft_min = mft_new; mi_min = mi_new; } else { - ntfs_mark_rec_free(sbi, mft_new); + ntfs_mark_rec_free(sbi, mft_new, true); mft_new = 0; ni_remove_mi(ni, mi_new); } @@ -1262,7 +1305,7 @@ static int ni_expand_mft_list(struct ntfs_inode *ni) done = asize - run_size - SIZEOF_NONRESIDENT; le32_sub_cpu(&ni->mi.mrec->used, done); - /* Estimate the size of second part: run_buf=NULL. */ + /* Estimate packed size (run_buf=NULL). */ err = run_pack(run, svcn, evcn + 1 - svcn, NULL, sbi->record_size, &plen); if (err < 0) @@ -1288,10 +1331,16 @@ static int ni_expand_mft_list(struct ntfs_inode *ni) goto out; } + if (IS_ERR(attr)) { + err = PTR_ERR(attr); + goto out; + } + attr->non_res = 1; attr->name_off = SIZEOF_NONRESIDENT_LE; attr->flags = 0; + /* This function can't fail - cause already checked above. */ run_pack(run, svcn, evcn + 1 - svcn, Add2Ptr(attr, SIZEOF_NONRESIDENT), run_size, &plen); @@ -1301,7 +1350,7 @@ static int ni_expand_mft_list(struct ntfs_inode *ni) out: if (mft_new) { - ntfs_mark_rec_free(sbi, mft_new); + ntfs_mark_rec_free(sbi, mft_new, true); ni_remove_mi(ni, mi_new); } @@ -1367,8 +1416,6 @@ int ni_expand_list(struct ntfs_inode *ni) /* Split MFT data as much as possible. */ err = ni_expand_mft_list(ni); - if (err) - goto out; out: return !err && !done ? -EOPNOTSUPP : err; @@ -1381,7 +1428,7 @@ int ni_insert_nonresident(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, u8 name_len, const struct runs_tree *run, CLST svcn, CLST len, __le16 flags, struct ATTRIB **new_attr, - struct mft_inode **mi) + struct mft_inode **mi, struct ATTR_LIST_ENTRY **le) { int err; CLST plen; @@ -1394,6 +1441,7 @@ int ni_insert_nonresident(struct ntfs_inode *ni, enum ATTR_TYPE type, u32 run_size, asize; struct ntfs_sb_info *sbi = ni->mi.sbi; + /* Estimate packed size (run_buf=NULL). */ err = run_pack(run, svcn, len, NULL, sbi->max_bytes_per_attr - run_off, &plen); if (err < 0) @@ -1414,7 +1462,7 @@ int ni_insert_nonresident(struct ntfs_inode *ni, enum ATTR_TYPE type, } err = ni_insert_attr(ni, type, name, name_len, asize, name_off, svcn, - &attr, mi, NULL); + &attr, mi, le); if (err) goto out; @@ -1423,12 +1471,12 @@ int ni_insert_nonresident(struct ntfs_inode *ni, enum ATTR_TYPE type, attr->name_off = cpu_to_le16(name_off); attr->flags = flags; + /* This function can't fail - cause already checked above. */ run_pack(run, svcn, len, Add2Ptr(attr, run_off), run_size, &plen); attr->nres.svcn = cpu_to_le64(svcn); attr->nres.evcn = cpu_to_le64((u64)svcn + len - 1); - err = 0; if (new_attr) *new_attr = attr; @@ -1560,7 +1608,7 @@ int ni_delete_all(struct ntfs_inode *ni) mi->dirty = true; mi_write(mi, 0); - ntfs_mark_rec_free(sbi, mi->rno); + ntfs_mark_rec_free(sbi, mi->rno, false); ni_remove_mi(ni, mi); mi_put(mi); node = next; @@ -1571,7 +1619,7 @@ int ni_delete_all(struct ntfs_inode *ni) ni->mi.dirty = true; err = mi_write(&ni->mi, 0); - ntfs_mark_rec_free(sbi, ni->mi.rno); + ntfs_mark_rec_free(sbi, ni->mi.rno, false); return err; } @@ -1589,7 +1637,8 @@ struct ATTR_FILE_NAME *ni_fname_name(struct ntfs_inode *ni, struct ATTRIB *attr = NULL; struct ATTR_FILE_NAME *fname; - *le = NULL; + if (le) + *le = NULL; /* Enumerate all names. */ next: @@ -1605,7 +1654,7 @@ next: goto next; if (!uni) - goto next; + return fname; if (uni->len != fname->name_len) goto next; @@ -2302,10 +2351,8 @@ remove_wof: out: kfree(pages); - if (err) { - make_bad_inode(inode); - ntfs_set_state(sbi, NTFS_DIRTY_ERROR); - } + if (err) + _ntfs_bad_inode(inode); return err; } @@ -2944,7 +2991,7 @@ bool ni_remove_name_undo(struct ntfs_inode *dir_ni, struct ntfs_inode *ni, } /* - * ni_add_name - Add new name in MFT and in directory. + * ni_add_name - Add new name into MFT and into directory. */ int ni_add_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni, struct NTFS_DE *de) @@ -2953,13 +3000,20 @@ int ni_add_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni, struct ATTRIB *attr; struct ATTR_LIST_ENTRY *le; struct mft_inode *mi; + struct ATTR_FILE_NAME *fname; struct ATTR_FILE_NAME *de_name = (struct ATTR_FILE_NAME *)(de + 1); u16 de_key_size = le16_to_cpu(de->key_size); mi_get_ref(&ni->mi, &de->ref); mi_get_ref(&dir_ni->mi, &de_name->home); - /* Insert new name in MFT. */ + /* Fill duplicate from any ATTR_NAME. */ + fname = ni_fname_name(ni, NULL, NULL, NULL, NULL); + if (fname) + memcpy(&de_name->dup, &fname->dup, sizeof(fname->dup)); + de_name->dup.fa = ni->std_fa; + + /* Insert new name into MFT. */ err = ni_insert_resident(ni, de_key_size, ATTR_NAME, NULL, 0, &attr, &mi, &le); if (err) @@ -2967,7 +3021,7 @@ int ni_add_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni, memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), de_name, de_key_size); - /* Insert new name in directory. */ + /* Insert new name into directory. */ err = indx_insert_entry(&dir_ni->dir, dir_ni, de, ni->mi.sbi, NULL, 0); if (err) ni_remove_attr_le(ni, attr, mi, le); @@ -2991,7 +3045,7 @@ int ni_rename(struct ntfs_inode *dir_ni, struct ntfs_inode *new_dir_ni, * 1) Add new name and remove old name. * 2) Remove old name and add new name. * - * In most cases (not all!) adding new name in MFT and in directory can + * In most cases (not all!) adding new name into MFT and into directory can * allocate additional cluster(s). * Second way may result to bad inode if we can't add new name * and then can't restore (add) old name. @@ -3261,7 +3315,7 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint) err = err2; if (is_empty) { - ntfs_mark_rec_free(sbi, mi->rno); + ntfs_mark_rec_free(sbi, mi->rno, false); rb_erase(node, &ni->mi_tree); mi_put(mi); } diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c index 49b7df616778..e7c494005122 100644 --- a/fs/ntfs3/fslog.c +++ b/fs/ntfs3/fslog.c @@ -3843,6 +3843,8 @@ int log_replay(struct ntfs_inode *ni, bool *initialized) memset(&rst_info2, 0, sizeof(struct restart_info)); err = log_read_rst(log, l_size, false, &rst_info2); + if (err) + goto out; /* Determine which restart area to use. */ if (!rst_info2.restart || rst_info2.last_lsn <= rst_info.last_lsn) @@ -5057,7 +5059,7 @@ undo_action_next: goto add_allocated_vcns; vcn = le64_to_cpu(lrh->target_vcn); - vcn &= ~(log->clst_per_page - 1); + vcn &= ~(u64)(log->clst_per_page - 1); add_allocated_vcns: for (i = 0, vcn = le64_to_cpu(lrh->target_vcn), diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c index 1835e35199c2..4ed15f64b17f 100644 --- a/fs/ntfs3/fsntfs.c +++ b/fs/ntfs3/fsntfs.c @@ -703,12 +703,14 @@ out: /* * ntfs_mark_rec_free - Mark record as free. + * is_mft - true if we are changing MFT */ -void ntfs_mark_rec_free(struct ntfs_sb_info *sbi, CLST rno) +void ntfs_mark_rec_free(struct ntfs_sb_info *sbi, CLST rno, bool is_mft) { struct wnd_bitmap *wnd = &sbi->mft.bitmap; - down_write_nested(&wnd->rw_lock, BITMAP_MUTEX_MFT); + if (!is_mft) + down_write_nested(&wnd->rw_lock, BITMAP_MUTEX_MFT); if (rno >= wnd->nbits) goto out; @@ -727,7 +729,8 @@ void ntfs_mark_rec_free(struct ntfs_sb_info *sbi, CLST rno) sbi->mft.next_free = rno; out: - up_write(&wnd->rw_lock); + if (!is_mft) + up_write(&wnd->rw_lock); } /* @@ -780,7 +783,7 @@ out: */ int ntfs_refresh_zone(struct ntfs_sb_info *sbi) { - CLST zone_limit, zone_max, lcn, vcn, len; + CLST lcn, vcn, len; size_t lcn_s, zlen; struct wnd_bitmap *wnd = &sbi->used.bitmap; struct ntfs_inode *ni = sbi->mft.ni; @@ -789,16 +792,6 @@ int ntfs_refresh_zone(struct ntfs_sb_info *sbi) if (wnd_zone_len(wnd)) return 0; - /* - * Compute the MFT zone at two steps. - * It would be nice if we are able to allocate 1/8 of - * total clusters for MFT but not more then 512 MB. - */ - zone_limit = (512 * 1024 * 1024) >> sbi->cluster_bits; - zone_max = wnd->nbits >> 3; - if (zone_max > zone_limit) - zone_max = zone_limit; - vcn = bytes_to_cluster(sbi, (u64)sbi->mft.bitmap.nbits << sbi->record_bits); @@ -812,13 +805,7 @@ int ntfs_refresh_zone(struct ntfs_sb_info *sbi) lcn_s = lcn + 1; /* Try to allocate clusters after last MFT run. */ - zlen = wnd_find(wnd, zone_max, lcn_s, 0, &lcn_s); - if (!zlen) { - ntfs_notice(sbi->sb, "MftZone: unavailable"); - return 0; - } - - /* Truncate too large zone. */ + zlen = wnd_find(wnd, sbi->zone_max, lcn_s, 0, &lcn_s); wnd_zone_set(wnd, lcn_s, zlen); return 0; @@ -827,16 +814,21 @@ int ntfs_refresh_zone(struct ntfs_sb_info *sbi) /* * ntfs_update_mftmirr - Update $MFTMirr data. */ -int ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait) +void ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait) { int err; struct super_block *sb = sbi->sb; - u32 blocksize = sb->s_blocksize; + u32 blocksize; sector_t block1, block2; u32 bytes; + if (!sb) + return; + + blocksize = sb->s_blocksize; + if (!(sbi->flags & NTFS_FLAGS_MFTMIRR)) - return 0; + return; err = 0; bytes = sbi->mft.recs_mirr << sbi->record_bits; @@ -847,16 +839,13 @@ int ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait) struct buffer_head *bh1, *bh2; bh1 = sb_bread(sb, block1++); - if (!bh1) { - err = -EIO; - goto out; - } + if (!bh1) + return; bh2 = sb_getblk(sb, block2++); if (!bh2) { put_bh(bh1); - err = -EIO; - goto out; + return; } if (buffer_locked(bh2)) @@ -876,13 +865,24 @@ int ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait) put_bh(bh2); if (err) - goto out; + return; } sbi->flags &= ~NTFS_FLAGS_MFTMIRR; +} -out: - return err; +/* + * ntfs_bad_inode + * + * Marks inode as bad and marks fs as 'dirty' + */ +void ntfs_bad_inode(struct inode *inode, const char *hint) +{ + struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info; + + ntfs_inode_err(inode, "%s", hint); + make_bad_inode(inode); + ntfs_set_state(sbi, NTFS_DIRTY_ERROR); } /* @@ -1395,7 +1395,7 @@ int ntfs_write_bh(struct ntfs_sb_info *sbi, struct NTFS_RECORD_HEADER *rhdr, if (buffer_locked(bh)) __wait_on_buffer(bh); - lock_buffer(nb->bh[idx]); + lock_buffer(bh); bh_data = bh->b_data + off; end_data = Add2Ptr(bh_data, op); @@ -2424,7 +2424,7 @@ static inline void ntfs_unmap_and_discard(struct ntfs_sb_info *sbi, CLST lcn, void mark_as_free_ex(struct ntfs_sb_info *sbi, CLST lcn, CLST len, bool trim) { - CLST end, i; + CLST end, i, zone_len, zlen; struct wnd_bitmap *wnd = &sbi->used.bitmap; down_write_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS); @@ -2459,6 +2459,28 @@ void mark_as_free_ex(struct ntfs_sb_info *sbi, CLST lcn, CLST len, bool trim) ntfs_unmap_and_discard(sbi, lcn, len); wnd_set_free(wnd, lcn, len); + /* append to MFT zone, if possible. */ + zone_len = wnd_zone_len(wnd); + zlen = min(zone_len + len, sbi->zone_max); + + if (zlen == zone_len) { + /* MFT zone already has maximum size. */ + } else if (!zone_len) { + /* Create MFT zone only if 'zlen' is large enough. */ + if (zlen == sbi->zone_max) + wnd_zone_set(wnd, lcn, zlen); + } else { + CLST zone_lcn = wnd_zone_bit(wnd); + + if (lcn + len == zone_lcn) { + /* Append into head MFT zone. */ + wnd_zone_set(wnd, lcn, zlen); + } else if (zone_lcn + zone_len == lcn) { + /* Append into tail MFT zone. */ + wnd_zone_set(wnd, zone_lcn, zlen); + } + } + out: up_write(&wnd->rw_lock); } diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c index 6f81e3a49abf..440328147e7e 100644 --- a/fs/ntfs3/index.c +++ b/fs/ntfs3/index.c @@ -1042,19 +1042,16 @@ int indx_find(struct ntfs_index *indx, struct ntfs_inode *ni, { int err; struct NTFS_DE *e; - const struct INDEX_HDR *hdr; struct indx_node *node; if (!root) root = indx_get_root(&ni->dir, ni, NULL, NULL); if (!root) { - err = -EINVAL; - goto out; + /* Should not happen. */ + return -EINVAL; } - hdr = &root->ihdr; - /* Check cache. */ e = fnd->level ? fnd->de[fnd->level - 1] : fnd->root_de; if (e && !de_is_last(e) && @@ -1068,39 +1065,35 @@ int indx_find(struct ntfs_index *indx, struct ntfs_inode *ni, fnd_clear(fnd); /* Lookup entry that is <= to the search value. */ - e = hdr_find_e(indx, hdr, key, key_len, ctx, diff); + e = hdr_find_e(indx, &root->ihdr, key, key_len, ctx, diff); if (!e) return -EINVAL; fnd->root_de = e; - err = 0; for (;;) { node = NULL; - if (*diff >= 0 || !de_has_vcn_ex(e)) { - *entry = e; - goto out; - } + if (*diff >= 0 || !de_has_vcn_ex(e)) + break; /* Read next level. */ err = indx_read(indx, ni, de_get_vbn(e), &node); if (err) - goto out; + return err; /* Lookup entry that is <= to the search value. */ e = hdr_find_e(indx, &node->index->ihdr, key, key_len, ctx, diff); if (!e) { - err = -EINVAL; put_indx_node(node); - goto out; + return -EINVAL; } fnd_push(fnd, node, e); } -out: - return err; + *entry = e; + return 0; } int indx_find_sort(struct ntfs_index *indx, struct ntfs_inode *ni, @@ -1354,7 +1347,7 @@ static int indx_create_allocate(struct ntfs_index *indx, struct ntfs_inode *ni, goto out; err = ni_insert_nonresident(ni, ATTR_ALLOC, in->name, in->name_len, - &run, 0, len, 0, &alloc, NULL); + &run, 0, len, 0, &alloc, NULL, NULL); if (err) goto out1; @@ -1685,8 +1678,8 @@ indx_insert_into_buffer(struct ntfs_index *indx, struct ntfs_inode *ni, { int err; const struct NTFS_DE *sp; - struct NTFS_DE *e, *de_t, *up_e = NULL; - struct indx_node *n2 = NULL; + struct NTFS_DE *e, *de_t, *up_e; + struct indx_node *n2; struct indx_node *n1 = fnd->nodes[level]; struct INDEX_HDR *hdr1 = &n1->index->ihdr; struct INDEX_HDR *hdr2; @@ -1994,7 +1987,7 @@ static int indx_free_children(struct ntfs_index *indx, struct ntfs_inode *ni, const struct NTFS_DE *e, bool trim) { int err; - struct indx_node *n; + struct indx_node *n = NULL; struct INDEX_HDR *hdr; CLST vbn = de_get_vbn(e); size_t i; diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 80104afeb2cd..26a76ebfe58f 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -430,6 +430,7 @@ end_enum: } else if (fname && fname->home.low == cpu_to_le32(MFT_REC_EXTEND) && fname->home.seq == cpu_to_le16(MFT_REC_EXTEND)) { /* Records in $Extend are not a files or general directories. */ + inode->i_op = &ntfs_file_inode_operations; } else { err = -EINVAL; goto out; @@ -500,7 +501,7 @@ struct inode *ntfs_iget5(struct super_block *sb, const struct MFT_REF *ref, inode = ntfs_read_mft(inode, name, ref); else if (ref->seq != ntfs_i(inode)->mi.mrec->seq) { /* Inode overlaps? */ - make_bad_inode(inode); + _ntfs_bad_inode(inode); } return inode; @@ -1632,7 +1633,7 @@ out4: ni->mi.dirty = false; discard_new_inode(inode); out3: - ntfs_mark_rec_free(sbi, ino); + ntfs_mark_rec_free(sbi, ino, false); out2: __putname(new_de); @@ -1655,7 +1656,6 @@ int ntfs_link_inode(struct inode *inode, struct dentry *dentry) struct ntfs_inode *ni = ntfs_i(inode); struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info; struct NTFS_DE *de; - struct ATTR_FILE_NAME *de_name; /* Allocate PATH_MAX bytes. */ de = __getname(); @@ -1670,15 +1670,6 @@ int ntfs_link_inode(struct inode *inode, struct dentry *dentry) if (err) goto out; - de_name = (struct ATTR_FILE_NAME *)(de + 1); - /* Fill duplicate info. */ - de_name->dup.cr_time = de_name->dup.m_time = de_name->dup.c_time = - de_name->dup.a_time = kernel2nt(&inode->i_ctime); - de_name->dup.alloc_size = de_name->dup.data_size = - cpu_to_le64(inode->i_size); - de_name->dup.fa = ni->std_fa; - de_name->dup.ea_size = de_name->dup.reparse = 0; - err = ni_add_name(ntfs_i(d_inode(dentry->d_parent)), ni, de); out: __putname(de); @@ -1731,9 +1722,7 @@ int ntfs_unlink_inode(struct inode *dir, const struct dentry *dentry) if (inode->i_nlink) mark_inode_dirty(inode); } else if (!ni_remove_name_undo(dir_ni, ni, de, de2, undo_remove)) { - make_bad_inode(inode); - ntfs_inode_err(inode, "failed to undo unlink"); - ntfs_set_state(sbi, NTFS_DIRTY_ERROR); + _ntfs_bad_inode(inode); } else { if (ni_is_dirty(dir)) mark_inode_dirty(dir); @@ -1938,8 +1927,6 @@ const struct inode_operations ntfs_link_inode_operations = { .setattr = ntfs3_setattr, .listxattr = ntfs_listxattr, .permission = ntfs_permission, - .get_acl = ntfs_get_acl, - .set_acl = ntfs_set_acl, }; const struct address_space_operations ntfs_aops = { diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c index bc741213ad84..bc22cc321a74 100644 --- a/fs/ntfs3/namei.c +++ b/fs/ntfs3/namei.c @@ -208,7 +208,7 @@ static int ntfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, } /* - * ntfs_rmdir - inode_operations::rm_dir + * ntfs_rmdir - inode_operations::rmdir */ static int ntfs_rmdir(struct inode *dir, struct dentry *dentry) { @@ -308,9 +308,7 @@ static int ntfs_rename(struct user_namespace *mnt_userns, struct inode *dir, err = ni_rename(dir_ni, new_dir_ni, ni, de, new_de, &is_bad); if (is_bad) { /* Restore after failed rename failed too. */ - make_bad_inode(inode); - ntfs_inode_err(inode, "failed to undo rename"); - ntfs_set_state(sbi, NTFS_DIRTY_ERROR); + _ntfs_bad_inode(inode); } else if (!err) { inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(dir); diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index 8dbdca03e1af..2c791222c4e2 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -220,6 +220,7 @@ struct ntfs_sb_info { u32 flags; // See NTFS_FLAGS_XXX. + CLST zone_max; // Maximum MFT zone length in clusters CLST bad_clusters; // The count of marked bad clusters. u16 max_bytes_per_attr; // Maximum attribute size in record. @@ -408,8 +409,6 @@ enum REPARSE_SIGN { }; /* Functions from attrib.c */ -int attr_load_runs(struct ATTRIB *attr, struct ntfs_inode *ni, - struct runs_tree *run, const CLST *vcn); int attr_allocate_clusters(struct ntfs_sb_info *sbi, struct runs_tree *run, CLST vcn, CLST lcn, CLST len, CLST *pre_alloc, enum ALLOCATE_OPT opt, CLST *alen, const size_t fr, @@ -440,6 +439,7 @@ int attr_is_frame_compressed(struct ntfs_inode *ni, struct ATTRIB *attr, int attr_allocate_frame(struct ntfs_inode *ni, CLST frame, size_t compr_size, u64 new_valid); int attr_collapse_range(struct ntfs_inode *ni, u64 vbo, u64 bytes); +int attr_insert_range(struct ntfs_inode *ni, u64 vbo, u64 bytes); int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size); /* Functions from attrlist.c */ @@ -528,7 +528,7 @@ int ni_insert_nonresident(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, u8 name_len, const struct runs_tree *run, CLST svcn, CLST len, __le16 flags, struct ATTRIB **new_attr, - struct mft_inode **mi); + struct mft_inode **mi, struct ATTR_LIST_ENTRY **le); int ni_insert_resident(struct ntfs_inode *ni, u32 data_size, enum ATTR_TYPE type, const __le16 *name, u8 name_len, struct ATTRIB **new_attr, struct mft_inode **mi, @@ -589,10 +589,12 @@ int ntfs_look_for_free_space(struct ntfs_sb_info *sbi, CLST lcn, CLST len, enum ALLOCATE_OPT opt); int ntfs_look_free_mft(struct ntfs_sb_info *sbi, CLST *rno, bool mft, struct ntfs_inode *ni, struct mft_inode **mi); -void ntfs_mark_rec_free(struct ntfs_sb_info *sbi, CLST rno); +void ntfs_mark_rec_free(struct ntfs_sb_info *sbi, CLST rno, bool is_mft); int ntfs_clear_mft_tail(struct ntfs_sb_info *sbi, size_t from, size_t to); int ntfs_refresh_zone(struct ntfs_sb_info *sbi); -int ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait); +void ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait); +void ntfs_bad_inode(struct inode *inode, const char *hint); +#define _ntfs_bad_inode(i) ntfs_bad_inode(i, __func__) enum NTFS_DIRTY_FLAGS { NTFS_DIRTY_CLEAR = 0, NTFS_DIRTY_DIRTY = 1, @@ -738,7 +740,6 @@ static inline struct ATTRIB *rec_find_attr_le(struct mft_inode *rec, int mi_write(struct mft_inode *mi, int wait); int mi_format_new(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno, __le16 flags, bool is_mft); -void mi_mark_free(struct mft_inode *mi); struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type, const __le16 *name, u8 name_len, u32 asize, u16 name_off); @@ -780,10 +781,10 @@ bool run_lookup_entry(const struct runs_tree *run, CLST vcn, CLST *lcn, void run_truncate(struct runs_tree *run, CLST vcn); void run_truncate_head(struct runs_tree *run, CLST vcn); void run_truncate_around(struct runs_tree *run, CLST vcn); -bool run_lookup(const struct runs_tree *run, CLST vcn, size_t *Index); bool run_add_entry(struct runs_tree *run, CLST vcn, CLST lcn, CLST len, bool is_mft); bool run_collapse_range(struct runs_tree *run, CLST vcn, CLST len); +bool run_insert_range(struct runs_tree *run, CLST vcn, CLST len); bool run_get_entry(const struct runs_tree *run, size_t index, CLST *vcn, CLST *lcn, CLST *len); bool run_is_mapped_full(const struct runs_tree *run, CLST svcn, CLST evcn); @@ -802,6 +803,7 @@ int run_unpack_ex(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino, #define run_unpack_ex run_unpack #endif int run_get_highest_vcn(CLST vcn, const u8 *run_buf, u64 *highest_vcn); +int run_clone(const struct runs_tree *run, struct runs_tree *new_run); /* Globals from super.c */ void *ntfs_set_shared(void *ptr, u32 bytes); diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c index 861e35791506..7d2fac5ee215 100644 --- a/fs/ntfs3/record.c +++ b/fs/ntfs3/record.c @@ -395,28 +395,6 @@ int mi_format_new(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno, } /* - * mi_mark_free - Mark record as unused and marks it as free in bitmap. - */ -void mi_mark_free(struct mft_inode *mi) -{ - CLST rno = mi->rno; - struct ntfs_sb_info *sbi = mi->sbi; - - if (rno >= MFT_REC_RESERVED && rno < MFT_REC_FREE) { - ntfs_clear_mft_tail(sbi, rno, rno + 1); - mi->dirty = false; - return; - } - - if (mi->mrec) { - clear_rec_inuse(mi->mrec); - mi->dirty = true; - mi_write(mi, 0); - } - ntfs_mark_rec_free(sbi, rno); -} - -/* * mi_insert_attr - Reserve space for new attribute. * * Return: Not full constructed attribute or NULL if not possible to create. @@ -445,12 +423,11 @@ struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type, attr = NULL; while ((attr = mi_enum_attr(mi, attr))) { diff = compare_attr(attr, type, name, name_len, upcase); - if (diff > 0) - break; + if (diff < 0) continue; - if (!is_attr_indexed(attr)) + if (!diff && !is_attr_indexed(attr)) return NULL; break; } diff --git a/fs/ntfs3/run.c b/fs/ntfs3/run.c index a8fec651f973..aaaa0d3d35a2 100644 --- a/fs/ntfs3/run.c +++ b/fs/ntfs3/run.c @@ -31,7 +31,7 @@ struct ntfs_run { * Case of entry missing from list 'index' will be set to * point to insertion position for the entry question. */ -bool run_lookup(const struct runs_tree *run, CLST vcn, size_t *index) +static bool run_lookup(const struct runs_tree *run, CLST vcn, size_t *index) { size_t min_idx, max_idx, mid_idx; struct ntfs_run *r; @@ -547,6 +547,48 @@ bool run_collapse_range(struct runs_tree *run, CLST vcn, CLST len) return true; } +/* run_insert_range + * + * Helper for attr_insert_range(), + * which is helper for fallocate(insert_range). + */ +bool run_insert_range(struct runs_tree *run, CLST vcn, CLST len) +{ + size_t index; + struct ntfs_run *r, *e; + + if (WARN_ON(!run_lookup(run, vcn, &index))) + return false; /* Should never be here. */ + + e = run->runs + run->count; + r = run->runs + index; + + if (vcn > r->vcn) + r += 1; + + for (; r < e; r++) + r->vcn += len; + + r = run->runs + index; + + if (vcn > r->vcn) { + /* split fragment. */ + CLST len1 = vcn - r->vcn; + CLST len2 = r->len - len1; + CLST lcn2 = r->lcn == SPARSE_LCN ? SPARSE_LCN : (r->lcn + len1); + + r->len = len1; + + if (!run_add_entry(run, vcn + len, lcn2, len2, false)) + return false; + } + + if (!run_add_entry(run, vcn, SPARSE_LCN, len, false)) + return false; + + return true; +} + /* * run_get_entry - Return index-th mapped region. */ @@ -778,26 +820,36 @@ int run_pack(const struct runs_tree *run, CLST svcn, CLST len, u8 *run_buf, CLST next_vcn, vcn, lcn; CLST prev_lcn = 0; CLST evcn1 = svcn + len; + const struct ntfs_run *r, *r_end; int packed_size = 0; size_t i; - bool ok; s64 dlcn; int offset_size, size_size, tmp; - next_vcn = vcn = svcn; - *packed_vcns = 0; if (!len) goto out; - ok = run_lookup_entry(run, vcn, &lcn, &len, &i); + /* Check all required entries [svcn, encv1) available. */ + if (!run_lookup(run, svcn, &i)) + return -ENOENT; + + r_end = run->runs + run->count; + r = run->runs + i; - if (!ok) - goto error; + for (next_vcn = r->vcn + r->len; next_vcn < evcn1; + next_vcn = r->vcn + r->len) { + if (++r >= r_end || r->vcn != next_vcn) + return -ENOENT; + } - if (next_vcn != vcn) - goto error; + /* Repeat cycle above and pack runs. Assume no errors. */ + r = run->runs + i; + len = svcn - r->vcn; + vcn = svcn; + lcn = r->lcn == SPARSE_LCN ? SPARSE_LCN : (r->lcn + len); + len = r->len - len; for (;;) { next_vcn = vcn + len; @@ -846,12 +898,10 @@ int run_pack(const struct runs_tree *run, CLST svcn, CLST len, u8 *run_buf, if (packed_size + 1 >= run_buf_size || next_vcn >= evcn1) goto out; - ok = run_get_entry(run, ++i, &vcn, &lcn, &len); - if (!ok) - goto error; - - if (next_vcn != vcn) - goto error; + r += 1; + vcn = r->vcn; + lcn = r->lcn; + len = r->len; } out: @@ -860,9 +910,6 @@ out: run_buf[0] = 0; return packed_size + 1; - -error: - return -EOPNOTSUPP; } /* @@ -1109,3 +1156,28 @@ int run_get_highest_vcn(CLST vcn, const u8 *run_buf, u64 *highest_vcn) *highest_vcn = vcn64 - 1; return 0; } + +/* + * run_clone + * + * Make a copy of run + */ +int run_clone(const struct runs_tree *run, struct runs_tree *new_run) +{ + size_t bytes = run->count * sizeof(struct ntfs_run); + + if (bytes > new_run->allocated) { + struct ntfs_run *new_ptr = kvmalloc(bytes, GFP_KERNEL); + + if (!new_ptr) + return -ENOMEM; + + kvfree(new_run->runs); + new_run->runs = new_ptr; + new_run->allocated = bytes; + } + + memcpy(new_run->runs, run->runs, bytes); + new_run->count = run->count; + return 0; +} diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index 0c6de6287737..47012c9bf505 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -30,6 +30,7 @@ #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/log2.h> +#include <linux/minmax.h> #include <linux/module.h> #include <linux/nls.h> #include <linux/seq_file.h> @@ -390,7 +391,7 @@ static int ntfs_fs_reconfigure(struct fs_context *fc) return -EINVAL; } - memcpy(sbi->options, new_opts, sizeof(*new_opts)); + swap(sbi->options, fc->fs_private); return 0; } @@ -870,6 +871,13 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, sb->s_maxbytes = 0xFFFFFFFFull << sbi->cluster_bits; #endif + /* + * Compute the MFT zone at two steps. + * It would be nice if we are able to allocate 1/8 of + * total clusters for MFT but not more then 512 MB. + */ + sbi->zone_max = min_t(CLST, 0x20000000 >> sbi->cluster_bits, clusters >> 3); + err = 0; out: @@ -900,6 +908,8 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) ref.high = 0; sbi->sb = sb; + sbi->options = fc->fs_private; + fc->fs_private = NULL; sb->s_flags |= SB_NODIRATIME; sb->s_magic = 0x7366746e; // "ntfs" sb->s_op = &ntfs_sops; @@ -1262,8 +1272,6 @@ load_root: goto put_inode_out; } - fc->fs_private = NULL; - return 0; put_inode_out: @@ -1378,7 +1386,7 @@ static const struct fs_context_operations ntfs_context_ops = { /* * ntfs_init_fs_context - Initialize spi and opts * - * This will called when mount/remount. We will first initiliaze + * This will called when mount/remount. We will first initialize * options so that if remount we can use just that. */ static int ntfs_init_fs_context(struct fs_context *fc) @@ -1416,7 +1424,6 @@ static int ntfs_init_fs_context(struct fs_context *fc) mutex_init(&sbi->compress.mtx_lzx); #endif - sbi->options = opts; fc->s_fs_info = sbi; ok: fc->fs_private = opts; diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c index 5e0e0280e70d..7de8718c68a9 100644 --- a/fs/ntfs3/xattr.c +++ b/fs/ntfs3/xattr.c @@ -118,7 +118,7 @@ static int ntfs_read_ea(struct ntfs_inode *ni, struct EA_FULL **ea, run_init(&run); - err = attr_load_runs(attr_ea, ni, &run, NULL); + err = attr_load_runs_range(ni, ATTR_EA, NULL, 0, &run, 0, size); if (!err) err = ntfs_read_run_nb(sbi, &run, 0, ea_p, size, NULL); run_close(&run); @@ -444,6 +444,11 @@ update_ea: /* Delete xattr, ATTR_EA */ ni_remove_attr_le(ni, attr, mi, le); } else if (attr->non_res) { + err = attr_load_runs_range(ni, ATTR_EA, NULL, 0, &ea_run, 0, + size); + if (err) + goto out; + err = ntfs_sb_write_run(sbi, &ea_run, 0, ea_all, size, 0); if (err) goto out; @@ -478,8 +483,7 @@ out: } #ifdef CONFIG_NTFS3_FS_POSIX_ACL -static struct posix_acl *ntfs_get_acl_ex(struct user_namespace *mnt_userns, - struct inode *inode, int type, +static struct posix_acl *ntfs_get_acl_ex(struct inode *inode, int type, int locked) { struct ntfs_inode *ni = ntfs_i(inode); @@ -514,7 +518,7 @@ static struct posix_acl *ntfs_get_acl_ex(struct user_namespace *mnt_userns, /* Translate extended attribute to acl. */ if (err >= 0) { - acl = posix_acl_from_xattr(mnt_userns, buf, err); + acl = posix_acl_from_xattr(&init_user_ns, buf, err); } else if (err == -ENODATA) { acl = NULL; } else { @@ -537,8 +541,7 @@ struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu) if (rcu) return ERR_PTR(-ECHILD); - /* TODO: init_user_ns? */ - return ntfs_get_acl_ex(&init_user_ns, inode, type, 0); + return ntfs_get_acl_ex(inode, type, 0); } static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns, @@ -547,28 +550,23 @@ static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns, { const char *name; size_t size, name_len; - void *value = NULL; - int err = 0; + void *value; + int err; int flags; + umode_t mode; if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; + mode = inode->i_mode; switch (type) { case ACL_TYPE_ACCESS: /* Do not change i_mode if we are in init_acl */ if (acl && !init_acl) { - umode_t mode; - err = posix_acl_update_mode(mnt_userns, inode, &mode, &acl); if (err) - goto out; - - if (inode->i_mode != mode) { - inode->i_mode = mode; - mark_inode_dirty(inode); - } + return err; } name = XATTR_NAME_POSIX_ACL_ACCESS; name_len = sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1; @@ -595,7 +593,7 @@ static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns, value = kmalloc(size, GFP_NOFS); if (!value) return -ENOMEM; - err = posix_acl_to_xattr(mnt_userns, acl, value, size); + err = posix_acl_to_xattr(&init_user_ns, acl, value, size); if (err < 0) goto out; flags = 0; @@ -604,8 +602,13 @@ static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns, err = ntfs_set_ea(inode, name, name_len, value, size, flags, 0); if (err == -ENODATA && !size) err = 0; /* Removing non existed xattr. */ - if (!err) + if (!err) { set_cached_acl(inode, type, acl); + if (inode->i_mode != mode) { + inode->i_mode = mode; + mark_inode_dirty(inode); + } + } out: kfree(value); @@ -622,67 +625,6 @@ int ntfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, return ntfs_set_acl_ex(mnt_userns, inode, acl, type, false); } -static int ntfs_xattr_get_acl(struct user_namespace *mnt_userns, - struct inode *inode, int type, void *buffer, - size_t size) -{ - struct posix_acl *acl; - int err; - - if (!(inode->i_sb->s_flags & SB_POSIXACL)) { - ntfs_inode_warn(inode, "add mount option \"acl\" to use acl"); - return -EOPNOTSUPP; - } - - acl = ntfs_get_acl(inode, type, false); - if (IS_ERR(acl)) - return PTR_ERR(acl); - - if (!acl) - return -ENODATA; - - err = posix_acl_to_xattr(mnt_userns, acl, buffer, size); - posix_acl_release(acl); - - return err; -} - -static int ntfs_xattr_set_acl(struct user_namespace *mnt_userns, - struct inode *inode, int type, const void *value, - size_t size) -{ - struct posix_acl *acl; - int err; - - if (!(inode->i_sb->s_flags & SB_POSIXACL)) { - ntfs_inode_warn(inode, "add mount option \"acl\" to use acl"); - return -EOPNOTSUPP; - } - - if (!inode_owner_or_capable(mnt_userns, inode)) - return -EPERM; - - if (!value) { - acl = NULL; - } else { - acl = posix_acl_from_xattr(mnt_userns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - - if (acl) { - err = posix_acl_valid(mnt_userns, acl); - if (err) - goto release_and_out; - } - } - - err = ntfs_set_acl(mnt_userns, inode, acl, type); - -release_and_out: - posix_acl_release(acl); - return err; -} - /* * ntfs_init_acl - Initialize the ACLs of a new inode. * @@ -706,13 +648,13 @@ int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode, inode->i_default_acl = NULL; } - if (!acl) - inode->i_acl = NULL; - else { + if (acl) { if (!err) err = ntfs_set_acl_ex(mnt_userns, inode, acl, ACL_TYPE_ACCESS, true); posix_acl_release(acl); + } else { + inode->i_acl = NULL; } return err; @@ -849,23 +791,6 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de, goto out; } -#ifdef CONFIG_NTFS3_FS_POSIX_ACL - if ((name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 && - !memcmp(name, XATTR_NAME_POSIX_ACL_ACCESS, - sizeof(XATTR_NAME_POSIX_ACL_ACCESS))) || - (name_len == sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1 && - !memcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, - sizeof(XATTR_NAME_POSIX_ACL_DEFAULT)))) { - /* TODO: init_user_ns? */ - err = ntfs_xattr_get_acl( - &init_user_ns, inode, - name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 - ? ACL_TYPE_ACCESS - : ACL_TYPE_DEFAULT, - buffer, size); - goto out; - } -#endif /* Deal with NTFS extended attribute. */ err = ntfs_get_ea(inode, name, name_len, buffer, size, NULL); @@ -978,22 +903,6 @@ set_new_fa: goto out; } -#ifdef CONFIG_NTFS3_FS_POSIX_ACL - if ((name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 && - !memcmp(name, XATTR_NAME_POSIX_ACL_ACCESS, - sizeof(XATTR_NAME_POSIX_ACL_ACCESS))) || - (name_len == sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1 && - !memcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, - sizeof(XATTR_NAME_POSIX_ACL_DEFAULT)))) { - err = ntfs_xattr_set_acl( - mnt_userns, inode, - name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 - ? ACL_TYPE_ACCESS - : ACL_TYPE_DEFAULT, - value, size); - goto out; - } -#endif /* Deal with NTFS extended attribute. */ err = ntfs_set_ea(inode, name, name_len, value, size, flags, 0); @@ -1083,7 +992,7 @@ static bool ntfs_xattr_user_list(struct dentry *dentry) } // clang-format off -static const struct xattr_handler ntfs_xattr_handler = { +static const struct xattr_handler ntfs_other_xattr_handler = { .prefix = "", .get = ntfs_getxattr, .set = ntfs_setxattr, @@ -1091,7 +1000,11 @@ static const struct xattr_handler ntfs_xattr_handler = { }; const struct xattr_handler *ntfs_xattr_handlers[] = { - &ntfs_xattr_handler, +#ifdef CONFIG_NTFS3_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif + &ntfs_other_xattr_handler, NULL, }; // clang-format on diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 81c3d65d68fe..694471fc46b8 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -2032,7 +2032,7 @@ struct ocfs2_empty_dir_priv { unsigned seen_other; unsigned dx_dir; }; -static int ocfs2_empty_dir_filldir(struct dir_context *ctx, const char *name, +static bool ocfs2_empty_dir_filldir(struct dir_context *ctx, const char *name, int name_len, loff_t pos, u64 ino, unsigned type) { @@ -2052,7 +2052,7 @@ static int ocfs2_empty_dir_filldir(struct dir_context *ctx, const char *name, */ if (name_len == 1 && !strncmp(".", name, 1) && pos == 0) { p->seen_dot = 1; - return 0; + return true; } if (name_len == 2 && !strncmp("..", name, 2) && @@ -2060,13 +2060,13 @@ static int ocfs2_empty_dir_filldir(struct dir_context *ctx, const char *name, p->seen_dot_dot = 1; if (p->dx_dir && p->seen_dot) - return 1; + return false; - return 0; + return true; } p->seen_other = 1; - return 1; + return false; } static int ocfs2_empty_dir_dx(struct inode *inode, diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 801e60bab955..c28bc983a7b1 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -3403,10 +3403,12 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb, ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres); ocfs2_lock_res_free(&osb->osb_orphan_scan.os_lockres); - ocfs2_cluster_disconnect(osb->cconn, hangup_pending); - osb->cconn = NULL; + if (osb->cconn) { + ocfs2_cluster_disconnect(osb->cconn, hangup_pending); + osb->cconn = NULL; - ocfs2_dlm_shutdown_debug(osb); + ocfs2_dlm_shutdown_debug(osb); + } } static int ocfs2_drop_lock(struct ocfs2_super *osb, diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index fa87d89cf754..126671e6caed 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -2057,7 +2057,7 @@ struct ocfs2_orphan_filldir_priv { enum ocfs2_orphan_reco_type orphan_reco_type; }; -static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, +static bool ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, int name_len, loff_t pos, u64 ino, unsigned type) { @@ -2066,21 +2066,21 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, struct inode *iter; if (name_len == 1 && !strncmp(".", name, 1)) - return 0; + return true; if (name_len == 2 && !strncmp("..", name, 2)) - return 0; + return true; /* do not include dio entry in case of orphan scan */ if ((p->orphan_reco_type == ORPHAN_NO_NEED_TRUNCATE) && (!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX, OCFS2_DIO_ORPHAN_PREFIX_LEN))) - return 0; + return true; /* Skip bad inodes so that recovery can continue */ iter = ocfs2_iget(p->osb, ino, OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0); if (IS_ERR(iter)) - return 0; + return true; if (!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX, OCFS2_DIO_ORPHAN_PREFIX_LEN)) @@ -2090,7 +2090,7 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, * happen concurrently with unlink/rename */ if (OCFS2_I(iter)->ip_next_orphan) { iput(iter); - return 0; + return true; } trace_ocfs2_orphan_filldir((unsigned long long)OCFS2_I(iter)->ip_blkno); @@ -2099,7 +2099,7 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, OCFS2_I(iter)->ip_next_orphan = p->head; p->head = iter; - return 0; + return true; } static int ocfs2_queue_orphans(struct ocfs2_super *osb, diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index a75e2b7d67f5..64e6ddcfe329 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -991,7 +991,7 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn) lc->oc_type = NO_CONTROLD; rc = dlm_new_lockspace(conn->cc_name, conn->cc_cluster_name, - DLM_LSFL_FS | DLM_LSFL_NEWEXCL, DLM_LVB_LEN, + DLM_LSFL_NEWEXCL, DLM_LVB_LEN, &ocfs2_ls_ops, conn, &ops_rv, &fsdlm); if (rc) { if (rc == -EEXIST || rc == -EPROTO) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 013a727bd7c8..e2cc9eec287c 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1914,8 +1914,7 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) !ocfs2_is_hard_readonly(osb)) hangup_needed = 1; - if (osb->cconn) - ocfs2_dlm_shutdown(osb, hangup_needed); + ocfs2_dlm_shutdown(osb, hangup_needed); ocfs2_blockcheck_stats_debugfs_remove(&osb->osb_ecc_stats); debugfs_remove_recursive(osb->osb_debug_root); diff --git a/fs/open.c b/fs/open.c index 8a813fa5ca56..a81319b6177f 100644 --- a/fs/open.c +++ b/fs/open.c @@ -716,6 +716,8 @@ int chown_common(const struct path *path, uid_t user, gid_t group) fs_userns = i_user_ns(inode); retry_deleg: + newattrs.ia_vfsuid = INVALID_VFSUID; + newattrs.ia_vfsgid = INVALID_VFSGID; newattrs.ia_valid = ATTR_CTIME; if ((user != (uid_t)-1) && !setattr_vfsuid(&newattrs, uid)) return -EINVAL; @@ -840,7 +842,9 @@ static int do_dentry_open(struct file *f, return 0; } - if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { + if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) { + i_readcount_inc(inode); + } else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { error = get_write_access(inode); if (unlikely(error)) goto cleanup_file; @@ -880,8 +884,6 @@ static int do_dentry_open(struct file *f, goto cleanup_all; } f->f_mode |= FMODE_OPENED; - if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) - i_readcount_inc(inode); if ((f->f_mode & FMODE_READ) && likely(f->f_op->read || f->f_op->read_iter)) f->f_mode |= FMODE_CAN_READ; @@ -935,10 +937,7 @@ cleanup_all: if (WARN_ON_ONCE(error > 0)) error = -EINVAL; fops_put(f->f_op); - if (f->f_mode & FMODE_WRITER) { - put_write_access(inode); - __mnt_drop_write(f->f_path.mnt); - } + put_file_access(f); cleanup_file: path_put(&f->f_path); f->f_path.mnt = NULL; diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 86810e5d7914..732661aa2680 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -417,9 +417,7 @@ static int orangefs_file_release(struct inode *inode, struct file *file) * readahead cache (if any); this forces an expensive refresh of * data for the next caller of mmap (or 'get_block' accesses) */ - if (file_inode(file) && - file_inode(file)->i_mapping && - mapping_nrpages(&file_inode(file)->i_data)) { + if (mapping_nrpages(file->f_mapping)) { if (orangefs_features & ORANGEFS_FEATURE_READAHEAD) { gossip_debug(GOSSIP_INODE_DEBUG, "calling flush_racache on %pU\n", diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 19bd9af8b8a8..9e61511de7a7 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -460,9 +460,12 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) * of the POSIX ACLs retrieved from the lower layer to this function to not * alter the POSIX ACLs for the underlying filesystem. */ -static void ovl_idmap_posix_acl(struct user_namespace *mnt_userns, +static void ovl_idmap_posix_acl(struct inode *realinode, + struct user_namespace *mnt_userns, struct posix_acl *acl) { + struct user_namespace *fs_userns = i_user_ns(realinode); + for (unsigned int i = 0; i < acl->a_count; i++) { vfsuid_t vfsuid; vfsgid_t vfsgid; @@ -470,11 +473,11 @@ static void ovl_idmap_posix_acl(struct user_namespace *mnt_userns, struct posix_acl_entry *e = &acl->a_entries[i]; switch (e->e_tag) { case ACL_USER: - vfsuid = make_vfsuid(mnt_userns, &init_user_ns, e->e_uid); + vfsuid = make_vfsuid(mnt_userns, fs_userns, e->e_uid); e->e_uid = vfsuid_into_kuid(vfsuid); break; case ACL_GROUP: - vfsgid = make_vfsgid(mnt_userns, &init_user_ns, e->e_gid); + vfsgid = make_vfsgid(mnt_userns, fs_userns, e->e_gid); e->e_gid = vfsgid_into_kgid(vfsgid); break; } @@ -536,7 +539,7 @@ struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu) if (!clone) clone = ERR_PTR(-ENOMEM); else - ovl_idmap_posix_acl(mnt_user_ns(realpath.mnt), clone); + ovl_idmap_posix_acl(realinode, mnt_user_ns(realpath.mnt), clone); /* * Since we're not in RCU path walk we always need to release the * original ACLs. diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 1ac96eaeda54..59624521eeb2 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -250,7 +250,7 @@ static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry, size_t size, int flags) { int err = vfs_setxattr(ovl_upper_mnt_userns(ofs), dentry, name, - (void *)value, size, flags); + value, size, flags); pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, %d) = %i\n", dentry, name, min((int)size, 48), value, size, flags, err); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 268ed415577a..2b210640036c 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -170,7 +170,7 @@ static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd, return p; } -static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, +static bool ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, const char *name, int len, u64 ino, unsigned int d_type) { @@ -179,22 +179,22 @@ static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, struct ovl_cache_entry *p; if (ovl_cache_entry_find_link(name, len, &newp, &parent)) - return 0; + return true; p = ovl_cache_entry_new(rdd, name, len, ino, d_type); if (p == NULL) { rdd->err = -ENOMEM; - return -ENOMEM; + return false; } list_add_tail(&p->l_node, rdd->list); rb_link_node(&p->node, parent, newp); rb_insert_color(&p->node, rdd->root); - return 0; + return true; } -static int ovl_fill_lowest(struct ovl_readdir_data *rdd, +static bool ovl_fill_lowest(struct ovl_readdir_data *rdd, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) { @@ -211,7 +211,7 @@ static int ovl_fill_lowest(struct ovl_readdir_data *rdd, list_add_tail(&p->l_node, &rdd->middle); } - return rdd->err; + return rdd->err == 0; } void ovl_cache_free(struct list_head *list) @@ -250,7 +250,7 @@ static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry) } } -static int ovl_fill_merge(struct dir_context *ctx, const char *name, +static bool ovl_fill_merge(struct dir_context *ctx, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) { @@ -528,7 +528,7 @@ fail: goto out; } -static int ovl_fill_plain(struct dir_context *ctx, const char *name, +static bool ovl_fill_plain(struct dir_context *ctx, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) { @@ -540,11 +540,11 @@ static int ovl_fill_plain(struct dir_context *ctx, const char *name, p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type); if (p == NULL) { rdd->err = -ENOMEM; - return -ENOMEM; + return false; } list_add_tail(&p->l_node, rdd->list); - return 0; + return true; } static int ovl_dir_read_impure(const struct path *path, struct list_head *list, @@ -648,7 +648,7 @@ struct ovl_readdir_translate { bool xinowarn; }; -static int ovl_fill_real(struct dir_context *ctx, const char *name, +static bool ovl_fill_real(struct dir_context *ctx, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) { @@ -1027,7 +1027,7 @@ void ovl_cleanup_whiteouts(struct ovl_fs *ofs, struct dentry *upper, inode_unlock(upper->d_inode); } -static int ovl_check_d_type(struct dir_context *ctx, const char *name, +static bool ovl_check_d_type(struct dir_context *ctx, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) { @@ -1036,12 +1036,12 @@ static int ovl_check_d_type(struct dir_context *ctx, const char *name, /* Even if d_type is not supported, DT_DIR is returned for . and .. */ if (!strncmp(name, ".", namelen) || !strncmp(name, "..", namelen)) - return 0; + return true; if (d_type != DT_UNKNOWN) rdd->d_type_supported = true; - return 0; + return true; } /* diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index d51a02d555e0..9ca98bea8e18 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1022,7 +1022,20 @@ ovl_posix_acl_xattr_set(const struct xattr_handler *handler, /* Check that everything is OK before copy-up */ if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); + /* The above comment can be understood in two ways: + * + * 1. We just want to check whether the basic POSIX ACL format + * is ok. For example, if the header is correct and the size + * is sane. + * 2. We want to know whether the ACL_{GROUP,USER} entries can + * be mapped according to the underlying filesystem. + * + * Currently, we only check 1. If we wanted to check 2. we + * would need to pass the mnt_userns and the fs_userns of the + * underlying filesystem. But frankly, I think checking 1. is + * enough to start the copy-up. + */ + acl = vfs_set_acl_prepare(&init_user_ns, &init_user_ns, value, size); if (IS_ERR(acl)) return PTR_ERR(acl); } diff --git a/fs/pipe.c b/fs/pipe.c index 74ae9fafd25a..42c7ff41c2db 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -860,7 +860,7 @@ static struct vfsmount *pipe_mnt __read_mostly; */ static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) { - return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", + return dynamic_dname(buffer, buflen, "pipe:[%lu]", d_inode(dentry)->i_ino); } diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 1d17d7b13dcd..b4f109875e79 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -361,6 +361,7 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode, const struct posix_acl *acl, int want) { const struct posix_acl_entry *pa, *pe, *mask_obj; + struct user_namespace *fs_userns = i_user_ns(inode); int found = 0; vfsuid_t vfsuid; vfsgid_t vfsgid; @@ -376,7 +377,7 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode, goto check_perm; break; case ACL_USER: - vfsuid = make_vfsuid(mnt_userns, &init_user_ns, + vfsuid = make_vfsuid(mnt_userns, fs_userns, pa->e_uid); if (vfsuid_eq_kuid(vfsuid, current_fsuid())) goto mask; @@ -390,7 +391,7 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode, } break; case ACL_GROUP: - vfsgid = make_vfsgid(mnt_userns, &init_user_ns, + vfsgid = make_vfsgid(mnt_userns, fs_userns, pa->e_gid); if (vfsgid_in_group_p(vfsgid)) { found = 1; @@ -709,9 +710,9 @@ EXPORT_SYMBOL(posix_acl_update_mode); /* * Fix up the uids and gids in posix acl extended attributes in place. */ -static int posix_acl_fix_xattr_common(void *value, size_t size) +static int posix_acl_fix_xattr_common(const void *value, size_t size) { - struct posix_acl_xattr_header *header = value; + const struct posix_acl_xattr_header *header = value; int count; if (!header) @@ -719,13 +720,13 @@ static int posix_acl_fix_xattr_common(void *value, size_t size) if (size < sizeof(struct posix_acl_xattr_header)) return -EINVAL; if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) - return -EINVAL; + return -EOPNOTSUPP; count = posix_acl_xattr_count(size); if (count < 0) return -EINVAL; if (count == 0) - return -EINVAL; + return 0; return count; } @@ -736,6 +737,7 @@ void posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns, { struct posix_acl_xattr_header *header = value; struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end; + struct user_namespace *fs_userns = i_user_ns(inode); int count; vfsuid_t vfsuid; vfsgid_t vfsgid; @@ -746,20 +748,20 @@ void posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns, return; count = posix_acl_fix_xattr_common(value, size); - if (count < 0) + if (count <= 0) return; for (end = entry + count; entry != end; entry++) { switch (le16_to_cpu(entry->e_tag)) { case ACL_USER: uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id)); - vfsuid = make_vfsuid(mnt_userns, &init_user_ns, uid); + vfsuid = make_vfsuid(mnt_userns, fs_userns, uid); entry->e_id = cpu_to_le32(from_kuid(&init_user_ns, vfsuid_into_kuid(vfsuid))); break; case ACL_GROUP: gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id)); - vfsgid = make_vfsgid(mnt_userns, &init_user_ns, gid); + vfsgid = make_vfsgid(mnt_userns, fs_userns, gid); entry->e_id = cpu_to_le32(from_kgid(&init_user_ns, vfsgid_into_kgid(vfsgid))); break; @@ -769,45 +771,6 @@ void posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns, } } -void posix_acl_setxattr_idmapped_mnt(struct user_namespace *mnt_userns, - const struct inode *inode, - void *value, size_t size) -{ - struct posix_acl_xattr_header *header = value; - struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end; - int count; - vfsuid_t vfsuid; - vfsgid_t vfsgid; - kuid_t uid; - kgid_t gid; - - if (no_idmapping(mnt_userns, i_user_ns(inode))) - return; - - count = posix_acl_fix_xattr_common(value, size); - if (count < 0) - return; - - for (end = entry + count; entry != end; entry++) { - switch (le16_to_cpu(entry->e_tag)) { - case ACL_USER: - uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id)); - vfsuid = VFSUIDT_INIT(uid); - uid = from_vfsuid(mnt_userns, &init_user_ns, vfsuid); - entry->e_id = cpu_to_le32(from_kuid(&init_user_ns, uid)); - break; - case ACL_GROUP: - gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id)); - vfsgid = VFSGIDT_INIT(gid); - gid = from_vfsgid(mnt_userns, &init_user_ns, vfsgid); - entry->e_id = cpu_to_le32(from_kgid(&init_user_ns, gid)); - break; - default: - break; - } - } -} - static void posix_acl_fix_xattr_userns( struct user_namespace *to, struct user_namespace *from, void *value, size_t size) @@ -819,7 +782,7 @@ static void posix_acl_fix_xattr_userns( kgid_t gid; count = posix_acl_fix_xattr_common(value, size); - if (count < 0) + if (count <= 0) return; for (end = entry + count; entry != end; entry++) { @@ -854,12 +817,32 @@ void posix_acl_fix_xattr_to_user(void *value, size_t size) posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size); } -/* - * Convert from extended attribute to in-memory representation. +/** + * make_posix_acl - convert POSIX ACLs from uapi to VFS format using the + * provided callbacks to map ACL_{GROUP,USER} entries into the + * appropriate format + * @mnt_userns: the mount's idmapping + * @fs_userns: the filesystem's idmapping + * @value: the uapi representation of POSIX ACLs + * @size: the size of @void + * @uid_cb: callback to use for mapping the uid stored in ACL_USER entries + * @gid_cb: callback to use for mapping the gid stored in ACL_GROUP entries + * + * The make_posix_acl() helper is an abstraction to translate from uapi format + * into the VFS format allowing the caller to specific callbacks to map + * ACL_{GROUP,USER} entries into the expected format. This is used in + * posix_acl_from_xattr() and vfs_set_acl_prepare() and avoids pointless code + * duplication. + * + * Return: Allocated struct posix_acl on success, NULL for a valid header but + * without actual POSIX ACL entries, or ERR_PTR() encoded error code. */ -struct posix_acl * -posix_acl_from_xattr(struct user_namespace *user_ns, - const void *value, size_t size) +static struct posix_acl *make_posix_acl(struct user_namespace *mnt_userns, + struct user_namespace *fs_userns, const void *value, size_t size, + kuid_t (*uid_cb)(struct user_namespace *, struct user_namespace *, + const struct posix_acl_xattr_entry *), + kgid_t (*gid_cb)(struct user_namespace *, struct user_namespace *, + const struct posix_acl_xattr_entry *)) { const struct posix_acl_xattr_header *header = value; const struct posix_acl_xattr_entry *entry = (const void *)(header + 1), *end; @@ -867,16 +850,9 @@ posix_acl_from_xattr(struct user_namespace *user_ns, struct posix_acl *acl; struct posix_acl_entry *acl_e; - if (!value) - return NULL; - if (size < sizeof(struct posix_acl_xattr_header)) - return ERR_PTR(-EINVAL); - if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) - return ERR_PTR(-EOPNOTSUPP); - - count = posix_acl_xattr_count(size); + count = posix_acl_fix_xattr_common(value, size); if (count < 0) - return ERR_PTR(-EINVAL); + return ERR_PTR(count); if (count == 0) return NULL; @@ -897,16 +873,12 @@ posix_acl_from_xattr(struct user_namespace *user_ns, break; case ACL_USER: - acl_e->e_uid = - make_kuid(user_ns, - le32_to_cpu(entry->e_id)); + acl_e->e_uid = uid_cb(mnt_userns, fs_userns, entry); if (!uid_valid(acl_e->e_uid)) goto fail; break; case ACL_GROUP: - acl_e->e_gid = - make_kgid(user_ns, - le32_to_cpu(entry->e_id)); + acl_e->e_gid = gid_cb(mnt_userns, fs_userns, entry); if (!gid_valid(acl_e->e_gid)) goto fail; break; @@ -921,6 +893,181 @@ fail: posix_acl_release(acl); return ERR_PTR(-EINVAL); } + +/** + * vfs_set_acl_prepare_kuid - map ACL_USER uid according to mount- and + * filesystem idmapping + * @mnt_userns: the mount's idmapping + * @fs_userns: the filesystem's idmapping + * @e: a ACL_USER entry in POSIX ACL uapi format + * + * The uid stored as ACL_USER entry in @e is a kuid_t stored as a raw {g,u}id + * value. The vfs_set_acl_prepare_kuid() will recover the kuid_t through + * KUIDT_INIT() and then map it according to the idmapped mount. The resulting + * kuid_t is the value which the filesystem can map up into a raw backing store + * id in the filesystem's idmapping. + * + * This is used in vfs_set_acl_prepare() to generate the proper VFS + * representation of POSIX ACLs with ACL_USER entries during setxattr(). + * + * Return: A kuid in @fs_userns for the uid stored in @e. + */ +static inline kuid_t +vfs_set_acl_prepare_kuid(struct user_namespace *mnt_userns, + struct user_namespace *fs_userns, + const struct posix_acl_xattr_entry *e) +{ + kuid_t kuid = KUIDT_INIT(le32_to_cpu(e->e_id)); + return from_vfsuid(mnt_userns, fs_userns, VFSUIDT_INIT(kuid)); +} + +/** + * vfs_set_acl_prepare_kgid - map ACL_GROUP gid according to mount- and + * filesystem idmapping + * @mnt_userns: the mount's idmapping + * @fs_userns: the filesystem's idmapping + * @e: a ACL_GROUP entry in POSIX ACL uapi format + * + * The gid stored as ACL_GROUP entry in @e is a kgid_t stored as a raw {g,u}id + * value. The vfs_set_acl_prepare_kgid() will recover the kgid_t through + * KGIDT_INIT() and then map it according to the idmapped mount. The resulting + * kgid_t is the value which the filesystem can map up into a raw backing store + * id in the filesystem's idmapping. + * + * This is used in vfs_set_acl_prepare() to generate the proper VFS + * representation of POSIX ACLs with ACL_GROUP entries during setxattr(). + * + * Return: A kgid in @fs_userns for the gid stored in @e. + */ +static inline kgid_t +vfs_set_acl_prepare_kgid(struct user_namespace *mnt_userns, + struct user_namespace *fs_userns, + const struct posix_acl_xattr_entry *e) +{ + kgid_t kgid = KGIDT_INIT(le32_to_cpu(e->e_id)); + return from_vfsgid(mnt_userns, fs_userns, VFSGIDT_INIT(kgid)); +} + +/** + * vfs_set_acl_prepare - convert POSIX ACLs from uapi to VFS format taking + * mount and filesystem idmappings into account + * @mnt_userns: the mount's idmapping + * @fs_userns: the filesystem's idmapping + * @value: the uapi representation of POSIX ACLs + * @size: the size of @void + * + * When setting POSIX ACLs with ACL_{GROUP,USER} entries they need to be + * mapped according to the relevant mount- and filesystem idmapping. It is + * important that the ACL_{GROUP,USER} entries in struct posix_acl will be + * mapped into k{g,u}id_t that are supposed to be mapped up in the filesystem + * idmapping. This is crucial since the resulting struct posix_acl might be + * cached filesystem wide. The vfs_set_acl_prepare() function will take care to + * perform all necessary idmappings. + * + * Note, that since basically forever the {g,u}id values encoded as + * ACL_{GROUP,USER} entries in the uapi POSIX ACLs passed via @value contain + * values that have been mapped according to the caller's idmapping. In other + * words, POSIX ACLs passed in uapi format as @value during setxattr() contain + * {g,u}id values in their ACL_{GROUP,USER} entries that should actually have + * been stored as k{g,u}id_t. + * + * This means, vfs_set_acl_prepare() needs to first recover the k{g,u}id_t by + * calling K{G,U}IDT_INIT(). Afterwards they can be interpreted as vfs{g,u}id_t + * through from_vfs{g,u}id() to account for any idmapped mounts. The + * vfs_set_acl_prepare_k{g,u}id() helpers will take care to generate the + * correct k{g,u}id_t. + * + * The filesystem will then receive the POSIX ACLs ready to be cached + * filesystem wide and ready to be written to the backing store taking the + * filesystem's idmapping into account. + * + * Return: Allocated struct posix_acl on success, NULL for a valid header but + * without actual POSIX ACL entries, or ERR_PTR() encoded error code. + */ +struct posix_acl *vfs_set_acl_prepare(struct user_namespace *mnt_userns, + struct user_namespace *fs_userns, + const void *value, size_t size) +{ + return make_posix_acl(mnt_userns, fs_userns, value, size, + vfs_set_acl_prepare_kuid, + vfs_set_acl_prepare_kgid); +} +EXPORT_SYMBOL(vfs_set_acl_prepare); + +/** + * posix_acl_from_xattr_kuid - map ACL_USER uid into filesystem idmapping + * @mnt_userns: unused + * @fs_userns: the filesystem's idmapping + * @e: a ACL_USER entry in POSIX ACL uapi format + * + * Map the uid stored as ACL_USER entry in @e into the filesystem's idmapping. + * This is used in posix_acl_from_xattr() to generate the proper VFS + * representation of POSIX ACLs with ACL_USER entries. + * + * Return: A kuid in @fs_userns for the uid stored in @e. + */ +static inline kuid_t +posix_acl_from_xattr_kuid(struct user_namespace *mnt_userns, + struct user_namespace *fs_userns, + const struct posix_acl_xattr_entry *e) +{ + return make_kuid(fs_userns, le32_to_cpu(e->e_id)); +} + +/** + * posix_acl_from_xattr_kgid - map ACL_GROUP gid into filesystem idmapping + * @mnt_userns: unused + * @fs_userns: the filesystem's idmapping + * @e: a ACL_GROUP entry in POSIX ACL uapi format + * + * Map the gid stored as ACL_GROUP entry in @e into the filesystem's idmapping. + * This is used in posix_acl_from_xattr() to generate the proper VFS + * representation of POSIX ACLs with ACL_GROUP entries. + * + * Return: A kgid in @fs_userns for the gid stored in @e. + */ +static inline kgid_t +posix_acl_from_xattr_kgid(struct user_namespace *mnt_userns, + struct user_namespace *fs_userns, + const struct posix_acl_xattr_entry *e) +{ + return make_kgid(fs_userns, le32_to_cpu(e->e_id)); +} + +/** + * posix_acl_from_xattr - convert POSIX ACLs from backing store to VFS format + * @fs_userns: the filesystem's idmapping + * @value: the uapi representation of POSIX ACLs + * @size: the size of @void + * + * Filesystems that store POSIX ACLs in the unaltered uapi format should use + * posix_acl_from_xattr() when reading them from the backing store and + * converting them into the struct posix_acl VFS format. The helper is + * specifically intended to be called from the ->get_acl() inode operation. + * + * The posix_acl_from_xattr() function will map the raw {g,u}id values stored + * in ACL_{GROUP,USER} entries into the filesystem idmapping in @fs_userns. The + * posix_acl_from_xattr_k{g,u}id() helpers will take care to generate the + * correct k{g,u}id_t. The returned struct posix_acl can be cached. + * + * Note that posix_acl_from_xattr() does not take idmapped mounts into account. + * If it did it calling is from the ->get_acl() inode operation would return + * POSIX ACLs mapped according to an idmapped mount which would mean that the + * value couldn't be cached for the filesystem. Idmapped mounts are taken into + * account on the fly during permission checking or right at the VFS - + * userspace boundary before reporting them to the user. + * + * Return: Allocated struct posix_acl on success, NULL for a valid header but + * without actual POSIX ACL entries, or ERR_PTR() encoded error code. + */ +struct posix_acl * +posix_acl_from_xattr(struct user_namespace *fs_userns, + const void *value, size_t size) +{ + return make_posix_acl(&init_user_ns, fs_userns, value, size, + posix_acl_from_xattr_kuid, + posix_acl_from_xattr_kgid); +} EXPORT_SYMBOL (posix_acl_from_xattr); /* @@ -1024,7 +1171,17 @@ posix_acl_xattr_set(const struct xattr_handler *handler, int ret; if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); + /* + * By the time we end up here the {g,u}ids stored in + * ACL_{GROUP,USER} have already been mapped according to the + * caller's idmapping. The vfs_set_acl_prepare() helper will + * recover them and take idmapped mounts into account. The + * filesystem will receive the POSIX ACLs in the correct + * format ready to be cached or written to the backing store + * taking the filesystem idmapping into account. + */ + acl = vfs_set_acl_prepare(mnt_userns, i_user_ns(inode), + value, size); if (IS_ERR(acl)) return PTR_ERR(acl); } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index a3398d0f1927..4e0023643f8b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -527,10 +527,12 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, struct vm_area_struct *vma = walk->vma; bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; - bool migration = false; + bool migration = false, young = false, dirty = false; if (pte_present(*pte)) { page = vm_normal_page(vma, addr, *pte); + young = pte_young(*pte); + dirty = pte_dirty(*pte); } else if (is_swap_pte(*pte)) { swp_entry_t swpent = pte_to_swp_entry(*pte); @@ -560,8 +562,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, if (!page) return; - smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), - locked, migration); + smaps_account(mss, page, false, young, dirty, locked, migration); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index b2fd3c20e7c2..0c034ea39954 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -28,14 +28,11 @@ #include <linux/crypto.h> #include <linux/string.h> #include <linux/timer.h> -#include <linux/scatterlist.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/jiffies.h> #include <linux/workqueue.h> -#include <crypto/acompress.h> - #include "internal.h" /* @@ -93,8 +90,7 @@ module_param(compress, charp, 0444); MODULE_PARM_DESC(compress, "compression to use"); /* Compression parameters */ -static struct crypto_acomp *tfm; -static struct acomp_req *creq; +static struct crypto_comp *tfm; struct pstore_zbackend { int (*zbufsize)(size_t size); @@ -272,21 +268,12 @@ static const struct pstore_zbackend zbackends[] = { static int pstore_compress(const void *in, void *out, unsigned int inlen, unsigned int outlen) { - struct scatterlist src, dst; int ret; if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS)) return -EINVAL; - sg_init_table(&src, 1); - sg_set_buf(&src, in, inlen); - - sg_init_table(&dst, 1); - sg_set_buf(&dst, out, outlen); - - acomp_request_set_params(creq, &src, &dst, inlen, outlen); - - ret = crypto_acomp_compress(creq); + ret = crypto_comp_compress(tfm, in, inlen, out, &outlen); if (ret) { pr_err("crypto_comp_compress failed, ret = %d!\n", ret); return ret; @@ -297,7 +284,7 @@ static int pstore_compress(const void *in, void *out, static void allocate_buf_for_compression(void) { - struct crypto_acomp *acomp; + struct crypto_comp *ctx; int size; char *buf; @@ -309,7 +296,7 @@ static void allocate_buf_for_compression(void) if (!psinfo || tfm) return; - if (!crypto_has_acomp(zbackend->name, 0, CRYPTO_ALG_ASYNC)) { + if (!crypto_has_comp(zbackend->name, 0, 0)) { pr_err("Unknown compression: %s\n", zbackend->name); return; } @@ -328,24 +315,16 @@ static void allocate_buf_for_compression(void) return; } - acomp = crypto_alloc_acomp(zbackend->name, 0, CRYPTO_ALG_ASYNC); - if (IS_ERR_OR_NULL(acomp)) { + ctx = crypto_alloc_comp(zbackend->name, 0, 0); + if (IS_ERR_OR_NULL(ctx)) { kfree(buf); pr_err("crypto_alloc_comp('%s') failed: %ld\n", zbackend->name, - PTR_ERR(acomp)); - return; - } - - creq = acomp_request_alloc(acomp); - if (!creq) { - crypto_free_acomp(acomp); - kfree(buf); - pr_err("acomp_request_alloc('%s') failed\n", zbackend->name); + PTR_ERR(ctx)); return; } /* A non-NULL big_oops_buf indicates compression is available. */ - tfm = acomp; + tfm = ctx; big_oops_buf_sz = size; big_oops_buf = buf; @@ -355,8 +334,7 @@ static void allocate_buf_for_compression(void) static void free_buf_for_compression(void) { if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && tfm) { - acomp_request_free(creq); - crypto_free_acomp(tfm); + crypto_free_comp(tfm); tfm = NULL; } kfree(big_oops_buf); @@ -693,8 +671,6 @@ static void decompress_record(struct pstore_record *record) int ret; int unzipped_len; char *unzipped, *workspace; - struct acomp_req *dreq; - struct scatterlist src, dst; if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS) || !record->compressed) return; @@ -718,30 +694,16 @@ static void decompress_record(struct pstore_record *record) if (!workspace) return; - dreq = acomp_request_alloc(tfm); - if (!dreq) { - kfree(workspace); - return; - } - - sg_init_table(&src, 1); - sg_set_buf(&src, record->buf, record->size); - - sg_init_table(&dst, 1); - sg_set_buf(&dst, workspace, unzipped_len); - - acomp_request_set_params(dreq, &src, &dst, record->size, unzipped_len); - /* After decompression "unzipped_len" is almost certainly smaller. */ - ret = crypto_acomp_decompress(dreq); + ret = crypto_comp_decompress(tfm, record->buf, record->size, + workspace, &unzipped_len); if (ret) { - pr_err("crypto_acomp_decompress failed, ret = %d!\n", ret); + pr_err("crypto_comp_decompress failed, ret = %d!\n", ret); kfree(workspace); return; } /* Append ECC notice to decompressed buffer. */ - unzipped_len = dreq->dlen; memcpy(workspace + unzipped_len, record->buf + record->size, record->ecc_notice_size); @@ -749,7 +711,6 @@ static void decompress_record(struct pstore_record *record) unzipped = kmemdup(workspace, unzipped_len + record->ecc_notice_size, GFP_KERNEL); kfree(workspace); - acomp_request_free(dreq); if (!unzipped) return; diff --git a/fs/read_write.c b/fs/read_write.c index 1a261dcf1778..328ce8cf9a85 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -496,14 +496,9 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t } /* caller is responsible for file_start_write/file_end_write */ -ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) +ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos) { - struct kvec iov = { - .iov_base = (void *)buf, - .iov_len = min_t(size_t, count, MAX_RW_COUNT), - }; struct kiocb kiocb; - struct iov_iter iter; ssize_t ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE))) @@ -519,8 +514,7 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t init_sync_kiocb(&kiocb, file); kiocb.ki_pos = pos ? *pos : 0; - iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len); - ret = file->f_op->write_iter(&kiocb, &iter); + ret = file->f_op->write_iter(&kiocb, from); if (ret > 0) { if (pos) *pos = kiocb.ki_pos; @@ -530,6 +524,18 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t inc_syscw(current); return ret; } + +/* caller is responsible for file_start_write/file_end_write */ +ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) +{ + struct kvec iov = { + .iov_base = (void *)buf, + .iov_len = min_t(size_t, count, MAX_RW_COUNT), + }; + struct iov_iter iter; + iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len); + return __kernel_write_iter(file, &iter, pos); +} /* * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()", * but autofs is one of the few internal kernel users that actually diff --git a/fs/readdir.c b/fs/readdir.c index 09e8ed7d4161..9c53edb60c03 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -140,7 +140,7 @@ struct readdir_callback { int result; }; -static int fillonedir(struct dir_context *ctx, const char *name, int namlen, +static bool fillonedir(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct readdir_callback *buf = @@ -149,14 +149,14 @@ static int fillonedir(struct dir_context *ctx, const char *name, int namlen, unsigned long d_ino; if (buf->result) - return -EINVAL; + return false; buf->result = verify_dirent_name(name, namlen); - if (buf->result < 0) - return buf->result; + if (buf->result) + return false; d_ino = ino; if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { buf->result = -EOVERFLOW; - return -EOVERFLOW; + return false; } buf->result++; dirent = buf->dirent; @@ -169,12 +169,12 @@ static int fillonedir(struct dir_context *ctx, const char *name, int namlen, unsafe_put_user(namlen, &dirent->d_namlen, efault_end); unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); user_write_access_end(); - return 0; + return true; efault_end: user_write_access_end(); efault: buf->result = -EFAULT; - return -EFAULT; + return false; } SYSCALL_DEFINE3(old_readdir, unsigned int, fd, @@ -219,7 +219,7 @@ struct getdents_callback { int error; }; -static int filldir(struct dir_context *ctx, const char *name, int namlen, +static bool filldir(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct linux_dirent __user *dirent, *prev; @@ -232,18 +232,18 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen, buf->error = verify_dirent_name(name, namlen); if (unlikely(buf->error)) - return buf->error; + return false; buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) - return -EINVAL; + return false; d_ino = ino; if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { buf->error = -EOVERFLOW; - return -EOVERFLOW; + return false; } prev_reclen = buf->prev_reclen; if (prev_reclen && signal_pending(current)) - return -EINTR; + return false; dirent = buf->current_dir; prev = (void __user *) dirent - prev_reclen; if (!user_write_access_begin(prev, reclen + prev_reclen)) @@ -260,12 +260,12 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen, buf->current_dir = (void __user *)dirent + reclen; buf->prev_reclen = reclen; buf->count -= reclen; - return 0; + return true; efault_end: user_write_access_end(); efault: buf->error = -EFAULT; - return -EFAULT; + return false; } SYSCALL_DEFINE3(getdents, unsigned int, fd, @@ -307,7 +307,7 @@ struct getdents_callback64 { int error; }; -static int filldir64(struct dir_context *ctx, const char *name, int namlen, +static bool filldir64(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct linux_dirent64 __user *dirent, *prev; @@ -319,13 +319,13 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen, buf->error = verify_dirent_name(name, namlen); if (unlikely(buf->error)) - return buf->error; + return false; buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) - return -EINVAL; + return false; prev_reclen = buf->prev_reclen; if (prev_reclen && signal_pending(current)) - return -EINTR; + return false; dirent = buf->current_dir; prev = (void __user *)dirent - prev_reclen; if (!user_write_access_begin(prev, reclen + prev_reclen)) @@ -342,13 +342,13 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen, buf->prev_reclen = reclen; buf->current_dir = (void __user *)dirent + reclen; buf->count -= reclen; - return 0; + return true; efault_end: user_write_access_end(); efault: buf->error = -EFAULT; - return -EFAULT; + return false; } SYSCALL_DEFINE3(getdents64, unsigned int, fd, @@ -397,7 +397,7 @@ struct compat_readdir_callback { int result; }; -static int compat_fillonedir(struct dir_context *ctx, const char *name, +static bool compat_fillonedir(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { @@ -407,14 +407,14 @@ static int compat_fillonedir(struct dir_context *ctx, const char *name, compat_ulong_t d_ino; if (buf->result) - return -EINVAL; + return false; buf->result = verify_dirent_name(name, namlen); - if (buf->result < 0) - return buf->result; + if (buf->result) + return false; d_ino = ino; if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { buf->result = -EOVERFLOW; - return -EOVERFLOW; + return false; } buf->result++; dirent = buf->dirent; @@ -427,12 +427,12 @@ static int compat_fillonedir(struct dir_context *ctx, const char *name, unsafe_put_user(namlen, &dirent->d_namlen, efault_end); unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); user_write_access_end(); - return 0; + return true; efault_end: user_write_access_end(); efault: buf->result = -EFAULT; - return -EFAULT; + return false; } COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd, @@ -471,7 +471,7 @@ struct compat_getdents_callback { int error; }; -static int compat_filldir(struct dir_context *ctx, const char *name, int namlen, +static bool compat_filldir(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct compat_linux_dirent __user *dirent, *prev; @@ -484,18 +484,18 @@ static int compat_filldir(struct dir_context *ctx, const char *name, int namlen, buf->error = verify_dirent_name(name, namlen); if (unlikely(buf->error)) - return buf->error; + return false; buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) - return -EINVAL; + return false; d_ino = ino; if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { buf->error = -EOVERFLOW; - return -EOVERFLOW; + return false; } prev_reclen = buf->prev_reclen; if (prev_reclen && signal_pending(current)) - return -EINTR; + return false; dirent = buf->current_dir; prev = (void __user *) dirent - prev_reclen; if (!user_write_access_begin(prev, reclen + prev_reclen)) @@ -511,12 +511,12 @@ static int compat_filldir(struct dir_context *ctx, const char *name, int namlen, buf->prev_reclen = reclen; buf->current_dir = (void __user *)dirent + reclen; buf->count -= reclen; - return 0; + return true; efault_end: user_write_access_end(); efault: buf->error = -EFAULT; - return -EFAULT; + return false; } COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd, diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 436641369283..8b2d52443f41 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -189,7 +189,7 @@ struct reiserfs_dentry_buf { struct dentry *dentries[8]; }; -static int +static bool fill_with_dentries(struct dir_context *ctx, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) { @@ -200,16 +200,16 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen, WARN_ON_ONCE(!inode_is_locked(d_inode(dbuf->xadir))); if (dbuf->count == ARRAY_SIZE(dbuf->dentries)) - return -ENOSPC; + return false; if (name[0] == '.' && (namelen < 2 || (namelen == 2 && name[1] == '.'))) - return 0; + return true; dentry = lookup_one_len(name, dbuf->xadir, namelen); if (IS_ERR(dentry)) { dbuf->err = PTR_ERR(dentry); - return PTR_ERR(dentry); + return false; } else if (d_really_is_negative(dentry)) { /* A directory entry exists, but no file? */ reiserfs_error(dentry->d_sb, "xattr-20003", @@ -218,11 +218,11 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen, dentry, dbuf->xadir); dput(dentry); dbuf->err = -EIO; - return -EIO; + return false; } dbuf->dentries[dbuf->count++] = dentry; - return 0; + return true; } static void @@ -797,7 +797,7 @@ struct listxattr_buf { struct dentry *dentry; }; -static int listxattr_filler(struct dir_context *ctx, const char *name, +static bool listxattr_filler(struct dir_context *ctx, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) { @@ -813,19 +813,19 @@ static int listxattr_filler(struct dir_context *ctx, const char *name, name); if (!handler /* Unsupported xattr name */ || (handler->list && !handler->list(b->dentry))) - return 0; + return true; size = namelen + 1; if (b->buf) { if (b->pos + size > b->size) { b->pos = -ERANGE; - return -ERANGE; + return false; } memcpy(b->buf + b->pos, name, namelen); b->buf[b->pos + namelen] = 0; } b->pos += size; } - return 0; + return true; } /* diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index 98e64fec75b7..e56510964b22 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -593,7 +593,7 @@ static void squashfs_readahead(struct readahead_control *ractl) res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor); - kfree(actor); + squashfs_page_actor_free(actor); if (res == expected) { int bytes; diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c index be4b12d31e0c..f1ccad519e28 100644 --- a/fs/squashfs/file_direct.c +++ b/fs/squashfs/file_direct.c @@ -74,7 +74,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, /* Decompress directly into the page cache buffers */ res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor); - kfree(actor); + squashfs_page_actor_free(actor); if (res < 0) goto mark_errored; diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c index b23b780d8f42..54b93bf4a25c 100644 --- a/fs/squashfs/page_actor.c +++ b/fs/squashfs/page_actor.c @@ -52,6 +52,7 @@ struct squashfs_page_actor *squashfs_page_actor_init(void **buffer, actor->buffer = buffer; actor->pages = pages; actor->next_page = 0; + actor->tmp_buffer = NULL; actor->squashfs_first_page = cache_first_page; actor->squashfs_next_page = cache_next_page; actor->squashfs_finish_page = cache_finish_page; @@ -68,20 +69,9 @@ static void *handle_next_page(struct squashfs_page_actor *actor) if ((actor->next_page == actor->pages) || (actor->next_index != actor->page[actor->next_page]->index)) { - if (actor->alloc_buffer) { - void *tmp_buffer = kmalloc(PAGE_SIZE, GFP_KERNEL); - - if (tmp_buffer) { - actor->tmp_buffer = tmp_buffer; - actor->next_index++; - actor->returned_pages++; - return tmp_buffer; - } - } - actor->next_index++; actor->returned_pages++; - return ERR_PTR(-ENOMEM); + return actor->alloc_buffer ? actor->tmp_buffer : ERR_PTR(-ENOMEM); } actor->next_index++; @@ -96,11 +86,10 @@ static void *direct_first_page(struct squashfs_page_actor *actor) static void *direct_next_page(struct squashfs_page_actor *actor) { - if (actor->pageaddr) + if (actor->pageaddr) { kunmap_local(actor->pageaddr); - - kfree(actor->tmp_buffer); - actor->pageaddr = actor->tmp_buffer = NULL; + actor->pageaddr = NULL; + } return handle_next_page(actor); } @@ -109,8 +98,6 @@ static void direct_finish_page(struct squashfs_page_actor *actor) { if (actor->pageaddr) kunmap_local(actor->pageaddr); - - kfree(actor->tmp_buffer); } struct squashfs_page_actor *squashfs_page_actor_init_special(struct squashfs_sb_info *msblk, @@ -121,6 +108,16 @@ struct squashfs_page_actor *squashfs_page_actor_init_special(struct squashfs_sb_ if (actor == NULL) return NULL; + if (msblk->decompressor->alloc_buffer) { + actor->tmp_buffer = kmalloc(PAGE_SIZE, GFP_KERNEL); + + if (actor->tmp_buffer == NULL) { + kfree(actor); + return NULL; + } + } else + actor->tmp_buffer = NULL; + actor->length = length ? : pages * PAGE_SIZE; actor->page = page; actor->pages = pages; @@ -128,7 +125,6 @@ struct squashfs_page_actor *squashfs_page_actor_init_special(struct squashfs_sb_ actor->returned_pages = 0; actor->next_index = page[0]->index & ~((1 << (msblk->block_log - PAGE_SHIFT)) - 1); actor->pageaddr = NULL; - actor->tmp_buffer = NULL; actor->alloc_buffer = msblk->decompressor->alloc_buffer; actor->squashfs_first_page = direct_first_page; actor->squashfs_next_page = direct_next_page; diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h index 24841d28bc0f..95ffbb543d91 100644 --- a/fs/squashfs/page_actor.h +++ b/fs/squashfs/page_actor.h @@ -29,6 +29,11 @@ extern struct squashfs_page_actor *squashfs_page_actor_init(void **buffer, extern struct squashfs_page_actor *squashfs_page_actor_init_special( struct squashfs_sb_info *msblk, struct page **page, int pages, int length); +static inline void squashfs_page_actor_free(struct squashfs_page_actor *actor) +{ + kfree(actor->tmp_buffer); + kfree(actor); +} static inline void *squashfs_first_page(struct squashfs_page_actor *actor) { return actor->squashfs_first_page(actor); diff --git a/fs/stat.c b/fs/stat.c index 9ced8860e0f3..ef50573c72a2 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -5,6 +5,7 @@ * Copyright (C) 1991, 1992 Linus Torvalds */ +#include <linux/blkdev.h> #include <linux/export.h> #include <linux/mm.h> #include <linux/errno.h> @@ -230,11 +231,22 @@ retry: goto out; error = vfs_getattr(&path, stat, request_mask, flags); + stat->mnt_id = real_mount(path.mnt)->mnt_id; stat->result_mask |= STATX_MNT_ID; + if (path.mnt->mnt_root == path.dentry) stat->attributes |= STATX_ATTR_MOUNT_ROOT; stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT; + + /* Handle STATX_DIOALIGN for block devices. */ + if (request_mask & STATX_DIOALIGN) { + struct inode *inode = d_backing_inode(path.dentry); + + if (S_ISBLK(inode->i_mode)) + bdev_statx_dioalign(inode, stat); + } + path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; @@ -611,6 +623,8 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer) tmp.stx_dev_major = MAJOR(stat->dev); tmp.stx_dev_minor = MINOR(stat->dev); tmp.stx_mnt_id = stat->mnt_id; + tmp.stx_dio_mem_align = stat->dio_mem_align; + tmp.stx_dio_offset_align = stat->dio_offset_align; return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0; } diff --git a/fs/super.c b/fs/super.c index 734ed584a946..6a82660e1adb 100644 --- a/fs/super.c +++ b/fs/super.c @@ -291,7 +291,6 @@ static void __put_super(struct super_block *s) WARN_ON(s->s_inode_lru.node); WARN_ON(!list_empty(&s->s_mounts)); security_sb_free(s); - fscrypt_sb_free(s); put_user_ns(s->s_user_ns); kfree(s->s_subtype); call_rcu(&s->rcu, destroy_super_rcu); @@ -480,6 +479,7 @@ void generic_shutdown_super(struct super_block *sb) evict_inodes(sb); /* only nonzero refcount inodes can have marks */ fsnotify_sb_delete(sb); + fscrypt_sb_delete(sb); security_sb_delete(sb); if (sb->s_dio_done_wq) { diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 81d26abf486f..da85b3979195 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -141,6 +141,8 @@ struct tracefs_mount_opts { kuid_t uid; kgid_t gid; umode_t mode; + /* Opt_* bitfield. */ + unsigned int opts; }; enum { @@ -241,6 +243,7 @@ static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts) kgid_t gid; char *p; + opts->opts = 0; opts->mode = TRACEFS_DEFAULT_MODE; while ((p = strsep(&data, ",")) != NULL) { @@ -275,24 +278,36 @@ static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts) * but traditionally tracefs has ignored all mount options */ } + + opts->opts |= BIT(token); } return 0; } -static int tracefs_apply_options(struct super_block *sb) +static int tracefs_apply_options(struct super_block *sb, bool remount) { struct tracefs_fs_info *fsi = sb->s_fs_info; struct inode *inode = d_inode(sb->s_root); struct tracefs_mount_opts *opts = &fsi->mount_opts; - inode->i_mode &= ~S_IALLUGO; - inode->i_mode |= opts->mode; + /* + * On remount, only reset mode/uid/gid if they were provided as mount + * options. + */ + + if (!remount || opts->opts & BIT(Opt_mode)) { + inode->i_mode &= ~S_IALLUGO; + inode->i_mode |= opts->mode; + } - inode->i_uid = opts->uid; + if (!remount || opts->opts & BIT(Opt_uid)) + inode->i_uid = opts->uid; - /* Set all the group ids to the mount option */ - set_gid(sb->s_root, opts->gid); + if (!remount || opts->opts & BIT(Opt_gid)) { + /* Set all the group ids to the mount option */ + set_gid(sb->s_root, opts->gid); + } return 0; } @@ -307,7 +322,7 @@ static int tracefs_remount(struct super_block *sb, int *flags, char *data) if (err) goto fail; - tracefs_apply_options(sb); + tracefs_apply_options(sb, true); fail: return err; @@ -359,7 +374,7 @@ static int trace_fill_super(struct super_block *sb, void *data, int silent) sb->s_op = &tracefs_super_operations; - tracefs_apply_options(sb); + tracefs_apply_options(sb, false); return 0; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 1c44bf75f916..0c1d33c4f74c 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -991,7 +991,7 @@ static int resolve_userfault_fork(struct userfaultfd_ctx *new, int fd; fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, new, - O_RDWR | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode); + O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode); if (fd < 0) return fd; @@ -1601,6 +1601,10 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range); } + /* Reset ptes for the whole vma range if wr-protected */ + if (userfaultfd_wp(vma)) + uffd_wp_range(mm, vma, start, vma_end - start, false); + new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, @@ -2090,7 +2094,7 @@ SYSCALL_DEFINE1(userfaultfd, int, flags) mmgrab(ctx->mm); fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, ctx, - O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL); + O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL); if (fd < 0) { mmdrop(ctx->mm); kmem_cache_free(userfaultfd_ctx_cachep, ctx); diff --git a/fs/verity/read_metadata.c b/fs/verity/read_metadata.c index 6ee849dc7bc1..2aefc5565152 100644 --- a/fs/verity/read_metadata.c +++ b/fs/verity/read_metadata.c @@ -53,14 +53,14 @@ static int fsverity_read_merkle_tree(struct inode *inode, break; } - virt = kmap(page); + virt = kmap_local_page(page); if (copy_to_user(buf, virt + offs_in_page, bytes_to_copy)) { - kunmap(page); + kunmap_local(virt); put_page(page); err = -EFAULT; break; } - kunmap(page); + kunmap_local(virt); put_page(page); retval += bytes_to_copy; diff --git a/fs/verity/verify.c b/fs/verity/verify.c index 14e2fb49cff5..bde8c9b7d25f 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -39,16 +39,6 @@ static void hash_at_level(const struct merkle_tree_params *params, (params->log_blocksize - params->log_arity); } -/* Extract a hash from a hash page */ -static void extract_hash(struct page *hpage, unsigned int hoffset, - unsigned int hsize, u8 *out) -{ - void *virt = kmap_atomic(hpage); - - memcpy(out, virt + hoffset, hsize); - kunmap_atomic(virt); -} - static inline int cmp_hashes(const struct fsverity_info *vi, const u8 *want_hash, const u8 *real_hash, pgoff_t index, int level) @@ -129,7 +119,7 @@ static bool verify_page(struct inode *inode, const struct fsverity_info *vi, } if (PageChecked(hpage)) { - extract_hash(hpage, hoffset, hsize, _want_hash); + memcpy_from_page(_want_hash, hpage, hoffset, hsize); want_hash = _want_hash; put_page(hpage); pr_debug_ratelimited("Hash page already checked, want %s:%*phN\n", @@ -158,7 +148,7 @@ descend: if (err) goto out; SetPageChecked(hpage); - extract_hash(hpage, hoffset, hsize, _want_hash); + memcpy_from_page(_want_hash, hpage, hoffset, hsize); want_hash = _want_hash; put_page(hpage); pr_debug("Verified hash page at level %d, now want %s:%*phN\n", diff --git a/fs/xattr.c b/fs/xattr.c index a1f4998bc6be..61107b6bbed2 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -290,7 +290,7 @@ static inline bool is_posix_acl_xattr(const char *name) int vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, - const char *name, void *value, size_t size, int flags) + const char *name, const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; struct inode *delegated_inode = NULL; @@ -298,16 +298,12 @@ vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, int error; if (size && strcmp(name, XATTR_NAME_CAPS) == 0) { - error = cap_convert_nscap(mnt_userns, dentry, - (const void **)&value, size); + error = cap_convert_nscap(mnt_userns, dentry, &value, size); if (error < 0) return error; size = error; } - if (size && is_posix_acl_xattr(name)) - posix_acl_setxattr_idmapped_mnt(mnt_userns, inode, value, size); - retry_deleg: inode_lock(inode); error = __vfs_setxattr_locked(mnt_userns, dentry, name, value, size, @@ -587,9 +583,7 @@ int setxattr_copy(const char __user *name, struct xattr_ctx *ctx) static void setxattr_convert(struct user_namespace *mnt_userns, struct dentry *d, struct xattr_ctx *ctx) { - if (ctx->size && - ((strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || - (strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))) + if (ctx->size && is_posix_acl_xattr(ctx->kname->name)) posix_acl_fix_xattr_from_user(ctx->kvalue, ctx->size); } @@ -705,8 +699,7 @@ do_getxattr(struct user_namespace *mnt_userns, struct dentry *d, error = vfs_getxattr(mnt_userns, d, kname, ctx->kvalue, ctx->size); if (error > 0) { - if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || - (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) + if (is_posix_acl_xattr(kname)) posix_acl_fix_xattr_to_user(ctx->kvalue, error); if (ctx->size && copy_to_user(ctx->value, ctx->kvalue, error)) error = -EFAULT; diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index 5abb5fdb71d9..b594f02a52c4 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -99,7 +99,7 @@ out: * we check the inode number to make sure it's sane, then we check that * we can look up this filename. Finally, we check the ftype. */ -STATIC int +STATIC bool xchk_dir_actor( struct dir_context *dir_iter, const char *name, @@ -124,7 +124,7 @@ xchk_dir_actor( xfs_dir2_dataptr_to_db(mp->m_dir_geo, pos)); if (xchk_should_terminate(sdc->sc, &error)) - return error; + return !error; /* Does this inode number make sense? */ if (!xfs_verify_dir_ino(mp, ino)) { @@ -191,8 +191,8 @@ out: * and return zero to xchk_directory. */ if (error == 0 && sdc->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return -EFSCORRUPTED; - return error; + return false; + return !error; } /* Scrub a directory btree record. */ diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index ab182a5cd0c0..d8dff3fd8053 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -38,7 +38,7 @@ struct xchk_parent_ctx { }; /* Look for a single entry in a directory pointing to an inode. */ -STATIC int +STATIC bool xchk_parent_actor( struct dir_context *dc, const char *name, @@ -62,7 +62,7 @@ xchk_parent_actor( if (xchk_should_terminate(spc->sc, &error)) spc->cancelled = true; - return error; + return !error; } /* Count the number of dentries in the parent dir that point to this inode. */ diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 45518b8c613c..f51c60d7e205 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -604,6 +604,16 @@ xfs_vn_getattr( stat->blksize = BLKDEV_IOSIZE; stat->rdev = inode->i_rdev; break; + case S_IFREG: + if (request_mask & STATX_DIOALIGN) { + struct xfs_buftarg *target = xfs_inode_buftarg(ip); + struct block_device *bdev = target->bt_bdev; + + stat->result_mask |= STATX_DIOALIGN; + stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; + stat->dio_offset_align = bdev_logical_block_size(bdev); + } + fallthrough; default: stat->blksize = xfs_stat_blksize(ip); stat->rdev = 0; diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c index 69d9c83ea4b2..5b1f9a24ed59 100644 --- a/fs/xfs/xfs_notify_failure.c +++ b/fs/xfs/xfs_notify_failure.c @@ -175,13 +175,13 @@ xfs_dax_notify_failure( u64 ddev_start; u64 ddev_end; - if (!(mp->m_sb.sb_flags & SB_BORN)) { + if (!(mp->m_super->s_flags & SB_BORN)) { xfs_warn(mp, "filesystem is not ready for notify_failure()!"); return -EIO; } if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) { - xfs_warn(mp, + xfs_debug(mp, "notify_failure() not supported on realtime device!"); return -EOPNOTSUPP; } @@ -194,7 +194,7 @@ xfs_dax_notify_failure( } if (!xfs_has_rmapbt(mp)) { - xfs_warn(mp, "notify_failure() needs rmapbt enabled!"); + xfs_debug(mp, "notify_failure() needs rmapbt enabled!"); return -EOPNOTSUPP; } |