diff options
Diffstat (limited to 'fs')
211 files changed, 8730 insertions, 2431 deletions
diff --git a/fs/9p/fid.c b/fs/9p/fid.c index 47db55aee7f2..60fb47469c86 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -257,36 +257,12 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry) return v9fs_fid_lookup_with_uid(dentry, uid, any); } -struct p9_fid *v9fs_fid_clone(struct dentry *dentry) -{ - struct p9_fid *fid, *ret; - - fid = v9fs_fid_lookup(dentry); - if (IS_ERR(fid)) - return fid; - - ret = p9_client_walk(fid, 0, NULL, 1); - return ret; -} - -static struct p9_fid *v9fs_fid_clone_with_uid(struct dentry *dentry, kuid_t uid) -{ - struct p9_fid *fid, *ret; - - fid = v9fs_fid_lookup_with_uid(dentry, uid, 0); - if (IS_ERR(fid)) - return fid; - - ret = p9_client_walk(fid, 0, NULL, 1); - return ret; -} - struct p9_fid *v9fs_writeback_fid(struct dentry *dentry) { int err; struct p9_fid *fid; - fid = v9fs_fid_clone_with_uid(dentry, GLOBAL_ROOT_UID); + fid = clone_fid(v9fs_fid_lookup_with_uid(dentry, GLOBAL_ROOT_UID, 0)); if (IS_ERR(fid)) goto error_out; /* diff --git a/fs/9p/fid.h b/fs/9p/fid.h index 12700df0bb51..4491bcaf42b8 100644 --- a/fs/9p/fid.h +++ b/fs/9p/fid.h @@ -28,7 +28,14 @@ static inline struct p9_fid *v9fs_parent_fid(struct dentry *dentry) { return v9fs_fid_lookup(dentry->d_parent); } -struct p9_fid *v9fs_fid_clone(struct dentry *dentry); void v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid); struct p9_fid *v9fs_writeback_fid(struct dentry *dentry); +static inline struct p9_fid *clone_fid(struct p9_fid *fid) +{ + return IS_ERR(fid) ? fid : p9_client_walk(fid, 0, NULL, 1); +} +static inline struct p9_fid *v9fs_fid_clone(struct dentry *dentry) +{ + return clone_fid(v9fs_fid_lookup(dentry)); +} #endif diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 7da9a8354fad..8b1999b528e9 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -661,7 +661,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, } /* clone a fid to use for creation */ - ofid = p9_client_walk(dfid, 0, NULL, 1); + ofid = clone_fid(dfid); if (IS_ERR(ofid)) { err = PTR_ERR(ofid); p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); @@ -975,13 +975,13 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (IS_ERR(oldfid)) return PTR_ERR(oldfid); - olddirfid = v9fs_parent_fid(old_dentry); + olddirfid = clone_fid(v9fs_parent_fid(old_dentry)); if (IS_ERR(olddirfid)) { retval = PTR_ERR(olddirfid); goto done; } - newdirfid = v9fs_parent_fid(new_dentry); + newdirfid = clone_fid(v9fs_parent_fid(new_dentry)); if (IS_ERR(newdirfid)) { retval = PTR_ERR(newdirfid); goto clunk_olddir; diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 2ed04c2fe7af..eeabcb0bad12 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -281,7 +281,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, } /* clone a fid to use for creation */ - ofid = p9_client_walk(dfid, 0, NULL, 1); + ofid = clone_fid(dfid); if (IS_ERR(ofid)) { err = PTR_ERR(ofid); p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index a6bd349bab23..f329eee6dc93 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -97,8 +97,6 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name, const void *value, size_t value_len, int flags) { struct p9_fid *fid = v9fs_fid_lookup(dentry); - if (IS_ERR(fid)) - return PTR_ERR(fid); return v9fs_fid_xattr_set(fid, name, value, value_len, flags); } @@ -115,7 +113,7 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name, name, value_len, flags); /* Clone it */ - fid = p9_client_walk(fid, 0, NULL, 1); + fid = clone_fid(fid); if (IS_ERR(fid)) return PTR_ERR(fid); diff --git a/fs/Kconfig b/fs/Kconfig index 4524916fa200..2bc7ad775842 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -70,6 +70,12 @@ config FS_POSIX_ACL config EXPORTFS tristate +config EXPORTFS_BLOCK_OPS + bool "Enable filesystem export operations for block IO" + help + This option enables the export operations for a filesystem to support + external block IO. + config FILE_LOCKING bool "Enable POSIX file locking API" if EXPERT default y diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 72c03354c14b..c7efddf6e038 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -89,7 +89,8 @@ config BINFMT_SCRIPT config BINFMT_FLAT bool "Kernel support for flat binaries" - depends on !MMU && (!FRV || BROKEN) + depends on !MMU || M68K + depends on !FRV || BROKEN help Support uClinux FLAT format binaries. diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c index bec25f7017c0..29444c83da48 100644 --- a/fs/adfs/dir.c +++ b/fs/adfs/dir.c @@ -101,7 +101,7 @@ out: } static int -adfs_match(struct qstr *name, struct object_info *obj) +adfs_match(const struct qstr *name, struct object_info *obj) { int i; @@ -126,7 +126,7 @@ adfs_match(struct qstr *name, struct object_info *obj) } static int -adfs_dir_lookup_byname(struct inode *inode, struct qstr *name, struct object_info *obj) +adfs_dir_lookup_byname(struct inode *inode, const struct qstr *name, struct object_info *obj) { struct super_block *sb = inode->i_sb; const struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir; @@ -227,7 +227,7 @@ adfs_hash(const struct dentry *parent, struct qstr *qstr) * requirements of the underlying filesystem. */ static int -adfs_compare(const struct dentry *parent, const struct dentry *dentry, +adfs_compare(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { int i; diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index d6c7a51c93e4..d8f217c711d3 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -472,9 +472,7 @@ affs_warning(struct super_block *sb, const char *function, const char *fmt, ...) bool affs_nofilenametruncate(const struct dentry *dentry) { - struct inode *inode = d_inode(dentry); - - return affs_test_opt(AFFS_SB(inode->i_sb)->s_flags, SF_NO_TRUNCATE); + return affs_test_opt(AFFS_SB(dentry->d_sb)->s_flags, SF_NO_TRUNCATE); } /* Check if the name is valid for a affs object. */ diff --git a/fs/affs/namei.c b/fs/affs/namei.c index eb32029bc776..a2d68f828d53 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -14,11 +14,11 @@ typedef int (*toupper_t)(int); static int affs_toupper(int ch); static int affs_hash_dentry(const struct dentry *, struct qstr *); -static int affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry, +static int affs_compare_dentry(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name); static int affs_intl_toupper(int ch); static int affs_intl_hash_dentry(const struct dentry *, struct qstr *); -static int affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry, +static int affs_intl_compare_dentry(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name); const struct dentry_operations affs_dentry_operations = { @@ -131,20 +131,20 @@ static inline int __affs_compare_dentry(unsigned int len, } static int -affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry, +affs_compare_dentry(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { return __affs_compare_dentry(len, str, name, affs_toupper, - affs_nofilenametruncate(parent)); + affs_nofilenametruncate(dentry)); } static int -affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry, +affs_intl_compare_dentry(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { return __affs_compare_dentry(len, str, name, affs_intl_toupper, - affs_nofilenametruncate(parent)); + affs_nofilenametruncate(dentry)); } diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 3767f6641af1..fa84bb8832e0 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -159,7 +159,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry) { struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct dentry *parent = dentry->d_parent; - struct qstr *name = &dentry->d_name; + const struct qstr *name = &dentry->d_name; unsigned int len = name->len; unsigned int hash = name->hash; const unsigned char *str = name->name; @@ -172,7 +172,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry) list_for_each(p, head) { struct autofs_info *ino; struct dentry *active; - struct qstr *qstr; + const struct qstr *qstr; ino = list_entry(p, struct autofs_info, active); active = ino->dentry; @@ -214,7 +214,7 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry, { struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct dentry *parent = dentry->d_parent; - struct qstr *name = &dentry->d_name; + const struct qstr *name = &dentry->d_name; unsigned int len = name->len; unsigned int hash = name->hash; const unsigned char *str = name->name; @@ -227,7 +227,7 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry, list_for_each(p, head) { struct autofs_info *ino; struct dentry *expiring; - struct qstr *qstr; + const struct qstr *qstr; if (rcu_walk) { spin_unlock(&sbi->lookup_lock); diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 708214457d16..431fd7ee3488 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -225,7 +225,7 @@ rename_retry: } static struct autofs_wait_queue * -autofs4_find_wait(struct autofs_sb_info *sbi, struct qstr *qstr) +autofs4_find_wait(struct autofs_sb_info *sbi, const struct qstr *qstr) { struct autofs_wait_queue *wq; @@ -249,7 +249,7 @@ autofs4_find_wait(struct autofs_sb_info *sbi, struct qstr *qstr) */ static int validate_request(struct autofs_wait_queue **wait, struct autofs_sb_info *sbi, - struct qstr *qstr, + const struct qstr *qstr, struct dentry *dentry, enum autofs_notify notify) { struct autofs_wait_queue *wq; diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 203589311bf8..464a972e88c1 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -67,8 +67,6 @@ static int create_elf_fdpic_tables(struct linux_binprm *, struct mm_struct *, struct elf_fdpic_params *); #ifndef CONFIG_MMU -static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *, - unsigned long *); static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *, struct file *, struct mm_struct *); @@ -515,8 +513,9 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, sp = mm->start_stack; /* stack the program arguments and environment */ - if (elf_fdpic_transfer_args_to_stack(bprm, &sp) < 0) + if (transfer_args_to_stack(bprm, &sp) < 0) return -EFAULT; + sp &= ~15; #endif /* @@ -711,39 +710,6 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, /*****************************************************************************/ /* - * transfer the program arguments and environment from the holding pages onto - * the stack - */ -#ifndef CONFIG_MMU -static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *bprm, - unsigned long *_sp) -{ - unsigned long index, stop, sp; - char *src; - int ret = 0; - - stop = bprm->p >> PAGE_SHIFT; - sp = *_sp; - - for (index = MAX_ARG_PAGES - 1; index >= stop; index--) { - src = kmap(bprm->page[index]); - sp -= PAGE_SIZE; - if (copy_to_user((void *) sp, src, PAGE_SIZE) != 0) - ret = -EFAULT; - kunmap(bprm->page[index]); - if (ret < 0) - goto out; - } - - *_sp = (*_sp - (MAX_ARG_PAGES * PAGE_SIZE - bprm->p)) & ~15; - -out: - return ret; -} -#endif - -/*****************************************************************************/ -/* * load the appropriate binary image (executable or interpreter) into memory * - we assume no MMU is available * - if no other PIC bits are set in params->hdr->e_flags diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index caf9e39bb82b..9b2917a30294 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -15,7 +15,8 @@ * JAN/99 -- coded full program relocation (gerg@snapgear.com) */ -#include <linux/export.h> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/sched.h> #include <linux/mm.h> @@ -25,8 +26,6 @@ #include <linux/string.h> #include <linux/fs.h> #include <linux/file.h> -#include <linux/stat.h> -#include <linux/fcntl.h> #include <linux/ptrace.h> #include <linux/user.h> #include <linux/slab.h> @@ -34,26 +33,16 @@ #include <linux/personality.h> #include <linux/init.h> #include <linux/flat.h> -#include <linux/syscalls.h> +#include <linux/uaccess.h> +#include <linux/vmalloc.h> #include <asm/byteorder.h> -#include <asm/uaccess.h> #include <asm/unaligned.h> #include <asm/cacheflush.h> #include <asm/page.h> /****************************************************************************/ -#if 0 -#define DEBUG 1 -#endif - -#ifdef DEBUG -#define DBG_FLT(a...) printk(a) -#else -#define DBG_FLT(a...) -#endif - /* * User data (data section and bss) needs to be aligned. * We pick 0x20 here because it is the max value elf2flt has always @@ -80,7 +69,7 @@ struct lib_info { unsigned long text_len; /* Length of text segment */ unsigned long entry; /* Start address for this module */ unsigned long build_date; /* When this one was compiled */ - short loaded; /* Has this library been loaded? */ + bool loaded; /* Has this library been loaded? */ } lib_list[MAX_SHARED_LIBS]; }; @@ -106,59 +95,67 @@ static struct linux_binfmt flat_format = { static int flat_core_dump(struct coredump_params *cprm) { - printk("Process %s:%d received signr %d and should have core dumped\n", - current->comm, current->pid, (int) cprm->siginfo->si_signo); - return(1); + pr_warn("Process %s:%d received signr %d and should have core dumped\n", + current->comm, current->pid, cprm->siginfo->si_signo); + return 1; } /****************************************************************************/ /* * create_flat_tables() parses the env- and arg-strings in new user * memory and creates the pointer tables from them, and puts their - * addresses on the "stack", returning the new stack pointer value. + * addresses on the "stack", recording the new stack pointer value. */ -static unsigned long create_flat_tables( - unsigned long pp, - struct linux_binprm * bprm) +static int create_flat_tables(struct linux_binprm *bprm, unsigned long arg_start) { - unsigned long *argv,*envp; - unsigned long * sp; - char * p = (char*)pp; - int argc = bprm->argc; - int envc = bprm->envc; - char uninitialized_var(dummy); - - sp = (unsigned long *)p; - sp -= (envc + argc + 2) + 1 + (flat_argvp_envp_on_stack() ? 2 : 0); - sp = (unsigned long *) ((unsigned long)sp & -FLAT_STACK_ALIGN); - argv = sp + 1 + (flat_argvp_envp_on_stack() ? 2 : 0); - envp = argv + (argc + 1); + char __user *p; + unsigned long __user *sp; + long i, len; + + p = (char __user *)arg_start; + sp = (unsigned long __user *)current->mm->start_stack; + + sp -= bprm->envc + 1; + sp -= bprm->argc + 1; + sp -= flat_argvp_envp_on_stack() ? 2 : 0; + sp -= 1; /* &argc */ + current->mm->start_stack = (unsigned long)sp & -FLAT_STACK_ALIGN; + sp = (unsigned long __user *)current->mm->start_stack; + + __put_user(bprm->argc, sp++); if (flat_argvp_envp_on_stack()) { - put_user((unsigned long) envp, sp + 2); - put_user((unsigned long) argv, sp + 1); - } - - put_user(argc, sp); - current->mm->arg_start = (unsigned long) p; - while (argc-->0) { - put_user((unsigned long) p, argv++); - do { - get_user(dummy, p); p++; - } while (dummy); - } - put_user((unsigned long) NULL, argv); - current->mm->arg_end = current->mm->env_start = (unsigned long) p; - while (envc-->0) { - put_user((unsigned long)p, envp); envp++; - do { - get_user(dummy, p); p++; - } while (dummy); - } - put_user((unsigned long) NULL, envp); - current->mm->env_end = (unsigned long) p; - return (unsigned long)sp; + unsigned long argv, envp; + argv = (unsigned long)(sp + 2); + envp = (unsigned long)(sp + 2 + bprm->argc + 1); + __put_user(argv, sp++); + __put_user(envp, sp++); + } + + current->mm->arg_start = (unsigned long)p; + for (i = bprm->argc; i > 0; i--) { + __put_user((unsigned long)p, sp++); + len = strnlen_user(p, MAX_ARG_STRLEN); + if (!len || len > MAX_ARG_STRLEN) + return -EINVAL; + p += len; + } + __put_user(0, sp++); + current->mm->arg_end = (unsigned long)p; + + current->mm->env_start = (unsigned long) p; + for (i = bprm->envc; i > 0; i--) { + __put_user((unsigned long)p, sp++); + len = strnlen_user(p, MAX_ARG_STRLEN); + if (!len || len > MAX_ARG_STRLEN) + return -EINVAL; + p += len; + } + __put_user(0, sp++); + current->mm->env_end = (unsigned long)p; + + return 0; } /****************************************************************************/ @@ -190,17 +187,17 @@ static int decompress_exec( loff_t fpos; int ret, retval; - DBG_FLT("decompress_exec(offset=%x,buf=%x,len=%x)\n",(int)offset, (int)dst, (int)len); + pr_debug("decompress_exec(offset=%lx,buf=%p,len=%lx)\n", offset, dst, len); memset(&strm, 0, sizeof(strm)); strm.workspace = kmalloc(zlib_inflate_workspacesize(), GFP_KERNEL); if (strm.workspace == NULL) { - DBG_FLT("binfmt_flat: no memory for decompress workspace\n"); + pr_debug("no memory for decompress workspace\n"); return -ENOMEM; } buf = kmalloc(LBUFSIZE, GFP_KERNEL); if (buf == NULL) { - DBG_FLT("binfmt_flat: no memory for read buffer\n"); + pr_debug("no memory for read buffer\n"); retval = -ENOMEM; goto out_free; } @@ -218,49 +215,49 @@ static int decompress_exec( /* Check minimum size -- gzip header */ if (ret < 10) { - DBG_FLT("binfmt_flat: file too small?\n"); + pr_debug("file too small?\n"); goto out_free_buf; } /* Check gzip magic number */ if ((buf[0] != 037) || ((buf[1] != 0213) && (buf[1] != 0236))) { - DBG_FLT("binfmt_flat: unknown compression magic?\n"); + pr_debug("unknown compression magic?\n"); goto out_free_buf; } /* Check gzip method */ if (buf[2] != 8) { - DBG_FLT("binfmt_flat: unknown compression method?\n"); + pr_debug("unknown compression method?\n"); goto out_free_buf; } /* Check gzip flags */ if ((buf[3] & ENCRYPTED) || (buf[3] & CONTINUATION) || (buf[3] & RESERVED)) { - DBG_FLT("binfmt_flat: unknown flags?\n"); + pr_debug("unknown flags?\n"); goto out_free_buf; } ret = 10; if (buf[3] & EXTRA_FIELD) { ret += 2 + buf[10] + (buf[11] << 8); - if (unlikely(LBUFSIZE <= ret)) { - DBG_FLT("binfmt_flat: buffer overflow (EXTRA)?\n"); + if (unlikely(ret >= LBUFSIZE)) { + pr_debug("buffer overflow (EXTRA)?\n"); goto out_free_buf; } } if (buf[3] & ORIG_NAME) { while (ret < LBUFSIZE && buf[ret++] != 0) ; - if (unlikely(LBUFSIZE == ret)) { - DBG_FLT("binfmt_flat: buffer overflow (ORIG_NAME)?\n"); + if (unlikely(ret == LBUFSIZE)) { + pr_debug("buffer overflow (ORIG_NAME)?\n"); goto out_free_buf; } } if (buf[3] & COMMENT) { while (ret < LBUFSIZE && buf[ret++] != 0) ; - if (unlikely(LBUFSIZE == ret)) { - DBG_FLT("binfmt_flat: buffer overflow (COMMENT)?\n"); + if (unlikely(ret == LBUFSIZE)) { + pr_debug("buffer overflow (COMMENT)?\n"); goto out_free_buf; } } @@ -273,7 +270,7 @@ static int decompress_exec( strm.total_out = 0; if (zlib_inflateInit2(&strm, -MAX_WBITS) != Z_OK) { - DBG_FLT("binfmt_flat: zlib init failed?\n"); + pr_debug("zlib init failed?\n"); goto out_free_buf; } @@ -290,7 +287,7 @@ static int decompress_exec( } if (ret < 0) { - DBG_FLT("binfmt_flat: decompression failed (%d), %s\n", + pr_debug("decompression failed (%d), %s\n", ret, strm.msg); goto out_zlib; } @@ -327,24 +324,23 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp) r &= 0x00ffffff; /* Trim ID off here */ } if (id >= MAX_SHARED_LIBS) { - printk("BINFMT_FLAT: reference 0x%x to shared library %d", - (unsigned) r, id); + pr_err("reference 0x%lx to shared library %d", r, id); goto failed; } if (curid != id) { if (internalp) { - printk("BINFMT_FLAT: reloc address 0x%x not in same module " - "(%d != %d)", (unsigned) r, curid, id); + pr_err("reloc address 0x%lx not in same module " + "(%d != %d)", r, curid, id); goto failed; - } else if ( ! p->lib_list[id].loaded && - load_flat_shared_library(id, p) < 0) { - printk("BINFMT_FLAT: failed to load library %d", id); + } else if (!p->lib_list[id].loaded && + load_flat_shared_library(id, p) < 0) { + pr_err("failed to load library %d", id); goto failed; } /* Check versioning information (i.e. time stamps) */ if (p->lib_list[id].build_date && p->lib_list[curid].build_date && p->lib_list[curid].build_date < p->lib_list[id].build_date) { - printk("BINFMT_FLAT: library %d is younger than %d", id, curid); + pr_err("library %d is younger than %d", id, curid); goto failed; } } @@ -358,8 +354,8 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp) text_len = p->lib_list[id].text_len; if (!flat_reloc_valid(r, start_brk - start_data + text_len)) { - printk("BINFMT_FLAT: reloc outside program 0x%x (0 - 0x%x/0x%x)", - (int) r,(int)(start_brk-start_data+text_len),(int)text_len); + pr_err("reloc outside program 0x%lx (0 - 0x%lx/0x%lx)", + r, start_brk-start_data+text_len, text_len); goto failed; } @@ -369,10 +365,10 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp) addr = r - text_len + start_data; /* Range checked already above so doing the range tests is redundant...*/ - return(addr); + return addr; failed: - printk(", killing %s!\n", current->comm); + pr_cont(", killing %s!\n", current->comm); send_sig(SIGSEGV, current, 0); return RELOC_FAILED; @@ -382,62 +378,57 @@ failed: static void old_reloc(unsigned long rl) { -#ifdef DEBUG - char *segment[] = { "TEXT", "DATA", "BSS", "*UNKNOWN*" }; -#endif + static const char *segment[] = { "TEXT", "DATA", "BSS", "*UNKNOWN*" }; flat_v2_reloc_t r; - unsigned long *ptr; - + unsigned long __user *ptr; + unsigned long val; + r.value = rl; #if defined(CONFIG_COLDFIRE) - ptr = (unsigned long *) (current->mm->start_code + r.reloc.offset); + ptr = (unsigned long __user *)(current->mm->start_code + r.reloc.offset); #else - ptr = (unsigned long *) (current->mm->start_data + r.reloc.offset); + ptr = (unsigned long __user *)(current->mm->start_data + r.reloc.offset); #endif + get_user(val, ptr); + + pr_debug("Relocation of variable at DATASEG+%x " + "(address %p, currently %lx) into segment %s\n", + r.reloc.offset, ptr, val, segment[r.reloc.type]); -#ifdef DEBUG - printk("Relocation of variable at DATASEG+%x " - "(address %p, currently %x) into segment %s\n", - r.reloc.offset, ptr, (int)*ptr, segment[r.reloc.type]); -#endif - switch (r.reloc.type) { case OLD_FLAT_RELOC_TYPE_TEXT: - *ptr += current->mm->start_code; + val += current->mm->start_code; break; case OLD_FLAT_RELOC_TYPE_DATA: - *ptr += current->mm->start_data; + val += current->mm->start_data; break; case OLD_FLAT_RELOC_TYPE_BSS: - *ptr += current->mm->end_data; + val += current->mm->end_data; break; default: - printk("BINFMT_FLAT: Unknown relocation type=%x\n", r.reloc.type); + pr_err("Unknown relocation type=%x\n", r.reloc.type); break; } + put_user(val, ptr); -#ifdef DEBUG - printk("Relocation became %x\n", (int)*ptr); -#endif -} + pr_debug("Relocation became %lx\n", val); +} /****************************************************************************/ -static int load_flat_file(struct linux_binprm * bprm, +static int load_flat_file(struct linux_binprm *bprm, struct lib_info *libinfo, int id, unsigned long *extra_stack) { - struct flat_hdr * hdr; - unsigned long textpos = 0, datapos = 0, result; - unsigned long realdatastart = 0; - unsigned long text_len, data_len, bss_len, stack_len, flags; - unsigned long full_data; - unsigned long len, memp = 0; - unsigned long memp_size, extra, rlim; - unsigned long *reloc = 0, *rp; + struct flat_hdr *hdr; + unsigned long textpos, datapos, realdatastart; + unsigned long text_len, data_len, bss_len, stack_len, full_data, flags; + unsigned long len, memp, memp_size, extra, rlim; + unsigned long __user *reloc, *rp; struct inode *inode; - int i, rev, relocs = 0; + int i, rev, relocs; loff_t fpos; unsigned long start_code, end_code; + ssize_t result; int ret; hdr = ((struct flat_hdr *) bprm->buf); /* exec-header */ @@ -469,20 +460,30 @@ static int load_flat_file(struct linux_binprm * bprm, } if (flags & FLAT_FLAG_KTRACE) - printk("BINFMT_FLAT: Loading file: %s\n", bprm->filename); + pr_info("Loading file: %s\n", bprm->filename); if (rev != FLAT_VERSION && rev != OLD_FLAT_VERSION) { - printk("BINFMT_FLAT: bad flat file version 0x%x (supported " - "0x%lx and 0x%lx)\n", - rev, FLAT_VERSION, OLD_FLAT_VERSION); + pr_err("bad flat file version 0x%x (supported 0x%lx and 0x%lx)\n", + rev, FLAT_VERSION, OLD_FLAT_VERSION); ret = -ENOEXEC; goto err; } - + /* Don't allow old format executables to use shared libraries */ if (rev == OLD_FLAT_VERSION && id != 0) { - printk("BINFMT_FLAT: shared libraries are not available before rev 0x%x\n", - (int) FLAT_VERSION); + pr_err("shared libraries are not available before rev 0x%lx\n", + FLAT_VERSION); + ret = -ENOEXEC; + goto err; + } + + /* + * Make sure the header params are sane. + * 28 bits (256 MB) is way more than reasonable in this case. + * If some top bits are set we have probable binary corruption. + */ + if ((text_len | data_len | bss_len | stack_len | full_data) >> 28) { + pr_err("bad header\n"); ret = -ENOEXEC; goto err; } @@ -496,7 +497,7 @@ static int load_flat_file(struct linux_binprm * bprm, #ifndef CONFIG_BINFMT_ZFLAT if (flags & (FLAT_FLAG_GZIP|FLAT_FLAG_GZDATA)) { - printk("Support for ZFLAT executables is not enabled.\n"); + pr_err("Support for ZFLAT executables is not enabled.\n"); ret = -ENOEXEC; goto err; } @@ -517,11 +518,9 @@ static int load_flat_file(struct linux_binprm * bprm, /* Flush all traces of the currently running executable */ if (id == 0) { - result = flush_old_exec(bprm); - if (result) { - ret = result; + ret = flush_old_exec(bprm); + if (ret) goto err; - } /* OK, This is the point of no return */ set_personality(PER_LINUX_32BIT); @@ -539,48 +538,48 @@ static int load_flat_file(struct linux_binprm * bprm, * case, and then the fully copied to RAM case which lumps * it all together. */ - if ((flags & (FLAT_FLAG_RAM|FLAT_FLAG_GZIP)) == 0) { + if (!IS_ENABLED(CONFIG_MMU) && !(flags & (FLAT_FLAG_RAM|FLAT_FLAG_GZIP))) { /* * this should give us a ROM ptr, but if it doesn't we don't * really care */ - DBG_FLT("BINFMT_FLAT: ROM mapping of file (we hope)\n"); + pr_debug("ROM mapping of file (we hope)\n"); textpos = vm_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_EXECUTABLE, 0); if (!textpos || IS_ERR_VALUE(textpos)) { - if (!textpos) - textpos = (unsigned long) -ENOMEM; - printk("Unable to mmap process text, errno %d\n", (int)-textpos); ret = textpos; + if (!textpos) + ret = -ENOMEM; + pr_err("Unable to mmap process text, errno %d\n", ret); goto err; } len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); len = PAGE_ALIGN(len); - realdatastart = vm_mmap(0, 0, len, + realdatastart = vm_mmap(NULL, 0, len, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0); if (realdatastart == 0 || IS_ERR_VALUE(realdatastart)) { + ret = realdatastart; if (!realdatastart) - realdatastart = (unsigned long) -ENOMEM; - printk("Unable to allocate RAM for process data, errno %d\n", - (int)-realdatastart); + ret = -ENOMEM; + pr_err("Unable to allocate RAM for process data, " + "errno %d\n", ret); vm_munmap(textpos, text_len); - ret = realdatastart; goto err; } datapos = ALIGN(realdatastart + MAX_SHARED_LIBS * sizeof(unsigned long), FLAT_DATA_ALIGN); - DBG_FLT("BINFMT_FLAT: Allocated data+bss+stack (%d bytes): %x\n", - (int)(data_len + bss_len + stack_len), (int)datapos); + pr_debug("Allocated data+bss+stack (%ld bytes): %lx\n", + data_len + bss_len + stack_len, datapos); fpos = ntohl(hdr->data_start); #ifdef CONFIG_BINFMT_ZFLAT if (flags & FLAT_FLAG_GZDATA) { - result = decompress_exec(bprm, fpos, (char *) datapos, + result = decompress_exec(bprm, fpos, (char *)datapos, full_data, 0); } else #endif @@ -589,29 +588,30 @@ static int load_flat_file(struct linux_binprm * bprm, full_data); } if (IS_ERR_VALUE(result)) { - printk("Unable to read data+bss, errno %d\n", (int)-result); + ret = result; + pr_err("Unable to read data+bss, errno %d\n", ret); vm_munmap(textpos, text_len); vm_munmap(realdatastart, len); - ret = result; goto err; } - reloc = (unsigned long *) (datapos+(ntohl(hdr->reloc_start)-text_len)); + reloc = (unsigned long __user *) + (datapos + (ntohl(hdr->reloc_start) - text_len)); memp = realdatastart; memp_size = len; } else { len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); len = PAGE_ALIGN(len); - textpos = vm_mmap(0, 0, len, + textpos = vm_mmap(NULL, 0, len, PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0); if (!textpos || IS_ERR_VALUE(textpos)) { - if (!textpos) - textpos = (unsigned long) -ENOMEM; - printk("Unable to allocate RAM for process text/data, errno %d\n", - (int)-textpos); ret = textpos; + if (!textpos) + ret = -ENOMEM; + pr_err("Unable to allocate RAM for process text/data, " + "errno %d\n", ret); goto err; } @@ -620,7 +620,7 @@ static int load_flat_file(struct linux_binprm * bprm, MAX_SHARED_LIBS * sizeof(unsigned long), FLAT_DATA_ALIGN); - reloc = (unsigned long *) + reloc = (unsigned long __user *) (datapos + (ntohl(hdr->reloc_start) - text_len)); memp = textpos; memp_size = len; @@ -629,21 +629,59 @@ static int load_flat_file(struct linux_binprm * bprm, * load it all in and treat it like a RAM load from now on */ if (flags & FLAT_FLAG_GZIP) { - result = decompress_exec(bprm, sizeof (struct flat_hdr), - (((char *) textpos) + sizeof (struct flat_hdr)), +#ifndef CONFIG_MMU + result = decompress_exec(bprm, sizeof(struct flat_hdr), + (((char *)textpos) + sizeof(struct flat_hdr)), (text_len + full_data - - sizeof (struct flat_hdr)), + - sizeof(struct flat_hdr)), 0); memmove((void *) datapos, (void *) realdatastart, full_data); +#else + /* + * This is used on MMU systems mainly for testing. + * Let's use a kernel buffer to simplify things. + */ + long unz_text_len = text_len - sizeof(struct flat_hdr); + long unz_len = unz_text_len + full_data; + char *unz_data = vmalloc(unz_len); + if (!unz_data) { + result = -ENOMEM; + } else { + result = decompress_exec(bprm, sizeof(struct flat_hdr), + unz_data, unz_len, 0); + if (result == 0 && + (copy_to_user((void __user *)textpos + sizeof(struct flat_hdr), + unz_data, unz_text_len) || + copy_to_user((void __user *)datapos, + unz_data + unz_text_len, full_data))) + result = -EFAULT; + vfree(unz_data); + } +#endif } else if (flags & FLAT_FLAG_GZDATA) { result = read_code(bprm->file, textpos, 0, text_len); - if (!IS_ERR_VALUE(result)) + if (!IS_ERR_VALUE(result)) { +#ifndef CONFIG_MMU result = decompress_exec(bprm, text_len, (char *) datapos, full_data, 0); - } - else +#else + char *unz_data = vmalloc(full_data); + if (!unz_data) { + result = -ENOMEM; + } else { + result = decompress_exec(bprm, text_len, + unz_data, full_data, 0); + if (result == 0 && + copy_to_user((void __user *)datapos, + unz_data, full_data)) + result = -EFAULT; + vfree(unz_data); + } #endif + } + } else +#endif /* CONFIG_BINFMT_ZFLAT */ { result = read_code(bprm->file, textpos, 0, text_len); if (!IS_ERR_VALUE(result)) @@ -652,21 +690,19 @@ static int load_flat_file(struct linux_binprm * bprm, full_data); } if (IS_ERR_VALUE(result)) { - printk("Unable to read code+data+bss, errno %d\n",(int)-result); + ret = result; + pr_err("Unable to read code+data+bss, errno %d\n", ret); vm_munmap(textpos, text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long)); - ret = result; goto err; } } - if (flags & FLAT_FLAG_KTRACE) - printk("Mapping is %x, Entry point is %x, data_start is %x\n", - (int)textpos, 0x00ffffff&ntohl(hdr->entry), ntohl(hdr->data_start)); + start_code = textpos + sizeof(struct flat_hdr); + end_code = textpos + text_len; + text_len -= sizeof(struct flat_hdr); /* the real code len */ /* The main program needs a little extra setup in the task structure */ - start_code = textpos + sizeof (struct flat_hdr); - end_code = textpos + text_len; if (id == 0) { current->mm->start_code = start_code; current->mm->end_code = end_code; @@ -681,19 +717,19 @@ static int load_flat_file(struct linux_binprm * bprm, */ current->mm->start_brk = datapos + data_len + bss_len; current->mm->brk = (current->mm->start_brk + 3) & ~3; +#ifndef CONFIG_MMU current->mm->context.end_brk = memp + memp_size - stack_len; +#endif } - if (flags & FLAT_FLAG_KTRACE) - printk("%s %s: TEXT=%x-%x DATA=%x-%x BSS=%x-%x\n", + if (flags & FLAT_FLAG_KTRACE) { + pr_info("Mapping is %lx, Entry point is %x, data_start is %x\n", + textpos, 0x00ffffff&ntohl(hdr->entry), ntohl(hdr->data_start)); + pr_info("%s %s: TEXT=%lx-%lx DATA=%lx-%lx BSS=%lx-%lx\n", id ? "Lib" : "Load", bprm->filename, - (int) start_code, (int) end_code, - (int) datapos, - (int) (datapos + data_len), - (int) (datapos + data_len), - (int) (((datapos + data_len + bss_len) + 3) & ~3)); - - text_len -= sizeof(struct flat_hdr); /* the real code len */ + start_code, end_code, datapos, datapos + data_len, + datapos + data_len, (datapos + data_len + bss_len + 3) & ~3); + } /* Store the current module values into the global library structure */ libinfo->lib_list[id].start_code = start_code; @@ -703,7 +739,7 @@ static int load_flat_file(struct linux_binprm * bprm, libinfo->lib_list[id].loaded = 1; libinfo->lib_list[id].entry = (0x00ffffff & ntohl(hdr->entry)) + textpos; libinfo->lib_list[id].build_date = ntohl(hdr->build_date); - + /* * We just load the allocations into some temporary memory to * help simplify all this mumbo jumbo @@ -717,15 +753,20 @@ static int load_flat_file(struct linux_binprm * bprm, * image. */ if (flags & FLAT_FLAG_GOTPIC) { - for (rp = (unsigned long *)datapos; *rp != 0xffffffff; rp++) { - unsigned long addr; - if (*rp) { - addr = calc_reloc(*rp, libinfo, id, 0); + for (rp = (unsigned long __user *)datapos; ; rp++) { + unsigned long addr, rp_val; + if (get_user(rp_val, rp)) + return -EFAULT; + if (rp_val == 0xffffffff) + break; + if (rp_val) { + addr = calc_reloc(rp_val, libinfo, id, 0); if (addr == RELOC_FAILED) { ret = -ENOEXEC; goto err; } - *rp = addr; + if (put_user(addr, rp)) + return -EFAULT; } } } @@ -742,19 +783,23 @@ static int load_flat_file(struct linux_binprm * bprm, * __start to address 4 so that is okay). */ if (rev > OLD_FLAT_VERSION) { - unsigned long persistent = 0; - for (i=0; i < relocs; i++) { + unsigned long __maybe_unused persistent = 0; + for (i = 0; i < relocs; i++) { unsigned long addr, relval; - /* Get the address of the pointer to be - relocated (of course, the address has to be - relocated first). */ - relval = ntohl(reloc[i]); - if (flat_set_persistent (relval, &persistent)) + /* + * Get the address of the pointer to be + * relocated (of course, the address has to be + * relocated first). + */ + if (get_user(relval, reloc + i)) + return -EFAULT; + relval = ntohl(relval); + if (flat_set_persistent(relval, &persistent)) continue; addr = flat_get_relocate_addr(relval); - rp = (unsigned long *) calc_reloc(addr, libinfo, id, 1); - if (rp == (unsigned long *)RELOC_FAILED) { + rp = (unsigned long __user *)calc_reloc(addr, libinfo, id, 1); + if (rp == (unsigned long __user *)RELOC_FAILED) { ret = -ENOEXEC; goto err; } @@ -780,17 +825,23 @@ static int load_flat_file(struct linux_binprm * bprm, } } } else { - for (i=0; i < relocs; i++) - old_reloc(ntohl(reloc[i])); + for (i = 0; i < relocs; i++) { + unsigned long relval; + if (get_user(relval, reloc + i)) + return -EFAULT; + relval = ntohl(relval); + old_reloc(relval); + } } - + flush_icache_range(start_code, end_code); /* zero the BSS, BRK and stack areas */ - memset((void*)(datapos + data_len), 0, bss_len + - (memp + memp_size - stack_len - /* end brk */ - libinfo->lib_list[id].start_brk) + /* start brk */ - stack_len); + if (clear_user((void __user *)(datapos + data_len), bss_len + + (memp + memp_size - stack_len - /* end brk */ + libinfo->lib_list[id].start_brk) + /* start brk */ + stack_len)) + return -EFAULT; return 0; err: @@ -846,7 +897,7 @@ out: allow_write_access(bprm.file); fput(bprm.file); - return(res); + return res; } #endif /* CONFIG_BINFMT_SHARED_FLAT */ @@ -857,18 +908,17 @@ out: * libraries. There is no binary dependent code anywhere else. */ -static int load_flat_binary(struct linux_binprm * bprm) +static int load_flat_binary(struct linux_binprm *bprm) { struct lib_info libinfo; struct pt_regs *regs = current_pt_regs(); - unsigned long p = bprm->p; - unsigned long stack_len; + unsigned long stack_len = 0; unsigned long start_addr; - unsigned long *sp; int res; int i, j; memset(&libinfo, 0, sizeof(libinfo)); + /* * We have to add the size of our arguments to our stack size * otherwise it's too easy for users to create stack overflows @@ -876,38 +926,54 @@ static int load_flat_binary(struct linux_binprm * bprm) * pedantic and include space for the argv/envp array as it may have * a lot of entries. */ -#define TOP_OF_ARGS (PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *)) - stack_len = TOP_OF_ARGS - bprm->p; /* the strings */ - stack_len += (bprm->argc + 1) * sizeof(char *); /* the argv array */ - stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */ - stack_len += FLAT_STACK_ALIGN - 1; /* reserve for upcoming alignment */ - +#ifndef CONFIG_MMU + stack_len += PAGE_SIZE * MAX_ARG_PAGES - bprm->p; /* the strings */ +#endif + stack_len += (bprm->argc + 1) * sizeof(char *); /* the argv array */ + stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */ + stack_len = ALIGN(stack_len, FLAT_STACK_ALIGN); + res = load_flat_file(bprm, &libinfo, 0, &stack_len); if (res < 0) return res; - + /* Update data segment pointers for all libraries */ - for (i=0; i<MAX_SHARED_LIBS; i++) - if (libinfo.lib_list[i].loaded) - for (j=0; j<MAX_SHARED_LIBS; j++) - (-(j+1))[(unsigned long *)(libinfo.lib_list[i].start_data)] = - (libinfo.lib_list[j].loaded)? - libinfo.lib_list[j].start_data:UNLOADED_LIB; + for (i = 0; i < MAX_SHARED_LIBS; i++) { + if (!libinfo.lib_list[i].loaded) + continue; + for (j = 0; j < MAX_SHARED_LIBS; j++) { + unsigned long val = libinfo.lib_list[j].loaded ? + libinfo.lib_list[j].start_data : UNLOADED_LIB; + unsigned long __user *p = (unsigned long __user *) + libinfo.lib_list[i].start_data; + p -= j + 1; + if (put_user(val, p)) + return -EFAULT; + } + } install_exec_creds(bprm); set_binfmt(&flat_format); - p = ((current->mm->context.end_brk + stack_len + 3) & ~3) - 4; - DBG_FLT("p=%x\n", (int)p); +#ifdef CONFIG_MMU + res = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); + if (!res) + res = create_flat_tables(bprm, bprm->p); +#else + /* Stash our initial stack pointer into the mm structure */ + current->mm->start_stack = + ((current->mm->context.end_brk + stack_len + 3) & ~3) - 4; + pr_debug("sp=%lx\n", current->mm->start_stack); - /* copy the arg pages onto the stack, this could be more efficient :-) */ - for (i = TOP_OF_ARGS - 1; i >= bprm->p; i--) - * (char *) --p = - ((char *) page_address(bprm->page[i/PAGE_SIZE]))[i % PAGE_SIZE]; + /* copy the arg pages onto the stack */ + res = transfer_args_to_stack(bprm, ¤t->mm->start_stack); + if (!res) + res = create_flat_tables(bprm, current->mm->start_stack); +#endif + if (res) + return res; - sp = (unsigned long *) create_flat_tables(p, bprm); - /* Fake some return addresses to ensure the call chain will * initialise library in order for us. We are required to call * lib 1 first, then 2, ... and finally the main program (id 0). @@ -915,24 +981,24 @@ static int load_flat_binary(struct linux_binprm * bprm) start_addr = libinfo.lib_list[0].entry; #ifdef CONFIG_BINFMT_SHARED_FLAT - for (i = MAX_SHARED_LIBS-1; i>0; i--) { + for (i = MAX_SHARED_LIBS-1; i > 0; i--) { if (libinfo.lib_list[i].loaded) { /* Push previos first to call address */ - --sp; put_user(start_addr, sp); + unsigned long __user *sp; + current->mm->start_stack -= sizeof(unsigned long); + sp = (unsigned long __user *)current->mm->start_stack; + __put_user(start_addr, sp); start_addr = libinfo.lib_list[i].entry; } } #endif - - /* Stash our initial stack pointer into the mm structure */ - current->mm->start_stack = (unsigned long )sp; #ifdef FLAT_PLAT_INIT FLAT_PLAT_INIT(regs); #endif - DBG_FLT("start_thread(regs=0x%x, entry=0x%x, start_stack=0x%x)\n", - (int)regs, (int)start_addr, (int)current->mm->start_stack); - + + pr_debug("start_thread(regs=0x%p, entry=0x%lx, start_stack=0x%lx)\n", + regs, start_addr, current->mm->start_stack); start_thread(regs, start_addr, current->mm->start_stack); return 0; @@ -945,9 +1011,6 @@ static int __init init_flat_binfmt(void) register_binfmt(&flat_format); return 0; } - -/****************************************************************************/ - core_initcall(init_flat_binfmt); /****************************************************************************/ diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 5417516f6e59..6103a6362ccd 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -26,6 +26,8 @@ #include <linux/fs.h> #include <linux/uaccess.h> +#include "internal.h" + #ifdef DEBUG # define USE_DEBUG 1 #else @@ -43,6 +45,7 @@ enum {Enabled, Magic}; #define MISC_FMT_PRESERVE_ARGV0 (1 << 31) #define MISC_FMT_OPEN_BINARY (1 << 30) #define MISC_FMT_CREDENTIALS (1 << 29) +#define MISC_FMT_OPEN_FILE (1 << 28) typedef struct { struct list_head list; @@ -54,6 +57,7 @@ typedef struct { char *interpreter; /* filename of interpreter */ char *name; struct dentry *dentry; + struct file *interp_file; } Node; static DEFINE_RWLOCK(entries_lock); @@ -201,7 +205,13 @@ static int load_misc_binary(struct linux_binprm *bprm) if (retval < 0) goto error; - interp_file = open_exec(iname); + if (fmt->flags & MISC_FMT_OPEN_FILE && fmt->interp_file) { + interp_file = filp_clone_open(fmt->interp_file); + if (!IS_ERR(interp_file)) + deny_write_access(interp_file); + } else { + interp_file = open_exec(iname); + } retval = PTR_ERR(interp_file); if (IS_ERR(interp_file)) goto error; @@ -285,6 +295,11 @@ static char *check_special_flags(char *sfs, Node *e) e->flags |= (MISC_FMT_CREDENTIALS | MISC_FMT_OPEN_BINARY); break; + case 'F': + pr_debug("register: flag: F: open interpreter file now\n"); + p++; + e->flags |= MISC_FMT_OPEN_FILE; + break; default: cont = 0; } @@ -543,6 +558,8 @@ static void entry_status(Node *e, char *page) *dp++ = 'O'; if (e->flags & MISC_FMT_CREDENTIALS) *dp++ = 'C'; + if (e->flags & MISC_FMT_OPEN_FILE) + *dp++ = 'F'; *dp++ = '\n'; if (!test_bit(Magic, &e->flags)) { @@ -590,6 +607,11 @@ static void kill_node(Node *e) } write_unlock(&entries_lock); + if ((e->flags & MISC_FMT_OPEN_FILE) && e->interp_file) { + filp_close(e->interp_file, NULL); + e->interp_file = NULL; + } + if (dentry) { drop_nlink(d_inode(dentry)); d_drop(dentry); @@ -696,6 +718,21 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, goto out2; } + if (e->flags & MISC_FMT_OPEN_FILE) { + struct file *f; + + f = open_exec(e->interpreter); + if (IS_ERR(f)) { + err = PTR_ERR(f); + pr_notice("register: failed to install interpreter file %s\n", e->interpreter); + simple_release_fs(&bm_mnt, &entry_count); + iput(inode); + inode = NULL; + goto out2; + } + e->interp_file = f; + } + e->dentry = dget(dentry); inode->i_private = e; inode->i_fop = &bm_entry_operations; @@ -713,7 +750,7 @@ out: if (err) { kfree(e); - return -EINVAL; + return err; } return count; } diff --git a/fs/block_dev.c b/fs/block_dev.c index ada42cf42d06..c3cdde87cc8c 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -416,7 +416,7 @@ int bdev_read_page(struct block_device *bdev, sector_t sector, result = blk_queue_enter(bdev->bd_queue, false); if (result) return result; - result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ); + result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, false); blk_queue_exit(bdev->bd_queue); return result; } @@ -445,7 +445,6 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, struct page *page, struct writeback_control *wbc) { int result; - int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE; const struct block_device_operations *ops = bdev->bd_disk->fops; if (!ops->rw_page || bdev_get_integrity(bdev)) @@ -455,7 +454,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, return result; set_page_writeback(page); - result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw); + result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, true); if (result) end_page_writeback(page); else @@ -1275,11 +1274,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = disk; bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; - if (IS_ENABLED(CONFIG_BLK_DEV_DAX) && - blk_queue_dax(disk->queue)) - bdev->bd_inode->i_flags = S_DAX; - else - bdev->bd_inode->i_flags = 0; + bdev->bd_inode->i_flags = 0; if (!partno) { ret = -ENXIO; diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 67a607709d4f..53bb7af4e5f0 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -55,8 +55,7 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type) } if (size > 0) { acl = posix_acl_from_xattr(&init_user_ns, value, size); - } else if (size == -ENOENT || size == -ENODATA || size == 0) { - /* FIXME, who returns -ENOENT? I think nobody */ + } else if (size == -ERANGE || size == -ENODATA || size == 0) { acl = NULL; } else { acl = ERR_PTR(-EIO); diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 5fb60ea7eee2..e0f071f6b5a7 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -34,6 +34,10 @@ struct __btrfs_workqueue { struct workqueue_struct *normal_wq; + + /* File system this workqueue services */ + struct btrfs_fs_info *fs_info; + /* List head pointing to ordered work list */ struct list_head ordered_list; @@ -70,6 +74,18 @@ void btrfs_##name(struct work_struct *arg) \ normal_work_helper(work); \ } +struct btrfs_fs_info * +btrfs_workqueue_owner(struct __btrfs_workqueue *wq) +{ + return wq->fs_info; +} + +struct btrfs_fs_info * +btrfs_work_owner(struct btrfs_work *work) +{ + return work->wq->fs_info; +} + BTRFS_WORK_HELPER(worker_helper); BTRFS_WORK_HELPER(delalloc_helper); BTRFS_WORK_HELPER(flush_delalloc_helper); @@ -94,14 +110,15 @@ BTRFS_WORK_HELPER(scrubnc_helper); BTRFS_WORK_HELPER(scrubparity_helper); static struct __btrfs_workqueue * -__btrfs_alloc_workqueue(const char *name, unsigned int flags, int limit_active, - int thresh) +__btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name, + unsigned int flags, int limit_active, int thresh) { struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL); if (!ret) return NULL; + ret->fs_info = fs_info; ret->limit_active = limit_active; atomic_set(&ret->pending, 0); if (thresh == 0) @@ -143,7 +160,8 @@ __btrfs_alloc_workqueue(const char *name, unsigned int flags, int limit_active, static inline void __btrfs_destroy_workqueue(struct __btrfs_workqueue *wq); -struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, +struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, + const char *name, unsigned int flags, int limit_active, int thresh) @@ -153,7 +171,8 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, if (!ret) return NULL; - ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI, + ret->normal = __btrfs_alloc_workqueue(fs_info, name, + flags & ~WQ_HIGHPRI, limit_active, thresh); if (!ret->normal) { kfree(ret); @@ -161,8 +180,8 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, } if (flags & WQ_HIGHPRI) { - ret->high = __btrfs_alloc_workqueue(name, flags, limit_active, - thresh); + ret->high = __btrfs_alloc_workqueue(fs_info, name, flags, + limit_active, thresh); if (!ret->high) { __btrfs_destroy_workqueue(ret->normal); kfree(ret); diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index ad4d0647d1a6..8e52484cd461 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -21,6 +21,7 @@ #define __BTRFS_ASYNC_THREAD_ #include <linux/workqueue.h> +struct btrfs_fs_info; struct btrfs_workqueue; /* Internal use only */ struct __btrfs_workqueue; @@ -67,7 +68,8 @@ BTRFS_WORK_HELPER_PROTO(scrubnc_helper); BTRFS_WORK_HELPER_PROTO(scrubparity_helper); -struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, +struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, + const char *name, unsigned int flags, int limit_active, int thresh); @@ -80,4 +82,6 @@ void btrfs_queue_work(struct btrfs_workqueue *wq, void btrfs_destroy_workqueue(struct btrfs_workqueue *wq); void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max); void btrfs_set_work_high_priority(struct btrfs_work *work); +struct btrfs_fs_info *btrfs_work_owner(struct btrfs_work *work); +struct btrfs_fs_info *btrfs_workqueue_owner(struct __btrfs_workqueue *wq); #endif diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 8bb3509099e8..2b88439c2ee8 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -139,7 +139,7 @@ int __init btrfs_prelim_ref_init(void) btrfs_prelim_ref_cache = kmem_cache_create("btrfs_prelim_ref", sizeof(struct __prelim_ref), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + SLAB_MEM_SPREAD, NULL); if (!btrfs_prelim_ref_cache) return -ENOMEM; @@ -361,7 +361,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, goto out; } - if (btrfs_test_is_dummy_root(root)) { + if (btrfs_is_testing(fs_info)) { srcu_read_unlock(&fs_info->subvol_srcu, index); ret = -ENOENT; goto out; diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 5d5cae05818d..66789471b49d 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -2945,7 +2945,7 @@ static void __btrfsic_submit_bio(struct bio *bio) printk(KERN_INFO "submit_bio(rw=%d,0x%x, bi_vcnt=%u," " bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", - bio_op(bio), bio->bi_rw, bio->bi_vcnt, + bio_op(bio), bio->bi_opf, bio->bi_vcnt, (unsigned long long)bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev); @@ -2976,18 +2976,18 @@ static void __btrfsic_submit_bio(struct bio *bio) btrfsic_process_written_block(dev_state, dev_bytenr, mapped_datav, bio->bi_vcnt, bio, &bio_is_patched, - NULL, bio->bi_rw); + NULL, bio->bi_opf); while (i > 0) { i--; kunmap(bio->bi_io_vec[i].bv_page); } kfree(mapped_datav); - } else if (NULL != dev_state && (bio->bi_rw & REQ_PREFLUSH)) { + } else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) { if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) printk(KERN_INFO "submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n", - bio_op(bio), bio->bi_rw, bio->bi_bdev); + bio_op(bio), bio->bi_opf, bio->bi_bdev); if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { if ((dev_state->state->print_mask & (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | @@ -3005,7 +3005,7 @@ static void __btrfsic_submit_bio(struct bio *bio) block->never_written = 0; block->iodone_w_error = 0; block->flush_gen = dev_state->last_flush_gen + 1; - block->submit_bio_bh_rw = bio->bi_rw; + block->submit_bio_bh_rw = bio->bi_opf; block->orig_bio_bh_private = bio->bi_private; block->orig_bio_bh_end_io.bio = bio->bi_end_io; block->next_in_same_bio = NULL; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index cefedabf0a92..029db6e1105c 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -403,7 +403,10 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, } ret = btrfs_map_bio(root, bio, 0, 1); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + bio->bi_error = ret; + bio_endio(bio); + } bio_put(bio); @@ -434,7 +437,10 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, } ret = btrfs_map_bio(root, bio, 0, 1); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + bio->bi_error = ret; + bio_endio(bio); + } bio_put(bio); return 0; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index a85cf7d23309..d1c56c94dd5a 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1153,14 +1153,14 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { ret = btrfs_reloc_cow_block(trans, root, buf, cow); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } } @@ -1198,7 +1198,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if (last_ref) { ret = tree_mod_log_free_eb(root->fs_info, buf); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } } @@ -1505,7 +1505,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) { - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(root->fs_info)) return 0; /* ensure we can see the force_cow */ @@ -1771,6 +1771,14 @@ static noinline int generic_bin_search(struct extent_buffer *eb, unsigned long map_len = 0; int err; + if (low > high) { + btrfs_err(eb->fs_info, + "%s: low (%d) > high (%d) eb %llu owner %llu level %d", + __func__, low, high, eb->start, + btrfs_header_owner(eb), btrfs_header_level(eb)); + return -EINVAL; + } + while (low < high) { mid = (low + high) / 2; offset = p + mid * item_size; @@ -1858,7 +1866,6 @@ static void root_sub_used(struct btrfs_root *root, u32 size) /* given a node and slot number, this reads the blocks it points to. The * extent buffer is returned with a reference taken (but unlocked). - * NULL is returned on error. */ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, struct extent_buffer *parent, int slot) @@ -1866,19 +1873,16 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, int level = btrfs_header_level(parent); struct extent_buffer *eb; - if (slot < 0) - return NULL; - if (slot >= btrfs_header_nritems(parent)) - return NULL; + if (slot < 0 || slot >= btrfs_header_nritems(parent)) + return ERR_PTR(-ENOENT); BUG_ON(level == 0); eb = read_tree_block(root, btrfs_node_blockptr(parent, slot), btrfs_node_ptr_generation(parent, slot)); - if (IS_ERR(eb) || !extent_buffer_uptodate(eb)) { - if (!IS_ERR(eb)) - free_extent_buffer(eb); - eb = NULL; + if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + eb = ERR_PTR(-EIO); } return eb; @@ -1931,8 +1935,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* promote the child to a root */ child = read_node_slot(root, mid, 0); - if (!child) { - ret = -EROFS; + if (IS_ERR(child)) { + ret = PTR_ERR(child); btrfs_handle_fs_error(root->fs_info, ret, NULL); goto enospc; } @@ -1970,6 +1974,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, return 0; left = read_node_slot(root, parent, pslot - 1); + if (IS_ERR(left)) + left = NULL; + if (left) { btrfs_tree_lock(left); btrfs_set_lock_blocking(left); @@ -1980,7 +1987,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto enospc; } } + right = read_node_slot(root, parent, pslot + 1); + if (IS_ERR(right)) + right = NULL; + if (right) { btrfs_tree_lock(right); btrfs_set_lock_blocking(right); @@ -2135,6 +2146,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, return 1; left = read_node_slot(root, parent, pslot - 1); + if (IS_ERR(left)) + left = NULL; /* first, try to make some room in the middle buffer */ if (left) { @@ -2185,6 +2198,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, free_extent_buffer(left); } right = read_node_slot(root, parent, pslot + 1); + if (IS_ERR(right)) + right = NULL; /* * then try to empty the right most buffer into the middle @@ -3240,7 +3255,7 @@ static int push_node_left(struct btrfs_trans_handle *trans, ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, push_items); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } copy_extent_buffer(dst, src, @@ -3315,7 +3330,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans, ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0, src_nritems - push_items, push_items); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } copy_extent_buffer(dst, src, @@ -3519,7 +3534,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } copy_extent_buffer(split, c, @@ -3773,7 +3788,11 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_assert_tree_locked(path->nodes[1]); right = read_node_slot(root, upper, slot + 1); - if (right == NULL) + /* + * slot + 1 is not valid or we fail to read the right node, + * no big deal, just return. + */ + if (IS_ERR(right)) return 1; btrfs_tree_lock(right); @@ -4003,7 +4022,11 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_assert_tree_locked(path->nodes[1]); left = read_node_slot(root, path->nodes[1], slot - 1); - if (left == NULL) + /* + * slot - 1 is not valid or we fail to read the left node, + * no big deal, just return. + */ + if (IS_ERR(left)) return 1; btrfs_tree_lock(left); @@ -5210,7 +5233,10 @@ find_next_key: } btrfs_set_path_blocking(path); cur = read_node_slot(root, cur, slot); - BUG_ON(!cur); /* -ENOMEM */ + if (IS_ERR(cur)) { + ret = PTR_ERR(cur); + goto out; + } btrfs_tree_read_lock(cur); @@ -5229,15 +5255,21 @@ out: return ret; } -static void tree_move_down(struct btrfs_root *root, +static int tree_move_down(struct btrfs_root *root, struct btrfs_path *path, int *level, int root_level) { + struct extent_buffer *eb; + BUG_ON(*level == 0); - path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level], - path->slots[*level]); + eb = read_node_slot(root, path->nodes[*level], path->slots[*level]); + if (IS_ERR(eb)) + return PTR_ERR(eb); + + path->nodes[*level - 1] = eb; path->slots[*level - 1] = 0; (*level)--; + return 0; } static int tree_move_next_or_upnext(struct btrfs_root *root, @@ -5282,8 +5314,7 @@ static int tree_advance(struct btrfs_root *root, if (*level == 0 || !allow_down) { ret = tree_move_next_or_upnext(root, path, level, root_level); } else { - tree_move_down(root, path, level, root_level); - ret = 0; + ret = tree_move_down(root, path, level, root_level); } if (ret >= 0) { if (*level == 0) @@ -5457,8 +5488,10 @@ int btrfs_compare_trees(struct btrfs_root *left_root, left_root_level, advance_left != ADVANCE_ONLY_NEXT, &left_key); - if (ret < 0) + if (ret == -1) left_end_reached = ADVANCE; + else if (ret < 0) + goto out; advance_left = 0; } if (advance_right && !right_end_reached) { @@ -5466,8 +5499,10 @@ int btrfs_compare_trees(struct btrfs_root *left_root, right_root_level, advance_right != ADVANCE_ONLY_NEXT, &right_key); - if (ret < 0) + if (ret == -1) right_end_reached = ADVANCE; + else if (ret < 0) + goto out; advance_right = 0; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 443fcc402114..2fe8f89091a3 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -117,6 +117,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) #define BTRFS_FS_STATE_REMOUNTING 1 #define BTRFS_FS_STATE_TRANS_ABORTED 2 #define BTRFS_FS_STATE_DEV_REPLACING 3 +#define BTRFS_FS_STATE_DUMMY_FS_INFO 4 #define BTRFS_BACKREF_REV_MAX 256 #define BTRFS_BACKREF_REV_SHIFT 56 @@ -144,21 +145,6 @@ struct btrfs_header { u8 level; } __attribute__ ((__packed__)); -#define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \ - sizeof(struct btrfs_header)) / \ - sizeof(struct btrfs_key_ptr)) -#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header)) -#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->nodesize)) -#define BTRFS_FILE_EXTENT_INLINE_DATA_START \ - (offsetof(struct btrfs_file_extent_item, disk_bytenr)) -#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ - sizeof(struct btrfs_item) - \ - BTRFS_FILE_EXTENT_INLINE_DATA_START) -#define BTRFS_MAX_XATTR_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ - sizeof(struct btrfs_item) -\ - sizeof(struct btrfs_dir_item)) - - /* * this is a very generous portion of the super block, giving us * room to translate 14 chunks with 3 stripes each. @@ -1114,12 +1100,11 @@ struct btrfs_subvolume_writers { #define BTRFS_ROOT_REF_COWS 1 #define BTRFS_ROOT_TRACK_DIRTY 2 #define BTRFS_ROOT_IN_RADIX 3 -#define BTRFS_ROOT_DUMMY_ROOT 4 -#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED 5 -#define BTRFS_ROOT_DEFRAG_RUNNING 6 -#define BTRFS_ROOT_FORCE_COW 7 -#define BTRFS_ROOT_MULTI_LOG_TASKS 8 -#define BTRFS_ROOT_DIRTY 9 +#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED 4 +#define BTRFS_ROOT_DEFRAG_RUNNING 5 +#define BTRFS_ROOT_FORCE_COW 6 +#define BTRFS_ROOT_MULTI_LOG_TASKS 7 +#define BTRFS_ROOT_DIRTY 8 /* * in ram representation of the tree. extent_root is used for all allocations @@ -1181,8 +1166,10 @@ struct btrfs_root { u64 highest_objectid; +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS /* only used with CONFIG_BTRFS_FS_RUN_SANITY_TESTS is enabled */ u64 alloc_bytenr; +#endif u64 defrag_trans_start; struct btrfs_key defrag_progress; @@ -1259,6 +1246,39 @@ struct btrfs_root { atomic_t qgroup_meta_rsv; }; +static inline u32 __BTRFS_LEAF_DATA_SIZE(u32 blocksize) +{ + return blocksize - sizeof(struct btrfs_header); +} + +static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_root *root) +{ + return __BTRFS_LEAF_DATA_SIZE(root->nodesize); +} + +static inline u32 BTRFS_MAX_ITEM_SIZE(const struct btrfs_root *root) +{ + return BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); +} + +static inline u32 BTRFS_NODEPTRS_PER_BLOCK(const struct btrfs_root *root) +{ + return BTRFS_LEAF_DATA_SIZE(root) / sizeof(struct btrfs_key_ptr); +} + +#define BTRFS_FILE_EXTENT_INLINE_DATA_START \ + (offsetof(struct btrfs_file_extent_item, disk_bytenr)) +static inline u32 BTRFS_MAX_INLINE_DATA_SIZE(const struct btrfs_root *root) +{ + return BTRFS_MAX_ITEM_SIZE(root) - + BTRFS_FILE_EXTENT_INLINE_DATA_START; +} + +static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_root *root) +{ + return BTRFS_MAX_ITEM_SIZE(root) - sizeof(struct btrfs_dir_item); +} + /* * Flags for mount options. * @@ -1299,21 +1319,21 @@ struct btrfs_root { #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) #define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) -#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ +#define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \ BTRFS_MOUNT_##opt) -#define btrfs_set_and_info(root, opt, fmt, args...) \ +#define btrfs_set_and_info(fs_info, opt, fmt, args...) \ { \ - if (!btrfs_test_opt(root, opt)) \ - btrfs_info(root->fs_info, fmt, ##args); \ - btrfs_set_opt(root->fs_info->mount_opt, opt); \ + if (!btrfs_test_opt(fs_info, opt)) \ + btrfs_info(fs_info, fmt, ##args); \ + btrfs_set_opt(fs_info->mount_opt, opt); \ } -#define btrfs_clear_and_info(root, opt, fmt, args...) \ +#define btrfs_clear_and_info(fs_info, opt, fmt, args...) \ { \ - if (btrfs_test_opt(root, opt)) \ - btrfs_info(root->fs_info, fmt, ##args); \ - btrfs_clear_opt(root->fs_info->mount_opt, opt); \ + if (btrfs_test_opt(fs_info, opt)) \ + btrfs_info(fs_info, fmt, ##args); \ + btrfs_clear_opt(fs_info->mount_opt, opt); \ } #ifdef CONFIG_BTRFS_DEBUG @@ -1321,9 +1341,9 @@ static inline int btrfs_should_fragment_free_space(struct btrfs_root *root, struct btrfs_block_group_cache *block_group) { - return (btrfs_test_opt(root, FRAGMENT_METADATA) && + return (btrfs_test_opt(root->fs_info, FRAGMENT_METADATA) && block_group->flags & BTRFS_BLOCK_GROUP_METADATA) || - (btrfs_test_opt(root, FRAGMENT_DATA) && + (btrfs_test_opt(root->fs_info, FRAGMENT_DATA) && block_group->flags & BTRFS_BLOCK_GROUP_DATA); } #endif @@ -2886,9 +2906,6 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq); /* root-item.c */ -int btrfs_find_root_ref(struct btrfs_root *tree_root, - struct btrfs_path *path, - u64 root_id, u64 ref_id); int btrfs_add_root_ref(struct btrfs_trans_handle *trans, struct btrfs_root *tree_root, u64 root_id, u64 ref_id, u64 dirid, u64 sequence, @@ -3362,23 +3379,23 @@ const char *btrfs_decode_error(int errno); __cold void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const char *function, + const char *function, unsigned int line, int errno); /* * Call btrfs_abort_transaction as early as possible when an error condition is * detected, that way the exact line number is reported. */ -#define btrfs_abort_transaction(trans, root, errno) \ +#define btrfs_abort_transaction(trans, errno) \ do { \ /* Report first abort since mount */ \ if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ - &((root)->fs_info->fs_state))) { \ + &((trans)->fs_info->fs_state))) { \ WARN(1, KERN_DEBUG \ "BTRFS: Transaction aborted (error %d)\n", \ (errno)); \ } \ - __btrfs_abort_transaction((trans), (root), __func__, \ + __btrfs_abort_transaction((trans), __func__, \ __LINE__, (errno)); \ } while (0) @@ -3610,13 +3627,13 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) void btrfs_test_destroy_inode(struct inode *inode); #endif -static inline int btrfs_test_is_dummy_root(struct btrfs_root *root) +static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) { #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + if (unlikely(test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, + &fs_info->fs_state))) return 1; #endif return 0; } - #endif diff --git a/fs/btrfs/dedupe.h b/fs/btrfs/dedupe.h new file mode 100644 index 000000000000..83ebfe28da9e --- /dev/null +++ b/fs/btrfs/dedupe.h @@ -0,0 +1,24 @@ +/* + * Copyright (C) 2016 Fujitsu. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_DEDUPE__ +#define __BTRFS_DEDUPE__ + +/* later in-band dedupe will expand this struct */ +struct btrfs_dedupe_hash; +#endif diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index dd3c040139a2..3eeb9cd8cfa5 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -34,7 +34,7 @@ int __init btrfs_delayed_inode_init(void) delayed_node_cache = kmem_cache_create("btrfs_delayed_node", sizeof(struct btrfs_delayed_node), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + SLAB_MEM_SPREAD, NULL); if (!delayed_node_cache) return -ENOMEM; @@ -1170,7 +1170,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, if (ret) { btrfs_release_delayed_node(curr_node); curr_node = NULL; - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); break; } diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 430b3689b112..d9ddcfc18c91 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -606,7 +606,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, qrecord->num_bytes = num_bytes; qrecord->old_roots = NULL; - qexisting = btrfs_qgroup_insert_dirty_extent(delayed_refs, + qexisting = btrfs_qgroup_insert_dirty_extent(fs_info, + delayed_refs, qrecord); if (qexisting) kfree(qrecord); @@ -615,7 +616,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, spin_lock_init(&head_ref->lock); mutex_init(&head_ref->mutex); - trace_add_delayed_ref_head(ref, head_ref, action); + trace_add_delayed_ref_head(fs_info, ref, head_ref, action); existing = htree_insert(&delayed_refs->href_root, &head_ref->href_node); @@ -682,7 +683,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, ref->type = BTRFS_TREE_BLOCK_REF_KEY; full_ref->level = level; - trace_add_delayed_tree_ref(ref, full_ref, action); + trace_add_delayed_tree_ref(fs_info, ref, full_ref, action); ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref); @@ -739,7 +740,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, full_ref->objectid = owner; full_ref->offset = offset; - trace_add_delayed_data_ref(ref, full_ref, action); + trace_add_delayed_data_ref(fs_info, ref, full_ref, action); ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref); @@ -861,33 +862,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, return 0; } -int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, - u64 ref_root, u64 bytenr, u64 num_bytes) -{ - struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_delayed_ref_head *ref_head; - int ret = 0; - - if (!fs_info->quota_enabled || !is_fstree(ref_root)) - return 0; - - delayed_refs = &trans->transaction->delayed_refs; - - spin_lock(&delayed_refs->lock); - ref_head = find_ref_head(&delayed_refs->href_root, bytenr, 0); - if (!ref_head) { - ret = -ENOENT; - goto out; - } - WARN_ON(ref_head->qgroup_reserved || ref_head->qgroup_ref_root); - ref_head->qgroup_ref_root = ref_root; - ref_head->qgroup_reserved = num_bytes; -out: - spin_unlock(&delayed_refs->lock); - return ret; -} - int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, @@ -940,28 +914,28 @@ int btrfs_delayed_ref_init(void) btrfs_delayed_ref_head_cachep = kmem_cache_create( "btrfs_delayed_ref_head", sizeof(struct btrfs_delayed_ref_head), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!btrfs_delayed_ref_head_cachep) goto fail; btrfs_delayed_tree_ref_cachep = kmem_cache_create( "btrfs_delayed_tree_ref", sizeof(struct btrfs_delayed_tree_ref), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!btrfs_delayed_tree_ref_cachep) goto fail; btrfs_delayed_data_ref_cachep = kmem_cache_create( "btrfs_delayed_data_ref", sizeof(struct btrfs_delayed_data_ref), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!btrfs_delayed_data_ref_cachep) goto fail; btrfs_delayed_extent_op_cachep = kmem_cache_create( "btrfs_delayed_extent_op", sizeof(struct btrfs_delayed_extent_op), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!btrfs_delayed_extent_op_cachep) goto fail; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 5fca9534a271..43f3629760e9 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -250,9 +250,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, u64 parent, u64 ref_root, u64 owner, u64 offset, u64 reserved, int action, struct btrfs_delayed_extent_op *extent_op); -int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, - u64 ref_root, u64 bytenr, u64 num_bytes); int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 63ef9cdf0144..e9bbff3c0029 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -142,7 +142,7 @@ no_valid_dev_replace_entry_found: * missing */ if (!dev_replace->srcdev && - !btrfs_test_opt(dev_root, DEGRADED)) { + !btrfs_test_opt(dev_root->fs_info, DEGRADED)) { ret = -EIO; btrfs_warn(fs_info, "cannot mount because device replace operation is ongoing and"); @@ -151,7 +151,7 @@ no_valid_dev_replace_entry_found: src_devid); } if (!dev_replace->tgtdev && - !btrfs_test_opt(dev_root, DEGRADED)) { + !btrfs_test_opt(dev_root->fs_info, DEGRADED)) { ret = -EIO; btrfs_warn(fs_info, "cannot mount because device replace operation is ongoing and"); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9a726ded2c6d..59febfb8d04a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -101,7 +101,7 @@ int __init btrfs_end_io_wq_init(void) btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq", sizeof(struct btrfs_end_io_wq), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + SLAB_MEM_SPREAD, NULL); if (!btrfs_end_io_wq_cache) return -ENOMEM; @@ -870,7 +870,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, atomic_inc(&fs_info->nr_async_submits); - if (bio->bi_rw & REQ_SYNC) + if (bio->bi_opf & REQ_SYNC) btrfs_set_work_high_priority(&async->work); btrfs_queue_work(fs_info->workers, &async->work); @@ -1140,7 +1140,7 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info, struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr) { - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(root->fs_info)) return alloc_test_extent_buffer(root->fs_info, bytenr, root->nodesize); return alloc_extent_buffer(root->fs_info, bytenr); @@ -1227,6 +1227,7 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize, struct btrfs_root *root, struct btrfs_fs_info *fs_info, u64 objectid) { + bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); root->node = NULL; root->commit_root = NULL; root->sectorsize = sectorsize; @@ -1281,14 +1282,14 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize, root->log_transid = 0; root->log_transid_committed = -1; root->last_log_commit = 0; - if (fs_info) + if (!dummy) extent_io_tree_init(&root->dirty_log_pages, fs_info->btree_inode->i_mapping); memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); - if (fs_info) + if (!dummy) root->defrag_trans_start = fs_info->generation; else root->defrag_trans_start = 0; @@ -1309,17 +1310,20 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS /* Should only be used by the testing infrastructure */ -struct btrfs_root *btrfs_alloc_dummy_root(u32 sectorsize, u32 nodesize) +struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info, + u32 sectorsize, u32 nodesize) { struct btrfs_root *root; - root = btrfs_alloc_root(NULL, GFP_KERNEL); + if (!fs_info) + return ERR_PTR(-EINVAL); + + root = btrfs_alloc_root(fs_info, GFP_KERNEL); if (!root) return ERR_PTR(-ENOMEM); /* We don't use the stripesize in selftest, set it as sectorsize */ - __setup_root(nodesize, sectorsize, sectorsize, root, NULL, + __setup_root(nodesize, sectorsize, sectorsize, root, fs_info, BTRFS_ROOT_TREE_OBJECTID); - set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state); root->alloc_bytenr = 0; return root; @@ -1594,14 +1598,14 @@ int btrfs_init_fs_root(struct btrfs_root *root) ret = get_anon_bdev(&root->anon_dev); if (ret) - goto free_writers; + goto fail; mutex_lock(&root->objectid_mutex); ret = btrfs_find_highest_objectid(root, &root->highest_objectid); if (ret) { mutex_unlock(&root->objectid_mutex); - goto free_root_dev; + goto fail; } ASSERT(root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID); @@ -1609,14 +1613,8 @@ int btrfs_init_fs_root(struct btrfs_root *root) mutex_unlock(&root->objectid_mutex); return 0; - -free_root_dev: - free_anon_bdev(root->anon_dev); -free_writers: - btrfs_free_subvolume_writers(root->subv_writers); fail: - kfree(root->free_ino_ctl); - kfree(root->free_ino_pinned); + /* the caller is responsible to call free_fs_root */ return ret; } @@ -2310,17 +2308,19 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; fs_info->workers = - btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI, - max_active, 16); + btrfs_alloc_workqueue(fs_info, "worker", + flags | WQ_HIGHPRI, max_active, 16); fs_info->delalloc_workers = - btrfs_alloc_workqueue("delalloc", flags, max_active, 2); + btrfs_alloc_workqueue(fs_info, "delalloc", + flags, max_active, 2); fs_info->flush_workers = - btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0); + btrfs_alloc_workqueue(fs_info, "flush_delalloc", + flags, max_active, 0); fs_info->caching_workers = - btrfs_alloc_workqueue("cache", flags, max_active, 0); + btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0); /* * a higher idle thresh on the submit workers makes it much more @@ -2328,41 +2328,48 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, * devices */ fs_info->submit_workers = - btrfs_alloc_workqueue("submit", flags, + btrfs_alloc_workqueue(fs_info, "submit", flags, min_t(u64, fs_devices->num_devices, max_active), 64); fs_info->fixup_workers = - btrfs_alloc_workqueue("fixup", flags, 1, 0); + btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0); /* * endios are largely parallel and should have a very * low idle thresh */ fs_info->endio_workers = - btrfs_alloc_workqueue("endio", flags, max_active, 4); + btrfs_alloc_workqueue(fs_info, "endio", flags, max_active, 4); fs_info->endio_meta_workers = - btrfs_alloc_workqueue("endio-meta", flags, max_active, 4); + btrfs_alloc_workqueue(fs_info, "endio-meta", flags, + max_active, 4); fs_info->endio_meta_write_workers = - btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2); + btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags, + max_active, 2); fs_info->endio_raid56_workers = - btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4); + btrfs_alloc_workqueue(fs_info, "endio-raid56", flags, + max_active, 4); fs_info->endio_repair_workers = - btrfs_alloc_workqueue("endio-repair", flags, 1, 0); + btrfs_alloc_workqueue(fs_info, "endio-repair", flags, 1, 0); fs_info->rmw_workers = - btrfs_alloc_workqueue("rmw", flags, max_active, 2); + btrfs_alloc_workqueue(fs_info, "rmw", flags, max_active, 2); fs_info->endio_write_workers = - btrfs_alloc_workqueue("endio-write", flags, max_active, 2); + btrfs_alloc_workqueue(fs_info, "endio-write", flags, + max_active, 2); fs_info->endio_freespace_worker = - btrfs_alloc_workqueue("freespace-write", flags, max_active, 0); + btrfs_alloc_workqueue(fs_info, "freespace-write", flags, + max_active, 0); fs_info->delayed_workers = - btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0); + btrfs_alloc_workqueue(fs_info, "delayed-meta", flags, + max_active, 0); fs_info->readahead_workers = - btrfs_alloc_workqueue("readahead", flags, max_active, 2); + btrfs_alloc_workqueue(fs_info, "readahead", flags, + max_active, 2); fs_info->qgroup_rescan_workers = - btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0); + btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0); fs_info->extent_workers = - btrfs_alloc_workqueue("extent-refs", flags, + btrfs_alloc_workqueue(fs_info, "extent-refs", flags, min_t(u64, fs_devices->num_devices, max_active), 8); @@ -3010,8 +3017,8 @@ retry_root_backup: if (IS_ERR(fs_info->transaction_kthread)) goto fail_cleaner; - if (!btrfs_test_opt(tree_root, SSD) && - !btrfs_test_opt(tree_root, NOSSD) && + if (!btrfs_test_opt(tree_root->fs_info, SSD) && + !btrfs_test_opt(tree_root->fs_info, NOSSD) && !fs_info->fs_devices->rotating) { btrfs_info(fs_info, "detected SSD devices, enabling SSD mode"); btrfs_set_opt(fs_info->mount_opt, SSD); @@ -3024,9 +3031,9 @@ retry_root_backup: btrfs_apply_pending_changes(fs_info); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) { + if (btrfs_test_opt(tree_root->fs_info, CHECK_INTEGRITY)) { ret = btrfsic_mount(tree_root, fs_devices, - btrfs_test_opt(tree_root, + btrfs_test_opt(tree_root->fs_info, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ? 1 : 0, fs_info->check_integrity_print_mask); @@ -3042,7 +3049,7 @@ retry_root_backup: /* do not make disk changes in broken FS or nologreplay is given */ if (btrfs_super_log_root(disk_super) != 0 && - !btrfs_test_opt(tree_root, NOLOGREPLAY)) { + !btrfs_test_opt(tree_root->fs_info, NOLOGREPLAY)) { ret = btrfs_replay_log(fs_info, fs_devices); if (ret) { err = ret; @@ -3083,7 +3090,7 @@ retry_root_backup: if (sb->s_flags & MS_RDONLY) return 0; - if (btrfs_test_opt(tree_root, FREE_SPACE_TREE) && + if (btrfs_test_opt(tree_root->fs_info, FREE_SPACE_TREE) && !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { btrfs_info(fs_info, "creating free space tree"); ret = btrfs_create_free_space_tree(fs_info); @@ -3120,7 +3127,7 @@ retry_root_backup: btrfs_qgroup_rescan_resume(fs_info); - if (btrfs_test_opt(tree_root, CLEAR_CACHE) && + if (btrfs_test_opt(tree_root->fs_info, CLEAR_CACHE) && btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { btrfs_info(fs_info, "clearing free space tree"); ret = btrfs_clear_free_space_tree(fs_info); @@ -3141,7 +3148,7 @@ retry_root_backup: close_ctree(tree_root); return ret; } - } else if (btrfs_test_opt(tree_root, RESCAN_UUID_TREE) || + } else if (btrfs_test_opt(tree_root->fs_info, RESCAN_UUID_TREE) || fs_info->generation != btrfs_super_uuid_tree_generation(disk_super)) { btrfs_info(fs_info, "checking UUID tree"); @@ -3218,7 +3225,7 @@ fail: return err; recovery_tree_root: - if (!btrfs_test_opt(tree_root, USEBACKUPROOT)) + if (!btrfs_test_opt(tree_root->fs_info, USEBACKUPROOT)) goto fail_tree_roots; free_root_pointers(fs_info, 0); @@ -3634,7 +3641,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) int total_errors = 0; u64 flags; - do_barriers = !btrfs_test_opt(root, NOBARRIER); + do_barriers = !btrfs_test_opt(root->fs_info, NOBARRIER); backup_super_roots(root->fs_info); sb = root->fs_info->super_for_commit; @@ -3918,7 +3925,7 @@ void close_ctree(struct btrfs_root *root) iput(fs_info->btree_inode); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_test_opt(root, CHECK_INTEGRITY)) + if (btrfs_test_opt(root->fs_info, CHECK_INTEGRITY)) btrfsic_unmount(root, fs_info->fs_devices); #endif diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index dbf3e1aab69e..b3207a0e09f7 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -90,7 +90,8 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, void btrfs_free_fs_root(struct btrfs_root *root); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -struct btrfs_root *btrfs_alloc_dummy_root(u32 sectorsize, u32 nodesize); +struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info, + u32 sectorsize, u32 nodesize); #endif /* diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e9376b1657e2..61b494e8e604 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2180,7 +2180,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, path, bytenr, parent, root_objectid, owner, offset, refs_to_add); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); out: btrfs_free_path(path); return ret; @@ -2204,7 +2204,7 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, ins.type = BTRFS_EXTENT_ITEM_KEY; ref = btrfs_delayed_node_to_data_ref(node); - trace_run_delayed_data_ref(node, ref, node->action); + trace_run_delayed_data_ref(root->fs_info, node, ref, node->action); if (node->type == BTRFS_SHARED_DATA_REF_KEY) parent = ref->parent; @@ -2359,7 +2359,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, SKINNY_METADATA); ref = btrfs_delayed_node_to_tree_ref(node); - trace_run_delayed_tree_ref(node, ref, node->action); + trace_run_delayed_tree_ref(root->fs_info, node, ref, node->action); if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) parent = ref->parent; @@ -2423,7 +2423,8 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, */ BUG_ON(extent_op); head = btrfs_delayed_node_to_head(node); - trace_run_delayed_ref_head(node, head, node->action); + trace_run_delayed_ref_head(root->fs_info, node, head, + node->action); if (insert_reserved) { btrfs_pin_extent(root, node->bytenr, @@ -2778,7 +2779,7 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes) u64 num_csums_per_leaf; u64 num_csums; - csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); + csum_size = BTRFS_MAX_ITEM_SIZE(root); num_csums_per_leaf = div64_u64(csum_size, (u64)btrfs_super_csum_size(root->fs_info->super_copy)); num_csums = div64_u64(csum_bytes, root->sectorsize); @@ -2970,7 +2971,7 @@ again: trans->can_flush_pending_bgs = false; ret = __btrfs_run_delayed_refs(trans, root, count); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -3234,7 +3235,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, u64, u64, u64, u64, u64, u64); - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(root->fs_info)) return 0; ref_root = btrfs_header_owner(buf); @@ -3429,7 +3430,7 @@ again: * transaction, this only happens in really bad situations * anyway. */ - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_put; } WARN_ON(ret); @@ -3447,7 +3448,7 @@ again: spin_lock(&block_group->lock); if (block_group->cached != BTRFS_CACHE_FINISHED || - !btrfs_test_opt(root, SPACE_CACHE)) { + !btrfs_test_opt(root->fs_info, SPACE_CACHE)) { /* * don't bother trying to write stuff out _if_ * a) we're not cached, @@ -3524,7 +3525,7 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, struct btrfs_path *path; if (list_empty(&cur_trans->dirty_bgs) || - !btrfs_test_opt(root, SPACE_CACHE)) + !btrfs_test_opt(root->fs_info, SPACE_CACHE)) return 0; path = btrfs_alloc_path(); @@ -3669,7 +3670,7 @@ again: } spin_unlock(&cur_trans->dirty_bgs_lock); } else if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); } } @@ -3815,7 +3816,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, cache); } if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); } /* if its not on the io list, we need to put the block group */ @@ -4443,7 +4444,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans, thresh = btrfs_calc_trunc_metadata_size(root, num_devs) + btrfs_calc_trans_metadata_size(root, 1); - if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) { + if (left < thresh && btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) { btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu", left, thresh, type); dump_space_info(info, 0, 0); @@ -4588,7 +4589,7 @@ out: */ if (trans->can_flush_pending_bgs && trans->chunk_bytes_reserved >= (u64)SZ_2M) { - btrfs_create_pending_block_groups(trans, trans->root); + btrfs_create_pending_block_groups(trans, extent_root); btrfs_trans_release_chunk_metadata(trans); } return ret; @@ -5729,7 +5730,7 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, */ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) { - struct btrfs_fs_info *fs_info = trans->root->fs_info; + struct btrfs_fs_info *fs_info = trans->fs_info; if (!trans->chunk_bytes_reserved) return; @@ -6100,7 +6101,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) if (dropped > 0) to_free += btrfs_calc_trans_metadata_size(root, dropped); - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(root->fs_info)) return; trace_btrfs_space_reservation(root->fs_info, "delalloc", @@ -6215,7 +6216,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); - if (btrfs_test_opt(root, SPACE_CACHE) && + if (btrfs_test_opt(root->fs_info, SPACE_CACHE) && cache->disk_cache_state < BTRFS_DC_CLEAR) cache->disk_cache_state = BTRFS_DC_CLEAR; @@ -6597,7 +6598,7 @@ fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info, u64 *empty_cluster) { struct btrfs_free_cluster *ret = NULL; - bool ssd = btrfs_test_opt(root, SSD); + bool ssd = btrfs_test_opt(root->fs_info, SSD); *empty_cluster = 0; if (btrfs_mixed_space_info(space_info)) @@ -6742,7 +6743,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, break; } - if (btrfs_test_opt(root, DISCARD)) + if (btrfs_test_opt(root->fs_info, DISCARD)) ret = btrfs_discard_extent(root, start, end + 1 - start, NULL); @@ -6880,7 +6881,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, NULL, refs_to_drop, is_data, &last_ref); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } btrfs_release_path(path); @@ -6929,7 +6930,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, path->nodes[0]); } if (ret < 0) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } extent_slot = path->slots[0]; @@ -6940,10 +6941,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", bytenr, parent, root_objectid, owner_objectid, owner_offset); - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } else { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -6955,7 +6956,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = convert_extent_item_v0(trans, extent_root, path, owner_objectid, 0); if (ret < 0) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -6974,7 +6975,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_print_leaf(extent_root, path->nodes[0]); } if (ret < 0) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -6999,7 +7000,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_err(info, "trying to drop %d refs but we only have %Lu " "for bytenr %Lu", refs_to_drop, refs, bytenr); ret = -EINVAL; - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } refs -= refs_to_drop; @@ -7022,7 +7023,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, iref, refs_to_drop, is_data, &last_ref); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } } @@ -7045,7 +7046,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } btrfs_release_path(path); @@ -7053,7 +7054,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (is_data) { ret = btrfs_del_csums(trans, root, bytenr, num_bytes); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } } @@ -7061,13 +7062,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = add_to_free_space_tree(trans, root->fs_info, bytenr, num_bytes); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } ret = update_block_group(trans, root, bytenr, num_bytes, 0); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } } @@ -7216,7 +7217,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, int ret; struct btrfs_fs_info *fs_info = root->fs_info; - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(fs_info)) return 0; add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid); @@ -7851,8 +7852,7 @@ loop: * can do more things. */ if (ret < 0 && ret != -ENOSPC) - btrfs_abort_transaction(trans, - root, ret); + btrfs_abort_transaction(trans, ret); else ret = 0; if (!exist) @@ -7906,8 +7906,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n", info->flags, info->total_bytes - info->bytes_used - info->bytes_pinned - - info->bytes_reserved - info->bytes_readonly, - (info->full) ? "" : "not "); + info->bytes_reserved - info->bytes_readonly - + info->bytes_may_use, (info->full) ? "" : "not "); printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, " "reserved=%llu, may_use=%llu, readonly=%llu\n", info->total_bytes, info->bytes_used, info->bytes_pinned, @@ -7961,7 +7961,7 @@ again: if (num_bytes == min_alloc_size) final_tried = true; goto again; - } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + } else if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) { struct btrfs_space_info *sinfo; sinfo = __find_space_info(root->fs_info, flags); @@ -7992,7 +7992,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root, if (pin) pin_down_extent(root, cache, start, len, 1); else { - if (btrfs_test_opt(root, DISCARD)) + if (btrfs_test_opt(root->fs_info, DISCARD)) ret = btrfs_discard_extent(root, start, len, NULL); btrfs_add_free_space(cache, start, len); btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc); @@ -8300,7 +8300,7 @@ again: goto again; } - if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) { static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL * 10, /*DEFAULT_RATELIMIT_BURST*/ 1); @@ -8354,13 +8354,15 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, bool skinny_metadata = btrfs_fs_incompat(root->fs_info, SKINNY_METADATA); - if (btrfs_test_is_dummy_root(root)) { +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (btrfs_is_testing(root->fs_info)) { buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, level); if (!IS_ERR(buf)) root->alloc_bytenr += blocksize; return buf; } +#endif block_rsv = use_block_rsv(trans, root, blocksize); if (IS_ERR(block_rsv)) @@ -8540,7 +8542,8 @@ static int record_one_subtree_extent(struct btrfs_trans_handle *trans, delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - if (btrfs_qgroup_insert_dirty_extent(delayed_refs, qrecord)) + if (btrfs_qgroup_insert_dirty_extent(trans->fs_info, + delayed_refs, qrecord)) kfree(qrecord); spin_unlock(&delayed_refs->lock); @@ -9325,7 +9328,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, &root->root_key, root_item); if (ret) { - btrfs_abort_transaction(trans, tree_root, ret); + btrfs_abort_transaction(trans, ret); err = ret; goto out_end_trans; } @@ -9352,7 +9355,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, ret = btrfs_del_root(trans, tree_root, &root->root_key); if (ret) { - btrfs_abort_transaction(trans, tree_root, ret); + btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -9360,7 +9363,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, ret = btrfs_find_root(tree_root, &root->root_key, path, NULL, NULL); if (ret < 0) { - btrfs_abort_transaction(trans, tree_root, ret); + btrfs_abort_transaction(trans, ret); err = ret; goto out_end_trans; } else if (ret > 0) { @@ -9731,7 +9734,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) int full = 0; int ret = 0; - debug = btrfs_test_opt(root, ENOSPC_DEBUG); + debug = btrfs_test_opt(root->fs_info, ENOSPC_DEBUG); block_group = btrfs_lookup_block_group(root->fs_info, bytenr); @@ -9887,7 +9890,22 @@ static int find_first_block_group(struct btrfs_root *root, if (found_key.objectid >= key->objectid && found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { - ret = 0; + struct extent_map_tree *em_tree; + struct extent_map *em; + + em_tree = &root->fs_info->mapping_tree.map_tree; + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, found_key.objectid, + found_key.offset); + read_unlock(&em_tree->lock); + if (!em) { + btrfs_err(root->fs_info, + "logical %llu len %llu found bg but no related chunk", + found_key.objectid, found_key.offset); + ret = -ENOENT; + } else { + ret = 0; + } goto out; } path->slots[0]++; @@ -10129,10 +10147,10 @@ int btrfs_read_block_groups(struct btrfs_root *root) path->reada = READA_FORWARD; cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); - if (btrfs_test_opt(root, SPACE_CACHE) && + if (btrfs_test_opt(root->fs_info, SPACE_CACHE) && btrfs_super_generation(root->fs_info->super_copy) != cache_gen) need_clear = 1; - if (btrfs_test_opt(root, CLEAR_CACHE)) + if (btrfs_test_opt(root->fs_info, CLEAR_CACHE)) need_clear = 1; while (1) { @@ -10163,7 +10181,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) * b) Setting 'dirty flag' makes sure that we flush * the new space cache info onto disk. */ - if (btrfs_test_opt(root, SPACE_CACHE)) + if (btrfs_test_opt(root->fs_info, SPACE_CACHE)) cache->disk_cache_state = BTRFS_DC_CLEAR; } @@ -10305,11 +10323,11 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, ret = btrfs_insert_item(trans, extent_root, &key, &item, sizeof(item)); if (ret) - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); ret = btrfs_finish_chunk_alloc(trans, extent_root, key.objectid, key.offset); if (ret) - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); add_block_group_free_space(trans, root->fs_info, block_group); /* already aborted the transaction if it failed. */ next: @@ -10622,7 +10640,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_lock(&block_group->space_info->lock); list_del_init(&block_group->ro_list); - if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) { WARN_ON(block_group->space_info->total_bytes < block_group->key.offset); WARN_ON(block_group->space_info->bytes_readonly @@ -10890,7 +10908,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_unlock(&space_info->lock); /* DISCARD can flip during remount */ - trimming = btrfs_test_opt(root, DISCARD); + trimming = btrfs_test_opt(root->fs_info, DISCARD); /* Implicit trim during transaction commit. */ if (trimming) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index cee4cb99b8ce..44fe66b53c8b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -163,13 +163,13 @@ int __init extent_io_init(void) { extent_state_cache = kmem_cache_create("btrfs_extent_state", sizeof(struct extent_state), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!extent_state_cache) return -ENOMEM; extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", sizeof(struct extent_buffer), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!extent_buffer_cache) goto free_state_cache; @@ -2049,7 +2049,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, return -EIO; } bio->bi_bdev = dev->bdev; - bio->bi_rw = WRITE_SYNC; + bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC); bio_add_page(bio, page, length, pg_offset); if (btrfsic_submit_bio_wait(bio)) { @@ -2697,12 +2697,6 @@ struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) btrfs_bio->csum = NULL; btrfs_bio->csum_allocated = NULL; btrfs_bio->end_io = NULL; - -#ifdef CONFIG_BLK_CGROUP - /* FIXME, put this into bio_clone_bioset */ - if (bio->bi_css) - bio_associate_blkcg(new, bio->bi_css); -#endif } return new; } @@ -2756,7 +2750,6 @@ static int merge_bio(struct extent_io_tree *tree, struct page *page, if (tree->ops && tree->ops->merge_bio_hook) ret = tree->ops->merge_bio_hook(page, offset, size, bio, bio_flags); - BUG_ON(ret < 0); return ret; } @@ -2879,6 +2872,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, * into the tree that are removed when the IO is done (by the end_io * handlers) * XXX JDM: This needs looking at to ensure proper page locking + * return 0 on success, otherwise return error */ static int __do_readpage(struct extent_io_tree *tree, struct page *page, @@ -2900,7 +2894,7 @@ static int __do_readpage(struct extent_io_tree *tree, sector_t sector; struct extent_map *em; struct block_device *bdev; - int ret; + int ret = 0; int nr = 0; size_t pg_offset = 0; size_t iosize; @@ -3081,6 +3075,7 @@ static int __do_readpage(struct extent_io_tree *tree, } else { SetPageError(page); unlock_extent(tree, cur, cur + iosize - 1); + goto out; } cur = cur + iosize; pg_offset += iosize; @@ -3091,7 +3086,7 @@ out: SetPageUptodate(page); unlock_page(page); } - return 0; + return ret; } static inline void __do_contiguous_readpages(struct extent_io_tree *tree, @@ -5230,14 +5225,31 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, atomic_set(&eb->io_pages, num_reads); for (i = start_i; i < num_pages; i++) { page = eb->pages[i]; + if (!PageUptodate(page)) { + if (ret) { + atomic_dec(&eb->io_pages); + unlock_page(page); + continue; + } + ClearPageError(page); err = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, &bio_flags, REQ_META); - if (err) + if (err) { ret = err; + /* + * We use &bio in above __extent_read_full_page, + * so we ensure that if it returns error, the + * current page fails to add itself to bio and + * it's been unlocked. + * + * We must dec io_pages by ourselves. + */ + atomic_dec(&eb->io_pages); + } } else { unlock_page(page); } diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index e0715fcfb11e..26f9ac719d20 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -13,7 +13,7 @@ int __init extent_map_init(void) { extent_map_cache = kmem_cache_create("btrfs_extent_map", sizeof(struct extent_map), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!extent_map_cache) return -ENOMEM; return 0; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 62a81ee13a5f..d0d571c47d33 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -250,7 +250,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, offset + root->sectorsize - 1, EXTENT_NODATASUM); } else { - btrfs_info(BTRFS_I(inode)->root->fs_info, + btrfs_info_rl(BTRFS_I(inode)->root->fs_info, "no csum found for inode %llu start %llu", btrfs_ino(inode), offset); } @@ -699,7 +699,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, */ ret = btrfs_split_item(trans, root, path, &key, offset); if (ret && ret != -EAGAIN) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index bcfb4a27ddd4..5842423f8f47 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -132,7 +132,7 @@ static int __btrfs_add_inode_defrag(struct inode *inode, static inline int __need_auto_defrag(struct btrfs_root *root) { - if (!btrfs_test_opt(root, AUTO_DEFRAG)) + if (!btrfs_test_opt(root->fs_info, AUTO_DEFRAG)) return 0; if (btrfs_fs_closing(root->fs_info)) @@ -950,7 +950,7 @@ delete_extent_item: ret = btrfs_del_items(trans, root, path, del_slot, del_nr); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); break; } @@ -974,7 +974,7 @@ delete_extent_item: path->slots[0] = del_slot; ret = btrfs_del_items(trans, root, path, del_slot, del_nr); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); } leaf = path->nodes[0]; @@ -1190,7 +1190,7 @@ again: goto again; } if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -1278,7 +1278,7 @@ again: ret = btrfs_del_items(trans, root, path, del_slot, del_nr); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } } @@ -2033,6 +2033,14 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); + /* + * An ordered extent might have started before and completed + * already with io errors, in which case the inode was not + * updated and we end up here. So check the inode's mapping + * flags for any errors that might have happened while doing + * writeback of file data. + */ + ret = btrfs_inode_check_errors(inode); inode_unlock(inode); goto out; } @@ -2975,7 +2983,7 @@ int btrfs_auto_defrag_init(void) { btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", sizeof(struct inode_defrag), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + SLAB_MEM_SPREAD, NULL); if (!btrfs_inode_defrag_cachep) return -ENOMEM; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 69d270f6602c..d571bd2b697b 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -280,7 +280,7 @@ fail: if (locked) mutex_unlock(&trans->transaction->cache_write_mutex); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -3026,7 +3026,7 @@ int btrfs_find_space_cluster(struct btrfs_root *root, * For metadata, allow allocates with smaller extents. For * data, keep it dense. */ - if (btrfs_test_opt(root, SSD_SPREAD)) { + if (btrfs_test_opt(root->fs_info, SSD_SPREAD)) { cont1_bytes = min_bytes = bytes + empty_size; } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { cont1_bytes = bytes; @@ -3470,7 +3470,7 @@ int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root) int ret = 0; u64 root_gen = btrfs_root_generation(&root->root_item); - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return 0; /* @@ -3514,7 +3514,7 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, struct btrfs_io_ctl io_ctl; bool release_metadata = true; - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return 0; memset(&io_ctl, 0, sizeof(io_ctl)); diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 53dbeaf6ce94..87e7e3d3e676 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -305,7 +305,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, out: kvfree(bitmap); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -454,7 +454,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, out: kvfree(bitmap); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -851,7 +851,7 @@ int remove_from_free_space_tree(struct btrfs_trans_handle *trans, out: btrfs_free_path(path); if (ret) - btrfs_abort_transaction(trans, fs_info->free_space_root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -1047,7 +1047,7 @@ int add_to_free_space_tree(struct btrfs_trans_handle *trans, out: btrfs_free_path(path); if (ret) - btrfs_abort_transaction(trans, fs_info->free_space_root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -1193,7 +1193,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) abort: fs_info->creating_free_space_tree = 0; - btrfs_abort_transaction(trans, tree_root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans, tree_root); return ret; } @@ -1280,7 +1280,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) return 0; abort: - btrfs_abort_transaction(trans, tree_root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans, tree_root); return ret; } @@ -1333,7 +1333,7 @@ out: btrfs_free_path(path); mutex_unlock(&block_group->free_space_lock); if (ret) - btrfs_abort_transaction(trans, fs_info->free_space_root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -1410,7 +1410,7 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans, out: btrfs_free_path(path); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 70107f7c9307..aa6fabaee72e 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -38,7 +38,7 @@ static int caching_kthread(void *data) int slot; int ret; - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return 0; path = btrfs_alloc_path(); @@ -141,7 +141,7 @@ static void start_caching(struct btrfs_root *root) int ret; u64 objectid; - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return; spin_lock(&root->ino_cache_lock); @@ -185,7 +185,7 @@ static void start_caching(struct btrfs_root *root) int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid) { - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return btrfs_find_free_objectid(root, objectid); again: @@ -211,7 +211,7 @@ void btrfs_return_ino(struct btrfs_root *root, u64 objectid) { struct btrfs_free_space_ctl *pinned = root->free_ino_pinned; - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return; again: if (root->ino_cache_state == BTRFS_CACHE_FINISHED) { @@ -251,7 +251,7 @@ void btrfs_unpin_free_ino(struct btrfs_root *root) struct rb_node *n; u64 count; - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return; while (1) { @@ -412,7 +412,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root, if (btrfs_root_refs(&root->root_item) == 0) return 0; - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return 0; path = btrfs_alloc_path(); @@ -458,7 +458,7 @@ again: BTRFS_I(inode)->generation = 0; ret = btrfs_update_inode(trans, root, inode); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_put; } @@ -466,7 +466,7 @@ again: ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode); if (ret) { if (ret != -ENOSPC) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_put; } } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8078077d1090..08dfc57e2270 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -60,6 +60,7 @@ #include "hash.h" #include "props.h" #include "qgroup.h" +#include "dedupe.h" struct btrfs_iget_args { struct btrfs_key *location; @@ -105,8 +106,9 @@ static int btrfs_truncate(struct inode *inode); static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); static noinline int cow_file_range(struct inode *inode, struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, int unlock); + u64 start, u64 end, u64 delalloc_end, + int *page_started, unsigned long *nr_written, + int unlock, struct btrfs_dedupe_hash *hash); static struct extent_map *create_pinned_em(struct inode *inode, u64 start, u64 len, u64 orig_start, u64 block_start, u64 block_len, @@ -294,7 +296,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, start, aligned_end, NULL, 1, 1, extent_item_size, &extent_inserted); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -305,7 +307,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, inline_len, compressed_size, compress_type, compressed_pages); if (ret && ret != -ENOSPC) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } else if (ret == -ENOSPC) { ret = 1; @@ -374,12 +376,12 @@ static inline int inode_need_compress(struct inode *inode) struct btrfs_root *root = BTRFS_I(inode)->root; /* force compress */ - if (btrfs_test_opt(root, FORCE_COMPRESS)) + if (btrfs_test_opt(root->fs_info, FORCE_COMPRESS)) return 1; /* bad compression ratios */ if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) return 0; - if (btrfs_test_opt(root, COMPRESS) || + if (btrfs_test_opt(root->fs_info, COMPRESS) || BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS || BTRFS_I(inode)->force_compress) return 1; @@ -585,9 +587,27 @@ cont: will_compress = 0; } else { num_bytes = total_in; + *num_added += 1; + + /* + * The async work queues will take care of doing actual + * allocation on disk for these compressed pages, and + * will submit them to the elevator. + */ + add_async_extent(async_cow, start, num_bytes, + total_compressed, pages, nr_pages_ret, + compress_type); + + if (start + num_bytes < end) { + start += num_bytes; + pages = NULL; + cond_resched(); + goto again; + } + return; } } - if (!will_compress && pages) { + if (pages) { /* * the compression code ran but failed to make things smaller, * free any pages it allocated and our page pointer array @@ -602,48 +622,28 @@ cont: nr_pages_ret = 0; /* flag the file so we don't compress in the future */ - if (!btrfs_test_opt(root, FORCE_COMPRESS) && + if (!btrfs_test_opt(root->fs_info, FORCE_COMPRESS) && !(BTRFS_I(inode)->force_compress)) { BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; } } - if (will_compress) { - *num_added += 1; - - /* the async work queues will take care of doing actual - * allocation on disk for these compressed pages, - * and will submit them to the elevator. - */ - add_async_extent(async_cow, start, num_bytes, - total_compressed, pages, nr_pages_ret, - compress_type); - - if (start + num_bytes < end) { - start += num_bytes; - pages = NULL; - cond_resched(); - goto again; - } - } else { cleanup_and_bail_uncompressed: - /* - * No compression, but we still need to write the pages in - * the file we've been given so far. redirty the locked - * page if it corresponds to our extent and set things up - * for the async work queue to run cow_file_range to do - * the normal delalloc dance - */ - if (page_offset(locked_page) >= start && - page_offset(locked_page) <= end) { - __set_page_dirty_nobuffers(locked_page); - /* unlocked later on in the async handlers */ - } - if (redirty) - extent_range_redirty_for_io(inode, start, end); - add_async_extent(async_cow, start, end - start + 1, - 0, NULL, 0, BTRFS_COMPRESS_NONE); - *num_added += 1; - } + /* + * No compression, but we still need to write the pages in the file + * we've been given so far. redirty the locked page if it corresponds + * to our extent and set things up for the async work queue to run + * cow_file_range to do the normal delalloc dance. + */ + if (page_offset(locked_page) >= start && + page_offset(locked_page) <= end) + __set_page_dirty_nobuffers(locked_page); + /* unlocked later on in the async handlers */ + + if (redirty) + extent_range_redirty_for_io(inode, start, end); + add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0, + BTRFS_COMPRESS_NONE); + *num_added += 1; return; @@ -712,7 +712,10 @@ retry: async_extent->start, async_extent->start + async_extent->ram_size - 1, - &page_started, &nr_written, 0); + async_extent->start + + async_extent->ram_size - 1, + &page_started, &nr_written, 0, + NULL); /* JDM XXX */ @@ -925,9 +928,9 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start, */ static noinline int cow_file_range(struct inode *inode, struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, - int unlock) + u64 start, u64 end, u64 delalloc_end, + int *page_started, unsigned long *nr_written, + int unlock, struct btrfs_dedupe_hash *hash) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 alloc_hint = 0; @@ -1156,7 +1159,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, async_cow->start = start; if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS && - !btrfs_test_opt(root, FORCE_COMPRESS)) + !btrfs_test_opt(root->fs_info, FORCE_COMPRESS)) cur_end = end; else cur_end = min(end, start + SZ_512K - 1); @@ -1418,7 +1421,8 @@ out_check: if (cow_start != (u64)-1) { ret = cow_file_range(inode, locked_page, cow_start, found_key.offset - 1, - page_started, nr_written, 1); + end, page_started, nr_written, 1, + NULL); if (ret) { if (!nolock && nocow) btrfs_end_write_no_snapshoting(root); @@ -1501,8 +1505,8 @@ out_check: } if (cow_start != (u64)-1) { - ret = cow_file_range(inode, locked_page, cow_start, end, - page_started, nr_written, 1); + ret = cow_file_range(inode, locked_page, cow_start, end, end, + page_started, nr_written, 1, NULL); if (ret) goto error; } @@ -1561,8 +1565,8 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 0, nr_written); } else if (!inode_need_compress(inode)) { - ret = cow_file_range(inode, locked_page, start, end, - page_started, nr_written, 1); + ret = cow_file_range(inode, locked_page, start, end, end, + page_started, nr_written, 1, NULL); } else { set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &BTRFS_I(inode)->runtime_flags); @@ -1740,7 +1744,7 @@ static void btrfs_set_bit_hook(struct inode *inode, } /* For sanity tests */ - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(root->fs_info)) return; __percpu_counter_add(&root->fs_info->delalloc_bytes, len, @@ -1799,7 +1803,7 @@ static void btrfs_clear_bit_hook(struct inode *inode, btrfs_delalloc_release_metadata(inode, len); /* For sanity tests. */ - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(root->fs_info)) return; if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID @@ -1822,6 +1826,10 @@ static void btrfs_clear_bit_hook(struct inode *inode, /* * extent_io.c merge_bio_hook, this must check the chunk tree to make sure * we don't create bios that span stripes or chunks + * + * return 1 if page cannot be merged to bio + * return 0 if page can be merged to bio + * return error otherwise */ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, size_t size, struct bio *bio, @@ -1840,8 +1848,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, map_length = length; ret = btrfs_map_block(root->fs_info, bio_op(bio), logical, &map_length, NULL, 0); - /* Will always return 0 with map_multi == NULL */ - BUG_ON(ret < 0); + if (ret < 0) + return ret; if (map_length < length + size) return 1; return 0; @@ -2594,7 +2602,7 @@ again: ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent)); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_free_path; } @@ -2621,7 +2629,7 @@ again: backref->root_id, backref->inum, new->file_pos); /* start - extent_offset */ if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_free_path; } @@ -2890,7 +2898,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) trans->block_rsv = &root->fs_info->delalloc_block_rsv; ret = btrfs_update_inode_fallback(trans, root, inode); if (ret) /* -ENOMEM or corruption */ - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -2950,7 +2958,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) ordered_extent->file_offset, ordered_extent->len, trans->transid); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_unlock; } @@ -2960,7 +2968,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) btrfs_ordered_update_i_size(inode, 0, ordered_extent); ret = btrfs_update_inode_fallback(trans, root, inode); if (ret) { /* -ENOMEM or corruption */ - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_unlock; } ret = 0; @@ -3204,7 +3212,7 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, root->root_key.objectid); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); else clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); @@ -3295,7 +3303,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) if (ret != -EEXIST) { clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, &BTRFS_I(inode)->runtime_flags); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } } @@ -3307,7 +3315,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, root->root_key.objectid); if (ret && ret != -EEXIST) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } } @@ -3427,10 +3435,10 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) found_key.offset = 0; inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); ret = PTR_ERR_OR_ZERO(inode); - if (ret && ret != -ESTALE) + if (ret && ret != -ENOENT) goto out; - if (ret == -ESTALE && root == root->fs_info->tree_root) { + if (ret == -ENOENT && root == root->fs_info->tree_root) { struct btrfs_root *dead_root; struct btrfs_fs_info *fs_info = root->fs_info; int is_dead_root = 0; @@ -3466,7 +3474,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) * Inode is already gone but the orphan item is still there, * kill the orphan item. */ - if (ret == -ESTALE) { + if (ret == -ENOENT) { trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -3625,7 +3633,7 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, /* * read an inode from the btree into the in-memory inode */ -static void btrfs_read_locked_inode(struct inode *inode) +static int btrfs_read_locked_inode(struct inode *inode) { struct btrfs_path *path; struct extent_buffer *leaf; @@ -3644,14 +3652,19 @@ static void btrfs_read_locked_inode(struct inode *inode) filled = true; path = btrfs_alloc_path(); - if (!path) + if (!path) { + ret = -ENOMEM; goto make_bad; + } memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); ret = btrfs_lookup_inode(NULL, root, path, &location, 0); - if (ret) + if (ret) { + if (ret > 0) + ret = -ENOENT; goto make_bad; + } leaf = path->nodes[0]; @@ -3804,11 +3817,12 @@ cache_acl: } btrfs_update_iflags(inode); - return; + return 0; make_bad: btrfs_free_path(path); make_bad_inode(inode); + return ret; } /* @@ -4006,20 +4020,20 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, btrfs_info(root->fs_info, "failed to delete reference to %.*s, inode %llu parent %llu", name_len, name, ino, dir_ino); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto err; } skip_backref: ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto err; } ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, dir_ino); if (ret != 0 && ret != -ENOENT) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto err; } @@ -4028,7 +4042,7 @@ skip_backref: if (ret == -ENOENT) ret = 0; else if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); err: btrfs_free_path(path); if (ret) @@ -4142,7 +4156,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); ret = btrfs_delete_one_dir_name(trans, root, path, di); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } btrfs_release_path(path); @@ -4152,7 +4166,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, dir_ino, &index, name, name_len); if (ret < 0) { if (ret != -ENOENT) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } di = btrfs_search_dir_index_item(root, path, dir_ino, @@ -4162,7 +4176,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, ret = -ENOENT; else ret = PTR_ERR(di); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -4175,7 +4189,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -4184,7 +4198,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb); ret = btrfs_update_inode_fallback(trans, root, dir); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); out: btrfs_free_path(path); return ret; @@ -4196,6 +4210,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) int err = 0; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_trans_handle *trans; + u64 last_unlink_trans; if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; @@ -4218,11 +4233,27 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) if (err) goto out; + last_unlink_trans = BTRFS_I(inode)->last_unlink_trans; + /* now the directory is empty */ err = btrfs_unlink_inode(trans, root, dir, d_inode(dentry), dentry->d_name.name, dentry->d_name.len); - if (!err) + if (!err) { btrfs_i_size_write(inode, 0); + /* + * Propagate the last_unlink_trans value of the deleted dir to + * its parent directory. This is to prevent an unrecoverable + * log tree in the case we do something like this: + * 1) create dir foo + * 2) create snapshot under dir foo + * 3) delete the snapshot + * 4) rmdir foo + * 5) mkdir foo + * 6) fsync foo or some file inside foo + */ + if (last_unlink_trans >= trans->transid) + BTRFS_I(dir)->last_unlink_trans = last_unlink_trans; + } out: btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root); @@ -4505,7 +4536,6 @@ search_again: pending_del_nr); if (err) { btrfs_abort_transaction(trans, - root, err); goto error; } @@ -4517,8 +4547,7 @@ search_again: item_end, new_size); if (err) { - btrfs_abort_transaction(trans, - root, err); + btrfs_abort_transaction(trans, err); goto error; } } else if (test_bit(BTRFS_ROOT_REF_COWS, @@ -4582,8 +4611,7 @@ delete: pending_del_slot, pending_del_nr); if (ret) { - btrfs_abort_transaction(trans, - root, ret); + btrfs_abort_transaction(trans, ret); goto error; } pending_del_nr = 0; @@ -4616,7 +4644,7 @@ out: ret = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); } error: if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) @@ -4785,7 +4813,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans, root); return ret; } @@ -4793,7 +4821,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset, 0, 0, len, 0, len, 0, 0, 0); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); else btrfs_update_inode(trans, root, inode); btrfs_end_transaction(trans, root); @@ -5020,7 +5048,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) i_size_write(inode, BTRFS_I(inode)->disk_i_size); err = btrfs_orphan_del(trans, inode); if (err) - btrfs_abort_transaction(trans, root, err); + btrfs_abort_transaction(trans, err); btrfs_end_transaction(trans, root); } } @@ -5158,11 +5186,18 @@ void btrfs_evict_inode(struct inode *inode) struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *rsv, *global_rsv; int steal_from_global = 0; - u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); + u64 min_size; int ret; trace_btrfs_inode_evict(inode); + if (!root) { + kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); + return; + } + + min_size = btrfs_calc_trunc_metadata_size(root, 1); + evict_inode_truncate_pages(inode); if (inode->i_nlink && @@ -5594,7 +5629,9 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, return ERR_PTR(-ENOMEM); if (inode->i_state & I_NEW) { - btrfs_read_locked_inode(inode); + int ret; + + ret = btrfs_read_locked_inode(inode); if (!is_bad_inode(inode)) { inode_tree_add(inode); unlock_new_inode(inode); @@ -5603,7 +5640,8 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, } else { unlock_new_inode(inode); iput(inode); - inode = ERR_PTR(-ESTALE); + ASSERT(ret < 0); + inode = ERR_PTR(ret < 0 ? ret : -ESTALE); } } @@ -6239,9 +6277,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_inherit_iflags(inode, dir); if (S_ISREG(mode)) { - if (btrfs_test_opt(root, NODATASUM)) + if (btrfs_test_opt(root->fs_info, NODATASUM)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; - if (btrfs_test_opt(root, NODATACOW)) + if (btrfs_test_opt(root->fs_info, NODATACOW)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | BTRFS_INODE_NODATASUM; } @@ -6319,7 +6357,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, if (ret == -EEXIST || ret == -EOVERFLOW) goto fail_dir_item; else if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -6330,7 +6368,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, current_fs_time(parent_inode->i_sb); ret = btrfs_update_inode(trans, root, parent_inode); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; fail_dir_item: @@ -8197,7 +8235,7 @@ static void btrfs_end_dio_bio(struct bio *bio) if (err) btrfs_warn(BTRFS_I(dip->inode)->root->fs_info, "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d", - btrfs_ino(dip->inode), bio_op(bio), bio->bi_rw, + btrfs_ino(dip->inode), bio_op(bio), bio->bi_opf, (unsigned long long)bio->bi_iter.bi_sector, bio->bi_iter.bi_size, err); @@ -8361,7 +8399,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, if (!bio) return -ENOMEM; - bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_rw); + bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_opf); bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; @@ -8399,7 +8437,7 @@ next_block: start_sector, GFP_NOFS); if (!bio) goto out_err; - bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_rw); + bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_opf); bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; @@ -9385,25 +9423,25 @@ int btrfs_init_cachep(void) btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", sizeof(struct btrfs_trans_handle), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL); if (!btrfs_trans_handle_cachep) goto fail; btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction", sizeof(struct btrfs_transaction), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL); if (!btrfs_transaction_cachep) goto fail; btrfs_path_cachep = kmem_cache_create("btrfs_path", sizeof(struct btrfs_path), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!btrfs_path_cachep) goto fail; btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", sizeof(struct btrfs_free_space), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!btrfs_free_space_cachep) goto fail; @@ -9553,7 +9591,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, ret = btrfs_update_inode(trans, root, old_inode); } if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -9573,7 +9611,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, ret = btrfs_update_inode(trans, dest, new_inode); } if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -9581,7 +9619,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, new_dentry->d_name.name, new_dentry->d_name.len, 0, old_idx); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -9589,7 +9627,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, old_dentry->d_name.name, old_dentry->d_name.len, 0, new_idx); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -9828,7 +9866,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, ret = btrfs_update_inode(trans, root, old_inode); } if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -9852,7 +9890,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (!ret && new_inode->i_nlink == 0) ret = btrfs_orphan_add(trans, d_inode(new_dentry)); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_fail; } } @@ -9861,7 +9899,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, new_dentry->d_name.name, new_dentry->d_name.len, 0, index); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -9881,7 +9919,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, old_dentry); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_fail; } } @@ -10307,7 +10345,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, if (ret) { btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 0); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); if (own_trans) btrfs_end_transaction(trans, root); break; @@ -10367,7 +10405,7 @@ next: ret = btrfs_update_inode(trans, root, inode); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); if (own_trans) btrfs_end_transaction(trans, root); break; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 05173563e4a6..14ed1e9e6bc8 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -561,7 +561,7 @@ static noinline int create_subvol(struct inode *dir, new_root = btrfs_read_fs_root_no_name(root->fs_info, &key); if (IS_ERR(new_root)) { ret = PTR_ERR(new_root); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -570,7 +570,7 @@ static noinline int create_subvol(struct inode *dir, ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid); if (ret) { /* We potentially lose an unused inode item here */ - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -583,7 +583,7 @@ static noinline int create_subvol(struct inode *dir, */ ret = btrfs_set_inode_index(dir, &index); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -591,7 +591,7 @@ static noinline int create_subvol(struct inode *dir, name, namelen, dir, &key, BTRFS_FT_DIR, index); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -608,7 +608,7 @@ static noinline int create_subvol(struct inode *dir, root_item->uuid, BTRFS_UUID_KEY_SUBVOL, objectid); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); fail: kfree(root_item); @@ -1948,8 +1948,7 @@ static noinline int key_in_sk(struct btrfs_key *key, return 1; } -static noinline int copy_to_sk(struct btrfs_root *root, - struct btrfs_path *path, +static noinline int copy_to_sk(struct btrfs_path *path, struct btrfs_key *key, struct btrfs_ioctl_search_key *sk, size_t *buf_size, @@ -2120,7 +2119,7 @@ static noinline int search_ioctl(struct inode *inode, ret = 0; goto err; } - ret = copy_to_sk(root, path, &key, sk, buf_size, ubuf, + ret = copy_to_sk(path, &key, sk, buf_size, ubuf, &sk_offset, &num_found); btrfs_release_path(path); if (ret) @@ -2406,7 +2405,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, * rmdir(2). */ err = -EPERM; - if (!btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) + if (!btrfs_test_opt(root->fs_info, USER_SUBVOL_RM_ALLOWED)) goto out_dput; /* @@ -2489,7 +2488,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, dentry->d_name.len); if (ret) { err = ret; - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -2505,7 +2504,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, root->fs_info->tree_root, dest->root_key.objectid); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); err = ret; goto out_end_trans; } @@ -2515,7 +2514,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, dest->root_item.uuid, BTRFS_UUID_KEY_SUBVOL, dest->root_key.objectid); if (ret && ret != -ENOENT) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); err = ret; goto out_end_trans; } @@ -2525,7 +2524,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, BTRFS_UUID_KEY_RECEIVED_SUBVOL, dest->root_key.objectid); if (ret && ret != -ENOENT) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); err = ret; goto out_end_trans; } @@ -3292,7 +3291,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, ret = btrfs_update_inode(trans, root, inode); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans, root); goto out; } @@ -3694,7 +3693,7 @@ process_slot: if (ret) { if (ret != -EOPNOTSUPP) btrfs_abort_transaction(trans, - root, ret); + ret); btrfs_end_transaction(trans, root); goto out; } @@ -3702,8 +3701,7 @@ process_slot: ret = btrfs_insert_empty_item(trans, root, path, &new_key, size); if (ret) { - btrfs_abort_transaction(trans, root, - ret); + btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans, root); goto out; } @@ -3735,7 +3733,6 @@ process_slot: new_key.offset - datao); if (ret) { btrfs_abort_transaction(trans, - root, ret); btrfs_end_transaction(trans, root); @@ -3772,7 +3769,6 @@ process_slot: if (ret) { if (ret != -EOPNOTSUPP) btrfs_abort_transaction(trans, - root, ret); btrfs_end_transaction(trans, root); goto out; @@ -3828,7 +3824,7 @@ process_slot: last_dest_end, destoff + len, 1); if (ret) { if (ret != -EOPNOTSUPP) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans, root); goto out; } @@ -5164,13 +5160,13 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, BTRFS_UUID_KEY_RECEIVED_SUBVOL, root->root_key.objectid); if (ret < 0 && ret != -EEXIST) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } } ret = btrfs_commit_transaction(trans, root); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index aca8264f4a49..3b78d38173b3 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -1122,7 +1122,7 @@ int __init ordered_data_init(void) { btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", sizeof(struct btrfs_ordered_extent), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + SLAB_MEM_SPREAD, NULL); if (!btrfs_ordered_extent_cache) return -ENOMEM; diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 36992128c746..cf0b444ac4f3 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -350,6 +350,7 @@ int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_root *parent_root) { + struct super_block *sb = root->fs_info->sb; struct btrfs_key key; struct inode *parent_inode, *child_inode; int ret; @@ -358,12 +359,11 @@ int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - parent_inode = btrfs_iget(parent_root->fs_info->sb, &key, - parent_root, NULL); + parent_inode = btrfs_iget(sb, &key, parent_root, NULL); if (IS_ERR(parent_inode)) return PTR_ERR(parent_inode); - child_inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); + child_inode = btrfs_iget(sb, &key, root, NULL); if (IS_ERR(child_inode)) { iput(parent_inode); return PTR_ERR(child_inode); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 9d4c05b14f6e..93ee1c18ef9d 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -571,7 +571,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_key key; - if (btrfs_test_is_dummy_root(quota_root)) + if (btrfs_is_testing(quota_root->fs_info)) return 0; path = btrfs_alloc_path(); @@ -728,7 +728,7 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans, int ret; int slot; - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(root->fs_info)) return 0; key.objectid = 0; @@ -1453,9 +1453,10 @@ int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, return ret; } -struct btrfs_qgroup_extent_record -*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_qgroup_extent_record *record) +struct btrfs_qgroup_extent_record * +btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_qgroup_extent_record *record) { struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node; struct rb_node *parent_node = NULL; @@ -1463,7 +1464,7 @@ struct btrfs_qgroup_extent_record u64 bytenr = record->bytenr; assert_spin_locked(&delayed_refs->lock); - trace_btrfs_qgroup_insert_dirty_extent(record); + trace_btrfs_qgroup_insert_dirty_extent(fs_info, record); while (*p) { parent_node = *p; @@ -1595,8 +1596,8 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info, cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); - trace_qgroup_update_counters(qg->qgroupid, cur_old_count, - cur_new_count); + trace_qgroup_update_counters(fs_info, qg->qgroupid, + cur_old_count, cur_new_count); /* Rfer update part */ if (cur_old_count == 0 && cur_new_count > 0) { @@ -1687,8 +1688,8 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, goto out_free; BUG_ON(!fs_info->quota_root); - trace_btrfs_qgroup_account_extent(bytenr, num_bytes, nr_old_roots, - nr_new_roots); + trace_btrfs_qgroup_account_extent(fs_info, bytenr, num_bytes, + nr_old_roots, nr_new_roots); qgroups = ulist_alloc(GFP_NOFS); if (!qgroups) { @@ -1759,7 +1760,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, record = rb_entry(node, struct btrfs_qgroup_extent_record, node); - trace_btrfs_qgroup_account_extents(record); + trace_btrfs_qgroup_account_extents(fs_info, record); if (!ret) { /* @@ -2195,7 +2196,7 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) { if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq) return; - btrfs_err(trans->root->fs_info, + btrfs_err(trans->fs_info, "qgroups not uptodate in trans handle %p: list is%s empty, " "seq is %#x.%x", trans, list_empty(&trans->qgroup_ref_list) ? "" : " not", diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index ecb2c143ef75..710887c06aaf 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -63,9 +63,10 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); struct btrfs_delayed_extent_op; int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); -struct btrfs_qgroup_extent_record -*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_qgroup_extent_record *record); +struct btrfs_qgroup_extent_record * +btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_qgroup_extent_record *record); int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, @@ -88,7 +89,7 @@ static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info, u64 ref_root, u64 num_bytes) { btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes); - trace_btrfs_qgroup_free_delayed_ref(ref_root, num_bytes); + trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes); } void assert_qgroups_uptodate(struct btrfs_trans_handle *trans); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index fc067b07e31f..b26a5aea41b4 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -235,12 +235,12 @@ static void backref_cache_cleanup(struct backref_cache *cache) cache->last_trans = 0; for (i = 0; i < BTRFS_MAX_LEVEL; i++) - BUG_ON(!list_empty(&cache->pending[i])); - BUG_ON(!list_empty(&cache->changed)); - BUG_ON(!list_empty(&cache->detached)); - BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root)); - BUG_ON(cache->nr_nodes); - BUG_ON(cache->nr_edges); + ASSERT(list_empty(&cache->pending[i])); + ASSERT(list_empty(&cache->changed)); + ASSERT(list_empty(&cache->detached)); + ASSERT(RB_EMPTY_ROOT(&cache->rb_root)); + ASSERT(!cache->nr_nodes); + ASSERT(!cache->nr_edges); } static struct backref_node *alloc_backref_node(struct backref_cache *cache) @@ -1171,8 +1171,12 @@ out: lower = list_entry(useless.next, struct backref_node, list); list_del_init(&lower->list); + if (lower == node) + node = NULL; free_backref_node(cache, lower); } + + free_backref_node(cache, node); return ERR_PTR(err); } ASSERT(!node || !node->detached); @@ -1719,7 +1723,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, btrfs_header_owner(leaf), key.objectid, key.offset); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); break; } @@ -1727,7 +1731,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, parent, btrfs_header_owner(leaf), key.objectid, key.offset); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); break; } } diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index f1c30861d062..7fd7e1830cfe 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -150,7 +150,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root ret = btrfs_search_slot(trans, root, key, path, 0, 1); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -176,20 +176,20 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root ret = btrfs_search_slot(trans, root, key, path, -1, 1); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } ret = btrfs_del_item(trans, root, path); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } btrfs_release_path(path); ret = btrfs_insert_empty_item(trans, root, path, key, sizeof(*item)); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } l = path->nodes[0]; @@ -448,7 +448,7 @@ again: ret = btrfs_insert_empty_item(trans, tree_root, path, &key, sizeof(*ref) + name_len); if (ret) { - btrfs_abort_transaction(trans, tree_root, ret); + btrfs_abort_transaction(trans, ret); btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index e08b6bc676e3..1d195d2b32c6 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3785,27 +3785,27 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, if (fs_info->scrub_workers_refcnt == 0) { if (is_dev_replace) fs_info->scrub_workers = - btrfs_alloc_workqueue("scrub", flags, + btrfs_alloc_workqueue(fs_info, "scrub", flags, 1, 4); else fs_info->scrub_workers = - btrfs_alloc_workqueue("scrub", flags, + btrfs_alloc_workqueue(fs_info, "scrub", flags, max_active, 4); if (!fs_info->scrub_workers) goto fail_scrub_workers; fs_info->scrub_wr_completion_workers = - btrfs_alloc_workqueue("scrubwrc", flags, + btrfs_alloc_workqueue(fs_info, "scrubwrc", flags, max_active, 2); if (!fs_info->scrub_wr_completion_workers) goto fail_scrub_wr_completion_workers; fs_info->scrub_nocow_workers = - btrfs_alloc_workqueue("scrubnc", flags, 1, 0); + btrfs_alloc_workqueue(fs_info, "scrubnc", flags, 1, 0); if (!fs_info->scrub_nocow_workers) goto fail_scrub_nocow_workers; fs_info->scrub_parity_workers = - btrfs_alloc_workqueue("scrubparity", flags, + btrfs_alloc_workqueue(fs_info, "scrubparity", flags, max_active, 2); if (!fs_info->scrub_parity_workers) goto fail_scrub_parity_workers; @@ -3860,7 +3860,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, if (fs_info->chunk_root->sectorsize != PAGE_SIZE) { /* not supported for data w/o checksums */ - btrfs_err(fs_info, + btrfs_err_rl(fs_info, "scrub: size assumption sectorsize != PAGE_SIZE " "(%d != %lu) fails", fs_info->chunk_root->sectorsize, PAGE_SIZE); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index b71dd298385c..efe129fe2678 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -231,7 +231,6 @@ struct pending_dir_move { u64 parent_ino; u64 ino; u64 gen; - bool is_orphan; struct list_head update_refs; }; @@ -274,6 +273,39 @@ struct name_cache_entry { char name[]; }; +static void inconsistent_snapshot_error(struct send_ctx *sctx, + enum btrfs_compare_tree_result result, + const char *what) +{ + const char *result_string; + + switch (result) { + case BTRFS_COMPARE_TREE_NEW: + result_string = "new"; + break; + case BTRFS_COMPARE_TREE_DELETED: + result_string = "deleted"; + break; + case BTRFS_COMPARE_TREE_CHANGED: + result_string = "updated"; + break; + case BTRFS_COMPARE_TREE_SAME: + ASSERT(0); + result_string = "unchanged"; + break; + default: + ASSERT(0); + result_string = "unexpected"; + } + + btrfs_err(sctx->send_root->fs_info, + "Send: inconsistent snapshot, found %s %s for inode %llu without updated inode item, send root is %llu, parent root is %llu", + result_string, what, sctx->cmp_key->objectid, + sctx->send_root->root_key.objectid, + (sctx->parent_root ? + sctx->parent_root->root_key.objectid : 0)); +} + static int is_waiting_for_move(struct send_ctx *sctx, u64 ino); static struct waiting_dir_move * @@ -1861,7 +1893,8 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, * was already unlinked/moved, so we can safely assume that we will not * overwrite anything at this point in time. */ - if (other_inode > sctx->send_progress) { + if (other_inode > sctx->send_progress || + is_waiting_for_move(sctx, other_inode)) { ret = get_inode_info(sctx->parent_root, other_inode, NULL, who_gen, NULL, NULL, NULL, NULL); if (ret < 0) @@ -2502,6 +2535,8 @@ verbose_printk("btrfs: send_utimes %llu\n", ino); key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0); + if (ret > 0) + ret = -ENOENT; if (ret < 0) goto out; @@ -2947,6 +2982,10 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, } if (loc.objectid > send_progress) { + struct orphan_dir_info *odi; + + odi = get_orphan_dir_info(sctx, dir); + free_orphan_dir_info(sctx, odi); ret = 0; goto out; } @@ -3047,7 +3086,6 @@ static int add_pending_dir_move(struct send_ctx *sctx, pm->parent_ino = parent_ino; pm->ino = ino; pm->gen = ino_gen; - pm->is_orphan = is_orphan; INIT_LIST_HEAD(&pm->list); INIT_LIST_HEAD(&pm->update_refs); RB_CLEAR_NODE(&pm->node); @@ -3113,6 +3151,48 @@ static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx, return NULL; } +static int path_loop(struct send_ctx *sctx, struct fs_path *name, + u64 ino, u64 gen, u64 *ancestor_ino) +{ + int ret = 0; + u64 parent_inode = 0; + u64 parent_gen = 0; + u64 start_ino = ino; + + *ancestor_ino = 0; + while (ino != BTRFS_FIRST_FREE_OBJECTID) { + fs_path_reset(name); + + if (is_waiting_for_rm(sctx, ino)) + break; + if (is_waiting_for_move(sctx, ino)) { + if (*ancestor_ino == 0) + *ancestor_ino = ino; + ret = get_first_ref(sctx->parent_root, ino, + &parent_inode, &parent_gen, name); + } else { + ret = __get_cur_name_and_parent(sctx, ino, gen, + &parent_inode, + &parent_gen, name); + if (ret > 0) { + ret = 0; + break; + } + } + if (ret < 0) + break; + if (parent_inode == start_ino) { + ret = 1; + if (*ancestor_ino == 0) + *ancestor_ino = ino; + break; + } + ino = parent_inode; + gen = parent_gen; + } + return ret; +} + static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) { struct fs_path *from_path = NULL; @@ -3123,6 +3203,8 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) u64 parent_ino, parent_gen; struct waiting_dir_move *dm = NULL; u64 rmdir_ino = 0; + u64 ancestor; + bool is_orphan; int ret; name = fs_path_alloc(); @@ -3135,9 +3217,10 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) dm = get_waiting_dir_move(sctx, pm->ino); ASSERT(dm); rmdir_ino = dm->rmdir_ino; + is_orphan = dm->orphanized; free_waiting_dir_move(sctx, dm); - if (pm->is_orphan) { + if (is_orphan) { ret = gen_unique_name(sctx, pm->ino, pm->gen, from_path); } else { @@ -3155,6 +3238,24 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) goto out; sctx->send_progress = sctx->cur_ino + 1; + ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor); + if (ret < 0) + goto out; + if (ret) { + LIST_HEAD(deleted_refs); + ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID); + ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor, + &pm->update_refs, &deleted_refs, + is_orphan); + if (ret < 0) + goto out; + if (rmdir_ino) { + dm = get_waiting_dir_move(sctx, pm->ino); + ASSERT(dm); + dm->rmdir_ino = rmdir_ino; + } + goto out; + } fs_path_reset(name); to_path = name; name = NULL; @@ -3174,7 +3275,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) /* already deleted */ goto finish; } - ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino + 1); + ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino); if (ret < 0) goto out; if (!ret) @@ -3204,8 +3305,18 @@ finish: * and old parent(s). */ list_for_each_entry(cur, &pm->update_refs, list) { - if (cur->dir == rmdir_ino) + /* + * The parent inode might have been deleted in the send snapshot + */ + ret = get_inode_info(sctx->send_root, cur->dir, NULL, + NULL, NULL, NULL, NULL, NULL); + if (ret == -ENOENT) { + ret = 0; continue; + } + if (ret < 0) + goto out; + ret = send_utimes(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; @@ -3325,6 +3436,7 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, u64 left_gen; u64 right_gen; int ret = 0; + struct waiting_dir_move *wdm; if (RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) return 0; @@ -3383,7 +3495,8 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, goto out; } - if (is_waiting_for_move(sctx, di_key.objectid)) { + wdm = get_waiting_dir_move(sctx, di_key.objectid); + if (wdm && !wdm->orphanized) { ret = add_pending_dir_move(sctx, sctx->cur_ino, sctx->cur_inode_gen, @@ -3470,7 +3583,8 @@ static int wait_for_parent_move(struct send_ctx *sctx, ret = is_ancestor(sctx->parent_root, sctx->cur_ino, sctx->cur_inode_gen, ino, path_before); - break; + if (ret) + break; } fs_path_reset(path_before); @@ -3643,11 +3757,26 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); goto out; if (ret) { struct name_cache_entry *nce; + struct waiting_dir_move *wdm; ret = orphanize_inode(sctx, ow_inode, ow_gen, cur->full_path); if (ret < 0) goto out; + + /* + * If ow_inode has its rename operation delayed + * make sure that its orphanized name is used in + * the source path when performing its rename + * operation. + */ + if (is_waiting_for_move(sctx, ow_inode)) { + wdm = get_waiting_dir_move(sctx, + ow_inode); + ASSERT(wdm); + wdm->orphanized = true; + } + /* * Make sure we clear our orphanized inode's * name from the name cache. This is because the @@ -3663,6 +3792,19 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); name_cache_delete(sctx, nce); kfree(nce); } + + /* + * ow_inode might currently be an ancestor of + * cur_ino, therefore compute valid_path (the + * current path of cur_ino) again because it + * might contain the pre-orphanization name of + * ow_inode, which is no longer valid. + */ + fs_path_reset(valid_path); + ret = get_cur_path(sctx, sctx->cur_ino, + sctx->cur_inode_gen, valid_path); + if (ret < 0) + goto out; } else { ret = send_unlink(sctx, cur->full_path); if (ret < 0) @@ -5602,7 +5744,10 @@ static int changed_ref(struct send_ctx *sctx, { int ret = 0; - BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid); + if (sctx->cur_ino != sctx->cmp_key->objectid) { + inconsistent_snapshot_error(sctx, result, "reference"); + return -EIO; + } if (!sctx->cur_inode_new_gen && sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) { @@ -5627,7 +5772,10 @@ static int changed_xattr(struct send_ctx *sctx, { int ret = 0; - BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid); + if (sctx->cur_ino != sctx->cmp_key->objectid) { + inconsistent_snapshot_error(sctx, result, "xattr"); + return -EIO; + } if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) { if (result == BTRFS_COMPARE_TREE_NEW) @@ -5651,7 +5799,10 @@ static int changed_extent(struct send_ctx *sctx, { int ret = 0; - BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid); + if (sctx->cur_ino != sctx->cmp_key->objectid) { + inconsistent_snapshot_error(sctx, result, "extent"); + return -EIO; + } if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) { if (result != BTRFS_COMPARE_TREE_DELETED) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 60e7179ed4b7..864ce334f696 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -184,6 +184,22 @@ static const char * const logtypes[] = { "debug", }; + +/* + * Use one ratelimit state per log level so that a flood of less important + * messages doesn't cause more important ones to be dropped. + */ +static struct ratelimit_state printk_limits[] = { + RATELIMIT_STATE_INIT(printk_limits[0], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[1], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[2], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[3], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[4], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[5], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[6], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100), +}; + void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) { struct super_block *sb = fs_info->sb; @@ -192,6 +208,7 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) va_list args; const char *type = logtypes[4]; int kern_level; + struct ratelimit_state *ratelimit; va_start(args, fmt); @@ -202,13 +219,18 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) lvl[size] = '\0'; fmt += size; type = logtypes[kern_level - '0']; - } else + ratelimit = &printk_limits[kern_level - '0']; + } else { *lvl = '\0'; + /* Default to debug output */ + ratelimit = &printk_limits[7]; + } vaf.fmt = fmt; vaf.va = &args; - printk("%sBTRFS %s (device %s): %pV\n", lvl, type, sb->s_id, &vaf); + if (__ratelimit(ratelimit)) + printk("%sBTRFS %s (device %s): %pV\n", lvl, type, sb->s_id, &vaf); va_end(args); } @@ -229,9 +251,11 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) */ __cold void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const char *function, + const char *function, unsigned int line, int errno) { + struct btrfs_fs_info *fs_info = trans->fs_info; + trans->aborted = errno; /* Nothing used. The other threads that have joined this * transaction may be able to continue. */ @@ -239,16 +263,16 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, const char *errstr; errstr = btrfs_decode_error(errno); - btrfs_warn(root->fs_info, + btrfs_warn(fs_info, "%s:%d: Aborting unused transaction(%s).", function, line, errstr); return; } ACCESS_ONCE(trans->transaction->aborted) = errno; /* Wake up anybody who may be waiting on this transaction */ - wake_up(&root->fs_info->transaction_wait); - wake_up(&root->fs_info->transaction_blocked_wait); - __btrfs_handle_fs_error(root->fs_info, function, line, errno, NULL); + wake_up(&fs_info->transaction_wait); + wake_up(&fs_info->transaction_blocked_wait); + __btrfs_handle_fs_error(fs_info, function, line, errno, NULL); } /* * __btrfs_panic decodes unexpected, fatal errors from the caller, @@ -432,12 +456,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, */ break; case Opt_nodatasum: - btrfs_set_and_info(root, NODATASUM, + btrfs_set_and_info(info, NODATASUM, "setting nodatasum"); break; case Opt_datasum: - if (btrfs_test_opt(root, NODATASUM)) { - if (btrfs_test_opt(root, NODATACOW)) + if (btrfs_test_opt(info, NODATASUM)) { + if (btrfs_test_opt(info, NODATACOW)) btrfs_info(root->fs_info, "setting datasum, datacow enabled"); else btrfs_info(root->fs_info, "setting datasum"); @@ -446,9 +470,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, btrfs_clear_opt(info->mount_opt, NODATASUM); break; case Opt_nodatacow: - if (!btrfs_test_opt(root, NODATACOW)) { - if (!btrfs_test_opt(root, COMPRESS) || - !btrfs_test_opt(root, FORCE_COMPRESS)) { + if (!btrfs_test_opt(info, NODATACOW)) { + if (!btrfs_test_opt(info, COMPRESS) || + !btrfs_test_opt(info, FORCE_COMPRESS)) { btrfs_info(root->fs_info, "setting nodatacow, compression disabled"); } else { @@ -461,7 +485,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, btrfs_set_opt(info->mount_opt, NODATASUM); break; case Opt_datacow: - btrfs_clear_and_info(root, NODATACOW, + btrfs_clear_and_info(info, NODATACOW, "setting datacow"); break; case Opt_compress_force: @@ -470,10 +494,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, /* Fallthrough */ case Opt_compress: case Opt_compress_type: - saved_compress_type = btrfs_test_opt(root, COMPRESS) ? + saved_compress_type = btrfs_test_opt(info, + COMPRESS) ? info->compress_type : BTRFS_COMPRESS_NONE; saved_compress_force = - btrfs_test_opt(root, FORCE_COMPRESS); + btrfs_test_opt(info, FORCE_COMPRESS); if (token == Opt_compress || token == Opt_compress_force || strcmp(args[0].from, "zlib") == 0) { @@ -513,10 +538,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, */ btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); } - if ((btrfs_test_opt(root, COMPRESS) && + if ((btrfs_test_opt(info, COMPRESS) && (info->compress_type != saved_compress_type || compress_force != saved_compress_force)) || - (!btrfs_test_opt(root, COMPRESS) && + (!btrfs_test_opt(info, COMPRESS) && no_compress == 1)) { btrfs_info(root->fs_info, "%s %s compression", @@ -526,25 +551,25 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, compress_force = false; break; case Opt_ssd: - btrfs_set_and_info(root, SSD, + btrfs_set_and_info(info, SSD, "use ssd allocation scheme"); break; case Opt_ssd_spread: - btrfs_set_and_info(root, SSD_SPREAD, + btrfs_set_and_info(info, SSD_SPREAD, "use spread ssd allocation scheme"); btrfs_set_opt(info->mount_opt, SSD); break; case Opt_nossd: - btrfs_set_and_info(root, NOSSD, + btrfs_set_and_info(info, NOSSD, "not using ssd allocation scheme"); btrfs_clear_opt(info->mount_opt, SSD); break; case Opt_barrier: - btrfs_clear_and_info(root, NOBARRIER, + btrfs_clear_and_info(info, NOBARRIER, "turning on barriers"); break; case Opt_nobarrier: - btrfs_set_and_info(root, NOBARRIER, + btrfs_set_and_info(info, NOBARRIER, "turning off barriers"); break; case Opt_thread_pool: @@ -604,24 +629,24 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, root->fs_info->sb->s_flags &= ~MS_POSIXACL; break; case Opt_notreelog: - btrfs_set_and_info(root, NOTREELOG, + btrfs_set_and_info(info, NOTREELOG, "disabling tree log"); break; case Opt_treelog: - btrfs_clear_and_info(root, NOTREELOG, + btrfs_clear_and_info(info, NOTREELOG, "enabling tree log"); break; case Opt_norecovery: case Opt_nologreplay: - btrfs_set_and_info(root, NOLOGREPLAY, + btrfs_set_and_info(info, NOLOGREPLAY, "disabling log replay at mount time"); break; case Opt_flushoncommit: - btrfs_set_and_info(root, FLUSHONCOMMIT, + btrfs_set_and_info(info, FLUSHONCOMMIT, "turning on flush-on-commit"); break; case Opt_noflushoncommit: - btrfs_clear_and_info(root, FLUSHONCOMMIT, + btrfs_clear_and_info(info, FLUSHONCOMMIT, "turning off flush-on-commit"); break; case Opt_ratio: @@ -638,11 +663,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, } break; case Opt_discard: - btrfs_set_and_info(root, DISCARD, + btrfs_set_and_info(info, DISCARD, "turning on discard"); break; case Opt_nodiscard: - btrfs_clear_and_info(root, DISCARD, + btrfs_clear_and_info(info, DISCARD, "turning off discard"); break; case Opt_space_cache: @@ -651,12 +676,13 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, strcmp(args[0].from, "v1") == 0) { btrfs_clear_opt(root->fs_info->mount_opt, FREE_SPACE_TREE); - btrfs_set_and_info(root, SPACE_CACHE, + btrfs_set_and_info(info, SPACE_CACHE, "enabling disk space caching"); } else if (strcmp(args[0].from, "v2") == 0) { btrfs_clear_opt(root->fs_info->mount_opt, SPACE_CACHE); - btrfs_set_and_info(root, FREE_SPACE_TREE, + btrfs_set_and_info(info, + FREE_SPACE_TREE, "enabling free space tree"); } else { ret = -EINVAL; @@ -667,12 +693,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE); break; case Opt_no_space_cache: - if (btrfs_test_opt(root, SPACE_CACHE)) { - btrfs_clear_and_info(root, SPACE_CACHE, + if (btrfs_test_opt(info, SPACE_CACHE)) { + btrfs_clear_and_info(info, + SPACE_CACHE, "disabling disk space caching"); } - if (btrfs_test_opt(root, FREE_SPACE_TREE)) { - btrfs_clear_and_info(root, FREE_SPACE_TREE, + if (btrfs_test_opt(info, FREE_SPACE_TREE)) { + btrfs_clear_and_info(info, + FREE_SPACE_TREE, "disabling free space tree"); } break; @@ -685,7 +713,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, "disabling inode map caching"); break; case Opt_clear_cache: - btrfs_set_and_info(root, CLEAR_CACHE, + btrfs_set_and_info(info, CLEAR_CACHE, "force clearing of disk cache"); break; case Opt_user_subvol_rm_allowed: @@ -698,11 +726,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, btrfs_clear_opt(info->mount_opt, ENOSPC_DEBUG); break; case Opt_defrag: - btrfs_set_and_info(root, AUTO_DEFRAG, + btrfs_set_and_info(info, AUTO_DEFRAG, "enabling auto defrag"); break; case Opt_nodefrag: - btrfs_clear_and_info(root, AUTO_DEFRAG, + btrfs_clear_and_info(info, AUTO_DEFRAG, "disabling auto defrag"); break; case Opt_recovery: @@ -810,22 +838,22 @@ check: /* * Extra check for current option against current flag */ - if (btrfs_test_opt(root, NOLOGREPLAY) && !(new_flags & MS_RDONLY)) { + if (btrfs_test_opt(info, NOLOGREPLAY) && !(new_flags & MS_RDONLY)) { btrfs_err(root->fs_info, "nologreplay must be used with ro mount option"); ret = -EINVAL; } out: if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE) && - !btrfs_test_opt(root, FREE_SPACE_TREE) && - !btrfs_test_opt(root, CLEAR_CACHE)) { + !btrfs_test_opt(info, FREE_SPACE_TREE) && + !btrfs_test_opt(info, CLEAR_CACHE)) { btrfs_err(root->fs_info, "cannot disable free space tree"); ret = -EINVAL; } - if (!ret && btrfs_test_opt(root, SPACE_CACHE)) + if (!ret && btrfs_test_opt(info, SPACE_CACHE)) btrfs_info(root->fs_info, "disk space caching is enabled"); - if (!ret && btrfs_test_opt(root, FREE_SPACE_TREE)) + if (!ret && btrfs_test_opt(info, FREE_SPACE_TREE)) btrfs_info(root->fs_info, "using free space tree"); kfree(orig); return ret; @@ -1149,7 +1177,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait) struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root = fs_info->tree_root; - trace_btrfs_sync_fs(wait); + trace_btrfs_sync_fs(fs_info, wait); if (!wait) { filemap_flush(fs_info->btree_inode->i_mapping); @@ -1192,13 +1220,13 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) struct btrfs_root *root = info->tree_root; char *compress_type; - if (btrfs_test_opt(root, DEGRADED)) + if (btrfs_test_opt(info, DEGRADED)) seq_puts(seq, ",degraded"); - if (btrfs_test_opt(root, NODATASUM)) + if (btrfs_test_opt(info, NODATASUM)) seq_puts(seq, ",nodatasum"); - if (btrfs_test_opt(root, NODATACOW)) + if (btrfs_test_opt(info, NODATACOW)) seq_puts(seq, ",nodatacow"); - if (btrfs_test_opt(root, NOBARRIER)) + if (btrfs_test_opt(info, NOBARRIER)) seq_puts(seq, ",nobarrier"); if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE) seq_printf(seq, ",max_inline=%llu", info->max_inline); @@ -1207,56 +1235,56 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) if (info->thread_pool_size != min_t(unsigned long, num_online_cpus() + 2, 8)) seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); - if (btrfs_test_opt(root, COMPRESS)) { + if (btrfs_test_opt(info, COMPRESS)) { if (info->compress_type == BTRFS_COMPRESS_ZLIB) compress_type = "zlib"; else compress_type = "lzo"; - if (btrfs_test_opt(root, FORCE_COMPRESS)) + if (btrfs_test_opt(info, FORCE_COMPRESS)) seq_printf(seq, ",compress-force=%s", compress_type); else seq_printf(seq, ",compress=%s", compress_type); } - if (btrfs_test_opt(root, NOSSD)) + if (btrfs_test_opt(info, NOSSD)) seq_puts(seq, ",nossd"); - if (btrfs_test_opt(root, SSD_SPREAD)) + if (btrfs_test_opt(info, SSD_SPREAD)) seq_puts(seq, ",ssd_spread"); - else if (btrfs_test_opt(root, SSD)) + else if (btrfs_test_opt(info, SSD)) seq_puts(seq, ",ssd"); - if (btrfs_test_opt(root, NOTREELOG)) + if (btrfs_test_opt(info, NOTREELOG)) seq_puts(seq, ",notreelog"); - if (btrfs_test_opt(root, NOLOGREPLAY)) + if (btrfs_test_opt(info, NOLOGREPLAY)) seq_puts(seq, ",nologreplay"); - if (btrfs_test_opt(root, FLUSHONCOMMIT)) + if (btrfs_test_opt(info, FLUSHONCOMMIT)) seq_puts(seq, ",flushoncommit"); - if (btrfs_test_opt(root, DISCARD)) + if (btrfs_test_opt(info, DISCARD)) seq_puts(seq, ",discard"); if (!(root->fs_info->sb->s_flags & MS_POSIXACL)) seq_puts(seq, ",noacl"); - if (btrfs_test_opt(root, SPACE_CACHE)) + if (btrfs_test_opt(info, SPACE_CACHE)) seq_puts(seq, ",space_cache"); - else if (btrfs_test_opt(root, FREE_SPACE_TREE)) + else if (btrfs_test_opt(info, FREE_SPACE_TREE)) seq_puts(seq, ",space_cache=v2"); else seq_puts(seq, ",nospace_cache"); - if (btrfs_test_opt(root, RESCAN_UUID_TREE)) + if (btrfs_test_opt(info, RESCAN_UUID_TREE)) seq_puts(seq, ",rescan_uuid_tree"); - if (btrfs_test_opt(root, CLEAR_CACHE)) + if (btrfs_test_opt(info, CLEAR_CACHE)) seq_puts(seq, ",clear_cache"); - if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) + if (btrfs_test_opt(info, USER_SUBVOL_RM_ALLOWED)) seq_puts(seq, ",user_subvol_rm_allowed"); - if (btrfs_test_opt(root, ENOSPC_DEBUG)) + if (btrfs_test_opt(info, ENOSPC_DEBUG)) seq_puts(seq, ",enospc_debug"); - if (btrfs_test_opt(root, AUTO_DEFRAG)) + if (btrfs_test_opt(info, AUTO_DEFRAG)) seq_puts(seq, ",autodefrag"); - if (btrfs_test_opt(root, INODE_MAP_CACHE)) + if (btrfs_test_opt(info, INODE_MAP_CACHE)) seq_puts(seq, ",inode_cache"); - if (btrfs_test_opt(root, SKIP_BALANCE)) + if (btrfs_test_opt(info, SKIP_BALANCE)) seq_puts(seq, ",skip_balance"); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_test_opt(root, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA)) + if (btrfs_test_opt(info, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA)) seq_puts(seq, ",check_int_data"); - else if (btrfs_test_opt(root, CHECK_INTEGRITY)) + else if (btrfs_test_opt(info, CHECK_INTEGRITY)) seq_puts(seq, ",check_int"); if (info->check_integrity_print_mask) seq_printf(seq, ",check_int_print_mask=%d", @@ -1265,14 +1293,14 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) if (info->metadata_ratio) seq_printf(seq, ",metadata_ratio=%d", info->metadata_ratio); - if (btrfs_test_opt(root, PANIC_ON_FATAL_ERROR)) + if (btrfs_test_opt(info, PANIC_ON_FATAL_ERROR)) seq_puts(seq, ",fatal_errors=panic"); if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL) seq_printf(seq, ",commit=%d", info->commit_interval); #ifdef CONFIG_BTRFS_DEBUG - if (btrfs_test_opt(root, FRAGMENT_DATA)) + if (btrfs_test_opt(info, FRAGMENT_DATA)) seq_puts(seq, ",fragment=data"); - if (btrfs_test_opt(root, FRAGMENT_METADATA)) + if (btrfs_test_opt(info, FRAGMENT_METADATA)) seq_puts(seq, ",fragment=metadata"); #endif seq_printf(seq, ",subvolid=%llu", @@ -2030,9 +2058,6 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) * chunk). * * If metadata is exhausted, f_bavail will be 0. - * - * FIXME: not accurate for mixed block groups, total and free/used are ok, - * available appears slightly larger. */ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) { @@ -2319,49 +2344,6 @@ static void btrfs_print_mod_info(void) btrfs_crc32c_impl()); } -static int btrfs_run_sanity_tests(void) -{ - int ret, i; - u32 sectorsize, nodesize; - u32 test_sectorsize[] = { - PAGE_SIZE, - }; - ret = btrfs_init_test_fs(); - if (ret) - return ret; - for (i = 0; i < ARRAY_SIZE(test_sectorsize); i++) { - sectorsize = test_sectorsize[i]; - for (nodesize = sectorsize; - nodesize <= BTRFS_MAX_METADATA_BLOCKSIZE; - nodesize <<= 1) { - pr_info("BTRFS: selftest: sectorsize: %u nodesize: %u\n", - sectorsize, nodesize); - ret = btrfs_test_free_space_cache(sectorsize, nodesize); - if (ret) - goto out; - ret = btrfs_test_extent_buffer_operations(sectorsize, - nodesize); - if (ret) - goto out; - ret = btrfs_test_extent_io(sectorsize, nodesize); - if (ret) - goto out; - ret = btrfs_test_inodes(sectorsize, nodesize); - if (ret) - goto out; - ret = btrfs_test_qgroups(sectorsize, nodesize); - if (ret) - goto out; - ret = btrfs_test_free_space_tree(sectorsize, nodesize); - if (ret) - goto out; - } - } -out: - btrfs_destroy_test_fs(); - return ret; -} - static int __init init_btrfs_fs(void) { int err; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 4879656bda3c..c6569905d3d1 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -326,6 +326,7 @@ SPACE_INFO_ATTR(bytes_used); SPACE_INFO_ATTR(bytes_pinned); SPACE_INFO_ATTR(bytes_reserved); SPACE_INFO_ATTR(bytes_may_use); +SPACE_INFO_ATTR(bytes_readonly); SPACE_INFO_ATTR(disk_used); SPACE_INFO_ATTR(disk_total); BTRFS_ATTR(total_bytes_pinned, btrfs_space_info_show_total_bytes_pinned); @@ -337,6 +338,7 @@ static struct attribute *space_info_attrs[] = { BTRFS_ATTR_PTR(bytes_pinned), BTRFS_ATTR_PTR(bytes_reserved), BTRFS_ATTR_PTR(bytes_may_use), + BTRFS_ATTR_PTR(bytes_readonly), BTRFS_ATTR_PTR(disk_used), BTRFS_ATTR_PTR(disk_total), BTRFS_ATTR_PTR(total_bytes_pinned), diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 02223f3f78f4..bf62ad919a95 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -54,7 +54,7 @@ struct inode *btrfs_new_test_inode(void) return new_inode(test_mnt->mnt_sb); } -int btrfs_init_test_fs(void) +static int btrfs_init_test_fs(void) { int ret; @@ -73,7 +73,7 @@ int btrfs_init_test_fs(void) return 0; } -void btrfs_destroy_test_fs(void) +static void btrfs_destroy_test_fs(void) { kern_unmount(test_mnt); unregister_filesystem(&test_type); @@ -128,14 +128,27 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void) extent_io_tree_init(&fs_info->freed_extents[0], NULL); extent_io_tree_init(&fs_info->freed_extents[1], NULL); fs_info->pinned_extents = &fs_info->freed_extents[0]; + set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); + + test_mnt->mnt_sb->s_fs_info = fs_info; + return fs_info; } -static void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) +void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) { struct radix_tree_iter iter; void **slot; + if (!fs_info) + return; + + if (WARN_ON(!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, + &fs_info->fs_state))) + return; + + test_mnt->mnt_sb->s_fs_info = NULL; + spin_lock(&fs_info->buffer_lock); radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) { struct extent_buffer *eb; @@ -167,10 +180,11 @@ void btrfs_free_dummy_root(struct btrfs_root *root) { if (!root) return; + /* Will be freed by btrfs_free_fs_roots */ + if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state))) + return; if (root->node) free_extent_buffer(root->node); - if (root->fs_info) - btrfs_free_dummy_fs_info(root->fs_info); kfree(root); } @@ -220,3 +234,46 @@ void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans) INIT_LIST_HEAD(&trans->qgroup_ref_list); trans->type = __TRANS_DUMMY; } + +int btrfs_run_sanity_tests(void) +{ + int ret, i; + u32 sectorsize, nodesize; + u32 test_sectorsize[] = { + PAGE_SIZE, + }; + ret = btrfs_init_test_fs(); + if (ret) + return ret; + for (i = 0; i < ARRAY_SIZE(test_sectorsize); i++) { + sectorsize = test_sectorsize[i]; + for (nodesize = sectorsize; + nodesize <= BTRFS_MAX_METADATA_BLOCKSIZE; + nodesize <<= 1) { + pr_info("BTRFS: selftest: sectorsize: %u nodesize: %u\n", + sectorsize, nodesize); + ret = btrfs_test_free_space_cache(sectorsize, nodesize); + if (ret) + goto out; + ret = btrfs_test_extent_buffer_operations(sectorsize, + nodesize); + if (ret) + goto out; + ret = btrfs_test_extent_io(sectorsize, nodesize); + if (ret) + goto out; + ret = btrfs_test_inodes(sectorsize, nodesize); + if (ret) + goto out; + ret = btrfs_test_qgroups(sectorsize, nodesize); + if (ret) + goto out; + ret = btrfs_test_free_space_tree(sectorsize, nodesize); + if (ret) + goto out; + } + } +out: + btrfs_destroy_test_fs(); + return ret; +} diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index 66fb6b701eb7..b17ffbe8f9f3 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -20,57 +20,29 @@ #define __BTRFS_TESTS #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +int btrfs_run_sanity_tests(void); #define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__) struct btrfs_root; struct btrfs_trans_handle; -int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize); int btrfs_test_extent_buffer_operations(u32 sectorsize, u32 nodesize); +int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize); int btrfs_test_extent_io(u32 sectorsize, u32 nodesize); int btrfs_test_inodes(u32 sectorsize, u32 nodesize); int btrfs_test_qgroups(u32 sectorsize, u32 nodesize); int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize); -int btrfs_init_test_fs(void); -void btrfs_destroy_test_fs(void); struct inode *btrfs_new_test_inode(void); struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void); +void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info); void btrfs_free_dummy_root(struct btrfs_root *root); struct btrfs_block_group_cache * btrfs_alloc_dummy_block_group(unsigned long length, u32 sectorsize); void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache); void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans); #else -static inline int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) -{ - return 0; -} -static inline int btrfs_test_extent_buffer_operations(u32 sectorsize, - u32 nodesize) -{ - return 0; -} -static inline int btrfs_init_test_fs(void) -{ - return 0; -} -static inline void btrfs_destroy_test_fs(void) -{ -} -static inline int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) -{ - return 0; -} -static inline int btrfs_test_inodes(u32 sectorsize, u32 nodesize) -{ - return 0; -} -static inline int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) -{ - return 0; -} -static inline int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize) +static inline int btrfs_run_sanity_tests(void) { return 0; } diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index 4f8cbd1ec5ee..199569174637 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -24,8 +24,9 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) { - struct btrfs_path *path; - struct btrfs_root *root; + struct btrfs_fs_info *fs_info; + struct btrfs_path *path = NULL; + struct btrfs_root *root = NULL; struct extent_buffer *eb; struct btrfs_item *item; char *value = "mary had a little lamb"; @@ -40,17 +41,24 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) test_msg("Running btrfs_split_item tests\n"); - root = btrfs_alloc_dummy_root(sectorsize, nodesize); + fs_info = btrfs_alloc_dummy_fs_info(); + if (!fs_info) { + test_msg("Could not allocate fs_info\n"); + return -ENOMEM; + } + + root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); if (IS_ERR(root)) { test_msg("Could not allocate root\n"); - return PTR_ERR(root); + ret = PTR_ERR(root); + goto out; } path = btrfs_alloc_path(); if (!path) { test_msg("Could not allocate path\n"); - kfree(root); - return -ENOMEM; + ret = -ENOMEM; + goto out; } path->nodes[0] = eb = alloc_dummy_extent_buffer(NULL, nodesize, @@ -219,7 +227,8 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) } out: btrfs_free_path(path); - kfree(root); + btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); return ret; } diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index 3956bb2ff84c..3221c8dee272 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -837,6 +837,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) { + struct btrfs_fs_info *fs_info; struct btrfs_block_group_cache *cache; struct btrfs_root *root = NULL; int ret = -ENOMEM; @@ -855,15 +856,17 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) return 0; } - root = btrfs_alloc_dummy_root(sectorsize, nodesize); - if (IS_ERR(root)) { - ret = PTR_ERR(root); + fs_info = btrfs_alloc_dummy_fs_info(); + if (!fs_info) { + ret = -ENOMEM; goto out; } - root->fs_info = btrfs_alloc_dummy_fs_info(); - if (!root->fs_info) + root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); + if (IS_ERR(root)) { + ret = PTR_ERR(root); goto out; + } root->fs_info->extent_root = root; cache->fs_info = root->fs_info; @@ -882,6 +885,7 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) out: btrfs_free_dummy_block_group(cache); btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); test_msg("Free space cache tests finished\n"); return ret; } diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index aac507085ab0..7508d3b42780 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -443,23 +443,24 @@ typedef int (*test_func_t)(struct btrfs_trans_handle *, static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, u32 nodesize) { + struct btrfs_fs_info *fs_info; struct btrfs_root *root = NULL; struct btrfs_block_group_cache *cache = NULL; struct btrfs_trans_handle trans; struct btrfs_path *path = NULL; int ret; - root = btrfs_alloc_dummy_root(sectorsize, nodesize); - if (IS_ERR(root)) { - test_msg("Couldn't allocate dummy root\n"); - ret = PTR_ERR(root); + fs_info = btrfs_alloc_dummy_fs_info(); + if (!fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); + ret = -ENOMEM; goto out; } - root->fs_info = btrfs_alloc_dummy_fs_info(); - if (!root->fs_info) { - test_msg("Couldn't allocate dummy fs info\n"); - ret = -ENOMEM; + root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); + if (IS_ERR(root)) { + test_msg("Couldn't allocate dummy root\n"); + ret = PTR_ERR(root); goto out; } @@ -534,6 +535,7 @@ out: btrfs_free_path(path); btrfs_free_dummy_block_group(cache); btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); return ret; } diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 29648c0a39f1..9f72aeda9220 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -230,6 +230,7 @@ static unsigned long vacancy_only = 0; static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) { + struct btrfs_fs_info *fs_info = NULL; struct inode *inode = NULL; struct btrfs_root *root = NULL; struct extent_map *em = NULL; @@ -248,19 +249,15 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; BTRFS_I(inode)->location.offset = 0; - root = btrfs_alloc_dummy_root(sectorsize, nodesize); - if (IS_ERR(root)) { - test_msg("Couldn't allocate root\n"); + fs_info = btrfs_alloc_dummy_fs_info(); + if (!fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); goto out; } - /* - * We do this since btrfs_get_extent wants to assign em->bdev to - * root->fs_info->fs_devices->latest_bdev. - */ - root->fs_info = btrfs_alloc_dummy_fs_info(); - if (!root->fs_info) { - test_msg("Couldn't allocate dummy fs info\n"); + root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); goto out; } @@ -835,11 +832,13 @@ out: free_extent_map(em); iput(inode); btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); return ret; } static int test_hole_first(u32 sectorsize, u32 nodesize) { + struct btrfs_fs_info *fs_info = NULL; struct inode *inode = NULL; struct btrfs_root *root = NULL; struct extent_map *em = NULL; @@ -855,15 +854,15 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; BTRFS_I(inode)->location.offset = 0; - root = btrfs_alloc_dummy_root(sectorsize, nodesize); - if (IS_ERR(root)) { - test_msg("Couldn't allocate root\n"); + fs_info = btrfs_alloc_dummy_fs_info(); + if (!fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); goto out; } - root->fs_info = btrfs_alloc_dummy_fs_info(); - if (!root->fs_info) { - test_msg("Couldn't allocate dummy fs info\n"); + root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); goto out; } @@ -934,11 +933,13 @@ out: free_extent_map(em); iput(inode); btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); return ret; } static int test_extent_accounting(u32 sectorsize, u32 nodesize) { + struct btrfs_fs_info *fs_info = NULL; struct inode *inode = NULL; struct btrfs_root *root = NULL; int ret = -ENOMEM; @@ -949,15 +950,15 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) return ret; } - root = btrfs_alloc_dummy_root(sectorsize, nodesize); - if (IS_ERR(root)) { - test_msg("Couldn't allocate root\n"); + fs_info = btrfs_alloc_dummy_fs_info(); + if (!fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); goto out; } - root->fs_info = btrfs_alloc_dummy_fs_info(); - if (!root->fs_info) { - test_msg("Couldn't allocate dummy fs info\n"); + root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); goto out; } @@ -1132,6 +1133,7 @@ out: NULL, GFP_KERNEL); iput(inode); btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); return ret; } diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 57a12c0d680b..4407fef7c16c 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -453,22 +453,24 @@ static int test_multiple_refs(struct btrfs_root *root, int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) { + struct btrfs_fs_info *fs_info = NULL; struct btrfs_root *root; struct btrfs_root *tmp_root; int ret = 0; - root = btrfs_alloc_dummy_root(sectorsize, nodesize); - if (IS_ERR(root)) { - test_msg("Couldn't allocate root\n"); - return PTR_ERR(root); + fs_info = btrfs_alloc_dummy_fs_info(); + if (!fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); + return -ENOMEM; } - root->fs_info = btrfs_alloc_dummy_fs_info(); - if (!root->fs_info) { - test_msg("Couldn't allocate dummy fs info\n"); - ret = -ENOMEM; + root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); + ret = PTR_ERR(root); goto out; } + /* We are using this root as our extent root */ root->fs_info->extent_root = root; @@ -495,7 +497,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) btrfs_set_header_nritems(root->node, 0); root->alloc_bytenr += 2 * nodesize; - tmp_root = btrfs_alloc_dummy_root(sectorsize, nodesize); + tmp_root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); if (IS_ERR(tmp_root)) { test_msg("Couldn't allocate a fs root\n"); ret = PTR_ERR(tmp_root); @@ -510,7 +512,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) goto out; } - tmp_root = btrfs_alloc_dummy_root(sectorsize, nodesize); + tmp_root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); if (IS_ERR(tmp_root)) { test_msg("Couldn't allocate a fs root\n"); ret = PTR_ERR(tmp_root); @@ -531,5 +533,6 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) ret = test_multiple_refs(root, sectorsize, nodesize); out: btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); return ret; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 948aa186b353..9cca0a721961 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -561,6 +561,7 @@ again: h->transaction = cur_trans; h->root = root; h->use_count = 1; + h->fs_info = root->fs_info; h->type = type; h->can_flush_pending_bgs = true; @@ -1491,7 +1492,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto dir_item_existed; } else if (IS_ERR(dir_item)) { ret = PTR_ERR(dir_item); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } btrfs_release_path(path); @@ -1504,7 +1505,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, */ ret = btrfs_run_delayed_items(trans, root); if (ret) { /* Transaction aborted */ - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -1543,7 +1544,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (ret) { btrfs_tree_unlock(old); free_extent_buffer(old); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -1554,7 +1555,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_tree_unlock(old); free_extent_buffer(old); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } /* see comments in should_cow_block() */ @@ -1568,7 +1569,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_tree_unlock(tmp); free_extent_buffer(tmp); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -1580,7 +1581,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_ino(parent_inode), index, dentry->d_name.name, dentry->d_name.len); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -1588,19 +1589,19 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key); if (IS_ERR(pending->snap)) { ret = PTR_ERR(pending->snap); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } ret = btrfs_reloc_post_snapshot(trans, pending); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -1622,7 +1623,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, /* We have check then name at the beginning, so it is impossible. */ BUG_ON(ret == -EEXIST || ret == -EOVERFLOW); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -1632,13 +1633,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, current_fs_time(parent_inode->i_sb); ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, new_uuid.b, BTRFS_UUID_KEY_SUBVOL, objectid); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } if (!btrfs_is_empty_uuid(new_root_item->received_uuid)) { @@ -1647,14 +1648,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, BTRFS_UUID_KEY_RECEIVED_SUBVOL, objectid); if (ret && ret != -EEXIST) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } } ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -1709,7 +1710,7 @@ static void update_super_roots(struct btrfs_root *root) super->root = root_item->bytenr; super->generation = root_item->generation; super->root_level = root_item->level; - if (btrfs_test_opt(root, SPACE_CACHE)) + if (btrfs_test_opt(root->fs_info, SPACE_CACHE)) super->cache_generation = root_item->generation; if (root->fs_info->update_uuid_tree_gen) super->uuid_tree_generation = root_item->generation; @@ -1850,7 +1851,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, WARN_ON(trans->use_count > 1); - btrfs_abort_transaction(trans, root, err); + btrfs_abort_transaction(trans, err); spin_lock(&root->fs_info->trans_lock); @@ -1895,14 +1896,14 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) { - if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) + if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) return btrfs_start_delalloc_roots(fs_info, 1, -1); return 0; } static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) { - if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) + if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index c5abee4f01ad..efb122643380 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -128,6 +128,7 @@ struct btrfs_trans_handle { * Subvolume quota depends on this */ struct btrfs_root *root; + struct btrfs_fs_info *fs_info; struct seq_list delayed_ref_elem; struct list_head qgroup_ref_list; struct list_head new_bgs; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c05f69a8ec42..fff3f3efa436 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2757,7 +2757,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, while (1) { int batch = atomic_read(&root->log_batch); /* when we're on an ssd, just kick the log commit out */ - if (!btrfs_test_opt(root, SSD) && + if (!btrfs_test_opt(root->fs_info, SSD) && test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { mutex_unlock(&root->log_mutex); schedule_timeout_uninterruptible(1); @@ -2788,7 +2788,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); if (ret) { blk_finish_plug(&plug); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); btrfs_free_logged_extents(log, log_transid); btrfs_set_log_full_commit(root->fs_info, trans); mutex_unlock(&root->log_mutex); @@ -2838,7 +2838,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, btrfs_set_log_full_commit(root->fs_info, trans); if (ret != -ENOSPC) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); mutex_unlock(&log_root_tree->log_mutex); goto out; } @@ -2898,7 +2898,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, blk_finish_plug(&plug); if (ret) { btrfs_set_log_full_commit(root->fs_info, trans); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); goto out_wake_log_root; @@ -2934,7 +2934,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = write_ctree_super(trans, root->fs_info->tree_root, 1); if (ret) { btrfs_set_log_full_commit(root->fs_info, trans); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_wake_log_root; } @@ -2991,7 +2991,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans, ret = walk_log_tree(trans, log, &wc); /* I don't think this can happen but just in case */ if (ret) - btrfs_abort_transaction(trans, log, ret); + btrfs_abort_transaction(trans, ret); while (1) { ret = find_first_extent_bit(&log->dirty_log_pages, @@ -3160,7 +3160,7 @@ out_unlock: btrfs_set_log_full_commit(root->fs_info, trans); ret = 0; } else if (ret < 0) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_log_trans(root); @@ -3193,7 +3193,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, btrfs_set_log_full_commit(root->fs_info, trans); ret = 0; } else if (ret < 0 && ret != -ENOENT) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_log_trans(root); return ret; @@ -4469,7 +4469,8 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, static int btrfs_check_ref_name_override(struct extent_buffer *eb, const int slot, const struct btrfs_key *key, - struct inode *inode) + struct inode *inode, + u64 *other_ino) { int ret; struct btrfs_path *search_path; @@ -4528,7 +4529,16 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, search_path, parent, name, this_name_len, 0); if (di && !IS_ERR(di)) { - ret = 1; + struct btrfs_key di_key; + + btrfs_dir_item_key_to_cpu(search_path->nodes[0], + di, &di_key); + if (di_key.type == BTRFS_INODE_ITEM_KEY) { + ret = 1; + *other_ino = di_key.objectid; + } else { + ret = -EAGAIN; + } goto out; } else if (IS_ERR(di)) { ret = PTR_ERR(di); @@ -4703,6 +4713,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, ins_nr = 0; ret = btrfs_search_forward(root, &min_key, path, trans->transid); + if (ret < 0) { + err = ret; + goto out_unlock; + } if (ret != 0) break; again: @@ -4718,16 +4732,71 @@ again: if ((min_key.type == BTRFS_INODE_REF_KEY || min_key.type == BTRFS_INODE_EXTREF_KEY) && BTRFS_I(inode)->generation == trans->transid) { + u64 other_ino = 0; + ret = btrfs_check_ref_name_override(path->nodes[0], path->slots[0], - &min_key, inode); + &min_key, inode, + &other_ino); if (ret < 0) { err = ret; goto out_unlock; } else if (ret > 0) { - err = 1; - btrfs_set_log_full_commit(root->fs_info, trans); - goto out_unlock; + struct btrfs_key inode_key; + struct inode *other_inode; + + if (ins_nr > 0) { + ins_nr++; + } else { + ins_nr = 1; + ins_start_slot = path->slots[0]; + } + ret = copy_items(trans, inode, dst_path, path, + &last_extent, ins_start_slot, + ins_nr, inode_only, + logged_isize); + if (ret < 0) { + err = ret; + goto out_unlock; + } + ins_nr = 0; + btrfs_release_path(path); + inode_key.objectid = other_ino; + inode_key.type = BTRFS_INODE_ITEM_KEY; + inode_key.offset = 0; + other_inode = btrfs_iget(root->fs_info->sb, + &inode_key, root, + NULL); + /* + * If the other inode that had a conflicting dir + * entry was deleted in the current transaction, + * we don't need to do more work nor fallback to + * a transaction commit. + */ + if (IS_ERR(other_inode) && + PTR_ERR(other_inode) == -ENOENT) { + goto next_key; + } else if (IS_ERR(other_inode)) { + err = PTR_ERR(other_inode); + goto out_unlock; + } + /* + * We are safe logging the other inode without + * acquiring its i_mutex as long as we log with + * the LOG_INODE_EXISTS mode. We're safe against + * concurrent renames of the other inode as well + * because during a rename we pin the log and + * update the log with the new name before we + * unpin it. + */ + err = btrfs_log_inode(trans, root, other_inode, + LOG_INODE_EXISTS, + 0, LLONG_MAX, ctx); + iput(other_inode); + if (err) + goto out_unlock; + else + goto next_key; } } @@ -4795,7 +4864,7 @@ next_slot: ins_nr = 0; } btrfs_release_path(path); - +next_key: if (min_key.offset < (u64)-1) { min_key.offset++; } else if (min_key.type < max_key.type) { @@ -4989,8 +5058,12 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) break; - if (IS_ROOT(parent)) + if (IS_ROOT(parent)) { + inode = d_inode(parent); + if (btrfs_must_commit_transaction(trans, inode)) + ret = 1; break; + } parent = dget_parent(parent); dput(old_parent); @@ -5301,7 +5374,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, sb = inode->i_sb; - if (btrfs_test_opt(root, NOTREELOG)) { + if (btrfs_test_opt(root->fs_info, NOTREELOG)) { ret = 1; goto end_no_trans; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0fb4a959012e..51f125508771 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -140,7 +140,6 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root); static void __btrfs_reset_dev_stats(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); -static void btrfs_close_one_device(struct btrfs_device *device); DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); @@ -853,6 +852,46 @@ static void free_device(struct rcu_head *head) schedule_work(&device->rcu_work); } +static void btrfs_close_one_device(struct btrfs_device *device) +{ + struct btrfs_fs_devices *fs_devices = device->fs_devices; + struct btrfs_device *new_device; + struct rcu_string *name; + + if (device->bdev) + fs_devices->open_devices--; + + if (device->writeable && + device->devid != BTRFS_DEV_REPLACE_DEVID) { + list_del_init(&device->dev_alloc_list); + fs_devices->rw_devices--; + } + + if (device->missing) + fs_devices->missing_devices--; + + if (device->bdev && device->writeable) { + sync_blockdev(device->bdev); + invalidate_bdev(device->bdev); + } + + new_device = btrfs_alloc_device(NULL, &device->devid, + device->uuid); + BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ + + /* Safe because we are under uuid_mutex */ + if (device->name) { + name = rcu_string_strdup(device->name->str, GFP_NOFS); + BUG_ON(!name); /* -ENOMEM */ + rcu_assign_pointer(new_device->name, name); + } + + list_replace_rcu(&device->dev_list, &new_device->dev_list); + new_device->fs_devices = device->fs_devices; + + call_rcu(&device->rcu, free_device); +} + static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device, *tmp; @@ -2399,14 +2438,14 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ret = init_first_rw_device(trans, root, device); unlock_chunks(root); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto error_trans; } } ret = btrfs_add_device(trans, root, device); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto error_trans; } @@ -2415,7 +2454,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ret = btrfs_finish_sprout(trans, root); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto error_trans; } @@ -2801,7 +2840,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, &dev_extent_len); if (ret) { mutex_unlock(&fs_devices->device_list_mutex); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -2820,7 +2859,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, ret = btrfs_update_device(trans, map->stripes[i].dev); if (ret) { mutex_unlock(&fs_devices->device_list_mutex); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } } @@ -2829,7 +2868,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, ret = btrfs_free_chunk(trans, root, chunk_objectid, chunk_offset); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -2838,14 +2877,14 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } } ret = btrfs_remove_block_group(trans, extent_root, chunk_offset, em); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -2902,7 +2941,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset) * chunk tree entries */ ret = btrfs_remove_chunk(trans, root, chunk_offset); - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans, extent_root); return ret; } @@ -3421,7 +3460,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) u64 size_to_free; u64 chunk_type; struct btrfs_chunk *chunk; - struct btrfs_path *path; + struct btrfs_path *path = NULL; struct btrfs_key key; struct btrfs_key found_key; struct btrfs_trans_handle *trans; @@ -3455,13 +3494,33 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) ret = btrfs_shrink_device(device, old_size - size_to_free); if (ret == -ENOSPC) break; - BUG_ON(ret); + if (ret) { + /* btrfs_shrink_device never returns ret > 0 */ + WARN_ON(ret > 0); + goto error; + } trans = btrfs_start_transaction(dev_root, 0); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + btrfs_info_in_rcu(fs_info, + "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu", + rcu_str_deref(device->name), ret, + old_size, old_size - size_to_free); + goto error; + } ret = btrfs_grow_device(trans, device, old_size); - BUG_ON(ret); + if (ret) { + btrfs_end_transaction(trans, dev_root); + /* btrfs_grow_device never returns ret > 0 */ + WARN_ON(ret > 0); + btrfs_info_in_rcu(fs_info, + "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu", + rcu_str_deref(device->name), ret, + old_size, old_size - size_to_free); + goto error; + } btrfs_end_transaction(trans, dev_root); } @@ -3885,7 +3944,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) } spin_unlock(&fs_info->balance_lock); - if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) { + if (btrfs_test_opt(fs_info, SKIP_BALANCE)) { btrfs_info(fs_info, "force skipping balance"); return 0; } @@ -4240,7 +4299,7 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) BTRFS_UUID_TREE_OBJECTID); if (IS_ERR(uuid_root)) { ret = PTR_ERR(uuid_root); - btrfs_abort_transaction(trans, tree_root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans, tree_root); return ret; } @@ -4514,8 +4573,7 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) btrfs_set_fs_incompat(info, RAID56); } -#define BTRFS_MAX_DEVS(r) ((BTRFS_LEAF_DATA_SIZE(r) \ - - sizeof(struct btrfs_item) \ +#define BTRFS_MAX_DEVS(r) ((BTRFS_MAX_ITEM_SIZE(r) \ - sizeof(struct btrfs_chunk)) \ / sizeof(struct btrfs_stripe) + 1) @@ -5954,7 +6012,7 @@ static void btrfs_end_bio(struct bio *bio) else btrfs_dev_stat_inc(dev, BTRFS_DEV_STAT_READ_ERRS); - if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH) + if ((bio->bi_opf & WRITE_FLUSH) == WRITE_FLUSH) btrfs_dev_stat_inc(dev, BTRFS_DEV_STAT_FLUSH_ERRS); btrfs_dev_stat_print_on_error(dev); @@ -6031,7 +6089,7 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root, bio->bi_next = NULL; spin_lock(&device->io_lock); - if (bio->bi_rw & REQ_SYNC) + if (bio->bi_opf & REQ_SYNC) pending_bios = &device->pending_sync_bios; else pending_bios = &device->pending_bios; @@ -6069,7 +6127,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, rcu_read_lock(); name = rcu_dereference(dev->name); pr_debug("btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu " - "(%s id %llu), size=%u\n", bio_op(bio), bio->bi_rw, + "(%s id %llu), size=%u\n", bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector, (u_long)dev->bdev->bd_dev, name->str, dev->devid, bio->bi_iter.bi_size); rcu_read_unlock(); @@ -6401,7 +6459,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, BTRFS_UUID_SIZE); map->stripes[i].dev = btrfs_find_device(root->fs_info, devid, uuid, NULL); - if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { + if (!map->stripes[i].dev && + !btrfs_test_opt(root->fs_info, DEGRADED)) { free_extent_map(em); return -EIO; } @@ -6469,7 +6528,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_root *root, fs_devices = find_fsid(fsid); if (!fs_devices) { - if (!btrfs_test_opt(root, DEGRADED)) + if (!btrfs_test_opt(root->fs_info, DEGRADED)) return ERR_PTR(-ENOENT); fs_devices = alloc_fs_devices(fsid); @@ -6531,7 +6590,7 @@ static int read_one_dev(struct btrfs_root *root, device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid); if (!device) { - if (!btrfs_test_opt(root, DEGRADED)) + if (!btrfs_test_opt(root->fs_info, DEGRADED)) return -EIO; device = add_missing_dev(root, fs_devices, devid, dev_uuid); @@ -6540,7 +6599,7 @@ static int read_one_dev(struct btrfs_root *root, btrfs_warn(root->fs_info, "devid %llu uuid %pU missing", devid, dev_uuid); } else { - if (!device->bdev && !btrfs_test_opt(root, DEGRADED)) + if (!device->bdev && !btrfs_test_opt(root->fs_info, DEGRADED)) return -EIO; if(!device->bdev && !device->missing) { @@ -7143,38 +7202,3 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) fs_devices = fs_devices->seed; } } - -static void btrfs_close_one_device(struct btrfs_device *device) -{ - struct btrfs_fs_devices *fs_devices = device->fs_devices; - struct btrfs_device *new_device; - struct rcu_string *name; - - if (device->bdev) - fs_devices->open_devices--; - - if (device->writeable && - device->devid != BTRFS_DEV_REPLACE_DEVID) { - list_del_init(&device->dev_alloc_list); - fs_devices->rw_devices--; - } - - if (device->missing) - fs_devices->missing_devices--; - - new_device = btrfs_alloc_device(NULL, &device->devid, - device->uuid); - BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ - - /* Safe because we are under uuid_mutex */ - if (device->name) { - name = rcu_string_strdup(device->name->str, GFP_NOFS); - BUG_ON(!name); /* -ENOMEM */ - rcu_assign_pointer(new_device->name, name); - } - - list_replace_rcu(&device->dev_list, &new_device->dev_list); - new_device->fs_devices = device->fs_devices; - - call_rcu(&device->rcu, free_device); -} diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 4ae75006e73b..3f7c2cd41f8f 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -263,6 +263,8 @@ requeue: void cachefiles_mark_object_inactive(struct cachefiles_cache *cache, struct cachefiles_object *object) { + blkcnt_t i_blocks = d_backing_inode(object->dentry)->i_blocks; + write_lock(&cache->active_lock); rb_erase(&object->active_node, &cache->active_nodes); clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags); @@ -273,8 +275,7 @@ void cachefiles_mark_object_inactive(struct cachefiles_cache *cache, /* This object can now be culled, so we need to let the daemon know * that there is something it can remove if it needs to. */ - atomic_long_add(d_backing_inode(object->dentry)->i_blocks, - &cache->b_released); + atomic_long_add(i_blocks, &cache->b_released); if (atomic_inc_return(&cache->f_released)) cachefiles_state_changed(cache); } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 99115cae1652..16e6ded0b7f2 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1347,9 +1347,12 @@ void ceph_flush_snaps(struct ceph_inode_info *ci, { struct inode *inode = &ci->vfs_inode; struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; - struct ceph_mds_session *session = *psession; + struct ceph_mds_session *session = NULL; int mds; + dout("ceph_flush_snaps %p\n", inode); + if (psession) + session = *psession; retry: spin_lock(&ci->i_ceph_lock); if (!(ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)) { diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index fa59a85226b2..f72d4ae303b2 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2759,6 +2759,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, } else { path = NULL; pathlen = 0; + pathbase = 0; } spin_lock(&ci->i_ceph_lock); diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 4e532536cbc6..4716c54dbfc6 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -903,10 +903,10 @@ static int cifs_ci_hash(const struct dentry *dentry, struct qstr *q) return 0; } -static int cifs_ci_compare(const struct dentry *parent, const struct dentry *dentry, +static int cifs_ci_compare(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { - struct nls_table *codepage = CIFS_SB(parent->d_sb)->local_nls; + struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls; wchar_t c1, c2; int i, l1, l2; diff --git a/fs/dcache.c b/fs/dcache.c index b90cf8e09d5b..5c7cc953ac81 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -316,20 +316,6 @@ static void dentry_free(struct dentry *dentry) call_rcu(&dentry->d_u.d_rcu, __d_free); } -/** - * dentry_rcuwalk_invalidate - invalidate in-progress rcu-walk lookups - * @dentry: the target dentry - * After this call, in-progress rcu-walk path lookup will fail. This - * should be called after unhashing, and after changing d_inode (if - * the dentry has not already been unhashed). - */ -static inline void dentry_rcuwalk_invalidate(struct dentry *dentry) -{ - lockdep_assert_held(&dentry->d_lock); - /* Go through am invalidation barrier */ - write_seqcount_invalidate(&dentry->d_seq); -} - /* * Release the dentry's inode, using the filesystem * d_iput() operation if defined. @@ -468,7 +454,8 @@ void __d_drop(struct dentry *dentry) __hlist_bl_del(&dentry->d_hash); dentry->d_hash.pprev = NULL; hlist_bl_unlock(b); - dentry_rcuwalk_invalidate(dentry); + /* After this call, in-progress rcu-walk path lookup will fail. */ + write_seqcount_invalidate(&dentry->d_seq); } } EXPORT_SYMBOL(__d_drop); @@ -2060,7 +2047,7 @@ static inline bool d_same_name(const struct dentry *dentry, return false; return dentry_cmp(dentry, name->name, name->len) == 0; } - return parent->d_op->d_compare(parent, dentry, + return parent->d_op->d_compare(dentry, dentry->d_name.len, dentry->d_name.name, name) == 0; } @@ -2163,7 +2150,7 @@ seqretry: cpu_relax(); goto seqretry; } - if (parent->d_op->d_compare(parent, dentry, + if (parent->d_op->d_compare(dentry, tlen, tname, name) != 0) continue; } else { @@ -2352,19 +2339,15 @@ again: } EXPORT_SYMBOL(d_delete); -static void __d_rehash(struct dentry * entry, struct hlist_bl_head *b) +static void __d_rehash(struct dentry *entry) { + struct hlist_bl_head *b = d_hash(entry->d_name.hash); BUG_ON(!d_unhashed(entry)); hlist_bl_lock(b); hlist_bl_add_head_rcu(&entry->d_hash, b); hlist_bl_unlock(b); } -static void _d_rehash(struct dentry * entry) -{ - __d_rehash(entry, d_hash(entry->d_name.hash)); -} - /** * d_rehash - add an entry back to the hash * @entry: dentry to add to the hash @@ -2375,7 +2358,7 @@ static void _d_rehash(struct dentry * entry) void d_rehash(struct dentry * entry) { spin_lock(&entry->d_lock); - _d_rehash(entry); + __d_rehash(entry); spin_unlock(&entry->d_lock); } EXPORT_SYMBOL(d_rehash); @@ -2549,7 +2532,7 @@ static inline void __d_add(struct dentry *dentry, struct inode *inode) raw_write_seqcount_end(&dentry->d_seq); fsnotify_update_flags(dentry); } - _d_rehash(dentry); + __d_rehash(dentry); if (dir) end_dir_add(dir, n); spin_unlock(&dentry->d_lock); @@ -2611,7 +2594,7 @@ struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode) alias = NULL; } else { __dget_dlock(alias); - _d_rehash(alias); + __d_rehash(alias); spin_unlock(&alias->d_lock); } spin_unlock(&inode->i_lock); @@ -2636,7 +2619,7 @@ EXPORT_SYMBOL(d_exact_alias); * Parent inode i_mutex must be held over d_lookup and into this call (to * keep renames and concurrent inserts, and readdir(2) away). */ -void dentry_update_name_case(struct dentry *dentry, struct qstr *name) +void dentry_update_name_case(struct dentry *dentry, const struct qstr *name) { BUG_ON(!inode_is_locked(dentry->d_parent->d_inode)); BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */ @@ -2795,23 +2778,10 @@ static void __d_move(struct dentry *dentry, struct dentry *target, write_seqcount_begin(&dentry->d_seq); write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED); + /* unhash both */ /* __d_drop does write_seqcount_barrier, but they're OK to nest. */ - - /* - * Move the dentry to the target hash queue. Don't bother checking - * for the same hash queue because of how unlikely it is. - */ __d_drop(dentry); - __d_rehash(dentry, d_hash(target->d_name.hash)); - - /* - * Unhash the target (d_delete() is not usable here). If exchanging - * the two dentries, then rehash onto the other's hash queue. - */ __d_drop(target); - if (exchange) { - __d_rehash(target, d_hash(dentry->d_name.hash)); - } /* Switch the names.. */ if (exchange) @@ -2819,6 +2789,11 @@ static void __d_move(struct dentry *dentry, struct dentry *target, else copy_name(dentry, target); + /* rehash in new place(s) */ + __d_rehash(dentry); + if (exchange) + __d_rehash(target); + /* ... and switch them in the tree */ if (IS_ROOT(dentry)) { /* splicing a tree */ @@ -3038,7 +3013,7 @@ static int prepend(char **buffer, int *buflen, const char *str, int namelen) * Data dependency barrier is needed to make sure that we see that terminating * NUL. Alpha strikes again, film at 11... */ -static int prepend_name(char **buffer, int *buflen, struct qstr *name) +static int prepend_name(char **buffer, int *buflen, const struct qstr *name) { const char *dname = ACCESS_ONCE(name->name); u32 dlen = ACCESS_ONCE(name->len); diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index a5e607e8f056..688ccc16b702 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -45,8 +45,7 @@ static struct super_block *efivarfs_sb; * So we need to perform a case-sensitive match on part 1 and a * case-insensitive match on part 2. */ -static int efivarfs_d_compare(const struct dentry *parent, - const struct dentry *dentry, +static int efivarfs_d_compare(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { diff --git a/fs/exec.c b/fs/exec.c index a1789cd684bf..6fcfb3f7b137 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -762,6 +762,39 @@ out_unlock: } EXPORT_SYMBOL(setup_arg_pages); +#else + +/* + * Transfer the program arguments and environment from the holding pages + * onto the stack. The provided stack pointer is adjusted accordingly. + */ +int transfer_args_to_stack(struct linux_binprm *bprm, + unsigned long *sp_location) +{ + unsigned long index, stop, sp; + int ret = 0; + + stop = bprm->p >> PAGE_SHIFT; + sp = *sp_location; + + for (index = MAX_ARG_PAGES - 1; index >= stop; index--) { + unsigned int offset = index == stop ? bprm->p & ~PAGE_MASK : 0; + char *src = kmap(bprm->page[index]) + offset; + sp -= PAGE_SIZE - offset; + if (copy_to_user((void *) sp, src, PAGE_SIZE - offset) != 0) + ret = -EFAULT; + kunmap(bprm->page[index]); + if (ret) + goto out; + } + + *sp_location = sp; + +out: + return ret; +} +EXPORT_SYMBOL(transfer_args_to_stack); + #endif /* CONFIG_MMU */ static struct file *do_open_execat(int fd, struct filename *name, int flags) diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 19efd1197fa5..61ad490ed67b 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -358,8 +358,8 @@ ext2_readdir(struct file *file, struct dir_context *ctx) * and the entry itself. Page is returned mapped and unlocked. * Entry is guaranteed to be valid. */ -struct ext2_dir_entry_2 *ext2_find_entry (struct inode * dir, - struct qstr *child, struct page ** res_page) +struct ext2_dir_entry_2 *ext2_find_entry (struct inode *dir, + const struct qstr *child, struct page **res_page) { const char *name = child->name; int namelen = child->len; @@ -435,7 +435,7 @@ struct ext2_dir_entry_2 * ext2_dotdot (struct inode *dir, struct page **p) return de; } -ino_t ext2_inode_by_name(struct inode *dir, struct qstr *child) +ino_t ext2_inode_by_name(struct inode *dir, const struct qstr *child) { ino_t res = 0; struct ext2_dir_entry_2 *de; diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 3fb93681bf7f..06af2f92226c 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -757,9 +757,9 @@ extern void ext2_rsv_window_add(struct super_block *sb, struct ext2_reserve_wind /* dir.c */ extern int ext2_add_link (struct dentry *, struct inode *); -extern ino_t ext2_inode_by_name(struct inode *, struct qstr *); +extern ino_t ext2_inode_by_name(struct inode *, const struct qstr *); extern int ext2_make_empty(struct inode *, struct inode *); -extern struct ext2_dir_entry_2 * ext2_find_entry (struct inode *,struct qstr *, struct page **); +extern struct ext2_dir_entry_2 * ext2_find_entry (struct inode *,const struct qstr *, struct page **); extern int ext2_delete_entry (struct ext2_dir_entry_2 *, struct page *); extern int ext2_empty_dir (struct inode *); extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index e2624275d828..d64d2a515cb2 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -245,7 +245,6 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) bio_put(bio); return -EFAULT; } - bio->bi_rw = fio->op_flags; bio_set_op_attrs(bio, fio->op, fio->op_flags); __submit_bio(fio->sbi, bio, fio->type); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index a485f68a76b1..9054aeac8015 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -219,7 +219,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, * Entry is guaranteed to be valid. */ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, - struct qstr *child, struct page **res_page) + const struct qstr *child, struct page **res_page) { unsigned long npages = dir_blocks(dir); struct f2fs_dir_entry *de = NULL; @@ -272,7 +272,7 @@ struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p) return f2fs_find_entry(dir, &dotdot, p); } -ino_t f2fs_inode_by_name(struct inode *dir, struct qstr *qstr, +ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, struct page **page) { ino_t res = 0; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7890e9071499..675fa79d86f6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1914,10 +1914,10 @@ struct page *init_inode_metadata(struct inode *, struct inode *, void update_parent_metadata(struct inode *, struct inode *, unsigned int); int room_for_filename(const void *, int, int); void f2fs_drop_nlink(struct inode *, struct inode *); -struct f2fs_dir_entry *f2fs_find_entry(struct inode *, struct qstr *, +struct f2fs_dir_entry *f2fs_find_entry(struct inode *, const struct qstr *, struct page **); struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **); -ino_t f2fs_inode_by_name(struct inode *, struct qstr *, struct page **); +ino_t f2fs_inode_by_name(struct inode *, const struct qstr *, struct page **); void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, struct page *, struct inode *); int update_dent_inode(struct inode *, struct inode *, const struct qstr *); diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index 1337c0c7527d..664655b2c55f 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -162,10 +162,10 @@ static int msdos_hash(const struct dentry *dentry, struct qstr *qstr) * Compare two msdos names. If either of the names are invalid, * we fall back to doing the standard name comparison. */ -static int msdos_cmp(const struct dentry *parent, const struct dentry *dentry, +static int msdos_cmp(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { - struct fat_mount_options *options = &MSDOS_SB(parent->d_sb)->options; + struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options; unsigned char a_msdos_name[MSDOS_NAME], b_msdos_name[MSDOS_NAME]; int error; diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 6ccdf3f34f90..92b7363dafa9 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -138,10 +138,10 @@ static int vfat_hashi(const struct dentry *dentry, struct qstr *qstr) /* * Case insensitive compare of two vfat names. */ -static int vfat_cmpi(const struct dentry *parent, const struct dentry *dentry, +static int vfat_cmpi(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { - struct nls_table *t = MSDOS_SB(parent->d_sb)->nls_io; + struct nls_table *t = MSDOS_SB(dentry->d_sb)->nls_io; unsigned int alen, blen; /* A filename cannot end in '.' or we treat it like it has none */ @@ -157,7 +157,7 @@ static int vfat_cmpi(const struct dentry *parent, const struct dentry *dentry, /* * Case sensitive compare of two vfat names. */ -static int vfat_cmp(const struct dentry *parent, const struct dentry *dentry, +static int vfat_cmp(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { unsigned int alen, blen; @@ -652,8 +652,8 @@ out_free: return err; } -static int vfat_add_entry(struct inode *dir, struct qstr *qname, int is_dir, - int cluster, struct timespec *ts, +static int vfat_add_entry(struct inode *dir, const struct qstr *qname, + int is_dir, int cluster, struct timespec *ts, struct fat_slot_info *sinfo) { struct msdos_dir_slot *slots; @@ -688,7 +688,7 @@ cleanup: return err; } -static int vfat_find(struct inode *dir, struct qstr *qname, +static int vfat_find(struct inode *dir, const struct qstr *qname, struct fat_slot_info *sinfo) { unsigned int len = vfat_striptail_len(qname); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 56c8fda436c0..05713a5da083 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1327,6 +1327,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) dirty = inode->i_state & I_DIRTY; if (inode->i_state & I_DIRTY_TIME) { if ((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) || + wbc->sync_mode == WB_SYNC_ALL || unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) || unlikely(time_after(jiffies, (inode->dirtied_time_when + @@ -1948,6 +1949,12 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason) { struct backing_dev_info *bdi; + /* + * If we are expecting writeback progress we must submit plugged IO. + */ + if (blk_needs_flush_plug(current)) + blk_schedule_flush_plug(current); + if (!nr_pages) nr_pages = get_nr_dirty_pages(); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 5f1627725791..c47b7780ce37 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -146,7 +146,7 @@ static void fuse_invalidate_entry(struct dentry *entry) } static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args, - u64 nodeid, struct qstr *name, + u64 nodeid, const struct qstr *name, struct fuse_entry_out *outarg) { memset(outarg, 0, sizeof(struct fuse_entry_out)); @@ -282,7 +282,7 @@ int fuse_valid_type(int m) S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m); } -int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name, +int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name, struct fuse_entry_out *outarg, struct inode **inode) { struct fuse_conn *fc = get_fuse_conn_super(sb); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 5db5d24f91a5..d98d8cc84def 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -704,7 +704,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, int generation, struct fuse_attr *attr, u64 attr_valid, u64 attr_version); -int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name, +int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name, struct fuse_entry_out *outarg, struct inode **inode); /** diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 9b7cb37b4ba8..4e05b51120f4 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -673,13 +673,11 @@ static struct dentry *fuse_get_dentry(struct super_block *sb, inode = ilookup5(sb, handle->nodeid, fuse_inode_eq, &handle->nodeid); if (!inode) { struct fuse_entry_out outarg; - struct qstr name; + const struct qstr name = QSTR_INIT(".", 1); if (!fc->export_support) goto out_err; - name.len = 1; - name.name = "."; err = fuse_lookup_name(sb, handle->nodeid, &name, &outarg, &inode); if (err && err != -ENOENT) @@ -775,14 +773,12 @@ static struct dentry *fuse_get_parent(struct dentry *child) struct inode *inode; struct dentry *parent; struct fuse_entry_out outarg; - struct qstr name; + const struct qstr name = QSTR_INIT("..", 2); int err; if (!fc->export_support) return ERR_PTR(-ESTALE); - name.len = 2; - name.name = ".."; err = fuse_lookup_name(child_inode->i_sb, get_node_id(child_inode), &name, &outarg, &inode); if (err) { diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index e0621cacf134..e4da0ecd3285 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1800,7 +1800,7 @@ int gfs2_permission(struct inode *inode, int mask) } if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode)) - error = -EACCES; + error = -EPERM; else error = generic_permission(inode, mask); if (gfs2_holder_initialized(&i_gh)) diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c index 98cde8ba5dc2..8f4afd3f5108 100644 --- a/fs/hfs/catalog.c +++ b/fs/hfs/catalog.c @@ -20,7 +20,7 @@ * * Given the ID of the parent and the name build a search key. */ -void hfs_cat_build_key(struct super_block *sb, btree_key *key, u32 parent, struct qstr *name) +void hfs_cat_build_key(struct super_block *sb, btree_key *key, u32 parent, const struct qstr *name) { key->cat.reserved = 0; key->cat.ParID = cpu_to_be32(parent); @@ -64,7 +64,7 @@ static int hfs_cat_build_record(hfs_cat_rec *rec, u32 cnid, struct inode *inode) static int hfs_cat_build_thread(struct super_block *sb, hfs_cat_rec *rec, int type, - u32 parentid, struct qstr *name) + u32 parentid, const struct qstr *name) { rec->type = type; memset(rec->thread.reserved, 0, sizeof(rec->thread.reserved)); @@ -79,7 +79,7 @@ static int hfs_cat_build_thread(struct super_block *sb, * Add a new file or directory to the catalog B-tree and * return a (struct hfs_cat_entry) for it in '*result'. */ -int hfs_cat_create(u32 cnid, struct inode *dir, struct qstr *str, struct inode *inode) +int hfs_cat_create(u32 cnid, struct inode *dir, const struct qstr *str, struct inode *inode) { struct hfs_find_data fd; struct super_block *sb; @@ -210,7 +210,7 @@ int hfs_cat_find_brec(struct super_block *sb, u32 cnid, * Delete the indicated file or directory. * The associated thread is also removed unless ('with_thread'==0). */ -int hfs_cat_delete(u32 cnid, struct inode *dir, struct qstr *str) +int hfs_cat_delete(u32 cnid, struct inode *dir, const struct qstr *str) { struct super_block *sb; struct hfs_find_data fd; @@ -277,8 +277,8 @@ out: * If the destination exists it is removed and a * (struct hfs_cat_entry) for it is returned in '*result'. */ -int hfs_cat_move(u32 cnid, struct inode *src_dir, struct qstr *src_name, - struct inode *dst_dir, struct qstr *dst_name) +int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name, + struct inode *dst_dir, const struct qstr *dst_name) { struct super_block *sb; struct hfs_find_data src_fd, dst_fd; diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index ee2f385811c8..16f5172ee40d 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -178,11 +178,11 @@ extern int hfs_clear_vbm_bits(struct super_block *, u16, u16); extern int hfs_cat_keycmp(const btree_key *, const btree_key *); struct hfs_find_data; extern int hfs_cat_find_brec(struct super_block *, u32, struct hfs_find_data *); -extern int hfs_cat_create(u32, struct inode *, struct qstr *, struct inode *); -extern int hfs_cat_delete(u32, struct inode *, struct qstr *); -extern int hfs_cat_move(u32, struct inode *, struct qstr *, - struct inode *, struct qstr *); -extern void hfs_cat_build_key(struct super_block *, btree_key *, u32, struct qstr *); +extern int hfs_cat_create(u32, struct inode *, const struct qstr *, struct inode *); +extern int hfs_cat_delete(u32, struct inode *, const struct qstr *); +extern int hfs_cat_move(u32, struct inode *, const struct qstr *, + struct inode *, const struct qstr *); +extern void hfs_cat_build_key(struct super_block *, btree_key *, u32, const struct qstr *); /* dir.c */ extern const struct file_operations hfs_dir_operations; @@ -201,7 +201,7 @@ extern int hfs_get_block(struct inode *, sector_t, struct buffer_head *, int); extern const struct address_space_operations hfs_aops; extern const struct address_space_operations hfs_btree_aops; -extern struct inode *hfs_new_inode(struct inode *, struct qstr *, umode_t); +extern struct inode *hfs_new_inode(struct inode *, const struct qstr *, umode_t); extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *); extern int hfs_write_inode(struct inode *, struct writeback_control *); extern int hfs_inode_setattr(struct dentry *, struct iattr *); @@ -233,11 +233,11 @@ extern const struct dentry_operations hfs_dentry_operations; extern int hfs_hash_dentry(const struct dentry *, struct qstr *); extern int hfs_strcmp(const unsigned char *, unsigned int, const unsigned char *, unsigned int); -extern int hfs_compare_dentry(const struct dentry *parent, const struct dentry *dentry, +extern int hfs_compare_dentry(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name); /* trans.c */ -extern void hfs_asc2mac(struct super_block *, struct hfs_name *, struct qstr *); +extern void hfs_asc2mac(struct super_block *, struct hfs_name *, const struct qstr *); extern int hfs_mac2asc(struct super_block *, char *, const struct hfs_name *); /* super.c */ diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 02a3845363f7..c6a32415735b 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -177,7 +177,7 @@ const struct address_space_operations hfs_aops = { /* * hfs_new_inode */ -struct inode *hfs_new_inode(struct inode *dir, struct qstr *name, umode_t mode) +struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t mode) { struct super_block *sb = dir->i_sb; struct inode *inode = new_inode(sb); diff --git a/fs/hfs/string.c b/fs/hfs/string.c index ec9f164c35a5..3912209153a8 100644 --- a/fs/hfs/string.c +++ b/fs/hfs/string.c @@ -92,7 +92,7 @@ int hfs_strcmp(const unsigned char *s1, unsigned int len1, * Test for equality of two strings in the HFS filename character ordering. * return 1 on failure and 0 on success */ -int hfs_compare_dentry(const struct dentry *parent, const struct dentry *dentry, +int hfs_compare_dentry(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { const unsigned char *n1, *n2; diff --git a/fs/hfs/trans.c b/fs/hfs/trans.c index b1ce4c7ad3fb..39f5e343bf4d 100644 --- a/fs/hfs/trans.c +++ b/fs/hfs/trans.c @@ -94,7 +94,7 @@ out: * This routine is a inverse to hfs_mac2triv(). * A ':' is replaced by a '/'. */ -void hfs_asc2mac(struct super_block *sb, struct hfs_name *out, struct qstr *in) +void hfs_asc2mac(struct super_block *sb, struct hfs_name *out, const struct qstr *in) { struct nls_table *nls_disk = HFS_SB(sb)->nls_disk; struct nls_table *nls_io = HFS_SB(sb)->nls_io; diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index fb707e8f423a..142534d3c2d5 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -40,7 +40,7 @@ int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *k1, /* Generates key for catalog file/folders record. */ int hfsplus_cat_build_key(struct super_block *sb, - hfsplus_btree_key *key, u32 parent, struct qstr *str) + hfsplus_btree_key *key, u32 parent, const struct qstr *str) { int len, err; @@ -174,7 +174,7 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, static int hfsplus_fill_cat_thread(struct super_block *sb, hfsplus_cat_entry *entry, int type, - u32 parentid, struct qstr *str) + u32 parentid, const struct qstr *str) { int err; @@ -250,7 +250,7 @@ static void hfsplus_subfolders_dec(struct inode *dir) } int hfsplus_create_cat(u32 cnid, struct inode *dir, - struct qstr *str, struct inode *inode) + const struct qstr *str, struct inode *inode) { struct super_block *sb = dir->i_sb; struct hfs_find_data fd; @@ -318,7 +318,7 @@ err2: return err; } -int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) +int hfsplus_delete_cat(u32 cnid, struct inode *dir, const struct qstr *str) { struct super_block *sb = dir->i_sb; struct hfs_find_data fd; @@ -415,8 +415,8 @@ out: } int hfsplus_rename_cat(u32 cnid, - struct inode *src_dir, struct qstr *src_name, - struct inode *dst_dir, struct qstr *dst_name) + struct inode *src_dir, const struct qstr *src_name, + struct inode *dst_dir, const struct qstr *dst_name) { struct super_block *sb = src_dir->i_sb; struct hfs_find_data src_fd, dst_fd; diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 047245bd2cd6..a3f03b247463 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -445,17 +445,17 @@ int hfsplus_cat_case_cmp_key(const hfsplus_btree_key *k1, int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *k1, const hfsplus_btree_key *k2); int hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *key, - u32 parent, struct qstr *str); + u32 parent, const struct qstr *str); void hfsplus_cat_build_key_with_cnid(struct super_block *sb, hfsplus_btree_key *key, u32 parent); void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms); int hfsplus_find_cat(struct super_block *sb, u32 cnid, struct hfs_find_data *fd); -int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, +int hfsplus_create_cat(u32 cnid, struct inode *dir, const struct qstr *str, struct inode *inode); -int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str); -int hfsplus_rename_cat(u32 cnid, struct inode *src_dir, struct qstr *src_name, - struct inode *dst_dir, struct qstr *dst_name); +int hfsplus_delete_cat(u32 cnid, struct inode *dir, const struct qstr *str); +int hfsplus_rename_cat(u32 cnid, struct inode *src_dir, const struct qstr *src_name, + struct inode *dst_dir, const struct qstr *dst_name); /* dir.c */ extern const struct inode_operations hfsplus_dir_inode_operations; @@ -520,8 +520,7 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr, int max_unistr_len, const char *astr, int len); int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str); -int hfsplus_compare_dentry(const struct dentry *parent, - const struct dentry *dentry, unsigned int len, +int hfsplus_compare_dentry(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name); /* wrapper.c */ diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c index c13c8a240be3..e563939882f3 100644 --- a/fs/hfsplus/unicode.c +++ b/fs/hfsplus/unicode.c @@ -385,10 +385,10 @@ int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str) * Composed unicode characters are decomposed and case-folding is performed * if the appropriate bits are (un)set on the superblock. */ -int hfsplus_compare_dentry(const struct dentry *parent, const struct dentry *dentry, +int hfsplus_compare_dentry(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { - struct super_block *sb = parent->d_sb; + struct super_block *sb = dentry->d_sb; int casefold, decompose, size; int dsize1, dsize2, len1, len2; const u16 *dstr1, *dstr2; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 5c57654927a6..90e46cd752fe 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -959,10 +959,11 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent) if (S_ISLNK(root_inode->i_mode)) { char *name = follow_link(host_root_path); - if (IS_ERR(name)) + if (IS_ERR(name)) { err = PTR_ERR(name); - else - err = read_name(root_inode, name); + goto out_put; + } + err = read_name(root_inode, name); kfree(name); if (err) goto out_put; diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c index 60e6d334d79a..bb87d65f0d97 100644 --- a/fs/hpfs/dentry.c +++ b/fs/hpfs/dentry.c @@ -34,7 +34,7 @@ static int hpfs_hash_dentry(const struct dentry *dentry, struct qstr *qstr) return 0; } -static int hpfs_compare_dentry(const struct dentry *parent, const struct dentry *dentry, +static int hpfs_compare_dentry(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { unsigned al = len; @@ -50,7 +50,7 @@ static int hpfs_compare_dentry(const struct dentry *parent, const struct dentry if (hpfs_chk_name(name->name, &bl)) return 1; - if (hpfs_compare_names(parent->d_sb, str, al, name->name, bl, 0)) + if (hpfs_compare_names(dentry->d_sb, str, al, name->name, bl, 0)) return 1; return 0; } diff --git a/fs/inode.c b/fs/inode.c index ad445542c285..7e3ef3af3db9 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1729,7 +1729,6 @@ int dentry_needs_remove_privs(struct dentry *dentry) mask |= ATTR_KILL_PRIV; return mask; } -EXPORT_SYMBOL(dentry_needs_remove_privs); static int __remove_privs(struct dentry *dentry, int kill) { @@ -1749,8 +1748,8 @@ static int __remove_privs(struct dentry *dentry, int kill) */ int file_remove_privs(struct file *file) { - struct dentry *dentry = file->f_path.dentry; - struct inode *inode = d_inode(dentry); + struct dentry *dentry = file_dentry(file); + struct inode *inode = file_inode(file); int kill; int error = 0; @@ -1758,7 +1757,7 @@ int file_remove_privs(struct file *file) if (IS_NOSEC(inode)) return 0; - kill = file_needs_remove_privs(file); + kill = dentry_needs_remove_privs(dentry); if (kill < 0) return kill; if (kill) diff --git a/fs/internal.h b/fs/internal.h index cef0913e5d41..ba0737649d4a 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -111,12 +111,14 @@ extern long do_handle_open(int mountdirfd, struct file_handle __user *ufh, int open_flag); extern int open_check_o_direct(struct file *f); extern int vfs_open(const struct path *, struct file *, const struct cred *); +extern struct file *filp_clone_open(struct file *); /* * inode.c */ extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); extern void inode_add_lru(struct inode *inode); +extern int dentry_needs_remove_privs(struct dentry *dentry); /* * fs-writeback.c diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 761fade7680f..ad0c745ebad7 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -29,18 +29,15 @@ #define BEQUIET static int isofs_hashi(const struct dentry *parent, struct qstr *qstr); -static int isofs_dentry_cmpi(const struct dentry *parent, - const struct dentry *dentry, +static int isofs_dentry_cmpi(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name); #ifdef CONFIG_JOLIET static int isofs_hashi_ms(const struct dentry *parent, struct qstr *qstr); static int isofs_hash_ms(const struct dentry *parent, struct qstr *qstr); -static int isofs_dentry_cmpi_ms(const struct dentry *parent, - const struct dentry *dentry, +static int isofs_dentry_cmpi_ms(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name); -static int isofs_dentry_cmp_ms(const struct dentry *parent, - const struct dentry *dentry, +static int isofs_dentry_cmp_ms(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name); #endif @@ -235,7 +232,7 @@ isofs_hashi(const struct dentry *dentry, struct qstr *qstr) } static int -isofs_dentry_cmpi(const struct dentry *parent, const struct dentry *dentry, +isofs_dentry_cmpi(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { return isofs_dentry_cmp_common(len, str, name, 0, 1); @@ -276,14 +273,14 @@ isofs_hashi_ms(const struct dentry *dentry, struct qstr *qstr) } static int -isofs_dentry_cmp_ms(const struct dentry *parent, const struct dentry *dentry, +isofs_dentry_cmp_ms(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { return isofs_dentry_cmp_common(len, str, name, 1, 0); } static int -isofs_dentry_cmpi_ms(const struct dentry *parent, const struct dentry *dentry, +isofs_dentry_cmpi_ms(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { return isofs_dentry_cmp_common(len, str, name, 1, 1); diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c index 7b543e6b6526..aee592767f1d 100644 --- a/fs/isofs/namei.c +++ b/fs/isofs/namei.c @@ -22,7 +22,7 @@ isofs_cmp(struct dentry *dentry, const char *compare, int dlen) qstr.len = dlen; if (likely(!dentry->d_op)) return dentry->d_name.len != dlen || memcmp(dentry->d_name.name, compare, dlen); - return dentry->d_op->d_compare(NULL, NULL, dentry->d_name.len, dentry->d_name.name, &qstr); + return dentry->d_op->d_compare(NULL, dentry->d_name.len, dentry->d_name.name, &qstr); } /* diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 04baf0dfc40c..814b0c58016c 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -1572,7 +1572,7 @@ static int jfs_ci_hash(const struct dentry *dir, struct qstr *this) return 0; } -static int jfs_ci_compare(const struct dentry *parent, const struct dentry *dentry, +static int jfs_ci_compare(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { int i, result = 1; diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c index bcd754d216bd..9568064ecadf 100644 --- a/fs/logfs/dir.c +++ b/fs/logfs/dir.c @@ -156,7 +156,7 @@ static pgoff_t hash_index(u32 hash, int round) static struct page *logfs_get_dd_page(struct inode *dir, struct dentry *dentry) { - struct qstr *name = &dentry->d_name; + const struct qstr *name = &dentry->d_name; struct page *page; struct logfs_disk_dentry *dd; u32 hash = logfs_hash_32(name->name, name->len, 0); @@ -323,7 +323,7 @@ static int logfs_readdir(struct file *file, struct dir_context *ctx) return 0; } -static void logfs_set_name(struct logfs_disk_dentry *dd, struct qstr *name) +static void logfs_set_name(struct logfs_disk_dentry *dd, const struct qstr *name) { dd->namelen = cpu_to_be16(name->len); memcpy(dd->name, name->name, name->len); diff --git a/fs/mpage.c b/fs/mpage.c index 2ca1f39c8cba..d2413af0823a 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -50,7 +50,7 @@ static void mpage_end_io(struct bio *bio) bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; - page_endio(page, bio_data_dir(bio), bio->bi_error); + page_endio(page, op_is_write(bio_op(bio)), bio->bi_error); } bio_put(bio); diff --git a/fs/namei.c b/fs/namei.c index c386a329ab20..adb04146df09 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -410,7 +410,7 @@ int __inode_permission(struct inode *inode, int mask) * Nobody gets write access to an immutable file. */ if (IS_IMMUTABLE(inode)) - return -EACCES; + return -EPERM; /* * Updating mtime will likely cause i_uid and i_gid to be diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index 9add7ab747a5..17de5c13dfae 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -74,7 +74,7 @@ const struct inode_operations ncp_dir_inode_operations = */ static int ncp_lookup_validate(struct dentry *, unsigned int); static int ncp_hash_dentry(const struct dentry *, struct qstr *); -static int ncp_compare_dentry(const struct dentry *, const struct dentry *, +static int ncp_compare_dentry(const struct dentry *, unsigned int, const char *, const struct qstr *); static int ncp_delete_dentry(const struct dentry *); static void ncp_d_prune(struct dentry *dentry); @@ -154,7 +154,7 @@ ncp_hash_dentry(const struct dentry *dentry, struct qstr *this) * the callers will handle races. */ static int -ncp_compare_dentry(const struct dentry *parent, const struct dentry *dentry, +ncp_compare_dentry(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { struct inode *pinode; @@ -162,7 +162,7 @@ ncp_compare_dentry(const struct dentry *parent, const struct dentry *dentry, if (len != name->len) return 1; - pinode = d_inode_rcu(parent); + pinode = d_inode_rcu(dentry->d_parent); if (!pinode) return 1; diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index cb28cceefebe..698be9361280 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -144,7 +144,7 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, } static int -nfs3_proc_lookup(struct inode *dir, struct qstr *name, +nfs3_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label) { @@ -404,7 +404,7 @@ out: } static int -nfs3_proc_remove(struct inode *dir, struct qstr *name) +nfs3_proc_remove(struct inode *dir, const struct qstr *name) { struct nfs_removeargs arg = { .fh = NFS_FH(dir), @@ -480,7 +480,7 @@ nfs3_proc_rename_done(struct rpc_task *task, struct inode *old_dir, } static int -nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) +nfs3_proc_link(struct inode *inode, struct inode *dir, const struct qstr *name) { struct nfs3_linkargs arg = { .fromfh = NFS_FH(inode), @@ -582,7 +582,7 @@ out: } static int -nfs3_proc_rmdir(struct inode *dir, struct qstr *name) +nfs3_proc_rmdir(struct inode *dir, const struct qstr *name) { struct nfs_fattr *dir_attr; struct nfs3_diropargs arg = { diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 33da841a21bb..6f4752734804 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -338,6 +338,8 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata) case 0: break; case -NFS4ERR_EXPIRED: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_OLD_STATEID: case -NFS4ERR_BAD_STATEID: diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 4be567a54958..9bf64eacba5b 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -225,7 +225,8 @@ int nfs_atomic_open(struct inode *, struct dentry *, struct file *, extern struct file_system_type nfs4_fs_type; /* nfs4namespace.c */ -struct rpc_clnt *nfs4_negotiate_security(struct rpc_clnt *, struct inode *, struct qstr *); +struct rpc_clnt *nfs4_negotiate_security(struct rpc_clnt *, struct inode *, + const struct qstr *); struct vfsmount *nfs4_submount(struct nfs_server *, struct dentry *, struct nfs_fh *, struct nfs_fattr *); int nfs4_replace_transport(struct nfs_server *server, @@ -252,7 +253,7 @@ extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struc extern int nfs4_proc_get_locations(struct inode *, struct nfs4_fs_locations *, struct page *page, struct rpc_cred *); extern int nfs4_proc_fsid_present(struct inode *, struct rpc_cred *); -extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *, struct qstr *, +extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *, const struct qstr *, struct nfs_fh *, struct nfs_fattr *); extern int nfs4_proc_secinfo(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *); extern const struct xattr_handler *nfs4_xattr_handlers[]; @@ -395,6 +396,10 @@ extern void nfs4_schedule_state_renewal(struct nfs_client *); extern void nfs4_renewd_prepare_shutdown(struct nfs_server *); extern void nfs4_kill_renewd(struct nfs_client *); extern void nfs4_renew_state(struct work_struct *); +extern void nfs4_set_lease_period(struct nfs_client *clp, + unsigned long lease, + unsigned long lastrenewed); + /* nfs4state.c */ struct rpc_cred *nfs4_get_clid_cred(struct nfs_client *clp); diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index f592672373cb..d21104912676 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -208,7 +208,7 @@ static struct rpc_clnt *nfs_find_best_sec(struct rpc_clnt *clnt, */ struct rpc_clnt * nfs4_negotiate_security(struct rpc_clnt *clnt, struct inode *inode, - struct qstr *name) + const struct qstr *name) { struct page *page; struct nfs4_secinfo_flavors *flavors; @@ -397,7 +397,7 @@ struct vfsmount *nfs4_submount(struct nfs_server *server, struct dentry *dentry, rpc_authflavor_t flavor = server->client->cl_auth->au_flavor; struct dentry *parent = dget_parent(dentry); struct inode *dir = d_inode(parent); - struct qstr *name = &dentry->d_name; + const struct qstr *name = &dentry->d_name; struct rpc_clnt *client; struct vfsmount *mnt; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index da5c9e58e907..1949bbd806eb 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3538,7 +3538,7 @@ static void nfs_fixup_secinfo_attributes(struct nfs_fattr *fattr) } static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir, - struct qstr *name, struct nfs_fh *fhandle, + const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label) { struct nfs4_exception exception = { }; @@ -3580,7 +3580,7 @@ out: return err; } -static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, +static int nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label) { @@ -3596,7 +3596,7 @@ static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, } struct rpc_clnt * -nfs4_proc_lookup_mountpoint(struct inode *dir, struct qstr *name, +nfs4_proc_lookup_mountpoint(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr) { struct rpc_clnt *client = NFS_CLIENT(dir); @@ -3755,7 +3755,7 @@ out: return status; } -static int _nfs4_proc_remove(struct inode *dir, struct qstr *name) +static int _nfs4_proc_remove(struct inode *dir, const struct qstr *name) { struct nfs_server *server = NFS_SERVER(dir); struct nfs_removeargs args = { @@ -3778,7 +3778,7 @@ static int _nfs4_proc_remove(struct inode *dir, struct qstr *name) return status; } -static int nfs4_proc_remove(struct inode *dir, struct qstr *name) +static int nfs4_proc_remove(struct inode *dir, const struct qstr *name) { struct nfs4_exception exception = { }; int err; @@ -3861,7 +3861,7 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir, return 1; } -static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) +static int _nfs4_proc_link(struct inode *inode, struct inode *dir, const struct qstr *name) { struct nfs_server *server = NFS_SERVER(inode); struct nfs4_link_arg arg = { @@ -3908,7 +3908,7 @@ out: return status; } -static int nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) +static int nfs4_proc_link(struct inode *inode, struct inode *dir, const struct qstr *name) { struct nfs4_exception exception = { }; int err; @@ -3930,7 +3930,7 @@ struct nfs4_createdata { }; static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir, - struct qstr *name, struct iattr *sattr, u32 ftype) + const struct qstr *name, struct iattr *sattr, u32 ftype) { struct nfs4_createdata *data; @@ -4237,12 +4237,9 @@ static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, str err = _nfs4_do_fsinfo(server, fhandle, fsinfo); trace_nfs4_fsinfo(server, fhandle, fsinfo->fattr, err); if (err == 0) { - struct nfs_client *clp = server->nfs_client; - - spin_lock(&clp->cl_lock); - clp->cl_lease_time = fsinfo->lease_time * HZ; - clp->cl_last_renewal = now; - spin_unlock(&clp->cl_lock); + nfs4_set_lease_period(server->nfs_client, + fsinfo->lease_time * HZ, + now); break; } err = nfs4_handle_exception(server, err, &exception); diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index e1ba58c3d1ad..82e77198d17e 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -136,6 +136,26 @@ nfs4_kill_renewd(struct nfs_client *clp) cancel_delayed_work_sync(&clp->cl_renewd); } +/** + * nfs4_set_lease_period - Sets the lease period on a nfs_client + * + * @clp: pointer to nfs_client + * @lease: new value for lease period + * @lastrenewed: time at which lease was last renewed + */ +void nfs4_set_lease_period(struct nfs_client *clp, + unsigned long lease, + unsigned long lastrenewed) +{ + spin_lock(&clp->cl_lock); + clp->cl_lease_time = lease; + clp->cl_last_renewal = lastrenewed; + spin_unlock(&clp->cl_lock); + + /* Cap maximum reconnect timeout at 1/2 lease period */ + rpc_cap_max_reconnect_timeout(clp->cl_rpcclient, lease >> 1); +} + /* * Local variables: * c-basic-offset: 8 diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 834b875900d6..cada00aa5096 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -277,20 +277,17 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp) { int status; struct nfs_fsinfo fsinfo; + unsigned long now; if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) { nfs4_schedule_state_renewal(clp); return 0; } + now = jiffies; status = nfs4_proc_get_lease_time(clp, &fsinfo); if (status == 0) { - /* Update lease time and schedule renewal */ - spin_lock(&clp->cl_lock); - clp->cl_lease_time = fsinfo.lease_time * HZ; - clp->cl_last_renewal = jiffies; - spin_unlock(&clp->cl_lock); - + nfs4_set_lease_period(clp, fsinfo.lease_time * HZ, now); nfs4_schedule_state_renewal(clp); } diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index b417bbcd9704..b7bca8303989 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -145,7 +145,7 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, } static int -nfs_proc_lookup(struct inode *dir, struct qstr *name, +nfs_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label) { @@ -299,7 +299,7 @@ out: } static int -nfs_proc_remove(struct inode *dir, struct qstr *name) +nfs_proc_remove(struct inode *dir, const struct qstr *name) { struct nfs_removeargs arg = { .fh = NFS_FH(dir), @@ -357,7 +357,7 @@ nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir, } static int -nfs_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) +nfs_proc_link(struct inode *inode, struct inode *dir, const struct qstr *name) { struct nfs_linkargs arg = { .fromfh = NFS_FH(inode), @@ -456,7 +456,7 @@ out: } static int -nfs_proc_rmdir(struct inode *dir, struct qstr *name) +nfs_proc_rmdir(struct inode *dir, const struct qstr *name) { struct nfs_diropargs arg = { .fh = NFS_FH(dir), diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 1868246f56e6..191aa577dd1f 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -162,7 +162,7 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data) * @dentry: dentry to unlink */ static int -nfs_async_unlink(struct dentry *dentry, struct qstr *name) +nfs_async_unlink(struct dentry *dentry, const struct qstr *name) { struct nfs_unlinkdata *data; int status = -ENOMEM; diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index c9f583d7bac8..47febcf99185 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -90,6 +90,7 @@ config NFSD_BLOCKLAYOUT bool "NFSv4.1 server support for pNFS block layouts" depends on NFSD_V4 && BLOCK select NFSD_PNFS + select EXPORTFS_BLOCK_OPS help This option enables support for the exporting pNFS block layouts in the kernel's NFS server. The pNFS block layout enables NFS @@ -102,6 +103,7 @@ config NFSD_SCSILAYOUT bool "NFSv4.1 server support for pNFS SCSI layouts" depends on NFSD_V4 && BLOCK select NFSD_PNFS + select EXPORTFS_BLOCK_OPS help This option enables support for the exporting pNFS SCSI layouts in the kernel's NFS server. The pNFS SCSI layout enables NFS @@ -111,6 +113,23 @@ config NFSD_SCSILAYOUT If unsure, say N. +config NFSD_FLEXFILELAYOUT + bool "NFSv4.1 server support for pNFS Flex File layouts" + depends on NFSD_V4 + select NFSD_PNFS + help + This option enables support for the exporting pNFS Flex File + layouts in the kernel's NFS server. The pNFS Flex File layout + enables NFS clients to directly perform I/O to NFSv3 devices + accesible to both the server and the clients. See + draft-ietf-nfsv4-flex-files for more details. + + Warning, this server implements the bare minimum functionality + to be a flex file server - it is for testing the client, + not for use in production. + + If unsure, say N. + config NFSD_V4_SECURITY_LABEL bool "Provide Security Label support for NFSv4 server" depends on NFSD_V4 && SECURITY diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile index 3ae5f3c77e28..5f5d3a76980c 100644 --- a/fs/nfsd/Makefile +++ b/fs/nfsd/Makefile @@ -20,3 +20,4 @@ nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o +nfsd-$(CONFIG_NFSD_FLEXFILELAYOUT) += flexfilelayout.o flexfilelayoutxdr.o diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index ad2c05e80a83..5a1708441510 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -163,6 +163,7 @@ nfsd4_block_get_device_info_simple(struct super_block *sb, static __be32 nfsd4_block_proc_getdeviceinfo(struct super_block *sb, + struct svc_rqst *rqstp, struct nfs4_client *clp, struct nfsd4_getdeviceinfo *gdp) { @@ -355,6 +356,7 @@ nfsd4_block_get_device_info_scsi(struct super_block *sb, static __be32 nfsd4_scsi_proc_getdeviceinfo(struct super_block *sb, + struct svc_rqst *rqstp, struct nfs4_client *clp, struct nfsd4_getdeviceinfo *gdp) { diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c index 4ebaaf4b8d8a..ac6f54546fdd 100644 --- a/fs/nfsd/blocklayoutxdr.c +++ b/fs/nfsd/blocklayoutxdr.c @@ -44,7 +44,7 @@ nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) switch (b->type) { case PNFS_BLOCK_VOLUME_SIMPLE: - len = 4 + 4 + 8 + 4 + b->simple.sig_len; + len = 4 + 4 + 8 + 4 + (XDR_QUADLEN(b->simple.sig_len) << 2); p = xdr_reserve_space(xdr, len); if (!p) return -ETOOSMALL; @@ -55,7 +55,7 @@ nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) p = xdr_encode_opaque(p, b->simple.sig, b->simple.sig_len); break; case PNFS_BLOCK_VOLUME_SCSI: - len = 4 + 4 + 4 + 4 + b->scsi.designator_len + 8; + len = 4 + 4 + 4 + 4 + (XDR_QUADLEN(b->scsi.designator_len) << 2) + 8; p = xdr_reserve_space(xdr, len); if (!p) return -ETOOSMALL; diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index b4d84b579f20..43e109cc0ccc 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -706,7 +706,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem) new->ex_fslocs.locations = NULL; new->ex_fslocs.locations_count = 0; new->ex_fslocs.migrated = 0; - new->ex_layout_type = 0; + new->ex_layout_types = 0; new->ex_uuid = NULL; new->cd = item->cd; } @@ -731,7 +731,7 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem) item->ex_fslocs.locations_count = 0; new->ex_fslocs.migrated = item->ex_fslocs.migrated; item->ex_fslocs.migrated = 0; - new->ex_layout_type = item->ex_layout_type; + new->ex_layout_types = item->ex_layout_types; new->ex_nflavors = item->ex_nflavors; for (i = 0; i < MAX_SECINFO_LIST; i++) { new->ex_flavors[i] = item->ex_flavors[i]; @@ -954,6 +954,16 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp) rqstp->rq_cred.cr_flavor == RPC_AUTH_UNIX) return 0; } + + /* If the compound op contains a spo_must_allowed op, + * it will be sent with integrity/protection which + * will have to be expressly allowed on mounts that + * don't support it + */ + + if (nfsd4_spo_must_allow(rqstp)) + return 0; + return nfserr_wrongsec; } diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h index 2e315072bf3f..730f15eeb7ed 100644 --- a/fs/nfsd/export.h +++ b/fs/nfsd/export.h @@ -57,7 +57,7 @@ struct svc_export { struct nfsd4_fs_locations ex_fslocs; uint32_t ex_nflavors; struct exp_flavor_info ex_flavors[MAX_SECINFO_LIST]; - enum pnfs_layouttype ex_layout_type; + u32 ex_layout_types; struct nfsd4_deviceid_map *ex_devid_map; struct cache_detail *cd; }; diff --git a/fs/nfsd/flexfilelayout.c b/fs/nfsd/flexfilelayout.c new file mode 100644 index 000000000000..df880e9fa71f --- /dev/null +++ b/fs/nfsd/flexfilelayout.c @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2016 Tom Haynes <loghyr@primarydata.com> + * + * The following implements a super-simple flex-file server + * where the NFSv4.1 mds is also the ds. And the storage is + * the same. I.e., writing to the mds via a NFSv4.1 WRITE + * goes to the same location as the NFSv3 WRITE. + */ +#include <linux/slab.h> + +#include <linux/nfsd/debug.h> + +#include <linux/sunrpc/addr.h> + +#include "flexfilelayoutxdr.h" +#include "pnfs.h" + +#define NFSDDBG_FACILITY NFSDDBG_PNFS + +static __be32 +nfsd4_ff_proc_layoutget(struct inode *inode, const struct svc_fh *fhp, + struct nfsd4_layoutget *args) +{ + struct nfsd4_layout_seg *seg = &args->lg_seg; + u32 device_generation = 0; + int error; + uid_t u; + + struct pnfs_ff_layout *fl; + + /* + * The super simple flex file server has 1 mirror, 1 data server, + * and 1 file handle. So instead of 4 allocs, do 1 for now. + * Zero it out for the stateid - don't want junk in there! + */ + error = -ENOMEM; + fl = kzalloc(sizeof(*fl), GFP_KERNEL); + if (!fl) + goto out_error; + args->lg_content = fl; + + /* + * Avoid layout commit, try to force the I/O to the DS, + * and for fun, cause all IOMODE_RW layout segments to + * effectively be WRITE only. + */ + fl->flags = FF_FLAGS_NO_LAYOUTCOMMIT | FF_FLAGS_NO_IO_THRU_MDS | + FF_FLAGS_NO_READ_IO; + + /* Do not allow a IOMODE_READ segment to have write pemissions */ + if (seg->iomode == IOMODE_READ) { + u = from_kuid(&init_user_ns, inode->i_uid) + 1; + fl->uid = make_kuid(&init_user_ns, u); + } else + fl->uid = inode->i_uid; + fl->gid = inode->i_gid; + + error = nfsd4_set_deviceid(&fl->deviceid, fhp, device_generation); + if (error) + goto out_error; + + fl->fh.size = fhp->fh_handle.fh_size; + memcpy(fl->fh.data, &fhp->fh_handle.fh_base, fl->fh.size); + + /* Give whole file layout segments */ + seg->offset = 0; + seg->length = NFS4_MAX_UINT64; + + dprintk("GET: 0x%llx:0x%llx %d\n", seg->offset, seg->length, + seg->iomode); + return 0; + +out_error: + seg->length = 0; + return nfserrno(error); +} + +static __be32 +nfsd4_ff_proc_getdeviceinfo(struct super_block *sb, struct svc_rqst *rqstp, + struct nfs4_client *clp, struct nfsd4_getdeviceinfo *gdp) +{ + struct pnfs_ff_device_addr *da; + + u16 port; + char addr[INET6_ADDRSTRLEN]; + + da = kzalloc(sizeof(struct pnfs_ff_device_addr), GFP_KERNEL); + if (!da) + return nfserrno(-ENOMEM); + + gdp->gd_device = da; + + da->version = 3; + da->minor_version = 0; + + da->rsize = svc_max_payload(rqstp); + da->wsize = da->rsize; + + rpc_ntop((struct sockaddr *)&rqstp->rq_daddr, + addr, INET6_ADDRSTRLEN); + if (rqstp->rq_daddr.ss_family == AF_INET) { + struct sockaddr_in *sin; + + sin = (struct sockaddr_in *)&rqstp->rq_daddr; + port = ntohs(sin->sin_port); + snprintf(da->netaddr.netid, FF_NETID_LEN + 1, "tcp"); + da->netaddr.netid_len = 3; + } else { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)&rqstp->rq_daddr; + port = ntohs(sin6->sin6_port); + snprintf(da->netaddr.netid, FF_NETID_LEN + 1, "tcp6"); + da->netaddr.netid_len = 4; + } + + da->netaddr.addr_len = + snprintf(da->netaddr.addr, FF_ADDR_LEN + 1, + "%s.%hhu.%hhu", addr, port >> 8, port & 0xff); + + da->tightly_coupled = false; + + return 0; +} + +const struct nfsd4_layout_ops ff_layout_ops = { + .notify_types = + NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE, + .proc_getdeviceinfo = nfsd4_ff_proc_getdeviceinfo, + .encode_getdeviceinfo = nfsd4_ff_encode_getdeviceinfo, + .proc_layoutget = nfsd4_ff_proc_layoutget, + .encode_layoutget = nfsd4_ff_encode_layoutget, +}; diff --git a/fs/nfsd/flexfilelayoutxdr.c b/fs/nfsd/flexfilelayoutxdr.c new file mode 100644 index 000000000000..5e3fd7fc1a9f --- /dev/null +++ b/fs/nfsd/flexfilelayoutxdr.c @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2016 Tom Haynes <loghyr@primarydata.com> + */ +#include <linux/sunrpc/svc.h> +#include <linux/nfs4.h> + +#include "nfsd.h" +#include "flexfilelayoutxdr.h" + +#define NFSDDBG_FACILITY NFSDDBG_PNFS + +struct ff_idmap { + char buf[11]; + int len; +}; + +__be32 +nfsd4_ff_encode_layoutget(struct xdr_stream *xdr, + struct nfsd4_layoutget *lgp) +{ + struct pnfs_ff_layout *fl = lgp->lg_content; + int len, mirror_len, ds_len, fh_len; + __be32 *p; + + /* + * Unlike nfsd4_encode_user, we know these will + * always be stringified. + */ + struct ff_idmap uid; + struct ff_idmap gid; + + fh_len = 4 + fl->fh.size; + + uid.len = sprintf(uid.buf, "%u", from_kuid(&init_user_ns, fl->uid)); + gid.len = sprintf(gid.buf, "%u", from_kgid(&init_user_ns, fl->gid)); + + /* 8 + len for recording the length, name, and padding */ + ds_len = 20 + sizeof(stateid_opaque_t) + 4 + fh_len + + 8 + uid.len + 8 + gid.len; + + mirror_len = 4 + ds_len; + + /* The layout segment */ + len = 20 + mirror_len; + + p = xdr_reserve_space(xdr, sizeof(__be32) + len); + if (!p) + return nfserr_toosmall; + + *p++ = cpu_to_be32(len); + p = xdr_encode_hyper(p, 0); /* stripe unit of 1 */ + + *p++ = cpu_to_be32(1); /* single mirror */ + *p++ = cpu_to_be32(1); /* single data server */ + + p = xdr_encode_opaque_fixed(p, &fl->deviceid, + sizeof(struct nfsd4_deviceid)); + + *p++ = cpu_to_be32(1); /* efficiency */ + + *p++ = cpu_to_be32(fl->stateid.si_generation); + p = xdr_encode_opaque_fixed(p, &fl->stateid.si_opaque, + sizeof(stateid_opaque_t)); + + *p++ = cpu_to_be32(1); /* single file handle */ + p = xdr_encode_opaque(p, fl->fh.data, fl->fh.size); + + p = xdr_encode_opaque(p, uid.buf, uid.len); + p = xdr_encode_opaque(p, gid.buf, gid.len); + + *p++ = cpu_to_be32(fl->flags); + *p++ = cpu_to_be32(0); /* No stats collect hint */ + + return 0; +} + +__be32 +nfsd4_ff_encode_getdeviceinfo(struct xdr_stream *xdr, + struct nfsd4_getdeviceinfo *gdp) +{ + struct pnfs_ff_device_addr *da = gdp->gd_device; + int len; + int ver_len; + int addr_len; + __be32 *p; + + /* len + padding for two strings */ + addr_len = 16 + da->netaddr.netid_len + da->netaddr.addr_len; + ver_len = 20; + + len = 4 + ver_len + 4 + addr_len; + + p = xdr_reserve_space(xdr, len + sizeof(__be32)); + if (!p) + return nfserr_resource; + + /* + * Fill in the overall length and number of volumes at the beginning + * of the layout. + */ + *p++ = cpu_to_be32(len); + *p++ = cpu_to_be32(1); /* 1 netaddr */ + p = xdr_encode_opaque(p, da->netaddr.netid, da->netaddr.netid_len); + p = xdr_encode_opaque(p, da->netaddr.addr, da->netaddr.addr_len); + + *p++ = cpu_to_be32(1); /* 1 versions */ + + *p++ = cpu_to_be32(da->version); + *p++ = cpu_to_be32(da->minor_version); + *p++ = cpu_to_be32(da->rsize); + *p++ = cpu_to_be32(da->wsize); + *p++ = cpu_to_be32(da->tightly_coupled); + + return 0; +} diff --git a/fs/nfsd/flexfilelayoutxdr.h b/fs/nfsd/flexfilelayoutxdr.h new file mode 100644 index 000000000000..467defd4e563 --- /dev/null +++ b/fs/nfsd/flexfilelayoutxdr.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2016 Tom Haynes <loghyr@primarydata.com> + */ +#ifndef _NFSD_FLEXFILELAYOUTXDR_H +#define _NFSD_FLEXFILELAYOUTXDR_H 1 + +#include <linux/inet.h> +#include "xdr4.h" + +#define FF_FLAGS_NO_LAYOUTCOMMIT 1 +#define FF_FLAGS_NO_IO_THRU_MDS 2 +#define FF_FLAGS_NO_READ_IO 4 + +struct xdr_stream; + +#define FF_NETID_LEN (4) +#define FF_ADDR_LEN (INET6_ADDRSTRLEN + 8) +struct pnfs_ff_netaddr { + char netid[FF_NETID_LEN + 1]; + char addr[FF_ADDR_LEN + 1]; + u32 netid_len; + u32 addr_len; +}; + +struct pnfs_ff_device_addr { + struct pnfs_ff_netaddr netaddr; + u32 version; + u32 minor_version; + u32 rsize; + u32 wsize; + bool tightly_coupled; +}; + +struct pnfs_ff_layout { + u32 flags; + u32 stats_collect_hint; + kuid_t uid; + kgid_t gid; + struct nfsd4_deviceid deviceid; + stateid_t stateid; + struct nfs_fh fh; +}; + +__be32 nfsd4_ff_encode_getdeviceinfo(struct xdr_stream *xdr, + struct nfsd4_getdeviceinfo *gdp); +__be32 nfsd4_ff_encode_layoutget(struct xdr_stream *xdr, + struct nfsd4_layoutget *lgp); + +#endif /* _NFSD_FLEXFILELAYOUTXDR_H */ diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 953c0755cb37..2be9602b0221 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -27,6 +27,9 @@ static const struct nfsd4_callback_ops nfsd4_cb_layout_ops; static const struct lock_manager_operations nfsd4_layouts_lm_ops; const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = { +#ifdef CONFIG_NFSD_FLEXFILELAYOUT + [LAYOUT_FLEX_FILES] = &ff_layout_ops, +#endif #ifdef CONFIG_NFSD_BLOCKLAYOUT [LAYOUT_BLOCK_VOLUME] = &bl_layout_ops, #endif @@ -122,28 +125,35 @@ nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp, void nfsd4_setup_layout_type(struct svc_export *exp) { +#if defined(CONFIG_NFSD_BLOCKLAYOUT) || defined(CONFIG_NFSD_SCSILAYOUT) struct super_block *sb = exp->ex_path.mnt->mnt_sb; +#endif if (!(exp->ex_flags & NFSEXP_PNFS)) return; /* - * Check if the file system supports exporting a block-like layout. + * If flex file is configured, use it by default. Otherwise + * check if the file system supports exporting a block-like layout. * If the block device supports reservations prefer the SCSI layout, * otherwise advertise the block layout. */ +#ifdef CONFIG_NFSD_FLEXFILELAYOUT + exp->ex_layout_types |= 1 << LAYOUT_FLEX_FILES; +#endif #ifdef CONFIG_NFSD_BLOCKLAYOUT + /* overwrite flex file layout selection if needed */ if (sb->s_export_op->get_uuid && sb->s_export_op->map_blocks && sb->s_export_op->commit_blocks) - exp->ex_layout_type = LAYOUT_BLOCK_VOLUME; + exp->ex_layout_types |= 1 << LAYOUT_BLOCK_VOLUME; #endif #ifdef CONFIG_NFSD_SCSILAYOUT /* overwrite block layout selection if needed */ if (sb->s_export_op->map_blocks && sb->s_export_op->commit_blocks && sb->s_bdev && sb->s_bdev->bd_disk->fops->pr_ops) - exp->ex_layout_type = LAYOUT_SCSI; + exp->ex_layout_types |= 1 << LAYOUT_SCSI; #endif } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index de1ff1d98bb1..1fb222752b2b 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -605,8 +605,7 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, fh_init(&resfh, NFS4_FHSIZE); - status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, - NFSD_MAY_CREATE); + status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, NFSD_MAY_NOP); if (status) return status; @@ -1219,12 +1218,12 @@ nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, static const struct nfsd4_layout_ops * nfsd4_layout_verify(struct svc_export *exp, unsigned int layout_type) { - if (!exp->ex_layout_type) { + if (!exp->ex_layout_types) { dprintk("%s: export does not support pNFS\n", __func__); return NULL; } - if (exp->ex_layout_type != layout_type) { + if (!(exp->ex_layout_types & (1 << layout_type))) { dprintk("%s: layout type %d not supported\n", __func__, layout_type); return NULL; @@ -1270,7 +1269,7 @@ nfsd4_getdeviceinfo(struct svc_rqst *rqstp, nfserr = nfs_ok; if (gdp->gd_maxcount != 0) { nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, - cstate->session->se_client, gdp); + rqstp, cstate->session->se_client, gdp); } gdp->gd_notify_types &= ops->notify_types; @@ -2335,6 +2334,45 @@ static struct nfsd4_operation nfsd4_ops[] = { }, }; +/** + * nfsd4_spo_must_allow - Determine if the compound op contains an + * operation that is allowed to be sent with machine credentials + * + * @rqstp: a pointer to the struct svc_rqst + * + * Checks to see if the compound contains a spo_must_allow op + * and confirms that it was sent with the proper machine creds. + */ + +bool nfsd4_spo_must_allow(struct svc_rqst *rqstp) +{ + struct nfsd4_compoundres *resp = rqstp->rq_resp; + struct nfsd4_compoundargs *argp = rqstp->rq_argp; + struct nfsd4_op *this = &argp->ops[resp->opcnt - 1]; + struct nfsd4_compound_state *cstate = &resp->cstate; + struct nfs4_op_map *allow = &cstate->clp->cl_spo_must_allow; + u32 opiter; + + if (!cstate->minorversion) + return false; + + if (cstate->spo_must_allowed == true) + return true; + + opiter = resp->opcnt; + while (opiter < argp->opcnt) { + this = &argp->ops[opiter++]; + if (test_bit(this->opnum, allow->u.longs) && + cstate->clp->cl_mach_cred && + nfsd4_mach_creds_match(cstate->clp, rqstp)) { + cstate->spo_must_allowed = true; + return true; + } + } + cstate->spo_must_allowed = false; + return false; +} + int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op) { struct nfsd4_operation *opdesc; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 70d0b9b33031..a204d7e109d4 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1200,27 +1200,6 @@ free_ol_stateid_reaplist(struct list_head *reaplist) } } -static void release_lockowner(struct nfs4_lockowner *lo) -{ - struct nfs4_client *clp = lo->lo_owner.so_client; - struct nfs4_ol_stateid *stp; - struct list_head reaplist; - - INIT_LIST_HEAD(&reaplist); - - spin_lock(&clp->cl_lock); - unhash_lockowner_locked(lo); - while (!list_empty(&lo->lo_owner.so_stateids)) { - stp = list_first_entry(&lo->lo_owner.so_stateids, - struct nfs4_ol_stateid, st_perstateowner); - WARN_ON(!unhash_lock_stateid(stp)); - put_ol_stateid_locked(stp, &reaplist); - } - spin_unlock(&clp->cl_lock); - free_ol_stateid_reaplist(&reaplist); - nfs4_put_stateowner(&lo->lo_owner); -} - static void release_open_stateid_locks(struct nfs4_ol_stateid *open_stp, struct list_head *reaplist) { @@ -1972,7 +1951,7 @@ static bool svc_rqst_integrity_protected(struct svc_rqst *rqstp) service == RPC_GSS_SVC_PRIVACY; } -static bool mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp) +bool nfsd4_mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp) { struct svc_cred *cr = &rqstp->rq_cred; @@ -2388,6 +2367,22 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, switch (exid->spa_how) { case SP4_MACH_CRED: + exid->spo_must_enforce[0] = 0; + exid->spo_must_enforce[1] = ( + 1 << (OP_BIND_CONN_TO_SESSION - 32) | + 1 << (OP_EXCHANGE_ID - 32) | + 1 << (OP_CREATE_SESSION - 32) | + 1 << (OP_DESTROY_SESSION - 32) | + 1 << (OP_DESTROY_CLIENTID - 32)); + + exid->spo_must_allow[0] &= (1 << (OP_CLOSE) | + 1 << (OP_OPEN_DOWNGRADE) | + 1 << (OP_LOCKU) | + 1 << (OP_DELEGRETURN)); + + exid->spo_must_allow[1] &= ( + 1 << (OP_TEST_STATEID - 32) | + 1 << (OP_FREE_STATEID - 32)); if (!svc_rqst_integrity_protected(rqstp)) { status = nfserr_inval; goto out_nolock; @@ -2424,7 +2419,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, status = nfserr_inval; goto out; } - if (!mach_creds_match(conf, rqstp)) { + if (!nfsd4_mach_creds_match(conf, rqstp)) { status = nfserr_wrong_cred; goto out; } @@ -2473,6 +2468,8 @@ out_new: goto out; } new->cl_minorversion = cstate->minorversion; + new->cl_spo_must_allow.u.words[0] = exid->spo_must_allow[0]; + new->cl_spo_must_allow.u.words[1] = exid->spo_must_allow[1]; gen_clid(new, nn); add_to_unconfirmed(new); @@ -2676,7 +2673,7 @@ nfsd4_create_session(struct svc_rqst *rqstp, if (conf) { status = nfserr_wrong_cred; - if (!mach_creds_match(conf, rqstp)) + if (!nfsd4_mach_creds_match(conf, rqstp)) goto out_free_conn; cs_slot = &conf->cl_cs_slot; status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); @@ -2692,7 +2689,7 @@ nfsd4_create_session(struct svc_rqst *rqstp, goto out_free_conn; } status = nfserr_wrong_cred; - if (!mach_creds_match(unconf, rqstp)) + if (!nfsd4_mach_creds_match(unconf, rqstp)) goto out_free_conn; cs_slot = &unconf->cl_cs_slot; status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); @@ -2801,7 +2798,7 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp, if (!session) goto out_no_session; status = nfserr_wrong_cred; - if (!mach_creds_match(session->se_client, rqstp)) + if (!nfsd4_mach_creds_match(session->se_client, rqstp)) goto out; status = nfsd4_map_bcts_dir(&bcts->dir); if (status) @@ -2848,7 +2845,7 @@ nfsd4_destroy_session(struct svc_rqst *r, if (!ses) goto out_client_lock; status = nfserr_wrong_cred; - if (!mach_creds_match(ses->se_client, r)) + if (!nfsd4_mach_creds_match(ses->se_client, r)) goto out_put_session; status = mark_session_dead_locked(ses, 1 + ref_held_by_me); if (status) @@ -3087,7 +3084,7 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta status = nfserr_stale_clientid; goto out; } - if (!mach_creds_match(clp, rqstp)) { + if (!nfsd4_mach_creds_match(clp, rqstp)) { clp = NULL; status = nfserr_wrong_cred; goto out; @@ -3112,7 +3109,7 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta * We don't take advantage of the rca_one_fs case. * That's OK, it's optional, we can safely ignore it. */ - return nfs_ok; + return nfs_ok; } status = nfserr_complete_already; @@ -4906,6 +4903,32 @@ nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfs_ok; } +static __be32 +nfsd4_free_lock_stateid(stateid_t *stateid, struct nfs4_stid *s) +{ + struct nfs4_ol_stateid *stp = openlockstateid(s); + __be32 ret; + + mutex_lock(&stp->st_mutex); + + ret = check_stateid_generation(stateid, &s->sc_stateid, 1); + if (ret) + goto out; + + ret = nfserr_locks_held; + if (check_for_locks(stp->st_stid.sc_file, + lockowner(stp->st_stateowner))) + goto out; + + release_lock_stateid(stp); + ret = nfs_ok; + +out: + mutex_unlock(&stp->st_mutex); + nfs4_put_stid(s); + return ret; +} + __be32 nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_free_stateid *free_stateid) @@ -4913,7 +4936,6 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stateid_t *stateid = &free_stateid->fr_stateid; struct nfs4_stid *s; struct nfs4_delegation *dp; - struct nfs4_ol_stateid *stp; struct nfs4_client *cl = cstate->session->se_client; __be32 ret = nfserr_bad_stateid; @@ -4932,18 +4954,9 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, ret = nfserr_locks_held; break; case NFS4_LOCK_STID: - ret = check_stateid_generation(stateid, &s->sc_stateid, 1); - if (ret) - break; - stp = openlockstateid(s); - ret = nfserr_locks_held; - if (check_for_locks(stp->st_stid.sc_file, - lockowner(stp->st_stateowner))) - break; - WARN_ON(!unhash_lock_stateid(stp)); + atomic_inc(&s->sc_count); spin_unlock(&cl->cl_lock); - nfs4_put_stid(s); - ret = nfs_ok; + ret = nfsd4_free_lock_stateid(stateid, s); goto out; case NFS4_REVOKED_DELEG_STID: dp = delegstateid(s); @@ -5510,7 +5523,7 @@ static __be32 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *ost, struct nfsd4_lock *lock, - struct nfs4_ol_stateid **lst, bool *new) + struct nfs4_ol_stateid **plst, bool *new) { __be32 status; struct nfs4_file *fi = ost->st_stid.sc_file; @@ -5518,7 +5531,9 @@ lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_client *cl = oo->oo_owner.so_client; struct inode *inode = d_inode(cstate->current_fh.fh_dentry); struct nfs4_lockowner *lo; + struct nfs4_ol_stateid *lst; unsigned int strhashval; + bool hashed; lo = find_lockowner_str(cl, &lock->lk_new_owner); if (!lo) { @@ -5534,12 +5549,27 @@ lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, goto out; } - *lst = find_or_create_lock_stateid(lo, fi, inode, ost, new); - if (*lst == NULL) { +retry: + lst = find_or_create_lock_stateid(lo, fi, inode, ost, new); + if (lst == NULL) { status = nfserr_jukebox; goto out; } + + mutex_lock(&lst->st_mutex); + + /* See if it's still hashed to avoid race with FREE_STATEID */ + spin_lock(&cl->cl_lock); + hashed = !list_empty(&lst->st_perfile); + spin_unlock(&cl->cl_lock); + + if (!hashed) { + mutex_unlock(&lst->st_mutex); + nfs4_put_stid(&lst->st_stid); + goto retry; + } status = nfs_ok; + *plst = lst; out: nfs4_put_stateowner(&lo->lo_owner); return status; @@ -5606,8 +5636,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; status = lookup_or_create_lock_state(cstate, open_stp, lock, &lock_stp, &new); - if (status == nfs_ok) - mutex_lock(&lock_stp->st_mutex); } else { status = nfs4_preprocess_seqid_op(cstate, lock->lk_old_lock_seqid, @@ -5945,6 +5973,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, __be32 status; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct nfs4_client *clp; + LIST_HEAD (reaplist); dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n", clid->cl_boot, clid->cl_id); @@ -5975,9 +6004,23 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, nfs4_get_stateowner(sop); break; } + if (!lo) { + spin_unlock(&clp->cl_lock); + return status; + } + + unhash_lockowner_locked(lo); + while (!list_empty(&lo->lo_owner.so_stateids)) { + stp = list_first_entry(&lo->lo_owner.so_stateids, + struct nfs4_ol_stateid, + st_perstateowner); + WARN_ON(!unhash_lock_stateid(stp)); + put_ol_stateid_locked(stp, &reaplist); + } spin_unlock(&clp->cl_lock); - if (lo) - release_lockowner(lo); + free_ol_stateid_reaplist(&reaplist); + nfs4_put_stateowner(&lo->lo_owner); + return status; } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 9df898ba648f..0aa0236a1429 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1299,16 +1299,14 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp, break; case SP4_MACH_CRED: /* spo_must_enforce */ - READ_BUF(4); - dummy = be32_to_cpup(p++); - READ_BUF(dummy * 4); - p += dummy; - + status = nfsd4_decode_bitmap(argp, + exid->spo_must_enforce); + if (status) + goto out; /* spo_must_allow */ - READ_BUF(4); - dummy = be32_to_cpup(p++); - READ_BUF(dummy * 4); - p += dummy; + status = nfsd4_decode_bitmap(argp, exid->spo_must_allow); + if (status) + goto out; break; case SP4_SSV: /* ssp_ops */ @@ -2164,22 +2162,20 @@ nfsd4_encode_aclname(struct xdr_stream *xdr, struct svc_rqst *rqstp, } static inline __be32 -nfsd4_encode_layout_type(struct xdr_stream *xdr, enum pnfs_layouttype layout_type) +nfsd4_encode_layout_types(struct xdr_stream *xdr, u32 layout_types) { - __be32 *p; + __be32 *p; + unsigned long i = hweight_long(layout_types); - if (layout_type) { - p = xdr_reserve_space(xdr, 8); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(1); - *p++ = cpu_to_be32(layout_type); - } else { - p = xdr_reserve_space(xdr, 4); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(0); - } + p = xdr_reserve_space(xdr, 4 + 4 * i); + if (!p) + return nfserr_resource; + + *p++ = cpu_to_be32(i); + + for (i = LAYOUT_NFSV4_1_FILES; i < LAYOUT_TYPE_MAX; ++i) + if (layout_types & (1 << i)) + *p++ = cpu_to_be32(i); return 0; } @@ -2754,13 +2750,13 @@ out_acl: } #ifdef CONFIG_NFSD_PNFS if (bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) { - status = nfsd4_encode_layout_type(xdr, exp->ex_layout_type); + status = nfsd4_encode_layout_types(xdr, exp->ex_layout_types); if (status) goto out; } if (bmval2 & FATTR4_WORD2_LAYOUT_TYPES) { - status = nfsd4_encode_layout_type(xdr, exp->ex_layout_type); + status = nfsd4_encode_layout_types(xdr, exp->ex_layout_types); if (status) goto out; } @@ -3867,14 +3863,6 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w return nfserr; } -static const u32 nfs4_minimal_spo_must_enforce[2] = { - [1] = 1 << (OP_BIND_CONN_TO_SESSION - 32) | - 1 << (OP_EXCHANGE_ID - 32) | - 1 << (OP_CREATE_SESSION - 32) | - 1 << (OP_DESTROY_SESSION - 32) | - 1 << (OP_DESTROY_CLIENTID - 32) -}; - static __be32 nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_exchange_id *exid) @@ -3885,6 +3873,7 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, char *server_scope; int major_id_sz; int server_scope_sz; + int status = 0; uint64_t minor_id = 0; if (nfserr) @@ -3913,18 +3902,20 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, case SP4_NONE: break; case SP4_MACH_CRED: - /* spo_must_enforce, spo_must_allow */ - p = xdr_reserve_space(xdr, 16); - if (!p) - return nfserr_resource; - /* spo_must_enforce bitmap: */ - *p++ = cpu_to_be32(2); - *p++ = cpu_to_be32(nfs4_minimal_spo_must_enforce[0]); - *p++ = cpu_to_be32(nfs4_minimal_spo_must_enforce[1]); - /* empty spo_must_allow bitmap: */ - *p++ = cpu_to_be32(0); - + status = nfsd4_encode_bitmap(xdr, + exid->spo_must_enforce[0], + exid->spo_must_enforce[1], + exid->spo_must_enforce[2]); + if (status) + goto out; + /* spo_must_allow bitmap: */ + status = nfsd4_encode_bitmap(xdr, + exid->spo_must_allow[0], + exid->spo_must_allow[1], + exid->spo_must_allow[2]); + if (status) + goto out; break; default: WARN_ON_ONCE(1); @@ -3951,6 +3942,8 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, /* Implementation id */ *p++ = cpu_to_be32(0); /* zero length nfs_impl_id4 array */ return 0; +out: + return status; } static __be32 diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index cf980523898b..9446849888d5 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -124,6 +124,7 @@ void nfs4_state_shutdown_net(struct net *net); void nfs4_reset_lease(time_t leasetime); int nfs4_reset_recoverydir(char *recdir); char * nfs4_recoverydir(void); +bool nfsd4_spo_must_allow(struct svc_rqst *rqstp); #else static inline int nfsd4_init_slabs(void) { return 0; } static inline void nfsd4_free_slabs(void) { } @@ -134,6 +135,10 @@ static inline void nfs4_state_shutdown_net(struct net *net) { } static inline void nfs4_reset_lease(time_t leasetime) { } static inline int nfs4_reset_recoverydir(char *recdir) { return 0; } static inline char * nfs4_recoverydir(void) {return NULL; } +static inline bool nfsd4_spo_must_allow(struct svc_rqst *rqstp) +{ + return false; +} #endif /* diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index a8919444c460..cfe7500d5847 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -59,14 +59,20 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry) * the write call). */ static inline __be32 -nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, umode_t requested) +nfsd_mode_check(struct svc_rqst *rqstp, struct dentry *dentry, + umode_t requested) { - mode &= S_IFMT; + umode_t mode = d_inode(dentry)->i_mode & S_IFMT; if (requested == 0) /* the caller doesn't care */ return nfs_ok; - if (mode == requested) + if (mode == requested) { + if (mode == S_IFDIR && !d_can_lookup(dentry)) { + WARN_ON_ONCE(1); + return nfserr_notdir; + } return nfs_ok; + } /* * v4 has an error more specific than err_notdir which we should * return in preference to err_notdir: @@ -298,7 +304,7 @@ out: * that it expects something not of the given type. * * @access is formed from the NFSD_MAY_* constants defined in - * include/linux/nfsd/nfsd.h. + * fs/nfsd/vfs.h. */ __be32 fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access) @@ -340,7 +346,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access) if (error) goto out; - error = nfsd_mode_check(rqstp, d_inode(dentry)->i_mode, type); + error = nfsd_mode_check(rqstp, dentry, type); if (error) goto out; @@ -533,7 +539,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, * the reference filehandle (if it is in the same export) * or the export options. */ - set_version_and_fsid_type(fhp, exp, ref_fh); + set_version_and_fsid_type(fhp, exp, ref_fh); if (ref_fh == fhp) fh_put(ref_fh); diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 4cd78ef4c95c..e9214768cde9 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -251,9 +251,6 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp, /* Check for NFSD_MAY_WRITE in nfsd_create if necessary */ - nfserr = nfserr_acces; - if (!argp->len) - goto done; nfserr = nfserr_exist; if (isdotent(argp->name, argp->len)) goto done; @@ -362,8 +359,8 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp, nfserr = 0; if (!inode) { /* File doesn't exist. Create it and set attrs */ - nfserr = nfsd_create(rqstp, dirfhp, argp->name, argp->len, - attr, type, rdev, newfhp); + nfserr = nfsd_create_locked(rqstp, dirfhp, argp->name, + argp->len, attr, type, rdev, newfhp); } else if (type == S_IFREG) { dprintk("nfsd: existing %s, valid=%x, size=%ld\n", argp->name, attr->ia_valid, (long) attr->ia_size); diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 79d964aa8079..41b468a6a90f 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -240,7 +240,7 @@ nfssvc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p, || !(p = decode_filename(p, &args->name, &args->len))) return 0; - return xdr_argsize_check(rqstp, p); + return xdr_argsize_check(rqstp, p); } int diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h index 7d073b9b1553..0c2a716e8741 100644 --- a/fs/nfsd/pnfs.h +++ b/fs/nfsd/pnfs.h @@ -21,6 +21,7 @@ struct nfsd4_layout_ops { u32 notify_types; __be32 (*proc_getdeviceinfo)(struct super_block *sb, + struct svc_rqst *rqstp, struct nfs4_client *clp, struct nfsd4_getdeviceinfo *gdevp); __be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr, @@ -44,6 +45,9 @@ extern const struct nfsd4_layout_ops bl_layout_ops; #ifdef CONFIG_NFSD_SCSILAYOUT extern const struct nfsd4_layout_ops scsi_layout_ops; #endif +#ifdef CONFIG_NFSD_FLEXFILELAYOUT +extern const struct nfsd4_layout_ops ff_layout_ops; +#endif __be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stateid_t *stateid, diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 64053eadeb81..b95adf9a1595 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -345,6 +345,7 @@ struct nfs4_client { u32 cl_exchange_flags; /* number of rpc's in progress over an associated session: */ atomic_t cl_refcount; + struct nfs4_op_map cl_spo_must_allow; /* for nfs41 callbacks */ /* We currently support a single back channel with a single slot */ diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 6fbd81ecb410..ff476e654b8f 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1135,96 +1135,37 @@ nfsd_check_ignore_resizing(struct iattr *iap) iap->ia_valid &= ~ATTR_SIZE; } -/* - * Create a file (regular, directory, device, fifo); UNIX sockets - * not yet implemented. - * If the response fh has been verified, the parent directory should - * already be locked. Note that the parent directory is left locked. - * - * N.B. Every call to nfsd_create needs an fh_put for _both_ fhp and resfhp - */ +/* The parent directory should already be locked: */ __be32 -nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, +nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, char *fname, int flen, struct iattr *iap, int type, dev_t rdev, struct svc_fh *resfhp) { - struct dentry *dentry, *dchild = NULL; + struct dentry *dentry, *dchild; struct inode *dirp; __be32 err; __be32 err2; int host_err; - err = nfserr_perm; - if (!flen) - goto out; - err = nfserr_exist; - if (isdotent(fname, flen)) - goto out; - - err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); - if (err) - goto out; - dentry = fhp->fh_dentry; dirp = d_inode(dentry); - err = nfserr_notdir; - if (!dirp->i_op->lookup) - goto out; - /* - * Check whether the response file handle has been verified yet. - * If it has, the parent directory should already be locked. - */ - if (!resfhp->fh_dentry) { - host_err = fh_want_write(fhp); - if (host_err) - goto out_nfserr; - - /* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */ - fh_lock_nested(fhp, I_MUTEX_PARENT); - dchild = lookup_one_len(fname, dentry, flen); - host_err = PTR_ERR(dchild); - if (IS_ERR(dchild)) - goto out_nfserr; - err = fh_compose(resfhp, fhp->fh_export, dchild, fhp); - if (err) - goto out; - } else { - /* called from nfsd_proc_create */ - dchild = dget(resfhp->fh_dentry); - if (!fhp->fh_locked) { - /* not actually possible */ - printk(KERN_ERR - "nfsd_create: parent %pd2 not locked!\n", + dchild = dget(resfhp->fh_dentry); + if (!fhp->fh_locked) { + WARN_ONCE(1, "nfsd_create: parent %pd2 not locked!\n", dentry); - err = nfserr_io; - goto out; - } - } - /* - * Make sure the child dentry is still negative ... - */ - err = nfserr_exist; - if (d_really_is_positive(dchild)) { - dprintk("nfsd_create: dentry %pd/%pd not negative!\n", - dentry, dchild); - goto out; + err = nfserr_io; + goto out; } + err = nfsd_permission(rqstp, fhp->fh_export, dentry, NFSD_MAY_CREATE); + if (err) + goto out; + if (!(iap->ia_valid & ATTR_MODE)) iap->ia_mode = 0; iap->ia_mode = (iap->ia_mode & S_IALLUGO) | type; - err = nfserr_inval; - if (!S_ISREG(type) && !S_ISDIR(type) && !special_file(type)) { - printk(KERN_WARNING "nfsd: bad file type %o in nfsd_create\n", - type); - goto out; - } - - /* - * Get the dir op function pointer. - */ err = 0; host_err = 0; switch (type) { @@ -1242,6 +1183,10 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, case S_IFSOCK: host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev); break; + default: + printk(KERN_WARNING "nfsd: bad file type %o in nfsd_create\n", + type); + host_err = -EINVAL; } if (host_err < 0) goto out_nfserr; @@ -1251,7 +1196,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, /* * nfsd_create_setattr already committed the child. Transactional * filesystems had a chance to commit changes for both parent and - * child * simultaneously making the following commit_metadata a + * child simultaneously making the following commit_metadata a * noop. */ err2 = nfserrno(commit_metadata(fhp)); @@ -1263,8 +1208,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, if (!err) err = fh_update(resfhp); out: - if (dchild && !IS_ERR(dchild)) - dput(dchild); + dput(dchild); return err; out_nfserr: @@ -1272,6 +1216,53 @@ out_nfserr: goto out; } +/* + * Create a filesystem object (regular, directory, special). + * Note that the parent directory is left locked. + * + * N.B. Every call to nfsd_create needs an fh_put for _both_ fhp and resfhp + */ +__be32 +nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, + char *fname, int flen, struct iattr *iap, + int type, dev_t rdev, struct svc_fh *resfhp) +{ + struct dentry *dentry, *dchild = NULL; + struct inode *dirp; + __be32 err; + int host_err; + + if (isdotent(fname, flen)) + return nfserr_exist; + + err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_NOP); + if (err) + return err; + + dentry = fhp->fh_dentry; + dirp = d_inode(dentry); + + host_err = fh_want_write(fhp); + if (host_err) + return nfserrno(host_err); + + fh_lock_nested(fhp, I_MUTEX_PARENT); + dchild = lookup_one_len(fname, dentry, flen); + host_err = PTR_ERR(dchild); + if (IS_ERR(dchild)) + return nfserrno(host_err); + err = fh_compose(resfhp, fhp->fh_export, dchild, fhp); + /* + * We unconditionally drop our ref to dchild as fh_compose will have + * already grabbed its own ref for it. + */ + dput(dchild); + if (err) + return err; + return nfsd_create_locked(rqstp, fhp, fname, flen, iap, type, + rdev, resfhp); +} + #ifdef CONFIG_NFSD_V3 /* @@ -1304,12 +1295,6 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, dentry = fhp->fh_dentry; dirp = d_inode(dentry); - /* Get all the sanity checks out of the way before - * we lock the parent. */ - err = nfserr_notdir; - if (!dirp->i_op->lookup) - goto out; - host_err = fh_want_write(fhp); if (host_err) goto out_nfserr; diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 2d573ec057f8..3cbb1b33777b 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -59,6 +59,9 @@ __be32 nfsd4_vfs_fallocate(struct svc_rqst *, struct svc_fh *, __be32 nfsd4_clone_file_range(struct file *, u64, struct file *, u64, u64); #endif /* CONFIG_NFSD_V4 */ +__be32 nfsd_create_locked(struct svc_rqst *, struct svc_fh *, + char *name, int len, struct iattr *attrs, + int type, dev_t rdev, struct svc_fh *res); __be32 nfsd_create(struct svc_rqst *, struct svc_fh *, char *name, int len, struct iattr *attrs, int type, dev_t rdev, struct svc_fh *res); diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index d9554813e58a..beea0c5edc51 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -59,6 +59,7 @@ struct nfsd4_compound_state { struct nfsd4_session *session; struct nfsd4_slot *slot; int data_offset; + bool spo_must_allowed; size_t iovlen; u32 minorversion; __be32 status; @@ -403,6 +404,8 @@ struct nfsd4_exchange_id { clientid_t clientid; u32 seqid; int spa_how; + u32 spo_must_enforce[3]; + u32 spo_must_allow[3]; }; struct nfsd4_sequence { @@ -654,6 +657,8 @@ set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp) } + +bool nfsd4_mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp); int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *, void *); int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *, struct nfsd4_compoundargs *); diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 12e064b8be9a..533bd524e41e 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -172,12 +172,10 @@ void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { struct hlist_head *bucket; - struct qstr *q; assert_spin_locked(&dlm->spinlock); - q = &res->lockname; - bucket = dlm_lockres_hash(dlm, q->hash); + bucket = dlm_lockres_hash(dlm, res->lockname.hash); /* get a reference for our hashtable */ dlm_lockres_get(res); diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 47b3b2d4e775..ef474cdd6404 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -469,7 +469,7 @@ static int dlmfs_mkdir(struct inode * dir, { int status; struct inode *inode = NULL; - struct qstr *domain = &dentry->d_name; + const struct qstr *domain = &dentry->d_name; struct dlmfs_inode_private *ip; struct ocfs2_cluster_connection *conn; @@ -518,7 +518,7 @@ static int dlmfs_create(struct inode *dir, { int status = 0; struct inode *inode; - struct qstr *name = &dentry->d_name; + const struct qstr *name = &dentry->d_name; mlog(0, "create %.*s\n", name->len, name->name); diff --git a/fs/ocfs2/dlmfs/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c index 0499e3fb7bdb..f70cda2f090d 100644 --- a/fs/ocfs2/dlmfs/userdlm.c +++ b/fs/ocfs2/dlmfs/userdlm.c @@ -667,7 +667,7 @@ void user_dlm_set_locking_protocol(void) ocfs2_stack_glue_set_max_proto_version(&user_dlm_lproto.lp_max_version); } -struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name) +struct ocfs2_cluster_connection *user_dlm_register(const struct qstr *name) { int rc; struct ocfs2_cluster_connection *conn; diff --git a/fs/ocfs2/dlmfs/userdlm.h b/fs/ocfs2/dlmfs/userdlm.h index 3b42d79531d7..ede94a6e7fd3 100644 --- a/fs/ocfs2/dlmfs/userdlm.h +++ b/fs/ocfs2/dlmfs/userdlm.h @@ -83,7 +83,7 @@ void user_dlm_write_lvb(struct inode *inode, ssize_t user_dlm_read_lvb(struct inode *inode, char *val, unsigned int len); -struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name); +struct ocfs2_cluster_connection *user_dlm_register(const struct qstr *name); void user_dlm_unregister(struct ocfs2_cluster_connection *conn); void user_dlm_set_locking_protocol(void); diff --git a/fs/open.c b/fs/open.c index bf66cf1a9f5c..4fd6e256f4f4 100644 --- a/fs/open.c +++ b/fs/open.c @@ -998,6 +998,26 @@ struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt, } EXPORT_SYMBOL(file_open_root); +struct file *filp_clone_open(struct file *oldfile) +{ + struct file *file; + int retval; + + file = get_empty_filp(); + if (IS_ERR(file)) + return file; + + file->f_flags = oldfile->f_flags; + retval = vfs_open(&oldfile->f_path, file, oldfile->f_cred); + if (retval) { + put_filp(file); + return ERR_PTR(retval); + } + + return file; +} +EXPORT_SYMBOL(filp_clone_open); + long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) { struct open_flags op; diff --git a/fs/pipe.c b/fs/pipe.c index 4b32928f5426..4ebe6b2e5217 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -144,10 +144,8 @@ static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, struct page *page = buf->page; if (page_count(page) == 1) { - if (memcg_kmem_enabled()) { + if (memcg_kmem_enabled()) memcg_kmem_uncharge(page, 0); - __ClearPageKmemcg(page); - } __SetPageLocked(page); return 0; } diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 09e18fdf61e5..b9a8c813e5e6 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -46,7 +46,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) cached = 0; for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) - pages[lru] = global_page_state(NR_LRU_BASE + lru); + pages[lru] = global_node_page_state(NR_LRU_BASE + lru); available = si_mem_available(); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 62d8c6975d34..2ed3d71d4767 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -479,7 +479,7 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, { struct ctl_table_header *head = grab_header(dir); struct ctl_table_header *h = NULL; - struct qstr *name = &dentry->d_name; + const struct qstr *name = &dentry->d_name; struct ctl_table *p; struct inode *inode; struct dentry *err = ERR_PTR(-ENOENT); @@ -839,7 +839,7 @@ static int sysctl_is_seen(struct ctl_table_header *p) return res; } -static int proc_sys_compare(const struct dentry *parent, const struct dentry *dentry, +static int proc_sys_compare(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { struct ctl_table_header *head; diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 47516a794011..7a034d62cf8c 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -486,30 +486,21 @@ static int ramoops_parse_dt(struct platform_device *pdev, struct ramoops_platform_data *pdata) { struct device_node *of_node = pdev->dev.of_node; - struct device_node *mem_region; - struct resource res; + struct resource *res; u32 value; int ret; dev_dbg(&pdev->dev, "using Device Tree\n"); - mem_region = of_parse_phandle(of_node, "memory-region", 0); - if (!mem_region) { - dev_err(&pdev->dev, "no memory-region phandle\n"); - return -ENODEV; - } - - ret = of_address_to_resource(mem_region, 0, &res); - of_node_put(mem_region); - if (ret) { + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) { dev_err(&pdev->dev, - "failed to translate memory-region to resource: %d\n", - ret); - return ret; + "failed to locate DT /reserved-memory resource\n"); + return -EINVAL; } - pdata->mem_size = resource_size(&res); - pdata->mem_address = res.start; + pdata->mem_size = resource_size(res); + pdata->mem_address = res->start; pdata->mem_type = of_property_read_bool(of_node, "unbuffered"); pdata->dump_oops = !of_property_read_bool(of_node, "no-dump-oops"); @@ -652,11 +643,11 @@ fail_buf: kfree(cxt->pstore.buf); fail_clear: cxt->pstore.bufsize = 0; - kfree(cxt->mprz); + persistent_ram_free(cxt->mprz); fail_init_mprz: - kfree(cxt->fprz); + persistent_ram_free(cxt->fprz); fail_init_fprz: - kfree(cxt->cprz); + persistent_ram_free(cxt->cprz); fail_init_cprz: ramoops_free_przs(cxt); fail_out: diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index 9718da86ad01..821b34816976 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -100,10 +100,6 @@ static int switch_gc_head(struct ubifs_info *c) if (err) return err; - err = ubifs_wbuf_sync_nolock(wbuf); - if (err) - return err; - err = ubifs_add_bud_to_log(c, GCHD, gc_lnum, 0); if (err) return err; diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 70349954e78b..4ec051089186 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -520,19 +520,19 @@ static int init_constants_early(struct ubifs_info *c) c->max_write_shift = fls(c->max_write_size) - 1; if (c->leb_size < UBIFS_MIN_LEB_SZ) { - ubifs_err(c, "too small LEBs (%d bytes), min. is %d bytes", - c->leb_size, UBIFS_MIN_LEB_SZ); + ubifs_errc(c, "too small LEBs (%d bytes), min. is %d bytes", + c->leb_size, UBIFS_MIN_LEB_SZ); return -EINVAL; } if (c->leb_cnt < UBIFS_MIN_LEB_CNT) { - ubifs_err(c, "too few LEBs (%d), min. is %d", - c->leb_cnt, UBIFS_MIN_LEB_CNT); + ubifs_errc(c, "too few LEBs (%d), min. is %d", + c->leb_cnt, UBIFS_MIN_LEB_CNT); return -EINVAL; } if (!is_power_of_2(c->min_io_size)) { - ubifs_err(c, "bad min. I/O size %d", c->min_io_size); + ubifs_errc(c, "bad min. I/O size %d", c->min_io_size); return -EINVAL; } @@ -543,8 +543,8 @@ static int init_constants_early(struct ubifs_info *c) if (c->max_write_size < c->min_io_size || c->max_write_size % c->min_io_size || !is_power_of_2(c->max_write_size)) { - ubifs_err(c, "bad write buffer size %d for %d min. I/O unit", - c->max_write_size, c->min_io_size); + ubifs_errc(c, "bad write buffer size %d for %d min. I/O unit", + c->max_write_size, c->min_io_size); return -EINVAL; } @@ -2108,8 +2108,9 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, */ ubi = open_ubi(name, UBI_READONLY); if (IS_ERR(ubi)) { - pr_err("UBIFS error (pid: %d): cannot open \"%s\", error %d", - current->pid, name, (int)PTR_ERR(ubi)); + if (!(flags & MS_SILENT)) + pr_err("UBIFS error (pid: %d): cannot open \"%s\", error %d", + current->pid, name, (int)PTR_ERR(ubi)); return ERR_CAST(ubi); } diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index ddf9f6b9eee2..4617d459022a 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1783,8 +1783,8 @@ void ubifs_err(const struct ubifs_info *c, const char *fmt, ...); __printf(2, 3) void ubifs_warn(const struct ubifs_info *c, const char *fmt, ...); /* - * A variant of 'ubifs_err()' which takes the UBIFS file-sytem description - * object as an argument. + * A conditional variant of 'ubifs_err()' which doesn't output anything + * if probing (ie. MS_SILENT set). */ #define ubifs_errc(c, fmt, ...) \ do { \ diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index b5fc27969e9d..e237811f09ce 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -592,19 +592,19 @@ static int ubifs_xattr_set(const struct xattr_handler *handler, return __ubifs_removexattr(inode, name); } -const struct xattr_handler ubifs_user_xattr_handler = { +static const struct xattr_handler ubifs_user_xattr_handler = { .prefix = XATTR_USER_PREFIX, .get = ubifs_xattr_get, .set = ubifs_xattr_set, }; -const struct xattr_handler ubifs_trusted_xattr_handler = { +static const struct xattr_handler ubifs_trusted_xattr_handler = { .prefix = XATTR_TRUSTED_PREFIX, .get = ubifs_xattr_get, .set = ubifs_xattr_set, }; -const struct xattr_handler ubifs_security_xattr_handler = { +static const struct xattr_handler ubifs_security_xattr_handler = { .prefix = XATTR_SECURITY_PREFIX, .get = ubifs_xattr_get, .set = ubifs_xattr_set, diff --git a/fs/utimes.c b/fs/utimes.c index 85c40f4f373d..794f5f5b1fb5 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -92,10 +92,11 @@ static int utimes_common(struct path *path, struct timespec *times) * then we need to check permissions, because * inode_change_ok() won't do it. */ - error = -EACCES; + error = -EPERM; if (IS_IMMUTABLE(inode)) goto mnt_drop_write_and_out; + error = -EACCES; if (!inode_owner_or_capable(inode)) { error = inode_permission(inode, MAY_WRITE); if (error) diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 3542d94fddce..fc593c869493 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -39,6 +39,7 @@ xfs-y += $(addprefix libxfs/, \ xfs_btree.o \ xfs_da_btree.o \ xfs_da_format.o \ + xfs_defer.o \ xfs_dir2.o \ xfs_dir2_block.o \ xfs_dir2_data.o \ @@ -51,6 +52,8 @@ xfs-y += $(addprefix libxfs/, \ xfs_inode_fork.o \ xfs_inode_buf.o \ xfs_log_rlimit.o \ + xfs_rmap.o \ + xfs_rmap_btree.o \ xfs_sb.o \ xfs_symlink_remote.o \ xfs_trans_resv.o \ @@ -100,11 +103,13 @@ xfs-y += xfs_log.o \ xfs_extfree_item.o \ xfs_icreate_item.o \ xfs_inode_item.o \ + xfs_rmap_item.o \ xfs_log_recover.o \ xfs_trans_ail.o \ xfs_trans_buf.o \ xfs_trans_extfree.o \ xfs_trans_inode.o \ + xfs_trans_rmap.o \ # optional features xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \ @@ -121,5 +126,4 @@ xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o -xfs-$(CONFIG_NFSD_BLOCKLAYOUT) += xfs_pnfs.o -xfs-$(CONFIG_NFSD_SCSILAYOUT) += xfs_pnfs.o +xfs-$(CONFIG_EXPORTFS_BLOCK_OPS) += xfs_pnfs.o diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 88c26b827a2d..776ae2f325d1 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -24,8 +24,10 @@ #include "xfs_bit.h" #include "xfs_sb.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_btree.h" +#include "xfs_rmap.h" #include "xfs_alloc_btree.h" #include "xfs_alloc.h" #include "xfs_extent_busy.h" @@ -49,6 +51,81 @@ STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *); STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *, xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *); +xfs_extlen_t +xfs_prealloc_blocks( + struct xfs_mount *mp) +{ + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + return XFS_RMAP_BLOCK(mp) + 1; + if (xfs_sb_version_hasfinobt(&mp->m_sb)) + return XFS_FIBT_BLOCK(mp) + 1; + return XFS_IBT_BLOCK(mp) + 1; +} + +/* + * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of + * AGF buffer (PV 947395), we place constraints on the relationship among + * actual allocations for data blocks, freelist blocks, and potential file data + * bmap btree blocks. However, these restrictions may result in no actual space + * allocated for a delayed extent, for example, a data block in a certain AG is + * allocated but there is no additional block for the additional bmap btree + * block due to a split of the bmap btree of the file. The result of this may + * lead to an infinite loop when the file gets flushed to disk and all delayed + * extents need to be actually allocated. To get around this, we explicitly set + * aside a few blocks which will not be reserved in delayed allocation. + * + * When rmap is disabled, we need to reserve 4 fsbs _per AG_ for the freelist + * and 4 more to handle a potential split of the file's bmap btree. + * + * When rmap is enabled, we must also be able to handle two rmap btree inserts + * to record both the file data extent and a new bmbt block. The bmbt block + * might not be in the same AG as the file data extent. In the worst case + * the bmap btree splits multiple levels and all the new blocks come from + * different AGs, so set aside enough to handle rmap btree splits in all AGs. + */ +unsigned int +xfs_alloc_set_aside( + struct xfs_mount *mp) +{ + unsigned int blocks; + + blocks = 4 + (mp->m_sb.sb_agcount * XFS_ALLOC_AGFL_RESERVE); + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + blocks += mp->m_sb.sb_agcount * mp->m_rmap_maxlevels; + return blocks; +} + +/* + * When deciding how much space to allocate out of an AG, we limit the + * allocation maximum size to the size the AG. However, we cannot use all the + * blocks in the AG - some are permanently used by metadata. These + * blocks are generally: + * - the AG superblock, AGF, AGI and AGFL + * - the AGF (bno and cnt) and AGI btree root blocks, and optionally + * the AGI free inode and rmap btree root blocks. + * - blocks on the AGFL according to xfs_alloc_set_aside() limits + * - the rmapbt root block + * + * The AG headers are sector sized, so the amount of space they take up is + * dependent on filesystem geometry. The others are all single blocks. + */ +unsigned int +xfs_alloc_ag_max_usable( + struct xfs_mount *mp) +{ + unsigned int blocks; + + blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */ + blocks += XFS_ALLOC_AGFL_RESERVE; + blocks += 3; /* AGF, AGI btree root blocks */ + if (xfs_sb_version_hasfinobt(&mp->m_sb)) + blocks++; /* finobt root block */ + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + blocks++; /* rmap root block */ + + return mp->m_sb.sb_agblocks - blocks; +} + /* * Lookup the record equal to [bno, len] in the btree given by cur. */ @@ -636,6 +713,14 @@ xfs_alloc_ag_vextent( ASSERT(!args->wasfromfl || !args->isfl); ASSERT(args->agbno % args->alignment == 0); + /* if not file data, insert new block into the reverse map btree */ + if (args->oinfo.oi_owner != XFS_RMAP_OWN_UNKNOWN) { + error = xfs_rmap_alloc(args->tp, args->agbp, args->agno, + args->agbno, args->len, &args->oinfo); + if (error) + return error; + } + if (!args->wasfromfl) { error = xfs_alloc_update_counters(args->tp, args->pag, args->agbp, @@ -1577,14 +1662,15 @@ error0: /* * Free the extent starting at agno/bno for length. */ -STATIC int /* error */ +STATIC int xfs_free_ag_extent( - xfs_trans_t *tp, /* transaction pointer */ - xfs_buf_t *agbp, /* buffer for a.g. freelist header */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_agblock_t bno, /* starting block number */ - xfs_extlen_t len, /* length of extent */ - int isfl) /* set if is freelist blocks - no sb acctg */ + xfs_trans_t *tp, + xfs_buf_t *agbp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len, + struct xfs_owner_info *oinfo, + int isfl) { xfs_btree_cur_t *bno_cur; /* cursor for by-block btree */ xfs_btree_cur_t *cnt_cur; /* cursor for by-size btree */ @@ -1601,12 +1687,19 @@ xfs_free_ag_extent( xfs_extlen_t nlen; /* new length of freespace */ xfs_perag_t *pag; /* per allocation group data */ + bno_cur = cnt_cur = NULL; mp = tp->t_mountp; + + if (oinfo->oi_owner != XFS_RMAP_OWN_UNKNOWN) { + error = xfs_rmap_free(tp, agbp, agno, bno, len, oinfo); + if (error) + goto error0; + } + /* * Allocate and initialize a cursor for the by-block btree. */ bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO); - cnt_cur = NULL; /* * Look for a neighboring block on the left (lower block numbers) * that is contiguous with this space. @@ -1875,6 +1968,11 @@ xfs_alloc_min_freelist( /* space needed by-size freespace btree */ min_free += min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_CNTi] + 1, mp->m_ag_maxlevels); + /* space needed reverse mapping used space btree */ + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + min_free += min_t(unsigned int, + pag->pagf_levels[XFS_BTNUM_RMAPi] + 1, + mp->m_rmap_maxlevels); return min_free; } @@ -1992,21 +2090,34 @@ xfs_alloc_fix_freelist( * anything other than extra overhead when we need to put more blocks * back on the free list? Maybe we should only do this when space is * getting low or the AGFL is more than half full? + * + * The NOSHRINK flag prevents the AGFL from being shrunk if it's too + * big; the NORMAP flag prevents AGFL expand/shrink operations from + * updating the rmapbt. Both flags are used in xfs_repair while we're + * rebuilding the rmapbt, and neither are used by the kernel. They're + * both required to ensure that rmaps are correctly recorded for the + * regenerated AGFL, bnobt, and cntbt. See repair/phase5.c and + * repair/rmap.c in xfsprogs for details. */ - while (pag->pagf_flcount > need) { + memset(&targs, 0, sizeof(targs)); + if (flags & XFS_ALLOC_FLAG_NORMAP) + xfs_rmap_skip_owner_update(&targs.oinfo); + else + xfs_rmap_ag_owner(&targs.oinfo, XFS_RMAP_OWN_AG); + while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) { struct xfs_buf *bp; error = xfs_alloc_get_freelist(tp, agbp, &bno, 0); if (error) goto out_agbp_relse; - error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1); + error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, + &targs.oinfo, 1); if (error) goto out_agbp_relse; bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0); xfs_trans_binval(tp, bp); } - memset(&targs, 0, sizeof(targs)); targs.tp = tp; targs.mp = mp; targs.agbp = agbp; @@ -2271,6 +2382,10 @@ xfs_agf_verify( be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS) return false; + if (xfs_sb_version_hasrmapbt(&mp->m_sb) && + be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS) + return false; + /* * during growfs operations, the perag is not fully initialised, * so we can't use it for any useful checking. growfs ensures we can't @@ -2402,6 +2517,8 @@ xfs_alloc_read_agf( be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]); pag->pagf_levels[XFS_BTNUM_CNTi] = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); + pag->pagf_levels[XFS_BTNUM_RMAPi] = + be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]); spin_lock_init(&pag->pagb_lock); pag->pagb_count = 0; pag->pagb_tree = RB_ROOT; @@ -2691,7 +2808,8 @@ int /* error */ xfs_free_extent( struct xfs_trans *tp, /* transaction pointer */ xfs_fsblock_t bno, /* starting block number of extent */ - xfs_extlen_t len) /* length of extent */ + xfs_extlen_t len, /* length of extent */ + struct xfs_owner_info *oinfo) /* extent owner */ { struct xfs_mount *mp = tp->t_mountp; struct xfs_buf *agbp; @@ -2701,6 +2819,11 @@ xfs_free_extent( ASSERT(len != 0); + if (XFS_TEST_ERROR(false, mp, + XFS_ERRTAG_FREE_EXTENT, + XFS_RANDOM_FREE_EXTENT)) + return -EIO; + error = xfs_free_extent_fix_freelist(tp, agno, &agbp); if (error) return error; @@ -2712,7 +2835,7 @@ xfs_free_extent( agbno + len <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length), err); - error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, 0); + error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, 0); if (error) goto err; diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index cf268b2d0b6c..6fe2d6b7cfe9 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -54,41 +54,8 @@ typedef unsigned int xfs_alloctype_t; */ #define XFS_ALLOC_FLAG_TRYLOCK 0x00000001 /* use trylock for buffer locking */ #define XFS_ALLOC_FLAG_FREEING 0x00000002 /* indicate caller is freeing extents*/ - -/* - * In order to avoid ENOSPC-related deadlock caused by - * out-of-order locking of AGF buffer (PV 947395), we place - * constraints on the relationship among actual allocations for - * data blocks, freelist blocks, and potential file data bmap - * btree blocks. However, these restrictions may result in no - * actual space allocated for a delayed extent, for example, a data - * block in a certain AG is allocated but there is no additional - * block for the additional bmap btree block due to a split of the - * bmap btree of the file. The result of this may lead to an - * infinite loop in xfssyncd when the file gets flushed to disk and - * all delayed extents need to be actually allocated. To get around - * this, we explicitly set aside a few blocks which will not be - * reserved in delayed allocation. Considering the minimum number of - * needed freelist blocks is 4 fsbs _per AG_, a potential split of file's bmap - * btree requires 1 fsb, so we set the number of set-aside blocks - * to 4 + 4*agcount. - */ -#define XFS_ALLOC_SET_ASIDE(mp) (4 + ((mp)->m_sb.sb_agcount * 4)) - -/* - * When deciding how much space to allocate out of an AG, we limit the - * allocation maximum size to the size the AG. However, we cannot use all the - * blocks in the AG - some are permanently used by metadata. These - * blocks are generally: - * - the AG superblock, AGF, AGI and AGFL - * - the AGF (bno and cnt) and AGI btree root blocks - * - 4 blocks on the AGFL according to XFS_ALLOC_SET_ASIDE() limits - * - * The AG headers are sector sized, so the amount of space they take up is - * dependent on filesystem geometry. The others are all single blocks. - */ -#define XFS_ALLOC_AG_MAX_USABLE(mp) \ - ((mp)->m_sb.sb_agblocks - XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)) - 7) +#define XFS_ALLOC_FLAG_NORMAP 0x00000004 /* don't modify the rmapbt */ +#define XFS_ALLOC_FLAG_NOSHRINK 0x00000008 /* don't shrink the freelist */ /* @@ -123,6 +90,7 @@ typedef struct xfs_alloc_arg { char isfl; /* set if is freelist blocks - !acctg */ char userdata; /* mask defining userdata treatment */ xfs_fsblock_t firstblock; /* io first block allocated */ + struct xfs_owner_info oinfo; /* owner of blocks being allocated */ } xfs_alloc_arg_t; /* @@ -132,6 +100,11 @@ typedef struct xfs_alloc_arg { #define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */ #define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */ +/* freespace limit calculations */ +#define XFS_ALLOC_AGFL_RESERVE 4 +unsigned int xfs_alloc_set_aside(struct xfs_mount *mp); +unsigned int xfs_alloc_ag_max_usable(struct xfs_mount *mp); + xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp, struct xfs_perag *pag, xfs_extlen_t need); unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp, @@ -208,9 +181,10 @@ xfs_alloc_vextent( */ int /* error */ xfs_free_extent( - struct xfs_trans *tp, /* transaction pointer */ - xfs_fsblock_t bno, /* starting block number of extent */ - xfs_extlen_t len); /* length of extent */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_fsblock_t bno, /* starting block number of extent */ + xfs_extlen_t len, /* length of extent */ + struct xfs_owner_info *oinfo);/* extent owner */ int /* error */ xfs_alloc_lookup_ge( @@ -232,4 +206,6 @@ int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags); int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno, struct xfs_buf **agbp); +xfs_extlen_t xfs_prealloc_blocks(struct xfs_mount *mp); + #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index d9b42425291e..5ba2dac5e67c 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -212,17 +212,6 @@ xfs_allocbt_init_key_from_rec( } STATIC void -xfs_allocbt_init_rec_from_key( - union xfs_btree_key *key, - union xfs_btree_rec *rec) -{ - ASSERT(key->alloc.ar_startblock != 0); - - rec->alloc.ar_startblock = key->alloc.ar_startblock; - rec->alloc.ar_blockcount = key->alloc.ar_blockcount; -} - -STATIC void xfs_allocbt_init_rec_from_cur( struct xfs_btree_cur *cur, union xfs_btree_rec *rec) @@ -406,7 +395,6 @@ static const struct xfs_btree_ops xfs_allocbt_ops = { .get_minrecs = xfs_allocbt_get_minrecs, .get_maxrecs = xfs_allocbt_get_maxrecs, .init_key_from_rec = xfs_allocbt_init_key_from_rec, - .init_rec_from_key = xfs_allocbt_init_rec_from_key, .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, .key_diff = xfs_allocbt_key_diff, diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 4e126f41a0aa..af1ecb19121e 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -23,6 +23,7 @@ #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_attr_sf.h" @@ -203,7 +204,7 @@ xfs_attr_set( { struct xfs_mount *mp = dp->i_mount; struct xfs_da_args args; - struct xfs_bmap_free flist; + struct xfs_defer_ops dfops; struct xfs_trans_res tres; xfs_fsblock_t firstblock; int rsvd = (flags & ATTR_ROOT) != 0; @@ -221,7 +222,7 @@ xfs_attr_set( args.value = value; args.valuelen = valuelen; args.firstblock = &firstblock; - args.flist = &flist; + args.dfops = &dfops; args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; args.total = xfs_attr_calc_size(&args, &local); @@ -316,13 +317,13 @@ xfs_attr_set( * It won't fit in the shortform, transform to a leaf block. * GROT: another possible req'mt for a double-split btree op. */ - xfs_bmap_init(args.flist, args.firstblock); + xfs_defer_init(args.dfops, args.firstblock); error = xfs_attr_shortform_to_leaf(&args); if (!error) - error = xfs_bmap_finish(&args.trans, args.flist, dp); + error = xfs_defer_finish(&args.trans, args.dfops, dp); if (error) { args.trans = NULL; - xfs_bmap_cancel(&flist); + xfs_defer_cancel(&dfops); goto out; } @@ -382,7 +383,7 @@ xfs_attr_remove( { struct xfs_mount *mp = dp->i_mount; struct xfs_da_args args; - struct xfs_bmap_free flist; + struct xfs_defer_ops dfops; xfs_fsblock_t firstblock; int error; @@ -399,7 +400,7 @@ xfs_attr_remove( return error; args.firstblock = &firstblock; - args.flist = &flist; + args.dfops = &dfops; /* * we have no control over the attribute names that userspace passes us @@ -584,13 +585,13 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * Commit that transaction so that the node_addname() call * can manage its own transactions. */ - xfs_bmap_init(args->flist, args->firstblock); + xfs_defer_init(args->dfops, args->firstblock); error = xfs_attr3_leaf_to_node(args); if (!error) - error = xfs_bmap_finish(&args->trans, args->flist, dp); + error = xfs_defer_finish(&args->trans, args->dfops, dp); if (error) { args->trans = NULL; - xfs_bmap_cancel(args->flist); + xfs_defer_cancel(args->dfops); return error; } @@ -674,15 +675,15 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * If the result is small enough, shrink it all into the inode. */ if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { - xfs_bmap_init(args->flist, args->firstblock); + xfs_defer_init(args->dfops, args->firstblock); error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (!error) - error = xfs_bmap_finish(&args->trans, - args->flist, dp); + error = xfs_defer_finish(&args->trans, + args->dfops, dp); if (error) { args->trans = NULL; - xfs_bmap_cancel(args->flist); + xfs_defer_cancel(args->dfops); return error; } } @@ -737,14 +738,14 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) * If the result is small enough, shrink it all into the inode. */ if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { - xfs_bmap_init(args->flist, args->firstblock); + xfs_defer_init(args->dfops, args->firstblock); error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (!error) - error = xfs_bmap_finish(&args->trans, args->flist, dp); + error = xfs_defer_finish(&args->trans, args->dfops, dp); if (error) { args->trans = NULL; - xfs_bmap_cancel(args->flist); + xfs_defer_cancel(args->dfops); return error; } } @@ -863,14 +864,14 @@ restart: */ xfs_da_state_free(state); state = NULL; - xfs_bmap_init(args->flist, args->firstblock); + xfs_defer_init(args->dfops, args->firstblock); error = xfs_attr3_leaf_to_node(args); if (!error) - error = xfs_bmap_finish(&args->trans, - args->flist, dp); + error = xfs_defer_finish(&args->trans, + args->dfops, dp); if (error) { args->trans = NULL; - xfs_bmap_cancel(args->flist); + xfs_defer_cancel(args->dfops); goto out; } @@ -891,13 +892,13 @@ restart: * in the index/blkno/rmtblkno/rmtblkcnt fields and * in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields. */ - xfs_bmap_init(args->flist, args->firstblock); + xfs_defer_init(args->dfops, args->firstblock); error = xfs_da3_split(state); if (!error) - error = xfs_bmap_finish(&args->trans, args->flist, dp); + error = xfs_defer_finish(&args->trans, args->dfops, dp); if (error) { args->trans = NULL; - xfs_bmap_cancel(args->flist); + xfs_defer_cancel(args->dfops); goto out; } } else { @@ -990,14 +991,14 @@ restart: * Check to see if the tree needs to be collapsed. */ if (retval && (state->path.active > 1)) { - xfs_bmap_init(args->flist, args->firstblock); + xfs_defer_init(args->dfops, args->firstblock); error = xfs_da3_join(state); if (!error) - error = xfs_bmap_finish(&args->trans, - args->flist, dp); + error = xfs_defer_finish(&args->trans, + args->dfops, dp); if (error) { args->trans = NULL; - xfs_bmap_cancel(args->flist); + xfs_defer_cancel(args->dfops); goto out; } } @@ -1113,13 +1114,13 @@ xfs_attr_node_removename(xfs_da_args_t *args) * Check to see if the tree needs to be collapsed. */ if (retval && (state->path.active > 1)) { - xfs_bmap_init(args->flist, args->firstblock); + xfs_defer_init(args->dfops, args->firstblock); error = xfs_da3_join(state); if (!error) - error = xfs_bmap_finish(&args->trans, args->flist, dp); + error = xfs_defer_finish(&args->trans, args->dfops, dp); if (error) { args->trans = NULL; - xfs_bmap_cancel(args->flist); + xfs_defer_cancel(args->dfops); goto out; } /* @@ -1146,15 +1147,15 @@ xfs_attr_node_removename(xfs_da_args_t *args) goto out; if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { - xfs_bmap_init(args->flist, args->firstblock); + xfs_defer_init(args->dfops, args->firstblock); error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (!error) - error = xfs_bmap_finish(&args->trans, - args->flist, dp); + error = xfs_defer_finish(&args->trans, + args->dfops, dp); if (error) { args->trans = NULL; - xfs_bmap_cancel(args->flist); + xfs_defer_cancel(args->dfops); goto out; } } else diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 01a5ecfedfcf..8ea91f363093 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -792,7 +792,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) nargs.dp = dp; nargs.geo = args->geo; nargs.firstblock = args->firstblock; - nargs.flist = args->flist; + nargs.dfops = args->dfops; nargs.total = args->total; nargs.whichfork = XFS_ATTR_FORK; nargs.trans = args->trans; @@ -922,7 +922,7 @@ xfs_attr3_leaf_to_shortform( nargs.geo = args->geo; nargs.dp = dp; nargs.firstblock = args->firstblock; - nargs.flist = args->flist; + nargs.dfops = args->dfops; nargs.total = args->total; nargs.whichfork = XFS_ATTR_FORK; nargs.trans = args->trans; diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index a572532a55cd..d52f525f5b2d 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -24,6 +24,7 @@ #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_inode.h" @@ -460,16 +461,16 @@ xfs_attr_rmtval_set( * extent and then crash then the block may not contain the * correct metadata after log recovery occurs. */ - xfs_bmap_init(args->flist, args->firstblock); + xfs_defer_init(args->dfops, args->firstblock); nmap = 1; error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, blkcnt, XFS_BMAPI_ATTRFORK, args->firstblock, - args->total, &map, &nmap, args->flist); + args->total, &map, &nmap, args->dfops); if (!error) - error = xfs_bmap_finish(&args->trans, args->flist, dp); + error = xfs_defer_finish(&args->trans, args->dfops, dp); if (error) { args->trans = NULL; - xfs_bmap_cancel(args->flist); + xfs_defer_cancel(args->dfops); return error; } @@ -503,7 +504,7 @@ xfs_attr_rmtval_set( ASSERT(blkcnt > 0); - xfs_bmap_init(args->flist, args->firstblock); + xfs_defer_init(args->dfops, args->firstblock); nmap = 1; error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno, blkcnt, &map, &nmap, @@ -603,16 +604,16 @@ xfs_attr_rmtval_remove( blkcnt = args->rmtblkcnt; done = 0; while (!done) { - xfs_bmap_init(args->flist, args->firstblock); + xfs_defer_init(args->dfops, args->firstblock); error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, XFS_BMAPI_ATTRFORK, 1, args->firstblock, - args->flist, &done); + args->dfops, &done); if (!error) - error = xfs_bmap_finish(&args->trans, args->flist, + error = xfs_defer_finish(&args->trans, args->dfops, args->dp); if (error) { args->trans = NULL; - xfs_bmap_cancel(args->flist); + xfs_defer_cancel(args->dfops); return error; } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 2f2c85cc8117..b060bca93402 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -24,6 +24,7 @@ #include "xfs_bit.h" #include "xfs_sb.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_dir2.h" @@ -45,6 +46,7 @@ #include "xfs_symlink.h" #include "xfs_attr_leaf.h" #include "xfs_filestream.h" +#include "xfs_rmap.h" kmem_zone_t *xfs_bmap_free_item_zone; @@ -570,12 +572,13 @@ xfs_bmap_validate_ret( */ void xfs_bmap_add_free( - struct xfs_mount *mp, /* mount point structure */ - struct xfs_bmap_free *flist, /* list of extents */ - xfs_fsblock_t bno, /* fs block number of extent */ - xfs_filblks_t len) /* length of extent */ + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + xfs_fsblock_t bno, + xfs_filblks_t len, + struct xfs_owner_info *oinfo) { - struct xfs_bmap_free_item *new; /* new element */ + struct xfs_extent_free_item *new; /* new element */ #ifdef DEBUG xfs_agnumber_t agno; xfs_agblock_t agbno; @@ -592,44 +595,17 @@ xfs_bmap_add_free( ASSERT(agbno + len <= mp->m_sb.sb_agblocks); #endif ASSERT(xfs_bmap_free_item_zone != NULL); - new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); - new->xbfi_startblock = bno; - new->xbfi_blockcount = (xfs_extlen_t)len; - list_add(&new->xbfi_list, &flist->xbf_flist); - flist->xbf_count++; -} - -/* - * Remove the entry "free" from the free item list. Prev points to the - * previous entry, unless "free" is the head of the list. - */ -void -xfs_bmap_del_free( - struct xfs_bmap_free *flist, /* free item list header */ - struct xfs_bmap_free_item *free) /* list item to be freed */ -{ - list_del(&free->xbfi_list); - flist->xbf_count--; - kmem_zone_free(xfs_bmap_free_item_zone, free); -} - -/* - * Free up any items left in the list. - */ -void -xfs_bmap_cancel( - struct xfs_bmap_free *flist) /* list of bmap_free_items */ -{ - struct xfs_bmap_free_item *free; /* free list item */ - if (flist->xbf_count == 0) - return; - while (!list_empty(&flist->xbf_flist)) { - free = list_first_entry(&flist->xbf_flist, - struct xfs_bmap_free_item, xbfi_list); - xfs_bmap_del_free(flist, free); - } - ASSERT(flist->xbf_count == 0); + new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); + new->xefi_startblock = bno; + new->xefi_blockcount = (xfs_extlen_t)len; + if (oinfo) + new->xefi_oinfo = *oinfo; + else + xfs_rmap_skip_owner_update(&new->xefi_oinfo); + trace_xfs_bmap_free_defer(mp, XFS_FSB_TO_AGNO(mp, bno), 0, + XFS_FSB_TO_AGBNO(mp, bno), len); + xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list); } /* @@ -659,6 +635,7 @@ xfs_bmap_btree_to_extents( xfs_mount_t *mp; /* mount point structure */ __be64 *pp; /* ptr to block address */ struct xfs_btree_block *rblock;/* root btree block */ + struct xfs_owner_info oinfo; mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); @@ -682,7 +659,8 @@ xfs_bmap_btree_to_extents( cblock = XFS_BUF_TO_BLOCK(cbp); if ((error = xfs_btree_check_block(cur, cblock, 0, cbp))) return error; - xfs_bmap_add_free(mp, cur->bc_private.b.flist, cbno, 1); + xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork); + xfs_bmap_add_free(mp, cur->bc_private.b.dfops, cbno, 1, &oinfo); ip->i_d.di_nblocks--; xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); xfs_trans_binval(tp, cbp); @@ -705,7 +683,7 @@ xfs_bmap_extents_to_btree( xfs_trans_t *tp, /* transaction pointer */ xfs_inode_t *ip, /* incore inode pointer */ xfs_fsblock_t *firstblock, /* first-block-allocated */ - xfs_bmap_free_t *flist, /* blocks freed in xaction */ + struct xfs_defer_ops *dfops, /* blocks freed in xaction */ xfs_btree_cur_t **curp, /* cursor returned to caller */ int wasdel, /* converting a delayed alloc */ int *logflagsp, /* inode logging flags */ @@ -754,7 +732,7 @@ xfs_bmap_extents_to_btree( */ cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); cur->bc_private.b.firstblock = *firstblock; - cur->bc_private.b.flist = flist; + cur->bc_private.b.dfops = dfops; cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0; /* * Convert to a btree with two levels, one record in root. @@ -763,11 +741,12 @@ xfs_bmap_extents_to_btree( memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = mp; + xfs_rmap_ino_bmbt_owner(&args.oinfo, ip->i_ino, whichfork); args.firstblock = *firstblock; if (*firstblock == NULLFSBLOCK) { args.type = XFS_ALLOCTYPE_START_BNO; args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino); - } else if (flist->xbf_low) { + } else if (dfops->dop_low) { args.type = XFS_ALLOCTYPE_START_BNO; args.fsbno = *firstblock; } else { @@ -788,7 +767,7 @@ xfs_bmap_extents_to_btree( ASSERT(args.fsbno != NULLFSBLOCK); ASSERT(*firstblock == NULLFSBLOCK || args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) || - (flist->xbf_low && + (dfops->dop_low && args.agno > XFS_FSB_TO_AGNO(mp, *firstblock))); *firstblock = cur->bc_private.b.firstblock = args.fsbno; cur->bc_private.b.allocated++; @@ -909,6 +888,7 @@ xfs_bmap_local_to_extents( memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = ip->i_mount; + xfs_rmap_ino_owner(&args.oinfo, ip->i_ino, whichfork, 0); args.firstblock = *firstblock; /* * Allocate a block. We know we need only one, since the @@ -973,7 +953,7 @@ xfs_bmap_add_attrfork_btree( xfs_trans_t *tp, /* transaction pointer */ xfs_inode_t *ip, /* incore inode pointer */ xfs_fsblock_t *firstblock, /* first block allocated */ - xfs_bmap_free_t *flist, /* blocks to free at commit */ + struct xfs_defer_ops *dfops, /* blocks to free at commit */ int *flags) /* inode logging flags */ { xfs_btree_cur_t *cur; /* btree cursor */ @@ -986,7 +966,7 @@ xfs_bmap_add_attrfork_btree( *flags |= XFS_ILOG_DBROOT; else { cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK); - cur->bc_private.b.flist = flist; + cur->bc_private.b.dfops = dfops; cur->bc_private.b.firstblock = *firstblock; if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat))) goto error0; @@ -1016,7 +996,7 @@ xfs_bmap_add_attrfork_extents( xfs_trans_t *tp, /* transaction pointer */ xfs_inode_t *ip, /* incore inode pointer */ xfs_fsblock_t *firstblock, /* first block allocated */ - xfs_bmap_free_t *flist, /* blocks to free at commit */ + struct xfs_defer_ops *dfops, /* blocks to free at commit */ int *flags) /* inode logging flags */ { xfs_btree_cur_t *cur; /* bmap btree cursor */ @@ -1025,7 +1005,7 @@ xfs_bmap_add_attrfork_extents( if (ip->i_d.di_nextents * sizeof(xfs_bmbt_rec_t) <= XFS_IFORK_DSIZE(ip)) return 0; cur = NULL; - error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, &cur, 0, + error = xfs_bmap_extents_to_btree(tp, ip, firstblock, dfops, &cur, 0, flags, XFS_DATA_FORK); if (cur) { cur->bc_private.b.allocated = 0; @@ -1051,7 +1031,7 @@ xfs_bmap_add_attrfork_local( xfs_trans_t *tp, /* transaction pointer */ xfs_inode_t *ip, /* incore inode pointer */ xfs_fsblock_t *firstblock, /* first block allocated */ - xfs_bmap_free_t *flist, /* blocks to free at commit */ + struct xfs_defer_ops *dfops, /* blocks to free at commit */ int *flags) /* inode logging flags */ { xfs_da_args_t dargs; /* args for dir/attr code */ @@ -1064,7 +1044,7 @@ xfs_bmap_add_attrfork_local( dargs.geo = ip->i_mount->m_dir_geo; dargs.dp = ip; dargs.firstblock = firstblock; - dargs.flist = flist; + dargs.dfops = dfops; dargs.total = dargs.geo->fsbcount; dargs.whichfork = XFS_DATA_FORK; dargs.trans = tp; @@ -1092,7 +1072,7 @@ xfs_bmap_add_attrfork( int rsvd) /* xact may use reserved blks */ { xfs_fsblock_t firstblock; /* 1st block/ag allocated */ - xfs_bmap_free_t flist; /* freed extent records */ + struct xfs_defer_ops dfops; /* freed extent records */ xfs_mount_t *mp; /* mount structure */ xfs_trans_t *tp; /* transaction pointer */ int blks; /* space reservation */ @@ -1158,18 +1138,18 @@ xfs_bmap_add_attrfork( ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); ip->i_afp->if_flags = XFS_IFEXTENTS; logflags = 0; - xfs_bmap_init(&flist, &firstblock); + xfs_defer_init(&dfops, &firstblock); switch (ip->i_d.di_format) { case XFS_DINODE_FMT_LOCAL: - error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist, + error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &dfops, &logflags); break; case XFS_DINODE_FMT_EXTENTS: error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock, - &flist, &logflags); + &dfops, &logflags); break; case XFS_DINODE_FMT_BTREE: - error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist, + error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &dfops, &logflags); break; default: @@ -1198,7 +1178,7 @@ xfs_bmap_add_attrfork( xfs_log_sb(tp); } - error = xfs_bmap_finish(&tp, &flist, NULL); + error = xfs_defer_finish(&tp, &dfops, NULL); if (error) goto bmap_cancel; error = xfs_trans_commit(tp); @@ -1206,7 +1186,7 @@ xfs_bmap_add_attrfork( return error; bmap_cancel: - xfs_bmap_cancel(&flist); + xfs_defer_cancel(&dfops); trans_cancel: xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -2003,7 +1983,7 @@ xfs_bmap_add_extent_delay_real( if (xfs_bmap_needs_btree(bma->ip, whichfork)) { error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, - bma->firstblock, bma->flist, + bma->firstblock, bma->dfops, &bma->cur, 1, &tmp_rval, whichfork); rval |= tmp_rval; if (error) @@ -2087,7 +2067,7 @@ xfs_bmap_add_extent_delay_real( if (xfs_bmap_needs_btree(bma->ip, whichfork)) { error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, - bma->firstblock, bma->flist, &bma->cur, 1, + bma->firstblock, bma->dfops, &bma->cur, 1, &tmp_rval, whichfork); rval |= tmp_rval; if (error) @@ -2156,7 +2136,7 @@ xfs_bmap_add_extent_delay_real( if (xfs_bmap_needs_btree(bma->ip, whichfork)) { error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, - bma->firstblock, bma->flist, &bma->cur, + bma->firstblock, bma->dfops, &bma->cur, 1, &tmp_rval, whichfork); rval |= tmp_rval; if (error) @@ -2199,13 +2179,18 @@ xfs_bmap_add_extent_delay_real( ASSERT(0); } + /* add reverse mapping */ + error = xfs_rmap_map_extent(mp, bma->dfops, bma->ip, whichfork, new); + if (error) + goto done; + /* convert to a btree if necessary */ if (xfs_bmap_needs_btree(bma->ip, whichfork)) { int tmp_logflags; /* partial log flag return val */ ASSERT(bma->cur == NULL); error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, - bma->firstblock, bma->flist, &bma->cur, + bma->firstblock, bma->dfops, &bma->cur, da_old > 0, &tmp_logflags, whichfork); bma->logflags |= tmp_logflags; if (error) @@ -2247,7 +2232,7 @@ xfs_bmap_add_extent_unwritten_real( xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ xfs_fsblock_t *first, /* pointer to firstblock variable */ - xfs_bmap_free_t *flist, /* list of extents to be freed */ + struct xfs_defer_ops *dfops, /* list of extents to be freed */ int *logflagsp) /* inode logging flags */ { xfs_btree_cur_t *cur; /* btree cursor */ @@ -2735,12 +2720,17 @@ xfs_bmap_add_extent_unwritten_real( ASSERT(0); } + /* update reverse mappings */ + error = xfs_rmap_convert_extent(mp, dfops, ip, XFS_DATA_FORK, new); + if (error) + goto done; + /* convert to a btree if necessary */ if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) { int tmp_logflags; /* partial log flag return val */ ASSERT(cur == NULL); - error = xfs_bmap_extents_to_btree(tp, ip, first, flist, &cur, + error = xfs_bmap_extents_to_btree(tp, ip, first, dfops, &cur, 0, &tmp_logflags, XFS_DATA_FORK); *logflagsp |= tmp_logflags; if (error) @@ -3127,13 +3117,18 @@ xfs_bmap_add_extent_hole_real( break; } + /* add reverse mapping */ + error = xfs_rmap_map_extent(mp, bma->dfops, bma->ip, whichfork, new); + if (error) + goto done; + /* convert to a btree if necessary */ if (xfs_bmap_needs_btree(bma->ip, whichfork)) { int tmp_logflags; /* partial log flag return val */ ASSERT(bma->cur == NULL); error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, - bma->firstblock, bma->flist, &bma->cur, + bma->firstblock, bma->dfops, &bma->cur, 0, &tmp_logflags, whichfork); bma->logflags |= tmp_logflags; if (error) @@ -3691,9 +3686,10 @@ xfs_bmap_btalloc( args.tp = ap->tp; args.mp = mp; args.fsbno = ap->blkno; + xfs_rmap_skip_owner_update(&args.oinfo); /* Trim the allocation back to the maximum an AG can fit. */ - args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp)); + args.maxlen = MIN(ap->length, mp->m_ag_max_usable); args.firstblock = *ap->firstblock; blen = 0; if (nullfb) { @@ -3708,7 +3704,7 @@ xfs_bmap_btalloc( error = xfs_bmap_btalloc_nullfb(ap, &args, &blen); if (error) return error; - } else if (ap->flist->xbf_low) { + } else if (ap->dfops->dop_low) { if (xfs_inode_is_filestream(ap->ip)) args.type = XFS_ALLOCTYPE_FIRST_AG; else @@ -3741,7 +3737,7 @@ xfs_bmap_btalloc( * is >= the stripe unit and the allocation offset is * at the end of file. */ - if (!ap->flist->xbf_low && ap->aeof) { + if (!ap->dfops->dop_low && ap->aeof) { if (!ap->offset) { args.alignment = stripe_align; atype = args.type; @@ -3834,7 +3830,7 @@ xfs_bmap_btalloc( args.minleft = 0; if ((error = xfs_alloc_vextent(&args))) return error; - ap->flist->xbf_low = 1; + ap->dfops->dop_low = true; } if (args.fsbno != NULLFSBLOCK) { /* @@ -3844,7 +3840,7 @@ xfs_bmap_btalloc( ASSERT(*ap->firstblock == NULLFSBLOCK || XFS_FSB_TO_AGNO(mp, *ap->firstblock) == XFS_FSB_TO_AGNO(mp, args.fsbno) || - (ap->flist->xbf_low && + (ap->dfops->dop_low && XFS_FSB_TO_AGNO(mp, *ap->firstblock) < XFS_FSB_TO_AGNO(mp, args.fsbno))); @@ -3852,7 +3848,7 @@ xfs_bmap_btalloc( if (*ap->firstblock == NULLFSBLOCK) *ap->firstblock = args.fsbno; ASSERT(nullfb || fb_agno == args.agno || - (ap->flist->xbf_low && fb_agno < args.agno)); + (ap->dfops->dop_low && fb_agno < args.agno)); ap->length = args.len; ap->ip->i_d.di_nblocks += args.len; xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); @@ -4319,7 +4315,7 @@ xfs_bmapi_allocate( if (error) return error; - if (bma->flist->xbf_low) + if (bma->dfops->dop_low) bma->minleft = 0; if (bma->cur) bma->cur->bc_private.b.firstblock = *bma->firstblock; @@ -4328,7 +4324,7 @@ xfs_bmapi_allocate( if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) { bma->cur = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork); bma->cur->bc_private.b.firstblock = *bma->firstblock; - bma->cur->bc_private.b.flist = bma->flist; + bma->cur->bc_private.b.dfops = bma->dfops; } /* * Bump the number of extents we've allocated @@ -4409,7 +4405,7 @@ xfs_bmapi_convert_unwritten( bma->cur = xfs_bmbt_init_cursor(bma->ip->i_mount, bma->tp, bma->ip, whichfork); bma->cur->bc_private.b.firstblock = *bma->firstblock; - bma->cur->bc_private.b.flist = bma->flist; + bma->cur->bc_private.b.dfops = bma->dfops; } mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN) ? XFS_EXT_NORM : XFS_EXT_UNWRITTEN; @@ -4426,7 +4422,7 @@ xfs_bmapi_convert_unwritten( } error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx, - &bma->cur, mval, bma->firstblock, bma->flist, + &bma->cur, mval, bma->firstblock, bma->dfops, &tmp_logflags); /* * Log the inode core unconditionally in the unwritten extent conversion @@ -4480,7 +4476,7 @@ xfs_bmapi_write( xfs_extlen_t total, /* total blocks needed */ struct xfs_bmbt_irec *mval, /* output: map values */ int *nmap, /* i/o: mval size/count */ - struct xfs_bmap_free *flist) /* i/o: list extents to free */ + struct xfs_defer_ops *dfops) /* i/o: list extents to free */ { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp; @@ -4570,7 +4566,7 @@ xfs_bmapi_write( bma.ip = ip; bma.total = total; bma.userdata = 0; - bma.flist = flist; + bma.dfops = dfops; bma.firstblock = firstblock; while (bno < end && n < *nmap) { @@ -4684,7 +4680,7 @@ error0: XFS_FSB_TO_AGNO(mp, *firstblock) == XFS_FSB_TO_AGNO(mp, bma.cur->bc_private.b.firstblock) || - (flist->xbf_low && + (dfops->dop_low && XFS_FSB_TO_AGNO(mp, *firstblock) < XFS_FSB_TO_AGNO(mp, bma.cur->bc_private.b.firstblock))); @@ -4768,7 +4764,7 @@ xfs_bmap_del_extent( xfs_inode_t *ip, /* incore inode pointer */ xfs_trans_t *tp, /* current transaction pointer */ xfs_extnum_t *idx, /* extent number to update/delete */ - xfs_bmap_free_t *flist, /* list of extents to be freed */ + struct xfs_defer_ops *dfops, /* list of extents to be freed */ xfs_btree_cur_t *cur, /* if null, not a btree */ xfs_bmbt_irec_t *del, /* data to remove from extents */ int *logflagsp, /* inode logging flags */ @@ -4870,6 +4866,7 @@ xfs_bmap_del_extent( nblks = 0; do_fx = 0; } + /* * Set flag value to use in switch statement. * Left-contig is 2, right-contig is 1. @@ -5052,12 +5049,20 @@ xfs_bmap_del_extent( ++*idx; break; } + + /* remove reverse mapping */ + if (!delay) { + error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, del); + if (error) + goto done; + } + /* * If we need to, add to list of extents to delete. */ if (do_fx) - xfs_bmap_add_free(mp, flist, del->br_startblock, - del->br_blockcount); + xfs_bmap_add_free(mp, dfops, del->br_startblock, + del->br_blockcount, NULL); /* * Adjust inode # blocks in the file. */ @@ -5097,7 +5102,7 @@ xfs_bunmapi( xfs_extnum_t nexts, /* number of extents max */ xfs_fsblock_t *firstblock, /* first allocated block controls a.g. for allocs */ - xfs_bmap_free_t *flist, /* i/o: list extents to free */ + struct xfs_defer_ops *dfops, /* i/o: list extents to free */ int *done) /* set if not done yet */ { xfs_btree_cur_t *cur; /* bmap btree cursor */ @@ -5170,7 +5175,7 @@ xfs_bunmapi( ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); cur->bc_private.b.firstblock = *firstblock; - cur->bc_private.b.flist = flist; + cur->bc_private.b.dfops = dfops; cur->bc_private.b.flags = 0; } else cur = NULL; @@ -5179,8 +5184,10 @@ xfs_bunmapi( /* * Synchronize by locking the bitmap inode. */ - xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP); xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM); + xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL); } extno = 0; @@ -5262,7 +5269,7 @@ xfs_bunmapi( } del.br_state = XFS_EXT_UNWRITTEN; error = xfs_bmap_add_extent_unwritten_real(tp, ip, - &lastx, &cur, &del, firstblock, flist, + &lastx, &cur, &del, firstblock, dfops, &logflags); if (error) goto error0; @@ -5321,7 +5328,7 @@ xfs_bunmapi( lastx--; error = xfs_bmap_add_extent_unwritten_real(tp, ip, &lastx, &cur, &prev, - firstblock, flist, &logflags); + firstblock, dfops, &logflags); if (error) goto error0; goto nodelete; @@ -5330,7 +5337,7 @@ xfs_bunmapi( del.br_state = XFS_EXT_UNWRITTEN; error = xfs_bmap_add_extent_unwritten_real(tp, ip, &lastx, &cur, &del, - firstblock, flist, &logflags); + firstblock, dfops, &logflags); if (error) goto error0; goto nodelete; @@ -5388,7 +5395,7 @@ xfs_bunmapi( } else if (cur) cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL; - error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del, + error = xfs_bmap_del_extent(ip, tp, &lastx, dfops, cur, &del, &tmp_logflags, whichfork); logflags |= tmp_logflags; if (error) @@ -5422,7 +5429,7 @@ nodelete: */ if (xfs_bmap_needs_btree(ip, whichfork)) { ASSERT(cur == NULL); - error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, + error = xfs_bmap_extents_to_btree(tp, ip, firstblock, dfops, &cur, 0, &tmp_logflags, whichfork); logflags |= tmp_logflags; if (error) @@ -5589,7 +5596,8 @@ xfs_bmse_shift_one( struct xfs_bmbt_rec_host *gotp, struct xfs_btree_cur *cur, int *logflags, - enum shift_direction direction) + enum shift_direction direction, + struct xfs_defer_ops *dfops) { struct xfs_ifork *ifp; struct xfs_mount *mp; @@ -5637,9 +5645,13 @@ xfs_bmse_shift_one( /* check whether to merge the extent or shift it down */ if (xfs_bmse_can_merge(&adj_irec, &got, offset_shift_fsb)) { - return xfs_bmse_merge(ip, whichfork, offset_shift_fsb, - *current_ext, gotp, adj_irecp, - cur, logflags); + error = xfs_bmse_merge(ip, whichfork, offset_shift_fsb, + *current_ext, gotp, adj_irecp, + cur, logflags); + if (error) + return error; + adj_irec = got; + goto update_rmap; } } else { startoff = got.br_startoff + offset_shift_fsb; @@ -5676,9 +5688,10 @@ update_current_ext: (*current_ext)--; xfs_bmbt_set_startoff(gotp, startoff); *logflags |= XFS_ILOG_CORE; + adj_irec = got; if (!cur) { *logflags |= XFS_ILOG_DEXT; - return 0; + goto update_rmap; } error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock, @@ -5688,8 +5701,18 @@ update_current_ext: XFS_WANT_CORRUPTED_RETURN(mp, i == 1); got.br_startoff = startoff; - return xfs_bmbt_update(cur, got.br_startoff, got.br_startblock, - got.br_blockcount, got.br_state); + error = xfs_bmbt_update(cur, got.br_startoff, got.br_startblock, + got.br_blockcount, got.br_state); + if (error) + return error; + +update_rmap: + /* update reverse mapping */ + error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, &adj_irec); + if (error) + return error; + adj_irec.br_startoff = startoff; + return xfs_rmap_map_extent(mp, dfops, ip, whichfork, &adj_irec); } /* @@ -5711,7 +5734,7 @@ xfs_bmap_shift_extents( int *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock, - struct xfs_bmap_free *flist, + struct xfs_defer_ops *dfops, enum shift_direction direction, int num_exts) { @@ -5756,7 +5779,7 @@ xfs_bmap_shift_extents( if (ifp->if_flags & XFS_IFBROOT) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); cur->bc_private.b.firstblock = *firstblock; - cur->bc_private.b.flist = flist; + cur->bc_private.b.dfops = dfops; cur->bc_private.b.flags = 0; } @@ -5817,7 +5840,7 @@ xfs_bmap_shift_extents( while (nexts++ < num_exts) { error = xfs_bmse_shift_one(ip, whichfork, offset_shift_fsb, ¤t_ext, gotp, cur, &logflags, - direction); + direction, dfops); if (error) goto del_cursor; /* @@ -5865,7 +5888,7 @@ xfs_bmap_split_extent_at( struct xfs_inode *ip, xfs_fileoff_t split_fsb, xfs_fsblock_t *firstfsb, - struct xfs_bmap_free *free_list) + struct xfs_defer_ops *dfops) { int whichfork = XFS_DATA_FORK; struct xfs_btree_cur *cur = NULL; @@ -5927,7 +5950,7 @@ xfs_bmap_split_extent_at( if (ifp->if_flags & XFS_IFBROOT) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); cur->bc_private.b.firstblock = *firstfsb; - cur->bc_private.b.flist = free_list; + cur->bc_private.b.dfops = dfops; cur->bc_private.b.flags = 0; error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock, @@ -5980,7 +6003,7 @@ xfs_bmap_split_extent_at( int tmp_logflags; /* partial log flag return val */ ASSERT(cur == NULL); - error = xfs_bmap_extents_to_btree(tp, ip, firstfsb, free_list, + error = xfs_bmap_extents_to_btree(tp, ip, firstfsb, dfops, &cur, 0, &tmp_logflags, whichfork); logflags |= tmp_logflags; } @@ -6004,7 +6027,7 @@ xfs_bmap_split_extent( { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; - struct xfs_bmap_free free_list; + struct xfs_defer_ops dfops; xfs_fsblock_t firstfsb; int error; @@ -6016,21 +6039,21 @@ xfs_bmap_split_extent( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_bmap_init(&free_list, &firstfsb); + xfs_defer_init(&dfops, &firstfsb); error = xfs_bmap_split_extent_at(tp, ip, split_fsb, - &firstfsb, &free_list); + &firstfsb, &dfops); if (error) goto out; - error = xfs_bmap_finish(&tp, &free_list, NULL); + error = xfs_defer_finish(&tp, &dfops, NULL); if (error) goto out; return xfs_trans_commit(tp); out: - xfs_bmap_cancel(&free_list); + xfs_defer_cancel(&dfops); xfs_trans_cancel(tp); return error; } diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index f1f3ae6c0a3f..254034f96941 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -32,7 +32,7 @@ extern kmem_zone_t *xfs_bmap_free_item_zone; */ struct xfs_bmalloca { xfs_fsblock_t *firstblock; /* i/o first block allocated */ - struct xfs_bmap_free *flist; /* bmap freelist */ + struct xfs_defer_ops *dfops; /* bmap freelist */ struct xfs_trans *tp; /* transaction pointer */ struct xfs_inode *ip; /* incore inode pointer */ struct xfs_bmbt_irec prev; /* extent before the new one */ @@ -62,34 +62,14 @@ struct xfs_bmalloca { * List of extents to be free "later". * The list is kept sorted on xbf_startblock. */ -struct xfs_bmap_free_item +struct xfs_extent_free_item { - xfs_fsblock_t xbfi_startblock;/* starting fs block number */ - xfs_extlen_t xbfi_blockcount;/* number of blocks in extent */ - struct list_head xbfi_list; + xfs_fsblock_t xefi_startblock;/* starting fs block number */ + xfs_extlen_t xefi_blockcount;/* number of blocks in extent */ + struct list_head xefi_list; + struct xfs_owner_info xefi_oinfo; /* extent owner */ }; -/* - * Header for free extent list. - * - * xbf_low is used by the allocator to activate the lowspace algorithm - - * when free space is running low the extent allocator may choose to - * allocate an extent from an AG without leaving sufficient space for - * a btree split when inserting the new extent. In this case the allocator - * will enable the lowspace algorithm which is supposed to allow further - * allocations (such as btree splits and newroots) to allocate from - * sequential AGs. In order to avoid locking AGs out of order the lowspace - * algorithm will start searching for free space from AG 0. If the correct - * transaction reservations have been made then this algorithm will eventually - * find all the space it needs. - */ -typedef struct xfs_bmap_free -{ - struct list_head xbf_flist; /* list of to-be-free extents */ - int xbf_count; /* count of items on list */ - int xbf_low; /* alloc in low mode */ -} xfs_bmap_free_t; - #define XFS_BMAP_MAX_NMAP 4 /* @@ -139,14 +119,6 @@ static inline int xfs_bmapi_aflag(int w) #define DELAYSTARTBLOCK ((xfs_fsblock_t)-1LL) #define HOLESTARTBLOCK ((xfs_fsblock_t)-2LL) -static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp) -{ - INIT_LIST_HEAD(&flp->xbf_flist); - flp->xbf_count = 0; - flp->xbf_low = 0; - *fbp = NULLFSBLOCK; -} - /* * Flags for xfs_bmap_add_extent*. */ @@ -193,11 +165,9 @@ void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt, int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork); -void xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_bmap_free *flist, - xfs_fsblock_t bno, xfs_filblks_t len); -void xfs_bmap_cancel(struct xfs_bmap_free *flist); -int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist, - struct xfs_inode *ip); +void xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_defer_ops *dfops, + xfs_fsblock_t bno, xfs_filblks_t len, + struct xfs_owner_info *oinfo); void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork); int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip, xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork); @@ -218,18 +188,18 @@ int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, int flags, xfs_fsblock_t *firstblock, xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap, - struct xfs_bmap_free *flist); + struct xfs_defer_ops *dfops); int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, int flags, xfs_extnum_t nexts, xfs_fsblock_t *firstblock, - struct xfs_bmap_free *flist, int *done); + struct xfs_defer_ops *dfops, int *done); int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx, xfs_extnum_t num); uint xfs_default_attroffset(struct xfs_inode *ip); int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, int *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock, - struct xfs_bmap_free *flist, enum shift_direction direction, + struct xfs_defer_ops *dfops, enum shift_direction direction, int num_exts); int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset); diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index db0c71e470c9..cd85274e810c 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -23,6 +23,7 @@ #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_inode_item.h" @@ -34,6 +35,7 @@ #include "xfs_quota.h" #include "xfs_trace.h" #include "xfs_cksum.h" +#include "xfs_rmap.h" /* * Determine the extent state. @@ -406,11 +408,11 @@ xfs_bmbt_dup_cursor( cur->bc_private.b.ip, cur->bc_private.b.whichfork); /* - * Copy the firstblock, flist, and flags values, + * Copy the firstblock, dfops, and flags values, * since init cursor doesn't get them. */ new->bc_private.b.firstblock = cur->bc_private.b.firstblock; - new->bc_private.b.flist = cur->bc_private.b.flist; + new->bc_private.b.dfops = cur->bc_private.b.dfops; new->bc_private.b.flags = cur->bc_private.b.flags; return new; @@ -423,7 +425,7 @@ xfs_bmbt_update_cursor( { ASSERT((dst->bc_private.b.firstblock != NULLFSBLOCK) || (dst->bc_private.b.ip->i_d.di_flags & XFS_DIFLAG_REALTIME)); - ASSERT(dst->bc_private.b.flist == src->bc_private.b.flist); + ASSERT(dst->bc_private.b.dfops == src->bc_private.b.dfops); dst->bc_private.b.allocated += src->bc_private.b.allocated; dst->bc_private.b.firstblock = src->bc_private.b.firstblock; @@ -446,6 +448,8 @@ xfs_bmbt_alloc_block( args.mp = cur->bc_mp; args.fsbno = cur->bc_private.b.firstblock; args.firstblock = args.fsbno; + xfs_rmap_ino_bmbt_owner(&args.oinfo, cur->bc_private.b.ip->i_ino, + cur->bc_private.b.whichfork); if (args.fsbno == NULLFSBLOCK) { args.fsbno = be64_to_cpu(start->l); @@ -462,7 +466,7 @@ xfs_bmbt_alloc_block( * block allocation here and corrupt the filesystem. */ args.minleft = args.tp->t_blk_res; - } else if (cur->bc_private.b.flist->xbf_low) { + } else if (cur->bc_private.b.dfops->dop_low) { args.type = XFS_ALLOCTYPE_START_BNO; } else { args.type = XFS_ALLOCTYPE_NEAR_BNO; @@ -490,7 +494,7 @@ xfs_bmbt_alloc_block( error = xfs_alloc_vextent(&args); if (error) goto error0; - cur->bc_private.b.flist->xbf_low = 1; + cur->bc_private.b.dfops->dop_low = true; } if (args.fsbno == NULLFSBLOCK) { XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); @@ -525,8 +529,10 @@ xfs_bmbt_free_block( struct xfs_inode *ip = cur->bc_private.b.ip; struct xfs_trans *tp = cur->bc_tp; xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); + struct xfs_owner_info oinfo; - xfs_bmap_add_free(mp, cur->bc_private.b.flist, fsbno, 1); + xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_private.b.whichfork); + xfs_bmap_add_free(mp, cur->bc_private.b.dfops, fsbno, 1, &oinfo); ip->i_d.di_nblocks--; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); @@ -600,17 +606,6 @@ xfs_bmbt_init_key_from_rec( } STATIC void -xfs_bmbt_init_rec_from_key( - union xfs_btree_key *key, - union xfs_btree_rec *rec) -{ - ASSERT(key->bmbt.br_startoff != 0); - - xfs_bmbt_disk_set_allf(&rec->bmbt, be64_to_cpu(key->bmbt.br_startoff), - 0, 0, XFS_EXT_NORM); -} - -STATIC void xfs_bmbt_init_rec_from_cur( struct xfs_btree_cur *cur, union xfs_btree_rec *rec) @@ -760,7 +755,6 @@ static const struct xfs_btree_ops xfs_bmbt_ops = { .get_minrecs = xfs_bmbt_get_minrecs, .get_dmaxrecs = xfs_bmbt_get_dmaxrecs, .init_key_from_rec = xfs_bmbt_init_key_from_rec, - .init_rec_from_key = xfs_bmbt_init_rec_from_key, .init_rec_from_cur = xfs_bmbt_init_rec_from_cur, .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur, .key_diff = xfs_bmbt_key_diff, @@ -800,7 +794,7 @@ xfs_bmbt_init_cursor( cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork); cur->bc_private.b.ip = ip; cur->bc_private.b.firstblock = NULLFSBLOCK; - cur->bc_private.b.flist = NULL; + cur->bc_private.b.dfops = NULL; cur->bc_private.b.allocated = 0; cur->bc_private.b.flags = 0; cur->bc_private.b.whichfork = whichfork; diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 07eeb0b4ca74..b5c213a051cd 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -23,6 +23,7 @@ #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_inode_item.h" @@ -43,15 +44,14 @@ kmem_zone_t *xfs_btree_cur_zone; * Btree magic numbers. */ static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = { - { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC, + { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC, XFS_FIBT_MAGIC }, - { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, + { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC, XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC } }; #define xfs_btree_magic(cur) \ xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum] - STATIC int /* error (0 or EFSCORRUPTED) */ xfs_btree_check_lblock( struct xfs_btree_cur *cur, /* btree cursor */ @@ -428,6 +428,50 @@ xfs_btree_dup_cursor( * into a btree block (xfs_btree_*_offset) or return a pointer to the given * record, key or pointer (xfs_btree_*_addr). Note that all addressing * inside the btree block is done using indices starting at one, not zero! + * + * If XFS_BTREE_OVERLAPPING is set, then this btree supports keys containing + * overlapping intervals. In such a tree, records are still sorted lowest to + * highest and indexed by the smallest key value that refers to the record. + * However, nodes are different: each pointer has two associated keys -- one + * indexing the lowest key available in the block(s) below (the same behavior + * as the key in a regular btree) and another indexing the highest key + * available in the block(s) below. Because records are /not/ sorted by the + * highest key, all leaf block updates require us to compute the highest key + * that matches any record in the leaf and to recursively update the high keys + * in the nodes going further up in the tree, if necessary. Nodes look like + * this: + * + * +--------+-----+-----+-----+-----+-----+-------+-------+-----+ + * Non-Leaf: | header | lo1 | hi1 | lo2 | hi2 | ... | ptr 1 | ptr 2 | ... | + * +--------+-----+-----+-----+-----+-----+-------+-------+-----+ + * + * To perform an interval query on an overlapped tree, perform the usual + * depth-first search and use the low and high keys to decide if we can skip + * that particular node. If a leaf node is reached, return the records that + * intersect the interval. Note that an interval query may return numerous + * entries. For a non-overlapped tree, simply search for the record associated + * with the lowest key and iterate forward until a non-matching record is + * found. Section 14.3 ("Interval Trees") of _Introduction to Algorithms_ by + * Cormen, Leiserson, Rivest, and Stein (2nd or 3rd ed. only) discuss this in + * more detail. + * + * Why do we care about overlapping intervals? Let's say you have a bunch of + * reverse mapping records on a reflink filesystem: + * + * 1: +- file A startblock B offset C length D -----------+ + * 2: +- file E startblock F offset G length H --------------+ + * 3: +- file I startblock F offset J length K --+ + * 4: +- file L... --+ + * + * Now say we want to map block (B+D) into file A at offset (C+D). Ideally, + * we'd simply increment the length of record 1. But how do we find the record + * that ends at (B+D-1) (i.e. record 1)? A LE lookup of (B+D-1) would return + * record 3 because the keys are ordered first by startblock. An interval + * query would return records 1 and 2 because they both overlap (B+D-1), and + * from that we can pick out record 1 as the appropriate left neighbor. + * + * In the non-overlapped case you can do a LE lookup and decrement the cursor + * because a record's interval must end before the next record. */ /* @@ -479,6 +523,18 @@ xfs_btree_key_offset( } /* + * Calculate offset of the n-th high key in a btree block. + */ +STATIC size_t +xfs_btree_high_key_offset( + struct xfs_btree_cur *cur, + int n) +{ + return xfs_btree_block_len(cur) + + (n - 1) * cur->bc_ops->key_len + (cur->bc_ops->key_len / 2); +} + +/* * Calculate offset of the n-th block pointer in a btree block. */ STATIC size_t @@ -519,6 +575,19 @@ xfs_btree_key_addr( } /* + * Return a pointer to the n-th high key in the btree block. + */ +STATIC union xfs_btree_key * +xfs_btree_high_key_addr( + struct xfs_btree_cur *cur, + int n, + struct xfs_btree_block *block) +{ + return (union xfs_btree_key *) + ((char *)block + xfs_btree_high_key_offset(cur, n)); +} + +/* * Return a pointer to the n-th block pointer in the btree block. */ STATIC union xfs_btree_ptr * @@ -1144,6 +1213,9 @@ xfs_btree_set_refs( case XFS_BTNUM_BMAP: xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF); break; + case XFS_BTNUM_RMAP: + xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF); + break; default: ASSERT(0); } @@ -1879,32 +1951,214 @@ error0: return error; } +/* Find the high key storage area from a regular key. */ +STATIC union xfs_btree_key * +xfs_btree_high_key_from_key( + struct xfs_btree_cur *cur, + union xfs_btree_key *key) +{ + ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING); + return (union xfs_btree_key *)((char *)key + + (cur->bc_ops->key_len / 2)); +} + +/* Determine the low (and high if overlapped) keys of a leaf block */ +STATIC void +xfs_btree_get_leaf_keys( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + union xfs_btree_key *key) +{ + union xfs_btree_key max_hkey; + union xfs_btree_key hkey; + union xfs_btree_rec *rec; + union xfs_btree_key *high; + int n; + + rec = xfs_btree_rec_addr(cur, 1, block); + cur->bc_ops->init_key_from_rec(key, rec); + + if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { + + cur->bc_ops->init_high_key_from_rec(&max_hkey, rec); + for (n = 2; n <= xfs_btree_get_numrecs(block); n++) { + rec = xfs_btree_rec_addr(cur, n, block); + cur->bc_ops->init_high_key_from_rec(&hkey, rec); + if (cur->bc_ops->diff_two_keys(cur, &hkey, &max_hkey) + > 0) + max_hkey = hkey; + } + + high = xfs_btree_high_key_from_key(cur, key); + memcpy(high, &max_hkey, cur->bc_ops->key_len / 2); + } +} + +/* Determine the low (and high if overlapped) keys of a node block */ +STATIC void +xfs_btree_get_node_keys( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + union xfs_btree_key *key) +{ + union xfs_btree_key *hkey; + union xfs_btree_key *max_hkey; + union xfs_btree_key *high; + int n; + + if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { + memcpy(key, xfs_btree_key_addr(cur, 1, block), + cur->bc_ops->key_len / 2); + + max_hkey = xfs_btree_high_key_addr(cur, 1, block); + for (n = 2; n <= xfs_btree_get_numrecs(block); n++) { + hkey = xfs_btree_high_key_addr(cur, n, block); + if (cur->bc_ops->diff_two_keys(cur, hkey, max_hkey) > 0) + max_hkey = hkey; + } + + high = xfs_btree_high_key_from_key(cur, key); + memcpy(high, max_hkey, cur->bc_ops->key_len / 2); + } else { + memcpy(key, xfs_btree_key_addr(cur, 1, block), + cur->bc_ops->key_len); + } +} + +/* Derive the keys for any btree block. */ +STATIC void +xfs_btree_get_keys( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + union xfs_btree_key *key) +{ + if (be16_to_cpu(block->bb_level) == 0) + xfs_btree_get_leaf_keys(cur, block, key); + else + xfs_btree_get_node_keys(cur, block, key); +} + /* - * Update keys at all levels from here to the root along the cursor's path. + * Decide if we need to update the parent keys of a btree block. For + * a standard btree this is only necessary if we're updating the first + * record/key. For an overlapping btree, we must always update the + * keys because the highest key can be in any of the records or keys + * in the block. + */ +static inline bool +xfs_btree_needs_key_update( + struct xfs_btree_cur *cur, + int ptr) +{ + return (cur->bc_flags & XFS_BTREE_OVERLAPPING) || ptr == 1; +} + +/* + * Update the low and high parent keys of the given level, progressing + * towards the root. If force_all is false, stop if the keys for a given + * level do not need updating. */ STATIC int -xfs_btree_updkey( +__xfs_btree_updkeys( + struct xfs_btree_cur *cur, + int level, + struct xfs_btree_block *block, + struct xfs_buf *bp0, + bool force_all) +{ + union xfs_btree_bigkey key; /* keys from current level */ + union xfs_btree_key *lkey; /* keys from the next level up */ + union xfs_btree_key *hkey; + union xfs_btree_key *nlkey; /* keys from the next level up */ + union xfs_btree_key *nhkey; + struct xfs_buf *bp; + int ptr; + + ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING); + + /* Exit if there aren't any parent levels to update. */ + if (level + 1 >= cur->bc_nlevels) + return 0; + + trace_xfs_btree_updkeys(cur, level, bp0); + + lkey = (union xfs_btree_key *)&key; + hkey = xfs_btree_high_key_from_key(cur, lkey); + xfs_btree_get_keys(cur, block, lkey); + for (level++; level < cur->bc_nlevels; level++) { +#ifdef DEBUG + int error; +#endif + block = xfs_btree_get_block(cur, level, &bp); + trace_xfs_btree_updkeys(cur, level, bp); +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, level, bp); + if (error) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; + } +#endif + ptr = cur->bc_ptrs[level]; + nlkey = xfs_btree_key_addr(cur, ptr, block); + nhkey = xfs_btree_high_key_addr(cur, ptr, block); + if (!force_all && + !(cur->bc_ops->diff_two_keys(cur, nlkey, lkey) != 0 || + cur->bc_ops->diff_two_keys(cur, nhkey, hkey) != 0)) + break; + xfs_btree_copy_keys(cur, nlkey, lkey, 1); + xfs_btree_log_keys(cur, bp, ptr, ptr); + if (level + 1 >= cur->bc_nlevels) + break; + xfs_btree_get_node_keys(cur, block, lkey); + } + + return 0; +} + +/* Update all the keys from some level in cursor back to the root. */ +STATIC int +xfs_btree_updkeys_force( + struct xfs_btree_cur *cur, + int level) +{ + struct xfs_buf *bp; + struct xfs_btree_block *block; + + block = xfs_btree_get_block(cur, level, &bp); + return __xfs_btree_updkeys(cur, level, block, bp, true); +} + +/* + * Update the parent keys of the given level, progressing towards the root. + */ +STATIC int +xfs_btree_update_keys( struct xfs_btree_cur *cur, - union xfs_btree_key *keyp, int level) { struct xfs_btree_block *block; struct xfs_buf *bp; union xfs_btree_key *kp; + union xfs_btree_key key; int ptr; + ASSERT(level >= 0); + + block = xfs_btree_get_block(cur, level, &bp); + if (cur->bc_flags & XFS_BTREE_OVERLAPPING) + return __xfs_btree_updkeys(cur, level, block, bp, false); + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); XFS_BTREE_TRACE_ARGIK(cur, level, keyp); - ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || level >= 1); - /* * Go up the tree from this level toward the root. * At each level, update the key value to the value input. * Stop when we reach a level where the cursor isn't pointing * at the first entry in the block. */ - for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) { + xfs_btree_get_keys(cur, block, &key); + for (level++, ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) { #ifdef DEBUG int error; #endif @@ -1918,7 +2172,7 @@ xfs_btree_updkey( #endif ptr = cur->bc_ptrs[level]; kp = xfs_btree_key_addr(cur, ptr, block); - xfs_btree_copy_keys(cur, kp, keyp, 1); + xfs_btree_copy_keys(cur, kp, &key, 1); xfs_btree_log_keys(cur, bp, ptr, ptr); } @@ -1970,12 +2224,9 @@ xfs_btree_update( ptr, LASTREC_UPDATE); } - /* Updating first rec in leaf. Pass new key value up to our parent. */ - if (ptr == 1) { - union xfs_btree_key key; - - cur->bc_ops->init_key_from_rec(&key, rec); - error = xfs_btree_updkey(cur, &key, 1); + /* Pass new key value up to our parent. */ + if (xfs_btree_needs_key_update(cur, ptr)) { + error = xfs_btree_update_keys(cur, 0); if (error) goto error0; } @@ -1998,18 +2249,19 @@ xfs_btree_lshift( int level, int *stat) /* success/failure */ { - union xfs_btree_key key; /* btree key */ struct xfs_buf *lbp; /* left buffer pointer */ struct xfs_btree_block *left; /* left btree block */ int lrecs; /* left record count */ struct xfs_buf *rbp; /* right buffer pointer */ struct xfs_btree_block *right; /* right btree block */ + struct xfs_btree_cur *tcur; /* temporary btree cursor */ int rrecs; /* right record count */ union xfs_btree_ptr lptr; /* left btree pointer */ union xfs_btree_key *rkp = NULL; /* right btree key */ union xfs_btree_ptr *rpp = NULL; /* right address pointer */ union xfs_btree_rec *rrp = NULL; /* right record pointer */ int error; /* error return value */ + int i; XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); XFS_BTREE_TRACE_ARGI(cur, level); @@ -2139,18 +2391,33 @@ xfs_btree_lshift( xfs_btree_rec_addr(cur, 2, right), -1, rrecs); xfs_btree_log_recs(cur, rbp, 1, rrecs); + } - /* - * If it's the first record in the block, we'll need a key - * structure to pass up to the next level (updkey). - */ - cur->bc_ops->init_key_from_rec(&key, - xfs_btree_rec_addr(cur, 1, right)); - rkp = &key; + /* + * Using a temporary cursor, update the parent key values of the + * block on the left. + */ + if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { + error = xfs_btree_dup_cursor(cur, &tcur); + if (error) + goto error0; + i = xfs_btree_firstrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(tcur->bc_mp, i == 1, error0); + + error = xfs_btree_decrement(tcur, level, &i); + if (error) + goto error1; + + /* Update the parent high keys of the left block, if needed. */ + error = xfs_btree_update_keys(tcur, level); + if (error) + goto error1; + + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); } - /* Update the parent key values of right. */ - error = xfs_btree_updkey(cur, rkp, level + 1); + /* Update the parent keys of the right block. */ + error = xfs_btree_update_keys(cur, level); if (error) goto error0; @@ -2169,6 +2436,11 @@ out0: error0: XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); return error; + +error1: + XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR); + xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); + return error; } /* @@ -2181,7 +2453,6 @@ xfs_btree_rshift( int level, int *stat) /* success/failure */ { - union xfs_btree_key key; /* btree key */ struct xfs_buf *lbp; /* left buffer pointer */ struct xfs_btree_block *left; /* left btree block */ struct xfs_buf *rbp; /* right buffer pointer */ @@ -2290,12 +2561,6 @@ xfs_btree_rshift( /* Now put the new data in, and log it. */ xfs_btree_copy_recs(cur, rrp, lrp, 1); xfs_btree_log_recs(cur, rbp, 1, rrecs + 1); - - cur->bc_ops->init_key_from_rec(&key, rrp); - rkp = &key; - - ASSERT(cur->bc_ops->recs_inorder(cur, rrp, - xfs_btree_rec_addr(cur, 2, right))); } /* @@ -2315,13 +2580,21 @@ xfs_btree_rshift( if (error) goto error0; i = xfs_btree_lastrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(tcur->bc_mp, i == 1, error0); error = xfs_btree_increment(tcur, level, &i); if (error) goto error1; - error = xfs_btree_updkey(tcur, rkp, level + 1); + /* Update the parent high keys of the left block, if needed. */ + if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { + error = xfs_btree_update_keys(cur, level); + if (error) + goto error1; + } + + /* Update the parent keys of the right block. */ + error = xfs_btree_update_keys(tcur, level); if (error) goto error1; @@ -2422,6 +2695,11 @@ __xfs_btree_split( XFS_BTREE_STATS_ADD(cur, moves, rrecs); + /* Adjust numrecs for the later get_*_keys() calls. */ + lrecs -= rrecs; + xfs_btree_set_numrecs(left, lrecs); + xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs); + /* * Copy btree block entries from the left block over to the * new block, the right. Update the right block and log the @@ -2447,14 +2725,15 @@ __xfs_btree_split( } #endif + /* Copy the keys & pointers to the new block. */ xfs_btree_copy_keys(cur, rkp, lkp, rrecs); xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs); xfs_btree_log_keys(cur, rbp, 1, rrecs); xfs_btree_log_ptrs(cur, rbp, 1, rrecs); - /* Grab the keys to the entries moved to the right block */ - xfs_btree_copy_keys(cur, key, rkp, 1); + /* Stash the keys of the new block for later insertion. */ + xfs_btree_get_node_keys(cur, right, key); } else { /* It's a leaf. Move records. */ union xfs_btree_rec *lrp; /* left record pointer */ @@ -2463,27 +2742,23 @@ __xfs_btree_split( lrp = xfs_btree_rec_addr(cur, src_index, left); rrp = xfs_btree_rec_addr(cur, 1, right); + /* Copy records to the new block. */ xfs_btree_copy_recs(cur, rrp, lrp, rrecs); xfs_btree_log_recs(cur, rbp, 1, rrecs); - cur->bc_ops->init_key_from_rec(key, - xfs_btree_rec_addr(cur, 1, right)); + /* Stash the keys of the new block for later insertion. */ + xfs_btree_get_leaf_keys(cur, right, key); } - /* * Find the left block number by looking in the buffer. - * Adjust numrecs, sibling pointers. + * Adjust sibling pointers. */ xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB); xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB); xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB); - lrecs -= rrecs; - xfs_btree_set_numrecs(left, lrecs); - xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs); - xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS); xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); @@ -2499,6 +2774,14 @@ __xfs_btree_split( xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB); xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB); } + + /* Update the parent high keys of the left block, if needed. */ + if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { + error = xfs_btree_update_keys(cur, level); + if (error) + goto error0; + } + /* * If the cursor is really in the right block, move it there. * If it's just pointing past the last entry in left, then we'll @@ -2802,6 +3085,7 @@ xfs_btree_new_root( bp = lbp; nptr = 2; } + /* Fill in the new block's btree header and log it. */ xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2); xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS); @@ -2810,19 +3094,24 @@ xfs_btree_new_root( /* Fill in the key data in the new root. */ if (xfs_btree_get_level(left) > 0) { - xfs_btree_copy_keys(cur, - xfs_btree_key_addr(cur, 1, new), - xfs_btree_key_addr(cur, 1, left), 1); - xfs_btree_copy_keys(cur, - xfs_btree_key_addr(cur, 2, new), - xfs_btree_key_addr(cur, 1, right), 1); + /* + * Get the keys for the left block's keys and put them directly + * in the parent block. Do the same for the right block. + */ + xfs_btree_get_node_keys(cur, left, + xfs_btree_key_addr(cur, 1, new)); + xfs_btree_get_node_keys(cur, right, + xfs_btree_key_addr(cur, 2, new)); } else { - cur->bc_ops->init_key_from_rec( - xfs_btree_key_addr(cur, 1, new), - xfs_btree_rec_addr(cur, 1, left)); - cur->bc_ops->init_key_from_rec( - xfs_btree_key_addr(cur, 2, new), - xfs_btree_rec_addr(cur, 1, right)); + /* + * Get the keys for the left block's records and put them + * directly in the parent block. Do the same for the right + * block. + */ + xfs_btree_get_leaf_keys(cur, left, + xfs_btree_key_addr(cur, 1, new)); + xfs_btree_get_leaf_keys(cur, right, + xfs_btree_key_addr(cur, 2, new)); } xfs_btree_log_keys(cur, nbp, 1, 2); @@ -2858,10 +3147,9 @@ xfs_btree_make_block_unfull( int *index, /* new tree index */ union xfs_btree_ptr *nptr, /* new btree ptr */ struct xfs_btree_cur **ncur, /* new btree cursor */ - union xfs_btree_rec *nrec, /* new record */ + union xfs_btree_key *key, /* key of new block */ int *stat) { - union xfs_btree_key key; /* new btree key value */ int error = 0; if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && @@ -2871,6 +3159,7 @@ xfs_btree_make_block_unfull( if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) { /* A root block that can be made bigger. */ xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork); + *stat = 1; } else { /* A root block that needs replacing */ int logflags = 0; @@ -2906,13 +3195,12 @@ xfs_btree_make_block_unfull( * If this works we have to re-set our variables because we * could be in a different block now. */ - error = xfs_btree_split(cur, level, nptr, &key, ncur, stat); + error = xfs_btree_split(cur, level, nptr, key, ncur, stat); if (error || *stat == 0) return error; *index = cur->bc_ptrs[level]; - cur->bc_ops->init_rec_from_key(&key, nrec); return 0; } @@ -2925,16 +3213,17 @@ xfs_btree_insrec( struct xfs_btree_cur *cur, /* btree cursor */ int level, /* level to insert record at */ union xfs_btree_ptr *ptrp, /* i/o: block number inserted */ - union xfs_btree_rec *recp, /* i/o: record data inserted */ + union xfs_btree_rec *rec, /* record to insert */ + union xfs_btree_key *key, /* i/o: block key for ptrp */ struct xfs_btree_cur **curp, /* output: new cursor replacing cur */ int *stat) /* success/failure */ { struct xfs_btree_block *block; /* btree block */ struct xfs_buf *bp; /* buffer for block */ - union xfs_btree_key key; /* btree key */ union xfs_btree_ptr nptr; /* new block ptr */ struct xfs_btree_cur *ncur; /* new btree cursor */ - union xfs_btree_rec nrec; /* new record count */ + union xfs_btree_bigkey nkey; /* new block key */ + union xfs_btree_key *lkey; int optr; /* old key/record index */ int ptr; /* key/record index */ int numrecs;/* number of records */ @@ -2942,11 +3231,13 @@ xfs_btree_insrec( #ifdef DEBUG int i; #endif + xfs_daddr_t old_bn; XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, recp); + XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, &rec); ncur = NULL; + lkey = (union xfs_btree_key *)&nkey; /* * If we have an external root pointer, and we've made it to the @@ -2969,15 +3260,13 @@ xfs_btree_insrec( return 0; } - /* Make a key out of the record data to be inserted, and save it. */ - cur->bc_ops->init_key_from_rec(&key, recp); - optr = ptr; XFS_BTREE_STATS_INC(cur, insrec); /* Get pointers to the btree buffer and block. */ block = xfs_btree_get_block(cur, level, &bp); + old_bn = bp ? bp->b_bn : XFS_BUF_DADDR_NULL; numrecs = xfs_btree_get_numrecs(block); #ifdef DEBUG @@ -2988,10 +3277,10 @@ xfs_btree_insrec( /* Check that the new entry is being inserted in the right place. */ if (ptr <= numrecs) { if (level == 0) { - ASSERT(cur->bc_ops->recs_inorder(cur, recp, + ASSERT(cur->bc_ops->recs_inorder(cur, rec, xfs_btree_rec_addr(cur, ptr, block))); } else { - ASSERT(cur->bc_ops->keys_inorder(cur, &key, + ASSERT(cur->bc_ops->keys_inorder(cur, key, xfs_btree_key_addr(cur, ptr, block))); } } @@ -3004,7 +3293,7 @@ xfs_btree_insrec( xfs_btree_set_ptr_null(cur, &nptr); if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) { error = xfs_btree_make_block_unfull(cur, level, numrecs, - &optr, &ptr, &nptr, &ncur, &nrec, stat); + &optr, &ptr, &nptr, &ncur, lkey, stat); if (error || *stat == 0) goto error0; } @@ -3054,7 +3343,7 @@ xfs_btree_insrec( #endif /* Now put the new data in, bump numrecs and log it. */ - xfs_btree_copy_keys(cur, kp, &key, 1); + xfs_btree_copy_keys(cur, kp, key, 1); xfs_btree_copy_ptrs(cur, pp, ptrp, 1); numrecs++; xfs_btree_set_numrecs(block, numrecs); @@ -3075,7 +3364,7 @@ xfs_btree_insrec( xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1); /* Now put the new data in, bump numrecs and log it. */ - xfs_btree_copy_recs(cur, rp, recp, 1); + xfs_btree_copy_recs(cur, rp, rec, 1); xfs_btree_set_numrecs(block, ++numrecs); xfs_btree_log_recs(cur, bp, ptr, numrecs); #ifdef DEBUG @@ -3089,9 +3378,18 @@ xfs_btree_insrec( /* Log the new number of records in the btree header. */ xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); - /* If we inserted at the start of a block, update the parents' keys. */ - if (optr == 1) { - error = xfs_btree_updkey(cur, &key, level + 1); + /* + * If we just inserted into a new tree block, we have to + * recalculate nkey here because nkey is out of date. + * + * Otherwise we're just updating an existing block (having shoved + * some records into the new tree block), so use the regular key + * update mechanism. + */ + if (bp && bp->b_bn != old_bn) { + xfs_btree_get_keys(cur, block, lkey); + } else if (xfs_btree_needs_key_update(cur, optr)) { + error = xfs_btree_update_keys(cur, level); if (error) goto error0; } @@ -3101,7 +3399,7 @@ xfs_btree_insrec( * we are at the far right edge of the tree, update it. */ if (xfs_btree_is_lastrec(cur, block, level)) { - cur->bc_ops->update_lastrec(cur, block, recp, + cur->bc_ops->update_lastrec(cur, block, rec, ptr, LASTREC_INSREC); } @@ -3111,7 +3409,7 @@ xfs_btree_insrec( */ *ptrp = nptr; if (!xfs_btree_ptr_is_null(cur, &nptr)) { - *recp = nrec; + xfs_btree_copy_keys(cur, key, lkey, 1); *curp = ncur; } @@ -3142,14 +3440,20 @@ xfs_btree_insert( union xfs_btree_ptr nptr; /* new block number (split result) */ struct xfs_btree_cur *ncur; /* new cursor (split result) */ struct xfs_btree_cur *pcur; /* previous level's cursor */ + union xfs_btree_bigkey bkey; /* key of block to insert */ + union xfs_btree_key *key; union xfs_btree_rec rec; /* record to insert */ level = 0; ncur = NULL; pcur = cur; + key = (union xfs_btree_key *)&bkey; xfs_btree_set_ptr_null(cur, &nptr); + + /* Make a key out of the record data to be inserted, and save it. */ cur->bc_ops->init_rec_from_cur(cur, &rec); + cur->bc_ops->init_key_from_rec(key, &rec); /* * Loop going up the tree, starting at the leaf level. @@ -3161,7 +3465,8 @@ xfs_btree_insert( * Insert nrec/nptr into this level of the tree. * Note if we fail, nptr will be null. */ - error = xfs_btree_insrec(pcur, level, &nptr, &rec, &ncur, &i); + error = xfs_btree_insrec(pcur, level, &nptr, &rec, key, + &ncur, &i); if (error) { if (pcur != cur) xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR); @@ -3385,8 +3690,6 @@ xfs_btree_delrec( struct xfs_buf *bp; /* buffer for block */ int error; /* error return value */ int i; /* loop counter */ - union xfs_btree_key key; /* storage for keyp */ - union xfs_btree_key *keyp = &key; /* passed to the next level */ union xfs_btree_ptr lptr; /* left sibling block ptr */ struct xfs_buf *lbp; /* left buffer pointer */ struct xfs_btree_block *left; /* left btree block */ @@ -3457,13 +3760,6 @@ xfs_btree_delrec( xfs_btree_log_keys(cur, bp, ptr, numrecs - 1); xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1); } - - /* - * If it's the first record in the block, we'll need to pass a - * key up to the next level (updkey). - */ - if (ptr == 1) - keyp = xfs_btree_key_addr(cur, 1, block); } else { /* It's a leaf. operate on records */ if (ptr < numrecs) { @@ -3472,16 +3768,6 @@ xfs_btree_delrec( -1, numrecs - ptr); xfs_btree_log_recs(cur, bp, ptr, numrecs - 1); } - - /* - * If it's the first record in the block, we'll need a key - * structure to pass up to the next level (updkey). - */ - if (ptr == 1) { - cur->bc_ops->init_key_from_rec(&key, - xfs_btree_rec_addr(cur, 1, block)); - keyp = &key; - } } /* @@ -3548,8 +3834,8 @@ xfs_btree_delrec( * If we deleted the leftmost entry in the block, update the * key values above us in the tree. */ - if (ptr == 1) { - error = xfs_btree_updkey(cur, keyp, level + 1); + if (xfs_btree_needs_key_update(cur, ptr)) { + error = xfs_btree_update_keys(cur, level); if (error) goto error0; } @@ -3878,6 +4164,16 @@ xfs_btree_delrec( if (level > 0) cur->bc_ptrs[level]--; + /* + * We combined blocks, so we have to update the parent keys if the + * btree supports overlapped intervals. However, bc_ptrs[level + 1] + * points to the old block so that the caller knows which record to + * delete. Therefore, the caller must be savvy enough to call updkeys + * for us if we return stat == 2. The other exit points from this + * function don't require deletions further up the tree, so they can + * call updkeys directly. + */ + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); /* Return value means the next level up has something to do. */ *stat = 2; @@ -3903,6 +4199,7 @@ xfs_btree_delete( int error; /* error return value */ int level; int i; + bool joined = false; XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); @@ -3916,6 +4213,18 @@ xfs_btree_delete( error = xfs_btree_delrec(cur, level, &i); if (error) goto error0; + if (i == 2) + joined = true; + } + + /* + * If we combined blocks as part of deleting the record, delrec won't + * have updated the parent high keys so we have to do that here. + */ + if (joined && (cur->bc_flags & XFS_BTREE_OVERLAPPING)) { + error = xfs_btree_updkeys_force(cur, 0); + if (error) + goto error0; } if (i == 0) { @@ -3978,6 +4287,81 @@ xfs_btree_get_rec( return 0; } +/* Visit a block in a btree. */ +STATIC int +xfs_btree_visit_block( + struct xfs_btree_cur *cur, + int level, + xfs_btree_visit_blocks_fn fn, + void *data) +{ + struct xfs_btree_block *block; + struct xfs_buf *bp; + union xfs_btree_ptr rptr; + int error; + + /* do right sibling readahead */ + xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); + block = xfs_btree_get_block(cur, level, &bp); + + /* process the block */ + error = fn(cur, level, data); + if (error) + return error; + + /* now read rh sibling block for next iteration */ + xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); + if (xfs_btree_ptr_is_null(cur, &rptr)) + return -ENOENT; + + return xfs_btree_lookup_get_block(cur, level, &rptr, &block); +} + + +/* Visit every block in a btree. */ +int +xfs_btree_visit_blocks( + struct xfs_btree_cur *cur, + xfs_btree_visit_blocks_fn fn, + void *data) +{ + union xfs_btree_ptr lptr; + int level; + struct xfs_btree_block *block = NULL; + int error = 0; + + cur->bc_ops->init_ptr_from_cur(cur, &lptr); + + /* for each level */ + for (level = cur->bc_nlevels - 1; level >= 0; level--) { + /* grab the left hand block */ + error = xfs_btree_lookup_get_block(cur, level, &lptr, &block); + if (error) + return error; + + /* readahead the left most block for the next level down */ + if (level > 0) { + union xfs_btree_ptr *ptr; + + ptr = xfs_btree_ptr_addr(cur, 1, block); + xfs_btree_readahead_ptr(cur, ptr, 1); + + /* save for the next iteration of the loop */ + lptr = *ptr; + } + + /* for each buffer in the level */ + do { + error = xfs_btree_visit_block(cur, level, fn, data); + } while (!error); + + if (error != -ENOENT) + return error; + } + + return 0; +} + /* * Change the owner of a btree. * @@ -4002,26 +4386,27 @@ xfs_btree_get_rec( * just queue the modified buffer as delayed write buffer so the transaction * recovery completion writes the changes to disk. */ +struct xfs_btree_block_change_owner_info { + __uint64_t new_owner; + struct list_head *buffer_list; +}; + static int xfs_btree_block_change_owner( struct xfs_btree_cur *cur, int level, - __uint64_t new_owner, - struct list_head *buffer_list) + void *data) { + struct xfs_btree_block_change_owner_info *bbcoi = data; struct xfs_btree_block *block; struct xfs_buf *bp; - union xfs_btree_ptr rptr; - - /* do right sibling readahead */ - xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); /* modify the owner */ block = xfs_btree_get_block(cur, level, &bp); if (cur->bc_flags & XFS_BTREE_LONG_PTRS) - block->bb_u.l.bb_owner = cpu_to_be64(new_owner); + block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner); else - block->bb_u.s.bb_owner = cpu_to_be32(new_owner); + block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner); /* * If the block is a root block hosted in an inode, we might not have a @@ -4035,19 +4420,14 @@ xfs_btree_block_change_owner( xfs_trans_ordered_buf(cur->bc_tp, bp); xfs_btree_log_block(cur, bp, XFS_BB_OWNER); } else { - xfs_buf_delwri_queue(bp, buffer_list); + xfs_buf_delwri_queue(bp, bbcoi->buffer_list); } } else { ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); ASSERT(level == cur->bc_nlevels - 1); } - /* now read rh sibling block for next iteration */ - xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); - if (xfs_btree_ptr_is_null(cur, &rptr)) - return -ENOENT; - - return xfs_btree_lookup_get_block(cur, level, &rptr, &block); + return 0; } int @@ -4056,43 +4436,13 @@ xfs_btree_change_owner( __uint64_t new_owner, struct list_head *buffer_list) { - union xfs_btree_ptr lptr; - int level; - struct xfs_btree_block *block = NULL; - int error = 0; - - cur->bc_ops->init_ptr_from_cur(cur, &lptr); - - /* for each level */ - for (level = cur->bc_nlevels - 1; level >= 0; level--) { - /* grab the left hand block */ - error = xfs_btree_lookup_get_block(cur, level, &lptr, &block); - if (error) - return error; - - /* readahead the left most block for the next level down */ - if (level > 0) { - union xfs_btree_ptr *ptr; - - ptr = xfs_btree_ptr_addr(cur, 1, block); - xfs_btree_readahead_ptr(cur, ptr, 1); - - /* save for the next iteration of the loop */ - lptr = *ptr; - } - - /* for each buffer in the level */ - do { - error = xfs_btree_block_change_owner(cur, level, - new_owner, - buffer_list); - } while (!error); + struct xfs_btree_block_change_owner_info bbcoi; - if (error != -ENOENT) - return error; - } + bbcoi.new_owner = new_owner; + bbcoi.buffer_list = buffer_list; - return 0; + return xfs_btree_visit_blocks(cur, xfs_btree_block_change_owner, + &bbcoi); } /** @@ -4171,3 +4521,267 @@ xfs_btree_compute_maxlevels( maxblocks = (maxblocks + limits[1] - 1) / limits[1]; return level; } + +/* + * Query a regular btree for all records overlapping a given interval. + * Start with a LE lookup of the key of low_rec and return all records + * until we find a record with a key greater than the key of high_rec. + */ +STATIC int +xfs_btree_simple_query_range( + struct xfs_btree_cur *cur, + union xfs_btree_key *low_key, + union xfs_btree_key *high_key, + xfs_btree_query_range_fn fn, + void *priv) +{ + union xfs_btree_rec *recp; + union xfs_btree_key rec_key; + __int64_t diff; + int stat; + bool firstrec = true; + int error; + + ASSERT(cur->bc_ops->init_high_key_from_rec); + ASSERT(cur->bc_ops->diff_two_keys); + + /* + * Find the leftmost record. The btree cursor must be set + * to the low record used to generate low_key. + */ + stat = 0; + error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat); + if (error) + goto out; + + while (stat) { + /* Find the record. */ + error = xfs_btree_get_rec(cur, &recp, &stat); + if (error || !stat) + break; + cur->bc_ops->init_high_key_from_rec(&rec_key, recp); + + /* Skip if high_key(rec) < low_key. */ + if (firstrec) { + firstrec = false; + diff = cur->bc_ops->diff_two_keys(cur, low_key, + &rec_key); + if (diff > 0) + goto advloop; + } + + /* Stop if high_key < low_key(rec). */ + diff = cur->bc_ops->diff_two_keys(cur, &rec_key, high_key); + if (diff > 0) + break; + + /* Callback */ + error = fn(cur, recp, priv); + if (error < 0 || error == XFS_BTREE_QUERY_RANGE_ABORT) + break; + +advloop: + /* Move on to the next record. */ + error = xfs_btree_increment(cur, 0, &stat); + if (error) + break; + } + +out: + return error; +} + +/* + * Query an overlapped interval btree for all records overlapping a given + * interval. This function roughly follows the algorithm given in + * "Interval Trees" of _Introduction to Algorithms_, which is section + * 14.3 in the 2nd and 3rd editions. + * + * First, generate keys for the low and high records passed in. + * + * For any leaf node, generate the high and low keys for the record. + * If the record keys overlap with the query low/high keys, pass the + * record to the function iterator. + * + * For any internal node, compare the low and high keys of each + * pointer against the query low/high keys. If there's an overlap, + * follow the pointer. + * + * As an optimization, we stop scanning a block when we find a low key + * that is greater than the query's high key. + */ +STATIC int +xfs_btree_overlapped_query_range( + struct xfs_btree_cur *cur, + union xfs_btree_key *low_key, + union xfs_btree_key *high_key, + xfs_btree_query_range_fn fn, + void *priv) +{ + union xfs_btree_ptr ptr; + union xfs_btree_ptr *pp; + union xfs_btree_key rec_key; + union xfs_btree_key rec_hkey; + union xfs_btree_key *lkp; + union xfs_btree_key *hkp; + union xfs_btree_rec *recp; + struct xfs_btree_block *block; + __int64_t ldiff; + __int64_t hdiff; + int level; + struct xfs_buf *bp; + int i; + int error; + + /* Load the root of the btree. */ + level = cur->bc_nlevels - 1; + cur->bc_ops->init_ptr_from_cur(cur, &ptr); + error = xfs_btree_lookup_get_block(cur, level, &ptr, &block); + if (error) + return error; + xfs_btree_get_block(cur, level, &bp); + trace_xfs_btree_overlapped_query_range(cur, level, bp); +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, level, bp); + if (error) + goto out; +#endif + cur->bc_ptrs[level] = 1; + + while (level < cur->bc_nlevels) { + block = xfs_btree_get_block(cur, level, &bp); + + /* End of node, pop back towards the root. */ + if (cur->bc_ptrs[level] > be16_to_cpu(block->bb_numrecs)) { +pop_up: + if (level < cur->bc_nlevels - 1) + cur->bc_ptrs[level + 1]++; + level++; + continue; + } + + if (level == 0) { + /* Handle a leaf node. */ + recp = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block); + + cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp); + ldiff = cur->bc_ops->diff_two_keys(cur, &rec_hkey, + low_key); + + cur->bc_ops->init_key_from_rec(&rec_key, recp); + hdiff = cur->bc_ops->diff_two_keys(cur, high_key, + &rec_key); + + /* + * If (record's high key >= query's low key) and + * (query's high key >= record's low key), then + * this record overlaps the query range; callback. + */ + if (ldiff >= 0 && hdiff >= 0) { + error = fn(cur, recp, priv); + if (error < 0 || + error == XFS_BTREE_QUERY_RANGE_ABORT) + break; + } else if (hdiff < 0) { + /* Record is larger than high key; pop. */ + goto pop_up; + } + cur->bc_ptrs[level]++; + continue; + } + + /* Handle an internal node. */ + lkp = xfs_btree_key_addr(cur, cur->bc_ptrs[level], block); + hkp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level], block); + pp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[level], block); + + ldiff = cur->bc_ops->diff_two_keys(cur, hkp, low_key); + hdiff = cur->bc_ops->diff_two_keys(cur, high_key, lkp); + + /* + * If (pointer's high key >= query's low key) and + * (query's high key >= pointer's low key), then + * this record overlaps the query range; follow pointer. + */ + if (ldiff >= 0 && hdiff >= 0) { + level--; + error = xfs_btree_lookup_get_block(cur, level, pp, + &block); + if (error) + goto out; + xfs_btree_get_block(cur, level, &bp); + trace_xfs_btree_overlapped_query_range(cur, level, bp); +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, level, bp); + if (error) + goto out; +#endif + cur->bc_ptrs[level] = 1; + continue; + } else if (hdiff < 0) { + /* The low key is larger than the upper range; pop. */ + goto pop_up; + } + cur->bc_ptrs[level]++; + } + +out: + /* + * If we don't end this function with the cursor pointing at a record + * block, a subsequent non-error cursor deletion will not release + * node-level buffers, causing a buffer leak. This is quite possible + * with a zero-results range query, so release the buffers if we + * failed to return any results. + */ + if (cur->bc_bufs[0] == NULL) { + for (i = 0; i < cur->bc_nlevels; i++) { + if (cur->bc_bufs[i]) { + xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]); + cur->bc_bufs[i] = NULL; + cur->bc_ptrs[i] = 0; + cur->bc_ra[i] = 0; + } + } + } + + return error; +} + +/* + * Query a btree for all records overlapping a given interval of keys. The + * supplied function will be called with each record found; return one of the + * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error + * code. This function returns XFS_BTREE_QUERY_RANGE_ABORT, zero, or a + * negative error code. + */ +int +xfs_btree_query_range( + struct xfs_btree_cur *cur, + union xfs_btree_irec *low_rec, + union xfs_btree_irec *high_rec, + xfs_btree_query_range_fn fn, + void *priv) +{ + union xfs_btree_rec rec; + union xfs_btree_key low_key; + union xfs_btree_key high_key; + + /* Find the keys of both ends of the interval. */ + cur->bc_rec = *high_rec; + cur->bc_ops->init_rec_from_cur(cur, &rec); + cur->bc_ops->init_key_from_rec(&high_key, &rec); + + cur->bc_rec = *low_rec; + cur->bc_ops->init_rec_from_cur(cur, &rec); + cur->bc_ops->init_key_from_rec(&low_key, &rec); + + /* Enforce low key < high key. */ + if (cur->bc_ops->diff_two_keys(cur, &low_key, &high_key) > 0) + return -EINVAL; + + if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + return xfs_btree_simple_query_range(cur, &low_key, + &high_key, fn, priv); + return xfs_btree_overlapped_query_range(cur, &low_key, &high_key, + fn, priv); +} diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 785a99682159..04d0865e5e6d 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -19,7 +19,7 @@ #define __XFS_BTREE_H__ struct xfs_buf; -struct xfs_bmap_free; +struct xfs_defer_ops; struct xfs_inode; struct xfs_mount; struct xfs_trans; @@ -38,17 +38,37 @@ union xfs_btree_ptr { }; union xfs_btree_key { - xfs_bmbt_key_t bmbt; - xfs_bmdr_key_t bmbr; /* bmbt root block */ - xfs_alloc_key_t alloc; - xfs_inobt_key_t inobt; + struct xfs_bmbt_key bmbt; + xfs_bmdr_key_t bmbr; /* bmbt root block */ + xfs_alloc_key_t alloc; + struct xfs_inobt_key inobt; + struct xfs_rmap_key rmap; +}; + +/* + * In-core key that holds both low and high keys for overlapped btrees. + * The two keys are packed next to each other on disk, so do the same + * in memory. Preserve the existing xfs_btree_key as a single key to + * avoid the mental model breakage that would happen if we passed a + * bigkey into a function that operates on a single key. + */ +union xfs_btree_bigkey { + struct xfs_bmbt_key bmbt; + xfs_bmdr_key_t bmbr; /* bmbt root block */ + xfs_alloc_key_t alloc; + struct xfs_inobt_key inobt; + struct { + struct xfs_rmap_key rmap; + struct xfs_rmap_key rmap_hi; + }; }; union xfs_btree_rec { - xfs_bmbt_rec_t bmbt; - xfs_bmdr_rec_t bmbr; /* bmbt root block */ - xfs_alloc_rec_t alloc; - xfs_inobt_rec_t inobt; + struct xfs_bmbt_rec bmbt; + xfs_bmdr_rec_t bmbr; /* bmbt root block */ + struct xfs_alloc_rec alloc; + struct xfs_inobt_rec inobt; + struct xfs_rmap_rec rmap; }; /* @@ -63,6 +83,7 @@ union xfs_btree_rec { #define XFS_BTNUM_BMAP ((xfs_btnum_t)XFS_BTNUM_BMAPi) #define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi) #define XFS_BTNUM_FINO ((xfs_btnum_t)XFS_BTNUM_FINOi) +#define XFS_BTNUM_RMAP ((xfs_btnum_t)XFS_BTNUM_RMAPi) /* * For logging record fields. @@ -95,6 +116,7 @@ do { \ case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(__mp, bmbt, stat); break; \ case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(__mp, ibt, stat); break; \ case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(__mp, fibt, stat); break; \ + case XFS_BTNUM_RMAP: __XFS_BTREE_STATS_INC(__mp, rmap, stat); break; \ case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ } \ } while (0) @@ -115,11 +137,13 @@ do { \ __XFS_BTREE_STATS_ADD(__mp, ibt, stat, val); break; \ case XFS_BTNUM_FINO: \ __XFS_BTREE_STATS_ADD(__mp, fibt, stat, val); break; \ + case XFS_BTNUM_RMAP: \ + __XFS_BTREE_STATS_ADD(__mp, rmap, stat, val); break; \ case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ } \ } while (0) -#define XFS_BTREE_MAXLEVELS 8 /* max of all btrees */ +#define XFS_BTREE_MAXLEVELS 9 /* max of all btrees */ struct xfs_btree_ops { /* size of the key and record structures */ @@ -158,17 +182,25 @@ struct xfs_btree_ops { /* init values of btree structures */ void (*init_key_from_rec)(union xfs_btree_key *key, union xfs_btree_rec *rec); - void (*init_rec_from_key)(union xfs_btree_key *key, - union xfs_btree_rec *rec); void (*init_rec_from_cur)(struct xfs_btree_cur *cur, union xfs_btree_rec *rec); void (*init_ptr_from_cur)(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr); + void (*init_high_key_from_rec)(union xfs_btree_key *key, + union xfs_btree_rec *rec); /* difference between key value and cursor value */ __int64_t (*key_diff)(struct xfs_btree_cur *cur, union xfs_btree_key *key); + /* + * Difference between key2 and key1 -- positive if key1 > key2, + * negative if key1 < key2, and zero if equal. + */ + __int64_t (*diff_two_keys)(struct xfs_btree_cur *cur, + union xfs_btree_key *key1, + union xfs_btree_key *key2); + const struct xfs_buf_ops *buf_ops; #if defined(DEBUG) || defined(XFS_WARN) @@ -192,6 +224,13 @@ struct xfs_btree_ops { #define LASTREC_DELREC 2 +union xfs_btree_irec { + struct xfs_alloc_rec_incore a; + struct xfs_bmbt_irec b; + struct xfs_inobt_rec_incore i; + struct xfs_rmap_irec r; +}; + /* * Btree cursor structure. * This collects all information needed by the btree code in one place. @@ -202,11 +241,7 @@ typedef struct xfs_btree_cur struct xfs_mount *bc_mp; /* file system mount struct */ const struct xfs_btree_ops *bc_ops; uint bc_flags; /* btree features - below */ - union { - xfs_alloc_rec_incore_t a; - xfs_bmbt_irec_t b; - xfs_inobt_rec_incore_t i; - } bc_rec; /* current insert/search record value */ + union xfs_btree_irec bc_rec; /* current insert/search record value */ struct xfs_buf *bc_bufs[XFS_BTREE_MAXLEVELS]; /* buf ptr per level */ int bc_ptrs[XFS_BTREE_MAXLEVELS]; /* key/record # */ __uint8_t bc_ra[XFS_BTREE_MAXLEVELS]; /* readahead bits */ @@ -218,11 +253,12 @@ typedef struct xfs_btree_cur union { struct { /* needed for BNO, CNT, INO */ struct xfs_buf *agbp; /* agf/agi buffer pointer */ + struct xfs_defer_ops *dfops; /* deferred updates */ xfs_agnumber_t agno; /* ag number */ } a; struct { /* needed for BMAP */ struct xfs_inode *ip; /* pointer to our inode */ - struct xfs_bmap_free *flist; /* list to free after */ + struct xfs_defer_ops *dfops; /* deferred updates */ xfs_fsblock_t firstblock; /* 1st blk allocated */ int allocated; /* count of alloced */ short forksize; /* fork's inode space */ @@ -238,6 +274,7 @@ typedef struct xfs_btree_cur #define XFS_BTREE_ROOT_IN_INODE (1<<1) /* root may be variable size */ #define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */ #define XFS_BTREE_CRC_BLOCKS (1<<3) /* uses extended btree blocks */ +#define XFS_BTREE_OVERLAPPING (1<<4) /* overlapping intervals */ #define XFS_BTREE_NOERROR 0 @@ -477,4 +514,19 @@ bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs); uint xfs_btree_compute_maxlevels(struct xfs_mount *mp, uint *limits, unsigned long len); +/* return codes */ +#define XFS_BTREE_QUERY_RANGE_CONTINUE 0 /* keep iterating */ +#define XFS_BTREE_QUERY_RANGE_ABORT 1 /* stop iterating */ +typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur, + union xfs_btree_rec *rec, void *priv); + +int xfs_btree_query_range(struct xfs_btree_cur *cur, + union xfs_btree_irec *low_rec, union xfs_btree_irec *high_rec, + xfs_btree_query_range_fn fn, void *priv); + +typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level, + void *data); +int xfs_btree_visit_blocks(struct xfs_btree_cur *cur, + xfs_btree_visit_blocks_fn fn, void *data); + #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 0f1f165f4048..f2dc1a950c85 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -2029,7 +2029,7 @@ xfs_da_grow_inode_int( error = xfs_bmapi_write(tp, dp, *bno, count, xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG, args->firstblock, args->total, &map, &nmap, - args->flist); + args->dfops); if (error) return error; @@ -2052,7 +2052,7 @@ xfs_da_grow_inode_int( error = xfs_bmapi_write(tp, dp, b, c, xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, args->firstblock, args->total, - &mapp[mapi], &nmap, args->flist); + &mapp[mapi], &nmap, args->dfops); if (error) goto out_free_map; if (nmap < 1) @@ -2362,7 +2362,7 @@ xfs_da_shrink_inode( */ error = xfs_bunmapi(tp, dp, dead_blkno, count, xfs_bmapi_aflag(w), 0, args->firstblock, - args->flist, &done); + args->dfops, &done); if (error == -ENOSPC) { if (w != XFS_DATA_FORK) break; diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index 6e153e399a77..98c75cbe6ac2 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -19,7 +19,7 @@ #ifndef __XFS_DA_BTREE_H__ #define __XFS_DA_BTREE_H__ -struct xfs_bmap_free; +struct xfs_defer_ops; struct xfs_inode; struct xfs_trans; struct zone; @@ -70,7 +70,7 @@ typedef struct xfs_da_args { xfs_ino_t inumber; /* input/output inode number */ struct xfs_inode *dp; /* directory inode to manipulate */ xfs_fsblock_t *firstblock; /* ptr to firstblock for bmap calls */ - struct xfs_bmap_free *flist; /* ptr to freelist for bmap_finish */ + struct xfs_defer_ops *dfops; /* ptr to freelist for bmap_finish */ struct xfs_trans *trans; /* current trans (changes over time) */ xfs_extlen_t total; /* total blocks needed, for 1st bmap */ int whichfork; /* data or attribute fork */ diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 685f23b67056..9a492a9e19bd 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -629,6 +629,7 @@ typedef struct xfs_attr_shortform { struct xfs_attr_sf_hdr { /* constant-structure header block */ __be16 totsize; /* total bytes in shortform list */ __u8 count; /* count of active entries */ + __u8 padding; } hdr; struct xfs_attr_sf_entry { __uint8_t namelen; /* actual length of name (no NULL) */ diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c new file mode 100644 index 000000000000..054a2032fdb3 --- /dev/null +++ b/fs/xfs/libxfs/xfs_defer.c @@ -0,0 +1,463 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_trans.h" +#include "xfs_trace.h" + +/* + * Deferred Operations in XFS + * + * Due to the way locking rules work in XFS, certain transactions (block + * mapping and unmapping, typically) have permanent reservations so that + * we can roll the transaction to adhere to AG locking order rules and + * to unlock buffers between metadata updates. Prior to rmap/reflink, + * the mapping code had a mechanism to perform these deferrals for + * extents that were going to be freed; this code makes that facility + * more generic. + * + * When adding the reverse mapping and reflink features, it became + * necessary to perform complex remapping multi-transactions to comply + * with AG locking order rules, and to be able to spread a single + * refcount update operation (an operation on an n-block extent can + * update as many as n records!) among multiple transactions. XFS can + * roll a transaction to facilitate this, but using this facility + * requires us to log "intent" items in case log recovery needs to + * redo the operation, and to log "done" items to indicate that redo + * is not necessary. + * + * Deferred work is tracked in xfs_defer_pending items. Each pending + * item tracks one type of deferred work. Incoming work items (which + * have not yet had an intent logged) are attached to a pending item + * on the dop_intake list, where they wait for the caller to finish + * the deferred operations. + * + * Finishing a set of deferred operations is an involved process. To + * start, we define "rolling a deferred-op transaction" as follows: + * + * > For each xfs_defer_pending item on the dop_intake list, + * - Sort the work items in AG order. XFS locking + * order rules require us to lock buffers in AG order. + * - Create a log intent item for that type. + * - Attach it to the pending item. + * - Move the pending item from the dop_intake list to the + * dop_pending list. + * > Roll the transaction. + * + * NOTE: To avoid exceeding the transaction reservation, we limit the + * number of items that we attach to a given xfs_defer_pending. + * + * The actual finishing process looks like this: + * + * > For each xfs_defer_pending in the dop_pending list, + * - Roll the deferred-op transaction as above. + * - Create a log done item for that type, and attach it to the + * log intent item. + * - For each work item attached to the log intent item, + * * Perform the described action. + * * Attach the work item to the log done item. + * + * The key here is that we must log an intent item for all pending + * work items every time we roll the transaction, and that we must log + * a done item as soon as the work is completed. With this mechanism + * we can perform complex remapping operations, chaining intent items + * as needed. + * + * This is an example of remapping the extent (E, E+B) into file X at + * offset A and dealing with the extent (C, C+B) already being mapped + * there: + * +-------------------------------------------------+ + * | Unmap file X startblock C offset A length B | t0 + * | Intent to reduce refcount for extent (C, B) | + * | Intent to remove rmap (X, C, A, B) | + * | Intent to free extent (D, 1) (bmbt block) | + * | Intent to map (X, A, B) at startblock E | + * +-------------------------------------------------+ + * | Map file X startblock E offset A length B | t1 + * | Done mapping (X, E, A, B) | + * | Intent to increase refcount for extent (E, B) | + * | Intent to add rmap (X, E, A, B) | + * +-------------------------------------------------+ + * | Reduce refcount for extent (C, B) | t2 + * | Done reducing refcount for extent (C, B) | + * | Increase refcount for extent (E, B) | + * | Done increasing refcount for extent (E, B) | + * | Intent to free extent (C, B) | + * | Intent to free extent (F, 1) (refcountbt block) | + * | Intent to remove rmap (F, 1, REFC) | + * +-------------------------------------------------+ + * | Remove rmap (X, C, A, B) | t3 + * | Done removing rmap (X, C, A, B) | + * | Add rmap (X, E, A, B) | + * | Done adding rmap (X, E, A, B) | + * | Remove rmap (F, 1, REFC) | + * | Done removing rmap (F, 1, REFC) | + * +-------------------------------------------------+ + * | Free extent (C, B) | t4 + * | Done freeing extent (C, B) | + * | Free extent (D, 1) | + * | Done freeing extent (D, 1) | + * | Free extent (F, 1) | + * | Done freeing extent (F, 1) | + * +-------------------------------------------------+ + * + * If we should crash before t2 commits, log recovery replays + * the following intent items: + * + * - Intent to reduce refcount for extent (C, B) + * - Intent to remove rmap (X, C, A, B) + * - Intent to free extent (D, 1) (bmbt block) + * - Intent to increase refcount for extent (E, B) + * - Intent to add rmap (X, E, A, B) + * + * In the process of recovering, it should also generate and take care + * of these intent items: + * + * - Intent to free extent (C, B) + * - Intent to free extent (F, 1) (refcountbt block) + * - Intent to remove rmap (F, 1, REFC) + */ + +static const struct xfs_defer_op_type *defer_op_types[XFS_DEFER_OPS_TYPE_MAX]; + +/* + * For each pending item in the intake list, log its intent item and the + * associated extents, then add the entire intake list to the end of + * the pending list. + */ +STATIC void +xfs_defer_intake_work( + struct xfs_trans *tp, + struct xfs_defer_ops *dop) +{ + struct list_head *li; + struct xfs_defer_pending *dfp; + + list_for_each_entry(dfp, &dop->dop_intake, dfp_list) { + trace_xfs_defer_intake_work(tp->t_mountp, dfp); + dfp->dfp_intent = dfp->dfp_type->create_intent(tp, + dfp->dfp_count); + list_sort(tp->t_mountp, &dfp->dfp_work, + dfp->dfp_type->diff_items); + list_for_each(li, &dfp->dfp_work) + dfp->dfp_type->log_item(tp, dfp->dfp_intent, li); + } + + list_splice_tail_init(&dop->dop_intake, &dop->dop_pending); +} + +/* Abort all the intents that were committed. */ +STATIC void +xfs_defer_trans_abort( + struct xfs_trans *tp, + struct xfs_defer_ops *dop, + int error) +{ + struct xfs_defer_pending *dfp; + + trace_xfs_defer_trans_abort(tp->t_mountp, dop); + /* + * If the transaction was committed, drop the intent reference + * since we're bailing out of here. The other reference is + * dropped when the intent hits the AIL. If the transaction + * was not committed, the intent is freed by the intent item + * unlock handler on abort. + */ + if (!dop->dop_committed) + return; + + /* Abort intent items. */ + list_for_each_entry(dfp, &dop->dop_pending, dfp_list) { + trace_xfs_defer_pending_abort(tp->t_mountp, dfp); + if (dfp->dfp_committed) + dfp->dfp_type->abort_intent(dfp->dfp_intent); + } + + /* Shut down FS. */ + xfs_force_shutdown(tp->t_mountp, (error == -EFSCORRUPTED) ? + SHUTDOWN_CORRUPT_INCORE : SHUTDOWN_META_IO_ERROR); +} + +/* Roll a transaction so we can do some deferred op processing. */ +STATIC int +xfs_defer_trans_roll( + struct xfs_trans **tp, + struct xfs_defer_ops *dop, + struct xfs_inode *ip) +{ + int i; + int error; + + /* Log all the joined inodes except the one we passed in. */ + for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) { + if (dop->dop_inodes[i] == ip) + continue; + xfs_trans_log_inode(*tp, dop->dop_inodes[i], XFS_ILOG_CORE); + } + + trace_xfs_defer_trans_roll((*tp)->t_mountp, dop); + + /* Roll the transaction. */ + error = xfs_trans_roll(tp, ip); + if (error) { + trace_xfs_defer_trans_roll_error((*tp)->t_mountp, dop, error); + xfs_defer_trans_abort(*tp, dop, error); + return error; + } + dop->dop_committed = true; + + /* Rejoin the joined inodes except the one we passed in. */ + for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) { + if (dop->dop_inodes[i] == ip) + continue; + xfs_trans_ijoin(*tp, dop->dop_inodes[i], 0); + } + + return error; +} + +/* Do we have any work items to finish? */ +bool +xfs_defer_has_unfinished_work( + struct xfs_defer_ops *dop) +{ + return !list_empty(&dop->dop_pending) || !list_empty(&dop->dop_intake); +} + +/* + * Add this inode to the deferred op. Each joined inode is relogged + * each time we roll the transaction, in addition to any inode passed + * to xfs_defer_finish(). + */ +int +xfs_defer_join( + struct xfs_defer_ops *dop, + struct xfs_inode *ip) +{ + int i; + + for (i = 0; i < XFS_DEFER_OPS_NR_INODES; i++) { + if (dop->dop_inodes[i] == ip) + return 0; + else if (dop->dop_inodes[i] == NULL) { + dop->dop_inodes[i] = ip; + return 0; + } + } + + return -EFSCORRUPTED; +} + +/* + * Finish all the pending work. This involves logging intent items for + * any work items that wandered in since the last transaction roll (if + * one has even happened), rolling the transaction, and finishing the + * work items in the first item on the logged-and-pending list. + * + * If an inode is provided, relog it to the new transaction. + */ +int +xfs_defer_finish( + struct xfs_trans **tp, + struct xfs_defer_ops *dop, + struct xfs_inode *ip) +{ + struct xfs_defer_pending *dfp; + struct list_head *li; + struct list_head *n; + void *done_item = NULL; + void *state; + int error = 0; + void (*cleanup_fn)(struct xfs_trans *, void *, int); + + ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); + + trace_xfs_defer_finish((*tp)->t_mountp, dop); + + /* Until we run out of pending work to finish... */ + while (xfs_defer_has_unfinished_work(dop)) { + /* Log intents for work items sitting in the intake. */ + xfs_defer_intake_work(*tp, dop); + + /* Roll the transaction. */ + error = xfs_defer_trans_roll(tp, dop, ip); + if (error) + goto out; + + /* Mark all pending intents as committed. */ + list_for_each_entry_reverse(dfp, &dop->dop_pending, dfp_list) { + if (dfp->dfp_committed) + break; + trace_xfs_defer_pending_commit((*tp)->t_mountp, dfp); + dfp->dfp_committed = true; + } + + /* Log an intent-done item for the first pending item. */ + dfp = list_first_entry(&dop->dop_pending, + struct xfs_defer_pending, dfp_list); + trace_xfs_defer_pending_finish((*tp)->t_mountp, dfp); + done_item = dfp->dfp_type->create_done(*tp, dfp->dfp_intent, + dfp->dfp_count); + cleanup_fn = dfp->dfp_type->finish_cleanup; + + /* Finish the work items. */ + state = NULL; + list_for_each_safe(li, n, &dfp->dfp_work) { + list_del(li); + dfp->dfp_count--; + error = dfp->dfp_type->finish_item(*tp, dop, li, + done_item, &state); + if (error) { + /* + * Clean up after ourselves and jump out. + * xfs_defer_cancel will take care of freeing + * all these lists and stuff. + */ + if (cleanup_fn) + cleanup_fn(*tp, state, error); + xfs_defer_trans_abort(*tp, dop, error); + goto out; + } + } + /* Done with the dfp, free it. */ + list_del(&dfp->dfp_list); + kmem_free(dfp); + + if (cleanup_fn) + cleanup_fn(*tp, state, error); + } + +out: + if (error) + trace_xfs_defer_finish_error((*tp)->t_mountp, dop, error); + else + trace_xfs_defer_finish_done((*tp)->t_mountp, dop); + return error; +} + +/* + * Free up any items left in the list. + */ +void +xfs_defer_cancel( + struct xfs_defer_ops *dop) +{ + struct xfs_defer_pending *dfp; + struct xfs_defer_pending *pli; + struct list_head *pwi; + struct list_head *n; + + trace_xfs_defer_cancel(NULL, dop); + + /* + * Free the pending items. Caller should already have arranged + * for the intent items to be released. + */ + list_for_each_entry_safe(dfp, pli, &dop->dop_intake, dfp_list) { + trace_xfs_defer_intake_cancel(NULL, dfp); + list_del(&dfp->dfp_list); + list_for_each_safe(pwi, n, &dfp->dfp_work) { + list_del(pwi); + dfp->dfp_count--; + dfp->dfp_type->cancel_item(pwi); + } + ASSERT(dfp->dfp_count == 0); + kmem_free(dfp); + } + list_for_each_entry_safe(dfp, pli, &dop->dop_pending, dfp_list) { + trace_xfs_defer_pending_cancel(NULL, dfp); + list_del(&dfp->dfp_list); + list_for_each_safe(pwi, n, &dfp->dfp_work) { + list_del(pwi); + dfp->dfp_count--; + dfp->dfp_type->cancel_item(pwi); + } + ASSERT(dfp->dfp_count == 0); + kmem_free(dfp); + } +} + +/* Add an item for later deferred processing. */ +void +xfs_defer_add( + struct xfs_defer_ops *dop, + enum xfs_defer_ops_type type, + struct list_head *li) +{ + struct xfs_defer_pending *dfp = NULL; + + /* + * Add the item to a pending item at the end of the intake list. + * If the last pending item has the same type, reuse it. Else, + * create a new pending item at the end of the intake list. + */ + if (!list_empty(&dop->dop_intake)) { + dfp = list_last_entry(&dop->dop_intake, + struct xfs_defer_pending, dfp_list); + if (dfp->dfp_type->type != type || + (dfp->dfp_type->max_items && + dfp->dfp_count >= dfp->dfp_type->max_items)) + dfp = NULL; + } + if (!dfp) { + dfp = kmem_alloc(sizeof(struct xfs_defer_pending), + KM_SLEEP | KM_NOFS); + dfp->dfp_type = defer_op_types[type]; + dfp->dfp_committed = false; + dfp->dfp_intent = NULL; + dfp->dfp_count = 0; + INIT_LIST_HEAD(&dfp->dfp_work); + list_add_tail(&dfp->dfp_list, &dop->dop_intake); + } + + list_add_tail(li, &dfp->dfp_work); + dfp->dfp_count++; +} + +/* Initialize a deferred operation list. */ +void +xfs_defer_init_op_type( + const struct xfs_defer_op_type *type) +{ + defer_op_types[type->type] = type; +} + +/* Initialize a deferred operation. */ +void +xfs_defer_init( + struct xfs_defer_ops *dop, + xfs_fsblock_t *fbp) +{ + dop->dop_committed = false; + dop->dop_low = false; + memset(&dop->dop_inodes, 0, sizeof(dop->dop_inodes)); + *fbp = NULLFSBLOCK; + INIT_LIST_HEAD(&dop->dop_intake); + INIT_LIST_HEAD(&dop->dop_pending); + trace_xfs_defer_init(NULL, dop); +} diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h new file mode 100644 index 000000000000..cc3981c48296 --- /dev/null +++ b/fs/xfs/libxfs/xfs_defer.h @@ -0,0 +1,97 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_DEFER_H__ +#define __XFS_DEFER_H__ + +struct xfs_defer_op_type; + +/* + * Save a log intent item and a list of extents, so that we can replay + * whatever action had to happen to the extent list and file the log done + * item. + */ +struct xfs_defer_pending { + const struct xfs_defer_op_type *dfp_type; /* function pointers */ + struct list_head dfp_list; /* pending items */ + bool dfp_committed; /* committed trans? */ + void *dfp_intent; /* log intent item */ + struct list_head dfp_work; /* work items */ + unsigned int dfp_count; /* # extent items */ +}; + +/* + * Header for deferred operation list. + * + * dop_low is used by the allocator to activate the lowspace algorithm - + * when free space is running low the extent allocator may choose to + * allocate an extent from an AG without leaving sufficient space for + * a btree split when inserting the new extent. In this case the allocator + * will enable the lowspace algorithm which is supposed to allow further + * allocations (such as btree splits and newroots) to allocate from + * sequential AGs. In order to avoid locking AGs out of order the lowspace + * algorithm will start searching for free space from AG 0. If the correct + * transaction reservations have been made then this algorithm will eventually + * find all the space it needs. + */ +enum xfs_defer_ops_type { + XFS_DEFER_OPS_TYPE_RMAP, + XFS_DEFER_OPS_TYPE_FREE, + XFS_DEFER_OPS_TYPE_MAX, +}; + +#define XFS_DEFER_OPS_NR_INODES 2 /* join up to two inodes */ + +struct xfs_defer_ops { + bool dop_committed; /* did any trans commit? */ + bool dop_low; /* alloc in low mode */ + struct list_head dop_intake; /* unlogged pending work */ + struct list_head dop_pending; /* logged pending work */ + + /* relog these inodes with each roll */ + struct xfs_inode *dop_inodes[XFS_DEFER_OPS_NR_INODES]; +}; + +void xfs_defer_add(struct xfs_defer_ops *dop, enum xfs_defer_ops_type type, + struct list_head *h); +int xfs_defer_finish(struct xfs_trans **tp, struct xfs_defer_ops *dop, + struct xfs_inode *ip); +void xfs_defer_cancel(struct xfs_defer_ops *dop); +void xfs_defer_init(struct xfs_defer_ops *dop, xfs_fsblock_t *fbp); +bool xfs_defer_has_unfinished_work(struct xfs_defer_ops *dop); +int xfs_defer_join(struct xfs_defer_ops *dop, struct xfs_inode *ip); + +/* Description of a deferred type. */ +struct xfs_defer_op_type { + enum xfs_defer_ops_type type; + unsigned int max_items; + void (*abort_intent)(void *); + void *(*create_done)(struct xfs_trans *, void *, unsigned int); + int (*finish_item)(struct xfs_trans *, struct xfs_defer_ops *, + struct list_head *, void *, void **); + void (*finish_cleanup)(struct xfs_trans *, void *, int); + void (*cancel_item)(struct list_head *); + int (*diff_items)(void *, struct list_head *, struct list_head *); + void *(*create_intent)(struct xfs_trans *, uint); + void (*log_item)(struct xfs_trans *, void *, struct list_head *); +}; + +void xfs_defer_init_op_type(const struct xfs_defer_op_type *type); + +#endif /* __XFS_DEFER_H__ */ diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index af0f9d171f8a..20a96dd5af7e 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -21,6 +21,7 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_inode.h" @@ -259,7 +260,7 @@ xfs_dir_createname( struct xfs_name *name, xfs_ino_t inum, /* new entry inode number */ xfs_fsblock_t *first, /* bmap's firstblock */ - xfs_bmap_free_t *flist, /* bmap's freeblock list */ + struct xfs_defer_ops *dfops, /* bmap's freeblock list */ xfs_extlen_t total) /* bmap's total block count */ { struct xfs_da_args *args; @@ -286,7 +287,7 @@ xfs_dir_createname( args->inumber = inum; args->dp = dp; args->firstblock = first; - args->flist = flist; + args->dfops = dfops; args->total = total; args->whichfork = XFS_DATA_FORK; args->trans = tp; @@ -436,7 +437,7 @@ xfs_dir_removename( struct xfs_name *name, xfs_ino_t ino, xfs_fsblock_t *first, /* bmap's firstblock */ - xfs_bmap_free_t *flist, /* bmap's freeblock list */ + struct xfs_defer_ops *dfops, /* bmap's freeblock list */ xfs_extlen_t total) /* bmap's total block count */ { struct xfs_da_args *args; @@ -458,7 +459,7 @@ xfs_dir_removename( args->inumber = ino; args->dp = dp; args->firstblock = first; - args->flist = flist; + args->dfops = dfops; args->total = total; args->whichfork = XFS_DATA_FORK; args->trans = tp; @@ -498,7 +499,7 @@ xfs_dir_replace( struct xfs_name *name, /* name of entry to replace */ xfs_ino_t inum, /* new inode number */ xfs_fsblock_t *first, /* bmap's firstblock */ - xfs_bmap_free_t *flist, /* bmap's freeblock list */ + struct xfs_defer_ops *dfops, /* bmap's freeblock list */ xfs_extlen_t total) /* bmap's total block count */ { struct xfs_da_args *args; @@ -523,7 +524,7 @@ xfs_dir_replace( args->inumber = inum; args->dp = dp; args->firstblock = first; - args->flist = flist; + args->dfops = dfops; args->total = total; args->whichfork = XFS_DATA_FORK; args->trans = tp; @@ -680,7 +681,7 @@ xfs_dir2_shrink_inode( /* Unmap the fsblock(s). */ error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount, 0, 0, - args->firstblock, args->flist, &done); + args->firstblock, args->dfops, &done); if (error) { /* * ENOSPC actually can happen if we're in a removename with no diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index e55353651f5b..becc926c3e3d 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -18,7 +18,7 @@ #ifndef __XFS_DIR2_H__ #define __XFS_DIR2_H__ -struct xfs_bmap_free; +struct xfs_defer_ops; struct xfs_da_args; struct xfs_inode; struct xfs_mount; @@ -129,18 +129,18 @@ extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp, extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_name *name, xfs_ino_t inum, xfs_fsblock_t *first, - struct xfs_bmap_free *flist, xfs_extlen_t tot); + struct xfs_defer_ops *dfops, xfs_extlen_t tot); extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_name *name, xfs_ino_t *inum, struct xfs_name *ci_name); extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_name *name, xfs_ino_t ino, xfs_fsblock_t *first, - struct xfs_bmap_free *flist, xfs_extlen_t tot); + struct xfs_defer_ops *dfops, xfs_extlen_t tot); extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_name *name, xfs_ino_t inum, xfs_fsblock_t *first, - struct xfs_bmap_free *flist, xfs_extlen_t tot); + struct xfs_defer_ops *dfops, xfs_extlen_t tot); extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_name *name); diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index adb204d40f22..f814d42c73b2 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -455,8 +455,10 @@ xfs_sb_has_compat_feature( } #define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */ +#define XFS_SB_FEAT_RO_COMPAT_RMAPBT (1 << 1) /* reverse map btree */ #define XFS_SB_FEAT_RO_COMPAT_ALL \ - (XFS_SB_FEAT_RO_COMPAT_FINOBT) + (XFS_SB_FEAT_RO_COMPAT_FINOBT | \ + XFS_SB_FEAT_RO_COMPAT_RMAPBT) #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL static inline bool xfs_sb_has_ro_compat_feature( @@ -538,6 +540,12 @@ static inline bool xfs_sb_version_hasmetauuid(struct xfs_sb *sbp) (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID); } +static inline bool xfs_sb_version_hasrmapbt(struct xfs_sb *sbp) +{ + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) && + (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_RMAPBT); +} + /* * end of superblock version macros */ @@ -598,10 +606,10 @@ xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino) #define XFS_AGI_GOOD_VERSION(v) ((v) == XFS_AGI_VERSION) /* - * Btree number 0 is bno, 1 is cnt. This value gives the size of the + * Btree number 0 is bno, 1 is cnt, 2 is rmap. This value gives the size of the * arrays below. */ -#define XFS_BTNUM_AGF ((int)XFS_BTNUM_CNTi + 1) +#define XFS_BTNUM_AGF ((int)XFS_BTNUM_RMAPi + 1) /* * The second word of agf_levels in the first a.g. overlaps the EFS @@ -618,12 +626,10 @@ typedef struct xfs_agf { __be32 agf_seqno; /* sequence # starting from 0 */ __be32 agf_length; /* size in blocks of a.g. */ /* - * Freespace information + * Freespace and rmap information */ __be32 agf_roots[XFS_BTNUM_AGF]; /* root blocks */ - __be32 agf_spare0; /* spare field */ __be32 agf_levels[XFS_BTNUM_AGF]; /* btree levels */ - __be32 agf_spare1; /* spare field */ __be32 agf_flfirst; /* first freelist block's index */ __be32 agf_fllast; /* last freelist block's index */ @@ -1308,17 +1314,118 @@ typedef __be32 xfs_inobt_ptr_t; #define XFS_FIBT_BLOCK(mp) ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1)) /* - * The first data block of an AG depends on whether the filesystem was formatted - * with the finobt feature. If so, account for the finobt reserved root btree - * block. + * Reverse mapping btree format definitions + * + * There is a btree for the reverse map per allocation group + */ +#define XFS_RMAP_CRC_MAGIC 0x524d4233 /* 'RMB3' */ + +/* + * Ownership info for an extent. This is used to create reverse-mapping + * entries. */ -#define XFS_PREALLOC_BLOCKS(mp) \ +#define XFS_OWNER_INFO_ATTR_FORK (1 << 0) +#define XFS_OWNER_INFO_BMBT_BLOCK (1 << 1) +struct xfs_owner_info { + uint64_t oi_owner; + xfs_fileoff_t oi_offset; + unsigned int oi_flags; +}; + +/* + * Special owner types. + * + * Seeing as we only support up to 8EB, we have the upper bit of the owner field + * to tell us we have a special owner value. We use these for static metadata + * allocated at mkfs/growfs time, as well as for freespace management metadata. + */ +#define XFS_RMAP_OWN_NULL (-1ULL) /* No owner, for growfs */ +#define XFS_RMAP_OWN_UNKNOWN (-2ULL) /* Unknown owner, for EFI recovery */ +#define XFS_RMAP_OWN_FS (-3ULL) /* static fs metadata */ +#define XFS_RMAP_OWN_LOG (-4ULL) /* static fs metadata */ +#define XFS_RMAP_OWN_AG (-5ULL) /* AG freespace btree blocks */ +#define XFS_RMAP_OWN_INOBT (-6ULL) /* Inode btree blocks */ +#define XFS_RMAP_OWN_INODES (-7ULL) /* Inode chunk */ +#define XFS_RMAP_OWN_MIN (-8ULL) /* guard */ + +#define XFS_RMAP_NON_INODE_OWNER(owner) (!!((owner) & (1ULL << 63))) + +/* + * Data record structure + */ +struct xfs_rmap_rec { + __be32 rm_startblock; /* extent start block */ + __be32 rm_blockcount; /* extent length */ + __be64 rm_owner; /* extent owner */ + __be64 rm_offset; /* offset within the owner */ +}; + +/* + * rmap btree record + * rm_offset:63 is the attribute fork flag + * rm_offset:62 is the bmbt block flag + * rm_offset:61 is the unwritten extent flag (same as l0:63 in bmbt) + * rm_offset:54-60 aren't used and should be zero + * rm_offset:0-53 is the block offset within the inode + */ +#define XFS_RMAP_OFF_ATTR_FORK ((__uint64_t)1ULL << 63) +#define XFS_RMAP_OFF_BMBT_BLOCK ((__uint64_t)1ULL << 62) +#define XFS_RMAP_OFF_UNWRITTEN ((__uint64_t)1ULL << 61) + +#define XFS_RMAP_LEN_MAX ((__uint32_t)~0U) +#define XFS_RMAP_OFF_FLAGS (XFS_RMAP_OFF_ATTR_FORK | \ + XFS_RMAP_OFF_BMBT_BLOCK | \ + XFS_RMAP_OFF_UNWRITTEN) +#define XFS_RMAP_OFF_MASK ((__uint64_t)0x3FFFFFFFFFFFFFULL) + +#define XFS_RMAP_OFF(off) ((off) & XFS_RMAP_OFF_MASK) + +#define XFS_RMAP_IS_BMBT_BLOCK(off) (!!((off) & XFS_RMAP_OFF_BMBT_BLOCK)) +#define XFS_RMAP_IS_ATTR_FORK(off) (!!((off) & XFS_RMAP_OFF_ATTR_FORK)) +#define XFS_RMAP_IS_UNWRITTEN(len) (!!((off) & XFS_RMAP_OFF_UNWRITTEN)) + +#define RMAPBT_STARTBLOCK_BITLEN 32 +#define RMAPBT_BLOCKCOUNT_BITLEN 32 +#define RMAPBT_OWNER_BITLEN 64 +#define RMAPBT_ATTRFLAG_BITLEN 1 +#define RMAPBT_BMBTFLAG_BITLEN 1 +#define RMAPBT_EXNTFLAG_BITLEN 1 +#define RMAPBT_UNUSED_OFFSET_BITLEN 7 +#define RMAPBT_OFFSET_BITLEN 54 + +#define XFS_RMAP_ATTR_FORK (1 << 0) +#define XFS_RMAP_BMBT_BLOCK (1 << 1) +#define XFS_RMAP_UNWRITTEN (1 << 2) +#define XFS_RMAP_KEY_FLAGS (XFS_RMAP_ATTR_FORK | \ + XFS_RMAP_BMBT_BLOCK) +#define XFS_RMAP_REC_FLAGS (XFS_RMAP_UNWRITTEN) +struct xfs_rmap_irec { + xfs_agblock_t rm_startblock; /* extent start block */ + xfs_extlen_t rm_blockcount; /* extent length */ + __uint64_t rm_owner; /* extent owner */ + __uint64_t rm_offset; /* offset within the owner */ + unsigned int rm_flags; /* state flags */ +}; + +/* + * Key structure + * + * We don't use the length for lookups + */ +struct xfs_rmap_key { + __be32 rm_startblock; /* extent start block */ + __be64 rm_owner; /* extent owner */ + __be64 rm_offset; /* offset within the owner */ +} __attribute__((packed)); + +/* btree pointer type */ +typedef __be32 xfs_rmap_ptr_t; + +#define XFS_RMAP_BLOCK(mp) \ (xfs_sb_version_hasfinobt(&((mp)->m_sb)) ? \ XFS_FIBT_BLOCK(mp) + 1 : \ XFS_IBT_BLOCK(mp) + 1) - - /* * BMAP Btree format definitions * diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index f5ec9c5ccae6..79455058b752 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -206,6 +206,7 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_FTYPE 0x10000 /* inode directory types */ #define XFS_FSOP_GEOM_FLAGS_FINOBT 0x20000 /* free inode btree */ #define XFS_FSOP_GEOM_FLAGS_SPINODES 0x40000 /* sparse inode chunks */ +#define XFS_FSOP_GEOM_FLAGS_RMAPBT 0x80000 /* Reverse mapping btree */ /* * Minimum and maximum sizes need for growth checks. diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 4b1e408169a8..51b4e0de1fdc 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -24,6 +24,7 @@ #include "xfs_bit.h" #include "xfs_sb.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_ialloc.h" @@ -39,6 +40,7 @@ #include "xfs_icache.h" #include "xfs_trace.h" #include "xfs_log.h" +#include "xfs_rmap.h" /* @@ -614,6 +616,7 @@ xfs_ialloc_ag_alloc( args.tp = tp; args.mp = tp->t_mountp; args.fsbno = NULLFSBLOCK; + xfs_rmap_ag_owner(&args.oinfo, XFS_RMAP_OWN_INODES); #ifdef DEBUG /* randomly do sparse inode allocations */ @@ -1817,19 +1820,21 @@ xfs_difree_inode_chunk( struct xfs_mount *mp, xfs_agnumber_t agno, struct xfs_inobt_rec_incore *rec, - struct xfs_bmap_free *flist) + struct xfs_defer_ops *dfops) { xfs_agblock_t sagbno = XFS_AGINO_TO_AGBNO(mp, rec->ir_startino); int startidx, endidx; int nextbit; xfs_agblock_t agbno; int contigblk; + struct xfs_owner_info oinfo; DECLARE_BITMAP(holemask, XFS_INOBT_HOLEMASK_BITS); + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES); if (!xfs_inobt_issparse(rec->ir_holemask)) { /* not sparse, calculate extent info directly */ - xfs_bmap_add_free(mp, flist, XFS_AGB_TO_FSB(mp, agno, sagbno), - mp->m_ialloc_blks); + xfs_bmap_add_free(mp, dfops, XFS_AGB_TO_FSB(mp, agno, sagbno), + mp->m_ialloc_blks, &oinfo); return; } @@ -1872,8 +1877,8 @@ xfs_difree_inode_chunk( ASSERT(agbno % mp->m_sb.sb_spino_align == 0); ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); - xfs_bmap_add_free(mp, flist, XFS_AGB_TO_FSB(mp, agno, agbno), - contigblk); + xfs_bmap_add_free(mp, dfops, XFS_AGB_TO_FSB(mp, agno, agbno), + contigblk, &oinfo); /* reset range to current bit and carry on... */ startidx = endidx = nextbit; @@ -1889,7 +1894,7 @@ xfs_difree_inobt( struct xfs_trans *tp, struct xfs_buf *agbp, xfs_agino_t agino, - struct xfs_bmap_free *flist, + struct xfs_defer_ops *dfops, struct xfs_icluster *xic, struct xfs_inobt_rec_incore *orec) { @@ -1976,7 +1981,7 @@ xfs_difree_inobt( goto error0; } - xfs_difree_inode_chunk(mp, agno, &rec, flist); + xfs_difree_inode_chunk(mp, agno, &rec, dfops); } else { xic->deleted = 0; @@ -2121,7 +2126,7 @@ int xfs_difree( struct xfs_trans *tp, /* transaction pointer */ xfs_ino_t inode, /* inode to be freed */ - struct xfs_bmap_free *flist, /* extents to free */ + struct xfs_defer_ops *dfops, /* extents to free */ struct xfs_icluster *xic) /* cluster info if deleted */ { /* REFERENCED */ @@ -2173,7 +2178,7 @@ xfs_difree( /* * Fix up the inode allocation btree. */ - error = xfs_difree_inobt(mp, tp, agbp, agino, flist, xic, &rec); + error = xfs_difree_inobt(mp, tp, agbp, agino, dfops, xic, &rec); if (error) goto error0; diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index 6e450df2979b..0bb89669fc07 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -95,7 +95,7 @@ int /* error */ xfs_difree( struct xfs_trans *tp, /* transaction pointer */ xfs_ino_t inode, /* inode to be freed */ - struct xfs_bmap_free *flist, /* extents to free */ + struct xfs_defer_ops *dfops, /* extents to free */ struct xfs_icluster *ifree); /* cluster info if deleted */ /* diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 89c21d771e35..31ca2208c03d 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -32,6 +32,7 @@ #include "xfs_trace.h" #include "xfs_cksum.h" #include "xfs_trans.h" +#include "xfs_rmap.h" STATIC int @@ -96,6 +97,7 @@ xfs_inobt_alloc_block( memset(&args, 0, sizeof(args)); args.tp = cur->bc_tp; args.mp = cur->bc_mp; + xfs_rmap_ag_owner(&args.oinfo, XFS_RMAP_OWN_INOBT); args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno); args.minlen = 1; args.maxlen = 1; @@ -125,8 +127,12 @@ xfs_inobt_free_block( struct xfs_btree_cur *cur, struct xfs_buf *bp) { + struct xfs_owner_info oinfo; + + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT); return xfs_free_extent(cur->bc_tp, - XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1); + XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1, + &oinfo); } STATIC int @@ -146,14 +152,6 @@ xfs_inobt_init_key_from_rec( } STATIC void -xfs_inobt_init_rec_from_key( - union xfs_btree_key *key, - union xfs_btree_rec *rec) -{ - rec->inobt.ir_startino = key->inobt.ir_startino; -} - -STATIC void xfs_inobt_init_rec_from_cur( struct xfs_btree_cur *cur, union xfs_btree_rec *rec) @@ -314,7 +312,6 @@ static const struct xfs_btree_ops xfs_inobt_ops = { .get_minrecs = xfs_inobt_get_minrecs, .get_maxrecs = xfs_inobt_get_maxrecs, .init_key_from_rec = xfs_inobt_init_key_from_rec, - .init_rec_from_key = xfs_inobt_init_rec_from_key, .init_rec_from_cur = xfs_inobt_init_rec_from_cur, .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur, .key_diff = xfs_inobt_key_diff, @@ -336,7 +333,6 @@ static const struct xfs_btree_ops xfs_finobt_ops = { .get_minrecs = xfs_inobt_get_minrecs, .get_maxrecs = xfs_inobt_get_maxrecs, .init_key_from_rec = xfs_inobt_init_key_from_rec, - .init_rec_from_key = xfs_inobt_init_rec_from_key, .init_rec_from_cur = xfs_inobt_init_rec_from_cur, .init_ptr_from_cur = xfs_finobt_init_ptr_from_cur, .key_diff = xfs_inobt_key_diff, diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 9d9559eb2835..4b9769e23c83 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -22,6 +22,7 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_error.h" #include "xfs_cksum.h" diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index e8f49c029ff0..a6eed43fa7cd 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -110,7 +110,9 @@ static inline uint xlog_get_cycle(char *ptr) #define XLOG_REG_TYPE_COMMIT 18 #define XLOG_REG_TYPE_TRANSHDR 19 #define XLOG_REG_TYPE_ICREATE 20 -#define XLOG_REG_TYPE_MAX 20 +#define XLOG_REG_TYPE_RUI_FORMAT 21 +#define XLOG_REG_TYPE_RUD_FORMAT 22 +#define XLOG_REG_TYPE_MAX 22 /* * Flags to log operation header @@ -227,6 +229,8 @@ typedef struct xfs_trans_header { #define XFS_LI_DQUOT 0x123d #define XFS_LI_QUOTAOFF 0x123e #define XFS_LI_ICREATE 0x123f +#define XFS_LI_RUI 0x1240 /* rmap update intent */ +#define XFS_LI_RUD 0x1241 #define XFS_LI_TYPE_DESC \ { XFS_LI_EFI, "XFS_LI_EFI" }, \ @@ -236,7 +240,9 @@ typedef struct xfs_trans_header { { XFS_LI_BUF, "XFS_LI_BUF" }, \ { XFS_LI_DQUOT, "XFS_LI_DQUOT" }, \ { XFS_LI_QUOTAOFF, "XFS_LI_QUOTAOFF" }, \ - { XFS_LI_ICREATE, "XFS_LI_ICREATE" } + { XFS_LI_ICREATE, "XFS_LI_ICREATE" }, \ + { XFS_LI_RUI, "XFS_LI_RUI" }, \ + { XFS_LI_RUD, "XFS_LI_RUD" } /* * Inode Log Item Format definitions. @@ -604,6 +610,59 @@ typedef struct xfs_efd_log_format_64 { } xfs_efd_log_format_64_t; /* + * RUI/RUD (reverse mapping) log format definitions + */ +struct xfs_map_extent { + __uint64_t me_owner; + __uint64_t me_startblock; + __uint64_t me_startoff; + __uint32_t me_len; + __uint32_t me_flags; +}; + +/* rmap me_flags: upper bits are flags, lower byte is type code */ +#define XFS_RMAP_EXTENT_MAP 1 +#define XFS_RMAP_EXTENT_UNMAP 3 +#define XFS_RMAP_EXTENT_CONVERT 5 +#define XFS_RMAP_EXTENT_ALLOC 7 +#define XFS_RMAP_EXTENT_FREE 8 +#define XFS_RMAP_EXTENT_TYPE_MASK 0xFF + +#define XFS_RMAP_EXTENT_ATTR_FORK (1U << 31) +#define XFS_RMAP_EXTENT_BMBT_BLOCK (1U << 30) +#define XFS_RMAP_EXTENT_UNWRITTEN (1U << 29) + +#define XFS_RMAP_EXTENT_FLAGS (XFS_RMAP_EXTENT_TYPE_MASK | \ + XFS_RMAP_EXTENT_ATTR_FORK | \ + XFS_RMAP_EXTENT_BMBT_BLOCK | \ + XFS_RMAP_EXTENT_UNWRITTEN) + +/* + * This is the structure used to lay out an rui log item in the + * log. The rui_extents field is a variable size array whose + * size is given by rui_nextents. + */ +struct xfs_rui_log_format { + __uint16_t rui_type; /* rui log item type */ + __uint16_t rui_size; /* size of this item */ + __uint32_t rui_nextents; /* # extents to free */ + __uint64_t rui_id; /* rui identifier */ + struct xfs_map_extent rui_extents[1]; /* array of extents to rmap */ +}; + +/* + * This is the structure used to lay out an rud log item in the + * log. The rud_extents array is a variable size array whose + * size is given by rud_nextents; + */ +struct xfs_rud_log_format { + __uint16_t rud_type; /* rud log item type */ + __uint16_t rud_size; /* size of this item */ + __uint32_t __pad; + __uint64_t rud_rui_id; /* id of corresponding rui */ +}; + +/* * Dquot Log format definitions. * * The first two fields must be the type and size fitting into diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c new file mode 100644 index 000000000000..73d05407d663 --- /dev/null +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -0,0 +1,1399 @@ +/* + * Copyright (c) 2014 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_btree.h" +#include "xfs_trans.h" +#include "xfs_alloc.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_trans_space.h" +#include "xfs_trace.h" +#include "xfs_error.h" +#include "xfs_extent_busy.h" +#include "xfs_bmap.h" +#include "xfs_inode.h" + +/* + * Lookup the first record less than or equal to [bno, len, owner, offset] + * in the btree given by cur. + */ +int +xfs_rmap_lookup_le( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + uint64_t owner, + uint64_t offset, + unsigned int flags, + int *stat) +{ + cur->bc_rec.r.rm_startblock = bno; + cur->bc_rec.r.rm_blockcount = len; + cur->bc_rec.r.rm_owner = owner; + cur->bc_rec.r.rm_offset = offset; + cur->bc_rec.r.rm_flags = flags; + return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); +} + +/* + * Lookup the record exactly matching [bno, len, owner, offset] + * in the btree given by cur. + */ +int +xfs_rmap_lookup_eq( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + uint64_t owner, + uint64_t offset, + unsigned int flags, + int *stat) +{ + cur->bc_rec.r.rm_startblock = bno; + cur->bc_rec.r.rm_blockcount = len; + cur->bc_rec.r.rm_owner = owner; + cur->bc_rec.r.rm_offset = offset; + cur->bc_rec.r.rm_flags = flags; + return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); +} + +/* + * Update the record referred to by cur to the value given + * by [bno, len, owner, offset]. + * This either works (return 0) or gets an EFSCORRUPTED error. + */ +STATIC int +xfs_rmap_update( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *irec) +{ + union xfs_btree_rec rec; + int error; + + trace_xfs_rmap_update(cur->bc_mp, cur->bc_private.a.agno, + irec->rm_startblock, irec->rm_blockcount, + irec->rm_owner, irec->rm_offset, irec->rm_flags); + + rec.rmap.rm_startblock = cpu_to_be32(irec->rm_startblock); + rec.rmap.rm_blockcount = cpu_to_be32(irec->rm_blockcount); + rec.rmap.rm_owner = cpu_to_be64(irec->rm_owner); + rec.rmap.rm_offset = cpu_to_be64( + xfs_rmap_irec_offset_pack(irec)); + error = xfs_btree_update(cur, &rec); + if (error) + trace_xfs_rmap_update_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +int +xfs_rmap_insert( + struct xfs_btree_cur *rcur, + xfs_agblock_t agbno, + xfs_extlen_t len, + uint64_t owner, + uint64_t offset, + unsigned int flags) +{ + int i; + int error; + + trace_xfs_rmap_insert(rcur->bc_mp, rcur->bc_private.a.agno, agbno, + len, owner, offset, flags); + + error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 0, done); + + rcur->bc_rec.r.rm_startblock = agbno; + rcur->bc_rec.r.rm_blockcount = len; + rcur->bc_rec.r.rm_owner = owner; + rcur->bc_rec.r.rm_offset = offset; + rcur->bc_rec.r.rm_flags = flags; + error = xfs_btree_insert(rcur, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 1, done); +done: + if (error) + trace_xfs_rmap_insert_error(rcur->bc_mp, + rcur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +static int +xfs_rmap_btrec_to_irec( + union xfs_btree_rec *rec, + struct xfs_rmap_irec *irec) +{ + irec->rm_flags = 0; + irec->rm_startblock = be32_to_cpu(rec->rmap.rm_startblock); + irec->rm_blockcount = be32_to_cpu(rec->rmap.rm_blockcount); + irec->rm_owner = be64_to_cpu(rec->rmap.rm_owner); + return xfs_rmap_irec_offset_unpack(be64_to_cpu(rec->rmap.rm_offset), + irec); +} + +/* + * Get the data from the pointed-to record. + */ +int +xfs_rmap_get_rec( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *irec, + int *stat) +{ + union xfs_btree_rec *rec; + int error; + + error = xfs_btree_get_rec(cur, &rec, stat); + if (error || !*stat) + return error; + + return xfs_rmap_btrec_to_irec(rec, irec); +} + +/* + * Find the extent in the rmap btree and remove it. + * + * The record we find should always be an exact match for the extent that we're + * looking for, since we insert them into the btree without modification. + * + * Special Case #1: when growing the filesystem, we "free" an extent when + * growing the last AG. This extent is new space and so it is not tracked as + * used space in the btree. The growfs code will pass in an owner of + * XFS_RMAP_OWN_NULL to indicate that it expected that there is no owner of this + * extent. We verify that - the extent lookup result in a record that does not + * overlap. + * + * Special Case #2: EFIs do not record the owner of the extent, so when + * recovering EFIs from the log we pass in XFS_RMAP_OWN_UNKNOWN to tell the rmap + * btree to ignore the owner (i.e. wildcard match) so we don't trigger + * corruption checks during log recovery. + */ +STATIC int +xfs_rmap_unmap( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + bool unwritten, + struct xfs_owner_info *oinfo) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_rmap_irec ltrec; + uint64_t ltoff; + int error = 0; + int i; + uint64_t owner; + uint64_t offset; + unsigned int flags; + bool ignore_off; + + xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); + ignore_off = XFS_RMAP_NON_INODE_OWNER(owner) || + (flags & XFS_RMAP_BMBT_BLOCK); + if (unwritten) + flags |= XFS_RMAP_UNWRITTEN; + trace_xfs_rmap_unmap(mp, cur->bc_private.a.agno, bno, len, + unwritten, oinfo); + + /* + * We should always have a left record because there's a static record + * for the AG headers at rm_startblock == 0 created by mkfs/growfs that + * will not ever be removed from the tree. + */ + error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + + error = xfs_rmap_get_rec(cur, <rec, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, + cur->bc_private.a.agno, ltrec.rm_startblock, + ltrec.rm_blockcount, ltrec.rm_owner, + ltrec.rm_offset, ltrec.rm_flags); + ltoff = ltrec.rm_offset; + + /* + * For growfs, the incoming extent must be beyond the left record we + * just found as it is new space and won't be used by anyone. This is + * just a corruption check as we don't actually do anything with this + * extent. Note that we need to use >= instead of > because it might + * be the case that the "left" extent goes all the way to EOFS. + */ + if (owner == XFS_RMAP_OWN_NULL) { + XFS_WANT_CORRUPTED_GOTO(mp, bno >= ltrec.rm_startblock + + ltrec.rm_blockcount, out_error); + goto out_done; + } + + /* Make sure the unwritten flag matches. */ + XFS_WANT_CORRUPTED_GOTO(mp, (flags & XFS_RMAP_UNWRITTEN) == + (ltrec.rm_flags & XFS_RMAP_UNWRITTEN), out_error); + + /* Make sure the extent we found covers the entire freeing range. */ + XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_startblock <= bno && + ltrec.rm_startblock + ltrec.rm_blockcount >= + bno + len, out_error); + + /* Make sure the owner matches what we expect to find in the tree. */ + XFS_WANT_CORRUPTED_GOTO(mp, owner == ltrec.rm_owner || + XFS_RMAP_NON_INODE_OWNER(owner), out_error); + + /* Check the offset, if necessary. */ + if (!XFS_RMAP_NON_INODE_OWNER(owner)) { + if (flags & XFS_RMAP_BMBT_BLOCK) { + XFS_WANT_CORRUPTED_GOTO(mp, + ltrec.rm_flags & XFS_RMAP_BMBT_BLOCK, + out_error); + } else { + XFS_WANT_CORRUPTED_GOTO(mp, + ltrec.rm_offset <= offset, out_error); + XFS_WANT_CORRUPTED_GOTO(mp, + ltoff + ltrec.rm_blockcount >= offset + len, + out_error); + } + } + + if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) { + /* exact match, simply remove the record from rmap tree */ + trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + ltrec.rm_startblock, ltrec.rm_blockcount, + ltrec.rm_owner, ltrec.rm_offset, + ltrec.rm_flags); + error = xfs_btree_delete(cur, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + } else if (ltrec.rm_startblock == bno) { + /* + * overlap left hand side of extent: move the start, trim the + * length and update the current record. + * + * ltbno ltlen + * Orig: |oooooooooooooooooooo| + * Freeing: |fffffffff| + * Result: |rrrrrrrrrr| + * bno len + */ + ltrec.rm_startblock += len; + ltrec.rm_blockcount -= len; + if (!ignore_off) + ltrec.rm_offset += len; + error = xfs_rmap_update(cur, <rec); + if (error) + goto out_error; + } else if (ltrec.rm_startblock + ltrec.rm_blockcount == bno + len) { + /* + * overlap right hand side of extent: trim the length and update + * the current record. + * + * ltbno ltlen + * Orig: |oooooooooooooooooooo| + * Freeing: |fffffffff| + * Result: |rrrrrrrrrr| + * bno len + */ + ltrec.rm_blockcount -= len; + error = xfs_rmap_update(cur, <rec); + if (error) + goto out_error; + } else { + + /* + * overlap middle of extent: trim the length of the existing + * record to the length of the new left-extent size, increment + * the insertion position so we can insert a new record + * containing the remaining right-extent space. + * + * ltbno ltlen + * Orig: |oooooooooooooooooooo| + * Freeing: |fffffffff| + * Result: |rrrrr| |rrrr| + * bno len + */ + xfs_extlen_t orig_len = ltrec.rm_blockcount; + + ltrec.rm_blockcount = bno - ltrec.rm_startblock; + error = xfs_rmap_update(cur, <rec); + if (error) + goto out_error; + + error = xfs_btree_increment(cur, 0, &i); + if (error) + goto out_error; + + cur->bc_rec.r.rm_startblock = bno + len; + cur->bc_rec.r.rm_blockcount = orig_len - len - + ltrec.rm_blockcount; + cur->bc_rec.r.rm_owner = ltrec.rm_owner; + if (ignore_off) + cur->bc_rec.r.rm_offset = 0; + else + cur->bc_rec.r.rm_offset = offset + len; + cur->bc_rec.r.rm_flags = flags; + trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, + cur->bc_rec.r.rm_startblock, + cur->bc_rec.r.rm_blockcount, + cur->bc_rec.r.rm_owner, + cur->bc_rec.r.rm_offset, + cur->bc_rec.r.rm_flags); + error = xfs_btree_insert(cur, &i); + if (error) + goto out_error; + } + +out_done: + trace_xfs_rmap_unmap_done(mp, cur->bc_private.a.agno, bno, len, + unwritten, oinfo); +out_error: + if (error) + trace_xfs_rmap_unmap_error(mp, cur->bc_private.a.agno, + error, _RET_IP_); + return error; +} + +/* + * Remove a reference to an extent in the rmap btree. + */ +int +xfs_rmap_free( + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len, + struct xfs_owner_info *oinfo) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_btree_cur *cur; + int error; + + if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) + return 0; + + cur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno); + + error = xfs_rmap_unmap(cur, bno, len, false, oinfo); + if (error) + goto out_error; + + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return 0; + +out_error: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; +} + +/* + * A mergeable rmap must have the same owner and the same values for + * the unwritten, attr_fork, and bmbt flags. The startblock and + * offset are checked separately. + */ +static bool +xfs_rmap_is_mergeable( + struct xfs_rmap_irec *irec, + uint64_t owner, + unsigned int flags) +{ + if (irec->rm_owner == XFS_RMAP_OWN_NULL) + return false; + if (irec->rm_owner != owner) + return false; + if ((flags & XFS_RMAP_UNWRITTEN) ^ + (irec->rm_flags & XFS_RMAP_UNWRITTEN)) + return false; + if ((flags & XFS_RMAP_ATTR_FORK) ^ + (irec->rm_flags & XFS_RMAP_ATTR_FORK)) + return false; + if ((flags & XFS_RMAP_BMBT_BLOCK) ^ + (irec->rm_flags & XFS_RMAP_BMBT_BLOCK)) + return false; + return true; +} + +/* + * When we allocate a new block, the first thing we do is add a reference to + * the extent in the rmap btree. This takes the form of a [agbno, length, + * owner, offset] record. Flags are encoded in the high bits of the offset + * field. + */ +STATIC int +xfs_rmap_map( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + bool unwritten, + struct xfs_owner_info *oinfo) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_rmap_irec ltrec; + struct xfs_rmap_irec gtrec; + int have_gt; + int have_lt; + int error = 0; + int i; + uint64_t owner; + uint64_t offset; + unsigned int flags = 0; + bool ignore_off; + + xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); + ASSERT(owner != 0); + ignore_off = XFS_RMAP_NON_INODE_OWNER(owner) || + (flags & XFS_RMAP_BMBT_BLOCK); + if (unwritten) + flags |= XFS_RMAP_UNWRITTEN; + trace_xfs_rmap_map(mp, cur->bc_private.a.agno, bno, len, + unwritten, oinfo); + + /* + * For the initial lookup, look for an exact match or the left-adjacent + * record for our insertion point. This will also give us the record for + * start block contiguity tests. + */ + error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags, + &have_lt); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, have_lt == 1, out_error); + + error = xfs_rmap_get_rec(cur, <rec, &have_lt); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, have_lt == 1, out_error); + trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, + cur->bc_private.a.agno, ltrec.rm_startblock, + ltrec.rm_blockcount, ltrec.rm_owner, + ltrec.rm_offset, ltrec.rm_flags); + + if (!xfs_rmap_is_mergeable(<rec, owner, flags)) + have_lt = 0; + + XFS_WANT_CORRUPTED_GOTO(mp, + have_lt == 0 || + ltrec.rm_startblock + ltrec.rm_blockcount <= bno, out_error); + + /* + * Increment the cursor to see if we have a right-adjacent record to our + * insertion point. This will give us the record for end block + * contiguity tests. + */ + error = xfs_btree_increment(cur, 0, &have_gt); + if (error) + goto out_error; + if (have_gt) { + error = xfs_rmap_get_rec(cur, >rec, &have_gt); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, have_gt == 1, out_error); + XFS_WANT_CORRUPTED_GOTO(mp, bno + len <= gtrec.rm_startblock, + out_error); + trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, + cur->bc_private.a.agno, gtrec.rm_startblock, + gtrec.rm_blockcount, gtrec.rm_owner, + gtrec.rm_offset, gtrec.rm_flags); + if (!xfs_rmap_is_mergeable(>rec, owner, flags)) + have_gt = 0; + } + + /* + * Note: cursor currently points one record to the right of ltrec, even + * if there is no record in the tree to the right. + */ + if (have_lt && + ltrec.rm_startblock + ltrec.rm_blockcount == bno && + (ignore_off || ltrec.rm_offset + ltrec.rm_blockcount == offset)) { + /* + * left edge contiguous, merge into left record. + * + * ltbno ltlen + * orig: |ooooooooo| + * adding: |aaaaaaaaa| + * result: |rrrrrrrrrrrrrrrrrrr| + * bno len + */ + ltrec.rm_blockcount += len; + if (have_gt && + bno + len == gtrec.rm_startblock && + (ignore_off || offset + len == gtrec.rm_offset) && + (unsigned long)ltrec.rm_blockcount + len + + gtrec.rm_blockcount <= XFS_RMAP_LEN_MAX) { + /* + * right edge also contiguous, delete right record + * and merge into left record. + * + * ltbno ltlen gtbno gtlen + * orig: |ooooooooo| |ooooooooo| + * adding: |aaaaaaaaa| + * result: |rrrrrrrrrrrrrrrrrrrrrrrrrrrrr| + */ + ltrec.rm_blockcount += gtrec.rm_blockcount; + trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + gtrec.rm_startblock, + gtrec.rm_blockcount, + gtrec.rm_owner, + gtrec.rm_offset, + gtrec.rm_flags); + error = xfs_btree_delete(cur, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + } + + /* point the cursor back to the left record and update */ + error = xfs_btree_decrement(cur, 0, &have_gt); + if (error) + goto out_error; + error = xfs_rmap_update(cur, <rec); + if (error) + goto out_error; + } else if (have_gt && + bno + len == gtrec.rm_startblock && + (ignore_off || offset + len == gtrec.rm_offset)) { + /* + * right edge contiguous, merge into right record. + * + * gtbno gtlen + * Orig: |ooooooooo| + * adding: |aaaaaaaaa| + * Result: |rrrrrrrrrrrrrrrrrrr| + * bno len + */ + gtrec.rm_startblock = bno; + gtrec.rm_blockcount += len; + if (!ignore_off) + gtrec.rm_offset = offset; + error = xfs_rmap_update(cur, >rec); + if (error) + goto out_error; + } else { + /* + * no contiguous edge with identical owner, insert + * new record at current cursor position. + */ + cur->bc_rec.r.rm_startblock = bno; + cur->bc_rec.r.rm_blockcount = len; + cur->bc_rec.r.rm_owner = owner; + cur->bc_rec.r.rm_offset = offset; + cur->bc_rec.r.rm_flags = flags; + trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, len, + owner, offset, flags); + error = xfs_btree_insert(cur, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + } + + trace_xfs_rmap_map_done(mp, cur->bc_private.a.agno, bno, len, + unwritten, oinfo); +out_error: + if (error) + trace_xfs_rmap_map_error(mp, cur->bc_private.a.agno, + error, _RET_IP_); + return error; +} + +/* + * Add a reference to an extent in the rmap btree. + */ +int +xfs_rmap_alloc( + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len, + struct xfs_owner_info *oinfo) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_btree_cur *cur; + int error; + + if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) + return 0; + + cur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno); + error = xfs_rmap_map(cur, bno, len, false, oinfo); + if (error) + goto out_error; + + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return 0; + +out_error: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; +} + +#define RMAP_LEFT_CONTIG (1 << 0) +#define RMAP_RIGHT_CONTIG (1 << 1) +#define RMAP_LEFT_FILLING (1 << 2) +#define RMAP_RIGHT_FILLING (1 << 3) +#define RMAP_LEFT_VALID (1 << 6) +#define RMAP_RIGHT_VALID (1 << 7) + +#define LEFT r[0] +#define RIGHT r[1] +#define PREV r[2] +#define NEW r[3] + +/* + * Convert an unwritten extent to a real extent or vice versa. + * Does not handle overlapping extents. + */ +STATIC int +xfs_rmap_convert( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + bool unwritten, + struct xfs_owner_info *oinfo) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_rmap_irec r[4]; /* neighbor extent entries */ + /* left is 0, right is 1, prev is 2 */ + /* new is 3 */ + uint64_t owner; + uint64_t offset; + uint64_t new_endoff; + unsigned int oldext; + unsigned int newext; + unsigned int flags = 0; + int i; + int state = 0; + int error; + + xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); + ASSERT(!(XFS_RMAP_NON_INODE_OWNER(owner) || + (flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)))); + oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0; + new_endoff = offset + len; + trace_xfs_rmap_convert(mp, cur->bc_private.a.agno, bno, len, + unwritten, oinfo); + + /* + * For the initial lookup, look for an exact match or the left-adjacent + * record for our insertion point. This will also give us the record for + * start block contiguity tests. + */ + error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + + error = xfs_rmap_get_rec(cur, &PREV, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, + cur->bc_private.a.agno, PREV.rm_startblock, + PREV.rm_blockcount, PREV.rm_owner, + PREV.rm_offset, PREV.rm_flags); + + ASSERT(PREV.rm_offset <= offset); + ASSERT(PREV.rm_offset + PREV.rm_blockcount >= new_endoff); + ASSERT((PREV.rm_flags & XFS_RMAP_UNWRITTEN) == oldext); + newext = ~oldext & XFS_RMAP_UNWRITTEN; + + /* + * Set flags determining what part of the previous oldext allocation + * extent is being replaced by a newext allocation. + */ + if (PREV.rm_offset == offset) + state |= RMAP_LEFT_FILLING; + if (PREV.rm_offset + PREV.rm_blockcount == new_endoff) + state |= RMAP_RIGHT_FILLING; + + /* + * Decrement the cursor to see if we have a left-adjacent record to our + * insertion point. This will give us the record for end block + * contiguity tests. + */ + error = xfs_btree_decrement(cur, 0, &i); + if (error) + goto done; + if (i) { + state |= RMAP_LEFT_VALID; + error = xfs_rmap_get_rec(cur, &LEFT, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, + LEFT.rm_startblock + LEFT.rm_blockcount <= bno, + done); + trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, + cur->bc_private.a.agno, LEFT.rm_startblock, + LEFT.rm_blockcount, LEFT.rm_owner, + LEFT.rm_offset, LEFT.rm_flags); + if (LEFT.rm_startblock + LEFT.rm_blockcount == bno && + LEFT.rm_offset + LEFT.rm_blockcount == offset && + xfs_rmap_is_mergeable(&LEFT, owner, newext)) + state |= RMAP_LEFT_CONTIG; + } + + /* + * Increment the cursor to see if we have a right-adjacent record to our + * insertion point. This will give us the record for end block + * contiguity tests. + */ + error = xfs_btree_increment(cur, 0, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + error = xfs_btree_increment(cur, 0, &i); + if (error) + goto done; + if (i) { + state |= RMAP_RIGHT_VALID; + error = xfs_rmap_get_rec(cur, &RIGHT, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, bno + len <= RIGHT.rm_startblock, + done); + trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, + cur->bc_private.a.agno, RIGHT.rm_startblock, + RIGHT.rm_blockcount, RIGHT.rm_owner, + RIGHT.rm_offset, RIGHT.rm_flags); + if (bno + len == RIGHT.rm_startblock && + offset + len == RIGHT.rm_offset && + xfs_rmap_is_mergeable(&RIGHT, owner, newext)) + state |= RMAP_RIGHT_CONTIG; + } + + /* check that left + prev + right is not too long */ + if ((state & (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | + RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG)) == + (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | + RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG) && + (unsigned long)LEFT.rm_blockcount + len + + RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX) + state &= ~RMAP_RIGHT_CONTIG; + + trace_xfs_rmap_convert_state(mp, cur->bc_private.a.agno, state, + _RET_IP_); + + /* reset the cursor back to PREV */ + error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + + /* + * Switch out based on the FILLING and CONTIG state bits. + */ + switch (state & (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | + RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG)) { + case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | + RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG: + /* + * Setting all of a previous oldext extent to newext. + * The left and right neighbors are both contiguous with new. + */ + error = xfs_btree_increment(cur, 0, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + RIGHT.rm_startblock, RIGHT.rm_blockcount, + RIGHT.rm_owner, RIGHT.rm_offset, + RIGHT.rm_flags); + error = xfs_btree_delete(cur, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + error = xfs_btree_decrement(cur, 0, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + PREV.rm_startblock, PREV.rm_blockcount, + PREV.rm_owner, PREV.rm_offset, + PREV.rm_flags); + error = xfs_btree_delete(cur, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + error = xfs_btree_decrement(cur, 0, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + NEW = LEFT; + NEW.rm_blockcount += PREV.rm_blockcount + RIGHT.rm_blockcount; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + break; + + case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG: + /* + * Setting all of a previous oldext extent to newext. + * The left neighbor is contiguous, the right is not. + */ + trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + PREV.rm_startblock, PREV.rm_blockcount, + PREV.rm_owner, PREV.rm_offset, + PREV.rm_flags); + error = xfs_btree_delete(cur, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + error = xfs_btree_decrement(cur, 0, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + NEW = LEFT; + NEW.rm_blockcount += PREV.rm_blockcount; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + break; + + case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG: + /* + * Setting all of a previous oldext extent to newext. + * The right neighbor is contiguous, the left is not. + */ + error = xfs_btree_increment(cur, 0, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + RIGHT.rm_startblock, RIGHT.rm_blockcount, + RIGHT.rm_owner, RIGHT.rm_offset, + RIGHT.rm_flags); + error = xfs_btree_delete(cur, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + error = xfs_btree_decrement(cur, 0, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + NEW = PREV; + NEW.rm_blockcount = len + RIGHT.rm_blockcount; + NEW.rm_flags = newext; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + break; + + case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING: + /* + * Setting all of a previous oldext extent to newext. + * Neither the left nor right neighbors are contiguous with + * the new one. + */ + NEW = PREV; + NEW.rm_flags = newext; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + break; + + case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG: + /* + * Setting the first part of a previous oldext extent to newext. + * The left neighbor is contiguous. + */ + NEW = PREV; + NEW.rm_offset += len; + NEW.rm_startblock += len; + NEW.rm_blockcount -= len; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + error = xfs_btree_decrement(cur, 0, &i); + if (error) + goto done; + NEW = LEFT; + NEW.rm_blockcount += len; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + break; + + case RMAP_LEFT_FILLING: + /* + * Setting the first part of a previous oldext extent to newext. + * The left neighbor is not contiguous. + */ + NEW = PREV; + NEW.rm_startblock += len; + NEW.rm_offset += len; + NEW.rm_blockcount -= len; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + NEW.rm_startblock = bno; + NEW.rm_owner = owner; + NEW.rm_offset = offset; + NEW.rm_blockcount = len; + NEW.rm_flags = newext; + cur->bc_rec.r = NEW; + trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, + len, owner, offset, newext); + error = xfs_btree_insert(cur, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + break; + + case RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG: + /* + * Setting the last part of a previous oldext extent to newext. + * The right neighbor is contiguous with the new allocation. + */ + NEW = PREV; + NEW.rm_blockcount -= len; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + error = xfs_btree_increment(cur, 0, &i); + if (error) + goto done; + NEW = RIGHT; + NEW.rm_offset = offset; + NEW.rm_startblock = bno; + NEW.rm_blockcount += len; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + break; + + case RMAP_RIGHT_FILLING: + /* + * Setting the last part of a previous oldext extent to newext. + * The right neighbor is not contiguous. + */ + NEW = PREV; + NEW.rm_blockcount -= len; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + error = xfs_rmap_lookup_eq(cur, bno, len, owner, offset, + oldext, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); + NEW.rm_startblock = bno; + NEW.rm_owner = owner; + NEW.rm_offset = offset; + NEW.rm_blockcount = len; + NEW.rm_flags = newext; + cur->bc_rec.r = NEW; + trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, + len, owner, offset, newext); + error = xfs_btree_insert(cur, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + break; + + case 0: + /* + * Setting the middle part of a previous oldext extent to + * newext. Contiguity is impossible here. + * One extent becomes three extents. + */ + /* new right extent - oldext */ + NEW.rm_startblock = bno + len; + NEW.rm_owner = owner; + NEW.rm_offset = new_endoff; + NEW.rm_blockcount = PREV.rm_offset + PREV.rm_blockcount - + new_endoff; + NEW.rm_flags = PREV.rm_flags; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + /* new left extent - oldext */ + NEW = PREV; + NEW.rm_blockcount = offset - PREV.rm_offset; + cur->bc_rec.r = NEW; + trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, + NEW.rm_startblock, NEW.rm_blockcount, + NEW.rm_owner, NEW.rm_offset, + NEW.rm_flags); + error = xfs_btree_insert(cur, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + /* + * Reset the cursor to the position of the new extent + * we are about to insert as we can't trust it after + * the previous insert. + */ + error = xfs_rmap_lookup_eq(cur, bno, len, owner, offset, + oldext, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); + /* new middle extent - newext */ + cur->bc_rec.r.rm_flags &= ~XFS_RMAP_UNWRITTEN; + cur->bc_rec.r.rm_flags |= newext; + trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, len, + owner, offset, newext); + error = xfs_btree_insert(cur, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + break; + + case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG: + case RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG: + case RMAP_LEFT_FILLING | RMAP_RIGHT_CONTIG: + case RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG: + case RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG: + case RMAP_LEFT_CONTIG: + case RMAP_RIGHT_CONTIG: + /* + * These cases are all impossible. + */ + ASSERT(0); + } + + trace_xfs_rmap_convert_done(mp, cur->bc_private.a.agno, bno, len, + unwritten, oinfo); +done: + if (error) + trace_xfs_rmap_convert_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +#undef NEW +#undef LEFT +#undef RIGHT +#undef PREV + +struct xfs_rmap_query_range_info { + xfs_rmap_query_range_fn fn; + void *priv; +}; + +/* Format btree record and pass to our callback. */ +STATIC int +xfs_rmap_query_range_helper( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec, + void *priv) +{ + struct xfs_rmap_query_range_info *query = priv; + struct xfs_rmap_irec irec; + int error; + + error = xfs_rmap_btrec_to_irec(rec, &irec); + if (error) + return error; + return query->fn(cur, &irec, query->priv); +} + +/* Find all rmaps between two keys. */ +int +xfs_rmap_query_range( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *low_rec, + struct xfs_rmap_irec *high_rec, + xfs_rmap_query_range_fn fn, + void *priv) +{ + union xfs_btree_irec low_brec; + union xfs_btree_irec high_brec; + struct xfs_rmap_query_range_info query; + + low_brec.r = *low_rec; + high_brec.r = *high_rec; + query.priv = priv; + query.fn = fn; + return xfs_btree_query_range(cur, &low_brec, &high_brec, + xfs_rmap_query_range_helper, &query); +} + +/* Clean up after calling xfs_rmap_finish_one. */ +void +xfs_rmap_finish_one_cleanup( + struct xfs_trans *tp, + struct xfs_btree_cur *rcur, + int error) +{ + struct xfs_buf *agbp; + + if (rcur == NULL) + return; + agbp = rcur->bc_private.a.agbp; + xfs_btree_del_cursor(rcur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + if (error) + xfs_trans_brelse(tp, agbp); +} + +/* + * Process one of the deferred rmap operations. We pass back the + * btree cursor to maintain our lock on the rmapbt between calls. + * This saves time and eliminates a buffer deadlock between the + * superblock and the AGF because we'll always grab them in the same + * order. + */ +int +xfs_rmap_finish_one( + struct xfs_trans *tp, + enum xfs_rmap_intent_type type, + __uint64_t owner, + int whichfork, + xfs_fileoff_t startoff, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount, + xfs_exntst_t state, + struct xfs_btree_cur **pcur) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_btree_cur *rcur; + struct xfs_buf *agbp = NULL; + int error = 0; + xfs_agnumber_t agno; + struct xfs_owner_info oinfo; + xfs_agblock_t bno; + bool unwritten; + + agno = XFS_FSB_TO_AGNO(mp, startblock); + ASSERT(agno != NULLAGNUMBER); + bno = XFS_FSB_TO_AGBNO(mp, startblock); + + trace_xfs_rmap_deferred(mp, agno, type, bno, owner, whichfork, + startoff, blockcount, state); + + if (XFS_TEST_ERROR(false, mp, + XFS_ERRTAG_RMAP_FINISH_ONE, + XFS_RANDOM_RMAP_FINISH_ONE)) + return -EIO; + + /* + * If we haven't gotten a cursor or the cursor AG doesn't match + * the startblock, get one now. + */ + rcur = *pcur; + if (rcur != NULL && rcur->bc_private.a.agno != agno) { + xfs_rmap_finish_one_cleanup(tp, rcur, 0); + rcur = NULL; + *pcur = NULL; + } + if (rcur == NULL) { + /* + * Refresh the freelist before we start changing the + * rmapbt, because a shape change could cause us to + * allocate blocks. + */ + error = xfs_free_extent_fix_freelist(tp, agno, &agbp); + if (error) + return error; + if (!agbp) + return -EFSCORRUPTED; + + rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno); + if (!rcur) { + error = -ENOMEM; + goto out_cur; + } + } + *pcur = rcur; + + xfs_rmap_ino_owner(&oinfo, owner, whichfork, startoff); + unwritten = state == XFS_EXT_UNWRITTEN; + bno = XFS_FSB_TO_AGBNO(rcur->bc_mp, startblock); + + switch (type) { + case XFS_RMAP_ALLOC: + case XFS_RMAP_MAP: + error = xfs_rmap_map(rcur, bno, blockcount, unwritten, &oinfo); + break; + case XFS_RMAP_FREE: + case XFS_RMAP_UNMAP: + error = xfs_rmap_unmap(rcur, bno, blockcount, unwritten, + &oinfo); + break; + case XFS_RMAP_CONVERT: + error = xfs_rmap_convert(rcur, bno, blockcount, !unwritten, + &oinfo); + break; + default: + ASSERT(0); + error = -EFSCORRUPTED; + } + return error; + +out_cur: + xfs_trans_brelse(tp, agbp); + + return error; +} + +/* + * Don't defer an rmap if we aren't an rmap filesystem. + */ +static bool +xfs_rmap_update_is_needed( + struct xfs_mount *mp) +{ + return xfs_sb_version_hasrmapbt(&mp->m_sb); +} + +/* + * Record a rmap intent; the list is kept sorted first by AG and then by + * increasing age. + */ +static int +__xfs_rmap_add( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + enum xfs_rmap_intent_type type, + __uint64_t owner, + int whichfork, + struct xfs_bmbt_irec *bmap) +{ + struct xfs_rmap_intent *ri; + + trace_xfs_rmap_defer(mp, XFS_FSB_TO_AGNO(mp, bmap->br_startblock), + type, + XFS_FSB_TO_AGBNO(mp, bmap->br_startblock), + owner, whichfork, + bmap->br_startoff, + bmap->br_blockcount, + bmap->br_state); + + ri = kmem_alloc(sizeof(struct xfs_rmap_intent), KM_SLEEP | KM_NOFS); + INIT_LIST_HEAD(&ri->ri_list); + ri->ri_type = type; + ri->ri_owner = owner; + ri->ri_whichfork = whichfork; + ri->ri_bmap = *bmap; + + xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_RMAP, &ri->ri_list); + return 0; +} + +/* Map an extent into a file. */ +int +xfs_rmap_map_extent( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *PREV) +{ + if (!xfs_rmap_update_is_needed(mp)) + return 0; + + return __xfs_rmap_add(mp, dfops, XFS_RMAP_MAP, ip->i_ino, + whichfork, PREV); +} + +/* Unmap an extent out of a file. */ +int +xfs_rmap_unmap_extent( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *PREV) +{ + if (!xfs_rmap_update_is_needed(mp)) + return 0; + + return __xfs_rmap_add(mp, dfops, XFS_RMAP_UNMAP, ip->i_ino, + whichfork, PREV); +} + +/* Convert a data fork extent from unwritten to real or vice versa. */ +int +xfs_rmap_convert_extent( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *PREV) +{ + if (!xfs_rmap_update_is_needed(mp)) + return 0; + + return __xfs_rmap_add(mp, dfops, XFS_RMAP_CONVERT, ip->i_ino, + whichfork, PREV); +} + +/* Schedule the creation of an rmap for non-file data. */ +int +xfs_rmap_alloc_extent( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len, + __uint64_t owner) +{ + struct xfs_bmbt_irec bmap; + + if (!xfs_rmap_update_is_needed(mp)) + return 0; + + bmap.br_startblock = XFS_AGB_TO_FSB(mp, agno, bno); + bmap.br_blockcount = len; + bmap.br_startoff = 0; + bmap.br_state = XFS_EXT_NORM; + + return __xfs_rmap_add(mp, dfops, XFS_RMAP_ALLOC, owner, + XFS_DATA_FORK, &bmap); +} + +/* Schedule the deletion of an rmap for non-file data. */ +int +xfs_rmap_free_extent( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len, + __uint64_t owner) +{ + struct xfs_bmbt_irec bmap; + + if (!xfs_rmap_update_is_needed(mp)) + return 0; + + bmap.br_startblock = XFS_AGB_TO_FSB(mp, agno, bno); + bmap.br_blockcount = len; + bmap.br_startoff = 0; + bmap.br_state = XFS_EXT_NORM; + + return __xfs_rmap_add(mp, dfops, XFS_RMAP_FREE, owner, + XFS_DATA_FORK, &bmap); +} diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h new file mode 100644 index 000000000000..71cf99a4acba --- /dev/null +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -0,0 +1,209 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_RMAP_H__ +#define __XFS_RMAP_H__ + +static inline void +xfs_rmap_ag_owner( + struct xfs_owner_info *oi, + uint64_t owner) +{ + oi->oi_owner = owner; + oi->oi_offset = 0; + oi->oi_flags = 0; +} + +static inline void +xfs_rmap_ino_bmbt_owner( + struct xfs_owner_info *oi, + xfs_ino_t ino, + int whichfork) +{ + oi->oi_owner = ino; + oi->oi_offset = 0; + oi->oi_flags = XFS_OWNER_INFO_BMBT_BLOCK; + if (whichfork == XFS_ATTR_FORK) + oi->oi_flags |= XFS_OWNER_INFO_ATTR_FORK; +} + +static inline void +xfs_rmap_ino_owner( + struct xfs_owner_info *oi, + xfs_ino_t ino, + int whichfork, + xfs_fileoff_t offset) +{ + oi->oi_owner = ino; + oi->oi_offset = offset; + oi->oi_flags = 0; + if (whichfork == XFS_ATTR_FORK) + oi->oi_flags |= XFS_OWNER_INFO_ATTR_FORK; +} + +static inline void +xfs_rmap_skip_owner_update( + struct xfs_owner_info *oi) +{ + oi->oi_owner = XFS_RMAP_OWN_UNKNOWN; +} + +/* Reverse mapping functions. */ + +struct xfs_buf; + +static inline __u64 +xfs_rmap_irec_offset_pack( + const struct xfs_rmap_irec *irec) +{ + __u64 x; + + x = XFS_RMAP_OFF(irec->rm_offset); + if (irec->rm_flags & XFS_RMAP_ATTR_FORK) + x |= XFS_RMAP_OFF_ATTR_FORK; + if (irec->rm_flags & XFS_RMAP_BMBT_BLOCK) + x |= XFS_RMAP_OFF_BMBT_BLOCK; + if (irec->rm_flags & XFS_RMAP_UNWRITTEN) + x |= XFS_RMAP_OFF_UNWRITTEN; + return x; +} + +static inline int +xfs_rmap_irec_offset_unpack( + __u64 offset, + struct xfs_rmap_irec *irec) +{ + if (offset & ~(XFS_RMAP_OFF_MASK | XFS_RMAP_OFF_FLAGS)) + return -EFSCORRUPTED; + irec->rm_offset = XFS_RMAP_OFF(offset); + if (offset & XFS_RMAP_OFF_ATTR_FORK) + irec->rm_flags |= XFS_RMAP_ATTR_FORK; + if (offset & XFS_RMAP_OFF_BMBT_BLOCK) + irec->rm_flags |= XFS_RMAP_BMBT_BLOCK; + if (offset & XFS_RMAP_OFF_UNWRITTEN) + irec->rm_flags |= XFS_RMAP_UNWRITTEN; + return 0; +} + +static inline void +xfs_owner_info_unpack( + struct xfs_owner_info *oinfo, + uint64_t *owner, + uint64_t *offset, + unsigned int *flags) +{ + unsigned int r = 0; + + *owner = oinfo->oi_owner; + *offset = oinfo->oi_offset; + if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK) + r |= XFS_RMAP_ATTR_FORK; + if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK) + r |= XFS_RMAP_BMBT_BLOCK; + *flags = r; +} + +static inline void +xfs_owner_info_pack( + struct xfs_owner_info *oinfo, + uint64_t owner, + uint64_t offset, + unsigned int flags) +{ + oinfo->oi_owner = owner; + oinfo->oi_offset = XFS_RMAP_OFF(offset); + oinfo->oi_flags = 0; + if (flags & XFS_RMAP_ATTR_FORK) + oinfo->oi_flags |= XFS_OWNER_INFO_ATTR_FORK; + if (flags & XFS_RMAP_BMBT_BLOCK) + oinfo->oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK; +} + +int xfs_rmap_alloc(struct xfs_trans *tp, struct xfs_buf *agbp, + xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, + struct xfs_owner_info *oinfo); +int xfs_rmap_free(struct xfs_trans *tp, struct xfs_buf *agbp, + xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, + struct xfs_owner_info *oinfo); + +int xfs_rmap_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno, + xfs_extlen_t len, uint64_t owner, uint64_t offset, + unsigned int flags, int *stat); +int xfs_rmap_lookup_eq(struct xfs_btree_cur *cur, xfs_agblock_t bno, + xfs_extlen_t len, uint64_t owner, uint64_t offset, + unsigned int flags, int *stat); +int xfs_rmap_insert(struct xfs_btree_cur *rcur, xfs_agblock_t agbno, + xfs_extlen_t len, uint64_t owner, uint64_t offset, + unsigned int flags); +int xfs_rmap_get_rec(struct xfs_btree_cur *cur, struct xfs_rmap_irec *irec, + int *stat); + +typedef int (*xfs_rmap_query_range_fn)( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *rec, + void *priv); + +int xfs_rmap_query_range(struct xfs_btree_cur *cur, + struct xfs_rmap_irec *low_rec, struct xfs_rmap_irec *high_rec, + xfs_rmap_query_range_fn fn, void *priv); + +enum xfs_rmap_intent_type { + XFS_RMAP_MAP, + XFS_RMAP_MAP_SHARED, + XFS_RMAP_UNMAP, + XFS_RMAP_UNMAP_SHARED, + XFS_RMAP_CONVERT, + XFS_RMAP_CONVERT_SHARED, + XFS_RMAP_ALLOC, + XFS_RMAP_FREE, +}; + +struct xfs_rmap_intent { + struct list_head ri_list; + enum xfs_rmap_intent_type ri_type; + __uint64_t ri_owner; + int ri_whichfork; + struct xfs_bmbt_irec ri_bmap; +}; + +/* functions for updating the rmapbt based on bmbt map/unmap operations */ +int xfs_rmap_map_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, + struct xfs_inode *ip, int whichfork, + struct xfs_bmbt_irec *imap); +int xfs_rmap_unmap_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, + struct xfs_inode *ip, int whichfork, + struct xfs_bmbt_irec *imap); +int xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, + struct xfs_inode *ip, int whichfork, + struct xfs_bmbt_irec *imap); +int xfs_rmap_alloc_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, + xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, + __uint64_t owner); +int xfs_rmap_free_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, + xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, + __uint64_t owner); + +void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp, + struct xfs_btree_cur *rcur, int error); +int xfs_rmap_finish_one(struct xfs_trans *tp, enum xfs_rmap_intent_type type, + __uint64_t owner, int whichfork, xfs_fileoff_t startoff, + xfs_fsblock_t startblock, xfs_filblks_t blockcount, + xfs_exntst_t state, struct xfs_btree_cur **pcur); + +#endif /* __XFS_RMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c new file mode 100644 index 000000000000..bc1faebc84ec --- /dev/null +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -0,0 +1,511 @@ +/* + * Copyright (c) 2014 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_alloc.h" +#include "xfs_btree.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_error.h" +#include "xfs_extent_busy.h" + +/* + * Reverse map btree. + * + * This is a per-ag tree used to track the owner(s) of a given extent. With + * reflink it is possible for there to be multiple owners, which is a departure + * from classic XFS. Owner records for data extents are inserted when the + * extent is mapped and removed when an extent is unmapped. Owner records for + * all other block types (i.e. metadata) are inserted when an extent is + * allocated and removed when an extent is freed. There can only be one owner + * of a metadata extent, usually an inode or some other metadata structure like + * an AG btree. + * + * The rmap btree is part of the free space management, so blocks for the tree + * are sourced from the agfl. Hence we need transaction reservation support for + * this tree so that the freelist is always large enough. This also impacts on + * the minimum space we need to leave free in the AG. + * + * The tree is ordered by [ag block, owner, offset]. This is a large key size, + * but it is the only way to enforce unique keys when a block can be owned by + * multiple files at any offset. There's no need to order/search by extent + * size for online updating/management of the tree. It is intended that most + * reverse lookups will be to find the owner(s) of a particular block, or to + * try to recover tree and file data from corrupt primary metadata. + */ + +static struct xfs_btree_cur * +xfs_rmapbt_dup_cursor( + struct xfs_btree_cur *cur) +{ + return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp, + cur->bc_private.a.agbp, cur->bc_private.a.agno); +} + +STATIC void +xfs_rmapbt_set_root( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + int inc) +{ + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); + int btnum = cur->bc_btnum; + struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); + + ASSERT(ptr->s != 0); + + agf->agf_roots[btnum] = ptr->s; + be32_add_cpu(&agf->agf_levels[btnum], inc); + pag->pagf_levels[btnum] += inc; + xfs_perag_put(pag); + + xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); +} + +STATIC int +xfs_rmapbt_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) +{ + int error; + xfs_agblock_t bno; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + + /* Allocate the new block from the freelist. If we can't, give up. */ + error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp, + &bno, 1); + if (error) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; + } + + trace_xfs_rmapbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno, + bno, 1); + if (bno == NULLAGBLOCK) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + } + + xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, + false); + + xfs_trans_agbtree_delta(cur->bc_tp, 1); + new->s = cpu_to_be32(bno); + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; +} + +STATIC int +xfs_rmapbt_free_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + xfs_agblock_t bno; + int error; + + bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp)); + trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_private.a.agno, + bno, 1); + error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1); + if (error) + return error; + + xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, + XFS_EXTENT_BUSY_SKIP_DISCARD); + xfs_trans_agbtree_delta(cur->bc_tp, -1); + + return 0; +} + +STATIC int +xfs_rmapbt_get_minrecs( + struct xfs_btree_cur *cur, + int level) +{ + return cur->bc_mp->m_rmap_mnr[level != 0]; +} + +STATIC int +xfs_rmapbt_get_maxrecs( + struct xfs_btree_cur *cur, + int level) +{ + return cur->bc_mp->m_rmap_mxr[level != 0]; +} + +STATIC void +xfs_rmapbt_init_key_from_rec( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + key->rmap.rm_startblock = rec->rmap.rm_startblock; + key->rmap.rm_owner = rec->rmap.rm_owner; + key->rmap.rm_offset = rec->rmap.rm_offset; +} + +/* + * The high key for a reverse mapping record can be computed by shifting + * the startblock and offset to the highest value that would still map + * to that record. In practice this means that we add blockcount-1 to + * the startblock for all records, and if the record is for a data/attr + * fork mapping, we add blockcount-1 to the offset too. + */ +STATIC void +xfs_rmapbt_init_high_key_from_rec( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + __uint64_t off; + int adj; + + adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1; + + key->rmap.rm_startblock = rec->rmap.rm_startblock; + be32_add_cpu(&key->rmap.rm_startblock, adj); + key->rmap.rm_owner = rec->rmap.rm_owner; + key->rmap.rm_offset = rec->rmap.rm_offset; + if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) || + XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset))) + return; + off = be64_to_cpu(key->rmap.rm_offset); + off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK); + key->rmap.rm_offset = cpu_to_be64(off); +} + +STATIC void +xfs_rmapbt_init_rec_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec) +{ + rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock); + rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount); + rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner); + rec->rmap.rm_offset = cpu_to_be64( + xfs_rmap_irec_offset_pack(&cur->bc_rec.r)); +} + +STATIC void +xfs_rmapbt_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + + ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(agf->agf_roots[cur->bc_btnum] != 0); + + ptr->s = agf->agf_roots[cur->bc_btnum]; +} + +STATIC __int64_t +xfs_rmapbt_key_diff( + struct xfs_btree_cur *cur, + union xfs_btree_key *key) +{ + struct xfs_rmap_irec *rec = &cur->bc_rec.r; + struct xfs_rmap_key *kp = &key->rmap; + __u64 x, y; + __int64_t d; + + d = (__int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock; + if (d) + return d; + + x = be64_to_cpu(kp->rm_owner); + y = rec->rm_owner; + if (x > y) + return 1; + else if (y > x) + return -1; + + x = XFS_RMAP_OFF(be64_to_cpu(kp->rm_offset)); + y = rec->rm_offset; + if (x > y) + return 1; + else if (y > x) + return -1; + return 0; +} + +STATIC __int64_t +xfs_rmapbt_diff_two_keys( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + struct xfs_rmap_key *kp1 = &k1->rmap; + struct xfs_rmap_key *kp2 = &k2->rmap; + __int64_t d; + __u64 x, y; + + d = (__int64_t)be32_to_cpu(kp1->rm_startblock) - + be32_to_cpu(kp2->rm_startblock); + if (d) + return d; + + x = be64_to_cpu(kp1->rm_owner); + y = be64_to_cpu(kp2->rm_owner); + if (x > y) + return 1; + else if (y > x) + return -1; + + x = XFS_RMAP_OFF(be64_to_cpu(kp1->rm_offset)); + y = XFS_RMAP_OFF(be64_to_cpu(kp2->rm_offset)); + if (x > y) + return 1; + else if (y > x) + return -1; + return 0; +} + +static bool +xfs_rmapbt_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_perag *pag = bp->b_pag; + unsigned int level; + + /* + * magic number and level verification + * + * During growfs operations, we can't verify the exact level or owner as + * the perag is not fully initialised and hence not attached to the + * buffer. In this case, check against the maximum tree depth. + * + * Similarly, during log recovery we will have a perag structure + * attached, but the agf information will not yet have been initialised + * from the on disk AGF. Again, we can only check against maximum limits + * in this case. + */ + if (block->bb_magic != cpu_to_be32(XFS_RMAP_CRC_MAGIC)) + return false; + + if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) + return false; + if (!xfs_btree_sblock_v5hdr_verify(bp)) + return false; + + level = be16_to_cpu(block->bb_level); + if (pag && pag->pagf_init) { + if (level >= pag->pagf_levels[XFS_BTNUM_RMAPi]) + return false; + } else if (level >= mp->m_rmap_maxlevels) + return false; + + return xfs_btree_sblock_verify(bp, mp->m_rmap_mxr[level != 0]); +} + +static void +xfs_rmapbt_read_verify( + struct xfs_buf *bp) +{ + if (!xfs_btree_sblock_verify_crc(bp)) + xfs_buf_ioerror(bp, -EFSBADCRC); + else if (!xfs_rmapbt_verify(bp)) + xfs_buf_ioerror(bp, -EFSCORRUPTED); + + if (bp->b_error) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_verifier_error(bp); + } +} + +static void +xfs_rmapbt_write_verify( + struct xfs_buf *bp) +{ + if (!xfs_rmapbt_verify(bp)) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + xfs_btree_sblock_calc_crc(bp); + +} + +const struct xfs_buf_ops xfs_rmapbt_buf_ops = { + .name = "xfs_rmapbt", + .verify_read = xfs_rmapbt_read_verify, + .verify_write = xfs_rmapbt_write_verify, +}; + +#if defined(DEBUG) || defined(XFS_WARN) +STATIC int +xfs_rmapbt_keys_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + __uint32_t x; + __uint32_t y; + __uint64_t a; + __uint64_t b; + + x = be32_to_cpu(k1->rmap.rm_startblock); + y = be32_to_cpu(k2->rmap.rm_startblock); + if (x < y) + return 1; + else if (x > y) + return 0; + a = be64_to_cpu(k1->rmap.rm_owner); + b = be64_to_cpu(k2->rmap.rm_owner); + if (a < b) + return 1; + else if (a > b) + return 0; + a = XFS_RMAP_OFF(be64_to_cpu(k1->rmap.rm_offset)); + b = XFS_RMAP_OFF(be64_to_cpu(k2->rmap.rm_offset)); + if (a <= b) + return 1; + return 0; +} + +STATIC int +xfs_rmapbt_recs_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_rec *r1, + union xfs_btree_rec *r2) +{ + __uint32_t x; + __uint32_t y; + __uint64_t a; + __uint64_t b; + + x = be32_to_cpu(r1->rmap.rm_startblock); + y = be32_to_cpu(r2->rmap.rm_startblock); + if (x < y) + return 1; + else if (x > y) + return 0; + a = be64_to_cpu(r1->rmap.rm_owner); + b = be64_to_cpu(r2->rmap.rm_owner); + if (a < b) + return 1; + else if (a > b) + return 0; + a = XFS_RMAP_OFF(be64_to_cpu(r1->rmap.rm_offset)); + b = XFS_RMAP_OFF(be64_to_cpu(r2->rmap.rm_offset)); + if (a <= b) + return 1; + return 0; +} +#endif /* DEBUG */ + +static const struct xfs_btree_ops xfs_rmapbt_ops = { + .rec_len = sizeof(struct xfs_rmap_rec), + .key_len = 2 * sizeof(struct xfs_rmap_key), + + .dup_cursor = xfs_rmapbt_dup_cursor, + .set_root = xfs_rmapbt_set_root, + .alloc_block = xfs_rmapbt_alloc_block, + .free_block = xfs_rmapbt_free_block, + .get_minrecs = xfs_rmapbt_get_minrecs, + .get_maxrecs = xfs_rmapbt_get_maxrecs, + .init_key_from_rec = xfs_rmapbt_init_key_from_rec, + .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec, + .init_rec_from_cur = xfs_rmapbt_init_rec_from_cur, + .init_ptr_from_cur = xfs_rmapbt_init_ptr_from_cur, + .key_diff = xfs_rmapbt_key_diff, + .buf_ops = &xfs_rmapbt_buf_ops, + .diff_two_keys = xfs_rmapbt_diff_two_keys, +#if defined(DEBUG) || defined(XFS_WARN) + .keys_inorder = xfs_rmapbt_keys_inorder, + .recs_inorder = xfs_rmapbt_recs_inorder, +#endif +}; + +/* + * Allocate a new allocation btree cursor. + */ +struct xfs_btree_cur * +xfs_rmapbt_init_cursor( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agnumber_t agno) +{ + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_btree_cur *cur; + + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); + cur->bc_tp = tp; + cur->bc_mp = mp; + /* Overlapping btree; 2 keys per pointer. */ + cur->bc_btnum = XFS_BTNUM_RMAP; + cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING; + cur->bc_blocklog = mp->m_sb.sb_blocklog; + cur->bc_ops = &xfs_rmapbt_ops; + cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); + + cur->bc_private.a.agbp = agbp; + cur->bc_private.a.agno = agno; + + return cur; +} + +/* + * Calculate number of records in an rmap btree block. + */ +int +xfs_rmapbt_maxrecs( + struct xfs_mount *mp, + int blocklen, + int leaf) +{ + blocklen -= XFS_RMAP_BLOCK_LEN; + + if (leaf) + return blocklen / sizeof(struct xfs_rmap_rec); + return blocklen / + (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t)); +} + +/* Compute the maximum height of an rmap btree. */ +void +xfs_rmapbt_compute_maxlevels( + struct xfs_mount *mp) +{ + mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(mp, + mp->m_rmap_mnr, mp->m_sb.sb_agblocks); +} diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h new file mode 100644 index 000000000000..e73a55357dab --- /dev/null +++ b/fs/xfs/libxfs/xfs_rmap_btree.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2014 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_RMAP_BTREE_H__ +#define __XFS_RMAP_BTREE_H__ + +struct xfs_buf; +struct xfs_btree_cur; +struct xfs_mount; + +/* rmaps only exist on crc enabled filesystems */ +#define XFS_RMAP_BLOCK_LEN XFS_BTREE_SBLOCK_CRC_LEN + +/* + * Record, key, and pointer address macros for btree blocks. + * + * (note that some of these may appear unused, but they are used in userspace) + */ +#define XFS_RMAP_REC_ADDR(block, index) \ + ((struct xfs_rmap_rec *) \ + ((char *)(block) + XFS_RMAP_BLOCK_LEN + \ + (((index) - 1) * sizeof(struct xfs_rmap_rec)))) + +#define XFS_RMAP_KEY_ADDR(block, index) \ + ((struct xfs_rmap_key *) \ + ((char *)(block) + XFS_RMAP_BLOCK_LEN + \ + ((index) - 1) * 2 * sizeof(struct xfs_rmap_key))) + +#define XFS_RMAP_HIGH_KEY_ADDR(block, index) \ + ((struct xfs_rmap_key *) \ + ((char *)(block) + XFS_RMAP_BLOCK_LEN + \ + sizeof(struct xfs_rmap_key) + \ + ((index) - 1) * 2 * sizeof(struct xfs_rmap_key))) + +#define XFS_RMAP_PTR_ADDR(block, index, maxrecs) \ + ((xfs_rmap_ptr_t *) \ + ((char *)(block) + XFS_RMAP_BLOCK_LEN + \ + (maxrecs) * 2 * sizeof(struct xfs_rmap_key) + \ + ((index) - 1) * sizeof(xfs_rmap_ptr_t))) + +struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp, + struct xfs_trans *tp, struct xfs_buf *bp, + xfs_agnumber_t agno); +int xfs_rmapbt_maxrecs(struct xfs_mount *mp, int blocklen, int leaf); +extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp); + +#endif /* __XFS_RMAP_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 12ca86778e02..0e3d4f5ec33c 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -24,6 +24,7 @@ #include "xfs_bit.h" #include "xfs_sb.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_ialloc.h" #include "xfs_alloc.h" @@ -36,6 +37,7 @@ #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" #include "xfs_log.h" +#include "xfs_rmap_btree.h" /* * Physical superblock buffer manipulations. Shared with libxfs in userspace. @@ -729,6 +731,11 @@ xfs_sb_mount_common( mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2; mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2; + mp->m_rmap_mxr[0] = xfs_rmapbt_maxrecs(mp, sbp->sb_blocksize, 1); + mp->m_rmap_mxr[1] = xfs_rmapbt_maxrecs(mp, sbp->sb_blocksize, 0); + mp->m_rmap_mnr[0] = mp->m_rmap_mxr[0] / 2; + mp->m_rmap_mnr[1] = mp->m_rmap_mxr[1] / 2; + mp->m_bsize = XFS_FSB_TO_BB(mp, 1); mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK, sbp->sb_inopblock); @@ -738,6 +745,8 @@ xfs_sb_mount_common( mp->m_ialloc_min_blks = sbp->sb_spino_align; else mp->m_ialloc_min_blks = mp->m_ialloc_blks; + mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); + mp->m_ag_max_usable = xfs_alloc_ag_max_usable(mp); } /* diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 16002b5ec4eb..0c5b30bd884c 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -38,6 +38,7 @@ extern const struct xfs_buf_ops xfs_agi_buf_ops; extern const struct xfs_buf_ops xfs_agf_buf_ops; extern const struct xfs_buf_ops xfs_agfl_buf_ops; extern const struct xfs_buf_ops xfs_allocbt_buf_ops; +extern const struct xfs_buf_ops xfs_rmapbt_buf_ops; extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops; extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops; extern const struct xfs_buf_ops xfs_bmbt_buf_ops; @@ -116,6 +117,7 @@ int xfs_log_calc_minimum_size(struct xfs_mount *); #define XFS_INO_BTREE_REF 3 #define XFS_ALLOC_BTREE_REF 2 #define XFS_BMAP_BTREE_REF 2 +#define XFS_RMAP_BTREE_REF 2 #define XFS_DIR_BTREE_REF 2 #define XFS_INO_REF 2 #define XFS_ATTR_BTREE_REF 1 diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 68cb1e7bf2bb..301ef2f4dbd6 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -64,6 +64,30 @@ xfs_calc_buf_res( } /* + * Per-extent log reservation for the btree changes involved in freeing or + * allocating an extent. In classic XFS there were two trees that will be + * modified (bnobt + cntbt). With rmap enabled, there are three trees + * (rmapbt). The number of blocks reserved is based on the formula: + * + * num trees * ((2 blocks/level * max depth) - 1) + * + * Keep in mind that max depth is calculated separately for each type of tree. + */ +static uint +xfs_allocfree_log_count( + struct xfs_mount *mp, + uint num_ops) +{ + uint blocks; + + blocks = num_ops * 2 * (2 * mp->m_ag_maxlevels - 1); + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + blocks += num_ops * (2 * mp->m_rmap_maxlevels - 1); + + return blocks; +} + +/* * Logging inodes is really tricksy. They are logged in memory format, * which means that what we write into the log doesn't directly translate into * the amount of space they use on disk. @@ -126,7 +150,7 @@ xfs_calc_inode_res( */ STATIC uint xfs_calc_finobt_res( - struct xfs_mount *mp, + struct xfs_mount *mp, int alloc, int modify) { @@ -137,7 +161,7 @@ xfs_calc_finobt_res( res = xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)); if (alloc) - res += xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + res += xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), XFS_FSB_TO_B(mp, 1)); if (modify) res += (uint)XFS_FSB_TO_B(mp, 1); @@ -153,9 +177,9 @@ xfs_calc_finobt_res( * item logged to try to account for the overhead of the transaction mechanism. * * Note: Most of the reservations underestimate the number of allocation - * groups into which they could free extents in the xfs_bmap_finish() call. + * groups into which they could free extents in the xfs_defer_finish() call. * This is because the number in the worst case is quite high and quite - * unusual. In order to fix this we need to change xfs_bmap_finish() to free + * unusual. In order to fix this we need to change xfs_defer_finish() to free * extents in only a single AG at a time. This will require changes to the * EFI code as well, however, so that the EFI for the extents not freed is * logged again in each transaction. See SGI PV #261917. @@ -188,10 +212,10 @@ xfs_calc_write_reservation( xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), XFS_FSB_TO_B(mp, 1)) + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), XFS_FSB_TO_B(mp, 1)))); } @@ -217,10 +241,10 @@ xfs_calc_itruncate_reservation( xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4), + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), XFS_FSB_TO_B(mp, 1)) + xfs_calc_buf_res(5, 0) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), XFS_FSB_TO_B(mp, 1)) + xfs_calc_buf_res(2 + mp->m_ialloc_blks + mp->m_in_maxlevels, 0))); @@ -247,7 +271,7 @@ xfs_calc_rename_reservation( xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 3), + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 3), XFS_FSB_TO_B(mp, 1)))); } @@ -286,7 +310,7 @@ xfs_calc_link_reservation( xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), XFS_FSB_TO_B(mp, 1)))); } @@ -324,7 +348,7 @@ xfs_calc_remove_reservation( xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), XFS_FSB_TO_B(mp, 1)))); } @@ -371,7 +395,7 @@ xfs_calc_create_resv_alloc( mp->m_sb.sb_sectsize + xfs_calc_buf_res(mp->m_ialloc_blks, XFS_FSB_TO_B(mp, 1)) + xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), XFS_FSB_TO_B(mp, 1)); } @@ -399,7 +423,7 @@ xfs_calc_icreate_resv_alloc( return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + mp->m_sb.sb_sectsize + xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), XFS_FSB_TO_B(mp, 1)) + xfs_calc_finobt_res(mp, 0, 0); } @@ -483,7 +507,7 @@ xfs_calc_ifree_reservation( xfs_calc_buf_res(1, 0) + xfs_calc_buf_res(2 + mp->m_ialloc_blks + mp->m_in_maxlevels, 0) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), XFS_FSB_TO_B(mp, 1)) + xfs_calc_finobt_res(mp, 0, 1); } @@ -513,7 +537,7 @@ xfs_calc_growdata_reservation( struct xfs_mount *mp) { return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), XFS_FSB_TO_B(mp, 1)); } @@ -535,7 +559,7 @@ xfs_calc_growrtalloc_reservation( xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), XFS_FSB_TO_B(mp, 1)) + xfs_calc_inode_res(mp, 1) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), XFS_FSB_TO_B(mp, 1)); } @@ -611,7 +635,7 @@ xfs_calc_addafork_reservation( xfs_calc_buf_res(1, mp->m_dir_geo->blksize) + xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1, XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), XFS_FSB_TO_B(mp, 1)); } @@ -634,7 +658,7 @@ xfs_calc_attrinval_reservation( xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4), + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), XFS_FSB_TO_B(mp, 1)))); } @@ -701,7 +725,7 @@ xfs_calc_attrrm_reservation( XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)), (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), XFS_FSB_TO_B(mp, 1)))); } diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h index 797815012c0e..0eb46ed6d404 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.h +++ b/fs/xfs/libxfs/xfs_trans_resv.h @@ -68,16 +68,6 @@ struct xfs_trans_resv { #define M_RES(mp) (&(mp)->m_resv) /* - * Per-extent log reservation for the allocation btree changes - * involved in freeing or allocating an extent. - * 2 trees * (2 blocks/level * max depth - 1) * block size - */ -#define XFS_ALLOCFREE_LOG_RES(mp,nx) \ - ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * (mp)->m_ag_maxlevels - 1))) -#define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \ - ((nx) * (2 * (2 * (mp)->m_ag_maxlevels - 1))) - -/* * Per-directory log reservation for any directory change. * dir blocks: (1 btree block per level + data block + free block) * dblock size * bmap btree: (levels + 2) * max depth * block size diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index b79dc66b2ecd..3d503647f26b 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -108,8 +108,8 @@ typedef enum { } xfs_lookup_t; typedef enum { - XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_BMAPi, XFS_BTNUM_INOi, - XFS_BTNUM_FINOi, XFS_BTNUM_MAX + XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_RMAPi, XFS_BTNUM_BMAPi, + XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_MAX } xfs_btnum_t; struct xfs_name { diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index cd4a850564f2..4ece4f2ffc72 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -25,6 +25,7 @@ #include "xfs_bit.h" #include "xfs_mount.h" #include "xfs_da_format.h" +#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_trans.h" @@ -40,6 +41,7 @@ #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_log.h" +#include "xfs_rmap_btree.h" /* Kernel only BMAP related definitions and functions */ @@ -79,95 +81,6 @@ xfs_zero_extent( GFP_NOFS, true); } -/* Sort bmap items by AG. */ -static int -xfs_bmap_free_list_cmp( - void *priv, - struct list_head *a, - struct list_head *b) -{ - struct xfs_mount *mp = priv; - struct xfs_bmap_free_item *ra; - struct xfs_bmap_free_item *rb; - - ra = container_of(a, struct xfs_bmap_free_item, xbfi_list); - rb = container_of(b, struct xfs_bmap_free_item, xbfi_list); - return XFS_FSB_TO_AGNO(mp, ra->xbfi_startblock) - - XFS_FSB_TO_AGNO(mp, rb->xbfi_startblock); -} - -/* - * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi - * caller. Frees all the extents that need freeing, which must be done - * last due to locking considerations. We never free any extents in - * the first transaction. - * - * If an inode *ip is provided, rejoin it to the transaction if - * the transaction was committed. - */ -int /* error */ -xfs_bmap_finish( - struct xfs_trans **tp, /* transaction pointer addr */ - struct xfs_bmap_free *flist, /* i/o: list extents to free */ - struct xfs_inode *ip) -{ - struct xfs_efd_log_item *efd; /* extent free data */ - struct xfs_efi_log_item *efi; /* extent free intention */ - int error; /* error return value */ - int committed;/* xact committed or not */ - struct xfs_bmap_free_item *free; /* free extent item */ - - ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); - if (flist->xbf_count == 0) - return 0; - - list_sort((*tp)->t_mountp, &flist->xbf_flist, xfs_bmap_free_list_cmp); - - efi = xfs_trans_get_efi(*tp, flist->xbf_count); - list_for_each_entry(free, &flist->xbf_flist, xbfi_list) - xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock, - free->xbfi_blockcount); - - error = __xfs_trans_roll(tp, ip, &committed); - if (error) { - /* - * If the transaction was committed, drop the EFD reference - * since we're bailing out of here. The other reference is - * dropped when the EFI hits the AIL. - * - * If the transaction was not committed, the EFI is freed by the - * EFI item unlock handler on abort. Also, we have a new - * transaction so we should return committed=1 even though we're - * returning an error. - */ - if (committed) { - xfs_efi_release(efi); - xfs_force_shutdown((*tp)->t_mountp, - SHUTDOWN_META_IO_ERROR); - } - return error; - } - - /* - * Get an EFD and free each extent in the list, logging to the EFD in - * the process. The remaining bmap free list is cleaned up by the caller - * on error. - */ - efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count); - while (!list_empty(&flist->xbf_flist)) { - free = list_first_entry(&flist->xbf_flist, - struct xfs_bmap_free_item, xbfi_list); - error = xfs_trans_free_extent(*tp, efd, free->xbfi_startblock, - free->xbfi_blockcount); - if (error) - return error; - - xfs_bmap_del_free(flist, free); - } - - return 0; -} - int xfs_bmap_rtalloc( struct xfs_bmalloca *ap) /* bmap alloc argument struct */ @@ -214,9 +127,9 @@ xfs_bmap_rtalloc( /* * Lock out modifications to both the RT bitmap and summary inodes */ - xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP); xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL); - xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL); + xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM); xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL); /* @@ -773,7 +686,7 @@ xfs_bmap_punch_delalloc_range( xfs_bmbt_irec_t imap; int nimaps = 1; xfs_fsblock_t firstblock; - xfs_bmap_free_t flist; + struct xfs_defer_ops dfops; /* * Map the range first and check that it is a delalloc extent @@ -804,18 +717,18 @@ xfs_bmap_punch_delalloc_range( WARN_ON(imap.br_blockcount == 0); /* - * Note: while we initialise the firstblock/flist pair, they + * Note: while we initialise the firstblock/dfops pair, they * should never be used because blocks should never be * allocated or freed for a delalloc extent and hence we need * don't cancel or finish them after the xfs_bunmapi() call. */ - xfs_bmap_init(&flist, &firstblock); + xfs_defer_init(&dfops, &firstblock); error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock, - &flist, &done); + &dfops, &done); if (error) break; - ASSERT(!flist.xbf_count && list_empty(&flist.xbf_flist)); + ASSERT(!xfs_defer_has_unfinished_work(&dfops)); next_block: start_fsb++; remaining--; @@ -972,7 +885,7 @@ xfs_alloc_file_space( int rt; xfs_trans_t *tp; xfs_bmbt_irec_t imaps[1], *imapp; - xfs_bmap_free_t free_list; + struct xfs_defer_ops dfops; uint qblocks, resblks, resrtextents; int error; @@ -1063,17 +976,17 @@ xfs_alloc_file_space( xfs_trans_ijoin(tp, ip, 0); - xfs_bmap_init(&free_list, &firstfsb); + xfs_defer_init(&dfops, &firstfsb); error = xfs_bmapi_write(tp, ip, startoffset_fsb, allocatesize_fsb, alloc_type, &firstfsb, - resblks, imapp, &nimaps, &free_list); + resblks, imapp, &nimaps, &dfops); if (error) goto error0; /* * Complete the transaction */ - error = xfs_bmap_finish(&tp, &free_list, NULL); + error = xfs_defer_finish(&tp, &dfops, NULL); if (error) goto error0; @@ -1096,7 +1009,7 @@ xfs_alloc_file_space( return error; error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ - xfs_bmap_cancel(&free_list); + xfs_defer_cancel(&dfops); xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); error1: /* Just cancel transaction */ @@ -1114,7 +1027,7 @@ xfs_unmap_extent( { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; - struct xfs_bmap_free free_list; + struct xfs_defer_ops dfops; xfs_fsblock_t firstfsb; uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); int error; @@ -1133,13 +1046,13 @@ xfs_unmap_extent( xfs_trans_ijoin(tp, ip, 0); - xfs_bmap_init(&free_list, &firstfsb); + xfs_defer_init(&dfops, &firstfsb); error = xfs_bunmapi(tp, ip, startoffset_fsb, len_fsb, 0, 2, &firstfsb, - &free_list, done); + &dfops, done); if (error) goto out_bmap_cancel; - error = xfs_bmap_finish(&tp, &free_list, NULL); + error = xfs_defer_finish(&tp, &dfops, ip); if (error) goto out_bmap_cancel; @@ -1149,7 +1062,7 @@ out_unlock: return error; out_bmap_cancel: - xfs_bmap_cancel(&free_list); + xfs_defer_cancel(&dfops); out_trans_cancel: xfs_trans_cancel(tp); goto out_unlock; @@ -1338,7 +1251,7 @@ xfs_shift_file_space( struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; int error; - struct xfs_bmap_free free_list; + struct xfs_defer_ops dfops; xfs_fsblock_t first_block; xfs_fileoff_t stop_fsb; xfs_fileoff_t next_fsb; @@ -1416,19 +1329,19 @@ xfs_shift_file_space( xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_bmap_init(&free_list, &first_block); + xfs_defer_init(&dfops, &first_block); /* * We are using the write transaction in which max 2 bmbt * updates are allowed */ error = xfs_bmap_shift_extents(tp, ip, &next_fsb, shift_fsb, - &done, stop_fsb, &first_block, &free_list, + &done, stop_fsb, &first_block, &dfops, direction, XFS_BMAP_MAX_SHIFT_EXTENTS); if (error) goto out_bmap_cancel; - error = xfs_bmap_finish(&tp, &free_list, NULL); + error = xfs_defer_finish(&tp, &dfops, NULL); if (error) goto out_bmap_cancel; @@ -1438,7 +1351,7 @@ xfs_shift_file_space( return error; out_bmap_cancel: - xfs_bmap_cancel(&free_list); + xfs_defer_cancel(&dfops); out_trans_cancel: xfs_trans_cancel(tp); return error; @@ -1622,6 +1535,10 @@ xfs_swap_extents( __uint64_t tmp; int lock_flags; + /* XXX: we can't do this with rmap, will fix later */ + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + return -EOPNOTSUPP; + tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); if (!tempifp) { error = -ENOMEM; diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index f20071432ca6..68a621a8e0c0 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -21,7 +21,7 @@ /* Kernel only BMAP related definitions and functions */ struct xfs_bmbt_irec; -struct xfs_bmap_free_item; +struct xfs_extent_free_item; struct xfs_ifork; struct xfs_inode; struct xfs_mount; @@ -40,8 +40,6 @@ int xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv, xfs_bmap_format_t formatter, void *arg); /* functions in xfs_bmap.c that are only needed by xfs_bmap_util.c */ -void xfs_bmap_del_free(struct xfs_bmap_free *flist, - struct xfs_bmap_free_item *free); int xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp, struct xfs_bmbt_irec *prevp, xfs_extlen_t extsz, int rt, int eof, int delay, int convert, diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 272c3f8b6f7d..4ff499aa7338 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -179,7 +179,7 @@ xfs_ioc_trim( * matter as trimming blocks is an advisory interface. */ if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) || - range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp)) || + range.minlen > XFS_FSB_TO_B(mp, mp->m_ag_max_usable) || range.len < mp->m_sb.sb_blocksize) return -EINVAL; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index ccb0811963b2..7a30b8f11db7 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -23,6 +23,7 @@ #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" @@ -307,7 +308,7 @@ xfs_qm_dqalloc( xfs_buf_t **O_bpp) { xfs_fsblock_t firstblock; - xfs_bmap_free_t flist; + struct xfs_defer_ops dfops; xfs_bmbt_irec_t map; int nmaps, error; xfs_buf_t *bp; @@ -320,7 +321,7 @@ xfs_qm_dqalloc( /* * Initialize the bmap freelist prior to calling bmapi code. */ - xfs_bmap_init(&flist, &firstblock); + xfs_defer_init(&dfops, &firstblock); xfs_ilock(quotip, XFS_ILOCK_EXCL); /* * Return if this type of quotas is turned off while we didn't @@ -336,7 +337,7 @@ xfs_qm_dqalloc( error = xfs_bmapi_write(tp, quotip, offset_fsb, XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, &firstblock, XFS_QM_DQALLOC_SPACE_RES(mp), - &map, &nmaps, &flist); + &map, &nmaps, &dfops); if (error) goto error0; ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB); @@ -368,7 +369,7 @@ xfs_qm_dqalloc( dqp->dq_flags & XFS_DQ_ALLTYPES, bp); /* - * xfs_bmap_finish() may commit the current transaction and + * xfs_defer_finish() may commit the current transaction and * start a second transaction if the freelist is not empty. * * Since we still want to modify this buffer, we need to @@ -382,7 +383,7 @@ xfs_qm_dqalloc( xfs_trans_bhold(tp, bp); - error = xfs_bmap_finish(tpp, &flist, NULL); + error = xfs_defer_finish(tpp, &dfops, NULL); if (error) goto error1; @@ -398,7 +399,7 @@ xfs_qm_dqalloc( return 0; error1: - xfs_bmap_cancel(&flist); + xfs_defer_cancel(&dfops); error0: xfs_iunlock(quotip, XFS_ILOCK_EXCL); diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 2e4f67f68856..3d224702fbc0 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -90,7 +90,9 @@ extern void xfs_verifier_error(struct xfs_buf *bp); #define XFS_ERRTAG_STRATCMPL_IOERR 19 #define XFS_ERRTAG_DIOWRITE_IOERR 20 #define XFS_ERRTAG_BMAPIFORMAT 21 -#define XFS_ERRTAG_MAX 22 +#define XFS_ERRTAG_FREE_EXTENT 22 +#define XFS_ERRTAG_RMAP_FINISH_ONE 23 +#define XFS_ERRTAG_MAX 24 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -117,6 +119,8 @@ extern void xfs_verifier_error(struct xfs_buf *bp); #define XFS_RANDOM_STRATCMPL_IOERR (XFS_RANDOM_DEFAULT/10) #define XFS_RANDOM_DIOWRITE_IOERR (XFS_RANDOM_DEFAULT/10) #define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT +#define XFS_RANDOM_FREE_EXTENT 1 +#define XFS_RANDOM_RMAP_FINISH_ONE 1 #ifdef DEBUG extern int xfs_error_test_active; diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index a1b2dd828b9d..fe1bfee35898 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -246,7 +246,7 @@ const struct export_operations xfs_export_operations = { .fh_to_parent = xfs_fs_fh_to_parent, .get_parent = xfs_fs_get_parent, .commit_metadata = xfs_fs_nfs_commit_metadata, -#ifdef CONFIG_NFSD_BLOCKLAYOUT +#ifdef CONFIG_EXPORTFS_BLOCK_OPS .get_uuid = xfs_fs_get_uuid, .map_blocks = xfs_fs_map_blocks, .commit_blocks = xfs_fs_commit_blocks, diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index ab779460ecbf..d7bc14906af8 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -20,12 +20,15 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" +#include "xfs_bit.h" #include "xfs_mount.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_buf_item.h" #include "xfs_extfree_item.h" #include "xfs_log.h" +#include "xfs_btree.h" +#include "xfs_rmap.h" kmem_zone_t *xfs_efi_zone; @@ -486,3 +489,69 @@ xfs_efd_init( return efdp; } + +/* + * Process an extent free intent item that was recovered from + * the log. We need to free the extents that it describes. + */ +int +xfs_efi_recover( + struct xfs_mount *mp, + struct xfs_efi_log_item *efip) +{ + struct xfs_efd_log_item *efdp; + struct xfs_trans *tp; + int i; + int error = 0; + xfs_extent_t *extp; + xfs_fsblock_t startblock_fsb; + struct xfs_owner_info oinfo; + + ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)); + + /* + * First check the validity of the extents described by the + * EFI. If any are bad, then assume that all are bad and + * just toss the EFI. + */ + for (i = 0; i < efip->efi_format.efi_nextents; i++) { + extp = &efip->efi_format.efi_extents[i]; + startblock_fsb = XFS_BB_TO_FSB(mp, + XFS_FSB_TO_DADDR(mp, extp->ext_start)); + if (startblock_fsb == 0 || + extp->ext_len == 0 || + startblock_fsb >= mp->m_sb.sb_dblocks || + extp->ext_len >= mp->m_sb.sb_agblocks) { + /* + * This will pull the EFI from the AIL and + * free the memory associated with it. + */ + set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); + xfs_efi_release(efip); + return -EIO; + } + } + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); + if (error) + return error; + efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); + + xfs_rmap_skip_owner_update(&oinfo); + for (i = 0; i < efip->efi_format.efi_nextents; i++) { + extp = &efip->efi_format.efi_extents[i]; + error = xfs_trans_free_extent(tp, efdp, extp->ext_start, + extp->ext_len, &oinfo); + if (error) + goto abort_error; + + } + + set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); + error = xfs_trans_commit(tp); + return error; + +abort_error: + xfs_trans_cancel(tp); + return error; +} diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index 8fa8651705e1..a32c794a86b7 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -98,4 +98,7 @@ int xfs_efi_copy_format(xfs_log_iovec_t *buf, void xfs_efi_item_free(xfs_efi_log_item_t *); void xfs_efi_release(struct xfs_efi_log_item *); +int xfs_efi_recover(struct xfs_mount *mp, + struct xfs_efi_log_item *efip); + #endif /* __XFS_EXTFREE_ITEM_H__ */ diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index a51353a1f87f..4a33a3304369 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -22,6 +22,7 @@ #include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" @@ -385,7 +386,7 @@ xfs_filestream_new_ag( } flags = (ap->userdata ? XFS_PICK_USERDATA : 0) | - (ap->flist->xbf_low ? XFS_PICK_LOWSPACE : 0); + (ap->dfops->dop_low ? XFS_PICK_LOWSPACE : 0); err = xfs_filestream_pick_ag(pip, startag, agp, flags, minlen); diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 7191c3878b4a..0f96847b90e1 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -23,6 +23,7 @@ #include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_inode.h" @@ -32,6 +33,7 @@ #include "xfs_btree.h" #include "xfs_alloc_btree.h" #include "xfs_alloc.h" +#include "xfs_rmap_btree.h" #include "xfs_ialloc.h" #include "xfs_fsops.h" #include "xfs_itable.h" @@ -40,6 +42,7 @@ #include "xfs_trace.h" #include "xfs_log.h" #include "xfs_filestream.h" +#include "xfs_rmap.h" /* * File system operations @@ -103,7 +106,9 @@ xfs_fs_geometry( (xfs_sb_version_hasfinobt(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_FINOBT : 0) | (xfs_sb_version_hassparseinodes(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_SPINODES : 0); + XFS_FSOP_GEOM_FLAGS_SPINODES : 0) | + (xfs_sb_version_hasrmapbt(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_RMAPBT : 0); geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? mp->m_sb.sb_logsectsize : BBSIZE; geo->rtsectsize = mp->m_sb.sb_blocksize; @@ -239,10 +244,16 @@ xfs_growfs_data_private( agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp)); agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1); agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1); + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { + agf->agf_roots[XFS_BTNUM_RMAPi] = + cpu_to_be32(XFS_RMAP_BLOCK(mp)); + agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1); + } + agf->agf_flfirst = cpu_to_be32(1); agf->agf_fllast = 0; agf->agf_flcount = 0; - tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp); + tmpsize = agsize - mp->m_ag_prealloc_blocks; agf->agf_freeblks = cpu_to_be32(tmpsize); agf->agf_longest = cpu_to_be32(tmpsize); if (xfs_sb_version_hascrc(&mp->m_sb)) @@ -339,7 +350,7 @@ xfs_growfs_data_private( agno, 0); arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); - arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); + arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks); arec->ar_blockcount = cpu_to_be32( agsize - be32_to_cpu(arec->ar_startblock)); @@ -368,7 +379,7 @@ xfs_growfs_data_private( agno, 0); arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); - arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); + arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks); arec->ar_blockcount = cpu_to_be32( agsize - be32_to_cpu(arec->ar_startblock)); nfree += be32_to_cpu(arec->ar_blockcount); @@ -378,6 +389,72 @@ xfs_growfs_data_private( if (error) goto error0; + /* RMAP btree root block */ + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { + struct xfs_rmap_rec *rrec; + struct xfs_btree_block *block; + + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AGB_TO_DADDR(mp, agno, XFS_RMAP_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), 0, + &xfs_rmapbt_buf_ops); + if (!bp) { + error = -ENOMEM; + goto error0; + } + + xfs_btree_init_block(mp, bp, XFS_RMAP_CRC_MAGIC, 0, 0, + agno, XFS_BTREE_CRC_BLOCKS); + block = XFS_BUF_TO_BLOCK(bp); + + + /* + * mark the AG header regions as static metadata The BNO + * btree block is the first block after the headers, so + * it's location defines the size of region the static + * metadata consumes. + * + * Note: unlike mkfs, we never have to account for log + * space when growing the data regions + */ + rrec = XFS_RMAP_REC_ADDR(block, 1); + rrec->rm_startblock = 0; + rrec->rm_blockcount = cpu_to_be32(XFS_BNO_BLOCK(mp)); + rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_FS); + rrec->rm_offset = 0; + be16_add_cpu(&block->bb_numrecs, 1); + + /* account freespace btree root blocks */ + rrec = XFS_RMAP_REC_ADDR(block, 2); + rrec->rm_startblock = cpu_to_be32(XFS_BNO_BLOCK(mp)); + rrec->rm_blockcount = cpu_to_be32(2); + rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG); + rrec->rm_offset = 0; + be16_add_cpu(&block->bb_numrecs, 1); + + /* account inode btree root blocks */ + rrec = XFS_RMAP_REC_ADDR(block, 3); + rrec->rm_startblock = cpu_to_be32(XFS_IBT_BLOCK(mp)); + rrec->rm_blockcount = cpu_to_be32(XFS_RMAP_BLOCK(mp) - + XFS_IBT_BLOCK(mp)); + rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_INOBT); + rrec->rm_offset = 0; + be16_add_cpu(&block->bb_numrecs, 1); + + /* account for rmap btree root */ + rrec = XFS_RMAP_REC_ADDR(block, 4); + rrec->rm_startblock = cpu_to_be32(XFS_RMAP_BLOCK(mp)); + rrec->rm_blockcount = cpu_to_be32(1); + rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG); + rrec->rm_offset = 0; + be16_add_cpu(&block->bb_numrecs, 1); + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) + goto error0; + } + /* * INO btree root block */ @@ -435,6 +512,8 @@ xfs_growfs_data_private( * There are new blocks in the old last a.g. */ if (new) { + struct xfs_owner_info oinfo; + /* * Change the agi length. */ @@ -462,14 +541,20 @@ xfs_growfs_data_private( be32_to_cpu(agi->agi_length)); xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH); + /* * Free the new space. + * + * XFS_RMAP_OWN_NULL is used here to tell the rmap btree that + * this doesn't actually exist in the rmap btree. */ - error = xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, agno, - be32_to_cpu(agf->agf_length) - new), new); - if (error) { + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_NULL); + error = xfs_free_extent(tp, + XFS_AGB_TO_FSB(mp, agno, + be32_to_cpu(agf->agf_length) - new), + new, &oinfo); + if (error) goto error0; - } } /* @@ -501,6 +586,7 @@ xfs_growfs_data_private( } else mp->m_maxicount = 0; xfs_set_low_space_thresholds(mp); + mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); /* update secondary superblocks. */ for (agno = 1; agno < nagcount; agno++) { @@ -638,7 +724,7 @@ xfs_fs_counts( cnt->allocino = percpu_counter_read_positive(&mp->m_icount); cnt->freeino = percpu_counter_read_positive(&mp->m_ifree); cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) - - XFS_ALLOC_SET_ASIDE(mp); + mp->m_alloc_set_aside; spin_lock(&mp->m_sb_lock); cnt->freertx = mp->m_sb.sb_frextents; @@ -726,7 +812,7 @@ xfs_reserve_blocks( error = -ENOSPC; do { free = percpu_counter_sum(&mp->m_fdblocks) - - XFS_ALLOC_SET_ASIDE(mp); + mp->m_alloc_set_aside; if (!free) break; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 8825bcfd314c..e08eaea6327b 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -25,6 +25,7 @@ #include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" @@ -1122,7 +1123,7 @@ xfs_create( struct xfs_inode *ip = NULL; struct xfs_trans *tp = NULL; int error; - xfs_bmap_free_t free_list; + struct xfs_defer_ops dfops; xfs_fsblock_t first_block; bool unlock_dp_on_error = false; prid_t prid; @@ -1182,7 +1183,7 @@ xfs_create( XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT); unlock_dp_on_error = true; - xfs_bmap_init(&free_list, &first_block); + xfs_defer_init(&dfops, &first_block); /* * Reserve disk quota and the inode. @@ -1219,7 +1220,7 @@ xfs_create( unlock_dp_on_error = false; error = xfs_dir_createname(tp, dp, name, ip->i_ino, - &first_block, &free_list, resblks ? + &first_block, &dfops, resblks ? resblks - XFS_IALLOC_SPACE_RES(mp) : 0); if (error) { ASSERT(error != -ENOSPC); @@ -1253,7 +1254,7 @@ xfs_create( */ xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); - error = xfs_bmap_finish(&tp, &free_list, NULL); + error = xfs_defer_finish(&tp, &dfops, NULL); if (error) goto out_bmap_cancel; @@ -1269,7 +1270,7 @@ xfs_create( return 0; out_bmap_cancel: - xfs_bmap_cancel(&free_list); + xfs_defer_cancel(&dfops); out_trans_cancel: xfs_trans_cancel(tp); out_release_inode: @@ -1401,7 +1402,7 @@ xfs_link( xfs_mount_t *mp = tdp->i_mount; xfs_trans_t *tp; int error; - xfs_bmap_free_t free_list; + struct xfs_defer_ops dfops; xfs_fsblock_t first_block; int resblks; @@ -1452,7 +1453,7 @@ xfs_link( goto error_return; } - xfs_bmap_init(&free_list, &first_block); + xfs_defer_init(&dfops, &first_block); /* * Handle initial link state of O_TMPFILE inode @@ -1464,7 +1465,7 @@ xfs_link( } error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino, - &first_block, &free_list, resblks); + &first_block, &dfops, resblks); if (error) goto error_return; xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); @@ -1482,9 +1483,9 @@ xfs_link( if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) xfs_trans_set_sync(tp); - error = xfs_bmap_finish(&tp, &free_list, NULL); + error = xfs_defer_finish(&tp, &dfops, NULL); if (error) { - xfs_bmap_cancel(&free_list); + xfs_defer_cancel(&dfops); goto error_return; } @@ -1526,7 +1527,7 @@ xfs_itruncate_extents( { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp = *tpp; - xfs_bmap_free_t free_list; + struct xfs_defer_ops dfops; xfs_fsblock_t first_block; xfs_fileoff_t first_unmap_block; xfs_fileoff_t last_block; @@ -1562,12 +1563,12 @@ xfs_itruncate_extents( ASSERT(first_unmap_block < last_block); unmap_len = last_block - first_unmap_block + 1; while (!done) { - xfs_bmap_init(&free_list, &first_block); + xfs_defer_init(&dfops, &first_block); error = xfs_bunmapi(tp, ip, first_unmap_block, unmap_len, xfs_bmapi_aflag(whichfork), XFS_ITRUNC_MAX_EXTENTS, - &first_block, &free_list, + &first_block, &dfops, &done); if (error) goto out_bmap_cancel; @@ -1576,7 +1577,7 @@ xfs_itruncate_extents( * Duplicate the transaction that has the permanent * reservation and commit the old transaction. */ - error = xfs_bmap_finish(&tp, &free_list, ip); + error = xfs_defer_finish(&tp, &dfops, ip); if (error) goto out_bmap_cancel; @@ -1602,7 +1603,7 @@ out_bmap_cancel: * the transaction can be properly aborted. We just need to make sure * we're not holding any resources that we were not when we came in. */ - xfs_bmap_cancel(&free_list); + xfs_defer_cancel(&dfops); goto out; } @@ -1743,7 +1744,7 @@ STATIC int xfs_inactive_ifree( struct xfs_inode *ip) { - xfs_bmap_free_t free_list; + struct xfs_defer_ops dfops; xfs_fsblock_t first_block; struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; @@ -1780,8 +1781,8 @@ xfs_inactive_ifree( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - xfs_bmap_init(&free_list, &first_block); - error = xfs_ifree(tp, ip, &free_list); + xfs_defer_init(&dfops, &first_block); + error = xfs_ifree(tp, ip, &dfops); if (error) { /* * If we fail to free the inode, shut down. The cancel @@ -1807,11 +1808,11 @@ xfs_inactive_ifree( * Just ignore errors at this point. There is nothing we can do except * to try to keep going. Make sure it's not a silent error. */ - error = xfs_bmap_finish(&tp, &free_list, NULL); + error = xfs_defer_finish(&tp, &dfops, NULL); if (error) { - xfs_notice(mp, "%s: xfs_bmap_finish returned error %d", + xfs_notice(mp, "%s: xfs_defer_finish returned error %d", __func__, error); - xfs_bmap_cancel(&free_list); + xfs_defer_cancel(&dfops); } error = xfs_trans_commit(tp); if (error) @@ -2367,7 +2368,7 @@ int xfs_ifree( xfs_trans_t *tp, xfs_inode_t *ip, - xfs_bmap_free_t *flist) + struct xfs_defer_ops *dfops) { int error; struct xfs_icluster xic = { 0 }; @@ -2386,7 +2387,7 @@ xfs_ifree( if (error) return error; - error = xfs_difree(tp, ip->i_ino, flist, &xic); + error = xfs_difree(tp, ip->i_ino, dfops, &xic); if (error) return error; @@ -2474,7 +2475,7 @@ xfs_iunpin_wait( * directory entry. * * This is still safe from a transactional point of view - it is not until we - * get to xfs_bmap_finish() that we have the possibility of multiple + * get to xfs_defer_finish() that we have the possibility of multiple * transactions in this operation. Hence as long as we remove the directory * entry and drop the link count in the first transaction of the remove * operation, there are no transactional constraints on the ordering here. @@ -2489,7 +2490,7 @@ xfs_remove( xfs_trans_t *tp = NULL; int is_dir = S_ISDIR(VFS_I(ip)->i_mode); int error = 0; - xfs_bmap_free_t free_list; + struct xfs_defer_ops dfops; xfs_fsblock_t first_block; uint resblks; @@ -2571,9 +2572,9 @@ xfs_remove( if (error) goto out_trans_cancel; - xfs_bmap_init(&free_list, &first_block); + xfs_defer_init(&dfops, &first_block); error = xfs_dir_removename(tp, dp, name, ip->i_ino, - &first_block, &free_list, resblks); + &first_block, &dfops, resblks); if (error) { ASSERT(error != -ENOENT); goto out_bmap_cancel; @@ -2587,7 +2588,7 @@ xfs_remove( if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) xfs_trans_set_sync(tp); - error = xfs_bmap_finish(&tp, &free_list, NULL); + error = xfs_defer_finish(&tp, &dfops, NULL); if (error) goto out_bmap_cancel; @@ -2601,7 +2602,7 @@ xfs_remove( return 0; out_bmap_cancel: - xfs_bmap_cancel(&free_list); + xfs_defer_cancel(&dfops); out_trans_cancel: xfs_trans_cancel(tp); std_return: @@ -2662,7 +2663,7 @@ xfs_sort_for_rename( static int xfs_finish_rename( struct xfs_trans *tp, - struct xfs_bmap_free *free_list) + struct xfs_defer_ops *dfops) { int error; @@ -2673,9 +2674,9 @@ xfs_finish_rename( if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) xfs_trans_set_sync(tp); - error = xfs_bmap_finish(&tp, free_list, NULL); + error = xfs_defer_finish(&tp, dfops, NULL); if (error) { - xfs_bmap_cancel(free_list); + xfs_defer_cancel(dfops); xfs_trans_cancel(tp); return error; } @@ -2697,7 +2698,7 @@ xfs_cross_rename( struct xfs_inode *dp2, struct xfs_name *name2, struct xfs_inode *ip2, - struct xfs_bmap_free *free_list, + struct xfs_defer_ops *dfops, xfs_fsblock_t *first_block, int spaceres) { @@ -2709,14 +2710,14 @@ xfs_cross_rename( /* Swap inode number for dirent in first parent */ error = xfs_dir_replace(tp, dp1, name1, ip2->i_ino, - first_block, free_list, spaceres); + first_block, dfops, spaceres); if (error) goto out_trans_abort; /* Swap inode number for dirent in second parent */ error = xfs_dir_replace(tp, dp2, name2, ip1->i_ino, - first_block, free_list, spaceres); + first_block, dfops, spaceres); if (error) goto out_trans_abort; @@ -2731,7 +2732,7 @@ xfs_cross_rename( if (S_ISDIR(VFS_I(ip2)->i_mode)) { error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot, dp1->i_ino, first_block, - free_list, spaceres); + dfops, spaceres); if (error) goto out_trans_abort; @@ -2758,7 +2759,7 @@ xfs_cross_rename( if (S_ISDIR(VFS_I(ip1)->i_mode)) { error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot, dp2->i_ino, first_block, - free_list, spaceres); + dfops, spaceres); if (error) goto out_trans_abort; @@ -2797,10 +2798,10 @@ xfs_cross_rename( } xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE); - return xfs_finish_rename(tp, free_list); + return xfs_finish_rename(tp, dfops); out_trans_abort: - xfs_bmap_cancel(free_list); + xfs_defer_cancel(dfops); xfs_trans_cancel(tp); return error; } @@ -2855,7 +2856,7 @@ xfs_rename( { struct xfs_mount *mp = src_dp->i_mount; struct xfs_trans *tp; - struct xfs_bmap_free free_list; + struct xfs_defer_ops dfops; xfs_fsblock_t first_block; struct xfs_inode *wip = NULL; /* whiteout inode */ struct xfs_inode *inodes[__XFS_SORT_INODES]; @@ -2944,13 +2945,13 @@ xfs_rename( goto out_trans_cancel; } - xfs_bmap_init(&free_list, &first_block); + xfs_defer_init(&dfops, &first_block); /* RENAME_EXCHANGE is unique from here on. */ if (flags & RENAME_EXCHANGE) return xfs_cross_rename(tp, src_dp, src_name, src_ip, target_dp, target_name, target_ip, - &free_list, &first_block, spaceres); + &dfops, &first_block, spaceres); /* * Set up the target. @@ -2972,7 +2973,7 @@ xfs_rename( */ error = xfs_dir_createname(tp, target_dp, target_name, src_ip->i_ino, &first_block, - &free_list, spaceres); + &dfops, spaceres); if (error) goto out_bmap_cancel; @@ -3012,7 +3013,7 @@ xfs_rename( */ error = xfs_dir_replace(tp, target_dp, target_name, src_ip->i_ino, - &first_block, &free_list, spaceres); + &first_block, &dfops, spaceres); if (error) goto out_bmap_cancel; @@ -3047,7 +3048,7 @@ xfs_rename( */ error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot, target_dp->i_ino, - &first_block, &free_list, spaceres); + &first_block, &dfops, spaceres); ASSERT(error != -EEXIST); if (error) goto out_bmap_cancel; @@ -3086,10 +3087,10 @@ xfs_rename( */ if (wip) { error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino, - &first_block, &free_list, spaceres); + &first_block, &dfops, spaceres); } else error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, - &first_block, &free_list, spaceres); + &first_block, &dfops, spaceres); if (error) goto out_bmap_cancel; @@ -3124,13 +3125,13 @@ xfs_rename( if (new_parent) xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); - error = xfs_finish_rename(tp, &free_list); + error = xfs_finish_rename(tp, &dfops); if (wip) IRELE(wip); return error; out_bmap_cancel: - xfs_bmap_cancel(&free_list); + xfs_defer_cancel(&dfops); out_trans_cancel: xfs_trans_cancel(tp); out_release_wip: diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 8eb78ec4a6e2..e1a411e08f00 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -27,7 +27,7 @@ struct xfs_dinode; struct xfs_inode; struct xfs_buf; -struct xfs_bmap_free; +struct xfs_defer_ops; struct xfs_bmbt_irec; struct xfs_inode_log_item; struct xfs_mount; @@ -398,7 +398,7 @@ uint xfs_ilock_attr_map_shared(struct xfs_inode *); uint xfs_ip2xflags(struct xfs_inode *); int xfs_ifree(struct xfs_trans *, xfs_inode_t *, - struct xfs_bmap_free *); + struct xfs_defer_ops *); int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *, int, xfs_fsize_t); void xfs_iext_realloc(xfs_inode_t *, int, int); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 9a7c87809d3b..96a70fd1f5d6 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -232,7 +232,7 @@ xfs_open_by_handle( } if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) { - error = -EACCES; + error = -EPERM; goto out_dput; } @@ -387,6 +387,7 @@ xfs_attrlist_by_handle( { int error = -ENOMEM; attrlist_cursor_kern_t *cursor; + struct xfs_fsop_attrlist_handlereq __user *p = arg; xfs_fsop_attrlist_handlereq_t al_hreq; struct dentry *dentry; char *kbuf; @@ -419,6 +420,11 @@ xfs_attrlist_by_handle( if (error) goto out_kfree; + if (copy_to_user(&p->pos, cursor, sizeof(attrlist_cursor_kern_t))) { + error = -EFAULT; + goto out_kfree; + } + if (copy_to_user(al_hreq.buffer, kbuf, al_hreq.buflen)) error = -EFAULT; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 620fc9120444..2114d53df433 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -23,6 +23,7 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_bmap_btree.h" @@ -128,7 +129,7 @@ xfs_iomap_write_direct( int quota_flag; int rt; xfs_trans_t *tp; - xfs_bmap_free_t free_list; + struct xfs_defer_ops dfops; uint qblocks, resblks, resrtextents; int error; int lockmode; @@ -231,18 +232,18 @@ xfs_iomap_write_direct( * From this point onwards we overwrite the imap pointer that the * caller gave to us. */ - xfs_bmap_init(&free_list, &firstfsb); + xfs_defer_init(&dfops, &firstfsb); nimaps = 1; error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flags, &firstfsb, resblks, imap, - &nimaps, &free_list); + &nimaps, &dfops); if (error) goto out_bmap_cancel; /* * Complete the transaction */ - error = xfs_bmap_finish(&tp, &free_list, NULL); + error = xfs_defer_finish(&tp, &dfops, NULL); if (error) goto out_bmap_cancel; @@ -266,7 +267,7 @@ out_unlock: return error; out_bmap_cancel: - xfs_bmap_cancel(&free_list); + xfs_defer_cancel(&dfops); xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); out_trans_cancel: xfs_trans_cancel(tp); @@ -685,7 +686,7 @@ xfs_iomap_write_allocate( xfs_fileoff_t offset_fsb, last_block; xfs_fileoff_t end_fsb, map_start_fsb; xfs_fsblock_t first_block; - xfs_bmap_free_t free_list; + struct xfs_defer_ops dfops; xfs_filblks_t count_fsb; xfs_trans_t *tp; int nimaps; @@ -727,7 +728,7 @@ xfs_iomap_write_allocate( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - xfs_bmap_init(&free_list, &first_block); + xfs_defer_init(&dfops, &first_block); /* * it is possible that the extents have changed since @@ -783,11 +784,11 @@ xfs_iomap_write_allocate( error = xfs_bmapi_write(tp, ip, map_start_fsb, count_fsb, 0, &first_block, nres, imap, &nimaps, - &free_list); + &dfops); if (error) goto trans_cancel; - error = xfs_bmap_finish(&tp, &free_list, NULL); + error = xfs_defer_finish(&tp, &dfops, NULL); if (error) goto trans_cancel; @@ -821,7 +822,7 @@ xfs_iomap_write_allocate( } trans_cancel: - xfs_bmap_cancel(&free_list); + xfs_defer_cancel(&dfops); xfs_trans_cancel(tp); error0: xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -842,7 +843,7 @@ xfs_iomap_write_unwritten( int nimaps; xfs_trans_t *tp; xfs_bmbt_irec_t imap; - xfs_bmap_free_t free_list; + struct xfs_defer_ops dfops; xfs_fsize_t i_size; uint resblks; int error; @@ -886,11 +887,11 @@ xfs_iomap_write_unwritten( /* * Modify the unwritten extent state of the buffer. */ - xfs_bmap_init(&free_list, &firstfsb); + xfs_defer_init(&dfops, &firstfsb); nimaps = 1; error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, XFS_BMAPI_CONVERT, &firstfsb, resblks, - &imap, &nimaps, &free_list); + &imap, &nimaps, &dfops); if (error) goto error_on_bmapi_transaction; @@ -909,7 +910,7 @@ xfs_iomap_write_unwritten( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } - error = xfs_bmap_finish(&tp, &free_list, NULL); + error = xfs_defer_finish(&tp, &dfops, NULL); if (error) goto error_on_bmapi_transaction; @@ -936,7 +937,7 @@ xfs_iomap_write_unwritten( return 0; error_on_bmapi_transaction: - xfs_bmap_cancel(&free_list); + xfs_defer_cancel(&dfops); xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 835997843846..e8638fd2c0c3 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -43,6 +43,7 @@ #include "xfs_bmap_btree.h" #include "xfs_error.h" #include "xfs_dir2.h" +#include "xfs_rmap_item.h" #define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) @@ -1911,6 +1912,8 @@ xlog_recover_reorder_trans( case XFS_LI_QUOTAOFF: case XFS_LI_EFD: case XFS_LI_EFI: + case XFS_LI_RUI: + case XFS_LI_RUD: trace_xfs_log_recover_item_reorder_tail(log, trans, item, pass); list_move_tail(&item->ri_list, &inode_list); @@ -2228,6 +2231,7 @@ xlog_recover_get_buf_lsn( case XFS_ABTC_CRC_MAGIC: case XFS_ABTB_MAGIC: case XFS_ABTC_MAGIC: + case XFS_RMAP_CRC_MAGIC: case XFS_IBT_CRC_MAGIC: case XFS_IBT_MAGIC: { struct xfs_btree_block *btb = blk; @@ -2396,6 +2400,9 @@ xlog_recover_validate_buf_type( case XFS_BMAP_MAGIC: bp->b_ops = &xfs_bmbt_buf_ops; break; + case XFS_RMAP_CRC_MAGIC: + bp->b_ops = &xfs_rmapbt_buf_ops; + break; default: xfs_warn(mp, "Bad btree block magic!"); ASSERT(0); @@ -3415,6 +3422,99 @@ xlog_recover_efd_pass2( } /* + * This routine is called to create an in-core extent rmap update + * item from the rui format structure which was logged on disk. + * It allocates an in-core rui, copies the extents from the format + * structure into it, and adds the rui to the AIL with the given + * LSN. + */ +STATIC int +xlog_recover_rui_pass2( + struct xlog *log, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + int error; + struct xfs_mount *mp = log->l_mp; + struct xfs_rui_log_item *ruip; + struct xfs_rui_log_format *rui_formatp; + + rui_formatp = item->ri_buf[0].i_addr; + + ruip = xfs_rui_init(mp, rui_formatp->rui_nextents); + error = xfs_rui_copy_format(&item->ri_buf[0], &ruip->rui_format); + if (error) { + xfs_rui_item_free(ruip); + return error; + } + atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents); + + spin_lock(&log->l_ailp->xa_lock); + /* + * The RUI has two references. One for the RUD and one for RUI to ensure + * it makes it into the AIL. Insert the RUI into the AIL directly and + * drop the RUI reference. Note that xfs_trans_ail_update() drops the + * AIL lock. + */ + xfs_trans_ail_update(log->l_ailp, &ruip->rui_item, lsn); + xfs_rui_release(ruip); + return 0; +} + + +/* + * This routine is called when an RUD format structure is found in a committed + * transaction in the log. Its purpose is to cancel the corresponding RUI if it + * was still in the log. To do this it searches the AIL for the RUI with an id + * equal to that in the RUD format structure. If we find it we drop the RUD + * reference, which removes the RUI from the AIL and frees it. + */ +STATIC int +xlog_recover_rud_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_rud_log_format *rud_formatp; + struct xfs_rui_log_item *ruip = NULL; + struct xfs_log_item *lip; + __uint64_t rui_id; + struct xfs_ail_cursor cur; + struct xfs_ail *ailp = log->l_ailp; + + rud_formatp = item->ri_buf[0].i_addr; + ASSERT(item->ri_buf[0].i_len == sizeof(struct xfs_rud_log_format)); + rui_id = rud_formatp->rud_rui_id; + + /* + * Search for the RUI with the id in the RUD format structure in the + * AIL. + */ + spin_lock(&ailp->xa_lock); + lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); + while (lip != NULL) { + if (lip->li_type == XFS_LI_RUI) { + ruip = (struct xfs_rui_log_item *)lip; + if (ruip->rui_format.rui_id == rui_id) { + /* + * Drop the RUD reference to the RUI. This + * removes the RUI from the AIL and frees it. + */ + spin_unlock(&ailp->xa_lock); + xfs_rui_release(ruip); + spin_lock(&ailp->xa_lock); + break; + } + } + lip = xfs_trans_ail_cursor_next(ailp, &cur); + } + + xfs_trans_ail_cursor_done(&cur); + spin_unlock(&ailp->xa_lock); + + return 0; +} + +/* * This routine is called when an inode create format structure is found in a * committed transaction in the log. It's purpose is to initialise the inodes * being allocated on disk. This requires us to get inode cluster buffers that @@ -3639,6 +3739,8 @@ xlog_recover_ra_pass2( case XFS_LI_EFI: case XFS_LI_EFD: case XFS_LI_QUOTAOFF: + case XFS_LI_RUI: + case XFS_LI_RUD: default: break; } @@ -3662,6 +3764,8 @@ xlog_recover_commit_pass1( case XFS_LI_EFD: case XFS_LI_DQUOT: case XFS_LI_ICREATE: + case XFS_LI_RUI: + case XFS_LI_RUD: /* nothing to do in pass 1 */ return 0; default: @@ -3692,6 +3796,10 @@ xlog_recover_commit_pass2( return xlog_recover_efi_pass2(log, item, trans->r_lsn); case XFS_LI_EFD: return xlog_recover_efd_pass2(log, item); + case XFS_LI_RUI: + return xlog_recover_rui_pass2(log, item, trans->r_lsn); + case XFS_LI_RUD: + return xlog_recover_rud_pass2(log, item); case XFS_LI_DQUOT: return xlog_recover_dquot_pass2(log, buffer_list, item, trans->r_lsn); @@ -4164,126 +4272,156 @@ xlog_recover_process_data( return 0; } -/* - * Process an extent free intent item that was recovered from - * the log. We need to free the extents that it describes. - */ +/* Recover the EFI if necessary. */ STATIC int xlog_recover_process_efi( - xfs_mount_t *mp, - xfs_efi_log_item_t *efip) + struct xfs_mount *mp, + struct xfs_ail *ailp, + struct xfs_log_item *lip) { - xfs_efd_log_item_t *efdp; - xfs_trans_t *tp; - int i; - int error = 0; - xfs_extent_t *extp; - xfs_fsblock_t startblock_fsb; - - ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)); + struct xfs_efi_log_item *efip; + int error; /* - * First check the validity of the extents described by the - * EFI. If any are bad, then assume that all are bad and - * just toss the EFI. + * Skip EFIs that we've already processed. */ - for (i = 0; i < efip->efi_format.efi_nextents; i++) { - extp = &(efip->efi_format.efi_extents[i]); - startblock_fsb = XFS_BB_TO_FSB(mp, - XFS_FSB_TO_DADDR(mp, extp->ext_start)); - if ((startblock_fsb == 0) || - (extp->ext_len == 0) || - (startblock_fsb >= mp->m_sb.sb_dblocks) || - (extp->ext_len >= mp->m_sb.sb_agblocks)) { - /* - * This will pull the EFI from the AIL and - * free the memory associated with it. - */ - set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); - xfs_efi_release(efip); - return -EIO; - } - } + efip = container_of(lip, struct xfs_efi_log_item, efi_item); + if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) + return 0; - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); - if (error) - return error; - efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); + spin_unlock(&ailp->xa_lock); + error = xfs_efi_recover(mp, efip); + spin_lock(&ailp->xa_lock); - for (i = 0; i < efip->efi_format.efi_nextents; i++) { - extp = &(efip->efi_format.efi_extents[i]); - error = xfs_trans_free_extent(tp, efdp, extp->ext_start, - extp->ext_len); - if (error) - goto abort_error; + return error; +} - } +/* Release the EFI since we're cancelling everything. */ +STATIC void +xlog_recover_cancel_efi( + struct xfs_mount *mp, + struct xfs_ail *ailp, + struct xfs_log_item *lip) +{ + struct xfs_efi_log_item *efip; - set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); - error = xfs_trans_commit(tp); - return error; + efip = container_of(lip, struct xfs_efi_log_item, efi_item); + + spin_unlock(&ailp->xa_lock); + xfs_efi_release(efip); + spin_lock(&ailp->xa_lock); +} + +/* Recover the RUI if necessary. */ +STATIC int +xlog_recover_process_rui( + struct xfs_mount *mp, + struct xfs_ail *ailp, + struct xfs_log_item *lip) +{ + struct xfs_rui_log_item *ruip; + int error; + + /* + * Skip RUIs that we've already processed. + */ + ruip = container_of(lip, struct xfs_rui_log_item, rui_item); + if (test_bit(XFS_RUI_RECOVERED, &ruip->rui_flags)) + return 0; + + spin_unlock(&ailp->xa_lock); + error = xfs_rui_recover(mp, ruip); + spin_lock(&ailp->xa_lock); -abort_error: - xfs_trans_cancel(tp); return error; } +/* Release the RUI since we're cancelling everything. */ +STATIC void +xlog_recover_cancel_rui( + struct xfs_mount *mp, + struct xfs_ail *ailp, + struct xfs_log_item *lip) +{ + struct xfs_rui_log_item *ruip; + + ruip = container_of(lip, struct xfs_rui_log_item, rui_item); + + spin_unlock(&ailp->xa_lock); + xfs_rui_release(ruip); + spin_lock(&ailp->xa_lock); +} + +/* Is this log item a deferred action intent? */ +static inline bool xlog_item_is_intent(struct xfs_log_item *lip) +{ + switch (lip->li_type) { + case XFS_LI_EFI: + case XFS_LI_RUI: + return true; + default: + return false; + } +} + /* - * When this is called, all of the EFIs which did not have - * corresponding EFDs should be in the AIL. What we do now - * is free the extents associated with each one. + * When this is called, all of the log intent items which did not have + * corresponding log done items should be in the AIL. What we do now + * is update the data structures associated with each one. * - * Since we process the EFIs in normal transactions, they - * will be removed at some point after the commit. This prevents - * us from just walking down the list processing each one. - * We'll use a flag in the EFI to skip those that we've already - * processed and use the AIL iteration mechanism's generation - * count to try to speed this up at least a bit. + * Since we process the log intent items in normal transactions, they + * will be removed at some point after the commit. This prevents us + * from just walking down the list processing each one. We'll use a + * flag in the intent item to skip those that we've already processed + * and use the AIL iteration mechanism's generation count to try to + * speed this up at least a bit. * - * When we start, we know that the EFIs are the only things in - * the AIL. As we process them, however, other items are added - * to the AIL. Since everything added to the AIL must come after - * everything already in the AIL, we stop processing as soon as - * we see something other than an EFI in the AIL. + * When we start, we know that the intents are the only things in the + * AIL. As we process them, however, other items are added to the + * AIL. */ STATIC int -xlog_recover_process_efis( +xlog_recover_process_intents( struct xlog *log) { struct xfs_log_item *lip; - struct xfs_efi_log_item *efip; int error = 0; struct xfs_ail_cursor cur; struct xfs_ail *ailp; + xfs_lsn_t last_lsn; ailp = log->l_ailp; spin_lock(&ailp->xa_lock); lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); + last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block); while (lip != NULL) { /* - * We're done when we see something other than an EFI. - * There should be no EFIs left in the AIL now. + * We're done when we see something other than an intent. + * There should be no intents left in the AIL now. */ - if (lip->li_type != XFS_LI_EFI) { + if (!xlog_item_is_intent(lip)) { #ifdef DEBUG for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur)) - ASSERT(lip->li_type != XFS_LI_EFI); + ASSERT(!xlog_item_is_intent(lip)); #endif break; } /* - * Skip EFIs that we've already processed. + * We should never see a redo item with a LSN higher than + * the last transaction we found in the log at the start + * of recovery. */ - efip = container_of(lip, struct xfs_efi_log_item, efi_item); - if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) { - lip = xfs_trans_ail_cursor_next(ailp, &cur); - continue; - } + ASSERT(XFS_LSN_CMP(last_lsn, lip->li_lsn) >= 0); - spin_unlock(&ailp->xa_lock); - error = xlog_recover_process_efi(log->l_mp, efip); - spin_lock(&ailp->xa_lock); + switch (lip->li_type) { + case XFS_LI_EFI: + error = xlog_recover_process_efi(log->l_mp, ailp, lip); + break; + case XFS_LI_RUI: + error = xlog_recover_process_rui(log->l_mp, ailp, lip); + break; + } if (error) goto out; lip = xfs_trans_ail_cursor_next(ailp, &cur); @@ -4295,15 +4433,14 @@ out: } /* - * A cancel occurs when the mount has failed and we're bailing out. Release all - * pending EFIs so they don't pin the AIL. + * A cancel occurs when the mount has failed and we're bailing out. + * Release all pending log intent items so they don't pin the AIL. */ STATIC int -xlog_recover_cancel_efis( +xlog_recover_cancel_intents( struct xlog *log) { struct xfs_log_item *lip; - struct xfs_efi_log_item *efip; int error = 0; struct xfs_ail_cursor cur; struct xfs_ail *ailp; @@ -4313,22 +4450,25 @@ xlog_recover_cancel_efis( lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); while (lip != NULL) { /* - * We're done when we see something other than an EFI. - * There should be no EFIs left in the AIL now. + * We're done when we see something other than an intent. + * There should be no intents left in the AIL now. */ - if (lip->li_type != XFS_LI_EFI) { + if (!xlog_item_is_intent(lip)) { #ifdef DEBUG for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur)) - ASSERT(lip->li_type != XFS_LI_EFI); + ASSERT(!xlog_item_is_intent(lip)); #endif break; } - efip = container_of(lip, struct xfs_efi_log_item, efi_item); - - spin_unlock(&ailp->xa_lock); - xfs_efi_release(efip); - spin_lock(&ailp->xa_lock); + switch (lip->li_type) { + case XFS_LI_EFI: + xlog_recover_cancel_efi(log->l_mp, ailp, lip); + break; + case XFS_LI_RUI: + xlog_recover_cancel_rui(log->l_mp, ailp, lip); + break; + } lip = xfs_trans_ail_cursor_next(ailp, &cur); } @@ -5023,6 +5163,7 @@ xlog_do_recover( xfs_warn(mp, "Failed post-recovery per-ag init: %d", error); return error; } + mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); xlog_recover_check_summary(log); @@ -5139,16 +5280,17 @@ xlog_recover_finish( */ if (log->l_flags & XLOG_RECOVERY_NEEDED) { int error; - error = xlog_recover_process_efis(log); + error = xlog_recover_process_intents(log); if (error) { - xfs_alert(log->l_mp, "Failed to recover EFIs"); + xfs_alert(log->l_mp, "Failed to recover intents"); return error; } + /* - * Sync the log to get all the EFIs out of the AIL. + * Sync the log to get all the intents out of the AIL. * This isn't absolutely necessary, but it helps in * case the unlink transactions would have problems - * pushing the EFIs out of the way. + * pushing the intents out of the way. */ xfs_log_force(log->l_mp, XFS_LOG_SYNC); @@ -5173,7 +5315,7 @@ xlog_recover_cancel( int error = 0; if (log->l_flags & XLOG_RECOVERY_NEEDED) - error = xlog_recover_cancel_efis(log); + error = xlog_recover_cancel_intents(log); return error; } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 970c19ba2f56..faeead671f9f 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -24,6 +24,7 @@ #include "xfs_bit.h" #include "xfs_sb.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_inode.h" @@ -41,6 +42,7 @@ #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_sysfs.h" +#include "xfs_rmap_btree.h" static DEFINE_MUTEX(xfs_uuid_table_mutex); @@ -230,6 +232,8 @@ xfs_initialize_perag( if (maxagi) *maxagi = index; + + mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp); return 0; out_unwind: @@ -679,6 +683,7 @@ xfs_mountfs( xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK); xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK); xfs_ialloc_compute_maxlevels(mp); + xfs_rmapbt_compute_maxlevels(mp); xfs_set_maxicount(mp); @@ -1216,7 +1221,7 @@ xfs_mod_fdblocks( batch = XFS_FDBLOCKS_BATCH; __percpu_counter_add(&mp->m_fdblocks, delta, batch); - if (__percpu_counter_compare(&mp->m_fdblocks, XFS_ALLOC_SET_ASIDE(mp), + if (__percpu_counter_compare(&mp->m_fdblocks, mp->m_alloc_set_aside, XFS_FDBLOCKS_BATCH) >= 0) { /* we had space! */ return 0; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index c1b798c72126..b36676cde103 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -116,9 +116,15 @@ typedef struct xfs_mount { uint m_bmap_dmnr[2]; /* min bmap btree records */ uint m_inobt_mxr[2]; /* max inobt btree records */ uint m_inobt_mnr[2]; /* min inobt btree records */ + uint m_rmap_mxr[2]; /* max rmap btree records */ + uint m_rmap_mnr[2]; /* min rmap btree records */ uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */ uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */ uint m_in_maxlevels; /* max inobt btree levels. */ + uint m_rmap_maxlevels; /* max rmap btree levels */ + xfs_extlen_t m_ag_prealloc_blocks; /* reserved ag blocks */ + uint m_alloc_set_aside; /* space we can't use */ + uint m_ag_max_usable; /* max space per AG */ struct radix_tree_root m_perag_tree; /* per-ag accounting info */ spinlock_t m_perag_lock; /* lock for m_perag_tree */ struct mutex m_growlock; /* growfs mutex */ diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 0cc8d8f74356..69e2986a3776 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -49,11 +49,14 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_dsymlink_hdr, 56); XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_key, 4); XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_rec, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_key, 20); + XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_rec, 24); XFS_CHECK_STRUCT_SIZE(struct xfs_timestamp, 8); XFS_CHECK_STRUCT_SIZE(xfs_alloc_key_t, 8); XFS_CHECK_STRUCT_SIZE(xfs_alloc_ptr_t, 4); XFS_CHECK_STRUCT_SIZE(xfs_alloc_rec_t, 8); XFS_CHECK_STRUCT_SIZE(xfs_inobt_ptr_t, 4); + XFS_CHECK_STRUCT_SIZE(xfs_rmap_ptr_t, 4); /* dir/attr trees */ XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leaf_hdr, 80); diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h index 93f74853961b..e8339f74966b 100644 --- a/fs/xfs/xfs_pnfs.h +++ b/fs/xfs/xfs_pnfs.h @@ -1,7 +1,7 @@ #ifndef _XFS_PNFS_H #define _XFS_PNFS_H 1 -#if defined(CONFIG_NFSD_BLOCKLAYOUT) || defined(CONFIG_NFSD_SCSILAYOUT) +#ifdef CONFIG_EXPORTFS_BLOCK_OPS int xfs_fs_get_uuid(struct super_block *sb, u8 *buf, u32 *len, u64 *offset); int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length, struct iomap *iomap, bool write, u32 *device_generation); @@ -15,5 +15,5 @@ xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex) { return 0; } -#endif /* CONFIG_NFSD_PNFS */ +#endif /* CONFIG_EXPORTFS_BLOCK_OPS */ #endif /* _XFS_PNFS_H */ diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c new file mode 100644 index 000000000000..2500f28689d5 --- /dev/null +++ b/fs/xfs/xfs_rmap_item.c @@ -0,0 +1,536 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_buf_item.h" +#include "xfs_rmap_item.h" +#include "xfs_log.h" +#include "xfs_rmap.h" + + +kmem_zone_t *xfs_rui_zone; +kmem_zone_t *xfs_rud_zone; + +static inline struct xfs_rui_log_item *RUI_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_rui_log_item, rui_item); +} + +void +xfs_rui_item_free( + struct xfs_rui_log_item *ruip) +{ + if (ruip->rui_format.rui_nextents > XFS_RUI_MAX_FAST_EXTENTS) + kmem_free(ruip); + else + kmem_zone_free(xfs_rui_zone, ruip); +} + +/* + * This returns the number of iovecs needed to log the given rui item. + * We only need 1 iovec for an rui item. It just logs the rui_log_format + * structure. + */ +static inline int +xfs_rui_item_sizeof( + struct xfs_rui_log_item *ruip) +{ + return sizeof(struct xfs_rui_log_format) + + (ruip->rui_format.rui_nextents - 1) * + sizeof(struct xfs_map_extent); +} + +STATIC void +xfs_rui_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + *nvecs += 1; + *nbytes += xfs_rui_item_sizeof(RUI_ITEM(lip)); +} + +/* + * This is called to fill in the vector of log iovecs for the + * given rui log item. We use only 1 iovec, and we point that + * at the rui_log_format structure embedded in the rui item. + * It is at this point that we assert that all of the extent + * slots in the rui item have been filled. + */ +STATIC void +xfs_rui_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_rui_log_item *ruip = RUI_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + + ASSERT(atomic_read(&ruip->rui_next_extent) == + ruip->rui_format.rui_nextents); + + ruip->rui_format.rui_type = XFS_LI_RUI; + ruip->rui_format.rui_size = 1; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_RUI_FORMAT, &ruip->rui_format, + xfs_rui_item_sizeof(ruip)); +} + +/* + * Pinning has no meaning for an rui item, so just return. + */ +STATIC void +xfs_rui_item_pin( + struct xfs_log_item *lip) +{ +} + +/* + * The unpin operation is the last place an RUI is manipulated in the log. It is + * either inserted in the AIL or aborted in the event of a log I/O error. In + * either case, the RUI transaction has been successfully committed to make it + * this far. Therefore, we expect whoever committed the RUI to either construct + * and commit the RUD or drop the RUD's reference in the event of error. Simply + * drop the log's RUI reference now that the log is done with it. + */ +STATIC void +xfs_rui_item_unpin( + struct xfs_log_item *lip, + int remove) +{ + struct xfs_rui_log_item *ruip = RUI_ITEM(lip); + + xfs_rui_release(ruip); +} + +/* + * RUI items have no locking or pushing. However, since RUIs are pulled from + * the AIL when their corresponding RUDs are committed to disk, their situation + * is very similar to being pinned. Return XFS_ITEM_PINNED so that the caller + * will eventually flush the log. This should help in getting the RUI out of + * the AIL. + */ +STATIC uint +xfs_rui_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + return XFS_ITEM_PINNED; +} + +/* + * The RUI has been either committed or aborted if the transaction has been + * cancelled. If the transaction was cancelled, an RUD isn't going to be + * constructed and thus we free the RUI here directly. + */ +STATIC void +xfs_rui_item_unlock( + struct xfs_log_item *lip) +{ + if (lip->li_flags & XFS_LI_ABORTED) + xfs_rui_item_free(RUI_ITEM(lip)); +} + +/* + * The RUI is logged only once and cannot be moved in the log, so simply return + * the lsn at which it's been logged. + */ +STATIC xfs_lsn_t +xfs_rui_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + return lsn; +} + +/* + * The RUI dependency tracking op doesn't do squat. It can't because + * it doesn't know where the free extent is coming from. The dependency + * tracking has to be handled by the "enclosing" metadata object. For + * example, for inodes, the inode is locked throughout the extent freeing + * so the dependency should be recorded there. + */ +STATIC void +xfs_rui_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ +} + +/* + * This is the ops vector shared by all rui log items. + */ +static const struct xfs_item_ops xfs_rui_item_ops = { + .iop_size = xfs_rui_item_size, + .iop_format = xfs_rui_item_format, + .iop_pin = xfs_rui_item_pin, + .iop_unpin = xfs_rui_item_unpin, + .iop_unlock = xfs_rui_item_unlock, + .iop_committed = xfs_rui_item_committed, + .iop_push = xfs_rui_item_push, + .iop_committing = xfs_rui_item_committing, +}; + +/* + * Allocate and initialize an rui item with the given number of extents. + */ +struct xfs_rui_log_item * +xfs_rui_init( + struct xfs_mount *mp, + uint nextents) + +{ + struct xfs_rui_log_item *ruip; + uint size; + + ASSERT(nextents > 0); + if (nextents > XFS_RUI_MAX_FAST_EXTENTS) { + size = (uint)(sizeof(struct xfs_rui_log_item) + + ((nextents - 1) * sizeof(struct xfs_map_extent))); + ruip = kmem_zalloc(size, KM_SLEEP); + } else { + ruip = kmem_zone_zalloc(xfs_rui_zone, KM_SLEEP); + } + + xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops); + ruip->rui_format.rui_nextents = nextents; + ruip->rui_format.rui_id = (uintptr_t)(void *)ruip; + atomic_set(&ruip->rui_next_extent, 0); + atomic_set(&ruip->rui_refcount, 2); + + return ruip; +} + +/* + * Copy an RUI format buffer from the given buf, and into the destination + * RUI format structure. The RUI/RUD items were designed not to need any + * special alignment handling. + */ +int +xfs_rui_copy_format( + struct xfs_log_iovec *buf, + struct xfs_rui_log_format *dst_rui_fmt) +{ + struct xfs_rui_log_format *src_rui_fmt; + uint len; + + src_rui_fmt = buf->i_addr; + len = sizeof(struct xfs_rui_log_format) + + (src_rui_fmt->rui_nextents - 1) * + sizeof(struct xfs_map_extent); + + if (buf->i_len != len) + return -EFSCORRUPTED; + + memcpy((char *)dst_rui_fmt, (char *)src_rui_fmt, len); + return 0; +} + +/* + * Freeing the RUI requires that we remove it from the AIL if it has already + * been placed there. However, the RUI may not yet have been placed in the AIL + * when called by xfs_rui_release() from RUD processing due to the ordering of + * committed vs unpin operations in bulk insert operations. Hence the reference + * count to ensure only the last caller frees the RUI. + */ +void +xfs_rui_release( + struct xfs_rui_log_item *ruip) +{ + if (atomic_dec_and_test(&ruip->rui_refcount)) { + xfs_trans_ail_remove(&ruip->rui_item, SHUTDOWN_LOG_IO_ERROR); + xfs_rui_item_free(ruip); + } +} + +static inline struct xfs_rud_log_item *RUD_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_rud_log_item, rud_item); +} + +STATIC void +xfs_rud_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + *nvecs += 1; + *nbytes += sizeof(struct xfs_rud_log_format); +} + +/* + * This is called to fill in the vector of log iovecs for the + * given rud log item. We use only 1 iovec, and we point that + * at the rud_log_format structure embedded in the rud item. + * It is at this point that we assert that all of the extent + * slots in the rud item have been filled. + */ +STATIC void +xfs_rud_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_rud_log_item *rudp = RUD_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + + rudp->rud_format.rud_type = XFS_LI_RUD; + rudp->rud_format.rud_size = 1; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_RUD_FORMAT, &rudp->rud_format, + sizeof(struct xfs_rud_log_format)); +} + +/* + * Pinning has no meaning for an rud item, so just return. + */ +STATIC void +xfs_rud_item_pin( + struct xfs_log_item *lip) +{ +} + +/* + * Since pinning has no meaning for an rud item, unpinning does + * not either. + */ +STATIC void +xfs_rud_item_unpin( + struct xfs_log_item *lip, + int remove) +{ +} + +/* + * There isn't much you can do to push on an rud item. It is simply stuck + * waiting for the log to be flushed to disk. + */ +STATIC uint +xfs_rud_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + return XFS_ITEM_PINNED; +} + +/* + * The RUD is either committed or aborted if the transaction is cancelled. If + * the transaction is cancelled, drop our reference to the RUI and free the + * RUD. + */ +STATIC void +xfs_rud_item_unlock( + struct xfs_log_item *lip) +{ + struct xfs_rud_log_item *rudp = RUD_ITEM(lip); + + if (lip->li_flags & XFS_LI_ABORTED) { + xfs_rui_release(rudp->rud_ruip); + kmem_zone_free(xfs_rud_zone, rudp); + } +} + +/* + * When the rud item is committed to disk, all we need to do is delete our + * reference to our partner rui item and then free ourselves. Since we're + * freeing ourselves we must return -1 to keep the transaction code from + * further referencing this item. + */ +STATIC xfs_lsn_t +xfs_rud_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + struct xfs_rud_log_item *rudp = RUD_ITEM(lip); + + /* + * Drop the RUI reference regardless of whether the RUD has been + * aborted. Once the RUD transaction is constructed, it is the sole + * responsibility of the RUD to release the RUI (even if the RUI is + * aborted due to log I/O error). + */ + xfs_rui_release(rudp->rud_ruip); + kmem_zone_free(xfs_rud_zone, rudp); + + return (xfs_lsn_t)-1; +} + +/* + * The RUD dependency tracking op doesn't do squat. It can't because + * it doesn't know where the free extent is coming from. The dependency + * tracking has to be handled by the "enclosing" metadata object. For + * example, for inodes, the inode is locked throughout the extent freeing + * so the dependency should be recorded there. + */ +STATIC void +xfs_rud_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ +} + +/* + * This is the ops vector shared by all rud log items. + */ +static const struct xfs_item_ops xfs_rud_item_ops = { + .iop_size = xfs_rud_item_size, + .iop_format = xfs_rud_item_format, + .iop_pin = xfs_rud_item_pin, + .iop_unpin = xfs_rud_item_unpin, + .iop_unlock = xfs_rud_item_unlock, + .iop_committed = xfs_rud_item_committed, + .iop_push = xfs_rud_item_push, + .iop_committing = xfs_rud_item_committing, +}; + +/* + * Allocate and initialize an rud item with the given number of extents. + */ +struct xfs_rud_log_item * +xfs_rud_init( + struct xfs_mount *mp, + struct xfs_rui_log_item *ruip) + +{ + struct xfs_rud_log_item *rudp; + + rudp = kmem_zone_zalloc(xfs_rud_zone, KM_SLEEP); + xfs_log_item_init(mp, &rudp->rud_item, XFS_LI_RUD, &xfs_rud_item_ops); + rudp->rud_ruip = ruip; + rudp->rud_format.rud_rui_id = ruip->rui_format.rui_id; + + return rudp; +} + +/* + * Process an rmap update intent item that was recovered from the log. + * We need to update the rmapbt. + */ +int +xfs_rui_recover( + struct xfs_mount *mp, + struct xfs_rui_log_item *ruip) +{ + int i; + int error = 0; + struct xfs_map_extent *rmap; + xfs_fsblock_t startblock_fsb; + bool op_ok; + struct xfs_rud_log_item *rudp; + enum xfs_rmap_intent_type type; + int whichfork; + xfs_exntst_t state; + struct xfs_trans *tp; + struct xfs_btree_cur *rcur = NULL; + + ASSERT(!test_bit(XFS_RUI_RECOVERED, &ruip->rui_flags)); + + /* + * First check the validity of the extents described by the + * RUI. If any are bad, then assume that all are bad and + * just toss the RUI. + */ + for (i = 0; i < ruip->rui_format.rui_nextents; i++) { + rmap = &ruip->rui_format.rui_extents[i]; + startblock_fsb = XFS_BB_TO_FSB(mp, + XFS_FSB_TO_DADDR(mp, rmap->me_startblock)); + switch (rmap->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { + case XFS_RMAP_EXTENT_MAP: + case XFS_RMAP_EXTENT_UNMAP: + case XFS_RMAP_EXTENT_CONVERT: + case XFS_RMAP_EXTENT_ALLOC: + case XFS_RMAP_EXTENT_FREE: + op_ok = true; + break; + default: + op_ok = false; + break; + } + if (!op_ok || startblock_fsb == 0 || + rmap->me_len == 0 || + startblock_fsb >= mp->m_sb.sb_dblocks || + rmap->me_len >= mp->m_sb.sb_agblocks || + (rmap->me_flags & ~XFS_RMAP_EXTENT_FLAGS)) { + /* + * This will pull the RUI from the AIL and + * free the memory associated with it. + */ + set_bit(XFS_RUI_RECOVERED, &ruip->rui_flags); + xfs_rui_release(ruip); + return -EIO; + } + } + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); + if (error) + return error; + rudp = xfs_trans_get_rud(tp, ruip); + + for (i = 0; i < ruip->rui_format.rui_nextents; i++) { + rmap = &ruip->rui_format.rui_extents[i]; + state = (rmap->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ? + XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + whichfork = (rmap->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + switch (rmap->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { + case XFS_RMAP_EXTENT_MAP: + type = XFS_RMAP_MAP; + break; + case XFS_RMAP_EXTENT_UNMAP: + type = XFS_RMAP_UNMAP; + break; + case XFS_RMAP_EXTENT_CONVERT: + type = XFS_RMAP_CONVERT; + break; + case XFS_RMAP_EXTENT_ALLOC: + type = XFS_RMAP_ALLOC; + break; + case XFS_RMAP_EXTENT_FREE: + type = XFS_RMAP_FREE; + break; + default: + error = -EFSCORRUPTED; + goto abort_error; + } + error = xfs_trans_log_finish_rmap_update(tp, rudp, type, + rmap->me_owner, whichfork, + rmap->me_startoff, rmap->me_startblock, + rmap->me_len, state, &rcur); + if (error) + goto abort_error; + + } + + xfs_rmap_finish_one_cleanup(tp, rcur, error); + set_bit(XFS_RUI_RECOVERED, &ruip->rui_flags); + error = xfs_trans_commit(tp); + return error; + +abort_error: + xfs_rmap_finish_one_cleanup(tp, rcur, error); + xfs_trans_cancel(tp); + return error; +} diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h new file mode 100644 index 000000000000..aefcc3a318a5 --- /dev/null +++ b/fs/xfs/xfs_rmap_item.h @@ -0,0 +1,95 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_RMAP_ITEM_H__ +#define __XFS_RMAP_ITEM_H__ + +/* + * There are (currently) three pairs of rmap btree redo item types: map, unmap, + * and convert. The common abbreviations for these are RUI (rmap update + * intent) and RUD (rmap update done). The redo item type is encoded in the + * flags field of each xfs_map_extent. + * + * *I items should be recorded in the *first* of a series of rolled + * transactions, and the *D items should be recorded in the same transaction + * that records the associated rmapbt updates. Typically, the first + * transaction will record a bmbt update, followed by some number of + * transactions containing rmapbt updates, and finally transactions with any + * bnobt/cntbt updates. + * + * Should the system crash after the commit of the first transaction but + * before the commit of the final transaction in a series, log recovery will + * use the redo information recorded by the intent items to replay the + * (rmapbt/bnobt/cntbt) metadata updates in the non-first transaction. + */ + +/* kernel only RUI/RUD definitions */ + +struct xfs_mount; +struct kmem_zone; + +/* + * Max number of extents in fast allocation path. + */ +#define XFS_RUI_MAX_FAST_EXTENTS 16 + +/* + * Define RUI flag bits. Manipulated by set/clear/test_bit operators. + */ +#define XFS_RUI_RECOVERED 1 + +/* + * This is the "rmap update intent" log item. It is used to log the fact that + * some reverse mappings need to change. It is used in conjunction with the + * "rmap update done" log item described below. + * + * These log items follow the same rules as struct xfs_efi_log_item; see the + * comments about that structure (in xfs_extfree_item.h) for more details. + */ +struct xfs_rui_log_item { + struct xfs_log_item rui_item; + atomic_t rui_refcount; + atomic_t rui_next_extent; + unsigned long rui_flags; /* misc flags */ + struct xfs_rui_log_format rui_format; +}; + +/* + * This is the "rmap update done" log item. It is used to log the fact that + * some rmapbt updates mentioned in an earlier rui item have been performed. + */ +struct xfs_rud_log_item { + struct xfs_log_item rud_item; + struct xfs_rui_log_item *rud_ruip; + struct xfs_rud_log_format rud_format; +}; + +extern struct kmem_zone *xfs_rui_zone; +extern struct kmem_zone *xfs_rud_zone; + +struct xfs_rui_log_item *xfs_rui_init(struct xfs_mount *, uint); +struct xfs_rud_log_item *xfs_rud_init(struct xfs_mount *, + struct xfs_rui_log_item *); +int xfs_rui_copy_format(struct xfs_log_iovec *buf, + struct xfs_rui_log_format *dst_rui_fmt); +void xfs_rui_item_free(struct xfs_rui_log_item *); +void xfs_rui_release(struct xfs_rui_log_item *); +int xfs_rui_recover(struct xfs_mount *mp, struct xfs_rui_log_item *ruip); + +#endif /* __XFS_RMAP_ITEM_H__ */ diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 3938b37d1043..802bcc326d9f 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -23,6 +23,7 @@ #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" @@ -769,7 +770,7 @@ xfs_growfs_rt_alloc( xfs_daddr_t d; /* disk block address */ int error; /* error return value */ xfs_fsblock_t firstblock;/* first block allocated in xaction */ - struct xfs_bmap_free flist; /* list of freed blocks */ + struct xfs_defer_ops dfops; /* list of freed blocks */ xfs_fsblock_t fsbno; /* filesystem block for bno */ struct xfs_bmbt_irec map; /* block map output */ int nmap; /* number of block maps */ @@ -794,14 +795,14 @@ xfs_growfs_rt_alloc( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_bmap_init(&flist, &firstblock); + xfs_defer_init(&dfops, &firstblock); /* * Allocate blocks to the bitmap file. */ nmap = 1; error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks, XFS_BMAPI_METADATA, &firstblock, - resblks, &map, &nmap, &flist); + resblks, &map, &nmap, &dfops); if (!error && nmap < 1) error = -ENOSPC; if (error) @@ -809,7 +810,7 @@ xfs_growfs_rt_alloc( /* * Free any blocks freed up in the transaction, then commit. */ - error = xfs_bmap_finish(&tp, &flist, NULL); + error = xfs_defer_finish(&tp, &dfops, NULL); if (error) goto out_bmap_cancel; error = xfs_trans_commit(tp); @@ -862,7 +863,7 @@ xfs_growfs_rt_alloc( return 0; out_bmap_cancel: - xfs_bmap_cancel(&flist); + xfs_defer_cancel(&dfops); out_trans_cancel: xfs_trans_cancel(tp); return error; diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index d266e835ecc3..6e812fe0fd43 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -61,6 +61,7 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) { "bmbt2", XFSSTAT_END_BMBT_V2 }, { "ibt2", XFSSTAT_END_IBT_V2 }, { "fibt2", XFSSTAT_END_FIBT_V2 }, + { "rmapbt", XFSSTAT_END_RMAP_V2 }, /* we print both series of quota information together */ { "qm", XFSSTAT_END_QM }, }; diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h index 483b0eff1988..657865f51e78 100644 --- a/fs/xfs/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -197,7 +197,23 @@ struct xfsstats { __uint32_t xs_fibt_2_alloc; __uint32_t xs_fibt_2_free; __uint32_t xs_fibt_2_moves; -#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_FIBT_V2+6) +#define XFSSTAT_END_RMAP_V2 (XFSSTAT_END_FIBT_V2+15) + __uint32_t xs_rmap_2_lookup; + __uint32_t xs_rmap_2_compare; + __uint32_t xs_rmap_2_insrec; + __uint32_t xs_rmap_2_delrec; + __uint32_t xs_rmap_2_newroot; + __uint32_t xs_rmap_2_killroot; + __uint32_t xs_rmap_2_increment; + __uint32_t xs_rmap_2_decrement; + __uint32_t xs_rmap_2_lshift; + __uint32_t xs_rmap_2_rshift; + __uint32_t xs_rmap_2_split; + __uint32_t xs_rmap_2_join; + __uint32_t xs_rmap_2_alloc; + __uint32_t xs_rmap_2_free; + __uint32_t xs_rmap_2_moves; +#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_RMAP_V2+6) __uint32_t xs_qm_dqreclaims; __uint32_t xs_qm_dqreclaim_misses; __uint32_t xs_qm_dquot_dups; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 0303f1005f88..24ef83ef04de 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -46,6 +46,7 @@ #include "xfs_quota.h" #include "xfs_sysfs.h" #include "xfs_ondisk.h" +#include "xfs_rmap_item.h" #include <linux/namei.h> #include <linux/init.h> @@ -1075,7 +1076,7 @@ xfs_fs_statfs( statp->f_blocks = sbp->sb_dblocks - lsize; spin_unlock(&mp->m_sb_lock); - statp->f_bfree = fdblocks - XFS_ALLOC_SET_ASIDE(mp); + statp->f_bfree = fdblocks - mp->m_alloc_set_aside; statp->f_bavail = statp->f_bfree; fakeinos = statp->f_bfree << sbp->sb_inopblog; @@ -1573,6 +1574,10 @@ xfs_fs_fill_super( } } + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + xfs_alert(mp, + "EXPERIMENTAL reverse mapping btree feature enabled. Use at your own risk!"); + error = xfs_mountfs(mp); if (error) goto out_filestream_unmount; @@ -1697,7 +1702,7 @@ xfs_init_zones(void) goto out_free_ioend_bioset; xfs_bmap_free_item_zone = kmem_zone_init( - sizeof(struct xfs_bmap_free_item), + sizeof(struct xfs_extent_free_item), "xfs_bmap_free_item"); if (!xfs_bmap_free_item_zone) goto out_destroy_log_ticket_zone; @@ -1765,8 +1770,24 @@ xfs_init_zones(void) if (!xfs_icreate_zone) goto out_destroy_ili_zone; + xfs_rud_zone = kmem_zone_init(sizeof(struct xfs_rud_log_item), + "xfs_rud_item"); + if (!xfs_rud_zone) + goto out_destroy_icreate_zone; + + xfs_rui_zone = kmem_zone_init((sizeof(struct xfs_rui_log_item) + + ((XFS_RUI_MAX_FAST_EXTENTS - 1) * + sizeof(struct xfs_map_extent))), + "xfs_rui_item"); + if (!xfs_rui_zone) + goto out_destroy_rud_zone; + return 0; + out_destroy_rud_zone: + kmem_zone_destroy(xfs_rud_zone); + out_destroy_icreate_zone: + kmem_zone_destroy(xfs_icreate_zone); out_destroy_ili_zone: kmem_zone_destroy(xfs_ili_zone); out_destroy_inode_zone: @@ -1805,6 +1826,8 @@ xfs_destroy_zones(void) * destroy caches. */ rcu_barrier(); + kmem_zone_destroy(xfs_rui_zone); + kmem_zone_destroy(xfs_rud_zone); kmem_zone_destroy(xfs_icreate_zone); kmem_zone_destroy(xfs_ili_zone); kmem_zone_destroy(xfs_inode_zone); @@ -1854,6 +1877,9 @@ init_xfs_fs(void) printk(KERN_INFO XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled\n"); + xfs_extent_free_init_defer_op(); + xfs_rmap_update_init_defer_op(); + xfs_dir_startup(); error = xfs_init_zones(); diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 08a46c6181fd..58142aeeeea6 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -26,6 +26,7 @@ #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" +#include "xfs_defer.h" #include "xfs_dir2.h" #include "xfs_inode.h" #include "xfs_ialloc.h" @@ -172,7 +173,7 @@ xfs_symlink( struct xfs_inode *ip = NULL; int error = 0; int pathlen; - struct xfs_bmap_free free_list; + struct xfs_defer_ops dfops; xfs_fsblock_t first_block; bool unlock_dp_on_error = false; xfs_fileoff_t first_fsb; @@ -269,7 +270,7 @@ xfs_symlink( * Initialize the bmap freelist prior to calling either * bmapi or the directory create code. */ - xfs_bmap_init(&free_list, &first_block); + xfs_defer_init(&dfops, &first_block); /* * Allocate an inode for the symlink. @@ -313,7 +314,7 @@ xfs_symlink( error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks, XFS_BMAPI_METADATA, &first_block, resblks, - mval, &nmaps, &free_list); + mval, &nmaps, &dfops); if (error) goto out_bmap_cancel; @@ -361,7 +362,7 @@ xfs_symlink( * Create the directory entry for the symlink. */ error = xfs_dir_createname(tp, dp, link_name, ip->i_ino, - &first_block, &free_list, resblks); + &first_block, &dfops, resblks); if (error) goto out_bmap_cancel; xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); @@ -376,7 +377,7 @@ xfs_symlink( xfs_trans_set_sync(tp); } - error = xfs_bmap_finish(&tp, &free_list, NULL); + error = xfs_defer_finish(&tp, &dfops, NULL); if (error) goto out_bmap_cancel; @@ -392,7 +393,7 @@ xfs_symlink( return 0; out_bmap_cancel: - xfs_bmap_cancel(&free_list); + xfs_defer_cancel(&dfops); out_trans_cancel: xfs_trans_cancel(tp); out_release_inode: @@ -426,7 +427,7 @@ xfs_inactive_symlink_rmt( int done; int error; xfs_fsblock_t first_block; - xfs_bmap_free_t free_list; + struct xfs_defer_ops dfops; int i; xfs_mount_t *mp; xfs_bmbt_irec_t mval[XFS_SYMLINK_MAPS]; @@ -465,7 +466,7 @@ xfs_inactive_symlink_rmt( * Find the block(s) so we can inval and unmap them. */ done = 0; - xfs_bmap_init(&free_list, &first_block); + xfs_defer_init(&dfops, &first_block); nmaps = ARRAY_SIZE(mval); error = xfs_bmapi_read(ip, 0, xfs_symlink_blocks(mp, size), mval, &nmaps, 0); @@ -485,17 +486,17 @@ xfs_inactive_symlink_rmt( xfs_trans_binval(tp, bp); } /* - * Unmap the dead block(s) to the free_list. + * Unmap the dead block(s) to the dfops. */ error = xfs_bunmapi(tp, ip, 0, size, 0, nmaps, - &first_block, &free_list, &done); + &first_block, &dfops, &done); if (error) goto error_bmap_cancel; ASSERT(done); /* * Commit the first transaction. This logs the EFI and the inode. */ - error = xfs_bmap_finish(&tp, &free_list, ip); + error = xfs_defer_finish(&tp, &dfops, ip); if (error) goto error_bmap_cancel; /* @@ -525,7 +526,7 @@ xfs_inactive_symlink_rmt( return 0; error_bmap_cancel: - xfs_bmap_cancel(&free_list); + xfs_defer_cancel(&dfops); error_trans_cancel: xfs_trans_cancel(tp); error_unlock: diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 13a029806805..7f17ae6d709a 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -22,7 +22,9 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_da_format.h" +#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_da_btree.h" diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 145169093fe0..551b7e26980c 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -38,6 +38,7 @@ struct xlog_recover_item; struct xfs_buf_log_format; struct xfs_inode_log_format; struct xfs_bmbt_irec; +struct xfs_btree_cur; DECLARE_EVENT_CLASS(xfs_attr_list_class, TP_PROTO(struct xfs_attr_list_context *ctx), @@ -2185,6 +2186,379 @@ DEFINE_DISCARD_EVENT(xfs_discard_toosmall); DEFINE_DISCARD_EVENT(xfs_discard_exclude); DEFINE_DISCARD_EVENT(xfs_discard_busy); +/* btree cursor events */ +DECLARE_EVENT_CLASS(xfs_btree_cur_class, + TP_PROTO(struct xfs_btree_cur *cur, int level, struct xfs_buf *bp), + TP_ARGS(cur, level, bp), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_btnum_t, btnum) + __field(int, level) + __field(int, nlevels) + __field(int, ptr) + __field(xfs_daddr_t, daddr) + ), + TP_fast_assign( + __entry->dev = cur->bc_mp->m_super->s_dev; + __entry->btnum = cur->bc_btnum; + __entry->level = level; + __entry->nlevels = cur->bc_nlevels; + __entry->ptr = cur->bc_ptrs[level]; + __entry->daddr = bp ? bp->b_bn : -1; + ), + TP_printk("dev %d:%d btnum %d level %d/%d ptr %d daddr 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->btnum, + __entry->level, + __entry->nlevels, + __entry->ptr, + (unsigned long long)__entry->daddr) +) + +#define DEFINE_BTREE_CUR_EVENT(name) \ +DEFINE_EVENT(xfs_btree_cur_class, name, \ + TP_PROTO(struct xfs_btree_cur *cur, int level, struct xfs_buf *bp), \ + TP_ARGS(cur, level, bp)) +DEFINE_BTREE_CUR_EVENT(xfs_btree_updkeys); +DEFINE_BTREE_CUR_EVENT(xfs_btree_overlapped_query_range); + +/* deferred ops */ +struct xfs_defer_pending; +struct xfs_defer_intake; +struct xfs_defer_ops; + +DECLARE_EVENT_CLASS(xfs_defer_class, + TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop), + TP_ARGS(mp, dop), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(void *, dop) + __field(bool, committed) + __field(bool, low) + ), + TP_fast_assign( + __entry->dev = mp ? mp->m_super->s_dev : 0; + __entry->dop = dop; + __entry->committed = dop->dop_committed; + __entry->low = dop->dop_low; + ), + TP_printk("dev %d:%d ops %p committed %d low %d\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dop, + __entry->committed, + __entry->low) +) +#define DEFINE_DEFER_EVENT(name) \ +DEFINE_EVENT(xfs_defer_class, name, \ + TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop), \ + TP_ARGS(mp, dop)) + +DECLARE_EVENT_CLASS(xfs_defer_error_class, + TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop, int error), + TP_ARGS(mp, dop, error), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(void *, dop) + __field(bool, committed) + __field(bool, low) + __field(int, error) + ), + TP_fast_assign( + __entry->dev = mp ? mp->m_super->s_dev : 0; + __entry->dop = dop; + __entry->committed = dop->dop_committed; + __entry->low = dop->dop_low; + __entry->error = error; + ), + TP_printk("dev %d:%d ops %p committed %d low %d err %d\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dop, + __entry->committed, + __entry->low, + __entry->error) +) +#define DEFINE_DEFER_ERROR_EVENT(name) \ +DEFINE_EVENT(xfs_defer_error_class, name, \ + TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop, int error), \ + TP_ARGS(mp, dop, error)) + +DECLARE_EVENT_CLASS(xfs_defer_pending_class, + TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp), + TP_ARGS(mp, dfp), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, type) + __field(void *, intent) + __field(bool, committed) + __field(int, nr) + ), + TP_fast_assign( + __entry->dev = mp ? mp->m_super->s_dev : 0; + __entry->type = dfp->dfp_type->type; + __entry->intent = dfp->dfp_intent; + __entry->committed = dfp->dfp_committed; + __entry->nr = dfp->dfp_count; + ), + TP_printk("dev %d:%d optype %d intent %p committed %d nr %d\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->type, + __entry->intent, + __entry->committed, + __entry->nr) +) +#define DEFINE_DEFER_PENDING_EVENT(name) \ +DEFINE_EVENT(xfs_defer_pending_class, name, \ + TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp), \ + TP_ARGS(mp, dfp)) + +DECLARE_EVENT_CLASS(xfs_phys_extent_deferred_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + int type, xfs_agblock_t agbno, xfs_extlen_t len), + TP_ARGS(mp, agno, type, agbno, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(int, type) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->type = type; + __entry->agbno = agbno; + __entry->len = len; + ), + TP_printk("dev %d:%d op %d agno %u agbno %u len %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->type, + __entry->agno, + __entry->agbno, + __entry->len) +); +#define DEFINE_PHYS_EXTENT_DEFERRED_EVENT(name) \ +DEFINE_EVENT(xfs_phys_extent_deferred_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + int type, \ + xfs_agblock_t bno, \ + xfs_extlen_t len), \ + TP_ARGS(mp, agno, type, bno, len)) + +DECLARE_EVENT_CLASS(xfs_map_extent_deferred_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + int op, + xfs_agblock_t agbno, + xfs_ino_t ino, + int whichfork, + xfs_fileoff_t offset, + xfs_filblks_t len, + xfs_exntst_t state), + TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_ino_t, ino) + __field(xfs_agblock_t, agbno) + __field(int, whichfork) + __field(xfs_fileoff_t, l_loff) + __field(xfs_filblks_t, l_len) + __field(xfs_exntst_t, l_state) + __field(int, op) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->ino = ino; + __entry->agbno = agbno; + __entry->whichfork = whichfork; + __entry->l_loff = offset; + __entry->l_len = len; + __entry->l_state = state; + __entry->op = op; + ), + TP_printk("dev %d:%d op %d agno %u agbno %u owner %lld %s offset %llu len %llu state %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->op, + __entry->agno, + __entry->agbno, + __entry->ino, + __entry->whichfork == XFS_ATTR_FORK ? "attr" : "data", + __entry->l_loff, + __entry->l_len, + __entry->l_state) +); +#define DEFINE_MAP_EXTENT_DEFERRED_EVENT(name) \ +DEFINE_EVENT(xfs_map_extent_deferred_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + int op, \ + xfs_agblock_t agbno, \ + xfs_ino_t ino, \ + int whichfork, \ + xfs_fileoff_t offset, \ + xfs_filblks_t len, \ + xfs_exntst_t state), \ + TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state)) + +DEFINE_DEFER_EVENT(xfs_defer_init); +DEFINE_DEFER_EVENT(xfs_defer_cancel); +DEFINE_DEFER_EVENT(xfs_defer_trans_roll); +DEFINE_DEFER_EVENT(xfs_defer_trans_abort); +DEFINE_DEFER_EVENT(xfs_defer_finish); +DEFINE_DEFER_EVENT(xfs_defer_finish_done); + +DEFINE_DEFER_ERROR_EVENT(xfs_defer_trans_roll_error); +DEFINE_DEFER_ERROR_EVENT(xfs_defer_finish_error); +DEFINE_DEFER_ERROR_EVENT(xfs_defer_op_finish_error); + +DEFINE_DEFER_PENDING_EVENT(xfs_defer_intake_work); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_intake_cancel); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_commit); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_cancel); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort); + +#define DEFINE_BMAP_FREE_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT +DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_defer); +DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_deferred); + +/* rmap tracepoints */ +DECLARE_EVENT_CLASS(xfs_rmap_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len, bool unwritten, + struct xfs_owner_info *oinfo), + TP_ARGS(mp, agno, agbno, len, unwritten, oinfo), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(uint64_t, owner) + __field(uint64_t, offset) + __field(unsigned long, flags) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->len = len; + __entry->owner = oinfo->oi_owner; + __entry->offset = oinfo->oi_offset; + __entry->flags = oinfo->oi_flags; + if (unwritten) + __entry->flags |= XFS_RMAP_UNWRITTEN; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u owner %lld offset %llu flags 0x%lx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len, + __entry->owner, + __entry->offset, + __entry->flags) +); +#define DEFINE_RMAP_EVENT(name) \ +DEFINE_EVENT(xfs_rmap_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + xfs_agblock_t agbno, xfs_extlen_t len, bool unwritten, \ + struct xfs_owner_info *oinfo), \ + TP_ARGS(mp, agno, agbno, len, unwritten, oinfo)) + +/* simple AG-based error/%ip tracepoint class */ +DECLARE_EVENT_CLASS(xfs_ag_error_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int error, + unsigned long caller_ip), + TP_ARGS(mp, agno, error, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(int, error) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->error = error; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d agno %u error %d caller %ps", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->error, + (char *)__entry->caller_ip) +); + +#define DEFINE_AG_ERROR_EVENT(name) \ +DEFINE_EVENT(xfs_ag_error_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int error, \ + unsigned long caller_ip), \ + TP_ARGS(mp, agno, error, caller_ip)) + +DEFINE_RMAP_EVENT(xfs_rmap_unmap); +DEFINE_RMAP_EVENT(xfs_rmap_unmap_done); +DEFINE_AG_ERROR_EVENT(xfs_rmap_unmap_error); +DEFINE_RMAP_EVENT(xfs_rmap_map); +DEFINE_RMAP_EVENT(xfs_rmap_map_done); +DEFINE_AG_ERROR_EVENT(xfs_rmap_map_error); +DEFINE_RMAP_EVENT(xfs_rmap_convert); +DEFINE_RMAP_EVENT(xfs_rmap_convert_done); +DEFINE_AG_ERROR_EVENT(xfs_rmap_convert_error); +DEFINE_AG_ERROR_EVENT(xfs_rmap_convert_state); + +DECLARE_EVENT_CLASS(xfs_rmapbt_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len, + uint64_t owner, uint64_t offset, unsigned int flags), + TP_ARGS(mp, agno, agbno, len, owner, offset, flags), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(uint64_t, owner) + __field(uint64_t, offset) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->len = len; + __entry->owner = owner; + __entry->offset = offset; + __entry->flags = flags; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u owner %lld offset %llu flags 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len, + __entry->owner, + __entry->offset, + __entry->flags) +); +#define DEFINE_RMAPBT_EVENT(name) \ +DEFINE_EVENT(xfs_rmapbt_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + xfs_agblock_t agbno, xfs_extlen_t len, \ + uint64_t owner, uint64_t offset, unsigned int flags), \ + TP_ARGS(mp, agno, agbno, len, owner, offset, flags)) + +#define DEFINE_RMAP_DEFERRED_EVENT DEFINE_MAP_EXTENT_DEFERRED_EVENT +DEFINE_RMAP_DEFERRED_EVENT(xfs_rmap_defer); +DEFINE_RMAP_DEFERRED_EVENT(xfs_rmap_deferred); + +DEFINE_BUSY_EVENT(xfs_rmapbt_alloc_block); +DEFINE_BUSY_EVENT(xfs_rmapbt_free_block); +DEFINE_RMAPBT_EVENT(xfs_rmap_update); +DEFINE_RMAPBT_EVENT(xfs_rmap_insert); +DEFINE_RMAPBT_EVENT(xfs_rmap_delete); +DEFINE_AG_ERROR_EVENT(xfs_rmap_insert_error); +DEFINE_AG_ERROR_EVENT(xfs_rmap_delete_error); +DEFINE_AG_ERROR_EVENT(xfs_rmap_update_error); +DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range_result); +DEFINE_RMAPBT_EVENT(xfs_rmap_find_right_neighbor_result); +DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_result); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 9b2b9fa89331..e2bf86aad33d 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -33,6 +33,9 @@ struct xfs_trans; struct xfs_trans_res; struct xfs_dquot_acct; struct xfs_busy_extent; +struct xfs_rud_log_item; +struct xfs_rui_log_item; +struct xfs_btree_cur; typedef struct xfs_log_item { struct list_head li_ail; /* AIL pointers */ @@ -210,17 +213,14 @@ void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int); void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint); void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint); void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint); -struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint); -void xfs_trans_log_efi_extent(xfs_trans_t *, - struct xfs_efi_log_item *, - xfs_fsblock_t, - xfs_extlen_t); -struct xfs_efd_log_item *xfs_trans_get_efd(xfs_trans_t *, + +void xfs_extent_free_init_defer_op(void); +struct xfs_efd_log_item *xfs_trans_get_efd(struct xfs_trans *, struct xfs_efi_log_item *, uint); int xfs_trans_free_extent(struct xfs_trans *, struct xfs_efd_log_item *, xfs_fsblock_t, - xfs_extlen_t); + xfs_extlen_t, struct xfs_owner_info *); int xfs_trans_commit(struct xfs_trans *); int __xfs_trans_roll(struct xfs_trans **, struct xfs_inode *, int *); int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *); @@ -236,4 +236,16 @@ void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp, extern kmem_zone_t *xfs_trans_zone; extern kmem_zone_t *xfs_log_item_desc_zone; +/* rmap updates */ +enum xfs_rmap_intent_type; + +void xfs_rmap_update_init_defer_op(void); +struct xfs_rud_log_item *xfs_trans_get_rud(struct xfs_trans *tp, + struct xfs_rui_log_item *ruip); +int xfs_trans_log_finish_rmap_update(struct xfs_trans *tp, + struct xfs_rud_log_item *rudp, enum xfs_rmap_intent_type type, + __uint64_t owner, int whichfork, xfs_fileoff_t startoff, + xfs_fsblock_t startblock, xfs_filblks_t blockcount, + xfs_exntst_t state, struct xfs_btree_cur **pcur); + #endif /* __XFS_TRANS_H__ */ diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c index a96ae540eb62..459ddec137a4 100644 --- a/fs/xfs/xfs_trans_extfree.c +++ b/fs/xfs/xfs_trans_extfree.c @@ -21,66 +21,15 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" +#include "xfs_bit.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_extfree_item.h" #include "xfs_alloc.h" - -/* - * This routine is called to allocate an "extent free intention" - * log item that will hold nextents worth of extents. The - * caller must use all nextents extents, because we are not - * flexible about this at all. - */ -xfs_efi_log_item_t * -xfs_trans_get_efi(xfs_trans_t *tp, - uint nextents) -{ - xfs_efi_log_item_t *efip; - - ASSERT(tp != NULL); - ASSERT(nextents > 0); - - efip = xfs_efi_init(tp->t_mountp, nextents); - ASSERT(efip != NULL); - - /* - * Get a log_item_desc to point at the new item. - */ - xfs_trans_add_item(tp, &efip->efi_item); - return efip; -} - -/* - * This routine is called to indicate that the described - * extent is to be logged as needing to be freed. It should - * be called once for each extent to be freed. - */ -void -xfs_trans_log_efi_extent(xfs_trans_t *tp, - xfs_efi_log_item_t *efip, - xfs_fsblock_t start_block, - xfs_extlen_t ext_len) -{ - uint next_extent; - xfs_extent_t *extp; - - tp->t_flags |= XFS_TRANS_DIRTY; - efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY; - - /* - * atomic_inc_return gives us the value after the increment; - * we want to use it as an array index so we need to subtract 1 from - * it. - */ - next_extent = atomic_inc_return(&efip->efi_next_extent) - 1; - ASSERT(next_extent < efip->efi_format.efi_nextents); - extp = &(efip->efi_format.efi_extents[next_extent]); - extp->ext_start = start_block; - extp->ext_len = ext_len; -} - +#include "xfs_bmap.h" +#include "xfs_trace.h" /* * This routine is called to allocate an "extent free done" @@ -88,12 +37,12 @@ xfs_trans_log_efi_extent(xfs_trans_t *tp, * caller must use all nextents extents, because we are not * flexible about this at all. */ -xfs_efd_log_item_t * -xfs_trans_get_efd(xfs_trans_t *tp, - xfs_efi_log_item_t *efip, - uint nextents) +struct xfs_efd_log_item * +xfs_trans_get_efd(struct xfs_trans *tp, + struct xfs_efi_log_item *efip, + uint nextents) { - xfs_efd_log_item_t *efdp; + struct xfs_efd_log_item *efdp; ASSERT(tp != NULL); ASSERT(nextents > 0); @@ -118,13 +67,19 @@ xfs_trans_free_extent( struct xfs_trans *tp, struct xfs_efd_log_item *efdp, xfs_fsblock_t start_block, - xfs_extlen_t ext_len) + xfs_extlen_t ext_len, + struct xfs_owner_info *oinfo) { + struct xfs_mount *mp = tp->t_mountp; uint next_extent; + xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, start_block); + xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, start_block); struct xfs_extent *extp; int error; - error = xfs_free_extent(tp, start_block, ext_len); + trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len); + + error = xfs_free_extent(tp, start_block, ext_len, oinfo); /* * Mark the transaction dirty, even on error. This ensures the @@ -145,3 +100,139 @@ xfs_trans_free_extent( return error; } + +/* Sort bmap items by AG. */ +static int +xfs_extent_free_diff_items( + void *priv, + struct list_head *a, + struct list_head *b) +{ + struct xfs_mount *mp = priv; + struct xfs_extent_free_item *ra; + struct xfs_extent_free_item *rb; + + ra = container_of(a, struct xfs_extent_free_item, xefi_list); + rb = container_of(b, struct xfs_extent_free_item, xefi_list); + return XFS_FSB_TO_AGNO(mp, ra->xefi_startblock) - + XFS_FSB_TO_AGNO(mp, rb->xefi_startblock); +} + +/* Get an EFI. */ +STATIC void * +xfs_extent_free_create_intent( + struct xfs_trans *tp, + unsigned int count) +{ + struct xfs_efi_log_item *efip; + + ASSERT(tp != NULL); + ASSERT(count > 0); + + efip = xfs_efi_init(tp->t_mountp, count); + ASSERT(efip != NULL); + + /* + * Get a log_item_desc to point at the new item. + */ + xfs_trans_add_item(tp, &efip->efi_item); + return efip; +} + +/* Log a free extent to the intent item. */ +STATIC void +xfs_extent_free_log_item( + struct xfs_trans *tp, + void *intent, + struct list_head *item) +{ + struct xfs_efi_log_item *efip = intent; + struct xfs_extent_free_item *free; + uint next_extent; + struct xfs_extent *extp; + + free = container_of(item, struct xfs_extent_free_item, xefi_list); + + tp->t_flags |= XFS_TRANS_DIRTY; + efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY; + + /* + * atomic_inc_return gives us the value after the increment; + * we want to use it as an array index so we need to subtract 1 from + * it. + */ + next_extent = atomic_inc_return(&efip->efi_next_extent) - 1; + ASSERT(next_extent < efip->efi_format.efi_nextents); + extp = &efip->efi_format.efi_extents[next_extent]; + extp->ext_start = free->xefi_startblock; + extp->ext_len = free->xefi_blockcount; +} + +/* Get an EFD so we can process all the free extents. */ +STATIC void * +xfs_extent_free_create_done( + struct xfs_trans *tp, + void *intent, + unsigned int count) +{ + return xfs_trans_get_efd(tp, intent, count); +} + +/* Process a free extent. */ +STATIC int +xfs_extent_free_finish_item( + struct xfs_trans *tp, + struct xfs_defer_ops *dop, + struct list_head *item, + void *done_item, + void **state) +{ + struct xfs_extent_free_item *free; + int error; + + free = container_of(item, struct xfs_extent_free_item, xefi_list); + error = xfs_trans_free_extent(tp, done_item, + free->xefi_startblock, + free->xefi_blockcount, + &free->xefi_oinfo); + kmem_free(free); + return error; +} + +/* Abort all pending EFIs. */ +STATIC void +xfs_extent_free_abort_intent( + void *intent) +{ + xfs_efi_release(intent); +} + +/* Cancel a free extent. */ +STATIC void +xfs_extent_free_cancel_item( + struct list_head *item) +{ + struct xfs_extent_free_item *free; + + free = container_of(item, struct xfs_extent_free_item, xefi_list); + kmem_free(free); +} + +static const struct xfs_defer_op_type xfs_extent_free_defer_type = { + .type = XFS_DEFER_OPS_TYPE_FREE, + .max_items = XFS_EFI_MAX_FAST_EXTENTS, + .diff_items = xfs_extent_free_diff_items, + .create_intent = xfs_extent_free_create_intent, + .abort_intent = xfs_extent_free_abort_intent, + .log_item = xfs_extent_free_log_item, + .create_done = xfs_extent_free_create_done, + .finish_item = xfs_extent_free_finish_item, + .cancel_item = xfs_extent_free_cancel_item, +}; + +/* Register the deferred op type. */ +void +xfs_extent_free_init_defer_op(void) +{ + xfs_defer_init_op_type(&xfs_extent_free_defer_type); +} diff --git a/fs/xfs/xfs_trans_rmap.c b/fs/xfs/xfs_trans_rmap.c new file mode 100644 index 000000000000..5a50ef881568 --- /dev/null +++ b/fs/xfs/xfs_trans_rmap.c @@ -0,0 +1,271 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_rmap_item.h" +#include "xfs_alloc.h" +#include "xfs_rmap.h" + +/* Set the map extent flags for this reverse mapping. */ +static void +xfs_trans_set_rmap_flags( + struct xfs_map_extent *rmap, + enum xfs_rmap_intent_type type, + int whichfork, + xfs_exntst_t state) +{ + rmap->me_flags = 0; + if (state == XFS_EXT_UNWRITTEN) + rmap->me_flags |= XFS_RMAP_EXTENT_UNWRITTEN; + if (whichfork == XFS_ATTR_FORK) + rmap->me_flags |= XFS_RMAP_EXTENT_ATTR_FORK; + switch (type) { + case XFS_RMAP_MAP: + rmap->me_flags |= XFS_RMAP_EXTENT_MAP; + break; + case XFS_RMAP_UNMAP: + rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP; + break; + case XFS_RMAP_CONVERT: + rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT; + break; + case XFS_RMAP_ALLOC: + rmap->me_flags |= XFS_RMAP_EXTENT_ALLOC; + break; + case XFS_RMAP_FREE: + rmap->me_flags |= XFS_RMAP_EXTENT_FREE; + break; + default: + ASSERT(0); + } +} + +struct xfs_rud_log_item * +xfs_trans_get_rud( + struct xfs_trans *tp, + struct xfs_rui_log_item *ruip) +{ + struct xfs_rud_log_item *rudp; + + rudp = xfs_rud_init(tp->t_mountp, ruip); + xfs_trans_add_item(tp, &rudp->rud_item); + return rudp; +} + +/* + * Finish an rmap update and log it to the RUD. Note that the transaction is + * marked dirty regardless of whether the rmap update succeeds or fails to + * support the RUI/RUD lifecycle rules. + */ +int +xfs_trans_log_finish_rmap_update( + struct xfs_trans *tp, + struct xfs_rud_log_item *rudp, + enum xfs_rmap_intent_type type, + __uint64_t owner, + int whichfork, + xfs_fileoff_t startoff, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount, + xfs_exntst_t state, + struct xfs_btree_cur **pcur) +{ + int error; + + error = xfs_rmap_finish_one(tp, type, owner, whichfork, startoff, + startblock, blockcount, state, pcur); + + /* + * Mark the transaction dirty, even on error. This ensures the + * transaction is aborted, which: + * + * 1.) releases the RUI and frees the RUD + * 2.) shuts down the filesystem + */ + tp->t_flags |= XFS_TRANS_DIRTY; + rudp->rud_item.li_desc->lid_flags |= XFS_LID_DIRTY; + + return error; +} + +/* Sort rmap intents by AG. */ +static int +xfs_rmap_update_diff_items( + void *priv, + struct list_head *a, + struct list_head *b) +{ + struct xfs_mount *mp = priv; + struct xfs_rmap_intent *ra; + struct xfs_rmap_intent *rb; + + ra = container_of(a, struct xfs_rmap_intent, ri_list); + rb = container_of(b, struct xfs_rmap_intent, ri_list); + return XFS_FSB_TO_AGNO(mp, ra->ri_bmap.br_startblock) - + XFS_FSB_TO_AGNO(mp, rb->ri_bmap.br_startblock); +} + +/* Get an RUI. */ +STATIC void * +xfs_rmap_update_create_intent( + struct xfs_trans *tp, + unsigned int count) +{ + struct xfs_rui_log_item *ruip; + + ASSERT(tp != NULL); + ASSERT(count > 0); + + ruip = xfs_rui_init(tp->t_mountp, count); + ASSERT(ruip != NULL); + + /* + * Get a log_item_desc to point at the new item. + */ + xfs_trans_add_item(tp, &ruip->rui_item); + return ruip; +} + +/* Log rmap updates in the intent item. */ +STATIC void +xfs_rmap_update_log_item( + struct xfs_trans *tp, + void *intent, + struct list_head *item) +{ + struct xfs_rui_log_item *ruip = intent; + struct xfs_rmap_intent *rmap; + uint next_extent; + struct xfs_map_extent *map; + + rmap = container_of(item, struct xfs_rmap_intent, ri_list); + + tp->t_flags |= XFS_TRANS_DIRTY; + ruip->rui_item.li_desc->lid_flags |= XFS_LID_DIRTY; + + /* + * atomic_inc_return gives us the value after the increment; + * we want to use it as an array index so we need to subtract 1 from + * it. + */ + next_extent = atomic_inc_return(&ruip->rui_next_extent) - 1; + ASSERT(next_extent < ruip->rui_format.rui_nextents); + map = &ruip->rui_format.rui_extents[next_extent]; + map->me_owner = rmap->ri_owner; + map->me_startblock = rmap->ri_bmap.br_startblock; + map->me_startoff = rmap->ri_bmap.br_startoff; + map->me_len = rmap->ri_bmap.br_blockcount; + xfs_trans_set_rmap_flags(map, rmap->ri_type, rmap->ri_whichfork, + rmap->ri_bmap.br_state); +} + +/* Get an RUD so we can process all the deferred rmap updates. */ +STATIC void * +xfs_rmap_update_create_done( + struct xfs_trans *tp, + void *intent, + unsigned int count) +{ + return xfs_trans_get_rud(tp, intent); +} + +/* Process a deferred rmap update. */ +STATIC int +xfs_rmap_update_finish_item( + struct xfs_trans *tp, + struct xfs_defer_ops *dop, + struct list_head *item, + void *done_item, + void **state) +{ + struct xfs_rmap_intent *rmap; + int error; + + rmap = container_of(item, struct xfs_rmap_intent, ri_list); + error = xfs_trans_log_finish_rmap_update(tp, done_item, + rmap->ri_type, + rmap->ri_owner, rmap->ri_whichfork, + rmap->ri_bmap.br_startoff, + rmap->ri_bmap.br_startblock, + rmap->ri_bmap.br_blockcount, + rmap->ri_bmap.br_state, + (struct xfs_btree_cur **)state); + kmem_free(rmap); + return error; +} + +/* Clean up after processing deferred rmaps. */ +STATIC void +xfs_rmap_update_finish_cleanup( + struct xfs_trans *tp, + void *state, + int error) +{ + struct xfs_btree_cur *rcur = state; + + xfs_rmap_finish_one_cleanup(tp, rcur, error); +} + +/* Abort all pending RUIs. */ +STATIC void +xfs_rmap_update_abort_intent( + void *intent) +{ + xfs_rui_release(intent); +} + +/* Cancel a deferred rmap update. */ +STATIC void +xfs_rmap_update_cancel_item( + struct list_head *item) +{ + struct xfs_rmap_intent *rmap; + + rmap = container_of(item, struct xfs_rmap_intent, ri_list); + kmem_free(rmap); +} + +static const struct xfs_defer_op_type xfs_rmap_update_defer_type = { + .type = XFS_DEFER_OPS_TYPE_RMAP, + .max_items = XFS_RUI_MAX_FAST_EXTENTS, + .diff_items = xfs_rmap_update_diff_items, + .create_intent = xfs_rmap_update_create_intent, + .abort_intent = xfs_rmap_update_abort_intent, + .log_item = xfs_rmap_update_log_item, + .create_done = xfs_rmap_update_create_done, + .finish_item = xfs_rmap_update_finish_item, + .finish_cleanup = xfs_rmap_update_finish_cleanup, + .cancel_item = xfs_rmap_update_cancel_item, +}; + +/* Register the deferred op type. */ +void +xfs_rmap_update_init_defer_op(void) +{ + xfs_defer_init_op_type(&xfs_rmap_update_defer_type); +} |