diff options
Diffstat (limited to 'fs')
208 files changed, 4540 insertions, 4381 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 014c8dd62962..57ccb7537dae 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -448,7 +448,7 @@ void v9fs_evict_inode(struct inode *inode) struct v9fs_inode *v9inode = V9FS_I(inode); truncate_inode_pages(inode->i_mapping, 0); - end_writeback(inode); + clear_inode(inode); filemap_fdatawrite(inode->i_mapping); #ifdef CONFIG_9P_FSCACHE diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 88a4b0b50058..8bc4a59f4e7e 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -264,7 +264,7 @@ affs_evict_inode(struct inode *inode) } invalidate_inode_buffers(inode); - end_writeback(inode); + clear_inode(inode); affs_free_prealloc(inode); cache_page = (unsigned long)AFFS_I(inode)->i_lc; if (cache_page) { diff --git a/fs/afs/inode.c b/fs/afs/inode.c index d890ae3b2ce6..95cffd38239f 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -423,7 +423,7 @@ void afs_evict_inode(struct inode *inode) ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode); truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); afs_give_up_callback(vnode); diff --git a/fs/attr.c b/fs/attr.c index 73f69a6ce9ed..584620e5dee5 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -47,14 +47,14 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr) /* Make sure a caller can chown. */ if ((ia_valid & ATTR_UID) && - (current_fsuid() != inode->i_uid || - attr->ia_uid != inode->i_uid) && !capable(CAP_CHOWN)) + (!uid_eq(current_fsuid(), inode->i_uid) || + !uid_eq(attr->ia_uid, inode->i_uid)) && !capable(CAP_CHOWN)) return -EPERM; /* Make sure caller can chgrp. */ if ((ia_valid & ATTR_GID) && - (current_fsuid() != inode->i_uid || - (!in_group_p(attr->ia_gid) && attr->ia_gid != inode->i_gid)) && + (!uid_eq(current_fsuid(), inode->i_uid) || + (!in_group_p(attr->ia_gid) && !gid_eq(attr->ia_gid, inode->i_gid))) && !capable(CAP_CHOWN)) return -EPERM; diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 6e488ebe7784..8a4fed8ead30 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -100,7 +100,7 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root) static void autofs4_evict_inode(struct inode *inode) { - end_writeback(inode); + clear_inode(inode); kfree(inode->i_private); } diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index e23dc7c8b884..9870417c26e7 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -174,7 +174,7 @@ static void bfs_evict_inode(struct inode *inode) truncate_inode_pages(&inode->i_data, 0); invalidate_inode_buffers(inode); - end_writeback(inode); + clear_inode(inode); if (inode->i_nlink) return; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 16f735417072..e658dd134b95 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -226,10 +226,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, NEW_AUX_ENT(AT_BASE, interp_load_addr); NEW_AUX_ENT(AT_FLAGS, 0); NEW_AUX_ENT(AT_ENTRY, exec->e_entry); - NEW_AUX_ENT(AT_UID, cred->uid); - NEW_AUX_ENT(AT_EUID, cred->euid); - NEW_AUX_ENT(AT_GID, cred->gid); - NEW_AUX_ENT(AT_EGID, cred->egid); + NEW_AUX_ENT(AT_UID, from_kuid_munged(cred->user_ns, cred->uid)); + NEW_AUX_ENT(AT_EUID, from_kuid_munged(cred->user_ns, cred->euid)); + NEW_AUX_ENT(AT_GID, from_kgid_munged(cred->user_ns, cred->gid)); + NEW_AUX_ENT(AT_EGID, from_kgid_munged(cred->user_ns, cred->egid)); NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm)); NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes); NEW_AUX_ENT(AT_EXECFN, bprm->exec); @@ -1356,8 +1356,8 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, psinfo->pr_flag = p->flags; rcu_read_lock(); cred = __task_cred(p); - SET_UID(psinfo->pr_uid, cred->uid); - SET_GID(psinfo->pr_gid, cred->gid); + SET_UID(psinfo->pr_uid, from_kuid_munged(cred->user_ns, cred->uid)); + SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid)); rcu_read_unlock(); strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname)); diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index d390a0fffc65..3d77cf81ba3c 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -627,10 +627,10 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, NEW_AUX_ENT(AT_BASE, interp_params->elfhdr_addr); NEW_AUX_ENT(AT_FLAGS, 0); NEW_AUX_ENT(AT_ENTRY, exec_params->entry_addr); - NEW_AUX_ENT(AT_UID, (elf_addr_t) cred->uid); - NEW_AUX_ENT(AT_EUID, (elf_addr_t) cred->euid); - NEW_AUX_ENT(AT_GID, (elf_addr_t) cred->gid); - NEW_AUX_ENT(AT_EGID, (elf_addr_t) cred->egid); + NEW_AUX_ENT(AT_UID, (elf_addr_t) from_kuid_munged(cred->user_ns, cred->uid)); + NEW_AUX_ENT(AT_EUID, (elf_addr_t) from_kuid_munged(cred->user_ns, cred->euid)); + NEW_AUX_ENT(AT_GID, (elf_addr_t) from_kgid_munged(cred->user_ns, cred->gid)); + NEW_AUX_ENT(AT_EGID, (elf_addr_t) from_kgid_munged(cred->user_ns, cred->egid)); NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm)); NEW_AUX_ENT(AT_EXECFN, bprm->exec); @@ -1421,8 +1421,8 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, psinfo->pr_flag = p->flags; rcu_read_lock(); cred = __task_cred(p); - SET_UID(psinfo->pr_uid, cred->uid); - SET_GID(psinfo->pr_gid, cred->gid); + SET_UID(psinfo->pr_uid, from_kuid_munged(cred->user_ns, cred->uid)); + SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid)); rcu_read_unlock(); strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname)); diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 613aa0618235..790b3cddca67 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -505,7 +505,7 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode) static void bm_evict_inode(struct inode *inode) { - end_writeback(inode); + clear_inode(inode); kfree(inode->i_private); } diff --git a/fs/block_dev.c b/fs/block_dev.c index ba11c30f302d..c2bbe1fb1326 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -487,7 +487,7 @@ static void bdev_evict_inode(struct inode *inode) struct list_head *p; truncate_inode_pages(&inode->i_data, 0); invalidate_inode_buffers(inode); /* is it needed here? */ - end_writeback(inode); + clear_inode(inode); spin_lock(&bdev_lock); while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) { __bd_forget(list_entry(p, struct inode, i_devices)); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 61b16c641ce0..ceb7b9c9edcc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3756,7 +3756,7 @@ void btrfs_evict_inode(struct inode *inode) btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root, nr); no_delete: - end_writeback(inode); + clear_inode(inode); return; } diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 541ef81f6ae8..0a0fa0250e99 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -272,7 +272,7 @@ static void cifs_evict_inode(struct inode *inode) { truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); cifs_fscache_release_inode_cookie(inode); } diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 2870597b5c9d..f1813120d753 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -244,7 +244,7 @@ static void coda_put_super(struct super_block *sb) static void coda_evict_inode(struct inode *inode) { truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); coda_cache_clear_inode(inode); } diff --git a/fs/compat.c b/fs/compat.c index f2944ace7a7b..0781e619a62a 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -144,8 +144,8 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) tmp.st_nlink = stat->nlink; if (tmp.st_nlink != stat->nlink) return -EOVERFLOW; - SET_UID(tmp.st_uid, stat->uid); - SET_GID(tmp.st_gid, stat->gid); + SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); + SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); tmp.st_rdev = old_encode_dev(stat->rdev); if ((u64) stat->size > MAX_NON_LFS) return -EOVERFLOW; diff --git a/fs/dcache.c b/fs/dcache.c index 8c1ab8fb5012..4435d8b32904 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -3093,6 +3093,7 @@ static void __init dcache_init_early(void) HASH_EARLY, &d_hash_shift, &d_hash_mask, + 0, 0); for (loop = 0; loop < (1U << d_hash_shift); loop++) @@ -3123,6 +3124,7 @@ static void __init dcache_init(void) 0, &d_hash_shift, &d_hash_mask, + 0, 0); for (loop = 0; loop < (1U << d_hash_shift); loop++) diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 5dfafdd1dbd3..2340f6978d6e 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -20,6 +20,7 @@ #include <linux/namei.h> #include <linux/debugfs.h> #include <linux/io.h> +#include <linux/slab.h> static ssize_t default_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) @@ -520,6 +521,133 @@ struct dentry *debugfs_create_blob(const char *name, umode_t mode, } EXPORT_SYMBOL_GPL(debugfs_create_blob); +struct array_data { + void *array; + u32 elements; +}; + +static int u32_array_open(struct inode *inode, struct file *file) +{ + file->private_data = NULL; + return nonseekable_open(inode, file); +} + +static size_t format_array(char *buf, size_t bufsize, const char *fmt, + u32 *array, u32 array_size) +{ + size_t ret = 0; + u32 i; + + for (i = 0; i < array_size; i++) { + size_t len; + + len = snprintf(buf, bufsize, fmt, array[i]); + len++; /* ' ' or '\n' */ + ret += len; + + if (buf) { + buf += len; + bufsize -= len; + buf[-1] = (i == array_size-1) ? '\n' : ' '; + } + } + + ret++; /* \0 */ + if (buf) + *buf = '\0'; + + return ret; +} + +static char *format_array_alloc(const char *fmt, u32 *array, + u32 array_size) +{ + size_t len = format_array(NULL, 0, fmt, array, array_size); + char *ret; + + ret = kmalloc(len, GFP_KERNEL); + if (ret == NULL) + return NULL; + + format_array(ret, len, fmt, array, array_size); + return ret; +} + +static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len, + loff_t *ppos) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct array_data *data = inode->i_private; + size_t size; + + if (*ppos == 0) { + if (file->private_data) { + kfree(file->private_data); + file->private_data = NULL; + } + + file->private_data = format_array_alloc("%u", data->array, + data->elements); + } + + size = 0; + if (file->private_data) + size = strlen(file->private_data); + + return simple_read_from_buffer(buf, len, ppos, + file->private_data, size); +} + +static int u32_array_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + + return 0; +} + +static const struct file_operations u32_array_fops = { + .owner = THIS_MODULE, + .open = u32_array_open, + .release = u32_array_release, + .read = u32_array_read, + .llseek = no_llseek, +}; + +/** + * debugfs_create_u32_array - create a debugfs file that is used to read u32 + * array. + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have. + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @array: u32 array that provides data. + * @elements: total number of elements in the array. + * + * This function creates a file in debugfs with the given name that exports + * @array as data. If the @mode variable is so set it can be read from. + * Writing is not supported. Seek within the file is also not supported. + * Once array is created its size can not be changed. + * + * The function returns a pointer to dentry on success. If debugfs is not + * enabled in the kernel, the value -%ENODEV will be returned. + */ +struct dentry *debugfs_create_u32_array(const char *name, umode_t mode, + struct dentry *parent, + u32 *array, u32 elements) +{ + struct array_data *data = kmalloc(sizeof(*data), GFP_KERNEL); + + if (data == NULL) + return NULL; + + data->array = array; + data->elements = elements; + + return debugfs_create_file(name, mode, parent, data, &u32_array_fops); +} +EXPORT_SYMBOL_GPL(debugfs_create_u32_array); + #ifdef CONFIG_HAS_IOMEM /* diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 10f5e0b484db..979c1e309c73 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -98,8 +98,8 @@ static struct vfsmount *devpts_mnt; struct pts_mount_opts { int setuid; int setgid; - uid_t uid; - gid_t gid; + kuid_t uid; + kgid_t gid; umode_t mode; umode_t ptmxmode; int newinstance; @@ -158,11 +158,13 @@ static inline struct super_block *pts_sb_from_inode(struct inode *inode) static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts) { char *p; + kuid_t uid; + kgid_t gid; opts->setuid = 0; opts->setgid = 0; - opts->uid = 0; - opts->gid = 0; + opts->uid = GLOBAL_ROOT_UID; + opts->gid = GLOBAL_ROOT_GID; opts->mode = DEVPTS_DEFAULT_MODE; opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE; opts->max = NR_UNIX98_PTY_MAX; @@ -184,13 +186,19 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts) case Opt_uid: if (match_int(&args[0], &option)) return -EINVAL; - opts->uid = option; + uid = make_kuid(current_user_ns(), option); + if (!uid_valid(uid)) + return -EINVAL; + opts->uid = uid; opts->setuid = 1; break; case Opt_gid: if (match_int(&args[0], &option)) return -EINVAL; - opts->gid = option; + gid = make_kgid(current_user_ns(), option); + if (!gid_valid(gid)) + return -EINVAL; + opts->gid = gid; opts->setgid = 1; break; case Opt_mode: @@ -315,9 +323,9 @@ static int devpts_show_options(struct seq_file *seq, struct dentry *root) struct pts_mount_opts *opts = &fsi->mount_opts; if (opts->setuid) - seq_printf(seq, ",uid=%u", opts->uid); + seq_printf(seq, ",uid=%u", from_kuid_munged(&init_user_ns, opts->uid)); if (opts->setgid) - seq_printf(seq, ",gid=%u", opts->gid); + seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, opts->gid)); seq_printf(seq, ",mode=%03o", opts->mode); #ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES seq_printf(seq, ",ptmxmode=%03o", opts->ptmxmode); diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index 90e5997262ea..63dc19c54d5a 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -310,6 +310,7 @@ void dlm_callback_resume(struct dlm_ls *ls) } mutex_unlock(&ls->ls_cb_mutex); - log_debug(ls, "dlm_callback_resume %d", count); + if (count) + log_debug(ls, "dlm_callback_resume %d", count); } diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 3a564d197e99..bc342f7ac3af 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -38,6 +38,7 @@ #include <linux/miscdevice.h> #include <linux/mutex.h> #include <linux/idr.h> +#include <linux/ratelimit.h> #include <asm/uaccess.h> #include <linux/dlm.h> @@ -74,6 +75,13 @@ do { \ (ls)->ls_name , ##args); \ } while (0) +#define log_limit(ls, fmt, args...) \ +do { \ + if (dlm_config.ci_log_debug) \ + printk_ratelimited(KERN_DEBUG "dlm: %s: " fmt "\n", \ + (ls)->ls_name , ##args); \ +} while (0) + #define DLM_ASSERT(x, do) \ { \ if (!(x)) \ @@ -263,6 +271,8 @@ struct dlm_lkb { ktime_t lkb_last_cast_time; /* for debugging */ ktime_t lkb_last_bast_time; /* for debugging */ + uint64_t lkb_recover_seq; /* from ls_recover_seq */ + char *lkb_lvbptr; struct dlm_lksb *lkb_lksb; /* caller's status block */ void (*lkb_astfn) (void *astparam); @@ -317,7 +327,7 @@ enum rsb_flags { RSB_NEW_MASTER, RSB_NEW_MASTER2, RSB_RECOVER_CONVERT, - RSB_LOCKS_PURGED, + RSB_RECOVER_GRANT, }; static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag) @@ -563,6 +573,7 @@ struct dlm_ls { struct mutex ls_requestqueue_mutex; struct dlm_rcom *ls_recover_buf; int ls_recover_nodeid; /* for debugging */ + unsigned int ls_recover_locks_in; /* for log info */ uint64_t ls_rcom_seq; spinlock_t ls_rcom_spin; struct list_head ls_recover_list; @@ -589,6 +600,7 @@ struct dlm_ls { #define LSFL_UEVENT_WAIT 5 #define LSFL_TIMEWARN 6 #define LSFL_CB_DELAY 7 +#define LSFL_NODIR 8 /* much of this is just saving user space pointers associated with the lock that we pass back to the user lib with an ast */ @@ -636,7 +648,7 @@ static inline int dlm_recovery_stopped(struct dlm_ls *ls) static inline int dlm_no_directory(struct dlm_ls *ls) { - return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0; + return test_bit(LSFL_NODIR, &ls->ls_flags); } int dlm_netlink_init(void); diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 4c58d4a3adc4..bdafb65a5234 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -160,11 +160,12 @@ static const int __quecvt_compat_matrix[8][8] = { void dlm_print_lkb(struct dlm_lkb *lkb) { - printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x\n" - " status %d rqmode %d grmode %d wait_type %d\n", + printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x " + "sts %d rq %d gr %d wait_type %d wait_nodeid %d seq %llu\n", lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags, lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode, - lkb->lkb_grmode, lkb->lkb_wait_type); + lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_wait_nodeid, + (unsigned long long)lkb->lkb_recover_seq); } static void dlm_print_rsb(struct dlm_rsb *r) @@ -251,8 +252,6 @@ static inline int is_process_copy(struct dlm_lkb *lkb) static inline int is_master_copy(struct dlm_lkb *lkb) { - if (lkb->lkb_flags & DLM_IFL_MSTCPY) - DLM_ASSERT(lkb->lkb_nodeid, dlm_print_lkb(lkb);); return (lkb->lkb_flags & DLM_IFL_MSTCPY) ? 1 : 0; } @@ -479,6 +478,9 @@ static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b, kref_get(&r->res_ref); goto out; } + if (error == -ENOTBLK) + goto out; + error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, flags, &r); if (error) goto out; @@ -586,6 +588,23 @@ static int find_rsb(struct dlm_ls *ls, char *name, int namelen, return error; } +static void dlm_dump_rsb_hash(struct dlm_ls *ls, uint32_t hash) +{ + struct rb_node *n; + struct dlm_rsb *r; + int i; + + for (i = 0; i < ls->ls_rsbtbl_size; i++) { + spin_lock(&ls->ls_rsbtbl[i].lock); + for (n = rb_first(&ls->ls_rsbtbl[i].keep); n; n = rb_next(n)) { + r = rb_entry(n, struct dlm_rsb, res_hashnode); + if (r->res_hash == hash) + dlm_dump_rsb(r); + } + spin_unlock(&ls->ls_rsbtbl[i].lock); + } +} + /* This is only called to add a reference when the code already holds a valid reference to the rsb, so there's no need for locking. */ @@ -1064,8 +1083,9 @@ static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype, goto out_del; } - log_error(ls, "remwait error %x reply %d flags %x no wait_type", - lkb->lkb_id, mstype, lkb->lkb_flags); + log_error(ls, "remwait error %x remote %d %x msg %d flags %x no wait", + lkb->lkb_id, ms ? ms->m_header.h_nodeid : 0, lkb->lkb_remid, + mstype, lkb->lkb_flags); return -1; out_del: @@ -1498,13 +1518,13 @@ static void _grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) } lkb->lkb_rqmode = DLM_LOCK_IV; + lkb->lkb_highbast = 0; } static void grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) { set_lvb_lock(r, lkb); _grant_lock(r, lkb); - lkb->lkb_highbast = 0; } static void grant_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb, @@ -1866,7 +1886,8 @@ static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now, /* Returns the highest requested mode of all blocked conversions; sets cw if there's a blocked conversion to DLM_LOCK_CW. */ -static int grant_pending_convert(struct dlm_rsb *r, int high, int *cw) +static int grant_pending_convert(struct dlm_rsb *r, int high, int *cw, + unsigned int *count) { struct dlm_lkb *lkb, *s; int hi, demoted, quit, grant_restart, demote_restart; @@ -1885,6 +1906,8 @@ static int grant_pending_convert(struct dlm_rsb *r, int high, int *cw) if (can_be_granted(r, lkb, 0, &deadlk)) { grant_lock_pending(r, lkb); grant_restart = 1; + if (count) + (*count)++; continue; } @@ -1918,14 +1941,17 @@ static int grant_pending_convert(struct dlm_rsb *r, int high, int *cw) return max_t(int, high, hi); } -static int grant_pending_wait(struct dlm_rsb *r, int high, int *cw) +static int grant_pending_wait(struct dlm_rsb *r, int high, int *cw, + unsigned int *count) { struct dlm_lkb *lkb, *s; list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) { - if (can_be_granted(r, lkb, 0, NULL)) + if (can_be_granted(r, lkb, 0, NULL)) { grant_lock_pending(r, lkb); - else { + if (count) + (*count)++; + } else { high = max_t(int, lkb->lkb_rqmode, high); if (lkb->lkb_rqmode == DLM_LOCK_CW) *cw = 1; @@ -1954,16 +1980,20 @@ static int lock_requires_bast(struct dlm_lkb *gr, int high, int cw) return 0; } -static void grant_pending_locks(struct dlm_rsb *r) +static void grant_pending_locks(struct dlm_rsb *r, unsigned int *count) { struct dlm_lkb *lkb, *s; int high = DLM_LOCK_IV; int cw = 0; - DLM_ASSERT(is_master(r), dlm_dump_rsb(r);); + if (!is_master(r)) { + log_print("grant_pending_locks r nodeid %d", r->res_nodeid); + dlm_dump_rsb(r); + return; + } - high = grant_pending_convert(r, high, &cw); - high = grant_pending_wait(r, high, &cw); + high = grant_pending_convert(r, high, &cw, count); + high = grant_pending_wait(r, high, &cw, count); if (high == DLM_LOCK_IV) return; @@ -2499,7 +2529,7 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb) before we try again to grant this one. */ if (is_demoted(lkb)) { - grant_pending_convert(r, DLM_LOCK_IV, NULL); + grant_pending_convert(r, DLM_LOCK_IV, NULL, NULL); if (_can_be_granted(r, lkb, 1)) { grant_lock(r, lkb); queue_cast(r, lkb, 0); @@ -2527,7 +2557,7 @@ static void do_convert_effects(struct dlm_rsb *r, struct dlm_lkb *lkb, { switch (error) { case 0: - grant_pending_locks(r); + grant_pending_locks(r, NULL); /* grant_pending_locks also sends basts */ break; case -EAGAIN: @@ -2550,7 +2580,7 @@ static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb) static void do_unlock_effects(struct dlm_rsb *r, struct dlm_lkb *lkb, int error) { - grant_pending_locks(r); + grant_pending_locks(r, NULL); } /* returns: 0 did nothing, -DLM_ECANCEL canceled lock */ @@ -2571,7 +2601,7 @@ static void do_cancel_effects(struct dlm_rsb *r, struct dlm_lkb *lkb, int error) { if (error) - grant_pending_locks(r); + grant_pending_locks(r, NULL); } /* @@ -3372,7 +3402,7 @@ static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms) return error; } -static void receive_request(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_request(struct dlm_ls *ls, struct dlm_message *ms) { struct dlm_lkb *lkb; struct dlm_rsb *r; @@ -3412,14 +3442,15 @@ static void receive_request(struct dlm_ls *ls, struct dlm_message *ms) error = 0; if (error) dlm_put_lkb(lkb); - return; + return 0; fail: setup_stub_lkb(ls, ms); send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error); + return error; } -static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_convert(struct dlm_ls *ls, struct dlm_message *ms) { struct dlm_lkb *lkb; struct dlm_rsb *r; @@ -3429,6 +3460,15 @@ static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms) if (error) goto fail; + if (lkb->lkb_remid != ms->m_lkid) { + log_error(ls, "receive_convert %x remid %x recover_seq %llu " + "remote %d %x", lkb->lkb_id, lkb->lkb_remid, + (unsigned long long)lkb->lkb_recover_seq, + ms->m_header.h_nodeid, ms->m_lkid); + error = -ENOENT; + goto fail; + } + r = lkb->lkb_resource; hold_rsb(r); @@ -3456,14 +3496,15 @@ static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms) unlock_rsb(r); put_rsb(r); dlm_put_lkb(lkb); - return; + return 0; fail: setup_stub_lkb(ls, ms); send_convert_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error); + return error; } -static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_unlock(struct dlm_ls *ls, struct dlm_message *ms) { struct dlm_lkb *lkb; struct dlm_rsb *r; @@ -3473,6 +3514,14 @@ static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms) if (error) goto fail; + if (lkb->lkb_remid != ms->m_lkid) { + log_error(ls, "receive_unlock %x remid %x remote %d %x", + lkb->lkb_id, lkb->lkb_remid, + ms->m_header.h_nodeid, ms->m_lkid); + error = -ENOENT; + goto fail; + } + r = lkb->lkb_resource; hold_rsb(r); @@ -3497,14 +3546,15 @@ static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms) unlock_rsb(r); put_rsb(r); dlm_put_lkb(lkb); - return; + return 0; fail: setup_stub_lkb(ls, ms); send_unlock_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error); + return error; } -static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_cancel(struct dlm_ls *ls, struct dlm_message *ms) { struct dlm_lkb *lkb; struct dlm_rsb *r; @@ -3532,25 +3582,23 @@ static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms) unlock_rsb(r); put_rsb(r); dlm_put_lkb(lkb); - return; + return 0; fail: setup_stub_lkb(ls, ms); send_cancel_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error); + return error; } -static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_grant(struct dlm_ls *ls, struct dlm_message *ms) { struct dlm_lkb *lkb; struct dlm_rsb *r; int error; error = find_lkb(ls, ms->m_remid, &lkb); - if (error) { - log_debug(ls, "receive_grant from %d no lkb %x", - ms->m_header.h_nodeid, ms->m_remid); - return; - } + if (error) + return error; r = lkb->lkb_resource; @@ -3570,20 +3618,18 @@ static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms) unlock_rsb(r); put_rsb(r); dlm_put_lkb(lkb); + return 0; } -static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_bast(struct dlm_ls *ls, struct dlm_message *ms) { struct dlm_lkb *lkb; struct dlm_rsb *r; int error; error = find_lkb(ls, ms->m_remid, &lkb); - if (error) { - log_debug(ls, "receive_bast from %d no lkb %x", - ms->m_header.h_nodeid, ms->m_remid); - return; - } + if (error) + return error; r = lkb->lkb_resource; @@ -3595,10 +3641,12 @@ static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms) goto out; queue_bast(r, lkb, ms->m_bastmode); + lkb->lkb_highbast = ms->m_bastmode; out: unlock_rsb(r); put_rsb(r); dlm_put_lkb(lkb); + return 0; } static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms) @@ -3653,18 +3701,15 @@ static void receive_purge(struct dlm_ls *ls, struct dlm_message *ms) do_purge(ls, ms->m_nodeid, ms->m_pid); } -static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms) { struct dlm_lkb *lkb; struct dlm_rsb *r; int error, mstype, result; error = find_lkb(ls, ms->m_remid, &lkb); - if (error) { - log_debug(ls, "receive_request_reply from %d no lkb %x", - ms->m_header.h_nodeid, ms->m_remid); - return; - } + if (error) + return error; r = lkb->lkb_resource; hold_rsb(r); @@ -3676,8 +3721,13 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms) mstype = lkb->lkb_wait_type; error = remove_from_waiters(lkb, DLM_MSG_REQUEST_REPLY); - if (error) + if (error) { + log_error(ls, "receive_request_reply %x remote %d %x result %d", + lkb->lkb_id, ms->m_header.h_nodeid, ms->m_lkid, + ms->m_result); + dlm_dump_rsb(r); goto out; + } /* Optimization: the dir node was also the master, so it took our lookup as a request and sent request reply instead of lookup reply */ @@ -3755,6 +3805,7 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms) unlock_rsb(r); put_rsb(r); dlm_put_lkb(lkb); + return 0; } static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, @@ -3793,8 +3844,11 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, break; default: - log_error(r->res_ls, "receive_convert_reply %x error %d", - lkb->lkb_id, ms->m_result); + log_error(r->res_ls, "receive_convert_reply %x remote %d %x %d", + lkb->lkb_id, ms->m_header.h_nodeid, ms->m_lkid, + ms->m_result); + dlm_print_rsb(r); + dlm_print_lkb(lkb); } } @@ -3821,20 +3875,18 @@ static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms) put_rsb(r); } -static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms) { struct dlm_lkb *lkb; int error; error = find_lkb(ls, ms->m_remid, &lkb); - if (error) { - log_debug(ls, "receive_convert_reply from %d no lkb %x", - ms->m_header.h_nodeid, ms->m_remid); - return; - } + if (error) + return error; _receive_convert_reply(lkb, ms); dlm_put_lkb(lkb); + return 0; } static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms) @@ -3873,20 +3925,18 @@ static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms) put_rsb(r); } -static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms) { struct dlm_lkb *lkb; int error; error = find_lkb(ls, ms->m_remid, &lkb); - if (error) { - log_debug(ls, "receive_unlock_reply from %d no lkb %x", - ms->m_header.h_nodeid, ms->m_remid); - return; - } + if (error) + return error; _receive_unlock_reply(lkb, ms); dlm_put_lkb(lkb); + return 0; } static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms) @@ -3925,20 +3975,18 @@ static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms) put_rsb(r); } -static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms) { struct dlm_lkb *lkb; int error; error = find_lkb(ls, ms->m_remid, &lkb); - if (error) { - log_debug(ls, "receive_cancel_reply from %d no lkb %x", - ms->m_header.h_nodeid, ms->m_remid); - return; - } + if (error) + return error; _receive_cancel_reply(lkb, ms); dlm_put_lkb(lkb); + return 0; } static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms) @@ -3949,7 +3997,7 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms) error = find_lkb(ls, ms->m_lkid, &lkb); if (error) { - log_error(ls, "receive_lookup_reply no lkb"); + log_error(ls, "receive_lookup_reply no lkid %x", ms->m_lkid); return; } @@ -3993,8 +4041,11 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms) dlm_put_lkb(lkb); } -static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms) +static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms, + uint32_t saved_seq) { + int error = 0, noent = 0; + if (!dlm_is_member(ls, ms->m_header.h_nodeid)) { log_debug(ls, "ignore non-member message %d from %d %x %x %d", ms->m_type, ms->m_header.h_nodeid, ms->m_lkid, @@ -4007,47 +4058,50 @@ static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms) /* messages sent to a master node */ case DLM_MSG_REQUEST: - receive_request(ls, ms); + error = receive_request(ls, ms); break; case DLM_MSG_CONVERT: - receive_convert(ls, ms); + error = receive_convert(ls, ms); break; case DLM_MSG_UNLOCK: - receive_unlock(ls, ms); + error = receive_unlock(ls, ms); break; case DLM_MSG_CANCEL: - receive_cancel(ls, ms); + noent = 1; + error = receive_cancel(ls, ms); break; /* messages sent from a master node (replies to above) */ case DLM_MSG_REQUEST_REPLY: - receive_request_reply(ls, ms); + error = receive_request_reply(ls, ms); break; case DLM_MSG_CONVERT_REPLY: - receive_convert_reply(ls, ms); + error = receive_convert_reply(ls, ms); break; case DLM_MSG_UNLOCK_REPLY: - receive_unlock_reply(ls, ms); + error = receive_unlock_reply(ls, ms); break; case DLM_MSG_CANCEL_REPLY: - receive_cancel_reply(ls, ms); + error = receive_cancel_reply(ls, ms); break; /* messages sent from a master node (only two types of async msg) */ case DLM_MSG_GRANT: - receive_grant(ls, ms); + noent = 1; + error = receive_grant(ls, ms); break; case DLM_MSG_BAST: - receive_bast(ls, ms); + noent = 1; + error = receive_bast(ls, ms); break; /* messages sent to a dir node */ @@ -4075,6 +4129,37 @@ static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms) default: log_error(ls, "unknown message type %d", ms->m_type); } + + /* + * When checking for ENOENT, we're checking the result of + * find_lkb(m_remid): + * + * The lock id referenced in the message wasn't found. This may + * happen in normal usage for the async messages and cancel, so + * only use log_debug for them. + * + * Some errors are expected and normal. + */ + + if (error == -ENOENT && noent) { + log_debug(ls, "receive %d no %x remote %d %x saved_seq %u", + ms->m_type, ms->m_remid, ms->m_header.h_nodeid, + ms->m_lkid, saved_seq); + } else if (error == -ENOENT) { + log_error(ls, "receive %d no %x remote %d %x saved_seq %u", + ms->m_type, ms->m_remid, ms->m_header.h_nodeid, + ms->m_lkid, saved_seq); + + if (ms->m_type == DLM_MSG_CONVERT) + dlm_dump_rsb_hash(ls, ms->m_hash); + } + + if (error == -EINVAL) { + log_error(ls, "receive %d inval from %d lkid %x remid %x " + "saved_seq %u", + ms->m_type, ms->m_header.h_nodeid, + ms->m_lkid, ms->m_remid, saved_seq); + } } /* If the lockspace is in recovery mode (locking stopped), then normal @@ -4092,16 +4177,17 @@ static void dlm_receive_message(struct dlm_ls *ls, struct dlm_message *ms, dlm_add_requestqueue(ls, nodeid, ms); } else { dlm_wait_requestqueue(ls); - _receive_message(ls, ms); + _receive_message(ls, ms, 0); } } /* This is called by dlm_recoverd to process messages that were saved on the requestqueue. */ -void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms) +void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms, + uint32_t saved_seq) { - _receive_message(ls, ms); + _receive_message(ls, ms, saved_seq); } /* This is called by the midcomms layer when something is received for @@ -4137,9 +4223,11 @@ void dlm_receive_buffer(union dlm_packet *p, int nodeid) ls = dlm_find_lockspace_global(hd->h_lockspace); if (!ls) { - if (dlm_config.ci_log_debug) - log_print("invalid lockspace %x from %d cmd %d type %d", - hd->h_lockspace, nodeid, hd->h_cmd, type); + if (dlm_config.ci_log_debug) { + printk_ratelimited(KERN_DEBUG "dlm: invalid lockspace " + "%u from %d cmd %d type %d\n", + hd->h_lockspace, nodeid, hd->h_cmd, type); + } if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS) dlm_send_ls_not_ready(nodeid, &p->rcom); @@ -4187,15 +4275,13 @@ static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb, /* A waiting lkb needs recovery if the master node has failed, or the master node is changing (only when no directory is used) */ -static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb) +static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb, + int dir_nodeid) { - if (dlm_is_removed(ls, lkb->lkb_nodeid)) + if (dlm_no_directory(ls)) return 1; - if (!dlm_no_directory(ls)) - return 0; - - if (dlm_dir_nodeid(lkb->lkb_resource) != lkb->lkb_nodeid) + if (dlm_is_removed(ls, lkb->lkb_wait_nodeid)) return 1; return 0; @@ -4212,6 +4298,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) struct dlm_lkb *lkb, *safe; struct dlm_message *ms_stub; int wait_type, stub_unlock_result, stub_cancel_result; + int dir_nodeid; ms_stub = kmalloc(sizeof(struct dlm_message), GFP_KERNEL); if (!ms_stub) { @@ -4223,13 +4310,21 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) { + dir_nodeid = dlm_dir_nodeid(lkb->lkb_resource); + /* exclude debug messages about unlocks because there can be so many and they aren't very interesting */ if (lkb->lkb_wait_type != DLM_MSG_UNLOCK) { - log_debug(ls, "recover_waiter %x nodeid %d " - "msg %d to %d", lkb->lkb_id, lkb->lkb_nodeid, - lkb->lkb_wait_type, lkb->lkb_wait_nodeid); + log_debug(ls, "waiter %x remote %x msg %d r_nodeid %d " + "lkb_nodeid %d wait_nodeid %d dir_nodeid %d", + lkb->lkb_id, + lkb->lkb_remid, + lkb->lkb_wait_type, + lkb->lkb_resource->res_nodeid, + lkb->lkb_nodeid, + lkb->lkb_wait_nodeid, + dir_nodeid); } /* all outstanding lookups, regardless of destination will be @@ -4240,7 +4335,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) continue; } - if (!waiter_needs_recovery(ls, lkb)) + if (!waiter_needs_recovery(ls, lkb, dir_nodeid)) continue; wait_type = lkb->lkb_wait_type; @@ -4373,8 +4468,11 @@ int dlm_recover_waiters_post(struct dlm_ls *ls) ou = is_overlap_unlock(lkb); err = 0; - log_debug(ls, "recover_waiter %x nodeid %d msg %d r_nodeid %d", - lkb->lkb_id, lkb->lkb_nodeid, mstype, r->res_nodeid); + log_debug(ls, "waiter %x remote %x msg %d r_nodeid %d " + "lkb_nodeid %d wait_nodeid %d dir_nodeid %d " + "overlap %d %d", lkb->lkb_id, lkb->lkb_remid, mstype, + r->res_nodeid, lkb->lkb_nodeid, lkb->lkb_wait_nodeid, + dlm_dir_nodeid(r), oc, ou); /* At this point we assume that we won't get a reply to any previous op or overlap op on this lock. First, do a big @@ -4426,9 +4524,12 @@ int dlm_recover_waiters_post(struct dlm_ls *ls) } } - if (err) - log_error(ls, "recover_waiters_post %x %d %x %d %d", - lkb->lkb_id, mstype, lkb->lkb_flags, oc, ou); + if (err) { + log_error(ls, "waiter %x msg %d r_nodeid %d " + "dir_nodeid %d overlap %d %d", + lkb->lkb_id, mstype, r->res_nodeid, + dlm_dir_nodeid(r), oc, ou); + } unlock_rsb(r); put_rsb(r); dlm_put_lkb(lkb); @@ -4437,112 +4538,177 @@ int dlm_recover_waiters_post(struct dlm_ls *ls) return error; } -static void purge_queue(struct dlm_rsb *r, struct list_head *queue, - int (*test)(struct dlm_ls *ls, struct dlm_lkb *lkb)) +static void purge_mstcpy_list(struct dlm_ls *ls, struct dlm_rsb *r, + struct list_head *list) { - struct dlm_ls *ls = r->res_ls; struct dlm_lkb *lkb, *safe; - list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) { - if (test(ls, lkb)) { - rsb_set_flag(r, RSB_LOCKS_PURGED); - del_lkb(r, lkb); - /* this put should free the lkb */ - if (!dlm_put_lkb(lkb)) - log_error(ls, "purged lkb not released"); - } + list_for_each_entry_safe(lkb, safe, list, lkb_statequeue) { + if (!is_master_copy(lkb)) + continue; + + /* don't purge lkbs we've added in recover_master_copy for + the current recovery seq */ + + if (lkb->lkb_recover_seq == ls->ls_recover_seq) + continue; + + del_lkb(r, lkb); + + /* this put should free the lkb */ + if (!dlm_put_lkb(lkb)) + log_error(ls, "purged mstcpy lkb not released"); } } -static int purge_dead_test(struct dlm_ls *ls, struct dlm_lkb *lkb) +void dlm_purge_mstcpy_locks(struct dlm_rsb *r) { - return (is_master_copy(lkb) && dlm_is_removed(ls, lkb->lkb_nodeid)); -} + struct dlm_ls *ls = r->res_ls; -static int purge_mstcpy_test(struct dlm_ls *ls, struct dlm_lkb *lkb) -{ - return is_master_copy(lkb); + purge_mstcpy_list(ls, r, &r->res_grantqueue); + purge_mstcpy_list(ls, r, &r->res_convertqueue); + purge_mstcpy_list(ls, r, &r->res_waitqueue); } -static void purge_dead_locks(struct dlm_rsb *r) +static void purge_dead_list(struct dlm_ls *ls, struct dlm_rsb *r, + struct list_head *list, + int nodeid_gone, unsigned int *count) { - purge_queue(r, &r->res_grantqueue, &purge_dead_test); - purge_queue(r, &r->res_convertqueue, &purge_dead_test); - purge_queue(r, &r->res_waitqueue, &purge_dead_test); -} + struct dlm_lkb *lkb, *safe; -void dlm_purge_mstcpy_locks(struct dlm_rsb *r) -{ - purge_queue(r, &r->res_grantqueue, &purge_mstcpy_test); - purge_queue(r, &r->res_convertqueue, &purge_mstcpy_test); - purge_queue(r, &r->res_waitqueue, &purge_mstcpy_test); + list_for_each_entry_safe(lkb, safe, list, lkb_statequeue) { + if (!is_master_copy(lkb)) + continue; + + if ((lkb->lkb_nodeid == nodeid_gone) || + dlm_is_removed(ls, lkb->lkb_nodeid)) { + + del_lkb(r, lkb); + + /* this put should free the lkb */ + if (!dlm_put_lkb(lkb)) + log_error(ls, "purged dead lkb not released"); + + rsb_set_flag(r, RSB_RECOVER_GRANT); + + (*count)++; + } + } } /* Get rid of locks held by nodes that are gone. */ -int dlm_purge_locks(struct dlm_ls *ls) +void dlm_recover_purge(struct dlm_ls *ls) { struct dlm_rsb *r; + struct dlm_member *memb; + int nodes_count = 0; + int nodeid_gone = 0; + unsigned int lkb_count = 0; - log_debug(ls, "dlm_purge_locks"); + /* cache one removed nodeid to optimize the common + case of a single node removed */ + + list_for_each_entry(memb, &ls->ls_nodes_gone, list) { + nodes_count++; + nodeid_gone = memb->nodeid; + } + + if (!nodes_count) + return; down_write(&ls->ls_root_sem); list_for_each_entry(r, &ls->ls_root_list, res_root_list) { hold_rsb(r); lock_rsb(r); - if (is_master(r)) - purge_dead_locks(r); + if (is_master(r)) { + purge_dead_list(ls, r, &r->res_grantqueue, + nodeid_gone, &lkb_count); + purge_dead_list(ls, r, &r->res_convertqueue, + nodeid_gone, &lkb_count); + purge_dead_list(ls, r, &r->res_waitqueue, + nodeid_gone, &lkb_count); + } unlock_rsb(r); unhold_rsb(r); - - schedule(); + cond_resched(); } up_write(&ls->ls_root_sem); - return 0; + if (lkb_count) + log_debug(ls, "dlm_recover_purge %u locks for %u nodes", + lkb_count, nodes_count); } -static struct dlm_rsb *find_purged_rsb(struct dlm_ls *ls, int bucket) +static struct dlm_rsb *find_grant_rsb(struct dlm_ls *ls, int bucket) { struct rb_node *n; - struct dlm_rsb *r, *r_ret = NULL; + struct dlm_rsb *r; spin_lock(&ls->ls_rsbtbl[bucket].lock); for (n = rb_first(&ls->ls_rsbtbl[bucket].keep); n; n = rb_next(n)) { r = rb_entry(n, struct dlm_rsb, res_hashnode); - if (!rsb_flag(r, RSB_LOCKS_PURGED)) + + if (!rsb_flag(r, RSB_RECOVER_GRANT)) + continue; + rsb_clear_flag(r, RSB_RECOVER_GRANT); + if (!is_master(r)) continue; hold_rsb(r); - rsb_clear_flag(r, RSB_LOCKS_PURGED); - r_ret = r; - break; + spin_unlock(&ls->ls_rsbtbl[bucket].lock); + return r; } spin_unlock(&ls->ls_rsbtbl[bucket].lock); - return r_ret; + return NULL; } -void dlm_grant_after_purge(struct dlm_ls *ls) +/* + * Attempt to grant locks on resources that we are the master of. + * Locks may have become grantable during recovery because locks + * from departed nodes have been purged (or not rebuilt), allowing + * previously blocked locks to now be granted. The subset of rsb's + * we are interested in are those with lkb's on either the convert or + * waiting queues. + * + * Simplest would be to go through each master rsb and check for non-empty + * convert or waiting queues, and attempt to grant on those rsbs. + * Checking the queues requires lock_rsb, though, for which we'd need + * to release the rsbtbl lock. This would make iterating through all + * rsb's very inefficient. So, we rely on earlier recovery routines + * to set RECOVER_GRANT on any rsb's that we should attempt to grant + * locks for. + */ + +void dlm_recover_grant(struct dlm_ls *ls) { struct dlm_rsb *r; int bucket = 0; + unsigned int count = 0; + unsigned int rsb_count = 0; + unsigned int lkb_count = 0; while (1) { - r = find_purged_rsb(ls, bucket); + r = find_grant_rsb(ls, bucket); if (!r) { if (bucket == ls->ls_rsbtbl_size - 1) break; bucket++; continue; } + rsb_count++; + count = 0; lock_rsb(r); - if (is_master(r)) { - grant_pending_locks(r); - confirm_master(r, 0); - } + grant_pending_locks(r, &count); + lkb_count += count; + confirm_master(r, 0); unlock_rsb(r); put_rsb(r); - schedule(); + cond_resched(); } + + if (lkb_count) + log_debug(ls, "dlm_recover_grant %u locks on %u resources", + lkb_count, rsb_count); } static struct dlm_lkb *search_remid_list(struct list_head *head, int nodeid, @@ -4631,6 +4797,7 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc) struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf; struct dlm_rsb *r; struct dlm_lkb *lkb; + uint32_t remid = 0; int error; if (rl->rl_parent_lkid) { @@ -4638,14 +4805,31 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc) goto out; } - error = find_rsb(ls, rl->rl_name, le16_to_cpu(rl->rl_namelen), - R_MASTER, &r); + remid = le32_to_cpu(rl->rl_lkid); + + /* In general we expect the rsb returned to be R_MASTER, but we don't + have to require it. Recovery of masters on one node can overlap + recovery of locks on another node, so one node can send us MSTCPY + locks before we've made ourselves master of this rsb. We can still + add new MSTCPY locks that we receive here without any harm; when + we make ourselves master, dlm_recover_masters() won't touch the + MSTCPY locks we've received early. */ + + error = find_rsb(ls, rl->rl_name, le16_to_cpu(rl->rl_namelen), 0, &r); if (error) goto out; + if (dlm_no_directory(ls) && (dlm_dir_nodeid(r) != dlm_our_nodeid())) { + log_error(ls, "dlm_recover_master_copy remote %d %x not dir", + rc->rc_header.h_nodeid, remid); + error = -EBADR; + put_rsb(r); + goto out; + } + lock_rsb(r); - lkb = search_remid(r, rc->rc_header.h_nodeid, le32_to_cpu(rl->rl_lkid)); + lkb = search_remid(r, rc->rc_header.h_nodeid, remid); if (lkb) { error = -EEXIST; goto out_remid; @@ -4664,19 +4848,25 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc) attach_lkb(r, lkb); add_lkb(r, lkb, rl->rl_status); error = 0; + ls->ls_recover_locks_in++; + + if (!list_empty(&r->res_waitqueue) || !list_empty(&r->res_convertqueue)) + rsb_set_flag(r, RSB_RECOVER_GRANT); out_remid: /* this is the new value returned to the lock holder for saving in its process-copy lkb */ rl->rl_remid = cpu_to_le32(lkb->lkb_id); + lkb->lkb_recover_seq = ls->ls_recover_seq; + out_unlock: unlock_rsb(r); put_rsb(r); out: - if (error) - log_debug(ls, "recover_master_copy %d %x", error, - le32_to_cpu(rl->rl_lkid)); + if (error && error != -EEXIST) + log_debug(ls, "dlm_recover_master_copy remote %d %x error %d", + rc->rc_header.h_nodeid, remid, error); rl->rl_result = cpu_to_le32(error); return error; } @@ -4687,41 +4877,52 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc) struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf; struct dlm_rsb *r; struct dlm_lkb *lkb; - int error; + uint32_t lkid, remid; + int error, result; + + lkid = le32_to_cpu(rl->rl_lkid); + remid = le32_to_cpu(rl->rl_remid); + result = le32_to_cpu(rl->rl_result); - error = find_lkb(ls, le32_to_cpu(rl->rl_lkid), &lkb); + error = find_lkb(ls, lkid, &lkb); if (error) { - log_error(ls, "recover_process_copy no lkid %x", - le32_to_cpu(rl->rl_lkid)); + log_error(ls, "dlm_recover_process_copy no %x remote %d %x %d", + lkid, rc->rc_header.h_nodeid, remid, result); return error; } - DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb);); - - error = le32_to_cpu(rl->rl_result); - r = lkb->lkb_resource; hold_rsb(r); lock_rsb(r); - switch (error) { + if (!is_process_copy(lkb)) { + log_error(ls, "dlm_recover_process_copy bad %x remote %d %x %d", + lkid, rc->rc_header.h_nodeid, remid, result); + dlm_dump_rsb(r); + unlock_rsb(r); + put_rsb(r); + dlm_put_lkb(lkb); + return -EINVAL; + } + + switch (result) { case -EBADR: /* There's a chance the new master received our lock before dlm_recover_master_reply(), this wouldn't happen if we did a barrier between recover_masters and recover_locks. */ - log_debug(ls, "master copy not ready %x r %lx %s", lkb->lkb_id, - (unsigned long)r, r->res_name); + + log_debug(ls, "dlm_recover_process_copy %x remote %d %x %d", + lkid, rc->rc_header.h_nodeid, remid, result); + dlm_send_rcom_lock(r, lkb); goto out; case -EEXIST: - log_debug(ls, "master copy exists %x", lkb->lkb_id); - /* fall through */ case 0: - lkb->lkb_remid = le32_to_cpu(rl->rl_remid); + lkb->lkb_remid = remid; break; default: - log_error(ls, "dlm_recover_process_copy unknown error %d %x", - error, lkb->lkb_id); + log_error(ls, "dlm_recover_process_copy %x remote %d %x %d unk", + lkid, rc->rc_header.h_nodeid, remid, result); } /* an ack for dlm_recover_locks() which waits for replies from diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h index 1a255307f6ff..c8b226c62807 100644 --- a/fs/dlm/lock.h +++ b/fs/dlm/lock.h @@ -15,7 +15,8 @@ void dlm_dump_rsb(struct dlm_rsb *r); void dlm_print_lkb(struct dlm_lkb *lkb); -void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms); +void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms, + uint32_t saved_seq); void dlm_receive_buffer(union dlm_packet *p, int nodeid); int dlm_modes_compat(int mode1, int mode2); void dlm_put_rsb(struct dlm_rsb *r); @@ -31,9 +32,9 @@ void dlm_adjust_timeouts(struct dlm_ls *ls); int dlm_search_rsb_tree(struct rb_root *tree, char *name, int len, unsigned int flags, struct dlm_rsb **r_ret); -int dlm_purge_locks(struct dlm_ls *ls); +void dlm_recover_purge(struct dlm_ls *ls); void dlm_purge_mstcpy_locks(struct dlm_rsb *r); -void dlm_grant_after_purge(struct dlm_ls *ls); +void dlm_recover_grant(struct dlm_ls *ls); int dlm_recover_waiters_post(struct dlm_ls *ls); void dlm_recover_waiters_pre(struct dlm_ls *ls); int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc); diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index a1ea25face82..ca506abbdd3b 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -74,6 +74,19 @@ static ssize_t dlm_id_store(struct dlm_ls *ls, const char *buf, size_t len) return len; } +static ssize_t dlm_nodir_show(struct dlm_ls *ls, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", dlm_no_directory(ls)); +} + +static ssize_t dlm_nodir_store(struct dlm_ls *ls, const char *buf, size_t len) +{ + int val = simple_strtoul(buf, NULL, 0); + if (val == 1) + set_bit(LSFL_NODIR, &ls->ls_flags); + return len; +} + static ssize_t dlm_recover_status_show(struct dlm_ls *ls, char *buf) { uint32_t status = dlm_recover_status(ls); @@ -107,6 +120,12 @@ static struct dlm_attr dlm_attr_id = { .store = dlm_id_store }; +static struct dlm_attr dlm_attr_nodir = { + .attr = {.name = "nodir", .mode = S_IRUGO | S_IWUSR}, + .show = dlm_nodir_show, + .store = dlm_nodir_store +}; + static struct dlm_attr dlm_attr_recover_status = { .attr = {.name = "recover_status", .mode = S_IRUGO}, .show = dlm_recover_status_show @@ -121,6 +140,7 @@ static struct attribute *dlm_attrs[] = { &dlm_attr_control.attr, &dlm_attr_event.attr, &dlm_attr_id.attr, + &dlm_attr_nodir.attr, &dlm_attr_recover_status.attr, &dlm_attr_recover_nodeid.attr, NULL, diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 133ef6dc7cb7..5c1b0e38c7a4 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -142,6 +142,7 @@ struct writequeue_entry { static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT]; static int dlm_local_count; +static int dlm_allow_conn; /* Work queues */ static struct workqueue_struct *recv_workqueue; @@ -710,6 +711,13 @@ static int tcp_accept_from_sock(struct connection *con) struct connection *newcon; struct connection *addcon; + mutex_lock(&connections_lock); + if (!dlm_allow_conn) { + mutex_unlock(&connections_lock); + return -1; + } + mutex_unlock(&connections_lock); + memset(&peeraddr, 0, sizeof(peeraddr)); result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM, IPPROTO_TCP, &newsock); @@ -1503,6 +1511,7 @@ void dlm_lowcomms_stop(void) socket activity. */ mutex_lock(&connections_lock); + dlm_allow_conn = 0; foreach_conn(stop_conn); mutex_unlock(&connections_lock); @@ -1530,7 +1539,7 @@ int dlm_lowcomms_start(void) if (!dlm_local_count) { error = -ENOTCONN; log_print("no local IP address has been set"); - goto out; + goto fail; } error = -ENOMEM; @@ -1538,7 +1547,13 @@ int dlm_lowcomms_start(void) __alignof__(struct connection), 0, NULL); if (!con_cache) - goto out; + goto fail; + + error = work_start(); + if (error) + goto fail_destroy; + + dlm_allow_conn = 1; /* Start listening */ if (dlm_config.ci_protocol == 0) @@ -1548,20 +1563,17 @@ int dlm_lowcomms_start(void) if (error) goto fail_unlisten; - error = work_start(); - if (error) - goto fail_unlisten; - return 0; fail_unlisten: + dlm_allow_conn = 0; con = nodeid2con(0,0); if (con) { close_connection(con, false); kmem_cache_free(con_cache, con); } +fail_destroy: kmem_cache_destroy(con_cache); - -out: +fail: return error; } diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c index da64df7576e1..7cd24bccd4fe 100644 --- a/fs/dlm/memory.c +++ b/fs/dlm/memory.c @@ -21,21 +21,19 @@ static struct kmem_cache *rsb_cache; int __init dlm_memory_init(void) { - int ret = 0; - lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb), __alignof__(struct dlm_lkb), 0, NULL); if (!lkb_cache) - ret = -ENOMEM; + return -ENOMEM; rsb_cache = kmem_cache_create("dlm_rsb", sizeof(struct dlm_rsb), __alignof__(struct dlm_rsb), 0, NULL); if (!rsb_cache) { kmem_cache_destroy(lkb_cache); - ret = -ENOMEM; + return -ENOMEM; } - return ret; + return 0; } void dlm_memory_exit(void) diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index ac5c616c9696..64d3e2b958c7 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -486,47 +486,50 @@ int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) return 0; } -static int is_old_reply(struct dlm_ls *ls, struct dlm_rcom *rc) +/* Called by dlm_recv; corresponds to dlm_receive_message() but special + recovery-only comms are sent through here. */ + +void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) { + int lock_size = sizeof(struct dlm_rcom) + sizeof(struct rcom_lock); + int stop, reply = 0, lock = 0; + uint32_t status; uint64_t seq; - int rv = 0; switch (rc->rc_type) { + case DLM_RCOM_LOCK: + lock = 1; + break; + case DLM_RCOM_LOCK_REPLY: + lock = 1; + reply = 1; + break; case DLM_RCOM_STATUS_REPLY: case DLM_RCOM_NAMES_REPLY: case DLM_RCOM_LOOKUP_REPLY: - case DLM_RCOM_LOCK_REPLY: - spin_lock(&ls->ls_recover_lock); - seq = ls->ls_recover_seq; - spin_unlock(&ls->ls_recover_lock); - if (rc->rc_seq_reply != seq) { - log_debug(ls, "ignoring old reply %x from %d " - "seq_reply %llx expect %llx", - rc->rc_type, rc->rc_header.h_nodeid, - (unsigned long long)rc->rc_seq_reply, - (unsigned long long)seq); - rv = 1; - } - } - return rv; -} - -/* Called by dlm_recv; corresponds to dlm_receive_message() but special - recovery-only comms are sent through here. */ + reply = 1; + }; -void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) -{ - int lock_size = sizeof(struct dlm_rcom) + sizeof(struct rcom_lock); + spin_lock(&ls->ls_recover_lock); + status = ls->ls_recover_status; + stop = test_bit(LSFL_RECOVERY_STOP, &ls->ls_flags); + seq = ls->ls_recover_seq; + spin_unlock(&ls->ls_recover_lock); - if (dlm_recovery_stopped(ls) && (rc->rc_type != DLM_RCOM_STATUS)) { - log_debug(ls, "ignoring recovery message %x from %d", - rc->rc_type, nodeid); + if ((stop && (rc->rc_type != DLM_RCOM_STATUS)) || + (reply && (rc->rc_seq_reply != seq)) || + (lock && !(status & DLM_RS_DIR))) { + log_limit(ls, "dlm_receive_rcom ignore msg %d " + "from %d %llu %llu recover seq %llu sts %x gen %u", + rc->rc_type, + nodeid, + (unsigned long long)rc->rc_seq, + (unsigned long long)rc->rc_seq_reply, + (unsigned long long)seq, + status, ls->ls_generation); goto out; } - if (is_old_reply(ls, rc)) - goto out; - switch (rc->rc_type) { case DLM_RCOM_STATUS: receive_rcom_status(ls, rc); diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c index 34d5adf1fce7..7554e4dac6bb 100644 --- a/fs/dlm/recover.c +++ b/fs/dlm/recover.c @@ -339,9 +339,12 @@ static void set_lock_master(struct list_head *queue, int nodeid) { struct dlm_lkb *lkb; - list_for_each_entry(lkb, queue, lkb_statequeue) - if (!(lkb->lkb_flags & DLM_IFL_MSTCPY)) + list_for_each_entry(lkb, queue, lkb_statequeue) { + if (!(lkb->lkb_flags & DLM_IFL_MSTCPY)) { lkb->lkb_nodeid = nodeid; + lkb->lkb_remid = 0; + } + } } static void set_master_lkbs(struct dlm_rsb *r) @@ -354,18 +357,16 @@ static void set_master_lkbs(struct dlm_rsb *r) /* * Propagate the new master nodeid to locks * The NEW_MASTER flag tells dlm_recover_locks() which rsb's to consider. - * The NEW_MASTER2 flag tells recover_lvb() and set_locks_purged() which + * The NEW_MASTER2 flag tells recover_lvb() and recover_grant() which * rsb's to consider. */ static void set_new_master(struct dlm_rsb *r, int nodeid) { - lock_rsb(r); r->res_nodeid = nodeid; set_master_lkbs(r); rsb_set_flag(r, RSB_NEW_MASTER); rsb_set_flag(r, RSB_NEW_MASTER2); - unlock_rsb(r); } /* @@ -376,9 +377,9 @@ static void set_new_master(struct dlm_rsb *r, int nodeid) static int recover_master(struct dlm_rsb *r) { struct dlm_ls *ls = r->res_ls; - int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid(); - - dir_nodeid = dlm_dir_nodeid(r); + int error, ret_nodeid; + int our_nodeid = dlm_our_nodeid(); + int dir_nodeid = dlm_dir_nodeid(r); if (dir_nodeid == our_nodeid) { error = dlm_dir_lookup(ls, our_nodeid, r->res_name, @@ -388,7 +389,9 @@ static int recover_master(struct dlm_rsb *r) if (ret_nodeid == our_nodeid) ret_nodeid = 0; + lock_rsb(r); set_new_master(r, ret_nodeid); + unlock_rsb(r); } else { recover_list_add(r); error = dlm_send_rcom_lookup(r, dir_nodeid); @@ -398,24 +401,33 @@ static int recover_master(struct dlm_rsb *r) } /* - * When not using a directory, most resource names will hash to a new static - * master nodeid and the resource will need to be remastered. + * All MSTCPY locks are purged and rebuilt, even if the master stayed the same. + * This is necessary because recovery can be started, aborted and restarted, + * causing the master nodeid to briefly change during the aborted recovery, and + * change back to the original value in the second recovery. The MSTCPY locks + * may or may not have been purged during the aborted recovery. Another node + * with an outstanding request in waiters list and a request reply saved in the + * requestqueue, cannot know whether it should ignore the reply and resend the + * request, or accept the reply and complete the request. It must do the + * former if the remote node purged MSTCPY locks, and it must do the later if + * the remote node did not. This is solved by always purging MSTCPY locks, in + * which case, the request reply would always be ignored and the request + * resent. */ static int recover_master_static(struct dlm_rsb *r) { - int master = dlm_dir_nodeid(r); + int dir_nodeid = dlm_dir_nodeid(r); + int new_master = dir_nodeid; - if (master == dlm_our_nodeid()) - master = 0; + if (dir_nodeid == dlm_our_nodeid()) + new_master = 0; - if (r->res_nodeid != master) { - if (is_master(r)) - dlm_purge_mstcpy_locks(r); - set_new_master(r, master); - return 1; - } - return 0; + lock_rsb(r); + dlm_purge_mstcpy_locks(r); + set_new_master(r, new_master); + unlock_rsb(r); + return 1; } /* @@ -481,7 +493,9 @@ int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc) if (nodeid == dlm_our_nodeid()) nodeid = 0; + lock_rsb(r); set_new_master(r, nodeid); + unlock_rsb(r); recover_list_del(r); if (recover_list_empty(ls)) @@ -556,8 +570,6 @@ int dlm_recover_locks(struct dlm_ls *ls) struct dlm_rsb *r; int error, count = 0; - log_debug(ls, "dlm_recover_locks"); - down_read(&ls->ls_root_sem); list_for_each_entry(r, &ls->ls_root_list, res_root_list) { if (is_master(r)) { @@ -584,7 +596,7 @@ int dlm_recover_locks(struct dlm_ls *ls) } up_read(&ls->ls_root_sem); - log_debug(ls, "dlm_recover_locks %d locks", count); + log_debug(ls, "dlm_recover_locks %d out", count); error = dlm_wait_function(ls, &recover_list_empty); out: @@ -721,21 +733,19 @@ static void recover_conversion(struct dlm_rsb *r) } /* We've become the new master for this rsb and waiting/converting locks may - need to be granted in dlm_grant_after_purge() due to locks that may have + need to be granted in dlm_recover_grant() due to locks that may have existed from a removed node. */ -static void set_locks_purged(struct dlm_rsb *r) +static void recover_grant(struct dlm_rsb *r) { if (!list_empty(&r->res_waitqueue) || !list_empty(&r->res_convertqueue)) - rsb_set_flag(r, RSB_LOCKS_PURGED); + rsb_set_flag(r, RSB_RECOVER_GRANT); } void dlm_recover_rsbs(struct dlm_ls *ls) { struct dlm_rsb *r; - int count = 0; - - log_debug(ls, "dlm_recover_rsbs"); + unsigned int count = 0; down_read(&ls->ls_root_sem); list_for_each_entry(r, &ls->ls_root_list, res_root_list) { @@ -744,7 +754,7 @@ void dlm_recover_rsbs(struct dlm_ls *ls) if (rsb_flag(r, RSB_RECOVER_CONVERT)) recover_conversion(r); if (rsb_flag(r, RSB_NEW_MASTER2)) - set_locks_purged(r); + recover_grant(r); recover_lvb(r); count++; } @@ -754,7 +764,8 @@ void dlm_recover_rsbs(struct dlm_ls *ls) } up_read(&ls->ls_root_sem); - log_debug(ls, "dlm_recover_rsbs %d rsbs", count); + if (count) + log_debug(ls, "dlm_recover_rsbs %d done", count); } /* Create a single list of all root rsb's to be used during recovery */ diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index 3780caf7ae0c..f1a9073c0835 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -54,7 +54,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) unsigned long start; int error, neg = 0; - log_debug(ls, "dlm_recover %llx", (unsigned long long)rv->seq); + log_debug(ls, "dlm_recover %llu", (unsigned long long)rv->seq); mutex_lock(&ls->ls_recoverd_active); @@ -84,6 +84,8 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) goto fail; } + ls->ls_recover_locks_in = 0; + dlm_set_recover_status(ls, DLM_RS_NODES); error = dlm_recover_members_wait(ls); @@ -130,7 +132,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) * Clear lkb's for departed nodes. */ - dlm_purge_locks(ls); + dlm_recover_purge(ls); /* * Get new master nodeid's for rsb's that were mastered on @@ -161,6 +163,9 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) goto fail; } + log_debug(ls, "dlm_recover_locks %u in", + ls->ls_recover_locks_in); + /* * Finalize state in master rsb's now that all locks can be * checked. This includes conversion resolution and lvb @@ -225,9 +230,9 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) goto fail; } - dlm_grant_after_purge(ls); + dlm_recover_grant(ls); - log_debug(ls, "dlm_recover %llx generation %u done: %u ms", + log_debug(ls, "dlm_recover %llu generation %u done: %u ms", (unsigned long long)rv->seq, ls->ls_generation, jiffies_to_msecs(jiffies - start)); mutex_unlock(&ls->ls_recoverd_active); @@ -237,7 +242,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) fail: dlm_release_root_list(ls); - log_debug(ls, "dlm_recover %llx error %d", + log_debug(ls, "dlm_recover %llu error %d", (unsigned long long)rv->seq, error); mutex_unlock(&ls->ls_recoverd_active); return error; diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c index a44fa22890e1..1695f1b0dd45 100644 --- a/fs/dlm/requestqueue.c +++ b/fs/dlm/requestqueue.c @@ -19,6 +19,7 @@ struct rq_entry { struct list_head list; + uint32_t recover_seq; int nodeid; struct dlm_message request; }; @@ -41,6 +42,7 @@ void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms) return; } + e->recover_seq = ls->ls_recover_seq & 0xFFFFFFFF; e->nodeid = nodeid; memcpy(&e->request, ms, ms->m_header.h_length); @@ -63,6 +65,7 @@ void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms) int dlm_process_requestqueue(struct dlm_ls *ls) { struct rq_entry *e; + struct dlm_message *ms; int error = 0; mutex_lock(&ls->ls_requestqueue_mutex); @@ -76,7 +79,15 @@ int dlm_process_requestqueue(struct dlm_ls *ls) e = list_entry(ls->ls_requestqueue.next, struct rq_entry, list); mutex_unlock(&ls->ls_requestqueue_mutex); - dlm_receive_message_saved(ls, &e->request); + ms = &e->request; + + log_limit(ls, "dlm_process_requestqueue msg %d from %d " + "lkid %x remid %x result %d seq %u", + ms->m_type, ms->m_header.h_nodeid, + ms->m_lkid, ms->m_remid, ms->m_result, + e->recover_seq); + + dlm_receive_message_saved(ls, &e->request, e->recover_seq); mutex_lock(&ls->ls_requestqueue_mutex); list_del(&e->list); @@ -138,35 +149,7 @@ static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid) if (!dlm_no_directory(ls)) return 0; - /* with no directory, the master is likely to change as a part of - recovery; requests to/from the defunct master need to be purged */ - - switch (type) { - case DLM_MSG_REQUEST: - case DLM_MSG_CONVERT: - case DLM_MSG_UNLOCK: - case DLM_MSG_CANCEL: - /* we're no longer the master of this resource, the sender - will resend to the new master (see waiter_needs_recovery) */ - - if (dlm_hash2nodeid(ls, ms->m_hash) != dlm_our_nodeid()) - return 1; - break; - - case DLM_MSG_REQUEST_REPLY: - case DLM_MSG_CONVERT_REPLY: - case DLM_MSG_UNLOCK_REPLY: - case DLM_MSG_CANCEL_REPLY: - case DLM_MSG_GRANT: - /* this reply is from the former master of the resource, - we'll resend to the new master if needed */ - - if (dlm_hash2nodeid(ls, ms->m_hash) != nodeid) - return 1; - break; - } - - return 0; + return 1; } void dlm_purge_requestqueue(struct dlm_ls *ls) diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index ab2248090515..a750f957b145 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -303,7 +303,7 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid, mutex_unlock(&ecryptfs_daemon_hash_mux); goto wake_up; } - tsk_user_ns = __task_cred(msg_ctx->task)->user->user_ns; + tsk_user_ns = __task_cred(msg_ctx->task)->user_ns; ctx_euid = task_euid(msg_ctx->task); rc = ecryptfs_find_daemon_by_euid(&daemon, ctx_euid, tsk_user_ns); rcu_read_unlock(); diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index 2dd946b636d2..e879cf8ff0b1 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -133,7 +133,7 @@ static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf) static void ecryptfs_evict_inode(struct inode *inode) { truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); iput(ecryptfs_inode_to_lower(inode)); } diff --git a/fs/eventpoll.c b/fs/eventpoll.c index c0b3c70ee87a..079d1be65ba9 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -33,6 +33,7 @@ #include <linux/bitops.h> #include <linux/mutex.h> #include <linux/anon_inodes.h> +#include <linux/device.h> #include <asm/uaccess.h> #include <asm/io.h> #include <asm/mman.h> @@ -87,7 +88,7 @@ */ /* Epoll private bits inside the event mask */ -#define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET) +#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET) /* Maximum number of nesting allowed inside epoll sets */ #define EP_MAX_NESTS 4 @@ -154,6 +155,9 @@ struct epitem { /* List header used to link this item to the "struct file" items list */ struct list_head fllink; + /* wakeup_source used when EPOLLWAKEUP is set */ + struct wakeup_source *ws; + /* The structure that describe the interested events and the source fd */ struct epoll_event event; }; @@ -194,6 +198,9 @@ struct eventpoll { */ struct epitem *ovflist; + /* wakeup_source used when ep_scan_ready_list is running */ + struct wakeup_source *ws; + /* The user that created the eventpoll descriptor */ struct user_struct *user; @@ -588,8 +595,10 @@ static int ep_scan_ready_list(struct eventpoll *ep, * queued into ->ovflist but the "txlist" might already * contain them, and the list_splice() below takes care of them. */ - if (!ep_is_linked(&epi->rdllink)) + if (!ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); + __pm_stay_awake(epi->ws); + } } /* * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after @@ -602,6 +611,7 @@ static int ep_scan_ready_list(struct eventpoll *ep, * Quickly re-inject items left on "txlist". */ list_splice(&txlist, &ep->rdllist); + __pm_relax(ep->ws); if (!list_empty(&ep->rdllist)) { /* @@ -656,6 +666,8 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) list_del_init(&epi->rdllink); spin_unlock_irqrestore(&ep->lock, flags); + wakeup_source_unregister(epi->ws); + /* At this point it is safe to free the eventpoll item */ kmem_cache_free(epi_cache, epi); @@ -706,6 +718,7 @@ static void ep_free(struct eventpoll *ep) mutex_unlock(&epmutex); mutex_destroy(&ep->mtx); free_uid(ep->user); + wakeup_source_unregister(ep->ws); kfree(ep); } @@ -737,6 +750,7 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, * callback, but it's not actually ready, as far as * caller requested events goes. We can remove it here. */ + __pm_relax(epi->ws); list_del_init(&epi->rdllink); } } @@ -927,13 +941,23 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k if (epi->next == EP_UNACTIVE_PTR) { epi->next = ep->ovflist; ep->ovflist = epi; + if (epi->ws) { + /* + * Activate ep->ws since epi->ws may get + * deactivated at any time. + */ + __pm_stay_awake(ep->ws); + } + } goto out_unlock; } /* If this file is already in the ready list we exit soon */ - if (!ep_is_linked(&epi->rdllink)) + if (!ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); + __pm_stay_awake(epi->ws); + } /* * Wake up ( if active ) both the eventpoll wait list and the ->poll() @@ -1091,6 +1115,30 @@ static int reverse_path_check(void) return error; } +static int ep_create_wakeup_source(struct epitem *epi) +{ + const char *name; + + if (!epi->ep->ws) { + epi->ep->ws = wakeup_source_register("eventpoll"); + if (!epi->ep->ws) + return -ENOMEM; + } + + name = epi->ffd.file->f_path.dentry->d_name.name; + epi->ws = wakeup_source_register(name); + if (!epi->ws) + return -ENOMEM; + + return 0; +} + +static void ep_destroy_wakeup_source(struct epitem *epi) +{ + wakeup_source_unregister(epi->ws); + epi->ws = NULL; +} + /* * Must be called with "mtx" held. */ @@ -1118,6 +1166,13 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, epi->event = *event; epi->nwait = 0; epi->next = EP_UNACTIVE_PTR; + if (epi->event.events & EPOLLWAKEUP) { + error = ep_create_wakeup_source(epi); + if (error) + goto error_create_wakeup_source; + } else { + epi->ws = NULL; + } /* Initialize the poll table using the queue callback */ epq.epi = epi; @@ -1164,6 +1219,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, /* If the file is already "ready" we drop it inside the ready list */ if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); + __pm_stay_awake(epi->ws); /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) @@ -1204,6 +1260,9 @@ error_unregister: list_del_init(&epi->rdllink); spin_unlock_irqrestore(&ep->lock, flags); + wakeup_source_unregister(epi->ws); + +error_create_wakeup_source: kmem_cache_free(epi_cache, epi); return error; @@ -1229,6 +1288,12 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even epi->event.events = event->events; pt._key = event->events; epi->event.data = event->data; /* protected by mtx */ + if (epi->event.events & EPOLLWAKEUP) { + if (!epi->ws) + ep_create_wakeup_source(epi); + } else if (epi->ws) { + ep_destroy_wakeup_source(epi); + } /* * Get current event bits. We can safely use the file* here because @@ -1244,6 +1309,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even spin_lock_irq(&ep->lock); if (!ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); + __pm_stay_awake(epi->ws); /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) @@ -1282,6 +1348,18 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, !list_empty(head) && eventcnt < esed->maxevents;) { epi = list_first_entry(head, struct epitem, rdllink); + /* + * Activate ep->ws before deactivating epi->ws to prevent + * triggering auto-suspend here (in case we reactive epi->ws + * below). + * + * This could be rearranged to delay the deactivation of epi->ws + * instead, but then epi->ws would temporarily be out of sync + * with ep_is_linked(). + */ + if (epi->ws && epi->ws->active) + __pm_stay_awake(ep->ws); + __pm_relax(epi->ws); list_del_init(&epi->rdllink); pt._key = epi->event.events; @@ -1298,6 +1376,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, if (__put_user(revents, &uevent->events) || __put_user(epi->event.data, &uevent->data)) { list_add(&epi->rdllink, head); + __pm_stay_awake(epi->ws); return eventcnt ? eventcnt : -EFAULT; } eventcnt++; @@ -1317,6 +1396,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, * poll callback will queue them in ep->ovflist. */ list_add_tail(&epi->rdllink, &ep->rdllist); + __pm_stay_awake(epi->ws); } } } @@ -1629,6 +1709,10 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, if (!tfile->f_op || !tfile->f_op->poll) goto error_tgt_fput; + /* Check if EPOLLWAKEUP is allowed */ + if ((epds.events & EPOLLWAKEUP) && !capable(CAP_EPOLLWAKEUP)) + epds.events &= ~EPOLLWAKEUP; + /* * We have to check that the file structure underneath the file descriptor * the user passed to us _is_ an eventpoll file. And also we do not permit diff --git a/fs/exec.c b/fs/exec.c index d038968b54b4..52c9e2ff6e6b 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1139,7 +1139,7 @@ void setup_new_exec(struct linux_binprm * bprm) /* This is the point of no return */ current->sas_ss_sp = current->sas_ss_size = 0; - if (current_euid() == current_uid() && current_egid() == current_gid()) + if (uid_eq(current_euid(), current_uid()) && gid_eq(current_egid(), current_gid())) set_dumpable(current->mm, 1); else set_dumpable(current->mm, suid_dumpable); @@ -1153,8 +1153,8 @@ void setup_new_exec(struct linux_binprm * bprm) current->mm->task_size = TASK_SIZE; /* install the new credentials */ - if (bprm->cred->uid != current_euid() || - bprm->cred->gid != current_egid()) { + if (!uid_eq(bprm->cred->uid, current_euid()) || + !gid_eq(bprm->cred->gid, current_egid())) { current->pdeath_signal = 0; } else { would_dump(bprm, bprm->file); @@ -1299,8 +1299,11 @@ int prepare_binprm(struct linux_binprm *bprm) !current->no_new_privs) { /* Set-uid? */ if (mode & S_ISUID) { + if (!kuid_has_mapping(bprm->cred->user_ns, inode->i_uid)) + return -EPERM; bprm->per_clear |= PER_CLEAR_ON_SETID; bprm->cred->euid = inode->i_uid; + } /* Set-gid? */ @@ -1310,6 +1313,8 @@ int prepare_binprm(struct linux_binprm *bprm) * executable. */ if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { + if (!kgid_has_mapping(bprm->cred->user_ns, inode->i_gid)) + return -EPERM; bprm->per_clear |= PER_CLEAR_ON_SETID; bprm->cred->egid = inode->i_gid; } @@ -1938,8 +1943,21 @@ static int coredump_wait(int exit_code, struct core_state *core_state) core_waiters = zap_threads(tsk, mm, core_state, exit_code); up_write(&mm->mmap_sem); - if (core_waiters > 0) + if (core_waiters > 0) { + struct core_thread *ptr; + wait_for_completion(&core_state->startup); + /* + * Wait for all the threads to become inactive, so that + * all the thread context (extended register state, like + * fpu etc) gets copied to the memory. + */ + ptr = core_state->dumper.next; + while (ptr != NULL) { + wait_task_inactive(ptr->task, 0); + ptr = ptr->next; + } + } return core_waiters; } @@ -2129,7 +2147,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) if (__get_dumpable(cprm.mm_flags) == 2) { /* Setuid core dump mode */ flag = O_EXCL; /* Stop rewrite attacks */ - cred->fsuid = 0; /* Dump root private */ + cred->fsuid = GLOBAL_ROOT_UID; /* Dump root private */ } retval = coredump_wait(exit_code, &core_state); @@ -2230,7 +2248,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) * Dont allow local users get cute and trick others to coredump * into their pre-created files. */ - if (inode->i_uid != current_fsuid()) + if (!uid_eq(inode->i_uid, current_fsuid())) goto close_fail; if (!cprm.file->f_op || !cprm.file->f_op->write) goto close_fail; diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index ea5e1f97806a..5badb0c039de 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -1473,7 +1473,7 @@ void exofs_evict_inode(struct inode *inode) goto no_delete; inode->i_size = 0; - end_writeback(inode); + clear_inode(inode); /* if we are deleting an obj that hasn't been created yet, wait. * This also makes sure that create_done cannot be called with an @@ -1503,5 +1503,5 @@ void exofs_evict_inode(struct inode *inode) return; no_delete: - end_writeback(inode); + clear_inode(inode); } diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index a8cbe1bc6ad4..1c3613998862 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -165,7 +165,6 @@ static void release_blocks(struct super_block *sb, int count) struct ext2_sb_info *sbi = EXT2_SB(sb); percpu_counter_add(&sbi->s_freeblocks_counter, count); - sb->s_dirt = 1; } } @@ -180,7 +179,6 @@ static void group_adjust_blocks(struct super_block *sb, int group_no, free_blocks = le16_to_cpu(desc->bg_free_blocks_count); desc->bg_free_blocks_count = cpu_to_le16(free_blocks + count); spin_unlock(sb_bgl_lock(sbi, group_no)); - sb->s_dirt = 1; mark_buffer_dirty(bh); } } @@ -479,7 +477,7 @@ void ext2_discard_reservation(struct inode *inode) } /** - * ext2_free_blocks_sb() -- Free given blocks and update quota and i_blocks + * ext2_free_blocks() -- Free given blocks and update quota and i_blocks * @inode: inode * @block: start physcial block to free * @count: number of blocks to free @@ -1193,8 +1191,9 @@ static int ext2_has_free_blocks(struct ext2_sb_info *sbi) free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count); if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) && - sbi->s_resuid != current_fsuid() && - (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) { + !uid_eq(sbi->s_resuid, current_fsuid()) && + (gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) || + !in_group_p (sbi->s_resgid))) { return 0; } return 1; diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 0b2b4db5bdcd..d9a17d0b124d 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -82,8 +82,8 @@ struct ext2_sb_info { struct buffer_head ** s_group_desc; unsigned long s_mount_opt; unsigned long s_sb_block; - uid_t s_resuid; - gid_t s_resgid; + kuid_t s_resuid; + kgid_t s_resgid; unsigned short s_mount_state; unsigned short s_pad; int s_addr_per_block_bits; @@ -637,8 +637,8 @@ static inline void verify_offsets(void) */ struct ext2_mount_options { unsigned long s_mount_opt; - uid_t s_resuid; - gid_t s_resgid; + kuid_t s_resuid; + kgid_t s_resgid; }; /* diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 8b15cf8cef37..c13eb7b91a11 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -81,7 +81,6 @@ static void ext2_release_inode(struct super_block *sb, int group, int dir) spin_unlock(sb_bgl_lock(EXT2_SB(sb), group)); if (dir) percpu_counter_dec(&EXT2_SB(sb)->s_dirs_counter); - sb->s_dirt = 1; mark_buffer_dirty(bh); } @@ -543,7 +542,6 @@ got: } spin_unlock(sb_bgl_lock(sbi, group)); - sb->s_dirt = 1; mark_buffer_dirty(bh2); if (test_opt(sb, GRPID)) { inode->i_mode = mode; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 740cad8dcd8d..264d315f6c47 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -90,7 +90,7 @@ void ext2_evict_inode(struct inode * inode) } invalidate_inode_buffers(inode); - end_writeback(inode); + clear_inode(inode); ext2_discard_reservation(inode); rsv = EXT2_I(inode)->i_block_alloc_info; @@ -1293,6 +1293,8 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) struct inode *inode; long ret = -EIO; int n; + uid_t i_uid; + gid_t i_gid; inode = iget_locked(sb, ino); if (!inode) @@ -1310,12 +1312,14 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) } inode->i_mode = le16_to_cpu(raw_inode->i_mode); - inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); - inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); + i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); + i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); if (!(test_opt (inode->i_sb, NO_UID32))) { - inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; - inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; + i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; + i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; } + i_uid_write(inode, i_uid); + i_gid_write(inode, i_gid); set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); inode->i_size = le32_to_cpu(raw_inode->i_size); inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime); @@ -1413,8 +1417,8 @@ static int __ext2_write_inode(struct inode *inode, int do_sync) struct ext2_inode_info *ei = EXT2_I(inode); struct super_block *sb = inode->i_sb; ino_t ino = inode->i_ino; - uid_t uid = inode->i_uid; - gid_t gid = inode->i_gid; + uid_t uid = i_uid_read(inode); + gid_t gid = i_gid_read(inode); struct buffer_head * bh; struct ext2_inode * raw_inode = ext2_get_inode(sb, ino, &bh); int n; @@ -1529,8 +1533,8 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr) if (is_quota_modification(inode, iattr)) dquot_initialize(inode); - if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || - (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { + if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) || + (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) { error = dquot_transfer(inode, iattr); if (error) return error; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index e1025c7a437a..b3621cb7ea31 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -130,9 +130,6 @@ static void ext2_put_super (struct super_block * sb) dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); - if (sb->s_dirt) - ext2_write_super(sb); - ext2_xattr_put_super(sb); if (!(sb->s_flags & MS_RDONLY)) { struct ext2_super_block *es = sbi->s_es; @@ -228,13 +225,15 @@ static int ext2_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",grpid"); if (!test_opt(sb, GRPID) && (def_mount_opts & EXT2_DEFM_BSDGROUPS)) seq_puts(seq, ",nogrpid"); - if (sbi->s_resuid != EXT2_DEF_RESUID || + if (!uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT2_DEF_RESUID)) || le16_to_cpu(es->s_def_resuid) != EXT2_DEF_RESUID) { - seq_printf(seq, ",resuid=%u", sbi->s_resuid); + seq_printf(seq, ",resuid=%u", + from_kuid_munged(&init_user_ns, sbi->s_resuid)); } - if (sbi->s_resgid != EXT2_DEF_RESGID || + if (!gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT2_DEF_RESGID)) || le16_to_cpu(es->s_def_resgid) != EXT2_DEF_RESGID) { - seq_printf(seq, ",resgid=%u", sbi->s_resgid); + seq_printf(seq, ",resgid=%u", + from_kgid_munged(&init_user_ns, sbi->s_resgid)); } if (test_opt(sb, ERRORS_RO)) { int def_errors = le16_to_cpu(es->s_errors); @@ -305,7 +304,6 @@ static const struct super_operations ext2_sops = { .write_inode = ext2_write_inode, .evict_inode = ext2_evict_inode, .put_super = ext2_put_super, - .write_super = ext2_write_super, .sync_fs = ext2_sync_fs, .statfs = ext2_statfs, .remount_fs = ext2_remount, @@ -356,11 +354,6 @@ static struct dentry *ext2_fh_to_parent(struct super_block *sb, struct fid *fid, ext2_nfs_get_inode); } -/* Yes, most of these are left as NULL!! - * A NULL value implies the default, which works with ext2-like file - * systems, but can be improved upon. - * Currently only get_parent is required. - */ static const struct export_operations ext2_export_ops = { .fh_to_dentry = ext2_fh_to_dentry, .fh_to_parent = ext2_fh_to_parent, @@ -436,6 +429,8 @@ static int parse_options(char *options, struct super_block *sb) struct ext2_sb_info *sbi = EXT2_SB(sb); substring_t args[MAX_OPT_ARGS]; int option; + kuid_t uid; + kgid_t gid; if (!options) return 1; @@ -462,12 +457,23 @@ static int parse_options(char *options, struct super_block *sb) case Opt_resuid: if (match_int(&args[0], &option)) return 0; - sbi->s_resuid = option; + uid = make_kuid(current_user_ns(), option); + if (!uid_valid(uid)) { + ext2_msg(sb, KERN_ERR, "Invalid uid value %d", option); + return -1; + + } + sbi->s_resuid = uid; break; case Opt_resgid: if (match_int(&args[0], &option)) return 0; - sbi->s_resgid = option; + gid = make_kgid(current_user_ns(), option); + if (!gid_valid(gid)) { + ext2_msg(sb, KERN_ERR, "Invalid gid value %d", option); + return -1; + } + sbi->s_resgid = gid; break; case Opt_sb: /* handled by get_sb_block() instead of here */ @@ -841,8 +847,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) else set_opt(sbi->s_mount_opt, ERRORS_RO); - sbi->s_resuid = le16_to_cpu(es->s_def_resuid); - sbi->s_resgid = le16_to_cpu(es->s_def_resgid); + sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid)); + sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid)); set_opt(sbi->s_mount_opt, RESERVATION); @@ -1161,7 +1167,6 @@ static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es, mark_buffer_dirty(EXT2_SB(sb)->s_sbh); if (wait) sync_dirty_buffer(EXT2_SB(sb)->s_sbh); - sb->s_dirt = 0; } /* @@ -1194,8 +1199,6 @@ void ext2_write_super(struct super_block *sb) { if (!(sb->s_flags & MS_RDONLY)) ext2_sync_fs(sb, 1); - else - sb->s_dirt = 0; } static int ext2_remount (struct super_block * sb, int * flags, char * data) @@ -1441,7 +1444,6 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type, struct buffer_head tmp_bh; struct buffer_head *bh; - mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); while (towrite > 0) { tocopy = sb->s_blocksize - offset < towrite ? sb->s_blocksize - offset : towrite; @@ -1471,16 +1473,13 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type, blk++; } out: - if (len == towrite) { - mutex_unlock(&inode->i_mutex); + if (len == towrite) return err; - } if (inode->i_size < off+len-towrite) i_size_write(inode, off+len-towrite); inode->i_version++; inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); - mutex_unlock(&inode->i_mutex); return len - towrite; } diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 6dcafc7efdfd..b6754dbbce3c 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -339,7 +339,6 @@ static void ext2_xattr_update_super_block(struct super_block *sb) spin_lock(&EXT2_SB(sb)->s_lock); EXT2_SET_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR); spin_unlock(&EXT2_SB(sb)->s_lock); - sb->s_dirt = 1; mark_buffer_dirty(EXT2_SB(sb)->s_sbh); } diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index baac1b129fba..25cd60892116 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -1439,8 +1439,9 @@ static int ext3_has_free_blocks(struct ext3_sb_info *sbi, int use_reservation) free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count); if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) && - !use_reservation && sbi->s_resuid != current_fsuid() && - (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) { + !use_reservation && !uid_eq(sbi->s_resuid, current_fsuid()) && + (gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) || + !in_group_p (sbi->s_resgid))) { return 0; } return 1; diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index cc761ad8fa57..92490e9f85ca 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -21,30 +21,15 @@ * */ +#include <linux/compat.h> #include "ext3.h" static unsigned char ext3_filetype_table[] = { DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK }; -static int ext3_readdir(struct file *, void *, filldir_t); static int ext3_dx_readdir(struct file * filp, void * dirent, filldir_t filldir); -static int ext3_release_dir (struct inode * inode, - struct file * filp); - -const struct file_operations ext3_dir_operations = { - .llseek = generic_file_llseek, - .read = generic_read_dir, - .readdir = ext3_readdir, /* we take BKL. needed?*/ - .unlocked_ioctl = ext3_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = ext3_compat_ioctl, -#endif - .fsync = ext3_sync_file, /* BKL held */ - .release = ext3_release_dir, -}; - static unsigned char get_dtype(struct super_block *sb, int filetype) { @@ -55,6 +40,25 @@ static unsigned char get_dtype(struct super_block *sb, int filetype) return (ext3_filetype_table[filetype]); } +/** + * Check if the given dir-inode refers to an htree-indexed directory + * (or a directory which chould potentially get coverted to use htree + * indexing). + * + * Return 1 if it is a dx dir, 0 if not + */ +static int is_dx_dir(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb, + EXT3_FEATURE_COMPAT_DIR_INDEX) && + ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) || + ((inode->i_size >> sb->s_blocksize_bits) == 1))) + return 1; + + return 0; +} int ext3_check_dir_entry (const char * function, struct inode * dir, struct ext3_dir_entry_2 * de, @@ -94,18 +98,13 @@ static int ext3_readdir(struct file * filp, unsigned long offset; int i, stored; struct ext3_dir_entry_2 *de; - struct super_block *sb; int err; struct inode *inode = filp->f_path.dentry->d_inode; + struct super_block *sb = inode->i_sb; int ret = 0; int dir_has_error = 0; - sb = inode->i_sb; - - if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb, - EXT3_FEATURE_COMPAT_DIR_INDEX) && - ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) || - ((inode->i_size >> sb->s_blocksize_bits) == 1))) { + if (is_dx_dir(inode)) { err = ext3_dx_readdir(filp, dirent, filldir); if (err != ERR_BAD_DX_DIR) { ret = err; @@ -227,22 +226,87 @@ out: return ret; } +static inline int is_32bit_api(void) +{ +#ifdef CONFIG_COMPAT + return is_compat_task(); +#else + return (BITS_PER_LONG == 32); +#endif +} + /* * These functions convert from the major/minor hash to an f_pos - * value. + * value for dx directories * - * Currently we only use major hash numer. This is unfortunate, but - * on 32-bit machines, the same VFS interface is used for lseek and - * llseek, so if we use the 64 bit offset, then the 32-bit versions of - * lseek/telldir/seekdir will blow out spectacularly, and from within - * the ext2 low-level routine, we don't know if we're being called by - * a 64-bit version of the system call or the 32-bit version of the - * system call. Worse yet, NFSv2 only allows for a 32-bit readdir - * cookie. Sigh. + * Upper layer (for example NFS) should specify FMODE_32BITHASH or + * FMODE_64BITHASH explicitly. On the other hand, we allow ext3 to be mounted + * directly on both 32-bit and 64-bit nodes, under such case, neither + * FMODE_32BITHASH nor FMODE_64BITHASH is specified. */ -#define hash2pos(major, minor) (major >> 1) -#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff) -#define pos2min_hash(pos) (0) +static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return major >> 1; + else + return ((__u64)(major >> 1) << 32) | (__u64)minor; +} + +static inline __u32 pos2maj_hash(struct file *filp, loff_t pos) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return (pos << 1) & 0xffffffff; + else + return ((pos >> 32) << 1) & 0xffffffff; +} + +static inline __u32 pos2min_hash(struct file *filp, loff_t pos) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return 0; + else + return pos & 0xffffffff; +} + +/* + * Return 32- or 64-bit end-of-file for dx directories + */ +static inline loff_t ext3_get_htree_eof(struct file *filp) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return EXT3_HTREE_EOF_32BIT; + else + return EXT3_HTREE_EOF_64BIT; +} + + +/* + * ext3_dir_llseek() calls generic_file_llseek[_size]() to handle both + * non-htree and htree directories, where the "offset" is in terms + * of the filename hash value instead of the byte offset. + * + * Because we may return a 64-bit hash that is well beyond s_maxbytes, + * we need to pass the max hash as the maximum allowable offset in + * the htree directory case. + * + * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX) + * will be invalid once the directory was converted into a dx directory + */ +loff_t ext3_dir_llseek(struct file *file, loff_t offset, int origin) +{ + struct inode *inode = file->f_mapping->host; + int dx_dir = is_dx_dir(inode); + + if (likely(dx_dir)) + return generic_file_llseek_size(file, offset, origin, + ext3_get_htree_eof(file)); + else + return generic_file_llseek(file, offset, origin); +} /* * This structure holds the nodes of the red-black tree used to store @@ -303,15 +367,16 @@ static void free_rb_tree_fname(struct rb_root *root) } -static struct dir_private_info *ext3_htree_create_dir_info(loff_t pos) +static struct dir_private_info *ext3_htree_create_dir_info(struct file *filp, + loff_t pos) { struct dir_private_info *p; p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL); if (!p) return NULL; - p->curr_hash = pos2maj_hash(pos); - p->curr_minor_hash = pos2min_hash(pos); + p->curr_hash = pos2maj_hash(filp, pos); + p->curr_minor_hash = pos2min_hash(filp, pos); return p; } @@ -401,7 +466,7 @@ static int call_filldir(struct file * filp, void * dirent, printk("call_filldir: called with null fname?!?\n"); return 0; } - curr_pos = hash2pos(fname->hash, fname->minor_hash); + curr_pos = hash2pos(filp, fname->hash, fname->minor_hash); while (fname) { error = filldir(dirent, fname->name, fname->name_len, curr_pos, @@ -426,13 +491,13 @@ static int ext3_dx_readdir(struct file * filp, int ret; if (!info) { - info = ext3_htree_create_dir_info(filp->f_pos); + info = ext3_htree_create_dir_info(filp, filp->f_pos); if (!info) return -ENOMEM; filp->private_data = info; } - if (filp->f_pos == EXT3_HTREE_EOF) + if (filp->f_pos == ext3_get_htree_eof(filp)) return 0; /* EOF */ /* Some one has messed with f_pos; reset the world */ @@ -440,8 +505,8 @@ static int ext3_dx_readdir(struct file * filp, free_rb_tree_fname(&info->root); info->curr_node = NULL; info->extra_fname = NULL; - info->curr_hash = pos2maj_hash(filp->f_pos); - info->curr_minor_hash = pos2min_hash(filp->f_pos); + info->curr_hash = pos2maj_hash(filp, filp->f_pos); + info->curr_minor_hash = pos2min_hash(filp, filp->f_pos); } /* @@ -473,7 +538,7 @@ static int ext3_dx_readdir(struct file * filp, if (ret < 0) return ret; if (ret == 0) { - filp->f_pos = EXT3_HTREE_EOF; + filp->f_pos = ext3_get_htree_eof(filp); break; } info->curr_node = rb_first(&info->root); @@ -493,7 +558,7 @@ static int ext3_dx_readdir(struct file * filp, info->curr_minor_hash = fname->minor_hash; } else { if (info->next_hash == ~0) { - filp->f_pos = EXT3_HTREE_EOF; + filp->f_pos = ext3_get_htree_eof(filp); break; } info->curr_hash = info->next_hash; @@ -512,3 +577,15 @@ static int ext3_release_dir (struct inode * inode, struct file * filp) return 0; } + +const struct file_operations ext3_dir_operations = { + .llseek = ext3_dir_llseek, + .read = generic_read_dir, + .readdir = ext3_readdir, + .unlocked_ioctl = ext3_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ext3_compat_ioctl, +#endif + .fsync = ext3_sync_file, + .release = ext3_release_dir, +}; diff --git a/fs/ext3/ext3.h b/fs/ext3/ext3.h index b6515fd7e56c..e85ff15a060e 100644 --- a/fs/ext3/ext3.h +++ b/fs/ext3/ext3.h @@ -243,8 +243,8 @@ struct ext3_new_group_data { */ struct ext3_mount_options { unsigned long s_mount_opt; - uid_t s_resuid; - gid_t s_resgid; + kuid_t s_resuid; + kgid_t s_resgid; unsigned long s_commit_interval; #ifdef CONFIG_QUOTA int s_jquota_fmt; @@ -637,8 +637,8 @@ struct ext3_sb_info { struct buffer_head ** s_group_desc; unsigned long s_mount_opt; ext3_fsblk_t s_sb_block; - uid_t s_resuid; - gid_t s_resgid; + kuid_t s_resuid; + kgid_t s_resgid; unsigned short s_mount_state; unsigned short s_pad; int s_addr_per_block_bits; @@ -920,7 +920,11 @@ struct dx_hash_info u32 *seed; }; -#define EXT3_HTREE_EOF 0x7fffffff + +/* 32 and 64 bit signed EOF for dx directories */ +#define EXT3_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1) +#define EXT3_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1) + /* * Control parameters used by ext3_htree_next_block diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c index d10231ddcf8a..ede315cdf126 100644 --- a/fs/ext3/hash.c +++ b/fs/ext3/hash.c @@ -198,8 +198,8 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) return -1; } hash = hash & ~1; - if (hash == (EXT3_HTREE_EOF << 1)) - hash = (EXT3_HTREE_EOF-1) << 1; + if (hash == (EXT3_HTREE_EOF_32BIT << 1)) + hash = (EXT3_HTREE_EOF_32BIT - 1) << 1; hinfo->hash = hash; hinfo->minor_hash = minor_hash; return 0; diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index e3c39e4cec19..082afd78b107 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -180,8 +180,7 @@ error_return: * It's OK to put directory into a group unless * it has too many directories already (max_dirs) or * it has too few free inodes left (min_inodes) or - * it has too few free blocks left (min_blocks) or - * it's already running too large debt (max_debt). + * it has too few free blocks left (min_blocks). * Parent's group is preferred, if it doesn't satisfy these * conditions we search cyclically through the rest. If none * of the groups look good we just look for a group with more @@ -191,21 +190,16 @@ error_return: * when we allocate an inode, within 0--255. */ -#define INODE_COST 64 -#define BLOCK_COST 256 - static int find_group_orlov(struct super_block *sb, struct inode *parent) { int parent_group = EXT3_I(parent)->i_block_group; struct ext3_sb_info *sbi = EXT3_SB(sb); - struct ext3_super_block *es = sbi->s_es; int ngroups = sbi->s_groups_count; int inodes_per_group = EXT3_INODES_PER_GROUP(sb); unsigned int freei, avefreei; ext3_fsblk_t freeb, avefreeb; - ext3_fsblk_t blocks_per_dir; unsigned int ndirs; - int max_debt, max_dirs, min_inodes; + int max_dirs, min_inodes; ext3_grpblk_t min_blocks; int group = -1, i; struct ext3_group_desc *desc; @@ -242,20 +236,10 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent) goto fallback; } - blocks_per_dir = (le32_to_cpu(es->s_blocks_count) - freeb) / ndirs; - max_dirs = ndirs / ngroups + inodes_per_group / 16; min_inodes = avefreei - inodes_per_group / 4; min_blocks = avefreeb - EXT3_BLOCKS_PER_GROUP(sb) / 4; - max_debt = EXT3_BLOCKS_PER_GROUP(sb) / max(blocks_per_dir, (ext3_fsblk_t)BLOCK_COST); - if (max_debt * INODE_COST > inodes_per_group) - max_debt = inodes_per_group / INODE_COST; - if (max_debt > 255) - max_debt = 255; - if (max_debt == 0) - max_debt = 1; - for (i = 0; i < ngroups; i++) { group = (parent_group + i) % ngroups; desc = ext3_get_group_desc (sb, group, NULL); diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 10d7812f6021..9a4a5c48b1c9 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -272,18 +272,18 @@ void ext3_evict_inode (struct inode *inode) if (ext3_mark_inode_dirty(handle, inode)) { /* If that failed, just dquot_drop() and be done with that */ dquot_drop(inode); - end_writeback(inode); + clear_inode(inode); } else { ext3_xattr_delete_inode(handle, inode); dquot_free_inode(inode); dquot_drop(inode); - end_writeback(inode); + clear_inode(inode); ext3_free_inode(handle, inode); } ext3_journal_stop(handle); return; no_delete: - end_writeback(inode); + clear_inode(inode); dquot_drop(inode); } @@ -2891,6 +2891,8 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino) transaction_t *transaction; long ret; int block; + uid_t i_uid; + gid_t i_gid; inode = iget_locked(sb, ino); if (!inode) @@ -2907,12 +2909,14 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino) bh = iloc.bh; raw_inode = ext3_raw_inode(&iloc); inode->i_mode = le16_to_cpu(raw_inode->i_mode); - inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); - inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); + i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); + i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); if(!(test_opt (inode->i_sb, NO_UID32))) { - inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; - inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; + i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; + i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; } + i_uid_write(inode, i_uid); + i_gid_write(inode, i_gid); set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); inode->i_size = le32_to_cpu(raw_inode->i_size); inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime); @@ -3068,6 +3072,8 @@ static int ext3_do_update_inode(handle_t *handle, struct ext3_inode_info *ei = EXT3_I(inode); struct buffer_head *bh = iloc->bh; int err = 0, rc, block; + uid_t i_uid; + gid_t i_gid; again: /* we can't allow multiple procs in here at once, its a bit racey */ @@ -3080,27 +3086,29 @@ again: ext3_get_inode_flags(ei); raw_inode->i_mode = cpu_to_le16(inode->i_mode); + i_uid = i_uid_read(inode); + i_gid = i_gid_read(inode); if(!(test_opt(inode->i_sb, NO_UID32))) { - raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); - raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); + raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); + raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); /* * Fix up interoperability with old kernels. Otherwise, old inodes get * re-used with the upper 16 bits of the uid/gid intact */ if(!ei->i_dtime) { raw_inode->i_uid_high = - cpu_to_le16(high_16_bits(inode->i_uid)); + cpu_to_le16(high_16_bits(i_uid)); raw_inode->i_gid_high = - cpu_to_le16(high_16_bits(inode->i_gid)); + cpu_to_le16(high_16_bits(i_gid)); } else { raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } } else { raw_inode->i_uid_low = - cpu_to_le16(fs_high2lowuid(inode->i_uid)); + cpu_to_le16(fs_high2lowuid(i_uid)); raw_inode->i_gid_low = - cpu_to_le16(fs_high2lowgid(inode->i_gid)); + cpu_to_le16(fs_high2lowgid(i_gid)); raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } @@ -3262,8 +3270,8 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) if (is_quota_modification(inode, attr)) dquot_initialize(inode); - if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || - (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { + if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) || + (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) { handle_t *handle; /* (user+group)*(old+new) structure, inode write (sb, diff --git a/fs/ext3/super.c b/fs/ext3/super.c index cf0b5921cf0f..8c3a44b7c375 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -617,13 +617,15 @@ static int ext3_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",grpid"); if (!test_opt(sb, GRPID) && (def_mount_opts & EXT3_DEFM_BSDGROUPS)) seq_puts(seq, ",nogrpid"); - if (sbi->s_resuid != EXT3_DEF_RESUID || + if (!uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT3_DEF_RESUID)) || le16_to_cpu(es->s_def_resuid) != EXT3_DEF_RESUID) { - seq_printf(seq, ",resuid=%u", sbi->s_resuid); + seq_printf(seq, ",resuid=%u", + from_kuid_munged(&init_user_ns, sbi->s_resuid)); } - if (sbi->s_resgid != EXT3_DEF_RESGID || + if (!gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT3_DEF_RESGID)) || le16_to_cpu(es->s_def_resgid) != EXT3_DEF_RESGID) { - seq_printf(seq, ",resgid=%u", sbi->s_resgid); + seq_printf(seq, ",resgid=%u", + from_kgid_munged(&init_user_ns, sbi->s_resgid)); } if (test_opt(sb, ERRORS_RO)) { int def_errors = le16_to_cpu(es->s_errors); @@ -967,6 +969,8 @@ static int parse_options (char *options, struct super_block *sb, substring_t args[MAX_OPT_ARGS]; int data_opt = 0; int option; + kuid_t uid; + kgid_t gid; #ifdef CONFIG_QUOTA int qfmt; #endif @@ -1000,12 +1004,23 @@ static int parse_options (char *options, struct super_block *sb, case Opt_resuid: if (match_int(&args[0], &option)) return 0; - sbi->s_resuid = option; + uid = make_kuid(current_user_ns(), option); + if (!uid_valid(uid)) { + ext3_msg(sb, KERN_ERR, "Invalid uid value %d", option); + return -1; + + } + sbi->s_resuid = uid; break; case Opt_resgid: if (match_int(&args[0], &option)) return 0; - sbi->s_resgid = option; + gid = make_kgid(current_user_ns(), option); + if (!gid_valid(gid)) { + ext3_msg(sb, KERN_ERR, "Invalid gid value %d", option); + return -1; + } + sbi->s_resgid = gid; break; case Opt_sb: /* handled by get_sb_block() instead of here */ @@ -1651,8 +1666,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) } sb->s_fs_info = sbi; sbi->s_mount_opt = 0; - sbi->s_resuid = EXT3_DEF_RESUID; - sbi->s_resgid = EXT3_DEF_RESGID; + sbi->s_resuid = make_kuid(&init_user_ns, EXT3_DEF_RESUID); + sbi->s_resgid = make_kgid(&init_user_ns, EXT3_DEF_RESGID); sbi->s_sb_block = sb_block; blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE); @@ -1716,8 +1731,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) else set_opt(sbi->s_mount_opt, ERRORS_RO); - sbi->s_resuid = le16_to_cpu(es->s_def_resuid); - sbi->s_resgid = le16_to_cpu(es->s_def_resgid); + sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid)); + sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid)); /* enable barriers by default */ set_opt(sbi->s_mount_opt, BARRIER); @@ -3000,7 +3015,6 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type, (unsigned long long)off, (unsigned long long)len); return -EIO; } - mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); bh = ext3_bread(handle, inode, blk, 1, &err); if (!bh) goto out; @@ -3024,10 +3038,8 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type, } brelse(bh); out: - if (err) { - mutex_unlock(&inode->i_mutex); + if (err) return err; - } if (inode->i_size < off + len) { i_size_write(inode, off + len); EXT3_I(inode)->i_disksize = inode->i_size; @@ -3035,7 +3047,6 @@ out: inode->i_version++; inode->i_mtime = inode->i_ctime = CURRENT_TIME; ext3_mark_inode_dirty(handle, inode); - mutex_unlock(&inode->i_mutex); return len; } diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 4bbd07a6fa18..c45c41129a35 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -461,8 +461,8 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi, return 1; /* Hm, nope. Are (enough) root reserved clusters available? */ - if (sbi->s_resuid == current_fsuid() || - ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) || + if (uid_eq(sbi->s_resuid, current_fsuid()) || + (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) || capable(CAP_SYS_RESOURCE) || (flags & EXT4_MB_USE_ROOT_BLOCKS)) { diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0e01e90add8b..c21b1de51afb 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1153,8 +1153,8 @@ struct ext4_sb_info { unsigned int s_mount_flags; unsigned int s_def_mount_opt; ext4_fsblk_t s_sb_block; - uid_t s_resuid; - gid_t s_resgid; + kuid_t s_resuid; + kgid_t s_resgid; unsigned short s_mount_state; unsigned short s_pad; int s_addr_per_block_bits; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 409c2ee7750a..9f9acac6c43f 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -808,8 +808,8 @@ got: } if (owner) { inode->i_mode = mode; - inode->i_uid = owner[0]; - inode->i_gid = owner[1]; + i_uid_write(inode, owner[0]); + i_gid_write(inode, owner[1]); } else if (test_opt(sb, GRPID)) { inode->i_mode = mode; inode->i_uid = current_fsuid(); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c77b0bd2c711..07eaf565fdcb 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3630,6 +3630,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) journal_t *journal = EXT4_SB(sb)->s_journal; long ret; int block; + uid_t i_uid; + gid_t i_gid; inode = iget_locked(sb, ino); if (!inode) @@ -3645,12 +3647,14 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) goto bad_inode; raw_inode = ext4_raw_inode(&iloc); inode->i_mode = le16_to_cpu(raw_inode->i_mode); - inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); - inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); + i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); + i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); if (!(test_opt(inode->i_sb, NO_UID32))) { - inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; - inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; + i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; + i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; } + i_uid_write(inode, i_uid); + i_gid_write(inode, i_gid); set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ @@ -3870,6 +3874,8 @@ static int ext4_do_update_inode(handle_t *handle, struct ext4_inode_info *ei = EXT4_I(inode); struct buffer_head *bh = iloc->bh; int err = 0, rc, block; + uid_t i_uid; + gid_t i_gid; /* For fields not not tracking in the in-memory inode, * initialise them to zero for new inodes. */ @@ -3878,27 +3884,27 @@ static int ext4_do_update_inode(handle_t *handle, ext4_get_inode_flags(ei); raw_inode->i_mode = cpu_to_le16(inode->i_mode); + i_uid = i_uid_read(inode); + i_gid = i_gid_read(inode); if (!(test_opt(inode->i_sb, NO_UID32))) { - raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); - raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); + raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); + raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); /* * Fix up interoperability with old kernels. Otherwise, old inodes get * re-used with the upper 16 bits of the uid/gid intact */ if (!ei->i_dtime) { raw_inode->i_uid_high = - cpu_to_le16(high_16_bits(inode->i_uid)); + cpu_to_le16(high_16_bits(i_uid)); raw_inode->i_gid_high = - cpu_to_le16(high_16_bits(inode->i_gid)); + cpu_to_le16(high_16_bits(i_gid)); } else { raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } } else { - raw_inode->i_uid_low = - cpu_to_le16(fs_high2lowuid(inode->i_uid)); - raw_inode->i_gid_low = - cpu_to_le16(fs_high2lowgid(inode->i_gid)); + raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid)); + raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid)); raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } @@ -4084,8 +4090,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (is_quota_modification(inode, attr)) dquot_initialize(inode); - if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || - (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { + if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) || + (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) { handle_t *handle; /* (user+group)*(old+new) structure, inode write (sb, diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index f39f80f8f2c5..f1bb32ec0169 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -466,8 +466,8 @@ int ext4_ext_migrate(struct inode *inode) } goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) * EXT4_INODES_PER_GROUP(inode->i_sb)) + 1; - owner[0] = inode->i_uid; - owner[1] = inode->i_gid; + owner[0] = i_uid_read(inode); + owner[1] = i_gid_read(inode); tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, S_IFREG, NULL, goal, owner); if (IS_ERR(tmp_inode)) { diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e1fb1d5de58e..35b5954489ee 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1007,7 +1007,7 @@ static void destroy_inodecache(void) void ext4_clear_inode(struct inode *inode) { invalidate_inode_buffers(inode); - end_writeback(inode); + clear_inode(inode); dquot_drop(inode); ext4_discard_preallocations(inode); if (EXT4_I(inode)->jinode) { @@ -1448,6 +1448,8 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, { struct ext4_sb_info *sbi = EXT4_SB(sb); const struct mount_opts *m; + kuid_t uid; + kgid_t gid; int arg = 0; #ifdef CONFIG_QUOTA @@ -1474,10 +1476,20 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, "Ignoring removed %s option", opt); return 1; case Opt_resuid: - sbi->s_resuid = arg; + uid = make_kuid(current_user_ns(), arg); + if (!uid_valid(uid)) { + ext4_msg(sb, KERN_ERR, "Invalid uid value %d", arg); + return -1; + } + sbi->s_resuid = uid; return 1; case Opt_resgid: - sbi->s_resgid = arg; + gid = make_kgid(current_user_ns(), arg); + if (!gid_valid(gid)) { + ext4_msg(sb, KERN_ERR, "Invalid gid value %d", arg); + return -1; + } + sbi->s_resgid = gid; return 1; case Opt_abort: sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; @@ -1732,12 +1744,14 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, SEQ_OPTS_PRINT("%s", token2str(m->token)); } - if (nodefs || sbi->s_resuid != EXT4_DEF_RESUID || + if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) || le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) - SEQ_OPTS_PRINT("resuid=%u", sbi->s_resuid); - if (nodefs || sbi->s_resgid != EXT4_DEF_RESGID || + SEQ_OPTS_PRINT("resuid=%u", + from_kuid_munged(&init_user_ns, sbi->s_resuid)); + if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) || le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) - SEQ_OPTS_PRINT("resgid=%u", sbi->s_resgid); + SEQ_OPTS_PRINT("resgid=%u", + from_kgid_munged(&init_user_ns, sbi->s_resgid)); def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors); if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO) SEQ_OPTS_PUTS("errors=remount-ro"); @@ -2980,8 +2994,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } sb->s_fs_info = sbi; sbi->s_mount_opt = 0; - sbi->s_resuid = EXT4_DEF_RESUID; - sbi->s_resgid = EXT4_DEF_RESGID; + sbi->s_resuid = make_kuid(&init_user_ns, EXT4_DEF_RESUID); + sbi->s_resgid = make_kgid(&init_user_ns, EXT4_DEF_RESGID); sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; sbi->s_sb_block = sb_block; if (sb->s_bdev->bd_part) @@ -3060,8 +3074,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (def_mount_opts & EXT4_DEFM_DISCARD) set_opt(sb, DISCARD); - sbi->s_resuid = le16_to_cpu(es->s_def_resuid); - sbi->s_resgid = le16_to_cpu(es->s_def_resgid); + sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid)); + sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid)); sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; @@ -4213,8 +4227,8 @@ static int ext4_unfreeze(struct super_block *sb) struct ext4_mount_options { unsigned long s_mount_opt; unsigned long s_mount_opt2; - uid_t s_resuid; - gid_t s_resgid; + kuid_t s_resuid; + kgid_t s_resgid; unsigned long s_commit_interval; u32 s_min_batch_time, s_max_batch_time; #ifdef CONFIG_QUOTA @@ -4744,7 +4758,6 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, return -EIO; } - mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); bh = ext4_bread(handle, inode, blk, 1, &err); if (!bh) goto out; @@ -4760,16 +4773,13 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, err = ext4_handle_dirty_metadata(handle, NULL, bh); brelse(bh); out: - if (err) { - mutex_unlock(&inode->i_mutex); + if (err) return err; - } if (inode->i_size < off + len) { i_size_write(inode, off + len); EXT4_I(inode)->i_disksize = inode->i_size; ext4_mark_inode_dirty(handle, inode); } - mutex_unlock(&inode->i_mutex); return len; } diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 21687e31acc0..b3d290c1b513 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -454,7 +454,7 @@ static void fat_evict_inode(struct inode *inode) fat_truncate_blocks(inode, 0); } invalidate_inode_buffers(inode); - end_writeback(inode); + clear_inode(inode); fat_cache_inval_inode(inode); fat_detach(inode); } diff --git a/fs/fcntl.c b/fs/fcntl.c index 75e7c1f3a080..d078b75572a7 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -532,9 +532,9 @@ static inline int sigio_perm(struct task_struct *p, rcu_read_lock(); cred = __task_cred(p); - ret = ((fown->euid == 0 || - fown->euid == cred->suid || fown->euid == cred->uid || - fown->uid == cred->suid || fown->uid == cred->uid) && + ret = ((uid_eq(fown->euid, GLOBAL_ROOT_UID) || + uid_eq(fown->euid, cred->suid) || uid_eq(fown->euid, cred->uid) || + uid_eq(fown->uid, cred->suid) || uid_eq(fown->uid, cred->uid)) && !security_file_send_sigiotask(p, fown, sig)); rcu_read_unlock(); return ret; diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c index cf9ef918a2a9..ef67c95f12d4 100644 --- a/fs/freevxfs/vxfs_inode.c +++ b/fs/freevxfs/vxfs_inode.c @@ -355,6 +355,6 @@ void vxfs_evict_inode(struct inode *ip) { truncate_inode_pages(&ip->i_data, 0); - end_writeback(ip); + clear_inode(ip); call_rcu(&ip->i_rcu, vxfs_i_callback); } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 539f36cf3e4a..8d2fb8c88cf3 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -231,11 +231,8 @@ static void requeue_io(struct inode *inode, struct bdi_writeback *wb) static void inode_sync_complete(struct inode *inode) { - /* - * Prevent speculative execution through - * spin_unlock(&wb->list_lock); - */ - + inode->i_state &= ~I_SYNC; + /* Waiters must see I_SYNC cleared before being woken up */ smp_mb(); wake_up_bit(&inode->i_state, __I_SYNC); } @@ -329,10 +326,12 @@ static int write_inode(struct inode *inode, struct writeback_control *wbc) } /* - * Wait for writeback on an inode to complete. + * Wait for writeback on an inode to complete. Called with i_lock held. + * Caller must make sure inode cannot go away when we drop i_lock. */ -static void inode_wait_for_writeback(struct inode *inode, - struct bdi_writeback *wb) +static void __inode_wait_for_writeback(struct inode *inode) + __releases(inode->i_lock) + __acquires(inode->i_lock) { DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); wait_queue_head_t *wqh; @@ -340,70 +339,119 @@ static void inode_wait_for_writeback(struct inode *inode, wqh = bit_waitqueue(&inode->i_state, __I_SYNC); while (inode->i_state & I_SYNC) { spin_unlock(&inode->i_lock); - spin_unlock(&wb->list_lock); __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); - spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); } } /* - * Write out an inode's dirty pages. Called under wb->list_lock and - * inode->i_lock. Either the caller has an active reference on the inode or - * the inode has I_WILL_FREE set. - * - * If `wait' is set, wait on the writeout. - * - * The whole writeout design is quite complex and fragile. We want to avoid - * starvation of particular inodes when others are being redirtied, prevent - * livelocks, etc. + * Wait for writeback on an inode to complete. Caller must have inode pinned. */ -static int -writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, - struct writeback_control *wbc) +void inode_wait_for_writeback(struct inode *inode) { - struct address_space *mapping = inode->i_mapping; - long nr_to_write = wbc->nr_to_write; - unsigned dirty; - int ret; + spin_lock(&inode->i_lock); + __inode_wait_for_writeback(inode); + spin_unlock(&inode->i_lock); +} - assert_spin_locked(&wb->list_lock); - assert_spin_locked(&inode->i_lock); +/* + * Sleep until I_SYNC is cleared. This function must be called with i_lock + * held and drops it. It is aimed for callers not holding any inode reference + * so once i_lock is dropped, inode can go away. + */ +static void inode_sleep_on_writeback(struct inode *inode) + __releases(inode->i_lock) +{ + DEFINE_WAIT(wait); + wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC); + int sleep; - if (!atomic_read(&inode->i_count)) - WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); - else - WARN_ON(inode->i_state & I_WILL_FREE); + prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); + sleep = inode->i_state & I_SYNC; + spin_unlock(&inode->i_lock); + if (sleep) + schedule(); + finish_wait(wqh, &wait); +} - if (inode->i_state & I_SYNC) { +/* + * Find proper writeback list for the inode depending on its current state and + * possibly also change of its state while we were doing writeback. Here we + * handle things such as livelock prevention or fairness of writeback among + * inodes. This function can be called only by flusher thread - noone else + * processes all inodes in writeback lists and requeueing inodes behind flusher + * thread's back can have unexpected consequences. + */ +static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, + struct writeback_control *wbc) +{ + if (inode->i_state & I_FREEING) + return; + + /* + * Sync livelock prevention. Each inode is tagged and synced in one + * shot. If still dirty, it will be redirty_tail()'ed below. Update + * the dirty time to prevent enqueue and sync it again. + */ + if ((inode->i_state & I_DIRTY) && + (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)) + inode->dirtied_when = jiffies; + + if (wbc->pages_skipped) { /* - * If this inode is locked for writeback and we are not doing - * writeback-for-data-integrity, move it to b_more_io so that - * writeback can proceed with the other inodes on s_io. - * - * We'll have another go at writing back this inode when we - * completed a full scan of b_io. + * writeback is not making progress due to locked + * buffers. Skip this inode for now. */ - if (wbc->sync_mode != WB_SYNC_ALL) { + redirty_tail(inode, wb); + return; + } + + if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { + /* + * We didn't write back all the pages. nfs_writepages() + * sometimes bales out without doing anything. + */ + if (wbc->nr_to_write <= 0) { + /* Slice used up. Queue for next turn. */ requeue_io(inode, wb); - trace_writeback_single_inode_requeue(inode, wbc, - nr_to_write); - return 0; + } else { + /* + * Writeback blocked by something other than + * congestion. Delay the inode for some time to + * avoid spinning on the CPU (100% iowait) + * retrying writeback of the dirty page/inode + * that cannot be performed immediately. + */ + redirty_tail(inode, wb); } - + } else if (inode->i_state & I_DIRTY) { /* - * It's a data-integrity sync. We must wait. + * Filesystems can dirty the inode during writeback operations, + * such as delayed allocation during submission or metadata + * updates after data IO completion. */ - inode_wait_for_writeback(inode, wb); + redirty_tail(inode, wb); + } else { + /* The inode is clean. Remove from writeback lists. */ + list_del_init(&inode->i_wb_list); } +} - BUG_ON(inode->i_state & I_SYNC); +/* + * Write out an inode and its dirty pages. Do not update the writeback list + * linkage. That is left to the caller. The caller is also responsible for + * setting I_SYNC flag and calling inode_sync_complete() to clear it. + */ +static int +__writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, + struct writeback_control *wbc) +{ + struct address_space *mapping = inode->i_mapping; + long nr_to_write = wbc->nr_to_write; + unsigned dirty; + int ret; - /* Set I_SYNC, reset I_DIRTY_PAGES */ - inode->i_state |= I_SYNC; - inode->i_state &= ~I_DIRTY_PAGES; - spin_unlock(&inode->i_lock); - spin_unlock(&wb->list_lock); + WARN_ON(!(inode->i_state & I_SYNC)); ret = do_writepages(mapping, wbc); @@ -424,6 +472,9 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, * write_inode() */ spin_lock(&inode->i_lock); + /* Clear I_DIRTY_PAGES if we've written out all dirty pages */ + if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) + inode->i_state &= ~I_DIRTY_PAGES; dirty = inode->i_state & I_DIRTY; inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); spin_unlock(&inode->i_lock); @@ -433,60 +484,67 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, if (ret == 0) ret = err; } + trace_writeback_single_inode(inode, wbc, nr_to_write); + return ret; +} + +/* + * Write out an inode's dirty pages. Either the caller has an active reference + * on the inode or the inode has I_WILL_FREE set. + * + * This function is designed to be called for writing back one inode which + * we go e.g. from filesystem. Flusher thread uses __writeback_single_inode() + * and does more profound writeback list handling in writeback_sb_inodes(). + */ +static int +writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, + struct writeback_control *wbc) +{ + int ret = 0; - spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); - inode->i_state &= ~I_SYNC; - if (!(inode->i_state & I_FREEING)) { + if (!atomic_read(&inode->i_count)) + WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); + else + WARN_ON(inode->i_state & I_WILL_FREE); + + if (inode->i_state & I_SYNC) { + if (wbc->sync_mode != WB_SYNC_ALL) + goto out; /* - * Sync livelock prevention. Each inode is tagged and synced in - * one shot. If still dirty, it will be redirty_tail()'ed below. - * Update the dirty time to prevent enqueue and sync it again. + * It's a data-integrity sync. We must wait. Since callers hold + * inode reference or inode has I_WILL_FREE set, it cannot go + * away under us. */ - if ((inode->i_state & I_DIRTY) && - (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)) - inode->dirtied_when = jiffies; - - if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { - /* - * We didn't write back all the pages. nfs_writepages() - * sometimes bales out without doing anything. - */ - inode->i_state |= I_DIRTY_PAGES; - if (wbc->nr_to_write <= 0) { - /* - * slice used up: queue for next turn - */ - requeue_io(inode, wb); - } else { - /* - * Writeback blocked by something other than - * congestion. Delay the inode for some time to - * avoid spinning on the CPU (100% iowait) - * retrying writeback of the dirty page/inode - * that cannot be performed immediately. - */ - redirty_tail(inode, wb); - } - } else if (inode->i_state & I_DIRTY) { - /* - * Filesystems can dirty the inode during writeback - * operations, such as delayed allocation during - * submission or metadata updates after data IO - * completion. - */ - redirty_tail(inode, wb); - } else { - /* - * The inode is clean. At this point we either have - * a reference to the inode or it's on it's way out. - * No need to add it back to the LRU. - */ - list_del_init(&inode->i_wb_list); - } + __inode_wait_for_writeback(inode); } + WARN_ON(inode->i_state & I_SYNC); + /* + * Skip inode if it is clean. We don't want to mess with writeback + * lists in this function since flusher thread may be doing for example + * sync in parallel and if we move the inode, it could get skipped. So + * here we make sure inode is on some writeback list and leave it there + * unless we have completely cleaned the inode. + */ + if (!(inode->i_state & I_DIRTY)) + goto out; + inode->i_state |= I_SYNC; + spin_unlock(&inode->i_lock); + + ret = __writeback_single_inode(inode, wb, wbc); + + spin_lock(&wb->list_lock); + spin_lock(&inode->i_lock); + /* + * If inode is clean, remove it from writeback lists. Otherwise don't + * touch it. See comment above for explanation. + */ + if (!(inode->i_state & I_DIRTY)) + list_del_init(&inode->i_wb_list); + spin_unlock(&wb->list_lock); inode_sync_complete(inode); - trace_writeback_single_inode(inode, wbc, nr_to_write); +out: + spin_unlock(&inode->i_lock); return ret; } @@ -580,29 +638,57 @@ static long writeback_sb_inodes(struct super_block *sb, redirty_tail(inode, wb); continue; } - __iget(inode); + if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) { + /* + * If this inode is locked for writeback and we are not + * doing writeback-for-data-integrity, move it to + * b_more_io so that writeback can proceed with the + * other inodes on s_io. + * + * We'll have another go at writing back this inode + * when we completed a full scan of b_io. + */ + spin_unlock(&inode->i_lock); + requeue_io(inode, wb); + trace_writeback_sb_inodes_requeue(inode); + continue; + } + spin_unlock(&wb->list_lock); + + /* + * We already requeued the inode if it had I_SYNC set and we + * are doing WB_SYNC_NONE writeback. So this catches only the + * WB_SYNC_ALL case. + */ + if (inode->i_state & I_SYNC) { + /* Wait for I_SYNC. This function drops i_lock... */ + inode_sleep_on_writeback(inode); + /* Inode may be gone, start again */ + continue; + } + inode->i_state |= I_SYNC; + spin_unlock(&inode->i_lock); + write_chunk = writeback_chunk_size(wb->bdi, work); wbc.nr_to_write = write_chunk; wbc.pages_skipped = 0; - writeback_single_inode(inode, wb, &wbc); + /* + * We use I_SYNC to pin the inode in memory. While it is set + * evict_inode() will wait so the inode cannot be freed. + */ + __writeback_single_inode(inode, wb, &wbc); work->nr_pages -= write_chunk - wbc.nr_to_write; wrote += write_chunk - wbc.nr_to_write; + spin_lock(&wb->list_lock); + spin_lock(&inode->i_lock); if (!(inode->i_state & I_DIRTY)) wrote++; - if (wbc.pages_skipped) { - /* - * writeback is not making progress due to locked - * buffers. Skip this inode for now. - */ - redirty_tail(inode, wb); - } + requeue_inode(inode, wb, &wbc); + inode_sync_complete(inode); spin_unlock(&inode->i_lock); - spin_unlock(&wb->list_lock); - iput(inode); - cond_resched(); - spin_lock(&wb->list_lock); + cond_resched_lock(&wb->list_lock); /* * bail out to wb_writeback() often enough to check * background threshold and other termination conditions. @@ -796,8 +882,10 @@ static long wb_writeback(struct bdi_writeback *wb, trace_writeback_wait(wb->bdi, work); inode = wb_inode(wb->b_more_io.prev); spin_lock(&inode->i_lock); - inode_wait_for_writeback(inode, wb); - spin_unlock(&inode->i_lock); + spin_unlock(&wb->list_lock); + /* This function drops i_lock... */ + inode_sleep_on_writeback(inode); + spin_lock(&wb->list_lock); } } spin_unlock(&wb->list_lock); @@ -1331,7 +1419,6 @@ EXPORT_SYMBOL(sync_inodes_sb); int write_inode_now(struct inode *inode, int sync) { struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; - int ret; struct writeback_control wbc = { .nr_to_write = LONG_MAX, .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE, @@ -1343,12 +1430,7 @@ int write_inode_now(struct inode *inode, int sync) wbc.nr_to_write = 0; might_sleep(); - spin_lock(&wb->list_lock); - spin_lock(&inode->i_lock); - ret = writeback_single_inode(inode, wb, &wbc); - spin_unlock(&inode->i_lock); - spin_unlock(&wb->list_lock); - return ret; + return writeback_single_inode(inode, wb, &wbc); } EXPORT_SYMBOL(write_inode_now); @@ -1365,15 +1447,7 @@ EXPORT_SYMBOL(write_inode_now); */ int sync_inode(struct inode *inode, struct writeback_control *wbc) { - struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; - int ret; - - spin_lock(&wb->list_lock); - spin_lock(&inode->i_lock); - ret = writeback_single_inode(inode, wb, wbc); - spin_unlock(&inode->i_lock); - spin_unlock(&wb->list_lock); - return ret; + return writeback_single_inode(inode, &inode_to_bdi(inode)->wb, wbc); } EXPORT_SYMBOL(sync_inode); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 26783eb2b1fc..56f6dcf30768 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -122,7 +122,7 @@ static void fuse_destroy_inode(struct inode *inode) static void fuse_evict_inode(struct inode *inode) { truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); if (inode->i_sb->s_flags & MS_ACTIVE) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index aa9949e5de26..67fd6beffece 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -543,7 +543,6 @@ struct gfs2_sb_host { struct lm_lockstruct { int ls_jid; unsigned int ls_first; - unsigned int ls_nodir; const struct lm_lockops *ls_ops; dlm_lockspace_t *ls_dlm; diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 5f5e70e047dc..4a38db739ca0 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -1209,8 +1209,6 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table) fsname++; flags = DLM_LSFL_FS | DLM_LSFL_NEWEXCL; - if (ls->ls_nodir) - flags |= DLM_LSFL_NODIR; /* * create/join lockspace diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index c5871ae40561..b8c250fc4922 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -993,6 +993,7 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent) ls->ls_jid = option; break; case Opt_id: + case Opt_nodir: /* Obsolete, but left for backward compat purposes */ break; case Opt_first: @@ -1001,12 +1002,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent) goto hostdata_error; ls->ls_first = option; break; - case Opt_nodir: - ret = match_int(&tmp[0], &option); - if (ret || (option != 0 && option != 1)) - goto hostdata_error; - ls->ls_nodir = option; - break; case Opt_err: default: hostdata_error: diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 6172fa77ad59..713e621c240b 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1554,7 +1554,7 @@ out_unlock: out: /* Case 3 starts here */ truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); gfs2_dir_hash_inval(ip); ip->i_gl->gl_object = NULL; flush_delayed_work_sync(&ip->i_gl->gl_work); diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index d33172c291ba..9c2592b1d5ff 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -368,10 +368,7 @@ int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid) struct gfs2_jdesc *jd; int rv; - rv = -ESHUTDOWN; spin_lock(&sdp->sd_jindex_spin); - if (test_bit(SDF_NORECOVERY, &sdp->sd_flags)) - goto out; rv = -EBUSY; if (sdp->sd_jdesc->jd_jid == jid) goto out; @@ -396,8 +393,13 @@ static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len) if (rv != 1) return -EINVAL; - rv = gfs2_recover_set(sdp, jid); + if (test_bit(SDF_NORECOVERY, &sdp->sd_flags)) { + rv = -ESHUTDOWN; + goto out; + } + rv = gfs2_recover_set(sdp, jid); +out: return rv ? rv : len; } diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 737dbeb64320..761ec06354b4 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -532,7 +532,7 @@ out: void hfs_evict_inode(struct inode *inode) { truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); if (HFS_IS_RSRC(inode) && HFS_I(inode)->rsrc_inode) { HFS_I(HFS_I(inode)->rsrc_inode)->rsrc_inode = NULL; iput(HFS_I(inode)->rsrc_inode); diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index ceb1c281eefb..a9bca4b8768b 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -154,7 +154,7 @@ static void hfsplus_evict_inode(struct inode *inode) { dprint(DBG_INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino); truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); if (HFSPLUS_IS_RSRC(inode)) { HFSPLUS_I(HFSPLUS_I(inode)->rsrc_inode)->rsrc_inode = NULL; iput(HFSPLUS_I(inode)->rsrc_inode); diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 07c516bfea76..2afa5bbccf9b 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -240,7 +240,7 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb) static void hostfs_evict_inode(struct inode *inode) { truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); if (HOSTFS_I(inode)->fd != -1) { close_file(&HOSTFS_I(inode)->fd); HOSTFS_I(inode)->fd = -1; diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 3b2cec29972b..b43066cbdc6a 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -299,7 +299,7 @@ void hpfs_write_if_changed(struct inode *inode) void hpfs_evict_inode(struct inode *inode) { truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); if (!inode->i_nlink) { hpfs_lock(inode->i_sb); hpfs_remove_fnode(inode->i_sb, inode->i_ino); diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index a80e45a690ac..d4f93b52cec5 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -614,7 +614,7 @@ static struct inode *hppfs_alloc_inode(struct super_block *sb) void hppfs_evict_inode(struct inode *ino) { - end_writeback(ino); + clear_inode(ino); dput(HPPFS_I(ino)->proc_dentry); mntput(ino->i_sb->s_fs_info); } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 001ef01d2fe2..cc9281b6c628 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -393,7 +393,7 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart) static void hugetlbfs_evict_inode(struct inode *inode) { truncate_hugepages(inode, 0); - end_writeback(inode); + clear_inode(inode); } static inline void diff --git a/fs/inode.c b/fs/inode.c index 9f4f5fecc096..6bc8761cc333 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -135,8 +135,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_fop = &empty_fops; inode->__i_nlink = 1; inode->i_opflags = 0; - inode->i_uid = 0; - inode->i_gid = 0; + i_uid_write(inode, 0); + i_gid_write(inode, 0); atomic_set(&inode->i_writecount, 0); inode->i_size = 0; inode->i_blocks = 0; @@ -486,7 +486,7 @@ void __remove_inode_hash(struct inode *inode) } EXPORT_SYMBOL(__remove_inode_hash); -void end_writeback(struct inode *inode) +void clear_inode(struct inode *inode) { might_sleep(); /* @@ -500,11 +500,10 @@ void end_writeback(struct inode *inode) BUG_ON(!list_empty(&inode->i_data.private_list)); BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(inode->i_state & I_CLEAR); - inode_sync_wait(inode); /* don't need i_lock here, no concurrent mods to i_state */ inode->i_state = I_FREEING | I_CLEAR; } -EXPORT_SYMBOL(end_writeback); +EXPORT_SYMBOL(clear_inode); /* * Free the inode passed in, removing it from the lists it is still connected @@ -531,12 +530,20 @@ static void evict(struct inode *inode) inode_sb_list_del(inode); + /* + * Wait for flusher thread to be done with the inode so that filesystem + * does not start destroying it while writeback is still running. Since + * the inode has I_FREEING set, flusher thread won't start new work on + * the inode. We just have to wait for running writeback to finish. + */ + inode_wait_for_writeback(inode); + if (op->evict_inode) { op->evict_inode(inode); } else { if (inode->i_data.nrpages) truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); } if (S_ISBLK(inode->i_mode) && inode->i_bdev) bd_forget(inode); @@ -1647,6 +1654,7 @@ void __init inode_init_early(void) HASH_EARLY, &i_hash_shift, &i_hash_mask, + 0, 0); for (loop = 0; loop < (1U << i_hash_shift); loop++) @@ -1677,6 +1685,7 @@ void __init inode_init(void) 0, &i_hash_shift, &i_hash_mask, + 0, 0); for (loop = 0; loop < (1U << i_hash_shift); loop++) @@ -1732,11 +1741,9 @@ EXPORT_SYMBOL(inode_init_owner); */ bool inode_owner_or_capable(const struct inode *inode) { - struct user_namespace *ns = inode_userns(inode); - - if (current_user_ns() == ns && current_fsuid() == inode->i_uid) + if (uid_eq(current_fsuid(), inode->i_uid)) return true; - if (ns_capable(ns, CAP_FOWNER)) + if (inode_capable(inode, CAP_FOWNER)) return true; return false; } diff --git a/fs/ioprio.c b/fs/ioprio.c index 0f1b9515213b..5e6dbe8958fc 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -37,8 +37,8 @@ int set_task_ioprio(struct task_struct *task, int ioprio) rcu_read_lock(); tcred = __task_cred(task); - if (tcred->uid != cred->euid && - tcred->uid != cred->uid && !capable(CAP_SYS_NICE)) { + if (!uid_eq(tcred->uid, cred->euid) && + !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) { rcu_read_unlock(); return -EPERM; } @@ -65,6 +65,7 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) struct task_struct *p, *g; struct user_struct *user; struct pid *pgrp; + kuid_t uid; int ret; switch (class) { @@ -110,16 +111,19 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case IOPRIO_WHO_USER: + uid = make_kuid(current_user_ns(), who); + if (!uid_valid(uid)) + break; if (!who) user = current_user(); else - user = find_user(who); + user = find_user(uid); if (!user) break; do_each_thread(g, p) { - if (__task_cred(p)->uid != who) + if (!uid_eq(task_uid(p), uid)) continue; ret = set_task_ioprio(p, ioprio); if (ret) @@ -174,6 +178,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) struct task_struct *g, *p; struct user_struct *user; struct pid *pgrp; + kuid_t uid; int ret = -ESRCH; int tmpio; @@ -203,16 +208,17 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case IOPRIO_WHO_USER: + uid = make_kuid(current_user_ns(), who); if (!who) user = current_user(); else - user = find_user(who); + user = find_user(uid); if (!user) break; do_each_thread(g, p) { - if (__task_cred(p)->uid != user->uid) + if (!uid_eq(task_uid(p), user->uid)) continue; tmpio = get_task_ioprio(p); if (tmpio < 0) diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c index 05f0754f2b46..08c03044abdd 100644 --- a/fs/jbd/checkpoint.c +++ b/fs/jbd/checkpoint.c @@ -508,20 +508,19 @@ int cleanup_journal_tail(journal_t *journal) /* * We need to make sure that any blocks that were recently written out * --- perhaps by log_do_checkpoint() --- are flushed out before we - * drop the transactions from the journal. It's unlikely this will be - * necessary, especially with an appropriately sized journal, but we - * need this to guarantee correctness. Fortunately - * cleanup_journal_tail() doesn't get called all that often. + * drop the transactions from the journal. Similarly we need to be sure + * superblock makes it to disk before next transaction starts reusing + * freed space (otherwise we could replay some blocks of the new + * transaction thinking they belong to the old one). So we use + * WRITE_FLUSH_FUA. It's unlikely this will be necessary, especially + * with an appropriately sized journal, but we need this to guarantee + * correctness. Fortunately cleanup_journal_tail() doesn't get called + * all that often. */ - if (journal->j_flags & JFS_BARRIER) - blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); + journal_update_sb_log_tail(journal, first_tid, blocknr, + WRITE_FLUSH_FUA); spin_lock(&journal->j_state_lock); - if (!tid_gt(first_tid, journal->j_tail_sequence)) { - spin_unlock(&journal->j_state_lock); - /* Someone else cleaned up journal so return 0 */ - return 0; - } /* OK, update the superblock to recover the freed space. * Physical blocks come first: have we wrapped beyond the end of * the log? */ @@ -539,8 +538,6 @@ int cleanup_journal_tail(journal_t *journal) journal->j_tail_sequence = first_tid; journal->j_tail = blocknr; spin_unlock(&journal->j_state_lock); - if (!(journal->j_flags & JFS_ABORT)) - journal_update_superblock(journal, 1); return 0; } diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index f2b9a571f4cf..52c15c776029 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -298,6 +298,7 @@ void journal_commit_transaction(journal_t *journal) int tag_flag; int i; struct blk_plug plug; + int write_op = WRITE; /* * First job: lock down the current transaction and wait for @@ -307,7 +308,16 @@ void journal_commit_transaction(journal_t *journal) /* Do we need to erase the effects of a prior journal_flush? */ if (journal->j_flags & JFS_FLUSHED) { jbd_debug(3, "super block updated\n"); - journal_update_superblock(journal, 1); + mutex_lock(&journal->j_checkpoint_mutex); + /* + * We hold j_checkpoint_mutex so tail cannot change under us. + * We don't need any special data guarantees for writing sb + * since journal is empty and it is ok for write to be + * flushed only with transaction commit. + */ + journal_update_sb_log_tail(journal, journal->j_tail_sequence, + journal->j_tail, WRITE_SYNC); + mutex_unlock(&journal->j_checkpoint_mutex); } else { jbd_debug(3, "superblock not updated\n"); } @@ -413,13 +423,16 @@ void journal_commit_transaction(journal_t *journal) jbd_debug (3, "JBD: commit phase 2\n"); + if (tid_geq(journal->j_commit_waited, commit_transaction->t_tid)) + write_op = WRITE_SYNC; + /* * Now start flushing things to disk, in the order they appear * on the transaction lists. Data blocks go first. */ blk_start_plug(&plug); err = journal_submit_data_buffers(journal, commit_transaction, - WRITE_SYNC); + write_op); blk_finish_plug(&plug); /* @@ -478,7 +491,7 @@ void journal_commit_transaction(journal_t *journal) blk_start_plug(&plug); - journal_write_revoke_records(journal, commit_transaction, WRITE_SYNC); + journal_write_revoke_records(journal, commit_transaction, write_op); /* * If we found any dirty or locked buffers, then we should have @@ -649,7 +662,7 @@ start_journal_io: clear_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = journal_end_buffer_io_sync; - submit_bh(WRITE_SYNC, bh); + submit_bh(write_op, bh); } cond_resched(); diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 0971e9217808..425c2f2cf170 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -563,6 +563,8 @@ int log_wait_commit(journal_t *journal, tid_t tid) spin_unlock(&journal->j_state_lock); #endif spin_lock(&journal->j_state_lock); + if (!tid_geq(journal->j_commit_waited, tid)) + journal->j_commit_waited = tid; while (tid_gt(tid, journal->j_commit_sequence)) { jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n", tid, journal->j_commit_sequence); @@ -921,8 +923,33 @@ static int journal_reset(journal_t *journal) journal->j_max_transaction_buffers = journal->j_maxlen / 4; - /* Add the dynamic fields and write it to disk. */ - journal_update_superblock(journal, 1); + /* + * As a special case, if the on-disk copy is already marked as needing + * no recovery (s_start == 0), then we can safely defer the superblock + * update until the next commit by setting JFS_FLUSHED. This avoids + * attempting a write to a potential-readonly device. + */ + if (sb->s_start == 0) { + jbd_debug(1,"JBD: Skipping superblock update on recovered sb " + "(start %u, seq %d, errno %d)\n", + journal->j_tail, journal->j_tail_sequence, + journal->j_errno); + journal->j_flags |= JFS_FLUSHED; + } else { + /* Lock here to make assertions happy... */ + mutex_lock(&journal->j_checkpoint_mutex); + /* + * Update log tail information. We use WRITE_FUA since new + * transaction will start reusing journal space and so we + * must make sure information about current log tail is on + * disk before that. + */ + journal_update_sb_log_tail(journal, + journal->j_tail_sequence, + journal->j_tail, + WRITE_FUA); + mutex_unlock(&journal->j_checkpoint_mutex); + } return journal_start_thread(journal); } @@ -999,35 +1026,15 @@ int journal_create(journal_t *journal) return journal_reset(journal); } -/** - * void journal_update_superblock() - Update journal sb on disk. - * @journal: The journal to update. - * @wait: Set to '0' if you don't want to wait for IO completion. - * - * Update a journal's dynamic superblock fields and write it to disk, - * optionally waiting for the IO to complete. - */ -void journal_update_superblock(journal_t *journal, int wait) +static void journal_write_superblock(journal_t *journal, int write_op) { - journal_superblock_t *sb = journal->j_superblock; struct buffer_head *bh = journal->j_sb_buffer; + int ret; - /* - * As a special case, if the on-disk copy is already marked as needing - * no recovery (s_start == 0) and there are no outstanding transactions - * in the filesystem, then we can safely defer the superblock update - * until the next commit by setting JFS_FLUSHED. This avoids - * attempting a write to a potential-readonly device. - */ - if (sb->s_start == 0 && journal->j_tail_sequence == - journal->j_transaction_sequence) { - jbd_debug(1,"JBD: Skipping superblock update on recovered sb " - "(start %u, seq %d, errno %d)\n", - journal->j_tail, journal->j_tail_sequence, - journal->j_errno); - goto out; - } - + trace_journal_write_superblock(journal, write_op); + if (!(journal->j_flags & JFS_BARRIER)) + write_op &= ~(REQ_FUA | REQ_FLUSH); + lock_buffer(bh); if (buffer_write_io_error(bh)) { char b[BDEVNAME_SIZE]; /* @@ -1045,42 +1052,100 @@ void journal_update_superblock(journal_t *journal, int wait) set_buffer_uptodate(bh); } + get_bh(bh); + bh->b_end_io = end_buffer_write_sync; + ret = submit_bh(write_op, bh); + wait_on_buffer(bh); + if (buffer_write_io_error(bh)) { + clear_buffer_write_io_error(bh); + set_buffer_uptodate(bh); + ret = -EIO; + } + if (ret) { + char b[BDEVNAME_SIZE]; + printk(KERN_ERR "JBD: Error %d detected " + "when updating journal superblock for %s.\n", + ret, journal_dev_name(journal, b)); + } +} + +/** + * journal_update_sb_log_tail() - Update log tail in journal sb on disk. + * @journal: The journal to update. + * @tail_tid: TID of the new transaction at the tail of the log + * @tail_block: The first block of the transaction at the tail of the log + * @write_op: With which operation should we write the journal sb + * + * Update a journal's superblock information about log tail and write it to + * disk, waiting for the IO to complete. + */ +void journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, + unsigned int tail_block, int write_op) +{ + journal_superblock_t *sb = journal->j_superblock; + + BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); + jbd_debug(1,"JBD: updating superblock (start %u, seq %u)\n", + tail_block, tail_tid); + + sb->s_sequence = cpu_to_be32(tail_tid); + sb->s_start = cpu_to_be32(tail_block); + + journal_write_superblock(journal, write_op); + + /* Log is no longer empty */ + spin_lock(&journal->j_state_lock); + WARN_ON(!sb->s_sequence); + journal->j_flags &= ~JFS_FLUSHED; + spin_unlock(&journal->j_state_lock); +} + +/** + * mark_journal_empty() - Mark on disk journal as empty. + * @journal: The journal to update. + * + * Update a journal's dynamic superblock fields to show that journal is empty. + * Write updated superblock to disk waiting for IO to complete. + */ +static void mark_journal_empty(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; + + BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); spin_lock(&journal->j_state_lock); - jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n", - journal->j_tail, journal->j_tail_sequence, journal->j_errno); + jbd_debug(1, "JBD: Marking journal as empty (seq %d)\n", + journal->j_tail_sequence); sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); - sb->s_start = cpu_to_be32(journal->j_tail); - sb->s_errno = cpu_to_be32(journal->j_errno); + sb->s_start = cpu_to_be32(0); spin_unlock(&journal->j_state_lock); - BUFFER_TRACE(bh, "marking dirty"); - mark_buffer_dirty(bh); - if (wait) { - sync_dirty_buffer(bh); - if (buffer_write_io_error(bh)) { - char b[BDEVNAME_SIZE]; - printk(KERN_ERR "JBD: I/O error detected " - "when updating journal superblock for %s.\n", - journal_dev_name(journal, b)); - clear_buffer_write_io_error(bh); - set_buffer_uptodate(bh); - } - } else - write_dirty_buffer(bh, WRITE); + journal_write_superblock(journal, WRITE_FUA); - trace_jbd_update_superblock_end(journal, wait); -out: - /* If we have just flushed the log (by marking s_start==0), then - * any future commit will have to be careful to update the - * superblock again to re-record the true start of the log. */ + spin_lock(&journal->j_state_lock); + /* Log is empty */ + journal->j_flags |= JFS_FLUSHED; + spin_unlock(&journal->j_state_lock); +} + +/** + * journal_update_sb_errno() - Update error in the journal. + * @journal: The journal to update. + * + * Update a journal's errno. Write updated superblock to disk waiting for IO + * to complete. + */ +static void journal_update_sb_errno(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; spin_lock(&journal->j_state_lock); - if (sb->s_start) - journal->j_flags &= ~JFS_FLUSHED; - else - journal->j_flags |= JFS_FLUSHED; + jbd_debug(1, "JBD: updating superblock error (errno %d)\n", + journal->j_errno); + sb->s_errno = cpu_to_be32(journal->j_errno); spin_unlock(&journal->j_state_lock); + + journal_write_superblock(journal, WRITE_SYNC); } /* @@ -1251,6 +1316,8 @@ int journal_destroy(journal_t *journal) /* Force any old transactions to disk */ + /* We cannot race with anybody but must keep assertions happy */ + mutex_lock(&journal->j_checkpoint_mutex); /* Totally anal locking here... */ spin_lock(&journal->j_list_lock); while (journal->j_checkpoint_transactions != NULL) { @@ -1266,16 +1333,14 @@ int journal_destroy(journal_t *journal) if (journal->j_sb_buffer) { if (!is_journal_aborted(journal)) { - /* We can now mark the journal as empty. */ - journal->j_tail = 0; journal->j_tail_sequence = ++journal->j_transaction_sequence; - journal_update_superblock(journal, 1); - } else { + mark_journal_empty(journal); + } else err = -EIO; - } brelse(journal->j_sb_buffer); } + mutex_unlock(&journal->j_checkpoint_mutex); if (journal->j_inode) iput(journal->j_inode); @@ -1455,7 +1520,6 @@ int journal_flush(journal_t *journal) { int err = 0; transaction_t *transaction = NULL; - unsigned int old_tail; spin_lock(&journal->j_state_lock); @@ -1490,6 +1554,7 @@ int journal_flush(journal_t *journal) if (is_journal_aborted(journal)) return -EIO; + mutex_lock(&journal->j_checkpoint_mutex); cleanup_journal_tail(journal); /* Finally, mark the journal as really needing no recovery. @@ -1497,14 +1562,9 @@ int journal_flush(journal_t *journal) * the magic code for a fully-recovered superblock. Any future * commits of data to the journal will restore the current * s_start value. */ + mark_journal_empty(journal); + mutex_unlock(&journal->j_checkpoint_mutex); spin_lock(&journal->j_state_lock); - old_tail = journal->j_tail; - journal->j_tail = 0; - spin_unlock(&journal->j_state_lock); - journal_update_superblock(journal, 1); - spin_lock(&journal->j_state_lock); - journal->j_tail = old_tail; - J_ASSERT(!journal->j_running_transaction); J_ASSERT(!journal->j_committing_transaction); J_ASSERT(!journal->j_checkpoint_transactions); @@ -1544,8 +1604,12 @@ int journal_wipe(journal_t *journal, int write) write ? "Clearing" : "Ignoring"); err = journal_skip_recovery(journal); - if (write) - journal_update_superblock(journal, 1); + if (write) { + /* Lock to make assertions happy... */ + mutex_lock(&journal->j_checkpoint_mutex); + mark_journal_empty(journal); + mutex_unlock(&journal->j_checkpoint_mutex); + } no_recovery: return err; @@ -1613,7 +1677,7 @@ static void __journal_abort_soft (journal_t *journal, int errno) __journal_abort_hard(journal); if (errno) - journal_update_superblock(journal, 1); + journal_update_sb_errno(journal); } /** diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index b2a7e5244e39..febc10db5ced 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -1433,8 +1433,6 @@ int journal_stop(handle_t *handle) } } - if (handle->h_sync) - transaction->t_synchronous_commit = 1; current->journal_info = NULL; spin_lock(&journal->j_state_lock); spin_lock(&transaction->t_handle_lock); diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index bb6f993ebca9..3d3092eda811 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -240,7 +240,7 @@ void jffs2_evict_inode (struct inode *inode) jffs2_dbg(1, "%s(): ino #%lu mode %o\n", __func__, inode->i_ino, inode->i_mode); truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); jffs2_do_clear_inode(c, f); } diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 77b69b27f825..4692bf3ca8cb 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -169,7 +169,7 @@ void jfs_evict_inode(struct inode *inode) } else { truncate_inode_pages(&inode->i_data, 0); } - end_writeback(inode); + clear_inode(inode); dquot_drop(inode); } diff --git a/fs/locks.c b/fs/locks.c index 0d68f1f81799..4f441e46cef4 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1446,7 +1446,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) struct inode *inode = dentry->d_inode; int error; - if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE)) + if ((!uid_eq(current_fsuid(), inode->i_uid)) && !capable(CAP_LEASE)) return -EACCES; if (!S_ISREG(inode->i_mode)) return -EINVAL; diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c index e3ab5e5a904c..f1cb512c5019 100644 --- a/fs/logfs/readwrite.c +++ b/fs/logfs/readwrite.c @@ -2175,7 +2175,7 @@ void logfs_evict_inode(struct inode *inode) } } truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); /* Cheaper version of write_inode. All changes are concealed in * aliases, which are moved back. No write to the medium happens. diff --git a/fs/minix/inode.c b/fs/minix/inode.c index fcb05d2c6b5f..2a503ad020d5 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -32,7 +32,7 @@ static void minix_evict_inode(struct inode *inode) minix_truncate(inode); } invalidate_inode_buffers(inode); - end_writeback(inode); + clear_inode(inode); if (!inode->i_nlink) minix_free_inode(inode); } diff --git a/fs/namei.c b/fs/namei.c index f9e883c1b856..c651f02c9fec 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -16,6 +16,7 @@ #include <linux/init.h> #include <linux/export.h> +#include <linux/kernel.h> #include <linux/slab.h> #include <linux/fs.h> #include <linux/namei.h> @@ -218,10 +219,7 @@ static int acl_permission_check(struct inode *inode, int mask) { unsigned int mode = inode->i_mode; - if (current_user_ns() != inode_userns(inode)) - goto other_perms; - - if (likely(current_fsuid() == inode->i_uid)) + if (likely(uid_eq(current_fsuid(), inode->i_uid))) mode >>= 6; else { if (IS_POSIXACL(inode) && (mode & S_IRWXG)) { @@ -234,7 +232,6 @@ static int acl_permission_check(struct inode *inode, int mask) mode >>= 3; } -other_perms: /* * If the DACs are ok we don't need any capability check. */ @@ -270,10 +267,10 @@ int generic_permission(struct inode *inode, int mask) if (S_ISDIR(inode->i_mode)) { /* DACs are overridable for directories */ - if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE)) + if (inode_capable(inode, CAP_DAC_OVERRIDE)) return 0; if (!(mask & MAY_WRITE)) - if (ns_capable(inode_userns(inode), CAP_DAC_READ_SEARCH)) + if (inode_capable(inode, CAP_DAC_READ_SEARCH)) return 0; return -EACCES; } @@ -283,7 +280,7 @@ int generic_permission(struct inode *inode, int mask) * at least one exec bit set. */ if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO)) - if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE)) + if (inode_capable(inode, CAP_DAC_OVERRIDE)) return 0; /* @@ -291,7 +288,7 @@ int generic_permission(struct inode *inode, int mask) */ mask &= MAY_READ | MAY_WRITE | MAY_EXEC; if (mask == MAY_READ) - if (ns_capable(inode_userns(inode), CAP_DAC_READ_SEARCH)) + if (inode_capable(inode, CAP_DAC_READ_SEARCH)) return 0; return -EACCES; @@ -1455,7 +1452,8 @@ EXPORT_SYMBOL(full_name_hash); */ static inline unsigned long hash_name(const char *name, unsigned int *hashp) { - unsigned long a, mask, hash, len; + unsigned long a, b, adata, bdata, mask, hash, len; + const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; hash = a = 0; len = -sizeof(unsigned long); @@ -1463,17 +1461,18 @@ static inline unsigned long hash_name(const char *name, unsigned int *hashp) hash = (hash + a) * 9; len += sizeof(unsigned long); a = load_unaligned_zeropad(name+len); - /* Do we have any NUL or '/' bytes in this word? */ - mask = has_zero(a) | has_zero(a ^ REPEAT_BYTE('/')); - } while (!mask); - - /* The mask *below* the first high bit set */ - mask = (mask - 1) & ~mask; - mask >>= 7; - hash += a & mask; + b = a ^ REPEAT_BYTE('/'); + } while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants))); + + adata = prep_zero_mask(a, adata, &constants); + bdata = prep_zero_mask(b, bdata, &constants); + + mask = create_zero_mask(adata | bdata); + + hash += a & zero_bytemask(mask); *hashp = fold_hash(hash); - return len + count_masked_bytes(mask); + return len + find_zero(mask); } #else @@ -1934,19 +1933,15 @@ static int user_path_parent(int dfd, const char __user *path, */ static inline int check_sticky(struct inode *dir, struct inode *inode) { - uid_t fsuid = current_fsuid(); + kuid_t fsuid = current_fsuid(); if (!(dir->i_mode & S_ISVTX)) return 0; - if (current_user_ns() != inode_userns(inode)) - goto other_userns; - if (inode->i_uid == fsuid) + if (uid_eq(inode->i_uid, fsuid)) return 0; - if (dir->i_uid == fsuid) + if (uid_eq(dir->i_uid, fsuid)) return 0; - -other_userns: - return !ns_capable(inode_userns(inode), CAP_FOWNER); + return !inode_capable(inode, CAP_FOWNER); } /* @@ -2534,8 +2529,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) if (error) return error; - if ((S_ISCHR(mode) || S_ISBLK(mode)) && - !ns_capable(inode_userns(dir), CAP_MKNOD)) + if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD)) return -EPERM; if (!dir->i_op->mknod) diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 87484fb8d177..333df07ae3bd 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -292,7 +292,7 @@ static void ncp_evict_inode(struct inode *inode) { truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); if (S_ISDIR(inode->i_mode)) { DDPRINTK("ncp_evict_inode: put directory %ld\n", inode->i_ino); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index e8bbfa5b3500..c6073139b402 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -121,7 +121,7 @@ static void nfs_clear_inode(struct inode *inode) void nfs_evict_inode(struct inode *inode) { truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); nfs_clear_inode(inode); } @@ -1500,7 +1500,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) void nfs4_evict_inode(struct inode *inode) { truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); pnfs_return_layout(inode); pnfs_destroy_layout(NFS_I(inode)); /* If we are holding a delegation, return it! */ diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 79717a40daba..204438cc914e 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -1,6 +1,7 @@ /* Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */ #include <linux/sched.h> +#include <linux/user_namespace.h> #include "nfsd.h" #include "auth.h" @@ -56,8 +57,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) goto oom; for (i = 0; i < rqgi->ngroups; i++) { - if (!GROUP_AT(rqgi, i)) - GROUP_AT(gi, i) = exp->ex_anon_gid; + if (gid_eq(GLOBAL_ROOT_GID, GROUP_AT(rqgi, i))) + GROUP_AT(gi, i) = make_kgid(&init_user_ns, exp->ex_anon_gid); else GROUP_AT(gi, i) = GROUP_AT(rqgi, i); } diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 8f7b95ac1f7e..7cc64465ec26 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -734,7 +734,7 @@ void nilfs_evict_inode(struct inode *inode) if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) { if (inode->i_data.nrpages) truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); nilfs_clear_inode(inode); return; } @@ -746,7 +746,7 @@ void nilfs_evict_inode(struct inode *inode) /* TODO: some of the following operations may fail. */ nilfs_truncate_bmap(ii, 0); nilfs_mark_inode_dirty(inode); - end_writeback(inode); + clear_inode(inode); ret = nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino); if (!ret) diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 2eaa66652944..c6dbd3db6ca8 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -2258,7 +2258,7 @@ void ntfs_evict_big_inode(struct inode *vi) ntfs_inode *ni = NTFS_I(vi); truncate_inode_pages(&vi->i_data, 0); - end_writeback(vi); + clear_inode(vi); #ifdef NTFS_RW if (NInoDirty(ni)) { diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 3b5825ef3193..e31d6ae013ab 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -367,7 +367,7 @@ static void dlmfs_evict_inode(struct inode *inode) int status; struct dlmfs_inode_private *ip; - end_writeback(inode); + clear_inode(inode); mlog(0, "inode %lu\n", inode->i_ino); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 17454a904d7b..735514ca400f 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1069,7 +1069,7 @@ static void ocfs2_clear_inode(struct inode *inode) int status; struct ocfs2_inode_info *oi = OCFS2_I(inode); - end_writeback(inode); + clear_inode(inode); trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno, inode->i_nlink); diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index dbc842222589..e6213b3725d1 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -184,7 +184,7 @@ int omfs_sync_inode(struct inode *inode) static void omfs_evict_inode(struct inode *inode) { truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); if (inode->i_nlink) return; diff --git a/fs/open.c b/fs/open.c index 5eccdcea2d1b..d54301219d04 100644 --- a/fs/open.c +++ b/fs/open.c @@ -316,7 +316,8 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) if (!issecure(SECURE_NO_SETUID_FIXUP)) { /* Clear the capabilities if we switch to a non-root user */ - if (override_cred->uid) + kuid_t root_uid = make_kuid(override_cred->user_ns, 0); + if (!uid_eq(override_cred->uid, root_uid)) cap_clear(override_cred->cap_effective); else override_cred->cap_effective = @@ -505,15 +506,24 @@ static int chown_common(struct path *path, uid_t user, gid_t group) struct inode *inode = path->dentry->d_inode; int error; struct iattr newattrs; + kuid_t uid; + kgid_t gid; + + uid = make_kuid(current_user_ns(), user); + gid = make_kgid(current_user_ns(), group); newattrs.ia_valid = ATTR_CTIME; if (user != (uid_t) -1) { + if (!uid_valid(uid)) + return -EINVAL; newattrs.ia_valid |= ATTR_UID; - newattrs.ia_uid = user; + newattrs.ia_uid = uid; } if (group != (gid_t) -1) { + if (!gid_valid(gid)) + return -EINVAL; newattrs.ia_valid |= ATTR_GID; - newattrs.ia_gid = group; + newattrs.ia_gid = gid; } if (!S_ISDIR(inode->i_mode)) newattrs.ia_valid |= diff --git a/fs/proc/array.c b/fs/proc/array.c index f9bd395b3473..dc4c5a7b9ece 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -81,6 +81,7 @@ #include <linux/pid_namespace.h> #include <linux/ptrace.h> #include <linux/tracehook.h> +#include <linux/user_namespace.h> #include <asm/pgtable.h> #include <asm/processor.h> @@ -161,6 +162,7 @@ static inline const char *get_task_state(struct task_struct *tsk) static inline void task_state(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *p) { + struct user_namespace *user_ns = current_user_ns(); struct group_info *group_info; int g; struct fdtable *fdt = NULL; @@ -189,8 +191,14 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, task_tgid_nr_ns(p, ns), pid_nr_ns(pid, ns), ppid, tpid, - cred->uid, cred->euid, cred->suid, cred->fsuid, - cred->gid, cred->egid, cred->sgid, cred->fsgid); + from_kuid_munged(user_ns, cred->uid), + from_kuid_munged(user_ns, cred->euid), + from_kuid_munged(user_ns, cred->suid), + from_kuid_munged(user_ns, cred->fsuid), + from_kgid_munged(user_ns, cred->gid), + from_kgid_munged(user_ns, cred->egid), + from_kgid_munged(user_ns, cred->sgid), + from_kgid_munged(user_ns, cred->fsgid)); task_lock(p); if (p->files) @@ -205,7 +213,8 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, task_unlock(p); for (g = 0; g < min(group_info->ngroups, NGROUPS_SMALL); g++) - seq_printf(m, "%d ", GROUP_AT(group_info, g)); + seq_printf(m, "%d ", + from_kgid_munged(user_ns, GROUP_AT(group_info, g))); put_cred(cred); seq_putc(m, '\n'); diff --git a/fs/proc/base.c b/fs/proc/base.c index 57b8159f26f3..d2d3108a611c 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -81,6 +81,7 @@ #include <linux/oom.h> #include <linux/elf.h> #include <linux/pid_namespace.h> +#include <linux/user_namespace.h> #include <linux/fs_struct.h> #include <linux/slab.h> #include <linux/flex_array.h> @@ -1561,8 +1562,8 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) generic_fillattr(inode, stat); rcu_read_lock(); - stat->uid = 0; - stat->gid = 0; + stat->uid = GLOBAL_ROOT_UID; + stat->gid = GLOBAL_ROOT_GID; task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) { if (!has_pid_permissions(pid, task, 2)) { @@ -1622,8 +1623,8 @@ int pid_revalidate(struct dentry *dentry, struct nameidata *nd) inode->i_gid = cred->egid; rcu_read_unlock(); } else { - inode->i_uid = 0; - inode->i_gid = 0; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; } inode->i_mode &= ~(S_ISUID | S_ISGID); security_task_to_inode(task, inode); @@ -1815,8 +1816,8 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) inode->i_gid = cred->egid; rcu_read_unlock(); } else { - inode->i_uid = 0; - inode->i_gid = 0; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; } i_mode = S_IFLNK; @@ -2045,8 +2046,8 @@ static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd) inode->i_gid = cred->egid; rcu_read_unlock(); } else { - inode->i_uid = 0; - inode->i_gid = 0; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; } security_task_to_inode(task, inode); status = 1; @@ -2924,6 +2925,74 @@ static int proc_tgid_io_accounting(struct task_struct *task, char *buffer) } #endif /* CONFIG_TASK_IO_ACCOUNTING */ +#ifdef CONFIG_USER_NS +static int proc_id_map_open(struct inode *inode, struct file *file, + struct seq_operations *seq_ops) +{ + struct user_namespace *ns = NULL; + struct task_struct *task; + struct seq_file *seq; + int ret = -EINVAL; + + task = get_proc_task(inode); + if (task) { + rcu_read_lock(); + ns = get_user_ns(task_cred_xxx(task, user_ns)); + rcu_read_unlock(); + put_task_struct(task); + } + if (!ns) + goto err; + + ret = seq_open(file, seq_ops); + if (ret) + goto err_put_ns; + + seq = file->private_data; + seq->private = ns; + + return 0; +err_put_ns: + put_user_ns(ns); +err: + return ret; +} + +static int proc_id_map_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct user_namespace *ns = seq->private; + put_user_ns(ns); + return seq_release(inode, file); +} + +static int proc_uid_map_open(struct inode *inode, struct file *file) +{ + return proc_id_map_open(inode, file, &proc_uid_seq_operations); +} + +static int proc_gid_map_open(struct inode *inode, struct file *file) +{ + return proc_id_map_open(inode, file, &proc_gid_seq_operations); +} + +static const struct file_operations proc_uid_map_operations = { + .open = proc_uid_map_open, + .write = proc_uid_map_write, + .read = seq_read, + .llseek = seq_lseek, + .release = proc_id_map_release, +}; + +static const struct file_operations proc_gid_map_operations = { + .open = proc_gid_map_open, + .write = proc_gid_map_write, + .read = seq_read, + .llseek = seq_lseek, + .release = proc_id_map_release, +}; +#endif /* CONFIG_USER_NS */ + static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -3026,6 +3095,10 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_HARDWALL INF("hardwall", S_IRUGO, proc_pid_hardwall), #endif +#ifdef CONFIG_USER_NS + REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), + REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), +#endif }; static int proc_tgid_base_readdir(struct file * filp, @@ -3381,6 +3454,10 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_HARDWALL INF("hardwall", S_IRUGO, proc_pid_hardwall), #endif +#ifdef CONFIG_USER_NS + REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), + REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), +#endif }; static int proc_tid_base_readdir(struct file * filp, diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 205c92280838..7ac817b64a71 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -33,7 +33,7 @@ static void proc_evict_inode(struct inode *inode) const struct proc_ns_operations *ns_ops; truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); /* Stop tracking associated processes */ put_pid(PROC_I(inode)->pid); @@ -108,8 +108,8 @@ static int proc_show_options(struct seq_file *seq, struct dentry *root) struct super_block *sb = root->d_sb; struct pid_namespace *pid = sb->s_fs_info; - if (pid->pid_gid) - seq_printf(seq, ",gid=%lu", (unsigned long)pid->pid_gid); + if (!gid_eq(pid->pid_gid, GLOBAL_ROOT_GID)) + seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, pid->pid_gid)); if (pid->hide_pid != 0) seq_printf(seq, ",hidepid=%u", pid->hide_pid); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 21d836f40292..3476bca8f7af 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -371,9 +371,9 @@ void register_sysctl_root(struct ctl_table_root *root) static int test_perm(int mode, int op) { - if (!current_euid()) + if (uid_eq(current_euid(), GLOBAL_ROOT_UID)) mode >>= 6; - else if (in_egroup_p(0)) + else if (in_egroup_p(GLOBAL_ROOT_GID)) mode >>= 3; if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0) return 0; diff --git a/fs/proc/root.c b/fs/proc/root.c index eed44bfc85db..7c30fce037c0 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -67,7 +67,7 @@ static int proc_parse_options(char *options, struct pid_namespace *pid) case Opt_gid: if (match_int(&args[0], &option)) return 0; - pid->pid_gid = option; + pid->pid_gid = make_kgid(current_user_ns(), option); break; case Opt_hidepid: if (match_int(&args[0], &option)) diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 19507889bb7f..aeb19e68e086 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -85,7 +85,7 @@ static void pstore_evict_inode(struct inode *inode) struct pstore_private *p = inode->i_private; unsigned long flags; - end_writeback(inode); + clear_inode(inode); if (p) { spin_lock_irqsave(&allpstore_lock, flags); list_del(&p->list); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index d69a1d1d7e15..10cbe841cb7e 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -116,15 +116,15 @@ * spinlock to internal buffers before writing. * * Lock ordering (including related VFS locks) is the following: - * i_mutex > dqonoff_sem > journal_lock > dqptr_sem > dquot->dq_lock > + * dqonoff_mutex > i_mutex > journal_lock > dqptr_sem > dquot->dq_lock > * dqio_mutex + * dqonoff_mutex > i_mutex comes from dquot_quota_sync, dquot_enable, etc. * The lock ordering of dqptr_sem imposed by quota code is only dqonoff_sem > * dqptr_sem. But filesystem has to count with the fact that functions such as * dquot_alloc_space() acquire dqptr_sem and they usually have to be called * from inside a transaction to keep filesystem consistency after a crash. Also * filesystems usually want to do some IO on dquot from ->mark_dirty which is * called with dqptr_sem held. - * i_mutex on quota files is special (it's below dqio_mutex) */ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_list_lock); @@ -638,7 +638,7 @@ int dquot_quota_sync(struct super_block *sb, int type, int wait) dqstats_inc(DQST_SYNCS); mutex_unlock(&dqopt->dqonoff_mutex); - if (!wait || (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE)) + if (!wait || (dqopt->flags & DQUOT_QUOTA_SYS_FILE)) return 0; /* This is not very clever (and fast) but currently I don't know about @@ -652,18 +652,17 @@ int dquot_quota_sync(struct super_block *sb, int type, int wait) * Now when everything is written we can discard the pagecache so * that userspace sees the changes. */ - mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); + mutex_lock(&dqopt->dqonoff_mutex); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (type != -1 && cnt != type) continue; if (!sb_has_quota_active(sb, cnt)) continue; - mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex, - I_MUTEX_QUOTA); - truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0); - mutex_unlock(&sb_dqopt(sb)->files[cnt]->i_mutex); + mutex_lock(&dqopt->files[cnt]->i_mutex); + truncate_inode_pages(&dqopt->files[cnt]->i_data, 0); + mutex_unlock(&dqopt->files[cnt]->i_mutex); } - mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); + mutex_unlock(&dqopt->dqonoff_mutex); return 0; } @@ -907,14 +906,14 @@ static void add_dquot_ref(struct super_block *sb, int type) spin_unlock(&inode->i_lock); continue; } -#ifdef CONFIG_QUOTA_DEBUG - if (unlikely(inode_get_rsv_space(inode) > 0)) - reserved = 1; -#endif __iget(inode); spin_unlock(&inode->i_lock); spin_unlock(&inode_sb_list_lock); +#ifdef CONFIG_QUOTA_DEBUG + if (unlikely(inode_get_rsv_space(inode) > 0)) + reserved = 1; +#endif iput(old_inode); __dquot_initialize(inode, type); @@ -2037,8 +2036,7 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags) /* If quota was reenabled in the meantime, we have * nothing to do */ if (!sb_has_quota_loaded(sb, cnt)) { - mutex_lock_nested(&toputinode[cnt]->i_mutex, - I_MUTEX_QUOTA); + mutex_lock(&toputinode[cnt]->i_mutex); toputinode[cnt]->i_flags &= ~(S_IMMUTABLE | S_NOATIME | S_NOQUOTA); truncate_inode_pages(&toputinode[cnt]->i_data, @@ -2133,7 +2131,7 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, /* We don't want quota and atime on quota files (deadlocks * possible) Also nobody should write to the file - we use * special IO operations which ignore the immutable bit. */ - mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); + mutex_lock(&inode->i_mutex); oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA); inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE; @@ -2180,7 +2178,7 @@ out_file_init: iput(inode); out_lock: if (oldflags != -1) { - mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); + mutex_lock(&inode->i_mutex); /* Set the flags back (in the case of accidental quotaon() * on a wrong file we don't want to mess up the flags) */ inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE); diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 494c315c7417..59d06871a850 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -76,14 +76,14 @@ void reiserfs_evict_inode(struct inode *inode) ; } out: - end_writeback(inode); /* note this must go after the journal_end to prevent deadlock */ + clear_inode(inode); /* note this must go after the journal_end to prevent deadlock */ dquot_drop(inode); inode->i_blocks = 0; reiserfs_write_unlock_once(inode->i_sb, depth); return; no_delete: - end_writeback(inode); + clear_inode(inode); dquot_drop(inode); } diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 8b7616ef06d8..c07b7d709447 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -2270,7 +2270,6 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type, (unsigned long long)off, (unsigned long long)len); return -EIO; } - mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); while (towrite > 0) { tocopy = sb->s_blocksize - offset < towrite ? sb->s_blocksize - offset : towrite; @@ -2302,16 +2301,13 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type, blk++; } out: - if (len == towrite) { - mutex_unlock(&inode->i_mutex); + if (len == towrite) return err; - } if (inode->i_size < off + len - towrite) i_size_write(inode, off + len - towrite); inode->i_version++; inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); - mutex_unlock(&inode->i_mutex); return len - towrite; } diff --git a/fs/stat.c b/fs/stat.c index 0cef3366a919..b6ff11825fc8 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -138,8 +138,8 @@ static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * sta tmp.st_nlink = stat->nlink; if (tmp.st_nlink != stat->nlink) return -EOVERFLOW; - SET_UID(tmp.st_uid, stat->uid); - SET_GID(tmp.st_gid, stat->gid); + SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); + SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); tmp.st_rdev = old_encode_dev(stat->rdev); #if BITS_PER_LONG == 32 if (stat->size > MAX_NON_LFS) @@ -224,8 +224,8 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf) tmp.st_nlink = stat->nlink; if (tmp.st_nlink != stat->nlink) return -EOVERFLOW; - SET_UID(tmp.st_uid, stat->uid); - SET_GID(tmp.st_gid, stat->gid); + SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); + SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); tmp.st_rdev = encode_dev(stat->rdev); tmp.st_size = stat->size; tmp.st_atime = stat->atime.tv_sec; @@ -355,8 +355,8 @@ static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf) #endif tmp.st_mode = stat->mode; tmp.st_nlink = stat->nlink; - tmp.st_uid = stat->uid; - tmp.st_gid = stat->gid; + tmp.st_uid = from_kuid_munged(current_user_ns(), stat->uid); + tmp.st_gid = from_kgid_munged(current_user_ns(), stat->gid); tmp.st_atime = stat->atime.tv_sec; tmp.st_atime_nsec = stat->atime.tv_nsec; tmp.st_mtime = stat->mtime.tv_sec; diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index feb2d69396cf..0ce3ccf7f401 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -62,8 +62,8 @@ static struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd) /* assign default attributes */ iattrs->ia_mode = sd->s_mode; - iattrs->ia_uid = 0; - iattrs->ia_gid = 0; + iattrs->ia_uid = GLOBAL_ROOT_UID; + iattrs->ia_gid = GLOBAL_ROOT_GID; iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME; return attrs; @@ -310,7 +310,7 @@ void sysfs_evict_inode(struct inode *inode) struct sysfs_dirent *sd = inode->i_private; truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); sysfs_put(sd); } diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 3da5ce25faf0..08d0b2568cd3 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -316,7 +316,7 @@ static void sysv_evict_inode(struct inode *inode) sysv_truncate(inode); } invalidate_inode_buffers(inode); - end_writeback(inode); + clear_inode(inode); if (!inode->i_nlink) sysv_free_inode(inode); } diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig index f8b0160da2da..ba66d508006a 100644 --- a/fs/ubifs/Kconfig +++ b/fs/ubifs/Kconfig @@ -11,12 +11,6 @@ config UBIFS_FS help UBIFS is a file system for flash devices which works on top of UBI. -config UBIFS_FS_XATTR - bool "Extended attributes support" - depends on UBIFS_FS - help - This option enables support of extended attributes. - config UBIFS_FS_ADVANCED_COMPR bool "Advanced compression options" depends on UBIFS_FS @@ -41,20 +35,3 @@ config UBIFS_FS_ZLIB default y help Zlib compresses better than LZO but it is slower. Say 'Y' if unsure. - -# Debugging-related stuff -config UBIFS_FS_DEBUG - bool "Enable debugging support" - depends on UBIFS_FS - select DEBUG_FS - select KALLSYMS - help - This option enables UBIFS debugging support. It makes sure various - assertions, self-checks, debugging messages and test modes are compiled - in (this all is compiled out otherwise). Assertions are light-weight - and this option also enables them. Self-checks, debugging messages and - test modes are switched off by default. Thus, it is safe and actually - recommended to have debugging support enabled, and it should not slow - down UBIFS. You can then further enable / disable individual debugging - features using UBIFS module parameters and the corresponding sysfs - interfaces. diff --git a/fs/ubifs/Makefile b/fs/ubifs/Makefile index 80e93c35e496..2c6f0cb816b4 100644 --- a/fs/ubifs/Makefile +++ b/fs/ubifs/Makefile @@ -3,7 +3,4 @@ obj-$(CONFIG_UBIFS_FS) += ubifs.o ubifs-y += shrinker.o journal.o file.o dir.o super.o sb.o io.o ubifs-y += tnc.o master.o scan.o replay.o log.o commit.o gc.o orphan.o ubifs-y += budget.o find.o tnc_commit.o compress.o lpt.o lprops.o -ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o - -ubifs-$(CONFIG_UBIFS_FS_DEBUG) += debug.o -ubifs-$(CONFIG_UBIFS_FS_XATTR) += xattr.o +ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o xattr.o debug.o diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c index fb3b5c813a30..8eda717cb99b 100644 --- a/fs/ubifs/commit.c +++ b/fs/ubifs/commit.c @@ -496,7 +496,9 @@ int ubifs_gc_should_commit(struct ubifs_info *c) return ret; } -#ifdef CONFIG_UBIFS_FS_DEBUG +/* + * Everything below is related to debugging. + */ /** * struct idx_node - hold index nodes during index tree traversal. @@ -714,14 +716,14 @@ out: return 0; out_dump: - dbg_err("dumping index node (iip=%d)", i->iip); - dbg_dump_node(c, idx); + ubifs_err("dumping index node (iip=%d)", i->iip); + ubifs_dump_node(c, idx); list_del(&i->list); kfree(i); if (!list_empty(&list)) { i = list_entry(list.prev, struct idx_node, list); - dbg_err("dumping parent index node"); - dbg_dump_node(c, &i->idx); + ubifs_err("dumping parent index node"); + ubifs_dump_node(c, &i->idx); } out_free: while (!list_empty(&list)) { @@ -734,5 +736,3 @@ out_free: err = -EINVAL; return err; } - -#endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 1934084e2088..685a83756b2b 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -34,8 +34,6 @@ #include <linux/random.h> #include "ubifs.h" -#ifdef CONFIG_UBIFS_FS_DEBUG - static DEFINE_SPINLOCK(dbg_lock); static const char *get_key_fmt(int fmt) @@ -232,7 +230,7 @@ static void dump_ch(const struct ubifs_ch *ch) printk(KERN_ERR "\tlen %u\n", le32_to_cpu(ch->len)); } -void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode) +void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode) { const struct ubifs_inode *ui = ubifs_inode(inode); struct qstr nm = { .name = NULL }; @@ -300,7 +298,7 @@ void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode) kfree(pdent); } -void dbg_dump_node(const struct ubifs_info *c, const void *node) +void ubifs_dump_node(const struct ubifs_info *c, const void *node) { int i, n; union ubifs_key key; @@ -603,7 +601,7 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) spin_unlock(&dbg_lock); } -void dbg_dump_budget_req(const struct ubifs_budget_req *req) +void ubifs_dump_budget_req(const struct ubifs_budget_req *req) { spin_lock(&dbg_lock); printk(KERN_ERR "Budgeting request: new_ino %d, dirtied_ino %d\n", @@ -620,7 +618,7 @@ void dbg_dump_budget_req(const struct ubifs_budget_req *req) spin_unlock(&dbg_lock); } -void dbg_dump_lstats(const struct ubifs_lp_stats *lst) +void ubifs_dump_lstats(const struct ubifs_lp_stats *lst) { spin_lock(&dbg_lock); printk(KERN_ERR "(pid %d) Lprops statistics: empty_lebs %d, " @@ -634,7 +632,7 @@ void dbg_dump_lstats(const struct ubifs_lp_stats *lst) spin_unlock(&dbg_lock); } -void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi) +void ubifs_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi) { int i; struct rb_node *rb; @@ -707,7 +705,7 @@ out_unlock: spin_unlock(&c->space_lock); } -void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp) +void ubifs_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp) { int i, spc, dark = 0, dead = 0; struct rb_node *rb; @@ -801,7 +799,7 @@ void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp) printk(KERN_CONT ")\n"); } -void dbg_dump_lprops(struct ubifs_info *c) +void ubifs_dump_lprops(struct ubifs_info *c) { int lnum, err; struct ubifs_lprops lp; @@ -810,20 +808,20 @@ void dbg_dump_lprops(struct ubifs_info *c) printk(KERN_ERR "(pid %d) start dumping LEB properties\n", current->pid); ubifs_get_lp_stats(c, &lst); - dbg_dump_lstats(&lst); + ubifs_dump_lstats(&lst); for (lnum = c->main_first; lnum < c->leb_cnt; lnum++) { err = ubifs_read_one_lp(c, lnum, &lp); if (err) ubifs_err("cannot read lprops for LEB %d", lnum); - dbg_dump_lprop(c, &lp); + ubifs_dump_lprop(c, &lp); } printk(KERN_ERR "(pid %d) finish dumping LEB properties\n", current->pid); } -void dbg_dump_lpt_info(struct ubifs_info *c) +void ubifs_dump_lpt_info(struct ubifs_info *c) { int i; @@ -862,8 +860,8 @@ void dbg_dump_lpt_info(struct ubifs_info *c) spin_unlock(&dbg_lock); } -void dbg_dump_sleb(const struct ubifs_info *c, - const struct ubifs_scan_leb *sleb, int offs) +void ubifs_dump_sleb(const struct ubifs_info *c, + const struct ubifs_scan_leb *sleb, int offs) { struct ubifs_scan_node *snod; @@ -874,11 +872,11 @@ void dbg_dump_sleb(const struct ubifs_info *c, cond_resched(); printk(KERN_ERR "Dumping node at LEB %d:%d len %d\n", sleb->lnum, snod->offs, snod->len); - dbg_dump_node(c, snod->node); + ubifs_dump_node(c, snod->node); } } -void dbg_dump_leb(const struct ubifs_info *c, int lnum) +void ubifs_dump_leb(const struct ubifs_info *c, int lnum) { struct ubifs_scan_leb *sleb; struct ubifs_scan_node *snod; @@ -909,7 +907,7 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum) cond_resched(); printk(KERN_ERR "Dumping node at LEB %d:%d len %d\n", lnum, snod->offs, snod->len); - dbg_dump_node(c, snod->node); + ubifs_dump_node(c, snod->node); } printk(KERN_ERR "(pid %d) finish dumping LEB %d\n", @@ -921,8 +919,8 @@ out: return; } -void dbg_dump_znode(const struct ubifs_info *c, - const struct ubifs_znode *znode) +void ubifs_dump_znode(const struct ubifs_info *c, + const struct ubifs_znode *znode) { int n; const struct ubifs_zbranch *zbr; @@ -965,7 +963,7 @@ void dbg_dump_znode(const struct ubifs_info *c, spin_unlock(&dbg_lock); } -void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat) +void ubifs_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat) { int i; @@ -981,8 +979,8 @@ void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat) printk(KERN_ERR "(pid %d) finish dumping heap\n", current->pid); } -void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, - struct ubifs_nnode *parent, int iip) +void ubifs_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, + struct ubifs_nnode *parent, int iip) { int i; @@ -999,7 +997,7 @@ void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, } } -void dbg_dump_tnc(struct ubifs_info *c) +void ubifs_dump_tnc(struct ubifs_info *c) { struct ubifs_znode *znode; int level; @@ -1014,7 +1012,7 @@ void dbg_dump_tnc(struct ubifs_info *c) level = znode->level; printk(KERN_ERR "== Level %d ==\n", level); } - dbg_dump_znode(c, znode); + ubifs_dump_znode(c, znode); znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode); } printk(KERN_ERR "(pid %d) finish dumping TNC tree\n", current->pid); @@ -1023,18 +1021,18 @@ void dbg_dump_tnc(struct ubifs_info *c) static int dump_znode(struct ubifs_info *c, struct ubifs_znode *znode, void *priv) { - dbg_dump_znode(c, znode); + ubifs_dump_znode(c, znode); return 0; } /** - * dbg_dump_index - dump the on-flash index. + * ubifs_dump_index - dump the on-flash index. * @c: UBIFS file-system description object * - * This function dumps whole UBIFS indexing B-tree, unlike 'dbg_dump_tnc()' + * This function dumps whole UBIFS indexing B-tree, unlike 'ubifs_dump_tnc()' * which dumps only in-memory znodes and does not read znodes which from flash. */ -void dbg_dump_index(struct ubifs_info *c) +void ubifs_dump_index(struct ubifs_info *c) { dbg_walk_index(c, NULL, dump_znode, NULL); } @@ -1120,15 +1118,15 @@ int dbg_check_space_info(struct ubifs_info *c) out: ubifs_msg("saved lprops statistics dump"); - dbg_dump_lstats(&d->saved_lst); + ubifs_dump_lstats(&d->saved_lst); ubifs_msg("saved budgeting info dump"); - dbg_dump_budg(c, &d->saved_bi); + ubifs_dump_budg(c, &d->saved_bi); ubifs_msg("saved idx_gc_cnt %d", d->saved_idx_gc_cnt); ubifs_msg("current lprops statistics dump"); ubifs_get_lp_stats(c, &lst); - dbg_dump_lstats(&lst); + ubifs_dump_lstats(&lst); ubifs_msg("current budgeting info dump"); - dbg_dump_budg(c, &c->bi); + ubifs_dump_budg(c, &c->bi); dump_stack(); return -EINVAL; } @@ -1160,7 +1158,7 @@ int dbg_check_synced_i_size(const struct ubifs_info *c, struct inode *inode) "is clean", ui->ui_size, ui->synced_i_size); ubifs_err("i_ino %lu, i_mode %#x, i_size %lld", inode->i_ino, inode->i_mode, i_size_read(inode)); - dbg_dump_stack(); + dump_stack(); err = -EINVAL; } spin_unlock(&ui->ui_lock); @@ -1223,14 +1221,14 @@ int dbg_check_dir(struct ubifs_info *c, const struct inode *dir) "but calculated size is %llu", dir->i_ino, (unsigned long long)i_size_read(dir), (unsigned long long)size); - dbg_dump_inode(c, dir); + ubifs_dump_inode(c, dir); dump_stack(); return -EINVAL; } if (dir->i_nlink != nlink) { ubifs_err("directory inode %lu has nlink %u, but calculated " "nlink is %u", dir->i_ino, dir->i_nlink, nlink); - dbg_dump_inode(c, dir); + ubifs_dump_inode(c, dir); dump_stack(); return -EINVAL; } @@ -1287,25 +1285,25 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1, err = 1; key_read(c, &dent1->key, &key); if (keys_cmp(c, &zbr1->key, &key)) { - dbg_err("1st entry at %d:%d has key %s", zbr1->lnum, - zbr1->offs, dbg_snprintf_key(c, &key, key_buf, - DBG_KEY_BUF_LEN)); - dbg_err("but it should have key %s according to tnc", - dbg_snprintf_key(c, &zbr1->key, key_buf, - DBG_KEY_BUF_LEN)); - dbg_dump_node(c, dent1); + ubifs_err("1st entry at %d:%d has key %s", zbr1->lnum, + zbr1->offs, dbg_snprintf_key(c, &key, key_buf, + DBG_KEY_BUF_LEN)); + ubifs_err("but it should have key %s according to tnc", + dbg_snprintf_key(c, &zbr1->key, key_buf, + DBG_KEY_BUF_LEN)); + ubifs_dump_node(c, dent1); goto out_free; } key_read(c, &dent2->key, &key); if (keys_cmp(c, &zbr2->key, &key)) { - dbg_err("2nd entry at %d:%d has key %s", zbr1->lnum, - zbr1->offs, dbg_snprintf_key(c, &key, key_buf, - DBG_KEY_BUF_LEN)); - dbg_err("but it should have key %s according to tnc", - dbg_snprintf_key(c, &zbr2->key, key_buf, - DBG_KEY_BUF_LEN)); - dbg_dump_node(c, dent2); + ubifs_err("2nd entry at %d:%d has key %s", zbr1->lnum, + zbr1->offs, dbg_snprintf_key(c, &key, key_buf, + DBG_KEY_BUF_LEN)); + ubifs_err("but it should have key %s according to tnc", + dbg_snprintf_key(c, &zbr2->key, key_buf, + DBG_KEY_BUF_LEN)); + ubifs_dump_node(c, dent2); goto out_free; } @@ -1318,15 +1316,15 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1, goto out_free; } if (cmp == 0 && nlen1 == nlen2) - dbg_err("2 xent/dent nodes with the same name"); + ubifs_err("2 xent/dent nodes with the same name"); else - dbg_err("bad order of colliding key %s", - dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); + ubifs_err("bad order of colliding key %s", + dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); ubifs_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs); - dbg_dump_node(c, dent1); + ubifs_dump_node(c, dent1); ubifs_msg("second node at %d:%d\n", zbr2->lnum, zbr2->offs); - dbg_dump_node(c, dent2); + ubifs_dump_node(c, dent2); out_free: kfree(dent2); @@ -1529,10 +1527,10 @@ static int dbg_check_znode(struct ubifs_info *c, struct ubifs_zbranch *zbr) out: ubifs_err("failed, error %d", err); ubifs_msg("dump of the znode"); - dbg_dump_znode(c, znode); + ubifs_dump_znode(c, znode); if (zp) { ubifs_msg("dump of the parent znode"); - dbg_dump_znode(c, zp); + ubifs_dump_znode(c, zp); } dump_stack(); return -EINVAL; @@ -1599,9 +1597,9 @@ int dbg_check_tnc(struct ubifs_info *c, int extra) return err; if (err) { ubifs_msg("first znode"); - dbg_dump_znode(c, prev); + ubifs_dump_znode(c, prev); ubifs_msg("second znode"); - dbg_dump_znode(c, znode); + ubifs_dump_znode(c, znode); return -EINVAL; } } @@ -1690,7 +1688,7 @@ int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb, if (err) { ubifs_err("znode checking function returned " "error %d", err); - dbg_dump_znode(c, znode); + ubifs_dump_znode(c, znode); goto out_dump; } } @@ -1758,7 +1756,7 @@ out_dump: else zbr = &c->zroot; ubifs_msg("dump of znode at LEB %d:%d", zbr->lnum, zbr->offs); - dbg_dump_znode(c, znode); + ubifs_dump_znode(c, znode); out_unlock: mutex_unlock(&c->tnc_mutex); return err; @@ -2194,7 +2192,7 @@ out: out_dump: ubifs_msg("dump of node at LEB %d:%d", zbr->lnum, zbr->offs); - dbg_dump_node(c, node); + ubifs_dump_node(c, node); out_free: kfree(node); return err; @@ -2352,7 +2350,7 @@ out_dump: ubifs_msg("dump of the inode %lu sitting in LEB %d:%d", (unsigned long)fscki->inum, zbr->lnum, zbr->offs); - dbg_dump_node(c, ino); + ubifs_dump_node(c, ino); kfree(ino); return -EINVAL; } @@ -2423,12 +2421,12 @@ int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head) if (sa->type != UBIFS_DATA_NODE) { ubifs_err("bad node type %d", sa->type); - dbg_dump_node(c, sa->node); + ubifs_dump_node(c, sa->node); return -EINVAL; } if (sb->type != UBIFS_DATA_NODE) { ubifs_err("bad node type %d", sb->type); - dbg_dump_node(c, sb->node); + ubifs_dump_node(c, sb->node); return -EINVAL; } @@ -2459,8 +2457,8 @@ int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head) return 0; error_dump: - dbg_dump_node(c, sa->node); - dbg_dump_node(c, sb->node); + ubifs_dump_node(c, sa->node); + ubifs_dump_node(c, sb->node); return -EINVAL; } @@ -2491,13 +2489,13 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head) if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE && sa->type != UBIFS_XENT_NODE) { ubifs_err("bad node type %d", sa->type); - dbg_dump_node(c, sa->node); + ubifs_dump_node(c, sa->node); return -EINVAL; } if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE && sa->type != UBIFS_XENT_NODE) { ubifs_err("bad node type %d", sb->type); - dbg_dump_node(c, sb->node); + ubifs_dump_node(c, sb->node); return -EINVAL; } @@ -2547,9 +2545,9 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head) error_dump: ubifs_msg("dumping first node"); - dbg_dump_node(c, sa->node); + ubifs_dump_node(c, sa->node); ubifs_msg("dumping second node"); - dbg_dump_node(c, sb->node); + ubifs_dump_node(c, sb->node); return -EINVAL; return 0; } @@ -2678,7 +2676,7 @@ static void cut_data(const void *buf, unsigned int len) } int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf, - int offs, int len, int dtype) + int offs, int len) { int err, failing; @@ -2688,7 +2686,7 @@ int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf, failing = power_cut_emulated(c, lnum, 1); if (failing) cut_data(buf, len); - err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype); + err = ubi_leb_write(c->ubi, lnum, buf, offs, len); if (err) return err; if (failing) @@ -2697,7 +2695,7 @@ int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf, } int dbg_leb_change(struct ubifs_info *c, int lnum, const void *buf, - int len, int dtype) + int len) { int err; @@ -2705,7 +2703,7 @@ int dbg_leb_change(struct ubifs_info *c, int lnum, const void *buf, return -EROFS; if (power_cut_emulated(c, lnum, 1)) return -EROFS; - err = ubi_leb_change(c->ubi, lnum, buf, len, dtype); + err = ubi_leb_change(c->ubi, lnum, buf, len); if (err) return err; if (power_cut_emulated(c, lnum, 1)) @@ -2729,7 +2727,7 @@ int dbg_leb_unmap(struct ubifs_info *c, int lnum) return 0; } -int dbg_leb_map(struct ubifs_info *c, int lnum, int dtype) +int dbg_leb_map(struct ubifs_info *c, int lnum) { int err; @@ -2737,7 +2735,7 @@ int dbg_leb_map(struct ubifs_info *c, int lnum, int dtype) return -EROFS; if (power_cut_emulated(c, lnum, 0)) return -EROFS; - err = ubi_leb_map(c->ubi, lnum, dtype); + err = ubi_leb_map(c->ubi, lnum); if (err) return err; if (power_cut_emulated(c, lnum, 0)) @@ -2857,16 +2855,16 @@ static ssize_t dfs_file_write(struct file *file, const char __user *u, * 'ubifs-debug' file-system instead. */ if (file->f_path.dentry == d->dfs_dump_lprops) { - dbg_dump_lprops(c); + ubifs_dump_lprops(c); return count; } if (file->f_path.dentry == d->dfs_dump_budg) { - dbg_dump_budg(c, &c->bi); + ubifs_dump_budg(c, &c->bi); return count; } if (file->f_path.dentry == d->dfs_dump_tnc) { mutex_lock(&c->tnc_mutex); - dbg_dump_tnc(c); + ubifs_dump_tnc(c); mutex_unlock(&c->tnc_mutex); return count; } @@ -3189,5 +3187,3 @@ void ubifs_debugging_exit(struct ubifs_info *c) { kfree(c->dbg); } - -#endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index 9f717655df18..486a8e024fb6 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -29,8 +29,6 @@ typedef int (*dbg_leaf_callback)(struct ubifs_info *c, typedef int (*dbg_znode_callback)(struct ubifs_info *c, struct ubifs_znode *znode, void *priv); -#ifdef CONFIG_UBIFS_FS_DEBUG - /* * The UBIFS debugfs directory name pattern and maximum name length (3 for "ubi" * + 1 for "_" and plus 2x2 for 2 UBI numbers and 1 for the trailing zero byte. @@ -149,7 +147,7 @@ struct ubifs_global_debug_info { if (unlikely(!(expr))) { \ printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \ __func__, __LINE__, current->pid); \ - dbg_dump_stack(); \ + dump_stack(); \ } \ } while (0) @@ -161,12 +159,6 @@ struct ubifs_global_debug_info { } \ } while (0) -#define dbg_dump_stack() dump_stack() - -#define dbg_err(fmt, ...) do { \ - ubifs_err(fmt, ##__VA_ARGS__); \ -} while (0) - #define ubifs_dbg_msg(type, fmt, ...) \ pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__) @@ -257,27 +249,27 @@ const char *dbg_get_key_dump(const struct ubifs_info *c, const union ubifs_key *key); const char *dbg_snprintf_key(const struct ubifs_info *c, const union ubifs_key *key, char *buffer, int len); -void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode); -void dbg_dump_node(const struct ubifs_info *c, const void *node); -void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum, - int offs); -void dbg_dump_budget_req(const struct ubifs_budget_req *req); -void dbg_dump_lstats(const struct ubifs_lp_stats *lst); -void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi); -void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp); -void dbg_dump_lprops(struct ubifs_info *c); -void dbg_dump_lpt_info(struct ubifs_info *c); -void dbg_dump_leb(const struct ubifs_info *c, int lnum); -void dbg_dump_sleb(const struct ubifs_info *c, - const struct ubifs_scan_leb *sleb, int offs); -void dbg_dump_znode(const struct ubifs_info *c, - const struct ubifs_znode *znode); -void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat); -void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, - struct ubifs_nnode *parent, int iip); -void dbg_dump_tnc(struct ubifs_info *c); -void dbg_dump_index(struct ubifs_info *c); -void dbg_dump_lpt_lebs(const struct ubifs_info *c); +void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode); +void ubifs_dump_node(const struct ubifs_info *c, const void *node); +void ubifs_dump_budget_req(const struct ubifs_budget_req *req); +void ubifs_dump_lstats(const struct ubifs_lp_stats *lst); +void ubifs_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi); +void ubifs_dump_lprop(const struct ubifs_info *c, + const struct ubifs_lprops *lp); +void ubifs_dump_lprops(struct ubifs_info *c); +void ubifs_dump_lpt_info(struct ubifs_info *c); +void ubifs_dump_leb(const struct ubifs_info *c, int lnum); +void ubifs_dump_sleb(const struct ubifs_info *c, + const struct ubifs_scan_leb *sleb, int offs); +void ubifs_dump_znode(const struct ubifs_info *c, + const struct ubifs_znode *znode); +void ubifs_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, + int cat); +void ubifs_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, + struct ubifs_nnode *parent, int iip); +void ubifs_dump_tnc(struct ubifs_info *c); +void ubifs_dump_index(struct ubifs_info *c); +void ubifs_dump_lpt_lebs(const struct ubifs_info *c); int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb, dbg_znode_callback znode_cb, void *priv); @@ -307,11 +299,10 @@ int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head); int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head); int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf, int offs, - int len, int dtype); -int dbg_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len, - int dtype); + int len); +int dbg_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len); int dbg_leb_unmap(struct ubifs_info *c, int lnum); -int dbg_leb_map(struct ubifs_info *c, int lnum, int dtype); +int dbg_leb_map(struct ubifs_info *c, int lnum); /* Debugfs-related stuff */ int dbg_debugfs_init(void); @@ -319,162 +310,4 @@ void dbg_debugfs_exit(void); int dbg_debugfs_init_fs(struct ubifs_info *c); void dbg_debugfs_exit_fs(struct ubifs_info *c); -#else /* !CONFIG_UBIFS_FS_DEBUG */ - -/* Use "if (0)" to make compiler check arguments even if debugging is off */ -#define ubifs_assert(expr) do { \ - if (0) \ - printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \ - __func__, __LINE__, current->pid); \ -} while (0) - -#define dbg_err(fmt, ...) do { \ - if (0) \ - ubifs_err(fmt, ##__VA_ARGS__); \ -} while (0) - -#define DBGKEY(key) ((char *)(key)) -#define DBGKEY1(key) ((char *)(key)) - -#define ubifs_dbg_msg(fmt, ...) do { \ - if (0) \ - printk(KERN_DEBUG fmt "\n", ##__VA_ARGS__); \ -} while (0) - -#define dbg_dump_stack() -#define ubifs_assert_cmt_locked(c) - -#define dbg_msg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_gen(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_jnl(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_jnlk(key, fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_tnc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_tnck(key, fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_lp(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_find(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_mnt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_mntk(key, fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_io(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_cmt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_budg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_log(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_gc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_scan(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_rcvry(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) - -static inline int ubifs_debugging_init(struct ubifs_info *c) { return 0; } -static inline void ubifs_debugging_exit(struct ubifs_info *c) { return; } -static inline const char *dbg_ntype(int type) { return ""; } -static inline const char *dbg_cstate(int cmt_state) { return ""; } -static inline const char *dbg_jhead(int jhead) { return ""; } -static inline const char * -dbg_get_key_dump(const struct ubifs_info *c, - const union ubifs_key *key) { return ""; } -static inline const char * -dbg_snprintf_key(const struct ubifs_info *c, - const union ubifs_key *key, char *buffer, - int len) { return ""; } -static inline void dbg_dump_inode(struct ubifs_info *c, - const struct inode *inode) { return; } -static inline void dbg_dump_node(const struct ubifs_info *c, - const void *node) { return; } -static inline void dbg_dump_lpt_node(const struct ubifs_info *c, - void *node, int lnum, - int offs) { return; } -static inline void -dbg_dump_budget_req(const struct ubifs_budget_req *req) { return; } -static inline void -dbg_dump_lstats(const struct ubifs_lp_stats *lst) { return; } -static inline void -dbg_dump_budg(struct ubifs_info *c, - const struct ubifs_budg_info *bi) { return; } -static inline void dbg_dump_lprop(const struct ubifs_info *c, - const struct ubifs_lprops *lp) { return; } -static inline void dbg_dump_lprops(struct ubifs_info *c) { return; } -static inline void dbg_dump_lpt_info(struct ubifs_info *c) { return; } -static inline void dbg_dump_leb(const struct ubifs_info *c, - int lnum) { return; } -static inline void -dbg_dump_sleb(const struct ubifs_info *c, - const struct ubifs_scan_leb *sleb, int offs) { return; } -static inline void -dbg_dump_znode(const struct ubifs_info *c, - const struct ubifs_znode *znode) { return; } -static inline void dbg_dump_heap(struct ubifs_info *c, - struct ubifs_lpt_heap *heap, - int cat) { return; } -static inline void dbg_dump_pnode(struct ubifs_info *c, - struct ubifs_pnode *pnode, - struct ubifs_nnode *parent, - int iip) { return; } -static inline void dbg_dump_tnc(struct ubifs_info *c) { return; } -static inline void dbg_dump_index(struct ubifs_info *c) { return; } -static inline void dbg_dump_lpt_lebs(const struct ubifs_info *c) { return; } - -static inline int dbg_walk_index(struct ubifs_info *c, - dbg_leaf_callback leaf_cb, - dbg_znode_callback znode_cb, - void *priv) { return 0; } -static inline void dbg_save_space_info(struct ubifs_info *c) { return; } -static inline int dbg_check_space_info(struct ubifs_info *c) { return 0; } -static inline int dbg_check_lprops(struct ubifs_info *c) { return 0; } -static inline int -dbg_old_index_check_init(struct ubifs_info *c, - struct ubifs_zbranch *zroot) { return 0; } -static inline int -dbg_check_old_index(struct ubifs_info *c, - struct ubifs_zbranch *zroot) { return 0; } -static inline int dbg_check_cats(struct ubifs_info *c) { return 0; } -static inline int dbg_check_ltab(struct ubifs_info *c) { return 0; } -static inline int dbg_chk_lpt_free_spc(struct ubifs_info *c) { return 0; } -static inline int dbg_chk_lpt_sz(struct ubifs_info *c, - int action, int len) { return 0; } -static inline int -dbg_check_synced_i_size(const struct ubifs_info *c, - struct inode *inode) { return 0; } -static inline int dbg_check_dir(struct ubifs_info *c, - const struct inode *dir) { return 0; } -static inline int dbg_check_tnc(struct ubifs_info *c, int extra) { return 0; } -static inline int dbg_check_idx_size(struct ubifs_info *c, - long long idx_size) { return 0; } -static inline int dbg_check_filesystem(struct ubifs_info *c) { return 0; } -static inline void dbg_check_heap(struct ubifs_info *c, - struct ubifs_lpt_heap *heap, - int cat, int add_pos) { return; } -static inline int dbg_check_lpt_nodes(struct ubifs_info *c, - struct ubifs_cnode *cnode, int row, int col) { return 0; } -static inline int dbg_check_inode_size(struct ubifs_info *c, - const struct inode *inode, - loff_t size) { return 0; } -static inline int -dbg_check_data_nodes_order(struct ubifs_info *c, - struct list_head *head) { return 0; } -static inline int -dbg_check_nondata_nodes_order(struct ubifs_info *c, - struct list_head *head) { return 0; } - -static inline int dbg_leb_write(struct ubifs_info *c, int lnum, - const void *buf, int offset, - int len, int dtype) { return 0; } -static inline int dbg_leb_change(struct ubifs_info *c, int lnum, - const void *buf, int len, - int dtype) { return 0; } -static inline int dbg_leb_unmap(struct ubifs_info *c, int lnum) { return 0; } -static inline int dbg_leb_map(struct ubifs_info *c, int lnum, - int dtype) { return 0; } - -static inline int dbg_is_chk_gen(const struct ubifs_info *c) { return 0; } -static inline int dbg_is_chk_index(const struct ubifs_info *c) { return 0; } -static inline int dbg_is_chk_orph(const struct ubifs_info *c) { return 0; } -static inline int dbg_is_chk_lprops(const struct ubifs_info *c) { return 0; } -static inline int dbg_is_chk_fs(const struct ubifs_info *c) { return 0; } -static inline int dbg_is_tst_rcvry(const struct ubifs_info *c) { return 0; } -static inline int dbg_is_power_cut(const struct ubifs_info *c) { return 0; } - -static inline int dbg_debugfs_init(void) { return 0; } -static inline void dbg_debugfs_exit(void) { return; } -static inline int dbg_debugfs_init_fs(struct ubifs_info *c) { return 0; } -static inline int dbg_debugfs_exit_fs(struct ubifs_info *c) { return 0; } - -#endif /* !CONFIG_UBIFS_FS_DEBUG */ #endif /* !__UBIFS_DEBUG_H__ */ diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index ec9f1870ab7f..62a2727f4ecf 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -170,8 +170,6 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, return inode; } -#ifdef CONFIG_UBIFS_FS_DEBUG - static int dbg_check_name(const struct ubifs_info *c, const struct ubifs_dent_node *dent, const struct qstr *nm) @@ -185,12 +183,6 @@ static int dbg_check_name(const struct ubifs_info *c, return 0; } -#else - -#define dbg_check_name(c, dent, nm) 0 - -#endif - static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { @@ -1187,12 +1179,10 @@ const struct inode_operations ubifs_dir_inode_operations = { .rename = ubifs_rename, .setattr = ubifs_setattr, .getattr = ubifs_getattr, -#ifdef CONFIG_UBIFS_FS_XATTR .setxattr = ubifs_setxattr, .getxattr = ubifs_getxattr, .listxattr = ubifs_listxattr, .removexattr = ubifs_removexattr, -#endif }; const struct file_operations ubifs_dir_operations = { diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 5c8f6dc1d28b..35389ca2d267 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -97,7 +97,7 @@ static int read_block(struct inode *inode, void *addr, unsigned int block, dump: ubifs_err("bad data node (block %u, inode %lu)", block, inode->i_ino); - dbg_dump_node(c, dn); + ubifs_dump_node(c, dn); return -EINVAL; } @@ -1562,12 +1562,10 @@ const struct address_space_operations ubifs_file_address_operations = { const struct inode_operations ubifs_file_inode_operations = { .setattr = ubifs_setattr, .getattr = ubifs_getattr, -#ifdef CONFIG_UBIFS_FS_XATTR .setxattr = ubifs_setxattr, .getxattr = ubifs_getxattr, .listxattr = ubifs_listxattr, .removexattr = ubifs_removexattr, -#endif }; const struct inode_operations ubifs_symlink_inode_operations = { diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index ded29f6224c2..04dd6f47635e 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -109,7 +109,7 @@ static int switch_gc_head(struct ubifs_info *c) return err; c->gc_lnum = -1; - err = ubifs_wbuf_seek_nolock(wbuf, gc_lnum, 0, UBI_LONGTERM); + err = ubifs_wbuf_seek_nolock(wbuf, gc_lnum, 0); return err; } diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 9228950a658f..e18b9889a51b 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -109,13 +109,13 @@ int ubifs_leb_read(const struct ubifs_info *c, int lnum, void *buf, int offs, if (err && (err != -EBADMSG || even_ebadmsg)) { ubifs_err("reading %d bytes from LEB %d:%d failed, error %d", len, lnum, offs, err); - dbg_dump_stack(); + dump_stack(); } return err; } int ubifs_leb_write(struct ubifs_info *c, int lnum, const void *buf, int offs, - int len, int dtype) + int len) { int err; @@ -123,20 +123,19 @@ int ubifs_leb_write(struct ubifs_info *c, int lnum, const void *buf, int offs, if (c->ro_error) return -EROFS; if (!dbg_is_tst_rcvry(c)) - err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype); + err = ubi_leb_write(c->ubi, lnum, buf, offs, len); else - err = dbg_leb_write(c, lnum, buf, offs, len, dtype); + err = dbg_leb_write(c, lnum, buf, offs, len); if (err) { ubifs_err("writing %d bytes to LEB %d:%d failed, error %d", len, lnum, offs, err); ubifs_ro_mode(c, err); - dbg_dump_stack(); + dump_stack(); } return err; } -int ubifs_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len, - int dtype) +int ubifs_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len) { int err; @@ -144,14 +143,14 @@ int ubifs_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len, if (c->ro_error) return -EROFS; if (!dbg_is_tst_rcvry(c)) - err = ubi_leb_change(c->ubi, lnum, buf, len, dtype); + err = ubi_leb_change(c->ubi, lnum, buf, len); else - err = dbg_leb_change(c, lnum, buf, len, dtype); + err = dbg_leb_change(c, lnum, buf, len); if (err) { ubifs_err("changing %d bytes in LEB %d failed, error %d", len, lnum, err); ubifs_ro_mode(c, err); - dbg_dump_stack(); + dump_stack(); } return err; } @@ -170,12 +169,12 @@ int ubifs_leb_unmap(struct ubifs_info *c, int lnum) if (err) { ubifs_err("unmap LEB %d failed, error %d", lnum, err); ubifs_ro_mode(c, err); - dbg_dump_stack(); + dump_stack(); } return err; } -int ubifs_leb_map(struct ubifs_info *c, int lnum, int dtype) +int ubifs_leb_map(struct ubifs_info *c, int lnum) { int err; @@ -183,13 +182,13 @@ int ubifs_leb_map(struct ubifs_info *c, int lnum, int dtype) if (c->ro_error) return -EROFS; if (!dbg_is_tst_rcvry(c)) - err = ubi_leb_map(c->ubi, lnum, dtype); + err = ubi_leb_map(c->ubi, lnum); else - err = dbg_leb_map(c, lnum, dtype); + err = dbg_leb_map(c, lnum); if (err) { ubifs_err("mapping LEB %d failed, error %d", lnum, err); ubifs_ro_mode(c, err); - dbg_dump_stack(); + dump_stack(); } return err; } @@ -202,7 +201,7 @@ int ubifs_is_mapped(const struct ubifs_info *c, int lnum) if (err < 0) { ubifs_err("ubi_is_mapped failed for LEB %d, error %d", lnum, err); - dbg_dump_stack(); + dump_stack(); } return err; } @@ -294,8 +293,8 @@ out_len: out: if (!quiet) { ubifs_err("bad node at LEB %d:%d", lnum, offs); - dbg_dump_node(c, buf); - dbg_dump_stack(); + ubifs_dump_node(c, buf); + dump_stack(); } return err; } @@ -523,8 +522,7 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf) dirt = sync_len - wbuf->used; if (dirt) ubifs_pad(c, wbuf->buf + wbuf->used, dirt); - err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, wbuf->offs, sync_len, - wbuf->dtype); + err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, wbuf->offs, sync_len); if (err) return err; @@ -562,14 +560,12 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf) * @wbuf: write-buffer * @lnum: logical eraseblock number to seek to * @offs: logical eraseblock offset to seek to - * @dtype: data type * * This function targets the write-buffer to logical eraseblock @lnum:@offs. * The write-buffer has to be empty. Returns zero in case of success and a * negative error code in case of failure. */ -int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs, - int dtype) +int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs) { const struct ubifs_info *c = wbuf->c; @@ -592,7 +588,6 @@ int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs, wbuf->avail = wbuf->size; wbuf->used = 0; spin_unlock(&wbuf->lock); - wbuf->dtype = dtype; return 0; } @@ -719,8 +714,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) dbg_io("flush jhead %s wbuf to LEB %d:%d", dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs); err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, - wbuf->offs, wbuf->size, - wbuf->dtype); + wbuf->offs, wbuf->size); if (err) goto out; @@ -756,7 +750,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs); memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail); err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, wbuf->offs, - wbuf->size, wbuf->dtype); + wbuf->size); if (err) goto out; @@ -775,7 +769,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) dbg_io("write %d bytes to LEB %d:%d", wbuf->size, wbuf->lnum, wbuf->offs); err = ubifs_leb_write(c, wbuf->lnum, buf, wbuf->offs, - wbuf->size, wbuf->dtype); + wbuf->size); if (err) goto out; @@ -797,7 +791,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, wbuf->offs); err = ubifs_leb_write(c, wbuf->lnum, buf + written, - wbuf->offs, n, wbuf->dtype); + wbuf->offs, n); if (err) goto out; wbuf->offs += n; @@ -841,9 +835,9 @@ exit: out: ubifs_err("cannot write %d bytes to LEB %d:%d, error %d", len, wbuf->lnum, wbuf->offs, err); - dbg_dump_node(c, buf); - dbg_dump_stack(); - dbg_dump_leb(c, wbuf->lnum); + ubifs_dump_node(c, buf); + dump_stack(); + ubifs_dump_leb(c, wbuf->lnum); return err; } @@ -854,7 +848,6 @@ out: * @len: node length * @lnum: logical eraseblock number * @offs: offset within the logical eraseblock - * @dtype: node life-time hint (%UBI_LONGTERM, %UBI_SHORTTERM, %UBI_UNKNOWN) * * This function automatically fills node magic number, assigns sequence * number, and calculates node CRC checksum. The length of the @buf buffer has @@ -863,7 +856,7 @@ out: * success and a negative error code in case of failure. */ int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum, - int offs, int dtype) + int offs) { int err, buf_len = ALIGN(len, c->min_io_size); @@ -879,9 +872,9 @@ int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum, return -EROFS; ubifs_prepare_node(c, buf, len, 1); - err = ubifs_leb_write(c, lnum, buf, offs, buf_len, dtype); + err = ubifs_leb_write(c, lnum, buf, offs, buf_len); if (err) - dbg_dump_node(c, buf); + ubifs_dump_node(c, buf); return err; } @@ -960,8 +953,8 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len, out: ubifs_err("bad node at LEB %d:%d", lnum, offs); - dbg_dump_node(c, buf); - dbg_dump_stack(); + ubifs_dump_node(c, buf); + dump_stack(); return -EINVAL; } @@ -1017,8 +1010,8 @@ int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len, out: ubifs_err("bad node at LEB %d:%d, LEB mapping status %d", lnum, offs, ubi_is_mapped(c->ubi, lnum)); - dbg_dump_node(c, buf); - dbg_dump_stack(); + ubifs_dump_node(c, buf); + dump_stack(); return -EINVAL; } @@ -1056,7 +1049,6 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf) */ size = c->max_write_size - (c->leb_start % c->max_write_size); wbuf->avail = wbuf->size = size; - wbuf->dtype = UBI_UNKNOWN; wbuf->sync_callback = NULL; mutex_init(&wbuf->io_mutex); spin_lock_init(&wbuf->lock); diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 2f438ab2e7a2..12c0f154ca83 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -214,7 +214,7 @@ out: err = ubifs_add_bud_to_log(c, jhead, lnum, offs); if (err) goto out_return; - err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, wbuf->dtype); + err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs); if (err) goto out_unlock; @@ -385,9 +385,9 @@ out: if (err == -ENOSPC) { /* This are some budgeting problems, print useful information */ down_write(&c->commit_sem); - dbg_dump_stack(); - dbg_dump_budg(c, &c->bi); - dbg_dump_lprops(c); + dump_stack(); + ubifs_dump_budg(c, &c->bi); + ubifs_dump_lprops(c); cmt_retries = dbg_check_lprops(c); up_write(&c->commit_sem); } @@ -1267,7 +1267,6 @@ out_free: return err; } -#ifdef CONFIG_UBIFS_FS_XATTR /** * ubifs_jnl_delete_xattr - delete an extended attribute. @@ -1462,4 +1461,3 @@ out_free: return err; } -#endif /* CONFIG_UBIFS_FS_XATTR */ diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c index f9fd068d1ae0..c80b15d6c8de 100644 --- a/fs/ubifs/log.c +++ b/fs/ubifs/log.c @@ -29,11 +29,7 @@ #include "ubifs.h" -#ifdef CONFIG_UBIFS_FS_DEBUG static int dbg_check_bud_bytes(struct ubifs_info *c); -#else -#define dbg_check_bud_bytes(c) 0 -#endif /** * ubifs_search_bud - search bud LEB. @@ -262,7 +258,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs) * an unclean reboot, because the target LEB might have been * unmapped, but not yet physically erased. */ - err = ubifs_leb_map(c, bud->lnum, UBI_SHORTTERM); + err = ubifs_leb_map(c, bud->lnum); if (err) goto out_unlock; } @@ -270,7 +266,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs) dbg_log("write ref LEB %d:%d", c->lhead_lnum, c->lhead_offs); err = ubifs_write_node(c, ref, UBIFS_REF_NODE_SZ, c->lhead_lnum, - c->lhead_offs, UBI_SHORTTERM); + c->lhead_offs); if (err) goto out_unlock; @@ -422,7 +418,7 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum) len = ALIGN(len, c->min_io_size); dbg_log("writing commit start at LEB %d:0, len %d", c->lhead_lnum, len); - err = ubifs_leb_write(c, c->lhead_lnum, cs, 0, len, UBI_SHORTTERM); + err = ubifs_leb_write(c, c->lhead_lnum, cs, 0, len); if (err) goto out; @@ -623,7 +619,7 @@ static int add_node(struct ubifs_info *c, void *buf, int *lnum, int *offs, int sz = ALIGN(*offs, c->min_io_size), err; ubifs_pad(c, buf + *offs, sz - *offs); - err = ubifs_leb_change(c, *lnum, buf, sz, UBI_SHORTTERM); + err = ubifs_leb_change(c, *lnum, buf, sz); if (err) return err; *lnum = ubifs_next_log_lnum(c, *lnum); @@ -702,7 +698,7 @@ int ubifs_consolidate_log(struct ubifs_info *c) int sz = ALIGN(offs, c->min_io_size); ubifs_pad(c, buf + offs, sz - offs); - err = ubifs_leb_change(c, write_lnum, buf, sz, UBI_SHORTTERM); + err = ubifs_leb_change(c, write_lnum, buf, sz); if (err) goto out_free; offs = ALIGN(offs, c->min_io_size); @@ -734,8 +730,6 @@ out_free: return err; } -#ifdef CONFIG_UBIFS_FS_DEBUG - /** * dbg_check_bud_bytes - make sure bud bytes calculation are all right. * @c: UBIFS file-system description object @@ -767,5 +761,3 @@ static int dbg_check_bud_bytes(struct ubifs_info *c) return err; } - -#endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index f8a181e647cc..86eb8e533249 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -447,7 +447,7 @@ static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops) int new_cat = ubifs_categorize_lprops(c, lprops); if (old_cat == new_cat) { - struct ubifs_lpt_heap *heap = &c->lpt_heap[new_cat - 1]; + struct ubifs_lpt_heap *heap; /* lprops on a heap now must be moved up or down */ if (new_cat < 1 || new_cat > LPROPS_HEAP_CNT) @@ -846,7 +846,9 @@ const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c) return lprops; } -#ifdef CONFIG_UBIFS_FS_DEBUG +/* + * Everything below is related to debugging. + */ /** * dbg_check_cats - check category heaps and lists. @@ -1001,8 +1003,8 @@ void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat, out: if (err) { dbg_msg("failed cat %d hpos %d err %d", cat, i, err); - dbg_dump_stack(); - dbg_dump_heap(c, heap, cat); + dump_stack(); + ubifs_dump_heap(c, heap, cat); } } @@ -1109,8 +1111,8 @@ static int scan_check_cb(struct ubifs_info *c, if (IS_ERR(sleb)) { ret = PTR_ERR(sleb); if (ret == -EUCLEAN) { - dbg_dump_lprops(c); - dbg_dump_budg(c, &c->bi); + ubifs_dump_lprops(c); + ubifs_dump_budg(c, &c->bi); } goto out; } @@ -1237,7 +1239,7 @@ out_print: ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, " "should be free %d, dirty %d", lnum, lp->free, lp->dirty, lp->flags, free, dirty); - dbg_dump_leb(c, lnum); + ubifs_dump_leb(c, lnum); out_destroy: ubifs_scan_destroy(sleb); ret = -EINVAL; @@ -1315,5 +1317,3 @@ int dbg_check_lprops(struct ubifs_info *c) out: return err; } - -#endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c index 66d59d0a1402..ce33b2beb151 100644 --- a/fs/ubifs/lpt.c +++ b/fs/ubifs/lpt.c @@ -701,8 +701,7 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, alen = ALIGN(len, c->min_io_size); set_ltab(c, lnum, c->leb_size - alen, alen - len); memset(p, 0xff, alen - len); - err = ubifs_leb_change(c, lnum++, buf, alen, - UBI_SHORTTERM); + err = ubifs_leb_change(c, lnum++, buf, alen); if (err) goto out; p = buf; @@ -732,8 +731,7 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, set_ltab(c, lnum, c->leb_size - alen, alen - len); memset(p, 0xff, alen - len); - err = ubifs_leb_change(c, lnum++, buf, alen, - UBI_SHORTTERM); + err = ubifs_leb_change(c, lnum++, buf, alen); if (err) goto out; p = buf; @@ -780,8 +778,7 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, alen = ALIGN(len, c->min_io_size); set_ltab(c, lnum, c->leb_size - alen, alen - len); memset(p, 0xff, alen - len); - err = ubifs_leb_change(c, lnum++, buf, alen, - UBI_SHORTTERM); + err = ubifs_leb_change(c, lnum++, buf, alen); if (err) goto out; p = buf; @@ -806,7 +803,7 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, alen = ALIGN(len, c->min_io_size); set_ltab(c, lnum, c->leb_size - alen, alen - len); memset(p, 0xff, alen - len); - err = ubifs_leb_change(c, lnum++, buf, alen, UBI_SHORTTERM); + err = ubifs_leb_change(c, lnum++, buf, alen); if (err) goto out; p = buf; @@ -826,7 +823,7 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, /* Write remaining buffer */ memset(p, 0xff, alen - len); - err = ubifs_leb_change(c, lnum, buf, alen, UBI_SHORTTERM); + err = ubifs_leb_change(c, lnum, buf, alen); if (err) goto out; @@ -926,7 +923,7 @@ static int check_lpt_crc(void *buf, int len) if (crc != calc_crc) { ubifs_err("invalid crc in LPT node: crc %hx calc %hx", crc, calc_crc); - dbg_dump_stack(); + dump_stack(); return -EINVAL; } return 0; @@ -949,7 +946,7 @@ static int check_lpt_type(uint8_t **addr, int *pos, int type) if (node_type != type) { ubifs_err("invalid type (%d) in LPT node type %d", node_type, type); - dbg_dump_stack(); + dump_stack(); return -EINVAL; } return 0; @@ -1247,7 +1244,7 @@ int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip) out: ubifs_err("error %d reading nnode at %d:%d", err, lnum, offs); - dbg_dump_stack(); + dump_stack(); kfree(nnode); return err; } @@ -1312,8 +1309,8 @@ static int read_pnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip) out: ubifs_err("error %d reading pnode at %d:%d", err, lnum, offs); - dbg_dump_pnode(c, pnode, parent, iip); - dbg_dump_stack(); + ubifs_dump_pnode(c, pnode, parent, iip); + dump_stack(); dbg_msg("calc num: %d", calc_pnode_num_from_parent(c, parent, iip)); kfree(pnode); return err; @@ -1740,16 +1737,20 @@ int ubifs_lpt_init(struct ubifs_info *c, int rd, int wr) if (rd) { err = lpt_init_rd(c); if (err) - return err; + goto out_err; } if (wr) { err = lpt_init_wr(c); if (err) - return err; + goto out_err; } return 0; + +out_err: + ubifs_lpt_free(c, 0); + return err; } /** @@ -2080,8 +2081,6 @@ out: return err; } -#ifdef CONFIG_UBIFS_FS_DEBUG - /** * dbg_chk_pnode - check a pnode. * @c: the UBIFS file-system description object @@ -2096,8 +2095,8 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, int i; if (pnode->num != col) { - dbg_err("pnode num %d expected %d parent num %d iip %d", - pnode->num, col, pnode->parent->num, pnode->iip); + ubifs_err("pnode num %d expected %d parent num %d iip %d", + pnode->num, col, pnode->parent->num, pnode->iip); return -EINVAL; } for (i = 0; i < UBIFS_LPT_FANOUT; i++) { @@ -2111,14 +2110,14 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, if (lnum >= c->leb_cnt) continue; if (lprops->lnum != lnum) { - dbg_err("bad LEB number %d expected %d", - lprops->lnum, lnum); + ubifs_err("bad LEB number %d expected %d", + lprops->lnum, lnum); return -EINVAL; } if (lprops->flags & LPROPS_TAKEN) { if (cat != LPROPS_UNCAT) { - dbg_err("LEB %d taken but not uncat %d", - lprops->lnum, cat); + ubifs_err("LEB %d taken but not uncat %d", + lprops->lnum, cat); return -EINVAL; } continue; @@ -2130,8 +2129,8 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, case LPROPS_FRDI_IDX: break; default: - dbg_err("LEB %d index but cat %d", - lprops->lnum, cat); + ubifs_err("LEB %d index but cat %d", + lprops->lnum, cat); return -EINVAL; } } else { @@ -2143,8 +2142,8 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, case LPROPS_FREEABLE: break; default: - dbg_err("LEB %d not index but cat %d", - lprops->lnum, cat); + ubifs_err("LEB %d not index but cat %d", + lprops->lnum, cat); return -EINVAL; } } @@ -2184,24 +2183,24 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, break; } if (!found) { - dbg_err("LEB %d cat %d not found in cat heap/list", - lprops->lnum, cat); + ubifs_err("LEB %d cat %d not found in cat heap/list", + lprops->lnum, cat); return -EINVAL; } switch (cat) { case LPROPS_EMPTY: if (lprops->free != c->leb_size) { - dbg_err("LEB %d cat %d free %d dirty %d", - lprops->lnum, cat, lprops->free, - lprops->dirty); + ubifs_err("LEB %d cat %d free %d dirty %d", + lprops->lnum, cat, lprops->free, + lprops->dirty); return -EINVAL; } case LPROPS_FREEABLE: case LPROPS_FRDI_IDX: if (lprops->free + lprops->dirty != c->leb_size) { - dbg_err("LEB %d cat %d free %d dirty %d", - lprops->lnum, cat, lprops->free, - lprops->dirty); + ubifs_err("LEB %d cat %d free %d dirty %d", + lprops->lnum, cat, lprops->free, + lprops->dirty); return -EINVAL; } } @@ -2235,9 +2234,10 @@ int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode, /* cnode is a nnode */ num = calc_nnode_num(row, col); if (cnode->num != num) { - dbg_err("nnode num %d expected %d " - "parent num %d iip %d", cnode->num, num, - (nnode ? nnode->num : 0), cnode->iip); + ubifs_err("nnode num %d expected %d " + "parent num %d iip %d", + cnode->num, num, + (nnode ? nnode->num : 0), cnode->iip); return -EINVAL; } nn = (struct ubifs_nnode *)cnode; @@ -2274,5 +2274,3 @@ int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode, } return 0; } - -#endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index cddd6bd214f4..4fa70734e6e7 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -30,11 +30,7 @@ #include <linux/random.h> #include "ubifs.h" -#ifdef CONFIG_UBIFS_FS_DEBUG static int dbg_populate_lsave(struct ubifs_info *c); -#else -#define dbg_populate_lsave(c) 0 -#endif /** * first_dirty_cnode - find first dirty cnode. @@ -324,11 +320,10 @@ static int layout_cnodes(struct ubifs_info *c) return 0; no_space: - ubifs_err("LPT out of space"); - dbg_err("LPT out of space at LEB %d:%d needing %d, done_ltab %d, " - "done_lsave %d", lnum, offs, len, done_ltab, done_lsave); - dbg_dump_lpt_info(c); - dbg_dump_lpt_lebs(c); + ubifs_err("LPT out of space at LEB %d:%d needing %d, done_ltab %d, " + "done_lsave %d", lnum, offs, len, done_ltab, done_lsave); + ubifs_dump_lpt_info(c); + ubifs_dump_lpt_lebs(c); dump_stack(); return err; } @@ -421,7 +416,7 @@ static int write_cnodes(struct ubifs_info *c) alen = ALIGN(wlen, c->min_io_size); memset(buf + offs, 0xff, alen - wlen); err = ubifs_leb_write(c, lnum, buf + from, from, - alen, UBI_SHORTTERM); + alen); if (err) return err; } @@ -479,8 +474,7 @@ static int write_cnodes(struct ubifs_info *c) wlen = offs - from; alen = ALIGN(wlen, c->min_io_size); memset(buf + offs, 0xff, alen - wlen); - err = ubifs_leb_write(c, lnum, buf + from, from, alen, - UBI_SHORTTERM); + err = ubifs_leb_write(c, lnum, buf + from, from, alen); if (err) return err; dbg_chk_lpt_sz(c, 2, c->leb_size - offs); @@ -506,8 +500,7 @@ static int write_cnodes(struct ubifs_info *c) wlen = offs - from; alen = ALIGN(wlen, c->min_io_size); memset(buf + offs, 0xff, alen - wlen); - err = ubifs_leb_write(c, lnum, buf + from, from, alen, - UBI_SHORTTERM); + err = ubifs_leb_write(c, lnum, buf + from, from, alen); if (err) return err; dbg_chk_lpt_sz(c, 2, c->leb_size - offs); @@ -531,7 +524,7 @@ static int write_cnodes(struct ubifs_info *c) wlen = offs - from; alen = ALIGN(wlen, c->min_io_size); memset(buf + offs, 0xff, alen - wlen); - err = ubifs_leb_write(c, lnum, buf + from, from, alen, UBI_SHORTTERM); + err = ubifs_leb_write(c, lnum, buf + from, from, alen); if (err) return err; @@ -552,11 +545,10 @@ static int write_cnodes(struct ubifs_info *c) return 0; no_space: - ubifs_err("LPT out of space mismatch"); - dbg_err("LPT out of space mismatch at LEB %d:%d needing %d, done_ltab " - "%d, done_lsave %d", lnum, offs, len, done_ltab, done_lsave); - dbg_dump_lpt_info(c); - dbg_dump_lpt_lebs(c); + ubifs_err("LPT out of space mismatch at LEB %d:%d needing %d, done_ltab " + "%d, done_lsave %d", lnum, offs, len, done_ltab, done_lsave); + ubifs_dump_lpt_info(c); + ubifs_dump_lpt_lebs(c); dump_stack(); return err; } @@ -1497,7 +1489,9 @@ void ubifs_lpt_free(struct ubifs_info *c, int wr_only) kfree(c->lpt_nod_buf); } -#ifdef CONFIG_UBIFS_FS_DEBUG +/* + * Everything below is related to debugging. + */ /** * dbg_is_all_ff - determine if a buffer contains only 0xFF bytes. @@ -1735,7 +1729,7 @@ int dbg_check_ltab(struct ubifs_info *c) for (lnum = c->lpt_first; lnum <= c->lpt_last; lnum++) { err = dbg_check_ltab_lnum(c, lnum); if (err) { - dbg_err("failed at LEB %d", lnum); + ubifs_err("failed at LEB %d", lnum); return err; } } @@ -1767,10 +1761,10 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c) free += c->leb_size; } if (free < c->lpt_sz) { - dbg_err("LPT space error: free %lld lpt_sz %lld", - free, c->lpt_sz); - dbg_dump_lpt_info(c); - dbg_dump_lpt_lebs(c); + ubifs_err("LPT space error: free %lld lpt_sz %lld", + free, c->lpt_sz); + ubifs_dump_lpt_info(c); + ubifs_dump_lpt_lebs(c); dump_stack(); return -EINVAL; } @@ -1807,13 +1801,13 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) d->chk_lpt_lebs = 0; d->chk_lpt_wastage = 0; if (c->dirty_pn_cnt > c->pnode_cnt) { - dbg_err("dirty pnodes %d exceed max %d", - c->dirty_pn_cnt, c->pnode_cnt); + ubifs_err("dirty pnodes %d exceed max %d", + c->dirty_pn_cnt, c->pnode_cnt); err = -EINVAL; } if (c->dirty_nn_cnt > c->nnode_cnt) { - dbg_err("dirty nnodes %d exceed max %d", - c->dirty_nn_cnt, c->nnode_cnt); + ubifs_err("dirty nnodes %d exceed max %d", + c->dirty_nn_cnt, c->nnode_cnt); err = -EINVAL; } return err; @@ -1830,23 +1824,23 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) chk_lpt_sz *= d->chk_lpt_lebs; chk_lpt_sz += len - c->nhead_offs; if (d->chk_lpt_sz != chk_lpt_sz) { - dbg_err("LPT wrote %lld but space used was %lld", - d->chk_lpt_sz, chk_lpt_sz); + ubifs_err("LPT wrote %lld but space used was %lld", + d->chk_lpt_sz, chk_lpt_sz); err = -EINVAL; } if (d->chk_lpt_sz > c->lpt_sz) { - dbg_err("LPT wrote %lld but lpt_sz is %lld", - d->chk_lpt_sz, c->lpt_sz); + ubifs_err("LPT wrote %lld but lpt_sz is %lld", + d->chk_lpt_sz, c->lpt_sz); err = -EINVAL; } if (d->chk_lpt_sz2 && d->chk_lpt_sz != d->chk_lpt_sz2) { - dbg_err("LPT layout size %lld but wrote %lld", - d->chk_lpt_sz, d->chk_lpt_sz2); + ubifs_err("LPT layout size %lld but wrote %lld", + d->chk_lpt_sz, d->chk_lpt_sz2); err = -EINVAL; } if (d->chk_lpt_sz2 && d->new_nhead_offs != len) { - dbg_err("LPT new nhead offs: expected %d was %d", - d->new_nhead_offs, len); + ubifs_err("LPT new nhead offs: expected %d was %d", + d->new_nhead_offs, len); err = -EINVAL; } lpt_sz = (long long)c->pnode_cnt * c->pnode_sz; @@ -1855,13 +1849,13 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) if (c->big_lpt) lpt_sz += c->lsave_sz; if (d->chk_lpt_sz - d->chk_lpt_wastage > lpt_sz) { - dbg_err("LPT chk_lpt_sz %lld + waste %lld exceeds %lld", - d->chk_lpt_sz, d->chk_lpt_wastage, lpt_sz); + ubifs_err("LPT chk_lpt_sz %lld + waste %lld exceeds %lld", + d->chk_lpt_sz, d->chk_lpt_wastage, lpt_sz); err = -EINVAL; } if (err) { - dbg_dump_lpt_info(c); - dbg_dump_lpt_lebs(c); + ubifs_dump_lpt_info(c); + ubifs_dump_lpt_lebs(c); dump_stack(); } d->chk_lpt_sz2 = d->chk_lpt_sz; @@ -1880,7 +1874,7 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) } /** - * dbg_dump_lpt_leb - dump an LPT LEB. + * ubifs_dump_lpt_leb - dump an LPT LEB. * @c: UBIFS file-system description object * @lnum: LEB number to dump * @@ -1986,13 +1980,13 @@ out: } /** - * dbg_dump_lpt_lebs - dump LPT lebs. + * ubifs_dump_lpt_lebs - dump LPT lebs. * @c: UBIFS file-system description object * * This function dumps all LPT LEBs. The caller has to make sure the LPT is * locked. */ -void dbg_dump_lpt_lebs(const struct ubifs_info *c) +void ubifs_dump_lpt_lebs(const struct ubifs_info *c) { int i; @@ -2046,5 +2040,3 @@ static int dbg_populate_lsave(struct ubifs_info *c) return 1; } - -#endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c index 278c2382e8c2..ab83ace9910a 100644 --- a/fs/ubifs/master.c +++ b/fs/ubifs/master.c @@ -241,7 +241,7 @@ static int validate_master(const struct ubifs_info *c) out: ubifs_err("bad master node at offset %d error %d", c->mst_offs, err); - dbg_dump_node(c, c->mst_node); + ubifs_dump_node(c, c->mst_node); return -EINVAL; } @@ -317,7 +317,7 @@ int ubifs_read_master(struct ubifs_info *c) if (c->leb_cnt < old_leb_cnt || c->leb_cnt < UBIFS_MIN_LEB_CNT) { ubifs_err("bad leb_cnt on master node"); - dbg_dump_node(c, c->mst_node); + ubifs_dump_node(c, c->mst_node); return -EINVAL; } @@ -379,7 +379,7 @@ int ubifs_write_master(struct ubifs_info *c) c->mst_offs = offs; c->mst_node->highest_inum = cpu_to_le64(c->highest_inum); - err = ubifs_write_node(c, c->mst_node, len, lnum, offs, UBI_SHORTTERM); + err = ubifs_write_node(c, c->mst_node, len, lnum, offs); if (err) return err; @@ -390,7 +390,7 @@ int ubifs_write_master(struct ubifs_info *c) if (err) return err; } - err = ubifs_write_node(c, c->mst_node, len, lnum, offs, UBI_SHORTTERM); + err = ubifs_write_node(c, c->mst_node, len, lnum, offs); return err; } diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index c542c73cfa3c..b02734db187c 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -52,11 +52,7 @@ * than the maximum number of orphans allowed. */ -#ifdef CONFIG_UBIFS_FS_DEBUG static int dbg_check_orphans(struct ubifs_info *c); -#else -#define dbg_check_orphans(c) 0 -#endif /** * ubifs_add_orphan - add an orphan. @@ -92,7 +88,7 @@ int ubifs_add_orphan(struct ubifs_info *c, ino_t inum) else if (inum > o->inum) p = &(*p)->rb_right; else { - dbg_err("orphaned twice"); + ubifs_err("orphaned twice"); spin_unlock(&c->orphan_lock); kfree(orphan); return 0; @@ -158,8 +154,8 @@ void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum) } } spin_unlock(&c->orphan_lock); - dbg_err("missing orphan ino %lu", (unsigned long)inum); - dbg_dump_stack(); + ubifs_err("missing orphan ino %lu", (unsigned long)inum); + dump_stack(); } /** @@ -248,8 +244,7 @@ static int do_write_orph_node(struct ubifs_info *c, int len, int atomic) ubifs_assert(c->ohead_offs == 0); ubifs_prepare_node(c, c->orph_buf, len, 1); len = ALIGN(len, c->min_io_size); - err = ubifs_leb_change(c, c->ohead_lnum, c->orph_buf, len, - UBI_SHORTTERM); + err = ubifs_leb_change(c, c->ohead_lnum, c->orph_buf, len); } else { if (c->ohead_offs == 0) { /* Ensure LEB has been unmapped */ @@ -258,7 +253,7 @@ static int do_write_orph_node(struct ubifs_info *c, int len, int atomic) return err; } err = ubifs_write_node(c, c->orph_buf, len, c->ohead_lnum, - c->ohead_offs, UBI_SHORTTERM); + c->ohead_offs); } return err; } @@ -569,7 +564,7 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb, if (snod->type != UBIFS_ORPH_NODE) { ubifs_err("invalid node type %d in orphan area at " "%d:%d", snod->type, sleb->lnum, snod->offs); - dbg_dump_node(c, snod->node); + ubifs_dump_node(c, snod->node); return -EINVAL; } @@ -597,7 +592,7 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb, ubifs_err("out of order commit number %llu in " "orphan node at %d:%d", cmt_no, sleb->lnum, snod->offs); - dbg_dump_node(c, snod->node); + ubifs_dump_node(c, snod->node); return -EINVAL; } dbg_rcvry("out of date LEB %d", sleb->lnum); @@ -725,7 +720,9 @@ int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only) return err; } -#ifdef CONFIG_UBIFS_FS_DEBUG +/* + * Everything below is related to debugging. + */ struct check_orphan { struct rb_node rb; @@ -968,5 +965,3 @@ out: kfree(ci.node); return err; } - -#endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index 2a935b317232..c30d976b4be8 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -213,10 +213,10 @@ static int write_rcvrd_mst_node(struct ubifs_info *c, mst->flags |= cpu_to_le32(UBIFS_MST_RCVRY); ubifs_prepare_node(c, mst, UBIFS_MST_NODE_SZ, 1); - err = ubifs_leb_change(c, lnum, mst, sz, UBI_SHORTTERM); + err = ubifs_leb_change(c, lnum, mst, sz); if (err) goto out; - err = ubifs_leb_change(c, lnum + 1, mst, sz, UBI_SHORTTERM); + err = ubifs_leb_change(c, lnum + 1, mst, sz); if (err) goto out; out: @@ -362,12 +362,12 @@ out_err: out_free: ubifs_err("failed to recover master node"); if (mst1) { - dbg_err("dumping first master node"); - dbg_dump_node(c, mst1); + ubifs_err("dumping first master node"); + ubifs_dump_node(c, mst1); } if (mst2) { - dbg_err("dumping second master node"); - dbg_dump_node(c, mst2); + ubifs_err("dumping second master node"); + ubifs_dump_node(c, mst2); } vfree(buf2); vfree(buf1); @@ -555,8 +555,7 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb, ubifs_pad(c, buf, pad_len); } } - err = ubifs_leb_change(c, lnum, sleb->buf, len, - UBI_UNKNOWN); + err = ubifs_leb_change(c, lnum, sleb->buf, len); if (err) return err; } @@ -683,7 +682,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, ret, lnum, offs); break; } else { - dbg_err("unexpected return value %d", ret); + ubifs_err("unexpected return value %d", ret); err = -EINVAL; goto error; } @@ -789,7 +788,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, corrupted_rescan: /* Re-scan the corrupted data with verbose messages */ - dbg_err("corruptio %d", ret); + ubifs_err("corruptio %d", ret); ubifs_scan_a_node(c, buf, len, lnum, offs, 1); corrupted: ubifs_scanned_corruption(c, lnum, offs, buf); @@ -827,17 +826,17 @@ static int get_cs_sqnum(struct ubifs_info *c, int lnum, int offs, goto out_free; ret = ubifs_scan_a_node(c, cs_node, UBIFS_CS_NODE_SZ, lnum, offs, 0); if (ret != SCANNED_A_NODE) { - dbg_err("Not a valid node"); + ubifs_err("Not a valid node"); goto out_err; } if (cs_node->ch.node_type != UBIFS_CS_NODE) { - dbg_err("Node a CS node, type is %d", cs_node->ch.node_type); + ubifs_err("Node a CS node, type is %d", cs_node->ch.node_type); goto out_err; } if (le64_to_cpu(cs_node->cmt_no) != c->cmt_no) { - dbg_err("CS node cmt_no %llu != current cmt_no %llu", - (unsigned long long)le64_to_cpu(cs_node->cmt_no), - c->cmt_no); + ubifs_err("CS node cmt_no %llu != current cmt_no %llu", + (unsigned long long)le64_to_cpu(cs_node->cmt_no), + c->cmt_no); goto out_err; } *cs_sqnum = le64_to_cpu(cs_node->ch.sqnum); @@ -941,7 +940,7 @@ static int recover_head(struct ubifs_info *c, int lnum, int offs, void *sbuf) err = ubifs_leb_read(c, lnum, sbuf, 0, offs, 1); if (err) return err; - return ubifs_leb_change(c, lnum, sbuf, offs, UBI_UNKNOWN); + return ubifs_leb_change(c, lnum, sbuf, offs); } return 0; @@ -1071,7 +1070,7 @@ static int clean_an_unclean_leb(struct ubifs_info *c, } /* Write back the LEB atomically */ - err = ubifs_leb_change(c, lnum, sbuf, len, UBI_UNKNOWN); + err = ubifs_leb_change(c, lnum, sbuf, len); if (err) return err; @@ -1138,9 +1137,9 @@ static int grab_empty_leb(struct ubifs_info *c) */ lnum = ubifs_find_free_leb_for_idx(c); if (lnum < 0) { - dbg_err("could not find an empty LEB"); - dbg_dump_lprops(c); - dbg_dump_budg(c, &c->bi); + ubifs_err("could not find an empty LEB"); + ubifs_dump_lprops(c); + ubifs_dump_budg(c, &c->bi); return lnum; } @@ -1218,7 +1217,7 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c) } mutex_unlock(&wbuf->io_mutex); if (err < 0) { - dbg_err("GC failed, error %d", err); + ubifs_err("GC failed, error %d", err); if (err == -EAGAIN) err = -EINVAL; return err; @@ -1472,7 +1471,7 @@ static int fix_size_in_place(struct ubifs_info *c, struct size_entry *e) len -= 1; len = ALIGN(len + 1, c->min_io_size); /* Atomically write the fixed LEB back again */ - err = ubifs_leb_change(c, lnum, c->sbuf, len, UBI_UNKNOWN); + err = ubifs_leb_change(c, lnum, c->sbuf, len); if (err) goto out; dbg_rcvry("inode %lu at %d:%d size %lld -> %lld", diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index b007637f0406..3a2da7e476e5 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -154,8 +154,7 @@ static int set_bud_lprops(struct ubifs_info *c, struct bud_entry *b) /* Make sure the journal head points to the latest bud */ err = ubifs_wbuf_seek_nolock(&c->jheads[b->bud->jhead].wbuf, - b->bud->lnum, c->leb_size - b->free, - UBI_SHORTTERM); + b->bud->lnum, c->leb_size - b->free); out: ubifs_release_lprops(c); @@ -686,7 +685,7 @@ out: out_dump: ubifs_err("bad node is at LEB %d:%d", lnum, snod->offs); - dbg_dump_node(c, snod->node); + ubifs_dump_node(c, snod->node); ubifs_scan_destroy(sleb); return -EINVAL; } @@ -861,16 +860,16 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf) * numbers. */ if (snod->type != UBIFS_CS_NODE) { - dbg_err("first log node at LEB %d:%d is not CS node", - lnum, offs); + ubifs_err("first log node at LEB %d:%d is not CS node", + lnum, offs); goto out_dump; } if (le64_to_cpu(node->cmt_no) != c->cmt_no) { - dbg_err("first CS node at LEB %d:%d has wrong " - "commit number %llu expected %llu", - lnum, offs, - (unsigned long long)le64_to_cpu(node->cmt_no), - c->cmt_no); + ubifs_err("first CS node at LEB %d:%d has wrong " + "commit number %llu expected %llu", + lnum, offs, + (unsigned long long)le64_to_cpu(node->cmt_no), + c->cmt_no); goto out_dump; } @@ -892,7 +891,7 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf) /* Make sure the first node sits at offset zero of the LEB */ if (snod->offs != 0) { - dbg_err("first node is not at zero offset"); + ubifs_err("first node is not at zero offset"); goto out_dump; } @@ -905,8 +904,8 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf) } if (snod->sqnum < c->cs_sqnum) { - dbg_err("bad sqnum %llu, commit sqnum %llu", - snod->sqnum, c->cs_sqnum); + ubifs_err("bad sqnum %llu, commit sqnum %llu", + snod->sqnum, c->cs_sqnum); goto out_dump; } @@ -958,7 +957,7 @@ out: out_dump: ubifs_err("log error detected while replaying the log at LEB %d:%d", lnum, offs + snod->offs); - dbg_dump_node(c, snod->node); + ubifs_dump_node(c, snod->node); ubifs_scan_destroy(sleb); return -EINVAL; } diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index 771f7fb6ce92..ef3d1ba6d992 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -130,7 +130,6 @@ static int create_default_filesystem(struct ubifs_info *c) * orphan node. */ orph_lebs = UBIFS_MIN_ORPH_LEBS; -#ifdef CONFIG_UBIFS_FS_DEBUG if (c->leb_cnt - min_leb_cnt > 1) /* * For debugging purposes it is better to have at least 2 @@ -138,7 +137,6 @@ static int create_default_filesystem(struct ubifs_info *c) * consolidations and would be stressed more. */ orph_lebs += 1; -#endif main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS - log_lebs; main_lebs -= orph_lebs; @@ -196,7 +194,7 @@ static int create_default_filesystem(struct ubifs_info *c) sup->rp_size = cpu_to_le64(tmp64); sup->ro_compat_version = cpu_to_le32(UBIFS_RO_COMPAT_VERSION); - err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0, UBI_LONGTERM); + err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0); kfree(sup); if (err) return err; @@ -252,14 +250,13 @@ static int create_default_filesystem(struct ubifs_info *c) mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ); - err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM, 0, - UBI_UNKNOWN); + err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM, 0); if (err) { kfree(mst); return err; } - err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM + 1, 0, - UBI_UNKNOWN); + err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM + 1, + 0); kfree(mst); if (err) return err; @@ -282,8 +279,7 @@ static int create_default_filesystem(struct ubifs_info *c) key_write_idx(c, &key, &br->key); br->lnum = cpu_to_le32(main_first + DEFAULT_DATA_LEB); br->len = cpu_to_le32(UBIFS_INO_NODE_SZ); - err = ubifs_write_node(c, idx, tmp, main_first + DEFAULT_IDX_LEB, 0, - UBI_UNKNOWN); + err = ubifs_write_node(c, idx, tmp, main_first + DEFAULT_IDX_LEB, 0); kfree(idx); if (err) return err; @@ -315,8 +311,7 @@ static int create_default_filesystem(struct ubifs_info *c) ino->flags = cpu_to_le32(UBIFS_COMPR_FL); err = ubifs_write_node(c, ino, UBIFS_INO_NODE_SZ, - main_first + DEFAULT_DATA_LEB, 0, - UBI_UNKNOWN); + main_first + DEFAULT_DATA_LEB, 0); kfree(ino); if (err) return err; @@ -335,8 +330,7 @@ static int create_default_filesystem(struct ubifs_info *c) return -ENOMEM; cs->ch.node_type = UBIFS_CS_NODE; - err = ubifs_write_node(c, cs, UBIFS_CS_NODE_SZ, UBIFS_LOG_LNUM, - 0, UBI_UNKNOWN); + err = ubifs_write_node(c, cs, UBIFS_CS_NODE_SZ, UBIFS_LOG_LNUM, 0); kfree(cs); ubifs_msg("default file-system created"); @@ -475,7 +469,7 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup) failed: ubifs_err("bad superblock, error %d", err); - dbg_dump_node(c, sup); + ubifs_dump_node(c, sup); return -EINVAL; } @@ -518,7 +512,7 @@ int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup) int len = ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size); ubifs_prepare_node(c, sup, UBIFS_SB_NODE_SZ, 1); - return ubifs_leb_change(c, UBIFS_SB_LNUM, sup, len, UBI_LONGTERM); + return ubifs_leb_change(c, UBIFS_SB_LNUM, sup, len); } /** @@ -691,7 +685,7 @@ static int fixup_leb(struct ubifs_info *c, int lnum, int len) if (err) return err; - return ubifs_leb_change(c, lnum, c->sbuf, len, UBI_UNKNOWN); + return ubifs_leb_change(c, lnum, c->sbuf, len); } /** diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c index 37383e8011b1..7c40e6025fd6 100644 --- a/fs/ubifs/scan.c +++ b/fs/ubifs/scan.c @@ -101,7 +101,7 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum, if (!quiet) { ubifs_err("bad pad node at LEB %d:%d", lnum, offs); - dbg_dump_node(c, pad); + ubifs_dump_node(c, pad); } return SCANNED_A_BAD_PAD_NODE; } @@ -109,8 +109,8 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum, /* Make the node pads to 8-byte boundary */ if ((node_len + pad_len) & 7) { if (!quiet) - dbg_err("bad padding length %d - %d", - offs, offs + node_len + pad_len); + ubifs_err("bad padding length %d - %d", + offs, offs + node_len + pad_len); return SCANNED_A_BAD_PAD_NODE; } @@ -245,7 +245,7 @@ void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs, len = c->leb_size - offs; if (len > 8192) len = 8192; - dbg_err("first %d bytes from LEB %d:%d", len, lnum, offs); + ubifs_err("first %d bytes from LEB %d:%d", len, lnum, offs); print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 4, buf, len, 1); } @@ -300,16 +300,16 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, switch (ret) { case SCANNED_GARBAGE: - dbg_err("garbage"); + ubifs_err("garbage"); goto corrupted; case SCANNED_A_NODE: break; case SCANNED_A_CORRUPT_NODE: case SCANNED_A_BAD_PAD_NODE: - dbg_err("bad node"); + ubifs_err("bad node"); goto corrupted; default: - dbg_err("unknown"); + ubifs_err("unknown"); err = -EINVAL; goto error; } diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 76e4e0566ad6..5862dd9d2784 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -246,8 +246,8 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum) out_invalid: ubifs_err("inode %lu validation failed, error %d", inode->i_ino, err); - dbg_dump_node(c, ino); - dbg_dump_inode(c, inode); + ubifs_dump_node(c, ino); + ubifs_dump_inode(c, inode); err = -EINVAL; out_ino: kfree(ino); @@ -378,7 +378,7 @@ out: smp_wmb(); } done: - end_writeback(inode); + clear_inode(inode); } static void ubifs_dirty_inode(struct inode *inode, int flags) @@ -668,8 +668,8 @@ static int init_constants_sb(struct ubifs_info *c) tmp = UBIFS_CS_NODE_SZ + UBIFS_REF_NODE_SZ * c->jhead_cnt; tmp = ALIGN(tmp, c->min_io_size); if (tmp > c->leb_size) { - dbg_err("too small LEB size %d, at least %d needed", - c->leb_size, tmp); + ubifs_err("too small LEB size %d, at least %d needed", + c->leb_size, tmp); return -EINVAL; } @@ -683,8 +683,8 @@ static int init_constants_sb(struct ubifs_info *c) tmp /= c->leb_size; tmp += 1; if (c->log_lebs < tmp) { - dbg_err("too small log %d LEBs, required min. %d LEBs", - c->log_lebs, tmp); + ubifs_err("too small log %d LEBs, required min. %d LEBs", + c->log_lebs, tmp); return -EINVAL; } @@ -813,13 +813,10 @@ static int alloc_wbufs(struct ubifs_info *c) c->jheads[i].grouped = 1; } - c->jheads[BASEHD].wbuf.dtype = UBI_SHORTTERM; /* - * Garbage Collector head likely contains long-term data and - * does not need to be synchronized by timer. Also GC head nodes are - * not grouped. + * Garbage Collector head does not need to be synchronized by timer. + * Also GC head nodes are not grouped. */ - c->jheads[GCHD].wbuf.dtype = UBI_LONGTERM; c->jheads[GCHD].wbuf.no_timer = 1; c->jheads[GCHD].grouped = 0; @@ -863,7 +860,7 @@ static void free_orphans(struct ubifs_info *c) orph = list_entry(c->orph_list.next, struct ubifs_orphan, list); list_del(&orph->list); kfree(orph); - dbg_err("orphan list not empty at unmount"); + ubifs_err("orphan list not empty at unmount"); } vfree(c->orph_buf); @@ -1147,8 +1144,8 @@ static int check_free_space(struct ubifs_info *c) ubifs_assert(c->dark_wm > 0); if (c->lst.total_free + c->lst.total_dirty < c->dark_wm) { ubifs_err("insufficient free space to mount in R/W mode"); - dbg_dump_budg(c, &c->bi); - dbg_dump_lprops(c); + ubifs_dump_budg(c, &c->bi); + ubifs_dump_lprops(c); return -ENOSPC; } return 0; @@ -1301,7 +1298,7 @@ static int mount_ubifs(struct ubifs_info *c) if (!c->ro_mount && c->space_fixup) { err = ubifs_fixup_free_space(c); if (err) - goto out_master; + goto out_lpt; } if (!c->ro_mount) { @@ -2126,8 +2123,8 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, */ ubi = open_ubi(name, UBI_READONLY); if (IS_ERR(ubi)) { - dbg_err("cannot open \"%s\", error %d", - name, (int)PTR_ERR(ubi)); + ubifs_err("cannot open \"%s\", error %d", + name, (int)PTR_ERR(ubi)); return ERR_CAST(ubi); } diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index abd51331345e..349f31a30f40 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -339,8 +339,8 @@ static int lnc_add(struct ubifs_info *c, struct ubifs_zbranch *zbr, err = ubifs_validate_entry(c, dent); if (err) { - dbg_dump_stack(); - dbg_dump_node(c, dent); + dump_stack(); + ubifs_dump_node(c, dent); return err; } @@ -372,8 +372,8 @@ static int lnc_add_directly(struct ubifs_info *c, struct ubifs_zbranch *zbr, err = ubifs_validate_entry(c, node); if (err) { - dbg_dump_stack(); - dbg_dump_node(c, node); + dump_stack(); + ubifs_dump_node(c, node); return err; } @@ -1733,8 +1733,8 @@ out_err: err = -EINVAL; out: ubifs_err("bad node at LEB %d:%d", zbr->lnum, zbr->offs); - dbg_dump_node(c, buf); - dbg_dump_stack(); + ubifs_dump_node(c, buf); + dump_stack(); return err; } @@ -1775,7 +1775,7 @@ int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu) if (err && err != -EBADMSG) { ubifs_err("failed to read from LEB %d:%d, error %d", lnum, offs, err); - dbg_dump_stack(); + dump_stack(); dbg_tnck(&bu->key, "key "); return err; } @@ -2403,7 +2403,7 @@ static int tnc_delete(struct ubifs_info *c, struct ubifs_znode *znode, int n) err = ubifs_add_dirt(c, zbr->lnum, zbr->len); if (err) { - dbg_dump_znode(c, znode); + ubifs_dump_znode(c, znode); return err; } @@ -2649,7 +2649,7 @@ int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key, err = ubifs_add_dirt(c, znode->zbranch[i].lnum, znode->zbranch[i].len); if (err) { - dbg_dump_znode(c, znode); + ubifs_dump_znode(c, znode); goto out_unlock; } dbg_tnck(key, "removing key "); @@ -3275,8 +3275,6 @@ out_unlock: return err; } -#ifdef CONFIG_UBIFS_FS_DEBUG - /** * dbg_check_inode_size - check if inode size is correct. * @c: UBIFS file-system description object @@ -3335,13 +3333,11 @@ out_dump: (unsigned long)inode->i_ino, size, ((loff_t)block) << UBIFS_BLOCK_SHIFT); mutex_unlock(&c->tnc_mutex); - dbg_dump_inode(c, inode); - dbg_dump_stack(); + ubifs_dump_inode(c, inode); + dump_stack(); return -EINVAL; out_unlock: mutex_unlock(&c->tnc_mutex); return err; } - -#endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c index 4c15f07a8bb2..523bbad69c0c 100644 --- a/fs/ubifs/tnc_commit.c +++ b/fs/ubifs/tnc_commit.c @@ -54,18 +54,16 @@ static int make_idx_node(struct ubifs_info *c, struct ubifs_idx_node *idx, br->len = cpu_to_le32(zbr->len); if (!zbr->lnum || !zbr->len) { ubifs_err("bad ref in znode"); - dbg_dump_znode(c, znode); + ubifs_dump_znode(c, znode); if (zbr->znode) - dbg_dump_znode(c, zbr->znode); + ubifs_dump_znode(c, zbr->znode); } } ubifs_prepare_node(c, idx, len, 0); -#ifdef CONFIG_UBIFS_FS_DEBUG znode->lnum = lnum; znode->offs = offs; znode->len = len; -#endif err = insert_old_idx_znode(c, znode); @@ -322,8 +320,7 @@ static int layout_leb_in_gaps(struct ubifs_info *c, int *p) 0, 0, 0); if (err) return err; - err = ubifs_leb_change(c, lnum, c->ileb_buf, c->ileb_len, - UBI_SHORTTERM); + err = ubifs_leb_change(c, lnum, c->ileb_buf, c->ileb_len); if (err) return err; dbg_gc("LEB %d wrote %d index nodes", lnum, tot_written); @@ -388,8 +385,8 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt) * option which forces in-the-gaps is enabled. */ ubifs_warn("out of space"); - dbg_dump_budg(c, &c->bi); - dbg_dump_lprops(c); + ubifs_dump_budg(c, &c->bi); + ubifs_dump_lprops(c); } /* Try to commit anyway */ err = 0; @@ -456,11 +453,9 @@ static int layout_in_empty_space(struct ubifs_info *c) offs = buf_offs + used; -#ifdef CONFIG_UBIFS_FS_DEBUG znode->lnum = lnum; znode->offs = offs; znode->len = len; -#endif /* Update the parent */ zp = znode->parent; @@ -536,10 +531,8 @@ static int layout_in_empty_space(struct ubifs_info *c) break; } -#ifdef CONFIG_UBIFS_FS_DEBUG c->dbg->new_ihead_lnum = lnum; c->dbg->new_ihead_offs = buf_offs; -#endif return 0; } @@ -864,9 +857,9 @@ static int write_index(struct ubifs_info *c) br->len = cpu_to_le32(zbr->len); if (!zbr->lnum || !zbr->len) { ubifs_err("bad ref in znode"); - dbg_dump_znode(c, znode); + ubifs_dump_znode(c, znode); if (zbr->znode) - dbg_dump_znode(c, zbr->znode); + ubifs_dump_znode(c, zbr->znode); } } len = ubifs_idx_node_sz(c, znode->child_cnt); @@ -881,13 +874,11 @@ static int write_index(struct ubifs_info *c) } offs = buf_offs + used; -#ifdef CONFIG_UBIFS_FS_DEBUG if (lnum != znode->lnum || offs != znode->offs || len != znode->len) { ubifs_err("inconsistent znode posn"); return -EINVAL; } -#endif /* Grab some stuff from znode while we still can */ cnext = znode->cnext; @@ -959,8 +950,7 @@ static int write_index(struct ubifs_info *c) } /* The buffer is full or there are no more znodes to do */ - err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs, blen, - UBI_SHORTTERM); + err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs, blen); if (err) return err; buf_offs += blen; @@ -982,13 +972,11 @@ static int write_index(struct ubifs_info *c) break; } -#ifdef CONFIG_UBIFS_FS_DEBUG if (lnum != c->dbg->new_ihead_lnum || buf_offs != c->dbg->new_ihead_offs) { ubifs_err("inconsistent ihead"); return -EINVAL; } -#endif c->ihead_lnum = lnum; c->ihead_offs = buf_offs; diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c index dc28fe6ec07a..d38ac7f9654b 100644 --- a/fs/ubifs/tnc_misc.c +++ b/fs/ubifs/tnc_misc.c @@ -293,10 +293,10 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len, lnum, offs, znode->level, znode->child_cnt); if (znode->child_cnt > c->fanout || znode->level > UBIFS_MAX_LEVELS) { - dbg_err("current fanout %d, branch count %d", - c->fanout, znode->child_cnt); - dbg_err("max levels %d, znode level %d", - UBIFS_MAX_LEVELS, znode->level); + ubifs_err("current fanout %d, branch count %d", + c->fanout, znode->child_cnt); + ubifs_err("max levels %d, znode level %d", + UBIFS_MAX_LEVELS, znode->level); err = 1; goto out_dump; } @@ -316,7 +316,7 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len, if (zbr->lnum < c->main_first || zbr->lnum >= c->leb_cnt || zbr->offs < 0 || zbr->offs + zbr->len > c->leb_size || zbr->offs & 7) { - dbg_err("bad branch %d", i); + ubifs_err("bad branch %d", i); err = 2; goto out_dump; } @@ -340,19 +340,19 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len, type = key_type(c, &zbr->key); if (c->ranges[type].max_len == 0) { if (zbr->len != c->ranges[type].len) { - dbg_err("bad target node (type %d) length (%d)", - type, zbr->len); - dbg_err("have to be %d", c->ranges[type].len); + ubifs_err("bad target node (type %d) length (%d)", + type, zbr->len); + ubifs_err("have to be %d", c->ranges[type].len); err = 4; goto out_dump; } } else if (zbr->len < c->ranges[type].min_len || zbr->len > c->ranges[type].max_len) { - dbg_err("bad target node (type %d) length (%d)", - type, zbr->len); - dbg_err("have to be in range of %d-%d", - c->ranges[type].min_len, - c->ranges[type].max_len); + ubifs_err("bad target node (type %d) length (%d)", + type, zbr->len); + ubifs_err("have to be in range of %d-%d", + c->ranges[type].min_len, + c->ranges[type].max_len); err = 5; goto out_dump; } @@ -370,13 +370,13 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len, cmp = keys_cmp(c, key1, key2); if (cmp > 0) { - dbg_err("bad key order (keys %d and %d)", i, i + 1); + ubifs_err("bad key order (keys %d and %d)", i, i + 1); err = 6; goto out_dump; } else if (cmp == 0 && !is_hash_key(c, key1)) { /* These can only be keys with colliding hash */ - dbg_err("keys %d and %d are not hashed but equivalent", - i, i + 1); + ubifs_err("keys %d and %d are not hashed but equivalent", + i, i + 1); err = 7; goto out_dump; } @@ -387,7 +387,7 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len, out_dump: ubifs_err("bad indexing node at LEB %d:%d, error %d", lnum, offs, err); - dbg_dump_node(c, idx); + ubifs_dump_node(c, idx); kfree(idx); return -EINVAL; } @@ -486,7 +486,7 @@ int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr, zbr->lnum, zbr->offs); dbg_tnck(key, "looked for key "); dbg_tnck(&key1, "but found node's key "); - dbg_dump_node(c, node); + ubifs_dump_node(c, node); return -EINVAL; } diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 93d59aceaaef..1e5a08623d11 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -650,8 +650,6 @@ typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c, * @avail: number of bytes available in the write-buffer * @used: number of used bytes in the write-buffer * @size: write-buffer size (in [@c->min_io_size, @c->max_write_size] range) - * @dtype: type of data stored in this LEB (%UBI_LONGTERM, %UBI_SHORTTERM, - * %UBI_UNKNOWN) * @jhead: journal head the mutex belongs to (note, needed only to shut lockdep * up by 'mutex_lock_nested()). * @sync_callback: write-buffer synchronization callback @@ -685,7 +683,6 @@ struct ubifs_wbuf { int avail; int used; int size; - int dtype; int jhead; int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad); struct mutex io_mutex; @@ -762,6 +759,9 @@ struct ubifs_zbranch { * @offs: offset of the corresponding indexing node * @len: length of the corresponding indexing node * @zbranch: array of znode branches (@c->fanout elements) + * + * Note! The @lnum, @offs, and @len fields are not really needed - we have them + * only for internal consistency check. They could be removed to save some RAM. */ struct ubifs_znode { struct ubifs_znode *parent; @@ -772,9 +772,9 @@ struct ubifs_znode { int child_cnt; int iip; int alt; -#ifdef CONFIG_UBIFS_FS_DEBUG - int lnum, offs, len; -#endif + int lnum; + int offs; + int len; struct ubifs_zbranch zbranch[]; }; @@ -1444,9 +1444,7 @@ struct ubifs_info { struct rb_root size_tree; struct ubifs_mount_opts mount_opts; -#ifdef CONFIG_UBIFS_FS_DEBUG struct ubifs_debug_info *dbg; -#endif }; extern struct list_head ubifs_infos; @@ -1468,22 +1466,20 @@ void ubifs_ro_mode(struct ubifs_info *c, int err); int ubifs_leb_read(const struct ubifs_info *c, int lnum, void *buf, int offs, int len, int even_ebadmsg); int ubifs_leb_write(struct ubifs_info *c, int lnum, const void *buf, int offs, - int len, int dtype); -int ubifs_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len, - int dtype); + int len); +int ubifs_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len); int ubifs_leb_unmap(struct ubifs_info *c, int lnum); -int ubifs_leb_map(struct ubifs_info *c, int lnum, int dtype); +int ubifs_leb_map(struct ubifs_info *c, int lnum); int ubifs_is_mapped(const struct ubifs_info *c, int lnum); int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len); -int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs, - int dtype); +int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs); int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf); int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len, int lnum, int offs); int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len, int lnum, int offs); int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum, - int offs, int dtype); + int offs); int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, int offs, int quiet, int must_chk_crc); void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 7a8bafa19c9f..0f7139bdb2c2 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -399,8 +399,8 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf, if (buf) { /* If @buf is %NULL we are supposed to return the length */ if (ui->data_len > size) { - dbg_err("buffer size %zd, xattr len %d", - size, ui->data_len); + ubifs_err("buffer size %zd, xattr len %d", + size, ui->data_len); err = -ERANGE; goto out_iput; } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 7d7528008359..873e1bab9c4c 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -80,7 +80,7 @@ void udf_evict_inode(struct inode *inode) } else truncate_inode_pages(&inode->i_data, 0); invalidate_inode_buffers(inode); - end_writeback(inode); + clear_inode(inode); if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && inode->i_size != iinfo->i_lenExtents) { udf_warn(inode->i_sb, "Inode %lu (mode %o) has inode size %llu different from extent length %llu. Filesystem need not be standards compliant.\n", diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 7cdd3953d67e..dd7c89d8a1c1 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -895,7 +895,7 @@ void ufs_evict_inode(struct inode * inode) } invalidate_inode_buffers(inode); - end_writeback(inode); + clear_inode(inode); if (want_delete) { lock_ufs(inode->i_sb); diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 0a9977983f92..d2bf974b1a2f 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -33,6 +33,7 @@ xfs-y += xfs_aops.o \ xfs_discard.o \ xfs_error.o \ xfs_export.o \ + xfs_extent_busy.o \ xfs_file.o \ xfs_filestream.o \ xfs_fsops.o \ @@ -49,7 +50,6 @@ xfs-y += xfs_aops.o \ xfs_sync.o \ xfs_xattr.o \ xfs_rename.o \ - xfs_rw.o \ xfs_utils.o \ xfs_vnodeops.o \ kmem.o \ diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index 4805f009f923..44d65c1533c0 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -175,24 +175,6 @@ typedef struct xfs_agfl { } xfs_agfl_t; /* - * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that - * have been freed but whose transactions aren't committed to disk yet. - * - * Note that we use the transaction ID to record the transaction, not the - * transaction structure itself. See xfs_alloc_busy_insert() for details. - */ -struct xfs_busy_extent { - struct rb_node rb_node; /* ag by-bno indexed search tree */ - struct list_head list; /* transaction busy extent list */ - xfs_agnumber_t agno; - xfs_agblock_t bno; - xfs_extlen_t length; - unsigned int flags; -#define XFS_ALLOC_BUSY_DISCARDED 0x01 /* undergoing a discard op. */ -#define XFS_ALLOC_BUSY_SKIP_DISCARD 0x02 /* do not discard */ -}; - -/* * Per-ag incore structure, copies of information in agf and agi, * to improve the performance of allocation group selection. */ diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 0f0df2759b09..229641fb8e67 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -20,7 +20,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -32,6 +31,7 @@ #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_alloc.h" +#include "xfs_extent_busy.h" #include "xfs_error.h" #include "xfs_trace.h" @@ -47,8 +47,6 @@ STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *); STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *); STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *, xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *); -STATIC void xfs_alloc_busy_trim(struct xfs_alloc_arg *, - xfs_agblock_t, xfs_extlen_t, xfs_agblock_t *, xfs_extlen_t *); /* * Lookup the record equal to [bno, len] in the btree given by cur. @@ -152,7 +150,7 @@ xfs_alloc_compute_aligned( xfs_extlen_t len; /* Trim busy sections out of found extent */ - xfs_alloc_busy_trim(args, foundbno, foundlen, &bno, &len); + xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len); if (args->alignment > 1 && len >= args->minlen) { xfs_agblock_t aligned_bno = roundup(bno, args->alignment); @@ -536,7 +534,7 @@ xfs_alloc_ag_vextent( if (error) return error; - ASSERT(!xfs_alloc_busy_search(args->mp, args->agno, + ASSERT(!xfs_extent_busy_search(args->mp, args->agno, args->agbno, args->len)); } @@ -603,7 +601,7 @@ xfs_alloc_ag_vextent_exact( /* * Check for overlapping busy extents. */ - xfs_alloc_busy_trim(args, fbno, flen, &tbno, &tlen); + xfs_extent_busy_trim(args, fbno, flen, &tbno, &tlen); /* * Give up if the start of the extent is busy, or the freespace isn't @@ -1391,7 +1389,7 @@ xfs_alloc_ag_vextent_small( if (error) goto error0; if (fbno != NULLAGBLOCK) { - xfs_alloc_busy_reuse(args->mp, args->agno, fbno, 1, + xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1, args->userdata); if (args->userdata) { @@ -2496,579 +2494,8 @@ xfs_free_extent( error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); if (!error) - xfs_alloc_busy_insert(tp, args.agno, args.agbno, len, 0); + xfs_extent_busy_insert(tp, args.agno, args.agbno, len, 0); error0: xfs_perag_put(args.pag); return error; } - -void -xfs_alloc_busy_insert( - struct xfs_trans *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len, - unsigned int flags) -{ - struct xfs_busy_extent *new; - struct xfs_busy_extent *busyp; - struct xfs_perag *pag; - struct rb_node **rbp; - struct rb_node *parent = NULL; - - new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL); - if (!new) { - /* - * No Memory! Since it is now not possible to track the free - * block, make this a synchronous transaction to insure that - * the block is not reused before this transaction commits. - */ - trace_xfs_alloc_busy_enomem(tp->t_mountp, agno, bno, len); - xfs_trans_set_sync(tp); - return; - } - - new->agno = agno; - new->bno = bno; - new->length = len; - INIT_LIST_HEAD(&new->list); - new->flags = flags; - - /* trace before insert to be able to see failed inserts */ - trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len); - - pag = xfs_perag_get(tp->t_mountp, new->agno); - spin_lock(&pag->pagb_lock); - rbp = &pag->pagb_tree.rb_node; - while (*rbp) { - parent = *rbp; - busyp = rb_entry(parent, struct xfs_busy_extent, rb_node); - - if (new->bno < busyp->bno) { - rbp = &(*rbp)->rb_left; - ASSERT(new->bno + new->length <= busyp->bno); - } else if (new->bno > busyp->bno) { - rbp = &(*rbp)->rb_right; - ASSERT(bno >= busyp->bno + busyp->length); - } else { - ASSERT(0); - } - } - - rb_link_node(&new->rb_node, parent, rbp); - rb_insert_color(&new->rb_node, &pag->pagb_tree); - - list_add(&new->list, &tp->t_busy); - spin_unlock(&pag->pagb_lock); - xfs_perag_put(pag); -} - -/* - * Search for a busy extent within the range of the extent we are about to - * allocate. You need to be holding the busy extent tree lock when calling - * xfs_alloc_busy_search(). This function returns 0 for no overlapping busy - * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact - * match. This is done so that a non-zero return indicates an overlap that - * will require a synchronous transaction, but it can still be - * used to distinguish between a partial or exact match. - */ -int -xfs_alloc_busy_search( - struct xfs_mount *mp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len) -{ - struct xfs_perag *pag; - struct rb_node *rbp; - struct xfs_busy_extent *busyp; - int match = 0; - - pag = xfs_perag_get(mp, agno); - spin_lock(&pag->pagb_lock); - - rbp = pag->pagb_tree.rb_node; - - /* find closest start bno overlap */ - while (rbp) { - busyp = rb_entry(rbp, struct xfs_busy_extent, rb_node); - if (bno < busyp->bno) { - /* may overlap, but exact start block is lower */ - if (bno + len > busyp->bno) - match = -1; - rbp = rbp->rb_left; - } else if (bno > busyp->bno) { - /* may overlap, but exact start block is higher */ - if (bno < busyp->bno + busyp->length) - match = -1; - rbp = rbp->rb_right; - } else { - /* bno matches busyp, length determines exact match */ - match = (busyp->length == len) ? 1 : -1; - break; - } - } - spin_unlock(&pag->pagb_lock); - xfs_perag_put(pag); - return match; -} - -/* - * The found free extent [fbno, fend] overlaps part or all of the given busy - * extent. If the overlap covers the beginning, the end, or all of the busy - * extent, the overlapping portion can be made unbusy and used for the - * allocation. We can't split a busy extent because we can't modify a - * transaction/CIL context busy list, but we can update an entries block - * number or length. - * - * Returns true if the extent can safely be reused, or false if the search - * needs to be restarted. - */ -STATIC bool -xfs_alloc_busy_update_extent( - struct xfs_mount *mp, - struct xfs_perag *pag, - struct xfs_busy_extent *busyp, - xfs_agblock_t fbno, - xfs_extlen_t flen, - bool userdata) -{ - xfs_agblock_t fend = fbno + flen; - xfs_agblock_t bbno = busyp->bno; - xfs_agblock_t bend = bbno + busyp->length; - - /* - * This extent is currently being discarded. Give the thread - * performing the discard a chance to mark the extent unbusy - * and retry. - */ - if (busyp->flags & XFS_ALLOC_BUSY_DISCARDED) { - spin_unlock(&pag->pagb_lock); - delay(1); - spin_lock(&pag->pagb_lock); - return false; - } - - /* - * If there is a busy extent overlapping a user allocation, we have - * no choice but to force the log and retry the search. - * - * Fortunately this does not happen during normal operation, but - * only if the filesystem is very low on space and has to dip into - * the AGFL for normal allocations. - */ - if (userdata) - goto out_force_log; - - if (bbno < fbno && bend > fend) { - /* - * Case 1: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +---------+ - * fbno fend - */ - - /* - * We would have to split the busy extent to be able to track - * it correct, which we cannot do because we would have to - * modify the list of busy extents attached to the transaction - * or CIL context, which is immutable. - * - * Force out the log to clear the busy extent and retry the - * search. - */ - goto out_force_log; - } else if (bbno >= fbno && bend <= fend) { - /* - * Case 2: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +-----------------+ - * fbno fend - * - * Case 3: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +--------------------------+ - * fbno fend - * - * Case 4: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +--------------------------+ - * fbno fend - * - * Case 5: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +-----------------------------------+ - * fbno fend - * - */ - - /* - * The busy extent is fully covered by the extent we are - * allocating, and can simply be removed from the rbtree. - * However we cannot remove it from the immutable list - * tracking busy extents in the transaction or CIL context, - * so set the length to zero to mark it invalid. - * - * We also need to restart the busy extent search from the - * tree root, because erasing the node can rearrange the - * tree topology. - */ - rb_erase(&busyp->rb_node, &pag->pagb_tree); - busyp->length = 0; - return false; - } else if (fend < bend) { - /* - * Case 6: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +---------+ - * fbno fend - * - * Case 7: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +------------------+ - * fbno fend - * - */ - busyp->bno = fend; - } else if (bbno < fbno) { - /* - * Case 8: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +-------------+ - * fbno fend - * - * Case 9: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +----------------------+ - * fbno fend - */ - busyp->length = fbno - busyp->bno; - } else { - ASSERT(0); - } - - trace_xfs_alloc_busy_reuse(mp, pag->pag_agno, fbno, flen); - return true; - -out_force_log: - spin_unlock(&pag->pagb_lock); - xfs_log_force(mp, XFS_LOG_SYNC); - trace_xfs_alloc_busy_force(mp, pag->pag_agno, fbno, flen); - spin_lock(&pag->pagb_lock); - return false; -} - - -/* - * For a given extent [fbno, flen], make sure we can reuse it safely. - */ -void -xfs_alloc_busy_reuse( - struct xfs_mount *mp, - xfs_agnumber_t agno, - xfs_agblock_t fbno, - xfs_extlen_t flen, - bool userdata) -{ - struct xfs_perag *pag; - struct rb_node *rbp; - - ASSERT(flen > 0); - - pag = xfs_perag_get(mp, agno); - spin_lock(&pag->pagb_lock); -restart: - rbp = pag->pagb_tree.rb_node; - while (rbp) { - struct xfs_busy_extent *busyp = - rb_entry(rbp, struct xfs_busy_extent, rb_node); - xfs_agblock_t bbno = busyp->bno; - xfs_agblock_t bend = bbno + busyp->length; - - if (fbno + flen <= bbno) { - rbp = rbp->rb_left; - continue; - } else if (fbno >= bend) { - rbp = rbp->rb_right; - continue; - } - - if (!xfs_alloc_busy_update_extent(mp, pag, busyp, fbno, flen, - userdata)) - goto restart; - } - spin_unlock(&pag->pagb_lock); - xfs_perag_put(pag); -} - -/* - * For a given extent [fbno, flen], search the busy extent list to find a - * subset of the extent that is not busy. If *rlen is smaller than - * args->minlen no suitable extent could be found, and the higher level - * code needs to force out the log and retry the allocation. - */ -STATIC void -xfs_alloc_busy_trim( - struct xfs_alloc_arg *args, - xfs_agblock_t bno, - xfs_extlen_t len, - xfs_agblock_t *rbno, - xfs_extlen_t *rlen) -{ - xfs_agblock_t fbno; - xfs_extlen_t flen; - struct rb_node *rbp; - - ASSERT(len > 0); - - spin_lock(&args->pag->pagb_lock); -restart: - fbno = bno; - flen = len; - rbp = args->pag->pagb_tree.rb_node; - while (rbp && flen >= args->minlen) { - struct xfs_busy_extent *busyp = - rb_entry(rbp, struct xfs_busy_extent, rb_node); - xfs_agblock_t fend = fbno + flen; - xfs_agblock_t bbno = busyp->bno; - xfs_agblock_t bend = bbno + busyp->length; - - if (fend <= bbno) { - rbp = rbp->rb_left; - continue; - } else if (fbno >= bend) { - rbp = rbp->rb_right; - continue; - } - - /* - * If this is a metadata allocation, try to reuse the busy - * extent instead of trimming the allocation. - */ - if (!args->userdata && - !(busyp->flags & XFS_ALLOC_BUSY_DISCARDED)) { - if (!xfs_alloc_busy_update_extent(args->mp, args->pag, - busyp, fbno, flen, - false)) - goto restart; - continue; - } - - if (bbno <= fbno) { - /* start overlap */ - - /* - * Case 1: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +---------+ - * fbno fend - * - * Case 2: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +-------------+ - * fbno fend - * - * Case 3: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +-------------+ - * fbno fend - * - * Case 4: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +-----------------+ - * fbno fend - * - * No unbusy region in extent, return failure. - */ - if (fend <= bend) - goto fail; - - /* - * Case 5: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +----------------------+ - * fbno fend - * - * Case 6: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +--------------------------+ - * fbno fend - * - * Needs to be trimmed to: - * +-------+ - * fbno fend - */ - fbno = bend; - } else if (bend >= fend) { - /* end overlap */ - - /* - * Case 7: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +------------------+ - * fbno fend - * - * Case 8: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +--------------------------+ - * fbno fend - * - * Needs to be trimmed to: - * +-------+ - * fbno fend - */ - fend = bbno; - } else { - /* middle overlap */ - - /* - * Case 9: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +-----------------------------------+ - * fbno fend - * - * Can be trimmed to: - * +-------+ OR +-------+ - * fbno fend fbno fend - * - * Backward allocation leads to significant - * fragmentation of directories, which degrades - * directory performance, therefore we always want to - * choose the option that produces forward allocation - * patterns. - * Preferring the lower bno extent will make the next - * request use "fend" as the start of the next - * allocation; if the segment is no longer busy at - * that point, we'll get a contiguous allocation, but - * even if it is still busy, we will get a forward - * allocation. - * We try to avoid choosing the segment at "bend", - * because that can lead to the next allocation - * taking the segment at "fbno", which would be a - * backward allocation. We only use the segment at - * "fbno" if it is much larger than the current - * requested size, because in that case there's a - * good chance subsequent allocations will be - * contiguous. - */ - if (bbno - fbno >= args->maxlen) { - /* left candidate fits perfect */ - fend = bbno; - } else if (fend - bend >= args->maxlen * 4) { - /* right candidate has enough free space */ - fbno = bend; - } else if (bbno - fbno >= args->minlen) { - /* left candidate fits minimum requirement */ - fend = bbno; - } else { - goto fail; - } - } - - flen = fend - fbno; - } - spin_unlock(&args->pag->pagb_lock); - - if (fbno != bno || flen != len) { - trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len, - fbno, flen); - } - *rbno = fbno; - *rlen = flen; - return; -fail: - /* - * Return a zero extent length as failure indications. All callers - * re-check if the trimmed extent satisfies the minlen requirement. - */ - spin_unlock(&args->pag->pagb_lock); - trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len, fbno, 0); - *rbno = fbno; - *rlen = 0; -} - -static void -xfs_alloc_busy_clear_one( - struct xfs_mount *mp, - struct xfs_perag *pag, - struct xfs_busy_extent *busyp) -{ - if (busyp->length) { - trace_xfs_alloc_busy_clear(mp, busyp->agno, busyp->bno, - busyp->length); - rb_erase(&busyp->rb_node, &pag->pagb_tree); - } - - list_del_init(&busyp->list); - kmem_free(busyp); -} - -/* - * Remove all extents on the passed in list from the busy extents tree. - * If do_discard is set skip extents that need to be discarded, and mark - * these as undergoing a discard operation instead. - */ -void -xfs_alloc_busy_clear( - struct xfs_mount *mp, - struct list_head *list, - bool do_discard) -{ - struct xfs_busy_extent *busyp, *n; - struct xfs_perag *pag = NULL; - xfs_agnumber_t agno = NULLAGNUMBER; - - list_for_each_entry_safe(busyp, n, list, list) { - if (busyp->agno != agno) { - if (pag) { - spin_unlock(&pag->pagb_lock); - xfs_perag_put(pag); - } - pag = xfs_perag_get(mp, busyp->agno); - spin_lock(&pag->pagb_lock); - agno = busyp->agno; - } - - if (do_discard && busyp->length && - !(busyp->flags & XFS_ALLOC_BUSY_SKIP_DISCARD)) - busyp->flags = XFS_ALLOC_BUSY_DISCARDED; - else - xfs_alloc_busy_clear_one(mp, pag, busyp); - } - - if (pag) { - spin_unlock(&pag->pagb_lock); - xfs_perag_put(pag); - } -} - -/* - * Callback for list_sort to sort busy extents by the AG they reside in. - */ -int -xfs_busy_extent_ag_cmp( - void *priv, - struct list_head *a, - struct list_head *b) -{ - return container_of(a, struct xfs_busy_extent, list)->agno - - container_of(b, struct xfs_busy_extent, list)->agno; -} diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index 3a7e7d8f8ded..93be4a667ca1 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -23,7 +23,6 @@ struct xfs_btree_cur; struct xfs_mount; struct xfs_perag; struct xfs_trans; -struct xfs_busy_extent; extern struct workqueue_struct *xfs_alloc_wq; @@ -139,33 +138,6 @@ xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp, struct xfs_perag *pag); -#ifdef __KERNEL__ -void -xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno, - xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags); - -void -xfs_alloc_busy_clear(struct xfs_mount *mp, struct list_head *list, - bool do_discard); - -int -xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t bno, xfs_extlen_t len); - -void -xfs_alloc_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata); - -int -xfs_busy_extent_ag_cmp(void *priv, struct list_head *a, struct list_head *b); - -static inline void xfs_alloc_busy_sort(struct list_head *list) -{ - list_sort(NULL, list, xfs_busy_extent_ag_cmp); -} - -#endif /* __KERNEL__ */ - /* * Compute and fill in value of m_ag_maxlevels. */ diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index ffb3386e45c1..f1647caace8f 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -18,9 +18,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -32,6 +30,7 @@ #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_alloc.h" +#include "xfs_extent_busy.h" #include "xfs_error.h" #include "xfs_trace.h" @@ -94,7 +93,7 @@ xfs_allocbt_alloc_block( return 0; } - xfs_alloc_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false); + xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false); xfs_trans_agbtree_delta(cur->bc_tp, 1); new->s = cpu_to_be32(bno); @@ -119,8 +118,8 @@ xfs_allocbt_free_block( if (error) return error; - xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, - XFS_ALLOC_BUSY_SKIP_DISCARD); + xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, + XFS_EXTENT_BUSY_SKIP_DISCARD); xfs_trans_agbtree_delta(cur->bc_tp, -1); return 0; } diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 0dbb9e70fe21..ae31c313a79e 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -16,9 +16,7 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_trans.h" @@ -29,7 +27,6 @@ #include "xfs_inode_item.h" #include "xfs_alloc.h" #include "xfs_error.h" -#include "xfs_rw.h" #include "xfs_iomap.h" #include "xfs_vnodeops.h" #include "xfs_trace.h" @@ -623,7 +620,7 @@ xfs_map_at_offset( * or delayed allocate extent. */ STATIC int -xfs_is_delayed_page( +xfs_check_page_type( struct page *page, unsigned int type) { @@ -637,11 +634,11 @@ xfs_is_delayed_page( bh = head = page_buffers(page); do { if (buffer_unwritten(bh)) - acceptable = (type == IO_UNWRITTEN); + acceptable += (type == IO_UNWRITTEN); else if (buffer_delay(bh)) - acceptable = (type == IO_DELALLOC); + acceptable += (type == IO_DELALLOC); else if (buffer_dirty(bh) && buffer_mapped(bh)) - acceptable = (type == IO_OVERWRITE); + acceptable += (type == IO_OVERWRITE); else break; } while ((bh = bh->b_this_page) != head); @@ -684,7 +681,7 @@ xfs_convert_page( goto fail_unlock_page; if (page->mapping != inode->i_mapping) goto fail_unlock_page; - if (!xfs_is_delayed_page(page, (*ioendp)->io_type)) + if (!xfs_check_page_type(page, (*ioendp)->io_type)) goto fail_unlock_page; /* @@ -834,7 +831,7 @@ xfs_aops_discard_page( struct buffer_head *bh, *head; loff_t offset = page_offset(page); - if (!xfs_is_delayed_page(page, IO_DELALLOC)) + if (!xfs_check_page_type(page, IO_DELALLOC)) goto out_invalidate; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) @@ -1146,7 +1143,14 @@ __xfs_get_blocks( if (!create && direct && offset >= i_size_read(inode)) return 0; - if (create) { + /* + * Direct I/O is usually done on preallocated files, so try getting + * a block mapping without an exclusive lock first. For buffered + * writes we already have the exclusive iolock anyway, so avoiding + * a lock roundtrip here by taking the ilock exclusive from the + * beginning is a useful micro optimization. + */ + if (create && !direct) { lockmode = XFS_ILOCK_EXCL; xfs_ilock(ip, lockmode); } else { @@ -1168,23 +1172,45 @@ __xfs_get_blocks( (!nimaps || (imap.br_startblock == HOLESTARTBLOCK || imap.br_startblock == DELAYSTARTBLOCK))) { - if (direct) { + if (direct || xfs_get_extsz_hint(ip)) { + /* + * Drop the ilock in preparation for starting the block + * allocation transaction. It will be retaken + * exclusively inside xfs_iomap_write_direct for the + * actual allocation. + */ + xfs_iunlock(ip, lockmode); error = xfs_iomap_write_direct(ip, offset, size, &imap, nimaps); + if (error) + return -error; + new = 1; } else { + /* + * Delalloc reservations do not require a transaction, + * we can go on without dropping the lock here. If we + * are allocating a new delalloc block, make sure that + * we set the new flag so that we mark the buffer new so + * that we know that it is newly allocated if the write + * fails. + */ + if (nimaps && imap.br_startblock == HOLESTARTBLOCK) + new = 1; error = xfs_iomap_write_delay(ip, offset, size, &imap); + if (error) + goto out_unlock; + + xfs_iunlock(ip, lockmode); } - if (error) - goto out_unlock; trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap); } else if (nimaps) { trace_xfs_get_blocks_found(ip, offset, size, 0, &imap); + xfs_iunlock(ip, lockmode); } else { trace_xfs_get_blocks_notfound(ip, offset, size); goto out_unlock; } - xfs_iunlock(ip, lockmode); if (imap.br_startblock != HOLESTARTBLOCK && imap.br_startblock != DELAYSTARTBLOCK) { @@ -1386,52 +1412,91 @@ out_destroy_ioend: return ret; } +/* + * Punch out the delalloc blocks we have already allocated. + * + * Don't bother with xfs_setattr given that nothing can have made it to disk yet + * as the page is still locked at this point. + */ +STATIC void +xfs_vm_kill_delalloc_range( + struct inode *inode, + loff_t start, + loff_t end) +{ + struct xfs_inode *ip = XFS_I(inode); + xfs_fileoff_t start_fsb; + xfs_fileoff_t end_fsb; + int error; + + start_fsb = XFS_B_TO_FSB(ip->i_mount, start); + end_fsb = XFS_B_TO_FSB(ip->i_mount, end); + if (end_fsb <= start_fsb) + return; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmap_punch_delalloc_range(ip, start_fsb, + end_fsb - start_fsb); + if (error) { + /* something screwed, just bail */ + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_alert(ip->i_mount, + "xfs_vm_write_failed: unable to clean up ino %lld", + ip->i_ino); + } + } + xfs_iunlock(ip, XFS_ILOCK_EXCL); +} + STATIC void xfs_vm_write_failed( - struct address_space *mapping, - loff_t to) + struct inode *inode, + struct page *page, + loff_t pos, + unsigned len) { - struct inode *inode = mapping->host; + loff_t block_offset = pos & PAGE_MASK; + loff_t block_start; + loff_t block_end; + loff_t from = pos & (PAGE_CACHE_SIZE - 1); + loff_t to = from + len; + struct buffer_head *bh, *head; - if (to > inode->i_size) { - /* - * Punch out the delalloc blocks we have already allocated. - * - * Don't bother with xfs_setattr given that nothing can have - * made it to disk yet as the page is still locked at this - * point. - */ - struct xfs_inode *ip = XFS_I(inode); - xfs_fileoff_t start_fsb; - xfs_fileoff_t end_fsb; - int error; + ASSERT(block_offset + from == pos); - truncate_pagecache(inode, to, inode->i_size); + head = page_buffers(page); + block_start = 0; + for (bh = head; bh != head || !block_start; + bh = bh->b_this_page, block_start = block_end, + block_offset += bh->b_size) { + block_end = block_start + bh->b_size; - /* - * Check if there are any blocks that are outside of i_size - * that need to be trimmed back. - */ - start_fsb = XFS_B_TO_FSB(ip->i_mount, inode->i_size) + 1; - end_fsb = XFS_B_TO_FSB(ip->i_mount, to); - if (end_fsb <= start_fsb) - return; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_bmap_punch_delalloc_range(ip, start_fsb, - end_fsb - start_fsb); - if (error) { - /* something screwed, just bail */ - if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { - xfs_alert(ip->i_mount, - "xfs_vm_write_failed: unable to clean up ino %lld", - ip->i_ino); - } - } - xfs_iunlock(ip, XFS_ILOCK_EXCL); + /* skip buffers before the write */ + if (block_end <= from) + continue; + + /* if the buffer is after the write, we're done */ + if (block_start >= to) + break; + + if (!buffer_delay(bh)) + continue; + + if (!buffer_new(bh) && block_offset < i_size_read(inode)) + continue; + + xfs_vm_kill_delalloc_range(inode, block_offset, + block_offset + bh->b_size); } + } +/* + * This used to call block_write_begin(), but it unlocks and releases the page + * on error, and we need that page to be able to punch stale delalloc blocks out + * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at + * the appropriate point. + */ STATIC int xfs_vm_write_begin( struct file *file, @@ -1442,15 +1507,40 @@ xfs_vm_write_begin( struct page **pagep, void **fsdata) { - int ret; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct page *page; + int status; - ret = block_write_begin(mapping, pos, len, flags | AOP_FLAG_NOFS, - pagep, xfs_get_blocks); - if (unlikely(ret)) - xfs_vm_write_failed(mapping, pos + len); - return ret; + ASSERT(len <= PAGE_CACHE_SIZE); + + page = grab_cache_page_write_begin(mapping, index, + flags | AOP_FLAG_NOFS); + if (!page) + return -ENOMEM; + + status = __block_write_begin(page, pos, len, xfs_get_blocks); + if (unlikely(status)) { + struct inode *inode = mapping->host; + + xfs_vm_write_failed(inode, page, pos, len); + unlock_page(page); + + if (pos + len > i_size_read(inode)) + truncate_pagecache(inode, pos + len, i_size_read(inode)); + + page_cache_release(page); + page = NULL; + } + + *pagep = page; + return status; } +/* + * On failure, we only need to kill delalloc blocks beyond EOF because they + * will never be written. For blocks within EOF, generic_write_end() zeros them + * so they are safe to leave alone and be written with all the other valid data. + */ STATIC int xfs_vm_write_end( struct file *file, @@ -1463,9 +1553,19 @@ xfs_vm_write_end( { int ret; + ASSERT(len <= PAGE_CACHE_SIZE); + ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); - if (unlikely(ret < len)) - xfs_vm_write_failed(mapping, pos + len); + if (unlikely(ret < len)) { + struct inode *inode = mapping->host; + size_t isize = i_size_read(inode); + loff_t to = pos + len; + + if (to > isize) { + truncate_pagecache(inode, to, isize); + xfs_vm_kill_delalloc_range(inode, isize, to); + } + } return ret; } diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 65d61b948ead..a17ff01b5adf 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -21,7 +21,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -39,7 +38,6 @@ #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_trans_space.h" -#include "xfs_rw.h" #include "xfs_vnodeops.h" #include "xfs_trace.h" @@ -1987,14 +1985,12 @@ xfs_attr_rmtval_get(xfs_da_args_t *args) (map[i].br_startblock != HOLESTARTBLOCK)); dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); - error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno, - blkcnt, XBF_LOCK | XBF_DONT_BLOCK, - &bp); + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, + dblkno, blkcnt, 0, &bp); if (error) return(error); - tmp = (valuelen < XFS_BUF_SIZE(bp)) - ? valuelen : XFS_BUF_SIZE(bp); + tmp = min_t(int, valuelen, BBTOB(bp->b_length)); xfs_buf_iomove(bp, 0, tmp, dst, XBRW_READ); xfs_buf_relse(bp); dst += tmp; @@ -2097,6 +2093,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) lblkno = args->rmtblkno; valuelen = args->valuelen; while (valuelen > 0) { + int buflen; + /* * Try to remember where we decided to put the value. */ @@ -2114,15 +2112,16 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); - bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, - XBF_LOCK | XBF_DONT_BLOCK); + bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, 0); if (!bp) return ENOMEM; - tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen : - XFS_BUF_SIZE(bp); + + buflen = BBTOB(bp->b_length); + tmp = min_t(int, valuelen, buflen); xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE); - if (tmp < XFS_BUF_SIZE(bp)) - xfs_buf_zero(bp, tmp, XFS_BUF_SIZE(bp) - tmp); + if (tmp < buflen) + xfs_buf_zero(bp, tmp, buflen - tmp); + error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */ xfs_buf_relse(bp); if (error) diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 76d93dc953e1..7d89d800f517 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -20,7 +20,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -2983,7 +2982,7 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, map.br_blockcount); bp = xfs_trans_get_buf(*trans, dp->i_mount->m_ddev_targp, - dblkno, dblkcnt, XBF_LOCK); + dblkno, dblkcnt, 0); if (!bp) return ENOMEM; xfs_trans_binval(*trans, bp); diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 85e7e327bcd8..58b815ec8c91 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -41,7 +41,6 @@ #include "xfs_rtalloc.h" #include "xfs_error.h" #include "xfs_attr_leaf.h" -#include "xfs_rw.h" #include "xfs_quota.h" #include "xfs_trans_space.h" #include "xfs_buf_item.h" @@ -4527,7 +4526,7 @@ out_unreserve_blocks: xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, alen, 0); out_unreserve_quota: if (XFS_IS_QUOTA_ON(mp)) - xfs_trans_unreserve_quota_nblks(NULL, ip, alen, 0, rt ? + xfs_trans_unreserve_quota_nblks(NULL, ip, (long)alen, 0, rt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); return error; } @@ -5621,8 +5620,20 @@ xfs_getbmap( XFS_FSB_TO_BB(mp, map[i].br_blockcount); out[cur_ext].bmv_unused1 = 0; out[cur_ext].bmv_unused2 = 0; - ASSERT(((iflags & BMV_IF_DELALLOC) != 0) || - (map[i].br_startblock != DELAYSTARTBLOCK)); + + /* + * delayed allocation extents that start beyond EOF can + * occur due to speculative EOF allocation when the + * delalloc extent is larger than the largest freespace + * extent at conversion time. These extents cannot be + * converted by data writeback, so can exist here even + * if we are not supposed to be finding delalloc + * extents. + */ + if (map[i].br_startblock == DELAYSTARTBLOCK && + map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip))) + ASSERT((iflags & BMV_IF_DELALLOC) != 0); + if (map[i].br_startblock == HOLESTARTBLOCK && whichfork == XFS_ATTR_FORK) { /* came to the end of attribute fork */ @@ -6157,3 +6168,16 @@ next_block: return error; } + +/* + * Convert the given file system block to a disk block. We have to treat it + * differently based on whether the file is a real time file or not, because the + * bmap code does. + */ +xfs_daddr_t +xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) +{ + return (XFS_IS_REALTIME_INODE(ip) ? \ + (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \ + XFS_FSB_TO_DADDR((ip)->i_mount, (fsb))); +} diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 89ee672d378a..803b56d7ce16 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -211,6 +211,9 @@ int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, int *count); int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, xfs_fileoff_t start_fsb, xfs_fileoff_t length); + +xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb); + #endif /* __KERNEL__ */ #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index e2f5d59cbeaf..862084a47a7e 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -20,7 +20,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 1f19f03af9d3..e53e317b1582 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -20,7 +20,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 6819b5163e33..172d3cc8f8cb 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -35,14 +35,12 @@ #include <linux/freezer.h> #include "xfs_sb.h" -#include "xfs_inum.h" #include "xfs_log.h" #include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_trace.h" static kmem_zone_t *xfs_buf_zone; -STATIC int xfsbufd(void *); static struct workqueue_struct *xfslogd_workqueue; @@ -57,11 +55,7 @@ static struct workqueue_struct *xfslogd_workqueue; #endif #define xb_to_gfp(flags) \ - ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : \ - ((flags) & XBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN) - -#define xb_to_km(flags) \ - (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP) + ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN) static inline int @@ -71,11 +65,11 @@ xfs_buf_is_vmapped( /* * Return true if the buffer is vmapped. * - * The XBF_MAPPED flag is set if the buffer should be mapped, but the - * code is clever enough to know it doesn't have to map a single page, - * so the check has to be both for XBF_MAPPED and bp->b_page_count > 1. + * b_addr is null if the buffer is not mapped, but the code is clever + * enough to know it doesn't have to map a single page, so the check has + * to be both for b_addr and bp->b_page_count > 1. */ - return (bp->b_flags & XBF_MAPPED) && bp->b_page_count > 1; + return bp->b_addr && bp->b_page_count > 1; } static inline int @@ -144,8 +138,17 @@ void xfs_buf_stale( struct xfs_buf *bp) { + ASSERT(xfs_buf_islocked(bp)); + bp->b_flags |= XBF_STALE; - xfs_buf_delwri_dequeue(bp); + + /* + * Clear the delwri status so that a delwri queue walker will not + * flush this buffer to disk now that it is stale. The delwri queue has + * a reference to the buffer, so this is safe to do. + */ + bp->b_flags &= ~_XBF_DELWRI_Q; + atomic_set(&(bp)->b_lru_ref, 0); if (!list_empty(&bp->b_lru)) { struct xfs_buftarg *btp = bp->b_target; @@ -164,22 +167,22 @@ xfs_buf_stale( struct xfs_buf * xfs_buf_alloc( struct xfs_buftarg *target, - xfs_off_t range_base, - size_t range_length, + xfs_daddr_t blkno, + size_t numblks, xfs_buf_flags_t flags) { struct xfs_buf *bp; - bp = kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags)); + bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS); if (unlikely(!bp)) return NULL; /* - * We don't want certain flags to appear in b_flags. + * We don't want certain flags to appear in b_flags unless they are + * specifically set by later operations on the buffer. */ - flags &= ~(XBF_LOCK|XBF_MAPPED|XBF_DONT_BLOCK|XBF_READ_AHEAD); + flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); - memset(bp, 0, sizeof(xfs_buf_t)); atomic_set(&bp->b_hold, 1); atomic_set(&bp->b_lru_ref, 1); init_completion(&bp->b_iowait); @@ -189,14 +192,22 @@ xfs_buf_alloc( sema_init(&bp->b_sema, 0); /* held, no waiters */ XB_SET_OWNER(bp); bp->b_target = target; - bp->b_file_offset = range_base; + /* - * Set buffer_length and count_desired to the same value initially. - * I/O routines should use count_desired, which will be the same in + * Set length and io_length to the same value initially. + * I/O routines should use io_length, which will be the same in * most cases but may be reset (e.g. XFS recovery). */ - bp->b_buffer_length = bp->b_count_desired = range_length; + bp->b_length = numblks; + bp->b_io_length = numblks; bp->b_flags = flags; + + /* + * We do not set the block number here in the buffer because we have not + * finished initialising the buffer. We insert the buffer into the cache + * in this state, so this ensures that we are unable to do IO on a + * buffer that hasn't been fully initialised. + */ bp->b_bn = XFS_BUF_DADDR_NULL; atomic_set(&bp->b_pin_count, 0); init_waitqueue_head(&bp->b_waiters); @@ -219,13 +230,12 @@ _xfs_buf_get_pages( { /* Make sure that we have a page list */ if (bp->b_pages == NULL) { - bp->b_offset = xfs_buf_poff(bp->b_file_offset); bp->b_page_count = page_count; if (page_count <= XB_PAGES) { bp->b_pages = bp->b_page_array; } else { bp->b_pages = kmem_alloc(sizeof(struct page *) * - page_count, xb_to_km(flags)); + page_count, KM_NOFS); if (bp->b_pages == NULL) return -ENOMEM; } @@ -288,11 +298,11 @@ xfs_buf_allocate_memory( xfs_buf_t *bp, uint flags) { - size_t size = bp->b_count_desired; + size_t size; size_t nbytes, offset; gfp_t gfp_mask = xb_to_gfp(flags); unsigned short page_count, i; - xfs_off_t end; + xfs_off_t start, end; int error; /* @@ -300,15 +310,15 @@ xfs_buf_allocate_memory( * the memory from the heap - there's no need for the complexity of * page arrays to keep allocation down to order 0. */ - if (bp->b_buffer_length < PAGE_SIZE) { - bp->b_addr = kmem_alloc(bp->b_buffer_length, xb_to_km(flags)); + size = BBTOB(bp->b_length); + if (size < PAGE_SIZE) { + bp->b_addr = kmem_alloc(size, KM_NOFS); if (!bp->b_addr) { /* low memory - use alloc_page loop instead */ goto use_alloc_page; } - if (((unsigned long)(bp->b_addr + bp->b_buffer_length - 1) & - PAGE_MASK) != + if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) != ((unsigned long)bp->b_addr & PAGE_MASK)) { /* b_addr spans two pages - use alloc_page instead */ kmem_free(bp->b_addr); @@ -319,13 +329,14 @@ xfs_buf_allocate_memory( bp->b_pages = bp->b_page_array; bp->b_pages[0] = virt_to_page(bp->b_addr); bp->b_page_count = 1; - bp->b_flags |= XBF_MAPPED | _XBF_KMEM; + bp->b_flags |= _XBF_KMEM; return 0; } use_alloc_page: - end = bp->b_file_offset + bp->b_buffer_length; - page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset); + start = BBTOB(bp->b_bn) >> PAGE_SHIFT; + end = (BBTOB(bp->b_bn + bp->b_length) + PAGE_SIZE - 1) >> PAGE_SHIFT; + page_count = end - start; error = _xfs_buf_get_pages(bp, page_count, flags); if (unlikely(error)) return error; @@ -388,8 +399,9 @@ _xfs_buf_map_pages( if (bp->b_page_count == 1) { /* A single page buffer is always mappable */ bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; - bp->b_flags |= XBF_MAPPED; - } else if (flags & XBF_MAPPED) { + } else if (flags & XBF_UNMAPPED) { + bp->b_addr = NULL; + } else { int retried = 0; do { @@ -403,7 +415,6 @@ _xfs_buf_map_pages( if (!bp->b_addr) return -ENOMEM; bp->b_addr += bp->b_offset; - bp->b_flags |= XBF_MAPPED; } return 0; @@ -420,29 +431,27 @@ _xfs_buf_map_pages( */ xfs_buf_t * _xfs_buf_find( - xfs_buftarg_t *btp, /* block device target */ - xfs_off_t ioff, /* starting offset of range */ - size_t isize, /* length of range */ + struct xfs_buftarg *btp, + xfs_daddr_t blkno, + size_t numblks, xfs_buf_flags_t flags, xfs_buf_t *new_bp) { - xfs_off_t range_base; - size_t range_length; + size_t numbytes; struct xfs_perag *pag; struct rb_node **rbp; struct rb_node *parent; xfs_buf_t *bp; - range_base = (ioff << BBSHIFT); - range_length = (isize << BBSHIFT); + numbytes = BBTOB(numblks); /* Check for IOs smaller than the sector size / not sector aligned */ - ASSERT(!(range_length < (1 << btp->bt_sshift))); - ASSERT(!(range_base & (xfs_off_t)btp->bt_smask)); + ASSERT(!(numbytes < (1 << btp->bt_sshift))); + ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask)); /* get tree root */ pag = xfs_perag_get(btp->bt_mount, - xfs_daddr_to_agno(btp->bt_mount, ioff)); + xfs_daddr_to_agno(btp->bt_mount, blkno)); /* walk tree */ spin_lock(&pag->pag_buf_lock); @@ -453,20 +462,20 @@ _xfs_buf_find( parent = *rbp; bp = rb_entry(parent, struct xfs_buf, b_rbnode); - if (range_base < bp->b_file_offset) + if (blkno < bp->b_bn) rbp = &(*rbp)->rb_left; - else if (range_base > bp->b_file_offset) + else if (blkno > bp->b_bn) rbp = &(*rbp)->rb_right; else { /* - * found a block offset match. If the range doesn't + * found a block number match. If the range doesn't * match, the only way this is allowed is if the buffer * in the cache is stale and the transaction that made * it stale has not yet committed. i.e. we are * reallocating a busy extent. Skip this buffer and * continue searching to the right for an exact match. */ - if (bp->b_buffer_length != range_length) { + if (bp->b_length != numblks) { ASSERT(bp->b_flags & XBF_STALE); rbp = &(*rbp)->rb_right; continue; @@ -511,7 +520,7 @@ found: */ if (bp->b_flags & XBF_STALE) { ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); - bp->b_flags &= XBF_MAPPED | _XBF_KMEM | _XBF_PAGES; + bp->b_flags &= _XBF_KMEM | _XBF_PAGES; } trace_xfs_buf_find(bp, flags, _RET_IP_); @@ -526,63 +535,59 @@ found: */ struct xfs_buf * xfs_buf_get( - xfs_buftarg_t *target,/* target for buffer */ - xfs_off_t ioff, /* starting offset of range */ - size_t isize, /* length of range */ + xfs_buftarg_t *target, + xfs_daddr_t blkno, + size_t numblks, xfs_buf_flags_t flags) { struct xfs_buf *bp; struct xfs_buf *new_bp; int error = 0; - bp = _xfs_buf_find(target, ioff, isize, flags, NULL); + bp = _xfs_buf_find(target, blkno, numblks, flags, NULL); if (likely(bp)) goto found; - new_bp = xfs_buf_alloc(target, ioff << BBSHIFT, isize << BBSHIFT, - flags); + new_bp = xfs_buf_alloc(target, blkno, numblks, flags); if (unlikely(!new_bp)) return NULL; - bp = _xfs_buf_find(target, ioff, isize, flags, new_bp); - if (!bp) { + error = xfs_buf_allocate_memory(new_bp, flags); + if (error) { kmem_zone_free(xfs_buf_zone, new_bp); return NULL; } - if (bp == new_bp) { - error = xfs_buf_allocate_memory(bp, flags); - if (error) - goto no_buffer; - } else - kmem_zone_free(xfs_buf_zone, new_bp); + bp = _xfs_buf_find(target, blkno, numblks, flags, new_bp); + if (!bp) { + xfs_buf_free(new_bp); + return NULL; + } + + if (bp != new_bp) + xfs_buf_free(new_bp); /* * Now we have a workable buffer, fill in the block number so * that we can do IO on it. */ - bp->b_bn = ioff; - bp->b_count_desired = bp->b_buffer_length; + bp->b_bn = blkno; + bp->b_io_length = bp->b_length; found: - if (!(bp->b_flags & XBF_MAPPED)) { + if (!bp->b_addr) { error = _xfs_buf_map_pages(bp, flags); if (unlikely(error)) { xfs_warn(target->bt_mount, "%s: failed to map pages\n", __func__); - goto no_buffer; + xfs_buf_relse(bp); + return NULL; } } XFS_STATS_INC(xb_get); trace_xfs_buf_get(bp, flags, _RET_IP_); return bp; - -no_buffer: - if (flags & (XBF_LOCK | XBF_TRYLOCK)) - xfs_buf_unlock(bp); - xfs_buf_rele(bp); - return NULL; } STATIC int @@ -590,32 +595,30 @@ _xfs_buf_read( xfs_buf_t *bp, xfs_buf_flags_t flags) { - int status; - - ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE))); + ASSERT(!(flags & XBF_WRITE)); ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL); - bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_DELWRI | XBF_READ_AHEAD); + bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD); bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); - status = xfs_buf_iorequest(bp); - if (status || bp->b_error || (flags & XBF_ASYNC)) - return status; + xfs_buf_iorequest(bp); + if (flags & XBF_ASYNC) + return 0; return xfs_buf_iowait(bp); } xfs_buf_t * xfs_buf_read( xfs_buftarg_t *target, - xfs_off_t ioff, - size_t isize, + xfs_daddr_t blkno, + size_t numblks, xfs_buf_flags_t flags) { xfs_buf_t *bp; flags |= XBF_READ; - bp = xfs_buf_get(target, ioff, isize, flags); + bp = xfs_buf_get(target, blkno, numblks, flags); if (bp) { trace_xfs_buf_read(bp, flags, _RET_IP_); @@ -627,7 +630,8 @@ xfs_buf_read( * Read ahead call which is already satisfied, * drop the buffer */ - goto no_buffer; + xfs_buf_relse(bp); + return NULL; } else { /* We do not want read in the flags */ bp->b_flags &= ~XBF_READ; @@ -635,12 +639,6 @@ xfs_buf_read( } return bp; - - no_buffer: - if (flags & (XBF_LOCK | XBF_TRYLOCK)) - xfs_buf_unlock(bp); - xfs_buf_rele(bp); - return NULL; } /* @@ -650,14 +648,14 @@ xfs_buf_read( void xfs_buf_readahead( xfs_buftarg_t *target, - xfs_off_t ioff, - size_t isize) + xfs_daddr_t blkno, + size_t numblks) { if (bdi_read_congested(target->bt_bdi)) return; - xfs_buf_read(target, ioff, isize, - XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD|XBF_DONT_BLOCK); + xfs_buf_read(target, blkno, numblks, + XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD); } /* @@ -666,16 +664,15 @@ xfs_buf_readahead( */ struct xfs_buf * xfs_buf_read_uncached( - struct xfs_mount *mp, struct xfs_buftarg *target, xfs_daddr_t daddr, - size_t length, + size_t numblks, int flags) { xfs_buf_t *bp; int error; - bp = xfs_buf_get_uncached(target, length, flags); + bp = xfs_buf_get_uncached(target, numblks, flags); if (!bp) return NULL; @@ -683,9 +680,9 @@ xfs_buf_read_uncached( XFS_BUF_SET_ADDR(bp, daddr); XFS_BUF_READ(bp); - xfsbdstrat(mp, bp); + xfsbdstrat(target->bt_mount, bp); error = xfs_buf_iowait(bp); - if (error || bp->b_error) { + if (error) { xfs_buf_relse(bp); return NULL; } @@ -699,7 +696,7 @@ xfs_buf_read_uncached( void xfs_buf_set_empty( struct xfs_buf *bp, - size_t len) + size_t numblks) { if (bp->b_pages) _xfs_buf_free_pages(bp); @@ -707,10 +704,9 @@ xfs_buf_set_empty( bp->b_pages = NULL; bp->b_page_count = 0; bp->b_addr = NULL; - bp->b_file_offset = 0; - bp->b_buffer_length = bp->b_count_desired = len; + bp->b_length = numblks; + bp->b_io_length = numblks; bp->b_bn = XFS_BUF_DADDR_NULL; - bp->b_flags &= ~XBF_MAPPED; } static inline struct page * @@ -749,7 +745,7 @@ xfs_buf_associate_memory( bp->b_pages = NULL; bp->b_addr = mem; - rval = _xfs_buf_get_pages(bp, page_count, XBF_DONT_BLOCK); + rval = _xfs_buf_get_pages(bp, page_count, 0); if (rval) return rval; @@ -760,9 +756,8 @@ xfs_buf_associate_memory( pageaddr += PAGE_SIZE; } - bp->b_count_desired = len; - bp->b_buffer_length = buflen; - bp->b_flags |= XBF_MAPPED; + bp->b_io_length = BTOBB(len); + bp->b_length = BTOBB(buflen); return 0; } @@ -770,17 +765,18 @@ xfs_buf_associate_memory( xfs_buf_t * xfs_buf_get_uncached( struct xfs_buftarg *target, - size_t len, + size_t numblks, int flags) { - unsigned long page_count = PAGE_ALIGN(len) >> PAGE_SHIFT; + unsigned long page_count; int error, i; xfs_buf_t *bp; - bp = xfs_buf_alloc(target, 0, len, 0); + bp = xfs_buf_alloc(target, 0, numblks, 0); if (unlikely(bp == NULL)) goto fail; + page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT; error = _xfs_buf_get_pages(bp, page_count, 0); if (error) goto fail_free_buf; @@ -792,7 +788,7 @@ xfs_buf_get_uncached( } bp->b_flags |= _XBF_PAGES; - error = _xfs_buf_map_pages(bp, XBF_MAPPED); + error = _xfs_buf_map_pages(bp, 0); if (unlikely(error)) { xfs_warn(target->bt_mount, "%s: failed to map pages\n", __func__); @@ -855,7 +851,7 @@ xfs_buf_rele( spin_unlock(&pag->pag_buf_lock); } else { xfs_buf_lru_del(bp); - ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q))); + ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); spin_unlock(&pag->pag_buf_lock); xfs_perag_put(pag); @@ -915,13 +911,6 @@ xfs_buf_lock( trace_xfs_buf_lock_done(bp, _RET_IP_); } -/* - * Releases the lock on the buffer object. - * If the buffer is marked delwri but is not queued, do so before we - * unlock the buffer as we need to set flags correctly. We also need to - * take a reference for the delwri queue because the unlocker is going to - * drop their's and they don't know we just queued it. - */ void xfs_buf_unlock( struct xfs_buf *bp) @@ -1008,9 +997,8 @@ xfs_buf_ioerror_alert( const char *func) { xfs_alert(bp->b_target->bt_mount, -"metadata I/O error: block 0x%llx (\"%s\") error %d buf count %zd", - (__uint64_t)XFS_BUF_ADDR(bp), func, - bp->b_error, XFS_BUF_COUNT(bp)); +"metadata I/O error: block 0x%llx (\"%s\") error %d numblks %d", + (__uint64_t)XFS_BUF_ADDR(bp), func, bp->b_error, bp->b_length); } int @@ -1019,10 +1007,11 @@ xfs_bwrite( { int error; + ASSERT(xfs_buf_islocked(bp)); + bp->b_flags |= XBF_WRITE; - bp->b_flags &= ~(XBF_ASYNC | XBF_READ); + bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q); - xfs_buf_delwri_dequeue(bp); xfs_bdstrat_cb(bp); error = xfs_buf_iowait(bp); @@ -1181,7 +1170,7 @@ _xfs_buf_ioapply( int rw, map_i, total_nr_pages, nr_pages; struct bio *bio; int offset = bp->b_offset; - int size = bp->b_count_desired; + int size = BBTOB(bp->b_io_length); sector_t sector = bp->b_bn; total_nr_pages = bp->b_page_count; @@ -1229,7 +1218,7 @@ next_chunk: break; offset = 0; - sector += nbytes >> BBSHIFT; + sector += BTOBB(nbytes); size -= nbytes; total_nr_pages--; } @@ -1248,13 +1237,13 @@ next_chunk: } } -int +void xfs_buf_iorequest( xfs_buf_t *bp) { trace_xfs_buf_iorequest(bp, _RET_IP_); - ASSERT(!(bp->b_flags & XBF_DELWRI)); + ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); if (bp->b_flags & XBF_WRITE) xfs_buf_wait_unpin(bp); @@ -1269,13 +1258,12 @@ xfs_buf_iorequest( _xfs_buf_ioend(bp, 0); xfs_buf_rele(bp); - return 0; } /* - * Waits for I/O to complete on the buffer supplied. - * It returns immediately if no I/O is pending. - * It returns the I/O error code, if any, or 0 if there was no error. + * Waits for I/O to complete on the buffer supplied. It returns immediately if + * no I/O is pending or there is already a pending error on the buffer. It + * returns the I/O error code, if any, or 0 if there was no error. */ int xfs_buf_iowait( @@ -1283,7 +1271,8 @@ xfs_buf_iowait( { trace_xfs_buf_iowait(bp, _RET_IP_); - wait_for_completion(&bp->b_iowait); + if (!bp->b_error) + wait_for_completion(&bp->b_iowait); trace_xfs_buf_iowait_done(bp, _RET_IP_); return bp->b_error; @@ -1296,7 +1285,7 @@ xfs_buf_offset( { struct page *page; - if (bp->b_flags & XBF_MAPPED) + if (bp->b_addr) return bp->b_addr + offset; offset += bp->b_offset; @@ -1315,27 +1304,30 @@ xfs_buf_iomove( void *data, /* data address */ xfs_buf_rw_t mode) /* read/write/zero flag */ { - size_t bend, cpoff, csize; - struct page *page; + size_t bend; bend = boff + bsize; while (boff < bend) { - page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)]; - cpoff = xfs_buf_poff(boff + bp->b_offset); - csize = min_t(size_t, - PAGE_SIZE-cpoff, bp->b_count_desired-boff); + struct page *page; + int page_index, page_offset, csize; + + page_index = (boff + bp->b_offset) >> PAGE_SHIFT; + page_offset = (boff + bp->b_offset) & ~PAGE_MASK; + page = bp->b_pages[page_index]; + csize = min_t(size_t, PAGE_SIZE - page_offset, + BBTOB(bp->b_io_length) - boff); - ASSERT(((csize + cpoff) <= PAGE_SIZE)); + ASSERT((csize + page_offset) <= PAGE_SIZE); switch (mode) { case XBRW_ZERO: - memset(page_address(page) + cpoff, 0, csize); + memset(page_address(page) + page_offset, 0, csize); break; case XBRW_READ: - memcpy(data, page_address(page) + cpoff, csize); + memcpy(data, page_address(page) + page_offset, csize); break; case XBRW_WRITE: - memcpy(page_address(page) + cpoff, data, csize); + memcpy(page_address(page) + page_offset, data, csize); } boff += csize; @@ -1435,11 +1427,9 @@ xfs_free_buftarg( { unregister_shrinker(&btp->bt_shrinker); - xfs_flush_buftarg(btp, 1); if (mp->m_flags & XFS_MOUNT_BARRIER) xfs_blkdev_issue_flush(btp); - kthread_stop(btp->bt_task); kmem_free(btp); } @@ -1491,20 +1481,6 @@ xfs_setsize_buftarg( return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1); } -STATIC int -xfs_alloc_delwri_queue( - xfs_buftarg_t *btp, - const char *fsname) -{ - INIT_LIST_HEAD(&btp->bt_delwri_queue); - spin_lock_init(&btp->bt_delwri_lock); - btp->bt_flags = 0; - btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname); - if (IS_ERR(btp->bt_task)) - return PTR_ERR(btp->bt_task); - return 0; -} - xfs_buftarg_t * xfs_alloc_buftarg( struct xfs_mount *mp, @@ -1527,8 +1503,6 @@ xfs_alloc_buftarg( spin_lock_init(&btp->bt_lru_lock); if (xfs_setsize_buftarg_early(btp, bdev)) goto error; - if (xfs_alloc_delwri_queue(btp, fsname)) - goto error; btp->bt_shrinker.shrink = xfs_buftarg_shrink; btp->bt_shrinker.seeks = DEFAULT_SEEKS; register_shrinker(&btp->bt_shrinker); @@ -1539,125 +1513,52 @@ error: return NULL; } - /* - * Delayed write buffer handling + * Add a buffer to the delayed write list. + * + * This queues a buffer for writeout if it hasn't already been. Note that + * neither this routine nor the buffer list submission functions perform + * any internal synchronization. It is expected that the lists are thread-local + * to the callers. + * + * Returns true if we queued up the buffer, or false if it already had + * been on the buffer list. */ -void +bool xfs_buf_delwri_queue( - xfs_buf_t *bp) + struct xfs_buf *bp, + struct list_head *list) { - struct xfs_buftarg *btp = bp->b_target; - - trace_xfs_buf_delwri_queue(bp, _RET_IP_); - + ASSERT(xfs_buf_islocked(bp)); ASSERT(!(bp->b_flags & XBF_READ)); - spin_lock(&btp->bt_delwri_lock); - if (!list_empty(&bp->b_list)) { - /* if already in the queue, move it to the tail */ - ASSERT(bp->b_flags & _XBF_DELWRI_Q); - list_move_tail(&bp->b_list, &btp->bt_delwri_queue); - } else { - /* start xfsbufd as it is about to have something to do */ - if (list_empty(&btp->bt_delwri_queue)) - wake_up_process(bp->b_target->bt_task); - - atomic_inc(&bp->b_hold); - bp->b_flags |= XBF_DELWRI | _XBF_DELWRI_Q | XBF_ASYNC; - list_add_tail(&bp->b_list, &btp->bt_delwri_queue); - } - bp->b_queuetime = jiffies; - spin_unlock(&btp->bt_delwri_lock); -} - -void -xfs_buf_delwri_dequeue( - xfs_buf_t *bp) -{ - int dequeued = 0; - - spin_lock(&bp->b_target->bt_delwri_lock); - if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) { - ASSERT(bp->b_flags & _XBF_DELWRI_Q); - list_del_init(&bp->b_list); - dequeued = 1; + /* + * If the buffer is already marked delwri it already is queued up + * by someone else for imediate writeout. Just ignore it in that + * case. + */ + if (bp->b_flags & _XBF_DELWRI_Q) { + trace_xfs_buf_delwri_queued(bp, _RET_IP_); + return false; } - bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q); - spin_unlock(&bp->b_target->bt_delwri_lock); - - if (dequeued) - xfs_buf_rele(bp); - - trace_xfs_buf_delwri_dequeue(bp, _RET_IP_); -} - -/* - * If a delwri buffer needs to be pushed before it has aged out, then promote - * it to the head of the delwri queue so that it will be flushed on the next - * xfsbufd run. We do this by resetting the queuetime of the buffer to be older - * than the age currently needed to flush the buffer. Hence the next time the - * xfsbufd sees it is guaranteed to be considered old enough to flush. - */ -void -xfs_buf_delwri_promote( - struct xfs_buf *bp) -{ - struct xfs_buftarg *btp = bp->b_target; - long age = xfs_buf_age_centisecs * msecs_to_jiffies(10) + 1; - ASSERT(bp->b_flags & XBF_DELWRI); - ASSERT(bp->b_flags & _XBF_DELWRI_Q); + trace_xfs_buf_delwri_queue(bp, _RET_IP_); /* - * Check the buffer age before locking the delayed write queue as we - * don't need to promote buffers that are already past the flush age. + * If a buffer gets written out synchronously or marked stale while it + * is on a delwri list we lazily remove it. To do this, the other party + * clears the _XBF_DELWRI_Q flag but otherwise leaves the buffer alone. + * It remains referenced and on the list. In a rare corner case it + * might get readded to a delwri list after the synchronous writeout, in + * which case we need just need to re-add the flag here. */ - if (bp->b_queuetime < jiffies - age) - return; - bp->b_queuetime = jiffies - age; - spin_lock(&btp->bt_delwri_lock); - list_move(&bp->b_list, &btp->bt_delwri_queue); - spin_unlock(&btp->bt_delwri_lock); -} - -/* - * Move as many buffers as specified to the supplied list - * idicating if we skipped any buffers to prevent deadlocks. - */ -STATIC int -xfs_buf_delwri_split( - xfs_buftarg_t *target, - struct list_head *list, - unsigned long age) -{ - xfs_buf_t *bp, *n; - int skipped = 0; - int force; - - force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags); - INIT_LIST_HEAD(list); - spin_lock(&target->bt_delwri_lock); - list_for_each_entry_safe(bp, n, &target->bt_delwri_queue, b_list) { - ASSERT(bp->b_flags & XBF_DELWRI); - - if (!xfs_buf_ispinned(bp) && xfs_buf_trylock(bp)) { - if (!force && - time_before(jiffies, bp->b_queuetime + age)) { - xfs_buf_unlock(bp); - break; - } - - bp->b_flags &= ~(XBF_DELWRI | _XBF_DELWRI_Q); - bp->b_flags |= XBF_WRITE; - list_move_tail(&bp->b_list, list); - trace_xfs_buf_delwri_split(bp, _RET_IP_); - } else - skipped++; + bp->b_flags |= _XBF_DELWRI_Q; + if (list_empty(&bp->b_list)) { + atomic_inc(&bp->b_hold); + list_add_tail(&bp->b_list, list); } - spin_unlock(&target->bt_delwri_lock); - return skipped; + return true; } /* @@ -1683,99 +1584,109 @@ xfs_buf_cmp( return 0; } -STATIC int -xfsbufd( - void *data) +static int +__xfs_buf_delwri_submit( + struct list_head *buffer_list, + struct list_head *io_list, + bool wait) { - xfs_buftarg_t *target = (xfs_buftarg_t *)data; - - current->flags |= PF_MEMALLOC; - - set_freezable(); + struct blk_plug plug; + struct xfs_buf *bp, *n; + int pinned = 0; + + list_for_each_entry_safe(bp, n, buffer_list, b_list) { + if (!wait) { + if (xfs_buf_ispinned(bp)) { + pinned++; + continue; + } + if (!xfs_buf_trylock(bp)) + continue; + } else { + xfs_buf_lock(bp); + } - do { - long age = xfs_buf_age_centisecs * msecs_to_jiffies(10); - long tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10); - struct list_head tmp; - struct blk_plug plug; + /* + * Someone else might have written the buffer synchronously or + * marked it stale in the meantime. In that case only the + * _XBF_DELWRI_Q flag got cleared, and we have to drop the + * reference and remove it from the list here. + */ + if (!(bp->b_flags & _XBF_DELWRI_Q)) { + list_del_init(&bp->b_list); + xfs_buf_relse(bp); + continue; + } - if (unlikely(freezing(current))) - try_to_freeze(); + list_move_tail(&bp->b_list, io_list); + trace_xfs_buf_delwri_split(bp, _RET_IP_); + } - /* sleep for a long time if there is nothing to do. */ - if (list_empty(&target->bt_delwri_queue)) - tout = MAX_SCHEDULE_TIMEOUT; - schedule_timeout_interruptible(tout); + list_sort(NULL, io_list, xfs_buf_cmp); - xfs_buf_delwri_split(target, &tmp, age); - list_sort(NULL, &tmp, xfs_buf_cmp); + blk_start_plug(&plug); + list_for_each_entry_safe(bp, n, io_list, b_list) { + bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC); + bp->b_flags |= XBF_WRITE; - blk_start_plug(&plug); - while (!list_empty(&tmp)) { - struct xfs_buf *bp; - bp = list_first_entry(&tmp, struct xfs_buf, b_list); + if (!wait) { + bp->b_flags |= XBF_ASYNC; list_del_init(&bp->b_list); - xfs_bdstrat_cb(bp); } - blk_finish_plug(&plug); - } while (!kthread_should_stop()); + xfs_bdstrat_cb(bp); + } + blk_finish_plug(&plug); - return 0; + return pinned; } /* - * Go through all incore buffers, and release buffers if they belong to - * the given device. This is used in filesystem error handling to - * preserve the consistency of its metadata. + * Write out a buffer list asynchronously. + * + * This will take the @buffer_list, write all non-locked and non-pinned buffers + * out and not wait for I/O completion on any of the buffers. This interface + * is only safely useable for callers that can track I/O completion by higher + * level means, e.g. AIL pushing as the @buffer_list is consumed in this + * function. */ int -xfs_flush_buftarg( - xfs_buftarg_t *target, - int wait) +xfs_buf_delwri_submit_nowait( + struct list_head *buffer_list) { - xfs_buf_t *bp; - int pincount = 0; - LIST_HEAD(tmp_list); - LIST_HEAD(wait_list); - struct blk_plug plug; + LIST_HEAD (io_list); + return __xfs_buf_delwri_submit(buffer_list, &io_list, false); +} - flush_workqueue(xfslogd_workqueue); +/* + * Write out a buffer list synchronously. + * + * This will take the @buffer_list, write all buffers out and wait for I/O + * completion on all of the buffers. @buffer_list is consumed by the function, + * so callers must have some other way of tracking buffers if they require such + * functionality. + */ +int +xfs_buf_delwri_submit( + struct list_head *buffer_list) +{ + LIST_HEAD (io_list); + int error = 0, error2; + struct xfs_buf *bp; - set_bit(XBT_FORCE_FLUSH, &target->bt_flags); - pincount = xfs_buf_delwri_split(target, &tmp_list, 0); + __xfs_buf_delwri_submit(buffer_list, &io_list, true); - /* - * Dropped the delayed write list lock, now walk the temporary list. - * All I/O is issued async and then if we need to wait for completion - * we do that after issuing all the IO. - */ - list_sort(NULL, &tmp_list, xfs_buf_cmp); + /* Wait for IO to complete. */ + while (!list_empty(&io_list)) { + bp = list_first_entry(&io_list, struct xfs_buf, b_list); - blk_start_plug(&plug); - while (!list_empty(&tmp_list)) { - bp = list_first_entry(&tmp_list, struct xfs_buf, b_list); - ASSERT(target == bp->b_target); list_del_init(&bp->b_list); - if (wait) { - bp->b_flags &= ~XBF_ASYNC; - list_add(&bp->b_list, &wait_list); - } - xfs_bdstrat_cb(bp); - } - blk_finish_plug(&plug); - - if (wait) { - /* Wait for IO to complete. */ - while (!list_empty(&wait_list)) { - bp = list_first_entry(&wait_list, struct xfs_buf, b_list); - - list_del_init(&bp->b_list); - xfs_buf_iowait(bp); - xfs_buf_relse(bp); - } + error2 = xfs_buf_iowait(bp); + xfs_buf_relse(bp); + if (!error) + error = error2; } - return pincount; + return error; } int __init diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 5bf3be45f543..7f1d1392ce37 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -32,11 +32,6 @@ #define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL)) -#define xfs_buf_ctob(pp) ((pp) * PAGE_CACHE_SIZE) -#define xfs_buf_btoc(dd) (((dd) + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -#define xfs_buf_btoct(dd) ((dd) >> PAGE_CACHE_SHIFT) -#define xfs_buf_poff(aa) ((aa) & ~PAGE_CACHE_MASK) - typedef enum { XBRW_READ = 1, /* transfer into target memory */ XBRW_WRITE = 2, /* transfer from target memory */ @@ -46,11 +41,9 @@ typedef enum { #define XBF_READ (1 << 0) /* buffer intended for reading from device */ #define XBF_WRITE (1 << 1) /* buffer intended for writing to device */ #define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */ -#define XBF_MAPPED (1 << 3) /* buffer mapped (b_addr valid) */ #define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ #define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ -#define XBF_DELWRI (1 << 6) /* buffer has dirty pages */ -#define XBF_STALE (1 << 7) /* buffer has been staled, do not find it */ +#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ /* I/O hints for the BIO layer */ #define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */ @@ -58,14 +51,13 @@ typedef enum { #define XBF_FLUSH (1 << 12)/* flush the disk cache before a write */ /* flags used only as arguments to access routines */ -#define XBF_LOCK (1 << 15)/* lock requested */ #define XBF_TRYLOCK (1 << 16)/* lock requested, but do not wait */ -#define XBF_DONT_BLOCK (1 << 17)/* do not block in current thread */ +#define XBF_UNMAPPED (1 << 17)/* do not map the buffer */ /* flags used only internally */ #define _XBF_PAGES (1 << 20)/* backed by refcounted pages */ #define _XBF_KMEM (1 << 21)/* backed by heap memory */ -#define _XBF_DELWRI_Q (1 << 22)/* buffer on delwri queue */ +#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ typedef unsigned int xfs_buf_flags_t; @@ -73,25 +65,18 @@ typedef unsigned int xfs_buf_flags_t; { XBF_READ, "READ" }, \ { XBF_WRITE, "WRITE" }, \ { XBF_READ_AHEAD, "READ_AHEAD" }, \ - { XBF_MAPPED, "MAPPED" }, \ { XBF_ASYNC, "ASYNC" }, \ { XBF_DONE, "DONE" }, \ - { XBF_DELWRI, "DELWRI" }, \ { XBF_STALE, "STALE" }, \ { XBF_SYNCIO, "SYNCIO" }, \ { XBF_FUA, "FUA" }, \ { XBF_FLUSH, "FLUSH" }, \ - { XBF_LOCK, "LOCK" }, /* should never be set */\ - { XBF_TRYLOCK, "TRYLOCK" }, /* ditto */\ - { XBF_DONT_BLOCK, "DONT_BLOCK" }, /* ditto */\ + { XBF_TRYLOCK, "TRYLOCK" }, /* should never be set */\ + { XBF_UNMAPPED, "UNMAPPED" }, /* ditto */\ { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" } -typedef enum { - XBT_FORCE_FLUSH = 0, -} xfs_buftarg_flags_t; - typedef struct xfs_buftarg { dev_t bt_dev; struct block_device *bt_bdev; @@ -101,12 +86,6 @@ typedef struct xfs_buftarg { unsigned int bt_sshift; size_t bt_smask; - /* per device delwri queue */ - struct task_struct *bt_task; - struct list_head bt_delwri_queue; - spinlock_t bt_delwri_lock; - unsigned long bt_flags; - /* LRU control structures */ struct shrinker bt_shrinker; struct list_head bt_lru; @@ -128,8 +107,8 @@ typedef struct xfs_buf { * fast-path on locking. */ struct rb_node b_rbnode; /* rbtree node */ - xfs_off_t b_file_offset; /* offset in file */ - size_t b_buffer_length;/* size of buffer in bytes */ + xfs_daddr_t b_bn; /* block number for I/O */ + int b_length; /* size of buffer in BBs */ atomic_t b_hold; /* reference count */ atomic_t b_lru_ref; /* lru reclaim ref count */ xfs_buf_flags_t b_flags; /* status flags */ @@ -140,8 +119,6 @@ typedef struct xfs_buf { struct list_head b_list; struct xfs_perag *b_pag; /* contains rbtree root */ xfs_buftarg_t *b_target; /* buffer target (device) */ - xfs_daddr_t b_bn; /* block number for I/O */ - size_t b_count_desired;/* desired transfer size */ void *b_addr; /* virtual address of buffer */ struct work_struct b_iodone_work; xfs_buf_iodone_t b_iodone; /* I/O completion function */ @@ -150,7 +127,7 @@ typedef struct xfs_buf { struct xfs_trans *b_transp; struct page **b_pages; /* array of page pointers */ struct page *b_page_array[XB_PAGES]; /* inline pages */ - unsigned long b_queuetime; /* time buffer was queued */ + int b_io_length; /* IO size in BBs */ atomic_t b_pin_count; /* pin count */ atomic_t b_io_remaining; /* #outstanding I/O requests */ unsigned int b_page_count; /* size of page array */ @@ -163,26 +140,30 @@ typedef struct xfs_buf { /* Finding and Reading Buffers */ -extern xfs_buf_t *_xfs_buf_find(xfs_buftarg_t *, xfs_off_t, size_t, - xfs_buf_flags_t, xfs_buf_t *); +struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target, xfs_daddr_t blkno, + size_t numblks, xfs_buf_flags_t flags, + struct xfs_buf *new_bp); #define xfs_incore(buftarg,blkno,len,lockit) \ _xfs_buf_find(buftarg, blkno ,len, lockit, NULL) -extern xfs_buf_t *xfs_buf_get(xfs_buftarg_t *, xfs_off_t, size_t, - xfs_buf_flags_t); -extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t, - xfs_buf_flags_t); - -struct xfs_buf *xfs_buf_alloc(struct xfs_buftarg *, xfs_off_t, size_t, - xfs_buf_flags_t); -extern void xfs_buf_set_empty(struct xfs_buf *bp, size_t len); -extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int); -extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t); -extern void xfs_buf_hold(xfs_buf_t *); -extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t); -struct xfs_buf *xfs_buf_read_uncached(struct xfs_mount *mp, - struct xfs_buftarg *target, - xfs_daddr_t daddr, size_t length, int flags); +struct xfs_buf *xfs_buf_get(struct xfs_buftarg *target, xfs_daddr_t blkno, + size_t numblks, xfs_buf_flags_t flags); +struct xfs_buf *xfs_buf_read(struct xfs_buftarg *target, xfs_daddr_t blkno, + size_t numblks, xfs_buf_flags_t flags); +void xfs_buf_readahead(struct xfs_buftarg *target, xfs_daddr_t blkno, + size_t numblks); + +struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks); +struct xfs_buf *xfs_buf_alloc(struct xfs_buftarg *target, xfs_daddr_t blkno, + size_t numblks, xfs_buf_flags_t flags); +void xfs_buf_set_empty(struct xfs_buf *bp, size_t numblks); +int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length); + +struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, + int flags); +struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target, + xfs_daddr_t daddr, size_t numblks, int flags); +void xfs_buf_hold(struct xfs_buf *bp); /* Releasing Buffers */ extern void xfs_buf_free(xfs_buf_t *); @@ -204,7 +185,7 @@ extern int xfs_bdstrat_cb(struct xfs_buf *); extern void xfs_buf_ioend(xfs_buf_t *, int); extern void xfs_buf_ioerror(xfs_buf_t *, int); extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func); -extern int xfs_buf_iorequest(xfs_buf_t *); +extern void xfs_buf_iorequest(xfs_buf_t *); extern int xfs_buf_iowait(xfs_buf_t *); extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, xfs_buf_rw_t); @@ -220,24 +201,22 @@ static inline int xfs_buf_geterror(xfs_buf_t *bp) extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t); /* Delayed Write Buffer Routines */ -extern void xfs_buf_delwri_queue(struct xfs_buf *); -extern void xfs_buf_delwri_dequeue(struct xfs_buf *); -extern void xfs_buf_delwri_promote(struct xfs_buf *); +extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *); +extern int xfs_buf_delwri_submit(struct list_head *); +extern int xfs_buf_delwri_submit_nowait(struct list_head *); /* Buffer Daemon Setup Routines */ extern int xfs_buf_init(void); extern void xfs_buf_terminate(void); #define XFS_BUF_ZEROFLAGS(bp) \ - ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI| \ + ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC| \ XBF_SYNCIO|XBF_FUA|XBF_FLUSH)) void xfs_buf_stale(struct xfs_buf *bp); #define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE) #define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE) -#define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI) - #define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE) #define XFS_BUF_UNDONE(bp) ((bp)->b_flags &= ~XBF_DONE) #define XFS_BUF_ISDONE(bp) ((bp)->b_flags & XBF_DONE) @@ -256,12 +235,6 @@ void xfs_buf_stale(struct xfs_buf *bp); #define XFS_BUF_ADDR(bp) ((bp)->b_bn) #define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_bn = (xfs_daddr_t)(bno)) -#define XFS_BUF_OFFSET(bp) ((bp)->b_file_offset) -#define XFS_BUF_SET_OFFSET(bp, off) ((bp)->b_file_offset = (off)) -#define XFS_BUF_COUNT(bp) ((bp)->b_count_desired) -#define XFS_BUF_SET_COUNT(bp, cnt) ((bp)->b_count_desired = (cnt)) -#define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length) -#define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt)) static inline void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) { @@ -287,7 +260,6 @@ extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *, extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); extern void xfs_wait_buftarg(xfs_buftarg_t *); extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); -extern int xfs_flush_buftarg(xfs_buftarg_t *, int); #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index eac97ef81e2a..45df2b857d48 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -20,7 +20,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -123,11 +122,11 @@ xfs_buf_item_log_check( ASSERT(bip->bli_logged != NULL); bp = bip->bli_buf; - ASSERT(XFS_BUF_COUNT(bp) > 0); + ASSERT(bp->b_length > 0); ASSERT(bp->b_addr != NULL); orig = bip->bli_orig; buffer = bp->b_addr; - for (x = 0; x < XFS_BUF_COUNT(bp); x++) { + for (x = 0; x < BBTOB(bp->b_length); x++) { if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) { xfs_emerg(bp->b_mount, "%s: bip %x buffer %x orig %x index %d", @@ -418,7 +417,6 @@ xfs_buf_item_unpin( if (freed && stale) { ASSERT(bip->bli_flags & XFS_BLI_STALE); ASSERT(xfs_buf_islocked(bp)); - ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); ASSERT(XFS_BUF_ISSTALE(bp)); ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); @@ -455,42 +453,42 @@ xfs_buf_item_unpin( bp->b_iodone = NULL; } else { spin_lock(&ailp->xa_lock); - xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip); + xfs_trans_ail_delete(ailp, lip, SHUTDOWN_LOG_IO_ERROR); xfs_buf_item_relse(bp); ASSERT(bp->b_fspriv == NULL); } xfs_buf_relse(bp); + } else if (freed && remove) { + xfs_buf_lock(bp); + xfs_buf_ioerror(bp, EIO); + XFS_BUF_UNDONE(bp); + xfs_buf_stale(bp); + xfs_buf_ioend(bp, 0); } } -/* - * This is called to attempt to lock the buffer associated with this - * buf log item. Don't sleep on the buffer lock. If we can't get - * the lock right away, return 0. If we can get the lock, take a - * reference to the buffer. If this is a delayed write buffer that - * needs AIL help to be written back, invoke the pushbuf routine - * rather than the normal success path. - */ STATIC uint -xfs_buf_item_trylock( - struct xfs_log_item *lip) +xfs_buf_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; + uint rval = XFS_ITEM_SUCCESS; if (xfs_buf_ispinned(bp)) return XFS_ITEM_PINNED; if (!xfs_buf_trylock(bp)) return XFS_ITEM_LOCKED; - /* take a reference to the buffer. */ - xfs_buf_hold(bp); - ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - trace_xfs_buf_item_trylock(bip); - if (XFS_BUF_ISDELAYWRITE(bp)) - return XFS_ITEM_PUSHBUF; - return XFS_ITEM_SUCCESS; + + trace_xfs_buf_item_push(bip); + + if (!xfs_buf_delwri_queue(bp, buffer_list)) + rval = XFS_ITEM_FLUSHING; + xfs_buf_unlock(bp); + return rval; } /* @@ -603,49 +601,6 @@ xfs_buf_item_committed( return lsn; } -/* - * The buffer is locked, but is not a delayed write buffer. This happens - * if we race with IO completion and hence we don't want to try to write it - * again. Just release the buffer. - */ -STATIC void -xfs_buf_item_push( - struct xfs_log_item *lip) -{ - struct xfs_buf_log_item *bip = BUF_ITEM(lip); - struct xfs_buf *bp = bip->bli_buf; - - ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!XFS_BUF_ISDELAYWRITE(bp)); - - trace_xfs_buf_item_push(bip); - - xfs_buf_relse(bp); -} - -/* - * The buffer is locked and is a delayed write buffer. Promote the buffer - * in the delayed write queue as the caller knows that they must invoke - * the xfsbufd to get this buffer written. We have to unlock the buffer - * to allow the xfsbufd to write it, too. - */ -STATIC bool -xfs_buf_item_pushbuf( - struct xfs_log_item *lip) -{ - struct xfs_buf_log_item *bip = BUF_ITEM(lip); - struct xfs_buf *bp = bip->bli_buf; - - ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(XFS_BUF_ISDELAYWRITE(bp)); - - trace_xfs_buf_item_pushbuf(bip); - - xfs_buf_delwri_promote(bp); - xfs_buf_relse(bp); - return true; -} - STATIC void xfs_buf_item_committing( struct xfs_log_item *lip, @@ -661,11 +616,9 @@ static const struct xfs_item_ops xfs_buf_item_ops = { .iop_format = xfs_buf_item_format, .iop_pin = xfs_buf_item_pin, .iop_unpin = xfs_buf_item_unpin, - .iop_trylock = xfs_buf_item_trylock, .iop_unlock = xfs_buf_item_unlock, .iop_committed = xfs_buf_item_committed, .iop_push = xfs_buf_item_push, - .iop_pushbuf = xfs_buf_item_pushbuf, .iop_committing = xfs_buf_item_committing }; @@ -703,7 +656,8 @@ xfs_buf_item_init( * truncate any pieces. map_size is the size of the * bitmap needed to describe the chunks of the buffer. */ - chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLF_CHUNK - 1)) >> XFS_BLF_SHIFT); + chunks = (int)((BBTOB(bp->b_length) + (XFS_BLF_CHUNK - 1)) >> + XFS_BLF_SHIFT); map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT); bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone, @@ -713,7 +667,7 @@ xfs_buf_item_init( xfs_buf_hold(bp); bip->bli_format.blf_type = XFS_LI_BUF; bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp); - bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp)); + bip->bli_format.blf_len = (ushort)bp->b_length; bip->bli_format.blf_map_size = map_size; #ifdef XFS_TRANS_DEBUG @@ -725,9 +679,9 @@ xfs_buf_item_init( * the buffer to indicate which bytes the callers have asked * to have logged. */ - bip->bli_orig = (char *)kmem_alloc(XFS_BUF_COUNT(bp), KM_SLEEP); - memcpy(bip->bli_orig, bp->b_addr, XFS_BUF_COUNT(bp)); - bip->bli_logged = (char *)kmem_zalloc(XFS_BUF_COUNT(bp) / NBBY, KM_SLEEP); + bip->bli_orig = kmem_alloc(BBTOB(bp->b_length), KM_SLEEP); + memcpy(bip->bli_orig, bp->b_addr, BBTOB(bp->b_length)); + bip->bli_logged = kmem_zalloc(BBTOB(bp->b_length) / NBBY, KM_SLEEP); #endif /* @@ -984,20 +938,27 @@ xfs_buf_iodone_callbacks( * If the write was asynchronous then no one will be looking for the * error. Clear the error state and write the buffer out again. * - * During sync or umount we'll write all pending buffers again - * synchronous, which will catch these errors if they keep hanging - * around. + * XXX: This helps against transient write errors, but we need to find + * a way to shut the filesystem down if the writes keep failing. + * + * In practice we'll shut the filesystem down soon as non-transient + * erorrs tend to affect the whole device and a failing log write + * will make us give up. But we really ought to do better here. */ if (XFS_BUF_ISASYNC(bp)) { + ASSERT(bp->b_iodone != NULL); + + trace_xfs_buf_item_iodone_async(bp, _RET_IP_); + xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */ if (!XFS_BUF_ISSTALE(bp)) { - xfs_buf_delwri_queue(bp); - XFS_BUF_DONE(bp); + bp->b_flags |= XBF_WRITE | XBF_ASYNC | XBF_DONE; + xfs_bdstrat_cb(bp); + } else { + xfs_buf_relse(bp); } - ASSERT(bp->b_iodone != NULL); - trace_xfs_buf_item_iodone_async(bp, _RET_IP_); - xfs_buf_relse(bp); + return; } @@ -1045,6 +1006,6 @@ xfs_buf_iodone( * Either way, AIL is useless if we're forcing a shutdown. */ spin_lock(&ailp->xa_lock); - xfs_trans_ail_delete(ailp, lip); + xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE); xfs_buf_item_free(BUF_ITEM(lip)); } diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 7f1a6f5b05a6..015b946c5808 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -20,7 +20,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -2277,20 +2276,20 @@ xfs_da_buf_make(int nbuf, xfs_buf_t **bps) if (nbuf == 1) { dabuf->nbuf = 1; bp = bps[0]; - dabuf->bbcount = (short)BTOBB(XFS_BUF_COUNT(bp)); + dabuf->bbcount = bp->b_length; dabuf->data = bp->b_addr; dabuf->bps[0] = bp; } else { dabuf->nbuf = nbuf; for (i = 0, dabuf->bbcount = 0; i < nbuf; i++) { dabuf->bps[i] = bp = bps[i]; - dabuf->bbcount += BTOBB(XFS_BUF_COUNT(bp)); + dabuf->bbcount += bp->b_length; } dabuf->data = kmem_alloc(BBTOB(dabuf->bbcount), KM_SLEEP); - for (i = off = 0; i < nbuf; i++, off += XFS_BUF_COUNT(bp)) { + for (i = off = 0; i < nbuf; i++, off += BBTOB(bp->b_length)) { bp = bps[i]; memcpy((char *)dabuf->data + off, bp->b_addr, - XFS_BUF_COUNT(bp)); + BBTOB(bp->b_length)); } } return dabuf; @@ -2310,10 +2309,10 @@ xfs_da_buf_clean(xfs_dabuf_t *dabuf) ASSERT(dabuf->nbuf > 1); dabuf->dirty = 0; for (i = off = 0; i < dabuf->nbuf; - i++, off += XFS_BUF_COUNT(bp)) { + i++, off += BBTOB(bp->b_length)) { bp = dabuf->bps[i]; memcpy(bp->b_addr, dabuf->data + off, - XFS_BUF_COUNT(bp)); + BBTOB(bp->b_length)); } } } @@ -2356,10 +2355,10 @@ xfs_da_log_buf(xfs_trans_t *tp, xfs_dabuf_t *dabuf, uint first, uint last) } dabuf->dirty = 1; ASSERT(first <= last); - for (i = off = 0; i < dabuf->nbuf; i++, off += XFS_BUF_COUNT(bp)) { + for (i = off = 0; i < dabuf->nbuf; i++, off += BBTOB(bp->b_length)) { bp = dabuf->bps[i]; f = off; - l = f + XFS_BUF_COUNT(bp) - 1; + l = f + BBTOB(bp->b_length) - 1; if (f < first) f = first; if (l > last) diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 1137bbc5eccb..e00de08dc8ac 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -18,9 +18,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index a2e27010c7fb..67a250c36d41 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -18,7 +18,6 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" #include "xfs_inum.h" #include "xfs_trans.h" diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index d3b63aefd01d..586732f2d80d 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index 5bbe2a8a023f..2046988e9eb2 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 66e108f561a3..397ffbcbab1d 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -20,7 +20,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 0179a41d9e5a..b0f26780449d 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c index 79d05e84e296..19bf0c5e38f4 100644 --- a/fs/xfs/xfs_dir2_sf.c +++ b/fs/xfs/xfs_dir2_sf.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 1ad3a4b8ca40..f9c3fe304a17 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -17,7 +17,6 @@ */ #include "xfs.h" #include "xfs_sb.h" -#include "xfs_inum.h" #include "xfs_log.h" #include "xfs_ag.h" #include "xfs_mount.h" @@ -30,6 +29,7 @@ #include "xfs_inode.h" #include "xfs_alloc.h" #include "xfs_error.h" +#include "xfs_extent_busy.h" #include "xfs_discard.h" #include "xfs_trace.h" @@ -118,7 +118,7 @@ xfs_trim_extents( * If any blocks in the range are still busy, skip the * discard and try again the next time. */ - if (xfs_alloc_busy_search(mp, agno, fbno, flen)) { + if (xfs_extent_busy_search(mp, agno, fbno, flen)) { trace_xfs_discard_busy(mp, agno, fbno, flen); goto next_extent; } @@ -212,7 +212,7 @@ xfs_discard_extents( struct xfs_mount *mp, struct list_head *list) { - struct xfs_busy_extent *busyp; + struct xfs_extent_busy *busyp; int error = 0; list_for_each_entry(busyp, list, list) { diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 1155208fa830..bf27fcca4843 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -857,7 +856,7 @@ xfs_qm_dqflush_done( /* xfs_trans_ail_delete() drops the AIL lock. */ spin_lock(&ailp->xa_lock); if (lip->li_lsn == qip->qli_flush_lsn) - xfs_trans_ail_delete(ailp, lip); + xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE); else spin_unlock(&ailp->xa_lock); } @@ -878,8 +877,8 @@ xfs_qm_dqflush_done( */ int xfs_qm_dqflush( - xfs_dquot_t *dqp, - uint flags) + struct xfs_dquot *dqp, + struct xfs_buf **bpp) { struct xfs_mount *mp = dqp->q_mount; struct xfs_buf *bp; @@ -891,25 +890,30 @@ xfs_qm_dqflush( trace_xfs_dqflush(dqp); - /* - * If not dirty, or it's pinned and we are not supposed to block, nada. - */ - if (!XFS_DQ_IS_DIRTY(dqp) || - ((flags & SYNC_TRYLOCK) && atomic_read(&dqp->q_pincount) > 0)) { - xfs_dqfunlock(dqp); - return 0; - } + *bpp = NULL; + xfs_qm_dqunpin_wait(dqp); /* * This may have been unpinned because the filesystem is shutting * down forcibly. If that's the case we must not write this dquot - * to disk, because the log record didn't make it to disk! + * to disk, because the log record didn't make it to disk. + * + * We also have to remove the log item from the AIL in this case, + * as we wait for an emptry AIL as part of the unmount process. */ if (XFS_FORCED_SHUTDOWN(mp)) { + struct xfs_log_item *lip = &dqp->q_logitem.qli_item; dqp->dq_flags &= ~XFS_DQ_DIRTY; - xfs_dqfunlock(dqp); - return XFS_ERROR(EIO); + + spin_lock(&mp->m_ail->xa_lock); + if (lip->li_flags & XFS_LI_IN_AIL) + xfs_trans_ail_delete(mp->m_ail, lip, + SHUTDOWN_CORRUPT_INCORE); + else + spin_unlock(&mp->m_ail->xa_lock); + error = XFS_ERROR(EIO); + goto out_unlock; } /* @@ -917,11 +921,8 @@ xfs_qm_dqflush( */ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, 0, &bp); - if (error) { - ASSERT(error != ENOENT); - xfs_dqfunlock(dqp); - return error; - } + if (error) + goto out_unlock; /* * Calculate the location of the dquot inside the buffer. @@ -967,20 +968,13 @@ xfs_qm_dqflush( xfs_log_force(mp, 0); } - if (flags & SYNC_WAIT) - error = xfs_bwrite(bp); - else - xfs_buf_delwri_queue(bp); - - xfs_buf_relse(bp); - trace_xfs_dqflush_done(dqp); + *bpp = bp; + return 0; - /* - * dqp is still locked, but caller is free to unlock it now. - */ - return error; - +out_unlock: + xfs_dqfunlock(dqp); + return XFS_ERROR(EIO); } /* @@ -1011,39 +1005,6 @@ xfs_dqlock2( } } -/* - * Give the buffer a little push if it is incore and - * wait on the flush lock. - */ -void -xfs_dqflock_pushbuf_wait( - xfs_dquot_t *dqp) -{ - xfs_mount_t *mp = dqp->q_mount; - xfs_buf_t *bp; - - /* - * Check to see if the dquot has been flushed delayed - * write. If so, grab its buffer and send it - * out immediately. We'll be able to acquire - * the flush lock when the I/O completes. - */ - bp = xfs_incore(mp->m_ddev_targp, dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK); - if (!bp) - goto out_lock; - - if (XFS_BUF_ISDELAYWRITE(bp)) { - if (xfs_buf_ispinned(bp)) - xfs_log_force(mp, 0); - xfs_buf_delwri_promote(bp); - wake_up_process(bp->b_target->bt_task); - } - xfs_buf_relse(bp); -out_lock: - xfs_dqflock(dqp); -} - int __init xfs_qm_init(void) { diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index ef9190bd8b30..7d20af27346d 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -141,7 +141,7 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type) extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint, uint, struct xfs_dquot **); extern void xfs_qm_dqdestroy(xfs_dquot_t *); -extern int xfs_qm_dqflush(xfs_dquot_t *, uint); +extern int xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **); extern void xfs_qm_dqunpin_wait(xfs_dquot_t *); extern void xfs_qm_adjust_dqtimers(xfs_mount_t *, xfs_disk_dquot_t *); @@ -152,7 +152,6 @@ extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *, extern void xfs_qm_dqput(xfs_dquot_t *); extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *); -extern void xfs_dqflock_pushbuf_wait(struct xfs_dquot *dqp); static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) { diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 34baeae45265..57aa4b03720c 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -17,9 +17,7 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -108,38 +106,6 @@ xfs_qm_dquot_logitem_unpin( wake_up(&dqp->q_pinwait); } -/* - * Given the logitem, this writes the corresponding dquot entry to disk - * asynchronously. This is called with the dquot entry securely locked; - * we simply get xfs_qm_dqflush() to do the work, and unlock the dquot - * at the end. - */ -STATIC void -xfs_qm_dquot_logitem_push( - struct xfs_log_item *lip) -{ - struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; - int error; - - ASSERT(XFS_DQ_IS_LOCKED(dqp)); - ASSERT(!completion_done(&dqp->q_flush)); - - /* - * Since we were able to lock the dquot's flush lock and - * we found it on the AIL, the dquot must be dirty. This - * is because the dquot is removed from the AIL while still - * holding the flush lock in xfs_dqflush_done(). Thus, if - * we found it in the AIL and were able to obtain the flush - * lock without sleeping, then there must not have been - * anyone in the process of flushing the dquot. - */ - error = xfs_qm_dqflush(dqp, SYNC_TRYLOCK); - if (error) - xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p", - __func__, error, dqp); - xfs_dqunlock(dqp); -} - STATIC xfs_lsn_t xfs_qm_dquot_logitem_committed( struct xfs_log_item *lip, @@ -171,67 +137,15 @@ xfs_qm_dqunpin_wait( wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0)); } -/* - * This is called when IOP_TRYLOCK returns XFS_ITEM_PUSHBUF to indicate that - * the dquot is locked by us, but the flush lock isn't. So, here we are - * going to see if the relevant dquot buffer is incore, waiting on DELWRI. - * If so, we want to push it out to help us take this item off the AIL as soon - * as possible. - * - * We must not be holding the AIL lock at this point. Calling incore() to - * search the buffer cache can be a time consuming thing, and AIL lock is a - * spinlock. - */ -STATIC bool -xfs_qm_dquot_logitem_pushbuf( - struct xfs_log_item *lip) -{ - struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip); - struct xfs_dquot *dqp = qlip->qli_dquot; - struct xfs_buf *bp; - bool ret = true; - - ASSERT(XFS_DQ_IS_LOCKED(dqp)); - - /* - * If flushlock isn't locked anymore, chances are that the - * inode flush completed and the inode was taken off the AIL. - * So, just get out. - */ - if (completion_done(&dqp->q_flush) || - !(lip->li_flags & XFS_LI_IN_AIL)) { - xfs_dqunlock(dqp); - return true; - } - - bp = xfs_incore(dqp->q_mount->m_ddev_targp, qlip->qli_format.qlf_blkno, - dqp->q_mount->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK); - xfs_dqunlock(dqp); - if (!bp) - return true; - if (XFS_BUF_ISDELAYWRITE(bp)) - xfs_buf_delwri_promote(bp); - if (xfs_buf_ispinned(bp)) - ret = false; - xfs_buf_relse(bp); - return ret; -} - -/* - * This is called to attempt to lock the dquot associated with this - * dquot log item. Don't sleep on the dquot lock or the flush lock. - * If the flush lock is already held, indicating that the dquot has - * been or is in the process of being flushed, then see if we can - * find the dquot's buffer in the buffer cache without sleeping. If - * we can and it is marked delayed write, then we want to send it out. - * We delay doing so until the push routine, though, to avoid sleeping - * in any device strategy routines. - */ STATIC uint -xfs_qm_dquot_logitem_trylock( - struct xfs_log_item *lip) +xfs_qm_dquot_logitem_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) { struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; + struct xfs_buf *bp = NULL; + uint rval = XFS_ITEM_SUCCESS; + int error; if (atomic_read(&dqp->q_pincount) > 0) return XFS_ITEM_PINNED; @@ -239,16 +153,41 @@ xfs_qm_dquot_logitem_trylock( if (!xfs_dqlock_nowait(dqp)) return XFS_ITEM_LOCKED; + /* + * Re-check the pincount now that we stabilized the value by + * taking the quota lock. + */ + if (atomic_read(&dqp->q_pincount) > 0) { + rval = XFS_ITEM_PINNED; + goto out_unlock; + } + + /* + * Someone else is already flushing the dquot. Nothing we can do + * here but wait for the flush to finish and remove the item from + * the AIL. + */ if (!xfs_dqflock_nowait(dqp)) { - /* - * dquot has already been flushed to the backing buffer, - * leave it locked, pushbuf routine will unlock it. - */ - return XFS_ITEM_PUSHBUF; + rval = XFS_ITEM_FLUSHING; + goto out_unlock; } - ASSERT(lip->li_flags & XFS_LI_IN_AIL); - return XFS_ITEM_SUCCESS; + spin_unlock(&lip->li_ailp->xa_lock); + + error = xfs_qm_dqflush(dqp, &bp); + if (error) { + xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p", + __func__, error, dqp); + } else { + if (!xfs_buf_delwri_queue(bp, buffer_list)) + rval = XFS_ITEM_FLUSHING; + xfs_buf_relse(bp); + } + + spin_lock(&lip->li_ailp->xa_lock); +out_unlock: + xfs_dqunlock(dqp); + return rval; } /* @@ -299,11 +238,9 @@ static const struct xfs_item_ops xfs_dquot_item_ops = { .iop_format = xfs_qm_dquot_logitem_format, .iop_pin = xfs_qm_dquot_logitem_pin, .iop_unpin = xfs_qm_dquot_logitem_unpin, - .iop_trylock = xfs_qm_dquot_logitem_trylock, .iop_unlock = xfs_qm_dquot_logitem_unlock, .iop_committed = xfs_qm_dquot_logitem_committed, .iop_push = xfs_qm_dquot_logitem_push, - .iop_pushbuf = xfs_qm_dquot_logitem_pushbuf, .iop_committing = xfs_qm_dquot_logitem_committing }; @@ -398,11 +335,13 @@ xfs_qm_qoff_logitem_unpin( } /* - * Quotaoff items have no locking, so just return success. + * There isn't much you can do to push a quotaoff item. It is simply + * stuck waiting for the log to be flushed to disk. */ STATIC uint -xfs_qm_qoff_logitem_trylock( - struct xfs_log_item *lip) +xfs_qm_qoff_logitem_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) { return XFS_ITEM_LOCKED; } @@ -429,17 +368,6 @@ xfs_qm_qoff_logitem_committed( return lsn; } -/* - * There isn't much you can do to push on an quotaoff item. It is simply - * stuck waiting for the log to be flushed to disk. - */ -STATIC void -xfs_qm_qoff_logitem_push( - struct xfs_log_item *lip) -{ -} - - STATIC xfs_lsn_t xfs_qm_qoffend_logitem_committed( struct xfs_log_item *lip, @@ -454,7 +382,7 @@ xfs_qm_qoffend_logitem_committed( * xfs_trans_ail_delete() drops the AIL lock. */ spin_lock(&ailp->xa_lock); - xfs_trans_ail_delete(ailp, (xfs_log_item_t *)qfs); + xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR); kmem_free(qfs); kmem_free(qfe); @@ -487,7 +415,6 @@ static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { .iop_format = xfs_qm_qoff_logitem_format, .iop_pin = xfs_qm_qoff_logitem_pin, .iop_unpin = xfs_qm_qoff_logitem_unpin, - .iop_trylock = xfs_qm_qoff_logitem_trylock, .iop_unlock = xfs_qm_qoff_logitem_unlock, .iop_committed = xfs_qm_qoffend_logitem_committed, .iop_push = xfs_qm_qoff_logitem_push, @@ -502,7 +429,6 @@ static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = { .iop_format = xfs_qm_qoff_logitem_format, .iop_pin = xfs_qm_qoff_logitem_pin, .iop_unpin = xfs_qm_qoff_logitem_unpin, - .iop_trylock = xfs_qm_qoff_logitem_trylock, .iop_unlock = xfs_qm_qoff_logitem_unlock, .iop_committed = xfs_qm_qoff_logitem_committed, .iop_push = xfs_qm_qoff_logitem_push, diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 39f06336b99d..610456054dc2 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 558910f5e3c0..2d25d19c4ea1 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -17,7 +17,6 @@ */ #include "xfs.h" #include "xfs_types.h" -#include "xfs_inum.h" #include "xfs_log.h" #include "xfs_trans.h" #include "xfs_sb.h" diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c new file mode 100644 index 000000000000..85e9f87a1a7c --- /dev/null +++ b/fs/xfs/xfs_extent_busy.c @@ -0,0 +1,603 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2010 David Chinner. + * Copyright (c) 2011 Christoph Hellwig. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc.h" +#include "xfs_inode.h" +#include "xfs_extent_busy.h" +#include "xfs_trace.h" + +void +xfs_extent_busy_insert( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len, + unsigned int flags) +{ + struct xfs_extent_busy *new; + struct xfs_extent_busy *busyp; + struct xfs_perag *pag; + struct rb_node **rbp; + struct rb_node *parent = NULL; + + new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_MAYFAIL); + if (!new) { + /* + * No Memory! Since it is now not possible to track the free + * block, make this a synchronous transaction to insure that + * the block is not reused before this transaction commits. + */ + trace_xfs_extent_busy_enomem(tp->t_mountp, agno, bno, len); + xfs_trans_set_sync(tp); + return; + } + + new->agno = agno; + new->bno = bno; + new->length = len; + INIT_LIST_HEAD(&new->list); + new->flags = flags; + + /* trace before insert to be able to see failed inserts */ + trace_xfs_extent_busy(tp->t_mountp, agno, bno, len); + + pag = xfs_perag_get(tp->t_mountp, new->agno); + spin_lock(&pag->pagb_lock); + rbp = &pag->pagb_tree.rb_node; + while (*rbp) { + parent = *rbp; + busyp = rb_entry(parent, struct xfs_extent_busy, rb_node); + + if (new->bno < busyp->bno) { + rbp = &(*rbp)->rb_left; + ASSERT(new->bno + new->length <= busyp->bno); + } else if (new->bno > busyp->bno) { + rbp = &(*rbp)->rb_right; + ASSERT(bno >= busyp->bno + busyp->length); + } else { + ASSERT(0); + } + } + + rb_link_node(&new->rb_node, parent, rbp); + rb_insert_color(&new->rb_node, &pag->pagb_tree); + + list_add(&new->list, &tp->t_busy); + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); +} + +/* + * Search for a busy extent within the range of the extent we are about to + * allocate. You need to be holding the busy extent tree lock when calling + * xfs_extent_busy_search(). This function returns 0 for no overlapping busy + * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact + * match. This is done so that a non-zero return indicates an overlap that + * will require a synchronous transaction, but it can still be + * used to distinguish between a partial or exact match. + */ +int +xfs_extent_busy_search( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len) +{ + struct xfs_perag *pag; + struct rb_node *rbp; + struct xfs_extent_busy *busyp; + int match = 0; + + pag = xfs_perag_get(mp, agno); + spin_lock(&pag->pagb_lock); + + rbp = pag->pagb_tree.rb_node; + + /* find closest start bno overlap */ + while (rbp) { + busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node); + if (bno < busyp->bno) { + /* may overlap, but exact start block is lower */ + if (bno + len > busyp->bno) + match = -1; + rbp = rbp->rb_left; + } else if (bno > busyp->bno) { + /* may overlap, but exact start block is higher */ + if (bno < busyp->bno + busyp->length) + match = -1; + rbp = rbp->rb_right; + } else { + /* bno matches busyp, length determines exact match */ + match = (busyp->length == len) ? 1 : -1; + break; + } + } + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); + return match; +} + +/* + * The found free extent [fbno, fend] overlaps part or all of the given busy + * extent. If the overlap covers the beginning, the end, or all of the busy + * extent, the overlapping portion can be made unbusy and used for the + * allocation. We can't split a busy extent because we can't modify a + * transaction/CIL context busy list, but we can update an entries block + * number or length. + * + * Returns true if the extent can safely be reused, or false if the search + * needs to be restarted. + */ +STATIC bool +xfs_extent_busy_update_extent( + struct xfs_mount *mp, + struct xfs_perag *pag, + struct xfs_extent_busy *busyp, + xfs_agblock_t fbno, + xfs_extlen_t flen, + bool userdata) +{ + xfs_agblock_t fend = fbno + flen; + xfs_agblock_t bbno = busyp->bno; + xfs_agblock_t bend = bbno + busyp->length; + + /* + * This extent is currently being discarded. Give the thread + * performing the discard a chance to mark the extent unbusy + * and retry. + */ + if (busyp->flags & XFS_EXTENT_BUSY_DISCARDED) { + spin_unlock(&pag->pagb_lock); + delay(1); + spin_lock(&pag->pagb_lock); + return false; + } + + /* + * If there is a busy extent overlapping a user allocation, we have + * no choice but to force the log and retry the search. + * + * Fortunately this does not happen during normal operation, but + * only if the filesystem is very low on space and has to dip into + * the AGFL for normal allocations. + */ + if (userdata) + goto out_force_log; + + if (bbno < fbno && bend > fend) { + /* + * Case 1: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +---------+ + * fbno fend + */ + + /* + * We would have to split the busy extent to be able to track + * it correct, which we cannot do because we would have to + * modify the list of busy extents attached to the transaction + * or CIL context, which is immutable. + * + * Force out the log to clear the busy extent and retry the + * search. + */ + goto out_force_log; + } else if (bbno >= fbno && bend <= fend) { + /* + * Case 2: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------+ + * fbno fend + * + * Case 3: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Case 4: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Case 5: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------------------------+ + * fbno fend + * + */ + + /* + * The busy extent is fully covered by the extent we are + * allocating, and can simply be removed from the rbtree. + * However we cannot remove it from the immutable list + * tracking busy extents in the transaction or CIL context, + * so set the length to zero to mark it invalid. + * + * We also need to restart the busy extent search from the + * tree root, because erasing the node can rearrange the + * tree topology. + */ + rb_erase(&busyp->rb_node, &pag->pagb_tree); + busyp->length = 0; + return false; + } else if (fend < bend) { + /* + * Case 6: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +---------+ + * fbno fend + * + * Case 7: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +------------------+ + * fbno fend + * + */ + busyp->bno = fend; + } else if (bbno < fbno) { + /* + * Case 8: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-------------+ + * fbno fend + * + * Case 9: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +----------------------+ + * fbno fend + */ + busyp->length = fbno - busyp->bno; + } else { + ASSERT(0); + } + + trace_xfs_extent_busy_reuse(mp, pag->pag_agno, fbno, flen); + return true; + +out_force_log: + spin_unlock(&pag->pagb_lock); + xfs_log_force(mp, XFS_LOG_SYNC); + trace_xfs_extent_busy_force(mp, pag->pag_agno, fbno, flen); + spin_lock(&pag->pagb_lock); + return false; +} + + +/* + * For a given extent [fbno, flen], make sure we can reuse it safely. + */ +void +xfs_extent_busy_reuse( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agblock_t fbno, + xfs_extlen_t flen, + bool userdata) +{ + struct xfs_perag *pag; + struct rb_node *rbp; + + ASSERT(flen > 0); + + pag = xfs_perag_get(mp, agno); + spin_lock(&pag->pagb_lock); +restart: + rbp = pag->pagb_tree.rb_node; + while (rbp) { + struct xfs_extent_busy *busyp = + rb_entry(rbp, struct xfs_extent_busy, rb_node); + xfs_agblock_t bbno = busyp->bno; + xfs_agblock_t bend = bbno + busyp->length; + + if (fbno + flen <= bbno) { + rbp = rbp->rb_left; + continue; + } else if (fbno >= bend) { + rbp = rbp->rb_right; + continue; + } + + if (!xfs_extent_busy_update_extent(mp, pag, busyp, fbno, flen, + userdata)) + goto restart; + } + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); +} + +/* + * For a given extent [fbno, flen], search the busy extent list to find a + * subset of the extent that is not busy. If *rlen is smaller than + * args->minlen no suitable extent could be found, and the higher level + * code needs to force out the log and retry the allocation. + */ +void +xfs_extent_busy_trim( + struct xfs_alloc_arg *args, + xfs_agblock_t bno, + xfs_extlen_t len, + xfs_agblock_t *rbno, + xfs_extlen_t *rlen) +{ + xfs_agblock_t fbno; + xfs_extlen_t flen; + struct rb_node *rbp; + + ASSERT(len > 0); + + spin_lock(&args->pag->pagb_lock); +restart: + fbno = bno; + flen = len; + rbp = args->pag->pagb_tree.rb_node; + while (rbp && flen >= args->minlen) { + struct xfs_extent_busy *busyp = + rb_entry(rbp, struct xfs_extent_busy, rb_node); + xfs_agblock_t fend = fbno + flen; + xfs_agblock_t bbno = busyp->bno; + xfs_agblock_t bend = bbno + busyp->length; + + if (fend <= bbno) { + rbp = rbp->rb_left; + continue; + } else if (fbno >= bend) { + rbp = rbp->rb_right; + continue; + } + + /* + * If this is a metadata allocation, try to reuse the busy + * extent instead of trimming the allocation. + */ + if (!args->userdata && + !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) { + if (!xfs_extent_busy_update_extent(args->mp, args->pag, + busyp, fbno, flen, + false)) + goto restart; + continue; + } + + if (bbno <= fbno) { + /* start overlap */ + + /* + * Case 1: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +---------+ + * fbno fend + * + * Case 2: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-------------+ + * fbno fend + * + * Case 3: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-------------+ + * fbno fend + * + * Case 4: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------+ + * fbno fend + * + * No unbusy region in extent, return failure. + */ + if (fend <= bend) + goto fail; + + /* + * Case 5: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +----------------------+ + * fbno fend + * + * Case 6: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Needs to be trimmed to: + * +-------+ + * fbno fend + */ + fbno = bend; + } else if (bend >= fend) { + /* end overlap */ + + /* + * Case 7: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +------------------+ + * fbno fend + * + * Case 8: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Needs to be trimmed to: + * +-------+ + * fbno fend + */ + fend = bbno; + } else { + /* middle overlap */ + + /* + * Case 9: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------------------------+ + * fbno fend + * + * Can be trimmed to: + * +-------+ OR +-------+ + * fbno fend fbno fend + * + * Backward allocation leads to significant + * fragmentation of directories, which degrades + * directory performance, therefore we always want to + * choose the option that produces forward allocation + * patterns. + * Preferring the lower bno extent will make the next + * request use "fend" as the start of the next + * allocation; if the segment is no longer busy at + * that point, we'll get a contiguous allocation, but + * even if it is still busy, we will get a forward + * allocation. + * We try to avoid choosing the segment at "bend", + * because that can lead to the next allocation + * taking the segment at "fbno", which would be a + * backward allocation. We only use the segment at + * "fbno" if it is much larger than the current + * requested size, because in that case there's a + * good chance subsequent allocations will be + * contiguous. + */ + if (bbno - fbno >= args->maxlen) { + /* left candidate fits perfect */ + fend = bbno; + } else if (fend - bend >= args->maxlen * 4) { + /* right candidate has enough free space */ + fbno = bend; + } else if (bbno - fbno >= args->minlen) { + /* left candidate fits minimum requirement */ + fend = bbno; + } else { + goto fail; + } + } + + flen = fend - fbno; + } + spin_unlock(&args->pag->pagb_lock); + + if (fbno != bno || flen != len) { + trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, + fbno, flen); + } + *rbno = fbno; + *rlen = flen; + return; +fail: + /* + * Return a zero extent length as failure indications. All callers + * re-check if the trimmed extent satisfies the minlen requirement. + */ + spin_unlock(&args->pag->pagb_lock); + trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, fbno, 0); + *rbno = fbno; + *rlen = 0; +} + +STATIC void +xfs_extent_busy_clear_one( + struct xfs_mount *mp, + struct xfs_perag *pag, + struct xfs_extent_busy *busyp) +{ + if (busyp->length) { + trace_xfs_extent_busy_clear(mp, busyp->agno, busyp->bno, + busyp->length); + rb_erase(&busyp->rb_node, &pag->pagb_tree); + } + + list_del_init(&busyp->list); + kmem_free(busyp); +} + +/* + * Remove all extents on the passed in list from the busy extents tree. + * If do_discard is set skip extents that need to be discarded, and mark + * these as undergoing a discard operation instead. + */ +void +xfs_extent_busy_clear( + struct xfs_mount *mp, + struct list_head *list, + bool do_discard) +{ + struct xfs_extent_busy *busyp, *n; + struct xfs_perag *pag = NULL; + xfs_agnumber_t agno = NULLAGNUMBER; + + list_for_each_entry_safe(busyp, n, list, list) { + if (busyp->agno != agno) { + if (pag) { + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); + } + pag = xfs_perag_get(mp, busyp->agno); + spin_lock(&pag->pagb_lock); + agno = busyp->agno; + } + + if (do_discard && busyp->length && + !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) + busyp->flags = XFS_EXTENT_BUSY_DISCARDED; + else + xfs_extent_busy_clear_one(mp, pag, busyp); + } + + if (pag) { + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); + } +} + +/* + * Callback for list_sort to sort busy extents by the AG they reside in. + */ +int +xfs_extent_busy_ag_cmp( + void *priv, + struct list_head *a, + struct list_head *b) +{ + return container_of(a, struct xfs_extent_busy, list)->agno - + container_of(b, struct xfs_extent_busy, list)->agno; +} diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h new file mode 100644 index 000000000000..985412d65ba5 --- /dev/null +++ b/fs/xfs/xfs_extent_busy.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2010 David Chinner. + * Copyright (c) 2011 Christoph Hellwig. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_EXTENT_BUSY_H__ +#define __XFS_EXTENT_BUSY_H__ + +/* + * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that + * have been freed but whose transactions aren't committed to disk yet. + * + * Note that we use the transaction ID to record the transaction, not the + * transaction structure itself. See xfs_extent_busy_insert() for details. + */ +struct xfs_extent_busy { + struct rb_node rb_node; /* ag by-bno indexed search tree */ + struct list_head list; /* transaction busy extent list */ + xfs_agnumber_t agno; + xfs_agblock_t bno; + xfs_extlen_t length; + unsigned int flags; +#define XFS_EXTENT_BUSY_DISCARDED 0x01 /* undergoing a discard op. */ +#define XFS_EXTENT_BUSY_SKIP_DISCARD 0x02 /* do not discard */ +}; + +void +xfs_extent_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno, + xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags); + +void +xfs_extent_busy_clear(struct xfs_mount *mp, struct list_head *list, + bool do_discard); + +int +xfs_extent_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t bno, xfs_extlen_t len); + +void +xfs_extent_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata); + +void +xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t bno, + xfs_extlen_t len, xfs_agblock_t *rbno, xfs_extlen_t *rlen); + +int +xfs_extent_busy_ag_cmp(void *priv, struct list_head *a, struct list_head *b); + +static inline void xfs_extent_busy_sort(struct list_head *list) +{ + list_sort(NULL, list, xfs_extent_busy_ag_cmp); +} + +#endif /* __XFS_EXTENT_BUSY_H__ */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 35c2aff38b20..feb36d7551ae 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_sb.h" @@ -64,7 +63,8 @@ __xfs_efi_release( if (!test_and_clear_bit(XFS_EFI_COMMITTED, &efip->efi_flags)) { spin_lock(&ailp->xa_lock); /* xfs_trans_ail_delete() drops the AIL lock. */ - xfs_trans_ail_delete(ailp, &efip->efi_item); + xfs_trans_ail_delete(ailp, &efip->efi_item, + SHUTDOWN_LOG_IO_ERROR); xfs_efi_item_free(efip); } } @@ -147,22 +147,20 @@ xfs_efi_item_unpin( } /* - * Efi items have no locking or pushing. However, since EFIs are - * pulled from the AIL when their corresponding EFDs are committed - * to disk, their situation is very similar to being pinned. Return - * XFS_ITEM_PINNED so that the caller will eventually flush the log. - * This should help in getting the EFI out of the AIL. + * Efi items have no locking or pushing. However, since EFIs are pulled from + * the AIL when their corresponding EFDs are committed to disk, their situation + * is very similar to being pinned. Return XFS_ITEM_PINNED so that the caller + * will eventually flush the log. This should help in getting the EFI out of + * the AIL. */ STATIC uint -xfs_efi_item_trylock( - struct xfs_log_item *lip) +xfs_efi_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) { return XFS_ITEM_PINNED; } -/* - * Efi items have no locking, so just return. - */ STATIC void xfs_efi_item_unlock( struct xfs_log_item *lip) @@ -190,17 +188,6 @@ xfs_efi_item_committed( } /* - * There isn't much you can do to push on an efi item. It is simply - * stuck waiting for all of its corresponding efd items to be - * committed to disk. - */ -STATIC void -xfs_efi_item_push( - struct xfs_log_item *lip) -{ -} - -/* * The EFI dependency tracking op doesn't do squat. It can't because * it doesn't know where the free extent is coming from. The dependency * tracking has to be handled by the "enclosing" metadata object. For @@ -222,7 +209,6 @@ static const struct xfs_item_ops xfs_efi_item_ops = { .iop_format = xfs_efi_item_format, .iop_pin = xfs_efi_item_pin, .iop_unpin = xfs_efi_item_unpin, - .iop_trylock = xfs_efi_item_trylock, .iop_unlock = xfs_efi_item_unlock, .iop_committed = xfs_efi_item_committed, .iop_push = xfs_efi_item_push, @@ -404,19 +390,17 @@ xfs_efd_item_unpin( } /* - * Efd items have no locking, so just return success. + * There isn't much you can do to push on an efd item. It is simply stuck + * waiting for the log to be flushed to disk. */ STATIC uint -xfs_efd_item_trylock( - struct xfs_log_item *lip) +xfs_efd_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) { - return XFS_ITEM_LOCKED; + return XFS_ITEM_PINNED; } -/* - * Efd items have no locking or pushing, so return failure - * so that the caller doesn't bother with us. - */ STATIC void xfs_efd_item_unlock( struct xfs_log_item *lip) @@ -451,16 +435,6 @@ xfs_efd_item_committed( } /* - * There isn't much you can do to push on an efd item. It is simply - * stuck waiting for the log to be flushed to disk. - */ -STATIC void -xfs_efd_item_push( - struct xfs_log_item *lip) -{ -} - -/* * The EFD dependency tracking op doesn't do squat. It can't because * it doesn't know where the free extent is coming from. The dependency * tracking has to be handled by the "enclosing" metadata object. For @@ -482,7 +456,6 @@ static const struct xfs_item_ops xfs_efd_item_ops = { .iop_format = xfs_efd_item_format, .iop_pin = xfs_efd_item_pin, .iop_unpin = xfs_efd_item_unpin, - .iop_trylock = xfs_efd_item_trylock, .iop_unlock = xfs_efd_item_unlock, .iop_committed = xfs_efd_item_committed, .iop_push = xfs_efd_item_push, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 54a67dd9ac0a..8d214b87f6bb 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -17,9 +17,7 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_trans.h" @@ -396,114 +394,96 @@ xfs_file_splice_write( } /* - * This routine is called to handle zeroing any space in the last - * block of the file that is beyond the EOF. We do this since the - * size is being increased without writing anything to that block - * and we don't want anyone to read the garbage on the disk. + * This routine is called to handle zeroing any space in the last block of the + * file that is beyond the EOF. We do this since the size is being increased + * without writing anything to that block and we don't want to read the + * garbage on the disk. */ STATIC int /* error (positive) */ xfs_zero_last_block( - xfs_inode_t *ip, - xfs_fsize_t offset, - xfs_fsize_t isize) + struct xfs_inode *ip, + xfs_fsize_t offset, + xfs_fsize_t isize) { - xfs_fileoff_t last_fsb; - xfs_mount_t *mp = ip->i_mount; - int nimaps; - int zero_offset; - int zero_len; - int error = 0; - xfs_bmbt_irec_t imap; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - - zero_offset = XFS_B_FSB_OFFSET(mp, isize); - if (zero_offset == 0) { - /* - * There are no extra bytes in the last block on disk to - * zero, so return. - */ - return 0; - } + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t last_fsb = XFS_B_TO_FSBT(mp, isize); + int zero_offset = XFS_B_FSB_OFFSET(mp, isize); + int zero_len; + int nimaps = 1; + int error = 0; + struct xfs_bmbt_irec imap; - last_fsb = XFS_B_TO_FSBT(mp, isize); - nimaps = 1; + xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0); + xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) return error; + ASSERT(nimaps > 0); + /* * If the block underlying isize is just a hole, then there * is nothing to zero. */ - if (imap.br_startblock == HOLESTARTBLOCK) { + if (imap.br_startblock == HOLESTARTBLOCK) return 0; - } - /* - * Zero the part of the last block beyond the EOF, and write it - * out sync. We need to drop the ilock while we do this so we - * don't deadlock when the buffer cache calls back to us. - */ - xfs_iunlock(ip, XFS_ILOCK_EXCL); zero_len = mp->m_sb.sb_blocksize - zero_offset; if (isize + zero_len > offset) zero_len = offset - isize; - error = xfs_iozero(ip, isize, zero_len); - - xfs_ilock(ip, XFS_ILOCK_EXCL); - ASSERT(error >= 0); - return error; + return xfs_iozero(ip, isize, zero_len); } /* - * Zero any on disk space between the current EOF and the new, - * larger EOF. This handles the normal case of zeroing the remainder - * of the last block in the file and the unusual case of zeroing blocks - * out beyond the size of the file. This second case only happens - * with fixed size extents and when the system crashes before the inode - * size was updated but after blocks were allocated. If fill is set, - * then any holes in the range are filled and zeroed. If not, the holes - * are left alone as holes. + * Zero any on disk space between the current EOF and the new, larger EOF. + * + * This handles the normal case of zeroing the remainder of the last block in + * the file and the unusual case of zeroing blocks out beyond the size of the + * file. This second case only happens with fixed size extents and when the + * system crashes before the inode size was updated but after blocks were + * allocated. + * + * Expects the iolock to be held exclusive, and will take the ilock internally. */ - int /* error (positive) */ xfs_zero_eof( - xfs_inode_t *ip, - xfs_off_t offset, /* starting I/O offset */ - xfs_fsize_t isize) /* current inode size */ + struct xfs_inode *ip, + xfs_off_t offset, /* starting I/O offset */ + xfs_fsize_t isize) /* current inode size */ { - xfs_mount_t *mp = ip->i_mount; - xfs_fileoff_t start_zero_fsb; - xfs_fileoff_t end_zero_fsb; - xfs_fileoff_t zero_count_fsb; - xfs_fileoff_t last_fsb; - xfs_fileoff_t zero_off; - xfs_fsize_t zero_len; - int nimaps; - int error = 0; - xfs_bmbt_irec_t imap; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t start_zero_fsb; + xfs_fileoff_t end_zero_fsb; + xfs_fileoff_t zero_count_fsb; + xfs_fileoff_t last_fsb; + xfs_fileoff_t zero_off; + xfs_fsize_t zero_len; + int nimaps; + int error = 0; + struct xfs_bmbt_irec imap; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(offset > isize); /* * First handle zeroing the block on which isize resides. + * * We only zero a part of that block so it is handled specially. */ - error = xfs_zero_last_block(ip, offset, isize); - if (error) { - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); - return error; + if (XFS_B_FSB_OFFSET(mp, isize) != 0) { + error = xfs_zero_last_block(ip, offset, isize); + if (error) + return error; } /* - * Calculate the range between the new size and the old - * where blocks needing to be zeroed may exist. To get the - * block where the last byte in the file currently resides, - * we need to subtract one from the size and truncate back - * to a block boundary. We subtract 1 in case the size is - * exactly on a block boundary. + * Calculate the range between the new size and the old where blocks + * needing to be zeroed may exist. + * + * To get the block where the last byte in the file currently resides, + * we need to subtract one from the size and truncate back to a block + * boundary. We subtract 1 in case the size is exactly on a block + * boundary. */ last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1; start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); @@ -521,23 +501,18 @@ xfs_zero_eof( while (start_zero_fsb <= end_zero_fsb) { nimaps = 1; zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; + + xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb, &imap, &nimaps, 0); - if (error) { - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) return error; - } + ASSERT(nimaps > 0); if (imap.br_state == XFS_EXT_UNWRITTEN || imap.br_startblock == HOLESTARTBLOCK) { - /* - * This loop handles initializing pages that were - * partially initialized by the code below this - * loop. It basically zeroes the part of the page - * that sits on a hole and sets the page as P_HOLE - * and calls remapf if it is a mapped file. - */ start_zero_fsb = imap.br_startoff + imap.br_blockcount; ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); continue; @@ -545,11 +520,7 @@ xfs_zero_eof( /* * There are blocks we need to zero. - * Drop the inode lock while we're doing the I/O. - * We'll still have the iolock to protect us. */ - xfs_iunlock(ip, XFS_ILOCK_EXCL); - zero_off = XFS_FSB_TO_B(mp, start_zero_fsb); zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount); @@ -557,22 +528,14 @@ xfs_zero_eof( zero_len = offset - zero_off; error = xfs_iozero(ip, zero_off, zero_len); - if (error) { - goto out_lock; - } + if (error) + return error; start_zero_fsb = imap.br_startoff + imap.br_blockcount; ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); - - xfs_ilock(ip, XFS_ILOCK_EXCL); } return 0; - -out_lock: - xfs_ilock(ip, XFS_ILOCK_EXCL); - ASSERT(error >= 0); - return error; } /* @@ -593,35 +556,29 @@ xfs_file_aio_write_checks( struct xfs_inode *ip = XFS_I(inode); int error = 0; - xfs_rw_ilock(ip, XFS_ILOCK_EXCL); restart: error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); - if (error) { - xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); + if (error) return error; - } /* * If the offset is beyond the size of the file, we need to zero any * blocks that fall between the existing EOF and the start of this * write. If zeroing is needed and we are currently holding the - * iolock shared, we need to update it to exclusive which involves - * dropping all locks and relocking to maintain correct locking order. - * If we do this, restart the function to ensure all checks and values - * are still valid. + * iolock shared, we need to update it to exclusive which implies + * having to redo all checks before. */ if (*pos > i_size_read(inode)) { if (*iolock == XFS_IOLOCK_SHARED) { - xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); + xfs_rw_iunlock(ip, *iolock); *iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); + xfs_rw_ilock(ip, *iolock); goto restart; } error = -xfs_zero_eof(ip, *pos, i_size_read(inode)); + if (error) + return error; } - xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); - if (error) - return error; /* * Updating the timestamps will grab the ilock again from @@ -638,7 +595,6 @@ restart: * people from modifying setuid and setgid binaries. */ return file_remove_suid(file); - } /* @@ -1007,8 +963,149 @@ xfs_vm_page_mkwrite( return block_page_mkwrite(vma, vmf, xfs_get_blocks); } +STATIC loff_t +xfs_seek_data( + struct file *file, + loff_t start, + u32 type) +{ + struct inode *inode = file->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec map[2]; + int nmap = 2; + loff_t uninitialized_var(offset); + xfs_fsize_t isize; + xfs_fileoff_t fsbno; + xfs_filblks_t end; + uint lock; + int error; + + lock = xfs_ilock_map_shared(ip); + + isize = i_size_read(inode); + if (start >= isize) { + error = ENXIO; + goto out_unlock; + } + + fsbno = XFS_B_TO_FSBT(mp, start); + + /* + * Try to read extents from the first block indicated + * by fsbno to the end block of the file. + */ + end = XFS_B_TO_FSB(mp, isize); + + error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap, + XFS_BMAPI_ENTIRE); + if (error) + goto out_unlock; + + /* + * Treat unwritten extent as data extent since it might + * contains dirty data in page cache. + */ + if (map[0].br_startblock != HOLESTARTBLOCK) { + offset = max_t(loff_t, start, + XFS_FSB_TO_B(mp, map[0].br_startoff)); + } else { + if (nmap == 1) { + error = ENXIO; + goto out_unlock; + } + + offset = max_t(loff_t, start, + XFS_FSB_TO_B(mp, map[1].br_startoff)); + } + + if (offset != file->f_pos) + file->f_pos = offset; + +out_unlock: + xfs_iunlock_map_shared(ip, lock); + + if (error) + return -error; + return offset; +} + +STATIC loff_t +xfs_seek_hole( + struct file *file, + loff_t start, + u32 type) +{ + struct inode *inode = file->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + loff_t uninitialized_var(offset); + loff_t holeoff; + xfs_fsize_t isize; + xfs_fileoff_t fsbno; + uint lock; + int error; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -XFS_ERROR(EIO); + + lock = xfs_ilock_map_shared(ip); + + isize = i_size_read(inode); + if (start >= isize) { + error = ENXIO; + goto out_unlock; + } + + fsbno = XFS_B_TO_FSBT(mp, start); + error = xfs_bmap_first_unused(NULL, ip, 1, &fsbno, XFS_DATA_FORK); + if (error) + goto out_unlock; + + holeoff = XFS_FSB_TO_B(mp, fsbno); + if (holeoff <= start) + offset = start; + else { + /* + * xfs_bmap_first_unused() could return a value bigger than + * isize if there are no more holes past the supplied offset. + */ + offset = min_t(loff_t, holeoff, isize); + } + + if (offset != file->f_pos) + file->f_pos = offset; + +out_unlock: + xfs_iunlock_map_shared(ip, lock); + + if (error) + return -error; + return offset; +} + +STATIC loff_t +xfs_file_llseek( + struct file *file, + loff_t offset, + int origin) +{ + switch (origin) { + case SEEK_END: + case SEEK_CUR: + case SEEK_SET: + return generic_file_llseek(file, offset, origin); + case SEEK_DATA: + return xfs_seek_data(file, offset, origin); + case SEEK_HOLE: + return xfs_seek_hole(file, offset, origin); + default: + return -EINVAL; + } +} + const struct file_operations xfs_file_operations = { - .llseek = generic_file_llseek, + .llseek = xfs_file_llseek, .read = do_sync_read, .write = do_sync_write, .aio_read = xfs_file_aio_read, diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 1c6fdeb702ff..c25b094efbf7 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -18,8 +18,6 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" -#include "xfs_inum.h" #include "xfs_log.h" #include "xfs_trans.h" #include "xfs_sb.h" @@ -39,7 +37,6 @@ #include "xfs_itable.h" #include "xfs_trans_space.h" #include "xfs_rtalloc.h" -#include "xfs_rw.h" #include "xfs_filestream.h" #include "xfs_trace.h" @@ -147,9 +144,9 @@ xfs_growfs_data_private( if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb))) return error; dpct = pct - mp->m_sb.sb_imax_pct; - bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp, + bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), - BBTOB(XFS_FSS_TO_BB(mp, 1)), 0); + XFS_FSS_TO_BB(mp, 1), 0); if (!bp) return EIO; xfs_buf_relse(bp); @@ -193,7 +190,7 @@ xfs_growfs_data_private( */ bp = xfs_buf_get(mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED); + XFS_FSS_TO_BB(mp, 1), 0); if (!bp) { error = ENOMEM; goto error0; @@ -230,7 +227,7 @@ xfs_growfs_data_private( */ bp = xfs_buf_get(mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED); + XFS_FSS_TO_BB(mp, 1), 0); if (!bp) { error = ENOMEM; goto error0; @@ -259,8 +256,7 @@ xfs_growfs_data_private( */ bp = xfs_buf_get(mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), - XBF_LOCK | XBF_MAPPED); + BTOBB(mp->m_sb.sb_blocksize), 0); if (!bp) { error = ENOMEM; goto error0; @@ -286,8 +282,7 @@ xfs_growfs_data_private( */ bp = xfs_buf_get(mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), - XBF_LOCK | XBF_MAPPED); + BTOBB(mp->m_sb.sb_blocksize), 0); if (!bp) { error = ENOMEM; goto error0; @@ -314,8 +309,7 @@ xfs_growfs_data_private( */ bp = xfs_buf_get(mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), - XBF_LOCK | XBF_MAPPED); + BTOBB(mp->m_sb.sb_blocksize), 0); if (!bp) { error = ENOMEM; goto error0; @@ -405,7 +399,7 @@ xfs_growfs_data_private( /* update secondary superblocks. */ for (agno = 1; agno < nagcount; agno++) { - error = xfs_read_buf(mp, mp->m_ddev_targp, + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), XFS_FSS_TO_BB(mp, 1), 0, &bp); if (error) { @@ -693,3 +687,63 @@ xfs_fs_goingdown( return 0; } + +/* + * Force a shutdown of the filesystem instantly while keeping the filesystem + * consistent. We don't do an unmount here; just shutdown the shop, make sure + * that absolutely nothing persistent happens to this filesystem after this + * point. + */ +void +xfs_do_force_shutdown( + xfs_mount_t *mp, + int flags, + char *fname, + int lnnum) +{ + int logerror; + + logerror = flags & SHUTDOWN_LOG_IO_ERROR; + + if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { + xfs_notice(mp, + "%s(0x%x) called from line %d of file %s. Return address = 0x%p", + __func__, flags, lnnum, fname, __return_address); + } + /* + * No need to duplicate efforts. + */ + if (XFS_FORCED_SHUTDOWN(mp) && !logerror) + return; + + /* + * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't + * queue up anybody new on the log reservations, and wakes up + * everybody who's sleeping on log reservations to tell them + * the bad news. + */ + if (xfs_log_force_umount(mp, logerror)) + return; + + if (flags & SHUTDOWN_CORRUPT_INCORE) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT, + "Corruption of in-memory data detected. Shutting down filesystem"); + if (XFS_ERRLEVEL_HIGH <= xfs_error_level) + xfs_stack_trace(); + } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { + if (logerror) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR, + "Log I/O Error Detected. Shutting down filesystem"); + } else if (flags & SHUTDOWN_DEVICE_REQ) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, + "All device paths lost. Shutting down filesystem"); + } else if (!(flags & SHUTDOWN_REMOTE_REQ)) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, + "I/O Error Detected. Shutting down filesystem"); + } + } + if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { + xfs_alert(mp, + "Please umount the filesystem and rectify the problem(s)"); + } +} diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index dad1a31aa4fc..177a21a7ac49 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -200,8 +200,7 @@ xfs_ialloc_inode_init( */ d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster)); fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, - mp->m_bsize * blks_per_cluster, - XBF_LOCK); + mp->m_bsize * blks_per_cluster, 0); if (!fbuf) return ENOMEM; /* @@ -610,6 +609,13 @@ xfs_ialloc_get_rec( /* * Visible inode allocation functions. */ +/* + * Find a free (set) bit in the inode bitmask. + */ +static inline int xfs_ialloc_find_free(xfs_inofree_t *fp) +{ + return xfs_lowbit64(*fp); +} /* * Allocate an inode on disk. diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h index 666a037398d6..65ac57c8063c 100644 --- a/fs/xfs/xfs_ialloc.h +++ b/fs/xfs/xfs_ialloc.h @@ -47,15 +47,6 @@ xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o) } /* - * Find a free (set) bit in the inode bitmask. - */ -static inline int xfs_ialloc_find_free(xfs_inofree_t *fp) -{ - return xfs_lowbit64(*fp); -} - - -/* * Allocate an inode on disk. * Mode is used to tell whether the new inode will need space, and whether * it is a directory. diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c index c6a75815aea0..2b8b7a37aa18 100644 --- a/fs/xfs/xfs_ialloc_btree.c +++ b/fs/xfs/xfs_ialloc_btree.c @@ -20,7 +20,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index bcc6c249b2c7..1bb4365e8c25 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_acl.h" -#include "xfs_bit.h" #include "xfs_log.h" #include "xfs_inum.h" #include "xfs_trans.h" @@ -123,23 +122,7 @@ xfs_inode_free( xfs_idestroy_fork(ip, XFS_ATTR_FORK); if (ip->i_itemp) { - /* - * Only if we are shutting down the fs will we see an - * inode still in the AIL. If it is there, we should remove - * it to prevent a use-after-free from occurring. - */ - xfs_log_item_t *lip = &ip->i_itemp->ili_item; - struct xfs_ail *ailp = lip->li_ailp; - - ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) || - XFS_FORCED_SHUTDOWN(ip->i_mount)); - if (lip->li_flags & XFS_LI_IN_AIL) { - spin_lock(&ailp->xa_lock); - if (lip->li_flags & XFS_LI_IN_AIL) - xfs_trans_ail_delete(ailp, lip); - else - spin_unlock(&ailp->xa_lock); - } + ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL)); xfs_inode_item_destroy(ip); ip->i_itemp = NULL; } @@ -334,9 +317,10 @@ xfs_iget_cache_miss( /* * Preload the radix tree so we can insert safely under the * write spinlock. Note that we cannot sleep inside the preload - * region. + * region. Since we can be called from transaction context, don't + * recurse into the file system. */ - if (radix_tree_preload(GFP_KERNEL)) { + if (radix_tree_preload(GFP_NOFS)) { error = EAGAIN; goto out_destroy; } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index bc46c0a133d3..a59eea09930a 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -20,7 +20,6 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" #include "xfs_inum.h" #include "xfs_trans.h" @@ -61,6 +60,20 @@ STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int); STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int); STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int); +/* + * helper function to extract extent size hint from inode + */ +xfs_extlen_t +xfs_get_extsz_hint( + struct xfs_inode *ip) +{ + if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize) + return ip->i_d.di_extsize; + if (XFS_IS_REALTIME_INODE(ip)) + return ip->i_mount->m_sb.sb_rextsize; + return 0; +} + #ifdef DEBUG /* * Make sure that the extents in the given memory buffer @@ -137,6 +150,7 @@ xfs_imap_to_bp( int ni; xfs_buf_t *bp; + buf_flags |= XBF_UNMAPPED; error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, (int)imap->im_len, buf_flags, &bp); if (error) { @@ -226,7 +240,7 @@ xfs_inotobp( if (error) return error; - error = xfs_imap_to_bp(mp, tp, &imap, &bp, XBF_LOCK, imap_flags); + error = xfs_imap_to_bp(mp, tp, &imap, &bp, 0, imap_flags); if (error) return error; @@ -782,8 +796,7 @@ xfs_iread( /* * Get pointers to the on-disk inode and the buffer containing it. */ - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, - XBF_LOCK, iget_flags); + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, 0, iget_flags); if (error) return error; dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); @@ -1342,7 +1355,7 @@ xfs_iunlink( * Here we put the head pointer into our next pointer, * and then we fall through to point the head at us. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); + error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0); if (error) return error; @@ -1423,7 +1436,7 @@ xfs_iunlink_remove( * of dealing with the buffer when there is no need to * change it. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); + error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0); if (error) { xfs_warn(mp, "%s: xfs_itobp() returned error %d.", __func__, error); @@ -1484,7 +1497,7 @@ xfs_iunlink_remove( * Now last_ibp points to the buffer previous to us on * the unlinked list. Pull us from the list. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); + error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0); if (error) { xfs_warn(mp, "%s: xfs_itobp(2) returned error %d.", __func__, error); @@ -1566,8 +1579,7 @@ xfs_ifree_cluster( * to mark all the active inodes on the buffer stale. */ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, - mp->m_bsize * blks_per_cluster, - XBF_LOCK); + mp->m_bsize * blks_per_cluster, 0); if (!bp) return ENOMEM; @@ -1737,7 +1749,7 @@ xfs_ifree( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XBF_LOCK); + error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0); if (error) return error; @@ -2347,11 +2359,11 @@ cluster_corrupt_out: */ rcu_read_unlock(); /* - * Clean up the buffer. If it was B_DELWRI, just release it -- + * Clean up the buffer. If it was delwri, just release it -- * brelse can handle it with no problems. If not, shut down the * filesystem before releasing the buffer. */ - bufwasdelwri = XFS_BUF_ISDELAYWRITE(bp); + bufwasdelwri = (bp->b_flags & _XBF_DELWRI_Q); if (bufwasdelwri) xfs_buf_relse(bp); @@ -2377,30 +2389,29 @@ cluster_corrupt_out: /* * Unlocks the flush lock */ - xfs_iflush_abort(iq); + xfs_iflush_abort(iq, false); kmem_free(ilist); xfs_perag_put(pag); return XFS_ERROR(EFSCORRUPTED); } /* - * xfs_iflush() will write a modified inode's changes out to the - * inode's on disk home. The caller must have the inode lock held - * in at least shared mode and the inode flush completion must be - * active as well. The inode lock will still be held upon return from - * the call and the caller is free to unlock it. - * The inode flush will be completed when the inode reaches the disk. - * The flags indicate how the inode's buffer should be written out. + * Flush dirty inode metadata into the backing buffer. + * + * The caller must have the inode lock and the inode flush lock held. The + * inode lock will still be held upon return to the caller, and the inode + * flush lock will be released after the inode has reached the disk. + * + * The caller must write out the buffer returned in *bpp and release it. */ int xfs_iflush( - xfs_inode_t *ip, - uint flags) + struct xfs_inode *ip, + struct xfs_buf **bpp) { - xfs_inode_log_item_t *iip; - xfs_buf_t *bp; - xfs_dinode_t *dip; - xfs_mount_t *mp; + struct xfs_mount *mp = ip->i_mount; + struct xfs_buf *bp; + struct xfs_dinode *dip; int error; XFS_STATS_INC(xs_iflush_count); @@ -2410,25 +2421,8 @@ xfs_iflush( ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); - iip = ip->i_itemp; - mp = ip->i_mount; + *bpp = NULL; - /* - * We can't flush the inode until it is unpinned, so wait for it if we - * are allowed to block. We know no one new can pin it, because we are - * holding the inode lock shared and you need to hold it exclusively to - * pin the inode. - * - * If we are not allowed to block, force the log out asynchronously so - * that when we come back the inode will be unpinned. If other inodes - * in the same cluster are dirty, they will probably write the inode - * out for us if they occur after the log force completes. - */ - if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) { - xfs_iunpin(ip); - xfs_ifunlock(ip); - return EAGAIN; - } xfs_iunpin_wait(ip); /* @@ -2447,20 +2441,20 @@ xfs_iflush( /* * This may have been unpinned because the filesystem is shutting * down forcibly. If that's the case we must not write this inode - * to disk, because the log record didn't make it to disk! + * to disk, because the log record didn't make it to disk. + * + * We also have to remove the log item from the AIL in this case, + * as we wait for an empty AIL as part of the unmount process. */ if (XFS_FORCED_SHUTDOWN(mp)) { - if (iip) - iip->ili_fields = 0; - xfs_ifunlock(ip); - return XFS_ERROR(EIO); + error = XFS_ERROR(EIO); + goto abort_out; } /* * Get the buffer containing the on-disk inode. */ - error = xfs_itobp(mp, NULL, ip, &dip, &bp, - (flags & SYNC_TRYLOCK) ? XBF_TRYLOCK : XBF_LOCK); + error = xfs_itobp(mp, NULL, ip, &dip, &bp, XBF_TRYLOCK); if (error || !bp) { xfs_ifunlock(ip); return error; @@ -2488,23 +2482,20 @@ xfs_iflush( if (error) goto cluster_corrupt_out; - if (flags & SYNC_WAIT) - error = xfs_bwrite(bp); - else - xfs_buf_delwri_queue(bp); - - xfs_buf_relse(bp); - return error; + *bpp = bp; + return 0; corrupt_out: xfs_buf_relse(bp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); cluster_corrupt_out: + error = XFS_ERROR(EFSCORRUPTED); +abort_out: /* * Unlocks the flush lock */ - xfs_iflush_abort(ip); - return XFS_ERROR(EFSCORRUPTED); + xfs_iflush_abort(ip, false); + return error; } @@ -2706,27 +2697,6 @@ corrupt_out: return XFS_ERROR(EFSCORRUPTED); } -void -xfs_promote_inode( - struct xfs_inode *ip) -{ - struct xfs_buf *bp; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - - bp = xfs_incore(ip->i_mount->m_ddev_targp, ip->i_imap.im_blkno, - ip->i_imap.im_len, XBF_TRYLOCK); - if (!bp) - return; - - if (XFS_BUF_ISDELAYWRITE(bp)) { - xfs_buf_delwri_promote(bp); - wake_up_process(ip->i_mount->m_ddev_targp->bt_task); - } - - xfs_buf_relse(bp); -} - /* * Return a pointer to the extent record at file index idx. */ diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 7fee3387e1c8..1efff36a75b6 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -529,11 +529,12 @@ int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); void xfs_iext_realloc(xfs_inode_t *, int, int); void xfs_iunpin_wait(xfs_inode_t *); -int xfs_iflush(xfs_inode_t *, uint); -void xfs_promote_inode(struct xfs_inode *); +int xfs_iflush(struct xfs_inode *, struct xfs_buf **); void xfs_lock_inodes(xfs_inode_t **, int, uint); void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); +xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); + #define IHOLD(ip) \ do { \ ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 05d924efceaf..6cdbf90c6f7b 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -18,9 +18,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -480,25 +478,16 @@ xfs_inode_item_unpin( wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT); } -/* - * This is called to attempt to lock the inode associated with this - * inode log item, in preparation for the push routine which does the actual - * iflush. Don't sleep on the inode lock or the flush lock. - * - * If the flush lock is already held, indicating that the inode has - * been or is in the process of being flushed, then (ideally) we'd like to - * see if the inode's buffer is still incore, and if so give it a nudge. - * We delay doing so until the pushbuf routine, though, to avoid holding - * the AIL lock across a call to the blackhole which is the buffer cache. - * Also we don't want to sleep in any device strategy routines, which can happen - * if we do the subsequent bawrite in here. - */ STATIC uint -xfs_inode_item_trylock( - struct xfs_log_item *lip) +xfs_inode_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) { struct xfs_inode_log_item *iip = INODE_ITEM(lip); struct xfs_inode *ip = iip->ili_inode; + struct xfs_buf *bp = NULL; + uint rval = XFS_ITEM_SUCCESS; + int error; if (xfs_ipincount(ip) > 0) return XFS_ITEM_PINNED; @@ -506,30 +495,50 @@ xfs_inode_item_trylock( if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) return XFS_ITEM_LOCKED; + /* + * Re-check the pincount now that we stabilized the value by + * taking the ilock. + */ + if (xfs_ipincount(ip) > 0) { + rval = XFS_ITEM_PINNED; + goto out_unlock; + } + + /* + * Someone else is already flushing the inode. Nothing we can do + * here but wait for the flush to finish and remove the item from + * the AIL. + */ if (!xfs_iflock_nowait(ip)) { - /* - * inode has already been flushed to the backing buffer, - * leave it locked in shared mode, pushbuf routine will - * unlock it. - */ - return XFS_ITEM_PUSHBUF; + rval = XFS_ITEM_FLUSHING; + goto out_unlock; } - /* Stale items should force out the iclog */ + /* + * Stale inode items should force out the iclog. + */ if (ip->i_flags & XFS_ISTALE) { xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_SHARED); return XFS_ITEM_PINNED; } -#ifdef DEBUG - if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { - ASSERT(iip->ili_fields != 0); - ASSERT(iip->ili_logged == 0); - ASSERT(lip->li_flags & XFS_LI_IN_AIL); + ASSERT(iip->ili_fields != 0 || XFS_FORCED_SHUTDOWN(ip->i_mount)); + ASSERT(iip->ili_logged == 0 || XFS_FORCED_SHUTDOWN(ip->i_mount)); + + spin_unlock(&lip->li_ailp->xa_lock); + + error = xfs_iflush(ip, &bp); + if (!error) { + if (!xfs_buf_delwri_queue(bp, buffer_list)) + rval = XFS_ITEM_FLUSHING; + xfs_buf_relse(bp); } -#endif - return XFS_ITEM_SUCCESS; + + spin_lock(&lip->li_ailp->xa_lock); +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return rval; } /* @@ -614,86 +623,6 @@ xfs_inode_item_committed( } /* - * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK - * failed to get the inode flush lock but did get the inode locked SHARED. - * Here we're trying to see if the inode buffer is incore, and if so whether it's - * marked delayed write. If that's the case, we'll promote it and that will - * allow the caller to write the buffer by triggering the xfsbufd to run. - */ -STATIC bool -xfs_inode_item_pushbuf( - struct xfs_log_item *lip) -{ - struct xfs_inode_log_item *iip = INODE_ITEM(lip); - struct xfs_inode *ip = iip->ili_inode; - struct xfs_buf *bp; - bool ret = true; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); - - /* - * If a flush is not in progress anymore, chances are that the - * inode was taken off the AIL. So, just get out. - */ - if (!xfs_isiflocked(ip) || - !(lip->li_flags & XFS_LI_IN_AIL)) { - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return true; - } - - bp = xfs_incore(ip->i_mount->m_ddev_targp, iip->ili_format.ilf_blkno, - iip->ili_format.ilf_len, XBF_TRYLOCK); - - xfs_iunlock(ip, XFS_ILOCK_SHARED); - if (!bp) - return true; - if (XFS_BUF_ISDELAYWRITE(bp)) - xfs_buf_delwri_promote(bp); - if (xfs_buf_ispinned(bp)) - ret = false; - xfs_buf_relse(bp); - return ret; -} - -/* - * This is called to asynchronously write the inode associated with this - * inode log item out to disk. The inode will already have been locked by - * a successful call to xfs_inode_item_trylock(). - */ -STATIC void -xfs_inode_item_push( - struct xfs_log_item *lip) -{ - struct xfs_inode_log_item *iip = INODE_ITEM(lip); - struct xfs_inode *ip = iip->ili_inode; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); - ASSERT(xfs_isiflocked(ip)); - - /* - * Since we were able to lock the inode's flush lock and - * we found it on the AIL, the inode must be dirty. This - * is because the inode is removed from the AIL while still - * holding the flush lock in xfs_iflush_done(). Thus, if - * we found it in the AIL and were able to obtain the flush - * lock without sleeping, then there must not have been - * anyone in the process of flushing the inode. - */ - ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || iip->ili_fields != 0); - - /* - * Push the inode to it's backing buffer. This will not remove the - * inode from the AIL - a further push will be required to trigger a - * buffer push. However, this allows all the dirty inodes to be pushed - * to the buffer before it is pushed to disk. The buffer IO completion - * will pull the inode from the AIL, mark it clean and unlock the flush - * lock. - */ - (void) xfs_iflush(ip, SYNC_TRYLOCK); - xfs_iunlock(ip, XFS_ILOCK_SHARED); -} - -/* * XXX rcc - this one really has to do something. Probably needs * to stamp in a new field in the incore inode. */ @@ -713,11 +642,9 @@ static const struct xfs_item_ops xfs_inode_item_ops = { .iop_format = xfs_inode_item_format, .iop_pin = xfs_inode_item_pin, .iop_unpin = xfs_inode_item_unpin, - .iop_trylock = xfs_inode_item_trylock, .iop_unlock = xfs_inode_item_unlock, .iop_committed = xfs_inode_item_committed, .iop_push = xfs_inode_item_push, - .iop_pushbuf = xfs_inode_item_pushbuf, .iop_committing = xfs_inode_item_committing }; @@ -848,7 +775,8 @@ xfs_iflush_done( ASSERT(i <= need_ail); } /* xfs_trans_ail_delete_bulk() drops the AIL lock. */ - xfs_trans_ail_delete_bulk(ailp, log_items, i); + xfs_trans_ail_delete_bulk(ailp, log_items, i, + SHUTDOWN_CORRUPT_INCORE); } @@ -869,16 +797,15 @@ xfs_iflush_done( } /* - * This is the inode flushing abort routine. It is called - * from xfs_iflush when the filesystem is shutting down to clean - * up the inode state. - * It is responsible for removing the inode item - * from the AIL if it has not been re-logged, and unlocking the inode's - * flush lock. + * This is the inode flushing abort routine. It is called from xfs_iflush when + * the filesystem is shutting down to clean up the inode state. It is + * responsible for removing the inode item from the AIL if it has not been + * re-logged, and unlocking the inode's flush lock. */ void xfs_iflush_abort( - xfs_inode_t *ip) + xfs_inode_t *ip, + bool stale) { xfs_inode_log_item_t *iip = ip->i_itemp; @@ -888,7 +815,10 @@ xfs_iflush_abort( spin_lock(&ailp->xa_lock); if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { /* xfs_trans_ail_delete() drops the AIL lock. */ - xfs_trans_ail_delete(ailp, (xfs_log_item_t *)iip); + xfs_trans_ail_delete(ailp, &iip->ili_item, + stale ? + SHUTDOWN_LOG_IO_ERROR : + SHUTDOWN_CORRUPT_INCORE); } else spin_unlock(&ailp->xa_lock); } @@ -915,7 +845,7 @@ xfs_istale_done( struct xfs_buf *bp, struct xfs_log_item *lip) { - xfs_iflush_abort(INODE_ITEM(lip)->ili_inode); + xfs_iflush_abort(INODE_ITEM(lip)->ili_inode, true); } /* diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 41d61c3b7a36..376d4d0b2635 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -165,7 +165,7 @@ extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); extern void xfs_inode_item_destroy(struct xfs_inode *); extern void xfs_iflush_done(struct xfs_buf *, struct xfs_log_item *); extern void xfs_istale_done(struct xfs_buf *, struct xfs_log_item *); -extern void xfs_iflush_abort(struct xfs_inode *); +extern void xfs_iflush_abort(struct xfs_inode *, bool); extern int xfs_inode_item_format_convert(xfs_log_iovec_t *, xfs_inode_log_format_t *); diff --git a/fs/xfs/xfs_inum.h b/fs/xfs/xfs_inum.h index b253c0ea5bec..90efdaf1706f 100644 --- a/fs/xfs/xfs_inum.h +++ b/fs/xfs/xfs_inum.h @@ -26,11 +26,6 @@ * high agno_log-agblklog-inopblog bits - 0 */ -typedef __uint32_t xfs_agino_t; /* within allocation grp inode number */ - -#define NULLFSINO ((xfs_ino_t)-1) -#define NULLAGINO ((xfs_agino_t)-1) - struct xfs_mount; #define XFS_INO_MASK(k) (__uint32_t)((1ULL << (k)) - 1) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 91f8ff547ab3..3a05a41b5d76 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -17,9 +17,7 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index a849a5473aff..c4f2da0d2bf5 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -22,9 +22,7 @@ #include <asm/uaccess.h> #include "xfs.h" #include "xfs_fs.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 71a464503c43..aadfce6681ee 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -17,9 +17,7 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -37,7 +35,6 @@ #include "xfs_rtalloc.h" #include "xfs_error.h" #include "xfs_itable.h" -#include "xfs_rw.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_trans_space.h" @@ -142,11 +139,7 @@ xfs_iomap_write_direct( int committed; int error; - /* - * Make sure that the dquots are there. This doesn't hold - * the ilock across a disk read. - */ - error = xfs_qm_dqattach_locked(ip, 0); + error = xfs_qm_dqattach(ip, 0); if (error) return XFS_ERROR(error); @@ -158,7 +151,7 @@ xfs_iomap_write_direct( if ((offset + count) > XFS_ISIZE(ip)) { error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); if (error) - goto error_out; + return XFS_ERROR(error); } else { if (nmaps && (imap->br_startblock == HOLESTARTBLOCK)) last_fsb = MIN(last_fsb, (xfs_fileoff_t) @@ -190,7 +183,6 @@ xfs_iomap_write_direct( /* * Allocate and setup the transaction */ - xfs_iunlock(ip, XFS_ILOCK_EXCL); tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); error = xfs_trans_reserve(tp, resblks, XFS_WRITE_LOG_RES(mp), resrtextents, @@ -199,15 +191,16 @@ xfs_iomap_write_direct( /* * Check for running out of space, note: need lock to return */ - if (error) + if (error) { xfs_trans_cancel(tp, 0); + return XFS_ERROR(error); + } + xfs_ilock(ip, XFS_ILOCK_EXCL); - if (error) - goto error_out; error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag); if (error) - goto error1; + goto out_trans_cancel; xfs_trans_ijoin(tp, ip, 0); @@ -224,42 +217,39 @@ xfs_iomap_write_direct( error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flag, &firstfsb, 0, imap, &nimaps, &free_list); if (error) - goto error0; + goto out_bmap_cancel; /* * Complete the transaction */ error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) - goto error0; + goto out_bmap_cancel; error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); if (error) - goto error_out; + goto out_unlock; /* * Copy any maps to caller's array and return any error. */ if (nimaps == 0) { - error = ENOSPC; - goto error_out; + error = XFS_ERROR(ENOSPC); + goto out_unlock; } - if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) { + if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) error = xfs_alert_fsblock_zero(ip, imap); - goto error_out; - } - return 0; +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; -error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ +out_bmap_cancel: xfs_bmap_cancel(&free_list); - xfs_trans_unreserve_quota_nblks(tp, ip, qblocks, 0, quota_flag); - -error1: /* Just cancel transaction */ + xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); +out_trans_cancel: xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); - -error_out: - return XFS_ERROR(error); + goto out_unlock; } /* @@ -422,6 +412,15 @@ retry: return error; } + /* + * Make sure preallocation does not create extents beyond the range we + * actually support in this filesystem. + */ + if (last_fsb > XFS_B_TO_FSB(mp, mp->m_maxioffset)) + last_fsb = XFS_B_TO_FSB(mp, mp->m_maxioffset); + + ASSERT(last_fsb > offset_fsb); + nimaps = XFS_WRITE_IMAPS; error = xfs_bmapi_delay(ip, offset_fsb, last_fsb - offset_fsb, imap, &nimaps, XFS_BMAPI_ENTIRE); diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 3011b879f850..1a25fd802798 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -18,9 +18,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_acl.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -34,7 +32,6 @@ #include "xfs_rtalloc.h" #include "xfs_error.h" #include "xfs_itable.h" -#include "xfs_rw.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_utils.h" @@ -700,7 +697,7 @@ xfs_setattr_size( xfs_off_t oldsize, newsize; struct xfs_trans *tp; int error; - uint lock_flags; + uint lock_flags = 0; uint commit_flags = 0; trace_xfs_setattr(ip); @@ -720,10 +717,10 @@ xfs_setattr_size( ATTR_MTIME_SET|ATTR_KILL_SUID|ATTR_KILL_SGID| ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); - lock_flags = XFS_ILOCK_EXCL; - if (!(flags & XFS_ATTR_NOLOCK)) + if (!(flags & XFS_ATTR_NOLOCK)) { lock_flags |= XFS_IOLOCK_EXCL; - xfs_ilock(ip, lock_flags); + xfs_ilock(ip, lock_flags); + } oldsize = inode->i_size; newsize = iattr->ia_size; @@ -746,7 +743,7 @@ xfs_setattr_size( /* * Make sure that the dquots are attached to the inode. */ - error = xfs_qm_dqattach_locked(ip, 0); + error = xfs_qm_dqattach(ip, 0); if (error) goto out_unlock; @@ -768,8 +765,6 @@ xfs_setattr_size( if (error) goto out_unlock; } - xfs_iunlock(ip, XFS_ILOCK_EXCL); - lock_flags &= ~XFS_ILOCK_EXCL; /* * We are going to log the inode size change in this transaction so diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index acc2bf264dab..eff577a9b67f 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -18,7 +18,6 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" #include "xfs_inum.h" #include "xfs_trans.h" diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 6db1fef38bff..6b965bf450e4 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -18,9 +18,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -35,7 +33,6 @@ #include "xfs_trans_priv.h" #include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_rw.h" #include "xfs_trace.h" kmem_zone_t *xfs_log_ticket_zone; @@ -916,27 +913,42 @@ xfs_log_need_covered(xfs_mount_t *mp) * We may be holding the log iclog lock upon entering this routine. */ xfs_lsn_t -xlog_assign_tail_lsn( +xlog_assign_tail_lsn_locked( struct xfs_mount *mp) { - xfs_lsn_t tail_lsn; struct log *log = mp->m_log; + struct xfs_log_item *lip; + xfs_lsn_t tail_lsn; + + assert_spin_locked(&mp->m_ail->xa_lock); /* * To make sure we always have a valid LSN for the log tail we keep * track of the last LSN which was committed in log->l_last_sync_lsn, - * and use that when the AIL was empty and xfs_ail_min_lsn returns 0. - * - * If the AIL has been emptied we also need to wake any process - * waiting for this condition. + * and use that when the AIL was empty. */ - tail_lsn = xfs_ail_min_lsn(mp->m_ail); - if (!tail_lsn) + lip = xfs_ail_min(mp->m_ail); + if (lip) + tail_lsn = lip->li_lsn; + else tail_lsn = atomic64_read(&log->l_last_sync_lsn); atomic64_set(&log->l_tail_lsn, tail_lsn); return tail_lsn; } +xfs_lsn_t +xlog_assign_tail_lsn( + struct xfs_mount *mp) +{ + xfs_lsn_t tail_lsn; + + spin_lock(&mp->m_ail->xa_lock); + tail_lsn = xlog_assign_tail_lsn_locked(mp); + spin_unlock(&mp->m_ail->xa_lock); + + return tail_lsn; +} + /* * Return the space in the log between the tail and the head. The head * is passed in the cycle/bytes formal parms. In the special case where @@ -1172,7 +1184,7 @@ xlog_alloc_log(xfs_mount_t *mp, xlog_get_iclog_buffer_size(mp, log); error = ENOMEM; - bp = xfs_buf_alloc(mp->m_logdev_targp, 0, log->l_iclog_size, 0); + bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0); if (!bp) goto out_free_log; bp->b_iodone = xlog_iodone; @@ -1182,9 +1194,6 @@ xlog_alloc_log(xfs_mount_t *mp, spin_lock_init(&log->l_icloglock); init_waitqueue_head(&log->l_flush_wait); - /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */ - ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0); - iclogp = &log->l_iclog; /* * The amount of memory to allocate for the iclog structure is @@ -1204,7 +1213,7 @@ xlog_alloc_log(xfs_mount_t *mp, prev_iclog = iclog; bp = xfs_buf_get_uncached(mp->m_logdev_targp, - log->l_iclog_size, 0); + BTOBB(log->l_iclog_size), 0); if (!bp) goto out_free_iclog; @@ -1224,7 +1233,7 @@ xlog_alloc_log(xfs_mount_t *mp, head->h_fmt = cpu_to_be32(XLOG_FMT); memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t)); - iclog->ic_size = XFS_BUF_SIZE(bp) - log->l_iclog_hsize; + iclog->ic_size = BBTOB(bp->b_length) - log->l_iclog_hsize; iclog->ic_state = XLOG_STATE_ACTIVE; iclog->ic_log = log; atomic_set(&iclog->ic_refcnt, 0); @@ -1475,7 +1484,7 @@ xlog_sync(xlog_t *log, } else { iclog->ic_bwritecnt = 1; } - XFS_BUF_SET_COUNT(bp, count); + bp->b_io_length = BTOBB(count); bp->b_fspriv = iclog; XFS_BUF_ZEROFLAGS(bp); XFS_BUF_ASYNC(bp); @@ -1573,7 +1582,7 @@ xlog_dealloc_log(xlog_t *log) * always need to ensure that the extra buffer does not point to memory * owned by another log buffer before we free it. */ - xfs_buf_set_empty(log->l_xbuf, log->l_iclog_size); + xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size)); xfs_buf_free(log->l_xbuf); iclog = log->l_iclog; @@ -2932,6 +2941,7 @@ xfs_log_force( { int error; + trace_xfs_log_force(mp, 0); error = _xfs_log_force(mp, flags, NULL); if (error) xfs_warn(mp, "%s: error %d returned.", __func__, error); @@ -3080,6 +3090,7 @@ xfs_log_force_lsn( { int error; + trace_xfs_log_force(mp, lsn); error = _xfs_log_force_lsn(mp, lsn, flags, NULL); if (error) xfs_warn(mp, "%s: error %d returned.", __func__, error); diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 2c622bedb302..748d312850e2 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -152,6 +152,7 @@ int xfs_log_mount(struct xfs_mount *mp, int num_bblocks); int xfs_log_mount_finish(struct xfs_mount *mp); xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); +xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp); void xfs_log_space_wake(struct xfs_mount *mp); int xfs_log_notify(struct xfs_mount *mp, struct xlog_in_core *iclog, diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index d4fadbe8ac90..7d6197c58493 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -18,9 +18,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_log_priv.h" @@ -29,61 +27,10 @@ #include "xfs_mount.h" #include "xfs_error.h" #include "xfs_alloc.h" +#include "xfs_extent_busy.h" #include "xfs_discard.h" /* - * Perform initial CIL structure initialisation. - */ -int -xlog_cil_init( - struct log *log) -{ - struct xfs_cil *cil; - struct xfs_cil_ctx *ctx; - - cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL); - if (!cil) - return ENOMEM; - - ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL); - if (!ctx) { - kmem_free(cil); - return ENOMEM; - } - - INIT_LIST_HEAD(&cil->xc_cil); - INIT_LIST_HEAD(&cil->xc_committing); - spin_lock_init(&cil->xc_cil_lock); - init_rwsem(&cil->xc_ctx_lock); - init_waitqueue_head(&cil->xc_commit_wait); - - INIT_LIST_HEAD(&ctx->committing); - INIT_LIST_HEAD(&ctx->busy_extents); - ctx->sequence = 1; - ctx->cil = cil; - cil->xc_ctx = ctx; - cil->xc_current_sequence = ctx->sequence; - - cil->xc_log = log; - log->l_cilp = cil; - return 0; -} - -void -xlog_cil_destroy( - struct log *log) -{ - if (log->l_cilp->xc_ctx) { - if (log->l_cilp->xc_ctx->ticket) - xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket); - kmem_free(log->l_cilp->xc_ctx); - } - - ASSERT(list_empty(&log->l_cilp->xc_cil)); - kmem_free(log->l_cilp); -} - -/* * Allocate a new ticket. Failing to get a new ticket makes it really hard to * recover, so we don't allow failure here. Also, we allocate in a context that * we don't want to be issuing transactions from, so we need to tell the @@ -390,8 +337,8 @@ xlog_cil_committed( xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain, ctx->start_lsn, abort); - xfs_alloc_busy_sort(&ctx->busy_extents); - xfs_alloc_busy_clear(mp, &ctx->busy_extents, + xfs_extent_busy_sort(&ctx->busy_extents); + xfs_extent_busy_clear(mp, &ctx->busy_extents, (mp->m_flags & XFS_MOUNT_DISCARD) && !abort); spin_lock(&ctx->cil->xc_cil_lock); @@ -404,7 +351,7 @@ xlog_cil_committed( ASSERT(mp->m_flags & XFS_MOUNT_DISCARD); xfs_discard_extents(mp, &ctx->busy_extents); - xfs_alloc_busy_clear(mp, &ctx->busy_extents, false); + xfs_extent_busy_clear(mp, &ctx->busy_extents, false); } kmem_free(ctx); @@ -426,8 +373,7 @@ xlog_cil_committed( */ STATIC int xlog_cil_push( - struct log *log, - xfs_lsn_t push_seq) + struct log *log) { struct xfs_cil *cil = log->l_cilp; struct xfs_log_vec *lv; @@ -443,39 +389,36 @@ xlog_cil_push( struct xfs_log_iovec lhdr; struct xfs_log_vec lvhdr = { NULL }; xfs_lsn_t commit_lsn; + xfs_lsn_t push_seq; if (!cil) return 0; - ASSERT(!push_seq || push_seq <= cil->xc_ctx->sequence); - new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); new_ctx->ticket = xlog_cil_ticket_alloc(log); - /* - * Lock out transaction commit, but don't block for background pushes - * unless we are well over the CIL space limit. See the definition of - * XLOG_CIL_HARD_SPACE_LIMIT() for the full explanation of the logic - * used here. - */ - if (!down_write_trylock(&cil->xc_ctx_lock)) { - if (!push_seq && - cil->xc_ctx->space_used < XLOG_CIL_HARD_SPACE_LIMIT(log)) - goto out_free_ticket; - down_write(&cil->xc_ctx_lock); - } + down_write(&cil->xc_ctx_lock); ctx = cil->xc_ctx; - /* check if we've anything to push */ - if (list_empty(&cil->xc_cil)) - goto out_skip; + spin_lock(&cil->xc_cil_lock); + push_seq = cil->xc_push_seq; + ASSERT(push_seq <= ctx->sequence); - /* check for spurious background flush */ - if (!push_seq && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) + /* + * Check if we've anything to push. If there is nothing, then we don't + * move on to a new sequence number and so we have to be able to push + * this sequence again later. + */ + if (list_empty(&cil->xc_cil)) { + cil->xc_push_seq = 0; + spin_unlock(&cil->xc_cil_lock); goto out_skip; + } + spin_unlock(&cil->xc_cil_lock); + /* check for a previously pushed seqeunce */ - if (push_seq && push_seq < cil->xc_ctx->sequence) + if (push_seq < cil->xc_ctx->sequence) goto out_skip; /* @@ -629,7 +572,6 @@ restart: out_skip: up_write(&cil->xc_ctx_lock); -out_free_ticket: xfs_log_ticket_put(new_ctx->ticket); kmem_free(new_ctx); return 0; @@ -641,6 +583,82 @@ out_abort: return XFS_ERROR(EIO); } +static void +xlog_cil_push_work( + struct work_struct *work) +{ + struct xfs_cil *cil = container_of(work, struct xfs_cil, + xc_push_work); + xlog_cil_push(cil->xc_log); +} + +/* + * We need to push CIL every so often so we don't cache more than we can fit in + * the log. The limit really is that a checkpoint can't be more than half the + * log (the current checkpoint is not allowed to overwrite the previous + * checkpoint), but commit latency and memory usage limit this to a smaller + * size. + */ +static void +xlog_cil_push_background( + struct log *log) +{ + struct xfs_cil *cil = log->l_cilp; + + /* + * The cil won't be empty because we are called while holding the + * context lock so whatever we added to the CIL will still be there + */ + ASSERT(!list_empty(&cil->xc_cil)); + + /* + * don't do a background push if we haven't used up all the + * space available yet. + */ + if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) + return; + + spin_lock(&cil->xc_cil_lock); + if (cil->xc_push_seq < cil->xc_current_sequence) { + cil->xc_push_seq = cil->xc_current_sequence; + queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work); + } + spin_unlock(&cil->xc_cil_lock); + +} + +static void +xlog_cil_push_foreground( + struct log *log, + xfs_lsn_t push_seq) +{ + struct xfs_cil *cil = log->l_cilp; + + if (!cil) + return; + + ASSERT(push_seq && push_seq <= cil->xc_current_sequence); + + /* start on any pending background push to minimise wait time on it */ + flush_work(&cil->xc_push_work); + + /* + * If the CIL is empty or we've already pushed the sequence then + * there's no work we need to do. + */ + spin_lock(&cil->xc_cil_lock); + if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) { + spin_unlock(&cil->xc_cil_lock); + return; + } + + cil->xc_push_seq = push_seq; + spin_unlock(&cil->xc_cil_lock); + + /* do the push now */ + xlog_cil_push(log); +} + /* * Commit a transaction with the given vector to the Committed Item List. * @@ -667,7 +685,6 @@ xfs_log_commit_cil( { struct log *log = mp->m_log; int log_flags = 0; - int push = 0; struct xfs_log_vec *log_vector; if (flags & XFS_TRANS_RELEASE_LOG_RES) @@ -719,21 +736,9 @@ xfs_log_commit_cil( */ xfs_trans_free_items(tp, *commit_lsn, 0); - /* check for background commit before unlock */ - if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log)) - push = 1; + xlog_cil_push_background(log); up_read(&log->l_cilp->xc_ctx_lock); - - /* - * We need to push CIL every so often so we don't cache more than we - * can fit in the log. The limit really is that a checkpoint can't be - * more than half the log (the current checkpoint is not allowed to - * overwrite the previous checkpoint), but commit latency and memory - * usage limit this to a smaller size in most cases. - */ - if (push) - xlog_cil_push(log, 0); return 0; } @@ -746,9 +751,6 @@ xfs_log_commit_cil( * * We return the current commit lsn to allow the callers to determine if a * iclog flush is necessary following this call. - * - * XXX: Initially, just push the CIL unconditionally and return whatever - * commit lsn is there. It'll be empty, so this is broken for now. */ xfs_lsn_t xlog_cil_force_lsn( @@ -766,8 +768,7 @@ xlog_cil_force_lsn( * xlog_cil_push() handles racing pushes for the same sequence, * so no need to deal with it here. */ - if (sequence == cil->xc_current_sequence) - xlog_cil_push(log, sequence); + xlog_cil_push_foreground(log, sequence); /* * See if we can find a previous sequence still committing. @@ -826,3 +827,57 @@ xfs_log_item_in_current_chkpt( return false; return true; } + +/* + * Perform initial CIL structure initialisation. + */ +int +xlog_cil_init( + struct log *log) +{ + struct xfs_cil *cil; + struct xfs_cil_ctx *ctx; + + cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL); + if (!cil) + return ENOMEM; + + ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL); + if (!ctx) { + kmem_free(cil); + return ENOMEM; + } + + INIT_WORK(&cil->xc_push_work, xlog_cil_push_work); + INIT_LIST_HEAD(&cil->xc_cil); + INIT_LIST_HEAD(&cil->xc_committing); + spin_lock_init(&cil->xc_cil_lock); + init_rwsem(&cil->xc_ctx_lock); + init_waitqueue_head(&cil->xc_commit_wait); + + INIT_LIST_HEAD(&ctx->committing); + INIT_LIST_HEAD(&ctx->busy_extents); + ctx->sequence = 1; + ctx->cil = cil; + cil->xc_ctx = ctx; + cil->xc_current_sequence = ctx->sequence; + + cil->xc_log = log; + log->l_cilp = cil; + return 0; +} + +void +xlog_cil_destroy( + struct log *log) +{ + if (log->l_cilp->xc_ctx) { + if (log->l_cilp->xc_ctx->ticket) + xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket); + kmem_free(log->l_cilp->xc_ctx); + } + + ASSERT(list_empty(&log->l_cilp->xc_cil)); + kmem_free(log->l_cilp); +} + diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 2152900b79d4..735ff1ee53da 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -417,6 +417,8 @@ struct xfs_cil { struct list_head xc_committing; wait_queue_head_t xc_commit_wait; xfs_lsn_t xc_current_sequence; + struct work_struct xc_push_work; + xfs_lsn_t xc_push_seq; }; /* diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 8ecad5bad66c..ca386909131a 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -40,7 +40,6 @@ #include "xfs_extfree_item.h" #include "xfs_trans_priv.h" #include "xfs_quota.h" -#include "xfs_rw.h" #include "xfs_utils.h" #include "xfs_trace.h" @@ -120,7 +119,7 @@ xlog_get_bp( nbblks += log->l_sectBBsize; nbblks = round_up(nbblks, log->l_sectBBsize); - bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, BBTOB(nbblks), 0); + bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, nbblks, 0); if (bp) xfs_buf_unlock(bp); return bp; @@ -146,7 +145,7 @@ xlog_align( { xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1); - ASSERT(BBTOB(offset + nbblks) <= XFS_BUF_SIZE(bp)); + ASSERT(offset + nbblks <= bp->b_length); return bp->b_addr + BBTOB(offset); } @@ -174,11 +173,12 @@ xlog_bread_noalign( nbblks = round_up(nbblks, log->l_sectBBsize); ASSERT(nbblks > 0); - ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); + ASSERT(nbblks <= bp->b_length); XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); XFS_BUF_READ(bp); - XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); + bp->b_io_length = nbblks; + bp->b_error = 0; xfsbdstrat(log->l_mp, bp); error = xfs_buf_iowait(bp); @@ -218,7 +218,7 @@ xlog_bread_offset( xfs_caddr_t offset) { xfs_caddr_t orig_offset = bp->b_addr; - int orig_len = bp->b_buffer_length; + int orig_len = BBTOB(bp->b_length); int error, error2; error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks)); @@ -259,13 +259,14 @@ xlog_bwrite( nbblks = round_up(nbblks, log->l_sectBBsize); ASSERT(nbblks > 0); - ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); + ASSERT(nbblks <= bp->b_length); XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); XFS_BUF_ZEROFLAGS(bp); xfs_buf_hold(bp); xfs_buf_lock(bp); - XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); + bp->b_io_length = nbblks; + bp->b_error = 0; error = xfs_bwrite(bp); if (error) @@ -440,6 +441,8 @@ xlog_find_verify_cycle( * a log sector, or we're out of luck. */ bufblks = 1 << ffs(nbblks); + while (bufblks > log->l_logBBsize) + bufblks >>= 1; while (!(bp = xlog_get_bp(log, bufblks))) { bufblks >>= 1; if (bufblks < log->l_sectBBsize) @@ -1225,6 +1228,8 @@ xlog_write_log_records( * log sector, or we're out of luck. */ bufblks = 1 << ffs(blocks); + while (bufblks > log->l_logBBsize) + bufblks >>= 1; while (!(bp = xlog_get_bp(log, bufblks))) { bufblks >>= 1; if (bufblks < sectbb) @@ -1772,7 +1777,7 @@ xlog_recover_do_inode_buffer( trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); - inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog; + inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog; for (i = 0; i < inodes_per_buf; i++) { next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + offsetof(xfs_dinode_t, di_next_unlinked); @@ -1814,7 +1819,8 @@ xlog_recover_do_inode_buffer( ASSERT(item->ri_buf[item_index].i_addr != NULL); ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); - ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp)); + ASSERT((reg_buf_offset + reg_buf_bytes) <= + BBTOB(bp->b_io_length)); /* * The current logged region contains a copy of the @@ -1873,8 +1879,8 @@ xlog_recover_do_reg_buffer( ASSERT(nbits > 0); ASSERT(item->ri_buf[i].i_addr != NULL); ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); - ASSERT(XFS_BUF_COUNT(bp) >= - ((uint)bit << XFS_BLF_SHIFT)+(nbits<<XFS_BLF_SHIFT)); + ASSERT(BBTOB(bp->b_io_length) >= + ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT)); /* * Do a sanity check if this is a dquot buffer. Just checking @@ -2103,6 +2109,7 @@ xlog_recover_do_dquot_buffer( STATIC int xlog_recover_buffer_pass2( xlog_t *log, + struct list_head *buffer_list, xlog_recover_item_t *item) { xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; @@ -2123,9 +2130,9 @@ xlog_recover_buffer_pass2( trace_xfs_log_recover_buf_recover(log, buf_f); - buf_flags = XBF_LOCK; - if (!(buf_f->blf_flags & XFS_BLF_INODE_BUF)) - buf_flags |= XBF_MAPPED; + buf_flags = 0; + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) + buf_flags |= XBF_UNMAPPED; bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, buf_flags); @@ -2166,14 +2173,14 @@ xlog_recover_buffer_pass2( */ if (XFS_DINODE_MAGIC == be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && - (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize, + (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize, (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) { xfs_buf_stale(bp); error = xfs_bwrite(bp); } else { ASSERT(bp->b_target->bt_mount == mp); bp->b_iodone = xlog_recover_iodone; - xfs_buf_delwri_queue(bp); + xfs_buf_delwri_queue(bp, buffer_list); } xfs_buf_relse(bp); @@ -2183,6 +2190,7 @@ xlog_recover_buffer_pass2( STATIC int xlog_recover_inode_pass2( xlog_t *log, + struct list_head *buffer_list, xlog_recover_item_t *item) { xfs_inode_log_format_t *in_f; @@ -2220,8 +2228,7 @@ xlog_recover_inode_pass2( } trace_xfs_log_recover_inode_recover(log, in_f); - bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, - XBF_LOCK); + bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0); if (!bp) { error = ENOMEM; goto error; @@ -2436,7 +2443,7 @@ xlog_recover_inode_pass2( write_inode_buffer: ASSERT(bp->b_target->bt_mount == mp); bp->b_iodone = xlog_recover_iodone; - xfs_buf_delwri_queue(bp); + xfs_buf_delwri_queue(bp, buffer_list); xfs_buf_relse(bp); error: if (need_free) @@ -2477,6 +2484,7 @@ xlog_recover_quotaoff_pass1( STATIC int xlog_recover_dquot_pass2( xlog_t *log, + struct list_head *buffer_list, xlog_recover_item_t *item) { xfs_mount_t *mp = log->l_mp; @@ -2530,14 +2538,11 @@ xlog_recover_dquot_pass2( return XFS_ERROR(EIO); ASSERT(dq_f->qlf_len == 1); - error = xfs_read_buf(mp, mp->m_ddev_targp, - dq_f->qlf_blkno, - XFS_FSB_TO_BB(mp, dq_f->qlf_len), - 0, &bp); - if (error) { - xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#3)"); + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno, + XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp); + if (error) return error; - } + ASSERT(bp); ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset); @@ -2558,7 +2563,7 @@ xlog_recover_dquot_pass2( ASSERT(dq_f->qlf_size == 2); ASSERT(bp->b_target->bt_mount == mp); bp->b_iodone = xlog_recover_iodone; - xfs_buf_delwri_queue(bp); + xfs_buf_delwri_queue(bp, buffer_list); xfs_buf_relse(bp); return (0); @@ -2642,7 +2647,8 @@ xlog_recover_efd_pass2( * xfs_trans_ail_delete() drops the * AIL lock. */ - xfs_trans_ail_delete(ailp, lip); + xfs_trans_ail_delete(ailp, lip, + SHUTDOWN_CORRUPT_INCORE); xfs_efi_item_free(efip); spin_lock(&ailp->xa_lock); break; @@ -2712,21 +2718,22 @@ STATIC int xlog_recover_commit_pass2( struct log *log, struct xlog_recover *trans, + struct list_head *buffer_list, xlog_recover_item_t *item) { trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2); switch (ITEM_TYPE(item)) { case XFS_LI_BUF: - return xlog_recover_buffer_pass2(log, item); + return xlog_recover_buffer_pass2(log, buffer_list, item); case XFS_LI_INODE: - return xlog_recover_inode_pass2(log, item); + return xlog_recover_inode_pass2(log, buffer_list, item); case XFS_LI_EFI: return xlog_recover_efi_pass2(log, item, trans->r_lsn); case XFS_LI_EFD: return xlog_recover_efd_pass2(log, item); case XFS_LI_DQUOT: - return xlog_recover_dquot_pass2(log, item); + return xlog_recover_dquot_pass2(log, buffer_list, item); case XFS_LI_QUOTAOFF: /* nothing to do in pass2 */ return 0; @@ -2750,8 +2757,9 @@ xlog_recover_commit_trans( struct xlog_recover *trans, int pass) { - int error = 0; + int error = 0, error2; xlog_recover_item_t *item; + LIST_HEAD (buffer_list); hlist_del(&trans->r_list); @@ -2760,16 +2768,27 @@ xlog_recover_commit_trans( return error; list_for_each_entry(item, &trans->r_itemq, ri_list) { - if (pass == XLOG_RECOVER_PASS1) + switch (pass) { + case XLOG_RECOVER_PASS1: error = xlog_recover_commit_pass1(log, trans, item); - else - error = xlog_recover_commit_pass2(log, trans, item); + break; + case XLOG_RECOVER_PASS2: + error = xlog_recover_commit_pass2(log, trans, + &buffer_list, item); + break; + default: + ASSERT(0); + } + if (error) - return error; + goto out; } xlog_recover_free_trans(trans); - return 0; + +out: + error2 = xfs_buf_delwri_submit(&buffer_list); + return error ? error : error2; } STATIC int @@ -3079,7 +3098,7 @@ xlog_recover_process_one_iunlink( /* * Get the on disk inode to find the next inode in the bucket. */ - error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XBF_LOCK); + error = xfs_itobp(mp, NULL, ip, &dip, &ibp, 0); if (error) goto fail_iput; @@ -3639,11 +3658,8 @@ xlog_do_recover( * First replay the images in the log. */ error = xlog_do_log_recovery(log, head_blk, tail_blk); - if (error) { + if (error) return error; - } - - xfs_flush_buftarg(log->l_mp->m_ddev_targp, 1); /* * If IO errors happened during recovery, bail out. @@ -3670,7 +3686,6 @@ xlog_do_recover( bp = xfs_getsb(log->l_mp, 0); XFS_BUF_UNDONE(bp); ASSERT(!(XFS_BUF_ISWRITE(bp))); - ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); XFS_BUF_READ(bp); XFS_BUF_UNASYNC(bp); xfsbdstrat(log->l_mp, bp); diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index bd672def95ac..331cd9f83a7f 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 1ffead4b2296..536021fb3d4e 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -22,6 +22,7 @@ #include "xfs_log.h" #include "xfs_inum.h" #include "xfs_trans.h" +#include "xfs_trans_priv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_dir2.h" @@ -37,7 +38,6 @@ #include "xfs_rtalloc.h" #include "xfs_bmap.h" #include "xfs_error.h" -#include "xfs_rw.h" #include "xfs_quota.h" #include "xfs_fsops.h" #include "xfs_utils.h" @@ -683,8 +683,8 @@ xfs_readsb(xfs_mount_t *mp, int flags) sector_size = xfs_getsize_buftarg(mp->m_ddev_targp); reread: - bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp, - XFS_SB_DADDR, sector_size, 0); + bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, + BTOBB(sector_size), 0); if (!bp) { if (loud) xfs_warn(mp, "SB buffer read failed"); @@ -1032,9 +1032,9 @@ xfs_check_sizes(xfs_mount_t *mp) xfs_warn(mp, "filesystem size mismatch detected"); return XFS_ERROR(EFBIG); } - bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp, + bp = xfs_buf_read_uncached(mp->m_ddev_targp, d - XFS_FSS_TO_BB(mp, 1), - BBTOB(XFS_FSS_TO_BB(mp, 1)), 0); + XFS_FSS_TO_BB(mp, 1), 0); if (!bp) { xfs_warn(mp, "last sector read failed"); return EIO; @@ -1047,9 +1047,9 @@ xfs_check_sizes(xfs_mount_t *mp) xfs_warn(mp, "log size mismatch detected"); return XFS_ERROR(EFBIG); } - bp = xfs_buf_read_uncached(mp, mp->m_logdev_targp, + bp = xfs_buf_read_uncached(mp->m_logdev_targp, d - XFS_FSB_TO_BB(mp, 1), - XFS_FSB_TO_B(mp, 1), 0); + XFS_FSB_TO_BB(mp, 1), 0); if (!bp) { xfs_warn(mp, "log device read failed"); return EIO; @@ -1288,7 +1288,7 @@ xfs_mountfs( XFS_FSB_TO_BB(mp, sbp->sb_logblocks)); if (error) { xfs_warn(mp, "log mount failed"); - goto out_free_perag; + goto out_fail_wait; } /* @@ -1315,7 +1315,7 @@ xfs_mountfs( !mp->m_sb.sb_inprogress) { error = xfs_initialize_perag_data(mp, sbp->sb_agcount); if (error) - goto out_free_perag; + goto out_fail_wait; } /* @@ -1439,6 +1439,10 @@ xfs_mountfs( IRELE(rip); out_log_dealloc: xfs_log_unmount(mp); + out_fail_wait: + if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) + xfs_wait_buftarg(mp->m_logdev_targp); + xfs_wait_buftarg(mp->m_ddev_targp); out_free_perag: xfs_free_perag(mp); out_remove_uuid: @@ -1475,15 +1479,15 @@ xfs_unmountfs( xfs_log_force(mp, XFS_LOG_SYNC); /* - * Do a delwri reclaim pass first so that as many dirty inodes are - * queued up for IO as possible. Then flush the buffers before making - * a synchronous path to catch all the remaining inodes are reclaimed. - * This makes the reclaim process as quick as possible by avoiding - * synchronous writeout and blocking on inodes already in the delwri - * state as much as possible. + * Flush all pending changes from the AIL. + */ + xfs_ail_push_all_sync(mp->m_ail); + + /* + * And reclaim all inodes. At this point there should be no dirty + * inode, and none should be pinned or locked, but use synchronous + * reclaim just to be sure. */ - xfs_reclaim_inodes(mp, 0); - xfs_flush_buftarg(mp->m_ddev_targp, 1); xfs_reclaim_inodes(mp, SYNC_WAIT); xfs_qm_unmount(mp); @@ -1519,15 +1523,12 @@ xfs_unmountfs( if (error) xfs_warn(mp, "Unable to update superblock counters. " "Freespace may not be correct on next mount."); - xfs_unmountfs_writesb(mp); /* - * Make sure all buffers have been flushed and completed before - * unmounting the log. + * At this point we might have modified the superblock again and thus + * added an item to the AIL, thus flush it again. */ - error = xfs_flush_buftarg(mp->m_ddev_targp, 1); - if (error) - xfs_warn(mp, "%d busy buffers during unmount.", error); + xfs_ail_push_all_sync(mp->m_ail); xfs_wait_buftarg(mp->m_ddev_targp); xfs_log_unmount_write(mp); @@ -1588,36 +1589,6 @@ xfs_log_sbcount(xfs_mount_t *mp) return error; } -int -xfs_unmountfs_writesb(xfs_mount_t *mp) -{ - xfs_buf_t *sbp; - int error = 0; - - /* - * skip superblock write if fs is read-only, or - * if we are doing a forced umount. - */ - if (!((mp->m_flags & XFS_MOUNT_RDONLY) || - XFS_FORCED_SHUTDOWN(mp))) { - - sbp = xfs_getsb(mp, 0); - - XFS_BUF_UNDONE(sbp); - XFS_BUF_UNREAD(sbp); - xfs_buf_delwri_dequeue(sbp); - XFS_BUF_WRITE(sbp); - XFS_BUF_UNASYNC(sbp); - ASSERT(sbp->b_target == mp->m_ddev_targp); - xfsbdstrat(mp, sbp); - error = xfs_buf_iowait(sbp); - if (error) - xfs_buf_ioerror_alert(sbp, __func__); - xfs_buf_relse(sbp); - } - return error; -} - /* * xfs_mod_sb() can be used to copy arbitrary changes to the * in-core superblock into the superblock buffer to be logged. diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 9eba73887829..8b89c5ac72d9 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -214,6 +214,7 @@ typedef struct xfs_mount { struct workqueue_struct *m_data_workqueue; struct workqueue_struct *m_unwritten_workqueue; + struct workqueue_struct *m_cil_workqueue; } xfs_mount_t; /* @@ -378,7 +379,6 @@ extern __uint64_t xfs_default_resblks(xfs_mount_t *mp); extern int xfs_mountfs(xfs_mount_t *mp); extern void xfs_unmountfs(xfs_mount_t *); -extern int xfs_unmountfs_writesb(xfs_mount_t *); extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int); extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *, uint, int); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 55c6afedc879..249db1987764 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -65,7 +64,8 @@ STATIC int xfs_qm_dquot_walk( struct xfs_mount *mp, int type, - int (*execute)(struct xfs_dquot *dqp)) + int (*execute)(struct xfs_dquot *dqp, void *data), + void *data) { struct xfs_quotainfo *qi = mp->m_quotainfo; struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type); @@ -97,7 +97,7 @@ restart: next_index = be32_to_cpu(dqp->q_core.d_id) + 1; - error = execute(batch[i]); + error = execute(batch[i], data); if (error == EAGAIN) { skipped++; continue; @@ -129,7 +129,8 @@ restart: */ STATIC int xfs_qm_dqpurge( - struct xfs_dquot *dqp) + struct xfs_dquot *dqp, + void *data) { struct xfs_mount *mp = dqp->q_mount; struct xfs_quotainfo *qi = mp->m_quotainfo; @@ -153,21 +154,7 @@ xfs_qm_dqpurge( dqp->dq_flags |= XFS_DQ_FREEING; - /* - * If we're turning off quotas, we have to make sure that, for - * example, we don't delete quota disk blocks while dquots are - * in the process of getting written to those disk blocks. - * This dquot might well be on AIL, and we can't leave it there - * if we're turning off quotas. Basically, we need this flush - * lock, and are willing to block on it. - */ - if (!xfs_dqflock_nowait(dqp)) { - /* - * Block on the flush lock after nudging dquot buffer, - * if it is incore. - */ - xfs_dqflock_pushbuf_wait(dqp); - } + xfs_dqflock(dqp); /* * If we are turning this type of quotas off, we don't care @@ -175,16 +162,21 @@ xfs_qm_dqpurge( * we're unmounting, we do care, so we flush it and wait. */ if (XFS_DQ_IS_DIRTY(dqp)) { - int error; + struct xfs_buf *bp = NULL; + int error; /* * We don't care about getting disk errors here. We need * to purge this dquot anyway, so we go ahead regardless. */ - error = xfs_qm_dqflush(dqp, SYNC_WAIT); - if (error) + error = xfs_qm_dqflush(dqp, &bp); + if (error) { xfs_warn(mp, "%s: dquot %p flush failed", __func__, dqp); + } else { + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + } xfs_dqflock(dqp); } @@ -226,11 +218,11 @@ xfs_qm_dqpurge_all( uint flags) { if (flags & XFS_QMOPT_UQUOTA) - xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge); + xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge, NULL); if (flags & XFS_QMOPT_GQUOTA) - xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge); + xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge, NULL); if (flags & XFS_QMOPT_PQUOTA) - xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_dqpurge); + xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_dqpurge, NULL); } /* @@ -483,6 +475,23 @@ done: xfs_dqunlock(udq); } +static bool +xfs_qm_need_dqattach( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + + if (!XFS_IS_QUOTA_RUNNING(mp)) + return false; + if (!XFS_IS_QUOTA_ON(mp)) + return false; + if (!XFS_NOT_DQATTACHED(mp, ip)) + return false; + if (ip->i_ino == mp->m_sb.sb_uquotino || + ip->i_ino == mp->m_sb.sb_gquotino) + return false; + return true; +} /* * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON @@ -500,11 +509,7 @@ xfs_qm_dqattach_locked( uint nquotas = 0; int error = 0; - if (!XFS_IS_QUOTA_RUNNING(mp) || - !XFS_IS_QUOTA_ON(mp) || - !XFS_NOT_DQATTACHED(mp, ip) || - ip->i_ino == mp->m_sb.sb_uquotino || - ip->i_ino == mp->m_sb.sb_gquotino) + if (!xfs_qm_need_dqattach(ip)) return 0; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -575,6 +580,9 @@ xfs_qm_dqattach( { int error; + if (!xfs_qm_need_dqattach(ip)) + return 0; + xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_qm_dqattach_locked(ip, flags); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -855,15 +863,16 @@ xfs_qm_reset_dqcounts( STATIC int xfs_qm_dqiter_bufs( - xfs_mount_t *mp, - xfs_dqid_t firstid, - xfs_fsblock_t bno, - xfs_filblks_t blkcnt, - uint flags) + struct xfs_mount *mp, + xfs_dqid_t firstid, + xfs_fsblock_t bno, + xfs_filblks_t blkcnt, + uint flags, + struct list_head *buffer_list) { - xfs_buf_t *bp; - int error; - int type; + struct xfs_buf *bp; + int error; + int type; ASSERT(blkcnt > 0); type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER : @@ -887,7 +896,7 @@ xfs_qm_dqiter_bufs( break; xfs_qm_reset_dqcounts(mp, bp, firstid, type); - xfs_buf_delwri_queue(bp); + xfs_buf_delwri_queue(bp, buffer_list); xfs_buf_relse(bp); /* * goto the next block. @@ -895,6 +904,7 @@ xfs_qm_dqiter_bufs( bno++; firstid += mp->m_quotainfo->qi_dqperchunk; } + return error; } @@ -904,11 +914,12 @@ xfs_qm_dqiter_bufs( */ STATIC int xfs_qm_dqiterate( - xfs_mount_t *mp, - xfs_inode_t *qip, - uint flags) + struct xfs_mount *mp, + struct xfs_inode *qip, + uint flags, + struct list_head *buffer_list) { - xfs_bmbt_irec_t *map; + struct xfs_bmbt_irec *map; int i, nmaps; /* number of map entries */ int error; /* return value */ xfs_fileoff_t lblkno; @@ -975,21 +986,17 @@ xfs_qm_dqiterate( * Iterate thru all the blks in the extent and * reset the counters of all the dquots inside them. */ - if ((error = xfs_qm_dqiter_bufs(mp, - firstid, - map[i].br_startblock, - map[i].br_blockcount, - flags))) { - break; - } + error = xfs_qm_dqiter_bufs(mp, firstid, + map[i].br_startblock, + map[i].br_blockcount, + flags, buffer_list); + if (error) + goto out; } - - if (error) - break; } while (nmaps > 0); +out: kmem_free(map); - return error; } @@ -1182,8 +1189,11 @@ error0: STATIC int xfs_qm_flush_one( - struct xfs_dquot *dqp) + struct xfs_dquot *dqp, + void *data) { + struct list_head *buffer_list = data; + struct xfs_buf *bp = NULL; int error = 0; xfs_dqlock(dqp); @@ -1192,11 +1202,13 @@ xfs_qm_flush_one( if (!XFS_DQ_IS_DIRTY(dqp)) goto out_unlock; - if (!xfs_dqflock_nowait(dqp)) - xfs_dqflock_pushbuf_wait(dqp); - - error = xfs_qm_dqflush(dqp, 0); + xfs_dqflock(dqp); + error = xfs_qm_dqflush(dqp, &bp); + if (error) + goto out_unlock; + xfs_buf_delwri_queue(bp, buffer_list); + xfs_buf_relse(bp); out_unlock: xfs_dqunlock(dqp); return error; @@ -1215,6 +1227,7 @@ xfs_qm_quotacheck( size_t structsz; xfs_inode_t *uip, *gip; uint flags; + LIST_HEAD (buffer_list); count = INT_MAX; structsz = 1; @@ -1233,7 +1246,8 @@ xfs_qm_quotacheck( */ uip = mp->m_quotainfo->qi_uquotaip; if (uip) { - error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA); + error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA, + &buffer_list); if (error) goto error_return; flags |= XFS_UQUOTA_CHKD; @@ -1242,7 +1256,8 @@ xfs_qm_quotacheck( gip = mp->m_quotainfo->qi_gquotaip; if (gip) { error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ? - XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA); + XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA, + &buffer_list); if (error) goto error_return; flags |= XFS_OQUOTA_CHKD; @@ -1265,19 +1280,27 @@ xfs_qm_quotacheck( * We've made all the changes that we need to make incore. Flush them * down to disk buffers if everything was updated successfully. */ - if (XFS_IS_UQUOTA_ON(mp)) - error = xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_flush_one); + if (XFS_IS_UQUOTA_ON(mp)) { + error = xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_flush_one, + &buffer_list); + } if (XFS_IS_GQUOTA_ON(mp)) { - error2 = xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_flush_one); + error2 = xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_flush_one, + &buffer_list); if (!error) error = error2; } if (XFS_IS_PQUOTA_ON(mp)) { - error2 = xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_flush_one); + error2 = xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_flush_one, + &buffer_list); if (!error) error = error2; } + error2 = xfs_buf_delwri_submit(&buffer_list); + if (!error) + error = error2; + /* * We can get this error if we couldn't do a dquot allocation inside * xfs_qm_dqusage_adjust (via bulkstat). We don't care about the @@ -1291,15 +1314,6 @@ xfs_qm_quotacheck( } /* - * We didn't log anything, because if we crashed, we'll have to - * start the quotacheck from scratch anyway. However, we must make - * sure that our dquot changes are secure before we put the - * quotacheck'd stamp on the superblock. So, here we do a synchronous - * flush. - */ - xfs_flush_buftarg(mp->m_ddev_targp, 1); - - /* * If one type of quotas is off, then it will lose its * quotachecked status, since we won't be doing accounting for * that type anymore. @@ -1308,6 +1322,13 @@ xfs_qm_quotacheck( mp->m_qflags |= flags; error_return: + while (!list_empty(&buffer_list)) { + struct xfs_buf *bp = + list_first_entry(&buffer_list, struct xfs_buf, b_list); + list_del_init(&bp->b_list); + xfs_buf_relse(bp); + } + if (error) { xfs_warn(mp, "Quotacheck: Unsuccessful (Error %d): Disabling quotas.", @@ -1424,6 +1445,7 @@ xfs_qm_dqfree_one( STATIC void xfs_qm_dqreclaim_one( struct xfs_dquot *dqp, + struct list_head *buffer_list, struct list_head *dispose_list) { struct xfs_mount *mp = dqp->q_mount; @@ -1456,25 +1478,20 @@ xfs_qm_dqreclaim_one( if (!xfs_dqflock_nowait(dqp)) goto out_busy; - /* - * We have the flush lock so we know that this is not in the - * process of being flushed. So, if this is dirty, flush it - * DELWRI so that we don't get a freelist infested with - * dirty dquots. - */ if (XFS_DQ_IS_DIRTY(dqp)) { + struct xfs_buf *bp = NULL; + trace_xfs_dqreclaim_dirty(dqp); - /* - * We flush it delayed write, so don't bother releasing the - * freelist lock. - */ - error = xfs_qm_dqflush(dqp, 0); + error = xfs_qm_dqflush(dqp, &bp); if (error) { xfs_warn(mp, "%s: dquot %p flush failed", __func__, dqp); + goto out_busy; } + xfs_buf_delwri_queue(bp, buffer_list); + xfs_buf_relse(bp); /* * Give the dquot another try on the freelist, as the * flushing will take some time. @@ -1518,8 +1535,10 @@ xfs_qm_shake( struct xfs_quotainfo *qi = container_of(shrink, struct xfs_quotainfo, qi_shrinker); int nr_to_scan = sc->nr_to_scan; + LIST_HEAD (buffer_list); LIST_HEAD (dispose_list); struct xfs_dquot *dqp; + int error; if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT)) return 0; @@ -1532,15 +1551,20 @@ xfs_qm_shake( break; dqp = list_first_entry(&qi->qi_lru_list, struct xfs_dquot, q_lru); - xfs_qm_dqreclaim_one(dqp, &dispose_list); + xfs_qm_dqreclaim_one(dqp, &buffer_list, &dispose_list); } mutex_unlock(&qi->qi_lru_lock); + error = xfs_buf_delwri_submit(&buffer_list); + if (error) + xfs_warn(NULL, "%s: dquot reclaim failed", __func__); + while (!list_empty(&dispose_list)) { dqp = list_first_entry(&dispose_list, struct xfs_dquot, q_lru); list_del_init(&dqp->q_lru); xfs_qm_dqfree_one(dqp); } + out: return (qi->qi_lru_count / 100) * sysctl_vfs_cache_pressure; } diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index e6986b5d80d8..6b39115bf145 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -17,9 +17,7 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index c4f396e437a8..858a3b186110 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -22,7 +22,6 @@ #include "xfs_fs.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index 7e76f537abb7..fed504fc2999 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -17,7 +17,6 @@ */ #include "xfs.h" #include "xfs_sb.h" -#include "xfs_inum.h" #include "xfs_log.h" #include "xfs_ag.h" #include "xfs_mount.h" diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c index e44ef7ee8ce8..30ff5f401d28 100644 --- a/fs/xfs/xfs_rename.c +++ b/fs/xfs/xfs_rename.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index ca4f31534a0a..92d4331cd4f1 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -20,7 +20,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -34,7 +33,6 @@ #include "xfs_rtalloc.h" #include "xfs_fsops.h" #include "xfs_error.h" -#include "xfs_rw.h" #include "xfs_inode_item.h" #include "xfs_trans_space.h" #include "xfs_utils.h" @@ -1872,9 +1870,9 @@ xfs_growfs_rt( /* * Read in the last block of the device, make sure it exists. */ - bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp, + bp = xfs_buf_read_uncached(mp->m_rtdev_targp, XFS_FSB_TO_BB(mp, nrblocks - 1), - XFS_FSB_TO_B(mp, 1), 0); + XFS_FSB_TO_BB(mp, 1), 0); if (!bp) return EIO; xfs_buf_relse(bp); @@ -2219,9 +2217,9 @@ xfs_rtmount_init( (unsigned long long) mp->m_sb.sb_rblocks); return XFS_ERROR(EFBIG); } - bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp, + bp = xfs_buf_read_uncached(mp->m_rtdev_targp, d - XFS_FSB_TO_BB(mp, 1), - XFS_FSB_TO_B(mp, 1), 0); + XFS_FSB_TO_BB(mp, 1), 0); if (!bp) { xfs_warn(mp, "realtime device size check failed"); return EIO; diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c deleted file mode 100644 index 597d044a09a1..000000000000 --- a/fs/xfs/xfs_rw.c +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Copyright (c) 2000-2006 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_error.h" -#include "xfs_rw.h" - -/* - * Force a shutdown of the filesystem instantly while keeping - * the filesystem consistent. We don't do an unmount here; just shutdown - * the shop, make sure that absolutely nothing persistent happens to - * this filesystem after this point. - */ -void -xfs_do_force_shutdown( - xfs_mount_t *mp, - int flags, - char *fname, - int lnnum) -{ - int logerror; - - logerror = flags & SHUTDOWN_LOG_IO_ERROR; - - if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { - xfs_notice(mp, - "%s(0x%x) called from line %d of file %s. Return address = 0x%p", - __func__, flags, lnnum, fname, __return_address); - } - /* - * No need to duplicate efforts. - */ - if (XFS_FORCED_SHUTDOWN(mp) && !logerror) - return; - - /* - * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't - * queue up anybody new on the log reservations, and wakes up - * everybody who's sleeping on log reservations to tell them - * the bad news. - */ - if (xfs_log_force_umount(mp, logerror)) - return; - - if (flags & SHUTDOWN_CORRUPT_INCORE) { - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT, - "Corruption of in-memory data detected. Shutting down filesystem"); - if (XFS_ERRLEVEL_HIGH <= xfs_error_level) - xfs_stack_trace(); - } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { - if (logerror) { - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR, - "Log I/O Error Detected. Shutting down filesystem"); - } else if (flags & SHUTDOWN_DEVICE_REQ) { - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, - "All device paths lost. Shutting down filesystem"); - } else if (!(flags & SHUTDOWN_REMOTE_REQ)) { - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, - "I/O Error Detected. Shutting down filesystem"); - } - } - if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { - xfs_alert(mp, - "Please umount the filesystem and rectify the problem(s)"); - } -} - -/* - * This isn't an absolute requirement, but it is - * just a good idea to call xfs_read_buf instead of - * directly doing a read_buf call. For one, we shouldn't - * be doing this disk read if we are in SHUTDOWN state anyway, - * so this stops that from happening. Secondly, this does all - * the error checking stuff and the brelse if appropriate for - * the caller, so the code can be a little leaner. - */ - -int -xfs_read_buf( - struct xfs_mount *mp, - xfs_buftarg_t *target, - xfs_daddr_t blkno, - int len, - uint flags, - xfs_buf_t **bpp) -{ - xfs_buf_t *bp; - int error; - - if (!flags) - flags = XBF_LOCK | XBF_MAPPED; - - bp = xfs_buf_read(target, blkno, len, flags); - if (!bp) - return XFS_ERROR(EIO); - error = bp->b_error; - if (!error && !XFS_FORCED_SHUTDOWN(mp)) { - *bpp = bp; - } else { - *bpp = NULL; - if (error) { - xfs_buf_ioerror_alert(bp, __func__); - } else { - error = XFS_ERROR(EIO); - } - if (bp) { - XFS_BUF_UNDONE(bp); - xfs_buf_stale(bp); - /* - * brelse clears B_ERROR and b_error - */ - xfs_buf_relse(bp); - } - } - return (error); -} - -/* - * helper function to extract extent size hint from inode - */ -xfs_extlen_t -xfs_get_extsz_hint( - struct xfs_inode *ip) -{ - if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize) - return ip->i_d.di_extsize; - if (XFS_IS_REALTIME_INODE(ip)) - return ip->i_mount->m_sb.sb_rextsize; - return 0; -} diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h deleted file mode 100644 index bbdb9ad6a4ba..000000000000 --- a/fs/xfs/xfs_rw.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2000-2006 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_RW_H__ -#define __XFS_RW_H__ - -struct xfs_buf; -struct xfs_inode; -struct xfs_mount; - -/* - * Convert the given file system block to a disk block. - * We have to treat it differently based on whether the - * file is a real time file or not, because the bmap code - * does. - */ -static inline xfs_daddr_t -xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) -{ - return (XFS_IS_REALTIME_INODE(ip) ? \ - (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \ - XFS_FSB_TO_DADDR((ip)->i_mount, (fsb))); -} - -/* - * Prototypes for functions in xfs_rw.c. - */ -extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp, - xfs_daddr_t blkno, int len, uint flags, - struct xfs_buf **bpp); -extern xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); - -#endif /* __XFS_RW_H__ */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index dab9a5f6dfd6..0d9de41a7151 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -17,7 +17,6 @@ */ #include "xfs.h" -#include "xfs_bit.h" #include "xfs_log.h" #include "xfs_inum.h" #include "xfs_trans.h" @@ -622,7 +621,7 @@ void xfs_blkdev_issue_flush( xfs_buftarg_t *buftarg) { - blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL); + blkdev_issue_flush(buftarg->bt_bdev, GFP_NOFS, NULL); } STATIC void @@ -773,8 +772,14 @@ xfs_init_mount_workqueues( if (!mp->m_unwritten_workqueue) goto out_destroy_data_iodone_queue; + mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s", + WQ_MEM_RECLAIM, 0, mp->m_fsname); + if (!mp->m_cil_workqueue) + goto out_destroy_unwritten; return 0; +out_destroy_unwritten: + destroy_workqueue(mp->m_unwritten_workqueue); out_destroy_data_iodone_queue: destroy_workqueue(mp->m_data_workqueue); out: @@ -785,6 +790,7 @@ STATIC void xfs_destroy_mount_workqueues( struct xfs_mount *mp) { + destroy_workqueue(mp->m_cil_workqueue); destroy_workqueue(mp->m_data_workqueue); destroy_workqueue(mp->m_unwritten_workqueue); } @@ -926,7 +932,7 @@ xfs_fs_evict_inode( trace_xfs_evict_inode(ip); truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); XFS_STATS_INC(vn_rele); XFS_STATS_INC(vn_remove); XFS_STATS_DEC(vn_active); @@ -981,18 +987,9 @@ xfs_fs_put_super( { struct xfs_mount *mp = XFS_M(sb); - xfs_syncd_stop(mp); - - /* - * Blow away any referenced inode in the filestreams cache. - * This can and will cause log traffic as inodes go inactive - * here. - */ xfs_filestream_unmount(mp); - - xfs_flush_buftarg(mp->m_ddev_targp, 1); - xfs_unmountfs(mp); + xfs_syncd_stop(mp); xfs_freesb(mp); xfs_icsb_destroy_counters(mp); xfs_destroy_mount_workqueues(mp); @@ -1072,7 +1069,7 @@ xfs_fs_statfs( spin_unlock(&mp->m_sb_lock); - if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) || + if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))) == (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD)) xfs_qm_statvfs(ip, statp); @@ -1362,31 +1359,32 @@ xfs_fs_fill_super( sb->s_time_gran = 1; set_posix_acl_flag(sb); - error = xfs_mountfs(mp); + error = xfs_syncd_init(mp); if (error) goto out_filestream_unmount; - error = xfs_syncd_init(mp); + error = xfs_mountfs(mp); if (error) - goto out_unmount; + goto out_syncd_stop; root = igrab(VFS_I(mp->m_rootip)); if (!root) { error = ENOENT; - goto out_syncd_stop; + goto out_unmount; } if (is_bad_inode(root)) { error = EINVAL; - goto out_syncd_stop; + goto out_unmount; } sb->s_root = d_make_root(root); if (!sb->s_root) { error = ENOMEM; - goto out_syncd_stop; + goto out_unmount; } return 0; - + out_syncd_stop: + xfs_syncd_stop(mp); out_filestream_unmount: xfs_filestream_unmount(mp); out_free_sb: @@ -1403,19 +1401,10 @@ out_destroy_workqueues: out: return -error; - out_syncd_stop: - xfs_syncd_stop(mp); out_unmount: - /* - * Blow away any referenced inode in the filestreams cache. - * This can and will cause log traffic as inodes go inactive - * here. - */ xfs_filestream_unmount(mp); - - xfs_flush_buftarg(mp->m_ddev_targp, 1); - xfs_unmountfs(mp); + xfs_syncd_stop(mp); goto out_free_sb; } diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index 205ebcb34d9e..c9d3409c5ca3 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -18,7 +18,6 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" #include "xfs_inum.h" #include "xfs_trans.h" @@ -241,45 +240,6 @@ xfs_sync_inode_data( return error; } -STATIC int -xfs_sync_inode_attr( - struct xfs_inode *ip, - struct xfs_perag *pag, - int flags) -{ - int error = 0; - - xfs_ilock(ip, XFS_ILOCK_SHARED); - if (xfs_inode_clean(ip)) - goto out_unlock; - if (!xfs_iflock_nowait(ip)) { - if (!(flags & SYNC_WAIT)) - goto out_unlock; - xfs_iflock(ip); - } - - if (xfs_inode_clean(ip)) { - xfs_ifunlock(ip); - goto out_unlock; - } - - error = xfs_iflush(ip, flags); - - /* - * We don't want to try again on non-blocking flushes that can't run - * again immediately. If an inode really must be written, then that's - * what the SYNC_WAIT flag is for. - */ - if (error == EAGAIN) { - ASSERT(!(flags & SYNC_WAIT)); - error = 0; - } - - out_unlock: - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return error; -} - /* * Write out pagecache data for the whole filesystem. */ @@ -300,19 +260,6 @@ xfs_sync_data( return 0; } -/* - * Write out inode metadata (attributes) for the whole filesystem. - */ -STATIC int -xfs_sync_attr( - struct xfs_mount *mp, - int flags) -{ - ASSERT((flags & ~SYNC_WAIT) == 0); - - return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags); -} - STATIC int xfs_sync_fsdata( struct xfs_mount *mp) @@ -350,7 +297,7 @@ xfs_sync_fsdata( * First stage of freeze - no writers will make progress now we are here, * so we flush delwri and delalloc buffers here, then wait for all I/O to * complete. Data is frozen at that point. Metadata is not frozen, - * transactions can still occur here so don't bother flushing the buftarg + * transactions can still occur here so don't bother emptying the AIL * because it'll just get dirty again. */ int @@ -365,47 +312,13 @@ xfs_quiesce_data( /* write superblock and hoover up shutdown errors */ error = xfs_sync_fsdata(mp); - /* make sure all delwri buffers are written out */ - xfs_flush_buftarg(mp->m_ddev_targp, 1); - /* mark the log as covered if needed */ if (xfs_log_need_covered(mp)) error2 = xfs_fs_log_dummy(mp); - /* flush data-only devices */ - if (mp->m_rtdev_targp) - xfs_flush_buftarg(mp->m_rtdev_targp, 1); - return error ? error : error2; } -STATIC void -xfs_quiesce_fs( - struct xfs_mount *mp) -{ - int count = 0, pincount; - - xfs_reclaim_inodes(mp, 0); - xfs_flush_buftarg(mp->m_ddev_targp, 0); - - /* - * This loop must run at least twice. The first instance of the loop - * will flush most meta data but that will generate more meta data - * (typically directory updates). Which then must be flushed and - * logged before we can write the unmount record. We also so sync - * reclaim of inodes to catch any that the above delwri flush skipped. - */ - do { - xfs_reclaim_inodes(mp, SYNC_WAIT); - xfs_sync_attr(mp, SYNC_WAIT); - pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1); - if (!pincount) { - delay(50); - count++; - } - } while (count < 2); -} - /* * Second stage of a quiesce. The data is already synced, now we have to take * care of the metadata. New transactions are already blocked, so we need to @@ -421,8 +334,12 @@ xfs_quiesce_attr( while (atomic_read(&mp->m_active_trans) > 0) delay(100); - /* flush inodes and push all remaining buffers out to disk */ - xfs_quiesce_fs(mp); + /* reclaim inodes to do any IO before the freeze completes */ + xfs_reclaim_inodes(mp, 0); + xfs_reclaim_inodes(mp, SYNC_WAIT); + + /* flush all pending changes from the AIL */ + xfs_ail_push_all_sync(mp->m_ail); /* * Just warn here till VFS can correctly support @@ -436,7 +353,12 @@ xfs_quiesce_attr( xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. " "Frozen image may not be consistent."); xfs_log_unmount_write(mp); - xfs_unmountfs_writesb(mp); + + /* + * At this point we might have modified the superblock again and thus + * added an item to the AIL, thus flush it again. + */ + xfs_ail_push_all_sync(mp->m_ail); } static void @@ -460,16 +382,27 @@ xfs_sync_worker( struct xfs_mount, m_sync_work); int error; - if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { - /* dgc: errors ignored here */ - if (mp->m_super->s_frozen == SB_UNFROZEN && - xfs_log_need_covered(mp)) - error = xfs_fs_log_dummy(mp); - else - xfs_log_force(mp, 0); - - /* start pushing all the metadata that is currently dirty */ - xfs_ail_push_all(mp->m_ail); + /* + * We shouldn't write/force the log if we are in the mount/unmount + * process or on a read only filesystem. The workqueue still needs to be + * active in both cases, however, because it is used for inode reclaim + * during these times. Use the s_umount semaphore to provide exclusion + * with unmount. + */ + if (down_read_trylock(&mp->m_super->s_umount)) { + if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { + /* dgc: errors ignored here */ + if (mp->m_super->s_frozen == SB_UNFROZEN && + xfs_log_need_covered(mp)) + error = xfs_fs_log_dummy(mp); + else + xfs_log_force(mp, 0); + + /* start pushing all the metadata that is currently + * dirty */ + xfs_ail_push_all(mp->m_ail); + } + up_read(&mp->m_super->s_umount); } /* queue us up again */ @@ -488,14 +421,6 @@ xfs_syncd_queue_reclaim( struct xfs_mount *mp) { - /* - * We can have inodes enter reclaim after we've shut down the syncd - * workqueue during unmount, so don't allow reclaim work to be queued - * during unmount. - */ - if (!(mp->m_super->s_flags & MS_ACTIVE)) - return; - rcu_read_lock(); if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work, @@ -564,7 +489,6 @@ xfs_syncd_init( INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); xfs_syncd_queue_sync(mp); - xfs_syncd_queue_reclaim(mp); return 0; } @@ -702,11 +626,8 @@ xfs_reclaim_inode_grab( } /* - * Inodes in different states need to be treated differently, and the return - * value of xfs_iflush is not sufficient to get this right. The following table - * lists the inode states and the reclaim actions necessary for non-blocking - * reclaim: - * + * Inodes in different states need to be treated differently. The following + * table lists the inode states and the reclaim actions necessary: * * inode state iflush ret required action * --------------- ---------- --------------- @@ -716,39 +637,31 @@ xfs_reclaim_inode_grab( * stale, unpinned 0 reclaim * clean, pinned(*) 0 requeue * stale, pinned EAGAIN requeue - * dirty, delwri ok 0 requeue - * dirty, delwri blocked EAGAIN requeue - * dirty, sync flush 0 reclaim + * dirty, async - requeue + * dirty, sync 0 reclaim * * (*) dgc: I don't think the clean, pinned state is possible but it gets * handled anyway given the order of checks implemented. * - * As can be seen from the table, the return value of xfs_iflush() is not - * sufficient to correctly decide the reclaim action here. The checks in - * xfs_iflush() might look like duplicates, but they are not. - * * Also, because we get the flush lock first, we know that any inode that has * been flushed delwri has had the flush completed by the time we check that - * the inode is clean. The clean inode check needs to be done before flushing - * the inode delwri otherwise we would loop forever requeuing clean inodes as - * we cannot tell apart a successful delwri flush and a clean inode from the - * return value of xfs_iflush(). + * the inode is clean. * - * Note that because the inode is flushed delayed write by background - * writeback, the flush lock may already be held here and waiting on it can - * result in very long latencies. Hence for sync reclaims, where we wait on the - * flush lock, the caller should push out delayed write inodes first before - * trying to reclaim them to minimise the amount of time spent waiting. For - * background relaim, we just requeue the inode for the next pass. + * Note that because the inode is flushed delayed write by AIL pushing, the + * flush lock may already be held here and waiting on it can result in very + * long latencies. Hence for sync reclaims, where we wait on the flush lock, + * the caller should push the AIL first before trying to reclaim inodes to + * minimise the amount of time spent waiting. For background relaim, we only + * bother to reclaim clean inodes anyway. * * Hence the order of actions after gaining the locks should be: * bad => reclaim * shutdown => unpin and reclaim - * pinned, delwri => requeue + * pinned, async => requeue * pinned, sync => unpin * stale => reclaim * clean => reclaim - * dirty, delwri => flush and requeue + * dirty, async => requeue * dirty, sync => flush, wait and reclaim */ STATIC int @@ -757,7 +670,8 @@ xfs_reclaim_inode( struct xfs_perag *pag, int sync_mode) { - int error; + struct xfs_buf *bp = NULL; + int error; restart: error = 0; @@ -765,17 +679,6 @@ restart: if (!xfs_iflock_nowait(ip)) { if (!(sync_mode & SYNC_WAIT)) goto out; - - /* - * If we only have a single dirty inode in a cluster there is - * a fair chance that the AIL push may have pushed it into - * the buffer, but xfsbufd won't touch it until 30 seconds - * from now, and thus we will lock up here. - * - * Promote the inode buffer to the front of the delwri list - * and wake up xfsbufd now. - */ - xfs_promote_inode(ip); xfs_iflock(ip); } @@ -783,13 +686,12 @@ restart: goto reclaim; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { xfs_iunpin_wait(ip); + xfs_iflush_abort(ip, false); goto reclaim; } if (xfs_ipincount(ip)) { - if (!(sync_mode & SYNC_WAIT)) { - xfs_ifunlock(ip); - goto out; - } + if (!(sync_mode & SYNC_WAIT)) + goto out_ifunlock; xfs_iunpin_wait(ip); } if (xfs_iflags_test(ip, XFS_ISTALE)) @@ -798,60 +700,42 @@ restart: goto reclaim; /* + * Never flush out dirty data during non-blocking reclaim, as it would + * just contend with AIL pushing trying to do the same job. + */ + if (!(sync_mode & SYNC_WAIT)) + goto out_ifunlock; + + /* * Now we have an inode that needs flushing. * - * We do a nonblocking flush here even if we are doing a SYNC_WAIT - * reclaim as we can deadlock with inode cluster removal. + * Note that xfs_iflush will never block on the inode buffer lock, as * xfs_ifree_cluster() can lock the inode buffer before it locks the - * ip->i_lock, and we are doing the exact opposite here. As a result, - * doing a blocking xfs_itobp() to get the cluster buffer will result + * ip->i_lock, and we are doing the exact opposite here. As a result, + * doing a blocking xfs_itobp() to get the cluster buffer would result * in an ABBA deadlock with xfs_ifree_cluster(). * * As xfs_ifree_cluser() must gather all inodes that are active in the * cache to mark them stale, if we hit this case we don't actually want * to do IO here - we want the inode marked stale so we can simply - * reclaim it. Hence if we get an EAGAIN error on a SYNC_WAIT flush, - * just unlock the inode, back off and try again. Hopefully the next - * pass through will see the stale flag set on the inode. + * reclaim it. Hence if we get an EAGAIN error here, just unlock the + * inode, back off and try again. Hopefully the next pass through will + * see the stale flag set on the inode. */ - error = xfs_iflush(ip, SYNC_TRYLOCK | sync_mode); - if (sync_mode & SYNC_WAIT) { - if (error == EAGAIN) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - /* backoff longer than in xfs_ifree_cluster */ - delay(2); - goto restart; - } - xfs_iflock(ip); - goto reclaim; + error = xfs_iflush(ip, &bp); + if (error == EAGAIN) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + /* backoff longer than in xfs_ifree_cluster */ + delay(2); + goto restart; } - /* - * When we have to flush an inode but don't have SYNC_WAIT set, we - * flush the inode out using a delwri buffer and wait for the next - * call into reclaim to find it in a clean state instead of waiting for - * it now. We also don't return errors here - if the error is transient - * then the next reclaim pass will flush the inode, and if the error - * is permanent then the next sync reclaim will reclaim the inode and - * pass on the error. - */ - if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) { - xfs_warn(ip->i_mount, - "inode 0x%llx background reclaim flush failed with %d", - (long long)ip->i_ino, error); + if (!error) { + error = xfs_bwrite(bp); + xfs_buf_relse(bp); } -out: - xfs_iflags_clear(ip, XFS_IRECLAIM); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - /* - * We could return EAGAIN here to make reclaim rescan the inode tree in - * a short while. However, this just burns CPU time scanning the tree - * waiting for IO to complete and xfssyncd never goes back to the idle - * state. Instead, return 0 to let the next scheduled background reclaim - * attempt to reclaim the inode again. - */ - return 0; + xfs_iflock(ip); reclaim: xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -884,8 +768,21 @@ reclaim: xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_inode_free(ip); - return error; + +out_ifunlock: + xfs_ifunlock(ip); +out: + xfs_iflags_clear(ip, XFS_IRECLAIM); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + /* + * We could return EAGAIN here to make reclaim rescan the inode tree in + * a short while. However, this just burns CPU time scanning the tree + * waiting for IO to complete and xfssyncd never goes back to the idle + * state. Instead, return 0 to let the next scheduled background reclaim + * attempt to reclaim the inode again. + */ + return 0; } /* diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 9010ce885e6a..624bedd81357 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -18,9 +18,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 06838c42b2a0..7cf9d3529e51 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -281,7 +281,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_daddr_t, bno) - __field(size_t, buffer_length) + __field(int, nblks) __field(int, hold) __field(int, pincount) __field(unsigned, lockval) @@ -291,18 +291,18 @@ DECLARE_EVENT_CLASS(xfs_buf_class, TP_fast_assign( __entry->dev = bp->b_target->bt_dev; __entry->bno = bp->b_bn; - __entry->buffer_length = bp->b_buffer_length; + __entry->nblks = bp->b_length; __entry->hold = atomic_read(&bp->b_hold); __entry->pincount = atomic_read(&bp->b_pin_count); __entry->lockval = bp->b_sema.count; __entry->flags = bp->b_flags; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " + TP_printk("dev %d:%d bno 0x%llx nblks 0x%x hold %d pincount %d " "lock %d flags %s caller %pf", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->bno, - __entry->buffer_length, + __entry->nblks, __entry->hold, __entry->pincount, __entry->lockval, @@ -328,7 +328,7 @@ DEFINE_BUF_EVENT(xfs_buf_unlock); DEFINE_BUF_EVENT(xfs_buf_iowait); DEFINE_BUF_EVENT(xfs_buf_iowait_done); DEFINE_BUF_EVENT(xfs_buf_delwri_queue); -DEFINE_BUF_EVENT(xfs_buf_delwri_dequeue); +DEFINE_BUF_EVENT(xfs_buf_delwri_queued); DEFINE_BUF_EVENT(xfs_buf_delwri_split); DEFINE_BUF_EVENT(xfs_buf_get_uncached); DEFINE_BUF_EVENT(xfs_bdstrat_shut); @@ -362,7 +362,7 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class, TP_fast_assign( __entry->dev = bp->b_target->bt_dev; __entry->bno = bp->b_bn; - __entry->buffer_length = bp->b_buffer_length; + __entry->buffer_length = BBTOB(bp->b_length); __entry->flags = flags; __entry->hold = atomic_read(&bp->b_hold); __entry->pincount = atomic_read(&bp->b_pin_count); @@ -406,7 +406,7 @@ TRACE_EVENT(xfs_buf_ioerror, TP_fast_assign( __entry->dev = bp->b_target->bt_dev; __entry->bno = bp->b_bn; - __entry->buffer_length = bp->b_buffer_length; + __entry->buffer_length = BBTOB(bp->b_length); __entry->hold = atomic_read(&bp->b_hold); __entry->pincount = atomic_read(&bp->b_pin_count); __entry->lockval = bp->b_sema.count; @@ -450,7 +450,7 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class, __entry->bli_recur = bip->bli_recur; __entry->bli_refcount = atomic_read(&bip->bli_refcount); __entry->buf_bno = bip->bli_buf->b_bn; - __entry->buf_len = bip->bli_buf->b_buffer_length; + __entry->buf_len = BBTOB(bip->bli_buf->b_length); __entry->buf_flags = bip->bli_buf->b_flags; __entry->buf_hold = atomic_read(&bip->bli_buf->b_hold); __entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count); @@ -486,12 +486,10 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale); -DEFINE_BUF_ITEM_EVENT(xfs_buf_item_trylock); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push); -DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pushbuf); DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf); DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur); DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb); @@ -876,15 +874,30 @@ DECLARE_EVENT_CLASS(xfs_log_item_class, __print_flags(__entry->flags, "|", XFS_LI_FLAGS)) ) +TRACE_EVENT(xfs_log_force, + TP_PROTO(struct xfs_mount *mp, xfs_lsn_t lsn), + TP_ARGS(mp, lsn), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_lsn_t, lsn) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->lsn = lsn; + ), + TP_printk("dev %d:%d lsn 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->lsn) +) + #define DEFINE_LOG_ITEM_EVENT(name) \ DEFINE_EVENT(xfs_log_item_class, name, \ TP_PROTO(struct xfs_log_item *lip), \ TP_ARGS(lip)) DEFINE_LOG_ITEM_EVENT(xfs_ail_push); -DEFINE_LOG_ITEM_EVENT(xfs_ail_pushbuf); -DEFINE_LOG_ITEM_EVENT(xfs_ail_pushbuf_pinned); DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned); DEFINE_LOG_ITEM_EVENT(xfs_ail_locked); +DEFINE_LOG_ITEM_EVENT(xfs_ail_flushing); DECLARE_EVENT_CLASS(xfs_file_class, @@ -1145,7 +1158,7 @@ TRACE_EVENT(xfs_bunmap, ); -DECLARE_EVENT_CLASS(xfs_busy_class, +DECLARE_EVENT_CLASS(xfs_extent_busy_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t len), TP_ARGS(mp, agno, agbno, len), @@ -1168,17 +1181,17 @@ DECLARE_EVENT_CLASS(xfs_busy_class, __entry->len) ); #define DEFINE_BUSY_EVENT(name) \ -DEFINE_EVENT(xfs_busy_class, name, \ +DEFINE_EVENT(xfs_extent_busy_class, name, \ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ xfs_agblock_t agbno, xfs_extlen_t len), \ TP_ARGS(mp, agno, agbno, len)) -DEFINE_BUSY_EVENT(xfs_alloc_busy); -DEFINE_BUSY_EVENT(xfs_alloc_busy_enomem); -DEFINE_BUSY_EVENT(xfs_alloc_busy_force); -DEFINE_BUSY_EVENT(xfs_alloc_busy_reuse); -DEFINE_BUSY_EVENT(xfs_alloc_busy_clear); +DEFINE_BUSY_EVENT(xfs_extent_busy); +DEFINE_BUSY_EVENT(xfs_extent_busy_enomem); +DEFINE_BUSY_EVENT(xfs_extent_busy_force); +DEFINE_BUSY_EVENT(xfs_extent_busy_reuse); +DEFINE_BUSY_EVENT(xfs_extent_busy_clear); -TRACE_EVENT(xfs_alloc_busy_trim, +TRACE_EVENT(xfs_extent_busy_trim, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t len, xfs_agblock_t tbno, xfs_extlen_t tlen), diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 103b00c90004..cdf896fcbfa4 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -19,9 +19,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -36,6 +34,7 @@ #include "xfs_btree.h" #include "xfs_ialloc.h" #include "xfs_alloc.h" +#include "xfs_extent_busy.h" #include "xfs_bmap.h" #include "xfs_quota.h" #include "xfs_trans_priv.h" @@ -608,8 +607,8 @@ STATIC void xfs_trans_free( struct xfs_trans *tp) { - xfs_alloc_busy_sort(&tp->t_busy); - xfs_alloc_busy_clear(tp->t_mountp, &tp->t_busy, false); + xfs_extent_busy_sort(&tp->t_busy); + xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false); atomic_dec(&tp->t_mountp->m_active_trans); xfs_trans_free_dqinfo(tp); diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index f6118703f20d..7ab99e1898c8 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -345,11 +345,9 @@ struct xfs_item_ops { void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); void (*iop_pin)(xfs_log_item_t *); void (*iop_unpin)(xfs_log_item_t *, int remove); - uint (*iop_trylock)(xfs_log_item_t *); + uint (*iop_push)(struct xfs_log_item *, struct list_head *); void (*iop_unlock)(xfs_log_item_t *); xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t); - void (*iop_push)(xfs_log_item_t *); - bool (*iop_pushbuf)(xfs_log_item_t *); void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); }; @@ -357,20 +355,18 @@ struct xfs_item_ops { #define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp) #define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip) #define IOP_UNPIN(ip, remove) (*(ip)->li_ops->iop_unpin)(ip, remove) -#define IOP_TRYLOCK(ip) (*(ip)->li_ops->iop_trylock)(ip) +#define IOP_PUSH(ip, list) (*(ip)->li_ops->iop_push)(ip, list) #define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip) #define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn) -#define IOP_PUSH(ip) (*(ip)->li_ops->iop_push)(ip) -#define IOP_PUSHBUF(ip) (*(ip)->li_ops->iop_pushbuf)(ip) #define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn) /* - * Return values for the IOP_TRYLOCK() routines. + * Return values for the IOP_PUSH() routines. */ -#define XFS_ITEM_SUCCESS 0 -#define XFS_ITEM_PINNED 1 -#define XFS_ITEM_LOCKED 2 -#define XFS_ITEM_PUSHBUF 3 +#define XFS_ITEM_SUCCESS 0 +#define XFS_ITEM_PINNED 1 +#define XFS_ITEM_LOCKED 2 +#define XFS_ITEM_FLUSHING 3 /* * This is the type of function which can be given to xfs_trans_callback() diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 1dead07f092c..9c514483e599 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -20,7 +20,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -79,7 +78,7 @@ xfs_ail_check( * Return a pointer to the first item in the AIL. If the AIL is empty, then * return NULL. */ -static xfs_log_item_t * +xfs_log_item_t * xfs_ail_min( struct xfs_ail *ailp) { @@ -364,30 +363,31 @@ xfsaild_push( xfs_log_item_t *lip; xfs_lsn_t lsn; xfs_lsn_t target; - long tout = 10; + long tout; int stuck = 0; + int flushing = 0; int count = 0; - int push_xfsbufd = 0; /* - * If last time we ran we encountered pinned items, force the log first - * and wait for it before pushing again. + * If we encountered pinned items or did not finish writing out all + * buffers the last time we ran, force the log first and wait for it + * before pushing again. */ - spin_lock(&ailp->xa_lock); - if (ailp->xa_last_pushed_lsn == 0 && ailp->xa_log_flush && - !list_empty(&ailp->xa_ail)) { + if (ailp->xa_log_flush && ailp->xa_last_pushed_lsn == 0 && + (!list_empty_careful(&ailp->xa_buf_list) || + xfs_ail_min_lsn(ailp))) { ailp->xa_log_flush = 0; - spin_unlock(&ailp->xa_lock); + XFS_STATS_INC(xs_push_ail_flush); xfs_log_force(mp, XFS_LOG_SYNC); - spin_lock(&ailp->xa_lock); } - target = ailp->xa_target; + spin_lock(&ailp->xa_lock); lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->xa_last_pushed_lsn); - if (!lip || XFS_FORCED_SHUTDOWN(mp)) { + if (!lip) { /* - * AIL is empty or our push has reached the end. + * If the AIL is empty or our push has reached the end we are + * done now. */ xfs_trans_ail_cursor_done(ailp, &cur); spin_unlock(&ailp->xa_lock); @@ -396,54 +396,42 @@ xfsaild_push( XFS_STATS_INC(xs_push_ail); - /* - * While the item we are looking at is below the given threshold - * try to flush it out. We'd like not to stop until we've at least - * tried to push on everything in the AIL with an LSN less than - * the given threshold. - * - * However, we will stop after a certain number of pushes and wait - * for a reduced timeout to fire before pushing further. This - * prevents use from spinning when we can't do anything or there is - * lots of contention on the AIL lists. - */ lsn = lip->li_lsn; + target = ailp->xa_target; while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) { int lock_result; + /* - * If we can lock the item without sleeping, unlock the AIL - * lock and flush the item. Then re-grab the AIL lock so we - * can look for the next item on the AIL. List changes are - * handled by the AIL lookup functions internally - * - * If we can't lock the item, either its holder will flush it - * or it is already being flushed or it is being relogged. In - * any of these case it is being taken care of and we can just - * skip to the next item in the list. + * Note that IOP_PUSH may unlock and reacquire the AIL lock. We + * rely on the AIL cursor implementation to be able to deal with + * the dropped lock. */ - lock_result = IOP_TRYLOCK(lip); - spin_unlock(&ailp->xa_lock); + lock_result = IOP_PUSH(lip, &ailp->xa_buf_list); switch (lock_result) { case XFS_ITEM_SUCCESS: XFS_STATS_INC(xs_push_ail_success); trace_xfs_ail_push(lip); - IOP_PUSH(lip); ailp->xa_last_pushed_lsn = lsn; break; - case XFS_ITEM_PUSHBUF: - XFS_STATS_INC(xs_push_ail_pushbuf); - trace_xfs_ail_pushbuf(lip); - - if (!IOP_PUSHBUF(lip)) { - trace_xfs_ail_pushbuf_pinned(lip); - stuck++; - ailp->xa_log_flush++; - } else { - ailp->xa_last_pushed_lsn = lsn; - } - push_xfsbufd = 1; + case XFS_ITEM_FLUSHING: + /* + * The item or its backing buffer is already beeing + * flushed. The typical reason for that is that an + * inode buffer is locked because we already pushed the + * updates to it as part of inode clustering. + * + * We do not want to to stop flushing just because lots + * of items are already beeing flushed, but we need to + * re-try the flushing relatively soon if most of the + * AIL is beeing flushed. + */ + XFS_STATS_INC(xs_push_ail_flushing); + trace_xfs_ail_flushing(lip); + + flushing++; + ailp->xa_last_pushed_lsn = lsn; break; case XFS_ITEM_PINNED: @@ -453,28 +441,22 @@ xfsaild_push( stuck++; ailp->xa_log_flush++; break; - case XFS_ITEM_LOCKED: XFS_STATS_INC(xs_push_ail_locked); trace_xfs_ail_locked(lip); + stuck++; break; - default: ASSERT(0); break; } - spin_lock(&ailp->xa_lock); - /* should we bother continuing? */ - if (XFS_FORCED_SHUTDOWN(mp)) - break; - ASSERT(mp->m_log); - count++; /* * Are there too many items we can't do anything with? + * * If we we are skipping too many items because we can't flush * them or they are already being flushed, we back off and * given them time to complete whatever operation is being @@ -496,42 +478,36 @@ xfsaild_push( xfs_trans_ail_cursor_done(ailp, &cur); spin_unlock(&ailp->xa_lock); - if (push_xfsbufd) { - /* we've got delayed write buffers to flush */ - wake_up_process(mp->m_ddev_targp->bt_task); - } + if (xfs_buf_delwri_submit_nowait(&ailp->xa_buf_list)) + ailp->xa_log_flush++; - /* assume we have more work to do in a short while */ + if (!count || XFS_LSN_CMP(lsn, target) >= 0) { out_done: - if (!count) { - /* We're past our target or empty, so idle */ - ailp->xa_last_pushed_lsn = 0; - ailp->xa_log_flush = 0; - - tout = 50; - } else if (XFS_LSN_CMP(lsn, target) >= 0) { /* - * We reached the target so wait a bit longer for I/O to - * complete and remove pushed items from the AIL before we - * start the next scan from the start of the AIL. + * We reached the target or the AIL is empty, so wait a bit + * longer for I/O to complete and remove pushed items from the + * AIL before we start the next scan from the start of the AIL. */ tout = 50; ailp->xa_last_pushed_lsn = 0; - } else if ((stuck * 100) / count > 90) { + } else if (((stuck + flushing) * 100) / count > 90) { /* - * Either there is a lot of contention on the AIL or we - * are stuck due to operations in progress. "Stuck" in this - * case is defined as >90% of the items we tried to push - * were stuck. + * Either there is a lot of contention on the AIL or we are + * stuck due to operations in progress. "Stuck" in this case + * is defined as >90% of the items we tried to push were stuck. * * Backoff a bit more to allow some I/O to complete before - * restarting from the start of the AIL. This prevents us - * from spinning on the same items, and if they are pinned will - * all the restart to issue a log force to unpin the stuck - * items. + * restarting from the start of the AIL. This prevents us from + * spinning on the same items, and if they are pinned will all + * the restart to issue a log force to unpin the stuck items. */ tout = 20; ailp->xa_last_pushed_lsn = 0; + } else { + /* + * Assume we have more work to do in a short while. + */ + tout = 10; } return tout; @@ -544,6 +520,8 @@ xfsaild( struct xfs_ail *ailp = data; long tout = 0; /* milliseconds */ + current->flags |= PF_MEMALLOC; + while (!kthread_should_stop()) { if (tout && tout <= 20) __set_current_state(TASK_KILLABLE); @@ -611,6 +589,30 @@ xfs_ail_push_all( } /* + * Push out all items in the AIL immediately and wait until the AIL is empty. + */ +void +xfs_ail_push_all_sync( + struct xfs_ail *ailp) +{ + struct xfs_log_item *lip; + DEFINE_WAIT(wait); + + spin_lock(&ailp->xa_lock); + while ((lip = xfs_ail_max(ailp)) != NULL) { + prepare_to_wait(&ailp->xa_empty, &wait, TASK_UNINTERRUPTIBLE); + ailp->xa_target = lip->li_lsn; + wake_up_process(ailp->xa_task); + spin_unlock(&ailp->xa_lock); + schedule(); + spin_lock(&ailp->xa_lock); + } + spin_unlock(&ailp->xa_lock); + + finish_wait(&ailp->xa_empty, &wait); +} + +/* * xfs_trans_ail_update - bulk AIL insertion operation. * * @xfs_trans_ail_update takes an array of log items that all need to be @@ -667,11 +669,15 @@ xfs_trans_ail_update_bulk( if (!list_empty(&tmp)) xfs_ail_splice(ailp, cur, &tmp, lsn); - spin_unlock(&ailp->xa_lock); - if (mlip_changed && !XFS_FORCED_SHUTDOWN(ailp->xa_mount)) { - xlog_assign_tail_lsn(ailp->xa_mount); + if (mlip_changed) { + if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount)) + xlog_assign_tail_lsn_locked(ailp->xa_mount); + spin_unlock(&ailp->xa_lock); + xfs_log_space_wake(ailp->xa_mount); + } else { + spin_unlock(&ailp->xa_lock); } } @@ -700,7 +706,8 @@ void xfs_trans_ail_delete_bulk( struct xfs_ail *ailp, struct xfs_log_item **log_items, - int nr_items) __releases(ailp->xa_lock) + int nr_items, + int shutdown_type) __releases(ailp->xa_lock) { xfs_log_item_t *mlip; int mlip_changed = 0; @@ -718,7 +725,7 @@ xfs_trans_ail_delete_bulk( xfs_alert_tag(mp, XFS_PTAG_AILDELETE, "%s: attempting to delete a log item that is not in the AIL", __func__); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + xfs_force_shutdown(mp, shutdown_type); } return; } @@ -729,28 +736,20 @@ xfs_trans_ail_delete_bulk( if (mlip == lip) mlip_changed = 1; } - spin_unlock(&ailp->xa_lock); - if (mlip_changed && !XFS_FORCED_SHUTDOWN(ailp->xa_mount)) { - xlog_assign_tail_lsn(ailp->xa_mount); + if (mlip_changed) { + if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount)) + xlog_assign_tail_lsn_locked(ailp->xa_mount); + if (list_empty(&ailp->xa_ail)) + wake_up_all(&ailp->xa_empty); + spin_unlock(&ailp->xa_lock); + xfs_log_space_wake(ailp->xa_mount); + } else { + spin_unlock(&ailp->xa_lock); } } -/* - * The active item list (AIL) is a doubly linked list of log - * items sorted by ascending lsn. The base of the list is - * a forw/back pointer pair embedded in the xfs mount structure. - * The base is initialized with both pointers pointing to the - * base. This case always needs to be distinguished, because - * the base has no lsn to look at. We almost always insert - * at the end of the list, so on inserts we search from the - * end of the list to find where the new item belongs. - */ - -/* - * Initialize the doubly linked list to point only to itself. - */ int xfs_trans_ail_init( xfs_mount_t *mp) @@ -765,6 +764,8 @@ xfs_trans_ail_init( INIT_LIST_HEAD(&ailp->xa_ail); INIT_LIST_HEAD(&ailp->xa_cursors); spin_lock_init(&ailp->xa_lock); + INIT_LIST_HEAD(&ailp->xa_buf_list); + init_waitqueue_head(&ailp->xa_empty); ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s", ailp->xa_mount->m_fsname); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 1302d1d95a58..21c5a5e3700d 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -18,9 +18,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -33,7 +31,6 @@ #include "xfs_buf_item.h" #include "xfs_trans_priv.h" #include "xfs_error.h" -#include "xfs_rw.h" #include "xfs_trace.h" /* @@ -56,7 +53,7 @@ xfs_trans_buf_item_match( if (blip->bli_item.li_type == XFS_LI_BUF && blip->bli_buf->b_target == target && XFS_BUF_ADDR(blip->bli_buf) == blkno && - XFS_BUF_COUNT(blip->bli_buf) == len) + BBTOB(blip->bli_buf->b_length) == len) return blip->bli_buf; } @@ -141,15 +138,11 @@ xfs_trans_get_buf(xfs_trans_t *tp, xfs_buf_t *bp; xfs_buf_log_item_t *bip; - if (flags == 0) - flags = XBF_LOCK | XBF_MAPPED; - /* * Default to a normal get_buf() call if the tp is NULL. */ if (tp == NULL) - return xfs_buf_get(target_dev, blkno, len, - flags | XBF_DONT_BLOCK); + return xfs_buf_get(target_dev, blkno, len, flags); /* * If we find the buffer in the cache with this transaction @@ -165,14 +158,6 @@ xfs_trans_get_buf(xfs_trans_t *tp, XFS_BUF_DONE(bp); } - /* - * If the buffer is stale then it was binval'ed - * since last read. This doesn't matter since the - * caller isn't allowed to use the data anyway. - */ - else if (XFS_BUF_ISSTALE(bp)) - ASSERT(!XFS_BUF_ISDELAYWRITE(bp)); - ASSERT(bp->b_transp == tp); bip = bp->b_fspriv; ASSERT(bip != NULL); @@ -182,15 +167,7 @@ xfs_trans_get_buf(xfs_trans_t *tp, return (bp); } - /* - * We always specify the XBF_DONT_BLOCK flag within a transaction - * so that get_buf does not try to push out a delayed write buffer - * which might cause another transaction to take place (if the - * buffer was delayed alloc). Such recursive transactions can - * easily deadlock with our current transaction as well as cause - * us to run out of stack space. - */ - bp = xfs_buf_get(target_dev, blkno, len, flags | XBF_DONT_BLOCK); + bp = xfs_buf_get(target_dev, blkno, len, flags); if (bp == NULL) { return NULL; } @@ -282,14 +259,13 @@ xfs_trans_read_buf( xfs_buf_log_item_t *bip; int error; - if (flags == 0) - flags = XBF_LOCK | XBF_MAPPED; + *bpp = NULL; /* * Default to a normal get_buf() call if the tp is NULL. */ if (tp == NULL) { - bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK); + bp = xfs_buf_read(target, blkno, len, flags); if (!bp) return (flags & XBF_TRYLOCK) ? EAGAIN : XFS_ERROR(ENOMEM); @@ -297,6 +273,8 @@ xfs_trans_read_buf( if (bp->b_error) { error = bp->b_error; xfs_buf_ioerror_alert(bp, __func__); + XFS_BUF_UNDONE(bp); + xfs_buf_stale(bp); xfs_buf_relse(bp); return error; } @@ -371,15 +349,7 @@ xfs_trans_read_buf( return 0; } - /* - * We always specify the XBF_DONT_BLOCK flag within a transaction - * so that get_buf does not try to push out a delayed write buffer - * which might cause another transaction to take place (if the - * buffer was delayed alloc). Such recursive transactions can - * easily deadlock with our current transaction as well as cause - * us to run out of stack space. - */ - bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK); + bp = xfs_buf_read(target, blkno, len, flags); if (bp == NULL) { *bpp = NULL; return (flags & XBF_TRYLOCK) ? @@ -418,19 +388,6 @@ xfs_trans_read_buf( return 0; shutdown_abort: - /* - * the theory here is that buffer is good but we're - * bailing out because the filesystem is being forcibly - * shut down. So we should leave the b_flags alone since - * the buffer's not staled and just get out. - */ -#if defined(DEBUG) - if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp)) - xfs_notice(mp, "about to pop assert, bp == 0x%p", bp); -#endif - ASSERT((bp->b_flags & (XBF_STALE|XBF_DELWRI)) != - (XBF_STALE|XBF_DELWRI)); - trace_xfs_trans_read_buf_shut(bp, _RET_IP_); xfs_buf_relse(bp); *bpp = NULL; @@ -606,7 +563,7 @@ xfs_trans_log_buf(xfs_trans_t *tp, ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); - ASSERT((first <= last) && (last < XFS_BUF_COUNT(bp))); + ASSERT(first <= last && last < BBTOB(bp->b_length)); ASSERT(bp->b_iodone == NULL || bp->b_iodone == xfs_buf_iodone_callbacks); @@ -626,8 +583,6 @@ xfs_trans_log_buf(xfs_trans_t *tp, bp->b_iodone = xfs_buf_iodone_callbacks; bip->bli_item.li_cb = xfs_buf_iodone; - xfs_buf_delwri_queue(bp); - trace_xfs_trans_log_buf(bip); /* @@ -651,22 +606,33 @@ xfs_trans_log_buf(xfs_trans_t *tp, /* - * This called to invalidate a buffer that is being used within - * a transaction. Typically this is because the blocks in the - * buffer are being freed, so we need to prevent it from being - * written out when we're done. Allowing it to be written again - * might overwrite data in the free blocks if they are reallocated - * to a file. + * Invalidate a buffer that is being used within a transaction. + * + * Typically this is because the blocks in the buffer are being freed, so we + * need to prevent it from being written out when we're done. Allowing it + * to be written again might overwrite data in the free blocks if they are + * reallocated to a file. + * + * We prevent the buffer from being written out by marking it stale. We can't + * get rid of the buf log item at this point because the buffer may still be + * pinned by another transaction. If that is the case, then we'll wait until + * the buffer is committed to disk for the last time (we can tell by the ref + * count) and free it in xfs_buf_item_unpin(). Until that happens we will + * keep the buffer locked so that the buffer and buf log item are not reused. + * + * We also set the XFS_BLF_CANCEL flag in the buf log format structure and log + * the buf item. This will be used at recovery time to determine that copies + * of the buffer in the log before this should not be replayed. * - * We prevent the buffer from being written out by clearing the - * B_DELWRI flag. We can't always - * get rid of the buf log item at this point, though, because - * the buffer may still be pinned by another transaction. If that - * is the case, then we'll wait until the buffer is committed to - * disk for the last time (we can tell by the ref count) and - * free it in xfs_buf_item_unpin(). Until it is cleaned up we - * will keep the buffer locked so that the buffer and buf log item - * are not reused. + * We mark the item descriptor and the transaction dirty so that we'll hold + * the buffer until after the commit. + * + * Since we're invalidating the buffer, we also clear the state about which + * parts of the buffer have been logged. We also clear the flag indicating + * that this is an inode buffer since the data in the buffer will no longer + * be valid. + * + * We set the stale bit in the buffer as well since we're getting rid of it. */ void xfs_trans_binval( @@ -686,7 +652,6 @@ xfs_trans_binval( * If the buffer is already invalidated, then * just return. */ - ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); ASSERT(XFS_BUF_ISSTALE(bp)); ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF)); @@ -696,27 +661,8 @@ xfs_trans_binval( return; } - /* - * Clear the dirty bit in the buffer and set the STALE flag - * in the buf log item. The STALE flag will be used in - * xfs_buf_item_unpin() to determine if it should clean up - * when the last reference to the buf item is given up. - * We set the XFS_BLF_CANCEL flag in the buf log format structure - * and log the buf item. This will be used at recovery time - * to determine that copies of the buffer in the log before - * this should not be replayed. - * We mark the item descriptor and the transaction dirty so - * that we'll hold the buffer until after the commit. - * - * Since we're invalidating the buffer, we also clear the state - * about which parts of the buffer have been logged. We also - * clear the flag indicating that this is an inode buffer since - * the data in the buffer will no longer be valid. - * - * We set the stale bit in the buffer as well since we're getting - * rid of it. - */ xfs_buf_stale(bp); + bip->bli_flags |= XFS_BLI_STALE; bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 279099717ed2..bcb60542fcf1 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -17,9 +17,7 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c index f7590f5badea..8d71b16eccae 100644 --- a/fs/xfs/xfs_trans_extfree.c +++ b/fs/xfs/xfs_trans_extfree.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index 7a7442c03f2b..d2eee20d5f5b 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -18,9 +18,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 8ab2ced415f1..fb62377d1cbc 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -71,6 +71,8 @@ struct xfs_ail { spinlock_t xa_lock; xfs_lsn_t xa_last_pushed_lsn; int xa_log_flush; + struct list_head xa_buf_list; + wait_queue_head_t xa_empty; }; /* @@ -90,18 +92,22 @@ xfs_trans_ail_update( } void xfs_trans_ail_delete_bulk(struct xfs_ail *ailp, - struct xfs_log_item **log_items, int nr_items) + struct xfs_log_item **log_items, int nr_items, + int shutdown_type) __releases(ailp->xa_lock); static inline void xfs_trans_ail_delete( struct xfs_ail *ailp, - xfs_log_item_t *lip) __releases(ailp->xa_lock) + xfs_log_item_t *lip, + int shutdown_type) __releases(ailp->xa_lock) { - xfs_trans_ail_delete_bulk(ailp, &lip, 1); + xfs_trans_ail_delete_bulk(ailp, &lip, 1, shutdown_type); } void xfs_ail_push(struct xfs_ail *, xfs_lsn_t); void xfs_ail_push_all(struct xfs_ail *); +void xfs_ail_push_all_sync(struct xfs_ail *); +struct xfs_log_item *xfs_ail_min(struct xfs_ail *ailp); xfs_lsn_t xfs_ail_min_lsn(struct xfs_ail *ailp); struct xfs_log_item * xfs_trans_ail_cursor_first(struct xfs_ail *ailp, diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h index 65584b55607d..398cf681d025 100644 --- a/fs/xfs/xfs_types.h +++ b/fs/xfs/xfs_types.h @@ -57,6 +57,7 @@ typedef __uint64_t __psunsigned_t; #endif /* __KERNEL__ */ typedef __uint32_t xfs_agblock_t; /* blockno in alloc. group */ +typedef __uint32_t xfs_agino_t; /* inode # within allocation grp */ typedef __uint32_t xfs_extlen_t; /* extent length in blocks */ typedef __uint32_t xfs_agnumber_t; /* allocation group number */ typedef __int32_t xfs_extnum_t; /* # of extents in a file */ @@ -101,6 +102,7 @@ typedef __uint64_t xfs_fileoff_t; /* block number in a file */ typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */ typedef __uint64_t xfs_filblks_t; /* number of blocks in a file */ + /* * Null values for the types. */ @@ -120,6 +122,9 @@ typedef __uint64_t xfs_filblks_t; /* number of blocks in a file */ #define NULLCOMMITLSN ((xfs_lsn_t)-1) +#define NULLFSINO ((xfs_ino_t)-1) +#define NULLAGINO ((xfs_agino_t)-1) + /* * Max values for extlen, extnum, aextnum. */ diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c index 79c05ac85bfe..4e5b9ad5cb97 100644 --- a/fs/xfs/xfs_utils.c +++ b/fs/xfs/xfs_utils.c @@ -18,9 +18,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 64981d7e7375..b6a82d817a82 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -21,7 +21,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -39,7 +38,6 @@ #include "xfs_bmap.h" #include "xfs_acl.h" #include "xfs_attr.h" -#include "xfs_rw.h" #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_utils.h" @@ -81,8 +79,7 @@ xfs_readlink_bmap( d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); - bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), - XBF_LOCK | XBF_MAPPED | XBF_DONT_BLOCK); + bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0); if (!bp) return XFS_ERROR(ENOMEM); error = bp->b_error; @@ -1919,7 +1916,7 @@ xfs_alloc_file_space( error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ xfs_bmap_cancel(&free_list); - xfs_trans_unreserve_quota_nblks(tp, ip, qblocks, 0, quota_flag); + xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); error1: /* Just cancel transaction */ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); @@ -1966,7 +1963,7 @@ xfs_zero_remaining_bytes( bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp, - mp->m_sb.sb_blocksize, XBF_DONT_BLOCK); + BTOBB(mp->m_sb.sb_blocksize), 0); if (!bp) return XFS_ERROR(ENOMEM); @@ -2315,17 +2312,33 @@ xfs_change_file_space( case XFS_IOC_ALLOCSP64: case XFS_IOC_FREESP: case XFS_IOC_FREESP64: + /* + * These operations actually do IO when extending the file, but + * the allocation is done seperately to the zeroing that is + * done. This set of operations need to be serialised against + * other IO operations, such as truncate and buffered IO. We + * need to take the IOLOCK here to serialise the allocation and + * zeroing IO to prevent other IOLOCK holders (e.g. getbmap, + * truncate, direct IO) from racing against the transient + * allocated but not written state we can have here. + */ + xfs_ilock(ip, XFS_IOLOCK_EXCL); if (startoffset > fsize) { error = xfs_alloc_file_space(ip, fsize, - startoffset - fsize, 0, attr_flags); - if (error) + startoffset - fsize, 0, + attr_flags | XFS_ATTR_NOLOCK); + if (error) { + xfs_iunlock(ip, XFS_IOLOCK_EXCL); break; + } } iattr.ia_valid = ATTR_SIZE; iattr.ia_size = startoffset; - error = xfs_setattr_size(ip, &iattr, attr_flags); + error = xfs_setattr_size(ip, &iattr, + attr_flags | XFS_ATTR_NOLOCK); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); if (error) return error; |