diff options
Diffstat (limited to 'fs')
318 files changed, 7118 insertions, 5028 deletions
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 7bf835f85bc8..eadc894faea2 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -449,14 +449,14 @@ static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end, if (retval) return retval; - mutex_lock(&inode->i_mutex); + inode_lock(inode); p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync); fid = filp->private_data; v9fs_blank_wstat(&wstat); retval = p9_client_wstat(fid, &wstat); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return retval; } @@ -472,13 +472,13 @@ int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end, if (retval) return retval; - mutex_lock(&inode->i_mutex); + inode_lock(inode); p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync); fid = filp->private_data; retval = p9_client_fsync(fid, datasync); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return retval; } diff --git a/fs/affs/file.c b/fs/affs/file.c index 659c579c4588..22fc7c802d69 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -33,11 +33,11 @@ affs_file_release(struct inode *inode, struct file *filp) inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt)); if (atomic_dec_and_test(&AFFS_I(inode)->i_opencnt)) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (inode->i_size != AFFS_I(inode)->mmu_private) affs_truncate(inode); affs_free_prealloc(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } return 0; @@ -511,8 +511,6 @@ affs_do_readpage_ofs(struct page *page, unsigned to) pr_debug("%s(%lu, %ld, 0, %d)\n", __func__, inode->i_ino, page->index, to); BUG_ON(to > PAGE_CACHE_SIZE); - kmap(page); - data = page_address(page); bsize = AFFS_SB(sb)->s_data_blksize; tmp = page->index << PAGE_CACHE_SHIFT; bidx = tmp / bsize; @@ -524,14 +522,15 @@ affs_do_readpage_ofs(struct page *page, unsigned to) return PTR_ERR(bh); tmp = min(bsize - boff, to - pos); BUG_ON(pos + tmp > to || tmp > bsize); + data = kmap_atomic(page); memcpy(data + pos, AFFS_DATA(bh) + boff, tmp); + kunmap_atomic(data); affs_brelse(bh); bidx++; pos += tmp; boff = 0; } flush_dcache_page(page); - kunmap(page); return 0; } @@ -958,12 +957,12 @@ int affs_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) if (err) return err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = write_inode_now(inode, 0); err = sync_blockdev(inode->i_sb->s_bdev); if (!ret) ret = err; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } const struct file_operations affs_file_operations = { diff --git a/fs/afs/flock.c b/fs/afs/flock.c index 4baf1d2b39e4..d91a9c9cfbd0 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -483,7 +483,7 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl) fl->fl_type = F_UNLCK; - mutex_lock(&vnode->vfs_inode.i_mutex); + inode_lock(&vnode->vfs_inode); /* check local lock records first */ ret = 0; @@ -505,7 +505,7 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl) } error: - mutex_unlock(&vnode->vfs_inode.i_mutex); + inode_unlock(&vnode->vfs_inode); _leave(" = %d [%hd]", ret, fl->fl_type); return ret; } diff --git a/fs/afs/write.c b/fs/afs/write.c index 0714abcd7f32..dfef94f70667 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -693,7 +693,7 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync) ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* use a writeback record as a marker in the queue - when this reaches * the front of the queue, all the outstanding writes are either @@ -735,7 +735,7 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync) afs_put_writeback(wb); _leave(" = %d", ret); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/attr.c b/fs/attr.c index 6530ced19697..25b24d0f6c88 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -195,7 +195,7 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de struct timespec now; unsigned int ia_valid = attr->ia_valid; - WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex)); + WARN_ON_ONCE(!inode_is_locked(inode)); if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) { if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index c37149b929be..f0d268b97d19 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -1,15 +1,11 @@ -/* -*- c -*- ------------------------------------------------------------- * - * - * linux/fs/autofs/autofs_i.h - * - * Copyright 1997-1998 Transmeta Corporation - All Rights Reserved - * Copyright 2005-2006 Ian Kent <raven@themaw.net> +/* + * Copyright 1997-1998 Transmeta Corporation - All Rights Reserved + * Copyright 2005-2006 Ian Kent <raven@themaw.net> * * This file is part of the Linux kernel and is made available under * the terms of the GNU General Public License, version 2, or at your * option, any later version, incorporated herein by reference. - * - * ----------------------------------------------------------------------- */ + */ /* Internal header file for autofs */ @@ -35,28 +31,23 @@ #include <linux/mount.h> #include <linux/namei.h> #include <asm/current.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> /* #define DEBUG */ -#define DPRINTK(fmt, ...) \ - pr_debug("pid %d: %s: " fmt "\n", \ - current->pid, __func__, ##__VA_ARGS__) - -#define AUTOFS_WARN(fmt, ...) \ - printk(KERN_WARNING "pid %d: %s: " fmt "\n", \ - current->pid, __func__, ##__VA_ARGS__) - -#define AUTOFS_ERROR(fmt, ...) \ - printk(KERN_ERR "pid %d: %s: " fmt "\n", \ - current->pid, __func__, ##__VA_ARGS__) - -/* Unified info structure. This is pointed to by both the dentry and - inode structures. Each file in the filesystem has an instance of this - structure. It holds a reference to the dentry, so dentries are never - flushed while the file exists. All name lookups are dealt with at the - dentry level, although the filesystem can interfere in the validation - process. Readdir is implemented by traversing the dentry lists. */ +#ifdef pr_fmt +#undef pr_fmt +#endif +#define pr_fmt(fmt) KBUILD_MODNAME ":pid:%d:%s: " fmt, current->pid, __func__ + +/* + * Unified info structure. This is pointed to by both the dentry and + * inode structures. Each file in the filesystem has an instance of this + * structure. It holds a reference to the dentry, so dentries are never + * flushed while the file exists. All name lookups are dealt with at the + * dentry level, although the filesystem can interfere in the validation + * process. Readdir is implemented by traversing the dentry lists. + */ struct autofs_info { struct dentry *dentry; struct inode *inode; @@ -78,7 +69,7 @@ struct autofs_info { kgid_t gid; }; -#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */ +#define AUTOFS_INF_EXPIRING (1<<0) /* dentry in the process of expiring */ #define AUTOFS_INF_NO_RCU (1<<1) /* the dentry is being considered * for expiry, so RCU_walk is * not permitted @@ -140,10 +131,11 @@ static inline struct autofs_info *autofs4_dentry_ino(struct dentry *dentry) } /* autofs4_oz_mode(): do we see the man behind the curtain? (The - processes which do manipulations for us in user space sees the raw - filesystem without "magic".) */ - -static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) { + * processes which do manipulations for us in user space sees the raw + * filesystem without "magic".) + */ +static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) +{ return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp; } @@ -154,12 +146,12 @@ void autofs4_free_ino(struct autofs_info *); int is_autofs4_dentry(struct dentry *); int autofs4_expire_wait(struct dentry *dentry, int rcu_walk); int autofs4_expire_run(struct super_block *, struct vfsmount *, - struct autofs_sb_info *, - struct autofs_packet_expire __user *); + struct autofs_sb_info *, + struct autofs_packet_expire __user *); int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, struct autofs_sb_info *sbi, int when); int autofs4_expire_multi(struct super_block *, struct vfsmount *, - struct autofs_sb_info *, int __user *); + struct autofs_sb_info *, int __user *); struct dentry *autofs4_expire_direct(struct super_block *sb, struct vfsmount *mnt, struct autofs_sb_info *sbi, int how); @@ -224,8 +216,8 @@ static inline int autofs_prepare_pipe(struct file *pipe) /* Queue management functions */ -int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify); -int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int); +int autofs4_wait(struct autofs_sb_info *, struct dentry *, enum autofs_notify); +int autofs4_wait_release(struct autofs_sb_info *, autofs_wqt_t, int); void autofs4_catatonic_mode(struct autofs_sb_info *); static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi) @@ -242,37 +234,37 @@ static inline void __autofs4_add_expiring(struct dentry *dentry) { struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); + if (ino) { if (list_empty(&ino->expiring)) list_add(&ino->expiring, &sbi->expiring_list); } - return; } static inline void autofs4_add_expiring(struct dentry *dentry) { struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); + if (ino) { spin_lock(&sbi->lookup_lock); if (list_empty(&ino->expiring)) list_add(&ino->expiring, &sbi->expiring_list); spin_unlock(&sbi->lookup_lock); } - return; } static inline void autofs4_del_expiring(struct dentry *dentry) { struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); + if (ino) { spin_lock(&sbi->lookup_lock); if (!list_empty(&ino->expiring)) list_del_init(&ino->expiring); spin_unlock(&sbi->lookup_lock); } - return; } extern void autofs4_kill_sb(struct super_block *); diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index ac7d921ed984..c7fcc7438843 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -72,13 +72,13 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param) { int err = 0; - if ((AUTOFS_DEV_IOCTL_VERSION_MAJOR != param->ver_major) || - (AUTOFS_DEV_IOCTL_VERSION_MINOR < param->ver_minor)) { - AUTOFS_WARN("ioctl control interface version mismatch: " - "kernel(%u.%u), user(%u.%u), cmd(%d)", - AUTOFS_DEV_IOCTL_VERSION_MAJOR, - AUTOFS_DEV_IOCTL_VERSION_MINOR, - param->ver_major, param->ver_minor, cmd); + if ((param->ver_major != AUTOFS_DEV_IOCTL_VERSION_MAJOR) || + (param->ver_minor > AUTOFS_DEV_IOCTL_VERSION_MINOR)) { + pr_warn("ioctl control interface version mismatch: " + "kernel(%u.%u), user(%u.%u), cmd(%d)\n", + AUTOFS_DEV_IOCTL_VERSION_MAJOR, + AUTOFS_DEV_IOCTL_VERSION_MINOR, + param->ver_major, param->ver_minor, cmd); err = -EINVAL; } @@ -93,7 +93,8 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param) * Copy parameter control struct, including a possible path allocated * at the end of the struct. */ -static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in) +static struct autofs_dev_ioctl * + copy_dev_ioctl(struct autofs_dev_ioctl __user *in) { struct autofs_dev_ioctl tmp, *res; @@ -116,7 +117,6 @@ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *i static inline void free_dev_ioctl(struct autofs_dev_ioctl *param) { kfree(param); - return; } /* @@ -129,24 +129,24 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) err = check_dev_ioctl_version(cmd, param); if (err) { - AUTOFS_WARN("invalid device control module version " - "supplied for cmd(0x%08x)", cmd); + pr_warn("invalid device control module version " + "supplied for cmd(0x%08x)\n", cmd); goto out; } if (param->size > sizeof(*param)) { err = invalid_str(param->path, param->size - sizeof(*param)); if (err) { - AUTOFS_WARN( - "path string terminator missing for cmd(0x%08x)", + pr_warn( + "path string terminator missing for cmd(0x%08x)\n", cmd); goto out; } err = check_name(param->path); if (err) { - AUTOFS_WARN("invalid path supplied for cmd(0x%08x)", - cmd); + pr_warn("invalid path supplied for cmd(0x%08x)\n", + cmd); goto out; } } @@ -197,7 +197,9 @@ static int find_autofs_mount(const char *pathname, void *data) { struct path path; - int err = kern_path_mountpoint(AT_FDCWD, pathname, &path, 0); + int err; + + err = kern_path_mountpoint(AT_FDCWD, pathname, &path, 0); if (err) return err; err = -ENOENT; @@ -225,6 +227,7 @@ static int test_by_dev(struct path *path, void *p) static int test_by_type(struct path *path, void *p) { struct autofs_info *ino = autofs4_dentry_ino(path->dentry); + return ino && ino->sbi->type & *(unsigned *)p; } @@ -370,7 +373,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp, new_pid = get_task_pid(current, PIDTYPE_PGID); if (ns_of_pid(new_pid) != ns_of_pid(sbi->oz_pgrp)) { - AUTOFS_WARN("Not allowed to change PID namespace"); + pr_warn("not allowed to change PID namespace\n"); err = -EINVAL; goto out; } @@ -456,8 +459,10 @@ static int autofs_dev_ioctl_requester(struct file *fp, err = 0; autofs4_expire_wait(path.dentry, 0); spin_lock(&sbi->fs_lock); - param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid); - param->requester.gid = from_kgid_munged(current_user_ns(), ino->gid); + param->requester.uid = + from_kuid_munged(current_user_ns(), ino->uid); + param->requester.gid = + from_kgid_munged(current_user_ns(), ino->gid); spin_unlock(&sbi->fs_lock); } path_put(&path); @@ -619,7 +624,8 @@ static ioctl_fn lookup_dev_ioctl(unsigned int cmd) } /* ioctl dispatcher */ -static int _autofs_dev_ioctl(unsigned int command, struct autofs_dev_ioctl __user *user) +static int _autofs_dev_ioctl(unsigned int command, + struct autofs_dev_ioctl __user *user) { struct autofs_dev_ioctl *param; struct file *fp; @@ -655,7 +661,7 @@ static int _autofs_dev_ioctl(unsigned int command, struct autofs_dev_ioctl __use fn = lookup_dev_ioctl(cmd); if (!fn) { - AUTOFS_WARN("unknown command 0x%08x", command); + pr_warn("unknown command 0x%08x\n", command); return -ENOTTY; } @@ -711,6 +717,7 @@ out: static long autofs_dev_ioctl(struct file *file, uint command, ulong u) { int err; + err = _autofs_dev_ioctl(command, (struct autofs_dev_ioctl __user *) u); return (long) err; } @@ -733,8 +740,8 @@ static const struct file_operations _dev_ioctl_fops = { static struct miscdevice _autofs_dev_ioctl_misc = { .minor = AUTOFS_MINOR, - .name = AUTOFS_DEVICE_NAME, - .fops = &_dev_ioctl_fops + .name = AUTOFS_DEVICE_NAME, + .fops = &_dev_ioctl_fops }; MODULE_ALIAS_MISCDEV(AUTOFS_MINOR); @@ -747,7 +754,7 @@ int __init autofs_dev_ioctl_init(void) r = misc_register(&_autofs_dev_ioctl_misc); if (r) { - AUTOFS_ERROR("misc_register failed for control device"); + pr_err("misc_register failed for control device\n"); return r; } @@ -757,6 +764,4 @@ int __init autofs_dev_ioctl_init(void) void autofs_dev_ioctl_exit(void) { misc_deregister(&_autofs_dev_ioctl_misc); - return; } - diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 1cebc3c52fa5..9510d8d2e9cd 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -1,16 +1,12 @@ -/* -*- c -*- --------------------------------------------------------------- * - * - * linux/fs/autofs/expire.c - * - * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved - * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org> - * Copyright 2001-2006 Ian Kent <raven@themaw.net> +/* + * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved + * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org> + * Copyright 2001-2006 Ian Kent <raven@themaw.net> * * This file is part of the Linux kernel and is made available under * the terms of the GNU General Public License, version 2, or at your * option, any later version, incorporated herein by reference. - * - * ------------------------------------------------------------------------- */ + */ #include "autofs_i.h" @@ -18,7 +14,7 @@ static unsigned long now; /* Check if a dentry can be expired */ static inline int autofs4_can_expire(struct dentry *dentry, - unsigned long timeout, int do_now) + unsigned long timeout, int do_now) { struct autofs_info *ino = autofs4_dentry_ino(dentry); @@ -41,7 +37,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) struct path path = {.mnt = mnt, .dentry = dentry}; int status = 1; - DPRINTK("dentry %p %pd", dentry, dentry); + pr_debug("dentry %p %pd\n", dentry, dentry); path_get(&path); @@ -58,14 +54,16 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) /* Update the expiry counter if fs is busy */ if (!may_umount_tree(path.mnt)) { - struct autofs_info *ino = autofs4_dentry_ino(top); + struct autofs_info *ino; + + ino = autofs4_dentry_ino(top); ino->last_used = jiffies; goto done; } status = 0; done: - DPRINTK("returning = %d", status); + pr_debug("returning = %d\n", status); path_put(&path); return status; } @@ -74,7 +72,7 @@ done: * Calculate and dget next entry in the subdirs list under root. */ static struct dentry *get_next_positive_subdir(struct dentry *prev, - struct dentry *root) + struct dentry *root) { struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); struct list_head *next; @@ -121,7 +119,7 @@ cont: * Calculate and dget next entry in top down tree traversal. */ static struct dentry *get_next_positive_dentry(struct dentry *prev, - struct dentry *root) + struct dentry *root) { struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); struct list_head *next; @@ -187,15 +185,17 @@ again: * autofs submounts. */ static int autofs4_direct_busy(struct vfsmount *mnt, - struct dentry *top, - unsigned long timeout, - int do_now) + struct dentry *top, + unsigned long timeout, + int do_now) { - DPRINTK("top %p %pd", top, top); + pr_debug("top %p %pd\n", top, top); /* If it's busy update the expiry counters */ if (!may_umount_tree(mnt)) { - struct autofs_info *ino = autofs4_dentry_ino(top); + struct autofs_info *ino; + + ino = autofs4_dentry_ino(top); if (ino) ino->last_used = jiffies; return 1; @@ -208,7 +208,8 @@ static int autofs4_direct_busy(struct vfsmount *mnt, return 0; } -/* Check a directory tree of mount points for busyness +/* + * Check a directory tree of mount points for busyness * The tree is not busy iff no mountpoints are busy */ static int autofs4_tree_busy(struct vfsmount *mnt, @@ -219,7 +220,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt, struct autofs_info *top_ino = autofs4_dentry_ino(top); struct dentry *p; - DPRINTK("top %p %pd", top, top); + pr_debug("top %p %pd\n", top, top); /* Negative dentry - give up */ if (!simple_positive(top)) @@ -227,7 +228,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt, p = NULL; while ((p = get_next_positive_dentry(p, top))) { - DPRINTK("dentry %p %pd", p, p); + pr_debug("dentry %p %pd\n", p, p); /* * Is someone visiting anywhere in the subtree ? @@ -273,11 +274,11 @@ static struct dentry *autofs4_check_leaves(struct vfsmount *mnt, { struct dentry *p; - DPRINTK("parent %p %pd", parent, parent); + pr_debug("parent %p %pd\n", parent, parent); p = NULL; while ((p = get_next_positive_dentry(p, parent))) { - DPRINTK("dentry %p %pd", p, p); + pr_debug("dentry %p %pd\n", p, p); if (d_mountpoint(p)) { /* Can we umount this guy */ @@ -362,7 +363,7 @@ static struct dentry *should_expire(struct dentry *dentry, * offset (autofs-5.0+). */ if (d_mountpoint(dentry)) { - DPRINTK("checking mountpoint %p %pd", dentry, dentry); + pr_debug("checking mountpoint %p %pd\n", dentry, dentry); /* Can we umount this guy */ if (autofs4_mount_busy(mnt, dentry)) @@ -375,7 +376,7 @@ static struct dentry *should_expire(struct dentry *dentry, } if (d_really_is_positive(dentry) && d_is_symlink(dentry)) { - DPRINTK("checking symlink %p %pd", dentry, dentry); + pr_debug("checking symlink %p %pd\n", dentry, dentry); /* * A symlink can't be "busy" in the usual sense so * just check last used for expire timeout. @@ -404,6 +405,7 @@ static struct dentry *should_expire(struct dentry *dentry, } else { /* Path walk currently on this dentry? */ struct dentry *expired; + ino_count = atomic_read(&ino->count) + 1; if (d_count(dentry) > ino_count) return NULL; @@ -471,7 +473,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, return NULL; found: - DPRINTK("returning %p %pd", expired, expired); + pr_debug("returning %p %pd\n", expired, expired); ino->flags |= AUTOFS_INF_EXPIRING; smp_mb(); ino->flags &= ~AUTOFS_INF_NO_RCU; @@ -503,12 +505,12 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk) if (ino->flags & AUTOFS_INF_EXPIRING) { spin_unlock(&sbi->fs_lock); - DPRINTK("waiting for expire %p name=%pd", dentry, dentry); + pr_debug("waiting for expire %p name=%pd\n", dentry, dentry); status = autofs4_wait(sbi, dentry, NFY_NONE); wait_for_completion(&ino->expire_complete); - DPRINTK("expire done status=%d", status); + pr_debug("expire done status=%d\n", status); if (d_unhashed(dentry)) return -EAGAIN; @@ -522,21 +524,22 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk) /* Perform an expiry operation */ int autofs4_expire_run(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, - struct autofs_packet_expire __user *pkt_p) + struct vfsmount *mnt, + struct autofs_sb_info *sbi, + struct autofs_packet_expire __user *pkt_p) { struct autofs_packet_expire pkt; struct autofs_info *ino; struct dentry *dentry; int ret = 0; - memset(&pkt,0,sizeof pkt); + memset(&pkt, 0, sizeof(pkt)); pkt.hdr.proto_version = sbi->version; pkt.hdr.type = autofs_ptype_expire; - if ((dentry = autofs4_expire_indirect(sb, mnt, sbi, 0)) == NULL) + dentry = autofs4_expire_indirect(sb, mnt, sbi, 0); + if (!dentry) return -EAGAIN; pkt.len = dentry->d_name.len; @@ -544,7 +547,7 @@ int autofs4_expire_run(struct super_block *sb, pkt.name[pkt.len] = '\0'; dput(dentry); - if ( copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)) ) + if (copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire))) ret = -EFAULT; spin_lock(&sbi->fs_lock); @@ -573,7 +576,8 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, struct autofs_info *ino = autofs4_dentry_ino(dentry); /* This is synchronous because it makes the daemon a - little easier */ + * little easier + */ ret = autofs4_wait(sbi, dentry, NFY_EXPIRE); spin_lock(&sbi->fs_lock); @@ -588,8 +592,10 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, return ret; } -/* Call repeatedly until it returns -EAGAIN, meaning there's nothing - more to be done */ +/* + * Call repeatedly until it returns -EAGAIN, meaning there's nothing + * more to be done. + */ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt, struct autofs_sb_info *sbi, int __user *arg) { diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c index b3db517e89ec..8cf0e63389ae 100644 --- a/fs/autofs4/init.c +++ b/fs/autofs4/init.c @@ -1,14 +1,10 @@ -/* -*- c -*- --------------------------------------------------------------- * - * - * linux/fs/autofs/init.c - * - * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved +/* + * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved * * This file is part of the Linux kernel and is made available under * the terms of the GNU General Public License, version 2, or at your * option, any later version, incorporated herein by reference. - * - * ------------------------------------------------------------------------- */ + */ #include <linux/module.h> #include <linux/init.h> diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index a3ae0b2aeb5a..61b21051bd5a 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -1,15 +1,11 @@ -/* -*- c -*- --------------------------------------------------------------- * - * - * linux/fs/autofs/inode.c - * - * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved - * Copyright 2005-2006 Ian Kent <raven@themaw.net> +/* + * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved + * Copyright 2005-2006 Ian Kent <raven@themaw.net> * * This file is part of the Linux kernel and is made available under * the terms of the GNU General Public License, version 2, or at your * option, any later version, incorporated herein by reference. - * - * ------------------------------------------------------------------------- */ + */ #include <linux/kernel.h> #include <linux/slab.h> @@ -24,7 +20,9 @@ struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi) { - struct autofs_info *ino = kzalloc(sizeof(*ino), GFP_KERNEL); + struct autofs_info *ino; + + ino = kzalloc(sizeof(*ino), GFP_KERNEL); if (ino) { INIT_LIST_HEAD(&ino->active); INIT_LIST_HEAD(&ino->expiring); @@ -62,7 +60,7 @@ void autofs4_kill_sb(struct super_block *sb) put_pid(sbi->oz_pgrp); } - DPRINTK("shutting down"); + pr_debug("shutting down\n"); kill_litter_super(sb); if (sbi) kfree_rcu(sbi, rcu); @@ -94,7 +92,12 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root) seq_printf(m, ",direct"); else seq_printf(m, ",indirect"); - +#ifdef CONFIG_CHECKPOINT_RESTORE + if (sbi->pipe) + seq_printf(m, ",pipe_ino=%ld", sbi->pipe->f_inode->i_ino); + else + seq_printf(m, ",pipe_ino=-1"); +#endif return 0; } @@ -147,6 +150,7 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid, while ((p = strsep(&options, ",")) != NULL) { int token; + if (!*p) continue; @@ -204,9 +208,9 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid, int autofs4_fill_super(struct super_block *s, void *data, int silent) { - struct inode * root_inode; - struct dentry * root; - struct file * pipe; + struct inode *root_inode; + struct dentry *root; + struct file *pipe; int pipefd; struct autofs_sb_info *sbi; struct autofs_info *ino; @@ -217,7 +221,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return -ENOMEM; - DPRINTK("starting up, sbi = %p",sbi); + pr_debug("starting up, sbi = %p\n", sbi); s->s_fs_info = sbi; sbi->magic = AUTOFS_SBI_MAGIC; @@ -266,14 +270,14 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid, &pgrp, &pgrp_set, &sbi->type, &sbi->min_proto, &sbi->max_proto)) { - printk("autofs: called with bogus options\n"); + pr_err("called with bogus options\n"); goto fail_dput; } if (pgrp_set) { sbi->oz_pgrp = find_get_pid(pgrp); if (!sbi->oz_pgrp) { - pr_warn("autofs: could not find process group %d\n", + pr_err("could not find process group %d\n", pgrp); goto fail_dput; } @@ -290,10 +294,10 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) /* Couldn't this be tested earlier? */ if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION || sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) { - printk("autofs: kernel does not match daemon version " + pr_err("kernel does not match daemon version " "daemon (%d, %d) kernel (%d, %d)\n", - sbi->min_proto, sbi->max_proto, - AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION); + sbi->min_proto, sbi->max_proto, + AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION); goto fail_dput; } @@ -304,11 +308,11 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) sbi->version = sbi->max_proto; sbi->sub_version = AUTOFS_PROTO_SUBVERSION; - DPRINTK("pipe fd = %d, pgrp = %u", pipefd, pid_nr(sbi->oz_pgrp)); + pr_debug("pipe fd = %d, pgrp = %u\n", pipefd, pid_nr(sbi->oz_pgrp)); pipe = fget(pipefd); if (!pipe) { - printk("autofs: could not open pipe file descriptor\n"); + pr_err("could not open pipe file descriptor\n"); goto fail_dput; } ret = autofs_prepare_pipe(pipe); @@ -323,12 +327,12 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) */ s->s_root = root; return 0; - + /* * Failure ... clean up. */ fail_fput: - printk("autofs: pipe file descriptor does not contain proper ops\n"); + pr_err("pipe file descriptor does not contain proper ops\n"); fput(pipe); /* fall through */ fail_dput: diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index c6d7d3dbd52a..7ab923940d18 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -1,16 +1,12 @@ -/* -*- c -*- --------------------------------------------------------------- * - * - * linux/fs/autofs/root.c - * - * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved - * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org> - * Copyright 2001-2006 Ian Kent <raven@themaw.net> +/* + * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved + * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org> + * Copyright 2001-2006 Ian Kent <raven@themaw.net> * * This file is part of the Linux kernel and is made available under * the terms of the GNU General Public License, version 2, or at your * option, any later version, incorporated herein by reference. - * - * ------------------------------------------------------------------------- */ + */ #include <linux/capability.h> #include <linux/errno.h> @@ -23,16 +19,18 @@ #include "autofs_i.h" -static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *); -static int autofs4_dir_unlink(struct inode *,struct dentry *); -static int autofs4_dir_rmdir(struct inode *,struct dentry *); -static int autofs4_dir_mkdir(struct inode *,struct dentry *,umode_t); -static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long); +static int autofs4_dir_symlink(struct inode *, struct dentry *, const char *); +static int autofs4_dir_unlink(struct inode *, struct dentry *); +static int autofs4_dir_rmdir(struct inode *, struct dentry *); +static int autofs4_dir_mkdir(struct inode *, struct dentry *, umode_t); +static long autofs4_root_ioctl(struct file *, unsigned int, unsigned long); #ifdef CONFIG_COMPAT -static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long); +static long autofs4_root_compat_ioctl(struct file *, + unsigned int, unsigned long); #endif static int autofs4_dir_open(struct inode *inode, struct file *file); -static struct dentry *autofs4_lookup(struct inode *,struct dentry *, unsigned int); +static struct dentry *autofs4_lookup(struct inode *, + struct dentry *, unsigned int); static struct vfsmount *autofs4_d_automount(struct path *); static int autofs4_d_manage(struct dentry *, bool); static void autofs4_dentry_release(struct dentry *); @@ -74,7 +72,9 @@ const struct dentry_operations autofs4_dentry_operations = { static void autofs4_add_active(struct dentry *dentry) { struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_info *ino; + + ino = autofs4_dentry_ino(dentry); if (ino) { spin_lock(&sbi->lookup_lock); if (!ino->active_count) { @@ -84,13 +84,14 @@ static void autofs4_add_active(struct dentry *dentry) ino->active_count++; spin_unlock(&sbi->lookup_lock); } - return; } static void autofs4_del_active(struct dentry *dentry) { struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_info *ino; + + ino = autofs4_dentry_ino(dentry); if (ino) { spin_lock(&sbi->lookup_lock); ino->active_count--; @@ -100,7 +101,6 @@ static void autofs4_del_active(struct dentry *dentry) } spin_unlock(&sbi->lookup_lock); } - return; } static int autofs4_dir_open(struct inode *inode, struct file *file) @@ -108,7 +108,7 @@ static int autofs4_dir_open(struct inode *inode, struct file *file) struct dentry *dentry = file->f_path.dentry; struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - DPRINTK("file=%p dentry=%p %pd", file, dentry, dentry); + pr_debug("file=%p dentry=%p %pd\n", file, dentry, dentry); if (autofs4_oz_mode(sbi)) goto out; @@ -138,7 +138,7 @@ static void autofs4_dentry_release(struct dentry *de) struct autofs_info *ino = autofs4_dentry_ino(de); struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb); - DPRINTK("releasing %p", de); + pr_debug("releasing %p\n", de); if (!ino) return; @@ -278,9 +278,9 @@ static int autofs4_mount_wait(struct dentry *dentry, bool rcu_walk) if (ino->flags & AUTOFS_INF_PENDING) { if (rcu_walk) return -ECHILD; - DPRINTK("waiting for mount name=%pd", dentry); + pr_debug("waiting for mount name=%pd\n", dentry); status = autofs4_wait(sbi, dentry, NFY_MOUNT); - DPRINTK("mount wait done status=%d", status); + pr_debug("mount wait done status=%d\n", status); } ino->last_used = jiffies; return status; @@ -320,7 +320,9 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path) if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) { struct dentry *parent = dentry->d_parent; struct autofs_info *ino; - struct dentry *new = d_lookup(parent, &dentry->d_name); + struct dentry *new; + + new = d_lookup(parent, &dentry->d_name); if (!new) return NULL; ino = autofs4_dentry_ino(new); @@ -338,7 +340,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path) struct autofs_info *ino = autofs4_dentry_ino(dentry); int status; - DPRINTK("dentry=%p %pd", dentry, dentry); + pr_debug("dentry=%p %pd\n", dentry, dentry); /* The daemon never triggers a mount. */ if (autofs4_oz_mode(sbi)) @@ -425,7 +427,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) struct autofs_info *ino = autofs4_dentry_ino(dentry); int status; - DPRINTK("dentry=%p %pd", dentry, dentry); + pr_debug("dentry=%p %pd\n", dentry, dentry); /* The daemon never waits. */ if (autofs4_oz_mode(sbi)) { @@ -455,6 +457,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) * a mount-trap. */ struct inode *inode; + if (ino->flags & (AUTOFS_INF_EXPIRING | AUTOFS_INF_NO_RCU)) return 0; if (d_mountpoint(dentry)) @@ -494,13 +497,14 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) } /* Lookups in the root directory */ -static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) +static struct dentry *autofs4_lookup(struct inode *dir, + struct dentry *dentry, unsigned int flags) { struct autofs_sb_info *sbi; struct autofs_info *ino; struct dentry *active; - DPRINTK("name = %pd", dentry); + pr_debug("name = %pd\n", dentry); /* File name too long to exist */ if (dentry->d_name.len > NAME_MAX) @@ -508,14 +512,14 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, u sbi = autofs4_sbi(dir->i_sb); - DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d", - current->pid, task_pgrp_nr(current), sbi->catatonic, - autofs4_oz_mode(sbi)); + pr_debug("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d\n", + current->pid, task_pgrp_nr(current), sbi->catatonic, + autofs4_oz_mode(sbi)); active = autofs4_lookup_active(dentry); - if (active) { + if (active) return active; - } else { + else { /* * A dentry that is not within the root can never trigger a * mount operation, unless the directory already exists, so we @@ -526,7 +530,8 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, u return ERR_PTR(-ENOENT); /* Mark entries in the root as mount triggers */ - if (autofs_type_indirect(sbi->type) && IS_ROOT(dentry->d_parent)) + if (IS_ROOT(dentry->d_parent) && + autofs_type_indirect(sbi->type)) __managed_dentry_set_managed(dentry); ino = autofs4_new_ino(sbi); @@ -537,8 +542,6 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, u ino->dentry = dentry; autofs4_add_active(dentry); - - d_instantiate(dentry, NULL); } return NULL; } @@ -554,7 +557,7 @@ static int autofs4_dir_symlink(struct inode *dir, size_t size = strlen(symname); char *cp; - DPRINTK("%s <- %pd", symname, dentry); + pr_debug("%s <- %pd\n", symname, dentry); if (!autofs4_oz_mode(sbi)) return -EACCES; @@ -613,7 +616,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); struct autofs_info *p_ino; - + /* This allows root to remove symlinks */ if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) return -EPERM; @@ -664,7 +667,6 @@ static void autofs_set_leaf_automount_flags(struct dentry *dentry) if (IS_ROOT(parent->d_parent)) return; managed_dentry_clear_managed(parent); - return; } static void autofs_clear_leaf_automount_flags(struct dentry *dentry) @@ -687,7 +689,6 @@ static void autofs_clear_leaf_automount_flags(struct dentry *dentry) if (d_child->next == &parent->d_subdirs && d_child->prev == &parent->d_subdirs) managed_dentry_set_managed(parent); - return; } static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) @@ -695,8 +696,8 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); struct autofs_info *p_ino; - - DPRINTK("dentry %p, removing %pd", dentry, dentry); + + pr_debug("dentry %p, removing %pd\n", dentry, dentry); if (!autofs4_oz_mode(sbi)) return -EACCES; @@ -728,7 +729,8 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) return 0; } -static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int autofs4_dir_mkdir(struct inode *dir, + struct dentry *dentry, umode_t mode) { struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); @@ -738,7 +740,7 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t m if (!autofs4_oz_mode(sbi)) return -EACCES; - DPRINTK("dentry %p, creating %pd", dentry, dentry); + pr_debug("dentry %p, creating %pd\n", dentry, dentry); BUG_ON(!ino); @@ -768,14 +770,18 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t m /* Get/set timeout ioctl() operation */ #ifdef CONFIG_COMPAT static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi, - compat_ulong_t __user *p) + compat_ulong_t __user *p) { - int rv; unsigned long ntimeout; + int rv; + + rv = get_user(ntimeout, p); + if (rv) + goto error; - if ((rv = get_user(ntimeout, p)) || - (rv = put_user(sbi->exp_timeout/HZ, p))) - return rv; + rv = put_user(sbi->exp_timeout/HZ, p); + if (rv) + goto error; if (ntimeout > UINT_MAX/HZ) sbi->exp_timeout = 0; @@ -783,18 +789,24 @@ static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi, sbi->exp_timeout = ntimeout * HZ; return 0; +error: + return rv; } #endif static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi, - unsigned long __user *p) + unsigned long __user *p) { - int rv; unsigned long ntimeout; + int rv; + + rv = get_user(ntimeout, p); + if (rv) + goto error; - if ((rv = get_user(ntimeout, p)) || - (rv = put_user(sbi->exp_timeout/HZ, p))) - return rv; + rv = put_user(sbi->exp_timeout/HZ, p); + if (rv) + goto error; if (ntimeout > ULONG_MAX/HZ) sbi->exp_timeout = 0; @@ -802,16 +814,20 @@ static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi, sbi->exp_timeout = ntimeout * HZ; return 0; +error: + return rv; } /* Return protocol version */ -static inline int autofs4_get_protover(struct autofs_sb_info *sbi, int __user *p) +static inline int autofs4_get_protover(struct autofs_sb_info *sbi, + int __user *p) { return put_user(sbi->version, p); } /* Return protocol sub version */ -static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi, int __user *p) +static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi, + int __user *p) { return put_user(sbi->sub_version, p); } @@ -826,7 +842,7 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p) if (may_umount(mnt)) status = 1; - DPRINTK("returning %d", status); + pr_debug("returning %d\n", status); status = put_user(status, p); @@ -834,9 +850,9 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p) } /* Identify autofs4_dentries - this is so we can tell if there's - an extra dentry refcount or not. We only hold a refcount on the - dentry if its non-negative (ie, d_inode != NULL) -*/ + * an extra dentry refcount or not. We only hold a refcount on the + * dentry if its non-negative (ie, d_inode != NULL) + */ int is_autofs4_dentry(struct dentry *dentry) { return dentry && d_really_is_positive(dentry) && @@ -854,21 +870,21 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp, struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb); void __user *p = (void __user *)arg; - DPRINTK("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u", - cmd,arg,sbi,task_pgrp_nr(current)); + pr_debug("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n", + cmd, arg, sbi, task_pgrp_nr(current)); if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) || _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT) return -ENOTTY; - + if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) return -EPERM; - - switch(cmd) { + + switch (cmd) { case AUTOFS_IOC_READY: /* Wait queue: go ahead and retry */ - return autofs4_wait_release(sbi,(autofs_wqt_t)arg,0); + return autofs4_wait_release(sbi, (autofs_wqt_t) arg, 0); case AUTOFS_IOC_FAIL: /* Wait queue: fail with ENOENT */ - return autofs4_wait_release(sbi,(autofs_wqt_t)arg,-ENOENT); + return autofs4_wait_release(sbi, (autofs_wqt_t) arg, -ENOENT); case AUTOFS_IOC_CATATONIC: /* Enter catatonic mode (daemon shutdown) */ autofs4_catatonic_mode(sbi); return 0; @@ -888,13 +904,15 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp, /* return a single thing to expire */ case AUTOFS_IOC_EXPIRE: - return autofs4_expire_run(inode->i_sb,filp->f_path.mnt,sbi, p); + return autofs4_expire_run(inode->i_sb, + filp->f_path.mnt, sbi, p); /* same as above, but can send multiple expires through pipe */ case AUTOFS_IOC_EXPIRE_MULTI: - return autofs4_expire_multi(inode->i_sb,filp->f_path.mnt,sbi, p); + return autofs4_expire_multi(inode->i_sb, + filp->f_path.mnt, sbi, p); default: - return -ENOSYS; + return -EINVAL; } } @@ -902,12 +920,13 @@ static long autofs4_root_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); + return autofs4_root_ioctl_unlocked(inode, filp, cmd, arg); } #ifdef CONFIG_COMPAT static long autofs4_root_compat_ioctl(struct file *filp, - unsigned int cmd, unsigned long arg) + unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); int ret; @@ -916,7 +935,7 @@ static long autofs4_root_compat_ioctl(struct file *filp, ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg); else ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, - (unsigned long)compat_ptr(arg)); + (unsigned long) compat_ptr(arg)); return ret; } diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c index 84e037d1d129..99aab00dc217 100644 --- a/fs/autofs4/symlink.c +++ b/fs/autofs4/symlink.c @@ -1,14 +1,10 @@ -/* -*- c -*- --------------------------------------------------------------- * - * - * linux/fs/autofs/symlink.c - * - * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved +/* + * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved * * This file is part of the Linux kernel and is made available under * the terms of the GNU General Public License, version 2, or at your * option, any later version, incorporated herein by reference. - * - * ------------------------------------------------------------------------- */ + */ #include "autofs_i.h" @@ -18,6 +14,7 @@ static const char *autofs4_get_link(struct dentry *dentry, { struct autofs_sb_info *sbi; struct autofs_info *ino; + if (!dentry) return ERR_PTR(-ECHILD); sbi = autofs4_sbi(dentry->d_sb); diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 35b755e79c2d..0146d911f468 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -1,15 +1,11 @@ -/* -*- c -*- --------------------------------------------------------------- * - * - * linux/fs/autofs/waitq.c - * - * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved - * Copyright 2001-2006 Ian Kent <raven@themaw.net> +/* + * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved + * Copyright 2001-2006 Ian Kent <raven@themaw.net> * * This file is part of the Linux kernel and is made available under * the terms of the GNU General Public License, version 2, or at your * option, any later version, incorporated herein by reference. - * - * ------------------------------------------------------------------------- */ + */ #include <linux/slab.h> #include <linux/time.h> @@ -18,7 +14,8 @@ #include "autofs_i.h" /* We make this a static variable rather than a part of the superblock; it - is better if we don't reassign numbers easily even across filesystems */ + * is better if we don't reassign numbers easily even across filesystems + */ static autofs_wqt_t autofs4_next_wait_queue = 1; /* These are the signals we allow interrupting a pending mount */ @@ -34,7 +31,7 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi) return; } - DPRINTK("entering catatonic mode"); + pr_debug("entering catatonic mode\n"); sbi->catatonic = 1; wq = sbi->queues; @@ -69,17 +66,19 @@ static int autofs4_write(struct autofs_sb_info *sbi, set_fs(KERNEL_DS); mutex_lock(&sbi->pipe_mutex); - while (bytes && - (wr = __vfs_write(file,data,bytes,&file->f_pos)) > 0) { + wr = __vfs_write(file, data, bytes, &file->f_pos); + while (bytes && wr) { data += wr; bytes -= wr; + wr = __vfs_write(file, data, bytes, &file->f_pos); } mutex_unlock(&sbi->pipe_mutex); set_fs(fs); /* Keep the currently executing process from receiving a - SIGPIPE unless it was already supposed to get one */ + * SIGPIPE unless it was already supposed to get one + */ if (wr == -EPIPE && !sigpipe) { spin_lock_irqsave(¤t->sighand->siglock, flags); sigdelset(¤t->pending.signal, SIGPIPE); @@ -89,7 +88,7 @@ static int autofs4_write(struct autofs_sb_info *sbi, return (bytes > 0); } - + static void autofs4_notify_daemon(struct autofs_sb_info *sbi, struct autofs_wait_queue *wq, int type) @@ -102,10 +101,11 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, struct file *pipe = NULL; size_t pktsz; - DPRINTK("wait id = 0x%08lx, name = %.*s, type=%d", - (unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, type); + pr_debug("wait id = 0x%08lx, name = %.*s, type=%d\n", + (unsigned long) wq->wait_queue_token, + wq->name.len, wq->name.name, type); - memset(&pkt,0,sizeof pkt); /* For security reasons */ + memset(&pkt, 0, sizeof(pkt)); /* For security reasons */ pkt.hdr.proto_version = sbi->version; pkt.hdr.type = type; @@ -126,7 +126,8 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, } case autofs_ptype_expire_multi: { - struct autofs_packet_expire_multi *ep = &pkt.v4_pkt.expire_multi; + struct autofs_packet_expire_multi *ep = + &pkt.v4_pkt.expire_multi; pktsz = sizeof(*ep); @@ -163,7 +164,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, break; } default: - printk("autofs4_notify_daemon: bad type %d!\n", type); + pr_warn("bad type %d!\n", type); mutex_unlock(&sbi->wq_mutex); return; } @@ -231,7 +232,7 @@ autofs4_find_wait(struct autofs_sb_info *sbi, struct qstr *qstr) if (wq->name.hash == qstr->hash && wq->name.len == qstr->len && wq->name.name && - !memcmp(wq->name.name, qstr->name, qstr->len)) + !memcmp(wq->name.name, qstr->name, qstr->len)) break; } return wq; @@ -248,7 +249,7 @@ autofs4_find_wait(struct autofs_sb_info *sbi, struct qstr *qstr) static int validate_request(struct autofs_wait_queue **wait, struct autofs_sb_info *sbi, struct qstr *qstr, - struct dentry*dentry, enum autofs_notify notify) + struct dentry *dentry, enum autofs_notify notify) { struct autofs_wait_queue *wq; struct autofs_info *ino; @@ -322,8 +323,10 @@ static int validate_request(struct autofs_wait_queue **wait, * continue on and create a new request. */ if (!IS_ROOT(dentry)) { - if (d_really_is_positive(dentry) && d_unhashed(dentry)) { + if (d_unhashed(dentry) && + d_really_is_positive(dentry)) { struct dentry *parent = dentry->d_parent; + new = d_lookup(parent, &dentry->d_name); if (new) dentry = new; @@ -340,8 +343,8 @@ static int validate_request(struct autofs_wait_queue **wait, return 1; } -int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, - enum autofs_notify notify) +int autofs4_wait(struct autofs_sb_info *sbi, + struct dentry *dentry, enum autofs_notify notify) { struct autofs_wait_queue *wq; struct qstr qstr; @@ -411,7 +414,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, if (!wq) { /* Create a new wait queue */ - wq = kmalloc(sizeof(struct autofs_wait_queue),GFP_KERNEL); + wq = kmalloc(sizeof(struct autofs_wait_queue), GFP_KERNEL); if (!wq) { kfree(qstr.name); mutex_unlock(&sbi->wq_mutex); @@ -450,17 +453,19 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, autofs_ptype_expire_indirect; } - DPRINTK("new wait id = 0x%08lx, name = %.*s, nfy=%d\n", - (unsigned long) wq->wait_queue_token, wq->name.len, - wq->name.name, notify); + pr_debug("new wait id = 0x%08lx, name = %.*s, nfy=%d\n", + (unsigned long) wq->wait_queue_token, wq->name.len, + wq->name.name, notify); - /* autofs4_notify_daemon() may block; it will unlock ->wq_mutex */ + /* + * autofs4_notify_daemon() may block; it will unlock ->wq_mutex + */ autofs4_notify_daemon(sbi, wq, type); } else { wq->wait_ctr++; - DPRINTK("existing wait id = 0x%08lx, name = %.*s, nfy=%d", - (unsigned long) wq->wait_queue_token, wq->name.len, - wq->name.name, notify); + pr_debug("existing wait id = 0x%08lx, name = %.*s, nfy=%d\n", + (unsigned long) wq->wait_queue_token, wq->name.len, + wq->name.name, notify); mutex_unlock(&sbi->wq_mutex); kfree(qstr.name); } @@ -471,12 +476,14 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, */ if (wq->name.name) { /* Block all but "shutdown" signals while waiting */ - sigset_t oldset; + unsigned long shutdown_sigs_mask; unsigned long irqflags; + sigset_t oldset; spin_lock_irqsave(¤t->sighand->siglock, irqflags); oldset = current->blocked; - siginitsetinv(¤t->blocked, SHUTDOWN_SIGS & ~oldset.sig[0]); + shutdown_sigs_mask = SHUTDOWN_SIGS & ~oldset.sig[0]; + siginitsetinv(¤t->blocked, shutdown_sigs_mask); recalc_sigpending(); spin_unlock_irqrestore(¤t->sighand->siglock, irqflags); @@ -487,7 +494,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, recalc_sigpending(); spin_unlock_irqrestore(¤t->sighand->siglock, irqflags); } else { - DPRINTK("skipped sleeping"); + pr_debug("skipped sleeping\n"); } status = wq->status; @@ -562,4 +569,3 @@ int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_tok return 0; } - diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 3a93755e880f..7d914c67a9d0 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -491,6 +491,7 @@ static inline int arch_elf_pt_proc(struct elfhdr *ehdr, * arch_check_elf() - check an ELF executable * @ehdr: The main ELF header * @has_interp: True if the ELF has an interpreter, else false. + * @interp_ehdr: The interpreter's ELF header * @state: Architecture-specific state preserved throughout the process * of loading the ELF. * @@ -502,6 +503,7 @@ static inline int arch_elf_pt_proc(struct elfhdr *ehdr, * with that return code. */ static inline int arch_check_elf(struct elfhdr *ehdr, bool has_interp, + struct elfhdr *interp_ehdr, struct arch_elf_state *state) { /* Dummy implementation, always proceed */ @@ -651,7 +653,7 @@ static unsigned long randomize_stack_top(unsigned long stack_top) if ((current->flags & PF_RANDOMIZE) && !(current->personality & ADDR_NO_RANDOMIZE)) { - random_variable = (unsigned long) get_random_int(); + random_variable = get_random_long(); random_variable &= STACK_RND_MASK; random_variable <<= PAGE_SHIFT; } @@ -829,7 +831,9 @@ static int load_elf_binary(struct linux_binprm *bprm) * still possible to return an error to the code that invoked * the exec syscall. */ - retval = arch_check_elf(&loc->elf_ex, !!interpreter, &arch_state); + retval = arch_check_elf(&loc->elf_ex, + !!interpreter, &loc->interp_elf_ex, + &arch_state); if (retval) goto out_free_dentry; diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 78f005f37847..3a3ced779fc7 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -638,11 +638,11 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer, case 3: /* Delete this handler. */ root = dget(file->f_path.dentry->d_sb->s_root); - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); kill_node(e); - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); dput(root); break; default: @@ -675,7 +675,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, return PTR_ERR(e); root = dget(sb->s_root); - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); dentry = lookup_one_len(e->name, root, strlen(e->name)); err = PTR_ERR(dentry); if (IS_ERR(dentry)) @@ -711,7 +711,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, out2: dput(dentry); out: - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); dput(root); if (err) { @@ -754,12 +754,12 @@ static ssize_t bm_status_write(struct file *file, const char __user *buffer, case 3: /* Delete all handlers. */ root = dget(file->f_path.dentry->d_sb->s_root); - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); while (!list_empty(&entries)) kill_node(list_entry(entries.next, Node, list)); - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); dput(root); break; default: diff --git a/fs/block_dev.c b/fs/block_dev.c index ba762ea07f67..3172c4e2f502 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -75,7 +75,7 @@ void kill_bdev(struct block_device *bdev) { struct address_space *mapping = bdev->bd_inode->i_mapping; - if (mapping->nrpages == 0 && mapping->nrshadows == 0) + if (mapping->nrpages == 0 && mapping->nrexceptional == 0) return; invalidate_bh_lrus(); @@ -346,9 +346,9 @@ static loff_t block_llseek(struct file *file, loff_t offset, int whence) struct inode *bd_inode = bdev_file_inode(file); loff_t retval; - mutex_lock(&bd_inode->i_mutex); + inode_lock(bd_inode); retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode)); - mutex_unlock(&bd_inode->i_mutex); + inode_unlock(bd_inode); return retval; } @@ -575,7 +575,11 @@ static const struct super_operations bdev_sops = { static struct dentry *bd_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC); + struct dentry *dent; + dent = mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC); + if (dent) + dent->d_sb->s_iflags |= SB_I_CGROUPWB; + return dent; } static struct file_system_type bd_type = { @@ -1142,9 +1146,9 @@ void bd_set_size(struct block_device *bdev, loff_t size) { unsigned bsize = bdev_logical_block_size(bdev); - mutex_lock(&bdev->bd_inode->i_mutex); + inode_lock(bdev->bd_inode); i_size_write(bdev->bd_inode, size); - mutex_unlock(&bdev->bd_inode->i_mutex); + inode_unlock(bdev->bd_inode); while (bsize < PAGE_CACHE_SIZE) { if (size & bsize) break; @@ -1201,7 +1205,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = disk; bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; - bdev->bd_inode->i_flags = disk->fops->direct_access ? S_DAX : 0; + if (IS_ENABLED(CONFIG_BLK_DEV_DAX) && disk->fops->direct_access) + bdev->bd_inode->i_flags = S_DAX; + else + bdev->bd_inode->i_flags = 0; + if (!partno) { ret = -ENXIO; bdev->bd_part = disk_get_part(disk, partno); @@ -1693,13 +1701,24 @@ static int blkdev_releasepage(struct page *page, gfp_t wait) return try_to_free_buffers(page); } +static int blkdev_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + if (dax_mapping(mapping)) { + struct block_device *bdev = I_BDEV(mapping->host); + + return dax_writeback_mapping_range(mapping, bdev, wbc); + } + return generic_writepages(mapping, wbc); +} + static const struct address_space_operations def_blk_aops = { .readpage = blkdev_readpage, .readpages = blkdev_readpages, .writepage = blkdev_writepage, .write_begin = blkdev_write_begin, .write_end = blkdev_write_end, - .writepages = generic_writepages, + .writepages = blkdev_writepages, .releasepage = blkdev_releasepage, .direct_IO = blkdev_direct_IO, .is_dirty_writeback = buffer_check_dirty_writeback, @@ -1730,43 +1749,25 @@ static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return __dax_fault(vma, vmf, blkdev_get_block, NULL); } -static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, unsigned int flags) -{ - return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL); -} - -static void blkdev_vm_open(struct vm_area_struct *vma) +static int blkdev_dax_pfn_mkwrite(struct vm_area_struct *vma, + struct vm_fault *vmf) { - struct inode *bd_inode = bdev_file_inode(vma->vm_file); - struct block_device *bdev = I_BDEV(bd_inode); - - mutex_lock(&bd_inode->i_mutex); - bdev->bd_map_count++; - mutex_unlock(&bd_inode->i_mutex); + return dax_pfn_mkwrite(vma, vmf); } -static void blkdev_vm_close(struct vm_area_struct *vma) +static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, unsigned int flags) { - struct inode *bd_inode = bdev_file_inode(vma->vm_file); - struct block_device *bdev = I_BDEV(bd_inode); - - mutex_lock(&bd_inode->i_mutex); - bdev->bd_map_count--; - mutex_unlock(&bd_inode->i_mutex); + return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL); } static const struct vm_operations_struct blkdev_dax_vm_ops = { - .open = blkdev_vm_open, - .close = blkdev_vm_close, .fault = blkdev_dax_fault, .pmd_fault = blkdev_dax_pmd_fault, - .pfn_mkwrite = blkdev_dax_fault, + .pfn_mkwrite = blkdev_dax_pfn_mkwrite, }; static const struct vm_operations_struct blkdev_default_vm_ops = { - .open = blkdev_vm_open, - .close = blkdev_vm_close, .fault = filemap_fault, .map_pages = filemap_map_pages, }; @@ -1774,18 +1775,14 @@ static const struct vm_operations_struct blkdev_default_vm_ops = { static int blkdev_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *bd_inode = bdev_file_inode(file); - struct block_device *bdev = I_BDEV(bd_inode); file_accessed(file); - mutex_lock(&bd_inode->i_mutex); - bdev->bd_map_count++; if (IS_DAX(bd_inode)) { vma->vm_ops = &blkdev_dax_vm_ops; vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; } else { vma->vm_ops = &blkdev_default_vm_ops; } - mutex_unlock(&bd_inode->i_mutex); return 0; } diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 88d9af3d4581..5fb60ea7eee2 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -328,8 +328,8 @@ static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq, list_add_tail(&work->ordered_list, &wq->ordered_list); spin_unlock_irqrestore(&wq->list_lock, flags); } - queue_work(wq->normal_wq, &work->normal_work); trace_btrfs_work_queued(work); + queue_work(wq->normal_wq, &work->normal_work); } void btrfs_queue_work(struct btrfs_workqueue *wq, diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 08405a3da6b1..f6dac40f87ff 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -560,13 +560,13 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info, */ static void __merge_refs(struct list_head *head, int mode) { - struct __prelim_ref *ref1; + struct __prelim_ref *pos1; - list_for_each_entry(ref1, head, list) { - struct __prelim_ref *ref2 = ref1, *tmp; + list_for_each_entry(pos1, head, list) { + struct __prelim_ref *pos2 = pos1, *tmp; - list_for_each_entry_safe_continue(ref2, tmp, head, list) { - struct __prelim_ref *xchg; + list_for_each_entry_safe_continue(pos2, tmp, head, list) { + struct __prelim_ref *xchg, *ref1 = pos1, *ref2 = pos2; struct extent_inode_elem *eie; if (!ref_for_same_block(ref1, ref2)) @@ -1406,7 +1406,8 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, read_extent_buffer(eb, dest + bytes_left, name_off, name_len); if (eb != eb_in) { - btrfs_tree_read_unlock_blocking(eb); + if (!path->skip_locking) + btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); } ret = btrfs_find_item(fs_root, path, parent, 0, @@ -1426,9 +1427,10 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, eb = path->nodes[0]; /* make sure we can use eb after releasing the path */ if (eb != eb_in) { - atomic_inc(&eb->refs); - btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + if (!path->skip_locking) + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + path->nodes[0] = NULL; + path->locks[0] = 0; } btrfs_release_path(path); iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index c473c42d7d6c..3346cd8f9910 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -637,11 +637,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, faili = nr_pages - 1; cb->nr_pages = nr_pages; - /* In the parent-locked case, we only locked the range we are - * interested in. In all other cases, we can opportunistically - * cache decompressed data that goes beyond the requested range. */ - if (!(bio_flags & EXTENT_BIO_PARENT_LOCKED)) - add_ra_bio_pages(inode, em_start + em_len, cb); + add_ra_bio_pages(inode, em_start + em_len, cb); /* include any pages we added in add_ra-bio_pages */ uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 97ad9bbeb35d..bfe4a337fb4d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1614,7 +1614,7 @@ struct btrfs_fs_info { spinlock_t delayed_iput_lock; struct list_head delayed_iputs; - struct rw_semaphore delayed_iput_sem; + struct mutex cleaner_delayed_iput_mutex; /* this protects tree_mod_seq_list */ spinlock_t tree_mod_seq_lock; @@ -3641,6 +3641,7 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, int __get_raid_index(u64 flags); int btrfs_start_write_no_snapshoting(struct btrfs_root *root); void btrfs_end_write_no_snapshoting(struct btrfs_root *root); +void btrfs_wait_for_snapshot_creation(struct btrfs_root *root); void check_system_chunk(struct btrfs_trans_handle *trans, struct btrfs_root *root, const u64 type); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 0be47e4b8136..b57daa895cea 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1689,7 +1689,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list, * */ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, - struct list_head *ins_list) + struct list_head *ins_list, bool *emitted) { struct btrfs_dir_item *di; struct btrfs_delayed_item *curr, *next; @@ -1733,6 +1733,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, if (over) return 1; + *emitted = true; } return 0; } diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index f70119f25421..0167853c84ae 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -144,7 +144,7 @@ void btrfs_put_delayed_items(struct list_head *ins_list, int btrfs_should_delete_dir_index(struct list_head *del_list, u64 index); int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, - struct list_head *ins_list); + struct list_head *ins_list, bool *emitted); /* for init */ int __init btrfs_delayed_inode_init(void); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 1e668fb7dd4c..cbb7dbfb3fff 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -614,7 +614,7 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( em = lookup_extent_mapping(em_tree, start, (u64)-1); if (!em) break; - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) if (srcdev == map->stripes[i].dev) map->stripes[i].dev = tgtdev; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e99ccd6ffb2c..5699bbc23feb 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -55,6 +55,12 @@ #include <asm/cpufeature.h> #endif +#define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ + BTRFS_HEADER_FLAG_RELOC |\ + BTRFS_SUPER_FLAG_ERROR |\ + BTRFS_SUPER_FLAG_SEEDING |\ + BTRFS_SUPER_FLAG_METADUMP) + static const struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); static void free_fs_root(struct btrfs_root *root); @@ -176,6 +182,7 @@ static struct btrfs_lockdep_keyset { { .id = BTRFS_TREE_RELOC_OBJECTID, .name_stem = "treloc" }, { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc" }, { .id = BTRFS_UUID_TREE_OBJECTID, .name_stem = "uuid" }, + { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, .name_stem = "free-space" }, { .id = 0, .name_stem = "tree" }, }; @@ -924,7 +931,7 @@ static int check_async_write(struct inode *inode, unsigned long bio_flags) if (bio_flags & EXTENT_BIO_TREE_LOG) return 0; #ifdef CONFIG_X86 - if (static_cpu_has_safe(X86_FEATURE_XMM4_2)) + if (static_cpu_has(X86_FEATURE_XMM4_2)) return 0; #endif return 1; @@ -1583,8 +1590,23 @@ int btrfs_init_fs_root(struct btrfs_root *root) ret = get_anon_bdev(&root->anon_dev); if (ret) goto free_writers; + + mutex_lock(&root->objectid_mutex); + ret = btrfs_find_highest_objectid(root, + &root->highest_objectid); + if (ret) { + mutex_unlock(&root->objectid_mutex); + goto free_root_dev; + } + + ASSERT(root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID); + + mutex_unlock(&root->objectid_mutex); + return 0; +free_root_dev: + free_anon_bdev(root->anon_dev); free_writers: btrfs_free_subvolume_writers(root->subv_writers); fail: @@ -1766,7 +1788,6 @@ static int cleaner_kthread(void *arg) int again; struct btrfs_trans_handle *trans; - set_freezable(); do { again = 0; @@ -1786,7 +1807,10 @@ static int cleaner_kthread(void *arg) goto sleep; } + mutex_lock(&root->fs_info->cleaner_delayed_iput_mutex); btrfs_run_delayed_iputs(root); + mutex_unlock(&root->fs_info->cleaner_delayed_iput_mutex); + again = btrfs_clean_one_deleted_snapshot(root); mutex_unlock(&root->fs_info->cleaner_mutex); @@ -2556,8 +2580,8 @@ int open_ctree(struct super_block *sb, mutex_init(&fs_info->delete_unused_bgs_mutex); mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->delalloc_root_mutex); + mutex_init(&fs_info->cleaner_delayed_iput_mutex); seqlock_init(&fs_info->profiles_lock); - init_rwsem(&fs_info->delayed_iput_sem); INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); INIT_LIST_HEAD(&fs_info->space_info); @@ -2742,26 +2766,6 @@ int open_ctree(struct super_block *sb, goto fail_alloc; } - /* - * Leafsize and nodesize were always equal, this is only a sanity check. - */ - if (le32_to_cpu(disk_super->__unused_leafsize) != - btrfs_super_nodesize(disk_super)) { - printk(KERN_ERR "BTRFS: couldn't mount because metadata " - "blocksizes don't match. node %d leaf %d\n", - btrfs_super_nodesize(disk_super), - le32_to_cpu(disk_super->__unused_leafsize)); - err = -EINVAL; - goto fail_alloc; - } - if (btrfs_super_nodesize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) { - printk(KERN_ERR "BTRFS: couldn't mount because metadata " - "blocksize (%d) was too large\n", - btrfs_super_nodesize(disk_super)); - err = -EINVAL; - goto fail_alloc; - } - features = btrfs_super_incompat_flags(disk_super); features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; if (tree_root->fs_info->compress_type == BTRFS_COMPRESS_LZO) @@ -2833,17 +2837,6 @@ int open_ctree(struct super_block *sb, sb->s_blocksize = sectorsize; sb->s_blocksize_bits = blksize_bits(sectorsize); - if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) { - printk(KERN_ERR "BTRFS: valid FS not found on %s\n", sb->s_id); - goto fail_sb_buffer; - } - - if (sectorsize != PAGE_SIZE) { - printk(KERN_ERR "BTRFS: incompatible sector size (%lu) " - "found on %s\n", (unsigned long)sectorsize, sb->s_id); - goto fail_sb_buffer; - } - mutex_lock(&fs_info->chunk_mutex); ret = btrfs_read_sys_array(tree_root); mutex_unlock(&fs_info->chunk_mutex); @@ -2915,6 +2908,18 @@ retry_root_backup: tree_root->commit_root = btrfs_root_node(tree_root); btrfs_set_root_refs(&tree_root->root_item, 1); + mutex_lock(&tree_root->objectid_mutex); + ret = btrfs_find_highest_objectid(tree_root, + &tree_root->highest_objectid); + if (ret) { + mutex_unlock(&tree_root->objectid_mutex); + goto recovery_tree_root; + } + + ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID); + + mutex_unlock(&tree_root->objectid_mutex); + ret = btrfs_read_roots(fs_info, tree_root); if (ret) goto recovery_tree_root; @@ -4018,8 +4023,17 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, int read_only) { struct btrfs_super_block *sb = fs_info->super_copy; + u64 nodesize = btrfs_super_nodesize(sb); + u64 sectorsize = btrfs_super_sectorsize(sb); int ret = 0; + if (btrfs_super_magic(sb) != BTRFS_MAGIC) { + printk(KERN_ERR "BTRFS: no valid FS found\n"); + ret = -EINVAL; + } + if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) + printk(KERN_WARNING "BTRFS: unrecognized super flag: %llu\n", + btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP); if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) { printk(KERN_ERR "BTRFS: tree_root level too big: %d >= %d\n", btrfs_super_root_level(sb), BTRFS_MAX_LEVEL); @@ -4037,31 +4051,46 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, } /* - * The common minimum, we don't know if we can trust the nodesize/sectorsize - * items yet, they'll be verified later. Issue just a warning. + * Check sectorsize and nodesize first, other check will need it. + * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here. */ - if (!IS_ALIGNED(btrfs_super_root(sb), 4096)) + if (!is_power_of_2(sectorsize) || sectorsize < 4096 || + sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) { + printk(KERN_ERR "BTRFS: invalid sectorsize %llu\n", sectorsize); + ret = -EINVAL; + } + /* Only PAGE SIZE is supported yet */ + if (sectorsize != PAGE_CACHE_SIZE) { + printk(KERN_ERR "BTRFS: sectorsize %llu not supported yet, only support %lu\n", + sectorsize, PAGE_CACHE_SIZE); + ret = -EINVAL; + } + if (!is_power_of_2(nodesize) || nodesize < sectorsize || + nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) { + printk(KERN_ERR "BTRFS: invalid nodesize %llu\n", nodesize); + ret = -EINVAL; + } + if (nodesize != le32_to_cpu(sb->__unused_leafsize)) { + printk(KERN_ERR "BTRFS: invalid leafsize %u, should be %llu\n", + le32_to_cpu(sb->__unused_leafsize), + nodesize); + ret = -EINVAL; + } + + /* Root alignment check */ + if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) { printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", btrfs_super_root(sb)); - if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096)) + ret = -EINVAL; + } + if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) { printk(KERN_WARNING "BTRFS: chunk_root block unaligned: %llu\n", btrfs_super_chunk_root(sb)); - if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096)) - printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n", - btrfs_super_log_root(sb)); - - /* - * Check the lower bound, the alignment and other constraints are - * checked later. - */ - if (btrfs_super_nodesize(sb) < 4096) { - printk(KERN_ERR "BTRFS: nodesize too small: %u < 4096\n", - btrfs_super_nodesize(sb)); ret = -EINVAL; } - if (btrfs_super_sectorsize(sb) < 4096) { - printk(KERN_ERR "BTRFS: sectorsize too small: %u < 4096\n", - btrfs_super_sectorsize(sb)); + if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) { + printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n", + btrfs_super_log_root(sb)); ret = -EINVAL; } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 60cc1399c64f..e2287c7c10be 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4139,8 +4139,10 @@ commit_trans: !atomic_read(&root->fs_info->open_ioctl_trans)) { need_commit--; - if (need_commit > 0) + if (need_commit > 0) { + btrfs_start_delalloc_roots(fs_info, 0, -1); btrfs_wait_ordered_roots(fs_info, -1); + } trans = btrfs_join_transaction(root); if (IS_ERR(trans)) @@ -4153,11 +4155,12 @@ commit_trans: if (ret) return ret; /* - * make sure that all running delayed iput are - * done + * The cleaner kthread might still be doing iput + * operations. Wait for it to finish so that + * more space is released. */ - down_write(&root->fs_info->delayed_iput_sem); - up_write(&root->fs_info->delayed_iput_sem); + mutex_lock(&root->fs_info->cleaner_delayed_iput_mutex); + mutex_unlock(&root->fs_info->cleaner_delayed_iput_mutex); goto again; } else { btrfs_end_transaction(trans, root); @@ -10399,7 +10402,7 @@ btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info, * more device items and remove one chunk item), but this is done at * btrfs_remove_chunk() through a call to check_system_chunk(). */ - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; num_items = 3 + map->num_stripes; free_extent_map(em); @@ -10586,7 +10589,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info) disk_super = fs_info->super_copy; if (!btrfs_super_root(disk_super)) - return 1; + return -EINVAL; features = btrfs_super_incompat_flags(disk_super); if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) @@ -10816,3 +10819,23 @@ int btrfs_start_write_no_snapshoting(struct btrfs_root *root) } return 1; } + +static int wait_snapshoting_atomic_t(atomic_t *a) +{ + schedule(); + return 0; +} + +void btrfs_wait_for_snapshot_creation(struct btrfs_root *root) +{ + while (true) { + int ret; + + ret = btrfs_start_write_no_snapshoting(root); + if (ret) + break; + wait_on_atomic_t(&root->will_be_snapshoted, + wait_snapshoting_atomic_t, + TASK_UNINTERRUPTIBLE); + } +} diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 2e7c97a3f344..392592dc7010 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2897,12 +2897,11 @@ static int __do_readpage(struct extent_io_tree *tree, struct block_device *bdev; int ret; int nr = 0; - int parent_locked = *bio_flags & EXTENT_BIO_PARENT_LOCKED; size_t pg_offset = 0; size_t iosize; size_t disk_io_size; size_t blocksize = inode->i_sb->s_blocksize; - unsigned long this_bio_flag = *bio_flags & EXTENT_BIO_PARENT_LOCKED; + unsigned long this_bio_flag = 0; set_page_extent_mapped(page); @@ -2942,18 +2941,16 @@ static int __do_readpage(struct extent_io_tree *tree, kunmap_atomic(userpage); set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); - if (!parent_locked) - unlock_extent_cached(tree, cur, - cur + iosize - 1, - &cached, GFP_NOFS); + unlock_extent_cached(tree, cur, + cur + iosize - 1, + &cached, GFP_NOFS); break; } em = __get_extent_map(inode, page, pg_offset, cur, end - cur + 1, get_extent, em_cached); if (IS_ERR_OR_NULL(em)) { SetPageError(page); - if (!parent_locked) - unlock_extent(tree, cur, end); + unlock_extent(tree, cur, end); break; } extent_offset = cur - em->start; @@ -3038,12 +3035,9 @@ static int __do_readpage(struct extent_io_tree *tree, set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); - if (parent_locked) - free_extent_state(cached); - else - unlock_extent_cached(tree, cur, - cur + iosize - 1, - &cached, GFP_NOFS); + unlock_extent_cached(tree, cur, + cur + iosize - 1, + &cached, GFP_NOFS); cur = cur + iosize; pg_offset += iosize; continue; @@ -3052,8 +3046,7 @@ static int __do_readpage(struct extent_io_tree *tree, if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1, NULL)) { check_page_uptodate(tree, page); - if (!parent_locked) - unlock_extent(tree, cur, cur + iosize - 1); + unlock_extent(tree, cur, cur + iosize - 1); cur = cur + iosize; pg_offset += iosize; continue; @@ -3063,8 +3056,7 @@ static int __do_readpage(struct extent_io_tree *tree, */ if (block_start == EXTENT_MAP_INLINE) { SetPageError(page); - if (!parent_locked) - unlock_extent(tree, cur, cur + iosize - 1); + unlock_extent(tree, cur, cur + iosize - 1); cur = cur + iosize; pg_offset += iosize; continue; @@ -3083,8 +3075,7 @@ static int __do_readpage(struct extent_io_tree *tree, *bio_flags = this_bio_flag; } else { SetPageError(page); - if (!parent_locked) - unlock_extent(tree, cur, cur + iosize - 1); + unlock_extent(tree, cur, cur + iosize - 1); } cur = cur + iosize; pg_offset += iosize; @@ -3213,20 +3204,6 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page, return ret; } -int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page, - get_extent_t *get_extent, int mirror_num) -{ - struct bio *bio = NULL; - unsigned long bio_flags = EXTENT_BIO_PARENT_LOCKED; - int ret; - - ret = __do_readpage(tree, page, get_extent, NULL, &bio, mirror_num, - &bio_flags, READ, NULL); - if (bio) - ret = submit_one_bio(READ, bio, mirror_num, bio_flags); - return ret; -} - static noinline void update_nr_written(struct page *page, struct writeback_control *wbc, unsigned long nr_written) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 0377413bd4b9..880d5292e972 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -29,7 +29,6 @@ */ #define EXTENT_BIO_COMPRESSED 1 #define EXTENT_BIO_TREE_LOG 2 -#define EXTENT_BIO_PARENT_LOCKED 4 #define EXTENT_BIO_FLAG_SHIFT 16 /* these are bit numbers for test/set bit */ @@ -210,8 +209,6 @@ static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end) int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end); int extent_read_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, int mirror_num); -int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page, - get_extent_t *get_extent, int mirror_num); int __init extent_io_init(void); void extent_io_exit(void); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 6a98bddd8f33..84fb56d5c018 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -76,7 +76,7 @@ void free_extent_map(struct extent_map *em) WARN_ON(extent_map_in_tree(em)); WARN_ON(!list_empty(&em->list)); if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) - kfree(em->bdev); + kfree(em->map_lookup); kmem_cache_free(extent_map_cache, em); } } diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index b2991fd8583e..eb8b8fae036b 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -32,7 +32,15 @@ struct extent_map { u64 block_len; u64 generation; unsigned long flags; - struct block_device *bdev; + union { + struct block_device *bdev; + + /* + * used for chunk mappings + * flags & EXTENT_FLAG_FS_MAPPING must be set + */ + struct map_lookup *map_lookup; + }; atomic_t refs; unsigned int compress_type; struct list_head list; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 83d7859d7619..098bb8f690c9 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -406,8 +406,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) /* simple helper to fault in pages and copy. This should go away * and be replaced with calls into generic code. */ -static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, - size_t write_bytes, +static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, struct page **prepared_pages, struct iov_iter *i) { @@ -1588,8 +1587,7 @@ again: ret = 0; } - copied = btrfs_copy_from_user(pos, num_pages, - write_bytes, pages, i); + copied = btrfs_copy_from_user(pos, write_bytes, pages, i); /* * if we have trouble faulting in the pages, fall @@ -1764,17 +1762,17 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, loff_t pos; size_t count; - mutex_lock(&inode->i_mutex); + inode_lock(inode); err = generic_write_checks(iocb, from); if (err <= 0) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } current->backing_dev_info = inode_to_bdi(inode); err = file_remove_privs(file); if (err) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out; } @@ -1785,7 +1783,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, * to stop this write operation to ensure FS consistency. */ if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); err = -EROFS; goto out; } @@ -1806,7 +1804,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, end_pos = round_up(pos + count, root->sectorsize); err = btrfs_cont_expand(inode, i_size_read(inode), end_pos); if (err) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out; } } @@ -1822,7 +1820,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, iocb->ki_pos = pos + num_written; } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* * We also have to set last_sub_trans to the current log transid, @@ -1911,7 +1909,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); atomic_inc(&root->log_batch); full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); @@ -1963,7 +1961,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = start_ordered_ops(inode, start, end); } if (ret) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out; } atomic_inc(&root->log_batch); @@ -2009,7 +2007,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out; } @@ -2033,7 +2031,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out; } trans->sync = true; @@ -2056,7 +2054,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * file again, but that will end up using the synchronization * inside btrfs_sync_log to keep things safe. */ - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* * If any of the ordered extents had an error, just return it to user @@ -2305,7 +2303,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE); ret = find_first_non_hole(inode, &offset, &len); if (ret < 0) @@ -2345,7 +2343,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) truncated_page = true; ret = btrfs_truncate_page(inode, offset, 0, 0); if (ret) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } } @@ -2421,7 +2419,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) ret = btrfs_wait_ordered_range(inode, lockstart, lockend - lockstart + 1); if (ret) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } } @@ -2576,7 +2574,7 @@ out_only_mutex: ret = btrfs_end_transaction(trans, root); } } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (ret && !err) err = ret; return err; @@ -2660,7 +2658,7 @@ static long btrfs_fallocate(struct file *file, int mode, if (ret < 0) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = inode_newsize_ok(inode, alloc_end); if (ret) goto out; @@ -2818,7 +2816,7 @@ out: * So this is completely used as cleanup. */ btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* Let go of our reservation. */ btrfs_free_reserved_data_space(inode, alloc_start, alloc_end - alloc_start); @@ -2894,7 +2892,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) struct inode *inode = file->f_mapping->host; int ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); switch (whence) { case SEEK_END: case SEEK_CUR: @@ -2903,20 +2901,20 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) case SEEK_DATA: case SEEK_HOLE: if (offset >= i_size_read(inode)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return -ENXIO; } ret = find_desired_extent(inode, &offset, whence); if (ret) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } } offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return offset; } diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 393e36bd5845..53dbeaf6ce94 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -153,6 +153,20 @@ static inline u32 free_space_bitmap_size(u64 size, u32 sectorsize) static unsigned long *alloc_bitmap(u32 bitmap_size) { + void *mem; + + /* + * The allocation size varies, observed numbers were < 4K up to 16K. + * Using vmalloc unconditionally would be too heavy, we'll try + * contiguous allocations first. + */ + if (bitmap_size <= PAGE_SIZE) + return kzalloc(bitmap_size, GFP_NOFS); + + mem = kzalloc(bitmap_size, GFP_NOFS | __GFP_NOWARN); + if (mem) + return mem; + return __vmalloc(bitmap_size, GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); } @@ -289,7 +303,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, ret = 0; out: - vfree(bitmap); + kvfree(bitmap); if (ret) btrfs_abort_transaction(trans, root, ret); return ret; @@ -438,7 +452,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, ret = 0; out: - vfree(bitmap); + kvfree(bitmap); if (ret) btrfs_abort_transaction(trans, root, ret); return ret; diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 8b57c17b3fb3..e50316c4af15 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -515,7 +515,7 @@ out: return ret; } -static int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid) +int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid) { struct btrfs_path *path; int ret; @@ -555,13 +555,6 @@ int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid) int ret; mutex_lock(&root->objectid_mutex); - if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) { - ret = btrfs_find_highest_objectid(root, - &root->highest_objectid); - if (ret) - goto out; - } - if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) { ret = -ENOSPC; goto out; diff --git a/fs/btrfs/inode-map.h b/fs/btrfs/inode-map.h index ddb347bfee23..c8e864b2d530 100644 --- a/fs/btrfs/inode-map.h +++ b/fs/btrfs/inode-map.h @@ -9,5 +9,6 @@ int btrfs_save_ino_cache(struct btrfs_root *root, struct btrfs_trans_handle *trans); int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid); +int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid); #endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 247830107686..d96f5cf38a2d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3134,7 +3134,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; - down_read(&fs_info->delayed_iput_sem); spin_lock(&fs_info->delayed_iput_lock); while (!list_empty(&fs_info->delayed_iputs)) { struct btrfs_inode *inode; @@ -3153,7 +3152,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root) spin_lock(&fs_info->delayed_iput_lock); } spin_unlock(&fs_info->delayed_iput_lock); - up_read(&root->fs_info->delayed_iput_sem); } /* @@ -4874,26 +4872,6 @@ next: return err; } -static int wait_snapshoting_atomic_t(atomic_t *a) -{ - schedule(); - return 0; -} - -static void wait_for_snapshot_creation(struct btrfs_root *root) -{ - while (true) { - int ret; - - ret = btrfs_start_write_no_snapshoting(root); - if (ret) - break; - wait_on_atomic_t(&root->will_be_snapshoted, - wait_snapshoting_atomic_t, - TASK_UNINTERRUPTIBLE); - } -} - static int btrfs_setsize(struct inode *inode, struct iattr *attr) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -4925,7 +4903,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) * truncation, it must capture all writes that happened before * this truncation. */ - wait_for_snapshot_creation(root); + btrfs_wait_for_snapshot_creation(root); ret = btrfs_cont_expand(inode, oldsize, newsize); if (ret) { btrfs_end_write_no_snapshoting(root); @@ -5739,6 +5717,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) char *name_ptr; int name_len; int is_curr = 0; /* ctx->pos points to the current index? */ + bool emitted; /* FIXME, use a real flag for deciding about the key type */ if (root->fs_info->tree_root == root) @@ -5767,6 +5746,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) if (ret < 0) goto err; + emitted = false; while (1) { leaf = path->nodes[0]; slot = path->slots[0]; @@ -5846,6 +5826,7 @@ skip: if (over) goto nopos; + emitted = true; di_len = btrfs_dir_name_len(leaf, di) + btrfs_dir_data_len(leaf, di) + sizeof(*di); di_cur += di_len; @@ -5858,11 +5839,20 @@ next: if (key_type == BTRFS_DIR_INDEX_KEY) { if (is_curr) ctx->pos++; - ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list); + ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list, &emitted); if (ret) goto nopos; } + /* + * If we haven't emitted any dir entry, we must not touch ctx->pos as + * it was was set to the termination value in previous call. We assume + * that "." and ".." were emitted if we reach this point and set the + * termination value as well for an empty directory. + */ + if (ctx->pos > 2 && !emitted) + goto nopos; + /* Reached end of directory/root. Bump pos past the last item. */ ctx->pos++; @@ -7138,21 +7128,41 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, if (ret) return ERR_PTR(ret); - em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, - ins.offset, ins.offset, ins.offset, 0); - if (IS_ERR(em)) { - btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); - return em; - } - + /* + * Create the ordered extent before the extent map. This is to avoid + * races with the fast fsync path that would lead to it logging file + * extent items that point to disk extents that were not yet written to. + * The fast fsync path collects ordered extents into a local list and + * then collects all the new extent maps, so we must create the ordered + * extent first and make sure the fast fsync path collects any new + * ordered extents after collecting new extent maps as well. + * The fsync path simply can not rely on inode_dio_wait() because it + * causes deadlock with AIO. + */ ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, ins.offset, ins.offset, 0); if (ret) { btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); - free_extent_map(em); return ERR_PTR(ret); } + em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, + ins.offset, ins.offset, ins.offset, 0); + if (IS_ERR(em)) { + struct btrfs_ordered_extent *oe; + + btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); + oe = btrfs_lookup_ordered_extent(inode, start); + ASSERT(oe); + if (WARN_ON(!oe)) + return em; + set_bit(BTRFS_ORDERED_IOERR, &oe->flags); + set_bit(BTRFS_ORDERED_IO_DONE, &oe->flags); + btrfs_remove_ordered_extent(inode, oe); + /* Once for our lookup and once for the ordered extents tree. */ + btrfs_put_ordered_extent(oe); + btrfs_put_ordered_extent(oe); + } return em; } @@ -7976,6 +7986,7 @@ static void btrfs_endio_direct_read(struct bio *bio) kfree(dip); + dio_bio->bi_error = bio->bi_error; dio_end_io(dio_bio, bio->bi_error); if (io_bio->end_io) @@ -8030,6 +8041,7 @@ static void btrfs_endio_direct_write(struct bio *bio) kfree(dip); + dio_bio->bi_error = bio->bi_error; dio_end_io(dio_bio, bio->bi_error); bio_put(bio); } @@ -8469,7 +8481,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, * not unlock the i_mutex at this case. */ if (offset + count <= inode->i_size) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); relock = true; } ret = btrfs_delalloc_reserve_space(inode, offset, count); @@ -8526,7 +8538,7 @@ out: if (wakeup) inode_dio_end(inode); if (relock) - mutex_lock(&inode->i_mutex); + inode_lock(inode); return ret; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2a47a3148ec8..48aee9846329 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -240,7 +240,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ip_oldflags = ip->flags; i_oldflags = inode->i_flags; @@ -358,7 +358,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) } out_unlock: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); mnt_drop_write_file(file); return ret; } @@ -568,6 +568,10 @@ static noinline int create_subvol(struct inode *dir, goto fail; } + mutex_lock(&new_root->objectid_mutex); + new_root->highest_objectid = new_dirid; + mutex_unlock(&new_root->objectid_mutex); + /* * insert the directory item */ @@ -877,7 +881,7 @@ out_up_read: out_dput: dput(dentry); out_unlock: - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); return error; } @@ -1389,18 +1393,18 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, ra_index += cluster; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) BTRFS_I(inode)->force_compress = compress_type; ret = cluster_pages_for_defrag(inode, pages, i, cluster); if (ret < 0) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out_ra; } defrag_count += ret; balance_dirty_pages_ratelimited(inode->i_mapping); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (newer_than) { if (newer_off == (u64)-1) @@ -1461,9 +1465,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, out_ra: if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } if (!file) kfree(ra); @@ -2426,7 +2430,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out_dput; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * Don't allow to delete a subvolume with send in progress. This is @@ -2539,7 +2543,7 @@ out_up_write: spin_unlock(&dest->root_item_lock); } out_unlock_inode: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (!err) { d_invalidate(dentry); btrfs_invalidate_inodes(dest); @@ -2555,7 +2559,7 @@ out_unlock_inode: out_dput: dput(dentry); out_unlock_dir: - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); out_drop_write: mnt_drop_write_file(file); out: @@ -2790,24 +2794,29 @@ out: static struct page *extent_same_get_page(struct inode *inode, pgoff_t index) { struct page *page; - struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; page = grab_cache_page(inode->i_mapping, index); if (!page) - return NULL; + return ERR_PTR(-ENOMEM); if (!PageUptodate(page)) { - if (extent_read_full_page_nolock(tree, page, btrfs_get_extent, - 0)) - return NULL; + int ret; + + ret = btrfs_readpage(NULL, page); + if (ret) + return ERR_PTR(ret); lock_page(page); if (!PageUptodate(page)) { unlock_page(page); page_cache_release(page); - return NULL; + return ERR_PTR(-EIO); + } + if (page->mapping != inode->i_mapping) { + unlock_page(page); + page_cache_release(page); + return ERR_PTR(-EAGAIN); } } - unlock_page(page); return page; } @@ -2819,17 +2828,31 @@ static int gather_extent_pages(struct inode *inode, struct page **pages, pgoff_t index = off >> PAGE_CACHE_SHIFT; for (i = 0; i < num_pages; i++) { +again: pages[i] = extent_same_get_page(inode, index + i); - if (!pages[i]) - return -ENOMEM; + if (IS_ERR(pages[i])) { + int err = PTR_ERR(pages[i]); + + if (err == -EAGAIN) + goto again; + pages[i] = NULL; + return err; + } } return 0; } -static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) +static int lock_extent_range(struct inode *inode, u64 off, u64 len, + bool retry_range_locking) { - /* do any pending delalloc/csum calc on src, one way or - another, and lock file content */ + /* + * Do any pending delalloc/csum calculations on inode, one way or + * another, and lock file content. + * The locking order is: + * + * 1) pages + * 2) range in the inode's io tree + */ while (1) { struct btrfs_ordered_extent *ordered; lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); @@ -2847,14 +2870,17 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); if (ordered) btrfs_put_ordered_extent(ordered); + if (!retry_range_locking) + return -EAGAIN; btrfs_wait_ordered_range(inode, off, len); } + return 0; } static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2) { - mutex_unlock(&inode1->i_mutex); - mutex_unlock(&inode2->i_mutex); + inode_unlock(inode1); + inode_unlock(inode2); } static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2) @@ -2862,8 +2888,8 @@ static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2) if (inode1 < inode2) swap(inode1, inode2); - mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(inode1, I_MUTEX_PARENT); + inode_lock_nested(inode2, I_MUTEX_CHILD); } static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, @@ -2873,15 +2899,24 @@ static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); } -static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, - struct inode *inode2, u64 loff2, u64 len) +static int btrfs_double_extent_lock(struct inode *inode1, u64 loff1, + struct inode *inode2, u64 loff2, u64 len, + bool retry_range_locking) { + int ret; + if (inode1 < inode2) { swap(inode1, inode2); swap(loff1, loff2); } - lock_extent_range(inode1, loff1, len); - lock_extent_range(inode2, loff2, len); + ret = lock_extent_range(inode1, loff1, len, retry_range_locking); + if (ret) + return ret; + ret = lock_extent_range(inode2, loff2, len, retry_range_locking); + if (ret) + unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, + loff1 + len - 1); + return ret; } struct cmp_pages { @@ -2897,11 +2932,15 @@ static void btrfs_cmp_data_free(struct cmp_pages *cmp) for (i = 0; i < cmp->num_pages; i++) { pg = cmp->src_pages[i]; - if (pg) + if (pg) { + unlock_page(pg); page_cache_release(pg); + } pg = cmp->dst_pages[i]; - if (pg) + if (pg) { + unlock_page(pg); page_cache_release(pg); + } } kfree(cmp->src_pages); kfree(cmp->dst_pages); @@ -2962,6 +3001,8 @@ static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst, src_page = cmp->src_pages[i]; dst_page = cmp->dst_pages[i]; + ASSERT(PageLocked(src_page)); + ASSERT(PageLocked(dst_page)); addr = kmap_atomic(src_page); dst_addr = kmap_atomic(dst_page); @@ -3022,7 +3063,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, return 0; if (same_inode) { - mutex_lock(&src->i_mutex); + inode_lock(src); ret = extent_same_check_offsets(src, loff, &len, olen); if (ret) @@ -3074,14 +3115,46 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, goto out_unlock; } +again: ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, &cmp); if (ret) goto out_unlock; if (same_inode) - lock_extent_range(src, same_lock_start, same_lock_len); + ret = lock_extent_range(src, same_lock_start, same_lock_len, + false); else - btrfs_double_extent_lock(src, loff, dst, dst_loff, len); + ret = btrfs_double_extent_lock(src, loff, dst, dst_loff, len, + false); + /* + * If one of the inodes has dirty pages in the respective range or + * ordered extents, we need to flush dellaloc and wait for all ordered + * extents in the range. We must unlock the pages and the ranges in the + * io trees to avoid deadlocks when flushing delalloc (requires locking + * pages) and when waiting for ordered extents to complete (they require + * range locking). + */ + if (ret == -EAGAIN) { + /* + * Ranges in the io trees already unlocked. Now unlock all + * pages before waiting for all IO to complete. + */ + btrfs_cmp_data_free(&cmp); + if (same_inode) { + btrfs_wait_ordered_range(src, same_lock_start, + same_lock_len); + } else { + btrfs_wait_ordered_range(src, loff, len); + btrfs_wait_ordered_range(dst, dst_loff, len); + } + goto again; + } + ASSERT(ret == 0); + if (WARN_ON(ret)) { + /* ranges in the io trees already unlocked */ + btrfs_cmp_data_free(&cmp); + return ret; + } /* pass original length for comparison so we stay within i_size */ ret = btrfs_cmp_data(src, loff, dst, dst_loff, olen, &cmp); @@ -3097,7 +3170,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, btrfs_cmp_data_free(&cmp); out_unlock: if (same_inode) - mutex_unlock(&src->i_mutex); + inode_unlock(src); else btrfs_double_inode_unlock(src, dst); @@ -3745,7 +3818,7 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, if (!same_inode) { btrfs_double_inode_lock(src, inode); } else { - mutex_lock(&src->i_mutex); + inode_lock(src); } /* determine range to clone */ @@ -3791,9 +3864,15 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, u64 lock_start = min_t(u64, off, destoff); u64 lock_len = max_t(u64, off, destoff) + len - lock_start; - lock_extent_range(src, lock_start, lock_len); + ret = lock_extent_range(src, lock_start, lock_len, true); } else { - btrfs_double_extent_lock(src, off, inode, destoff, len); + ret = btrfs_double_extent_lock(src, off, inode, destoff, len, + true); + } + ASSERT(ret == 0); + if (WARN_ON(ret)) { + /* ranges in the io trees already unlocked */ + goto out_unlock; } ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); @@ -3816,7 +3895,7 @@ out_unlock: if (!same_inode) btrfs_double_inode_unlock(src, inode); else - mutex_unlock(&src->i_mutex); + inode_unlock(src); return ret; } diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 6d707545f775..55161369fab1 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -609,13 +609,28 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, return 1; } +static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe, + int index) +{ + return stripe * rbio->stripe_npages + index; +} + +/* + * these are just the pages from the rbio array, not from anything + * the FS sent down to us + */ +static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, + int index) +{ + return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)]; +} + /* * helper to index into the pstripe */ static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) { - index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; - return rbio->stripe_pages[index]; + return rbio_stripe_page(rbio, rbio->nr_data, index); } /* @@ -626,10 +641,7 @@ static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) { if (rbio->nr_data + 1 == rbio->real_stripes) return NULL; - - index += ((rbio->nr_data + 1) * rbio->stripe_len) >> - PAGE_CACHE_SHIFT; - return rbio->stripe_pages[index]; + return rbio_stripe_page(rbio, rbio->nr_data + 1, index); } /* @@ -889,6 +901,7 @@ static void raid_write_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; int err = bio->bi_error; + int max_errors; if (err) fail_bio_stripe(rbio, bio); @@ -901,7 +914,9 @@ static void raid_write_end_io(struct bio *bio) err = 0; /* OK, we have read all the stripes we need to. */ - if (atomic_read(&rbio->error) > rbio->bbio->max_errors) + max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ? + 0 : rbio->bbio->max_errors; + if (atomic_read(&rbio->error) > max_errors) err = -EIO; rbio_orig_end_io(rbio, err); @@ -947,8 +962,7 @@ static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, */ static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) { - unsigned long nr = stripe_len * nr_stripes; - return DIV_ROUND_UP(nr, PAGE_CACHE_SIZE); + return DIV_ROUND_UP(stripe_len, PAGE_CACHE_SIZE) * nr_stripes; } /* @@ -966,8 +980,8 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, void *p; rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 + - DIV_ROUND_UP(stripe_npages, BITS_PER_LONG / 8), - GFP_NOFS); + DIV_ROUND_UP(stripe_npages, BITS_PER_LONG) * + sizeof(long), GFP_NOFS); if (!rbio) return ERR_PTR(-ENOMEM); @@ -1021,18 +1035,17 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) if (!page) return -ENOMEM; rbio->stripe_pages[i] = page; - ClearPageUptodate(page); } return 0; } -/* allocate pages for just the p/q stripes */ +/* only allocate pages for p/q stripes */ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) { int i; struct page *page; - i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; + i = rbio_stripe_page_index(rbio, rbio->nr_data, 0); for (; i < rbio->nr_pages; i++) { if (rbio->stripe_pages[i]) @@ -1121,18 +1134,6 @@ static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) } /* - * these are just the pages from the rbio array, not from anything - * the FS sent down to us - */ -static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page) -{ - int index; - index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT); - index += page; - return rbio->stripe_pages[index]; -} - -/* * helper function to walk our bio list and populate the bio_pages array with * the result. This seems expensive, but it is faster than constantly * searching through the bio list as we setup the IO in finish_rmw or stripe @@ -1175,7 +1176,6 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) { struct btrfs_bio *bbio = rbio->bbio; void *pointers[rbio->real_stripes]; - int stripe_len = rbio->stripe_len; int nr_data = rbio->nr_data; int stripe; int pagenr; @@ -1183,7 +1183,6 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) int q_stripe = -1; struct bio_list bio_list; struct bio *bio; - int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT; int ret; bio_list_init(&bio_list); @@ -1226,7 +1225,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) else clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); - for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { struct page *p; /* first collect one page from each data stripe */ for (stripe = 0; stripe < nr_data; stripe++) { @@ -1268,7 +1267,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) * everything else. */ for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { struct page *page; if (stripe < rbio->nr_data) { page = page_in_rbio(rbio, stripe, pagenr, 1); @@ -1292,7 +1291,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) if (!bbio->tgtdev_map[stripe]) continue; - for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { struct page *page; if (stripe < rbio->nr_data) { page = page_in_rbio(rbio, stripe, pagenr, 1); @@ -1506,7 +1505,6 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) int bios_to_read = 0; struct bio_list bio_list; int ret; - int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); int pagenr; int stripe; struct bio *bio; @@ -1525,7 +1523,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) * stripe */ for (stripe = 0; stripe < rbio->nr_data; stripe++) { - for (pagenr = 0; pagenr < nr_pages; pagenr++) { + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { struct page *page; /* * we want to find all the pages missing from @@ -1801,7 +1799,6 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) int pagenr, stripe; void **pointers; int faila = -1, failb = -1; - int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); struct page *page; int err; int i; @@ -1824,7 +1821,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) index_rbio_pages(rbio); - for (pagenr = 0; pagenr < nr_pages; pagenr++) { + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { /* * Now we just use bitmap to mark the horizontal stripes in * which we have data when doing parity scrub. @@ -1935,7 +1932,7 @@ pstripe: * other endio functions will fiddle the uptodate bits */ if (rbio->operation == BTRFS_RBIO_WRITE) { - for (i = 0; i < nr_pages; i++) { + for (i = 0; i < rbio->stripe_npages; i++) { if (faila != -1) { page = rbio_stripe_page(rbio, faila, i); SetPageUptodate(page); @@ -2031,7 +2028,6 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) int bios_to_read = 0; struct bio_list bio_list; int ret; - int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); int pagenr; int stripe; struct bio *bio; @@ -2055,7 +2051,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) continue; } - for (pagenr = 0; pagenr < nr_pages; pagenr++) { + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { struct page *p; /* @@ -2279,37 +2275,11 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) if (!page) return -ENOMEM; rbio->stripe_pages[index] = page; - ClearPageUptodate(page); } } return 0; } -/* - * end io function used by finish_rmw. When we finally - * get here, we've written a full stripe - */ -static void raid_write_parity_end_io(struct bio *bio) -{ - struct btrfs_raid_bio *rbio = bio->bi_private; - int err = bio->bi_error; - - if (bio->bi_error) - fail_bio_stripe(rbio, bio); - - bio_put(bio); - - if (!atomic_dec_and_test(&rbio->stripes_pending)) - return; - - err = 0; - - if (atomic_read(&rbio->error)) - err = -EIO; - - rbio_orig_end_io(rbio, err); -} - static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) { @@ -2462,7 +2432,7 @@ submit_write: break; bio->bi_private = rbio; - bio->bi_end_io = raid_write_parity_end_io; + bio->bi_end_io = raid_write_end_io; submit_bio(WRITE, bio); } return; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index ef6d8fc85853..2bd0011450df 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -575,7 +575,8 @@ static int is_cowonly_root(u64 root_objectid) root_objectid == BTRFS_TREE_LOG_OBJECTID || root_objectid == BTRFS_CSUM_TREE_OBJECTID || root_objectid == BTRFS_UUID_TREE_OBJECTID || - root_objectid == BTRFS_QUOTA_TREE_OBJECTID) + root_objectid == BTRFS_QUOTA_TREE_OBJECTID || + root_objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) return 1; return 0; } @@ -3030,7 +3031,7 @@ int prealloc_file_extent_cluster(struct inode *inode, int ret = 0; BUG_ON(cluster->start != cluster->boundary[0]); - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = btrfs_check_data_free_space(inode, cluster->start, cluster->end + 1 - cluster->start); @@ -3057,7 +3058,7 @@ int prealloc_file_extent_cluster(struct inode *inode, btrfs_free_reserved_data_space(inode, cluster->start, cluster->end + 1 - cluster->start); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 7cf8509deda7..2c849b08a91b 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -310,8 +310,16 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); err = btrfs_insert_fs_root(root->fs_info, root); + /* + * The root might have been inserted already, as before we look + * for orphan roots, log replay might have happened, which + * triggers a transaction commit and qgroup accounting, which + * in turn reads and inserts fs roots while doing backref + * walking. + */ + if (err == -EEXIST) + err = 0; if (err) { - BUG_ON(err == -EEXIST); btrfs_free_fs_root(root); break; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 0c981ebe2acb..92bf5ee732fb 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2813,7 +2813,7 @@ out: static inline int scrub_calc_parity_bitmap_len(int nsectors) { - return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * (BITS_PER_LONG / 8); + return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long); } static void scrub_parity_get(struct scrub_parity *sparity) @@ -3458,7 +3458,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, return ret; } - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; if (em->start != chunk_offset) goto out; @@ -4279,7 +4279,7 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, return PTR_ERR(inode); /* Avoid truncate/dio/punch hole.. */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); inode_dio_wait(inode); physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; @@ -4358,7 +4358,7 @@ next_page: } ret = COPY_COMPLETE; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); iput(inode); return ret; } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9b9eab6d048e..d41e09fe8e38 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -383,6 +383,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) int ret = 0; char *compress_type; bool compress_force = false; + enum btrfs_compression_type saved_compress_type; + bool saved_compress_force; + int no_compress = 0; cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE)) @@ -462,6 +465,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) /* Fallthrough */ case Opt_compress: case Opt_compress_type: + saved_compress_type = btrfs_test_opt(root, COMPRESS) ? + info->compress_type : BTRFS_COMPRESS_NONE; + saved_compress_force = + btrfs_test_opt(root, FORCE_COMPRESS); if (token == Opt_compress || token == Opt_compress_force || strcmp(args[0].from, "zlib") == 0) { @@ -470,6 +477,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_set_opt(info->mount_opt, COMPRESS); btrfs_clear_opt(info->mount_opt, NODATACOW); btrfs_clear_opt(info->mount_opt, NODATASUM); + no_compress = 0; } else if (strcmp(args[0].from, "lzo") == 0) { compress_type = "lzo"; info->compress_type = BTRFS_COMPRESS_LZO; @@ -477,25 +485,21 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_clear_opt(info->mount_opt, NODATACOW); btrfs_clear_opt(info->mount_opt, NODATASUM); btrfs_set_fs_incompat(info, COMPRESS_LZO); + no_compress = 0; } else if (strncmp(args[0].from, "no", 2) == 0) { compress_type = "no"; btrfs_clear_opt(info->mount_opt, COMPRESS); btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); compress_force = false; + no_compress++; } else { ret = -EINVAL; goto out; } if (compress_force) { - btrfs_set_and_info(root, FORCE_COMPRESS, - "force %s compression", - compress_type); + btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); } else { - if (!btrfs_test_opt(root, COMPRESS)) - btrfs_info(root->fs_info, - "btrfs: use %s compression", - compress_type); /* * If we remount from compress-force=xxx to * compress=xxx, we need clear FORCE_COMPRESS @@ -504,6 +508,17 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) */ btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); } + if ((btrfs_test_opt(root, COMPRESS) && + (info->compress_type != saved_compress_type || + compress_force != saved_compress_force)) || + (!btrfs_test_opt(root, COMPRESS) && + no_compress == 1)) { + btrfs_info(root->fs_info, + "%s %s compression", + (compress_force) ? "force" : "use", + compress_type); + } + compress_force = false; break; case Opt_ssd: btrfs_set_and_info(root, SSD, diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index e0ac85949067..539e7b5e3f86 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -202,6 +202,7 @@ BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF); BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56); BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA); BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES); +BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE); static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(mixed_backref), @@ -213,6 +214,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(raid56), BTRFS_FEAT_ATTR_PTR(skinny_metadata), BTRFS_FEAT_ATTR_PTR(no_holes), + BTRFS_FEAT_ATTR_PTR(free_space_tree), NULL }; @@ -780,6 +782,39 @@ failure: return error; } + +/* + * Change per-fs features in /sys/fs/btrfs/UUID/features to match current + * values in superblock. Call after any changes to incompat/compat_ro flags + */ +void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, + u64 bit, enum btrfs_feature_set set) +{ + struct btrfs_fs_devices *fs_devs; + struct kobject *fsid_kobj; + u64 features; + int ret; + + if (!fs_info) + return; + + features = get_features(fs_info, set); + ASSERT(bit & supported_feature_masks[set]); + + fs_devs = fs_info->fs_devices; + fsid_kobj = &fs_devs->fsid_kobj; + + if (!fsid_kobj->state_initialized) + return; + + /* + * FIXME: this is too heavy to update just one value, ideally we'd like + * to use sysfs_update_group but some refactoring is needed first. + */ + sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group); + ret = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group); +} + static int btrfs_init_debugfs(void) { #ifdef CONFIG_DEBUG_FS diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 9c09522125a6..d7da1a4c2f6c 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -56,7 +56,7 @@ static struct btrfs_feature_attr btrfs_attr_##_name = { \ #define BTRFS_FEAT_ATTR_COMPAT(name, feature) \ BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature) #define BTRFS_FEAT_ATTR_COMPAT_RO(name, feature) \ - BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT, feature) + BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT_RO, feature) #define BTRFS_FEAT_ATTR_INCOMPAT(name, feature) \ BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature) @@ -90,4 +90,7 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, struct kobject *parent); int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); +void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, + u64 bit, enum btrfs_feature_set set); + #endif /* _BTRFS_SYSFS_H_ */ diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index b1d920b30070..1c76d73e06dc 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -82,18 +82,18 @@ void btrfs_destroy_test_fs(void) struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void) { struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info), - GFP_NOFS); + GFP_KERNEL); if (!fs_info) return fs_info; fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices), - GFP_NOFS); + GFP_KERNEL); if (!fs_info->fs_devices) { kfree(fs_info); return NULL; } fs_info->super_copy = kzalloc(sizeof(struct btrfs_super_block), - GFP_NOFS); + GFP_KERNEL); if (!fs_info->super_copy) { kfree(fs_info->fs_devices); kfree(fs_info); @@ -137,7 +137,6 @@ static void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) void **slot; spin_lock(&fs_info->buffer_lock); -restart: radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) { struct extent_buffer *eb; @@ -147,7 +146,7 @@ restart: /* Shouldn't happen but that kind of thinking creates CVE's */ if (radix_tree_exception(eb)) { if (radix_tree_deref_retry(eb)) - goto restart; + slot = radix_tree_iter_retry(&iter); continue; } spin_unlock(&fs_info->buffer_lock); @@ -180,11 +179,11 @@ btrfs_alloc_dummy_block_group(unsigned long length) { struct btrfs_block_group_cache *cache; - cache = kzalloc(sizeof(*cache), GFP_NOFS); + cache = kzalloc(sizeof(*cache), GFP_KERNEL); if (!cache) return NULL; cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), - GFP_NOFS); + GFP_KERNEL); if (!cache->free_space_ctl) { kfree(cache); return NULL; diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index e29fa297e053..669b58201e36 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -94,7 +94,7 @@ static int test_find_delalloc(void) * test. */ for (index = 0; index < (total_dirty >> PAGE_CACHE_SHIFT); index++) { - page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); + page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL); if (!page) { test_msg("Failed to allocate test page\n"); ret = -ENOMEM; @@ -113,7 +113,7 @@ static int test_find_delalloc(void) * |--- delalloc ---| * |--- search ---| */ - set_extent_delalloc(&tmp, 0, 4095, NULL, GFP_NOFS); + set_extent_delalloc(&tmp, 0, 4095, NULL, GFP_KERNEL); start = 0; end = 0; found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, @@ -144,7 +144,7 @@ static int test_find_delalloc(void) test_msg("Couldn't find the locked page\n"); goto out_bits; } - set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL, GFP_NOFS); + set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL, GFP_KERNEL); start = test_start; end = 0; found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, @@ -199,7 +199,7 @@ static int test_find_delalloc(void) * * We are re-using our test_start from above since it works out well. */ - set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL, GFP_NOFS); + set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL, GFP_KERNEL); start = test_start; end = 0; found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, @@ -262,7 +262,7 @@ static int test_find_delalloc(void) } ret = 0; out_bits: - clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_NOFS); + clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_KERNEL); out: if (locked_page) page_cache_release(locked_page); @@ -360,7 +360,7 @@ static int test_eb_bitmaps(void) test_msg("Running extent buffer bitmap tests\n"); - bitmap = kmalloc(len, GFP_NOFS); + bitmap = kmalloc(len, GFP_KERNEL); if (!bitmap) { test_msg("Couldn't allocate test bitmap\n"); return -ENOMEM; diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 5de55fdd28bc..e2d3da02deee 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -974,7 +974,7 @@ static int test_extent_accounting(void) (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095, EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE | EXTENT_DO_ACCOUNTING, 0, 0, - NULL, GFP_NOFS); + NULL, GFP_KERNEL); if (ret) { test_msg("clear_extent_bit returned %d\n", ret); goto out; @@ -1045,7 +1045,7 @@ static int test_extent_accounting(void) BTRFS_MAX_EXTENT_SIZE+8191, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, - NULL, GFP_NOFS); + NULL, GFP_KERNEL); if (ret) { test_msg("clear_extent_bit returned %d\n", ret); goto out; @@ -1079,7 +1079,7 @@ static int test_extent_accounting(void) ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, - NULL, GFP_NOFS); + NULL, GFP_KERNEL); if (ret) { test_msg("clear_extent_bit returned %d\n", ret); goto out; @@ -1096,7 +1096,7 @@ out: clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, - NULL, GFP_NOFS); + NULL, GFP_KERNEL); iput(inode); btrfs_free_dummy_root(root); return ret; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 323e12cc9d2f..978c3a810893 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4127,7 +4127,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_path *path, struct list_head *logged_list, - struct btrfs_log_ctx *ctx) + struct btrfs_log_ctx *ctx, + const u64 start, + const u64 end) { struct extent_map *em, *n; struct list_head extents; @@ -4166,7 +4168,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, } list_sort(NULL, &extents, extent_cmp); - + /* + * Collect any new ordered extents within the range. This is to + * prevent logging file extent items without waiting for the disk + * location they point to being written. We do this only to deal + * with races against concurrent lockless direct IO writes. + */ + btrfs_get_logged_extents(inode, logged_list, start, end); process: while (!list_empty(&extents)) { em = list_entry(extents.next, struct extent_map, list); @@ -4701,7 +4709,7 @@ log_extents: goto out_unlock; } ret = btrfs_log_changed_extents(trans, root, inode, dst_path, - &logged_list, ctx); + &logged_list, ctx, start, end); if (ret) { err = ret; goto out_unlock; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c32abbca9d77..366b335946fa 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -108,7 +108,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { }, }; -const u64 const btrfs_raid_group[BTRFS_NR_RAID_TYPES] = { +const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10, [BTRFS_RAID_RAID1] = BTRFS_BLOCK_GROUP_RAID1, [BTRFS_RAID_DUP] = BTRFS_BLOCK_GROUP_DUP, @@ -233,6 +233,7 @@ static struct btrfs_device *__alloc_device(void) spin_lock_init(&dev->reada_lock); atomic_set(&dev->reada_in_flight, 0); atomic_set(&dev->dev_stats_ccnt, 0); + btrfs_device_data_ordered_init(dev); INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); @@ -1183,7 +1184,7 @@ again: struct map_lookup *map; int i; - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) { u64 end; @@ -2755,7 +2756,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, free_extent_map(em); return -EINVAL; } - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; lock_chunks(root->fs_info->chunk_root); check_system_chunk(trans, extent_root, map->type); unlock_chunks(root->fs_info->chunk_root); @@ -3751,7 +3752,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, if (btrfs_get_num_tolerated_disk_barrier_failures(bctl->meta.target) < btrfs_get_num_tolerated_disk_barrier_failures(bctl->data.target)) { btrfs_warn(fs_info, - "metatdata profile 0x%llx has lower redundancy than data profile 0x%llx", + "metadata profile 0x%llx has lower redundancy than data profile 0x%llx", bctl->meta.target, bctl->data.target); } @@ -4718,7 +4719,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, goto error; } set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); - em->bdev = (struct block_device *)map; + em->map_lookup = map; em->start = start; em->len = num_bytes; em->block_start = 0; @@ -4813,7 +4814,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, return -EINVAL; } - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; item_size = btrfs_chunk_item_size(map->num_stripes); stripe_size = em->orig_block_len; @@ -4968,7 +4969,7 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) if (!em) return 1; - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) { if (map->stripes[i].dev->missing) { miss_ndevs++; @@ -5048,7 +5049,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) return 1; } - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) ret = map->num_stripes; else if (map->type & BTRFS_BLOCK_GROUP_RAID10) @@ -5084,7 +5085,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_root *root, BUG_ON(!em); BUG_ON(em->start > logical || em->start + em->len < logical); - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) len = map->stripe_len * nr_data_stripes(map); free_extent_map(em); @@ -5105,7 +5106,7 @@ int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, BUG_ON(!em); BUG_ON(em->start > logical || em->start + em->len < logical); - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) ret = 1; free_extent_map(em); @@ -5264,7 +5265,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, return -EINVAL; } - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; offset = logical - em->start; stripe_len = map->stripe_len; @@ -5378,35 +5379,33 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, * target drive. */ for (i = 0; i < tmp_num_stripes; i++) { - if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) { - /* - * In case of DUP, in order to keep it - * simple, only add the mirror with the - * lowest physical address - */ - if (found && - physical_of_found <= - tmp_bbio->stripes[i].physical) - continue; - index_srcdev = i; - found = 1; - physical_of_found = - tmp_bbio->stripes[i].physical; - } + if (tmp_bbio->stripes[i].dev->devid != srcdev_devid) + continue; + + /* + * In case of DUP, in order to keep it simple, only add + * the mirror with the lowest physical address + */ + if (found && + physical_of_found <= tmp_bbio->stripes[i].physical) + continue; + + index_srcdev = i; + found = 1; + physical_of_found = tmp_bbio->stripes[i].physical; } - if (found) { - mirror_num = index_srcdev + 1; - patch_the_first_stripe_for_dev_replace = 1; - physical_to_patch_in_first_stripe = physical_of_found; - } else { + btrfs_put_bbio(tmp_bbio); + + if (!found) { WARN_ON(1); ret = -EIO; - btrfs_put_bbio(tmp_bbio); goto out; } - btrfs_put_bbio(tmp_bbio); + mirror_num = index_srcdev + 1; + patch_the_first_stripe_for_dev_replace = 1; + physical_to_patch_in_first_stripe = physical_of_found; } else if (mirror_num > map->num_stripes) { mirror_num = 0; } @@ -5806,7 +5805,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, free_extent_map(em); return -EIO; } - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; length = em->len; rmap_len = map->stripe_len; @@ -6069,7 +6068,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, bbio->fs_info = root->fs_info; atomic_set(&bbio->stripes_pending, bbio->num_stripes); - if (bbio->raid_map) { + if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && + ((rw & WRITE) || (mirror_num > 1))) { /* In this case, map_length has been set to the length of a single stripe; not the whole write */ if (rw & WRITE) { @@ -6210,6 +6210,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, struct extent_map *em; u64 logical; u64 length; + u64 stripe_len; u64 devid; u8 uuid[BTRFS_UUID_SIZE]; int num_stripes; @@ -6218,6 +6219,37 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, logical = key->offset; length = btrfs_chunk_length(leaf, chunk); + stripe_len = btrfs_chunk_stripe_len(leaf, chunk); + num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + /* Validation check */ + if (!num_stripes) { + btrfs_err(root->fs_info, "invalid chunk num_stripes: %u", + num_stripes); + return -EIO; + } + if (!IS_ALIGNED(logical, root->sectorsize)) { + btrfs_err(root->fs_info, + "invalid chunk logical %llu", logical); + return -EIO; + } + if (!length || !IS_ALIGNED(length, root->sectorsize)) { + btrfs_err(root->fs_info, + "invalid chunk length %llu", length); + return -EIO; + } + if (!is_power_of_2(stripe_len)) { + btrfs_err(root->fs_info, "invalid chunk stripe length: %llu", + stripe_len); + return -EIO; + } + if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & + btrfs_chunk_type(leaf, chunk)) { + btrfs_err(root->fs_info, "unrecognized chunk type: %llu", + ~(BTRFS_BLOCK_GROUP_TYPE_MASK | + BTRFS_BLOCK_GROUP_PROFILE_MASK) & + btrfs_chunk_type(leaf, chunk)); + return -EIO; + } read_lock(&map_tree->map_tree.lock); em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); @@ -6234,7 +6266,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, em = alloc_extent_map(); if (!em) return -ENOMEM; - num_stripes = btrfs_chunk_num_stripes(leaf, chunk); map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); if (!map) { free_extent_map(em); @@ -6242,7 +6273,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, } set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); - em->bdev = (struct block_device *)map; + em->map_lookup = map; em->start = logical; em->len = length; em->orig_start = 0; @@ -6944,7 +6975,7 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, /* In order to kick the device replace finish process */ lock_chunks(root); list_for_each_entry(em, &transaction->pending_chunks, list) { - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) { dev = map->stripes[i].dev; diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index fd953c361a43..6c68d6356197 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -126,7 +126,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans, * locks the inode's i_mutex before calling setxattr or removexattr. */ if (flags & XATTR_REPLACE) { - ASSERT(mutex_is_locked(&inode->i_mutex)); + ASSERT(inode_is_locked(inode)); di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), name, name_len, 0); if (!di) diff --git a/fs/buffer.c b/fs/buffer.c index e1632abb4ca9..33be29675358 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -621,17 +621,17 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); * If warn is true, then emit a warning if the page is not uptodate and has * not been truncated. * - * The caller must hold mem_cgroup_begin_page_stat() lock. + * The caller must hold lock_page_memcg(). */ static void __set_page_dirty(struct page *page, struct address_space *mapping, - struct mem_cgroup *memcg, int warn) + int warn) { unsigned long flags; spin_lock_irqsave(&mapping->tree_lock, flags); if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); - account_page_dirtied(page, mapping, memcg); + account_page_dirtied(page, mapping); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } @@ -666,7 +666,6 @@ static void __set_page_dirty(struct page *page, struct address_space *mapping, int __set_page_dirty_buffers(struct page *page) { int newly_dirty; - struct mem_cgroup *memcg; struct address_space *mapping = page_mapping(page); if (unlikely(!mapping)) @@ -683,17 +682,17 @@ int __set_page_dirty_buffers(struct page *page) } while (bh != head); } /* - * Use mem_group_begin_page_stat() to keep PageDirty synchronized with - * per-memcg dirty page counters. + * Lock out page->mem_cgroup migration to keep PageDirty + * synchronized with per-memcg dirty page counters. */ - memcg = mem_cgroup_begin_page_stat(page); + lock_page_memcg(page); newly_dirty = !TestSetPageDirty(page); spin_unlock(&mapping->private_lock); if (newly_dirty) - __set_page_dirty(page, mapping, memcg, 1); + __set_page_dirty(page, mapping, 1); - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(page); if (newly_dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); @@ -1167,15 +1166,14 @@ void mark_buffer_dirty(struct buffer_head *bh) if (!test_set_buffer_dirty(bh)) { struct page *page = bh->b_page; struct address_space *mapping = NULL; - struct mem_cgroup *memcg; - memcg = mem_cgroup_begin_page_stat(page); + lock_page_memcg(page); if (!TestSetPageDirty(page)) { mapping = page_mapping(page); if (mapping) - __set_page_dirty(page, mapping, memcg, 0); + __set_page_dirty(page, mapping, 0); } - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(page); if (mapping) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c index 452e98dd7560..1ee54ffd3a24 100644 --- a/fs/cachefiles/daemon.c +++ b/fs/cachefiles/daemon.c @@ -162,6 +162,8 @@ static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer, size_t buflen, loff_t *pos) { struct cachefiles_cache *cache = file->private_data; + unsigned long long b_released; + unsigned f_released; char buffer[256]; int n; @@ -174,6 +176,8 @@ static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer, cachefiles_has_space(cache, 0, 0); /* summarise */ + f_released = atomic_xchg(&cache->f_released, 0); + b_released = atomic_long_xchg(&cache->b_released, 0); clear_bit(CACHEFILES_STATE_CHANGED, &cache->flags); n = snprintf(buffer, sizeof(buffer), @@ -183,15 +187,18 @@ static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer, " fstop=%llx" " brun=%llx" " bcull=%llx" - " bstop=%llx", + " bstop=%llx" + " freleased=%x" + " breleased=%llx", test_bit(CACHEFILES_CULLING, &cache->flags) ? '1' : '0', (unsigned long long) cache->frun, (unsigned long long) cache->fcull, (unsigned long long) cache->fstop, (unsigned long long) cache->brun, (unsigned long long) cache->bcull, - (unsigned long long) cache->bstop - ); + (unsigned long long) cache->bstop, + f_released, + b_released); if (n > buflen) return -EMSGSIZE; diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index afa023dded5b..861d611b8c05 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -291,15 +291,8 @@ static void cachefiles_drop_object(struct fscache_object *_object) } /* note that the object is now inactive */ - if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) { - write_lock(&cache->active_lock); - if (!test_and_clear_bit(CACHEFILES_OBJECT_ACTIVE, - &object->flags)) - BUG(); - rb_erase(&object->active_node, &cache->active_nodes); - wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE); - write_unlock(&cache->active_lock); - } + if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) + cachefiles_mark_object_inactive(cache, object); dput(object->dentry); object->dentry = NULL; @@ -446,7 +439,7 @@ static int cachefiles_attr_changed(struct fscache_object *_object) return 0; cachefiles_begin_secure(cache, &saved_cred); - mutex_lock(&d_inode(object->backer)->i_mutex); + inode_lock(d_inode(object->backer)); /* if there's an extension to a partial page at the end of the backing * file, we need to discard the partial page so that we pick up new @@ -465,7 +458,7 @@ static int cachefiles_attr_changed(struct fscache_object *_object) ret = notify_change(object->backer, &newattrs, NULL); truncate_failed: - mutex_unlock(&d_inode(object->backer)->i_mutex); + inode_unlock(d_inode(object->backer)); cachefiles_end_secure(cache, saved_cred); if (ret == -EIO) { diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 9c4b737a54df..2fcde1a34b7c 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -66,6 +66,8 @@ struct cachefiles_cache { struct rb_root active_nodes; /* active nodes (can't be culled) */ rwlock_t active_lock; /* lock for active_nodes */ atomic_t gravecounter; /* graveyard uniquifier */ + atomic_t f_released; /* number of objects released lately */ + atomic_long_t b_released; /* number of blocks released lately */ unsigned frun_percent; /* when to stop culling (% files) */ unsigned fcull_percent; /* when to start culling (% files) */ unsigned fstop_percent; /* when to stop allocating (% files) */ @@ -157,6 +159,8 @@ extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type); /* * namei.c */ +extern void cachefiles_mark_object_inactive(struct cachefiles_cache *cache, + struct cachefiles_object *object); extern int cachefiles_delete_object(struct cachefiles_cache *cache, struct cachefiles_object *object); extern int cachefiles_walk_to_object(struct cachefiles_object *parent, diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index c4b893453e0e..4ae75006e73b 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -258,6 +258,28 @@ requeue: } /* + * Mark an object as being inactive. + */ +void cachefiles_mark_object_inactive(struct cachefiles_cache *cache, + struct cachefiles_object *object) +{ + write_lock(&cache->active_lock); + rb_erase(&object->active_node, &cache->active_nodes); + clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags); + write_unlock(&cache->active_lock); + + wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE); + + /* This object can now be culled, so we need to let the daemon know + * that there is something it can remove if it needs to. + */ + atomic_long_add(d_backing_inode(object->dentry)->i_blocks, + &cache->b_released); + if (atomic_inc_return(&cache->f_released)) + cachefiles_state_changed(cache); +} + +/* * delete an object representation from the cache * - file backed objects are unlinked * - directory backed objects are stuffed into the graveyard for userspace to @@ -295,7 +317,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, cachefiles_mark_object_buried(cache, rep, why); } - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); if (ret == -EIO) cachefiles_io_error(cache, "Unlink failed"); @@ -306,7 +328,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, /* directories have to be moved to the graveyard */ _debug("move stale object to graveyard"); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); try_again: /* first step is to make up a grave dentry in the graveyard */ @@ -423,13 +445,13 @@ int cachefiles_delete_object(struct cachefiles_cache *cache, dir = dget_parent(object->dentry); - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); if (test_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->fscache.flags)) { /* object allocation for the same key preemptively deleted this * object's file so that it could create its own file */ _debug("object preemptively buried"); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); ret = 0; } else { /* we need to check that our parent is _still_ our parent - it @@ -442,7 +464,7 @@ int cachefiles_delete_object(struct cachefiles_cache *cache, /* it got moved, presumably by cachefilesd culling it, * so it's no longer in the key path and we can ignore * it */ - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); ret = 0; } } @@ -501,7 +523,7 @@ lookup_again: /* search the current directory for the element name */ _debug("lookup '%s'", name); - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); start = jiffies; next = lookup_one_len(name, dir, nlen); @@ -585,7 +607,7 @@ lookup_again: /* process the next component */ if (key) { _debug("advance"); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(dir); dir = next; next = NULL; @@ -623,7 +645,7 @@ lookup_again: /* note that we're now using this object */ ret = cachefiles_mark_object_active(cache, object); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(dir); dir = NULL; @@ -684,11 +706,7 @@ mark_active_timed_out: check_error: _debug("check error %d", ret); - write_lock(&cache->active_lock); - rb_erase(&object->active_node, &cache->active_nodes); - clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags); - wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE); - write_unlock(&cache->active_lock); + cachefiles_mark_object_inactive(cache, object); release_dentry: dput(object->dentry); object->dentry = NULL; @@ -705,7 +723,7 @@ lookup_error: cachefiles_io_error(cache, "Lookup failed"); next = NULL; error: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(next); error_out2: dput(dir); @@ -729,7 +747,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, _enter(",,%s", dirname); /* search the current directory for the element name */ - mutex_lock(&d_inode(dir)->i_mutex); + inode_lock(d_inode(dir)); start = jiffies; subdir = lookup_one_len(dirname, dir, strlen(dirname)); @@ -768,7 +786,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, d_backing_inode(subdir)->i_ino); } - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); /* we need to make sure the subdir is a directory */ ASSERT(d_backing_inode(subdir)); @@ -800,19 +818,19 @@ check_error: return ERR_PTR(ret); mkdir_error: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(subdir); pr_err("mkdir %s failed with error %d\n", dirname, ret); return ERR_PTR(ret); lookup_error: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); ret = PTR_ERR(subdir); pr_err("Lookup %s failed with error %d\n", dirname, ret); return ERR_PTR(ret); nomem_d_alloc: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); _leave(" = -ENOMEM"); return ERR_PTR(-ENOMEM); } @@ -837,7 +855,7 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, // dir, filename); /* look up the victim */ - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); start = jiffies; victim = lookup_one_len(filename, dir, strlen(filename)); @@ -852,7 +870,7 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, * at the netfs's request whilst the cull was in progress */ if (d_is_negative(victim)) { - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(victim); _leave(" = -ENOENT [absent]"); return ERR_PTR(-ENOENT); @@ -881,13 +899,13 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, object_in_use: read_unlock(&cache->active_lock); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(victim); //_leave(" = -EBUSY [in use]"); return ERR_PTR(-EBUSY); lookup_error: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); ret = PTR_ERR(victim); if (ret == -ENOENT) { /* file or dir now absent - probably retired by netfs */ @@ -947,7 +965,7 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, return 0; error_unlock: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); error: dput(victim); if (ret == -ENOENT) { @@ -982,7 +1000,7 @@ int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir, if (IS_ERR(victim)) return PTR_ERR(victim); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(victim); //_leave(" = 0"); return 0; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index b7d218a168fb..19adeb0ef82a 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1108,7 +1108,7 @@ retry_locked: return 0; /* past end of file? */ - i_size = inode->i_size; /* caller holds i_mutex */ + i_size = i_size_read(inode); if (page_off >= i_size || (pos_in_page == 0 && (pos+len) >= i_size && @@ -1149,7 +1149,6 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, page = grab_cache_page_write_begin(mapping, index, 0); if (!page) return -ENOMEM; - *pagep = page; dout("write_begin file %p inode %p page %p %d~%d\n", file, inode, page, (int)pos, (int)len); @@ -1184,8 +1183,7 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, zero_user_segment(page, from+copied, len); /* did file size increase? */ - /* (no need for i_size_read(); we caller holds i_mutex */ - if (pos+copied > inode->i_size) + if (pos+copied > i_size_read(inode)) check_cap = ceph_inode_set_size(inode, pos+copied); if (!PageUptodate(page)) @@ -1378,11 +1376,13 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = VM_FAULT_NOPAGE; if ((off > size) || - (page->mapping != inode->i_mapping)) + (page->mapping != inode->i_mapping)) { + unlock_page(page); goto out; + } ret = ceph_update_writeable_page(vma->vm_file, off, len, page); - if (ret == 0) { + if (ret >= 0) { /* success. we'll keep the page locked. */ set_page_dirty(page); ret = VM_FAULT_LOCKED; @@ -1393,8 +1393,6 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = VM_FAULT_SIGBUS; } out: - if (ret != VM_FAULT_LOCKED) - unlock_page(page); if (ret == VM_FAULT_LOCKED || ci->i_inline_version != CEPH_INLINE_NONE) { int dirty; @@ -1758,6 +1756,10 @@ int ceph_pool_perm_check(struct ceph_inode_info *ci, int need) u32 pool; int ret, flags; + /* does not support pool namespace yet */ + if (ci->i_pool_ns_len) + return -EIO; + if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode), NOPOOLPERM)) return 0; diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index a4766ded1ba7..a351480dbabc 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -106,7 +106,7 @@ static uint16_t ceph_fscache_inode_get_aux(const void *cookie_netfs_data, memset(&aux, 0, sizeof(aux)); aux.mtime = inode->i_mtime; - aux.size = inode->i_size; + aux.size = i_size_read(inode); memcpy(buffer, &aux, sizeof(aux)); @@ -117,9 +117,7 @@ static void ceph_fscache_inode_get_attr(const void *cookie_netfs_data, uint64_t *size) { const struct ceph_inode_info* ci = cookie_netfs_data; - const struct inode* inode = &ci->vfs_inode; - - *size = inode->i_size; + *size = i_size_read(&ci->vfs_inode); } static enum fscache_checkaux ceph_fscache_inode_check_aux( @@ -134,7 +132,7 @@ static enum fscache_checkaux ceph_fscache_inode_check_aux( memset(&aux, 0, sizeof(aux)); aux.mtime = inode->i_mtime; - aux.size = inode->i_size; + aux.size = i_size_read(inode); if (memcmp(data, &aux, sizeof(aux)) != 0) return FSCACHE_CHECKAUX_OBSOLETE; @@ -197,7 +195,7 @@ void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc, return; /* Avoid multiple racing open requests */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (ci->fscache) goto done; @@ -207,7 +205,7 @@ void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc, ci, true); fscache_check_consistency(ci->fscache); done: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index c69e1253b47b..6fe0ad26a7df 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2030,7 +2030,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) if (datasync) goto out; - mutex_lock(&inode->i_mutex); + inode_lock(inode); dirty = try_flush_caps(inode, &flush_tid); dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); @@ -2046,7 +2046,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) ret = wait_event_interruptible(ci->i_cap_wq, caps_are_flushed(inode, flush_tid)); } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); out: dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret); return ret; @@ -2753,7 +2753,8 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, void *inline_data, int inline_len, struct ceph_buffer *xattr_buf, struct ceph_mds_session *session, - struct ceph_cap *cap, int issued) + struct ceph_cap *cap, int issued, + u32 pool_ns_len) __releases(ci->i_ceph_lock) __releases(mdsc->snap_rwsem) { @@ -2873,6 +2874,8 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) { /* file layout may have changed */ ci->i_layout = grant->layout; + ci->i_pool_ns_len = pool_ns_len; + /* size/truncate_seq? */ queue_trunc = ceph_fill_file_size(inode, issued, le32_to_cpu(grant->truncate_seq), @@ -3411,6 +3414,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, u32 inline_len = 0; void *snaptrace; size_t snaptrace_len; + u32 pool_ns_len = 0; void *p, *end; dout("handle_caps from mds%d\n", mds); @@ -3463,6 +3467,21 @@ void ceph_handle_caps(struct ceph_mds_session *session, p += inline_len; } + if (le16_to_cpu(msg->hdr.version) >= 8) { + u64 flush_tid; + u32 caller_uid, caller_gid; + u32 osd_epoch_barrier; + /* version >= 5 */ + ceph_decode_32_safe(&p, end, osd_epoch_barrier, bad); + /* version >= 6 */ + ceph_decode_64_safe(&p, end, flush_tid, bad); + /* version >= 7 */ + ceph_decode_32_safe(&p, end, caller_uid, bad); + ceph_decode_32_safe(&p, end, caller_gid, bad); + /* version >= 8 */ + ceph_decode_32_safe(&p, end, pool_ns_len, bad); + } + /* lookup ino */ inode = ceph_find_inode(sb, vino); ci = ceph_inode(inode); @@ -3518,7 +3537,8 @@ void ceph_handle_caps(struct ceph_mds_session *session, &cap, &issued); handle_cap_grant(mdsc, inode, h, inline_version, inline_data, inline_len, - msg->middle, session, cap, issued); + msg->middle, session, cap, issued, + pool_ns_len); if (realm) ceph_put_snap_realm(mdsc, realm); goto done_unlocked; @@ -3542,7 +3562,8 @@ void ceph_handle_caps(struct ceph_mds_session *session, issued |= __ceph_caps_dirty(ci); handle_cap_grant(mdsc, inode, h, inline_version, inline_data, inline_len, - msg->middle, session, cap, issued); + msg->middle, session, cap, issued, + pool_ns_len); goto done_unlocked; case CEPH_CAP_OP_FLUSH_ACK: diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 9314b4ea2375..fd11fb231a2e 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -507,7 +507,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset); loff_t retval; - mutex_lock(&inode->i_mutex); + inode_lock(inode); retval = -EINVAL; switch (whence) { case SEEK_CUR: @@ -542,7 +542,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) } } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return retval; } diff --git a/fs/ceph/export.c b/fs/ceph/export.c index fe02ae7f056a..3b3172357326 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -215,7 +215,7 @@ static int ceph_get_name(struct dentry *parent, char *name, if (IS_ERR(req)) return PTR_ERR(req); - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); req->r_inode = d_inode(child); ihold(d_inode(child)); @@ -224,7 +224,7 @@ static int ceph_get_name(struct dentry *parent, char *name, req->r_num_caps = 2; err = ceph_mdsc_do_request(mdsc, NULL, req); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); if (!err) { struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 3c68e6aee2f0..eb9028e8cfc5 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -397,8 +397,9 @@ int ceph_release(struct inode *inode, struct file *file) } enum { - CHECK_EOF = 1, - READ_INLINE = 2, + HAVE_RETRIED = 1, + CHECK_EOF = 2, + READ_INLINE = 3, }; /* @@ -411,17 +412,15 @@ enum { static int striped_read(struct inode *inode, u64 off, u64 len, struct page **pages, int num_pages, - int *checkeof, bool o_direct, - unsigned long buf_align) + int *checkeof) { struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); u64 pos, this_len, left; - int io_align, page_align; - int pages_left; - int read; + loff_t i_size; + int page_align, pages_left; + int read, ret; struct page **page_pos; - int ret; bool hit_stripe, was_short; /* @@ -432,13 +431,9 @@ static int striped_read(struct inode *inode, page_pos = pages; pages_left = num_pages; read = 0; - io_align = off & ~PAGE_MASK; more: - if (o_direct) - page_align = (pos - io_align + buf_align) & ~PAGE_MASK; - else - page_align = pos & ~PAGE_MASK; + page_align = pos & ~PAGE_MASK; this_len = left; ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), &ci->i_layout, pos, &this_len, @@ -452,13 +447,12 @@ more: dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, left, read, ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : ""); + i_size = i_size_read(inode); if (ret >= 0) { int didpages; - if (was_short && (pos + ret < inode->i_size)) { - int zlen = min(this_len - ret, - inode->i_size - pos - ret); - int zoff = (o_direct ? buf_align : io_align) + - read + ret; + if (was_short && (pos + ret < i_size)) { + int zlen = min(this_len - ret, i_size - pos - ret); + int zoff = (off & ~PAGE_MASK) + read + ret; dout(" zero gap %llu to %llu\n", pos + ret, pos + ret + zlen); ceph_zero_page_vector_range(zoff, zlen, pages); @@ -473,14 +467,14 @@ more: pages_left -= didpages; /* hit stripe and need continue*/ - if (left && hit_stripe && pos < inode->i_size) + if (left && hit_stripe && pos < i_size) goto more; } if (read > 0) { ret = read; /* did we bounce off eof? */ - if (pos + left > inode->i_size) + if (pos + left > i_size) *checkeof = CHECK_EOF; } @@ -521,54 +515,28 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, if (ret < 0) return ret; - if (iocb->ki_flags & IOCB_DIRECT) { - while (iov_iter_count(i)) { - size_t start; - ssize_t n; - - n = dio_get_pagev_size(i); - pages = dio_get_pages_alloc(i, n, &start, &num_pages); - if (IS_ERR(pages)) - return PTR_ERR(pages); - - ret = striped_read(inode, off, n, - pages, num_pages, checkeof, - 1, start); - - ceph_put_page_vector(pages, num_pages, true); - - if (ret <= 0) - break; - off += ret; - iov_iter_advance(i, ret); - if (ret < n) + num_pages = calc_pages_for(off, len); + pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); + if (IS_ERR(pages)) + return PTR_ERR(pages); + ret = striped_read(inode, off, len, pages, + num_pages, checkeof); + if (ret > 0) { + int l, k = 0; + size_t left = ret; + + while (left) { + size_t page_off = off & ~PAGE_MASK; + size_t copy = min_t(size_t, left, + PAGE_SIZE - page_off); + l = copy_page_to_iter(pages[k++], page_off, copy, i); + off += l; + left -= l; + if (l < copy) break; } - } else { - num_pages = calc_pages_for(off, len); - pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); - if (IS_ERR(pages)) - return PTR_ERR(pages); - ret = striped_read(inode, off, len, pages, - num_pages, checkeof, 0, 0); - if (ret > 0) { - int l, k = 0; - size_t left = ret; - - while (left) { - size_t page_off = off & ~PAGE_MASK; - size_t copy = min_t(size_t, - PAGE_SIZE - page_off, left); - l = copy_page_to_iter(pages[k++], page_off, - copy, i); - off += l; - left -= l; - if (l < copy) - break; - } - } - ceph_release_page_vector(pages, num_pages); } + ceph_release_page_vector(pages, num_pages); if (off > iocb->ki_pos) { ret = off - iocb->ki_pos; @@ -579,6 +547,193 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, return ret; } +struct ceph_aio_request { + struct kiocb *iocb; + size_t total_len; + int write; + int error; + struct list_head osd_reqs; + unsigned num_reqs; + atomic_t pending_reqs; + struct timespec mtime; + struct ceph_cap_flush *prealloc_cf; +}; + +struct ceph_aio_work { + struct work_struct work; + struct ceph_osd_request *req; +}; + +static void ceph_aio_retry_work(struct work_struct *work); + +static void ceph_aio_complete(struct inode *inode, + struct ceph_aio_request *aio_req) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + + if (!atomic_dec_and_test(&aio_req->pending_reqs)) + return; + + ret = aio_req->error; + if (!ret) + ret = aio_req->total_len; + + dout("ceph_aio_complete %p rc %d\n", inode, ret); + + if (ret >= 0 && aio_req->write) { + int dirty; + + loff_t endoff = aio_req->iocb->ki_pos + aio_req->total_len; + if (endoff > i_size_read(inode)) { + if (ceph_inode_set_size(inode, endoff)) + ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); + } + + spin_lock(&ci->i_ceph_lock); + ci->i_inline_version = CEPH_INLINE_NONE; + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, + &aio_req->prealloc_cf); + spin_unlock(&ci->i_ceph_lock); + if (dirty) + __mark_inode_dirty(inode, dirty); + + } + + ceph_put_cap_refs(ci, (aio_req->write ? CEPH_CAP_FILE_WR : + CEPH_CAP_FILE_RD)); + + aio_req->iocb->ki_complete(aio_req->iocb, ret, 0); + + ceph_free_cap_flush(aio_req->prealloc_cf); + kfree(aio_req); +} + +static void ceph_aio_complete_req(struct ceph_osd_request *req, + struct ceph_msg *msg) +{ + int rc = req->r_result; + struct inode *inode = req->r_inode; + struct ceph_aio_request *aio_req = req->r_priv; + struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); + int num_pages = calc_pages_for((u64)osd_data->alignment, + osd_data->length); + + dout("ceph_aio_complete_req %p rc %d bytes %llu\n", + inode, rc, osd_data->length); + + if (rc == -EOLDSNAPC) { + struct ceph_aio_work *aio_work; + BUG_ON(!aio_req->write); + + aio_work = kmalloc(sizeof(*aio_work), GFP_NOFS); + if (aio_work) { + INIT_WORK(&aio_work->work, ceph_aio_retry_work); + aio_work->req = req; + queue_work(ceph_inode_to_client(inode)->wb_wq, + &aio_work->work); + return; + } + rc = -ENOMEM; + } else if (!aio_req->write) { + if (rc == -ENOENT) + rc = 0; + if (rc >= 0 && osd_data->length > rc) { + int zoff = osd_data->alignment + rc; + int zlen = osd_data->length - rc; + /* + * If read is satisfied by single OSD request, + * it can pass EOF. Otherwise read is within + * i_size. + */ + if (aio_req->num_reqs == 1) { + loff_t i_size = i_size_read(inode); + loff_t endoff = aio_req->iocb->ki_pos + rc; + if (endoff < i_size) + zlen = min_t(size_t, zlen, + i_size - endoff); + aio_req->total_len = rc + zlen; + } + + if (zlen > 0) + ceph_zero_page_vector_range(zoff, zlen, + osd_data->pages); + } + } + + ceph_put_page_vector(osd_data->pages, num_pages, false); + ceph_osdc_put_request(req); + + if (rc < 0) + cmpxchg(&aio_req->error, 0, rc); + + ceph_aio_complete(inode, aio_req); + return; +} + +static void ceph_aio_retry_work(struct work_struct *work) +{ + struct ceph_aio_work *aio_work = + container_of(work, struct ceph_aio_work, work); + struct ceph_osd_request *orig_req = aio_work->req; + struct ceph_aio_request *aio_req = orig_req->r_priv; + struct inode *inode = orig_req->r_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_snap_context *snapc; + struct ceph_osd_request *req; + int ret; + + spin_lock(&ci->i_ceph_lock); + if (__ceph_have_pending_cap_snap(ci)) { + struct ceph_cap_snap *capsnap = + list_last_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, + ci_item); + snapc = ceph_get_snap_context(capsnap->context); + } else { + BUG_ON(!ci->i_head_snapc); + snapc = ceph_get_snap_context(ci->i_head_snapc); + } + spin_unlock(&ci->i_ceph_lock); + + req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 2, + false, GFP_NOFS); + if (!req) { + ret = -ENOMEM; + req = orig_req; + goto out; + } + + req->r_flags = CEPH_OSD_FLAG_ORDERSNAP | + CEPH_OSD_FLAG_ONDISK | + CEPH_OSD_FLAG_WRITE; + req->r_base_oloc = orig_req->r_base_oloc; + req->r_base_oid = orig_req->r_base_oid; + + req->r_ops[0] = orig_req->r_ops[0]; + osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); + + ceph_osdc_build_request(req, req->r_ops[0].extent.offset, + snapc, CEPH_NOSNAP, &aio_req->mtime); + + ceph_osdc_put_request(orig_req); + + req->r_callback = ceph_aio_complete_req; + req->r_inode = inode; + req->r_priv = aio_req; + + ret = ceph_osdc_start_request(req->r_osdc, req, false); +out: + if (ret < 0) { + BUG_ON(ret == -EOLDSNAPC); + req->r_result = ret; + ceph_aio_complete_req(req, NULL); + } + + ceph_put_snap_context(snapc); + kfree(aio_work); +} + /* * Write commit request unsafe callback, called to tell us when a * request is unsafe (that is, in flight--has been handed to the @@ -612,16 +767,10 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) } -/* - * Synchronous write, straight from __user pointer or user pages. - * - * If write spans object boundary, just do multiple writes. (For a - * correct atomic write, we should e.g. take write locks on all - * objects, rollback on failure, etc.) - */ static ssize_t -ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, - struct ceph_snap_context *snapc) +ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, + struct ceph_snap_context *snapc, + struct ceph_cap_flush **pcf) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); @@ -630,44 +779,52 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, struct ceph_vino vino; struct ceph_osd_request *req; struct page **pages; - int num_pages; - int written = 0; + struct ceph_aio_request *aio_req = NULL; + int num_pages = 0; int flags; - int check_caps = 0; int ret; struct timespec mtime = CURRENT_TIME; - size_t count = iov_iter_count(from); + size_t count = iov_iter_count(iter); + loff_t pos = iocb->ki_pos; + bool write = iov_iter_rw(iter) == WRITE; - if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) + if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP) return -EROFS; - dout("sync_direct_write on file %p %lld~%u\n", file, pos, - (unsigned)count); + dout("sync_direct_read_write (%s) on file %p %lld~%u\n", + (write ? "write" : "read"), file, pos, (unsigned)count); ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); if (ret < 0) return ret; - ret = invalidate_inode_pages2_range(inode->i_mapping, - pos >> PAGE_CACHE_SHIFT, - (pos + count) >> PAGE_CACHE_SHIFT); - if (ret < 0) - dout("invalidate_inode_pages2_range returned %d\n", ret); + if (write) { + ret = invalidate_inode_pages2_range(inode->i_mapping, + pos >> PAGE_CACHE_SHIFT, + (pos + count) >> PAGE_CACHE_SHIFT); + if (ret < 0) + dout("invalidate_inode_pages2_range returned %d\n", ret); - flags = CEPH_OSD_FLAG_ORDERSNAP | - CEPH_OSD_FLAG_ONDISK | - CEPH_OSD_FLAG_WRITE; + flags = CEPH_OSD_FLAG_ORDERSNAP | + CEPH_OSD_FLAG_ONDISK | + CEPH_OSD_FLAG_WRITE; + } else { + flags = CEPH_OSD_FLAG_READ; + } - while (iov_iter_count(from) > 0) { - u64 len = dio_get_pagev_size(from); - size_t start; - ssize_t n; + while (iov_iter_count(iter) > 0) { + u64 size = dio_get_pagev_size(iter); + size_t start = 0; + ssize_t len; vino = ceph_vino(inode); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, - vino, pos, &len, 0, - 2,/*include a 'startsync' command*/ - CEPH_OSD_OP_WRITE, flags, snapc, + vino, pos, &size, 0, + /*include a 'startsync' command*/ + write ? 2 : 1, + write ? CEPH_OSD_OP_WRITE : + CEPH_OSD_OP_READ, + flags, snapc, ci->i_truncate_seq, ci->i_truncate_size, false); @@ -676,10 +833,8 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, break; } - osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); - - n = len; - pages = dio_get_pages_alloc(from, len, &start, &num_pages); + len = size; + pages = dio_get_pages_alloc(iter, len, &start, &num_pages); if (IS_ERR(pages)) { ceph_osdc_put_request(req); ret = PTR_ERR(pages); @@ -687,47 +842,128 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, } /* - * throw out any page cache pages in this range. this - * may block. + * To simplify error handling, allow AIO when IO within i_size + * or IO can be satisfied by single OSD request. */ - truncate_inode_pages_range(inode->i_mapping, pos, - (pos+n) | (PAGE_CACHE_SIZE-1)); - osd_req_op_extent_osd_data_pages(req, 0, pages, n, start, - false, false); + if (pos == iocb->ki_pos && !is_sync_kiocb(iocb) && + (len == count || pos + count <= i_size_read(inode))) { + aio_req = kzalloc(sizeof(*aio_req), GFP_KERNEL); + if (aio_req) { + aio_req->iocb = iocb; + aio_req->write = write; + INIT_LIST_HEAD(&aio_req->osd_reqs); + if (write) { + aio_req->mtime = mtime; + swap(aio_req->prealloc_cf, *pcf); + } + } + /* ignore error */ + } + + if (write) { + /* + * throw out any page cache pages in this range. this + * may block. + */ + truncate_inode_pages_range(inode->i_mapping, pos, + (pos+len) | (PAGE_CACHE_SIZE - 1)); + + osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); + } + + + osd_req_op_extent_osd_data_pages(req, 0, pages, len, start, + false, false); - /* BUG_ON(vino.snap != CEPH_NOSNAP); */ ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); - ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); + if (aio_req) { + aio_req->total_len += len; + aio_req->num_reqs++; + atomic_inc(&aio_req->pending_reqs); + + req->r_callback = ceph_aio_complete_req; + req->r_inode = inode; + req->r_priv = aio_req; + list_add_tail(&req->r_unsafe_item, &aio_req->osd_reqs); + + pos += len; + iov_iter_advance(iter, len); + continue; + } + + ret = ceph_osdc_start_request(req->r_osdc, req, false); if (!ret) ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + size = i_size_read(inode); + if (!write) { + if (ret == -ENOENT) + ret = 0; + if (ret >= 0 && ret < len && pos + ret < size) { + int zlen = min_t(size_t, len - ret, + size - pos - ret); + ceph_zero_page_vector_range(start + ret, zlen, + pages); + ret += zlen; + } + if (ret >= 0) + len = ret; + } + ceph_put_page_vector(pages, num_pages, false); ceph_osdc_put_request(req); - if (ret) + if (ret < 0) + break; + + pos += len; + iov_iter_advance(iter, len); + + if (!write && pos >= size) break; - pos += n; - written += n; - iov_iter_advance(from, n); - if (pos > i_size_read(inode)) { - check_caps = ceph_inode_set_size(inode, pos); - if (check_caps) + if (write && pos > size) { + if (ceph_inode_set_size(inode, pos)) ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); } } - if (ret != -EOLDSNAPC && written > 0) { + if (aio_req) { + if (aio_req->num_reqs == 0) { + kfree(aio_req); + return ret; + } + + ceph_get_cap_refs(ci, write ? CEPH_CAP_FILE_WR : + CEPH_CAP_FILE_RD); + + while (!list_empty(&aio_req->osd_reqs)) { + req = list_first_entry(&aio_req->osd_reqs, + struct ceph_osd_request, + r_unsafe_item); + list_del_init(&req->r_unsafe_item); + if (ret >= 0) + ret = ceph_osdc_start_request(req->r_osdc, + req, false); + if (ret < 0) { + BUG_ON(ret == -EOLDSNAPC); + req->r_result = ret; + ceph_aio_complete_req(req, NULL); + } + } + return -EIOCBQUEUED; + } + + if (ret != -EOLDSNAPC && pos > iocb->ki_pos) { + ret = pos - iocb->ki_pos; iocb->ki_pos = pos; - ret = written; } return ret; } - /* * Synchronous write, straight from __user pointer or user pages. * @@ -897,8 +1133,14 @@ again: ceph_cap_string(got)); if (ci->i_inline_version == CEPH_INLINE_NONE) { - /* hmm, this isn't really async... */ - ret = ceph_sync_read(iocb, to, &retry_op); + if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) { + ret = ceph_direct_read_write(iocb, to, + NULL, NULL); + if (ret >= 0 && ret < len) + retry_op = CHECK_EOF; + } else { + ret = ceph_sync_read(iocb, to, &retry_op); + } } else { retry_op = READ_INLINE; } @@ -916,7 +1158,7 @@ again: pinned_page = NULL; } ceph_put_cap_refs(ci, got); - if (retry_op && ret >= 0) { + if (retry_op > HAVE_RETRIED && ret >= 0) { int statret; struct page *page = NULL; loff_t i_size; @@ -968,12 +1210,11 @@ again: if (retry_op == CHECK_EOF && iocb->ki_pos < i_size && ret < len) { dout("sync_read hit hole, ppos %lld < size %lld" - ", reading more\n", iocb->ki_pos, - inode->i_size); + ", reading more\n", iocb->ki_pos, i_size); read += ret; len -= ret; - retry_op = 0; + retry_op = HAVE_RETRIED; goto again; } } @@ -1014,7 +1255,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) if (!prealloc_cf) return -ENOMEM; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* We can write back this queue in page reclaim */ current->backing_dev_info = inode_to_bdi(inode); @@ -1052,7 +1293,7 @@ retry_snap: } dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n", - inode, ceph_vinop(inode), pos, count, inode->i_size); + inode, ceph_vinop(inode), pos, count, i_size_read(inode)); if (fi->fmode & CEPH_FILE_MODE_LAZY) want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; else @@ -1070,7 +1311,7 @@ retry_snap: (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) { struct ceph_snap_context *snapc; struct iov_iter data; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); spin_lock(&ci->i_ceph_lock); if (__ceph_have_pending_cap_snap(ci)) { @@ -1088,8 +1329,8 @@ retry_snap: /* we might need to revert back to that point */ data = *from; if (iocb->ki_flags & IOCB_DIRECT) - written = ceph_sync_direct_write(iocb, &data, pos, - snapc); + written = ceph_direct_read_write(iocb, &data, snapc, + &prealloc_cf); else written = ceph_sync_write(iocb, &data, pos, snapc); if (written == -EOLDSNAPC) { @@ -1097,14 +1338,14 @@ retry_snap: "got EOLDSNAPC, retrying\n", inode, ceph_vinop(inode), pos, (unsigned)count); - mutex_lock(&inode->i_mutex); + inode_lock(inode); goto retry_snap; } if (written > 0) iov_iter_advance(from, written); ceph_put_snap_context(snapc); } else { - loff_t old_size = inode->i_size; + loff_t old_size = i_size_read(inode); /* * No need to acquire the i_truncate_mutex. Because * the MDS revokes Fwb caps before sending truncate @@ -1115,9 +1356,9 @@ retry_snap: written = generic_perform_write(file, from, pos); if (likely(written >= 0)) iocb->ki_pos = pos + written; - if (inode->i_size > old_size) + if (i_size_read(inode) > old_size) ceph_fscache_update_objectsize(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } if (written >= 0) { @@ -1147,7 +1388,7 @@ retry_snap: goto out_unlocked; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); out_unlocked: ceph_free_cap_flush(prealloc_cf); current->backing_dev_info = NULL; @@ -1160,9 +1401,10 @@ out_unlocked: static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; + loff_t i_size; int ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) { ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false); @@ -1172,9 +1414,10 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) } } + i_size = i_size_read(inode); switch (whence) { case SEEK_END: - offset += inode->i_size; + offset += i_size; break; case SEEK_CUR: /* @@ -1190,24 +1433,24 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) offset += file->f_pos; break; case SEEK_DATA: - if (offset >= inode->i_size) { + if (offset >= i_size) { ret = -ENXIO; goto out; } break; case SEEK_HOLE: - if (offset >= inode->i_size) { + if (offset >= i_size) { ret = -ENXIO; goto out; } - offset = inode->i_size; + offset = i_size; break; } offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return offset; } @@ -1363,7 +1606,7 @@ static long ceph_fallocate(struct file *file, int mode, if (!prealloc_cf) return -ENOMEM; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (ceph_snap(inode) != CEPH_NOSNAP) { ret = -EROFS; @@ -1418,7 +1661,7 @@ static long ceph_fallocate(struct file *file, int mode, ceph_put_cap_refs(ci, got); unlock: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); ceph_free_cap_flush(prealloc_cf); return ret; } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index da55eb8bcffa..e48fd8b23257 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -396,6 +396,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_symlink = NULL; memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); + ci->i_pool_ns_len = 0; ci->i_fragtree = RB_ROOT; mutex_init(&ci->i_fragtree_mutex); @@ -548,7 +549,7 @@ int ceph_fill_file_size(struct inode *inode, int issued, if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 || (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) { dout("size %lld -> %llu\n", inode->i_size, size); - inode->i_size = size; + i_size_write(inode, size); inode->i_blocks = (size + (1<<9) - 1) >> 9; ci->i_reported_size = size; if (truncate_seq != ci->i_truncate_seq) { @@ -756,6 +757,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page, if (ci->i_layout.fl_pg_pool != info->layout.fl_pg_pool) ci->i_ceph_flags &= ~CEPH_I_POOL_PERM; ci->i_layout = info->layout; + ci->i_pool_ns_len = iinfo->pool_ns_len; queue_trunc = ceph_fill_file_size(inode, issued, le32_to_cpu(info->truncate_seq), @@ -808,7 +810,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page, spin_unlock(&ci->i_ceph_lock); err = -EINVAL; - if (WARN_ON(symlen != inode->i_size)) + if (WARN_ON(symlen != i_size_read(inode))) goto out; err = -ENOMEM; @@ -975,13 +977,8 @@ out_unlock: /* * splice a dentry to an inode. * caller must hold directory i_mutex for this to be safe. - * - * we will only rehash the resulting dentry if @prehash is - * true; @prehash will be set to false (for the benefit of - * the caller) if we fail. */ -static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, - bool *prehash) +static struct dentry *splice_dentry(struct dentry *dn, struct inode *in) { struct dentry *realdn; @@ -994,8 +991,6 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, if (IS_ERR(realdn)) { pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n", PTR_ERR(realdn), dn, in, ceph_vinop(in)); - if (prehash) - *prehash = false; /* don't rehash on error */ dn = realdn; /* note realdn contains the error */ goto out; } else if (realdn) { @@ -1011,8 +1006,6 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, dout("dn %p attached to %p ino %llx.%llx\n", dn, d_inode(dn), ceph_vinop(d_inode(dn))); } - if ((!prehash || *prehash) && d_unhashed(dn)) - d_rehash(dn); out: return dn; } @@ -1245,10 +1238,8 @@ retry_lookup: dout("d_delete %p\n", dn); d_delete(dn); } else { - dout("d_instantiate %p NULL\n", dn); - d_instantiate(dn, NULL); if (have_lease && d_unhashed(dn)) - d_rehash(dn); + d_add(dn, NULL); update_dentry_lease(dn, rinfo->dlease, session, req->r_request_started); @@ -1260,7 +1251,7 @@ retry_lookup: if (d_really_is_negative(dn)) { ceph_dir_clear_ordered(dir); ihold(in); - dn = splice_dentry(dn, in, &have_lease); + dn = splice_dentry(dn, in); if (IS_ERR(dn)) { err = PTR_ERR(dn); goto done; @@ -1290,7 +1281,7 @@ retry_lookup: dout(" linking snapped dir %p to dn %p\n", in, dn); ceph_dir_clear_ordered(dir); ihold(in); - dn = splice_dentry(dn, in, NULL); + dn = splice_dentry(dn, in); if (IS_ERR(dn)) { err = PTR_ERR(dn); goto done; @@ -1501,7 +1492,7 @@ retry_lookup: } if (d_really_is_negative(dn)) { - struct dentry *realdn = splice_dentry(dn, in, NULL); + struct dentry *realdn = splice_dentry(dn, in); if (IS_ERR(realdn)) { err = PTR_ERR(realdn); d_drop(dn); @@ -1549,7 +1540,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size) spin_lock(&ci->i_ceph_lock); dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size); - inode->i_size = size; + i_size_write(inode, size); inode->i_blocks = (size + (1 << 9) - 1) >> 9; /* tell the MDS if we are approaching max_size */ @@ -1911,7 +1902,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) inode->i_size, attr->ia_size); if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size > inode->i_size) { - inode->i_size = attr->ia_size; + i_size_write(inode, attr->ia_size); inode->i_blocks = (attr->ia_size + (1 << 9) - 1) >> 9; inode->i_ctime = attr->ia_ctime; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index e7b130a637f9..911d64d865f1 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -100,6 +100,14 @@ static int parse_reply_info_in(void **p, void *end, } else info->inline_version = CEPH_INLINE_NONE; + if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) { + ceph_decode_32_safe(p, end, info->pool_ns_len, bad); + ceph_decode_need(p, end, info->pool_ns_len, bad); + *p += info->pool_ns_len; + } else { + info->pool_ns_len = 0; + } + return 0; bad: return err; @@ -2298,6 +2306,14 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), CEPH_CAP_PIN); + /* deny access to directories with pool_ns layouts */ + if (req->r_inode && S_ISDIR(req->r_inode->i_mode) && + ceph_inode(req->r_inode)->i_pool_ns_len) + return -EIO; + if (req->r_locked_dir && + ceph_inode(req->r_locked_dir)->i_pool_ns_len) + return -EIO; + /* issue */ mutex_lock(&mdsc->mutex); __register_request(mdsc, req, dir); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index ccf11ef0ca87..37712ccffcc6 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -44,6 +44,7 @@ struct ceph_mds_reply_info_in { u64 inline_version; u32 inline_len; char *inline_data; + u32 pool_ns_len; }; /* diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 75b7d125ce66..9c458eb52245 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -287,6 +287,7 @@ struct ceph_inode_info { struct ceph_dir_layout i_dir_layout; struct ceph_file_layout i_layout; + size_t i_pool_ns_len; char *i_symlink; /* for dirs */ diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 7febcf2475c5..788e19195991 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -50,7 +50,7 @@ void cifs_vfs_err(const char *fmt, ...) vaf.fmt = fmt; vaf.va = &args; - pr_err("CIFS VFS: %pV", &vaf); + pr_err_ratelimited("CIFS VFS: %pV", &vaf); va_end(args); } @@ -255,7 +255,6 @@ static const struct file_operations cifs_debug_data_proc_fops = { static ssize_t cifs_stats_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { - char c; bool bv; int rc; struct list_head *tmp1, *tmp2, *tmp3; @@ -263,11 +262,8 @@ static ssize_t cifs_stats_proc_write(struct file *file, struct cifs_ses *ses; struct cifs_tcon *tcon; - rc = get_user(c, buffer); - if (rc) - return rc; - - if (strtobool(&c, &bv) == 0) { + rc = kstrtobool_from_user(buffer, count, &bv); + if (rc == 0) { #ifdef CONFIG_CIFS_STATS2 atomic_set(&totBufAllocCount, 0); atomic_set(&totSmBufAllocCount, 0); @@ -290,6 +286,8 @@ static ssize_t cifs_stats_proc_write(struct file *file, } } spin_unlock(&cifs_tcp_ses_lock); + } else { + return rc; } return count; @@ -433,17 +431,17 @@ static int cifsFYI_proc_open(struct inode *inode, struct file *file) static ssize_t cifsFYI_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { - char c; + char c[2] = { '\0' }; bool bv; int rc; - rc = get_user(c, buffer); + rc = get_user(c[0], buffer); if (rc) return rc; - if (strtobool(&c, &bv) == 0) + if (strtobool(c, &bv) == 0) cifsFYI = bv; - else if ((c > '1') && (c <= '9')) - cifsFYI = (int) (c - '0'); /* see cifs_debug.h for meanings */ + else if ((c[0] > '1') && (c[0] <= '9')) + cifsFYI = (int) (c[0] - '0'); /* see cifs_debug.h for meanings */ return count; } @@ -471,20 +469,12 @@ static int cifs_linux_ext_proc_open(struct inode *inode, struct file *file) static ssize_t cifs_linux_ext_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { - char c; - bool bv; int rc; - rc = get_user(c, buffer); + rc = kstrtobool_from_user(buffer, count, &linuxExtEnabled); if (rc) return rc; - rc = strtobool(&c, &bv); - if (rc) - return rc; - - linuxExtEnabled = bv; - return count; } @@ -511,20 +501,12 @@ static int cifs_lookup_cache_proc_open(struct inode *inode, struct file *file) static ssize_t cifs_lookup_cache_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { - char c; - bool bv; int rc; - rc = get_user(c, buffer); + rc = kstrtobool_from_user(buffer, count, &lookupCacheEnabled); if (rc) return rc; - rc = strtobool(&c, &bv); - if (rc) - return rc; - - lookupCacheEnabled = bv; - return count; } @@ -551,20 +533,12 @@ static int traceSMB_proc_open(struct inode *inode, struct file *file) static ssize_t traceSMB_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { - char c; - bool bv; int rc; - rc = get_user(c, buffer); + rc = kstrtobool_from_user(buffer, count, &traceSMB); if (rc) return rc; - rc = strtobool(&c, &bv); - if (rc) - return rc; - - traceSMB = bv; - return count; } @@ -622,7 +596,6 @@ static ssize_t cifs_security_flags_proc_write(struct file *file, int rc; unsigned int flags; char flags_string[12]; - char c; bool bv; if ((count < 1) || (count > 11)) @@ -635,11 +608,10 @@ static ssize_t cifs_security_flags_proc_write(struct file *file, if (count < 3) { /* single char or single char followed by null */ - c = flags_string[0]; - if (strtobool(&c, &bv) == 0) { + if (strtobool(flags_string, &bv) == 0) { global_secflags = bv ? CIFSSEC_MAX : CIFSSEC_DEF; return count; - } else if (!isdigit(c)) { + } else if (!isdigit(flags_string[0])) { cifs_dbg(VFS, "Invalid SecurityFlags: %s\n", flags_string); return -EINVAL; diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h index f40fbaca1b2a..c611ca2339d7 100644 --- a/fs/cifs/cifs_debug.h +++ b/fs/cifs/cifs_debug.h @@ -25,7 +25,7 @@ void cifs_dump_mem(char *label, void *data, int length); void cifs_dump_detail(void *); void cifs_dump_mids(struct TCP_Server_Info *); -extern int traceSMB; /* flag which enables the function below */ +extern bool traceSMB; /* flag which enables the function below */ void dump_smb(void *, int); #define CIFS_INFO 0x01 #define CIFS_RC 0x02 @@ -51,14 +51,13 @@ __printf(1, 2) void cifs_vfs_err(const char *fmt, ...); /* information message: e.g., configuration, major event */ #define cifs_dbg(type, fmt, ...) \ do { \ - if (type == FYI) { \ - if (cifsFYI & CIFS_INFO) { \ - pr_debug("%s: " fmt, __FILE__, ##__VA_ARGS__); \ - } \ + if (type == FYI && cifsFYI & CIFS_INFO) { \ + pr_debug_ratelimited("%s: " \ + fmt, __FILE__, ##__VA_ARGS__); \ } else if (type == VFS) { \ cifs_vfs_err(fmt, ##__VA_ARGS__); \ } else if (type == NOISY && type != 0) { \ - pr_debug(fmt, ##__VA_ARGS__); \ + pr_debug_ratelimited(fmt, ##__VA_ARGS__); \ } \ } while (0) diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index 7dc886c9a78f..e956cba94338 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -175,7 +175,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata, * string to the length of the original string to allow for worst case. */ md_len = strlen(sb_mountdata) + INET6_ADDRSTRLEN; - mountdata = kzalloc(md_len + 1, GFP_KERNEL); + mountdata = kzalloc(md_len + sizeof("ip=") + 1, GFP_KERNEL); if (mountdata == NULL) { rc = -ENOMEM; goto compose_mount_options_err; diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index afa09fce8151..4897dacf8944 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -33,6 +33,7 @@ #include <linux/ctype.h> #include <linux/random.h> #include <linux/highmem.h> +#include <crypto/skcipher.h> static int cifs_crypto_shash_md5_allocate(struct TCP_Server_Info *server) @@ -714,7 +715,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) ses->auth_key.response = kmalloc(baselen + tilen, GFP_KERNEL); if (!ses->auth_key.response) { - rc = ENOMEM; + rc = -ENOMEM; ses->auth_key.len = 0; goto setup_ntlmv2_rsp_ret; } @@ -789,38 +790,46 @@ int calc_seckey(struct cifs_ses *ses) { int rc; - struct crypto_blkcipher *tfm_arc4; + struct crypto_skcipher *tfm_arc4; struct scatterlist sgin, sgout; - struct blkcipher_desc desc; + struct skcipher_request *req; unsigned char sec_key[CIFS_SESS_KEY_SIZE]; /* a nonce */ get_random_bytes(sec_key, CIFS_SESS_KEY_SIZE); - tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC); + tfm_arc4 = crypto_alloc_skcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(tfm_arc4)) { rc = PTR_ERR(tfm_arc4); cifs_dbg(VFS, "could not allocate crypto API arc4\n"); return rc; } - desc.tfm = tfm_arc4; - - rc = crypto_blkcipher_setkey(tfm_arc4, ses->auth_key.response, + rc = crypto_skcipher_setkey(tfm_arc4, ses->auth_key.response, CIFS_SESS_KEY_SIZE); if (rc) { cifs_dbg(VFS, "%s: Could not set response as a key\n", __func__); - return rc; + goto out_free_cipher; + } + + req = skcipher_request_alloc(tfm_arc4, GFP_KERNEL); + if (!req) { + rc = -ENOMEM; + cifs_dbg(VFS, "could not allocate crypto API arc4 request\n"); + goto out_free_cipher; } sg_init_one(&sgin, sec_key, CIFS_SESS_KEY_SIZE); sg_init_one(&sgout, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE); - rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, CIFS_CPHTXT_SIZE); + skcipher_request_set_callback(req, 0, NULL, NULL); + skcipher_request_set_crypt(req, &sgin, &sgout, CIFS_CPHTXT_SIZE, NULL); + + rc = crypto_skcipher_encrypt(req); + skcipher_request_free(req); if (rc) { cifs_dbg(VFS, "could not encrypt session key rc: %d\n", rc); - crypto_free_blkcipher(tfm_arc4); - return rc; + goto out_free_cipher; } /* make secondary_key/nonce as session key */ @@ -828,7 +837,8 @@ calc_seckey(struct cifs_ses *ses) /* and make len as that of session key only */ ses->auth_key.len = CIFS_SESS_KEY_SIZE; - crypto_free_blkcipher(tfm_arc4); +out_free_cipher: + crypto_free_skcipher(tfm_arc4); return rc; } diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index c4c1169814b2..1d86fc620e5c 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -54,10 +54,10 @@ #endif int cifsFYI = 0; -int traceSMB = 0; +bool traceSMB; bool enable_oplocks = true; -unsigned int linuxExtEnabled = 1; -unsigned int lookupCacheEnabled = 1; +bool linuxExtEnabled = true; +bool lookupCacheEnabled = true; unsigned int global_secflags = CIFSSEC_DEF; /* unsigned int ntlmv2_support = 0; */ unsigned int sign_CIFS_PDUs = 1; @@ -507,6 +507,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",rsize=%u", cifs_sb->rsize); seq_printf(s, ",wsize=%u", cifs_sb->wsize); + seq_printf(s, ",echo_interval=%lu", + tcon->ses->server->echo_interval / HZ); /* convert actimeo and display it in seconds */ seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ); @@ -640,9 +642,7 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) while (*s && *s != sep) s++; - mutex_lock(&dir->i_mutex); - child = lookup_one_len(p, dentry, s - p); - mutex_unlock(&dir->i_mutex); + child = lookup_one_len_unlocked(p, dentry, s - p); dput(dentry); dentry = child; } while (!IS_ERR(dentry)); @@ -752,6 +752,9 @@ cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter) ssize_t rc; struct inode *inode = file_inode(iocb->ki_filp); + if (iocb->ki_filp->f_flags & O_DIRECT) + return cifs_user_readv(iocb, iter); + rc = cifs_revalidate_mapping(inode); if (rc) return rc; @@ -766,6 +769,18 @@ static ssize_t cifs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ssize_t written; int rc; + if (iocb->ki_filp->f_flags & O_DIRECT) { + written = cifs_user_writev(iocb, from); + if (written > 0 && CIFS_CACHE_READ(cinode)) { + cifs_zap_mapping(inode); + cifs_dbg(FYI, + "Set no oplock for inode=%p after a write operation\n", + inode); + cinode->oplock = 0; + } + return written; + } + written = cifs_get_writer(cinode); if (written) return written; @@ -996,7 +1011,6 @@ const struct file_operations cifs_file_strict_ops = { .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, .clone_file_range = cifs_clone_file_range, - .clone_file_range = cifs_clone_file_range, .setlease = cifs_setlease, .fallocate = cifs_fallocate, }; diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 68c4547528c4..83aac8ba50b0 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -31,19 +31,15 @@ * so that it will fit. We use hash_64 to convert the value to 31 bits, and * then add 1, to ensure that we don't end up with a 0 as the value. */ -#if BITS_PER_LONG == 64 static inline ino_t cifs_uniqueid_to_ino_t(u64 fileid) { + if ((sizeof(ino_t)) < (sizeof(u64))) + return (ino_t)hash_64(fileid, (sizeof(ino_t) * 8) - 1) + 1; + return (ino_t)fileid; + } -#else -static inline ino_t -cifs_uniqueid_to_ino_t(u64 fileid) -{ - return (ino_t)hash_64(fileid, (sizeof(ino_t) * 8) - 1) + 1; -} -#endif extern struct file_system_type cifs_fs_type; extern const struct address_space_operations cifs_addr_ops; diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 2b510c537a0d..d21da9f05bae 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -70,8 +70,10 @@ #define SERVER_NAME_LENGTH 40 #define SERVER_NAME_LEN_WITH_NULL (SERVER_NAME_LENGTH + 1) -/* SMB echo "timeout" -- FIXME: tunable? */ -#define SMB_ECHO_INTERVAL (60 * HZ) +/* echo interval in seconds */ +#define SMB_ECHO_INTERVAL_MIN 1 +#define SMB_ECHO_INTERVAL_MAX 600 +#define SMB_ECHO_INTERVAL_DEFAULT 60 #include "cifspdu.h" @@ -225,7 +227,7 @@ struct smb_version_operations { void (*print_stats)(struct seq_file *m, struct cifs_tcon *); void (*dump_share_caps)(struct seq_file *, struct cifs_tcon *); /* verify the message */ - int (*check_message)(char *, unsigned int); + int (*check_message)(char *, unsigned int, struct TCP_Server_Info *); bool (*is_oplock_break)(char *, struct TCP_Server_Info *); void (*downgrade_oplock)(struct TCP_Server_Info *, struct cifsInodeInfo *, bool); @@ -507,6 +509,7 @@ struct smb_vol { struct sockaddr_storage dstaddr; /* destination address */ struct sockaddr_storage srcaddr; /* allow binding to a local IP */ struct nls_table *local_nls; + unsigned int echo_interval; /* echo interval in secs */ }; #define CIFS_MOUNT_MASK (CIFS_MOUNT_NO_PERM | CIFS_MOUNT_SET_UID | \ @@ -627,7 +630,9 @@ struct TCP_Server_Info { #ifdef CONFIG_CIFS_SMB2 unsigned int max_read; unsigned int max_write; + __u8 preauth_hash[512]; #endif /* CONFIG_CIFS_SMB2 */ + unsigned long echo_interval; }; static inline unsigned int @@ -809,7 +814,10 @@ struct cifs_ses { bool need_reconnect:1; /* connection reset, uid now invalid */ #ifdef CONFIG_CIFS_SMB2 __u16 session_flags; - char smb3signingkey[SMB3_SIGN_KEY_SIZE]; /* for signing smb3 packets */ + __u8 smb3signingkey[SMB3_SIGN_KEY_SIZE]; + __u8 smb3encryptionkey[SMB3_SIGN_KEY_SIZE]; + __u8 smb3decryptionkey[SMB3_SIGN_KEY_SIZE]; + __u8 preauth_hash[512]; #endif /* CONFIG_CIFS_SMB2 */ }; @@ -1588,11 +1596,11 @@ GLOBAL_EXTERN atomic_t midCount; /* Misc globals */ GLOBAL_EXTERN bool enable_oplocks; /* enable or disable oplocks */ -GLOBAL_EXTERN unsigned int lookupCacheEnabled; +GLOBAL_EXTERN bool lookupCacheEnabled; GLOBAL_EXTERN unsigned int global_secflags; /* if on, session setup sent with more secure ntlmssp2 challenge/resp */ GLOBAL_EXTERN unsigned int sign_CIFS_PDUs; /* enable smb packet signing */ -GLOBAL_EXTERN unsigned int linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/ +GLOBAL_EXTERN bool linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/ GLOBAL_EXTERN unsigned int CIFSMaxBufSize; /* max size not including hdr */ GLOBAL_EXTERN unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */ GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index c63fd1dde25b..eed7ff50faf0 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -102,7 +102,7 @@ extern int SendReceiveBlockingLock(const unsigned int xid, struct smb_hdr *out_buf, int *bytes_returned); extern int cifs_reconnect(struct TCP_Server_Info *server); -extern int checkSMB(char *buf, unsigned int length); +extern int checkSMB(char *buf, unsigned int len, struct TCP_Server_Info *srvr); extern bool is_valid_oplock_break(char *, struct TCP_Server_Info *); extern bool backup_cred(struct cifs_sb_info *); extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof); @@ -439,7 +439,8 @@ extern int setup_ntlm_response(struct cifs_ses *, const struct nls_table *); extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *); extern void cifs_crypto_shash_release(struct TCP_Server_Info *); extern int calc_seckey(struct cifs_ses *); -extern int generate_smb3signingkey(struct cifs_ses *); +extern int generate_smb30signingkey(struct cifs_ses *); +extern int generate_smb311signingkey(struct cifs_ses *); #ifdef CONFIG_CIFS_WEAK_PW_HASH extern int calc_lanman_hash(const char *password, const char *cryptkey, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 90b4f9f7de66..76fcb50295a3 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1396,11 +1396,10 @@ openRetry: * current bigbuf. */ static int -cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid) +discard_remaining_data(struct TCP_Server_Info *server) { unsigned int rfclen = get_rfc1002_length(server->smallbuf); int remaining = rfclen + 4 - server->total_read; - struct cifs_readdata *rdata = mid->callback_data; while (remaining > 0) { int length; @@ -1414,10 +1413,20 @@ cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid) remaining -= length; } - dequeue_mid(mid, rdata->result); return 0; } +static int +cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid) +{ + int length; + struct cifs_readdata *rdata = mid->callback_data; + + length = discard_remaining_data(server); + dequeue_mid(mid, rdata->result); + return length; +} + int cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) { @@ -1446,6 +1455,12 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) return length; server->total_read += length; + if (server->ops->is_status_pending && + server->ops->is_status_pending(buf, server, 0)) { + discard_remaining_data(server); + return -1; + } + /* Was the SMB read successful? */ rdata->result = server->ops->map_error(buf, false); if (rdata->result != 0) { diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index ecb0803bdb0e..a763cd3d9e7c 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -95,6 +95,7 @@ enum { Opt_cruid, Opt_gid, Opt_file_mode, Opt_dirmode, Opt_port, Opt_rsize, Opt_wsize, Opt_actimeo, + Opt_echo_interval, /* Mount options which take string value */ Opt_user, Opt_pass, Opt_ip, @@ -188,6 +189,7 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_rsize, "rsize=%s" }, { Opt_wsize, "wsize=%s" }, { Opt_actimeo, "actimeo=%s" }, + { Opt_echo_interval, "echo_interval=%s" }, { Opt_blank_user, "user=" }, { Opt_blank_user, "username=" }, @@ -368,7 +370,6 @@ cifs_reconnect(struct TCP_Server_Info *server) server->session_key.response = NULL; server->session_key.len = 0; server->lstrp = jiffies; - mutex_unlock(&server->srv_mutex); /* mark submitted MIDs for retry and issue callback */ INIT_LIST_HEAD(&retry_list); @@ -381,6 +382,7 @@ cifs_reconnect(struct TCP_Server_Info *server) list_move(&mid_entry->qhead, &retry_list); } spin_unlock(&GlobalMid_Lock); + mutex_unlock(&server->srv_mutex); cifs_dbg(FYI, "%s: issuing mid callbacks\n", __func__); list_for_each_safe(tmp, tmp2, &retry_list) { @@ -418,6 +420,7 @@ cifs_echo_request(struct work_struct *work) int rc; struct TCP_Server_Info *server = container_of(work, struct TCP_Server_Info, echo.work); + unsigned long echo_interval = server->echo_interval; /* * We cannot send an echo if it is disabled or until the @@ -427,7 +430,7 @@ cifs_echo_request(struct work_struct *work) */ if (!server->ops->need_neg || server->ops->need_neg(server) || (server->ops->can_echo && !server->ops->can_echo(server)) || - time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ)) + time_before(jiffies, server->lstrp + echo_interval - HZ)) goto requeue_echo; rc = server->ops->echo ? server->ops->echo(server) : -ENOSYS; @@ -436,7 +439,7 @@ cifs_echo_request(struct work_struct *work) server->hostname); requeue_echo: - queue_delayed_work(cifsiod_wq, &server->echo, SMB_ECHO_INTERVAL); + queue_delayed_work(cifsiod_wq, &server->echo, echo_interval); } static bool @@ -487,9 +490,9 @@ server_unresponsive(struct TCP_Server_Info *server) * a response in >60s. */ if (server->tcpStatus == CifsGood && - time_after(jiffies, server->lstrp + 2 * SMB_ECHO_INTERVAL)) { - cifs_dbg(VFS, "Server %s has not responded in %d seconds. Reconnecting...\n", - server->hostname, (2 * SMB_ECHO_INTERVAL) / HZ); + time_after(jiffies, server->lstrp + 2 * server->echo_interval)) { + cifs_dbg(VFS, "Server %s has not responded in %lu seconds. Reconnecting...\n", + server->hostname, (2 * server->echo_interval) / HZ); cifs_reconnect(server); wake_up(&server->response_q); return true; @@ -828,7 +831,7 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid) * 48 bytes is enough to display the header and a little bit * into the payload for debugging purposes. */ - length = server->ops->check_message(buf, server->total_read); + length = server->ops->check_message(buf, server->total_read, server); if (length != 0) cifs_dump_mem("Bad SMB: ", buf, min_t(unsigned int, server->total_read, 48)); @@ -1624,6 +1627,14 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, goto cifs_parse_mount_err; } break; + case Opt_echo_interval: + if (get_option_ul(args, &option)) { + cifs_dbg(VFS, "%s: Invalid echo interval value\n", + __func__); + goto cifs_parse_mount_err; + } + vol->echo_interval = option; + break; /* String Arguments */ @@ -2089,6 +2100,9 @@ static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol) if (!match_security(server, vol)) return 0; + if (server->echo_interval != vol->echo_interval) + return 0; + return 1; } @@ -2208,6 +2222,12 @@ cifs_get_tcp_session(struct smb_vol *volume_info) tcp_ses->tcpStatus = CifsNew; ++tcp_ses->srv_count; + if (volume_info->echo_interval >= SMB_ECHO_INTERVAL_MIN && + volume_info->echo_interval <= SMB_ECHO_INTERVAL_MAX) + tcp_ses->echo_interval = volume_info->echo_interval * HZ; + else + tcp_ses->echo_interval = SMB_ECHO_INTERVAL_DEFAULT * HZ; + rc = ip_connect(tcp_ses); if (rc < 0) { cifs_dbg(VFS, "Error connecting to socket. Aborting operation.\n"); @@ -2237,7 +2257,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) cifs_fscache_get_client_cookie(tcp_ses); /* queue echo request delayed work */ - queue_delayed_work(cifsiod_wq, &tcp_ses->echo, SMB_ECHO_INTERVAL); + queue_delayed_work(cifsiod_wq, &tcp_ses->echo, tcp_ses->echo_interval); return tcp_ses; @@ -2979,8 +2999,7 @@ ip_rfc1001_connect(struct TCP_Server_Info *server) if (ses_init_buf) { ses_init_buf->trailer.session_req.called_len = 32; - if (server->server_RFC1001_name && - server->server_RFC1001_name[0] != 0) + if (server->server_RFC1001_name[0] != 0) rfc1002mangle(ses_init_buf->trailer. session_req.called_name, server->server_RFC1001_name, diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 0a2752b79e72..ff882aeaccc6 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2267,7 +2267,7 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end, rc = filemap_write_and_wait_range(inode->i_mapping, start, end); if (rc) return rc; - mutex_lock(&inode->i_mutex); + inode_lock(inode); xid = get_xid(); @@ -2292,7 +2292,7 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end, } free_xid(xid); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return rc; } @@ -2309,7 +2309,7 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) rc = filemap_write_and_wait_range(inode->i_mapping, start, end); if (rc) return rc; - mutex_lock(&inode->i_mutex); + inode_lock(inode); xid = get_xid(); @@ -2326,7 +2326,7 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) } free_xid(xid); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return rc; } @@ -2672,7 +2672,7 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from) * with a brlock that prevents writing. */ down_read(&cinode->lock_sem); - mutex_lock(&inode->i_mutex); + inode_lock(inode); rc = generic_write_checks(iocb, from); if (rc <= 0) @@ -2685,7 +2685,7 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from) else rc = -EACCES; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (rc > 0) { ssize_t err = generic_write_sync(file, iocb->ki_pos - rc, rc); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index a329f5ba35aa..aeb26dbfa1bf 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -814,8 +814,21 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, } } else fattr.cf_uniqueid = iunique(sb, ROOT_I); - } else - fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid; + } else { + if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) && + validinum == false && server->ops->get_srv_inum) { + /* + * Pass a NULL tcon to ensure we don't make a round + * trip to the server. This only works for SMB2+. + */ + tmprc = server->ops->get_srv_inum(xid, + NULL, cifs_sb, full_path, + &fattr.cf_uniqueid, data); + if (tmprc) + fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid; + } else + fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid; + } /* query for SFU type info if supported and needed */ if (fattr.cf_cifsattrs & ATTR_SYSTEM && @@ -856,6 +869,13 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, } else { /* we already have inode, update it */ + /* if uniqueid is different, return error */ + if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM && + CIFS_I(*inode)->uniqueid != fattr.cf_uniqueid)) { + rc = -ESTALE; + goto cgii_exit; + } + /* if filetype is different, return error */ if (unlikely(((*inode)->i_mode & S_IFMT) != (fattr.cf_mode & S_IFMT))) { diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 8442b8b8e0be..813fe13c2ae1 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -310,7 +310,7 @@ check_smb_hdr(struct smb_hdr *smb) } int -checkSMB(char *buf, unsigned int total_read) +checkSMB(char *buf, unsigned int total_read, struct TCP_Server_Info *server) { struct smb_hdr *smb = (struct smb_hdr *)buf; __u32 rfclen = be32_to_cpu(smb->smb_buf_length); diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 0557c45e9c33..b30a4a6d98a0 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -847,6 +847,7 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) * if buggy server returns . and .. late do we want to * check for that here? */ + *tmp_buf = 0; rc = cifs_filldir(current_entry, file, ctx, tmp_buf, max_len); if (rc) { diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 1c5907019045..389fb9f8c84e 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -38,7 +38,7 @@ check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid) * Make sure that this really is an SMB, that it is a response, * and that the message ids match. */ - if ((*(__le32 *)hdr->ProtocolId == SMB2_PROTO_NUMBER) && + if ((hdr->ProtocolId == SMB2_PROTO_NUMBER) && (mid == wire_mid)) { if (hdr->Flags & SMB2_FLAGS_SERVER_TO_REDIR) return 0; @@ -50,9 +50,9 @@ check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid) cifs_dbg(VFS, "Received Request not response\n"); } } else { /* bad signature or mid */ - if (*(__le32 *)hdr->ProtocolId != SMB2_PROTO_NUMBER) + if (hdr->ProtocolId != SMB2_PROTO_NUMBER) cifs_dbg(VFS, "Bad protocol string signature header %x\n", - *(unsigned int *) hdr->ProtocolId); + le32_to_cpu(hdr->ProtocolId)); if (mid != wire_mid) cifs_dbg(VFS, "Mids do not match: %llu and %llu\n", mid, wire_mid); @@ -93,11 +93,11 @@ static const __le16 smb2_rsp_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = { }; int -smb2_check_message(char *buf, unsigned int length) +smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr) { struct smb2_hdr *hdr = (struct smb2_hdr *)buf; struct smb2_pdu *pdu = (struct smb2_pdu *)hdr; - __u64 mid = le64_to_cpu(hdr->MessageId); + __u64 mid; __u32 len = get_rfc1002_length(buf); __u32 clc_len; /* calculated length */ int command; @@ -111,6 +111,30 @@ smb2_check_message(char *buf, unsigned int length) * ie Validate the wct via smb2_struct_sizes table above */ + if (hdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) { + struct smb2_transform_hdr *thdr = + (struct smb2_transform_hdr *)buf; + struct cifs_ses *ses = NULL; + struct list_head *tmp; + + /* decrypt frame now that it is completely read in */ + spin_lock(&cifs_tcp_ses_lock); + list_for_each(tmp, &srvr->smb_ses_list) { + ses = list_entry(tmp, struct cifs_ses, smb_ses_list); + if (ses->Suid == thdr->SessionId) + break; + + ses = NULL; + } + spin_unlock(&cifs_tcp_ses_lock); + if (ses == NULL) { + cifs_dbg(VFS, "no decryption - session id not found\n"); + return 1; + } + } + + + mid = le64_to_cpu(hdr->MessageId); if (length < sizeof(struct smb2_pdu)) { if ((length >= sizeof(struct smb2_hdr)) && (hdr->Status != 0)) { pdu->StructureSize2 = 0; @@ -322,7 +346,7 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr) /* return pointer to beginning of data area, ie offset from SMB start */ if ((*off != 0) && (*len != 0)) - return (char *)(&hdr->ProtocolId[0]) + *off; + return (char *)(&hdr->ProtocolId) + *off; else return NULL; } diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 53ccdde6ff18..3525ed756173 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -182,6 +182,11 @@ smb2_find_mid(struct TCP_Server_Info *server, char *buf) struct smb2_hdr *hdr = (struct smb2_hdr *)buf; __u64 wire_mid = le64_to_cpu(hdr->MessageId); + if (hdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) { + cifs_dbg(VFS, "encrypted frame parsing not supported yet"); + return NULL; + } + spin_lock(&GlobalMid_Lock); list_for_each_entry(mid, &server->pending_mid_q, qhead) { if ((mid->mid == wire_mid) && @@ -1692,7 +1697,7 @@ struct smb_version_operations smb30_operations = { .get_lease_key = smb2_get_lease_key, .set_lease_key = smb2_set_lease_key, .new_lease_key = smb2_new_lease_key, - .generate_signingkey = generate_smb3signingkey, + .generate_signingkey = generate_smb30signingkey, .calc_signature = smb3_calc_signature, .set_integrity = smb3_set_integrity, .is_read_op = smb21_is_read_op, @@ -1779,7 +1784,7 @@ struct smb_version_operations smb311_operations = { .get_lease_key = smb2_get_lease_key, .set_lease_key = smb2_set_lease_key, .new_lease_key = smb2_new_lease_key, - .generate_signingkey = generate_smb3signingkey, + .generate_signingkey = generate_smb311signingkey, .calc_signature = smb3_calc_signature, .set_integrity = smb3_set_integrity, .is_read_op = smb21_is_read_op, @@ -1838,7 +1843,7 @@ struct smb_version_values smb21_values = { struct smb_version_values smb30_values = { .version_string = SMB30_VERSION_STRING, .protocol_id = SMB30_PROT_ID, - .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES, + .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION, .large_lock_type = 0, .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, @@ -1858,7 +1863,7 @@ struct smb_version_values smb30_values = { struct smb_version_values smb302_values = { .version_string = SMB302_VERSION_STRING, .protocol_id = SMB302_PROT_ID, - .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES, + .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION, .large_lock_type = 0, .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 767555518d40..42e1f440eb1e 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -97,10 +97,7 @@ smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ , hdr->smb2_buf_length = cpu_to_be32(parmsize + sizeof(struct smb2_hdr) - 4 /* RFC 1001 length field itself not counted */); - hdr->ProtocolId[0] = 0xFE; - hdr->ProtocolId[1] = 'S'; - hdr->ProtocolId[2] = 'M'; - hdr->ProtocolId[3] = 'B'; + hdr->ProtocolId = SMB2_PROTO_NUMBER; hdr->StructureSize = cpu_to_le16(64); hdr->Command = smb2_cmd; hdr->CreditRequest = cpu_to_le16(2); /* BB make this dynamic */ @@ -1109,21 +1106,25 @@ parse_lease_state(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp, { char *data_offset; struct create_context *cc; - unsigned int next = 0; + unsigned int next; + unsigned int remaining; char *name; data_offset = (char *)rsp + 4 + le32_to_cpu(rsp->CreateContextsOffset); + remaining = le32_to_cpu(rsp->CreateContextsLength); cc = (struct create_context *)data_offset; - do { - cc = (struct create_context *)((char *)cc + next); + while (remaining >= sizeof(struct create_context)) { name = le16_to_cpu(cc->NameOffset) + (char *)cc; - if (le16_to_cpu(cc->NameLength) != 4 || - strncmp(name, "RqLs", 4)) { - next = le32_to_cpu(cc->Next); - continue; - } - return server->ops->parse_lease_buf(cc, epoch); - } while (next != 0); + if (le16_to_cpu(cc->NameLength) == 4 && + strncmp(name, "RqLs", 4) == 0) + return server->ops->parse_lease_buf(cc, epoch); + + next = le32_to_cpu(cc->Next); + if (!next) + break; + remaining -= next; + cc = (struct create_context *)((char *)cc + next); + } return 0; } @@ -1573,7 +1574,8 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, goto ioctl_exit; } - memcpy(*out_data, rsp->hdr.ProtocolId + le32_to_cpu(rsp->OutputOffset), + memcpy(*out_data, + (char *)&rsp->hdr.ProtocolId + le32_to_cpu(rsp->OutputOffset), *plen); ioctl_exit: free_rsp_buf(resp_buftype, rsp); @@ -2093,7 +2095,7 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, } if (*buf) { - memcpy(*buf, (char *)rsp->hdr.ProtocolId + rsp->DataOffset, + memcpy(*buf, (char *)&rsp->hdr.ProtocolId + rsp->DataOffset, *nbytes); free_rsp_buf(resp_buftype, iov[0].iov_base); } else if (resp_buftype != CIFS_NO_BUFFER) { diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 4af52780ec35..ff88d9feb01e 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -86,6 +86,7 @@ #define MAX_SMB2_HDR_SIZE 0x78 /* 4 len + 64 hdr + (2*24 wct) + 2 bct + 2 pad */ #define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe) +#define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd) /* * SMB2 Header Definition @@ -102,7 +103,7 @@ struct smb2_hdr { __be32 smb2_buf_length; /* big endian on wire */ /* length is only two or three bytes - with one or two byte type preceding it that MBZ */ - __u8 ProtocolId[4]; /* 0xFE 'S' 'M' 'B' */ + __le32 ProtocolId; /* 0xFE 'S' 'M' 'B' */ __le16 StructureSize; /* 64 */ __le16 CreditCharge; /* MBZ */ __le32 Status; /* Error from server */ @@ -128,11 +129,10 @@ struct smb2_transform_hdr { one or two byte type preceding it that MBZ */ __u8 ProtocolId[4]; /* 0xFD 'S' 'M' 'B' */ __u8 Signature[16]; - __u8 Nonce[11]; - __u8 Reserved[5]; + __u8 Nonce[16]; __le32 OriginalMessageSize; __u16 Reserved1; - __le16 EncryptionAlgorithm; + __le16 Flags; /* EncryptionAlgorithm */ __u64 SessionId; } __packed; diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 79dc650c18b2..4f07dc93608d 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -34,7 +34,8 @@ struct smb_rqst; ***************************************************************** */ extern int map_smb2_to_linux_error(char *buf, bool log_err); -extern int smb2_check_message(char *buf, unsigned int length); +extern int smb2_check_message(char *buf, unsigned int length, + struct TCP_Server_Info *server); extern unsigned int smb2_calc_size(void *buf); extern char *smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr); extern __le16 *cifs_convert_path_to_utf16(const char *from, diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index d4c5b6f109a7..8732a43b1008 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -222,8 +222,8 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) return rc; } -int -generate_smb3signingkey(struct cifs_ses *ses) +static int generate_key(struct cifs_ses *ses, struct kvec label, + struct kvec context, __u8 *key, unsigned int key_size) { unsigned char zero = 0x0; __u8 i[4] = {0, 0, 0, 1}; @@ -233,7 +233,7 @@ generate_smb3signingkey(struct cifs_ses *ses) unsigned char *hashptr = prfhash; memset(prfhash, 0x0, SMB2_HMACSHA256_SIZE); - memset(ses->smb3signingkey, 0x0, SMB3_SIGNKEY_SIZE); + memset(key, 0x0, key_size); rc = smb3_crypto_shash_allocate(ses->server); if (rc) { @@ -262,7 +262,7 @@ generate_smb3signingkey(struct cifs_ses *ses) } rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash, - "SMB2AESCMAC", 12); + label.iov_base, label.iov_len); if (rc) { cifs_dbg(VFS, "%s: Could not update with label\n", __func__); goto smb3signkey_ret; @@ -276,7 +276,7 @@ generate_smb3signingkey(struct cifs_ses *ses) } rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash, - "SmbSign", 8); + context.iov_base, context.iov_len); if (rc) { cifs_dbg(VFS, "%s: Could not update with context\n", __func__); goto smb3signkey_ret; @@ -296,12 +296,102 @@ generate_smb3signingkey(struct cifs_ses *ses) goto smb3signkey_ret; } - memcpy(ses->smb3signingkey, hashptr, SMB3_SIGNKEY_SIZE); + memcpy(key, hashptr, key_size); smb3signkey_ret: return rc; } +struct derivation { + struct kvec label; + struct kvec context; +}; + +struct derivation_triplet { + struct derivation signing; + struct derivation encryption; + struct derivation decryption; +}; + +static int +generate_smb3signingkey(struct cifs_ses *ses, + const struct derivation_triplet *ptriplet) +{ + int rc; + + rc = generate_key(ses, ptriplet->signing.label, + ptriplet->signing.context, ses->smb3signingkey, + SMB3_SIGN_KEY_SIZE); + if (rc) + return rc; + + rc = generate_key(ses, ptriplet->encryption.label, + ptriplet->encryption.context, ses->smb3encryptionkey, + SMB3_SIGN_KEY_SIZE); + if (rc) + return rc; + + return generate_key(ses, ptriplet->decryption.label, + ptriplet->decryption.context, + ses->smb3decryptionkey, SMB3_SIGN_KEY_SIZE); +} + +int +generate_smb30signingkey(struct cifs_ses *ses) + +{ + struct derivation_triplet triplet; + struct derivation *d; + + d = &triplet.signing; + d->label.iov_base = "SMB2AESCMAC"; + d->label.iov_len = 12; + d->context.iov_base = "SmbSign"; + d->context.iov_len = 8; + + d = &triplet.encryption; + d->label.iov_base = "SMB2AESCCM"; + d->label.iov_len = 11; + d->context.iov_base = "ServerIn "; + d->context.iov_len = 10; + + d = &triplet.decryption; + d->label.iov_base = "SMB2AESCCM"; + d->label.iov_len = 11; + d->context.iov_base = "ServerOut"; + d->context.iov_len = 10; + + return generate_smb3signingkey(ses, &triplet); +} + +int +generate_smb311signingkey(struct cifs_ses *ses) + +{ + struct derivation_triplet triplet; + struct derivation *d; + + d = &triplet.signing; + d->label.iov_base = "SMB2AESCMAC"; + d->label.iov_len = 12; + d->context.iov_base = "SmbSign"; + d->context.iov_len = 8; + + d = &triplet.encryption; + d->label.iov_base = "SMB2AESCCM"; + d->label.iov_len = 11; + d->context.iov_base = "ServerIn "; + d->context.iov_len = 10; + + d = &triplet.decryption; + d->label.iov_base = "SMB2AESCCM"; + d->label.iov_len = 11; + d->context.iov_base = "ServerOut"; + d->context.iov_len = 10; + + return generate_smb3signingkey(ses, &triplet); +} + int smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) { diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c index a4232ec4f2ba..699b7868108f 100644 --- a/fs/cifs/smbencrypt.c +++ b/fs/cifs/smbencrypt.c @@ -23,6 +23,7 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +#include <crypto/skcipher.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/fs.h> @@ -70,31 +71,42 @@ smbhash(unsigned char *out, const unsigned char *in, unsigned char *key) { int rc; unsigned char key2[8]; - struct crypto_blkcipher *tfm_des; + struct crypto_skcipher *tfm_des; struct scatterlist sgin, sgout; - struct blkcipher_desc desc; + struct skcipher_request *req; str_to_key(key, key2); - tfm_des = crypto_alloc_blkcipher("ecb(des)", 0, CRYPTO_ALG_ASYNC); + tfm_des = crypto_alloc_skcipher("ecb(des)", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(tfm_des)) { rc = PTR_ERR(tfm_des); cifs_dbg(VFS, "could not allocate des crypto API\n"); goto smbhash_err; } - desc.tfm = tfm_des; + req = skcipher_request_alloc(tfm_des, GFP_KERNEL); + if (!req) { + rc = -ENOMEM; + cifs_dbg(VFS, "could not allocate des crypto API\n"); + goto smbhash_free_skcipher; + } - crypto_blkcipher_setkey(tfm_des, key2, 8); + crypto_skcipher_setkey(tfm_des, key2, 8); sg_init_one(&sgin, in, 8); sg_init_one(&sgout, out, 8); - rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, 8); + skcipher_request_set_callback(req, 0, NULL, NULL); + skcipher_request_set_crypt(req, &sgin, &sgout, 8, NULL); + + rc = crypto_skcipher_encrypt(req); if (rc) cifs_dbg(VFS, "could not encrypt crypt key rc: %d\n", rc); - crypto_free_blkcipher(tfm_des); + skcipher_request_free(req); + +smbhash_free_skcipher: + crypto_free_skcipher(tfm_des); smbhash_err: return rc; } diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 2a24c524fb9a..87abe8ed074c 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -576,14 +576,16 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, cifs_in_send_dec(server); cifs_save_when_sent(mid); - if (rc < 0) + if (rc < 0) { server->sequence_number -= 2; + cifs_delete_mid(mid); + } + mutex_unlock(&server->srv_mutex); if (rc == 0) return 0; - cifs_delete_mid(mid); add_credits_and_wake_if(server, credits, optype); return rc; } diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h index f829fe963f5b..5104d84c4f64 100644 --- a/fs/coda/coda_linux.h +++ b/fs/coda/coda_linux.h @@ -72,8 +72,7 @@ void coda_sysctl_clean(void); } while (0) -#define CODA_FREE(ptr,size) \ - do { if (size < PAGE_SIZE) kfree((ptr)); else vfree((ptr)); } while (0) +#define CODA_FREE(ptr, size) kvfree((ptr)) /* inode to cnode access functions */ diff --git a/fs/coda/dir.c b/fs/coda/dir.c index fda9f4311212..42e731b8c80a 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -427,13 +427,13 @@ static int coda_readdir(struct file *coda_file, struct dir_context *ctx) if (host_file->f_op->iterate) { struct inode *host_inode = file_inode(host_file); - mutex_lock(&host_inode->i_mutex); + inode_lock(host_inode); ret = -ENOENT; if (!IS_DEADDIR(host_inode)) { ret = host_file->f_op->iterate(host_file, ctx); file_accessed(host_file); } - mutex_unlock(&host_inode->i_mutex); + inode_unlock(host_inode); return ret; } /* Venus: we must read Venus dirents from a file */ diff --git a/fs/coda/file.c b/fs/coda/file.c index 1da3805f3ddc..f47c7483863b 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -71,12 +71,12 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to) host_file = cfi->cfi_container; file_start_write(host_file); - mutex_lock(&coda_inode->i_mutex); + inode_lock(coda_inode); ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos); coda_inode->i_size = file_inode(host_file)->i_size; coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9; coda_inode->i_mtime = coda_inode->i_ctime = CURRENT_TIME_SEC; - mutex_unlock(&coda_inode->i_mutex); + inode_unlock(coda_inode); file_end_write(host_file); return ret; } @@ -203,7 +203,7 @@ int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync) err = filemap_write_and_wait_range(coda_inode->i_mapping, start, end); if (err) return err; - mutex_lock(&coda_inode->i_mutex); + inode_lock(coda_inode); cfi = CODA_FTOC(coda_file); BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); @@ -212,7 +212,7 @@ int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync) err = vfs_fsync(host_file, datasync); if (!err && !datasync) err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode)); - mutex_unlock(&coda_inode->i_mutex); + inode_unlock(coda_inode); return err; } diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index a5b8eb69a8f4..bd01b92aad98 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -1040,28 +1040,6 @@ COMPATIBLE_IOCTL(PPPIOCGL2TPSTATS) /* PPPOX */ COMPATIBLE_IOCTL(PPPOEIOCSFWD) COMPATIBLE_IOCTL(PPPOEIOCDFWD) -/* ppdev */ -COMPATIBLE_IOCTL(PPSETMODE) -COMPATIBLE_IOCTL(PPRSTATUS) -COMPATIBLE_IOCTL(PPRCONTROL) -COMPATIBLE_IOCTL(PPWCONTROL) -COMPATIBLE_IOCTL(PPFCONTROL) -COMPATIBLE_IOCTL(PPRDATA) -COMPATIBLE_IOCTL(PPWDATA) -COMPATIBLE_IOCTL(PPCLAIM) -COMPATIBLE_IOCTL(PPRELEASE) -COMPATIBLE_IOCTL(PPYIELD) -COMPATIBLE_IOCTL(PPEXCL) -COMPATIBLE_IOCTL(PPDATADIR) -COMPATIBLE_IOCTL(PPNEGOT) -COMPATIBLE_IOCTL(PPWCTLONIRQ) -COMPATIBLE_IOCTL(PPCLRIRQ) -COMPATIBLE_IOCTL(PPSETPHASE) -COMPATIBLE_IOCTL(PPGETMODES) -COMPATIBLE_IOCTL(PPGETMODE) -COMPATIBLE_IOCTL(PPGETPHASE) -COMPATIBLE_IOCTL(PPGETFLAGS) -COMPATIBLE_IOCTL(PPSETFLAGS) /* Big A */ /* sparc only */ /* Big Q for sound/OSS */ @@ -1261,6 +1239,9 @@ COMPATIBLE_IOCTL(HCIUNBLOCKADDR) COMPATIBLE_IOCTL(HCIINQUIRY) COMPATIBLE_IOCTL(HCIUARTSETPROTO) COMPATIBLE_IOCTL(HCIUARTGETPROTO) +COMPATIBLE_IOCTL(HCIUARTGETDEVICE) +COMPATIBLE_IOCTL(HCIUARTSETFLAGS) +COMPATIBLE_IOCTL(HCIUARTGETFLAGS) COMPATIBLE_IOCTL(RFCOMMCREATEDEV) COMPATIBLE_IOCTL(RFCOMMRELEASEDEV) COMPATIBLE_IOCTL(RFCOMMGETDEVLIST) diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index cab612b2ae76..ea59c891fc53 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -432,14 +432,9 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den (sd->s_type & CONFIGFS_ITEM_BIN_ATTR) ? configfs_init_bin_file : configfs_init_file); - if (error) { + if (error) configfs_put(sd); - return error; - } - - d_rehash(dentry); - - return 0; + return error; } static struct dentry * configfs_lookup(struct inode *dir, @@ -640,13 +635,13 @@ static void detach_groups(struct config_group *group) child = sd->s_dentry; - mutex_lock(&d_inode(child)->i_mutex); + inode_lock(d_inode(child)); configfs_detach_group(sd->s_element); d_inode(child)->i_flags |= S_DEAD; dont_mount(child); - mutex_unlock(&d_inode(child)->i_mutex); + inode_unlock(d_inode(child)); d_delete(child); dput(child); @@ -701,23 +696,29 @@ static int populate_groups(struct config_group *group) { struct config_group *new_group; int ret = 0; - int i; - - if (group->default_groups) { - for (i = 0; group->default_groups[i]; i++) { - new_group = group->default_groups[i]; - ret = create_default_group(group, new_group); - if (ret) { - detach_groups(group); - break; - } + list_for_each_entry(new_group, &group->default_groups, group_entry) { + ret = create_default_group(group, new_group); + if (ret) { + detach_groups(group); + break; } } return ret; } +void configfs_remove_default_groups(struct config_group *group) +{ + struct config_group *g, *n; + + list_for_each_entry_safe(g, n, &group->default_groups, group_entry) { + list_del(&g->group_entry); + config_item_put(&g->cg_item); + } +} +EXPORT_SYMBOL(configfs_remove_default_groups); + /* * All of link_obj/unlink_obj/link_group/unlink_group require that * subsys->su_mutex is held. @@ -766,15 +767,10 @@ static void link_obj(struct config_item *parent_item, struct config_item *item) static void unlink_group(struct config_group *group) { - int i; struct config_group *new_group; - if (group->default_groups) { - for (i = 0; group->default_groups[i]; i++) { - new_group = group->default_groups[i]; - unlink_group(new_group); - } - } + list_for_each_entry(new_group, &group->default_groups, group_entry) + unlink_group(new_group); group->cg_subsys = NULL; unlink_obj(&group->cg_item); @@ -782,7 +778,6 @@ static void unlink_group(struct config_group *group) static void link_group(struct config_group *parent_group, struct config_group *group) { - int i; struct config_group *new_group; struct configfs_subsystem *subsys = NULL; /* gcc is a turd */ @@ -796,12 +791,8 @@ static void link_group(struct config_group *parent_group, struct config_group *g BUG(); group->cg_subsys = subsys; - if (group->default_groups) { - for (i = 0; group->default_groups[i]; i++) { - new_group = group->default_groups[i]; - link_group(group, new_group); - } - } + list_for_each_entry(new_group, &group->default_groups, group_entry) + link_group(group, new_group); } /* @@ -834,11 +825,11 @@ static int configfs_attach_item(struct config_item *parent_item, * the VFS may already have hit and used them. Thus, * we must lock them as rmdir() would. */ - mutex_lock(&d_inode(dentry)->i_mutex); + inode_lock(d_inode(dentry)); configfs_remove_dir(item); d_inode(dentry)->i_flags |= S_DEAD; dont_mount(dentry); - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); d_delete(dentry); } } @@ -874,7 +865,7 @@ static int configfs_attach_group(struct config_item *parent_item, * We must also lock the inode to remove it safely in case of * error, as rmdir() would. */ - mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD); configfs_adjust_dir_dirent_depth_before_populate(sd); ret = populate_groups(to_config_group(item)); if (ret) { @@ -883,7 +874,7 @@ static int configfs_attach_group(struct config_item *parent_item, dont_mount(dentry); } configfs_adjust_dir_dirent_depth_after_populate(sd); - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); if (ret) d_delete(dentry); } @@ -1135,7 +1126,7 @@ int configfs_depend_item(struct configfs_subsystem *subsys, * subsystem is really registered, and so we need to lock out * configfs_[un]register_subsystem(). */ - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); subsys_sd = configfs_find_subsys_dentry(root->d_fsdata, s_item); if (!subsys_sd) { @@ -1147,7 +1138,7 @@ int configfs_depend_item(struct configfs_subsystem *subsys, ret = configfs_do_depend_item(subsys_sd->s_dentry, target); out_unlock_fs: - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); /* * If we succeeded, the fs is pinned via other methods. If not, @@ -1230,7 +1221,7 @@ int configfs_depend_item_unlocked(struct configfs_subsystem *caller_subsys, * additional locking to prevent other subsystem from being * unregistered */ - mutex_lock(&d_inode(root->cg_item.ci_dentry)->i_mutex); + inode_lock(d_inode(root->cg_item.ci_dentry)); /* * As we are trying to depend item from other subsystem @@ -1254,7 +1245,7 @@ out_root_unlock: * We were called from subsystem other than our target so we * took some locks so now it's time to release them */ - mutex_unlock(&d_inode(root->cg_item.ci_dentry)->i_mutex); + inode_unlock(d_inode(root->cg_item.ci_dentry)); return ret; } @@ -1561,7 +1552,7 @@ int configfs_rename_dir(struct config_item * item, const char *new_name) down_write(&configfs_rename_sem); parent = item->parent->dentry; - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); new_dentry = lookup_one_len(new_name, parent, strlen(new_name)); if (!IS_ERR(new_dentry)) { @@ -1577,7 +1568,7 @@ int configfs_rename_dir(struct config_item * item, const char *new_name) error = -EEXIST; dput(new_dentry); } - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); up_write(&configfs_rename_sem); return error; @@ -1590,7 +1581,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file) struct configfs_dirent * parent_sd = dentry->d_fsdata; int err; - mutex_lock(&d_inode(dentry)->i_mutex); + inode_lock(d_inode(dentry)); /* * Fake invisibility if dir belongs to a group/default groups hierarchy * being attached @@ -1603,7 +1594,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file) else err = 0; } - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); return err; } @@ -1613,11 +1604,11 @@ static int configfs_dir_close(struct inode *inode, struct file *file) struct dentry * dentry = file->f_path.dentry; struct configfs_dirent * cursor = file->private_data; - mutex_lock(&d_inode(dentry)->i_mutex); + inode_lock(d_inode(dentry)); spin_lock(&configfs_dirent_lock); list_del_init(&cursor->s_sibling); spin_unlock(&configfs_dirent_lock); - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); release_configfs_dirent(cursor); @@ -1698,7 +1689,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence) { struct dentry * dentry = file->f_path.dentry; - mutex_lock(&d_inode(dentry)->i_mutex); + inode_lock(d_inode(dentry)); switch (whence) { case 1: offset += file->f_pos; @@ -1706,7 +1697,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence) if (offset >= 0) break; default: - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); return -EINVAL; } if (offset != file->f_pos) { @@ -1732,7 +1723,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence) spin_unlock(&configfs_dirent_lock); } } - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); return offset; } @@ -1767,14 +1758,14 @@ int configfs_register_group(struct config_group *parent_group, parent = parent_group->cg_item.ci_dentry; - mutex_lock_nested(&d_inode(parent)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(parent), I_MUTEX_PARENT); ret = create_default_group(parent_group, group); if (!ret) { spin_lock(&configfs_dirent_lock); configfs_dir_set_ready(group->cg_item.ci_dentry->d_fsdata); spin_unlock(&configfs_dirent_lock); } - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); return ret; } EXPORT_SYMBOL(configfs_register_group); @@ -1791,7 +1782,7 @@ void configfs_unregister_group(struct config_group *group) struct dentry *dentry = group->cg_item.ci_dentry; struct dentry *parent = group->cg_item.ci_parent->ci_dentry; - mutex_lock_nested(&d_inode(parent)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(parent), I_MUTEX_PARENT); spin_lock(&configfs_dirent_lock); configfs_detach_prep(dentry, NULL); spin_unlock(&configfs_dirent_lock); @@ -1800,7 +1791,7 @@ void configfs_unregister_group(struct config_group *group) d_inode(dentry)->i_flags |= S_DEAD; dont_mount(dentry); d_delete(dentry); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); dput(dentry); @@ -1872,7 +1863,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) sd = root->d_fsdata; link_group(to_config_group(sd->s_element), group); - mutex_lock_nested(&d_inode(root)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(root), I_MUTEX_PARENT); err = -ENOMEM; dentry = d_alloc_name(root, group->cg_item.ci_name); @@ -1892,7 +1883,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) } } - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); if (err) { unlink_group(group); @@ -1913,9 +1904,9 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) return; } - mutex_lock_nested(&d_inode(root)->i_mutex, + inode_lock_nested(d_inode(root), I_MUTEX_PARENT); - mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD); mutex_lock(&configfs_symlink_mutex); spin_lock(&configfs_dirent_lock); if (configfs_detach_prep(dentry, NULL)) { @@ -1926,11 +1917,11 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) configfs_detach_group(&group->cg_item); d_inode(dentry)->i_flags |= S_DEAD; dont_mount(dentry); - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); d_delete(dentry); - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); dput(dentry); diff --git a/fs/configfs/file.c b/fs/configfs/file.c index 3687187c8ea5..33b7ee34eda5 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -540,10 +540,10 @@ int configfs_create_file(struct config_item * item, const struct configfs_attrib umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG; int error = 0; - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_NORMAL); + inode_lock_nested(d_inode(dir), I_MUTEX_NORMAL); error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, CONFIGFS_ITEM_ATTR); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); return error; } @@ -562,10 +562,10 @@ int configfs_create_bin_file(struct config_item *item, umode_t mode = (bin_attr->cb_attr.ca_mode & S_IALLUGO) | S_IFREG; int error = 0; - mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_NORMAL); + inode_lock_nested(dir->d_inode, I_MUTEX_NORMAL); error = configfs_make_dirent(parent_sd, NULL, (void *) bin_attr, mode, CONFIGFS_ITEM_BIN_ATTR); - mutex_unlock(&dir->d_inode->i_mutex); + inode_unlock(dir->d_inode); return error; } diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 0cc810e9dccc..03d124ae27d7 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -75,7 +75,8 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr) sd_iattr->ia_mode = sd->s_mode; sd_iattr->ia_uid = GLOBAL_ROOT_UID; sd_iattr->ia_gid = GLOBAL_ROOT_GID; - sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME; + sd_iattr->ia_atime = sd_iattr->ia_mtime = + sd_iattr->ia_ctime = current_fs_time(inode->i_sb); sd->s_iattr = sd_iattr; } /* attributes were changed atleast once in past */ @@ -111,7 +112,8 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr) static inline void set_default_inode_attr(struct inode * inode, umode_t mode) { inode->i_mode = mode; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_atime = inode->i_mtime = + inode->i_ctime = current_fs_time(inode->i_sb); } static inline void set_inode_attr(struct inode * inode, struct iattr * iattr) @@ -195,13 +197,21 @@ int configfs_create(struct dentry * dentry, umode_t mode, void (*init)(struct in return -ENOMEM; p_inode = d_inode(dentry->d_parent); - p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME; + p_inode->i_mtime = p_inode->i_ctime = current_fs_time(p_inode->i_sb); configfs_set_inode_lock_class(sd, inode); init(inode); - d_instantiate(dentry, inode); - if (S_ISDIR(mode) || S_ISLNK(mode)) + if (S_ISDIR(mode) || S_ISLNK(mode)) { + /* + * ->symlink(), ->mkdir(), configfs_register_subsystem() or + * create_default_group() - already hashed. + */ + d_instantiate(dentry, inode); dget(dentry); /* pin link and directory dentries in core */ + } else { + /* ->lookup() */ + d_add(dentry, inode); + } return error; } @@ -255,7 +265,7 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name) /* no inode means this hasn't been made visible yet */ return; - mutex_lock(&d_inode(dir)->i_mutex); + inode_lock(d_inode(dir)); list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { if (!sd->s_element) continue; @@ -268,5 +278,5 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name) break; } } - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); } diff --git a/fs/configfs/item.c b/fs/configfs/item.c index b863a09cd2f1..8b2a994042dd 100644 --- a/fs/configfs/item.c +++ b/fs/configfs/item.c @@ -182,6 +182,7 @@ void config_group_init(struct config_group *group) { config_item_init(&group->cg_item); INIT_LIST_HEAD(&group->cg_children); + INIT_LIST_HEAD(&group->default_groups); } EXPORT_SYMBOL(config_group_init); @@ -24,6 +24,7 @@ #include <linux/memcontrol.h> #include <linux/mm.h> #include <linux/mutex.h> +#include <linux/pagevec.h> #include <linux/pmem.h> #include <linux/sched.h> #include <linux/uio.h> @@ -57,16 +58,35 @@ static void dax_unmap_atomic(struct block_device *bdev, blk_queue_exit(bdev->bd_queue); } +struct page *read_dax_sector(struct block_device *bdev, sector_t n) +{ + struct page *page = alloc_pages(GFP_KERNEL, 0); + struct blk_dax_ctl dax = { + .size = PAGE_SIZE, + .sector = n & ~((((int) PAGE_SIZE) / 512) - 1), + }; + long rc; + + if (!page) + return ERR_PTR(-ENOMEM); + + rc = dax_map_atomic(bdev, &dax); + if (rc < 0) + return ERR_PTR(rc); + memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE); + dax_unmap_atomic(bdev, &dax); + return page; +} + /* - * dax_clear_blocks() is called from within transaction context from XFS, + * dax_clear_sectors() is called from within transaction context from XFS, * and hence this means the stack from this point must follow GFP_NOFS * semantics for all operations. */ -int dax_clear_blocks(struct inode *inode, sector_t block, long _size) +int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size) { - struct block_device *bdev = inode->i_sb->s_bdev; struct blk_dax_ctl dax = { - .sector = block << (inode->i_blkbits - 9), + .sector = _sector, .size = _size, }; @@ -88,7 +108,7 @@ int dax_clear_blocks(struct inode *inode, sector_t block, long _size) wmb_pmem(); return 0; } -EXPORT_SYMBOL_GPL(dax_clear_blocks); +EXPORT_SYMBOL_GPL(dax_clear_sectors); /* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */ static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first, @@ -245,13 +265,14 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, loff_t end = pos + iov_iter_count(iter); memset(&bh, 0, sizeof(bh)); + bh.b_bdev = inode->i_sb->s_bdev; if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) { struct address_space *mapping = inode->i_mapping; - mutex_lock(&inode->i_mutex); + inode_lock(inode); retval = filemap_write_and_wait_range(mapping, pos, end - 1); if (retval) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out; } } @@ -263,7 +284,7 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, retval = dax_io(inode, iter, pos, end, get_block, &bh); if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if ((retval > 0) && end_io) end_io(iocb, pos, retval, bh.b_private); @@ -324,6 +345,202 @@ static int copy_user_bh(struct page *to, struct inode *inode, return 0; } +#define NO_SECTOR -1 +#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_CACHE_SHIFT)) + +static int dax_radix_entry(struct address_space *mapping, pgoff_t index, + sector_t sector, bool pmd_entry, bool dirty) +{ + struct radix_tree_root *page_tree = &mapping->page_tree; + pgoff_t pmd_index = DAX_PMD_INDEX(index); + int type, error = 0; + void *entry; + + WARN_ON_ONCE(pmd_entry && !dirty); + if (dirty) + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + + spin_lock_irq(&mapping->tree_lock); + + entry = radix_tree_lookup(page_tree, pmd_index); + if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) { + index = pmd_index; + goto dirty; + } + + entry = radix_tree_lookup(page_tree, index); + if (entry) { + type = RADIX_DAX_TYPE(entry); + if (WARN_ON_ONCE(type != RADIX_DAX_PTE && + type != RADIX_DAX_PMD)) { + error = -EIO; + goto unlock; + } + + if (!pmd_entry || type == RADIX_DAX_PMD) + goto dirty; + + /* + * We only insert dirty PMD entries into the radix tree. This + * means we don't need to worry about removing a dirty PTE + * entry and inserting a clean PMD entry, thus reducing the + * range we would flush with a follow-up fsync/msync call. + */ + radix_tree_delete(&mapping->page_tree, index); + mapping->nrexceptional--; + } + + if (sector == NO_SECTOR) { + /* + * This can happen during correct operation if our pfn_mkwrite + * fault raced against a hole punch operation. If this + * happens the pte that was hole punched will have been + * unmapped and the radix tree entry will have been removed by + * the time we are called, but the call will still happen. We + * will return all the way up to wp_pfn_shared(), where the + * pte_same() check will fail, eventually causing page fault + * to be retried by the CPU. + */ + goto unlock; + } + + error = radix_tree_insert(page_tree, index, + RADIX_DAX_ENTRY(sector, pmd_entry)); + if (error) + goto unlock; + + mapping->nrexceptional++; + dirty: + if (dirty) + radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); + unlock: + spin_unlock_irq(&mapping->tree_lock); + return error; +} + +static int dax_writeback_one(struct block_device *bdev, + struct address_space *mapping, pgoff_t index, void *entry) +{ + struct radix_tree_root *page_tree = &mapping->page_tree; + int type = RADIX_DAX_TYPE(entry); + struct radix_tree_node *node; + struct blk_dax_ctl dax; + void **slot; + int ret = 0; + + spin_lock_irq(&mapping->tree_lock); + /* + * Regular page slots are stabilized by the page lock even + * without the tree itself locked. These unlocked entries + * need verification under the tree lock. + */ + if (!__radix_tree_lookup(page_tree, index, &node, &slot)) + goto unlock; + if (*slot != entry) + goto unlock; + + /* another fsync thread may have already written back this entry */ + if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) + goto unlock; + + if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) { + ret = -EIO; + goto unlock; + } + + dax.sector = RADIX_DAX_SECTOR(entry); + dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE); + spin_unlock_irq(&mapping->tree_lock); + + /* + * We cannot hold tree_lock while calling dax_map_atomic() because it + * eventually calls cond_resched(). + */ + ret = dax_map_atomic(bdev, &dax); + if (ret < 0) + return ret; + + if (WARN_ON_ONCE(ret < dax.size)) { + ret = -EIO; + goto unmap; + } + + wb_cache_pmem(dax.addr, dax.size); + + spin_lock_irq(&mapping->tree_lock); + radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); + spin_unlock_irq(&mapping->tree_lock); + unmap: + dax_unmap_atomic(bdev, &dax); + return ret; + + unlock: + spin_unlock_irq(&mapping->tree_lock); + return ret; +} + +/* + * Flush the mapping to the persistent domain within the byte range of [start, + * end]. This is required by data integrity operations to ensure file data is + * on persistent storage prior to completion of the operation. + */ +int dax_writeback_mapping_range(struct address_space *mapping, + struct block_device *bdev, struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + pgoff_t start_index, end_index, pmd_index; + pgoff_t indices[PAGEVEC_SIZE]; + struct pagevec pvec; + bool done = false; + int i, ret = 0; + void *entry; + + if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) + return -EIO; + + if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) + return 0; + + start_index = wbc->range_start >> PAGE_CACHE_SHIFT; + end_index = wbc->range_end >> PAGE_CACHE_SHIFT; + pmd_index = DAX_PMD_INDEX(start_index); + + rcu_read_lock(); + entry = radix_tree_lookup(&mapping->page_tree, pmd_index); + rcu_read_unlock(); + + /* see if the start of our range is covered by a PMD entry */ + if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) + start_index = pmd_index; + + tag_pages_for_writeback(mapping, start_index, end_index); + + pagevec_init(&pvec, 0); + while (!done) { + pvec.nr = find_get_entries_tag(mapping, start_index, + PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, + pvec.pages, indices); + + if (pvec.nr == 0) + break; + + for (i = 0; i < pvec.nr; i++) { + if (indices[i] > end_index) { + done = true; + break; + } + + ret = dax_writeback_one(bdev, mapping, indices[i], + pvec.pages[i]); + if (ret < 0) + return ret; + } + } + wmb_pmem(); + return 0; +} +EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); + static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, struct vm_area_struct *vma, struct vm_fault *vmf) { @@ -363,6 +580,11 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, } dax_unmap_atomic(bdev, &dax); + error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false, + vmf->flags & FAULT_FLAG_WRITE); + if (error) + goto out; + error = vm_insert_mixed(vma, vaddr, dax.pfn); out: @@ -408,6 +630,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, memset(&bh, 0, sizeof(bh)); block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); + bh.b_bdev = inode->i_sb->s_bdev; bh.b_size = PAGE_SIZE; repeat: @@ -487,6 +710,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, delete_from_page_cache(page); unlock_page(page); page_cache_release(page); + page = NULL; } /* @@ -590,7 +814,8 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, struct block_device *bdev; pgoff_t size, pgoff; sector_t block; - int result = 0; + int error, result = 0; + bool alloc = false; /* dax pmd mappings require pfn_t_devmap() */ if (!IS_ENABLED(CONFIG_FS_DAX_PMD)) @@ -624,13 +849,21 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, } memset(&bh, 0, sizeof(bh)); + bh.b_bdev = inode->i_sb->s_bdev; block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); bh.b_size = PMD_SIZE; - if (get_block(inode, block, &bh, write) != 0) + + if (get_block(inode, block, &bh, 0) != 0) return VM_FAULT_SIGBUS; + + if (!buffer_mapped(&bh) && write) { + if (get_block(inode, block, &bh, 1) != 0) + return VM_FAULT_SIGBUS; + alloc = true; + } + bdev = bh.b_bdev; - i_mmap_lock_read(mapping); /* * If the filesystem isn't willing to tell us the length of a hole, @@ -639,19 +872,22 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, */ if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) { dax_pmd_dbg(&bh, address, "allocated block too small"); - goto fallback; + return VM_FAULT_FALLBACK; } /* * If we allocated new storage, make sure no process has any * zero pages covering this hole */ - if (buffer_new(&bh)) { - i_mmap_unlock_read(mapping); - unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0); - i_mmap_lock_read(mapping); + if (alloc) { + loff_t lstart = pgoff << PAGE_SHIFT; + loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */ + + truncate_pagecache_range(inode, lstart, lend); } + i_mmap_lock_read(mapping); + /* * If a truncate happened while we were allocating blocks, we may * leave blocks allocated to the file that are beyond EOF. We can't @@ -664,7 +900,8 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, goto out; } if ((pgoff | PG_PMD_COLOUR) >= size) { - dax_pmd_dbg(&bh, address, "pgoff unaligned"); + dax_pmd_dbg(&bh, address, + "offset + huge page size > file size"); goto fallback; } @@ -732,6 +969,31 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, } dax_unmap_atomic(bdev, &dax); + /* + * For PTE faults we insert a radix tree entry for reads, and + * leave it clean. Then on the first write we dirty the radix + * tree entry via the dax_pfn_mkwrite() path. This sequence + * allows the dax_pfn_mkwrite() call to be simpler and avoid a + * call into get_block() to translate the pgoff to a sector in + * order to be able to create a new radix tree entry. + * + * The PMD path doesn't have an equivalent to + * dax_pfn_mkwrite(), though, so for a read followed by a + * write we traverse all the way through __dax_pmd_fault() + * twice. This means we can just skip inserting a radix tree + * entry completely on the initial read and just wait until + * the write to insert a dirty entry. + */ + if (write) { + error = dax_radix_entry(mapping, pgoff, dax.sector, + true, true); + if (error) { + dax_pmd_dbg(&bh, address, + "PMD radix insertion failed"); + goto fallback; + } + } + dev_dbg(part_to_dev(bdev->bd_part), "%s: %s addr: %lx pfn: %lx sect: %llx\n", __func__, current->comm, address, @@ -790,15 +1052,27 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault); * dax_pfn_mkwrite - handle first write to DAX page * @vma: The virtual memory area where the fault occurred * @vmf: The description of the fault - * */ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { - struct super_block *sb = file_inode(vma->vm_file)->i_sb; + struct file *file = vma->vm_file; + int error; - sb_start_pagefault(sb); - file_update_time(vma->vm_file); - sb_end_pagefault(sb); + /* + * We pass NO_SECTOR to dax_radix_entry() because we expect that a + * RADIX_DAX_PTE entry already exists in the radix tree from a + * previous call to __dax_fault(). We just want to look up that PTE + * entry using vmf->pgoff and make sure the dirty tag is set. This + * saves us from having to make a call to get_block() here to look + * up the sector. + */ + error = dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, + true); + + if (error == -ENOMEM) + return VM_FAULT_OOM; + if (error) + return VM_FAULT_SIGBUS; return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); @@ -835,6 +1109,7 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, BUG_ON((offset + length) > PAGE_CACHE_SIZE); memset(&bh, 0, sizeof(bh)); + bh.b_bdev = inode->i_sb->s_bdev; bh.b_size = PAGE_CACHE_SIZE; err = get_block(inode, index, &bh, 0); if (err < 0) diff --git a/fs/dcache.c b/fs/dcache.c index b4539e84e577..32ceae3e6112 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -269,9 +269,6 @@ static inline int dname_external(const struct dentry *dentry) return dentry->d_name.name != dentry->d_iname; } -/* - * Make sure other CPUs see the inode attached before the type is set. - */ static inline void __d_set_inode_and_type(struct dentry *dentry, struct inode *inode, unsigned type_flags) @@ -279,28 +276,18 @@ static inline void __d_set_inode_and_type(struct dentry *dentry, unsigned flags; dentry->d_inode = inode; - smp_wmb(); flags = READ_ONCE(dentry->d_flags); flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU); flags |= type_flags; WRITE_ONCE(dentry->d_flags, flags); } -/* - * Ideally, we want to make sure that other CPUs see the flags cleared before - * the inode is detached, but this is really a violation of RCU principles - * since the ordering suggests we should always set inode before flags. - * - * We should instead replace or discard the entire dentry - but that sucks - * performancewise on mass deletion/rename. - */ static inline void __d_clear_type_and_inode(struct dentry *dentry) { unsigned flags = READ_ONCE(dentry->d_flags); flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU); WRITE_ONCE(dentry->d_flags, flags); - smp_wmb(); dentry->d_inode = NULL; } @@ -370,9 +357,11 @@ static void dentry_unlink_inode(struct dentry * dentry) __releases(dentry->d_inode->i_lock) { struct inode *inode = dentry->d_inode; + + raw_write_seqcount_begin(&dentry->d_seq); __d_clear_type_and_inode(dentry); hlist_del_init(&dentry->d_u.d_alias); - dentry_rcuwalk_invalidate(dentry); + raw_write_seqcount_end(&dentry->d_seq); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); if (!inode->i_nlink) @@ -1756,12 +1745,12 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) unsigned add_flags = d_flags_for_inode(inode); spin_lock(&dentry->d_lock); - if (inode) - hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); + hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); + raw_write_seqcount_begin(&dentry->d_seq); __d_set_inode_and_type(dentry, inode, add_flags); - dentry_rcuwalk_invalidate(dentry); + raw_write_seqcount_end(&dentry->d_seq); + __fsnotify_d_instantiate(dentry); spin_unlock(&dentry->d_lock); - fsnotify_d_instantiate(dentry, inode); } /** @@ -1782,91 +1771,16 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) void d_instantiate(struct dentry *entry, struct inode * inode) { BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); - if (inode) + if (inode) { spin_lock(&inode->i_lock); - __d_instantiate(entry, inode); - if (inode) + __d_instantiate(entry, inode); spin_unlock(&inode->i_lock); + } security_d_instantiate(entry, inode); } EXPORT_SYMBOL(d_instantiate); /** - * d_instantiate_unique - instantiate a non-aliased dentry - * @entry: dentry to instantiate - * @inode: inode to attach to this dentry - * - * Fill in inode information in the entry. On success, it returns NULL. - * If an unhashed alias of "entry" already exists, then we return the - * aliased dentry instead and drop one reference to inode. - * - * Note that in order to avoid conflicts with rename() etc, the caller - * had better be holding the parent directory semaphore. - * - * This also assumes that the inode count has been incremented - * (or otherwise set) by the caller to indicate that it is now - * in use by the dcache. - */ -static struct dentry *__d_instantiate_unique(struct dentry *entry, - struct inode *inode) -{ - struct dentry *alias; - int len = entry->d_name.len; - const char *name = entry->d_name.name; - unsigned int hash = entry->d_name.hash; - - if (!inode) { - __d_instantiate(entry, NULL); - return NULL; - } - - hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { - /* - * Don't need alias->d_lock here, because aliases with - * d_parent == entry->d_parent are not subject to name or - * parent changes, because the parent inode i_mutex is held. - */ - if (alias->d_name.hash != hash) - continue; - if (alias->d_parent != entry->d_parent) - continue; - if (alias->d_name.len != len) - continue; - if (dentry_cmp(alias, name, len)) - continue; - __dget(alias); - return alias; - } - - __d_instantiate(entry, inode); - return NULL; -} - -struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode) -{ - struct dentry *result; - - BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); - - if (inode) - spin_lock(&inode->i_lock); - result = __d_instantiate_unique(entry, inode); - if (inode) - spin_unlock(&inode->i_lock); - - if (!result) { - security_d_instantiate(entry, inode); - return NULL; - } - - BUG_ON(!d_unhashed(result)); - iput(inode); - return result; -} - -EXPORT_SYMBOL(d_instantiate_unique); - -/** * d_instantiate_no_diralias - instantiate a non-aliased dentry * @entry: dentry to complete * @inode: inode to attach to this dentry @@ -2446,6 +2360,86 @@ void d_rehash(struct dentry * entry) } EXPORT_SYMBOL(d_rehash); + +/* inode->i_lock held if inode is non-NULL */ + +static inline void __d_add(struct dentry *dentry, struct inode *inode) +{ + if (inode) { + __d_instantiate(dentry, inode); + spin_unlock(&inode->i_lock); + } + security_d_instantiate(dentry, inode); + d_rehash(dentry); +} + +/** + * d_add - add dentry to hash queues + * @entry: dentry to add + * @inode: The inode to attach to this dentry + * + * This adds the entry to the hash queues and initializes @inode. + * The entry was actually filled in earlier during d_alloc(). + */ + +void d_add(struct dentry *entry, struct inode *inode) +{ + if (inode) + spin_lock(&inode->i_lock); + __d_add(entry, inode); +} +EXPORT_SYMBOL(d_add); + +/** + * d_exact_alias - find and hash an exact unhashed alias + * @entry: dentry to add + * @inode: The inode to go with this dentry + * + * If an unhashed dentry with the same name/parent and desired + * inode already exists, hash and return it. Otherwise, return + * NULL. + * + * Parent directory should be locked. + */ +struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode) +{ + struct dentry *alias; + int len = entry->d_name.len; + const char *name = entry->d_name.name; + unsigned int hash = entry->d_name.hash; + + spin_lock(&inode->i_lock); + hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { + /* + * Don't need alias->d_lock here, because aliases with + * d_parent == entry->d_parent are not subject to name or + * parent changes, because the parent inode i_mutex is held. + */ + if (alias->d_name.hash != hash) + continue; + if (alias->d_parent != entry->d_parent) + continue; + if (alias->d_name.len != len) + continue; + if (dentry_cmp(alias, name, len)) + continue; + spin_lock(&alias->d_lock); + if (!d_unhashed(alias)) { + spin_unlock(&alias->d_lock); + alias = NULL; + } else { + __dget_dlock(alias); + _d_rehash(alias); + spin_unlock(&alias->d_lock); + } + spin_unlock(&inode->i_lock); + return alias; + } + spin_unlock(&inode->i_lock); + return NULL; +} +EXPORT_SYMBOL(d_exact_alias); + /** * dentry_update_name_case - update case insensitive dentry with a new name * @dentry: dentry to be updated @@ -2462,7 +2456,7 @@ EXPORT_SYMBOL(d_rehash); */ void dentry_update_name_case(struct dentry *dentry, struct qstr *name) { - BUG_ON(!mutex_is_locked(&dentry->d_parent->d_inode->i_mutex)); + BUG_ON(!inode_is_locked(dentry->d_parent->d_inode)); BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */ spin_lock(&dentry->d_lock); @@ -2738,7 +2732,7 @@ static int __d_unalias(struct inode *inode, if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex)) goto out_err; m1 = &dentry->d_sb->s_vfs_rename_mutex; - if (!mutex_trylock(&alias->d_parent->d_inode->i_mutex)) + if (!inode_trylock(alias->d_parent->d_inode)) goto out_err; m2 = &alias->d_parent->d_inode->i_mutex; out_unalias: @@ -2782,10 +2776,9 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) BUG_ON(!d_unhashed(dentry)); - if (!inode) { - __d_instantiate(dentry, NULL); + if (!inode) goto out; - } + spin_lock(&inode->i_lock); if (S_ISDIR(inode->i_mode)) { struct dentry *new = __d_find_any_alias(inode); @@ -2819,12 +2812,8 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) return new; } } - /* already taking inode->i_lock, so d_add() by hand */ - __d_instantiate(dentry, inode); - spin_unlock(&inode->i_lock); out: - security_d_instantiate(dentry, inode); - d_rehash(dentry); + __d_add(dentry, inode); return NULL; } EXPORT_SYMBOL(d_splice_alias); diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index b7fcc0de0b2f..bece948b363d 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -265,7 +265,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) if (!parent) parent = debugfs_mount->mnt_root; - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); dentry = lookup_one_len(name, parent, strlen(name)); if (!IS_ERR(dentry) && d_really_is_positive(dentry)) { dput(dentry); @@ -273,7 +273,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) } if (IS_ERR(dentry)) { - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); simple_release_fs(&debugfs_mount, &debugfs_mount_count); } @@ -282,7 +282,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) static struct dentry *failed_creating(struct dentry *dentry) { - mutex_unlock(&d_inode(dentry->d_parent)->i_mutex); + inode_unlock(d_inode(dentry->d_parent)); dput(dentry); simple_release_fs(&debugfs_mount, &debugfs_mount_count); return NULL; @@ -290,7 +290,7 @@ static struct dentry *failed_creating(struct dentry *dentry) static struct dentry *end_creating(struct dentry *dentry) { - mutex_unlock(&d_inode(dentry->d_parent)->i_mutex); + inode_unlock(d_inode(dentry->d_parent)); return dentry; } @@ -560,9 +560,9 @@ void debugfs_remove(struct dentry *dentry) if (!parent || d_really_is_negative(parent)) return; - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); ret = __debugfs_remove(dentry, parent); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); if (!ret) simple_release_fs(&debugfs_mount, &debugfs_mount_count); } @@ -594,7 +594,7 @@ void debugfs_remove_recursive(struct dentry *dentry) parent = dentry; down: - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); loop: /* * The parent->d_subdirs is protected by the d_lock. Outside that @@ -609,7 +609,7 @@ void debugfs_remove_recursive(struct dentry *dentry) /* perhaps simple_empty(child) makes more sense */ if (!list_empty(&child->d_subdirs)) { spin_unlock(&parent->d_lock); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); parent = child; goto down; } @@ -630,10 +630,10 @@ void debugfs_remove_recursive(struct dentry *dentry) } spin_unlock(&parent->d_lock); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); child = parent; parent = parent->d_parent; - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); if (child != dentry) /* go up */ @@ -641,7 +641,7 @@ void debugfs_remove_recursive(struct dentry *dentry) if (!__debugfs_remove(child, parent)) simple_release_fs(&debugfs_mount, &debugfs_mount_count); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); } EXPORT_SYMBOL_GPL(debugfs_remove_recursive); diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index c35ffdc12bba..655f21f99160 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -255,7 +255,7 @@ static int mknod_ptmx(struct super_block *sb) if (!uid_valid(root_uid) || !gid_valid(root_gid)) return -EINVAL; - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); /* If we have already created ptmx node, return */ if (fsi->ptmx_dentry) { @@ -292,7 +292,7 @@ static int mknod_ptmx(struct super_block *sb) fsi->ptmx_dentry = dentry; rc = 0; out: - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); return rc; } @@ -575,6 +575,26 @@ void devpts_kill_index(struct inode *ptmx_inode, int idx) mutex_unlock(&allocated_ptys_lock); } +/* + * pty code needs to hold extra references in case of last /dev/tty close + */ + +void devpts_add_ref(struct inode *ptmx_inode) +{ + struct super_block *sb = pts_sb_from_inode(ptmx_inode); + + atomic_inc(&sb->s_active); + ihold(ptmx_inode); +} + +void devpts_del_ref(struct inode *ptmx_inode) +{ + struct super_block *sb = pts_sb_from_inode(ptmx_inode); + + iput(ptmx_inode); + deactivate_super(sb); +} + /** * devpts_pty_new -- create a new inode in /dev/pts/ * @ptmx_inode: inode of the master @@ -615,7 +635,7 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, sprintf(s, "%d", index); - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); dentry = d_alloc_name(root, s); if (dentry) { @@ -626,7 +646,7 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, inode = ERR_PTR(-ENOMEM); } - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); return inode; } @@ -671,7 +691,7 @@ void devpts_pty_kill(struct inode *inode) BUG_ON(inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR)); - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); dentry = d_find_alias(inode); @@ -680,7 +700,7 @@ void devpts_pty_kill(struct inode *inode) dput(dentry); /* d_alloc_name() in devpts_pty_new() */ dput(dentry); /* d_find_alias above */ - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); } static int __init init_devpts_fs(void) diff --git a/fs/direct-io.c b/fs/direct-io.c index 602e8441bc0f..0a8d937c6775 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -445,7 +445,8 @@ static struct bio *dio_await_one(struct dio *dio) __set_current_state(TASK_UNINTERRUPTIBLE); dio->waiter = current; spin_unlock_irqrestore(&dio->bio_lock, flags); - if (!blk_poll(bdev_get_queue(dio->bio_bdev), dio->bio_cookie)) + if (!(dio->iocb->ki_flags & IOCB_HIPRI) || + !blk_poll(bdev_get_queue(dio->bio_bdev), dio->bio_cookie)) io_schedule(); /* wake up sets us TASK_RUNNING */ spin_lock_irqsave(&dio->bio_lock, flags); @@ -472,8 +473,8 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio) dio->io_error = -EIO; if (dio->is_async && dio->rw == READ && dio->should_dirty) { - bio_check_pages_dirty(bio); /* transfers ownership */ err = bio->bi_error; + bio_check_pages_dirty(bio); /* transfers ownership */ } else { bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; @@ -1157,12 +1158,12 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, iocb->ki_filp->f_mapping; /* will be released by direct_io_worker */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); retval = filemap_write_and_wait_range(mapping, offset, end - 1); if (retval) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); kmem_cache_free(dio_cache, dio); goto out; } @@ -1173,7 +1174,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, dio->i_size = i_size_read(inode); if (iov_iter_rw(iter) == READ && offset >= dio->i_size) { if (dio->flags & DIO_LOCKING) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); kmem_cache_free(dio_cache, dio); retval = 0; goto out; @@ -1295,7 +1296,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, * of protecting us from looking up uninitialized blocks. */ if (iov_iter_rw(iter) == READ && (dio->flags & DIO_LOCKING)) - mutex_unlock(&dio->inode->i_mutex); + inode_unlock(dio->inode); /* * The only time we want to leave bios in flight is when a successful diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 8e294fbbac39..519112168a9e 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -346,7 +346,6 @@ static struct config_group *make_cluster(struct config_group *g, void *gps = NULL; cl = kzalloc(sizeof(struct dlm_cluster), GFP_NOFS); - gps = kcalloc(3, sizeof(struct config_group *), GFP_NOFS); sps = kzalloc(sizeof(struct dlm_spaces), GFP_NOFS); cms = kzalloc(sizeof(struct dlm_comms), GFP_NOFS); @@ -357,10 +356,8 @@ static struct config_group *make_cluster(struct config_group *g, config_group_init_type_name(&sps->ss_group, "spaces", &spaces_type); config_group_init_type_name(&cms->cs_group, "comms", &comms_type); - cl->group.default_groups = gps; - cl->group.default_groups[0] = &sps->ss_group; - cl->group.default_groups[1] = &cms->cs_group; - cl->group.default_groups[2] = NULL; + configfs_add_default_group(&sps->ss_group, &cl->group); + configfs_add_default_group(&cms->cs_group, &cl->group); cl->cl_tcp_port = dlm_config.ci_tcp_port; cl->cl_buffer_size = dlm_config.ci_buffer_size; @@ -383,7 +380,6 @@ static struct config_group *make_cluster(struct config_group *g, fail: kfree(cl); - kfree(gps); kfree(sps); kfree(cms); return ERR_PTR(-ENOMEM); @@ -392,14 +388,8 @@ static struct config_group *make_cluster(struct config_group *g, static void drop_cluster(struct config_group *g, struct config_item *i) { struct dlm_cluster *cl = config_item_to_cluster(i); - struct config_item *tmp; - int j; - for (j = 0; cl->group.default_groups[j]; j++) { - tmp = &cl->group.default_groups[j]->cg_item; - cl->group.default_groups[j] = NULL; - config_item_put(tmp); - } + configfs_remove_default_groups(&cl->group); space_list = NULL; comm_list = NULL; @@ -410,7 +400,6 @@ static void drop_cluster(struct config_group *g, struct config_item *i) static void release_cluster(struct config_item *i) { struct dlm_cluster *cl = config_item_to_cluster(i); - kfree(cl->group.default_groups); kfree(cl); } @@ -418,21 +407,17 @@ static struct config_group *make_space(struct config_group *g, const char *name) { struct dlm_space *sp = NULL; struct dlm_nodes *nds = NULL; - void *gps = NULL; sp = kzalloc(sizeof(struct dlm_space), GFP_NOFS); - gps = kcalloc(2, sizeof(struct config_group *), GFP_NOFS); nds = kzalloc(sizeof(struct dlm_nodes), GFP_NOFS); - if (!sp || !gps || !nds) + if (!sp || !nds) goto fail; config_group_init_type_name(&sp->group, name, &space_type); - config_group_init_type_name(&nds->ns_group, "nodes", &nodes_type); - sp->group.default_groups = gps; - sp->group.default_groups[0] = &nds->ns_group; - sp->group.default_groups[1] = NULL; + config_group_init_type_name(&nds->ns_group, "nodes", &nodes_type); + configfs_add_default_group(&nds->ns_group, &sp->group); INIT_LIST_HEAD(&sp->members); mutex_init(&sp->members_lock); @@ -441,7 +426,6 @@ static struct config_group *make_space(struct config_group *g, const char *name) fail: kfree(sp); - kfree(gps); kfree(nds); return ERR_PTR(-ENOMEM); } @@ -449,24 +433,16 @@ static struct config_group *make_space(struct config_group *g, const char *name) static void drop_space(struct config_group *g, struct config_item *i) { struct dlm_space *sp = config_item_to_space(i); - struct config_item *tmp; - int j; /* assert list_empty(&sp->members) */ - for (j = 0; sp->group.default_groups[j]; j++) { - tmp = &sp->group.default_groups[j]->cg_item; - sp->group.default_groups[j] = NULL; - config_item_put(tmp); - } - + configfs_remove_default_groups(&sp->group); config_item_put(i); } static void release_space(struct config_item *i) { struct dlm_space *sp = config_item_to_space(i); - kfree(sp->group.default_groups); kfree(sp); } diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 3a37bd3f9637..00640e70ed7a 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -124,7 +124,10 @@ struct connection { struct connection *othercon; struct work_struct rwork; /* Receive workqueue */ struct work_struct swork; /* Send workqueue */ - void (*orig_error_report)(struct sock *sk); + void (*orig_error_report)(struct sock *); + void (*orig_data_ready)(struct sock *); + void (*orig_state_change)(struct sock *); + void (*orig_write_space)(struct sock *); }; #define sock2con(x) ((struct connection *)(x)->sk_user_data) @@ -467,16 +470,24 @@ int dlm_lowcomms_connect_node(int nodeid) static void lowcomms_error_report(struct sock *sk) { - struct connection *con = sock2con(sk); + struct connection *con; struct sockaddr_storage saddr; + int buflen; + void (*orig_report)(struct sock *) = NULL; - if (nodeid_to_addr(con->nodeid, &saddr, NULL, false)) { + read_lock_bh(&sk->sk_callback_lock); + con = sock2con(sk); + if (con == NULL) + goto out; + + orig_report = con->orig_error_report; + if (con->sock == NULL || + kernel_getpeername(con->sock, (struct sockaddr *)&saddr, &buflen)) { printk_ratelimited(KERN_ERR "dlm: node %d: socket error " "sending to node %d, port %d, " "sk_err=%d/%d\n", dlm_our_nodeid(), con->nodeid, dlm_config.ci_tcp_port, sk->sk_err, sk->sk_err_soft); - return; } else if (saddr.ss_family == AF_INET) { struct sockaddr_in *sin4 = (struct sockaddr_in *)&saddr; @@ -499,22 +510,54 @@ static void lowcomms_error_report(struct sock *sk) dlm_config.ci_tcp_port, sk->sk_err, sk->sk_err_soft); } - con->orig_error_report(sk); +out: + read_unlock_bh(&sk->sk_callback_lock); + if (orig_report) + orig_report(sk); +} + +/* Note: sk_callback_lock must be locked before calling this function. */ +static void save_callbacks(struct connection *con, struct sock *sk) +{ + lock_sock(sk); + con->orig_data_ready = sk->sk_data_ready; + con->orig_state_change = sk->sk_state_change; + con->orig_write_space = sk->sk_write_space; + con->orig_error_report = sk->sk_error_report; + release_sock(sk); +} + +static void restore_callbacks(struct connection *con, struct sock *sk) +{ + write_lock_bh(&sk->sk_callback_lock); + lock_sock(sk); + sk->sk_user_data = NULL; + sk->sk_data_ready = con->orig_data_ready; + sk->sk_state_change = con->orig_state_change; + sk->sk_write_space = con->orig_write_space; + sk->sk_error_report = con->orig_error_report; + release_sock(sk); + write_unlock_bh(&sk->sk_callback_lock); } /* Make a socket active */ static void add_sock(struct socket *sock, struct connection *con) { + struct sock *sk = sock->sk; + + write_lock_bh(&sk->sk_callback_lock); con->sock = sock; + sk->sk_user_data = con; + if (!test_bit(CF_IS_OTHERCON, &con->flags)) + save_callbacks(con, sk); /* Install a data_ready callback */ - con->sock->sk->sk_data_ready = lowcomms_data_ready; - con->sock->sk->sk_write_space = lowcomms_write_space; - con->sock->sk->sk_state_change = lowcomms_state_change; - con->sock->sk->sk_user_data = con; - con->sock->sk->sk_allocation = GFP_NOFS; - con->orig_error_report = con->sock->sk->sk_error_report; - con->sock->sk->sk_error_report = lowcomms_error_report; + sk->sk_data_ready = lowcomms_data_ready; + sk->sk_write_space = lowcomms_write_space; + sk->sk_state_change = lowcomms_state_change; + sk->sk_allocation = GFP_NOFS; + sk->sk_error_report = lowcomms_error_report; + write_unlock_bh(&sk->sk_callback_lock); } /* Add the port number to an IPv6 or 4 sockaddr and return the address @@ -549,6 +592,8 @@ static void close_connection(struct connection *con, bool and_other, mutex_lock(&con->sock_mutex); if (con->sock) { + if (!test_bit(CF_IS_OTHERCON, &con->flags)) + restore_callbacks(con, con->sock->sk); sock_release(con->sock); con->sock = NULL; } @@ -1190,6 +1235,8 @@ static struct socket *tcp_create_listen_sock(struct connection *con, if (result < 0) { log_print("Failed to set SO_REUSEADDR on socket: %d", result); } + sock->sk->sk_user_data = con; + con->rx_action = tcp_accept_from_sock; con->connect_action = tcp_connect_to_sock; @@ -1271,6 +1318,7 @@ static int sctp_listen_for_all(void) if (result < 0) log_print("Could not set SCTP NODELAY error %d\n", result); + write_lock_bh(&sock->sk->sk_callback_lock); /* Init con struct */ sock->sk->sk_user_data = con; con->sock = sock; @@ -1278,6 +1326,8 @@ static int sctp_listen_for_all(void) con->rx_action = sctp_accept_from_sock; con->connect_action = sctp_connect_to_sock; + write_unlock_bh(&sock->sk->sk_callback_lock); + /* Bind to all addresses. */ if (sctp_bind_addrs(con, dlm_config.ci_tcp_port)) goto create_delsock; diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 1925d6d222b8..58c2f4a21b7f 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -516,7 +516,7 @@ static ssize_t device_write(struct file *file, const char __user *buf, return -EINVAL; kbuf = memdup_user_nul(buf, count); - if (!IS_ERR(kbuf)) + if (IS_ERR(kbuf)) return PTR_ERR(kbuf); if (check_version(kbuf)) { diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 80d6901493cf..64026e53722a 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -23,6 +23,8 @@ * 02111-1307, USA. */ +#include <crypto/hash.h> +#include <crypto/skcipher.h> #include <linux/fs.h> #include <linux/mount.h> #include <linux/pagemap.h> @@ -30,7 +32,6 @@ #include <linux/compiler.h> #include <linux/key.h> #include <linux/namei.h> -#include <linux/crypto.h> #include <linux/file.h> #include <linux/scatterlist.h> #include <linux/slab.h> @@ -74,6 +75,19 @@ void ecryptfs_from_hex(char *dst, char *src, int dst_size) } } +static int ecryptfs_hash_digest(struct crypto_shash *tfm, + char *src, int len, char *dst) +{ + SHASH_DESC_ON_STACK(desc, tfm); + int err; + + desc->tfm = tfm; + desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + err = crypto_shash_digest(desc, src, len, dst); + shash_desc_zero(desc); + return err; +} + /** * ecryptfs_calculate_md5 - calculates the md5 of @src * @dst: Pointer to 16 bytes of allocated memory @@ -88,45 +102,26 @@ static int ecryptfs_calculate_md5(char *dst, struct ecryptfs_crypt_stat *crypt_stat, char *src, int len) { - struct scatterlist sg; - struct hash_desc desc = { - .tfm = crypt_stat->hash_tfm, - .flags = CRYPTO_TFM_REQ_MAY_SLEEP - }; + struct crypto_shash *tfm; int rc = 0; mutex_lock(&crypt_stat->cs_hash_tfm_mutex); - sg_init_one(&sg, (u8 *)src, len); - if (!desc.tfm) { - desc.tfm = crypto_alloc_hash(ECRYPTFS_DEFAULT_HASH, 0, - CRYPTO_ALG_ASYNC); - if (IS_ERR(desc.tfm)) { - rc = PTR_ERR(desc.tfm); + tfm = crypt_stat->hash_tfm; + if (!tfm) { + tfm = crypto_alloc_shash(ECRYPTFS_DEFAULT_HASH, 0, 0); + if (IS_ERR(tfm)) { + rc = PTR_ERR(tfm); ecryptfs_printk(KERN_ERR, "Error attempting to " "allocate crypto context; rc = [%d]\n", rc); goto out; } - crypt_stat->hash_tfm = desc.tfm; - } - rc = crypto_hash_init(&desc); - if (rc) { - printk(KERN_ERR - "%s: Error initializing crypto hash; rc = [%d]\n", - __func__, rc); - goto out; + crypt_stat->hash_tfm = tfm; } - rc = crypto_hash_update(&desc, &sg, len); + rc = ecryptfs_hash_digest(tfm, src, len, dst); if (rc) { printk(KERN_ERR - "%s: Error updating crypto hash; rc = [%d]\n", - __func__, rc); - goto out; - } - rc = crypto_hash_final(&desc, dst); - if (rc) { - printk(KERN_ERR - "%s: Error finalizing crypto hash; rc = [%d]\n", + "%s: Error computing crypto hash; rc = [%d]\n", __func__, rc); goto out; } @@ -234,10 +229,8 @@ void ecryptfs_destroy_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat) { struct ecryptfs_key_sig *key_sig, *key_sig_tmp; - if (crypt_stat->tfm) - crypto_free_ablkcipher(crypt_stat->tfm); - if (crypt_stat->hash_tfm) - crypto_free_hash(crypt_stat->hash_tfm); + crypto_free_skcipher(crypt_stat->tfm); + crypto_free_shash(crypt_stat->hash_tfm); list_for_each_entry_safe(key_sig, key_sig_tmp, &crypt_stat->keysig_list, crypt_stat_list) { list_del(&key_sig->crypt_stat_list); @@ -342,7 +335,7 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat, struct scatterlist *src_sg, int size, unsigned char *iv, int op) { - struct ablkcipher_request *req = NULL; + struct skcipher_request *req = NULL; struct extent_crypt_result ecr; int rc = 0; @@ -358,20 +351,20 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat, init_completion(&ecr.completion); mutex_lock(&crypt_stat->cs_tfm_mutex); - req = ablkcipher_request_alloc(crypt_stat->tfm, GFP_NOFS); + req = skcipher_request_alloc(crypt_stat->tfm, GFP_NOFS); if (!req) { mutex_unlock(&crypt_stat->cs_tfm_mutex); rc = -ENOMEM; goto out; } - ablkcipher_request_set_callback(req, + skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, extent_crypt_complete, &ecr); /* Consider doing this once, when the file is opened */ if (!(crypt_stat->flags & ECRYPTFS_KEY_SET)) { - rc = crypto_ablkcipher_setkey(crypt_stat->tfm, crypt_stat->key, - crypt_stat->key_size); + rc = crypto_skcipher_setkey(crypt_stat->tfm, crypt_stat->key, + crypt_stat->key_size); if (rc) { ecryptfs_printk(KERN_ERR, "Error setting key; rc = [%d]\n", @@ -383,9 +376,9 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat, crypt_stat->flags |= ECRYPTFS_KEY_SET; } mutex_unlock(&crypt_stat->cs_tfm_mutex); - ablkcipher_request_set_crypt(req, src_sg, dst_sg, size, iv); - rc = op == ENCRYPT ? crypto_ablkcipher_encrypt(req) : - crypto_ablkcipher_decrypt(req); + skcipher_request_set_crypt(req, src_sg, dst_sg, size, iv); + rc = op == ENCRYPT ? crypto_skcipher_encrypt(req) : + crypto_skcipher_decrypt(req); if (rc == -EINPROGRESS || rc == -EBUSY) { struct extent_crypt_result *ecr = req->base.data; @@ -394,7 +387,7 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat, reinit_completion(&ecr->completion); } out: - ablkcipher_request_free(req); + skcipher_request_free(req); return rc; } @@ -622,7 +615,7 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat) crypt_stat->cipher, "cbc"); if (rc) goto out_unlock; - crypt_stat->tfm = crypto_alloc_ablkcipher(full_alg_name, 0, 0); + crypt_stat->tfm = crypto_alloc_skcipher(full_alg_name, 0, 0); if (IS_ERR(crypt_stat->tfm)) { rc = PTR_ERR(crypt_stat->tfm); crypt_stat->tfm = NULL; @@ -631,7 +624,7 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat) full_alg_name); goto out_free; } - crypto_ablkcipher_set_flags(crypt_stat->tfm, CRYPTO_TFM_REQ_WEAK_KEY); + crypto_skcipher_set_flags(crypt_stat->tfm, CRYPTO_TFM_REQ_WEAK_KEY); rc = 0; out_free: kfree(full_alg_name); @@ -1499,16 +1492,14 @@ out: */ static int ecryptfs_encrypt_filename(struct ecryptfs_filename *filename, - struct ecryptfs_crypt_stat *crypt_stat, struct ecryptfs_mount_crypt_stat *mount_crypt_stat) { int rc = 0; filename->encrypted_filename = NULL; filename->encrypted_filename_size = 0; - if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCFN_USE_MOUNT_FNEK)) - || (mount_crypt_stat && (mount_crypt_stat->flags - & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) { + if (mount_crypt_stat && (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)) { size_t packet_size; size_t remaining_bytes; @@ -1591,7 +1582,7 @@ out: * event, regardless of whether this function succeeds for fails. */ static int -ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm, +ecryptfs_process_key_cipher(struct crypto_skcipher **key_tfm, char *cipher_name, size_t *key_size) { char dummy_key[ECRYPTFS_MAX_KEY_BYTES]; @@ -1609,21 +1600,18 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm, "ecb"); if (rc) goto out; - *key_tfm = crypto_alloc_blkcipher(full_alg_name, 0, CRYPTO_ALG_ASYNC); + *key_tfm = crypto_alloc_skcipher(full_alg_name, 0, CRYPTO_ALG_ASYNC); if (IS_ERR(*key_tfm)) { rc = PTR_ERR(*key_tfm); printk(KERN_ERR "Unable to allocate crypto cipher with name " "[%s]; rc = [%d]\n", full_alg_name, rc); goto out; } - crypto_blkcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_WEAK_KEY); - if (*key_size == 0) { - struct blkcipher_alg *alg = crypto_blkcipher_alg(*key_tfm); - - *key_size = alg->max_keysize; - } + crypto_skcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_WEAK_KEY); + if (*key_size == 0) + *key_size = crypto_skcipher_default_keysize(*key_tfm); get_random_bytes(dummy_key, *key_size); - rc = crypto_blkcipher_setkey(*key_tfm, dummy_key, *key_size); + rc = crypto_skcipher_setkey(*key_tfm, dummy_key, *key_size); if (rc) { printk(KERN_ERR "Error attempting to set key of size [%zd] for " "cipher [%s]; rc = [%d]\n", *key_size, full_alg_name, @@ -1660,8 +1648,7 @@ int ecryptfs_destroy_crypto(void) list_for_each_entry_safe(key_tfm, key_tfm_tmp, &key_tfm_list, key_tfm_list) { list_del(&key_tfm->key_tfm_list); - if (key_tfm->key_tfm) - crypto_free_blkcipher(key_tfm->key_tfm); + crypto_free_skcipher(key_tfm->key_tfm); kmem_cache_free(ecryptfs_key_tfm_cache, key_tfm); } mutex_unlock(&key_tfm_list_mutex); @@ -1747,7 +1734,7 @@ int ecryptfs_tfm_exists(char *cipher_name, struct ecryptfs_key_tfm **key_tfm) * Searches for cached item first, and creates new if not found. * Returns 0 on success, non-zero if adding new cipher failed */ -int ecryptfs_get_tfm_and_mutex_for_cipher_name(struct crypto_blkcipher **tfm, +int ecryptfs_get_tfm_and_mutex_for_cipher_name(struct crypto_skcipher **tfm, struct mutex **tfm_mutex, char *cipher_name) { @@ -1944,7 +1931,6 @@ out: int ecryptfs_encrypt_and_encode_filename( char **encoded_name, size_t *encoded_name_size, - struct ecryptfs_crypt_stat *crypt_stat, struct ecryptfs_mount_crypt_stat *mount_crypt_stat, const char *name, size_t name_size) { @@ -1953,9 +1939,8 @@ int ecryptfs_encrypt_and_encode_filename( (*encoded_name) = NULL; (*encoded_name_size) = 0; - if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES)) - || (mount_crypt_stat && (mount_crypt_stat->flags - & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) { + if (mount_crypt_stat && (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)) { struct ecryptfs_filename *filename; filename = kzalloc(sizeof(*filename), GFP_KERNEL); @@ -1968,8 +1953,7 @@ int ecryptfs_encrypt_and_encode_filename( } filename->filename = (char *)name; filename->filename_size = name_size; - rc = ecryptfs_encrypt_filename(filename, crypt_stat, - mount_crypt_stat); + rc = ecryptfs_encrypt_filename(filename, mount_crypt_stat); if (rc) { printk(KERN_ERR "%s: Error attempting to encrypt " "filename; rc = [%d]\n", __func__, rc); @@ -1980,11 +1964,9 @@ int ecryptfs_encrypt_and_encode_filename( NULL, &encoded_name_no_prefix_size, filename->encrypted_filename, filename->encrypted_filename_size); - if ((crypt_stat && (crypt_stat->flags - & ECRYPTFS_ENCFN_USE_MOUNT_FNEK)) - || (mount_crypt_stat + if (mount_crypt_stat && (mount_crypt_stat->flags - & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) + & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)) (*encoded_name_size) = (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE + encoded_name_no_prefix_size); @@ -2002,11 +1984,9 @@ int ecryptfs_encrypt_and_encode_filename( kfree(filename); goto out; } - if ((crypt_stat && (crypt_stat->flags - & ECRYPTFS_ENCFN_USE_MOUNT_FNEK)) - || (mount_crypt_stat + if (mount_crypt_stat && (mount_crypt_stat->flags - & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) { + & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)) { memcpy((*encoded_name), ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE); @@ -2120,7 +2100,7 @@ out: int ecryptfs_set_f_namelen(long *namelen, long lower_namelen, struct ecryptfs_mount_crypt_stat *mount_crypt_stat) { - struct blkcipher_desc desc; + struct crypto_skcipher *tfm; struct mutex *tfm_mutex; size_t cipher_blocksize; int rc; @@ -2130,7 +2110,7 @@ int ecryptfs_set_f_namelen(long *namelen, long lower_namelen, return 0; } - rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&desc.tfm, &tfm_mutex, + rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&tfm, &tfm_mutex, mount_crypt_stat->global_default_fn_cipher_name); if (unlikely(rc)) { (*namelen) = 0; @@ -2138,7 +2118,7 @@ int ecryptfs_set_f_namelen(long *namelen, long lower_namelen, } mutex_lock(tfm_mutex); - cipher_blocksize = crypto_blkcipher_blocksize(desc.tfm); + cipher_blocksize = crypto_skcipher_blocksize(tfm); mutex_unlock(tfm_mutex); /* Return an exact amount for the common cases */ diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 7b39260c7bba..d123fbaa28e0 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -28,6 +28,7 @@ #ifndef ECRYPTFS_KERNEL_H #define ECRYPTFS_KERNEL_H +#include <crypto/skcipher.h> #include <keys/user-type.h> #include <keys/encrypted-type.h> #include <linux/fs.h> @@ -38,7 +39,6 @@ #include <linux/nsproxy.h> #include <linux/backing-dev.h> #include <linux/ecryptfs.h> -#include <linux/crypto.h> #define ECRYPTFS_DEFAULT_IV_BYTES 16 #define ECRYPTFS_DEFAULT_EXTENT_SIZE 4096 @@ -233,9 +233,9 @@ struct ecryptfs_crypt_stat { size_t extent_shift; unsigned int extent_mask; struct ecryptfs_mount_crypt_stat *mount_crypt_stat; - struct crypto_ablkcipher *tfm; - struct crypto_hash *hash_tfm; /* Crypto context for generating - * the initialization vectors */ + struct crypto_skcipher *tfm; + struct crypto_shash *hash_tfm; /* Crypto context for generating + * the initialization vectors */ unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1]; unsigned char key[ECRYPTFS_MAX_KEY_BYTES]; unsigned char root_iv[ECRYPTFS_MAX_IV_BYTES]; @@ -309,7 +309,7 @@ struct ecryptfs_global_auth_tok { * keeps a list of crypto API contexts around to use when needed. */ struct ecryptfs_key_tfm { - struct crypto_blkcipher *key_tfm; + struct crypto_skcipher *key_tfm; size_t key_size; struct mutex key_tfm_mutex; struct list_head key_tfm_list; @@ -569,7 +569,6 @@ int ecryptfs_fill_zeros(struct file *file, loff_t new_length); int ecryptfs_encrypt_and_encode_filename( char **encoded_name, size_t *encoded_name_size, - struct ecryptfs_crypt_stat *crypt_stat, struct ecryptfs_mount_crypt_stat *mount_crypt_stat, const char *name, size_t name_size); struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry); @@ -659,7 +658,7 @@ ecryptfs_add_new_key_tfm(struct ecryptfs_key_tfm **key_tfm, char *cipher_name, int ecryptfs_init_crypto(void); int ecryptfs_destroy_crypto(void); int ecryptfs_tfm_exists(char *cipher_name, struct ecryptfs_key_tfm **key_tfm); -int ecryptfs_get_tfm_and_mutex_for_cipher_name(struct crypto_blkcipher **tfm, +int ecryptfs_get_tfm_and_mutex_for_cipher_name(struct crypto_skcipher **tfm, struct mutex **tfm_mutex, char *cipher_name); int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key, diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 040aa879d634..121114e9a464 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -29,7 +29,6 @@ #include <linux/dcache.h> #include <linux/namei.h> #include <linux/mount.h> -#include <linux/crypto.h> #include <linux/fs_stack.h> #include <linux/slab.h> #include <linux/xattr.h> @@ -41,13 +40,13 @@ static struct dentry *lock_parent(struct dentry *dentry) struct dentry *dir; dir = dget_parent(dentry); - mutex_lock_nested(&(d_inode(dir)->i_mutex), I_MUTEX_PARENT); + inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); return dir; } static void unlock_dir(struct dentry *dir) { - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(dir); } @@ -397,11 +396,9 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, int rc = 0; lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent); - mutex_lock(&d_inode(lower_dir_dentry)->i_mutex); - lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name, + lower_dentry = lookup_one_len_unlocked(ecryptfs_dentry->d_name.name, lower_dir_dentry, ecryptfs_dentry->d_name.len); - mutex_unlock(&d_inode(lower_dir_dentry)->i_mutex); if (IS_ERR(lower_dentry)) { rc = PTR_ERR(lower_dentry); ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " @@ -419,18 +416,16 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, dput(lower_dentry); rc = ecryptfs_encrypt_and_encode_filename( &encrypted_and_encoded_name, &encrypted_and_encoded_name_size, - NULL, mount_crypt_stat, ecryptfs_dentry->d_name.name, + mount_crypt_stat, ecryptfs_dentry->d_name.name, ecryptfs_dentry->d_name.len); if (rc) { printk(KERN_ERR "%s: Error attempting to encrypt and encode " "filename; rc = [%d]\n", __func__, rc); goto out; } - mutex_lock(&d_inode(lower_dir_dentry)->i_mutex); - lower_dentry = lookup_one_len(encrypted_and_encoded_name, + lower_dentry = lookup_one_len_unlocked(encrypted_and_encoded_name, lower_dir_dentry, encrypted_and_encoded_name_size); - mutex_unlock(&d_inode(lower_dir_dentry)->i_mutex); if (IS_ERR(lower_dentry)) { rc = PTR_ERR(lower_dentry); ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " @@ -502,7 +497,6 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry, dir->i_sb)->mount_crypt_stat; rc = ecryptfs_encrypt_and_encode_filename(&encoded_symname, &encoded_symlen, - NULL, mount_crypt_stat, symname, strlen(symname)); if (rc) @@ -869,9 +863,9 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) if (!rc && lower_ia.ia_valid & ATTR_SIZE) { struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); - mutex_lock(&d_inode(lower_dentry)->i_mutex); + inode_lock(d_inode(lower_dentry)); rc = notify_change(lower_dentry, &lower_ia, NULL); - mutex_unlock(&d_inode(lower_dentry)->i_mutex); + inode_unlock(d_inode(lower_dentry)); } return rc; } @@ -970,9 +964,9 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) if (lower_ia.ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) lower_ia.ia_valid &= ~ATTR_MODE; - mutex_lock(&d_inode(lower_dentry)->i_mutex); + inode_lock(d_inode(lower_dentry)); rc = notify_change(lower_dentry, &lower_ia, NULL); - mutex_unlock(&d_inode(lower_dentry)->i_mutex); + inode_unlock(d_inode(lower_dentry)); out: fsstack_copy_attr_all(inode, lower_inode); return rc; @@ -1048,10 +1042,10 @@ ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name, rc = -EOPNOTSUPP; goto out; } - mutex_lock(&d_inode(lower_dentry)->i_mutex); + inode_lock(d_inode(lower_dentry)); rc = d_inode(lower_dentry)->i_op->getxattr(lower_dentry, name, value, size); - mutex_unlock(&d_inode(lower_dentry)->i_mutex); + inode_unlock(d_inode(lower_dentry)); out: return rc; } @@ -1075,9 +1069,9 @@ ecryptfs_listxattr(struct dentry *dentry, char *list, size_t size) rc = -EOPNOTSUPP; goto out; } - mutex_lock(&d_inode(lower_dentry)->i_mutex); + inode_lock(d_inode(lower_dentry)); rc = d_inode(lower_dentry)->i_op->listxattr(lower_dentry, list, size); - mutex_unlock(&d_inode(lower_dentry)->i_mutex); + inode_unlock(d_inode(lower_dentry)); out: return rc; } @@ -1092,9 +1086,9 @@ static int ecryptfs_removexattr(struct dentry *dentry, const char *name) rc = -EOPNOTSUPP; goto out; } - mutex_lock(&d_inode(lower_dentry)->i_mutex); + inode_lock(d_inode(lower_dentry)); rc = d_inode(lower_dentry)->i_op->removexattr(lower_dentry, name); - mutex_unlock(&d_inode(lower_dentry)->i_mutex); + inode_unlock(d_inode(lower_dentry)); out: return rc; } diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 6bd67e2011f0..c5c84dfb5b3e 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -25,11 +25,12 @@ * 02111-1307, USA. */ +#include <crypto/hash.h> +#include <crypto/skcipher.h> #include <linux/string.h> #include <linux/pagemap.h> #include <linux/key.h> #include <linux/random.h> -#include <linux/crypto.h> #include <linux/scatterlist.h> #include <linux/slab.h> #include "ecryptfs_kernel.h" @@ -601,12 +602,13 @@ struct ecryptfs_write_tag_70_packet_silly_stack { struct ecryptfs_auth_tok *auth_tok; struct scatterlist src_sg[2]; struct scatterlist dst_sg[2]; - struct blkcipher_desc desc; + struct crypto_skcipher *skcipher_tfm; + struct skcipher_request *skcipher_req; char iv[ECRYPTFS_MAX_IV_BYTES]; char hash[ECRYPTFS_TAG_70_DIGEST_SIZE]; char tmp_hash[ECRYPTFS_TAG_70_DIGEST_SIZE]; - struct hash_desc hash_desc; - struct scatterlist hash_sg; + struct crypto_shash *hash_tfm; + struct shash_desc *hash_desc; }; /** @@ -629,14 +631,13 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, struct key *auth_tok_key = NULL; int rc = 0; - s = kmalloc(sizeof(*s), GFP_KERNEL); + s = kzalloc(sizeof(*s), GFP_KERNEL); if (!s) { printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc " "[%zd] bytes of kernel memory\n", __func__, sizeof(*s)); rc = -ENOMEM; goto out; } - s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; (*packet_size) = 0; rc = ecryptfs_find_auth_tok_for_sig( &auth_tok_key, @@ -649,7 +650,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, goto out; } rc = ecryptfs_get_tfm_and_mutex_for_cipher_name( - &s->desc.tfm, + &s->skcipher_tfm, &s->tfm_mutex, mount_crypt_stat->global_default_fn_cipher_name); if (unlikely(rc)) { printk(KERN_ERR "Internal error whilst attempting to get " @@ -658,7 +659,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, goto out; } mutex_lock(s->tfm_mutex); - s->block_size = crypto_blkcipher_blocksize(s->desc.tfm); + s->block_size = crypto_skcipher_blocksize(s->skcipher_tfm); /* Plus one for the \0 separator between the random prefix * and the plaintext filename */ s->num_rand_bytes = (ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES + 1); @@ -691,6 +692,19 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, rc = -EINVAL; goto out_unlock; } + + s->skcipher_req = skcipher_request_alloc(s->skcipher_tfm, GFP_KERNEL); + if (!s->skcipher_req) { + printk(KERN_ERR "%s: Out of kernel memory whilst attempting to " + "skcipher_request_alloc for %s\n", __func__, + crypto_skcipher_driver_name(s->skcipher_tfm)); + rc = -ENOMEM; + goto out_unlock; + } + + skcipher_request_set_callback(s->skcipher_req, + CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); + s->block_aligned_filename = kzalloc(s->block_aligned_filename_size, GFP_KERNEL); if (!s->block_aligned_filename) { @@ -700,7 +714,6 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, rc = -ENOMEM; goto out_unlock; } - s->i = 0; dest[s->i++] = ECRYPTFS_TAG_70_PACKET_TYPE; rc = ecryptfs_write_packet_length(&dest[s->i], (ECRYPTFS_SIG_SIZE @@ -738,40 +751,36 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, "password tokens\n", __func__); goto out_free_unlock; } - sg_init_one( - &s->hash_sg, - (u8 *)s->auth_tok->token.password.session_key_encryption_key, - s->auth_tok->token.password.session_key_encryption_key_bytes); - s->hash_desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; - s->hash_desc.tfm = crypto_alloc_hash(ECRYPTFS_TAG_70_DIGEST, 0, - CRYPTO_ALG_ASYNC); - if (IS_ERR(s->hash_desc.tfm)) { - rc = PTR_ERR(s->hash_desc.tfm); + s->hash_tfm = crypto_alloc_shash(ECRYPTFS_TAG_70_DIGEST, 0, 0); + if (IS_ERR(s->hash_tfm)) { + rc = PTR_ERR(s->hash_tfm); printk(KERN_ERR "%s: Error attempting to " "allocate hash crypto context; rc = [%d]\n", __func__, rc); goto out_free_unlock; } - rc = crypto_hash_init(&s->hash_desc); - if (rc) { - printk(KERN_ERR - "%s: Error initializing crypto hash; rc = [%d]\n", - __func__, rc); - goto out_release_free_unlock; - } - rc = crypto_hash_update( - &s->hash_desc, &s->hash_sg, - s->auth_tok->token.password.session_key_encryption_key_bytes); - if (rc) { - printk(KERN_ERR - "%s: Error updating crypto hash; rc = [%d]\n", - __func__, rc); + + s->hash_desc = kmalloc(sizeof(*s->hash_desc) + + crypto_shash_descsize(s->hash_tfm), GFP_KERNEL); + if (!s->hash_desc) { + printk(KERN_ERR "%s: Out of kernel memory whilst attempting to " + "kmalloc [%zd] bytes\n", __func__, + sizeof(*s->hash_desc) + + crypto_shash_descsize(s->hash_tfm)); + rc = -ENOMEM; goto out_release_free_unlock; } - rc = crypto_hash_final(&s->hash_desc, s->hash); + + s->hash_desc->tfm = s->hash_tfm; + s->hash_desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + + rc = crypto_shash_digest(s->hash_desc, + (u8 *)s->auth_tok->token.password.session_key_encryption_key, + s->auth_tok->token.password.session_key_encryption_key_bytes, + s->hash); if (rc) { printk(KERN_ERR - "%s: Error finalizing crypto hash; rc = [%d]\n", + "%s: Error computing crypto hash; rc = [%d]\n", __func__, rc); goto out_release_free_unlock; } @@ -780,27 +789,12 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, s->hash[(s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)]; if ((s->j % ECRYPTFS_TAG_70_DIGEST_SIZE) == (ECRYPTFS_TAG_70_DIGEST_SIZE - 1)) { - sg_init_one(&s->hash_sg, (u8 *)s->hash, - ECRYPTFS_TAG_70_DIGEST_SIZE); - rc = crypto_hash_init(&s->hash_desc); - if (rc) { - printk(KERN_ERR - "%s: Error initializing crypto hash; " - "rc = [%d]\n", __func__, rc); - goto out_release_free_unlock; - } - rc = crypto_hash_update(&s->hash_desc, &s->hash_sg, - ECRYPTFS_TAG_70_DIGEST_SIZE); + rc = crypto_shash_digest(s->hash_desc, (u8 *)s->hash, + ECRYPTFS_TAG_70_DIGEST_SIZE, + s->tmp_hash); if (rc) { printk(KERN_ERR - "%s: Error updating crypto hash; " - "rc = [%d]\n", __func__, rc); - goto out_release_free_unlock; - } - rc = crypto_hash_final(&s->hash_desc, s->tmp_hash); - if (rc) { - printk(KERN_ERR - "%s: Error finalizing crypto hash; " + "%s: Error computing crypto hash; " "rc = [%d]\n", __func__, rc); goto out_release_free_unlock; } @@ -834,10 +828,8 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, * of the IV here, so we just use 0's for the IV. Note the * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES * >= ECRYPTFS_MAX_IV_BYTES. */ - memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES); - s->desc.info = s->iv; - rc = crypto_blkcipher_setkey( - s->desc.tfm, + rc = crypto_skcipher_setkey( + s->skcipher_tfm, s->auth_tok->token.password.session_key_encryption_key, mount_crypt_stat->global_default_fn_cipher_key_bytes); if (rc < 0) { @@ -850,8 +842,9 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, mount_crypt_stat->global_default_fn_cipher_key_bytes); goto out_release_free_unlock; } - rc = crypto_blkcipher_encrypt_iv(&s->desc, s->dst_sg, s->src_sg, - s->block_aligned_filename_size); + skcipher_request_set_crypt(s->skcipher_req, s->src_sg, s->dst_sg, + s->block_aligned_filename_size, s->iv); + rc = crypto_skcipher_encrypt(s->skcipher_req); if (rc) { printk(KERN_ERR "%s: Error attempting to encrypt filename; " "rc = [%d]\n", __func__, rc); @@ -861,7 +854,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, (*packet_size) = s->i; (*remaining_bytes) -= (*packet_size); out_release_free_unlock: - crypto_free_hash(s->hash_desc.tfm); + crypto_free_shash(s->hash_tfm); out_free_unlock: kzfree(s->block_aligned_filename); out_unlock: @@ -871,6 +864,8 @@ out: up_write(&(auth_tok_key->sem)); key_put(auth_tok_key); } + skcipher_request_free(s->skcipher_req); + kzfree(s->hash_desc); kfree(s); return rc; } @@ -888,7 +883,8 @@ struct ecryptfs_parse_tag_70_packet_silly_stack { struct ecryptfs_auth_tok *auth_tok; struct scatterlist src_sg[2]; struct scatterlist dst_sg[2]; - struct blkcipher_desc desc; + struct crypto_skcipher *skcipher_tfm; + struct skcipher_request *skcipher_req; char fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX + 1]; char iv[ECRYPTFS_MAX_IV_BYTES]; char cipher_string[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1]; @@ -922,14 +918,13 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, (*packet_size) = 0; (*filename_size) = 0; (*filename) = NULL; - s = kmalloc(sizeof(*s), GFP_KERNEL); + s = kzalloc(sizeof(*s), GFP_KERNEL); if (!s) { printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc " "[%zd] bytes of kernel memory\n", __func__, sizeof(*s)); rc = -ENOMEM; goto out; } - s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; if (max_packet_size < ECRYPTFS_TAG_70_MIN_METADATA_SIZE) { printk(KERN_WARNING "%s: max_packet_size is [%zd]; it must be " "at least [%d]\n", __func__, max_packet_size, @@ -992,7 +987,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, rc); goto out; } - rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&s->desc.tfm, + rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&s->skcipher_tfm, &s->tfm_mutex, s->cipher_string); if (unlikely(rc)) { @@ -1030,12 +1025,23 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, __func__, rc, s->block_aligned_filename_size); goto out_free_unlock; } + + s->skcipher_req = skcipher_request_alloc(s->skcipher_tfm, GFP_KERNEL); + if (!s->skcipher_req) { + printk(KERN_ERR "%s: Out of kernel memory whilst attempting to " + "skcipher_request_alloc for %s\n", __func__, + crypto_skcipher_driver_name(s->skcipher_tfm)); + rc = -ENOMEM; + goto out_free_unlock; + } + + skcipher_request_set_callback(s->skcipher_req, + CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); + /* The characters in the first block effectively do the job of * the IV here, so we just use 0's for the IV. Note the * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES * >= ECRYPTFS_MAX_IV_BYTES. */ - memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES); - s->desc.info = s->iv; /* TODO: Support other key modules than passphrase for * filename encryption */ if (s->auth_tok->token_type != ECRYPTFS_PASSWORD) { @@ -1044,8 +1050,8 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, "password tokens\n", __func__); goto out_free_unlock; } - rc = crypto_blkcipher_setkey( - s->desc.tfm, + rc = crypto_skcipher_setkey( + s->skcipher_tfm, s->auth_tok->token.password.session_key_encryption_key, mount_crypt_stat->global_default_fn_cipher_key_bytes); if (rc < 0) { @@ -1058,14 +1064,14 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, mount_crypt_stat->global_default_fn_cipher_key_bytes); goto out_free_unlock; } - rc = crypto_blkcipher_decrypt_iv(&s->desc, s->dst_sg, s->src_sg, - s->block_aligned_filename_size); + skcipher_request_set_crypt(s->skcipher_req, s->src_sg, s->dst_sg, + s->block_aligned_filename_size, s->iv); + rc = crypto_skcipher_decrypt(s->skcipher_req); if (rc) { printk(KERN_ERR "%s: Error attempting to decrypt filename; " "rc = [%d]\n", __func__, rc); goto out_free_unlock; } - s->i = 0; while (s->decrypted_filename[s->i] != '\0' && s->i < s->block_aligned_filename_size) s->i++; @@ -1108,6 +1114,7 @@ out: up_write(&(auth_tok_key->sem)); key_put(auth_tok_key); } + skcipher_request_free(s->skcipher_req); kfree(s); return rc; } @@ -1667,9 +1674,8 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok, struct scatterlist dst_sg[2]; struct scatterlist src_sg[2]; struct mutex *tfm_mutex; - struct blkcipher_desc desc = { - .flags = CRYPTO_TFM_REQ_MAY_SLEEP - }; + struct crypto_skcipher *tfm; + struct skcipher_request *req = NULL; int rc = 0; if (unlikely(ecryptfs_verbosity > 0)) { @@ -1680,7 +1686,7 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok, auth_tok->token.password.session_key_encryption_key, auth_tok->token.password.session_key_encryption_key_bytes); } - rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&desc.tfm, &tfm_mutex, + rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&tfm, &tfm_mutex, crypt_stat->cipher); if (unlikely(rc)) { printk(KERN_ERR "Internal error whilst attempting to get " @@ -1711,8 +1717,20 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok, goto out; } mutex_lock(tfm_mutex); - rc = crypto_blkcipher_setkey( - desc.tfm, auth_tok->token.password.session_key_encryption_key, + req = skcipher_request_alloc(tfm, GFP_KERNEL); + if (!req) { + mutex_unlock(tfm_mutex); + printk(KERN_ERR "%s: Out of kernel memory whilst attempting to " + "skcipher_request_alloc for %s\n", __func__, + crypto_skcipher_driver_name(tfm)); + rc = -ENOMEM; + goto out; + } + + skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, + NULL, NULL); + rc = crypto_skcipher_setkey( + tfm, auth_tok->token.password.session_key_encryption_key, crypt_stat->key_size); if (unlikely(rc < 0)) { mutex_unlock(tfm_mutex); @@ -1720,8 +1738,10 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok, rc = -EINVAL; goto out; } - rc = crypto_blkcipher_decrypt(&desc, dst_sg, src_sg, - auth_tok->session_key.encrypted_key_size); + skcipher_request_set_crypt(req, src_sg, dst_sg, + auth_tok->session_key.encrypted_key_size, + NULL); + rc = crypto_skcipher_decrypt(req); mutex_unlock(tfm_mutex); if (unlikely(rc)) { printk(KERN_ERR "Error decrypting; rc = [%d]\n", rc); @@ -1738,6 +1758,7 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok, crypt_stat->key_size); } out: + skcipher_request_free(req); return rc; } @@ -2191,16 +2212,14 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes, size_t max_packet_size; struct ecryptfs_mount_crypt_stat *mount_crypt_stat = crypt_stat->mount_crypt_stat; - struct blkcipher_desc desc = { - .tfm = NULL, - .flags = CRYPTO_TFM_REQ_MAY_SLEEP - }; + struct crypto_skcipher *tfm; + struct skcipher_request *req; int rc = 0; (*packet_size) = 0; ecryptfs_from_hex(key_rec->sig, auth_tok->token.password.signature, ECRYPTFS_SIG_SIZE); - rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&desc.tfm, &tfm_mutex, + rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&tfm, &tfm_mutex, crypt_stat->cipher); if (unlikely(rc)) { printk(KERN_ERR "Internal error whilst attempting to get " @@ -2209,12 +2228,11 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes, goto out; } if (mount_crypt_stat->global_default_cipher_key_size == 0) { - struct blkcipher_alg *alg = crypto_blkcipher_alg(desc.tfm); - printk(KERN_WARNING "No key size specified at mount; " - "defaulting to [%d]\n", alg->max_keysize); + "defaulting to [%d]\n", + crypto_skcipher_default_keysize(tfm)); mount_crypt_stat->global_default_cipher_key_size = - alg->max_keysize; + crypto_skcipher_default_keysize(tfm); } if (crypt_stat->key_size == 0) crypt_stat->key_size = @@ -2284,20 +2302,36 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes, goto out; } mutex_lock(tfm_mutex); - rc = crypto_blkcipher_setkey(desc.tfm, session_key_encryption_key, - crypt_stat->key_size); + rc = crypto_skcipher_setkey(tfm, session_key_encryption_key, + crypt_stat->key_size); if (rc < 0) { mutex_unlock(tfm_mutex); ecryptfs_printk(KERN_ERR, "Error setting key for crypto " "context; rc = [%d]\n", rc); goto out; } + + req = skcipher_request_alloc(tfm, GFP_KERNEL); + if (!req) { + mutex_unlock(tfm_mutex); + ecryptfs_printk(KERN_ERR, "Out of kernel memory whilst " + "attempting to skcipher_request_alloc for " + "%s\n", crypto_skcipher_driver_name(tfm)); + rc = -ENOMEM; + goto out; + } + + skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, + NULL, NULL); + rc = 0; ecryptfs_printk(KERN_DEBUG, "Encrypting [%zd] bytes of the key\n", crypt_stat->key_size); - rc = crypto_blkcipher_encrypt(&desc, dst_sg, src_sg, - (*key_rec).enc_key_size); + skcipher_request_set_crypt(req, src_sg, dst_sg, + (*key_rec).enc_key_size, NULL); + rc = crypto_skcipher_encrypt(req); mutex_unlock(tfm_mutex); + skcipher_request_free(req); if (rc) { printk(KERN_ERR "Error encrypting; rc = [%d]\n", rc); goto out; diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index e25b6b06bacf..8b0b4a73116d 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -29,7 +29,6 @@ #include <linux/module.h> #include <linux/namei.h> #include <linux/skbuff.h> -#include <linux/crypto.h> #include <linux/mount.h> #include <linux/pagemap.h> #include <linux/key.h> diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index caba848ac763..1f5865263b3e 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -30,7 +30,6 @@ #include <linux/page-flags.h> #include <linux/mount.h> #include <linux/file.h> -#include <linux/crypto.h> #include <linux/scatterlist.h> #include <linux/slab.h> #include <asm/unaligned.h> @@ -436,7 +435,7 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode) rc = -ENOMEM; goto out; } - mutex_lock(&lower_inode->i_mutex); + inode_lock(lower_inode); size = lower_inode->i_op->getxattr(lower_dentry, ECRYPTFS_XATTR_NAME, xattr_virt, PAGE_CACHE_SIZE); if (size < 0) @@ -444,7 +443,7 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode) put_unaligned_be64(i_size_read(ecryptfs_inode), xattr_virt); rc = lower_inode->i_op->setxattr(lower_dentry, ECRYPTFS_XATTR_NAME, xattr_virt, size, 0); - mutex_unlock(&lower_inode->i_mutex); + inode_unlock(lower_inode); if (rc) printk(KERN_ERR "Error whilst attempting to write inode size " "to lower file xattr; rc = [%d]\n", rc); diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index afa1b81c3418..77a486d3a51b 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -29,7 +29,6 @@ #include <linux/slab.h> #include <linux/seq_file.h> #include <linux/file.h> -#include <linux/crypto.h> #include <linux/statfs.h> #include <linux/magic.h> #include "ecryptfs_kernel.h" diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c index 90001da9abfd..d48e0d261d78 100644 --- a/fs/efivarfs/file.c +++ b/fs/efivarfs/file.c @@ -10,6 +10,7 @@ #include <linux/efi.h> #include <linux/fs.h> #include <linux/slab.h> +#include <linux/mount.h> #include "internal.h" @@ -50,9 +51,9 @@ static ssize_t efivarfs_file_write(struct file *file, d_delete(file->f_path.dentry); dput(file->f_path.dentry); } else { - mutex_lock(&inode->i_mutex); + inode_lock(inode); i_size_write(inode, datasize + sizeof(attributes)); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } bytes = count; @@ -103,9 +104,78 @@ out_free: return size; } +static int +efivarfs_ioc_getxflags(struct file *file, void __user *arg) +{ + struct inode *inode = file->f_mapping->host; + unsigned int i_flags; + unsigned int flags = 0; + + i_flags = inode->i_flags; + if (i_flags & S_IMMUTABLE) + flags |= FS_IMMUTABLE_FL; + + if (copy_to_user(arg, &flags, sizeof(flags))) + return -EFAULT; + return 0; +} + +static int +efivarfs_ioc_setxflags(struct file *file, void __user *arg) +{ + struct inode *inode = file->f_mapping->host; + unsigned int flags; + unsigned int i_flags = 0; + int error; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (copy_from_user(&flags, arg, sizeof(flags))) + return -EFAULT; + + if (flags & ~FS_IMMUTABLE_FL) + return -EOPNOTSUPP; + + if (!capable(CAP_LINUX_IMMUTABLE)) + return -EPERM; + + if (flags & FS_IMMUTABLE_FL) + i_flags |= S_IMMUTABLE; + + + error = mnt_want_write_file(file); + if (error) + return error; + + inode_lock(inode); + inode_set_flags(inode, i_flags, S_IMMUTABLE); + inode_unlock(inode); + + mnt_drop_write_file(file); + + return 0; +} + +long +efivarfs_file_ioctl(struct file *file, unsigned int cmd, unsigned long p) +{ + void __user *arg = (void __user *)p; + + switch (cmd) { + case FS_IOC_GETFLAGS: + return efivarfs_ioc_getxflags(file, arg); + case FS_IOC_SETFLAGS: + return efivarfs_ioc_setxflags(file, arg); + } + + return -ENOTTY; +} + const struct file_operations efivarfs_file_operations = { .open = simple_open, .read = efivarfs_file_read, .write = efivarfs_file_write, .llseek = no_llseek, + .unlocked_ioctl = efivarfs_file_ioctl, }; diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c index 3381b9da9ee6..e2ab6d0497f2 100644 --- a/fs/efivarfs/inode.c +++ b/fs/efivarfs/inode.c @@ -15,7 +15,8 @@ #include "internal.h" struct inode *efivarfs_get_inode(struct super_block *sb, - const struct inode *dir, int mode, dev_t dev) + const struct inode *dir, int mode, + dev_t dev, bool is_removable) { struct inode *inode = new_inode(sb); @@ -23,6 +24,7 @@ struct inode *efivarfs_get_inode(struct super_block *sb, inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_flags = is_removable ? 0 : S_IMMUTABLE; switch (mode & S_IFMT) { case S_IFREG: inode->i_fop = &efivarfs_file_operations; @@ -102,22 +104,17 @@ static void efivarfs_hex_to_guid(const char *str, efi_guid_t *guid) static int efivarfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - struct inode *inode; + struct inode *inode = NULL; struct efivar_entry *var; int namelen, i = 0, err = 0; + bool is_removable = false; if (!efivarfs_valid_name(dentry->d_name.name, dentry->d_name.len)) return -EINVAL; - inode = efivarfs_get_inode(dir->i_sb, dir, mode, 0); - if (!inode) - return -ENOMEM; - var = kzalloc(sizeof(struct efivar_entry), GFP_KERNEL); - if (!var) { - err = -ENOMEM; - goto out; - } + if (!var) + return -ENOMEM; /* length of the variable name itself: remove GUID and separator */ namelen = dentry->d_name.len - EFI_VARIABLE_GUID_LEN - 1; @@ -125,6 +122,16 @@ static int efivarfs_create(struct inode *dir, struct dentry *dentry, efivarfs_hex_to_guid(dentry->d_name.name + namelen + 1, &var->var.VendorGuid); + if (efivar_variable_is_removable(var->var.VendorGuid, + dentry->d_name.name, namelen)) + is_removable = true; + + inode = efivarfs_get_inode(dir->i_sb, dir, mode, 0, is_removable); + if (!inode) { + err = -ENOMEM; + goto out; + } + for (i = 0; i < namelen; i++) var->var.VariableName[i] = dentry->d_name.name[i]; @@ -138,7 +145,8 @@ static int efivarfs_create(struct inode *dir, struct dentry *dentry, out: if (err) { kfree(var); - iput(inode); + if (inode) + iput(inode); } return err; } diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h index b5ff16addb7c..b4505188e799 100644 --- a/fs/efivarfs/internal.h +++ b/fs/efivarfs/internal.h @@ -15,7 +15,8 @@ extern const struct file_operations efivarfs_file_operations; extern const struct inode_operations efivarfs_dir_inode_operations; extern bool efivarfs_valid_name(const char *str, int len); extern struct inode *efivarfs_get_inode(struct super_block *sb, - const struct inode *dir, int mode, dev_t dev); + const struct inode *dir, int mode, dev_t dev, + bool is_removable); extern struct list_head efivarfs_list; diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 86a2121828c3..dd029d13ea61 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -118,8 +118,9 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, struct dentry *dentry, *root = sb->s_root; unsigned long size = 0; char *name; - int len, i; + int len; int err = -ENOMEM; + bool is_removable = false; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) @@ -128,15 +129,17 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, memcpy(entry->var.VariableName, name16, name_size); memcpy(&(entry->var.VendorGuid), &vendor, sizeof(efi_guid_t)); - len = ucs2_strlen(entry->var.VariableName); + len = ucs2_utf8size(entry->var.VariableName); /* name, plus '-', plus GUID, plus NUL*/ name = kmalloc(len + 1 + EFI_VARIABLE_GUID_LEN + 1, GFP_KERNEL); if (!name) goto fail; - for (i = 0; i < len; i++) - name[i] = entry->var.VariableName[i] & 0xFF; + ucs2_as_utf8(name, entry->var.VariableName, len); + + if (efivar_variable_is_removable(entry->var.VendorGuid, name, len)) + is_removable = true; name[len] = '-'; @@ -144,7 +147,8 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, name[len + EFI_VARIABLE_GUID_LEN+1] = '\0'; - inode = efivarfs_get_inode(sb, d_inode(root), S_IFREG | 0644, 0); + inode = efivarfs_get_inode(sb, d_inode(root), S_IFREG | 0644, 0, + is_removable); if (!inode) goto fail_name; @@ -160,10 +164,10 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, efivar_entry_size(entry, &size); efivar_entry_add(entry, &efivarfs_list); - mutex_lock(&inode->i_mutex); + inode_lock(inode); inode->i_private = entry; i_size_write(inode, size + sizeof(entry->var.Attributes)); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); d_add(dentry, inode); return 0; @@ -200,7 +204,7 @@ static int efivarfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_d_op = &efivarfs_d_ops; sb->s_time_gran = 1; - inode = efivarfs_get_inode(sb, NULL, S_IFDIR | 0755, 0); + inode = efivarfs_get_inode(sb, NULL, S_IFDIR | 0755, 0, true); if (!inode) return -ENOMEM; inode->i_op = &efivarfs_dir_inode_operations; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index ae1dbcf47e97..8a74a2a52e0f 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -94,6 +94,11 @@ /* Epoll private bits inside the event mask */ #define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE) +#define EPOLLINOUT_BITS (POLLIN | POLLOUT) + +#define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | POLLERR | POLLHUP | \ + EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE) + /* Maximum number of nesting allowed inside epoll sets */ #define EP_MAX_NESTS 4 @@ -1068,7 +1073,22 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k * wait list. */ if (waitqueue_active(&ep->wq)) { - ewake = 1; + if ((epi->event.events & EPOLLEXCLUSIVE) && + !((unsigned long)key & POLLFREE)) { + switch ((unsigned long)key & EPOLLINOUT_BITS) { + case POLLIN: + if (epi->event.events & POLLIN) + ewake = 1; + break; + case POLLOUT: + if (epi->event.events & POLLOUT) + ewake = 1; + break; + case 0: + ewake = 1; + break; + } + } wake_up_locked(&ep->wq); } if (waitqueue_active(&ep->poll_wait)) @@ -1596,7 +1616,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, { int res = 0, eavail, timed_out = 0; unsigned long flags; - long slack = 0; + u64 slack = 0; wait_queue_t wait; ktime_t expires, *to = NULL; @@ -1875,9 +1895,13 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation. * Also, we do not currently supported nested exclusive wakeups. */ - if ((epds.events & EPOLLEXCLUSIVE) && (op == EPOLL_CTL_MOD || - (op == EPOLL_CTL_ADD && is_file_epoll(tf.file)))) - goto error_tgt_fput; + if (epds.events & EPOLLEXCLUSIVE) { + if (op == EPOLL_CTL_MOD) + goto error_tgt_fput; + if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) || + (epds.events & ~EPOLLEXCLUSIVE_OK_BITS))) + goto error_tgt_fput; + } /* * At this point it is safe to assume that the "private_data" contains @@ -1950,8 +1974,10 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, break; case EPOLL_CTL_MOD: if (epi) { - epds.events |= POLLERR | POLLHUP; - error = ep_modify(ep, epi, &epds); + if (!(epi->event.events & EPOLLEXCLUSIVE)) { + epds.events |= POLLERR | POLLHUP; + error = ep_modify(ep, epi, &epds); + } } else error = -ENOENT; break; diff --git a/fs/exec.c b/fs/exec.c index 828ec5f07de0..c4010b8207a1 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -56,6 +56,7 @@ #include <linux/pipe_fs_i.h> #include <linux/oom.h> #include <linux/compat.h> +#include <linux/vmalloc.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -198,8 +199,12 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, return NULL; } #endif - ret = get_user_pages(current, bprm->mm, pos, - 1, write, 1, &page, NULL); + /* + * We are doing an exec(). 'current' is the process + * doing the exec and bprm->mm is the new process's mm. + */ + ret = get_user_pages_remote(current, bprm->mm, pos, 1, write, + 1, &page, NULL); if (ret <= 0) return NULL; @@ -831,6 +836,97 @@ int kernel_read(struct file *file, loff_t offset, EXPORT_SYMBOL(kernel_read); +int kernel_read_file(struct file *file, void **buf, loff_t *size, + loff_t max_size, enum kernel_read_file_id id) +{ + loff_t i_size, pos; + ssize_t bytes = 0; + int ret; + + if (!S_ISREG(file_inode(file)->i_mode) || max_size < 0) + return -EINVAL; + + ret = security_kernel_read_file(file, id); + if (ret) + return ret; + + i_size = i_size_read(file_inode(file)); + if (max_size > 0 && i_size > max_size) + return -EFBIG; + if (i_size <= 0) + return -EINVAL; + + *buf = vmalloc(i_size); + if (!*buf) + return -ENOMEM; + + pos = 0; + while (pos < i_size) { + bytes = kernel_read(file, pos, (char *)(*buf) + pos, + i_size - pos); + if (bytes < 0) { + ret = bytes; + goto out; + } + + if (bytes == 0) + break; + pos += bytes; + } + + if (pos != i_size) { + ret = -EIO; + goto out; + } + + ret = security_kernel_post_read_file(file, *buf, i_size, id); + if (!ret) + *size = pos; + +out: + if (ret < 0) { + vfree(*buf); + *buf = NULL; + } + return ret; +} +EXPORT_SYMBOL_GPL(kernel_read_file); + +int kernel_read_file_from_path(char *path, void **buf, loff_t *size, + loff_t max_size, enum kernel_read_file_id id) +{ + struct file *file; + int ret; + + if (!path || !*path) + return -EINVAL; + + file = filp_open(path, O_RDONLY, 0); + if (IS_ERR(file)) + return PTR_ERR(file); + + ret = kernel_read_file(file, buf, size, max_size, id); + fput(file); + return ret; +} +EXPORT_SYMBOL_GPL(kernel_read_file_from_path); + +int kernel_read_file_from_fd(int fd, void **buf, loff_t *size, loff_t max_size, + enum kernel_read_file_id id) +{ + struct fd f = fdget(fd); + int ret = -EBADF; + + if (!f.file) + goto out; + + ret = kernel_read_file(f.file, buf, size, max_size, id); +out: + fdput(f); + return ret; +} +EXPORT_SYMBOL_GPL(kernel_read_file_from_fd); + ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) { ssize_t res = vfs_read(file, (void __user *)addr, len, &pos); @@ -1307,13 +1403,13 @@ static void bprm_fill_uid(struct linux_binprm *bprm) return; /* Be careful if suid/sgid is set */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* reload atomically mode/uid/gid now that lock held */ mode = inode->i_mode; uid = inode->i_uid; gid = inode->i_gid; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* We ignore suid/sgid if there are no mappings for them in the ns */ if (!kuid_has_mapping(bprm->cred->user_ns, uid) || diff --git a/fs/exofs/file.c b/fs/exofs/file.c index 906de66e8e7e..28645f0640f7 100644 --- a/fs/exofs/file.c +++ b/fs/exofs/file.c @@ -52,9 +52,9 @@ static int exofs_file_fsync(struct file *filp, loff_t start, loff_t end, if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = sync_inode_metadata(filp->f_mapping->host, 1); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 714cd37a6ba3..c46f1a190b8d 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -124,10 +124,10 @@ static struct dentry *reconnect_one(struct vfsmount *mnt, int err; parent = ERR_PTR(-EACCES); - mutex_lock(&dentry->d_inode->i_mutex); + inode_lock(dentry->d_inode); if (mnt->mnt_sb->s_export_op->get_parent) parent = mnt->mnt_sb->s_export_op->get_parent(dentry); - mutex_unlock(&dentry->d_inode->i_mutex); + inode_unlock(dentry->d_inode); if (IS_ERR(parent)) { dprintk("%s: get_parent of %ld failed, err %d\n", @@ -143,9 +143,9 @@ static struct dentry *reconnect_one(struct vfsmount *mnt, if (err) goto out_err; dprintk("%s: found name: %s\n", __func__, nbuf); - mutex_lock(&parent->d_inode->i_mutex); + inode_lock(parent->d_inode); tmp = lookup_one_len(nbuf, parent, strlen(nbuf)); - mutex_unlock(&parent->d_inode->i_mutex); + inode_unlock(parent->d_inode); if (IS_ERR(tmp)) { dprintk("%s: lookup failed: %d\n", __func__, PTR_ERR(tmp)); goto out_err; @@ -503,10 +503,10 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, */ err = exportfs_get_name(mnt, target_dir, nbuf, result); if (!err) { - mutex_lock(&target_dir->d_inode->i_mutex); + inode_lock(target_dir->d_inode); nresult = lookup_one_len(nbuf, target_dir, strlen(nbuf)); - mutex_unlock(&target_dir->d_inode->i_mutex); + inode_unlock(target_dir->d_inode); if (!IS_ERR(nresult)) { if (nresult->d_inode) { dput(result); diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 4c69c94cafd8..170939f379d7 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -61,6 +61,8 @@ struct ext2_block_alloc_info { #define rsv_start rsv_window._rsv_start #define rsv_end rsv_window._rsv_end +struct mb_cache; + /* * second extended-fs super-block data in memory */ @@ -111,6 +113,7 @@ struct ext2_sb_info { * of the mount options. */ spinlock_t s_lock; + struct mb_cache *s_mb_cache; }; static inline spinlock_t * diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 11a42c5a09ae..c1400b109805 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -80,30 +80,13 @@ static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, return ret; } -static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct inode *inode = file_inode(vma->vm_file); - struct ext2_inode_info *ei = EXT2_I(inode); - int ret; - - sb_start_pagefault(inode->i_sb); - file_update_time(vma->vm_file); - down_read(&ei->dax_sem); - - ret = __dax_mkwrite(vma, vmf, ext2_get_block, NULL); - - up_read(&ei->dax_sem); - sb_end_pagefault(inode->i_sb); - return ret; -} - static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct inode *inode = file_inode(vma->vm_file); struct ext2_inode_info *ei = EXT2_I(inode); - int ret = VM_FAULT_NOPAGE; loff_t size; + int ret; sb_start_pagefault(inode->i_sb); file_update_time(vma->vm_file); @@ -113,6 +96,8 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma, size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) ret = VM_FAULT_SIGBUS; + else + ret = dax_pfn_mkwrite(vma, vmf); up_read(&ei->dax_sem); sb_end_pagefault(inode->i_sb); @@ -122,7 +107,7 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma, static const struct vm_operations_struct ext2_dax_vm_ops = { .fault = ext2_dax_fault, .pmd_fault = ext2_dax_pmd_fault, - .page_mkwrite = ext2_dax_mkwrite, + .page_mkwrite = ext2_dax_fault, .pfn_mkwrite = ext2_dax_pfn_mkwrite, }; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 338eefda70c6..6bd58e6ff038 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -737,8 +737,10 @@ static int ext2_get_blocks(struct inode *inode, * so that it's not found by another thread before it's * initialised */ - err = dax_clear_blocks(inode, le32_to_cpu(chain[depth-1].key), - 1 << inode->i_blkbits); + err = dax_clear_sectors(inode->i_sb->s_bdev, + le32_to_cpu(chain[depth-1].key) << + (inode->i_blkbits - 9), + 1 << inode->i_blkbits); if (err) { mutex_unlock(&ei->truncate_mutex); goto cleanup; @@ -874,6 +876,14 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) static int ext2_writepages(struct address_space *mapping, struct writeback_control *wbc) { +#ifdef CONFIG_FS_DAX + if (dax_mapping(mapping)) { + return dax_writeback_mapping_range(mapping, + mapping->host->i_sb->s_bdev, + wbc); + } +#endif + return mpage_writepages(mapping, wbc, ext2_get_block); } @@ -1296,7 +1306,7 @@ void ext2_set_inode_flags(struct inode *inode) inode->i_flags |= S_NOATIME; if (flags & EXT2_DIRSYNC_FL) inode->i_flags |= S_DIRSYNC; - if (test_opt(inode->i_sb, DAX)) + if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode)) inode->i_flags |= S_DAX; } diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index 5d46c09863f0..b386af2e45f4 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -51,10 +51,10 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) flags = ext2_mask_flags(inode->i_mode, flags); - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* Is it quota file? Do not allow user to mess with it */ if (IS_NOQUOTA(inode)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); ret = -EPERM; goto setflags_out; } @@ -68,7 +68,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) */ if ((flags ^ oldflags) & (EXT2_APPEND_FL | EXT2_IMMUTABLE_FL)) { if (!capable(CAP_LINUX_IMMUTABLE)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); ret = -EPERM; goto setflags_out; } @@ -80,7 +80,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) ext2_set_inode_flags(inode); inode->i_ctime = CURRENT_TIME_SEC; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); mark_inode_dirty(inode); setflags_out: @@ -102,10 +102,10 @@ setflags_out: goto setversion_out; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); inode->i_ctime = CURRENT_TIME_SEC; inode->i_generation = generation; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); mark_inode_dirty(inode); setversion_out: diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 2a188413a2b0..b78caf25f746 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -131,7 +131,10 @@ static void ext2_put_super (struct super_block * sb) dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); - ext2_xattr_put_super(sb); + if (sbi->s_mb_cache) { + ext2_xattr_destroy_cache(sbi->s_mb_cache); + sbi->s_mb_cache = NULL; + } if (!(sb->s_flags & MS_RDONLY)) { struct ext2_super_block *es = sbi->s_es; @@ -1104,6 +1107,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) ext2_msg(sb, KERN_ERR, "error: insufficient memory"); goto failed_mount3; } + +#ifdef CONFIG_EXT2_FS_XATTR + sbi->s_mb_cache = ext2_xattr_create_cache(); + if (!sbi->s_mb_cache) { + ext2_msg(sb, KERN_ERR, "Failed to create an mb_cache"); + goto failed_mount3; + } +#endif /* * set up enough so that it can read an inode */ @@ -1149,6 +1160,8 @@ cantfind_ext2: sb->s_id); goto failed_mount; failed_mount3: + if (sbi->s_mb_cache) + ext2_xattr_destroy_cache(sbi->s_mb_cache); percpu_counter_destroy(&sbi->s_freeblocks_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); @@ -1555,20 +1568,17 @@ MODULE_ALIAS_FS("ext2"); static int __init init_ext2_fs(void) { - int err = init_ext2_xattr(); - if (err) - return err; + int err; + err = init_inodecache(); if (err) - goto out1; + return err; err = register_filesystem(&ext2_fs_type); if (err) goto out; return 0; out: destroy_inodecache(); -out1: - exit_ext2_xattr(); return err; } @@ -1576,7 +1586,6 @@ static void __exit exit_ext2_fs(void) { unregister_filesystem(&ext2_fs_type); destroy_inodecache(); - exit_ext2_xattr(); } MODULE_AUTHOR("Remy Card and others"); diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index f57a7aba32eb..1a5e3bff0b63 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -90,14 +90,12 @@ static int ext2_xattr_set2(struct inode *, struct buffer_head *, struct ext2_xattr_header *); -static int ext2_xattr_cache_insert(struct buffer_head *); +static int ext2_xattr_cache_insert(struct mb_cache *, struct buffer_head *); static struct buffer_head *ext2_xattr_cache_find(struct inode *, struct ext2_xattr_header *); static void ext2_xattr_rehash(struct ext2_xattr_header *, struct ext2_xattr_entry *); -static struct mb_cache *ext2_xattr_cache; - static const struct xattr_handler *ext2_xattr_handler_map[] = { [EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler, #ifdef CONFIG_EXT2_FS_POSIX_ACL @@ -152,6 +150,7 @@ ext2_xattr_get(struct inode *inode, int name_index, const char *name, size_t name_len, size; char *end; int error; + struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache; ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", name_index, name, buffer, (long)buffer_size); @@ -196,7 +195,7 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_get", goto found; entry = next; } - if (ext2_xattr_cache_insert(bh)) + if (ext2_xattr_cache_insert(ext2_mb_cache, bh)) ea_idebug(inode, "cache insert failed"); error = -ENODATA; goto cleanup; @@ -209,7 +208,7 @@ found: le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize) goto bad_block; - if (ext2_xattr_cache_insert(bh)) + if (ext2_xattr_cache_insert(ext2_mb_cache, bh)) ea_idebug(inode, "cache insert failed"); if (buffer) { error = -ERANGE; @@ -247,6 +246,7 @@ ext2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) char *end; size_t rest = buffer_size; int error; + struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache; ea_idebug(inode, "buffer=%p, buffer_size=%ld", buffer, (long)buffer_size); @@ -281,7 +281,7 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_list", goto bad_block; entry = next; } - if (ext2_xattr_cache_insert(bh)) + if (ext2_xattr_cache_insert(ext2_mb_cache, bh)) ea_idebug(inode, "cache insert failed"); /* list the attribute names */ @@ -483,22 +483,23 @@ bad_block: ext2_error(sb, "ext2_xattr_set", /* Here we know that we can set the new attribute. */ if (header) { - struct mb_cache_entry *ce; - /* assert(header == HDR(bh)); */ - ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_bdev, - bh->b_blocknr); lock_buffer(bh); if (header->h_refcount == cpu_to_le32(1)) { + __u32 hash = le32_to_cpu(header->h_hash); + ea_bdebug(bh, "modifying in-place"); - if (ce) - mb_cache_entry_free(ce); + /* + * This must happen under buffer lock for + * ext2_xattr_set2() to reliably detect modified block + */ + mb_cache_entry_delete_block(EXT2_SB(sb)->s_mb_cache, + hash, bh->b_blocknr); + /* keep the buffer locked while modifying it. */ } else { int offset; - if (ce) - mb_cache_entry_release(ce); unlock_buffer(bh); ea_bdebug(bh, "cloning"); header = kmalloc(bh->b_size, GFP_KERNEL); @@ -626,6 +627,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, struct super_block *sb = inode->i_sb; struct buffer_head *new_bh = NULL; int error; + struct mb_cache *ext2_mb_cache = EXT2_SB(sb)->s_mb_cache; if (header) { new_bh = ext2_xattr_cache_find(inode, header); @@ -653,7 +655,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, don't need to change the reference count. */ new_bh = old_bh; get_bh(new_bh); - ext2_xattr_cache_insert(new_bh); + ext2_xattr_cache_insert(ext2_mb_cache, new_bh); } else { /* We need to allocate a new block */ ext2_fsblk_t goal = ext2_group_first_block_no(sb, @@ -674,7 +676,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, memcpy(new_bh->b_data, header, new_bh->b_size); set_buffer_uptodate(new_bh); unlock_buffer(new_bh); - ext2_xattr_cache_insert(new_bh); + ext2_xattr_cache_insert(ext2_mb_cache, new_bh); ext2_xattr_update_super_block(sb); } @@ -707,19 +709,21 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, error = 0; if (old_bh && old_bh != new_bh) { - struct mb_cache_entry *ce; - /* * If there was an old block and we are no longer using it, * release the old block. */ - ce = mb_cache_entry_get(ext2_xattr_cache, old_bh->b_bdev, - old_bh->b_blocknr); lock_buffer(old_bh); if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) { + __u32 hash = le32_to_cpu(HDR(old_bh)->h_hash); + + /* + * This must happen under buffer lock for + * ext2_xattr_set2() to reliably detect freed block + */ + mb_cache_entry_delete_block(ext2_mb_cache, + hash, old_bh->b_blocknr); /* Free the old block. */ - if (ce) - mb_cache_entry_free(ce); ea_bdebug(old_bh, "freeing"); ext2_free_blocks(inode, old_bh->b_blocknr, 1); mark_inode_dirty(inode); @@ -730,8 +734,6 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, } else { /* Decrement the refcount only. */ le32_add_cpu(&HDR(old_bh)->h_refcount, -1); - if (ce) - mb_cache_entry_release(ce); dquot_free_block_nodirty(inode, 1); mark_inode_dirty(inode); mark_buffer_dirty(old_bh); @@ -757,7 +759,6 @@ void ext2_xattr_delete_inode(struct inode *inode) { struct buffer_head *bh = NULL; - struct mb_cache_entry *ce; down_write(&EXT2_I(inode)->xattr_sem); if (!EXT2_I(inode)->i_file_acl) @@ -777,19 +778,22 @@ ext2_xattr_delete_inode(struct inode *inode) EXT2_I(inode)->i_file_acl); goto cleanup; } - ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_bdev, bh->b_blocknr); lock_buffer(bh); if (HDR(bh)->h_refcount == cpu_to_le32(1)) { - if (ce) - mb_cache_entry_free(ce); + __u32 hash = le32_to_cpu(HDR(bh)->h_hash); + + /* + * This must happen under buffer lock for ext2_xattr_set2() to + * reliably detect freed block + */ + mb_cache_entry_delete_block(EXT2_SB(inode->i_sb)->s_mb_cache, + hash, bh->b_blocknr); ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1); get_bh(bh); bforget(bh); unlock_buffer(bh); } else { le32_add_cpu(&HDR(bh)->h_refcount, -1); - if (ce) - mb_cache_entry_release(ce); ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount)); unlock_buffer(bh); @@ -806,18 +810,6 @@ cleanup: } /* - * ext2_xattr_put_super() - * - * This is called when a file system is unmounted. - */ -void -ext2_xattr_put_super(struct super_block *sb) -{ - mb_cache_shrink(sb->s_bdev); -} - - -/* * ext2_xattr_cache_insert() * * Create a new entry in the extended attribute cache, and insert @@ -826,28 +818,20 @@ ext2_xattr_put_super(struct super_block *sb) * Returns 0, or a negative error number on failure. */ static int -ext2_xattr_cache_insert(struct buffer_head *bh) +ext2_xattr_cache_insert(struct mb_cache *cache, struct buffer_head *bh) { __u32 hash = le32_to_cpu(HDR(bh)->h_hash); - struct mb_cache_entry *ce; int error; - ce = mb_cache_entry_alloc(ext2_xattr_cache, GFP_NOFS); - if (!ce) - return -ENOMEM; - error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash); + error = mb_cache_entry_create(cache, GFP_NOFS, hash, bh->b_blocknr, 1); if (error) { - mb_cache_entry_free(ce); if (error == -EBUSY) { ea_bdebug(bh, "already in cache (%d cache entries)", atomic_read(&ext2_xattr_cache->c_entry_count)); error = 0; } - } else { - ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash, - atomic_read(&ext2_xattr_cache->c_entry_count)); - mb_cache_entry_release(ce); - } + } else + ea_bdebug(bh, "inserting [%x]", (int)hash); return error; } @@ -904,22 +888,16 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header) { __u32 hash = le32_to_cpu(header->h_hash); struct mb_cache_entry *ce; + struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache; if (!header->h_hash) return NULL; /* never share */ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); again: - ce = mb_cache_entry_find_first(ext2_xattr_cache, inode->i_sb->s_bdev, - hash); + ce = mb_cache_entry_find_first(ext2_mb_cache, hash); while (ce) { struct buffer_head *bh; - if (IS_ERR(ce)) { - if (PTR_ERR(ce) == -EAGAIN) - goto again; - break; - } - bh = sb_bread(inode->i_sb, ce->e_block); if (!bh) { ext2_error(inode->i_sb, "ext2_xattr_cache_find", @@ -927,7 +905,21 @@ again: inode->i_ino, (unsigned long) ce->e_block); } else { lock_buffer(bh); - if (le32_to_cpu(HDR(bh)->h_refcount) > + /* + * We have to be careful about races with freeing or + * rehashing of xattr block. Once we hold buffer lock + * xattr block's state is stable so we can check + * whether the block got freed / rehashed or not. + * Since we unhash mbcache entry under buffer lock when + * freeing / rehashing xattr block, checking whether + * entry is still hashed is reliable. + */ + if (hlist_bl_unhashed(&ce->e_hash_list)) { + mb_cache_entry_put(ext2_mb_cache, ce); + unlock_buffer(bh); + brelse(bh); + goto again; + } else if (le32_to_cpu(HDR(bh)->h_refcount) > EXT2_XATTR_REFCOUNT_MAX) { ea_idebug(inode, "block %ld refcount %d>%d", (unsigned long) ce->e_block, @@ -936,13 +928,14 @@ again: } else if (!ext2_xattr_cmp(header, HDR(bh))) { ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count))); - mb_cache_entry_release(ce); + mb_cache_entry_touch(ext2_mb_cache, ce); + mb_cache_entry_put(ext2_mb_cache, ce); return bh; } unlock_buffer(bh); brelse(bh); } - ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash); + ce = mb_cache_entry_find_next(ext2_mb_cache, ce); } return NULL; } @@ -1015,17 +1008,15 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *header, #undef BLOCK_HASH_SHIFT -int __init -init_ext2_xattr(void) +#define HASH_BUCKET_BITS 10 + +struct mb_cache *ext2_xattr_create_cache(void) { - ext2_xattr_cache = mb_cache_create("ext2_xattr", 6); - if (!ext2_xattr_cache) - return -ENOMEM; - return 0; + return mb_cache_create(HASH_BUCKET_BITS); } -void -exit_ext2_xattr(void) +void ext2_xattr_destroy_cache(struct mb_cache *cache) { - mb_cache_destroy(ext2_xattr_cache); + if (cache) + mb_cache_destroy(cache); } diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h index 60edf298644e..6f82ab1b00ca 100644 --- a/fs/ext2/xattr.h +++ b/fs/ext2/xattr.h @@ -53,6 +53,8 @@ struct ext2_xattr_entry { #define EXT2_XATTR_SIZE(size) \ (((size) + EXT2_XATTR_ROUND) & ~EXT2_XATTR_ROUND) +struct mb_cache; + # ifdef CONFIG_EXT2_FS_XATTR extern const struct xattr_handler ext2_xattr_user_handler; @@ -65,10 +67,9 @@ extern int ext2_xattr_get(struct inode *, int, const char *, void *, size_t); extern int ext2_xattr_set(struct inode *, int, const char *, const void *, size_t, int); extern void ext2_xattr_delete_inode(struct inode *); -extern void ext2_xattr_put_super(struct super_block *); -extern int init_ext2_xattr(void); -extern void exit_ext2_xattr(void); +extern struct mb_cache *ext2_xattr_create_cache(void); +extern void ext2_xattr_destroy_cache(struct mb_cache *cache); extern const struct xattr_handler *ext2_xattr_handlers[]; @@ -93,19 +94,7 @@ ext2_xattr_delete_inode(struct inode *inode) { } -static inline void -ext2_xattr_put_super(struct super_block *sb) -{ -} - -static inline int -init_ext2_xattr(void) -{ - return 0; -} - -static inline void -exit_ext2_xattr(void) +static inline void ext2_xattr_destroy_cache(struct mb_cache *cache) { } diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index ec0668a60678..fe1f50fe764f 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -191,7 +191,6 @@ static int ext4_init_block_bitmap(struct super_block *sb, /* If checksum is bad mark all blocks used to prevent allocation * essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { - ext4_error(sb, "Checksum bad for group %u", block_group); grp = ext4_get_group_info(sb, block_group); if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) percpu_counter_sub(&sbi->s_freeclusters_counter, @@ -442,14 +441,16 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) } ext4_lock_group(sb, block_group); if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { - err = ext4_init_block_bitmap(sb, bh, block_group, desc); set_bitmap_uptodate(bh); set_buffer_uptodate(bh); ext4_unlock_group(sb, block_group); unlock_buffer(bh); - if (err) + if (err) { + ext4_error(sb, "Failed to init block bitmap for group " + "%u: %d", block_group, err); goto out; + } goto verify; } ext4_unlock_group(sb, block_group); diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index 1a0835073663..edc053a81914 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -18,11 +18,9 @@ * Special Publication 800-38E and IEEE P1619/D16. */ -#include <crypto/hash.h> -#include <crypto/sha.h> +#include <crypto/skcipher.h> #include <keys/user-type.h> #include <keys/encrypted-type.h> -#include <linux/crypto.h> #include <linux/ecryptfs.h> #include <linux/gfp.h> #include <linux/kernel.h> @@ -261,21 +259,21 @@ static int ext4_page_crypto(struct inode *inode, { u8 xts_tweak[EXT4_XTS_TWEAK_SIZE]; - struct ablkcipher_request *req = NULL; + struct skcipher_request *req = NULL; DECLARE_EXT4_COMPLETION_RESULT(ecr); struct scatterlist dst, src; struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; - struct crypto_ablkcipher *tfm = ci->ci_ctfm; + struct crypto_skcipher *tfm = ci->ci_ctfm; int res = 0; - req = ablkcipher_request_alloc(tfm, GFP_NOFS); + req = skcipher_request_alloc(tfm, GFP_NOFS); if (!req) { printk_ratelimited(KERN_ERR "%s: crypto_request_alloc() failed\n", __func__); return -ENOMEM; } - ablkcipher_request_set_callback( + skcipher_request_set_callback( req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, ext4_crypt_complete, &ecr); @@ -288,21 +286,21 @@ static int ext4_page_crypto(struct inode *inode, sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0); sg_init_table(&src, 1); sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0); - ablkcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE, - xts_tweak); + skcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE, + xts_tweak); if (rw == EXT4_DECRYPT) - res = crypto_ablkcipher_decrypt(req); + res = crypto_skcipher_decrypt(req); else - res = crypto_ablkcipher_encrypt(req); + res = crypto_skcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { wait_for_completion(&ecr.completion); res = ecr.res; } - ablkcipher_request_free(req); + skcipher_request_free(req); if (res) { printk_ratelimited( KERN_ERR - "%s: crypto_ablkcipher_encrypt() returned %d\n", + "%s: crypto_skcipher_encrypt() returned %d\n", __func__, res); return res; } @@ -384,14 +382,12 @@ int ext4_decrypt(struct page *page) EXT4_DECRYPT, page->index, page, page); } -int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex) +int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk, + ext4_fsblk_t pblk, ext4_lblk_t len) { struct ext4_crypto_ctx *ctx; struct page *ciphertext_page = NULL; struct bio *bio; - ext4_lblk_t lblk = le32_to_cpu(ex->ee_block); - ext4_fsblk_t pblk = ext4_ext_pblock(ex); - unsigned int len = ext4_ext_get_actual_len(ex); int ret, err = 0; #if 0 @@ -469,3 +465,59 @@ uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size) return size; return 0; } + +/* + * Validate dentries for encrypted directories to make sure we aren't + * potentially caching stale data after a key has been added or + * removed. + */ +static int ext4_d_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct inode *dir = d_inode(dentry->d_parent); + struct ext4_crypt_info *ci = EXT4_I(dir)->i_crypt_info; + int dir_has_key, cached_with_key; + + if (!ext4_encrypted_inode(dir)) + return 0; + + if (ci && ci->ci_keyring_key && + (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | + (1 << KEY_FLAG_REVOKED) | + (1 << KEY_FLAG_DEAD)))) + ci = NULL; + + /* this should eventually be an flag in d_flags */ + cached_with_key = dentry->d_fsdata != NULL; + dir_has_key = (ci != NULL); + + /* + * If the dentry was cached without the key, and it is a + * negative dentry, it might be a valid name. We can't check + * if the key has since been made available due to locking + * reasons, so we fail the validation so ext4_lookup() can do + * this check. + * + * We also fail the validation if the dentry was created with + * the key present, but we no longer have the key, or vice versa. + */ + if ((!cached_with_key && d_is_negative(dentry)) || + (!cached_with_key && dir_has_key) || + (cached_with_key && !dir_has_key)) { +#if 0 /* Revalidation debug */ + char buf[80]; + char *cp = simple_dname(dentry, buf, sizeof(buf)); + + if (IS_ERR(cp)) + cp = (char *) "???"; + pr_err("revalidate: %s %p %d %d %d\n", cp, dentry->d_fsdata, + cached_with_key, d_is_negative(dentry), + dir_has_key); +#endif + return 0; + } + return 1; +} + +const struct dentry_operations ext4_encrypted_d_ops = { + .d_revalidate = ext4_d_revalidate, +}; diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c index 2fbef8a14760..1a2f360405db 100644 --- a/fs/ext4/crypto_fname.c +++ b/fs/ext4/crypto_fname.c @@ -11,11 +11,9 @@ * */ -#include <crypto/hash.h> -#include <crypto/sha.h> +#include <crypto/skcipher.h> #include <keys/encrypted-type.h> #include <keys/user-type.h> -#include <linux/crypto.h> #include <linux/gfp.h> #include <linux/kernel.h> #include <linux/key.h> @@ -65,10 +63,10 @@ static int ext4_fname_encrypt(struct inode *inode, struct ext4_str *oname) { u32 ciphertext_len; - struct ablkcipher_request *req = NULL; + struct skcipher_request *req = NULL; DECLARE_EXT4_COMPLETION_RESULT(ecr); struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; - struct crypto_ablkcipher *tfm = ci->ci_ctfm; + struct crypto_skcipher *tfm = ci->ci_ctfm; int res = 0; char iv[EXT4_CRYPTO_BLOCK_SIZE]; struct scatterlist src_sg, dst_sg; @@ -95,14 +93,14 @@ static int ext4_fname_encrypt(struct inode *inode, } /* Allocate request */ - req = ablkcipher_request_alloc(tfm, GFP_NOFS); + req = skcipher_request_alloc(tfm, GFP_NOFS); if (!req) { printk_ratelimited( KERN_ERR "%s: crypto_request_alloc() failed\n", __func__); kfree(alloc_buf); return -ENOMEM; } - ablkcipher_request_set_callback(req, + skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, ext4_dir_crypt_complete, &ecr); @@ -117,14 +115,14 @@ static int ext4_fname_encrypt(struct inode *inode, /* Create encryption request */ sg_init_one(&src_sg, workbuf, ciphertext_len); sg_init_one(&dst_sg, oname->name, ciphertext_len); - ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv); - res = crypto_ablkcipher_encrypt(req); + skcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv); + res = crypto_skcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { wait_for_completion(&ecr.completion); res = ecr.res; } kfree(alloc_buf); - ablkcipher_request_free(req); + skcipher_request_free(req); if (res < 0) { printk_ratelimited( KERN_ERR "%s: Error (error code %d)\n", __func__, res); @@ -145,11 +143,11 @@ static int ext4_fname_decrypt(struct inode *inode, struct ext4_str *oname) { struct ext4_str tmp_in[2], tmp_out[1]; - struct ablkcipher_request *req = NULL; + struct skcipher_request *req = NULL; DECLARE_EXT4_COMPLETION_RESULT(ecr); struct scatterlist src_sg, dst_sg; struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; - struct crypto_ablkcipher *tfm = ci->ci_ctfm; + struct crypto_skcipher *tfm = ci->ci_ctfm; int res = 0; char iv[EXT4_CRYPTO_BLOCK_SIZE]; unsigned lim = max_name_len(inode); @@ -162,13 +160,13 @@ static int ext4_fname_decrypt(struct inode *inode, tmp_out[0].name = oname->name; /* Allocate request */ - req = ablkcipher_request_alloc(tfm, GFP_NOFS); + req = skcipher_request_alloc(tfm, GFP_NOFS); if (!req) { printk_ratelimited( KERN_ERR "%s: crypto_request_alloc() failed\n", __func__); return -ENOMEM; } - ablkcipher_request_set_callback(req, + skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, ext4_dir_crypt_complete, &ecr); @@ -178,13 +176,13 @@ static int ext4_fname_decrypt(struct inode *inode, /* Create encryption request */ sg_init_one(&src_sg, iname->name, iname->len); sg_init_one(&dst_sg, oname->name, oname->len); - ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv); - res = crypto_ablkcipher_decrypt(req); + skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv); + res = crypto_skcipher_decrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { wait_for_completion(&ecr.completion); res = ecr.res; } - ablkcipher_request_free(req); + skcipher_request_free(req); if (res < 0) { printk_ratelimited( KERN_ERR "%s: Error in ext4_fname_encrypt (error code %d)\n", diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c index c5882b36e558..0129d688d1f7 100644 --- a/fs/ext4/crypto_key.c +++ b/fs/ext4/crypto_key.c @@ -8,6 +8,7 @@ * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. */ +#include <crypto/skcipher.h> #include <keys/encrypted-type.h> #include <keys/user-type.h> #include <linux/random.h> @@ -41,45 +42,42 @@ static int ext4_derive_key_aes(char deriving_key[EXT4_AES_128_ECB_KEY_SIZE], char derived_key[EXT4_AES_256_XTS_KEY_SIZE]) { int res = 0; - struct ablkcipher_request *req = NULL; + struct skcipher_request *req = NULL; DECLARE_EXT4_COMPLETION_RESULT(ecr); struct scatterlist src_sg, dst_sg; - struct crypto_ablkcipher *tfm = crypto_alloc_ablkcipher("ecb(aes)", 0, - 0); + struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0); if (IS_ERR(tfm)) { res = PTR_ERR(tfm); tfm = NULL; goto out; } - crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY); - req = ablkcipher_request_alloc(tfm, GFP_NOFS); + crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY); + req = skcipher_request_alloc(tfm, GFP_NOFS); if (!req) { res = -ENOMEM; goto out; } - ablkcipher_request_set_callback(req, + skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, derive_crypt_complete, &ecr); - res = crypto_ablkcipher_setkey(tfm, deriving_key, - EXT4_AES_128_ECB_KEY_SIZE); + res = crypto_skcipher_setkey(tfm, deriving_key, + EXT4_AES_128_ECB_KEY_SIZE); if (res < 0) goto out; sg_init_one(&src_sg, source_key, EXT4_AES_256_XTS_KEY_SIZE); sg_init_one(&dst_sg, derived_key, EXT4_AES_256_XTS_KEY_SIZE); - ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, - EXT4_AES_256_XTS_KEY_SIZE, NULL); - res = crypto_ablkcipher_encrypt(req); + skcipher_request_set_crypt(req, &src_sg, &dst_sg, + EXT4_AES_256_XTS_KEY_SIZE, NULL); + res = crypto_skcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { wait_for_completion(&ecr.completion); res = ecr.res; } out: - if (req) - ablkcipher_request_free(req); - if (tfm) - crypto_free_ablkcipher(tfm); + skcipher_request_free(req); + crypto_free_skcipher(tfm); return res; } @@ -90,7 +88,7 @@ void ext4_free_crypt_info(struct ext4_crypt_info *ci) if (ci->ci_keyring_key) key_put(ci->ci_keyring_key); - crypto_free_ablkcipher(ci->ci_ctfm); + crypto_free_skcipher(ci->ci_ctfm); kmem_cache_free(ext4_crypt_info_cachep, ci); } @@ -122,7 +120,7 @@ int _ext4_get_encryption_info(struct inode *inode) struct ext4_encryption_context ctx; const struct user_key_payload *ukp; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - struct crypto_ablkcipher *ctfm; + struct crypto_skcipher *ctfm; const char *cipher_str; char raw_key[EXT4_MAX_KEY_SIZE]; char mode; @@ -213,9 +211,11 @@ retry: res = -ENOKEY; goto out; } + down_read(&keyring_key->sem); ukp = user_key_payload(keyring_key); if (ukp->datalen != sizeof(struct ext4_encryption_key)) { res = -EINVAL; + up_read(&keyring_key->sem); goto out; } master_key = (struct ext4_encryption_key *)ukp->data; @@ -226,14 +226,16 @@ retry: "ext4: key size incorrect: %d\n", master_key->size); res = -ENOKEY; + up_read(&keyring_key->sem); goto out; } res = ext4_derive_key_aes(ctx.nonce, master_key->raw, raw_key); + up_read(&keyring_key->sem); if (res) goto out; got_key: - ctfm = crypto_alloc_ablkcipher(cipher_str, 0, 0); + ctfm = crypto_alloc_skcipher(cipher_str, 0, 0); if (!ctfm || IS_ERR(ctfm)) { res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; printk(KERN_DEBUG @@ -242,11 +244,11 @@ got_key: goto out; } crypt_info->ci_ctfm = ctfm; - crypto_ablkcipher_clear_flags(ctfm, ~0); - crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctfm), + crypto_skcipher_clear_flags(ctfm, ~0); + crypto_tfm_set_flags(crypto_skcipher_tfm(ctfm), CRYPTO_TFM_REQ_WEAK_KEY); - res = crypto_ablkcipher_setkey(ctfm, raw_key, - ext4_encryption_key_size(mode)); + res = crypto_skcipher_setkey(ctfm, raw_key, + ext4_encryption_key_size(mode)); if (res) goto out; memzero_explicit(raw_key, sizeof(raw_key)); diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 1d1bca74f844..33f5e2a50cf8 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -111,6 +111,12 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) int dir_has_error = 0; struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; + if (ext4_encrypted_inode(inode)) { + err = ext4_get_encryption_info(inode); + if (err && err != -ENOKEY) + return err; + } + if (is_dx_dir(inode)) { err = ext4_dx_readdir(file, ctx); if (err != ERR_BAD_DX_DIR) { @@ -157,8 +163,11 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) index, 1); file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; bh = ext4_bread(NULL, inode, map.m_lblk, 0); - if (IS_ERR(bh)) - return PTR_ERR(bh); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + bh = NULL; + goto errout; + } } if (!bh) { diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index cc7ca4e87144..393689dfa1af 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -42,6 +42,18 @@ */ /* + * with AGGRESSIVE_CHECK allocator runs consistency checks over + * structures. these checks slow things down a lot + */ +#define AGGRESSIVE_CHECK__ + +/* + * with DOUBLE_CHECK defined mballoc creates persistent in-core + * bitmaps, maintains and uses them to check for double allocations + */ +#define DOUBLE_CHECK__ + +/* * Define EXT4FS_DEBUG to produce debug messages */ #undef EXT4FS_DEBUG @@ -182,9 +194,9 @@ typedef struct ext4_io_end { struct bio *bio; /* Linked list of completed * bios covering the extent */ unsigned int flag; /* unwritten or not */ + atomic_t count; /* reference counter */ loff_t offset; /* offset in the file */ ssize_t size; /* size of the extent */ - atomic_t count; /* reference counter */ } ext4_io_end_t; struct ext4_io_submit { @@ -378,14 +390,22 @@ struct flex_groups { #define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ -#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */ -#define EXT4_FL_USER_MODIFIABLE 0x004380FF /* User modifiable flags */ +#define EXT4_FL_USER_VISIBLE 0x304BDFFF /* User visible flags */ +#define EXT4_FL_USER_MODIFIABLE 0x204380FF /* User modifiable flags */ + +#define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \ + EXT4_IMMUTABLE_FL | \ + EXT4_APPEND_FL | \ + EXT4_NODUMP_FL | \ + EXT4_NOATIME_FL | \ + EXT4_PROJINHERIT_FL) /* Flags that should be inherited by new inodes from their parent. */ #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ - EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL) + EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\ + EXT4_PROJINHERIT_FL) /* Flags that are appropriate for regular files (all but dir-specific ones). */ #define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL)) @@ -555,10 +575,12 @@ enum { #define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 /* Request will not result in inode size update (user for fallocate) */ #define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080 - /* Do not take i_data_sem locking in ext4_map_blocks */ -#define EXT4_GET_BLOCKS_NO_LOCK 0x0100 /* Convert written extents to unwritten */ -#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0200 +#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0100 + /* Write zeros to newly created written extents */ +#define EXT4_GET_BLOCKS_ZERO 0x0200 +#define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\ + EXT4_GET_BLOCKS_ZERO) /* * The bit position of these flags must not overlap with any of the @@ -616,6 +638,46 @@ enum { #define EXT4_IOC_GET_ENCRYPTION_PWSALT _IOW('f', 20, __u8[16]) #define EXT4_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct ext4_encryption_policy) +#ifndef FS_IOC_FSGETXATTR +/* Until the uapi changes get merged for project quota... */ + +#define FS_IOC_FSGETXATTR _IOR('X', 31, struct fsxattr) +#define FS_IOC_FSSETXATTR _IOW('X', 32, struct fsxattr) + +/* + * Structure for FS_IOC_FSGETXATTR and FS_IOC_FSSETXATTR. + */ +struct fsxattr { + __u32 fsx_xflags; /* xflags field value (get/set) */ + __u32 fsx_extsize; /* extsize field value (get/set)*/ + __u32 fsx_nextents; /* nextents field value (get) */ + __u32 fsx_projid; /* project identifier (get/set) */ + unsigned char fsx_pad[12]; +}; + +/* + * Flags for the fsx_xflags field + */ +#define FS_XFLAG_REALTIME 0x00000001 /* data in realtime volume */ +#define FS_XFLAG_PREALLOC 0x00000002 /* preallocated file extents */ +#define FS_XFLAG_IMMUTABLE 0x00000008 /* file cannot be modified */ +#define FS_XFLAG_APPEND 0x00000010 /* all writes append */ +#define FS_XFLAG_SYNC 0x00000020 /* all writes synchronous */ +#define FS_XFLAG_NOATIME 0x00000040 /* do not update access time */ +#define FS_XFLAG_NODUMP 0x00000080 /* do not include in backups */ +#define FS_XFLAG_RTINHERIT 0x00000100 /* create with rt bit set */ +#define FS_XFLAG_PROJINHERIT 0x00000200 /* create with parents projid */ +#define FS_XFLAG_NOSYMLINKS 0x00000400 /* disallow symlink creation */ +#define FS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */ +#define FS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */ +#define FS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */ +#define FS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */ +#define FS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ +#endif /* !defined(FS_IOC_FSGETXATTR) */ + +#define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR +#define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR + #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* * ioctl commands in 32 bit emulation @@ -910,6 +972,15 @@ struct ext4_inode_info { * by other means, so we have i_data_sem. */ struct rw_semaphore i_data_sem; + /* + * i_mmap_sem is for serializing page faults with truncate / punch hole + * operations. We have to make sure that new page cannot be faulted in + * a section of the inode that is being punched. We cannot easily use + * i_data_sem for this since we need protection for the whole punch + * operation and i_data_sem ranks below transaction start so we have + * to occasionally drop it. + */ + struct rw_semaphore i_mmap_sem; struct inode vfs_inode; struct jbd2_inode *jinode; @@ -965,13 +1036,8 @@ struct ext4_inode_info { * transaction reserved */ struct list_head i_rsv_conversion_list; - /* - * Completed IOs that need unwritten extents handling and don't have - * transaction reserved - */ - atomic_t i_ioend_count; /* Number of outstanding io_end structs */ - atomic_t i_unwritten; /* Nr. of inflight conversions pending */ struct work_struct i_rsv_conversion_work; + atomic_t i_unwritten; /* Nr. of inflight conversions pending */ spinlock_t i_block_reservation_lock; @@ -993,6 +1059,7 @@ struct ext4_inode_info { /* Encryption params */ struct ext4_crypt_info *i_crypt_info; #endif + kprojid_t i_projid; }; /* @@ -1248,7 +1315,7 @@ struct ext4_super_block { #endif /* Number of quota types we support */ -#define EXT4_MAXQUOTAS 2 +#define EXT4_MAXQUOTAS 3 /* * fourth extended-fs super-block data in memory @@ -1453,16 +1520,6 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode, } } -static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode) -{ - return inode->i_private; -} - -static inline void ext4_inode_aio_set(struct inode *inode, ext4_io_end_t *io) -{ - inode->i_private = io; -} - /* * Inode dynamic state flags */ @@ -1754,7 +1811,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT) EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\ EXT4_FEATURE_RO_COMPAT_BIGALLOC |\ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\ - EXT4_FEATURE_RO_COMPAT_QUOTA) + EXT4_FEATURE_RO_COMPAT_QUOTA |\ + EXT4_FEATURE_RO_COMPAT_PROJECT) #define EXTN_FEATURE_FUNCS(ver) \ static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \ @@ -1796,6 +1854,11 @@ static inline bool ext4_has_incompat_features(struct super_block *sb) #define EXT4_DEF_RESUID 0 #define EXT4_DEF_RESGID 0 +/* + * Default project ID + */ +#define EXT4_DEF_PROJID 0 + #define EXT4_DEF_INODE_READAHEAD_BLKS 32 /* @@ -2234,7 +2297,9 @@ void ext4_restore_control_page(struct page *data_page); struct page *ext4_encrypt(struct inode *inode, struct page *plaintext_page); int ext4_decrypt(struct page *page); -int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex); +int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk, + ext4_fsblk_t pblk, ext4_lblk_t len); +extern const struct dentry_operations ext4_encrypted_d_ops; #ifdef CONFIG_EXT4_FS_ENCRYPTION int ext4_init_crypto(void); @@ -2438,12 +2503,14 @@ extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); int ext4_inode_is_fast_symlink(struct inode *inode); struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); -int ext4_get_block_write(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create); -int ext4_get_block_dax(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create); +int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create); + struct buffer_head *bh_result, int create); +int ext4_dio_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create); int ext4_walk_page_buffers(handle_t *handle, @@ -2484,9 +2551,16 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t lend); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); +extern int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); +extern int ext4_get_projid(struct inode *inode, kprojid_t *projid); extern void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim); +extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, + ext4_fsblk_t pblk, ext4_lblk_t len); +extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk, + unsigned int map_len, + struct extent_status *result); /* indirect.c */ extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, @@ -2825,7 +2899,7 @@ do { \ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) { WARN_ON_ONCE(S_ISREG(inode->i_mode) && - !mutex_is_locked(&inode->i_mutex)); + !inode_is_locked(inode)); down_write(&EXT4_I(inode)->i_data_sem); if (newsize > EXT4_I(inode)->i_disksize) EXT4_I(inode)->i_disksize = newsize; @@ -2848,6 +2922,9 @@ static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize) return changed; } +int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, + loff_t len); + struct ext4_group_info { unsigned long bb_state; struct rb_root bb_free_root; @@ -2986,8 +3063,7 @@ extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, struct page *page); extern int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, - struct dentry *dentry, - struct inode *inode); + struct inode *dir, struct inode *inode); extern int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent, struct inode *inode); @@ -3211,10 +3287,7 @@ static inline void ext4_inode_resume_unlocked_dio(struct inode *inode) #define EXT4_WQ_HASH_SZ 37 #define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\ EXT4_WQ_HASH_SZ]) -#define ext4_aio_mutex(v) (&ext4__aio_mutex[((unsigned long)(v)) %\ - EXT4_WQ_HASH_SZ]) extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; -extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ]; #define EXT4_RESIZING 0 extern int ext4_resize_begin(struct super_block *sb); diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h index ac7d4e813796..1f73c29717e1 100644 --- a/fs/ext4/ext4_crypto.h +++ b/fs/ext4/ext4_crypto.h @@ -77,7 +77,7 @@ struct ext4_crypt_info { char ci_data_mode; char ci_filename_mode; char ci_flags; - struct crypto_ablkcipher *ci_ctfm; + struct crypto_skcipher *ci_ctfm; struct key *ci_keyring_key; char ci_master_key[EXT4_KEY_DESCRIPTOR_SIZE]; }; diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 3c9381547094..8ecf84b8f5a1 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -11,7 +11,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public Licens + * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 551353b1b17a..95bf4679ac54 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -15,7 +15,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public Licens + * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- */ @@ -1736,6 +1736,12 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, */ if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) return 0; + /* + * The check for IO to unwritten extent is somewhat racy as we + * increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after + * dropping i_data_sem. But reserved blocks should save us in that + * case. + */ if (ext4_ext_is_unwritten(ex1) && (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) || atomic_read(&EXT4_I(inode)->i_unwritten) || @@ -2293,59 +2299,69 @@ static int ext4_fill_fiemap_extents(struct inode *inode, } /* - * ext4_ext_put_gap_in_cache: - * calculate boundaries of the gap that the requested block fits into - * and cache this gap + * ext4_ext_determine_hole - determine hole around given block + * @inode: inode we lookup in + * @path: path in extent tree to @lblk + * @lblk: pointer to logical block around which we want to determine hole + * + * Determine hole length (and start if easily possible) around given logical + * block. We don't try too hard to find the beginning of the hole but @path + * actually points to extent before @lblk, we provide it. + * + * The function returns the length of a hole starting at @lblk. We update @lblk + * to the beginning of the hole if we managed to find it. */ -static void -ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, - ext4_lblk_t block) +static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t *lblk) { int depth = ext_depth(inode); - ext4_lblk_t len; - ext4_lblk_t lblock; struct ext4_extent *ex; - struct extent_status es; + ext4_lblk_t len; ex = path[depth].p_ext; if (ex == NULL) { /* there is no extent yet, so gap is [0;-] */ - lblock = 0; + *lblk = 0; len = EXT_MAX_BLOCKS; - ext_debug("cache gap(whole file):"); - } else if (block < le32_to_cpu(ex->ee_block)) { - lblock = block; - len = le32_to_cpu(ex->ee_block) - block; - ext_debug("cache gap(before): %u [%u:%u]", - block, - le32_to_cpu(ex->ee_block), - ext4_ext_get_actual_len(ex)); - } else if (block >= le32_to_cpu(ex->ee_block) + } else if (*lblk < le32_to_cpu(ex->ee_block)) { + len = le32_to_cpu(ex->ee_block) - *lblk; + } else if (*lblk >= le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex)) { ext4_lblk_t next; - lblock = le32_to_cpu(ex->ee_block) - + ext4_ext_get_actual_len(ex); + *lblk = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); next = ext4_ext_next_allocated_block(path); - ext_debug("cache gap(after): [%u:%u] %u", - le32_to_cpu(ex->ee_block), - ext4_ext_get_actual_len(ex), - block); - BUG_ON(next == lblock); - len = next - lblock; + BUG_ON(next == *lblk); + len = next - *lblk; } else { BUG(); } + return len; +} + +/* + * ext4_ext_put_gap_in_cache: + * calculate boundaries of the gap that the requested block fits into + * and cache this gap + */ +static void +ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start, + ext4_lblk_t hole_len) +{ + struct extent_status es; - ext4_es_find_delayed_extent_range(inode, lblock, lblock + len - 1, &es); + ext4_es_find_delayed_extent_range(inode, hole_start, + hole_start + hole_len - 1, &es); if (es.es_len) { /* There's delayed extent containing lblock? */ - if (es.es_lblk <= lblock) + if (es.es_lblk <= hole_start) return; - len = min(es.es_lblk - lblock, len); + hole_len = min(es.es_lblk - hole_start, hole_len); } - ext_debug(" -> %u:%u\n", lblock, len); - ext4_es_insert_extent(inode, lblock, len, ~0, EXTENT_STATUS_HOLE); + ext_debug(" -> %u:%u\n", hole_start, hole_len); + ext4_es_insert_extent(inode, hole_start, hole_len, ~0, + EXTENT_STATUS_HOLE); } /* @@ -3119,19 +3135,11 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) { ext4_fsblk_t ee_pblock; unsigned int ee_len; - int ret; ee_len = ext4_ext_get_actual_len(ex); ee_pblock = ext4_ext_pblock(ex); - - if (ext4_encrypted_inode(inode)) - return ext4_encrypted_zeroout(inode, ex); - - ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS); - if (ret > 0) - ret = 0; - - return ret; + return ext4_issue_zeroout(inode, le32_to_cpu(ex->ee_block), ee_pblock, + ee_len); } /* @@ -3935,8 +3943,8 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, static int convert_initialized_extent(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, - struct ext4_ext_path **ppath, int flags, - unsigned int allocated, ext4_fsblk_t newblock) + struct ext4_ext_path **ppath, + unsigned int allocated) { struct ext4_ext_path *path = *ppath; struct ext4_extent *ex; @@ -4015,7 +4023,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, struct ext4_ext_path *path = *ppath; int ret = 0; int err = 0; - ext4_io_end_t *io = ext4_inode_aio(inode); ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical " "block %llu, max_blocks %u, flags %x, allocated %u\n", @@ -4038,20 +4045,19 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, flags | EXT4_GET_BLOCKS_CONVERT); if (ret <= 0) goto out; - /* - * Flag the inode(non aio case) or end_io struct (aio case) - * that this IO needs to conversion to written when IO is - * completed - */ - if (io) - ext4_set_io_unwritten_flag(inode, io); - else - ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); map->m_flags |= EXT4_MAP_UNWRITTEN; goto out; } /* IO end_io complete, convert the filled extent to written */ if (flags & EXT4_GET_BLOCKS_CONVERT) { + if (flags & EXT4_GET_BLOCKS_ZERO) { + if (allocated > map->m_len) + allocated = map->m_len; + err = ext4_issue_zeroout(inode, map->m_lblk, newblock, + allocated); + if (err < 0) + goto out2; + } ret = ext4_convert_unwritten_extents_endio(handle, inode, map, ppath); if (ret >= 0) { @@ -4283,9 +4289,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, unsigned int allocated = 0, offset = 0; unsigned int allocated_clusters = 0; struct ext4_allocation_request ar; - ext4_io_end_t *io = ext4_inode_aio(inode); ext4_lblk_t cluster_offset; - int set_unwritten = 0; bool map_from_cluster = false; ext_debug("blocks %u/%u requested for inode %lu\n", @@ -4347,7 +4351,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { allocated = convert_initialized_extent( handle, inode, map, &path, - flags, allocated, newblock); + allocated); goto out2; } else if (!ext4_ext_is_unwritten(ex)) goto out; @@ -4368,11 +4372,22 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, * we couldn't try to create block if create flag is zero */ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { + ext4_lblk_t hole_start, hole_len; + + hole_start = map->m_lblk; + hole_len = ext4_ext_determine_hole(inode, path, &hole_start); /* * put just found gap into cache to speed up * subsequent requests */ - ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); + ext4_ext_put_gap_in_cache(inode, hole_start, hole_len); + + /* Update hole_len to reflect hole size after map->m_lblk */ + if (hole_start != map->m_lblk) + hole_len -= map->m_lblk - hole_start; + map->m_pblk = 0; + map->m_len = min_t(unsigned int, map->m_len, hole_len); + goto out2; } @@ -4482,15 +4497,6 @@ got_allocated_blocks: if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){ ext4_ext_mark_unwritten(&newex); map->m_flags |= EXT4_MAP_UNWRITTEN; - /* - * io_end structure was created for every IO write to an - * unwritten extent. To avoid unnecessary conversion, - * here we flag the IO that really needs the conversion. - * For non asycn direct IO case, flag the inode state - * that we need to perform conversion when IO is done. - */ - if (flags & EXT4_GET_BLOCKS_PRE_IO) - set_unwritten = 1; } err = 0; @@ -4501,14 +4507,6 @@ got_allocated_blocks: err = ext4_ext_insert_extent(handle, inode, &path, &newex, flags); - if (!err && set_unwritten) { - if (io) - ext4_set_io_unwritten_flag(inode, io); - else - ext4_set_inode_state(inode, - EXT4_STATE_DIO_UNWRITTEN); - } - if (err && free_on_err) { int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; @@ -4685,10 +4683,6 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, if (len <= EXT_UNWRITTEN_MAX_LEN) flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; - /* Wait all existing dio workers, newcomers will block on i_mutex */ - ext4_inode_block_unlocked_dio(inode); - inode_dio_wait(inode); - /* * credits to insert 1 extent into extent tree */ @@ -4752,8 +4746,6 @@ retry: goto retry; } - ext4_inode_resume_unlocked_dio(inode); - return ret > 0 ? ret2 : ret; } @@ -4770,7 +4762,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, int partial_begin, partial_end; loff_t start, end; ext4_lblk_t lblk; - struct address_space *mapping = inode->i_mapping; unsigned int blkbits = inode->i_blkbits; trace_ext4_zero_range(inode, offset, len, mode); @@ -4786,17 +4777,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, } /* - * Write out all dirty pages to avoid race conditions - * Then release them. - */ - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { - ret = filemap_write_and_wait_range(mapping, offset, - offset + len - 1); - if (ret) - return ret; - } - - /* * Round up offset. This is not fallocate, we neet to zero out * blocks, so convert interior block aligned part of the range to * unwritten and possibly manually zero out unaligned parts of the @@ -4817,7 +4797,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, else max_blocks -= lblk; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * Indirect files do not support unwritten extnets @@ -4839,6 +4819,10 @@ static long ext4_zero_range(struct file *file, loff_t offset, if (mode & FALLOC_FL_KEEP_SIZE) flags |= EXT4_GET_BLOCKS_KEEP_SIZE; + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + /* Preallocate the range including the unaligned edges */ if (partial_begin || partial_end) { ret = ext4_alloc_file_blocks(file, @@ -4847,7 +4831,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, round_down(offset, 1 << blkbits)) >> blkbits, new_size, flags, mode); if (ret) - goto out_mutex; + goto out_dio; } @@ -4856,16 +4840,23 @@ static long ext4_zero_range(struct file *file, loff_t offset, flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | EXT4_EX_NOCACHE); - /* Now release the pages and zero block aligned part of pages*/ + /* + * Prevent page faults from reinstantiating pages we have + * released from page cache. + */ + down_write(&EXT4_I(inode)->i_mmap_sem); + ret = ext4_update_disksize_before_punch(inode, offset, len); + if (ret) { + up_write(&EXT4_I(inode)->i_mmap_sem); + goto out_dio; + } + /* Now release the pages and zero block aligned part of pages */ truncate_pagecache_range(inode, start, end - 1); inode->i_mtime = inode->i_ctime = ext4_current_time(inode); - /* Wait all existing dio workers, newcomers will block on i_mutex */ - ext4_inode_block_unlocked_dio(inode); - inode_dio_wait(inode); - ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags, mode); + up_write(&EXT4_I(inode)->i_mmap_sem); if (ret) goto out_dio; } @@ -4909,7 +4900,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, out_dio: ext4_inode_resume_unlocked_dio(inode); out_mutex: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -4980,7 +4971,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (mode & FALLOC_FL_KEEP_SIZE) flags |= EXT4_GET_BLOCKS_KEEP_SIZE; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * We only support preallocation for extent-based files only @@ -4998,8 +4989,13 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) goto out; } + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags, mode); + ext4_inode_resume_unlocked_dio(inode); if (ret) goto out; @@ -5008,7 +5004,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) EXT4_I(inode)->i_sync_tid); } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); return ret; } @@ -5494,21 +5490,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) return ret; } - /* - * Need to round down offset to be aligned with page size boundary - * for page size > block size. - */ - ioffset = round_down(offset, PAGE_SIZE); - - /* Write out all dirty pages */ - ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, - LLONG_MAX); - if (ret) - return ret; - - /* Take mutex lock */ - mutex_lock(&inode->i_mutex); - + inode_lock(inode); /* * There is no need to overlap collapse range with EOF, in which case * it is effectively a truncate operation @@ -5524,17 +5506,43 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) goto out_mutex; } - truncate_pagecache(inode, ioffset); - /* Wait for existing dio to complete */ ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); + /* + * Prevent page faults from reinstantiating pages we have released from + * page cache. + */ + down_write(&EXT4_I(inode)->i_mmap_sem); + /* + * Need to round down offset to be aligned with page size boundary + * for page size > block size. + */ + ioffset = round_down(offset, PAGE_SIZE); + /* + * Write tail of the last page before removed range since it will get + * removed from the page cache below. + */ + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset); + if (ret) + goto out_mmap; + /* + * Write data that will be shifted to preserve them when discarding + * page cache below. We are also protected from pages becoming dirty + * by i_mmap_sem. + */ + ret = filemap_write_and_wait_range(inode->i_mapping, offset + len, + LLONG_MAX); + if (ret) + goto out_mmap; + truncate_pagecache(inode, ioffset); + credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - goto out_dio; + goto out_mmap; } down_write(&EXT4_I(inode)->i_data_sem); @@ -5573,10 +5581,11 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) out_stop: ext4_journal_stop(handle); -out_dio: +out_mmap: + up_write(&EXT4_I(inode)->i_mmap_sem); ext4_inode_resume_unlocked_dio(inode); out_mutex: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -5627,21 +5636,7 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) return ret; } - /* - * Need to round down to align start offset to page size boundary - * for page size > block size. - */ - ioffset = round_down(offset, PAGE_SIZE); - - /* Write out all dirty pages */ - ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, - LLONG_MAX); - if (ret) - return ret; - - /* Take mutex lock */ - mutex_lock(&inode->i_mutex); - + inode_lock(inode); /* Currently just for extent based files */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { ret = -EOPNOTSUPP; @@ -5660,17 +5655,32 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) goto out_mutex; } - truncate_pagecache(inode, ioffset); - /* Wait for existing dio to complete */ ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); + /* + * Prevent page faults from reinstantiating pages we have released from + * page cache. + */ + down_write(&EXT4_I(inode)->i_mmap_sem); + /* + * Need to round down to align start offset to page size boundary + * for page size > block size. + */ + ioffset = round_down(offset, PAGE_SIZE); + /* Write out all dirty pages */ + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, + LLONG_MAX); + if (ret) + goto out_mmap; + truncate_pagecache(inode, ioffset); + credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - goto out_dio; + goto out_mmap; } /* Expand file to avoid data loss if there is error while shifting */ @@ -5741,10 +5751,11 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) out_stop: ext4_journal_stop(handle); -out_dio: +out_mmap: + up_write(&EXT4_I(inode)->i_mmap_sem); ext4_inode_resume_unlocked_dio(inode); out_mutex: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -5779,8 +5790,8 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem)); BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem)); - BUG_ON(!mutex_is_locked(&inode1->i_mutex)); - BUG_ON(!mutex_is_locked(&inode2->i_mutex)); + BUG_ON(!inode_is_locked(inode1)); + BUG_ON(!inode_is_locked(inode2)); *erp = ext4_es_remove_extent(inode1, lblk1, count); if (unlikely(*erp)) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index ac748b3af1c1..e38b987ac7f5 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -823,8 +823,8 @@ out: es->es_lblk = es1->es_lblk; es->es_len = es1->es_len; es->es_pblk = es1->es_pblk; - if (!ext4_es_is_referenced(es)) - ext4_es_set_referenced(es); + if (!ext4_es_is_referenced(es1)) + ext4_es_set_referenced(es1); stats->es_stats_cache_hits++; } else { stats->es_stats_cache_misses++; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 113837e7ba98..6659e216385e 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -93,31 +93,29 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(iocb->ki_filp); - struct mutex *aio_mutex = NULL; struct blk_plug plug; int o_direct = iocb->ki_flags & IOCB_DIRECT; + int unaligned_aio = 0; int overwrite = 0; ssize_t ret; + inode_lock(inode); + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto out; + /* - * Unaligned direct AIO must be serialized; see comment above - * In the case of O_APPEND, assume that we must always serialize + * Unaligned direct AIO must be serialized among each other as zeroing + * of partial blocks of two competing unaligned AIOs can result in data + * corruption. */ - if (o_direct && - ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && + if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && !is_sync_kiocb(iocb) && - (iocb->ki_flags & IOCB_APPEND || - ext4_unaligned_aio(inode, from, iocb->ki_pos))) { - aio_mutex = ext4_aio_mutex(inode); - mutex_lock(aio_mutex); + ext4_unaligned_aio(inode, from, iocb->ki_pos)) { + unaligned_aio = 1; ext4_unwritten_wait(inode); } - mutex_lock(&inode->i_mutex); - ret = generic_write_checks(iocb, from); - if (ret <= 0) - goto out; - /* * If we have encountered a bitmap-format file, the size limit * is smaller than s_maxbytes, which is for extent-mapped files. @@ -139,7 +137,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) blk_start_plug(&plug); /* check whether we do a DIO overwrite or not */ - if (ext4_should_dioread_nolock(inode) && !aio_mutex && + if (ext4_should_dioread_nolock(inode) && !unaligned_aio && !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) { struct ext4_map_blocks map; unsigned int blkbits = inode->i_blkbits; @@ -169,7 +167,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) } ret = __generic_file_write_iter(iocb, from); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (ret > 0) { ssize_t err; @@ -181,55 +179,43 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (o_direct) blk_finish_plug(&plug); - if (aio_mutex) - mutex_unlock(aio_mutex); return ret; out: - mutex_unlock(&inode->i_mutex); - if (aio_mutex) - mutex_unlock(aio_mutex); + inode_unlock(inode); return ret; } #ifdef CONFIG_FS_DAX -static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate) -{ - struct inode *inode = bh->b_assoc_map->host; - /* XXX: breaks on 32-bit > 16TB. Is that even supported? */ - loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits; - int err; - if (!uptodate) - return; - WARN_ON(!buffer_unwritten(bh)); - err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size); -} - static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { int result; handle_t *handle = NULL; - struct super_block *sb = file_inode(vma->vm_file)->i_sb; + struct inode *inode = file_inode(vma->vm_file); + struct super_block *sb = inode->i_sb; bool write = vmf->flags & FAULT_FLAG_WRITE; if (write) { sb_start_pagefault(sb); file_update_time(vma->vm_file); + down_read(&EXT4_I(inode)->i_mmap_sem); handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, EXT4_DATA_TRANS_BLOCKS(sb)); - } + } else + down_read(&EXT4_I(inode)->i_mmap_sem); if (IS_ERR(handle)) result = VM_FAULT_SIGBUS; else - result = __dax_fault(vma, vmf, ext4_get_block_dax, - ext4_end_io_unwritten); + result = __dax_fault(vma, vmf, ext4_dax_mmap_get_block, NULL); if (write) { if (!IS_ERR(handle)) ext4_journal_stop(handle); + up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(sb); - } + } else + up_read(&EXT4_I(inode)->i_mmap_sem); return result; } @@ -246,44 +232,73 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, if (write) { sb_start_pagefault(sb); file_update_time(vma->vm_file); + down_read(&EXT4_I(inode)->i_mmap_sem); handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, ext4_chunk_trans_blocks(inode, PMD_SIZE / PAGE_SIZE)); - } + } else + down_read(&EXT4_I(inode)->i_mmap_sem); if (IS_ERR(handle)) result = VM_FAULT_SIGBUS; else result = __dax_pmd_fault(vma, addr, pmd, flags, - ext4_get_block_dax, ext4_end_io_unwritten); + ext4_dax_mmap_get_block, NULL); if (write) { if (!IS_ERR(handle)) ext4_journal_stop(handle); + up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(sb); - } + } else + up_read(&EXT4_I(inode)->i_mmap_sem); return result; } -static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +/* + * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault() + * handler we check for races agaist truncate. Note that since we cycle through + * i_mmap_sem, we are sure that also any hole punching that began before we + * were called is finished by now and so if it included part of the file we + * are working on, our pte will get unmapped and the check for pte_same() in + * wp_pfn_shared() fails. Thus fault gets retried and things work out as + * desired. + */ +static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma, + struct vm_fault *vmf) { - return dax_mkwrite(vma, vmf, ext4_get_block_dax, - ext4_end_io_unwritten); + struct inode *inode = file_inode(vma->vm_file); + struct super_block *sb = inode->i_sb; + loff_t size; + int ret; + + sb_start_pagefault(sb); + file_update_time(vma->vm_file); + down_read(&EXT4_I(inode)->i_mmap_sem); + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (vmf->pgoff >= size) + ret = VM_FAULT_SIGBUS; + else + ret = dax_pfn_mkwrite(vma, vmf); + up_read(&EXT4_I(inode)->i_mmap_sem); + sb_end_pagefault(sb); + + return ret; } static const struct vm_operations_struct ext4_dax_vm_ops = { .fault = ext4_dax_fault, .pmd_fault = ext4_dax_pmd_fault, - .page_mkwrite = ext4_dax_mkwrite, - .pfn_mkwrite = dax_pfn_mkwrite, + .page_mkwrite = ext4_dax_fault, + .pfn_mkwrite = ext4_dax_pfn_mkwrite, }; #else #define ext4_dax_vm_ops ext4_file_vm_ops #endif static const struct vm_operations_struct ext4_file_vm_ops = { - .fault = filemap_fault, + .fault = ext4_filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = ext4_page_mkwrite, }; @@ -314,6 +329,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) struct super_block *sb = inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct vfsmount *mnt = filp->f_path.mnt; + struct inode *dir = filp->f_path.dentry->d_parent->d_inode; struct path path; char buf[64], *cp; int ret; @@ -357,6 +373,14 @@ static int ext4_file_open(struct inode * inode, struct file * filp) if (ext4_encryption_info(inode) == NULL) return -ENOKEY; } + if (ext4_encrypted_inode(dir) && + !ext4_is_child_context_consistent_with_parent(dir, inode)) { + ext4_warning(inode->i_sb, + "Inconsistent encryption contexts: %lu/%lu\n", + (unsigned long) dir->i_ino, + (unsigned long) inode->i_ino); + return -EPERM; + } /* * Set up the jbd2_inode if we are opening the inode for * writing and the journal is present @@ -387,7 +411,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) */ static int ext4_find_unwritten_pgoff(struct inode *inode, int whence, - struct ext4_map_blocks *map, + ext4_lblk_t end_blk, loff_t *offset) { struct pagevec pvec; @@ -402,7 +426,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, blkbits = inode->i_sb->s_blocksize_bits; startoff = *offset; lastoff = startoff; - endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits; + endoff = (loff_t)end_blk << blkbits; index = startoff >> PAGE_CACHE_SHIFT; end = endoff >> PAGE_CACHE_SHIFT; @@ -520,18 +544,17 @@ out: static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) { struct inode *inode = file->f_mapping->host; - struct ext4_map_blocks map; struct extent_status es; ext4_lblk_t start, last, end; loff_t dataoff, isize; int blkbits; - int ret = 0; + int ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); isize = i_size_read(inode); if (offset >= isize) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return -ENXIO; } @@ -542,44 +565,35 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) dataoff = offset; do { - map.m_lblk = last; - map.m_len = end - last + 1; - ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { - if (last != start) - dataoff = (loff_t)last << blkbits; - break; + ret = ext4_get_next_extent(inode, last, end - last + 1, &es); + if (ret <= 0) { + /* No extent found -> no data */ + if (ret == 0) + ret = -ENXIO; + inode_unlock(inode); + return ret; } - /* - * If there is a delay extent at this offset, - * it will be as a data. - */ - ext4_es_find_delayed_extent_range(inode, last, last, &es); - if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { - if (last != start) - dataoff = (loff_t)last << blkbits; + last = es.es_lblk; + if (last != start) + dataoff = (loff_t)last << blkbits; + if (!ext4_es_is_unwritten(&es)) break; - } /* * If there is a unwritten extent at this offset, * it will be as a data or a hole according to page * cache that has data or not. */ - if (map.m_flags & EXT4_MAP_UNWRITTEN) { - int unwritten; - unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA, - &map, &dataoff); - if (unwritten) - break; - } - - last++; + if (ext4_find_unwritten_pgoff(inode, SEEK_DATA, + es.es_lblk + es.es_len, &dataoff)) + break; + last += es.es_len; dataoff = (loff_t)last << blkbits; + cond_resched(); } while (last <= end); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (dataoff > isize) return -ENXIO; @@ -593,18 +607,17 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) { struct inode *inode = file->f_mapping->host; - struct ext4_map_blocks map; struct extent_status es; ext4_lblk_t start, last, end; loff_t holeoff, isize; int blkbits; - int ret = 0; + int ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); isize = i_size_read(inode); if (offset >= isize) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return -ENXIO; } @@ -615,47 +628,33 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) holeoff = offset; do { - map.m_lblk = last; - map.m_len = end - last + 1; - ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { - last += ret; - holeoff = (loff_t)last << blkbits; - continue; + ret = ext4_get_next_extent(inode, last, end - last + 1, &es); + if (ret < 0) { + inode_unlock(inode); + return ret; } - - /* - * If there is a delay extent at this offset, - * we will skip this extent. - */ - ext4_es_find_delayed_extent_range(inode, last, last, &es); - if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { - last = es.es_lblk + es.es_len; - holeoff = (loff_t)last << blkbits; - continue; + /* Found a hole? */ + if (ret == 0 || es.es_lblk > last) { + if (last != start) + holeoff = (loff_t)last << blkbits; + break; } - /* * If there is a unwritten extent at this offset, * it will be as a data or a hole according to page * cache that has data or not. */ - if (map.m_flags & EXT4_MAP_UNWRITTEN) { - int unwritten; - unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE, - &map, &holeoff); - if (!unwritten) { - last += ret; - holeoff = (loff_t)last << blkbits; - continue; - } - } + if (ext4_es_is_unwritten(&es) && + ext4_find_unwritten_pgoff(inode, SEEK_HOLE, + last + es.es_len, &holeoff)) + break; - /* find a hole */ - break; + last += es.es_len; + holeoff = (loff_t)last << blkbits; + cond_resched(); } while (last <= end); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (holeoff > isize) holeoff = isize; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 1b8024d26f65..237b877d316d 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -76,7 +76,6 @@ static int ext4_init_inode_bitmap(struct super_block *sb, /* If checksum is bad mark all blocks and inodes use to prevent * allocation, essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { - ext4_error(sb, "Checksum bad for group %u", block_group); grp = ext4_get_group_info(sb, block_group); if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) percpu_counter_sub(&sbi->s_freeclusters_counter, @@ -191,8 +190,11 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) set_buffer_verified(bh); ext4_unlock_group(sb, block_group); unlock_buffer(bh); - if (err) + if (err) { + ext4_error(sb, "Failed to init inode bitmap for group " + "%u: %d", block_group, err); goto out; + } return bh; } ext4_unlock_group(sb, block_group); @@ -785,7 +787,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, sbi = EXT4_SB(sb); /* - * Initalize owners and quota early so that we don't have to account + * Initialize owners and quota early so that we don't have to account * for quota initialization worst case in standard inode creating * transaction */ @@ -799,6 +801,13 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, inode->i_gid = dir->i_gid; } else inode_init_owner(inode, dir, mode); + + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT) && + ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) + ei->i_projid = EXT4_I(dir)->i_projid; + else + ei->i_projid = make_kprojid(&init_user_ns, EXT4_DEF_PROJID); + err = dquot_initialize(inode); if (err) goto out; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 355ef9c36c87..3027fa681de5 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -555,8 +555,23 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, goto got_it; } - /* Next simple case - plain lookup or failed read of indirect block */ - if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) + /* Next simple case - plain lookup failed */ + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { + unsigned epb = inode->i_sb->s_blocksize / sizeof(u32); + int i; + + /* Count number blocks in a subtree under 'partial' */ + count = 1; + for (i = 0; partial + i != chain + depth - 1; i++) + count *= epb; + /* Fill in size of a hole we found */ + map->m_pblk = 0; + map->m_len = min_t(unsigned int, map->m_len, count); + goto cleanup; + } + + /* Failed read of indirect block */ + if (err == -EIO) goto cleanup; /* @@ -693,21 +708,21 @@ retry: } if (IS_DAX(inode)) ret = dax_do_io(iocb, inode, iter, offset, - ext4_get_block, NULL, 0); + ext4_dio_get_block, NULL, 0); else ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, - offset, ext4_get_block, NULL, - NULL, 0); + offset, ext4_dio_get_block, + NULL, NULL, 0); inode_dio_end(inode); } else { locked: if (IS_DAX(inode)) ret = dax_do_io(iocb, inode, iter, offset, - ext4_get_block, NULL, DIO_LOCKING); + ext4_dio_get_block, NULL, DIO_LOCKING); else ret = blockdev_direct_IO(iocb, inode, iter, offset, - ext4_get_block); + ext4_dio_get_block); if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { loff_t isize = i_size_read(inode); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index d884989cc83d..7cbdd3752ba5 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -581,9 +581,10 @@ retry: if (ret) goto out; - if (ext4_should_dioread_nolock(inode)) - ret = __block_write_begin(page, from, to, ext4_get_block_write); - else + if (ext4_should_dioread_nolock(inode)) { + ret = __block_write_begin(page, from, to, + ext4_get_block_unwritten); + } else ret = __block_write_begin(page, from, to, ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { @@ -995,12 +996,11 @@ void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh, */ static int ext4_add_dirent_to_inline(handle_t *handle, struct ext4_filename *fname, - struct dentry *dentry, + struct inode *dir, struct inode *inode, struct ext4_iloc *iloc, void *inline_start, int inline_size) { - struct inode *dir = d_inode(dentry->d_parent); int err; struct ext4_dir_entry_2 *de; @@ -1245,12 +1245,11 @@ out: * the new created block. */ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, - struct dentry *dentry, struct inode *inode) + struct inode *dir, struct inode *inode) { int ret, inline_size; void *inline_start; struct ext4_iloc iloc; - struct inode *dir = d_inode(dentry->d_parent); ret = ext4_get_inode_loc(dir, &iloc); if (ret) @@ -1264,7 +1263,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, EXT4_INLINE_DOTDOT_SIZE; inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; - ret = ext4_add_dirent_to_inline(handle, fname, dentry, inode, &iloc, + ret = ext4_add_dirent_to_inline(handle, fname, dir, inode, &iloc, inline_start, inline_size); if (ret != -ENOSPC) goto out; @@ -1285,7 +1284,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, if (inline_size) { inline_start = ext4_get_inline_xattr_pos(dir, &iloc); - ret = ext4_add_dirent_to_inline(handle, fname, dentry, + ret = ext4_add_dirent_to_inline(handle, fname, dir, inode, &iloc, inline_start, inline_size); @@ -1698,7 +1697,6 @@ int ext4_delete_inline_entry(handle_t *handle, if (err) goto out; - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_mark_inode_dirty(handle, dir); if (unlikely(err)) goto out; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b3bd912df6bf..b2e9576450eb 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -216,7 +216,6 @@ void ext4_evict_inode(struct inode *inode) } truncate_inode_pages_final(&inode->i_data); - WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); goto no_delete; } @@ -228,8 +227,6 @@ void ext4_evict_inode(struct inode *inode) ext4_begin_ordered_truncate(inode, 0); truncate_inode_pages_final(&inode->i_data); - WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); - /* * Protect us against freezing - iput() caller didn't have to have any * protection against it @@ -383,6 +380,21 @@ static int __check_block_validity(struct inode *inode, const char *func, return 0; } +int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, + ext4_lblk_t len) +{ + int ret; + + if (ext4_encrypted_inode(inode)) + return ext4_encrypted_zeroout(inode, lblk, pblk, len); + + ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS); + if (ret > 0) + ret = 0; + + return ret; +} + #define check_block_validity(inode, map) \ __check_block_validity((inode), __func__, __LINE__, (map)) @@ -403,8 +415,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, * out taking i_data_sem. So at the time the unwritten extent * could be converted. */ - if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) - down_read(&EXT4_I(inode)->i_data_sem); + down_read(&EXT4_I(inode)->i_data_sem); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { retval = ext4_ext_map_blocks(handle, inode, map, flags & EXT4_GET_BLOCKS_KEEP_SIZE); @@ -412,8 +423,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, retval = ext4_ind_map_blocks(handle, inode, map, flags & EXT4_GET_BLOCKS_KEEP_SIZE); } - if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) - up_read((&EXT4_I(inode)->i_data_sem)); + up_read((&EXT4_I(inode)->i_data_sem)); /* * We don't check m_len because extent will be collpased in status @@ -445,13 +455,13 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping * based files * - * On success, it returns the number of blocks being mapped or allocated. - * if create==0 and the blocks are pre-allocated and unwritten block, - * the result buffer head is unmapped. If the create ==1, it will make sure - * the buffer head is mapped. + * On success, it returns the number of blocks being mapped or allocated. if + * create==0 and the blocks are pre-allocated and unwritten, the resulting @map + * is marked as unwritten. If the create == 1, it will mark @map as mapped. * * It returns 0 if plain look up failed (blocks have not been allocated), in - * that case, buffer head is unmapped + * that case, @map is returned as unmapped but we still do fill map->m_len to + * indicate the length of a hole starting at map->m_lblk. * * It returns the error in case of allocation failure. */ @@ -494,6 +504,11 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, retval = map->m_len; map->m_len = retval; } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) { + map->m_pblk = 0; + retval = es.es_len - (map->m_lblk - es.es_lblk); + if (retval > map->m_len) + retval = map->m_len; + map->m_len = retval; retval = 0; } else { BUG_ON(1); @@ -509,8 +524,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, * Try to see if we can get the block without requesting a new * file system block. */ - if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) - down_read(&EXT4_I(inode)->i_data_sem); + down_read(&EXT4_I(inode)->i_data_sem); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { retval = ext4_ext_map_blocks(handle, inode, map, flags & EXT4_GET_BLOCKS_KEEP_SIZE); @@ -541,8 +555,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, if (ret < 0) retval = ret; } - if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) - up_read((&EXT4_I(inode)->i_data_sem)); + up_read((&EXT4_I(inode)->i_data_sem)); found: if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { @@ -626,13 +639,29 @@ found: } /* + * We have to zeroout blocks before inserting them into extent + * status tree. Otherwise someone could look them up there and + * use them before they are really zeroed. + */ + if (flags & EXT4_GET_BLOCKS_ZERO && + map->m_flags & EXT4_MAP_MAPPED && + map->m_flags & EXT4_MAP_NEW) { + ret = ext4_issue_zeroout(inode, map->m_lblk, + map->m_pblk, map->m_len); + if (ret) { + retval = ret; + goto out_sem; + } + } + + /* * If the extent has been zeroed out, we don't need to update * extent status tree. */ if ((flags & EXT4_GET_BLOCKS_PRE_IO) && ext4_es_lookup_extent(inode, map->m_lblk, &es)) { if (ext4_es_is_written(&es)) - goto has_zeroout; + goto out_sem; } status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; @@ -643,11 +672,13 @@ found: status |= EXTENT_STATUS_DELAYED; ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status); - if (ret < 0) + if (ret < 0) { retval = ret; + goto out_sem; + } } -has_zeroout: +out_sem: up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { ret = check_block_validity(inode, map); @@ -657,16 +688,39 @@ has_zeroout: return retval; } -/* Maximum number of blocks we map for direct IO at once. */ -#define DIO_MAX_BLOCKS 4096 +/* + * Update EXT4_MAP_FLAGS in bh->b_state. For buffer heads attached to pages + * we have to be careful as someone else may be manipulating b_state as well. + */ +static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags) +{ + unsigned long old_state; + unsigned long new_state; + + flags &= EXT4_MAP_FLAGS; + + /* Dummy buffer_head? Set non-atomically. */ + if (!bh->b_page) { + bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags; + return; + } + /* + * Someone else may be modifying b_state. Be careful! This is ugly but + * once we get rid of using bh as a container for mapping information + * to pass to / from get_block functions, this can go away. + */ + do { + old_state = READ_ONCE(bh->b_state); + new_state = (old_state & ~EXT4_MAP_FLAGS) | flags; + } while (unlikely( + cmpxchg(&bh->b_state, old_state, new_state) != old_state)); +} static int _ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int flags) { - handle_t *handle = ext4_journal_current_handle(); struct ext4_map_blocks map; - int ret = 0, started = 0; - int dio_credits; + int ret = 0; if (ext4_has_inline_data(inode)) return -ERANGE; @@ -674,43 +728,14 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; - if (flags && !(flags & EXT4_GET_BLOCKS_NO_LOCK) && !handle) { - /* Direct IO write... */ - if (map.m_len > DIO_MAX_BLOCKS) - map.m_len = DIO_MAX_BLOCKS; - dio_credits = ext4_chunk_trans_blocks(inode, map.m_len); - handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, - dio_credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - return ret; - } - started = 1; - } - - ret = ext4_map_blocks(handle, inode, &map, flags); + ret = ext4_map_blocks(ext4_journal_current_handle(), inode, &map, + flags); if (ret > 0) { - ext4_io_end_t *io_end = ext4_inode_aio(inode); - map_bh(bh, inode->i_sb, map.m_pblk); - bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; - if (IS_DAX(inode) && buffer_unwritten(bh)) { - /* - * dgc: I suspect unwritten conversion on ext4+DAX is - * fundamentally broken here when there are concurrent - * read/write in progress on this inode. - */ - WARN_ON_ONCE(io_end); - bh->b_assoc_map = inode->i_mapping; - bh->b_private = (void *)(unsigned long)iblock; - } - if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN) - set_buffer_defer_completion(bh); + ext4_update_bh_state(bh, map.m_flags); bh->b_size = inode->i_sb->s_blocksize * map.m_len; ret = 0; } - if (started) - ext4_journal_stop(handle); return ret; } @@ -722,6 +747,155 @@ int ext4_get_block(struct inode *inode, sector_t iblock, } /* + * Get block function used when preparing for buffered write if we require + * creating an unwritten extent if blocks haven't been allocated. The extent + * will be converted to written after the IO is complete. + */ +int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n", + inode->i_ino, create); + return _ext4_get_block(inode, iblock, bh_result, + EXT4_GET_BLOCKS_IO_CREATE_EXT); +} + +/* Maximum number of blocks we map for direct IO at once. */ +#define DIO_MAX_BLOCKS 4096 + +static handle_t *start_dio_trans(struct inode *inode, + struct buffer_head *bh_result) +{ + int dio_credits; + + /* Trim mapping request to maximum we can map at once for DIO */ + if (bh_result->b_size >> inode->i_blkbits > DIO_MAX_BLOCKS) + bh_result->b_size = DIO_MAX_BLOCKS << inode->i_blkbits; + dio_credits = ext4_chunk_trans_blocks(inode, + bh_result->b_size >> inode->i_blkbits); + return ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits); +} + +/* Get block function for DIO reads and writes to inodes without extents */ +int ext4_dio_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) +{ + handle_t *handle; + int ret; + + /* We don't expect handle for direct IO */ + WARN_ON_ONCE(ext4_journal_current_handle()); + + if (create) { + handle = start_dio_trans(inode, bh); + if (IS_ERR(handle)) + return PTR_ERR(handle); + } + ret = _ext4_get_block(inode, iblock, bh, + create ? EXT4_GET_BLOCKS_CREATE : 0); + if (create) + ext4_journal_stop(handle); + return ret; +} + +/* + * Get block function for AIO DIO writes when we create unwritten extent if + * blocks are not allocated yet. The extent will be converted to written + * after IO is complete. + */ +static int ext4_dio_get_block_unwritten_async(struct inode *inode, + sector_t iblock, struct buffer_head *bh_result, int create) +{ + handle_t *handle; + int ret; + + /* We don't expect handle for direct IO */ + WARN_ON_ONCE(ext4_journal_current_handle()); + + handle = start_dio_trans(inode, bh_result); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = _ext4_get_block(inode, iblock, bh_result, + EXT4_GET_BLOCKS_IO_CREATE_EXT); + ext4_journal_stop(handle); + + /* + * When doing DIO using unwritten extents, we need io_end to convert + * unwritten extents to written on IO completion. We allocate io_end + * once we spot unwritten extent and store it in b_private. Generic + * DIO code keeps b_private set and furthermore passes the value to + * our completion callback in 'private' argument. + */ + if (!ret && buffer_unwritten(bh_result)) { + if (!bh_result->b_private) { + ext4_io_end_t *io_end; + + io_end = ext4_init_io_end(inode, GFP_KERNEL); + if (!io_end) + return -ENOMEM; + bh_result->b_private = io_end; + ext4_set_io_unwritten_flag(inode, io_end); + } + set_buffer_defer_completion(bh_result); + } + + return ret; +} + +/* + * Get block function for non-AIO DIO writes when we create unwritten extent if + * blocks are not allocated yet. The extent will be converted to written + * after IO is complete from ext4_ext_direct_IO() function. + */ +static int ext4_dio_get_block_unwritten_sync(struct inode *inode, + sector_t iblock, struct buffer_head *bh_result, int create) +{ + handle_t *handle; + int ret; + + /* We don't expect handle for direct IO */ + WARN_ON_ONCE(ext4_journal_current_handle()); + + handle = start_dio_trans(inode, bh_result); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = _ext4_get_block(inode, iblock, bh_result, + EXT4_GET_BLOCKS_IO_CREATE_EXT); + ext4_journal_stop(handle); + + /* + * Mark inode as having pending DIO writes to unwritten extents. + * ext4_ext_direct_IO() checks this flag and converts extents to + * written. + */ + if (!ret && buffer_unwritten(bh_result)) + ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); + + return ret; +} + +static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int ret; + + ext4_debug("ext4_dio_get_block_overwrite: inode %lu, create flag %d\n", + inode->i_ino, create); + /* We don't expect handle for direct IO */ + WARN_ON_ONCE(ext4_journal_current_handle()); + + ret = _ext4_get_block(inode, iblock, bh_result, 0); + /* + * Blocks should have been preallocated! ext4_file_write_iter() checks + * that. + */ + WARN_ON_ONCE(!buffer_mapped(bh_result) || buffer_unwritten(bh_result)); + + return ret; +} + + +/* * `handle' can be NULL if create is zero */ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, @@ -879,9 +1053,6 @@ int do_journal_get_write_access(handle_t *handle, return ret; } -static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create); - #ifdef CONFIG_EXT4_FS_ENCRYPTION static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, get_block_t *get_block) @@ -1035,13 +1206,14 @@ retry_journal: #ifdef CONFIG_EXT4_FS_ENCRYPTION if (ext4_should_dioread_nolock(inode)) ret = ext4_block_write_begin(page, pos, len, - ext4_get_block_write); + ext4_get_block_unwritten); else ret = ext4_block_write_begin(page, pos, len, ext4_get_block); #else if (ext4_should_dioread_nolock(inode)) - ret = __block_write_begin(page, pos, len, ext4_get_block_write); + ret = __block_write_begin(page, pos, len, + ext4_get_block_unwritten); else ret = __block_write_begin(page, pos, len, ext4_get_block); #endif @@ -1669,7 +1841,7 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, return ret; map_bh(bh, inode->i_sb, map.m_pblk); - bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; + ext4_update_bh_state(bh, map.m_flags); if (buffer_unwritten(bh)) { /* A delayed write to unwritten bh should be marked @@ -2434,6 +2606,10 @@ static int ext4_writepages(struct address_space *mapping, trace_ext4_writepages(inode, wbc); + if (dax_mapping(mapping)) + return dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, + wbc); + /* * No pages to write? This is mainly a kludge to avoid starting * a transaction for special inodes like journal inode on last iput() @@ -3040,44 +3216,83 @@ static int ext4_releasepage(struct page *page, gfp_t wait) return try_to_free_buffers(page); } -/* - * ext4_get_block used when preparing for a DIO write or buffer write. - * We allocate an uinitialized extent if blocks haven't been allocated. - * The extent will be converted to initialized after the IO is complete. - */ -int ext4_get_block_write(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +#ifdef CONFIG_FS_DAX +int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) { - ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", - inode->i_ino, create); - return _ext4_get_block(inode, iblock, bh_result, - EXT4_GET_BLOCKS_IO_CREATE_EXT); -} + int ret, err; + int credits; + struct ext4_map_blocks map; + handle_t *handle = NULL; + int flags = 0; -static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - ext4_debug("ext4_get_block_write_nolock: inode %lu, create flag %d\n", + ext4_debug("ext4_dax_mmap_get_block: inode %lu, create flag %d\n", inode->i_ino, create); - return _ext4_get_block(inode, iblock, bh_result, - EXT4_GET_BLOCKS_NO_LOCK); -} + map.m_lblk = iblock; + map.m_len = bh_result->b_size >> inode->i_blkbits; + credits = ext4_chunk_trans_blocks(inode, map.m_len); + if (create) { + flags |= EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_CREATE_ZERO; + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + return ret; + } + } -int ext4_get_block_dax(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - int flags = EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_UNWRIT_EXT; - if (create) - flags |= EXT4_GET_BLOCKS_CREATE; - ext4_debug("ext4_get_block_dax: inode %lu, create flag %d\n", - inode->i_ino, create); - return _ext4_get_block(inode, iblock, bh_result, flags); + ret = ext4_map_blocks(handle, inode, &map, flags); + if (create) { + err = ext4_journal_stop(handle); + if (ret >= 0 && err < 0) + ret = err; + } + if (ret <= 0) + goto out; + if (map.m_flags & EXT4_MAP_UNWRITTEN) { + int err2; + + /* + * We are protected by i_mmap_sem so we know block cannot go + * away from under us even though we dropped i_data_sem. + * Convert extent to written and write zeros there. + * + * Note: We may get here even when create == 0. + */ + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + err = ext4_map_blocks(handle, inode, &map, + EXT4_GET_BLOCKS_CONVERT | EXT4_GET_BLOCKS_CREATE_ZERO); + if (err < 0) + ret = err; + err2 = ext4_journal_stop(handle); + if (err2 < 0 && ret > 0) + ret = err2; + } +out: + WARN_ON_ONCE(ret == 0 && create); + if (ret > 0) { + map_bh(bh_result, inode->i_sb, map.m_pblk); + /* + * At least for now we have to clear BH_New so that DAX code + * doesn't attempt to zero blocks again in a racy way. + */ + map.m_flags &= ~EXT4_MAP_NEW; + ext4_update_bh_state(bh_result, map.m_flags); + bh_result->b_size = map.m_len << inode->i_blkbits; + ret = 0; + } + return ret; } +#endif static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ssize_t size, void *private) { - ext4_io_end_t *io_end = iocb->private; + ext4_io_end_t *io_end = private; /* if not async direct IO just return */ if (!io_end) @@ -3085,10 +3300,8 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ext_debug("ext4_end_io_dio(): io_end 0x%p " "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", - iocb->private, io_end->inode->i_ino, iocb, offset, - size); + io_end, io_end->inode->i_ino, iocb, offset, size); - iocb->private = NULL; io_end->offset = offset; io_end->size = size; ext4_put_io_end(io_end); @@ -3124,7 +3337,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, get_block_t *get_block_func = NULL; int dio_flags = 0; loff_t final_size = offset + count; - ext4_io_end_t *io_end = NULL; /* Use the old path for reads and writes beyond i_size. */ if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size) @@ -3143,24 +3355,23 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, /* If we do a overwrite dio, i_mutex locking can be released */ overwrite = *((int *)iocb->private); - if (overwrite) { - down_read(&EXT4_I(inode)->i_data_sem); - mutex_unlock(&inode->i_mutex); - } + if (overwrite) + inode_unlock(inode); /* * We could direct write to holes and fallocate. * - * Allocated blocks to fill the hole are marked as - * unwritten to prevent parallel buffered read to expose - * the stale data before DIO complete the data IO. + * Allocated blocks to fill the hole are marked as unwritten to prevent + * parallel buffered read to expose the stale data before DIO complete + * the data IO. * - * As to previously fallocated extents, ext4 get_block will - * just simply mark the buffer mapped but still keep the - * extents unwritten. + * As to previously fallocated extents, ext4 get_block will just simply + * mark the buffer mapped but still keep the extents unwritten. * - * For non AIO case, we will convert those unwritten extents - * to written after return back from blockdev_direct_IO. + * For non AIO case, we will convert those unwritten extents to written + * after return back from blockdev_direct_IO. That way we save us from + * allocating io_end structure and also the overhead of offloading + * the extent convertion to a workqueue. * * For async DIO, the conversion needs to be deferred when the * IO is completed. The ext4 end_io callback function will be @@ -3168,30 +3379,13 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, * case, we allocate an io_end structure to hook to the iocb. */ iocb->private = NULL; - ext4_inode_aio_set(inode, NULL); - if (!is_sync_kiocb(iocb)) { - io_end = ext4_init_io_end(inode, GFP_NOFS); - if (!io_end) { - ret = -ENOMEM; - goto retake_lock; - } - /* - * Grab reference for DIO. Will be dropped in ext4_end_io_dio() - */ - iocb->private = ext4_get_io_end(io_end); - /* - * we save the io structure for current async direct - * IO, so that later ext4_map_blocks() could flag the - * io structure whether there is a unwritten extents - * needs to be converted when IO is completed. - */ - ext4_inode_aio_set(inode, io_end); - } - - if (overwrite) { - get_block_func = ext4_get_block_write_nolock; + if (overwrite) + get_block_func = ext4_dio_get_block_overwrite; + else if (is_sync_kiocb(iocb)) { + get_block_func = ext4_dio_get_block_unwritten_sync; + dio_flags = DIO_LOCKING; } else { - get_block_func = ext4_get_block_write; + get_block_func = ext4_dio_get_block_unwritten_async; dio_flags = DIO_LOCKING; } #ifdef CONFIG_EXT4_FS_ENCRYPTION @@ -3206,27 +3400,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, get_block_func, ext4_end_io_dio, NULL, dio_flags); - /* - * Put our reference to io_end. This can free the io_end structure e.g. - * in sync IO case or in case of error. It can even perform extent - * conversion if all bios we submitted finished before we got here. - * Note that in that case iocb->private can be already set to NULL - * here. - */ - if (io_end) { - ext4_inode_aio_set(inode, NULL); - ext4_put_io_end(io_end); - /* - * When no IO was submitted ext4_end_io_dio() was not - * called so we have to put iocb's reference. - */ - if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) { - WARN_ON(iocb->private != io_end); - WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); - ext4_put_io_end(io_end); - iocb->private = NULL; - } - } if (ret > 0 && !overwrite && ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN)) { int err; @@ -3241,14 +3414,11 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); } -retake_lock: if (iov_iter_rw(iter) == WRITE) inode_dio_end(inode); /* take i_mutex locking again if we do a ovewrite dio */ - if (overwrite) { - up_read(&EXT4_I(inode)->i_data_sem); - mutex_lock(&inode->i_mutex); - } + if (overwrite) + inode_lock(inode); return ret; } @@ -3559,6 +3729,35 @@ int ext4_can_truncate(struct inode *inode) } /* + * We have to make sure i_disksize gets properly updated before we truncate + * page cache due to hole punching or zero range. Otherwise i_disksize update + * can get lost as it may have been postponed to submission of writeback but + * that will never happen after we truncate page cache. + */ +int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, + loff_t len) +{ + handle_t *handle; + loff_t size = i_size_read(inode); + + WARN_ON(!inode_is_locked(inode)); + if (offset > size || offset + len < size) + return 0; + + if (EXT4_I(inode)->i_disksize >= size) + return 0; + + handle = ext4_journal_start(inode, EXT4_HT_MISC, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ext4_update_i_disksize(inode, size); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + + return 0; +} + +/* * ext4_punch_hole: punches a hole in a file by releaseing the blocks * associated with the given offset and length * @@ -3595,7 +3794,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) return ret; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* No need to punch hole beyond i_size */ if (offset >= inode->i_size) @@ -3623,17 +3822,26 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) } + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + + /* + * Prevent page faults from reinstantiating pages we have released from + * page cache. + */ + down_write(&EXT4_I(inode)->i_mmap_sem); first_block_offset = round_up(offset, sb->s_blocksize); last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; /* Now release the pages and zero block aligned part of pages*/ - if (last_block_offset > first_block_offset) + if (last_block_offset > first_block_offset) { + ret = ext4_update_disksize_before_punch(inode, offset, length); + if (ret) + goto out_dio; truncate_pagecache_range(inode, first_block_offset, last_block_offset); - - /* Wait all existing dio workers, newcomers will block on i_mutex */ - ext4_inode_block_unlocked_dio(inode); - inode_dio_wait(inode); + } if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) credits = ext4_writepage_trans_blocks(inode); @@ -3680,19 +3888,15 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) if (IS_SYNC(inode)) ext4_handle_sync(handle); - /* Now release the pages again to reduce race window */ - if (last_block_offset > first_block_offset) - truncate_pagecache_range(inode, first_block_offset, - last_block_offset); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); out_stop: ext4_journal_stop(handle); out_dio: + up_write(&EXT4_I(inode)->i_mmap_sem); ext4_inode_resume_unlocked_dio(inode); out_mutex: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -3762,7 +3966,7 @@ void ext4_truncate(struct inode *inode) * have i_mutex locked because it's not necessary. */ if (!(inode->i_state & (I_NEW|I_FREEING))) - WARN_ON(!mutex_is_locked(&inode->i_mutex)); + WARN_ON(!inode_is_locked(inode)); trace_ext4_truncate_enter(inode); if (!ext4_can_truncate(inode)) @@ -4010,7 +4214,7 @@ void ext4_set_inode_flags(struct inode *inode) new_fl |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) new_fl |= S_DIRSYNC; - if (test_opt(inode->i_sb, DAX)) + if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode)) new_fl |= S_DAX; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX); @@ -4076,6 +4280,14 @@ static inline void ext4_iget_extra_inode(struct inode *inode, EXT4_I(inode)->i_inline_off = 0; } +int ext4_get_projid(struct inode *inode, kprojid_t *projid) +{ + if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, EXT4_FEATURE_RO_COMPAT_PROJECT)) + return -EOPNOTSUPP; + *projid = EXT4_I(inode)->i_projid; + return 0; +} + struct inode *ext4_iget(struct super_block *sb, unsigned long ino) { struct ext4_iloc iloc; @@ -4087,6 +4299,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) int block; uid_t i_uid; gid_t i_gid; + projid_t i_projid; inode = iget_locked(sb, ino); if (!inode) @@ -4136,12 +4349,20 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) inode->i_mode = le16_to_cpu(raw_inode->i_mode); i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT) && + EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE && + EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) + i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid); + else + i_projid = EXT4_DEF_PROJID; + if (!(test_opt(inode->i_sb, NO_UID32))) { i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; } i_uid_write(inode, i_uid); i_gid_write(inode, i_gid); + ei->i_projid = make_kprojid(&init_user_ns, i_projid); set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ @@ -4440,6 +4661,7 @@ static int ext4_do_update_inode(handle_t *handle, int need_datasync = 0, set_large_file = 0; uid_t i_uid; gid_t i_gid; + projid_t i_projid; spin_lock(&ei->i_raw_lock); @@ -4452,6 +4674,7 @@ static int ext4_do_update_inode(handle_t *handle, raw_inode->i_mode = cpu_to_le16(inode->i_mode); i_uid = i_uid_read(inode); i_gid = i_gid_read(inode); + i_projid = from_kprojid(&init_user_ns, ei->i_projid); if (!(test_opt(inode->i_sb, NO_UID32))) { raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); @@ -4529,6 +4752,15 @@ static int ext4_do_update_inode(handle_t *handle, cpu_to_le16(ei->i_extra_isize); } } + + BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_PROJECT) && + i_projid != EXT4_DEF_PROJID); + + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && + EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) + raw_inode->i_projid = cpu_to_le32(i_projid); + ext4_inode_csum_set(inode, raw_inode, ei); spin_unlock(&ei->i_raw_lock); if (inode->i_sb->s_flags & MS_LAZYTIME) @@ -4824,6 +5056,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) } else ext4_wait_for_tail_page_commit(inode); } + down_write(&EXT4_I(inode)->i_mmap_sem); /* * Truncate pagecache after we've waited for commit * in data=journal mode to make pages freeable. @@ -4831,6 +5064,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) truncate_pagecache(inode, inode->i_size); if (shrink) ext4_truncate(inode); + up_write(&EXT4_I(inode)->i_mmap_sem); } if (!rc) { @@ -5082,6 +5316,8 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) might_sleep(); trace_ext4_mark_inode_dirty(inode, _RET_IP_); err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + return err; if (ext4_handle_valid(handle) && EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { @@ -5112,9 +5348,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) } } } - if (!err) - err = ext4_mark_iloc_dirty(handle, inode, &iloc); - return err; + return ext4_mark_iloc_dirty(handle, inode, &iloc); } /* @@ -5279,6 +5513,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) sb_start_pagefault(inode->i_sb); file_update_time(vma->vm_file); + + down_read(&EXT4_I(inode)->i_mmap_sem); /* Delalloc case is easy... */ if (test_opt(inode->i_sb, DELALLOC) && !ext4_should_journal_data(inode) && @@ -5321,7 +5557,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) unlock_page(page); /* OK, we need to fill the hole... */ if (ext4_should_dioread_nolock(inode)) - get_block = ext4_get_block_write; + get_block = ext4_get_block_unwritten; else get_block = ext4_get_block; retry_alloc: @@ -5348,6 +5584,86 @@ retry_alloc: out_ret: ret = block_page_mkwrite_return(ret); out: + up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(inode->i_sb); return ret; } + +int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vma->vm_file); + int err; + + down_read(&EXT4_I(inode)->i_mmap_sem); + err = filemap_fault(vma, vmf); + up_read(&EXT4_I(inode)->i_mmap_sem); + + return err; +} + +/* + * Find the first extent at or after @lblk in an inode that is not a hole. + * Search for @map_len blocks at most. The extent is returned in @result. + * + * The function returns 1 if we found an extent. The function returns 0 in + * case there is no extent at or after @lblk and in that case also sets + * @result->es_len to 0. In case of error, the error code is returned. + */ +int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk, + unsigned int map_len, struct extent_status *result) +{ + struct ext4_map_blocks map; + struct extent_status es = {}; + int ret; + + map.m_lblk = lblk; + map.m_len = map_len; + + /* + * For non-extent based files this loop may iterate several times since + * we do not determine full hole size. + */ + while (map.m_len > 0) { + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret < 0) + return ret; + /* There's extent covering m_lblk? Just return it. */ + if (ret > 0) { + int status; + + ext4_es_store_pblock(result, map.m_pblk); + result->es_lblk = map.m_lblk; + result->es_len = map.m_len; + if (map.m_flags & EXT4_MAP_UNWRITTEN) + status = EXTENT_STATUS_UNWRITTEN; + else + status = EXTENT_STATUS_WRITTEN; + ext4_es_store_status(result, status); + return 1; + } + ext4_es_find_delayed_extent_range(inode, map.m_lblk, + map.m_lblk + map.m_len - 1, + &es); + /* Is delalloc data before next block in extent tree? */ + if (es.es_len && es.es_lblk < map.m_lblk + map.m_len) { + ext4_lblk_t offset = 0; + + if (es.es_lblk < lblk) + offset = lblk - es.es_lblk; + result->es_lblk = es.es_lblk + offset; + ext4_es_store_pblock(result, + ext4_es_pblock(&es) + offset); + result->es_len = es.es_len - offset; + ext4_es_store_status(result, ext4_es_status(&es)); + + return 1; + } + /* There's a hole at m_lblk, advance us after it */ + map.m_lblk += map.m_len; + map_len -= map.m_len; + map.m_len = map_len; + cond_resched(); + } + result->es_len = 0; + return 0; +} diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 5e872fd40e5e..eae5917c534e 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -14,6 +14,7 @@ #include <linux/mount.h> #include <linux/file.h> #include <linux/random.h> +#include <linux/quotaops.h> #include <asm/uaccess.h> #include "ext4_jbd2.h" #include "ext4.h" @@ -202,6 +203,238 @@ static int uuid_is_zero(__u8 u[16]) return 1; } +static int ext4_ioctl_setflags(struct inode *inode, + unsigned int flags) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + handle_t *handle = NULL; + int err = -EPERM, migrate = 0; + struct ext4_iloc iloc; + unsigned int oldflags, mask, i; + unsigned int jflag; + + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) + goto flags_out; + + oldflags = ei->i_flags; + + /* The JOURNAL_DATA flag is modifiable only by root */ + jflag = flags & EXT4_JOURNAL_DATA_FL; + + /* + * The IMMUTABLE and APPEND_ONLY flags can only be changed by + * the relevant capability. + * + * This test looks nicer. Thanks to Pauline Middelink + */ + if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) { + if (!capable(CAP_LINUX_IMMUTABLE)) + goto flags_out; + } + + /* + * The JOURNAL_DATA flag can only be changed by + * the relevant capability. + */ + if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { + if (!capable(CAP_SYS_RESOURCE)) + goto flags_out; + } + if ((flags ^ oldflags) & EXT4_EXTENTS_FL) + migrate = 1; + + if (flags & EXT4_EOFBLOCKS_FL) { + /* we don't support adding EOFBLOCKS flag */ + if (!(oldflags & EXT4_EOFBLOCKS_FL)) { + err = -EOPNOTSUPP; + goto flags_out; + } + } else if (oldflags & EXT4_EOFBLOCKS_FL) + ext4_truncate(inode); + + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto flags_out; + } + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + goto flags_err; + + for (i = 0, mask = 1; i < 32; i++, mask <<= 1) { + if (!(mask & EXT4_FL_USER_MODIFIABLE)) + continue; + if (mask & flags) + ext4_set_inode_flag(inode, i); + else + ext4_clear_inode_flag(inode, i); + } + + ext4_set_inode_flags(inode); + inode->i_ctime = ext4_current_time(inode); + + err = ext4_mark_iloc_dirty(handle, inode, &iloc); +flags_err: + ext4_journal_stop(handle); + if (err) + goto flags_out; + + if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) + err = ext4_change_inode_journal_flag(inode, jflag); + if (err) + goto flags_out; + if (migrate) { + if (flags & EXT4_EXTENTS_FL) + err = ext4_ext_migrate(inode); + else + err = ext4_ind_migrate(inode); + } + +flags_out: + return err; +} + +#ifdef CONFIG_QUOTA +static int ext4_ioctl_setproject(struct file *filp, __u32 projid) +{ + struct inode *inode = file_inode(filp); + struct super_block *sb = inode->i_sb; + struct ext4_inode_info *ei = EXT4_I(inode); + int err, rc; + handle_t *handle; + kprojid_t kprojid; + struct ext4_iloc iloc; + struct ext4_inode *raw_inode; + + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_PROJECT)) { + if (projid != EXT4_DEF_PROJID) + return -EOPNOTSUPP; + else + return 0; + } + + if (EXT4_INODE_SIZE(sb) <= EXT4_GOOD_OLD_INODE_SIZE) + return -EOPNOTSUPP; + + kprojid = make_kprojid(&init_user_ns, (projid_t)projid); + + if (projid_eq(kprojid, EXT4_I(inode)->i_projid)) + return 0; + + err = mnt_want_write_file(filp); + if (err) + return err; + + err = -EPERM; + inode_lock(inode); + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) + goto out_unlock; + + err = ext4_get_inode_loc(inode, &iloc); + if (err) + goto out_unlock; + + raw_inode = ext4_raw_inode(&iloc); + if (!EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) { + err = -EOVERFLOW; + brelse(iloc.bh); + goto out_unlock; + } + brelse(iloc.bh); + + dquot_initialize(inode); + + handle = ext4_journal_start(inode, EXT4_HT_QUOTA, + EXT4_QUOTA_INIT_BLOCKS(sb) + + EXT4_QUOTA_DEL_BLOCKS(sb) + 3); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto out_unlock; + } + + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + goto out_stop; + + if (sb_has_quota_limits_enabled(sb, PRJQUOTA)) { + struct dquot *transfer_to[MAXQUOTAS] = { }; + + transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid)); + if (transfer_to[PRJQUOTA]) { + err = __dquot_transfer(inode, transfer_to); + dqput(transfer_to[PRJQUOTA]); + if (err) + goto out_dirty; + } + } + EXT4_I(inode)->i_projid = kprojid; + inode->i_ctime = ext4_current_time(inode); +out_dirty: + rc = ext4_mark_iloc_dirty(handle, inode, &iloc); + if (!err) + err = rc; +out_stop: + ext4_journal_stop(handle); +out_unlock: + inode_unlock(inode); + mnt_drop_write_file(filp); + return err; +} +#else +static int ext4_ioctl_setproject(struct file *filp, __u32 projid) +{ + if (projid != EXT4_DEF_PROJID) + return -EOPNOTSUPP; + return 0; +} +#endif + +/* Transfer internal flags to xflags */ +static inline __u32 ext4_iflags_to_xflags(unsigned long iflags) +{ + __u32 xflags = 0; + + if (iflags & EXT4_SYNC_FL) + xflags |= FS_XFLAG_SYNC; + if (iflags & EXT4_IMMUTABLE_FL) + xflags |= FS_XFLAG_IMMUTABLE; + if (iflags & EXT4_APPEND_FL) + xflags |= FS_XFLAG_APPEND; + if (iflags & EXT4_NODUMP_FL) + xflags |= FS_XFLAG_NODUMP; + if (iflags & EXT4_NOATIME_FL) + xflags |= FS_XFLAG_NOATIME; + if (iflags & EXT4_PROJINHERIT_FL) + xflags |= FS_XFLAG_PROJINHERIT; + return xflags; +} + +/* Transfer xflags flags to internal */ +static inline unsigned long ext4_xflags_to_iflags(__u32 xflags) +{ + unsigned long iflags = 0; + + if (xflags & FS_XFLAG_SYNC) + iflags |= EXT4_SYNC_FL; + if (xflags & FS_XFLAG_IMMUTABLE) + iflags |= EXT4_IMMUTABLE_FL; + if (xflags & FS_XFLAG_APPEND) + iflags |= EXT4_APPEND_FL; + if (xflags & FS_XFLAG_NODUMP) + iflags |= EXT4_NODUMP_FL; + if (xflags & FS_XFLAG_NOATIME) + iflags |= EXT4_NOATIME_FL; + if (xflags & FS_XFLAG_PROJINHERIT) + iflags |= EXT4_PROJINHERIT_FL; + + return iflags; +} + long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -217,11 +450,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) flags = ei->i_flags & EXT4_FL_USER_VISIBLE; return put_user(flags, (int __user *) arg); case EXT4_IOC_SETFLAGS: { - handle_t *handle = NULL; - int err, migrate = 0; - struct ext4_iloc iloc; - unsigned int oldflags, mask, i; - unsigned int jflag; + int err; if (!inode_owner_or_capable(inode)) return -EACCES; @@ -235,90 +464,9 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) flags = ext4_mask_flags(inode->i_mode, flags); - err = -EPERM; - mutex_lock(&inode->i_mutex); - /* Is it quota file? Do not allow user to mess with it */ - if (IS_NOQUOTA(inode)) - goto flags_out; - - oldflags = ei->i_flags; - - /* The JOURNAL_DATA flag is modifiable only by root */ - jflag = flags & EXT4_JOURNAL_DATA_FL; - - /* - * The IMMUTABLE and APPEND_ONLY flags can only be changed by - * the relevant capability. - * - * This test looks nicer. Thanks to Pauline Middelink - */ - if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) { - if (!capable(CAP_LINUX_IMMUTABLE)) - goto flags_out; - } - - /* - * The JOURNAL_DATA flag can only be changed by - * the relevant capability. - */ - if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { - if (!capable(CAP_SYS_RESOURCE)) - goto flags_out; - } - if ((flags ^ oldflags) & EXT4_EXTENTS_FL) - migrate = 1; - - if (flags & EXT4_EOFBLOCKS_FL) { - /* we don't support adding EOFBLOCKS flag */ - if (!(oldflags & EXT4_EOFBLOCKS_FL)) { - err = -EOPNOTSUPP; - goto flags_out; - } - } else if (oldflags & EXT4_EOFBLOCKS_FL) - ext4_truncate(inode); - - handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); - goto flags_out; - } - if (IS_SYNC(inode)) - ext4_handle_sync(handle); - err = ext4_reserve_inode_write(handle, inode, &iloc); - if (err) - goto flags_err; - - for (i = 0, mask = 1; i < 32; i++, mask <<= 1) { - if (!(mask & EXT4_FL_USER_MODIFIABLE)) - continue; - if (mask & flags) - ext4_set_inode_flag(inode, i); - else - ext4_clear_inode_flag(inode, i); - } - - ext4_set_inode_flags(inode); - inode->i_ctime = ext4_current_time(inode); - - err = ext4_mark_iloc_dirty(handle, inode, &iloc); -flags_err: - ext4_journal_stop(handle); - if (err) - goto flags_out; - - if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) - err = ext4_change_inode_journal_flag(inode, jflag); - if (err) - goto flags_out; - if (migrate) { - if (flags & EXT4_EXTENTS_FL) - err = ext4_ext_migrate(inode); - else - err = ext4_ind_migrate(inode); - } - -flags_out: - mutex_unlock(&inode->i_mutex); + inode_lock(inode); + err = ext4_ioctl_setflags(inode, flags); + inode_unlock(inode); mnt_drop_write_file(filp); return err; } @@ -349,7 +497,7 @@ flags_out: goto setversion_out; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); @@ -364,7 +512,7 @@ flags_out: ext4_journal_stop(handle); unlock_out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); setversion_out: mnt_drop_write_file(filp); return err; @@ -435,6 +583,11 @@ group_extend_out: "Online defrag not supported with bigalloc"); err = -EOPNOTSUPP; goto mext_out; + } else if (IS_DAX(inode)) { + ext4_msg(sb, KERN_ERR, + "Online defrag not supported with DAX"); + err = -EOPNOTSUPP; + goto mext_out; } err = mnt_want_write_file(filp); @@ -510,9 +663,9 @@ group_add_out: * ext4_ext_swap_inode_data before we switch the * inode format to prevent read. */ - mutex_lock(&(inode->i_mutex)); + inode_lock((inode)); err = ext4_ext_migrate(inode); - mutex_unlock(&(inode->i_mutex)); + inode_unlock((inode)); mnt_drop_write_file(filp); return err; } @@ -689,6 +842,60 @@ encryption_policy_out: return -EOPNOTSUPP; #endif } + case EXT4_IOC_FSGETXATTR: + { + struct fsxattr fa; + + memset(&fa, 0, sizeof(struct fsxattr)); + ext4_get_inode_flags(ei); + fa.fsx_xflags = ext4_iflags_to_xflags(ei->i_flags & EXT4_FL_USER_VISIBLE); + + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_PROJECT)) { + fa.fsx_projid = (__u32)from_kprojid(&init_user_ns, + EXT4_I(inode)->i_projid); + } + + if (copy_to_user((struct fsxattr __user *)arg, + &fa, sizeof(fa))) + return -EFAULT; + return 0; + } + case EXT4_IOC_FSSETXATTR: + { + struct fsxattr fa; + int err; + + if (copy_from_user(&fa, (struct fsxattr __user *)arg, + sizeof(fa))) + return -EFAULT; + + /* Make sure caller has proper permission */ + if (!inode_owner_or_capable(inode)) + return -EACCES; + + err = mnt_want_write_file(filp); + if (err) + return err; + + flags = ext4_xflags_to_iflags(fa.fsx_xflags); + flags = ext4_mask_flags(inode->i_mode, flags); + + inode_lock(inode); + flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) | + (flags & EXT4_FL_XFLAG_VISIBLE); + err = ext4_ioctl_setflags(inode, flags); + inode_unlock(inode); + mnt_drop_write_file(filp); + if (err) + return err; + + err = ext4_ioctl_setproject(filp, fa.fsx_projid); + if (err) + return err; + + return 0; + } default: return -ENOTTY; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 61eaf74dca37..50e05df28f66 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -11,7 +11,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public Licens + * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- */ @@ -815,7 +815,7 @@ static void mb_regenerate_buddy(struct ext4_buddy *e4b) * for this page; do not hold this lock when calling this routine! */ -static int ext4_mb_init_cache(struct page *page, char *incore) +static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) { ext4_group_t ngroups; int blocksize; @@ -848,7 +848,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) /* allocate buffer_heads to read bitmaps */ if (groups_per_page > 1) { i = sizeof(struct buffer_head *) * groups_per_page; - bh = kzalloc(i, GFP_NOFS); + bh = kzalloc(i, gfp); if (bh == NULL) { err = -ENOMEM; goto out; @@ -983,7 +983,7 @@ out: * are on the same page e4b->bd_buddy_page is NULL and return value is 0. */ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, - ext4_group_t group, struct ext4_buddy *e4b) + ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) { struct inode *inode = EXT4_SB(sb)->s_buddy_cache; int block, pnum, poff; @@ -1002,7 +1002,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, block = group * 2; pnum = block / blocks_per_page; poff = block % blocks_per_page; - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + page = find_or_create_page(inode->i_mapping, pnum, gfp); if (!page) return -ENOMEM; BUG_ON(page->mapping != inode->i_mapping); @@ -1016,7 +1016,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, block++; pnum = block / blocks_per_page; - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + page = find_or_create_page(inode->i_mapping, pnum, gfp); if (!page) return -ENOMEM; BUG_ON(page->mapping != inode->i_mapping); @@ -1042,7 +1042,7 @@ static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) * calling this routine! */ static noinline_for_stack -int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) +int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) { struct ext4_group_info *this_grp; @@ -1062,7 +1062,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) * The call to ext4_mb_get_buddy_page_lock will mark the * page accessed. */ - ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b); + ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp); if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { /* * somebody initialized the group @@ -1072,7 +1072,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) } page = e4b.bd_bitmap_page; - ret = ext4_mb_init_cache(page, NULL); + ret = ext4_mb_init_cache(page, NULL, gfp); if (ret) goto err; if (!PageUptodate(page)) { @@ -1091,7 +1091,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) } /* init buddy cache */ page = e4b.bd_buddy_page; - ret = ext4_mb_init_cache(page, e4b.bd_bitmap); + ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp); if (ret) goto err; if (!PageUptodate(page)) { @@ -1109,8 +1109,8 @@ err: * calling this routine! */ static noinline_for_stack int -ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, - struct ext4_buddy *e4b) +ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, + struct ext4_buddy *e4b, gfp_t gfp) { int blocks_per_page; int block; @@ -1140,7 +1140,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, * we need full data about the group * to make a good selection */ - ret = ext4_mb_init_group(sb, group); + ret = ext4_mb_init_group(sb, group, gfp); if (ret) return ret; } @@ -1168,11 +1168,11 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, * wait for it to initialize. */ page_cache_release(page); - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + page = find_or_create_page(inode->i_mapping, pnum, gfp); if (page) { BUG_ON(page->mapping != inode->i_mapping); if (!PageUptodate(page)) { - ret = ext4_mb_init_cache(page, NULL); + ret = ext4_mb_init_cache(page, NULL, gfp); if (ret) { unlock_page(page); goto err; @@ -1204,11 +1204,12 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, if (page == NULL || !PageUptodate(page)) { if (page) page_cache_release(page); - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + page = find_or_create_page(inode->i_mapping, pnum, gfp); if (page) { BUG_ON(page->mapping != inode->i_mapping); if (!PageUptodate(page)) { - ret = ext4_mb_init_cache(page, e4b->bd_bitmap); + ret = ext4_mb_init_cache(page, e4b->bd_bitmap, + gfp); if (ret) { unlock_page(page); goto err; @@ -1247,6 +1248,12 @@ err: return ret; } +static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, + struct ext4_buddy *e4b) +{ + return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS); +} + static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) { if (e4b->bd_bitmap_page) @@ -2045,7 +2052,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, /* We only do this if the grp has never been initialized */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { - int ret = ext4_mb_init_group(ac->ac_sb, group); + int ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS); if (ret) return ret; } @@ -2285,7 +2292,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) if (group == 0) seq_puts(seq, "#group: free frags first [" " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 " - " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]"); + " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n"); i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + sizeof(struct ext4_group_info); @@ -4695,16 +4702,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, } /* - * We need to make sure we don't reuse the freed block until - * after the transaction is committed, which we can do by - * treating the block as metadata, below. We make an - * exception if the inode is to be written in writeback mode - * since writeback mode has weak data consistency guarantees. - */ - if (!ext4_should_writeback_data(inode)) - flags |= EXT4_FREE_BLOCKS_METADATA; - - /* * If the extent to be freed does not begin on a cluster * boundary, we need to deal with partial clusters at the * beginning and end of the extent. Normally we will free @@ -4738,14 +4735,13 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) { int i; + int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA; for (i = 0; i < count; i++) { cond_resched(); - bh = sb_find_get_block(inode->i_sb, block + i); - if (!bh) - continue; - ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, - inode, bh, block + i); + if (is_metadata) + bh = sb_find_get_block(inode->i_sb, block + i); + ext4_forget(handle, is_metadata, inode, bh, block + i); } } @@ -4815,16 +4811,23 @@ do_more: #endif trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters); - err = ext4_mb_load_buddy(sb, block_group, &e4b); + /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */ + err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b, + GFP_NOFS|__GFP_NOFAIL); if (err) goto error_return; - if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) { + /* + * We need to make sure we don't reuse the freed block until after the + * transaction is committed. We make an exception if the inode is to be + * written in writeback mode since writeback mode has weak data + * consistency guarantees. + */ + if (ext4_handle_valid(handle) && + ((flags & EXT4_FREE_BLOCKS_METADATA) || + !ext4_should_writeback_data(inode))) { struct ext4_free_data *new_entry; /* - * blocks being freed are metadata. these blocks shouldn't - * be used until this transaction is committed - * * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed * to fail. */ @@ -5217,7 +5220,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) grp = ext4_get_group_info(sb, group); /* We only do this if the grp has never been initialized */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { - ret = ext4_mb_init_group(sb, group); + ret = ext4_mb_init_group(sb, group, GFP_NOFS); if (ret) break; } diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index d634e183b4d4..3ef1df6ae9ec 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -23,18 +23,6 @@ #include "ext4.h" /* - * with AGGRESSIVE_CHECK allocator runs consistency checks over - * structures. these checks slow things down a lot - */ -#define AGGRESSIVE_CHECK__ - -/* - * with DOUBLE_CHECK defined mballoc creates persistent in-core - * bitmaps, maintains and uses them to check for double allocations - */ -#define DOUBLE_CHECK__ - -/* */ #ifdef CONFIG_EXT4_DEBUG extern ushort ext4_mballoc_debug; diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index a4651894cc33..364ea4d4a943 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -361,7 +361,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, * blocks. * * While converting to extents we need not - * update the orignal inode i_blocks for extent blocks + * update the original inode i_blocks for extent blocks * via quota APIs. The quota update happened via tmp_inode already. */ spin_lock(&inode->i_lock); diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 0a512aa81bf7..24445275d330 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -91,21 +91,22 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh); wait_on_buffer(*bh); if (!buffer_uptodate(*bh)) { - brelse(*bh); - *bh = NULL; ret = -EIO; goto warn_exit; } - mmp = (struct mmp_struct *)((*bh)->b_data); - if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) + if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) { ret = -EFSCORRUPTED; - else if (!ext4_mmp_csum_verify(sb, mmp)) + goto warn_exit; + } + if (!ext4_mmp_csum_verify(sb, mmp)) { ret = -EFSBADCRC; - else - return 0; - + goto warn_exit; + } + return 0; warn_exit: + brelse(*bh); + *bh = NULL; ext4_warning(sb, "Error %d while reading MMP block %llu", ret, mmp_block); return ret; @@ -181,15 +182,13 @@ static int kmmpd(void *data) EXT4_FEATURE_INCOMPAT_MMP)) { ext4_warning(sb, "kmmpd being stopped since MMP feature" " has been disabled."); - EXT4_SB(sb)->s_mmp_tsk = NULL; - goto failed; + goto exit_thread; } if (sb->s_flags & MS_RDONLY) { ext4_warning(sb, "kmmpd being stopped since filesystem " "has been remounted as readonly."); - EXT4_SB(sb)->s_mmp_tsk = NULL; - goto failed; + goto exit_thread; } diff = jiffies - last_update_time; @@ -211,9 +210,7 @@ static int kmmpd(void *data) if (retval) { ext4_error(sb, "error reading MMP data: %d", retval); - - EXT4_SB(sb)->s_mmp_tsk = NULL; - goto failed; + goto exit_thread; } mmp_check = (struct mmp_struct *)(bh_check->b_data); @@ -225,7 +222,9 @@ static int kmmpd(void *data) "The filesystem seems to have been" " multiply mounted."); ext4_error(sb, "abort"); - goto failed; + put_bh(bh_check); + retval = -EBUSY; + goto exit_thread; } put_bh(bh_check); } @@ -248,7 +247,8 @@ static int kmmpd(void *data) retval = write_mmp_block(sb, bh); -failed: +exit_thread: + EXT4_SB(sb)->s_mmp_tsk = NULL; kfree(data); brelse(bh); return retval; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index fb6f11709ae6..4098acc701c3 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -265,11 +265,12 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, ext4_lblk_t orig_blk_offset, donor_blk_offset; unsigned long blocksize = orig_inode->i_sb->s_blocksize; unsigned int tmp_data_size, data_size, replaced_size; - int err2, jblocks, retries = 0; + int i, err2, jblocks, retries = 0; int replaced_count = 0; int from = data_offset_in_page << orig_inode->i_blkbits; int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; struct super_block *sb = orig_inode->i_sb; + struct buffer_head *bh = NULL; /* * It needs twice the amount of ordinary journal buffers because @@ -380,8 +381,17 @@ data_copy: } /* Perform all necessary steps similar write_begin()/write_end() * but keeping in mind that i_size will not change */ - *err = __block_write_begin(pagep[0], from, replaced_size, - ext4_get_block); + if (!page_has_buffers(pagep[0])) + create_empty_buffers(pagep[0], 1 << orig_inode->i_blkbits, 0); + bh = page_buffers(pagep[0]); + for (i = 0; i < data_offset_in_page; i++) + bh = bh->b_this_page; + for (i = 0; i < block_len_in_page; i++) { + *err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0); + if (*err < 0) + break; + bh = bh->b_this_page; + } if (!*err) *err = block_commit_write(pagep[0], from, from + replaced_size); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index f27e0c2598c5..48e4b8907826 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -273,7 +273,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir); static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, - struct dentry *dentry, struct inode *inode); + struct inode *dir, struct inode *inode); /* checksumming functions */ void initialize_dirent_tail(struct ext4_dir_entry_tail *t, @@ -1558,6 +1558,24 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi struct ext4_dir_entry_2 *de; struct buffer_head *bh; + if (ext4_encrypted_inode(dir)) { + int res = ext4_get_encryption_info(dir); + + /* + * This should be a properly defined flag for + * dentry->d_flags when we uplift this to the VFS. + * d_fsdata is set to (void *) 1 if if the dentry is + * created while the directory was encrypted and we + * don't have access to the key. + */ + dentry->d_fsdata = NULL; + if (ext4_encryption_info(dir)) + dentry->d_fsdata = (void *) 1; + d_set_d_op(dentry, &ext4_encrypted_d_ops); + if (res && res != -ENOKEY) + return ERR_PTR(res); + } + if (dentry->d_name.len > EXT4_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); @@ -1585,11 +1603,15 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi return ERR_PTR(-EFSCORRUPTED); } if (!IS_ERR(inode) && ext4_encrypted_inode(dir) && - (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode)) && + (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && !ext4_is_child_context_consistent_with_parent(dir, inode)) { + int nokey = ext4_encrypted_inode(inode) && + !ext4_encryption_info(inode); + iput(inode); + if (nokey) + return ERR_PTR(-ENOKEY); ext4_warning(inode->i_sb, "Inconsistent encryption contexts: %lu/%lu\n", (unsigned long) dir->i_ino, @@ -1928,10 +1950,9 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, * directory, and adds the dentry to the indexed directory. */ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, - struct dentry *dentry, + struct inode *dir, struct inode *inode, struct buffer_head *bh) { - struct inode *dir = d_inode(dentry->d_parent); struct buffer_head *bh2; struct dx_root *root; struct dx_frame frames[2], *frame; @@ -2086,8 +2107,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, return retval; if (ext4_has_inline_data(dir)) { - retval = ext4_try_add_inline_entry(handle, &fname, - dentry, inode); + retval = ext4_try_add_inline_entry(handle, &fname, dir, inode); if (retval < 0) goto out; if (retval == 1) { @@ -2097,7 +2117,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, } if (is_dx(dir)) { - retval = ext4_dx_add_entry(handle, &fname, dentry, inode); + retval = ext4_dx_add_entry(handle, &fname, dir, inode); if (!retval || (retval != ERR_BAD_DX_DIR)) goto out; ext4_clear_inode_flag(dir, EXT4_INODE_INDEX); @@ -2119,7 +2139,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, if (blocks == 1 && !dx_fallback && ext4_has_feature_dir_index(sb)) { - retval = make_indexed_dir(handle, &fname, dentry, + retval = make_indexed_dir(handle, &fname, dir, inode, bh); bh = NULL; /* make_indexed_dir releases bh */ goto out; @@ -2154,12 +2174,11 @@ out: * Returns 0 for success, or a negative error value */ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, - struct dentry *dentry, struct inode *inode) + struct inode *dir, struct inode *inode) { struct dx_frame frames[2], *frame; struct dx_entry *entries, *at; struct buffer_head *bh; - struct inode *dir = d_inode(dentry->d_parent); struct super_block *sb = dir->i_sb; struct ext4_dir_entry_2 *de; int err; @@ -2756,7 +2775,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) return 0; WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && - !mutex_is_locked(&inode->i_mutex)); + !inode_is_locked(inode)); /* * Exit early if inode already is on orphan list. This is a big speedup * since we don't have to contend on the global s_orphan_lock. @@ -2838,7 +2857,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) return 0; WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && - !mutex_is_locked(&inode->i_mutex)); + !inode_is_locked(inode)); /* Do this quick check before taking global s_orphan_lock. */ if (list_empty(&ei->i_orphan)) return 0; @@ -3212,6 +3231,12 @@ static int ext4_link(struct dentry *old_dentry, if (ext4_encrypted_inode(dir) && !ext4_is_child_context_consistent_with_parent(dir, inode)) return -EPERM; + + if ((ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) && + (!projid_eq(EXT4_I(dir)->i_projid, + EXT4_I(old_dentry->d_inode)->i_projid))) + return -EXDEV; + err = dquot_initialize(dir); if (err) return err; @@ -3492,6 +3517,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, int credits; u8 old_file_type; + if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT)) && + (!projid_eq(EXT4_I(new_dir)->i_projid, + EXT4_I(old_dentry->d_inode)->i_projid))) + return -EXDEV; + retval = dquot_initialize(old.dir); if (retval) return retval; @@ -3701,6 +3731,14 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, new.inode))) return -EPERM; + if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT) && + !projid_eq(EXT4_I(new_dir)->i_projid, + EXT4_I(old_dentry->d_inode)->i_projid)) || + (ext4_test_inode_flag(old_dir, EXT4_INODE_PROJINHERIT) && + !projid_eq(EXT4_I(old_dir)->i_projid, + EXT4_I(new_dentry->d_inode)->i_projid))) + return -EXDEV; + retval = dquot_initialize(old.dir); if (retval) return retval; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 090b3498638e..349d7aa04fe7 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -128,9 +128,6 @@ static void ext4_release_io_end(ext4_io_end_t *io_end) BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); WARN_ON(io_end->handle); - if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count)) - wake_up_all(ext4_ioend_wq(io_end->inode)); - for (bio = io_end->bio; bio; bio = next_bio) { next_bio = bio->bi_private; ext4_finish_bio(bio); @@ -265,7 +262,6 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) { ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags); if (io) { - atomic_inc(&EXT4_I(inode)->i_ioend_count); io->inode = inode; INIT_LIST_HEAD(&io->list); atomic_set(&io->count, 1); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index ad62d7acc315..34038e3598d5 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -198,7 +198,7 @@ static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size) if (flex_gd == NULL) goto out3; - if (flexbg_size >= UINT_MAX / sizeof(struct ext4_new_flex_group_data)) + if (flexbg_size >= UINT_MAX / sizeof(struct ext4_new_group_data)) goto out2; flex_gd->count = flexbg_size; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f1b56ff01208..99996e9a8f57 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -55,7 +55,6 @@ static struct ext4_lazy_init *ext4_li_info; static struct mutex ext4_li_mtx; -static int ext4_mballoc_ready; static struct ratelimit_state ext4_mount_msg_ratelimit; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, @@ -80,6 +79,36 @@ static void ext4_destroy_lazyinit_thread(void); static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); +/* + * Lock ordering + * + * Note the difference between i_mmap_sem (EXT4_I(inode)->i_mmap_sem) and + * i_mmap_rwsem (inode->i_mmap_rwsem)! + * + * page fault path: + * mmap_sem -> sb_start_pagefault -> i_mmap_sem (r) -> transaction start -> + * page lock -> i_data_sem (rw) + * + * buffered write path: + * sb_start_write -> i_mutex -> mmap_sem + * sb_start_write -> i_mutex -> transaction start -> page lock -> + * i_data_sem (rw) + * + * truncate: + * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (w) -> i_mmap_sem (w) -> + * i_mmap_rwsem (w) -> page lock + * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (w) -> i_mmap_sem (w) -> + * transaction start -> i_data_sem (rw) + * + * direct IO: + * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (r) -> mmap_sem + * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (r) -> + * transaction start -> i_data_sem (rw) + * + * writepages: + * transaction start -> page lock(s) -> i_data_sem (rw) + */ + #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2) static struct file_system_type ext2_fs_type = { .owner = THIS_MODULE, @@ -814,7 +843,6 @@ static void ext4_put_super(struct super_block *sb) ext4_release_system_zone(sb); ext4_mb_release(sb); ext4_ext_release(sb); - ext4_xattr_put_super(sb); if (!(sb->s_flags & MS_RDONLY)) { ext4_clear_feature_journal_needs_recovery(sb); @@ -914,7 +942,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) spin_lock_init(&ei->i_completed_io_lock); ei->i_sync_tid = 0; ei->i_datasync_tid = 0; - atomic_set(&ei->i_ioend_count, 0); atomic_set(&ei->i_unwritten, 0); INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); #ifdef CONFIG_EXT4_FS_ENCRYPTION @@ -958,6 +985,7 @@ static void init_once(void *foo) INIT_LIST_HEAD(&ei->i_orphan); init_rwsem(&ei->xattr_sem); init_rwsem(&ei->i_data_sem); + init_rwsem(&ei->i_mmap_sem); inode_init_once(&ei->vfs_inode); } @@ -1066,8 +1094,8 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page, } #ifdef CONFIG_QUOTA -#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group") -#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) +static char *quotatypes[] = INITQFNAMES; +#define QTYPE2NAME(t) (quotatypes[t]) static int ext4_write_dquot(struct dquot *dquot); static int ext4_acquire_dquot(struct dquot *dquot); @@ -1100,6 +1128,7 @@ static const struct dquot_operations ext4_quota_operations = { .write_info = ext4_write_info, .alloc_dquot = dquot_alloc, .destroy_dquot = dquot_destroy, + .get_projid = ext4_get_projid, }; static const struct quotactl_ops ext4_qctl_operations = { @@ -1393,9 +1422,9 @@ static const struct mount_opts { {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR}, {Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR}, {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT, - MOPT_NO_EXT2 | MOPT_SET}, + MOPT_NO_EXT2}, {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT, - MOPT_NO_EXT2 | MOPT_CLEAR}, + MOPT_NO_EXT2}, {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET}, {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR}, {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET}, @@ -1673,6 +1702,10 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, ext4_msg(sb, KERN_INFO, "dax option not supported"); return -1; #endif + } else if (token == Opt_data_err_abort) { + sbi->s_mount_opt |= m->mount_opt; + } else if (token == Opt_data_err_ignore) { + sbi->s_mount_opt &= ~m->mount_opt; } else { if (!args->from) arg = 1; @@ -1882,6 +1915,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult); if (nodefs || sbi->s_max_dir_size_kb) SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb); + if (test_opt(sb, DATA_ERR_ABORT)) + SEQ_OPTS_PUTS("data_err=abort"); ext4_show_quota_options(seq, sb); return 0; @@ -2254,10 +2289,10 @@ static void ext4_orphan_cleanup(struct super_block *sb, __func__, inode->i_ino, inode->i_size); jbd_debug(2, "truncating inode %lu to %lld bytes\n", inode->i_ino, inode->i_size); - mutex_lock(&inode->i_mutex); + inode_lock(inode); truncate_inode_pages(inode->i_mapping, inode->i_size); ext4_truncate(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); nr_truncates++; } else { if (test_opt(sb, DEBUG)) @@ -2526,6 +2561,12 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly) "without CONFIG_QUOTA"); return 0; } + if (ext4_has_feature_project(sb) && !readonly) { + ext4_msg(sb, KERN_ERR, + "Filesystem with project quota feature cannot be mounted RDWR " + "without CONFIG_QUOTA"); + return 0; + } #endif /* CONFIG_QUOTA */ return 1; } @@ -3654,7 +3695,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_qcop = &dquot_quotactl_sysfile_ops; else sb->s_qcop = &ext4_qctl_operations; - sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; #endif memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); @@ -3758,12 +3799,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; no_journal: - if (ext4_mballoc_ready) { - sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id); - if (!sbi->s_mb_cache) { - ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache"); - goto failed_mount_wq; - } + sbi->s_mb_cache = ext4_xattr_create_cache(); + if (!sbi->s_mb_cache) { + ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache"); + goto failed_mount_wq; } if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) && @@ -3989,6 +4028,10 @@ failed_mount4: if (EXT4_SB(sb)->rsv_conversion_wq) destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); failed_mount_wq: + if (sbi->s_mb_cache) { + ext4_xattr_destroy_cache(sbi->s_mb_cache); + sbi->s_mb_cache = NULL; + } if (sbi->s_journal) { jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; @@ -4790,6 +4833,48 @@ restore_opts: return err; } +#ifdef CONFIG_QUOTA +static int ext4_statfs_project(struct super_block *sb, + kprojid_t projid, struct kstatfs *buf) +{ + struct kqid qid; + struct dquot *dquot; + u64 limit; + u64 curblock; + + qid = make_kqid_projid(projid); + dquot = dqget(sb, qid); + if (IS_ERR(dquot)) + return PTR_ERR(dquot); + spin_lock(&dq_data_lock); + + limit = (dquot->dq_dqb.dqb_bsoftlimit ? + dquot->dq_dqb.dqb_bsoftlimit : + dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits; + if (limit && buf->f_blocks > limit) { + curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits; + buf->f_blocks = limit; + buf->f_bfree = buf->f_bavail = + (buf->f_blocks > curblock) ? + (buf->f_blocks - curblock) : 0; + } + + limit = dquot->dq_dqb.dqb_isoftlimit ? + dquot->dq_dqb.dqb_isoftlimit : + dquot->dq_dqb.dqb_ihardlimit; + if (limit && buf->f_files > limit) { + buf->f_files = limit; + buf->f_ffree = + (buf->f_files > dquot->dq_dqb.dqb_curinodes) ? + (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0; + } + + spin_unlock(&dq_data_lock); + dqput(dquot); + return 0; +} +#endif + static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; @@ -4822,6 +4907,11 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; +#ifdef CONFIG_QUOTA + if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) && + sb_has_quota_limits_enabled(sb, PRJQUOTA)) + ext4_statfs_project(sb, EXT4_I(dentry->d_inode)->i_projid, buf); +#endif return 0; } @@ -4986,7 +5076,8 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, struct inode *qf_inode; unsigned long qf_inums[EXT4_MAXQUOTAS] = { le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), - le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum) + le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum), + le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum) }; BUG_ON(!ext4_has_feature_quota(sb)); @@ -5014,7 +5105,8 @@ static int ext4_enable_quotas(struct super_block *sb) int type, err = 0; unsigned long qf_inums[EXT4_MAXQUOTAS] = { le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), - le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum) + le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum), + le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum) }; sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; @@ -5234,7 +5326,6 @@ MODULE_ALIAS_FS("ext4"); /* Shared across all ext4 file systems */ wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; -struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ]; static int __init ext4_init_fs(void) { @@ -5247,10 +5338,8 @@ static int __init ext4_init_fs(void) /* Build-time check for flags consistency */ ext4_check_flag_values(); - for (i = 0; i < EXT4_WQ_HASH_SZ; i++) { - mutex_init(&ext4__aio_mutex[i]); + for (i = 0; i < EXT4_WQ_HASH_SZ; i++) init_waitqueue_head(&ext4__ioend_wq[i]); - } err = ext4_init_es(); if (err) @@ -5271,8 +5360,6 @@ static int __init ext4_init_fs(void) err = ext4_init_mballoc(); if (err) goto out2; - else - ext4_mballoc_ready = 1; err = init_inodecache(); if (err) goto out1; @@ -5288,7 +5375,6 @@ out: unregister_as_ext3(); destroy_inodecache(); out1: - ext4_mballoc_ready = 0; ext4_exit_mballoc(); out2: ext4_exit_sysfs(); diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h index 011ba6670d99..c70d06a383e2 100644 --- a/fs/ext4/truncate.h +++ b/fs/ext4/truncate.h @@ -10,8 +10,10 @@ */ static inline void ext4_truncate_failed_write(struct inode *inode) { + down_write(&EXT4_I(inode)->i_mmap_sem); truncate_inode_pages(inode->i_mapping, inode->i_size); ext4_truncate(inode); + up_write(&EXT4_I(inode)->i_mmap_sem); } /* diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index a95151e875bd..0441e055c8e8 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -545,30 +545,44 @@ static void ext4_xattr_release_block(handle_t *handle, struct inode *inode, struct buffer_head *bh) { - struct mb_cache_entry *ce = NULL; - int error = 0; struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + u32 hash, ref; + int error = 0; - ce = mb_cache_entry_get(ext4_mb_cache, bh->b_bdev, bh->b_blocknr); BUFFER_TRACE(bh, "get_write_access"); error = ext4_journal_get_write_access(handle, bh); if (error) goto out; lock_buffer(bh); - if (BHDR(bh)->h_refcount == cpu_to_le32(1)) { + hash = le32_to_cpu(BHDR(bh)->h_hash); + ref = le32_to_cpu(BHDR(bh)->h_refcount); + if (ref == 1) { ea_bdebug(bh, "refcount now=0; freeing"); - if (ce) - mb_cache_entry_free(ce); + /* + * This must happen under buffer lock for + * ext4_xattr_block_set() to reliably detect freed block + */ + mb_cache_entry_delete_block(ext4_mb_cache, hash, bh->b_blocknr); get_bh(bh); unlock_buffer(bh); ext4_free_blocks(handle, inode, bh, 0, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); } else { - le32_add_cpu(&BHDR(bh)->h_refcount, -1); - if (ce) - mb_cache_entry_release(ce); + ref--; + BHDR(bh)->h_refcount = cpu_to_le32(ref); + if (ref == EXT4_XATTR_REFCOUNT_MAX - 1) { + struct mb_cache_entry *ce; + + ce = mb_cache_entry_get(ext4_mb_cache, hash, + bh->b_blocknr); + if (ce) { + ce->e_reusable = 1; + mb_cache_entry_put(ext4_mb_cache, ce); + } + } + /* * Beware of this ugliness: Releasing of xattr block references * from different inodes can race and so we have to protect @@ -790,8 +804,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, if (i->value && i->value_len > sb->s_blocksize) return -ENOSPC; if (s->base) { - ce = mb_cache_entry_get(ext4_mb_cache, bs->bh->b_bdev, - bs->bh->b_blocknr); BUFFER_TRACE(bs->bh, "get_write_access"); error = ext4_journal_get_write_access(handle, bs->bh); if (error) @@ -799,10 +811,15 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, lock_buffer(bs->bh); if (header(s->base)->h_refcount == cpu_to_le32(1)) { - if (ce) { - mb_cache_entry_free(ce); - ce = NULL; - } + __u32 hash = le32_to_cpu(BHDR(bs->bh)->h_hash); + + /* + * This must happen under buffer lock for + * ext4_xattr_block_set() to reliably detect modified + * block + */ + mb_cache_entry_delete_block(ext4_mb_cache, hash, + bs->bh->b_blocknr); ea_bdebug(bs->bh, "modifying in-place"); error = ext4_xattr_set_entry(i, s); if (!error) { @@ -826,10 +843,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, int offset = (char *)s->here - bs->bh->b_data; unlock_buffer(bs->bh); - if (ce) { - mb_cache_entry_release(ce); - ce = NULL; - } ea_bdebug(bs->bh, "cloning"); s->base = kmalloc(bs->bh->b_size, GFP_NOFS); error = -ENOMEM; @@ -872,6 +885,8 @@ inserted: if (new_bh == bs->bh) ea_bdebug(new_bh, "keeping"); else { + u32 ref; + /* The old block is released after updating the inode. */ error = dquot_alloc_block(inode, @@ -884,9 +899,40 @@ inserted: if (error) goto cleanup_dquot; lock_buffer(new_bh); - le32_add_cpu(&BHDR(new_bh)->h_refcount, 1); + /* + * We have to be careful about races with + * freeing, rehashing or adding references to + * xattr block. Once we hold buffer lock xattr + * block's state is stable so we can check + * whether the block got freed / rehashed or + * not. Since we unhash mbcache entry under + * buffer lock when freeing / rehashing xattr + * block, checking whether entry is still + * hashed is reliable. Same rules hold for + * e_reusable handling. + */ + if (hlist_bl_unhashed(&ce->e_hash_list) || + !ce->e_reusable) { + /* + * Undo everything and check mbcache + * again. + */ + unlock_buffer(new_bh); + dquot_free_block(inode, + EXT4_C2B(EXT4_SB(sb), + 1)); + brelse(new_bh); + mb_cache_entry_put(ext4_mb_cache, ce); + ce = NULL; + new_bh = NULL; + goto inserted; + } + ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1; + BHDR(new_bh)->h_refcount = cpu_to_le32(ref); + if (ref >= EXT4_XATTR_REFCOUNT_MAX) + ce->e_reusable = 0; ea_bdebug(new_bh, "reusing; refcount now=%d", - le32_to_cpu(BHDR(new_bh)->h_refcount)); + ref); unlock_buffer(new_bh); error = ext4_handle_dirty_xattr_block(handle, inode, @@ -894,7 +940,8 @@ inserted: if (error) goto cleanup_dquot; } - mb_cache_entry_release(ce); + mb_cache_entry_touch(ext4_mb_cache, ce); + mb_cache_entry_put(ext4_mb_cache, ce); ce = NULL; } else if (bs->bh && s->base == bs->bh->b_data) { /* We were modifying this block in-place. */ @@ -959,7 +1006,7 @@ getblk_failed: cleanup: if (ce) - mb_cache_entry_release(ce); + mb_cache_entry_put(ext4_mb_cache, ce); brelse(new_bh); if (!(bs->bh && s->base == bs->bh->b_data)) kfree(s->base); @@ -1070,6 +1117,17 @@ static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, return 0; } +static int ext4_xattr_value_same(struct ext4_xattr_search *s, + struct ext4_xattr_info *i) +{ + void *value; + + if (le32_to_cpu(s->here->e_value_size) != i->value_len) + return 0; + value = ((void *)s->base) + le16_to_cpu(s->here->e_value_offs); + return !memcmp(value, i->value, i->value_len); +} + /* * ext4_xattr_set_handle() * @@ -1146,6 +1204,13 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, else if (!bs.s.not_found) error = ext4_xattr_block_set(handle, inode, &i, &bs); } else { + error = 0; + /* Xattr value did not change? Save us some work and bail out */ + if (!is.s.not_found && ext4_xattr_value_same(&is.s, &i)) + goto cleanup; + if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i)) + goto cleanup; + error = ext4_xattr_ibody_set(handle, inode, &i, &is); if (!error && !bs.s.not_found) { i.value = NULL; @@ -1512,17 +1577,6 @@ cleanup: } /* - * ext4_xattr_put_super() - * - * This is called when a file system is unmounted. - */ -void -ext4_xattr_put_super(struct super_block *sb) -{ - mb_cache_shrink(sb->s_bdev); -} - -/* * ext4_xattr_cache_insert() * * Create a new entry in the extended attribute cache, and insert @@ -1533,26 +1587,19 @@ ext4_xattr_put_super(struct super_block *sb) static void ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh) { - __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); - struct mb_cache_entry *ce; + struct ext4_xattr_header *header = BHDR(bh); + __u32 hash = le32_to_cpu(header->h_hash); + int reusable = le32_to_cpu(header->h_refcount) < + EXT4_XATTR_REFCOUNT_MAX; int error; - ce = mb_cache_entry_alloc(ext4_mb_cache, GFP_NOFS); - if (!ce) { - ea_bdebug(bh, "out of memory"); - return; - } - error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash); + error = mb_cache_entry_create(ext4_mb_cache, GFP_NOFS, hash, + bh->b_blocknr, reusable); if (error) { - mb_cache_entry_free(ce); - if (error == -EBUSY) { + if (error == -EBUSY) ea_bdebug(bh, "already in cache"); - error = 0; - } - } else { + } else ea_bdebug(bh, "inserting [%x]", (int)hash); - mb_cache_entry_release(ce); - } } /* @@ -1614,33 +1661,20 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header, if (!header->h_hash) return NULL; /* never share */ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); -again: - ce = mb_cache_entry_find_first(ext4_mb_cache, inode->i_sb->s_bdev, - hash); + ce = mb_cache_entry_find_first(ext4_mb_cache, hash); while (ce) { struct buffer_head *bh; - if (IS_ERR(ce)) { - if (PTR_ERR(ce) == -EAGAIN) - goto again; - break; - } bh = sb_bread(inode->i_sb, ce->e_block); if (!bh) { EXT4_ERROR_INODE(inode, "block %lu read error", (unsigned long) ce->e_block); - } else if (le32_to_cpu(BHDR(bh)->h_refcount) >= - EXT4_XATTR_REFCOUNT_MAX) { - ea_idebug(inode, "block %lu refcount %d>=%d", - (unsigned long) ce->e_block, - le32_to_cpu(BHDR(bh)->h_refcount), - EXT4_XATTR_REFCOUNT_MAX); } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { *pce = ce; return bh; } brelse(bh); - ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash); + ce = mb_cache_entry_find_next(ext4_mb_cache, ce); } return NULL; } @@ -1716,9 +1750,9 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header, #define HASH_BUCKET_BITS 10 struct mb_cache * -ext4_xattr_create_cache(char *name) +ext4_xattr_create_cache(void) { - return mb_cache_create(name, HASH_BUCKET_BITS); + return mb_cache_create(HASH_BUCKET_BITS); } void ext4_xattr_destroy_cache(struct mb_cache *cache) diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index ddc0957760ba..69dd3e6566e0 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -108,7 +108,6 @@ extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_ extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); extern void ext4_xattr_delete_inode(handle_t *, struct inode *); -extern void ext4_xattr_put_super(struct super_block *); extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_inode *raw_inode, handle_t *handle); @@ -124,7 +123,7 @@ extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_ibody_find *is); -extern struct mb_cache *ext4_xattr_create_cache(char *name); +extern struct mb_cache *ext4_xattr_create_cache(void); extern void ext4_xattr_destroy_cache(struct mb_cache *); #ifdef CONFIG_EXT4_FS_SECURITY diff --git a/fs/f2fs/crypto.c b/fs/f2fs/crypto.c index 4a62ef14e932..95c5cf039711 100644 --- a/fs/f2fs/crypto.c +++ b/fs/f2fs/crypto.c @@ -23,11 +23,9 @@ * The usage of AES-XTS should conform to recommendations in NIST * Special Publication 800-38E and IEEE P1619/D16. */ -#include <crypto/hash.h> -#include <crypto/sha.h> +#include <crypto/skcipher.h> #include <keys/user-type.h> #include <keys/encrypted-type.h> -#include <linux/crypto.h> #include <linux/ecryptfs.h> #include <linux/gfp.h> #include <linux/kernel.h> @@ -328,21 +326,21 @@ static int f2fs_page_crypto(struct f2fs_crypto_ctx *ctx, struct page *dest_page) { u8 xts_tweak[F2FS_XTS_TWEAK_SIZE]; - struct ablkcipher_request *req = NULL; + struct skcipher_request *req = NULL; DECLARE_F2FS_COMPLETION_RESULT(ecr); struct scatterlist dst, src; struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; - struct crypto_ablkcipher *tfm = ci->ci_ctfm; + struct crypto_skcipher *tfm = ci->ci_ctfm; int res = 0; - req = ablkcipher_request_alloc(tfm, GFP_NOFS); + req = skcipher_request_alloc(tfm, GFP_NOFS); if (!req) { printk_ratelimited(KERN_ERR "%s: crypto_request_alloc() failed\n", __func__); return -ENOMEM; } - ablkcipher_request_set_callback( + skcipher_request_set_callback( req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, f2fs_crypt_complete, &ecr); @@ -355,21 +353,21 @@ static int f2fs_page_crypto(struct f2fs_crypto_ctx *ctx, sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0); sg_init_table(&src, 1); sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0); - ablkcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE, - xts_tweak); + skcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE, + xts_tweak); if (rw == F2FS_DECRYPT) - res = crypto_ablkcipher_decrypt(req); + res = crypto_skcipher_decrypt(req); else - res = crypto_ablkcipher_encrypt(req); + res = crypto_skcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { BUG_ON(req->base.data != &ecr); wait_for_completion(&ecr.completion); res = ecr.res; } - ablkcipher_request_free(req); + skcipher_request_free(req); if (res) { printk_ratelimited(KERN_ERR - "%s: crypto_ablkcipher_encrypt() returned %d\n", + "%s: crypto_skcipher_encrypt() returned %d\n", __func__, res); return res; } diff --git a/fs/f2fs/crypto_fname.c b/fs/f2fs/crypto_fname.c index ab377d496a39..16aec6653291 100644 --- a/fs/f2fs/crypto_fname.c +++ b/fs/f2fs/crypto_fname.c @@ -15,11 +15,9 @@ * * This has not yet undergone a rigorous security audit. */ -#include <crypto/hash.h> -#include <crypto/sha.h> +#include <crypto/skcipher.h> #include <keys/encrypted-type.h> #include <keys/user-type.h> -#include <linux/crypto.h> #include <linux/gfp.h> #include <linux/kernel.h> #include <linux/key.h> @@ -70,10 +68,10 @@ static int f2fs_fname_encrypt(struct inode *inode, const struct qstr *iname, struct f2fs_str *oname) { u32 ciphertext_len; - struct ablkcipher_request *req = NULL; + struct skcipher_request *req = NULL; DECLARE_F2FS_COMPLETION_RESULT(ecr); struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; - struct crypto_ablkcipher *tfm = ci->ci_ctfm; + struct crypto_skcipher *tfm = ci->ci_ctfm; int res = 0; char iv[F2FS_CRYPTO_BLOCK_SIZE]; struct scatterlist src_sg, dst_sg; @@ -99,14 +97,14 @@ static int f2fs_fname_encrypt(struct inode *inode, } /* Allocate request */ - req = ablkcipher_request_alloc(tfm, GFP_NOFS); + req = skcipher_request_alloc(tfm, GFP_NOFS); if (!req) { printk_ratelimited(KERN_ERR "%s: crypto_request_alloc() failed\n", __func__); kfree(alloc_buf); return -ENOMEM; } - ablkcipher_request_set_callback(req, + skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, f2fs_dir_crypt_complete, &ecr); @@ -121,15 +119,15 @@ static int f2fs_fname_encrypt(struct inode *inode, /* Create encryption request */ sg_init_one(&src_sg, workbuf, ciphertext_len); sg_init_one(&dst_sg, oname->name, ciphertext_len); - ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv); - res = crypto_ablkcipher_encrypt(req); + skcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv); + res = crypto_skcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { BUG_ON(req->base.data != &ecr); wait_for_completion(&ecr.completion); res = ecr.res; } kfree(alloc_buf); - ablkcipher_request_free(req); + skcipher_request_free(req); if (res < 0) { printk_ratelimited(KERN_ERR "%s: Error (error code %d)\n", __func__, res); @@ -148,11 +146,11 @@ static int f2fs_fname_encrypt(struct inode *inode, static int f2fs_fname_decrypt(struct inode *inode, const struct f2fs_str *iname, struct f2fs_str *oname) { - struct ablkcipher_request *req = NULL; + struct skcipher_request *req = NULL; DECLARE_F2FS_COMPLETION_RESULT(ecr); struct scatterlist src_sg, dst_sg; struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; - struct crypto_ablkcipher *tfm = ci->ci_ctfm; + struct crypto_skcipher *tfm = ci->ci_ctfm; int res = 0; char iv[F2FS_CRYPTO_BLOCK_SIZE]; unsigned lim = max_name_len(inode); @@ -161,13 +159,13 @@ static int f2fs_fname_decrypt(struct inode *inode, return -EIO; /* Allocate request */ - req = ablkcipher_request_alloc(tfm, GFP_NOFS); + req = skcipher_request_alloc(tfm, GFP_NOFS); if (!req) { printk_ratelimited(KERN_ERR "%s: crypto_request_alloc() failed\n", __func__); return -ENOMEM; } - ablkcipher_request_set_callback(req, + skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, f2fs_dir_crypt_complete, &ecr); @@ -177,14 +175,14 @@ static int f2fs_fname_decrypt(struct inode *inode, /* Create decryption request */ sg_init_one(&src_sg, iname->name, iname->len); sg_init_one(&dst_sg, oname->name, oname->len); - ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv); - res = crypto_ablkcipher_decrypt(req); + skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv); + res = crypto_skcipher_decrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { BUG_ON(req->base.data != &ecr); wait_for_completion(&ecr.completion); res = ecr.res; } - ablkcipher_request_free(req); + skcipher_request_free(req); if (res < 0) { printk_ratelimited(KERN_ERR "%s: Error in f2fs_fname_decrypt (error code %d)\n", diff --git a/fs/f2fs/crypto_key.c b/fs/f2fs/crypto_key.c index 5de2d866a25c..2aeb6273bd8f 100644 --- a/fs/f2fs/crypto_key.c +++ b/fs/f2fs/crypto_key.c @@ -14,7 +14,7 @@ #include <linux/random.h> #include <linux/scatterlist.h> #include <uapi/linux/keyctl.h> -#include <crypto/hash.h> +#include <crypto/skcipher.h> #include <linux/f2fs_fs.h> #include "f2fs.h" @@ -44,46 +44,43 @@ static int f2fs_derive_key_aes(char deriving_key[F2FS_AES_128_ECB_KEY_SIZE], char derived_key[F2FS_AES_256_XTS_KEY_SIZE]) { int res = 0; - struct ablkcipher_request *req = NULL; + struct skcipher_request *req = NULL; DECLARE_F2FS_COMPLETION_RESULT(ecr); struct scatterlist src_sg, dst_sg; - struct crypto_ablkcipher *tfm = crypto_alloc_ablkcipher("ecb(aes)", 0, - 0); + struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0); if (IS_ERR(tfm)) { res = PTR_ERR(tfm); tfm = NULL; goto out; } - crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY); - req = ablkcipher_request_alloc(tfm, GFP_NOFS); + crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY); + req = skcipher_request_alloc(tfm, GFP_NOFS); if (!req) { res = -ENOMEM; goto out; } - ablkcipher_request_set_callback(req, + skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, derive_crypt_complete, &ecr); - res = crypto_ablkcipher_setkey(tfm, deriving_key, + res = crypto_skcipher_setkey(tfm, deriving_key, F2FS_AES_128_ECB_KEY_SIZE); if (res < 0) goto out; sg_init_one(&src_sg, source_key, F2FS_AES_256_XTS_KEY_SIZE); sg_init_one(&dst_sg, derived_key, F2FS_AES_256_XTS_KEY_SIZE); - ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, + skcipher_request_set_crypt(req, &src_sg, &dst_sg, F2FS_AES_256_XTS_KEY_SIZE, NULL); - res = crypto_ablkcipher_encrypt(req); + res = crypto_skcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { BUG_ON(req->base.data != &ecr); wait_for_completion(&ecr.completion); res = ecr.res; } out: - if (req) - ablkcipher_request_free(req); - if (tfm) - crypto_free_ablkcipher(tfm); + skcipher_request_free(req); + crypto_free_skcipher(tfm); return res; } @@ -93,7 +90,7 @@ static void f2fs_free_crypt_info(struct f2fs_crypt_info *ci) return; key_put(ci->ci_keyring_key); - crypto_free_ablkcipher(ci->ci_ctfm); + crypto_free_skcipher(ci->ci_ctfm); kmem_cache_free(f2fs_crypt_info_cachep, ci); } @@ -123,7 +120,7 @@ int _f2fs_get_encryption_info(struct inode *inode) struct f2fs_encryption_key *master_key; struct f2fs_encryption_context ctx; const struct user_key_payload *ukp; - struct crypto_ablkcipher *ctfm; + struct crypto_skcipher *ctfm; const char *cipher_str; char raw_key[F2FS_MAX_KEY_SIZE]; char mode; @@ -213,7 +210,7 @@ retry: if (res) goto out; - ctfm = crypto_alloc_ablkcipher(cipher_str, 0, 0); + ctfm = crypto_alloc_skcipher(cipher_str, 0, 0); if (!ctfm || IS_ERR(ctfm)) { res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; printk(KERN_DEBUG @@ -222,11 +219,10 @@ retry: goto out; } crypt_info->ci_ctfm = ctfm; - crypto_ablkcipher_clear_flags(ctfm, ~0); - crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctfm), - CRYPTO_TFM_REQ_WEAK_KEY); - res = crypto_ablkcipher_setkey(ctfm, raw_key, - f2fs_encryption_key_size(mode)); + crypto_skcipher_clear_flags(ctfm, ~0); + crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY); + res = crypto_skcipher_setkey(ctfm, raw_key, + f2fs_encryption_key_size(mode)); if (res) goto out; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ac9e7c6aac74..5c06db17e41f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -794,7 +794,7 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return ret; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); isize = i_size_read(inode); if (start >= isize) @@ -860,7 +860,7 @@ out: if (ret == 1) ret = 0; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/f2fs/f2fs_crypto.h b/fs/f2fs/f2fs_crypto.h index c2c1c2b63b25..ea3d1d7c97f3 100644 --- a/fs/f2fs/f2fs_crypto.h +++ b/fs/f2fs/f2fs_crypto.h @@ -78,7 +78,7 @@ struct f2fs_crypt_info { char ci_data_mode; char ci_filename_mode; char ci_flags; - struct crypto_ablkcipher *ci_ctfm; + struct crypto_skcipher *ci_ctfm; struct key *ci_keyring_key; char ci_master_key[F2FS_KEY_DESCRIPTOR_SIZE]; }; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 18ddb1e5182a..ea272be62677 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -333,7 +333,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) loff_t isize; int err = 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); isize = i_size_read(inode); if (offset >= isize) @@ -388,10 +388,10 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) found: if (whence == SEEK_HOLE && data_ofs > isize) data_ofs = isize; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return vfs_setpos(file, data_ofs, maxbytes); fail: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return -ENXIO; } @@ -1219,7 +1219,7 @@ static long f2fs_fallocate(struct file *file, int mode, FALLOC_FL_INSERT_RANGE)) return -EOPNOTSUPP; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (mode & FALLOC_FL_PUNCH_HOLE) { if (offset >= inode->i_size) @@ -1243,7 +1243,7 @@ static long f2fs_fallocate(struct file *file, int mode, } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); trace_f2fs_fallocate(inode, mode, offset, len, ret); return ret; @@ -1307,13 +1307,13 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) flags = f2fs_mask_flags(inode->i_mode, flags); - mutex_lock(&inode->i_mutex); + inode_lock(inode); oldflags = fi->i_flags; if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { if (!capable(CAP_LINUX_IMMUTABLE)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); ret = -EPERM; goto out; } @@ -1322,7 +1322,7 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) flags = flags & FS_FL_USER_MODIFIABLE; flags |= oldflags & ~FS_FL_USER_MODIFIABLE; fi->i_flags = flags; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); f2fs_set_inode_flags(inode); inode->i_ctime = CURRENT_TIME; @@ -1667,7 +1667,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, f2fs_balance_fs(sbi, true); - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* writeback all dirty pages in the range */ err = filemap_write_and_wait_range(inode->i_mapping, range->start, @@ -1778,7 +1778,7 @@ do_map: clear_out: clear_inode_flag(F2FS_I(inode), FI_DO_DEFRAG); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (!err) range->len = (u64)total << PAGE_CACHE_SHIFT; return err; diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 7def96caec5f..d0b95c95079b 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -769,7 +769,7 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *file, buf.dirent = dirent; buf.result = 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); buf.ctx.pos = file->f_pos; ret = -ENOENT; if (!IS_DEADDIR(inode)) { @@ -777,7 +777,7 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *file, short_only, both ? &buf : NULL); file->f_pos = buf.ctx.pos; } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (ret >= 0) ret = buf.result; return ret; diff --git a/fs/fat/file.c b/fs/fat/file.c index 43d3475da83a..f70185668832 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -24,9 +24,9 @@ static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr) { u32 attr; - mutex_lock(&inode->i_mutex); + inode_lock(inode); attr = fat_make_attrs(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return put_user(attr, user_attr); } @@ -47,7 +47,7 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) err = mnt_want_write_file(file); if (err) goto out; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * ATTR_VOLUME and ATTR_DIR cannot be changed; this also @@ -109,7 +109,7 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) fat_save_attrs(inode, attr); mark_inode_dirty(inode); out_unlock_inode: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); mnt_drop_write_file(file); out: return err; @@ -246,7 +246,7 @@ static long fat_fallocate(struct file *file, int mode, if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (mode & FALLOC_FL_KEEP_SIZE) { ondisksize = inode->i_blocks << 9; if ((offset + len) <= ondisksize) @@ -272,7 +272,7 @@ static long fat_fallocate(struct file *file, int mode, } error: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } diff --git a/fs/filesystems.c b/fs/filesystems.c index 5797d45a78cb..c5618db110be 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -46,9 +46,9 @@ void put_filesystem(struct file_system_type *fs) static struct file_system_type **find_filesystem(const char *name, unsigned len) { struct file_system_type **p; - for (p=&file_systems; *p; p=&(*p)->next) - if (strlen((*p)->name) == len && - strncmp((*p)->name, name, len) == 0) + for (p = &file_systems; *p; p = &(*p)->next) + if (strncmp((*p)->name, name, len) == 0 && + !(*p)->name[len]) break; return p; } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 6915c950e6e8..5c46ed9f3e14 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -223,6 +223,9 @@ static void wb_wait_for_completion(struct backing_dev_info *bdi, #define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1) /* one round can affect upto 5 slots */ +static atomic_t isw_nr_in_flight = ATOMIC_INIT(0); +static struct workqueue_struct *isw_wq; + void __inode_attach_wb(struct inode *inode, struct page *page) { struct backing_dev_info *bdi = inode_to_bdi(inode); @@ -424,6 +427,8 @@ skip_switch: iput(inode); kfree(isw); + + atomic_dec(&isw_nr_in_flight); } static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head) @@ -433,7 +438,7 @@ static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head) /* needs to grab bh-unsafe locks, bounce to work item */ INIT_WORK(&isw->work, inode_switch_wbs_work_fn); - schedule_work(&isw->work); + queue_work(isw_wq, &isw->work); } /** @@ -469,7 +474,8 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) /* while holding I_WB_SWITCH, no one else can update the association */ spin_lock(&inode->i_lock); - if (inode->i_state & (I_WB_SWITCH | I_FREEING) || + if (!(inode->i_sb->s_flags & MS_ACTIVE) || + inode->i_state & (I_WB_SWITCH | I_FREEING) || inode_to_wb(inode) == isw->new_wb) { spin_unlock(&inode->i_lock); goto out_free; @@ -480,6 +486,8 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) ihold(inode); isw->inode = inode; + atomic_inc(&isw_nr_in_flight); + /* * In addition to synchronizing among switchers, I_WB_SWITCH tells * the RCU protected stat update paths to grab the mapping's @@ -840,6 +848,33 @@ restart: wb_put(last_wb); } +/** + * cgroup_writeback_umount - flush inode wb switches for umount + * + * This function is called when a super_block is about to be destroyed and + * flushes in-flight inode wb switches. An inode wb switch goes through + * RCU and then workqueue, so the two need to be flushed in order to ensure + * that all previously scheduled switches are finished. As wb switches are + * rare occurrences and synchronize_rcu() can take a while, perform + * flushing iff wb switches are in flight. + */ +void cgroup_writeback_umount(void) +{ + if (atomic_read(&isw_nr_in_flight)) { + synchronize_rcu(); + flush_workqueue(isw_wq); + } +} + +static int __init cgroup_writeback_init(void) +{ + isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0); + if (!isw_wq) + return -ENOMEM; + return 0; +} +fs_initcall(cgroup_writeback_init); + #else /* CONFIG_CGROUP_WRITEBACK */ static struct bdi_writeback * diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 712601f299b8..4b855b65d457 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -944,7 +944,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, if (!parent) return -ENOENT; - mutex_lock(&parent->i_mutex); + inode_lock(parent); if (!S_ISDIR(parent->i_mode)) goto unlock; @@ -962,7 +962,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, fuse_invalidate_entry(entry); if (child_nodeid != 0 && d_really_is_positive(entry)) { - mutex_lock(&d_inode(entry)->i_mutex); + inode_lock(d_inode(entry)); if (get_node_id(d_inode(entry)) != child_nodeid) { err = -ENOENT; goto badentry; @@ -983,7 +983,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, clear_nlink(d_inode(entry)); err = 0; badentry: - mutex_unlock(&d_inode(entry)->i_mutex); + inode_unlock(d_inode(entry)); if (!err) d_delete(entry); } else { @@ -992,7 +992,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, dput(entry); unlock: - mutex_unlock(&parent->i_mutex); + inode_unlock(parent); iput(parent); return err; } @@ -1504,7 +1504,7 @@ void fuse_set_nowrite(struct inode *inode) struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); - BUG_ON(!mutex_is_locked(&inode->i_mutex)); + BUG_ON(!inode_is_locked(inode)); spin_lock(&fc->lock); BUG_ON(fi->writectr < 0); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index aa03aab6a24f..b03d253ece15 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -207,7 +207,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) return err; if (lock_inode) - mutex_lock(&inode->i_mutex); + inode_lock(inode); err = fuse_do_open(fc, get_node_id(inode), file, isdir); @@ -215,7 +215,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) fuse_finish_open(inode, file); if (lock_inode) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } @@ -413,9 +413,9 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (err) return err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); fuse_sync_writes(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); req = fuse_get_req_nofail_nopages(fc, file); memset(&inarg, 0, sizeof(inarg)); @@ -450,7 +450,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, if (is_bad_inode(inode)) return -EIO; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * Start writeback against all dirty pages of the inode, then @@ -486,7 +486,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, err = 0; } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } @@ -1160,7 +1160,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) return generic_file_write_iter(iocb, from); } - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* We can write back this queue in page reclaim */ current->backing_dev_info = inode_to_bdi(inode); @@ -1210,7 +1210,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) } out: current->backing_dev_info = NULL; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return written ? written : err; } @@ -1322,10 +1322,10 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) { if (!write) - mutex_lock(&inode->i_mutex); + inode_lock(inode); fuse_sync_writes(inode); if (!write) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } while (count) { @@ -1413,14 +1413,14 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) return -EIO; /* Don't allow parallel writes to the same file */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); res = generic_write_checks(iocb, from); if (res > 0) res = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE); fuse_invalidate_attr(inode); if (res > 0) fuse_write_update_size(inode, iocb->ki_pos); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return res; } @@ -2287,17 +2287,17 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence) retval = generic_file_llseek(file, offset, whence); break; case SEEK_END: - mutex_lock(&inode->i_mutex); + inode_lock(inode); retval = fuse_update_attributes(inode, NULL, file, NULL); if (!retval) retval = generic_file_llseek(file, offset, whence); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); break; case SEEK_HOLE: case SEEK_DATA: - mutex_lock(&inode->i_mutex); + inode_lock(inode); retval = fuse_lseek(file, offset, whence); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); break; default: retval = -EINVAL; @@ -2944,7 +2944,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, return -EOPNOTSUPP; if (lock_inode) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (mode & FALLOC_FL_PUNCH_HOLE) { loff_t endbyte = offset + length - 1; err = filemap_write_and_wait_range(inode->i_mapping, @@ -2990,7 +2990,7 @@ out: clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); if (lock_inode) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 93f07465e5a6..aa016e4b8bec 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -1082,7 +1082,7 @@ static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, * the first place, mapping->nr_pages will always be zero. */ if (mapping->nrpages) { - loff_t lstart = offset & (PAGE_CACHE_SIZE - 1); + loff_t lstart = offset & ~(PAGE_CACHE_SIZE - 1); loff_t len = iov_iter_count(iter); loff_t end = PAGE_ALIGN(offset + len) - 1; diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 6a92592304fb..4a01f30e9995 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -798,7 +798,7 @@ static int get_first_leaf(struct gfs2_inode *dip, u32 index, int error; error = get_leaf_nr(dip, index, &leaf_no); - if (!error) + if (!IS_ERR_VALUE(error)) error = get_leaf(dip, leaf_no, bh_out); return error; @@ -1014,7 +1014,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) index = name->hash >> (32 - dip->i_depth); error = get_leaf_nr(dip, index, &leaf_no); - if (error) + if (IS_ERR_VALUE(error)) return error; /* Get the old leaf block */ @@ -1660,7 +1660,7 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name, brelse(bh); if (fail_on_exist) return ERR_PTR(-EEXIST); - inode = gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, 0); + inode = gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino); if (!IS_ERR(inode)) GFS2_I(inode)->i_rahead = rahead; return inode; diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index 5d15e9498b48..d5bda8513457 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -137,7 +137,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, struct gfs2_sbd *sdp = sb->s_fs_info; struct inode *inode; - inode = gfs2_ilookup(sb, inum->no_addr, 0); + inode = gfs2_ilookup(sb, inum->no_addr); if (inode) { if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) { iput(inode); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 7412863cda1e..c9384f932975 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -914,7 +914,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t le if ((mode & ~FALLOC_FL_KEEP_SIZE) || gfs2_is_jdata(ip)) return -EOPNOTSUPP; - mutex_lock(&inode->i_mutex); + inode_lock(inode); gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); ret = gfs2_glock_nq(&gh); @@ -946,7 +946,7 @@ out_unlock: gfs2_glock_dq(&gh); out_uninit: gfs2_holder_uninit(&gh); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index a4ff7b56f5cd..6539131c52a2 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -572,17 +572,24 @@ static void delete_work_func(struct work_struct *work) struct inode *inode; u64 no_addr = gl->gl_name.ln_number; + /* If someone's using this glock to create a new dinode, the block must + have been freed by another node, then re-used, in which case our + iopen callback is too late after the fact. Ignore it. */ + if (test_bit(GLF_INODE_CREATING, &gl->gl_flags)) + goto out; + ip = gl->gl_object; /* Note: Unsafe to dereference ip as we don't hold right refs/locks */ if (ip) - inode = gfs2_ilookup(sdp->sd_vfs, no_addr, 1); + inode = gfs2_ilookup(sdp->sd_vfs, no_addr); else inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED); if (inode && !IS_ERR(inode)) { d_prune_aliases(inode); iput(inode); } +out: gfs2_glock_put(gl); } @@ -1015,6 +1022,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh) handle_callback(gl, LM_ST_UNLOCKED, 0, false); list_del_init(&gh->gh_list); + clear_bit(HIF_HOLDER, &gh->gh_iflags); if (find_first_holder(gl) == NULL) { if (glops->go_unlock) { GLOCK_BUG_ON(gl, test_and_set_bit(GLF_LOCK, &gl->gl_flags)); diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 845fb09cc606..a6a3389a07fc 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -328,6 +328,7 @@ enum { GLF_LRU = 13, GLF_OBJECT = 14, /* Used only for tracing */ GLF_BLOCKING = 15, + GLF_INODE_CREATING = 16, /* Inode creation occurring */ }; struct gfs2_glock { diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 3e94400d587c..bb30f9a72c65 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -37,61 +37,9 @@ #include "super.h" #include "glops.h" -struct gfs2_skip_data { - u64 no_addr; - int skipped; - int non_block; -}; - -static int iget_test(struct inode *inode, void *opaque) -{ - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_skip_data *data = opaque; - - if (ip->i_no_addr == data->no_addr) { - if (data->non_block && - inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) { - data->skipped = 1; - return 0; - } - return 1; - } - return 0; -} - -static int iget_set(struct inode *inode, void *opaque) -{ - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_skip_data *data = opaque; - - if (data->skipped) - return -ENOENT; - inode->i_ino = (unsigned long)(data->no_addr); - ip->i_no_addr = data->no_addr; - return 0; -} - -struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr, int non_block) +struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr) { - unsigned long hash = (unsigned long)no_addr; - struct gfs2_skip_data data; - - data.no_addr = no_addr; - data.skipped = 0; - data.non_block = non_block; - return ilookup5(sb, hash, iget_test, &data); -} - -static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr, - int non_block) -{ - struct gfs2_skip_data data; - unsigned long hash = (unsigned long)no_addr; - - data.no_addr = no_addr; - data.skipped = 0; - data.non_block = non_block; - return iget5_locked(sb, hash, iget_test, iget_set, &data); + return ilookup(sb, (unsigned long)no_addr); } /** @@ -132,21 +80,21 @@ static void gfs2_set_iop(struct inode *inode) * @sb: The super block * @no_addr: The inode number * @type: The type of the inode - * non_block: Can we block on inodes that are being freed? * * Returns: A VFS inode, or an error */ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, - u64 no_addr, u64 no_formal_ino, int non_block) + u64 no_addr, u64 no_formal_ino) { struct inode *inode; struct gfs2_inode *ip; struct gfs2_glock *io_gl = NULL; int error; - inode = gfs2_iget(sb, no_addr, non_block); + inode = iget_locked(sb, (unsigned long)no_addr); ip = GFS2_I(inode); + ip->i_no_addr = no_addr; if (!inode) return ERR_PTR(-ENOMEM); @@ -221,7 +169,7 @@ struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, if (error) goto fail; - inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0, 1); + inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0); if (IS_ERR(inode)) goto fail; @@ -592,7 +540,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, struct inode *inode = NULL; struct gfs2_inode *dip = GFS2_I(dir), *ip; struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); - struct gfs2_glock *io_gl; + struct gfs2_glock *io_gl = NULL; int error, free_vfs_inode = 1; u32 aflags = 0; unsigned blocks = 1; @@ -729,6 +677,8 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) goto fail_gunlock2; + BUG_ON(test_and_set_bit(GLF_INODE_CREATING, &io_gl->gl_flags)); + error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); if (error) goto fail_gunlock2; @@ -771,12 +721,15 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, } gfs2_glock_dq_uninit(ghs); gfs2_glock_dq_uninit(ghs + 1); + clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags); return error; fail_gunlock3: gfs2_glock_dq_uninit(&ip->i_iopen_gh); gfs2_glock_put(io_gl); fail_gunlock2: + if (io_gl) + clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags); gfs2_glock_dq_uninit(ghs + 1); fail_free_inode: if (ip->i_gl) @@ -2067,7 +2020,7 @@ static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); if (ret) @@ -2094,7 +2047,7 @@ static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, gfs2_glock_dq_uninit(&gh); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index ba4d9492d422..e1af0d4aa308 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -94,12 +94,11 @@ err: } extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, - u64 no_addr, u64 no_formal_ino, - int non_block); + u64 no_addr, u64 no_formal_ino); extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, u64 *no_formal_ino, unsigned int blktype); -extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr, int nonblock); +extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr); extern int gfs2_inode_refresh(struct gfs2_inode *ip); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index dbed9e243ea2..49b0bff18fe3 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -454,7 +454,7 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr, struct dentry *dentry; struct inode *inode; - inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0); + inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0); if (IS_ERR(inode)) { fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode)); return PTR_ERR(inode); diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index be6d9c450b22..a39891344259 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -888,7 +888,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) return -ENOMEM; sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL); - mutex_lock(&ip->i_inode.i_mutex); + inode_lock(&ip->i_inode); for (qx = 0; qx < num_qd; qx++) { error = gfs2_glock_nq_init(qda[qx]->qd_gl, LM_ST_EXCLUSIVE, GL_NOCACHE, &ghs[qx]); @@ -953,7 +953,7 @@ out_alloc: out: while (qx--) gfs2_glock_dq_uninit(&ghs[qx]); - mutex_unlock(&ip->i_inode.i_mutex); + inode_unlock(&ip->i_inode); kfree(ghs); gfs2_log_flush(ip->i_gl->gl_name.ln_sbd, ip->i_gl, NORMAL_FLUSH); return error; @@ -1674,7 +1674,7 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid, if (error) goto out_put; - mutex_lock(&ip->i_inode.i_mutex); + inode_lock(&ip->i_inode); error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE, 0, &q_gh); if (error) goto out_unlockput; @@ -1739,7 +1739,7 @@ out_i: out_q: gfs2_glock_dq_uninit(&q_gh); out_unlockput: - mutex_unlock(&ip->i_inode.i_mutex); + inode_unlock(&ip->i_inode); out_put: qd_put(qd); return error; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 8f960a51a9a0..f8a0cd821290 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1551,12 +1551,16 @@ static void gfs2_evict_inode(struct inode *inode) goto out_truncate; } - ip->i_iopen_gh.gh_flags |= GL_NOCACHE; - gfs2_glock_dq_wait(&ip->i_iopen_gh); - gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh); - error = gfs2_glock_nq(&ip->i_iopen_gh); - if (error) - goto out_truncate; + if (ip->i_iopen_gh.gh_gl && + test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; + gfs2_glock_dq_wait(&ip->i_iopen_gh); + gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, + &ip->i_iopen_gh); + error = gfs2_glock_nq(&ip->i_iopen_gh); + if (error) + goto out_truncate; + } /* Case 1 starts here */ @@ -1606,11 +1610,13 @@ out_unlock: if (gfs2_rs_active(&ip->i_res)) gfs2_rs_deltree(&ip->i_res); - if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { - ip->i_iopen_gh.gh_flags |= GL_NOCACHE; - gfs2_glock_dq_wait(&ip->i_iopen_gh); + if (ip->i_iopen_gh.gh_gl) { + if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; + gfs2_glock_dq_wait(&ip->i_iopen_gh); + } + gfs2_holder_uninit(&ip->i_iopen_gh); } - gfs2_holder_uninit(&ip->i_iopen_gh); gfs2_glock_dq_uninit(&gh); if (error && error != GLR_TRYFAILED && error != -EROFS) fs_warn(sdp, "gfs2_evict_inode: %d\n", error); diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index 70788e03820a..e9f2b855f831 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -173,9 +173,9 @@ static int hfs_dir_release(struct inode *inode, struct file *file) { struct hfs_readdir_data *rd = file->private_data; if (rd) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); list_del(&rd->list); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); kfree(rd); } return 0; diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index b99ebddb10cb..6686bf39a5b5 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -570,13 +570,13 @@ static int hfs_file_release(struct inode *inode, struct file *file) if (HFS_IS_RSRC(inode)) inode = HFS_I(inode)->rsrc_inode; if (atomic_dec_and_test(&HFS_I(inode)->opencnt)) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); hfs_file_truncate(inode); //if (inode->i_flags & S_DEAD) { // hfs_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL); // hfs_delete_inode(inode); //} - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } return 0; } @@ -656,7 +656,7 @@ static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end, ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* sync the inode to buffers */ ret = write_inode_now(inode, 0); @@ -668,7 +668,7 @@ static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end, err = sync_blockdev(sb->s_bdev); if (!ret) ret = err; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index d0f39dcbb58e..a4e867e08947 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -284,9 +284,9 @@ static int hfsplus_dir_release(struct inode *inode, struct file *file) { struct hfsplus_readdir_data *rd = file->private_data; if (rd) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); list_del(&rd->list); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); kfree(rd); } return 0; diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 19b33f8151f1..1a6394cdb54e 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -229,14 +229,14 @@ static int hfsplus_file_release(struct inode *inode, struct file *file) if (HFSPLUS_IS_RSRC(inode)) inode = HFSPLUS_I(inode)->rsrc_inode; if (atomic_dec_and_test(&HFSPLUS_I(inode)->opencnt)) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); hfsplus_file_truncate(inode); if (inode->i_flags & S_DEAD) { hfsplus_delete_cat(inode->i_ino, HFSPLUS_SB(sb)->hidden_dir, NULL); hfsplus_delete_inode(inode); } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } return 0; } @@ -286,7 +286,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, error = filemap_write_and_wait_range(inode->i_mapping, start, end); if (error) return error; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * Sync inode metadata into the catalog and extent trees. @@ -327,7 +327,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return error; } diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c index 0624ce4e0702..32a49e292b6a 100644 --- a/fs/hfsplus/ioctl.c +++ b/fs/hfsplus/ioctl.c @@ -93,7 +93,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags) goto out_drop_write; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); if ((flags & (FS_IMMUTABLE_FL|FS_APPEND_FL)) || inode->i_flags & (S_IMMUTABLE|S_APPEND)) { @@ -126,7 +126,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags) mark_inode_dirty(inode); out_unlock_inode: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); out_drop_write: mnt_drop_write_file(file); out: diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index cfaa18c7a337..d1abbee281d1 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -378,9 +378,9 @@ static int hostfs_fsync(struct file *file, loff_t start, loff_t end, if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = fsync_file(HOSTFS_I(inode)->fd, datasync); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index dc540bfcee1d..e57a53c13d86 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -33,7 +33,7 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence) if (whence == SEEK_DATA || whence == SEEK_HOLE) return -EINVAL; - mutex_lock(&i->i_mutex); + inode_lock(i); hpfs_lock(s); /*pr_info("dir lseek\n");*/ @@ -48,12 +48,12 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence) ok: filp->f_pos = new_off; hpfs_unlock(s); - mutex_unlock(&i->i_mutex); + inode_unlock(i); return new_off; fail: /*pr_warn("illegal lseek: %016llx\n", new_off);*/ hpfs_unlock(s); - mutex_unlock(&i->i_mutex); + inode_unlock(i); return -ESPIPE; } diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index 506765afa1a3..bb8d67e2740a 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -376,12 +376,11 @@ static int hpfs_unlink(struct inode *dir, struct dentry *dentry) struct inode *inode = d_inode(dentry); dnode_secno dno; int r; - int rep = 0; int err; hpfs_lock(dir->i_sb); hpfs_adjust_length(name, &len); -again: + err = -ENOENT; de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, &dno, &qbh); if (!de) @@ -401,33 +400,9 @@ again: hpfs_error(dir->i_sb, "there was error when removing dirent"); err = -EFSERROR; break; - case 2: /* no space for deleting, try to truncate file */ - + case 2: /* no space for deleting */ err = -ENOSPC; - if (rep++) - break; - - dentry_unhash(dentry); - if (!d_unhashed(dentry)) { - hpfs_unlock(dir->i_sb); - return -ENOSPC; - } - if (generic_permission(inode, MAY_WRITE) || - !S_ISREG(inode->i_mode) || - get_write_access(inode)) { - d_rehash(dentry); - } else { - struct iattr newattrs; - /*pr_info("truncating file before delete.\n");*/ - newattrs.ia_size = 0; - newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; - err = notify_change(dentry, &newattrs, NULL); - put_write_access(inode); - if (!err) - goto again; - } - hpfs_unlock(dir->i_sb); - return -ENOSPC; + break; default: drop_nlink(inode); err = 0; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 8bbf7f3e2a27..e1f465a389d5 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -141,7 +141,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) vma_len = (loff_t)(vma->vm_end - vma->vm_start); - mutex_lock(&inode->i_mutex); + inode_lock(inode); file_accessed(file); ret = -ENOMEM; @@ -157,7 +157,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) if (vma->vm_flags & VM_WRITE && inode->i_size < len) inode->i_size = len; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -530,7 +530,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (hole_end > hole_start) { struct address_space *mapping = inode->i_mapping; - mutex_lock(&inode->i_mutex); + inode_lock(inode); i_mmap_lock_write(mapping); if (!RB_EMPTY_ROOT(&mapping->i_mmap)) hugetlb_vmdelete_list(&mapping->i_mmap, @@ -538,7 +538,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) hole_end >> PAGE_SHIFT); i_mmap_unlock_write(mapping); remove_inode_hugepages(inode, hole_start, hole_end); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } return 0; @@ -572,7 +572,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, start = offset >> hpage_shift; end = (offset + len + hpage_size - 1) >> hpage_shift; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ error = inode_newsize_ok(inode, offset + len); @@ -659,7 +659,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, i_size_write(inode, offset + len); inode->i_ctime = CURRENT_TIME; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return error; } diff --git a/fs/inode.c b/fs/inode.c index e491e54d2430..69b8b526c194 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -154,6 +154,12 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_rdev = 0; inode->dirtied_when = 0; +#ifdef CONFIG_CGROUP_WRITEBACK + inode->i_wb_frn_winner = 0; + inode->i_wb_frn_avg_time = 0; + inode->i_wb_frn_history = 0; +#endif + if (security_inode_alloc(inode)) goto out; spin_lock_init(&inode->i_lock); @@ -495,7 +501,7 @@ void clear_inode(struct inode *inode) */ spin_lock_irq(&inode->i_data.tree_lock); BUG_ON(inode->i_data.nrpages); - BUG_ON(inode->i_data.nrshadows); + BUG_ON(inode->i_data.nrexceptional); spin_unlock_irq(&inode->i_data.tree_lock); BUG_ON(!list_empty(&inode->i_data.private_list)); BUG_ON(!(inode->i_state & I_FREEING)); @@ -966,9 +972,9 @@ void lock_two_nondirectories(struct inode *inode1, struct inode *inode2) swap(inode1, inode2); if (inode1 && !S_ISDIR(inode1->i_mode)) - mutex_lock(&inode1->i_mutex); + inode_lock(inode1); if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1) - mutex_lock_nested(&inode2->i_mutex, I_MUTEX_NONDIR2); + inode_lock_nested(inode2, I_MUTEX_NONDIR2); } EXPORT_SYMBOL(lock_two_nondirectories); @@ -980,9 +986,9 @@ EXPORT_SYMBOL(lock_two_nondirectories); void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2) { if (inode1 && !S_ISDIR(inode1->i_mode)) - mutex_unlock(&inode1->i_mutex); + inode_unlock(inode1); if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1) - mutex_unlock(&inode2->i_mutex); + inode_unlock(inode2); } EXPORT_SYMBOL(unlock_two_nondirectories); diff --git a/fs/ioctl.c b/fs/ioctl.c index 29466c380958..116a333e9c77 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -434,9 +434,9 @@ int generic_block_fiemap(struct inode *inode, u64 len, get_block_t *get_block) { int ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = __generic_block_fiemap(inode, fieinfo, start, len, get_block); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } EXPORT_SYMBOL(generic_block_fiemap); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 36345fefa3ff..517f2de784cf 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -131,14 +131,12 @@ static int journal_submit_commit_record(journal_t *journal, if (is_journal_aborted(journal)) return 0; - bh = jbd2_journal_get_descriptor_buffer(journal); + bh = jbd2_journal_get_descriptor_buffer(commit_transaction, + JBD2_COMMIT_BLOCK); if (!bh) return 1; tmp = (struct commit_header *)bh->b_data; - tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); - tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); - tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); tmp->h_commit_sec = cpu_to_be64(now.tv_sec); tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec); @@ -222,7 +220,7 @@ static int journal_submit_data_buffers(journal_t *journal, spin_lock(&journal->j_list_lock); list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { mapping = jinode->i_vfs_inode->i_mapping; - set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); + jinode->i_flags |= JI_COMMIT_RUNNING; spin_unlock(&journal->j_list_lock); /* * submit the inode data buffers. We use writepage @@ -236,8 +234,8 @@ static int journal_submit_data_buffers(journal_t *journal, ret = err; spin_lock(&journal->j_list_lock); J_ASSERT(jinode->i_transaction == commit_transaction); - clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); - smp_mb__after_atomic(); + jinode->i_flags &= ~JI_COMMIT_RUNNING; + smp_mb(); wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); } spin_unlock(&journal->j_list_lock); @@ -258,7 +256,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal, /* For locking, see the comment in journal_submit_data_buffers() */ spin_lock(&journal->j_list_lock); list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { - set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); + jinode->i_flags |= JI_COMMIT_RUNNING; spin_unlock(&journal->j_list_lock); err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping); if (err) { @@ -274,8 +272,8 @@ static int journal_finish_inode_data_buffers(journal_t *journal, ret = err; } spin_lock(&journal->j_list_lock); - clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); - smp_mb__after_atomic(); + jinode->i_flags &= ~JI_COMMIT_RUNNING; + smp_mb(); wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); } @@ -319,22 +317,6 @@ static void write_tag_block(journal_t *j, journal_block_tag_t *tag, tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1); } -static void jbd2_descr_block_csum_set(journal_t *j, - struct buffer_head *bh) -{ - struct jbd2_journal_block_tail *tail; - __u32 csum; - - if (!jbd2_journal_has_csum_v2or3(j)) - return; - - tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize - - sizeof(struct jbd2_journal_block_tail)); - tail->t_checksum = 0; - csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); - tail->t_checksum = cpu_to_be32(csum); -} - static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, struct buffer_head *bh, __u32 sequence) { @@ -379,7 +361,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) ktime_t start_time; u64 commit_time; char *tagp = NULL; - journal_header_t *header; journal_block_tag_t *tag = NULL; int space_left = 0; int first_tag = 0; @@ -554,8 +535,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) jbd2_journal_abort(journal, err); blk_start_plug(&plug); - jbd2_journal_write_revoke_records(journal, commit_transaction, - &log_bufs, WRITE_SYNC); + jbd2_journal_write_revoke_records(commit_transaction, &log_bufs); jbd_debug(3, "JBD2: commit phase 2b\n"); @@ -616,7 +596,9 @@ void jbd2_journal_commit_transaction(journal_t *journal) jbd_debug(4, "JBD2: get descriptor\n"); - descriptor = jbd2_journal_get_descriptor_buffer(journal); + descriptor = jbd2_journal_get_descriptor_buffer( + commit_transaction, + JBD2_DESCRIPTOR_BLOCK); if (!descriptor) { jbd2_journal_abort(journal, -EIO); continue; @@ -625,11 +607,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) jbd_debug(4, "JBD2: got buffer %llu (%p)\n", (unsigned long long)descriptor->b_blocknr, descriptor->b_data); - header = (journal_header_t *)descriptor->b_data; - header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); - header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK); - header->h_sequence = cpu_to_be32(commit_transaction->t_tid); - tagp = &descriptor->b_data[sizeof(journal_header_t)]; space_left = descriptor->b_size - sizeof(journal_header_t); @@ -721,7 +698,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG); - jbd2_descr_block_csum_set(journal, descriptor); + jbd2_descriptor_block_csum_set(journal, descriptor); start_journal_io: for (i = 0; i < bufs; i++) { struct buffer_head *bh = wbuf[i]; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 81e622681c82..de73a9516a54 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -805,10 +805,13 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr, * But we don't bother doing that, so there will be coherency problems with * mmaps of blockdevs which hold live JBD-controlled filesystems. */ -struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal) +struct buffer_head * +jbd2_journal_get_descriptor_buffer(transaction_t *transaction, int type) { + journal_t *journal = transaction->t_journal; struct buffer_head *bh; unsigned long long blocknr; + journal_header_t *header; int err; err = jbd2_journal_next_log_block(journal, &blocknr); @@ -821,12 +824,31 @@ struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal) return NULL; lock_buffer(bh); memset(bh->b_data, 0, journal->j_blocksize); + header = (journal_header_t *)bh->b_data; + header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); + header->h_blocktype = cpu_to_be32(type); + header->h_sequence = cpu_to_be32(transaction->t_tid); set_buffer_uptodate(bh); unlock_buffer(bh); BUFFER_TRACE(bh, "return this buffer"); return bh; } +void jbd2_descriptor_block_csum_set(journal_t *j, struct buffer_head *bh) +{ + struct jbd2_journal_block_tail *tail; + __u32 csum; + + if (!jbd2_journal_has_csum_v2or3(j)) + return; + + tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize - + sizeof(struct jbd2_journal_block_tail)); + tail->t_checksum = 0; + csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); + tail->t_checksum = cpu_to_be32(csum); +} + /* * Return tid of the oldest transaction in the journal and block in the journal * where the transaction starts. @@ -1408,11 +1430,12 @@ out: /** * jbd2_mark_journal_empty() - Mark on disk journal as empty. * @journal: The journal to update. + * @write_op: With which operation should we write the journal sb * * Update a journal's dynamic superblock fields to show that journal is empty. * Write updated superblock to disk waiting for IO to complete. */ -static void jbd2_mark_journal_empty(journal_t *journal) +static void jbd2_mark_journal_empty(journal_t *journal, int write_op) { journal_superblock_t *sb = journal->j_superblock; @@ -1430,7 +1453,7 @@ static void jbd2_mark_journal_empty(journal_t *journal) sb->s_start = cpu_to_be32(0); read_unlock(&journal->j_state_lock); - jbd2_write_superblock(journal, WRITE_FUA); + jbd2_write_superblock(journal, write_op); /* Log is no longer empty */ write_lock(&journal->j_state_lock); @@ -1716,7 +1739,13 @@ int jbd2_journal_destroy(journal_t *journal) if (journal->j_sb_buffer) { if (!is_journal_aborted(journal)) { mutex_lock(&journal->j_checkpoint_mutex); - jbd2_mark_journal_empty(journal); + + write_lock(&journal->j_state_lock); + journal->j_tail_sequence = + ++journal->j_transaction_sequence; + write_unlock(&journal->j_state_lock); + + jbd2_mark_journal_empty(journal, WRITE_FLUSH_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } else err = -EIO; @@ -1975,7 +2004,7 @@ int jbd2_journal_flush(journal_t *journal) * the magic code for a fully-recovered superblock. Any future * commits of data to the journal will restore the current * s_start value. */ - jbd2_mark_journal_empty(journal); + jbd2_mark_journal_empty(journal, WRITE_FUA); mutex_unlock(&journal->j_checkpoint_mutex); write_lock(&journal->j_state_lock); J_ASSERT(!journal->j_running_transaction); @@ -2021,7 +2050,7 @@ int jbd2_journal_wipe(journal_t *journal, int write) if (write) { /* Lock to make assertions happy... */ mutex_lock(&journal->j_checkpoint_mutex); - jbd2_mark_journal_empty(journal); + jbd2_mark_journal_empty(journal, WRITE_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } @@ -2565,7 +2594,7 @@ void jbd2_journal_release_jbd_inode(journal_t *journal, restart: spin_lock(&journal->j_list_lock); /* Is commit writing out inode - we have to wait */ - if (test_bit(__JI_COMMIT_RUNNING, &jinode->i_flags)) { + if (jinode->i_flags & JI_COMMIT_RUNNING) { wait_queue_head_t *wq; DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING); wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING); diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 7f277e49fe88..08a456b96e4e 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -174,8 +174,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal, return 0; } -static int jbd2_descr_block_csum_verify(journal_t *j, - void *buf) +static int jbd2_descriptor_block_csum_verify(journal_t *j, void *buf) { struct jbd2_journal_block_tail *tail; __be32 provided; @@ -522,8 +521,8 @@ static int do_one_pass(journal_t *journal, descr_csum_size = sizeof(struct jbd2_journal_block_tail); if (descr_csum_size > 0 && - !jbd2_descr_block_csum_verify(journal, - bh->b_data)) { + !jbd2_descriptor_block_csum_verify(journal, + bh->b_data)) { printk(KERN_ERR "JBD2: Invalid checksum " "recovering block %lu in log\n", next_log_block); @@ -811,26 +810,6 @@ static int do_one_pass(journal_t *journal, return err; } -static int jbd2_revoke_block_csum_verify(journal_t *j, - void *buf) -{ - struct jbd2_journal_revoke_tail *tail; - __be32 provided; - __u32 calculated; - - if (!jbd2_journal_has_csum_v2or3(j)) - return 1; - - tail = (struct jbd2_journal_revoke_tail *)(buf + j->j_blocksize - - sizeof(struct jbd2_journal_revoke_tail)); - provided = tail->r_checksum; - tail->r_checksum = 0; - calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize); - tail->r_checksum = provided; - - return provided == cpu_to_be32(calculated); -} - /* Scan a revoke record, marking all blocks mentioned as revoked. */ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, @@ -846,11 +825,11 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, offset = sizeof(jbd2_journal_revoke_header_t); rcount = be32_to_cpu(header->r_count); - if (!jbd2_revoke_block_csum_verify(journal, header)) + if (!jbd2_descriptor_block_csum_verify(journal, header)) return -EFSBADCRC; if (jbd2_journal_has_csum_v2or3(journal)) - csum_size = sizeof(struct jbd2_journal_revoke_tail); + csum_size = sizeof(struct jbd2_journal_block_tail); if (rcount > journal->j_blocksize - csum_size) return -EINVAL; max = rcount; diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 705ae577882b..91171dc352cb 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -122,11 +122,11 @@ struct jbd2_revoke_table_s #ifdef __KERNEL__ -static void write_one_revoke_record(journal_t *, transaction_t *, +static void write_one_revoke_record(transaction_t *, struct list_head *, struct buffer_head **, int *, - struct jbd2_revoke_record_s *, int); -static void flush_descriptor(journal_t *, struct buffer_head *, int, int); + struct jbd2_revoke_record_s *); +static void flush_descriptor(journal_t *, struct buffer_head *, int); #endif /* Utility functions to maintain the revoke table */ @@ -519,11 +519,10 @@ void jbd2_journal_switch_revoke_table(journal_t *journal) * Write revoke records to the journal for all entries in the current * revoke hash, deleting the entries as we go. */ -void jbd2_journal_write_revoke_records(journal_t *journal, - transaction_t *transaction, - struct list_head *log_bufs, - int write_op) +void jbd2_journal_write_revoke_records(transaction_t *transaction, + struct list_head *log_bufs) { + journal_t *journal = transaction->t_journal; struct buffer_head *descriptor; struct jbd2_revoke_record_s *record; struct jbd2_revoke_table_s *revoke; @@ -544,16 +543,15 @@ void jbd2_journal_write_revoke_records(journal_t *journal, while (!list_empty(hash_list)) { record = (struct jbd2_revoke_record_s *) hash_list->next; - write_one_revoke_record(journal, transaction, log_bufs, - &descriptor, &offset, - record, write_op); + write_one_revoke_record(transaction, log_bufs, + &descriptor, &offset, record); count++; list_del(&record->hash); kmem_cache_free(jbd2_revoke_record_cache, record); } } if (descriptor) - flush_descriptor(journal, descriptor, offset, write_op); + flush_descriptor(journal, descriptor, offset); jbd_debug(1, "Wrote %d revoke records\n", count); } @@ -562,18 +560,16 @@ void jbd2_journal_write_revoke_records(journal_t *journal, * block if the old one is full or if we have not already created one. */ -static void write_one_revoke_record(journal_t *journal, - transaction_t *transaction, +static void write_one_revoke_record(transaction_t *transaction, struct list_head *log_bufs, struct buffer_head **descriptorp, int *offsetp, - struct jbd2_revoke_record_s *record, - int write_op) + struct jbd2_revoke_record_s *record) { + journal_t *journal = transaction->t_journal; int csum_size = 0; struct buffer_head *descriptor; int sz, offset; - journal_header_t *header; /* If we are already aborting, this all becomes a noop. We still need to go round the loop in @@ -587,7 +583,7 @@ static void write_one_revoke_record(journal_t *journal, /* Do we need to leave space at the end for a checksum? */ if (jbd2_journal_has_csum_v2or3(journal)) - csum_size = sizeof(struct jbd2_journal_revoke_tail); + csum_size = sizeof(struct jbd2_journal_block_tail); if (jbd2_has_feature_64bit(journal)) sz = 8; @@ -597,19 +593,16 @@ static void write_one_revoke_record(journal_t *journal, /* Make sure we have a descriptor with space left for the record */ if (descriptor) { if (offset + sz > journal->j_blocksize - csum_size) { - flush_descriptor(journal, descriptor, offset, write_op); + flush_descriptor(journal, descriptor, offset); descriptor = NULL; } } if (!descriptor) { - descriptor = jbd2_journal_get_descriptor_buffer(journal); + descriptor = jbd2_journal_get_descriptor_buffer(transaction, + JBD2_REVOKE_BLOCK); if (!descriptor) return; - header = (journal_header_t *)descriptor->b_data; - header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); - header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK); - header->h_sequence = cpu_to_be32(transaction->t_tid); /* Record it so that we can wait for IO completion later */ BUFFER_TRACE(descriptor, "file in log_bufs"); @@ -630,21 +623,6 @@ static void write_one_revoke_record(journal_t *journal, *offsetp = offset; } -static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh) -{ - struct jbd2_journal_revoke_tail *tail; - __u32 csum; - - if (!jbd2_journal_has_csum_v2or3(j)) - return; - - tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize - - sizeof(struct jbd2_journal_revoke_tail)); - tail->r_checksum = 0; - csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); - tail->r_checksum = cpu_to_be32(csum); -} - /* * Flush a revoke descriptor out to the journal. If we are aborting, * this is a noop; otherwise we are generating a buffer which needs to @@ -654,7 +632,7 @@ static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh) static void flush_descriptor(journal_t *journal, struct buffer_head *descriptor, - int offset, int write_op) + int offset) { jbd2_journal_revoke_header_t *header; @@ -665,12 +643,12 @@ static void flush_descriptor(journal_t *journal, header = (jbd2_journal_revoke_header_t *)descriptor->b_data; header->r_count = cpu_to_be32(offset); - jbd2_revoke_csum_set(journal, descriptor); + jbd2_descriptor_block_csum_set(journal, descriptor); set_buffer_jwrite(descriptor); BUFFER_TRACE(descriptor, "write"); set_buffer_dirty(descriptor); - write_dirty_buffer(descriptor, write_op); + write_dirty_buffer(descriptor, WRITE_SYNC); } #endif diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 081dff087fc0..01e4652d88f6 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -966,14 +966,8 @@ repeat: if (!frozen_buffer) { JBUFFER_TRACE(jh, "allocate memory for buffer"); jbd_unlock_bh_state(bh); - frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS); - if (!frozen_buffer) { - printk(KERN_ERR "%s: OOM for frozen_buffer\n", - __func__); - JBUFFER_TRACE(jh, "oom!"); - error = -ENOMEM; - goto out; - } + frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size, + GFP_NOFS | __GFP_NOFAIL); goto repeat; } jh->b_frozen_data = frozen_buffer; @@ -1226,15 +1220,9 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh) goto out; repeat: - if (!jh->b_committed_data) { - committed_data = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS); - if (!committed_data) { - printk(KERN_ERR "%s: No memory for committed data\n", - __func__); - err = -ENOMEM; - goto out; - } - } + if (!jh->b_committed_data) + committed_data = jbd2_alloc(jh2bh(jh)->b_size, + GFP_NOFS|__GFP_NOFAIL); jbd_lock_bh_state(bh); if (!jh->b_committed_data) { diff --git a/fs/jffs2/README.Locking b/fs/jffs2/README.Locking index 3ea36554107f..8918ac905a3b 100644 --- a/fs/jffs2/README.Locking +++ b/fs/jffs2/README.Locking @@ -2,10 +2,6 @@ JFFS2 LOCKING DOCUMENTATION --------------------------- -At least theoretically, JFFS2 does not require the Big Kernel Lock -(BKL), which was always helpfully obtained for it by Linux 2.4 VFS -code. It has its own locking, as described below. - This document attempts to describe the existing locking rules for JFFS2. It is not expected to remain perfectly up to date, but ought to be fairly close. @@ -69,6 +65,7 @@ Ordering constraints: any f->sem held. 2. Never attempt to lock two file mutexes in one thread. No ordering rules have been made for doing so. + 3. Never lock a page cache page with f->sem held. erase_completion_lock spinlock diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c index a3750f902adc..b288c8ae1236 100644 --- a/fs/jffs2/build.c +++ b/fs/jffs2/build.c @@ -17,6 +17,7 @@ #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/mtd/mtd.h> +#include <linux/mm.h> /* kvfree() */ #include "nodelist.h" static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *, @@ -49,7 +50,8 @@ next_inode(int *i, struct jffs2_inode_cache *ic, struct jffs2_sb_info *c) static void jffs2_build_inode_pass1(struct jffs2_sb_info *c, - struct jffs2_inode_cache *ic) + struct jffs2_inode_cache *ic, + int *dir_hardlinks) { struct jffs2_full_dirent *fd; @@ -68,19 +70,21 @@ static void jffs2_build_inode_pass1(struct jffs2_sb_info *c, dbg_fsbuild("child \"%s\" (ino #%u) of dir ino #%u doesn't exist!\n", fd->name, fd->ino, ic->ino); jffs2_mark_node_obsolete(c, fd->raw); + /* Clear the ic/raw union so it doesn't cause problems later. */ + fd->ic = NULL; continue; } + /* From this point, fd->raw is no longer used so we can set fd->ic */ + fd->ic = child_ic; + child_ic->pino_nlink++; + /* If we appear (at this stage) to have hard-linked directories, + * set a flag to trigger a scan later */ if (fd->type == DT_DIR) { - if (child_ic->pino_nlink) { - JFFS2_ERROR("child dir \"%s\" (ino #%u) of dir ino #%u appears to be a hard link\n", - fd->name, fd->ino, ic->ino); - /* TODO: What do we do about it? */ - } else { - child_ic->pino_nlink = ic->ino; - } - } else - child_ic->pino_nlink++; + child_ic->flags |= INO_FLAGS_IS_DIR; + if (child_ic->pino_nlink > 1) + *dir_hardlinks = 1; + } dbg_fsbuild("increased nlink for child \"%s\" (ino #%u)\n", fd->name, fd->ino); /* Can't free scan_dents so far. We might need them in pass 2 */ @@ -94,8 +98,7 @@ static void jffs2_build_inode_pass1(struct jffs2_sb_info *c, */ static int jffs2_build_filesystem(struct jffs2_sb_info *c) { - int ret; - int i; + int ret, i, dir_hardlinks = 0; struct jffs2_inode_cache *ic; struct jffs2_full_dirent *fd; struct jffs2_full_dirent *dead_fds = NULL; @@ -119,7 +122,7 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c) /* Now scan the directory tree, increasing nlink according to every dirent found. */ for_each_inode(i, c, ic) { if (ic->scan_dents) { - jffs2_build_inode_pass1(c, ic); + jffs2_build_inode_pass1(c, ic, &dir_hardlinks); cond_resched(); } } @@ -155,6 +158,20 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c) } dbg_fsbuild("pass 2a complete\n"); + + if (dir_hardlinks) { + /* If we detected directory hardlinks earlier, *hopefully* + * they are gone now because some of the links were from + * dead directories which still had some old dirents lying + * around and not yet garbage-collected, but which have + * been discarded above. So clear the pino_nlink field + * in each directory, so that the final scan below can + * print appropriate warnings. */ + for_each_inode(i, c, ic) { + if (ic->flags & INO_FLAGS_IS_DIR) + ic->pino_nlink = 0; + } + } dbg_fsbuild("freeing temporary data structures\n"); /* Finally, we can scan again and free the dirent structs */ @@ -162,6 +179,33 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c) while(ic->scan_dents) { fd = ic->scan_dents; ic->scan_dents = fd->next; + /* We do use the pino_nlink field to count nlink of + * directories during fs build, so set it to the + * parent ino# now. Now that there's hopefully only + * one. */ + if (fd->type == DT_DIR) { + if (!fd->ic) { + /* We'll have complained about it and marked the coresponding + raw node obsolete already. Just skip it. */ + continue; + } + + /* We *have* to have set this in jffs2_build_inode_pass1() */ + BUG_ON(!(fd->ic->flags & INO_FLAGS_IS_DIR)); + + /* We clear ic->pino_nlink ∀ directories' ic *only* if dir_hardlinks + * is set. Otherwise, we know this should never trigger anyway, so + * we don't do the check. And ic->pino_nlink still contains the nlink + * value (which is 1). */ + if (dir_hardlinks && fd->ic->pino_nlink) { + JFFS2_ERROR("child dir \"%s\" (ino #%u) of dir ino #%u is also hard linked from dir ino #%u\n", + fd->name, fd->ino, ic->ino, fd->ic->pino_nlink); + /* Should we unlink it from its previous parent? */ + } + + /* For directories, ic->pino_nlink holds that parent inode # */ + fd->ic->pino_nlink = ic->ino; + } jffs2_free_full_dirent(fd); } ic->scan_dents = NULL; @@ -240,11 +284,7 @@ static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *c, /* Reduce nlink of the child. If it's now zero, stick it on the dead_fds list to be cleaned up later. Else just free the fd */ - - if (fd->type == DT_DIR) - child_ic->pino_nlink = 0; - else - child_ic->pino_nlink--; + child_ic->pino_nlink--; if (!child_ic->pino_nlink) { dbg_fsbuild("inode #%u (\"%s\") now has no links; adding to dead_fds list.\n", @@ -383,12 +423,7 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c) return 0; out_free: -#ifndef __ECOS - if (jffs2_blocks_use_vmalloc(c)) - vfree(c->blocks); - else -#endif - kfree(c->blocks); + kvfree(c->blocks); return ret; } diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index d211b8e18566..30c4c9ebb693 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -843,9 +843,14 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, pr_notice("%s(): Link succeeded, unlink failed (err %d). You now have a hard link\n", __func__, ret); - /* Might as well let the VFS know */ - d_instantiate(new_dentry, d_inode(old_dentry)); - ihold(d_inode(old_dentry)); + /* + * We can't keep the target in dcache after that. + * For one thing, we can't afford dentry aliases for directories. + * For another, if there was a victim, we _can't_ set new inode + * for that sucker and we have to trigger mount eviction - the + * caller won't do it on its own since we are returning an error. + */ + d_invalidate(new_dentry); new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now); return ret; } diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index f509f62e12f6..cad86bac3453 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -39,10 +39,10 @@ int jffs2_fsync(struct file *filp, loff_t start, loff_t end, int datasync) if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* Trigger GC to flush any pending writes for this inode */ jffs2_flush_wbuf_gc(c, inode->i_ino); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return 0; } @@ -137,39 +137,33 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, struct page *pg; struct inode *inode = mapping->host; struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); - struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); - struct jffs2_raw_inode ri; - uint32_t alloc_len = 0; pgoff_t index = pos >> PAGE_CACHE_SHIFT; uint32_t pageofs = index << PAGE_CACHE_SHIFT; int ret = 0; - jffs2_dbg(1, "%s()\n", __func__); - - if (pageofs > inode->i_size) { - ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len, - ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE); - if (ret) - return ret; - } - - mutex_lock(&f->sem); pg = grab_cache_page_write_begin(mapping, index, flags); - if (!pg) { - if (alloc_len) - jffs2_complete_reservation(c); - mutex_unlock(&f->sem); + if (!pg) return -ENOMEM; - } *pagep = pg; - if (alloc_len) { + jffs2_dbg(1, "%s()\n", __func__); + + if (pageofs > inode->i_size) { /* Make new hole frag from old EOF to new page */ + struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); + struct jffs2_raw_inode ri; struct jffs2_full_dnode *fn; + uint32_t alloc_len; jffs2_dbg(1, "Writing new hole frag 0x%x-0x%x between current EOF and new page\n", (unsigned int)inode->i_size, pageofs); + ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len, + ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE); + if (ret) + goto out_page; + + mutex_lock(&f->sem); memset(&ri, 0, sizeof(ri)); ri.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); @@ -196,6 +190,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, if (IS_ERR(fn)) { ret = PTR_ERR(fn); jffs2_complete_reservation(c); + mutex_unlock(&f->sem); goto out_page; } ret = jffs2_add_full_dnode_to_inode(c, f, fn); @@ -210,10 +205,12 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, jffs2_mark_node_obsolete(c, fn->raw); jffs2_free_full_dnode(fn); jffs2_complete_reservation(c); + mutex_unlock(&f->sem); goto out_page; } jffs2_complete_reservation(c); inode->i_size = pageofs; + mutex_unlock(&f->sem); } /* @@ -222,18 +219,18 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, * case of a short-copy. */ if (!PageUptodate(pg)) { + mutex_lock(&f->sem); ret = jffs2_do_readpage_nolock(inode, pg); + mutex_unlock(&f->sem); if (ret) goto out_page; } - mutex_unlock(&f->sem); jffs2_dbg(1, "end write_begin(). pg->flags %lx\n", pg->flags); return ret; out_page: unlock_page(pg); page_cache_release(pg); - mutex_unlock(&f->sem); return ret; } diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 2caf1682036d..bead25ae8fe4 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -596,10 +596,7 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent) out_root: jffs2_free_ino_caches(c); jffs2_free_raw_node_refs(c); - if (jffs2_blocks_use_vmalloc(c)) - vfree(c->blocks); - else - kfree(c->blocks); + kvfree(c->blocks); out_inohash: jffs2_clear_xattr_subsystem(c); kfree(c->inocache_list); diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c index 5a2dec2b064c..95d5880a63ee 100644 --- a/fs/jffs2/gc.c +++ b/fs/jffs2/gc.c @@ -1296,14 +1296,17 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era BUG_ON(start > orig_start); } - /* First, use readpage() to read the appropriate page into the page cache */ - /* Q: What happens if we actually try to GC the _same_ page for which commit_write() - * triggered garbage collection in the first place? - * A: I _think_ it's OK. read_cache_page shouldn't deadlock, we'll write out the - * page OK. We'll actually write it out again in commit_write, which is a little - * suboptimal, but at least we're correct. - */ + /* The rules state that we must obtain the page lock *before* f->sem, so + * drop f->sem temporarily. Since we also hold c->alloc_sem, nothing's + * actually going to *change* so we're safe; we only allow reading. + * + * It is important to note that jffs2_write_begin() will ensure that its + * page is marked Uptodate before allocating space. That means that if we + * end up here trying to GC the *same* page that jffs2_write_begin() is + * trying to write out, read_cache_page() will not deadlock. */ + mutex_unlock(&f->sem); pg_ptr = jffs2_gc_fetch_page(c, f, start, &pg); + mutex_lock(&f->sem); if (IS_ERR(pg_ptr)) { pr_warn("read_cache_page() returned error: %ld\n", diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h index fa35ff79ab35..0637271f3770 100644 --- a/fs/jffs2/nodelist.h +++ b/fs/jffs2/nodelist.h @@ -194,6 +194,7 @@ struct jffs2_inode_cache { #define INO_STATE_CLEARING 6 /* In clear_inode() */ #define INO_FLAGS_XATTR_CHECKED 0x01 /* has no duplicate xattr_ref */ +#define INO_FLAGS_IS_DIR 0x02 /* is a directory */ #define RAWNODE_CLASS_INODE_CACHE 0 #define RAWNODE_CLASS_XATTR_DATUM 1 @@ -249,7 +250,10 @@ struct jffs2_readinode_info struct jffs2_full_dirent { - struct jffs2_raw_node_ref *raw; + union { + struct jffs2_raw_node_ref *raw; + struct jffs2_inode_cache *ic; /* Just during part of build */ + }; struct jffs2_full_dirent *next; uint32_t version; uint32_t ino; /* == zero for unlink */ diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index bb080c272149..0a9a114bb9d1 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -331,10 +331,7 @@ static void jffs2_put_super (struct super_block *sb) jffs2_free_ino_caches(c); jffs2_free_raw_node_refs(c); - if (jffs2_blocks_use_vmalloc(c)) - vfree(c->blocks); - else - kfree(c->blocks); + kvfree(c->blocks); jffs2_flash_cleanup(c); kfree(c->inocache_list); jffs2_clear_xattr_subsystem(c); diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 0e026a7bdcd4..4ce7735dd042 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -38,17 +38,17 @@ int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) if (rc) return rc; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (!(inode->i_state & I_DIRTY_ALL) || (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) { /* Make sure committed changes hit the disk */ jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return rc; } rc |= jfs_commit_inode(inode, 1); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return rc ? -EIO : 0; } diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c index 8db8b7d61e40..8653cac7e12e 100644 --- a/fs/jfs/ioctl.c +++ b/fs/jfs/ioctl.c @@ -96,7 +96,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } /* Lock against other parallel changes of flags */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); jfs_get_inode_flags(jfs_inode); oldflags = jfs_inode->mode2; @@ -109,7 +109,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) ((flags ^ oldflags) & (JFS_APPEND_FL | JFS_IMMUTABLE_FL))) { if (!capable(CAP_LINUX_IMMUTABLE)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); err = -EPERM; goto setflags_out; } @@ -120,7 +120,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) jfs_inode->mode2 = flags; jfs_set_inode_flags(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); setflags_out: diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 900925b5eb8c..4f5d85ba8e23 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -792,7 +792,7 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type, struct buffer_head tmp_bh; struct buffer_head *bh; - mutex_lock(&inode->i_mutex); + inode_lock(inode); while (towrite > 0) { tocopy = sb->s_blocksize - offset < towrite ? sb->s_blocksize - offset : towrite; @@ -824,7 +824,7 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type, } out: if (len == towrite) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } if (inode->i_size < off+len-towrite) @@ -832,7 +832,7 @@ out: inode->i_version++; inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return len - towrite; } diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 25d71a53cf43..03b688d19f69 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -820,15 +820,22 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent, const unsigned char *path, const void *ns) { - static char path_buf[PATH_MAX]; /* protected by kernfs_mutex */ - size_t len = strlcpy(path_buf, path, PATH_MAX); - char *p = path_buf; - char *name; + size_t len; + char *p, *name; lockdep_assert_held(&kernfs_mutex); - if (len >= PATH_MAX) + /* grab kernfs_rename_lock to piggy back on kernfs_pr_cont_buf */ + spin_lock_irq(&kernfs_rename_lock); + + len = strlcpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf)); + + if (len >= sizeof(kernfs_pr_cont_buf)) { + spin_unlock_irq(&kernfs_rename_lock); return NULL; + } + + p = kernfs_pr_cont_buf; while ((name = strsep(&p, "/")) && parent) { if (*name == '\0') @@ -836,6 +843,8 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent, parent = kernfs_find_ns(parent, name, ns); } + spin_unlock_irq(&kernfs_rename_lock); + return parent; } @@ -1640,9 +1649,9 @@ static loff_t kernfs_dir_fop_llseek(struct file *file, loff_t offset, struct inode *inode = file_inode(file); loff_t ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = generic_file_llseek(file, offset, whence); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/libfs.c b/fs/libfs.c index 01491299f348..0ca80b2af420 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -89,7 +89,7 @@ EXPORT_SYMBOL(dcache_dir_close); loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) { struct dentry *dentry = file->f_path.dentry; - mutex_lock(&d_inode(dentry)->i_mutex); + inode_lock(d_inode(dentry)); switch (whence) { case 1: offset += file->f_pos; @@ -97,7 +97,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) if (offset >= 0) break; default: - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); return -EINVAL; } if (offset != file->f_pos) { @@ -124,7 +124,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) spin_unlock(&dentry->d_lock); } } - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); return offset; } EXPORT_SYMBOL(dcache_dir_lseek); @@ -941,7 +941,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end, if (err) return err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = sync_mapping_buffers(inode->i_mapping); if (!(inode->i_state & I_DIRTY_ALL)) goto out; @@ -953,7 +953,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end, ret = err; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } EXPORT_SYMBOL(__generic_file_fsync); diff --git a/fs/locks.c b/fs/locks.c index af1ed74a657f..7c5f91be9b65 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1650,12 +1650,12 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr * bother, maybe that's a sign this just isn't a good file to * hand out a delegation on. */ - if (is_deleg && !mutex_trylock(&inode->i_mutex)) + if (is_deleg && !inode_trylock(inode)) return -EAGAIN; if (is_deleg && arg == F_WRLCK) { /* Write delegations are not currently supported: */ - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); WARN_ON_ONCE(1); return -EINVAL; } @@ -1732,7 +1732,7 @@ out: spin_unlock(&ctx->flc_lock); locks_dispose_list(&dispose); if (is_deleg) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (!error && !my_fl) *flp = NULL; return error; diff --git a/fs/logfs/file.c b/fs/logfs/file.c index 1a6f0167b16a..61eaeb1b6cac 100644 --- a/fs/logfs/file.c +++ b/fs/logfs/file.c @@ -204,12 +204,12 @@ long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) if (err) return err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); oldflags = li->li_flags; flags &= LOGFS_FL_USER_MODIFIABLE; flags |= oldflags & ~LOGFS_FL_USER_MODIFIABLE; li->li_flags = flags; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); inode->i_ctime = CURRENT_TIME; mark_inode_dirty_sync(inode); @@ -230,11 +230,11 @@ int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); logfs_get_wblocks(sb, NULL, WF_LOCK); logfs_write_anchor(sb); logfs_put_wblocks(sb, NULL, WF_LOCK); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return 0; } diff --git a/fs/mbcache.c b/fs/mbcache.c index 187477ded6b3..eccda3a02de6 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -1,858 +1,433 @@ -/* - * linux/fs/mbcache.c - * (C) 2001-2002 Andreas Gruenbacher, <a.gruenbacher@computer.org> - */ - -/* - * Filesystem Meta Information Block Cache (mbcache) - * - * The mbcache caches blocks of block devices that need to be located - * by their device/block number, as well as by other criteria (such - * as the block's contents). - * - * There can only be one cache entry in a cache per device and block number. - * Additional indexes need not be unique in this sense. The number of - * additional indexes (=other criteria) can be hardwired at compile time - * or specified at cache create time. - * - * Each cache entry is of fixed size. An entry may be `valid' or `invalid' - * in the cache. A valid entry is in the main hash tables of the cache, - * and may also be in the lru list. An invalid entry is not in any hashes - * or lists. - * - * A valid cache entry is only in the lru list if no handles refer to it. - * Invalid cache entries will be freed when the last handle to the cache - * entry is released. Entries that cannot be freed immediately are put - * back on the lru list. - */ - -/* - * Lock descriptions and usage: - * - * Each hash chain of both the block and index hash tables now contains - * a built-in lock used to serialize accesses to the hash chain. - * - * Accesses to global data structures mb_cache_list and mb_cache_lru_list - * are serialized via the global spinlock mb_cache_spinlock. - * - * Each mb_cache_entry contains a spinlock, e_entry_lock, to serialize - * accesses to its local data, such as e_used and e_queued. - * - * Lock ordering: - * - * Each block hash chain's lock has the highest lock order, followed by an - * index hash chain's lock, mb_cache_bg_lock (used to implement mb_cache_entry's - * lock), and mb_cach_spinlock, with the lowest order. While holding - * either a block or index hash chain lock, a thread can acquire an - * mc_cache_bg_lock, which in turn can also acquire mb_cache_spinlock. - * - * Synchronization: - * - * Since both mb_cache_entry_get and mb_cache_entry_find scan the block and - * index hash chian, it needs to lock the corresponding hash chain. For each - * mb_cache_entry within the chain, it needs to lock the mb_cache_entry to - * prevent either any simultaneous release or free on the entry and also - * to serialize accesses to either the e_used or e_queued member of the entry. - * - * To avoid having a dangling reference to an already freed - * mb_cache_entry, an mb_cache_entry is only freed when it is not on a - * block hash chain and also no longer being referenced, both e_used, - * and e_queued are 0's. When an mb_cache_entry is explicitly freed it is - * first removed from a block hash chain. - */ - -#include <linux/kernel.h> -#include <linux/module.h> - -#include <linux/hash.h> -#include <linux/fs.h> -#include <linux/mm.h> +#include <linux/spinlock.h> #include <linux/slab.h> -#include <linux/sched.h> +#include <linux/list.h> #include <linux/list_bl.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/workqueue.h> #include <linux/mbcache.h> -#include <linux/init.h> -#include <linux/blockgroup_lock.h> -#include <linux/log2.h> - -#ifdef MB_CACHE_DEBUG -# define mb_debug(f...) do { \ - printk(KERN_DEBUG f); \ - printk("\n"); \ - } while (0) -#define mb_assert(c) do { if (!(c)) \ - printk(KERN_ERR "assertion " #c " failed\n"); \ - } while(0) -#else -# define mb_debug(f...) do { } while(0) -# define mb_assert(c) do { } while(0) -#endif -#define mb_error(f...) do { \ - printk(KERN_ERR f); \ - printk("\n"); \ - } while(0) - -#define MB_CACHE_WRITER ((unsigned short)~0U >> 1) - -#define MB_CACHE_ENTRY_LOCK_BITS ilog2(NR_BG_LOCKS) -#define MB_CACHE_ENTRY_LOCK_INDEX(ce) \ - (hash_long((unsigned long)ce, MB_CACHE_ENTRY_LOCK_BITS)) - -static DECLARE_WAIT_QUEUE_HEAD(mb_cache_queue); -static struct blockgroup_lock *mb_cache_bg_lock; -static struct kmem_cache *mb_cache_kmem_cache; - -MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>"); -MODULE_DESCRIPTION("Meta block cache (for extended attributes)"); -MODULE_LICENSE("GPL"); - -EXPORT_SYMBOL(mb_cache_create); -EXPORT_SYMBOL(mb_cache_shrink); -EXPORT_SYMBOL(mb_cache_destroy); -EXPORT_SYMBOL(mb_cache_entry_alloc); -EXPORT_SYMBOL(mb_cache_entry_insert); -EXPORT_SYMBOL(mb_cache_entry_release); -EXPORT_SYMBOL(mb_cache_entry_free); -EXPORT_SYMBOL(mb_cache_entry_get); -#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) -EXPORT_SYMBOL(mb_cache_entry_find_first); -EXPORT_SYMBOL(mb_cache_entry_find_next); -#endif /* - * Global data: list of all mbcache's, lru list, and a spinlock for - * accessing cache data structures on SMP machines. The lru list is - * global across all mbcaches. + * Mbcache is a simple key-value store. Keys need not be unique, however + * key-value pairs are expected to be unique (we use this fact in + * mb_cache_entry_delete_block()). + * + * Ext2 and ext4 use this cache for deduplication of extended attribute blocks. + * They use hash of a block contents as a key and block number as a value. + * That's why keys need not be unique (different xattr blocks may end up having + * the same hash). However block number always uniquely identifies a cache + * entry. + * + * We provide functions for creation and removal of entries, search by key, + * and a special "delete entry with given key-value pair" operation. Fixed + * size hash table is used for fast key lookups. */ -static LIST_HEAD(mb_cache_list); -static LIST_HEAD(mb_cache_lru_list); -static DEFINE_SPINLOCK(mb_cache_spinlock); - -static inline void -__spin_lock_mb_cache_entry(struct mb_cache_entry *ce) -{ - spin_lock(bgl_lock_ptr(mb_cache_bg_lock, - MB_CACHE_ENTRY_LOCK_INDEX(ce))); -} - -static inline void -__spin_unlock_mb_cache_entry(struct mb_cache_entry *ce) -{ - spin_unlock(bgl_lock_ptr(mb_cache_bg_lock, - MB_CACHE_ENTRY_LOCK_INDEX(ce))); -} - -static inline int -__mb_cache_entry_is_block_hashed(struct mb_cache_entry *ce) -{ - return !hlist_bl_unhashed(&ce->e_block_list); -} +struct mb_cache { + /* Hash table of entries */ + struct hlist_bl_head *c_hash; + /* log2 of hash table size */ + int c_bucket_bits; + /* Maximum entries in cache to avoid degrading hash too much */ + int c_max_entries; + /* Protects c_list, c_entry_count */ + spinlock_t c_list_lock; + struct list_head c_list; + /* Number of entries in cache */ + unsigned long c_entry_count; + struct shrinker c_shrink; + /* Work for shrinking when the cache has too many entries */ + struct work_struct c_shrink_work; +}; +static struct kmem_cache *mb_entry_cache; -static inline void -__mb_cache_entry_unhash_block(struct mb_cache_entry *ce) -{ - if (__mb_cache_entry_is_block_hashed(ce)) - hlist_bl_del_init(&ce->e_block_list); -} +static unsigned long mb_cache_shrink(struct mb_cache *cache, + unsigned int nr_to_scan); -static inline int -__mb_cache_entry_is_index_hashed(struct mb_cache_entry *ce) +static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache, + u32 key) { - return !hlist_bl_unhashed(&ce->e_index.o_list); + return &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; } -static inline void -__mb_cache_entry_unhash_index(struct mb_cache_entry *ce) -{ - if (__mb_cache_entry_is_index_hashed(ce)) - hlist_bl_del_init(&ce->e_index.o_list); -} +/* + * Number of entries to reclaim synchronously when there are too many entries + * in cache + */ +#define SYNC_SHRINK_BATCH 64 /* - * __mb_cache_entry_unhash_unlock() - * - * This function is called to unhash both the block and index hash - * chain. - * It assumes both the block and index hash chain is locked upon entry. - * It also unlock both hash chains both exit + * mb_cache_entry_create - create entry in cache + * @cache - cache where the entry should be created + * @mask - gfp mask with which the entry should be allocated + * @key - key of the entry + * @block - block that contains data + * @reusable - is the block reusable by other inodes? + * + * Creates entry in @cache with key @key and records that data is stored in + * block @block. The function returns -EBUSY if entry with the same key + * and for the same block already exists in cache. Otherwise 0 is returned. */ -static inline void -__mb_cache_entry_unhash_unlock(struct mb_cache_entry *ce) +int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, + sector_t block, bool reusable) { - __mb_cache_entry_unhash_index(ce); - hlist_bl_unlock(ce->e_index_hash_p); - __mb_cache_entry_unhash_block(ce); - hlist_bl_unlock(ce->e_block_hash_p); + struct mb_cache_entry *entry, *dup; + struct hlist_bl_node *dup_node; + struct hlist_bl_head *head; + + /* Schedule background reclaim if there are too many entries */ + if (cache->c_entry_count >= cache->c_max_entries) + schedule_work(&cache->c_shrink_work); + /* Do some sync reclaim if background reclaim cannot keep up */ + if (cache->c_entry_count >= 2*cache->c_max_entries) + mb_cache_shrink(cache, SYNC_SHRINK_BATCH); + + entry = kmem_cache_alloc(mb_entry_cache, mask); + if (!entry) + return -ENOMEM; + + INIT_LIST_HEAD(&entry->e_list); + /* One ref for hash, one ref returned */ + atomic_set(&entry->e_refcnt, 1); + entry->e_key = key; + entry->e_block = block; + entry->e_reusable = reusable; + head = mb_cache_entry_head(cache, key); + hlist_bl_lock(head); + hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) { + if (dup->e_key == key && dup->e_block == block) { + hlist_bl_unlock(head); + kmem_cache_free(mb_entry_cache, entry); + return -EBUSY; + } + } + hlist_bl_add_head(&entry->e_hash_list, head); + hlist_bl_unlock(head); + + spin_lock(&cache->c_list_lock); + list_add_tail(&entry->e_list, &cache->c_list); + /* Grab ref for LRU list */ + atomic_inc(&entry->e_refcnt); + cache->c_entry_count++; + spin_unlock(&cache->c_list_lock); + + return 0; } +EXPORT_SYMBOL(mb_cache_entry_create); -static void -__mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask) +void __mb_cache_entry_free(struct mb_cache_entry *entry) { - struct mb_cache *cache = ce->e_cache; - - mb_assert(!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))); - kmem_cache_free(cache->c_entry_cache, ce); - atomic_dec(&cache->c_entry_count); + kmem_cache_free(mb_entry_cache, entry); } +EXPORT_SYMBOL(__mb_cache_entry_free); -static void -__mb_cache_entry_release(struct mb_cache_entry *ce) +static struct mb_cache_entry *__entry_find(struct mb_cache *cache, + struct mb_cache_entry *entry, + u32 key) { - /* First lock the entry to serialize access to its local data. */ - __spin_lock_mb_cache_entry(ce); - /* Wake up all processes queuing for this cache entry. */ - if (ce->e_queued) - wake_up_all(&mb_cache_queue); - if (ce->e_used >= MB_CACHE_WRITER) - ce->e_used -= MB_CACHE_WRITER; - /* - * Make sure that all cache entries on lru_list have - * both e_used and e_qued of 0s. - */ - ce->e_used--; - if (!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))) { - if (!__mb_cache_entry_is_block_hashed(ce)) { - __spin_unlock_mb_cache_entry(ce); - goto forget; + struct mb_cache_entry *old_entry = entry; + struct hlist_bl_node *node; + struct hlist_bl_head *head; + + head = mb_cache_entry_head(cache, key); + hlist_bl_lock(head); + if (entry && !hlist_bl_unhashed(&entry->e_hash_list)) + node = entry->e_hash_list.next; + else + node = hlist_bl_first(head); + while (node) { + entry = hlist_bl_entry(node, struct mb_cache_entry, + e_hash_list); + if (entry->e_key == key && entry->e_reusable) { + atomic_inc(&entry->e_refcnt); + goto out; } - /* - * Need access to lru list, first drop entry lock, - * then reacquire the lock in the proper order. - */ - spin_lock(&mb_cache_spinlock); - if (list_empty(&ce->e_lru_list)) - list_add_tail(&ce->e_lru_list, &mb_cache_lru_list); - spin_unlock(&mb_cache_spinlock); + node = node->next; } - __spin_unlock_mb_cache_entry(ce); - return; -forget: - mb_assert(list_empty(&ce->e_lru_list)); - __mb_cache_entry_forget(ce, GFP_KERNEL); + entry = NULL; +out: + hlist_bl_unlock(head); + if (old_entry) + mb_cache_entry_put(cache, old_entry); + + return entry; } /* - * mb_cache_shrink_scan() memory pressure callback - * - * This function is called by the kernel memory management when memory - * gets low. + * mb_cache_entry_find_first - find the first entry in cache with given key + * @cache: cache where we should search + * @key: key to look for * - * @shrink: (ignored) - * @sc: shrink_control passed from reclaim - * - * Returns the number of objects freed. + * Search in @cache for entry with key @key. Grabs reference to the first + * entry found and returns the entry. */ -static unsigned long -mb_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) +struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, + u32 key) { - LIST_HEAD(free_list); - struct mb_cache_entry *entry, *tmp; - int nr_to_scan = sc->nr_to_scan; - gfp_t gfp_mask = sc->gfp_mask; - unsigned long freed = 0; - - mb_debug("trying to free %d entries", nr_to_scan); - spin_lock(&mb_cache_spinlock); - while ((nr_to_scan-- > 0) && !list_empty(&mb_cache_lru_list)) { - struct mb_cache_entry *ce = - list_entry(mb_cache_lru_list.next, - struct mb_cache_entry, e_lru_list); - list_del_init(&ce->e_lru_list); - if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt)) - continue; - spin_unlock(&mb_cache_spinlock); - /* Prevent any find or get operation on the entry */ - hlist_bl_lock(ce->e_block_hash_p); - hlist_bl_lock(ce->e_index_hash_p); - /* Ignore if it is touched by a find/get */ - if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt) || - !list_empty(&ce->e_lru_list)) { - hlist_bl_unlock(ce->e_index_hash_p); - hlist_bl_unlock(ce->e_block_hash_p); - spin_lock(&mb_cache_spinlock); - continue; - } - __mb_cache_entry_unhash_unlock(ce); - list_add_tail(&ce->e_lru_list, &free_list); - spin_lock(&mb_cache_spinlock); - } - spin_unlock(&mb_cache_spinlock); - - list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) { - __mb_cache_entry_forget(entry, gfp_mask); - freed++; - } - return freed; + return __entry_find(cache, NULL, key); } +EXPORT_SYMBOL(mb_cache_entry_find_first); -static unsigned long -mb_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc) +/* + * mb_cache_entry_find_next - find next entry in cache with the same + * @cache: cache where we should search + * @entry: entry to start search from + * + * Finds next entry in the hash chain which has the same key as @entry. + * If @entry is unhashed (which can happen when deletion of entry races + * with the search), finds the first entry in the hash chain. The function + * drops reference to @entry and returns with a reference to the found entry. + */ +struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache, + struct mb_cache_entry *entry) { - struct mb_cache *cache; - unsigned long count = 0; - - spin_lock(&mb_cache_spinlock); - list_for_each_entry(cache, &mb_cache_list, c_cache_list) { - mb_debug("cache %s (%d)", cache->c_name, - atomic_read(&cache->c_entry_count)); - count += atomic_read(&cache->c_entry_count); - } - spin_unlock(&mb_cache_spinlock); - - return vfs_pressure_ratio(count); + return __entry_find(cache, entry, entry->e_key); } - -static struct shrinker mb_cache_shrinker = { - .count_objects = mb_cache_shrink_count, - .scan_objects = mb_cache_shrink_scan, - .seeks = DEFAULT_SEEKS, -}; +EXPORT_SYMBOL(mb_cache_entry_find_next); /* - * mb_cache_create() create a new cache - * - * All entries in one cache are equal size. Cache entries may be from - * multiple devices. If this is the first mbcache created, registers - * the cache with kernel memory management. Returns NULL if no more - * memory was available. - * - * @name: name of the cache (informal) - * @bucket_bits: log2(number of hash buckets) + * mb_cache_entry_get - get a cache entry by block number (and key) + * @cache - cache we work with + * @key - key of block number @block + * @block - block number */ -struct mb_cache * -mb_cache_create(const char *name, int bucket_bits) +struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key, + sector_t block) { - int n, bucket_count = 1 << bucket_bits; - struct mb_cache *cache = NULL; - - if (!mb_cache_bg_lock) { - mb_cache_bg_lock = kmalloc(sizeof(struct blockgroup_lock), - GFP_KERNEL); - if (!mb_cache_bg_lock) - return NULL; - bgl_lock_init(mb_cache_bg_lock); - } - - cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL); - if (!cache) - return NULL; - cache->c_name = name; - atomic_set(&cache->c_entry_count, 0); - cache->c_bucket_bits = bucket_bits; - cache->c_block_hash = kmalloc(bucket_count * - sizeof(struct hlist_bl_head), GFP_KERNEL); - if (!cache->c_block_hash) - goto fail; - for (n=0; n<bucket_count; n++) - INIT_HLIST_BL_HEAD(&cache->c_block_hash[n]); - cache->c_index_hash = kmalloc(bucket_count * - sizeof(struct hlist_bl_head), GFP_KERNEL); - if (!cache->c_index_hash) - goto fail; - for (n=0; n<bucket_count; n++) - INIT_HLIST_BL_HEAD(&cache->c_index_hash[n]); - if (!mb_cache_kmem_cache) { - mb_cache_kmem_cache = kmem_cache_create(name, - sizeof(struct mb_cache_entry), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); - if (!mb_cache_kmem_cache) - goto fail2; + struct hlist_bl_node *node; + struct hlist_bl_head *head; + struct mb_cache_entry *entry; + + head = mb_cache_entry_head(cache, key); + hlist_bl_lock(head); + hlist_bl_for_each_entry(entry, node, head, e_hash_list) { + if (entry->e_key == key && entry->e_block == block) { + atomic_inc(&entry->e_refcnt); + goto out; + } } - cache->c_entry_cache = mb_cache_kmem_cache; - - /* - * Set an upper limit on the number of cache entries so that the hash - * chains won't grow too long. - */ - cache->c_max_entries = bucket_count << 4; - - spin_lock(&mb_cache_spinlock); - list_add(&cache->c_cache_list, &mb_cache_list); - spin_unlock(&mb_cache_spinlock); - return cache; - -fail2: - kfree(cache->c_index_hash); - -fail: - kfree(cache->c_block_hash); - kfree(cache); - return NULL; + entry = NULL; +out: + hlist_bl_unlock(head); + return entry; } +EXPORT_SYMBOL(mb_cache_entry_get); - -/* - * mb_cache_shrink() - * - * Removes all cache entries of a device from the cache. All cache entries - * currently in use cannot be freed, and thus remain in the cache. All others - * are freed. +/* mb_cache_entry_delete_block - remove information about block from cache + * @cache - cache we work with + * @key - key of block @block + * @block - block number * - * @bdev: which device's cache entries to shrink + * Remove entry from cache @cache with key @key with data stored in @block. */ -void -mb_cache_shrink(struct block_device *bdev) +void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key, + sector_t block) { - LIST_HEAD(free_list); - struct list_head *l; - struct mb_cache_entry *ce, *tmp; - - l = &mb_cache_lru_list; - spin_lock(&mb_cache_spinlock); - while (!list_is_last(l, &mb_cache_lru_list)) { - l = l->next; - ce = list_entry(l, struct mb_cache_entry, e_lru_list); - if (ce->e_bdev == bdev) { - list_del_init(&ce->e_lru_list); - if (ce->e_used || ce->e_queued || - atomic_read(&ce->e_refcnt)) - continue; - spin_unlock(&mb_cache_spinlock); - /* - * Prevent any find or get operation on the entry. - */ - hlist_bl_lock(ce->e_block_hash_p); - hlist_bl_lock(ce->e_index_hash_p); - /* Ignore if it is touched by a find/get */ - if (ce->e_used || ce->e_queued || - atomic_read(&ce->e_refcnt) || - !list_empty(&ce->e_lru_list)) { - hlist_bl_unlock(ce->e_index_hash_p); - hlist_bl_unlock(ce->e_block_hash_p); - l = &mb_cache_lru_list; - spin_lock(&mb_cache_spinlock); - continue; + struct hlist_bl_node *node; + struct hlist_bl_head *head; + struct mb_cache_entry *entry; + + head = mb_cache_entry_head(cache, key); + hlist_bl_lock(head); + hlist_bl_for_each_entry(entry, node, head, e_hash_list) { + if (entry->e_key == key && entry->e_block == block) { + /* We keep hash list reference to keep entry alive */ + hlist_bl_del_init(&entry->e_hash_list); + hlist_bl_unlock(head); + spin_lock(&cache->c_list_lock); + if (!list_empty(&entry->e_list)) { + list_del_init(&entry->e_list); + cache->c_entry_count--; + atomic_dec(&entry->e_refcnt); } - __mb_cache_entry_unhash_unlock(ce); - mb_assert(!(ce->e_used || ce->e_queued || - atomic_read(&ce->e_refcnt))); - list_add_tail(&ce->e_lru_list, &free_list); - l = &mb_cache_lru_list; - spin_lock(&mb_cache_spinlock); + spin_unlock(&cache->c_list_lock); + mb_cache_entry_put(cache, entry); + return; } } - spin_unlock(&mb_cache_spinlock); - - list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) { - __mb_cache_entry_forget(ce, GFP_KERNEL); - } + hlist_bl_unlock(head); } +EXPORT_SYMBOL(mb_cache_entry_delete_block); - -/* - * mb_cache_destroy() +/* mb_cache_entry_touch - cache entry got used + * @cache - cache the entry belongs to + * @entry - entry that got used * - * Shrinks the cache to its minimum possible size (hopefully 0 entries), - * and then destroys it. If this was the last mbcache, un-registers the - * mbcache from kernel memory management. + * Marks entry as used to give hit higher chances of surviving in cache. */ -void -mb_cache_destroy(struct mb_cache *cache) +void mb_cache_entry_touch(struct mb_cache *cache, + struct mb_cache_entry *entry) { - LIST_HEAD(free_list); - struct mb_cache_entry *ce, *tmp; - - spin_lock(&mb_cache_spinlock); - list_for_each_entry_safe(ce, tmp, &mb_cache_lru_list, e_lru_list) { - if (ce->e_cache == cache) - list_move_tail(&ce->e_lru_list, &free_list); - } - list_del(&cache->c_cache_list); - spin_unlock(&mb_cache_spinlock); - - list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) { - list_del_init(&ce->e_lru_list); - /* - * Prevent any find or get operation on the entry. - */ - hlist_bl_lock(ce->e_block_hash_p); - hlist_bl_lock(ce->e_index_hash_p); - mb_assert(!(ce->e_used || ce->e_queued || - atomic_read(&ce->e_refcnt))); - __mb_cache_entry_unhash_unlock(ce); - __mb_cache_entry_forget(ce, GFP_KERNEL); - } - - if (atomic_read(&cache->c_entry_count) > 0) { - mb_error("cache %s: %d orphaned entries", - cache->c_name, - atomic_read(&cache->c_entry_count)); - } - - if (list_empty(&mb_cache_list)) { - kmem_cache_destroy(mb_cache_kmem_cache); - mb_cache_kmem_cache = NULL; - } - kfree(cache->c_index_hash); - kfree(cache->c_block_hash); - kfree(cache); + entry->e_referenced = 1; } +EXPORT_SYMBOL(mb_cache_entry_touch); -/* - * mb_cache_entry_alloc() - * - * Allocates a new cache entry. The new entry will not be valid initially, - * and thus cannot be looked up yet. It should be filled with data, and - * then inserted into the cache using mb_cache_entry_insert(). Returns NULL - * if no more memory was available. - */ -struct mb_cache_entry * -mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags) +static unsigned long mb_cache_count(struct shrinker *shrink, + struct shrink_control *sc) { - struct mb_cache_entry *ce; - - if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) { - struct list_head *l; - - l = &mb_cache_lru_list; - spin_lock(&mb_cache_spinlock); - while (!list_is_last(l, &mb_cache_lru_list)) { - l = l->next; - ce = list_entry(l, struct mb_cache_entry, e_lru_list); - if (ce->e_cache == cache) { - list_del_init(&ce->e_lru_list); - if (ce->e_used || ce->e_queued || - atomic_read(&ce->e_refcnt)) - continue; - spin_unlock(&mb_cache_spinlock); - /* - * Prevent any find or get operation on the - * entry. - */ - hlist_bl_lock(ce->e_block_hash_p); - hlist_bl_lock(ce->e_index_hash_p); - /* Ignore if it is touched by a find/get */ - if (ce->e_used || ce->e_queued || - atomic_read(&ce->e_refcnt) || - !list_empty(&ce->e_lru_list)) { - hlist_bl_unlock(ce->e_index_hash_p); - hlist_bl_unlock(ce->e_block_hash_p); - l = &mb_cache_lru_list; - spin_lock(&mb_cache_spinlock); - continue; - } - mb_assert(list_empty(&ce->e_lru_list)); - mb_assert(!(ce->e_used || ce->e_queued || - atomic_read(&ce->e_refcnt))); - __mb_cache_entry_unhash_unlock(ce); - goto found; - } - } - spin_unlock(&mb_cache_spinlock); - } + struct mb_cache *cache = container_of(shrink, struct mb_cache, + c_shrink); - ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags); - if (!ce) - return NULL; - atomic_inc(&cache->c_entry_count); - INIT_LIST_HEAD(&ce->e_lru_list); - INIT_HLIST_BL_NODE(&ce->e_block_list); - INIT_HLIST_BL_NODE(&ce->e_index.o_list); - ce->e_cache = cache; - ce->e_queued = 0; - atomic_set(&ce->e_refcnt, 0); -found: - ce->e_block_hash_p = &cache->c_block_hash[0]; - ce->e_index_hash_p = &cache->c_index_hash[0]; - ce->e_used = 1 + MB_CACHE_WRITER; - return ce; + return cache->c_entry_count; } - -/* - * mb_cache_entry_insert() - * - * Inserts an entry that was allocated using mb_cache_entry_alloc() into - * the cache. After this, the cache entry can be looked up, but is not yet - * in the lru list as the caller still holds a handle to it. Returns 0 on - * success, or -EBUSY if a cache entry for that device + inode exists - * already (this may happen after a failed lookup, but when another process - * has inserted the same cache entry in the meantime). - * - * @bdev: device the cache entry belongs to - * @block: block number - * @key: lookup key - */ -int -mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev, - sector_t block, unsigned int key) +/* Shrink number of entries in cache */ +static unsigned long mb_cache_shrink(struct mb_cache *cache, + unsigned int nr_to_scan) { - struct mb_cache *cache = ce->e_cache; - unsigned int bucket; - struct hlist_bl_node *l; - struct hlist_bl_head *block_hash_p; - struct hlist_bl_head *index_hash_p; - struct mb_cache_entry *lce; - - mb_assert(ce); - bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), - cache->c_bucket_bits); - block_hash_p = &cache->c_block_hash[bucket]; - hlist_bl_lock(block_hash_p); - hlist_bl_for_each_entry(lce, l, block_hash_p, e_block_list) { - if (lce->e_bdev == bdev && lce->e_block == block) { - hlist_bl_unlock(block_hash_p); - return -EBUSY; + struct mb_cache_entry *entry; + struct hlist_bl_head *head; + unsigned int shrunk = 0; + + spin_lock(&cache->c_list_lock); + while (nr_to_scan-- && !list_empty(&cache->c_list)) { + entry = list_first_entry(&cache->c_list, + struct mb_cache_entry, e_list); + if (entry->e_referenced) { + entry->e_referenced = 0; + list_move_tail(&cache->c_list, &entry->e_list); + continue; } + list_del_init(&entry->e_list); + cache->c_entry_count--; + /* + * We keep LRU list reference so that entry doesn't go away + * from under us. + */ + spin_unlock(&cache->c_list_lock); + head = mb_cache_entry_head(cache, entry->e_key); + hlist_bl_lock(head); + if (!hlist_bl_unhashed(&entry->e_hash_list)) { + hlist_bl_del_init(&entry->e_hash_list); + atomic_dec(&entry->e_refcnt); + } + hlist_bl_unlock(head); + if (mb_cache_entry_put(cache, entry)) + shrunk++; + cond_resched(); + spin_lock(&cache->c_list_lock); } - mb_assert(!__mb_cache_entry_is_block_hashed(ce)); - __mb_cache_entry_unhash_block(ce); - __mb_cache_entry_unhash_index(ce); - ce->e_bdev = bdev; - ce->e_block = block; - ce->e_block_hash_p = block_hash_p; - ce->e_index.o_key = key; - hlist_bl_add_head(&ce->e_block_list, block_hash_p); - hlist_bl_unlock(block_hash_p); - bucket = hash_long(key, cache->c_bucket_bits); - index_hash_p = &cache->c_index_hash[bucket]; - hlist_bl_lock(index_hash_p); - ce->e_index_hash_p = index_hash_p; - hlist_bl_add_head(&ce->e_index.o_list, index_hash_p); - hlist_bl_unlock(index_hash_p); - return 0; -} + spin_unlock(&cache->c_list_lock); + return shrunk; +} -/* - * mb_cache_entry_release() - * - * Release a handle to a cache entry. When the last handle to a cache entry - * is released it is either freed (if it is invalid) or otherwise inserted - * in to the lru list. - */ -void -mb_cache_entry_release(struct mb_cache_entry *ce) +static unsigned long mb_cache_scan(struct shrinker *shrink, + struct shrink_control *sc) { - __mb_cache_entry_release(ce); + int nr_to_scan = sc->nr_to_scan; + struct mb_cache *cache = container_of(shrink, struct mb_cache, + c_shrink); + return mb_cache_shrink(cache, nr_to_scan); } +/* We shrink 1/X of the cache when we have too many entries in it */ +#define SHRINK_DIVISOR 16 -/* - * mb_cache_entry_free() - * - */ -void -mb_cache_entry_free(struct mb_cache_entry *ce) +static void mb_cache_shrink_worker(struct work_struct *work) { - mb_assert(ce); - mb_assert(list_empty(&ce->e_lru_list)); - hlist_bl_lock(ce->e_index_hash_p); - __mb_cache_entry_unhash_index(ce); - hlist_bl_unlock(ce->e_index_hash_p); - hlist_bl_lock(ce->e_block_hash_p); - __mb_cache_entry_unhash_block(ce); - hlist_bl_unlock(ce->e_block_hash_p); - __mb_cache_entry_release(ce); + struct mb_cache *cache = container_of(work, struct mb_cache, + c_shrink_work); + mb_cache_shrink(cache, cache->c_max_entries / SHRINK_DIVISOR); } - /* - * mb_cache_entry_get() + * mb_cache_create - create cache + * @bucket_bits: log2 of the hash table size * - * Get a cache entry by device / block number. (There can only be one entry - * in the cache per device and block.) Returns NULL if no such cache entry - * exists. The returned cache entry is locked for exclusive access ("single - * writer"). + * Create cache for keys with 2^bucket_bits hash entries. */ -struct mb_cache_entry * -mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev, - sector_t block) +struct mb_cache *mb_cache_create(int bucket_bits) { - unsigned int bucket; - struct hlist_bl_node *l; - struct mb_cache_entry *ce; - struct hlist_bl_head *block_hash_p; - - bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), - cache->c_bucket_bits); - block_hash_p = &cache->c_block_hash[bucket]; - /* First serialize access to the block corresponding hash chain. */ - hlist_bl_lock(block_hash_p); - hlist_bl_for_each_entry(ce, l, block_hash_p, e_block_list) { - mb_assert(ce->e_block_hash_p == block_hash_p); - if (ce->e_bdev == bdev && ce->e_block == block) { - /* - * Prevent a free from removing the entry. - */ - atomic_inc(&ce->e_refcnt); - hlist_bl_unlock(block_hash_p); - __spin_lock_mb_cache_entry(ce); - atomic_dec(&ce->e_refcnt); - if (ce->e_used > 0) { - DEFINE_WAIT(wait); - while (ce->e_used > 0) { - ce->e_queued++; - prepare_to_wait(&mb_cache_queue, &wait, - TASK_UNINTERRUPTIBLE); - __spin_unlock_mb_cache_entry(ce); - schedule(); - __spin_lock_mb_cache_entry(ce); - ce->e_queued--; - } - finish_wait(&mb_cache_queue, &wait); - } - ce->e_used += 1 + MB_CACHE_WRITER; - __spin_unlock_mb_cache_entry(ce); + struct mb_cache *cache; + int bucket_count = 1 << bucket_bits; + int i; - if (!list_empty(&ce->e_lru_list)) { - spin_lock(&mb_cache_spinlock); - list_del_init(&ce->e_lru_list); - spin_unlock(&mb_cache_spinlock); - } - if (!__mb_cache_entry_is_block_hashed(ce)) { - __mb_cache_entry_release(ce); - return NULL; - } - return ce; - } + if (!try_module_get(THIS_MODULE)) + return NULL; + + cache = kzalloc(sizeof(struct mb_cache), GFP_KERNEL); + if (!cache) + goto err_out; + cache->c_bucket_bits = bucket_bits; + cache->c_max_entries = bucket_count << 4; + INIT_LIST_HEAD(&cache->c_list); + spin_lock_init(&cache->c_list_lock); + cache->c_hash = kmalloc(bucket_count * sizeof(struct hlist_bl_head), + GFP_KERNEL); + if (!cache->c_hash) { + kfree(cache); + goto err_out; } - hlist_bl_unlock(block_hash_p); - return NULL; -} + for (i = 0; i < bucket_count; i++) + INIT_HLIST_BL_HEAD(&cache->c_hash[i]); -#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) + cache->c_shrink.count_objects = mb_cache_count; + cache->c_shrink.scan_objects = mb_cache_scan; + cache->c_shrink.seeks = DEFAULT_SEEKS; + register_shrinker(&cache->c_shrink); -static struct mb_cache_entry * -__mb_cache_entry_find(struct hlist_bl_node *l, struct hlist_bl_head *head, - struct block_device *bdev, unsigned int key) -{ + INIT_WORK(&cache->c_shrink_work, mb_cache_shrink_worker); - /* The index hash chain is alredy acquire by caller. */ - while (l != NULL) { - struct mb_cache_entry *ce = - hlist_bl_entry(l, struct mb_cache_entry, - e_index.o_list); - mb_assert(ce->e_index_hash_p == head); - if (ce->e_bdev == bdev && ce->e_index.o_key == key) { - /* - * Prevent a free from removing the entry. - */ - atomic_inc(&ce->e_refcnt); - hlist_bl_unlock(head); - __spin_lock_mb_cache_entry(ce); - atomic_dec(&ce->e_refcnt); - ce->e_used++; - /* Incrementing before holding the lock gives readers - priority over writers. */ - if (ce->e_used >= MB_CACHE_WRITER) { - DEFINE_WAIT(wait); - - while (ce->e_used >= MB_CACHE_WRITER) { - ce->e_queued++; - prepare_to_wait(&mb_cache_queue, &wait, - TASK_UNINTERRUPTIBLE); - __spin_unlock_mb_cache_entry(ce); - schedule(); - __spin_lock_mb_cache_entry(ce); - ce->e_queued--; - } - finish_wait(&mb_cache_queue, &wait); - } - __spin_unlock_mb_cache_entry(ce); - if (!list_empty(&ce->e_lru_list)) { - spin_lock(&mb_cache_spinlock); - list_del_init(&ce->e_lru_list); - spin_unlock(&mb_cache_spinlock); - } - if (!__mb_cache_entry_is_block_hashed(ce)) { - __mb_cache_entry_release(ce); - return ERR_PTR(-EAGAIN); - } - return ce; - } - l = l->next; - } - hlist_bl_unlock(head); + return cache; + +err_out: + module_put(THIS_MODULE); return NULL; } - +EXPORT_SYMBOL(mb_cache_create); /* - * mb_cache_entry_find_first() - * - * Find the first cache entry on a given device with a certain key in - * an additional index. Additional matches can be found with - * mb_cache_entry_find_next(). Returns NULL if no match was found. The - * returned cache entry is locked for shared access ("multiple readers"). + * mb_cache_destroy - destroy cache + * @cache: the cache to destroy * - * @cache: the cache to search - * @bdev: the device the cache entry should belong to - * @key: the key in the index + * Free all entries in cache and cache itself. Caller must make sure nobody + * (except shrinker) can reach @cache when calling this. */ -struct mb_cache_entry * -mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev, - unsigned int key) +void mb_cache_destroy(struct mb_cache *cache) { - unsigned int bucket = hash_long(key, cache->c_bucket_bits); - struct hlist_bl_node *l; - struct mb_cache_entry *ce = NULL; - struct hlist_bl_head *index_hash_p; - - index_hash_p = &cache->c_index_hash[bucket]; - hlist_bl_lock(index_hash_p); - if (!hlist_bl_empty(index_hash_p)) { - l = hlist_bl_first(index_hash_p); - ce = __mb_cache_entry_find(l, index_hash_p, bdev, key); - } else - hlist_bl_unlock(index_hash_p); - return ce; -} + struct mb_cache_entry *entry, *next; + unregister_shrinker(&cache->c_shrink); -/* - * mb_cache_entry_find_next() - * - * Find the next cache entry on a given device with a certain key in an - * additional index. Returns NULL if no match could be found. The previous - * entry is atomatically released, so that mb_cache_entry_find_next() can - * be called like this: - * - * entry = mb_cache_entry_find_first(); - * while (entry) { - * ... - * entry = mb_cache_entry_find_next(entry, ...); - * } - * - * @prev: The previous match - * @bdev: the device the cache entry should belong to - * @key: the key in the index - */ -struct mb_cache_entry * -mb_cache_entry_find_next(struct mb_cache_entry *prev, - struct block_device *bdev, unsigned int key) -{ - struct mb_cache *cache = prev->e_cache; - unsigned int bucket = hash_long(key, cache->c_bucket_bits); - struct hlist_bl_node *l; - struct mb_cache_entry *ce; - struct hlist_bl_head *index_hash_p; - - index_hash_p = &cache->c_index_hash[bucket]; - mb_assert(prev->e_index_hash_p == index_hash_p); - hlist_bl_lock(index_hash_p); - mb_assert(!hlist_bl_empty(index_hash_p)); - l = prev->e_index.o_list.next; - ce = __mb_cache_entry_find(l, index_hash_p, bdev, key); - __mb_cache_entry_release(prev); - return ce; + /* + * We don't bother with any locking. Cache must not be used at this + * point. + */ + list_for_each_entry_safe(entry, next, &cache->c_list, e_list) { + if (!hlist_bl_unhashed(&entry->e_hash_list)) { + hlist_bl_del_init(&entry->e_hash_list); + atomic_dec(&entry->e_refcnt); + } else + WARN_ON(1); + list_del(&entry->e_list); + WARN_ON(atomic_read(&entry->e_refcnt) != 1); + mb_cache_entry_put(cache, entry); + } + kfree(cache->c_hash); + kfree(cache); + module_put(THIS_MODULE); } +EXPORT_SYMBOL(mb_cache_destroy); -#endif /* !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) */ - -static int __init init_mbcache(void) +static int __init mbcache_init(void) { - register_shrinker(&mb_cache_shrinker); + mb_entry_cache = kmem_cache_create("mbcache", + sizeof(struct mb_cache_entry), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); + BUG_ON(!mb_entry_cache); return 0; } -static void __exit exit_mbcache(void) +static void __exit mbcache_exit(void) { - unregister_shrinker(&mb_cache_shrinker); + kmem_cache_destroy(mb_entry_cache); } -module_init(init_mbcache) -module_exit(exit_mbcache) +module_init(mbcache_init) +module_exit(mbcache_exit) +MODULE_AUTHOR("Jan Kara <jack@suse.cz>"); +MODULE_DESCRIPTION("Meta block cache (for extended attributes)"); +MODULE_LICENSE("GPL"); diff --git a/fs/mpage.c b/fs/mpage.c index 1480d3a18037..6bd9fd90964e 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -24,6 +24,7 @@ #include <linux/highmem.h> #include <linux/prefetch.h> #include <linux/mpage.h> +#include <linux/mm_inline.h> #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/pagevec.h> @@ -366,7 +367,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages, map_bh.b_state = 0; map_bh.b_size = 0; for (page_idx = 0; page_idx < nr_pages; page_idx++) { - struct page *page = list_entry(pages->prev, struct page, lru); + struct page *page = lru_to_page(pages); prefetchw(&page->flags); list_del(&page->lru); diff --git a/fs/namei.c b/fs/namei.c index bceefd5588a2..794f81dce766 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1220,8 +1220,8 @@ static int follow_managed(struct path *path, struct nameidata *nd) if (need_mntput && path->mnt == mnt) mntput(path->mnt); - if (ret == -EISDIR) - ret = 0; + if (ret == -EISDIR || !ret) + ret = 1; if (need_mntput) nd->flags |= LOOKUP_JUMPED; if (unlikely(ret < 0)) @@ -1444,40 +1444,26 @@ static int follow_dotdot(struct nameidata *nd) * This looks up the name in dcache, possibly revalidates the old dentry and * allocates a new one if not found or not valid. In the need_lookup argument * returns whether i_op->lookup is necessary. - * - * dir->d_inode->i_mutex must be held */ -static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir, - unsigned int flags, bool *need_lookup) +static struct dentry *lookup_dcache(const struct qstr *name, + struct dentry *dir, + unsigned int flags) { struct dentry *dentry; int error; - *need_lookup = false; dentry = d_lookup(dir, name); if (dentry) { if (dentry->d_flags & DCACHE_OP_REVALIDATE) { error = d_revalidate(dentry, flags); if (unlikely(error <= 0)) { - if (error < 0) { - dput(dentry); - return ERR_PTR(error); - } else { + if (!error) d_invalidate(dentry); - dput(dentry); - dentry = NULL; - } + dput(dentry); + return ERR_PTR(error); } } } - - if (!dentry) { - dentry = d_alloc(dir, name); - if (unlikely(!dentry)) - return ERR_PTR(-ENOMEM); - - *need_lookup = true; - } return dentry; } @@ -1506,45 +1492,44 @@ static struct dentry *lookup_real(struct inode *dir, struct dentry *dentry, return dentry; } -static struct dentry *__lookup_hash(struct qstr *name, +static struct dentry *__lookup_hash(const struct qstr *name, struct dentry *base, unsigned int flags) { - bool need_lookup; - struct dentry *dentry; + struct dentry *dentry = lookup_dcache(name, base, flags); - dentry = lookup_dcache(name, base, flags, &need_lookup); - if (!need_lookup) + if (dentry) return dentry; + dentry = d_alloc(base, name); + if (unlikely(!dentry)) + return ERR_PTR(-ENOMEM); + return lookup_real(base->d_inode, dentry, flags); } -/* - * It's more convoluted than I'd like it to be, but... it's still fairly - * small and for now I'd prefer to have fast path as straight as possible. - * It _is_ time-critical. - */ static int lookup_fast(struct nameidata *nd, struct path *path, struct inode **inode, unsigned *seqp) { struct vfsmount *mnt = nd->path.mnt; struct dentry *dentry, *parent = nd->path.dentry; - int need_reval = 1; int status = 1; int err; /* * Rename seqlock is not required here because in the off chance - * of a false negative due to a concurrent rename, we're going to - * do the non-racy lookup, below. + * of a false negative due to a concurrent rename, the caller is + * going to fall back to non-racy lookup. */ if (nd->flags & LOOKUP_RCU) { unsigned seq; bool negative; dentry = __d_lookup_rcu(parent, &nd->last, &seq); - if (!dentry) - goto unlazy; + if (unlikely(!dentry)) { + if (unlazy_walk(nd, NULL, 0)) + return -ECHILD; + return 0; + } /* * This sequence count validates that the inode matches @@ -1552,7 +1537,7 @@ static int lookup_fast(struct nameidata *nd, */ *inode = d_backing_inode(dentry); negative = d_is_negative(dentry); - if (read_seqcount_retry(&dentry->d_seq, seq)) + if (unlikely(read_seqcount_retry(&dentry->d_seq, seq))) return -ECHILD; /* @@ -1562,81 +1547,89 @@ static int lookup_fast(struct nameidata *nd, * The memory barrier in read_seqcount_begin of child is * enough, we can use __read_seqcount_retry here. */ - if (__read_seqcount_retry(&parent->d_seq, nd->seq)) + if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq))) return -ECHILD; *seqp = seq; - if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) { + if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) status = d_revalidate(dentry, nd->flags); - if (unlikely(status <= 0)) { - if (status != -ECHILD) - need_reval = 0; - goto unlazy; - } + if (unlikely(status <= 0)) { + if (unlazy_walk(nd, dentry, seq)) + return -ECHILD; + if (status == -ECHILD) + status = d_revalidate(dentry, nd->flags); + } else { + /* + * Note: do negative dentry check after revalidation in + * case that drops it. + */ + if (unlikely(negative)) + return -ENOENT; + path->mnt = mnt; + path->dentry = dentry; + if (likely(__follow_mount_rcu(nd, path, inode, seqp))) + return 1; + if (unlazy_walk(nd, dentry, seq)) + return -ECHILD; } - /* - * Note: do negative dentry check after revalidation in - * case that drops it. - */ - if (negative) - return -ENOENT; - path->mnt = mnt; - path->dentry = dentry; - if (likely(__follow_mount_rcu(nd, path, inode, seqp))) - return 0; -unlazy: - if (unlazy_walk(nd, dentry, seq)) - return -ECHILD; } else { dentry = __d_lookup(parent, &nd->last); + if (unlikely(!dentry)) + return 0; + if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) + status = d_revalidate(dentry, nd->flags); } - - if (unlikely(!dentry)) - goto need_lookup; - - if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval) - status = d_revalidate(dentry, nd->flags); if (unlikely(status <= 0)) { - if (status < 0) { - dput(dentry); - return status; - } - d_invalidate(dentry); + if (!status) + d_invalidate(dentry); dput(dentry); - goto need_lookup; + return status; } - if (unlikely(d_is_negative(dentry))) { dput(dentry); return -ENOENT; } + path->mnt = mnt; path->dentry = dentry; err = follow_managed(path, nd); - if (likely(!err)) + if (likely(err > 0)) *inode = d_backing_inode(path->dentry); return err; - -need_lookup: - return 1; } /* Fast lookup failed, do it the slow way */ -static int lookup_slow(struct nameidata *nd, struct path *path) +static struct dentry *lookup_slow(const struct qstr *name, + struct dentry *dir, + unsigned int flags) { - struct dentry *dentry, *parent; - - parent = nd->path.dentry; - BUG_ON(nd->inode != parent->d_inode); - - mutex_lock(&parent->d_inode->i_mutex); - dentry = __lookup_hash(&nd->last, parent, nd->flags); - mutex_unlock(&parent->d_inode->i_mutex); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - path->mnt = nd->path.mnt; - path->dentry = dentry; - return follow_managed(path, nd); + struct dentry *dentry; + inode_lock(dir->d_inode); + dentry = d_lookup(dir, name); + if (unlikely(dentry)) { + if ((dentry->d_flags & DCACHE_OP_REVALIDATE) && + !(flags & LOOKUP_NO_REVAL)) { + int error = d_revalidate(dentry, flags); + if (unlikely(error <= 0)) { + if (!error) + d_invalidate(dentry); + dput(dentry); + dentry = ERR_PTR(error); + } + } + if (dentry) { + inode_unlock(dir->d_inode); + return dentry; + } + } + dentry = d_alloc(dir, name); + if (unlikely(!dentry)) { + inode_unlock(dir->d_inode); + return ERR_PTR(-ENOMEM); + } + dentry = lookup_real(dir->d_inode, dentry, flags); + inode_unlock(dir->d_inode); + return dentry; } static inline int may_lookup(struct nameidata *nd) @@ -1712,6 +1705,11 @@ static inline int should_follow_link(struct nameidata *nd, struct path *link, return 0; if (!follow) return 0; + /* make sure that d_is_symlink above matches inode */ + if (nd->flags & LOOKUP_RCU) { + if (read_seqcount_retry(&link->dentry->d_seq, seq)) + return -ECHILD; + } return pick_link(nd, link, inode, seq); } @@ -1735,19 +1733,24 @@ static int walk_component(struct nameidata *nd, int flags) return err; } err = lookup_fast(nd, &path, &inode, &seq); - if (unlikely(err)) { + if (unlikely(err <= 0)) { if (err < 0) return err; - - err = lookup_slow(nd, &path); - if (err < 0) + path.dentry = lookup_slow(&nd->last, nd->path.dentry, + nd->flags); + if (IS_ERR(path.dentry)) + return PTR_ERR(path.dentry); + if (unlikely(d_is_negative(path.dentry))) { + dput(path.dentry); + return -ENOENT; + } + path.mnt = nd->path.mnt; + err = follow_managed(&path, nd); + if (unlikely(err < 0)) return err; - inode = d_backing_inode(path.dentry); seq = 0; /* we are already out of RCU mode */ - err = -ENOENT; - if (d_is_negative(path.dentry)) - goto out_path_put; + inode = d_backing_inode(path.dentry); } if (flags & WALK_PUT) @@ -1759,10 +1762,6 @@ static int walk_component(struct nameidata *nd, int flags) nd->inode = inode; nd->seq = seq; return 0; - -out_path_put: - path_to_nameidata(&path, nd); - return err; } /* @@ -2229,10 +2228,10 @@ struct dentry *kern_path_locked(const char *name, struct path *path) putname(filename); return ERR_PTR(-EINVAL); } - mutex_lock_nested(&path->dentry->d_inode->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); d = __lookup_hash(&last, path->dentry, 0); if (IS_ERR(d)) { - mutex_unlock(&path->dentry->d_inode->i_mutex); + inode_unlock(path->dentry->d_inode); path_put(path); } putname(filename); @@ -2282,7 +2281,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) unsigned int c; int err; - WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex)); + WARN_ON_ONCE(!inode_is_locked(base->d_inode)); this.name = name; this.len = len; @@ -2368,21 +2367,9 @@ struct dentry *lookup_one_len_unlocked(const char *name, if (err) return ERR_PTR(err); - /* - * __d_lookup() is used to try to get a quick answer and avoid the - * mutex. A false-negative does no harm. - */ - ret = __d_lookup(base, &this); - if (ret && unlikely(ret->d_flags & DCACHE_OP_REVALIDATE)) { - dput(ret); - ret = NULL; - } - if (ret) - return ret; - - mutex_lock(&base->d_inode->i_mutex); - ret = __lookup_hash(&this, base, 0); - mutex_unlock(&base->d_inode->i_mutex); + ret = lookup_dcache(&this, base, 0); + if (!ret) + ret = lookup_slow(&this, base, 0); return ret; } EXPORT_SYMBOL(lookup_one_len_unlocked); @@ -2460,31 +2447,21 @@ mountpoint_last(struct nameidata *nd, struct path *path) if (error) return error; dentry = dget(nd->path.dentry); - goto done; - } - - mutex_lock(&dir->d_inode->i_mutex); - dentry = d_lookup(dir, &nd->last); - if (!dentry) { - /* - * No cached dentry. Mounted dentries are pinned in the cache, - * so that means that this dentry is probably a symlink or the - * path doesn't actually point to a mounted dentry. - */ - dentry = d_alloc(dir, &nd->last); + } else { + dentry = d_lookup(dir, &nd->last); if (!dentry) { - mutex_unlock(&dir->d_inode->i_mutex); - return -ENOMEM; - } - dentry = lookup_real(dir->d_inode, dentry, nd->flags); - if (IS_ERR(dentry)) { - mutex_unlock(&dir->d_inode->i_mutex); - return PTR_ERR(dentry); + /* + * No cached dentry. Mounted dentries are pinned in the + * cache, so that means that this dentry is probably + * a symlink or the path doesn't actually point + * to a mounted dentry. + */ + dentry = lookup_slow(&nd->last, dir, + nd->flags | LOOKUP_NO_REVAL); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); } } - mutex_unlock(&dir->d_inode->i_mutex); - -done: if (d_is_negative(dentry)) { dput(dentry); return -ENOENT; @@ -2672,7 +2649,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) struct dentry *p; if (p1 == p2) { - mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); return NULL; } @@ -2680,29 +2657,29 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) p = d_ancestor(p2, p1); if (p) { - mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); + inode_lock_nested(p1->d_inode, I_MUTEX_CHILD); return p; } p = d_ancestor(p1, p2); if (p) { - mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); + inode_lock_nested(p2->d_inode, I_MUTEX_CHILD); return p; } - mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT2); + inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); + inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2); return NULL; } EXPORT_SYMBOL(lock_rename); void unlock_rename(struct dentry *p1, struct dentry *p2) { - mutex_unlock(&p1->d_inode->i_mutex); + inode_unlock(p1->d_inode); if (p1 != p2) { - mutex_unlock(&p2->d_inode->i_mutex); + inode_unlock(p2->d_inode); mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex); } } @@ -3013,16 +2990,22 @@ static int lookup_open(struct nameidata *nd, struct path *path, struct inode *dir_inode = dir->d_inode; struct dentry *dentry; int error; - bool need_lookup; + bool need_lookup = false; *opened &= ~FILE_CREATED; - dentry = lookup_dcache(&nd->last, dir, nd->flags, &need_lookup); + dentry = lookup_dcache(&nd->last, dir, nd->flags); if (IS_ERR(dentry)) return PTR_ERR(dentry); - /* Cached positive dentry: will open in f_op->open */ - if (!need_lookup && dentry->d_inode) + if (!dentry) { + dentry = d_alloc(dir, &nd->last); + if (unlikely(!dentry)) + return -ENOMEM; + need_lookup = true; + } else if (dentry->d_inode) { + /* Cached positive dentry: will open in f_op->open */ goto out_no_open; + } if ((nd->flags & LOOKUP_OPEN) && dir_inode->i_op->atomic_open) { return atomic_open(nd, dentry, path, file, op, got_write, @@ -3106,13 +3089,14 @@ static int do_last(struct nameidata *nd, nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; /* we _can_ be in RCU mode here */ error = lookup_fast(nd, &path, &inode, &seq); - if (likely(!error)) + if (likely(error > 0)) goto finish_lookup; if (error < 0) return error; BUG_ON(nd->inode != dir->d_inode); + BUG_ON(nd->flags & LOOKUP_RCU); } else { /* create side of things */ /* @@ -3141,9 +3125,9 @@ retry_lookup: * dropping this one anyway. */ } - mutex_lock(&dir->d_inode->i_mutex); + inode_lock(dir->d_inode); error = lookup_open(nd, &path, file, op, got_write, opened); - mutex_unlock(&dir->d_inode->i_mutex); + inode_unlock(dir->d_inode); if (error <= 0) { if (error) @@ -3167,12 +3151,6 @@ retry_lookup: } /* - * create/update audit record if it already exists. - */ - if (d_is_positive(path.dentry)) - audit_inode(nd->name, path.dentry, 0); - - /* * If atomic_open() acquired write access it is dropped now due to * possible mount and symlink following (this might be optimized away if * necessary...) @@ -3182,6 +3160,16 @@ retry_lookup: got_write = false; } + if (unlikely(d_is_negative(path.dentry))) { + path_to_nameidata(&path, nd); + return -ENOENT; + } + + /* + * create/update audit record if it already exists. + */ + audit_inode(nd->name, path.dentry, 0); + if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) { path_to_nameidata(&path, nd); return -EEXIST; @@ -3191,13 +3179,8 @@ retry_lookup: if (unlikely(error < 0)) return error; - BUG_ON(nd->flags & LOOKUP_RCU); - inode = d_backing_inode(path.dentry); seq = 0; /* out of RCU mode, so the value doesn't matter */ - if (unlikely(d_is_negative(path.dentry))) { - path_to_nameidata(&path, nd); - return -ENOENT; - } + inode = d_backing_inode(path.dentry); finish_lookup: if (nd->depth) put_link(nd); @@ -3206,11 +3189,6 @@ finish_lookup: if (unlikely(error)) return error; - if (unlikely(d_is_symlink(path.dentry)) && !(open_flag & O_PATH)) { - path_to_nameidata(&path, nd); - return -ELOOP; - } - if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path.mnt) { path_to_nameidata(&path, nd); } else { @@ -3229,6 +3207,10 @@ finish_open: return error; } audit_inode(nd->name, nd->path.dentry, 0); + if (unlikely(d_is_symlink(nd->path.dentry)) && !(open_flag & O_PATH)) { + error = -ELOOP; + goto out; + } error = -EISDIR; if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry)) goto out; @@ -3273,6 +3255,10 @@ opened: goto exit_fput; } out: + if (unlikely(error > 0)) { + WARN_ON(1); + error = -EINVAL; + } if (got_write) mnt_drop_write(nd->path.mnt); path_put(&save_parent); @@ -3489,7 +3475,7 @@ static struct dentry *filename_create(int dfd, struct filename *name, * Do the final lookup. */ lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL; - mutex_lock_nested(&path->dentry->d_inode->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); dentry = __lookup_hash(&last, path->dentry, lookup_flags); if (IS_ERR(dentry)) goto unlock; @@ -3518,7 +3504,7 @@ fail: dput(dentry); dentry = ERR_PTR(error); unlock: - mutex_unlock(&path->dentry->d_inode->i_mutex); + inode_unlock(path->dentry->d_inode); if (!err2) mnt_drop_write(path->mnt); out: @@ -3538,7 +3524,7 @@ EXPORT_SYMBOL(kern_path_create); void done_path_create(struct path *path, struct dentry *dentry) { dput(dentry); - mutex_unlock(&path->dentry->d_inode->i_mutex); + inode_unlock(path->dentry->d_inode); mnt_drop_write(path->mnt); path_put(path); } @@ -3699,31 +3685,6 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) return sys_mkdirat(AT_FDCWD, pathname, mode); } -/* - * The dentry_unhash() helper will try to drop the dentry early: we - * should have a usage count of 1 if we're the only user of this - * dentry, and if that is true (possibly after pruning the dcache), - * then we drop the dentry now. - * - * A low-level filesystem can, if it choses, legally - * do a - * - * if (!d_unhashed(dentry)) - * return -EBUSY; - * - * if it cannot handle the case of removing a directory - * that is still in use by something else.. - */ -void dentry_unhash(struct dentry *dentry) -{ - shrink_dcache_parent(dentry); - spin_lock(&dentry->d_lock); - if (dentry->d_lockref.count == 1) - __d_drop(dentry); - spin_unlock(&dentry->d_lock); -} -EXPORT_SYMBOL(dentry_unhash); - int vfs_rmdir(struct inode *dir, struct dentry *dentry) { int error = may_delete(dir, dentry, 1); @@ -3735,7 +3696,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry) return -EPERM; dget(dentry); - mutex_lock(&dentry->d_inode->i_mutex); + inode_lock(dentry->d_inode); error = -EBUSY; if (is_local_mountpoint(dentry)) @@ -3755,7 +3716,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry) detach_mounts(dentry); out: - mutex_unlock(&dentry->d_inode->i_mutex); + inode_unlock(dentry->d_inode); dput(dentry); if (!error) d_delete(dentry); @@ -3794,7 +3755,7 @@ retry: if (error) goto exit1; - mutex_lock_nested(&path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); dentry = __lookup_hash(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) @@ -3810,7 +3771,7 @@ retry: exit3: dput(dentry); exit2: - mutex_unlock(&path.dentry->d_inode->i_mutex); + inode_unlock(path.dentry->d_inode); mnt_drop_write(path.mnt); exit1: path_put(&path); @@ -3856,7 +3817,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegate if (!dir->i_op->unlink) return -EPERM; - mutex_lock(&target->i_mutex); + inode_lock(target); if (is_local_mountpoint(dentry)) error = -EBUSY; else { @@ -3873,7 +3834,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegate } } out: - mutex_unlock(&target->i_mutex); + inode_unlock(target); /* We don't d_delete() NFS sillyrenamed files--they still exist. */ if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) { @@ -3916,7 +3877,7 @@ retry: if (error) goto exit1; retry_deleg: - mutex_lock_nested(&path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); dentry = __lookup_hash(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { @@ -3934,7 +3895,7 @@ retry_deleg: exit2: dput(dentry); } - mutex_unlock(&path.dentry->d_inode->i_mutex); + inode_unlock(path.dentry->d_inode); if (inode) iput(inode); /* truncate the inode here */ inode = NULL; @@ -4086,7 +4047,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de if (error) return error; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* Make sure we don't allow creating hardlink to an unlinked file */ if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) error = -ENOENT; @@ -4103,7 +4064,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de inode->i_state &= ~I_LINKABLE; spin_unlock(&inode->i_lock); } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (!error) fsnotify_link(dir, inode, new_dentry); return error; @@ -4303,7 +4264,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (!is_dir || (flags & RENAME_EXCHANGE)) lock_two_nondirectories(source, target); else if (target) - mutex_lock(&target->i_mutex); + inode_lock(target); error = -EBUSY; if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry)) @@ -4356,7 +4317,7 @@ out: if (!is_dir || (flags & RENAME_EXCHANGE)) unlock_two_nondirectories(source, target); else if (target) - mutex_unlock(&target->i_mutex); + inode_unlock(target); dput(new_dentry); if (!error) { fsnotify_move(old_dir, new_dir, old_name, is_dir, diff --git a/fs/namespace.c b/fs/namespace.c index a830e1463704..4fb1691b4355 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1961,9 +1961,9 @@ static struct mountpoint *lock_mount(struct path *path) struct vfsmount *mnt; struct dentry *dentry = path->dentry; retry: - mutex_lock(&dentry->d_inode->i_mutex); + inode_lock(dentry->d_inode); if (unlikely(cant_mount(dentry))) { - mutex_unlock(&dentry->d_inode->i_mutex); + inode_unlock(dentry->d_inode); return ERR_PTR(-ENOENT); } namespace_lock(); @@ -1974,13 +1974,13 @@ retry: mp = new_mountpoint(dentry); if (IS_ERR(mp)) { namespace_unlock(); - mutex_unlock(&dentry->d_inode->i_mutex); + inode_unlock(dentry->d_inode); return mp; } return mp; } namespace_unlock(); - mutex_unlock(&path->dentry->d_inode->i_mutex); + inode_unlock(path->dentry->d_inode); path_put(path); path->mnt = mnt; dentry = path->dentry = dget(mnt->mnt_root); @@ -1992,7 +1992,7 @@ static void unlock_mount(struct mountpoint *where) struct dentry *dentry = where->m_dentry; put_mountpoint(where); namespace_unlock(); - mutex_unlock(&dentry->d_inode->i_mutex); + inode_unlock(dentry->d_inode); } static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index f0e3e9e747dd..b7f8eaeea5d8 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -369,7 +369,7 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags) if (!res) { struct inode *inode = d_inode(dentry); - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (finfo.i.dirEntNum == NCP_FINFO(inode)->dirEntNum) { ncp_new_dentry(dentry); val=1; @@ -377,7 +377,7 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags) ncp_dbg(2, "found, but dirEntNum changed\n"); ncp_update_inode2(inode, &finfo); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } finished: @@ -633,15 +633,15 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx, d_rehash(newdent); } else { spin_lock(&dentry->d_lock); - NCP_FINFO(inode)->flags &= ~NCPI_DIR_CACHE; + NCP_FINFO(dir)->flags &= ~NCPI_DIR_CACHE; spin_unlock(&dentry->d_lock); } } else { struct inode *inode = d_inode(newdent); - mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(inode, I_MUTEX_CHILD); ncp_update_inode2(inode, entry); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } if (ctl.idx >= NCP_DIRCACHE_SIZE) { diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c index 011324ce9df2..dd38ca1f2ecb 100644 --- a/fs/ncpfs/file.c +++ b/fs/ncpfs/file.c @@ -224,10 +224,10 @@ ncp_file_write_iter(struct kiocb *iocb, struct iov_iter *from) iocb->ki_pos = pos; if (pos > i_size_read(inode)) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (pos > i_size_read(inode)) i_size_write(inode, pos); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } ncp_dbg(1, "exit %pD2\n", file); outrel: diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c index c59a59c37f3d..35ab51c04814 100644 --- a/fs/nfs/blocklayout/extent_tree.c +++ b/fs/nfs/blocklayout/extent_tree.c @@ -476,6 +476,7 @@ static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg, for (i = 0; i < nr_pages; i++) put_page(arg->layoutupdate_pages[i]); + vfree(arg->start_p); kfree(arg->layoutupdate_pages); } else { put_page(arg->layoutupdate_page); @@ -559,10 +560,15 @@ retry: if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) { void *p = start_p, *end = p + arg->layoutupdate_len; + struct page *page = NULL; int i = 0; - for ( ; p < end; p += PAGE_SIZE) - arg->layoutupdate_pages[i++] = vmalloc_to_page(p); + arg->start_p = start_p; + for ( ; p < end; p += PAGE_SIZE) { + page = vmalloc_to_page(p); + arg->layoutupdate_pages[i++] = page; + get_page(page); + } } dprintk("%s found %zu ranges\n", __func__, count); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index c82a21228a34..4bfa7d8bcade 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -940,7 +940,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) dfprintk(FILE, "NFS: llseek dir(%pD2, %lld, %d)\n", filp, offset, whence); - mutex_lock(&inode->i_mutex); + inode_lock(inode); switch (whence) { case 1: offset += filp->f_pos; @@ -957,7 +957,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) dir_ctx->duped = 0; } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return offset; } @@ -972,9 +972,9 @@ static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end, dfprintk(FILE, "NFS: fsync dir(%pD2) datasync %d\n", filp, datasync); - mutex_lock(&inode->i_mutex); + inode_lock(inode); nfs_inc_stats(inode, NFSIOS_VFSFSYNC); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return 0; } @@ -1360,19 +1360,15 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in dfprintk(VFS, "NFS: lookup(%pd2)\n", dentry); nfs_inc_stats(dir, NFSIOS_VFSLOOKUP); - res = ERR_PTR(-ENAMETOOLONG); - if (dentry->d_name.len > NFS_SERVER(dir)->namelen) - goto out; + if (unlikely(dentry->d_name.len > NFS_SERVER(dir)->namelen)) + return ERR_PTR(-ENAMETOOLONG); /* * If we're doing an exclusive create, optimize away the lookup * but don't hash the dentry. */ - if (nfs_is_exclusive_create(dir, flags)) { - d_instantiate(dentry, NULL); - res = NULL; - goto out; - } + if (nfs_is_exclusive_create(dir, flags)) + return NULL; res = ERR_PTR(-ENOMEM); fhandle = nfs_alloc_fhandle(); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 7ab7ec9f4eed..7a0cfd3266e5 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -580,7 +580,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, if (!count) goto out; - mutex_lock(&inode->i_mutex); + inode_lock(inode); result = nfs_sync_mapping(mapping); if (result) goto out_unlock; @@ -608,7 +608,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, NFS_I(inode)->read_io += count; result = nfs_direct_read_schedule_iovec(dreq, iter, pos); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (!result) { result = nfs_direct_wait(dreq); @@ -622,7 +622,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, out_release: nfs_direct_req_release(dreq); out_unlock: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); out: return result; } @@ -1005,7 +1005,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) pos = iocb->ki_pos; end = (pos + iov_iter_count(iter) - 1) >> PAGE_CACHE_SHIFT; - mutex_lock(&inode->i_mutex); + inode_lock(inode); result = nfs_sync_mapping(mapping); if (result) @@ -1045,7 +1045,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) pos >> PAGE_CACHE_SHIFT, end); } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (!result) { result = nfs_direct_wait(dreq); @@ -1066,7 +1066,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) out_release: nfs_direct_req_release(dreq); out_unlock: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return result; } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 4ef8f5addcad..748bb813b8ec 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -278,9 +278,9 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret != 0) break; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = nfs_file_fsync_commit(file, start, end, datasync); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* * If nfs_file_fsync_commit detected a server reboot, then * resend all dirty pages that might have been covered by diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index bb1f4e7a3270..3384dc8e6683 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -971,7 +971,7 @@ filelayout_mark_request_commit(struct nfs_page *req, u32 i, j; if (fl->commit_through_mds) { - nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo); + nfs_request_add_commit_list(req, cinfo); } else { /* Note that we are calling nfs4_fl_calc_j_index on each page * that ends up being committed to a data server. An attractive diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 6594e9f903a0..0cb1abd535e3 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1215,7 +1215,7 @@ static int ff_layout_read_done_cb(struct rpc_task *task, hdr->pgio_mirror_idx + 1, &hdr->pgio_mirror_idx)) goto out_eagain; - set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, + set_bit(NFS_LAYOUT_RETURN_REQUESTED, &hdr->lseg->pls_layout->plh_flags); pnfs_read_resend_pnfs(hdr); return task->tk_status; @@ -1948,11 +1948,9 @@ ff_layout_encode_layoutreturn(struct pnfs_layout_hdr *lo, start = xdr_reserve_space(xdr, 4); BUG_ON(!start); - if (ff_layout_encode_ioerr(flo, xdr, args)) - goto out; - + ff_layout_encode_ioerr(flo, xdr, args); ff_layout_encode_iostats(flo, xdr, args); -out: + *start = cpu_to_be32((xdr->p - start - 1) * 4); dprintk("%s: Return\n", __func__); } diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index bd0327541366..eb370460ce20 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -218,63 +218,55 @@ static void extend_ds_error(struct nfs4_ff_layout_ds_err *err, err->length = end - err->offset; } -static bool ds_error_can_merge(struct nfs4_ff_layout_ds_err *err, u64 offset, - u64 length, int status, enum nfs_opnum4 opnum, - nfs4_stateid *stateid, - struct nfs4_deviceid *deviceid) +static int +ff_ds_error_match(const struct nfs4_ff_layout_ds_err *e1, + const struct nfs4_ff_layout_ds_err *e2) { - return err->status == status && err->opnum == opnum && - nfs4_stateid_match(&err->stateid, stateid) && - !memcmp(&err->deviceid, deviceid, sizeof(*deviceid)) && - end_offset(err->offset, err->length) >= offset && - err->offset <= end_offset(offset, length); -} - -static bool merge_ds_error(struct nfs4_ff_layout_ds_err *old, - struct nfs4_ff_layout_ds_err *new) -{ - if (!ds_error_can_merge(old, new->offset, new->length, new->status, - new->opnum, &new->stateid, &new->deviceid)) - return false; - - extend_ds_error(old, new->offset, new->length); - return true; + int ret; + + if (e1->opnum != e2->opnum) + return e1->opnum < e2->opnum ? -1 : 1; + if (e1->status != e2->status) + return e1->status < e2->status ? -1 : 1; + ret = memcmp(&e1->stateid, &e2->stateid, sizeof(e1->stateid)); + if (ret != 0) + return ret; + ret = memcmp(&e1->deviceid, &e2->deviceid, sizeof(e1->deviceid)); + if (ret != 0) + return ret; + if (end_offset(e1->offset, e1->length) < e2->offset) + return -1; + if (e1->offset > end_offset(e2->offset, e2->length)) + return 1; + /* If ranges overlap or are contiguous, they are the same */ + return 0; } -static bool +static void ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo, struct nfs4_ff_layout_ds_err *dserr) { - struct nfs4_ff_layout_ds_err *err; - - list_for_each_entry(err, &flo->error_list, list) { - if (merge_ds_error(err, dserr)) { - return true; - } - } - - list_add(&dserr->list, &flo->error_list); - return false; -} - -static bool -ff_layout_update_ds_error(struct nfs4_flexfile_layout *flo, u64 offset, - u64 length, int status, enum nfs_opnum4 opnum, - nfs4_stateid *stateid, struct nfs4_deviceid *deviceid) -{ - bool found = false; - struct nfs4_ff_layout_ds_err *err; - - list_for_each_entry(err, &flo->error_list, list) { - if (ds_error_can_merge(err, offset, length, status, opnum, - stateid, deviceid)) { - found = true; - extend_ds_error(err, offset, length); + struct nfs4_ff_layout_ds_err *err, *tmp; + struct list_head *head = &flo->error_list; + int match; + + /* Do insertion sort w/ merges */ + list_for_each_entry_safe(err, tmp, &flo->error_list, list) { + match = ff_ds_error_match(err, dserr); + if (match < 0) + continue; + if (match > 0) { + /* Add entry "dserr" _before_ entry "err" */ + head = &err->list; break; } + /* Entries match, so merge "err" into "dserr" */ + extend_ds_error(dserr, err->offset, err->length); + list_del(&err->list); + kfree(err); } - return found; + list_add_tail(&dserr->list, head); } int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, @@ -283,7 +275,6 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, gfp_t gfp_flags) { struct nfs4_ff_layout_ds_err *dserr; - bool needfree; if (status == 0) return 0; @@ -291,14 +282,6 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, if (mirror->mirror_ds == NULL) return -EINVAL; - spin_lock(&flo->generic_hdr.plh_inode->i_lock); - if (ff_layout_update_ds_error(flo, offset, length, status, opnum, - &mirror->stateid, - &mirror->mirror_ds->id_node.deviceid)) { - spin_unlock(&flo->generic_hdr.plh_inode->i_lock); - return 0; - } - spin_unlock(&flo->generic_hdr.plh_inode->i_lock); dserr = kmalloc(sizeof(*dserr), gfp_flags); if (!dserr) return -ENOMEM; @@ -313,10 +296,8 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, NFS4_DEVICEID4_SIZE); spin_lock(&flo->generic_hdr.plh_inode->i_lock); - needfree = ff_layout_add_ds_error_locked(flo, dserr); + ff_layout_add_ds_error_locked(flo, dserr); spin_unlock(&flo->generic_hdr.plh_inode->i_lock); - if (needfree) - kfree(dserr); return 0; } @@ -431,7 +412,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, OP_ILLEGAL, GFP_NOIO); if (!fail_return) { if (ff_layout_has_available_ds(lseg)) - set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, + set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lseg->pls_layout->plh_flags); else pnfs_error_mark_layout_for_return(ino, lseg); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 8e24d886d2c5..86faecf8f328 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -661,9 +661,9 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) trace_nfs_getattr_enter(inode); /* Flush out writes to the server in order to update c/mtime. */ if (S_ISREG(inode->i_mode)) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); err = nfs_sync_inode(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (err) goto out; } @@ -1178,9 +1178,9 @@ static int __nfs_revalidate_mapping(struct inode *inode, spin_unlock(&inode->i_lock); trace_nfs_invalidate_mapping_enter(inode); if (may_lock) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = nfs_invalidate_mapping(inode, mapping); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } else ret = nfs_invalidate_mapping(inode, mapping); trace_nfs_invalidate_mapping_exit(inode, ret); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 4e8cc942336c..9a547aa3ec8e 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -484,7 +484,7 @@ void nfs_retry_commit(struct list_head *page_list, struct nfs_commit_info *cinfo, u32 ds_commit_idx); void nfs_commitdata_release(struct nfs_commit_data *data); -void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst, +void nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo); void nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst, diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 6e8174930a48..dff83460e5a6 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -16,29 +16,8 @@ #define NFSDBG_FACILITY NFSDBG_PROC -static int nfs42_set_rw_stateid(nfs4_stateid *dst, struct file *file, - fmode_t fmode) -{ - struct nfs_open_context *open; - struct nfs_lock_context *lock; - int ret; - - open = get_nfs_open_context(nfs_file_open_context(file)); - lock = nfs_get_lock_context(open); - if (IS_ERR(lock)) { - put_nfs_open_context(open); - return PTR_ERR(lock); - } - - ret = nfs4_set_rw_stateid(dst, open, lock, fmode); - - nfs_put_lock_context(lock); - put_nfs_open_context(open); - return ret; -} - static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, - loff_t offset, loff_t len) + struct nfs_lock_context *lock, loff_t offset, loff_t len) { struct inode *inode = file_inode(filep); struct nfs_server *server = NFS_SERVER(inode); @@ -56,7 +35,8 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, msg->rpc_argp = &args; msg->rpc_resp = &res; - status = nfs42_set_rw_stateid(&args.falloc_stateid, filep, FMODE_WRITE); + status = nfs4_set_rw_stateid(&args.falloc_stateid, lock->open_context, + lock, FMODE_WRITE); if (status) return status; @@ -78,15 +58,26 @@ static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, { struct nfs_server *server = NFS_SERVER(file_inode(filep)); struct nfs4_exception exception = { }; + struct nfs_lock_context *lock; int err; + lock = nfs_get_lock_context(nfs_file_open_context(filep)); + if (IS_ERR(lock)) + return PTR_ERR(lock); + + exception.inode = file_inode(filep); + exception.state = lock->open_context->state; + do { - err = _nfs42_proc_fallocate(msg, filep, offset, len); - if (err == -ENOTSUPP) - return -EOPNOTSUPP; + err = _nfs42_proc_fallocate(msg, filep, lock, offset, len); + if (err == -ENOTSUPP) { + err = -EOPNOTSUPP; + break; + } err = nfs4_handle_exception(server, err, &exception); } while (exception.retry); + nfs_put_lock_context(lock); return err; } @@ -101,13 +92,13 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len) if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE)) return -EOPNOTSUPP; - mutex_lock(&inode->i_mutex); + inode_lock(inode); err = nfs42_proc_fallocate(&msg, filep, offset, len); if (err == -EOPNOTSUPP) NFS_SERVER(inode)->caps &= ~NFS_CAP_ALLOCATE; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } @@ -123,7 +114,7 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) return -EOPNOTSUPP; nfs_wb_all(inode); - mutex_lock(&inode->i_mutex); + inode_lock(inode); err = nfs42_proc_fallocate(&msg, filep, offset, len); if (err == 0) @@ -131,11 +122,12 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) if (err == -EOPNOTSUPP) NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } -static loff_t _nfs42_proc_llseek(struct file *filep, loff_t offset, int whence) +static loff_t _nfs42_proc_llseek(struct file *filep, + struct nfs_lock_context *lock, loff_t offset, int whence) { struct inode *inode = file_inode(filep); struct nfs42_seek_args args = { @@ -156,7 +148,8 @@ static loff_t _nfs42_proc_llseek(struct file *filep, loff_t offset, int whence) if (!nfs_server_capable(inode, NFS_CAP_SEEK)) return -ENOTSUPP; - status = nfs42_set_rw_stateid(&args.sa_stateid, filep, FMODE_READ); + status = nfs4_set_rw_stateid(&args.sa_stateid, lock->open_context, + lock, FMODE_READ); if (status) return status; @@ -175,17 +168,28 @@ loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence) { struct nfs_server *server = NFS_SERVER(file_inode(filep)); struct nfs4_exception exception = { }; + struct nfs_lock_context *lock; loff_t err; + lock = nfs_get_lock_context(nfs_file_open_context(filep)); + if (IS_ERR(lock)) + return PTR_ERR(lock); + + exception.inode = file_inode(filep); + exception.state = lock->open_context->state; + do { - err = _nfs42_proc_llseek(filep, offset, whence); + err = _nfs42_proc_llseek(filep, lock, offset, whence); if (err >= 0) break; - if (err == -ENOTSUPP) - return -EOPNOTSUPP; + if (err == -ENOTSUPP) { + err = -EOPNOTSUPP; + break; + } err = nfs4_handle_exception(server, err, &exception); } while (exception.retry); + nfs_put_lock_context(lock); return err; } @@ -298,8 +302,9 @@ int nfs42_proc_layoutstats_generic(struct nfs_server *server, } static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f, - struct file *dst_f, loff_t src_offset, - loff_t dst_offset, loff_t count) + struct file *dst_f, struct nfs_lock_context *src_lock, + struct nfs_lock_context *dst_lock, loff_t src_offset, + loff_t dst_offset, loff_t count) { struct inode *src_inode = file_inode(src_f); struct inode *dst_inode = file_inode(dst_f); @@ -320,11 +325,13 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f, msg->rpc_argp = &args; msg->rpc_resp = &res; - status = nfs42_set_rw_stateid(&args.src_stateid, src_f, FMODE_READ); + status = nfs4_set_rw_stateid(&args.src_stateid, src_lock->open_context, + src_lock, FMODE_READ); if (status) return status; - status = nfs42_set_rw_stateid(&args.dst_stateid, dst_f, FMODE_WRITE); + status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context, + dst_lock, FMODE_WRITE); if (status) return status; @@ -349,22 +356,48 @@ int nfs42_proc_clone(struct file *src_f, struct file *dst_f, }; struct inode *inode = file_inode(src_f); struct nfs_server *server = NFS_SERVER(file_inode(src_f)); - struct nfs4_exception exception = { }; - int err; + struct nfs_lock_context *src_lock; + struct nfs_lock_context *dst_lock; + struct nfs4_exception src_exception = { }; + struct nfs4_exception dst_exception = { }; + int err, err2; if (!nfs_server_capable(inode, NFS_CAP_CLONE)) return -EOPNOTSUPP; + src_lock = nfs_get_lock_context(nfs_file_open_context(src_f)); + if (IS_ERR(src_lock)) + return PTR_ERR(src_lock); + + src_exception.inode = file_inode(src_f); + src_exception.state = src_lock->open_context->state; + + dst_lock = nfs_get_lock_context(nfs_file_open_context(dst_f)); + if (IS_ERR(dst_lock)) { + err = PTR_ERR(dst_lock); + goto out_put_src_lock; + } + + dst_exception.inode = file_inode(dst_f); + dst_exception.state = dst_lock->open_context->state; + do { - err = _nfs42_proc_clone(&msg, src_f, dst_f, src_offset, - dst_offset, count); + err = _nfs42_proc_clone(&msg, src_f, dst_f, src_lock, dst_lock, + src_offset, dst_offset, count); if (err == -ENOTSUPP || err == -EOPNOTSUPP) { NFS_SERVER(inode)->caps &= ~NFS_CAP_CLONE; - return -EOPNOTSUPP; + err = -EOPNOTSUPP; + break; } - err = nfs4_handle_exception(server, err, &exception); - } while (exception.retry); - return err; + err2 = nfs4_handle_exception(server, err, &src_exception); + err = nfs4_handle_exception(server, err, &dst_exception); + if (!err) + err = err2; + } while (src_exception.retry || dst_exception.retry); + nfs_put_lock_context(dst_lock); +out_put_src_lock: + nfs_put_lock_context(src_lock); + return err; } diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 26f9a23e2b25..57ca1c8039c1 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -141,11 +141,11 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret != 0) break; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = nfs_file_fsync_commit(file, start, end, datasync); if (!ret) ret = pnfs_sync_inode(inode, !!datasync); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* * If nfs_file_fsync_commit detected a server reboot, then * resend all dirty pages that might have been covered by @@ -219,13 +219,13 @@ static int nfs42_clone_file_range(struct file *src_file, loff_t src_off, /* XXX: do we lock at all? what if server needs CB_RECALL_LAYOUT? */ if (same_inode) { - mutex_lock(&src_inode->i_mutex); + inode_lock(src_inode); } else if (dst_inode < src_inode) { - mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(dst_inode, I_MUTEX_PARENT); + inode_lock_nested(src_inode, I_MUTEX_CHILD); } else { - mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(src_inode, I_MUTEX_PARENT); + inode_lock_nested(dst_inode, I_MUTEX_CHILD); } /* flush all pending writes on both src and dst so that server @@ -246,13 +246,13 @@ static int nfs42_clone_file_range(struct file *src_file, loff_t src_off, out_unlock: if (same_inode) { - mutex_unlock(&src_inode->i_mutex); + inode_unlock(src_inode); } else if (dst_inode < src_inode) { - mutex_unlock(&src_inode->i_mutex); - mutex_unlock(&dst_inode->i_mutex); + inode_unlock(src_inode); + inode_unlock(dst_inode); } else { - mutex_unlock(&dst_inode->i_mutex); - mutex_unlock(&src_inode->i_mutex); + inode_unlock(dst_inode); + inode_unlock(src_inode); } out: return ret; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 4bfc33ad0563..400a70b3be7b 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2461,14 +2461,15 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, dentry = opendata->dentry; if (d_really_is_negative(dentry)) { - /* FIXME: Is this d_drop() ever needed? */ + struct dentry *alias; d_drop(dentry); - dentry = d_add_unique(dentry, igrab(state->inode)); - if (dentry == NULL) { - dentry = opendata->dentry; - } else if (dentry != ctx->dentry) { + alias = d_exact_alias(dentry, state->inode); + if (!alias) + alias = d_splice_alias(igrab(state->inode), dentry); + /* d_splice_alias() can't fail here - it's a non-directory */ + if (alias) { dput(ctx->dentry); - ctx->dentry = dget(dentry); + ctx->dentry = dentry = alias; } nfs_set_verifier(dentry, nfs_save_change_attribute(d_inode(opendata->dir))); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index a3592cc34a20..2fa483e6dbe2 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -52,9 +52,7 @@ static DEFINE_SPINLOCK(pnfs_spinlock); */ static LIST_HEAD(pnfs_modules_tbl); -static int -pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid, - enum pnfs_iomode iomode, bool sync); +static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo); /* Return the registered pnfs layout driver module matching given id */ static struct pnfs_layoutdriver_type * @@ -243,6 +241,8 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) { struct inode *inode = lo->plh_inode; + pnfs_layoutreturn_before_put_layout_hdr(lo); + if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) { if (!list_empty(&lo->plh_segs)) WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n"); @@ -252,6 +252,27 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) } } +/* + * Mark a pnfs_layout_hdr and all associated layout segments as invalid + * + * In order to continue using the pnfs_layout_hdr, a full recovery + * is required. + * Note that caller must hold inode->i_lock. + */ +static int +pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, + struct list_head *lseg_list) +{ + struct pnfs_layout_range range = { + .iomode = IOMODE_ANY, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + + set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); + return pnfs_mark_matching_lsegs_invalid(lo, lseg_list, &range); +} + static int pnfs_iomode_to_fail_bit(u32 iomode) { @@ -345,58 +366,6 @@ pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo, rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); } -/* Return true if layoutreturn is needed */ -static bool -pnfs_layout_need_return(struct pnfs_layout_hdr *lo, - struct pnfs_layout_segment *lseg) -{ - struct pnfs_layout_segment *s; - - if (!test_and_clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) - return false; - - list_for_each_entry(s, &lo->plh_segs, pls_list) - if (s != lseg && test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags)) - return false; - - return true; -} - -static bool -pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo) -{ - if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) - return false; - lo->plh_return_iomode = 0; - pnfs_get_layout_hdr(lo); - clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags); - return true; -} - -static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg, - struct pnfs_layout_hdr *lo, struct inode *inode) -{ - lo = lseg->pls_layout; - inode = lo->plh_inode; - - spin_lock(&inode->i_lock); - if (pnfs_layout_need_return(lo, lseg)) { - nfs4_stateid stateid; - enum pnfs_iomode iomode; - bool send; - - nfs4_stateid_copy(&stateid, &lo->plh_stateid); - iomode = lo->plh_return_iomode; - send = pnfs_prepare_layoutreturn(lo); - spin_unlock(&inode->i_lock); - if (send) { - /* Send an async layoutreturn so we dont deadlock */ - pnfs_send_layoutreturn(lo, &stateid, iomode, false); - } - } else - spin_unlock(&inode->i_lock); -} - void pnfs_put_lseg(struct pnfs_layout_segment *lseg) { @@ -410,15 +379,8 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg) atomic_read(&lseg->pls_refcount), test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); - /* Handle the case where refcount != 1 */ - if (atomic_add_unless(&lseg->pls_refcount, -1, 1)) - return; - lo = lseg->pls_layout; inode = lo->plh_inode; - /* Do we need a layoutreturn? */ - if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) - pnfs_layoutreturn_before_put_lseg(lseg, lo, inode); if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) { @@ -613,9 +575,8 @@ pnfs_destroy_layout(struct nfs_inode *nfsi) spin_lock(&nfsi->vfs_inode.i_lock); lo = nfsi->layout; if (lo) { - lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ - pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); pnfs_get_layout_hdr(lo); + pnfs_mark_layout_stateid_invalid(lo, &tmp_list); pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED); pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED); spin_unlock(&nfsi->vfs_inode.i_lock); @@ -676,11 +637,6 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list, { struct pnfs_layout_hdr *lo; struct inode *inode; - struct pnfs_layout_range range = { - .iomode = IOMODE_ANY, - .offset = 0, - .length = NFS4_MAX_UINT64, - }; LIST_HEAD(lseg_list); int ret = 0; @@ -695,11 +651,11 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list, spin_lock(&inode->i_lock); list_del_init(&lo->plh_bulk_destroy); - lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ - if (is_bulk_recall) - set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); - if (pnfs_mark_matching_lsegs_invalid(lo, &lseg_list, &range)) + if (pnfs_mark_layout_stateid_invalid(lo, &lseg_list)) { + if (is_bulk_recall) + set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); ret = -EAGAIN; + } spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&lseg_list); /* Free all lsegs that are attached to commit buckets */ @@ -937,6 +893,17 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo) rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq); } +static bool +pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo) +{ + if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) + return false; + lo->plh_return_iomode = 0; + pnfs_get_layout_hdr(lo); + clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); + return true; +} + static int pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid, enum pnfs_iomode iomode, bool sync) @@ -971,6 +938,48 @@ out: return status; } +/* Return true if layoutreturn is needed */ +static bool +pnfs_layout_need_return(struct pnfs_layout_hdr *lo) +{ + struct pnfs_layout_segment *s; + + if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) + return false; + + /* Defer layoutreturn until all lsegs are done */ + list_for_each_entry(s, &lo->plh_segs, pls_list) { + if (test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags)) + return false; + } + + return true; +} + +static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo) +{ + struct inode *inode= lo->plh_inode; + + if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) + return; + spin_lock(&inode->i_lock); + if (pnfs_layout_need_return(lo)) { + nfs4_stateid stateid; + enum pnfs_iomode iomode; + bool send; + + nfs4_stateid_copy(&stateid, &lo->plh_stateid); + iomode = lo->plh_return_iomode; + send = pnfs_prepare_layoutreturn(lo); + spin_unlock(&inode->i_lock); + if (send) { + /* Send an async layoutreturn so we dont deadlock */ + pnfs_send_layoutreturn(lo, &stateid, iomode, false); + } + } else + spin_unlock(&inode->i_lock); +} + /* * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr * when the layout segment list is empty. @@ -1091,7 +1100,7 @@ bool pnfs_roc(struct inode *ino) nfs4_stateid_copy(&stateid, &lo->plh_stateid); /* always send layoutreturn if being marked so */ - if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, + if (test_and_clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) layoutreturn = pnfs_prepare_layoutreturn(lo); @@ -1744,8 +1753,19 @@ pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode) if (lo->plh_return_iomode != 0) iomode = IOMODE_ANY; lo->plh_return_iomode = iomode; + set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); } +/** + * pnfs_mark_matching_lsegs_return - Free or return matching layout segments + * @lo: pointer to layout header + * @tmp_list: list header to be used with pnfs_free_lseg_list() + * @return_range: describe layout segment ranges to be returned + * + * This function is mainly intended for use by layoutrecall. It attempts + * to free the layout segment immediately, or else to mark it for return + * as soon as its reference count drops to zero. + */ int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, @@ -1768,12 +1788,11 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, lseg, lseg->pls_range.iomode, lseg->pls_range.offset, lseg->pls_range.length); + if (mark_lseg_invalid(lseg, tmp_list)) + continue; + remaining++; set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags); pnfs_set_plh_return_iomode(lo, return_range->iomode); - if (!mark_lseg_invalid(lseg, tmp_list)) - remaining++; - set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, - &lo->plh_flags); } return remaining; } diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 9f4e2a47f4aa..1ac1db5f6dad 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -94,8 +94,8 @@ enum { NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */ NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */ NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */ - NFS_LAYOUT_RETURN, /* Return this layout ASAP */ - NFS_LAYOUT_RETURN_BEFORE_CLOSE, /* Return this layout before close */ + NFS_LAYOUT_RETURN, /* layoutreturn in progress */ + NFS_LAYOUT_RETURN_REQUESTED, /* Return this layout ASAP */ NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */ NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */ }; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index ce43cd6d88c6..5754835a2886 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -830,11 +830,10 @@ EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked); * holding the nfs_page lock. */ void -nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst, - struct nfs_commit_info *cinfo) +nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo) { spin_lock(cinfo->lock); - nfs_request_add_commit_list_locked(req, dst, cinfo); + nfs_request_add_commit_list_locked(req, &cinfo->mds->list, cinfo); spin_unlock(cinfo->lock); nfs_mark_page_unstable(req->wb_page, cinfo); } @@ -892,7 +891,7 @@ nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, { if (pnfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx)) return; - nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo); + nfs_request_add_commit_list(req, cinfo); } static void diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 819ad812c71b..4cba7865f496 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -55,10 +55,10 @@ nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u struct inode *inode = d_inode(resfh->fh_dentry); int status; - mutex_lock(&inode->i_mutex); + inode_lock(inode); status = security_inode_setsecctx(resfh->fh_dentry, label->data, label->len); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (status) /* diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 79f0307a5ec8..195fe2668207 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -32,10 +32,10 @@ * */ +#include <crypto/hash.h> #include <linux/file.h> #include <linux/slab.h> #include <linux/namei.h> -#include <linux/crypto.h> #include <linux/sched.h> #include <linux/fs.h> #include <linux/module.h> @@ -104,29 +104,35 @@ static int nfs4_make_rec_clidname(char *dname, const struct xdr_netobj *clname) { struct xdr_netobj cksum; - struct hash_desc desc; - struct scatterlist sg; + struct crypto_shash *tfm; int status; dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n", clname->len, clname->data); - desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; - desc.tfm = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(desc.tfm)) { - status = PTR_ERR(desc.tfm); + tfm = crypto_alloc_shash("md5", 0, 0); + if (IS_ERR(tfm)) { + status = PTR_ERR(tfm); goto out_no_tfm; } - cksum.len = crypto_hash_digestsize(desc.tfm); + cksum.len = crypto_shash_digestsize(tfm); cksum.data = kmalloc(cksum.len, GFP_KERNEL); if (cksum.data == NULL) { status = -ENOMEM; goto out; } - sg_init_one(&sg, clname->data, clname->len); + { + SHASH_DESC_ON_STACK(desc, tfm); + + desc->tfm = tfm; + desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + + status = crypto_shash_digest(desc, clname->data, clname->len, + cksum.data); + shash_desc_zero(desc); + } - status = crypto_hash_digest(&desc, &sg, sg.length, cksum.data); if (status) goto out; @@ -135,7 +141,7 @@ nfs4_make_rec_clidname(char *dname, const struct xdr_netobj *clname) status = 0; out: kfree(cksum.data); - crypto_free_hash(desc.tfm); + crypto_free_shash(tfm); out_no_tfm: return status; } @@ -192,7 +198,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) dir = nn->rec_file->f_path.dentry; /* lock the parent */ - mutex_lock(&d_inode(dir)->i_mutex); + inode_lock(d_inode(dir)); dentry = lookup_one_len(dname, dir, HEXDIR_LEN-1); if (IS_ERR(dentry)) { @@ -213,7 +219,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) out_put: dput(dentry); out_unlock: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); if (status == 0) { if (nn->in_grace) { crp = nfs4_client_to_reclaim(dname, nn); @@ -286,7 +292,7 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn) } status = iterate_dir(nn->rec_file, &ctx.ctx); - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); list_for_each_entry_safe(entry, tmp, &ctx.names, list) { if (!status) { @@ -302,7 +308,7 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn) list_del(&entry->list); kfree(entry); } - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); nfs4_reset_creds(original_cred); list_for_each_entry_safe(entry, tmp, &ctx.names, list) { @@ -322,7 +328,7 @@ nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn) dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name); dir = nn->rec_file->f_path.dentry; - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); dentry = lookup_one_len(name, dir, namlen); if (IS_ERR(dentry)) { status = PTR_ERR(dentry); @@ -335,7 +341,7 @@ nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn) out: dput(dentry); out_unlock: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); return status; } diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 0770bcb543c8..f84fe6bf9aee 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -288,7 +288,7 @@ fh_lock_nested(struct svc_fh *fhp, unsigned int subclass) } inode = d_inode(dentry); - mutex_lock_nested(&inode->i_mutex, subclass); + inode_lock_nested(inode, subclass); fill_pre_wcc(fhp); fhp->fh_locked = true; } @@ -307,7 +307,7 @@ fh_unlock(struct svc_fh *fhp) { if (fhp->fh_locked) { fill_post_wcc(fhp); - mutex_unlock(&d_inode(fhp->fh_dentry)->i_mutex); + inode_unlock(d_inode(fhp->fh_dentry)); fhp->fh_locked = false; } } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 6739077f17fe..d40010e4f1a9 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -493,9 +493,9 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp, dentry = fhp->fh_dentry; - mutex_lock(&d_inode(dentry)->i_mutex); + inode_lock(d_inode(dentry)); host_error = security_inode_setsecctx(dentry, label->data, label->len); - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); return nfserrno(host_error); } #else @@ -870,7 +870,7 @@ __be32 nfsd_readv(struct file *file, loff_t offset, struct kvec *vec, int vlen, oldfs = get_fs(); set_fs(KERNEL_DS); - host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset); + host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset, 0); set_fs(oldfs); return nfsd_finish_read(file, count, host_err); } @@ -957,7 +957,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, /* Write the data. */ oldfs = get_fs(); set_fs(KERNEL_DS); - host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &pos); + host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &pos, 0); set_fs(oldfs); if (host_err < 0) goto out_nfserr; diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 10b22527a617..21a1e2e0d92f 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -1003,7 +1003,7 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); isize = i_size_read(inode); @@ -1113,6 +1113,6 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret == 1) ret = 0; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index aba43811d6ef..e8fe24882b5b 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -158,7 +158,7 @@ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp, flags = nilfs_mask_flags(inode->i_mode, flags); - mutex_lock(&inode->i_mutex); + inode_lock(inode); oldflags = NILFS_I(inode)->i_flags; @@ -186,7 +186,7 @@ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp, nilfs_mark_inode_dirty(inode); ret = nilfs_transaction_commit(inode->i_sb); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); mnt_drop_write_file(filp); return ret; } diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 45d650addd56..c20df77eff99 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -180,7 +180,7 @@ void nilfs_page_bug(struct page *page) printk(KERN_CRIT "NILFS_PAGE_BUG(%p): cnt=%d index#=%llu flags=0x%lx " "mapping=%p ino=%lu\n", - page, atomic_read(&page->_count), + page, page_ref_count(page), (unsigned long long)page->index, page->flags, m, ino); if (page_has_buffers(page)) { diff --git a/fs/notify/mark.c b/fs/notify/mark.c index cfcbf114676e..7115c5d7d373 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -91,7 +91,14 @@ #include <linux/fsnotify_backend.h> #include "fsnotify.h" +#define FSNOTIFY_REAPER_DELAY (1) /* 1 jiffy */ + struct srcu_struct fsnotify_mark_srcu; +static DEFINE_SPINLOCK(destroy_lock); +static LIST_HEAD(destroy_list); + +static void fsnotify_mark_destroy(struct work_struct *work); +static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy); void fsnotify_get_mark(struct fsnotify_mark *mark) { @@ -165,19 +172,10 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark) atomic_dec(&group->num_marks); } -static void -fsnotify_mark_free_rcu(struct rcu_head *rcu) -{ - struct fsnotify_mark *mark; - - mark = container_of(rcu, struct fsnotify_mark, g_rcu); - fsnotify_put_mark(mark); -} - /* - * Free fsnotify mark. The freeing is actually happening from a call_srcu - * callback. Caller must have a reference to the mark or be protected by - * fsnotify_mark_srcu. + * Free fsnotify mark. The freeing is actually happening from a kthread which + * first waits for srcu period end. Caller must have a reference to the mark + * or be protected by fsnotify_mark_srcu. */ void fsnotify_free_mark(struct fsnotify_mark *mark) { @@ -192,7 +190,11 @@ void fsnotify_free_mark(struct fsnotify_mark *mark) mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; spin_unlock(&mark->lock); - call_srcu(&fsnotify_mark_srcu, &mark->g_rcu, fsnotify_mark_free_rcu); + spin_lock(&destroy_lock); + list_add(&mark->g_list, &destroy_list); + spin_unlock(&destroy_lock); + queue_delayed_work(system_unbound_wq, &reaper_work, + FSNOTIFY_REAPER_DELAY); /* * Some groups like to know that marks are being freed. This is a @@ -388,7 +390,12 @@ err: spin_unlock(&mark->lock); - call_srcu(&fsnotify_mark_srcu, &mark->g_rcu, fsnotify_mark_free_rcu); + spin_lock(&destroy_lock); + list_add(&mark->g_list, &destroy_list); + spin_unlock(&destroy_lock); + queue_delayed_work(system_unbound_wq, &reaper_work, + FSNOTIFY_REAPER_DELAY); + return ret; } @@ -491,3 +498,21 @@ void fsnotify_init_mark(struct fsnotify_mark *mark, atomic_set(&mark->refcnt, 1); mark->free_mark = free_mark; } + +static void fsnotify_mark_destroy(struct work_struct *work) +{ + struct fsnotify_mark *mark, *next; + struct list_head private_destroy_list; + + spin_lock(&destroy_lock); + /* exchange the list head */ + list_replace_init(&destroy_list, &private_destroy_list); + spin_unlock(&destroy_lock); + + synchronize_srcu(&fsnotify_mark_srcu); + + list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) { + list_del_init(&mark->g_list); + fsnotify_put_mark(mark); + } +} diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c index 9e38dafa3bc7..b2eff5816adc 100644 --- a/fs/ntfs/dir.c +++ b/fs/ntfs/dir.c @@ -1509,7 +1509,7 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end, err = filemap_write_and_wait_range(vi->i_mapping, start, end); if (err) return err; - mutex_lock(&vi->i_mutex); + inode_lock(vi); BUG_ON(!S_ISDIR(vi->i_mode)); /* If the bitmap attribute inode is in memory sync it, too. */ @@ -1532,7 +1532,7 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end, else ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error " "%u.", datasync ? "data" : "", vi->i_ino, -ret); - mutex_unlock(&vi->i_mutex); + inode_unlock(vi); return ret; } diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 9d383e5eff0e..bed4d427dfae 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1944,14 +1944,14 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ssize_t written = 0; ssize_t err; - mutex_lock(&vi->i_mutex); + inode_lock(vi); /* We can write back this queue in page reclaim. */ current->backing_dev_info = inode_to_bdi(vi); err = ntfs_prepare_file_for_write(iocb, from); if (iov_iter_count(from) && !err) written = ntfs_perform_write(file, from, iocb->ki_pos); current->backing_dev_info = NULL; - mutex_unlock(&vi->i_mutex); + inode_unlock(vi); if (likely(written > 0)) { err = generic_write_sync(file, iocb->ki_pos, written); if (err < 0) @@ -1996,7 +1996,7 @@ static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end, err = filemap_write_and_wait_range(vi->i_mapping, start, end); if (err) return err; - mutex_lock(&vi->i_mutex); + inode_lock(vi); BUG_ON(S_ISDIR(vi->i_mode)); if (!datasync || !NInoNonResident(NTFS_I(vi))) @@ -2015,7 +2015,7 @@ static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end, else ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error " "%u.", datasync ? "data" : "", vi->i_ino, -ret); - mutex_unlock(&vi->i_mutex); + inode_unlock(vi); return ret; } diff --git a/fs/ntfs/quota.c b/fs/ntfs/quota.c index d80e3315cab0..9793e68ba1dd 100644 --- a/fs/ntfs/quota.c +++ b/fs/ntfs/quota.c @@ -48,7 +48,7 @@ bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol) ntfs_error(vol->sb, "Quota inodes are not open."); return false; } - mutex_lock(&vol->quota_q_ino->i_mutex); + inode_lock(vol->quota_q_ino); ictx = ntfs_index_ctx_get(NTFS_I(vol->quota_q_ino)); if (!ictx) { ntfs_error(vol->sb, "Failed to get index context."); @@ -98,7 +98,7 @@ bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol) ntfs_index_entry_mark_dirty(ictx); set_done: ntfs_index_ctx_put(ictx); - mutex_unlock(&vol->quota_q_ino->i_mutex); + inode_unlock(vol->quota_q_ino); /* * We set the flag so we do not try to mark the quotas out of date * again on remount. @@ -110,7 +110,7 @@ done: err_out: if (ictx) ntfs_index_ctx_put(ictx); - mutex_unlock(&vol->quota_q_ino->i_mutex); + inode_unlock(vol->quota_q_ino); return false; } diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 2f77f8dfb861..1b38abdaa3ed 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -1284,10 +1284,10 @@ static int check_windows_hibernation_status(ntfs_volume *vol) * Find the inode number for the hibernation file by looking up the * filename hiberfil.sys in the root directory. */ - mutex_lock(&vol->root_ino->i_mutex); + inode_lock(vol->root_ino); mref = ntfs_lookup_inode_by_name(NTFS_I(vol->root_ino), hiberfil, 12, &name); - mutex_unlock(&vol->root_ino->i_mutex); + inode_unlock(vol->root_ino); if (IS_ERR_MREF(mref)) { ret = MREF_ERR(mref); /* If the file does not exist, Windows is not hibernated. */ @@ -1377,10 +1377,10 @@ static bool load_and_init_quota(ntfs_volume *vol) * Find the inode number for the quota file by looking up the filename * $Quota in the extended system files directory $Extend. */ - mutex_lock(&vol->extend_ino->i_mutex); + inode_lock(vol->extend_ino); mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), Quota, 6, &name); - mutex_unlock(&vol->extend_ino->i_mutex); + inode_unlock(vol->extend_ino); if (IS_ERR_MREF(mref)) { /* * If the file does not exist, quotas are disabled and have @@ -1460,10 +1460,10 @@ static bool load_and_init_usnjrnl(ntfs_volume *vol) * Find the inode number for the transaction log file by looking up the * filename $UsnJrnl in the extended system files directory $Extend. */ - mutex_lock(&vol->extend_ino->i_mutex); + inode_lock(vol->extend_ino); mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), UsnJrnl, 8, &name); - mutex_unlock(&vol->extend_ino->i_mutex); + inode_unlock(vol->extend_ino); if (IS_ERR_MREF(mref)) { /* * If the file does not exist, transaction logging is disabled, diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index a3ded88718c9..d002579c6f2b 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5719,7 +5719,7 @@ int ocfs2_remove_btree_range(struct inode *inode, goto bail; } - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); if (ocfs2_truncate_log_needs_flush(osb)) { ret = __ocfs2_flush_truncate_log(osb); @@ -5776,7 +5776,7 @@ int ocfs2_remove_btree_range(struct inode *inode, out_commit: ocfs2_commit_trans(osb, handle); out: - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); bail: if (meta_ac) ocfs2_free_alloc_context(meta_ac); @@ -5832,7 +5832,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb, struct ocfs2_dinode *di; struct ocfs2_truncate_log *tl; - BUG_ON(mutex_trylock(&tl_inode->i_mutex)); + BUG_ON(inode_trylock(tl_inode)); start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk); @@ -5980,7 +5980,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) struct ocfs2_dinode *di; struct ocfs2_truncate_log *tl; - BUG_ON(mutex_trylock(&tl_inode->i_mutex)); + BUG_ON(inode_trylock(tl_inode)); di = (struct ocfs2_dinode *) tl_bh->b_data; @@ -6008,7 +6008,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) goto out; } - mutex_lock(&data_alloc_inode->i_mutex); + inode_lock(data_alloc_inode); status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1); if (status < 0) { @@ -6035,7 +6035,7 @@ out_unlock: ocfs2_inode_unlock(data_alloc_inode, 1); out_mutex: - mutex_unlock(&data_alloc_inode->i_mutex); + inode_unlock(data_alloc_inode); iput(data_alloc_inode); out: @@ -6047,9 +6047,9 @@ int ocfs2_flush_truncate_log(struct ocfs2_super *osb) int status; struct inode *tl_inode = osb->osb_tl_inode; - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); status = __ocfs2_flush_truncate_log(osb); - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); return status; } @@ -6208,7 +6208,7 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, (unsigned long long)le64_to_cpu(tl_copy->i_blkno), num_recs); - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); for(i = 0; i < num_recs; i++) { if (ocfs2_truncate_log_needs_flush(osb)) { status = __ocfs2_flush_truncate_log(osb); @@ -6239,7 +6239,7 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, } bail_up: - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); return status; } @@ -6346,7 +6346,7 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb, goto out; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = ocfs2_inode_lock(inode, &di_bh, 1); if (ret) { @@ -6395,7 +6395,7 @@ out_unlock: ocfs2_inode_unlock(inode, 1); brelse(di_bh); out_mutex: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); iput(inode); out: while(head) { @@ -6439,7 +6439,7 @@ static int ocfs2_free_cached_clusters(struct ocfs2_super *osb, handle_t *handle; int ret = 0; - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); while (head) { if (ocfs2_truncate_log_needs_flush(osb)) { @@ -6471,7 +6471,7 @@ static int ocfs2_free_cached_clusters(struct ocfs2_super *osb, } } - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); while (head) { /* Premature exit may have left some dangling items. */ @@ -7355,7 +7355,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) goto out; } - mutex_lock(&main_bm_inode->i_mutex); + inode_lock(main_bm_inode); ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0); if (ret < 0) { @@ -7422,7 +7422,7 @@ out_unlock: ocfs2_inode_unlock(main_bm_inode, 0); brelse(main_bm_bh); out_mutex: - mutex_unlock(&main_bm_inode->i_mutex); + inode_unlock(main_bm_inode); iput(main_bm_inode); out: return ret; diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 7f604727f487..cda0361e95a4 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -956,6 +956,7 @@ clean_orphan: tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, update_isize, end); if (tmp_ret < 0) { + ocfs2_inode_unlock(inode, 1); ret = tmp_ret; mlog_errno(ret); brelse(di_bh); @@ -2046,9 +2047,9 @@ static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb, int ret = 0; unsigned int truncated_clusters; - mutex_lock(&osb->osb_tl_inode->i_mutex); + inode_lock(osb->osb_tl_inode); truncated_clusters = osb->truncated_clusters; - mutex_unlock(&osb->osb_tl_inode->i_mutex); + inode_unlock(osb->osb_tl_inode); /* * Check whether we can succeed in allocating if we free diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index a3cc6d2fc896..ef6a2ec494de 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -287,7 +287,6 @@ struct o2hb_bio_wait_ctxt { static void o2hb_write_timeout(struct work_struct *work) { int failed, quorum; - unsigned long flags; struct o2hb_region *reg = container_of(work, struct o2hb_region, hr_write_timeout_work.work); @@ -297,14 +296,14 @@ static void o2hb_write_timeout(struct work_struct *work) jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); if (o2hb_global_heartbeat_active()) { - spin_lock_irqsave(&o2hb_live_lock, flags); + spin_lock(&o2hb_live_lock); if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) set_bit(reg->hr_region_num, o2hb_failed_region_bitmap); failed = bitmap_weight(o2hb_failed_region_bitmap, O2NM_MAX_REGIONS); quorum = bitmap_weight(o2hb_quorum_region_bitmap, O2NM_MAX_REGIONS); - spin_unlock_irqrestore(&o2hb_live_lock, flags); + spin_unlock(&o2hb_live_lock); mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n", quorum, failed); @@ -1254,15 +1253,15 @@ static const struct file_operations o2hb_debug_fops = { void o2hb_exit(void) { - kfree(o2hb_db_livenodes); - kfree(o2hb_db_liveregions); - kfree(o2hb_db_quorumregions); - kfree(o2hb_db_failedregions); debugfs_remove(o2hb_debug_failedregions); debugfs_remove(o2hb_debug_quorumregions); debugfs_remove(o2hb_debug_liveregions); debugfs_remove(o2hb_debug_livenodes); debugfs_remove(o2hb_debug_dir); + kfree(o2hb_db_livenodes); + kfree(o2hb_db_liveregions); + kfree(o2hb_db_quorumregions); + kfree(o2hb_db_failedregions); } static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir, @@ -1438,13 +1437,15 @@ static void o2hb_region_release(struct config_item *item) kfree(reg->hr_slots); - kfree(reg->hr_db_regnum); - kfree(reg->hr_db_livenodes); debugfs_remove(reg->hr_debug_livenodes); debugfs_remove(reg->hr_debug_regnum); debugfs_remove(reg->hr_debug_elapsed_time); debugfs_remove(reg->hr_debug_pinned); debugfs_remove(reg->hr_debug_dir); + kfree(reg->hr_db_livenodes); + kfree(reg->hr_db_regnum); + kfree(reg->hr_debug_elapsed_time); + kfree(reg->hr_debug_pinned); spin_lock(&o2hb_live_lock); list_del(®->hr_all_item); @@ -2423,11 +2424,10 @@ EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating); int o2hb_check_node_heartbeating_no_sem(u8 node_num) { unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; - unsigned long flags; - spin_lock_irqsave(&o2hb_live_lock, flags); + spin_lock(&o2hb_live_lock); o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map)); - spin_unlock_irqrestore(&o2hb_live_lock, flags); + spin_unlock(&o2hb_live_lock); if (!test_bit(node_num, testing_map)) { mlog(ML_HEARTBEAT, "node (%u) does not have heartbeating enabled.\n", diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index ebe543894db0..b17d180bdc16 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -630,7 +630,6 @@ static void o2nm_cluster_release(struct config_item *item) { struct o2nm_cluster *cluster = to_o2nm_cluster(item); - kfree(cluster->cl_group.default_groups); kfree(cluster); } @@ -666,7 +665,6 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g struct o2nm_cluster *cluster = NULL; struct o2nm_node_group *ns = NULL; struct config_group *o2hb_group = NULL, *ret = NULL; - void *defs = NULL; /* this runs under the parent dir's i_mutex; there can be only * one caller in here at a time */ @@ -675,20 +673,18 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g cluster = kzalloc(sizeof(struct o2nm_cluster), GFP_KERNEL); ns = kzalloc(sizeof(struct o2nm_node_group), GFP_KERNEL); - defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL); o2hb_group = o2hb_alloc_hb_set(); - if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL) + if (cluster == NULL || ns == NULL || o2hb_group == NULL) goto out; config_group_init_type_name(&cluster->cl_group, name, &o2nm_cluster_type); + configfs_add_default_group(&ns->ns_group, &cluster->cl_group); + config_group_init_type_name(&ns->ns_group, "node", &o2nm_node_group_type); + configfs_add_default_group(o2hb_group, &cluster->cl_group); - cluster->cl_group.default_groups = defs; - cluster->cl_group.default_groups[0] = &ns->ns_group; - cluster->cl_group.default_groups[1] = o2hb_group; - cluster->cl_group.default_groups[2] = NULL; rwlock_init(&cluster->cl_nodes_lock); cluster->cl_node_ip_tree = RB_ROOT; cluster->cl_reconnect_delay_ms = O2NET_RECONNECT_DELAY_MS_DEFAULT; @@ -704,7 +700,6 @@ out: kfree(cluster); kfree(ns); o2hb_free_hb_set(o2hb_group); - kfree(defs); ret = ERR_PTR(-ENOMEM); } @@ -714,18 +709,11 @@ out: static void o2nm_cluster_group_drop_item(struct config_group *group, struct config_item *item) { struct o2nm_cluster *cluster = to_o2nm_cluster(item); - int i; - struct config_item *killme; BUG_ON(o2nm_single_cluster != cluster); o2nm_single_cluster = NULL; - for (i = 0; cluster->cl_group.default_groups[i]; i++) { - killme = &cluster->cl_group.default_groups[i]->cg_item; - cluster->cl_group.default_groups[i] = NULL; - config_item_put(killme); - } - + configfs_remove_default_groups(&cluster->cl_group); config_item_put(item); } diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index ffecf89c8c1c..e1adf285fc31 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -4361,7 +4361,7 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir, mlog_errno(ret); goto out; } - mutex_lock(&dx_alloc_inode->i_mutex); + inode_lock(dx_alloc_inode); ret = ocfs2_inode_lock(dx_alloc_inode, &dx_alloc_bh, 1); if (ret) { @@ -4410,7 +4410,7 @@ out_unlock: ocfs2_inode_unlock(dx_alloc_inode, 1); out_mutex: - mutex_unlock(&dx_alloc_inode->i_mutex); + inode_unlock(dx_alloc_inode); brelse(dx_alloc_bh); out: iput(dx_alloc_inode); diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 68c607e63ff6..004f2cbe8f71 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -282,6 +282,7 @@ static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm, #define DLM_LOCK_RES_DROPPING_REF 0x00000040 #define DLM_LOCK_RES_BLOCK_DIRTY 0x00001000 #define DLM_LOCK_RES_SETREF_INPROG 0x00002000 +#define DLM_LOCK_RES_RECOVERY_WAITING 0x00004000 /* max milliseconds to wait to sync up a network failure with a node death */ #define DLM_NODE_DEATH_WAIT_MAX (5 * 1000) @@ -451,6 +452,7 @@ enum { DLM_QUERY_REGION = 519, DLM_QUERY_NODEINFO = 520, DLM_BEGIN_EXIT_DOMAIN_MSG = 521, + DLM_DEREF_LOCKRES_DONE = 522, }; struct dlm_reco_node_data @@ -545,7 +547,7 @@ struct dlm_master_requery * }; * * from ../cluster/tcp.h - * NET_MAX_PAYLOAD_BYTES (4096 - sizeof(net_msg)) + * O2NET_MAX_PAYLOAD_BYTES (4096 - sizeof(net_msg)) * (roughly 4080 bytes) * and sizeof(dlm_migratable_lockres) = 112 bytes * and sizeof(dlm_migratable_lock) = 16 bytes @@ -586,7 +588,7 @@ struct dlm_migratable_lockres /* from above, 128 bytes * for some undetermined future use */ -#define DLM_MIG_LOCKRES_RESERVED (NET_MAX_PAYLOAD_BYTES - \ +#define DLM_MIG_LOCKRES_RESERVED (O2NET_MAX_PAYLOAD_BYTES - \ DLM_MIG_LOCKRES_MAX_LEN) struct dlm_create_lock @@ -782,6 +784,20 @@ struct dlm_deref_lockres u8 name[O2NM_MAX_NAME_LEN]; }; +enum { + DLM_DEREF_RESPONSE_DONE = 0, + DLM_DEREF_RESPONSE_INPROG = 1, +}; + +struct dlm_deref_lockres_done { + u32 pad1; + u16 pad2; + u8 node_idx; + u8 namelen; + + u8 name[O2NM_MAX_NAME_LEN]; +}; + static inline enum dlm_status __dlm_lockres_state_to_status(struct dlm_lock_resource *res) { @@ -789,7 +805,8 @@ __dlm_lockres_state_to_status(struct dlm_lock_resource *res) assert_spin_locked(&res->spinlock); - if (res->state & DLM_LOCK_RES_RECOVERING) + if (res->state & (DLM_LOCK_RES_RECOVERING| + DLM_LOCK_RES_RECOVERY_WAITING)) status = DLM_RECOVERING; else if (res->state & DLM_LOCK_RES_MIGRATING) status = DLM_MIGRATING; @@ -968,6 +985,8 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data, void dlm_assert_master_post_handler(int status, void *data, void *ret_data); int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, void **ret_data); +int dlm_deref_lockres_done_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data); int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, void **ret_data); int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, @@ -1009,6 +1028,7 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res) { __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_IN_PROGRESS| DLM_LOCK_RES_RECOVERING| + DLM_LOCK_RES_RECOVERY_WAITING| DLM_LOCK_RES_MIGRATING)); } diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 2ee7fe747cea..12e064b8be9a 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -132,10 +132,13 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events); * - Message DLM_QUERY_NODEINFO added to allow online node removes * New in version 1.2: * - Message DLM_BEGIN_EXIT_DOMAIN_MSG added to mark start of exit domain + * New in version 1.3: + * - Message DLM_DEREF_LOCKRES_DONE added to inform non-master that the + * refmap is cleared */ static const struct dlm_protocol_version dlm_protocol = { .pv_major = 1, - .pv_minor = 2, + .pv_minor = 3, }; #define DLM_DOMAIN_BACKOFF_MS 200 @@ -1396,7 +1399,7 @@ static int dlm_send_join_cancels(struct dlm_ctxt *dlm, unsigned int map_size) { int status, tmpstat; - unsigned int node; + int node; if (map_size != (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long))) { @@ -1853,7 +1856,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm) sizeof(struct dlm_exit_domain), dlm_begin_exit_domain_handler, dlm, NULL, &dlm->dlm_domain_handlers); + if (status) + goto bail; + status = o2net_register_handler(DLM_DEREF_LOCKRES_DONE, dlm->key, + sizeof(struct dlm_deref_lockres_done), + dlm_deref_lockres_done_handler, + dlm, NULL, &dlm->dlm_domain_handlers); bail: if (status) dlm_unregister_domain_handlers(dlm); diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 9477d6e1de37..9aed6e202201 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2278,7 +2278,7 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) dlm_print_one_lock_resource(res); BUG(); } - return ret; + return ret ? ret : r; } int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, @@ -2345,7 +2345,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, res->lockname.len, res->lockname.name, node); dlm_print_one_lock_resource(res); } - ret = 0; + ret = DLM_DEREF_RESPONSE_DONE; goto done; } @@ -2365,7 +2365,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, spin_unlock(&dlm->work_lock); queue_work(dlm->dlm_worker, &dlm->dispatched_work); - return 0; + return DLM_DEREF_RESPONSE_INPROG; done: if (res) @@ -2375,6 +2375,122 @@ done: return ret; } +int dlm_deref_lockres_done_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_deref_lockres_done *deref + = (struct dlm_deref_lockres_done *)msg->buf; + struct dlm_lock_resource *res = NULL; + char *name; + unsigned int namelen; + int ret = -EINVAL; + u8 node; + unsigned int hash; + + if (!dlm_grab(dlm)) + return 0; + + name = deref->name; + namelen = deref->namelen; + node = deref->node_idx; + + if (namelen > DLM_LOCKID_NAME_MAX) { + mlog(ML_ERROR, "Invalid name length!"); + goto done; + } + if (deref->node_idx >= O2NM_MAX_NODES) { + mlog(ML_ERROR, "Invalid node number: %u\n", node); + goto done; + } + + hash = dlm_lockid_hash(name, namelen); + + spin_lock(&dlm->spinlock); + res = __dlm_lookup_lockres_full(dlm, name, namelen, hash); + if (!res) { + spin_unlock(&dlm->spinlock); + mlog(ML_ERROR, "%s:%.*s: bad lockres name\n", + dlm->name, namelen, name); + goto done; + } + + spin_lock(&res->spinlock); + BUG_ON(!(res->state & DLM_LOCK_RES_DROPPING_REF)); + if (!list_empty(&res->purge)) { + mlog(0, "%s: Removing res %.*s from purgelist\n", + dlm->name, res->lockname.len, res->lockname.name); + list_del_init(&res->purge); + dlm_lockres_put(res); + dlm->purge_count--; + } + + if (!__dlm_lockres_unused(res)) { + mlog(ML_ERROR, "%s: res %.*s in use after deref\n", + dlm->name, res->lockname.len, res->lockname.name); + __dlm_print_one_lock_resource(res); + BUG(); + } + + __dlm_unhash_lockres(dlm, res); + + spin_lock(&dlm->track_lock); + if (!list_empty(&res->tracking)) + list_del_init(&res->tracking); + else { + mlog(ML_ERROR, "%s: Resource %.*s not on the Tracking list\n", + dlm->name, res->lockname.len, res->lockname.name); + __dlm_print_one_lock_resource(res); + } + spin_unlock(&dlm->track_lock); + + /* lockres is not in the hash now. drop the flag and wake up + * any processes waiting in dlm_get_lock_resource. + */ + res->state &= ~DLM_LOCK_RES_DROPPING_REF; + spin_unlock(&res->spinlock); + wake_up(&res->wq); + + dlm_lockres_put(res); + + spin_unlock(&dlm->spinlock); + +done: + dlm_put(dlm); + return ret; +} + +static void dlm_drop_lockres_ref_done(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, u8 node) +{ + struct dlm_deref_lockres_done deref; + int ret = 0, r; + const char *lockname; + unsigned int namelen; + + lockname = res->lockname.name; + namelen = res->lockname.len; + BUG_ON(namelen > O2NM_MAX_NAME_LEN); + + memset(&deref, 0, sizeof(deref)); + deref.node_idx = dlm->node_num; + deref.namelen = namelen; + memcpy(deref.name, lockname, namelen); + + ret = o2net_send_message(DLM_DEREF_LOCKRES_DONE, dlm->key, + &deref, sizeof(deref), node, &r); + if (ret < 0) { + mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF DONE " + " to node %u\n", dlm->name, namelen, + lockname, ret, node); + } else if (r < 0) { + /* ignore the error */ + mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n", + dlm->name, namelen, lockname, node, r); + dlm_print_one_lock_resource(res); + } +} + static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) { struct dlm_ctxt *dlm; @@ -2395,6 +2511,8 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) } spin_unlock(&res->spinlock); + dlm_drop_lockres_ref_done(dlm, res, node); + if (cleared) { mlog(0, "%s:%.*s node %u ref dropped in dispatch\n", dlm->name, res->lockname.len, res->lockname.name, node); @@ -2432,7 +2550,8 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, return 0; /* delay migration when the lockres is in RECOCERING state */ - if (res->state & DLM_LOCK_RES_RECOVERING) + if (res->state & (DLM_LOCK_RES_RECOVERING| + DLM_LOCK_RES_RECOVERY_WAITING)) return 0; if (res->owner != dlm->node_num) diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index c5bdf02c213b..cd38488a10fc 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1403,12 +1403,24 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, * and RECOVERY flag changed when it completes. */ hash = dlm_lockid_hash(mres->lockname, mres->lockname_len); spin_lock(&dlm->spinlock); - res = __dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len, + res = __dlm_lookup_lockres_full(dlm, mres->lockname, mres->lockname_len, hash); if (res) { /* this will get a ref on res */ /* mark it as recovering/migrating and hash it */ spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_DROPPING_REF) { + mlog(0, "%s: node is attempting to migrate " + "lockres %.*s, but marked as dropping " + " ref!\n", dlm->name, + mres->lockname_len, mres->lockname); + ret = -EINVAL; + spin_unlock(&res->spinlock); + spin_unlock(&dlm->spinlock); + dlm_lockres_put(res); + goto leave; + } + if (mres->flags & DLM_MRES_RECOVERY) { res->state |= DLM_LOCK_RES_RECOVERING; } else { @@ -2163,6 +2175,13 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, for (i = 0; i < DLM_HASH_BUCKETS; i++) { bucket = dlm_lockres_hash(dlm, i); hlist_for_each_entry(res, bucket, hash_node) { + if (res->state & DLM_LOCK_RES_RECOVERY_WAITING) { + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_RECOVERY_WAITING; + spin_unlock(&res->spinlock); + wake_up(&res->wq); + } + if (!(res->state & DLM_LOCK_RES_RECOVERING)) continue; @@ -2300,6 +2319,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, res->lockname.len, res->lockname.name, freed, dead_node); __dlm_print_one_lock_resource(res); } + res->state |= DLM_LOCK_RES_RECOVERY_WAITING; dlm_lockres_clear_refmap_bit(dlm, res, dead_node); } else if (test_bit(dead_node, res->refmap)) { mlog(0, "%s:%.*s: dead node %u had a ref, but had " @@ -2367,6 +2387,8 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) break; } } + dlm_lockres_clear_refmap_bit(dlm, res, + dead_node); spin_unlock(&res->spinlock); continue; } @@ -2375,14 +2397,16 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) dlm_revalidate_lvb(dlm, res, dead_node); if (res->owner == dead_node) { if (res->state & DLM_LOCK_RES_DROPPING_REF) { - mlog(ML_NOTICE, "%s: res %.*s, Skip " - "recovery as it is being freed\n", - dlm->name, res->lockname.len, - res->lockname.name); - } else - dlm_move_lockres_to_recovery_list(dlm, - res); - + mlog(0, "%s:%.*s: owned by " + "dead node %u, this node was " + "dropping its ref when it died. " + "continue, dropping the flag.\n", + dlm->name, res->lockname.len, + res->lockname.name, dead_node); + } + res->state &= ~DLM_LOCK_RES_DROPPING_REF; + dlm_move_lockres_to_recovery_list(dlm, + res); } else if (res->owner == dlm->node_num) { dlm_free_dead_locks(dlm, res, dead_node); __dlm_lockres_calc_usage(dlm, res); diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index c5f6c241ecd7..68d239ba0c63 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -106,7 +106,8 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res) if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY) return 0; - if (res->state & DLM_LOCK_RES_RECOVERING) + if (res->state & (DLM_LOCK_RES_RECOVERING| + DLM_LOCK_RES_RECOVERY_WAITING)) return 0; /* Another node has this resource with this node as the master */ @@ -202,6 +203,13 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm, dlm->purge_count--; } + if (!master && ret != 0) { + mlog(0, "%s: deref %.*s in progress or master goes down\n", + dlm->name, res->lockname.len, res->lockname.name); + spin_unlock(&res->spinlock); + return; + } + if (!__dlm_lockres_unused(res)) { mlog(ML_ERROR, "%s: res %.*s in use after deref\n", dlm->name, res->lockname.len, res->lockname.name); @@ -700,7 +708,8 @@ static int dlm_thread(void *data) * dirty for a short while. */ BUG_ON(res->state & DLM_LOCK_RES_MIGRATING); if (res->state & (DLM_LOCK_RES_IN_PROGRESS | - DLM_LOCK_RES_RECOVERING)) { + DLM_LOCK_RES_RECOVERING | + DLM_LOCK_RES_RECOVERY_WAITING)) { /* move it to the tail and keep going */ res->state &= ~DLM_LOCK_RES_DIRTY; spin_unlock(&res->spinlock); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index f92612e4b9d6..474e57f834e6 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -1390,6 +1390,7 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb, unsigned int gen; int noqueue_attempted = 0; int dlm_locked = 0; + int kick_dc = 0; if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) { mlog_errno(-EINVAL); @@ -1524,7 +1525,12 @@ update_holders: unlock: lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING); + /* ocfs2_unblock_lock reques on seeing OCFS2_LOCK_UPCONVERT_FINISHING */ + kick_dc = (lockres->l_flags & OCFS2_LOCK_BLOCKED); + spin_unlock_irqrestore(&lockres->l_lock, flags); + if (kick_dc) + ocfs2_wake_downconvert_thread(osb); out: /* * This is helping work around a lock inversion between the page lock diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index d63127932509..7cb38fdca229 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1872,7 +1872,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) return -EROFS; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * This prevents concurrent writes on other nodes @@ -1991,7 +1991,7 @@ out_rw_unlock: ocfs2_rw_unlock(inode, 1); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -2299,7 +2299,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, appending = iocb->ki_flags & IOCB_APPEND ? 1 : 0; direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); relock: /* @@ -2435,7 +2435,7 @@ out: ocfs2_rw_unlock(inode, rw_level); out_mutex: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (written) ret = written; @@ -2547,7 +2547,7 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence) struct inode *inode = file->f_mapping->host; int ret = 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); switch (whence) { case SEEK_SET: @@ -2585,7 +2585,7 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence) offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (ret) return ret; return offset; diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 97a563bab9a8..36294446d960 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -630,10 +630,10 @@ static int ocfs2_remove_inode(struct inode *inode, goto bail; } - mutex_lock(&inode_alloc_inode->i_mutex); + inode_lock(inode_alloc_inode); status = ocfs2_inode_lock(inode_alloc_inode, &inode_alloc_bh, 1); if (status < 0) { - mutex_unlock(&inode_alloc_inode->i_mutex); + inode_unlock(inode_alloc_inode); mlog_errno(status); goto bail; @@ -680,7 +680,7 @@ bail_commit: ocfs2_commit_trans(osb, handle); bail_unlock: ocfs2_inode_unlock(inode_alloc_inode, 1); - mutex_unlock(&inode_alloc_inode->i_mutex); + inode_unlock(inode_alloc_inode); brelse(inode_alloc_bh); bail: iput(inode_alloc_inode); @@ -751,10 +751,10 @@ static int ocfs2_wipe_inode(struct inode *inode, /* Lock the orphan dir. The lock will be held for the entire * delete_inode operation. We do this now to avoid races with * recovery completion on other nodes. */ - mutex_lock(&orphan_dir_inode->i_mutex); + inode_lock(orphan_dir_inode); status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); if (status < 0) { - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); mlog_errno(status); goto bail; @@ -803,7 +803,7 @@ bail_unlock_dir: return status; ocfs2_inode_unlock(orphan_dir_inode, 1); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); brelse(orphan_dir_bh); bail: iput(orphan_dir_inode); diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 16b0bb482ea7..4506ec5ec2ea 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -86,7 +86,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, unsigned oldflags; int status; - mutex_lock(&inode->i_mutex); + inode_lock(inode); status = ocfs2_inode_lock(inode, &bh, 1); if (status < 0) { @@ -135,7 +135,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, bail_unlock: ocfs2_inode_unlock(inode, 1); bail: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); brelse(bh); @@ -287,7 +287,7 @@ static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb, struct ocfs2_dinode *dinode_alloc = NULL; if (inode_alloc) - mutex_lock(&inode_alloc->i_mutex); + inode_lock(inode_alloc); if (o2info_coherent(&fi->ifi_req)) { status = ocfs2_inode_lock(inode_alloc, &bh, 0); @@ -317,7 +317,7 @@ bail: ocfs2_inode_unlock(inode_alloc, 0); if (inode_alloc) - mutex_unlock(&inode_alloc->i_mutex); + inode_unlock(inode_alloc); brelse(bh); @@ -547,7 +547,7 @@ static int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb, struct ocfs2_dinode *gb_dinode = NULL; if (gb_inode) - mutex_lock(&gb_inode->i_mutex); + inode_lock(gb_inode); if (o2info_coherent(&ffg->iff_req)) { status = ocfs2_inode_lock(gb_inode, &bh, 0); @@ -604,7 +604,7 @@ bail: ocfs2_inode_unlock(gb_inode, 0); if (gb_inode) - mutex_unlock(&gb_inode->i_mutex); + inode_unlock(gb_inode); iput(gb_inode); brelse(bh); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 3772a2dbb980..61b833b721d8 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -2088,7 +2088,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb, return status; } - mutex_lock(&orphan_dir_inode->i_mutex); + inode_lock(orphan_dir_inode); status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0); if (status < 0) { mlog_errno(status); @@ -2106,7 +2106,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb, out_cluster: ocfs2_inode_unlock(orphan_dir_inode, 0); out: - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); return status; } @@ -2196,7 +2196,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, oi->ip_next_orphan = NULL; if (oi->ip_flags & OCFS2_INODE_DIO_ORPHAN_ENTRY) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = ocfs2_rw_lock(inode, 1); if (ret < 0) { mlog_errno(ret); @@ -2235,7 +2235,7 @@ unlock_inode: unlock_rw: ocfs2_rw_unlock(inode, 1); unlock_mutex: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* clear dio flag in ocfs2_inode_info */ oi->ip_flags &= ~OCFS2_INODE_DIO_ORPHAN_ENTRY; diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index e9c99e35f5ea..7d62c43a2c3e 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -414,7 +414,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) goto out; } - mutex_lock(&main_bm_inode->i_mutex); + inode_lock(main_bm_inode); status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); if (status < 0) { @@ -468,7 +468,7 @@ out_unlock: ocfs2_inode_unlock(main_bm_inode, 1); out_mutex: - mutex_unlock(&main_bm_inode->i_mutex); + inode_unlock(main_bm_inode); iput(main_bm_inode); out: @@ -506,7 +506,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, goto bail; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); status = ocfs2_read_inode_block_full(inode, &alloc_bh, OCFS2_BH_IGNORE_CACHE); @@ -539,7 +539,7 @@ bail: brelse(alloc_bh); if (inode) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); iput(inode); } @@ -571,7 +571,7 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb, goto out; } - mutex_lock(&main_bm_inode->i_mutex); + inode_lock(main_bm_inode); status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); if (status < 0) { @@ -601,7 +601,7 @@ out_unlock: ocfs2_inode_unlock(main_bm_inode, 1); out_mutex: - mutex_unlock(&main_bm_inode->i_mutex); + inode_unlock(main_bm_inode); brelse(main_bm_bh); @@ -643,7 +643,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, goto bail; } - mutex_lock(&local_alloc_inode->i_mutex); + inode_lock(local_alloc_inode); /* * We must double check state and allocator bits because @@ -709,7 +709,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, status = 0; bail: if (status < 0 && local_alloc_inode) { - mutex_unlock(&local_alloc_inode->i_mutex); + inode_unlock(local_alloc_inode); iput(local_alloc_inode); } diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 9581d190f6e1..77ebc2bc1cca 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -147,6 +147,10 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = ocfs2_inode_lock(inode, &di_bh, 1); if (ret < 0) { mlog_errno(ret); + if (ret == -ENOMEM) + ret = VM_FAULT_OOM; + else + ret = VM_FAULT_SIGBUS; goto out; } diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 124471d26a73..e3d05d9901a3 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -276,7 +276,7 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, * context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; */ - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); if (ocfs2_truncate_log_needs_flush(osb)) { ret = __ocfs2_flush_truncate_log(osb); @@ -338,7 +338,7 @@ out_commit: ocfs2_commit_trans(osb, handle); out_unlock_mutex: - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); if (context->data_ac) { ocfs2_free_alloc_context(context->data_ac); @@ -632,7 +632,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, goto out; } - mutex_lock(&gb_inode->i_mutex); + inode_lock(gb_inode); ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1); if (ret) { @@ -640,7 +640,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, goto out_unlock_gb_mutex; } - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { @@ -708,11 +708,11 @@ out_commit: brelse(gd_bh); out_unlock_tl_inode: - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); ocfs2_inode_unlock(gb_inode, 1); out_unlock_gb_mutex: - mutex_unlock(&gb_inode->i_mutex); + inode_unlock(gb_inode); brelse(gb_bh); iput(gb_inode); @@ -905,7 +905,7 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context) if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) return -EROFS; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * This prevents concurrent writes from other nodes @@ -969,7 +969,7 @@ out_inode_unlock: out_rw_unlock: ocfs2_rw_unlock(inode, 1); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return status; } diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index ab42c38031b1..6b3e87189a64 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -1045,7 +1045,7 @@ leave: if (orphan_dir) { /* This was locked for us in ocfs2_prepare_orphan_dir() */ ocfs2_inode_unlock(orphan_dir, 1); - mutex_unlock(&orphan_dir->i_mutex); + inode_unlock(orphan_dir); iput(orphan_dir); } @@ -1664,7 +1664,7 @@ bail: if (orphan_dir) { /* This was locked for us in ocfs2_prepare_orphan_dir() */ ocfs2_inode_unlock(orphan_dir, 1); - mutex_unlock(&orphan_dir->i_mutex); + inode_unlock(orphan_dir); iput(orphan_dir); } @@ -2121,11 +2121,11 @@ static int ocfs2_lookup_lock_orphan_dir(struct ocfs2_super *osb, return ret; } - mutex_lock(&orphan_dir_inode->i_mutex); + inode_lock(orphan_dir_inode); ret = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); if (ret < 0) { - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); mlog_errno(ret); @@ -2226,7 +2226,7 @@ out: if (ret) { ocfs2_inode_unlock(orphan_dir_inode, 1); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); } @@ -2495,7 +2495,7 @@ out: ocfs2_free_alloc_context(inode_ac); /* Unroll orphan dir locking */ - mutex_unlock(&orphan_dir->i_mutex); + inode_unlock(orphan_dir); ocfs2_inode_unlock(orphan_dir, 1); iput(orphan_dir); } @@ -2602,7 +2602,7 @@ leave: if (orphan_dir) { /* This was locked for us in ocfs2_prepare_orphan_dir() */ ocfs2_inode_unlock(orphan_dir, 1); - mutex_unlock(&orphan_dir->i_mutex); + inode_unlock(orphan_dir); iput(orphan_dir); } @@ -2689,7 +2689,7 @@ int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, bail_unlock_orphan: ocfs2_inode_unlock(orphan_dir_inode, 1); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); ocfs2_free_dir_lookup_result(&orphan_insert); @@ -2721,10 +2721,10 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, goto bail; } - mutex_lock(&orphan_dir_inode->i_mutex); + inode_lock(orphan_dir_inode); status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); if (status < 0) { - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); mlog_errno(status); goto bail; @@ -2770,7 +2770,7 @@ bail_commit: bail_unlock_orphan: ocfs2_inode_unlock(orphan_dir_inode, 1); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); brelse(orphan_dir_bh); iput(orphan_dir_inode); @@ -2834,12 +2834,12 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, goto leave; } - mutex_lock(&orphan_dir_inode->i_mutex); + inode_lock(orphan_dir_inode); status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); if (status < 0) { mlog_errno(status); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); goto leave; } @@ -2901,7 +2901,7 @@ out_commit: ocfs2_commit_trans(osb, handle); orphan_unlock: ocfs2_inode_unlock(orphan_dir_inode, 1); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); leave: diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index fde9ef18cff3..9c9dd30bc945 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -308,7 +308,7 @@ int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex) WARN_ON(bh != oinfo->dqi_gqi_bh); spin_unlock(&dq_data_lock); if (ex) { - mutex_lock(&oinfo->dqi_gqinode->i_mutex); + inode_lock(oinfo->dqi_gqinode); down_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); } else { down_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); @@ -320,7 +320,7 @@ void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex) { if (ex) { up_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); - mutex_unlock(&oinfo->dqi_gqinode->i_mutex); + inode_unlock(oinfo->dqi_gqinode); } else { up_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); } diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 252119860e6c..3eff031aaf26 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -807,7 +807,7 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh) mlog_errno(ret); goto out; } - mutex_lock(&alloc_inode->i_mutex); + inode_lock(alloc_inode); ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1); if (ret) { @@ -867,7 +867,7 @@ out_unlock: } out_mutex: if (alloc_inode) { - mutex_unlock(&alloc_inode->i_mutex); + inode_unlock(alloc_inode); iput(alloc_inode); } out: @@ -4197,7 +4197,7 @@ static int __ocfs2_reflink(struct dentry *old_dentry, goto out; } - mutex_lock_nested(&new_inode->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(new_inode, I_MUTEX_CHILD); ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1, OI_LS_REFLINK_TARGET); if (ret) { @@ -4231,7 +4231,7 @@ inode_unlock: ocfs2_inode_unlock(new_inode, 1); brelse(new_bh); out_unlock: - mutex_unlock(&new_inode->i_mutex); + inode_unlock(new_inode); out: if (!ret) { ret = filemap_fdatawait(inode->i_mapping); @@ -4402,11 +4402,11 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, return error; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); error = dquot_initialize(dir); if (!error) error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (!error) fsnotify_create(dir, new_dentry); return error; diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index 79b8021302b3..576b9a04873f 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c @@ -301,7 +301,7 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters) goto out; } - mutex_lock(&main_bm_inode->i_mutex); + inode_lock(main_bm_inode); ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); if (ret < 0) { @@ -375,7 +375,7 @@ out_unlock: ocfs2_inode_unlock(main_bm_inode, 1); out_mutex: - mutex_unlock(&main_bm_inode->i_mutex); + inode_unlock(main_bm_inode); iput(main_bm_inode); out: @@ -486,7 +486,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) goto out; } - mutex_lock(&main_bm_inode->i_mutex); + inode_lock(main_bm_inode); ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); if (ret < 0) { @@ -590,7 +590,7 @@ out_unlock: ocfs2_inode_unlock(main_bm_inode, 1); out_mutex: - mutex_unlock(&main_bm_inode->i_mutex); + inode_unlock(main_bm_inode); iput(main_bm_inode); out: diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index fc6d25f6d444..2f19aeec5482 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -141,7 +141,7 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) if (ac->ac_which != OCFS2_AC_USE_LOCAL) ocfs2_inode_unlock(inode, 1); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); iput(inode); ac->ac_inode = NULL; @@ -797,11 +797,11 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, return -EINVAL; } - mutex_lock(&alloc_inode->i_mutex); + inode_lock(alloc_inode); status = ocfs2_inode_lock(alloc_inode, &bh, 1); if (status < 0) { - mutex_unlock(&alloc_inode->i_mutex); + inode_unlock(alloc_inode); iput(alloc_inode); mlog_errno(status); @@ -2875,10 +2875,10 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) goto bail; } - mutex_lock(&inode_alloc_inode->i_mutex); + inode_lock(inode_alloc_inode); status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0); if (status < 0) { - mutex_unlock(&inode_alloc_inode->i_mutex); + inode_unlock(inode_alloc_inode); iput(inode_alloc_inode); mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n", (u32)suballoc_slot, status); @@ -2891,7 +2891,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) mlog(ML_ERROR, "test suballoc bit failed %d\n", status); ocfs2_inode_unlock(inode_alloc_inode, 0); - mutex_unlock(&inode_alloc_inode->i_mutex); + inode_unlock(inode_alloc_inode); iput(inode_alloc_inode); brelse(alloc_bh); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index faa1365097bc..302854ee0985 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -236,6 +236,7 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) struct ocfs2_recovery_map *rm = osb->recovery_map; struct ocfs2_orphan_scan *os = &osb->osb_orphan_scan; int i, out = 0; + unsigned long flags; out += snprintf(buf + out, len - out, "%10s => Id: %-s Uuid: %-s Gen: 0x%X Label: %-s\n", @@ -271,14 +272,14 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) cconn->cc_version.pv_minor); } - spin_lock(&osb->dc_task_lock); + spin_lock_irqsave(&osb->dc_task_lock, flags); out += snprintf(buf + out, len - out, "%10s => Pid: %d Count: %lu WakeSeq: %lu " "WorkSeq: %lu\n", "DownCnvt", (osb->dc_task ? task_pid_nr(osb->dc_task) : -1), osb->blocked_lock_count, osb->dc_wake_sequence, osb->dc_work_sequence); - spin_unlock(&osb->dc_task_lock); + spin_unlock_irqrestore(&osb->dc_task_lock, flags); spin_lock(&osb->osb_lock); out += snprintf(buf + out, len - out, "%10s => Pid: %d Nodes:", diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index f0e241ffd94f..7d3d979f57d9 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -2524,7 +2524,7 @@ static int ocfs2_xattr_free_block(struct inode *inode, mlog_errno(ret); goto out; } - mutex_lock(&xb_alloc_inode->i_mutex); + inode_lock(xb_alloc_inode); ret = ocfs2_inode_lock(xb_alloc_inode, &xb_alloc_bh, 1); if (ret < 0) { @@ -2549,7 +2549,7 @@ out_unlock: ocfs2_inode_unlock(xb_alloc_inode, 1); brelse(xb_alloc_bh); out_mutex: - mutex_unlock(&xb_alloc_inode->i_mutex); + inode_unlock(xb_alloc_inode); iput(xb_alloc_inode); out: brelse(blk_bh); @@ -3619,17 +3619,17 @@ int ocfs2_xattr_set(struct inode *inode, } } - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); if (ocfs2_truncate_log_needs_flush(osb)) { ret = __ocfs2_flush_truncate_log(osb); if (ret < 0) { - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); mlog_errno(ret); goto cleanup; } } - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis, &xbs, &ctxt, ref_meta, &credits); @@ -5460,7 +5460,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode, return ret; } - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); if (ocfs2_truncate_log_needs_flush(osb)) { ret = __ocfs2_flush_truncate_log(osb); @@ -5504,7 +5504,7 @@ out_commit: out: ocfs2_schedule_truncate_log_flush(osb, 1); - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); if (meta_ac) ocfs2_free_alloc_context(meta_ac); diff --git a/fs/open.c b/fs/open.c index b25b1542c530..55bdc75e2172 100644 --- a/fs/open.c +++ b/fs/open.c @@ -58,10 +58,10 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, if (ret) newattrs.ia_valid |= ret | ATTR_FORCE; - mutex_lock(&dentry->d_inode->i_mutex); + inode_lock(dentry->d_inode); /* Note any delegations or leases have already been broken: */ ret = notify_change(dentry, &newattrs, NULL); - mutex_unlock(&dentry->d_inode->i_mutex); + inode_unlock(dentry->d_inode); return ret; } @@ -510,7 +510,7 @@ static int chmod_common(struct path *path, umode_t mode) if (error) return error; retry_deleg: - mutex_lock(&inode->i_mutex); + inode_lock(inode); error = security_path_chmod(path, mode); if (error) goto out_unlock; @@ -518,7 +518,7 @@ retry_deleg: newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; error = notify_change(path->dentry, &newattrs, &delegated_inode); out_unlock: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) @@ -593,11 +593,11 @@ retry_deleg: if (!S_ISDIR(inode->i_mode)) newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV; - mutex_lock(&inode->i_mutex); + inode_lock(inode); error = security_path_chown(path, uid, gid); if (!error) error = notify_change(path->dentry, &newattrs, &delegated_inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index eff6319d5037..d894e7cd9a86 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -248,9 +248,9 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, if (err) goto out_cleanup; - mutex_lock(&newdentry->d_inode->i_mutex); + inode_lock(newdentry->d_inode); err = ovl_set_attr(newdentry, stat); - mutex_unlock(&newdentry->d_inode->i_mutex); + inode_unlock(newdentry->d_inode); if (err) goto out_cleanup; diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 692ceda3bc21..52f6de5d40a9 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -167,7 +167,7 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode, struct dentry *newdentry; int err; - mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(udir, I_MUTEX_PARENT); newdentry = lookup_one_len(dentry->d_name.name, upperdir, dentry->d_name.len); err = PTR_ERR(newdentry); @@ -185,7 +185,7 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode, out_dput: dput(newdentry); out_unlock: - mutex_unlock(&udir->i_mutex); + inode_unlock(udir); return err; } @@ -258,9 +258,9 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, if (err) goto out_cleanup; - mutex_lock(&opaquedir->d_inode->i_mutex); + inode_lock(opaquedir->d_inode); err = ovl_set_attr(opaquedir, &stat); - mutex_unlock(&opaquedir->d_inode->i_mutex); + inode_unlock(opaquedir->d_inode); if (err) goto out_cleanup; @@ -599,7 +599,7 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir) struct dentry *upper = ovl_dentry_upper(dentry); int err; - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(dir, I_MUTEX_PARENT); err = -ESTALE; if (upper->d_parent == upperdir) { /* Don't let d_delete() think it can reset d_inode */ @@ -618,8 +618,9 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir) * sole user of this dentry. Too tricky... Just unhash for * now. */ - d_drop(dentry); - mutex_unlock(&dir->i_mutex); + if (!err) + d_drop(dentry); + inode_unlock(dir); return err; } @@ -903,6 +904,13 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old, if (!overwrite && new_is_dir && !old_opaque && new_opaque) ovl_remove_opaque(newdentry); + /* + * Old dentry now lives in different location. Dentries in + * lowerstack are stale. We cannot drop them here because + * access to them is lockless. This could be only pure upper + * or opaque directory - numlower is zero. Or upper non-dir + * entry - its pureness is tracked by flag opaque. + */ if (old_opaque != new_opaque) { ovl_dentry_set_opaque(old, new_opaque); if (!overwrite) diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index bf996e574f3d..a4ff5d0d7db9 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -63,9 +63,11 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr) if (!err) { upperdentry = ovl_dentry_upper(dentry); - mutex_lock(&upperdentry->d_inode->i_mutex); + inode_lock(upperdentry->d_inode); err = notify_change(upperdentry, attr, NULL); - mutex_unlock(&upperdentry->d_inode->i_mutex); + if (!err) + ovl_copyattr(upperdentry->d_inode, dentry->d_inode); + inode_unlock(upperdentry->d_inode); } ovl_drop_write(dentry); out: diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index adcb1398c481..fdaf28f75e12 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -228,7 +228,7 @@ static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd) dput(dentry); } } - mutex_unlock(&dir->d_inode->i_mutex); + inode_unlock(dir->d_inode); } revert_creds(old_cred); put_cred(override_cred); @@ -399,7 +399,7 @@ static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin) loff_t res; struct ovl_dir_file *od = file->private_data; - mutex_lock(&file_inode(file)->i_mutex); + inode_lock(file_inode(file)); if (!file->f_pos) ovl_dir_reset(file); @@ -429,7 +429,7 @@ static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin) res = offset; } out_unlock: - mutex_unlock(&file_inode(file)->i_mutex); + inode_unlock(file_inode(file)); return res; } @@ -454,10 +454,10 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, ovl_path_upper(dentry, &upperpath); realfile = ovl_path_open(&upperpath, O_RDONLY); smp_mb__before_spinlock(); - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (!od->upperfile) { if (IS_ERR(realfile)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return PTR_ERR(realfile); } od->upperfile = realfile; @@ -467,7 +467,7 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, fput(realfile); realfile = od->upperfile; } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } } @@ -479,9 +479,9 @@ static int ovl_dir_release(struct inode *inode, struct file *file) struct ovl_dir_file *od = file->private_data; if (od->cache) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); ovl_cache_put(od, file->f_path.dentry); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } fput(od->realfile); if (od->upperfile) @@ -557,7 +557,7 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list) { struct ovl_cache_entry *p; - mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(upper->d_inode, I_MUTEX_CHILD); list_for_each_entry(p, list, l_node) { struct dentry *dentry; @@ -575,5 +575,5 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list) ovl_cleanup(upper->d_inode, dentry); dput(dentry); } - mutex_unlock(&upper->d_inode->i_mutex); + inode_unlock(upper->d_inode); } diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index d250604f985a..619ad4b016d2 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -76,12 +76,14 @@ enum ovl_path_type ovl_path_type(struct dentry *dentry) if (oe->__upperdentry) { type = __OVL_PATH_UPPER; - if (oe->numlower) { - if (S_ISDIR(dentry->d_inode->i_mode)) - type |= __OVL_PATH_MERGE; - } else if (!oe->opaque) { + /* + * Non-dir dentry can hold lower dentry from previous + * location. Its purity depends only on opaque flag. + */ + if (oe->numlower && S_ISDIR(dentry->d_inode->i_mode)) + type |= __OVL_PATH_MERGE; + else if (!oe->opaque) type |= __OVL_PATH_PURE; - } } else { if (oe->numlower > 1) type |= __OVL_PATH_MERGE; @@ -229,7 +231,7 @@ void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry) { struct ovl_entry *oe = dentry->d_fsdata; - WARN_ON(!mutex_is_locked(&upperdentry->d_parent->d_inode->i_mutex)); + WARN_ON(!inode_is_locked(upperdentry->d_parent->d_inode)); WARN_ON(oe->__upperdentry); BUG_ON(!upperdentry->d_inode); /* @@ -244,7 +246,7 @@ void ovl_dentry_version_inc(struct dentry *dentry) { struct ovl_entry *oe = dentry->d_fsdata; - WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); + WARN_ON(!inode_is_locked(dentry->d_inode)); oe->version++; } @@ -252,7 +254,7 @@ u64 ovl_dentry_version_get(struct dentry *dentry) { struct ovl_entry *oe = dentry->d_fsdata; - WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); + WARN_ON(!inode_is_locked(dentry->d_inode)); return oe->version; } @@ -341,6 +343,7 @@ static const struct dentry_operations ovl_dentry_operations = { static const struct dentry_operations ovl_reval_dentry_operations = { .d_release = ovl_dentry_release, + .d_select_inode = ovl_d_select_inode, .d_revalidate = ovl_dentry_revalidate, .d_weak_revalidate = ovl_dentry_weak_revalidate, }; @@ -375,9 +378,9 @@ static inline struct dentry *ovl_lookup_real(struct dentry *dir, { struct dentry *dentry; - mutex_lock(&dir->d_inode->i_mutex); + inode_lock(dir->d_inode); dentry = lookup_one_len(name->name, dir, name->len); - mutex_unlock(&dir->d_inode->i_mutex); + inode_unlock(dir->d_inode); if (IS_ERR(dentry)) { if (PTR_ERR(dentry) == -ENOENT) @@ -744,7 +747,7 @@ static struct dentry *ovl_workdir_create(struct vfsmount *mnt, if (err) return ERR_PTR(err); - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(dir, I_MUTEX_PARENT); retry: work = lookup_one_len(OVL_WORKDIR_NAME, dentry, strlen(OVL_WORKDIR_NAME)); @@ -770,7 +773,7 @@ retry: goto out_dput; } out_unlock: - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); mnt_drop_write(mnt); return work; diff --git a/fs/pipe.c b/fs/pipe.c index 42cf8ddf0e55..ab8dad3ccb6a 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -38,6 +38,12 @@ unsigned int pipe_max_size = 1048576; */ unsigned int pipe_min_size = PAGE_SIZE; +/* Maximum allocatable pages per user. Hard limit is unset by default, soft + * matches default values. + */ +unsigned long pipe_user_pages_hard; +unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; + /* * We use a start+len construction, which provides full use of the * allocated memory. @@ -583,20 +589,49 @@ pipe_fasync(int fd, struct file *filp, int on) return retval; } +static void account_pipe_buffers(struct pipe_inode_info *pipe, + unsigned long old, unsigned long new) +{ + atomic_long_add(new - old, &pipe->user->pipe_bufs); +} + +static bool too_many_pipe_buffers_soft(struct user_struct *user) +{ + return pipe_user_pages_soft && + atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_soft; +} + +static bool too_many_pipe_buffers_hard(struct user_struct *user) +{ + return pipe_user_pages_hard && + atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_hard; +} + struct pipe_inode_info *alloc_pipe_info(void) { struct pipe_inode_info *pipe; pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); if (pipe) { - pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL); + unsigned long pipe_bufs = PIPE_DEF_BUFFERS; + struct user_struct *user = get_current_user(); + + if (!too_many_pipe_buffers_hard(user)) { + if (too_many_pipe_buffers_soft(user)) + pipe_bufs = 1; + pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * pipe_bufs, GFP_KERNEL); + } + if (pipe->bufs) { init_waitqueue_head(&pipe->wait); pipe->r_counter = pipe->w_counter = 1; - pipe->buffers = PIPE_DEF_BUFFERS; + pipe->buffers = pipe_bufs; + pipe->user = user; + account_pipe_buffers(pipe, 0, pipe_bufs); mutex_init(&pipe->mutex); return pipe; } + free_uid(user); kfree(pipe); } @@ -607,6 +642,8 @@ void free_pipe_info(struct pipe_inode_info *pipe) { int i; + account_pipe_buffers(pipe, pipe->buffers, 0); + free_uid(pipe->user); for (i = 0; i < pipe->buffers; i++) { struct pipe_buffer *buf = pipe->bufs + i; if (buf->ops) @@ -998,6 +1035,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer)); } + account_pipe_buffers(pipe, pipe->buffers, nr_pages); pipe->curbuf = 0; kfree(pipe->bufs); pipe->bufs = bufs; @@ -1069,6 +1107,11 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) { ret = -EPERM; goto out; + } else if ((too_many_pipe_buffers_hard(pipe->user) || + too_many_pipe_buffers_soft(pipe->user)) && + !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out; } ret = pipe_set_size(pipe, nr_pages); break; diff --git a/fs/pnode.c b/fs/pnode.c index 6367e1e435c6..c524fdddc7fb 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -202,6 +202,11 @@ static struct mount *last_dest, *last_source, *dest_master; static struct mountpoint *mp; static struct hlist_head *list; +static inline bool peers(struct mount *m1, struct mount *m2) +{ + return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id; +} + static int propagate_one(struct mount *m) { struct mount *child; @@ -212,7 +217,7 @@ static int propagate_one(struct mount *m) /* skip if mountpoint isn't covered by it */ if (!is_subdir(mp->m_dentry, m->mnt.mnt_root)) return 0; - if (m->mnt_group_id == last_dest->mnt_group_id) { + if (peers(m, last_dest)) { type = CL_MAKE_SHARED; } else { struct mount *n, *p; @@ -223,7 +228,7 @@ static int propagate_one(struct mount *m) last_source = last_source->mnt_master; last_dest = last_source->mnt_parent; } - if (n->mnt_group_id != last_dest->mnt_group_id) { + if (!peers(n, last_dest)) { last_source = last_source->mnt_master; last_dest = last_source->mnt_parent; } diff --git a/fs/proc/base.c b/fs/proc/base.c index 4f764c2ac1a5..b1755b23893e 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -434,7 +434,7 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, && !lookup_symbol_name(wchan, symname)) seq_printf(m, "%s", symname); else - seq_putc(m, '0'); + seq_puts(m, "0\n"); return 0; } @@ -2158,6 +2158,7 @@ static const struct file_operations proc_map_files_operations = { .llseek = default_llseek, }; +#ifdef CONFIG_CHECKPOINT_RESTORE struct timers_private { struct pid *pid; struct task_struct *task; @@ -2256,6 +2257,73 @@ static const struct file_operations proc_timers_operations = { .llseek = seq_lseek, .release = seq_release_private, }; +#endif + +static ssize_t timerslack_ns_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file_inode(file); + struct task_struct *p; + u64 slack_ns; + int err; + + err = kstrtoull_from_user(buf, count, 10, &slack_ns); + if (err < 0) + return err; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) { + task_lock(p); + if (slack_ns == 0) + p->timer_slack_ns = p->default_timer_slack_ns; + else + p->timer_slack_ns = slack_ns; + task_unlock(p); + } else + count = -EPERM; + + put_task_struct(p); + + return count; +} + +static int timerslack_ns_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + int err = 0; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) { + task_lock(p); + seq_printf(m, "%llu\n", p->timer_slack_ns); + task_unlock(p); + } else + err = -EPERM; + + put_task_struct(p); + + return err; +} + +static int timerslack_ns_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, timerslack_ns_show, inode); +} + +static const struct file_operations proc_pid_set_timerslack_ns_operations = { + .open = timerslack_ns_open, + .read = seq_read, + .write = timerslack_ns_write, + .llseek = seq_lseek, + .release = single_release, +}; static int proc_pident_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) @@ -2831,6 +2899,7 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_CHECKPOINT_RESTORE REG("timers", S_IRUGO, proc_timers_operations), #endif + REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations), }; static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 92e6726f6e37..a939f5ed7f89 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -552,9 +552,9 @@ static int open_kcore(struct inode *inode, struct file *filp) if (kcore_need_update) kcore_update_ram(); if (i_size_read(inode) != proc_root_kcore->size) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); i_size_write(inode, proc_root_kcore->size); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } return 0; } diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index df4661abadc4..83720460c5bc 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -29,10 +29,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) unsigned long committed; long cached; long available; - unsigned long pagecache; - unsigned long wmark_low = 0; unsigned long pages[NR_LRU_LISTS]; - struct zone *zone; int lru; /* @@ -51,33 +48,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) pages[lru] = global_page_state(NR_LRU_BASE + lru); - for_each_zone(zone) - wmark_low += zone->watermark[WMARK_LOW]; - - /* - * Estimate the amount of memory available for userspace allocations, - * without causing swapping. - */ - available = i.freeram - totalreserve_pages; - - /* - * Not all the page cache can be freed, otherwise the system will - * start swapping. Assume at least half of the page cache, or the - * low watermark worth of cache, needs to stay. - */ - pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE]; - pagecache -= min(pagecache / 2, wmark_low); - available += pagecache; - - /* - * Part of the reclaimable slab consists of items that are in use, - * and cannot be freed. Cap this estimate at the low watermark. - */ - available += global_page_state(NR_SLAB_RECLAIMABLE) - - min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low); - - if (available < 0) - available = 0; + available = si_mem_available(); /* * Tagged format, for easy grepping and expansion. diff --git a/fs/proc/page.c b/fs/proc/page.c index b2855eea5405..712f1b9992cc 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -103,9 +103,9 @@ u64 stable_page_flags(struct page *page) * pseudo flags for the well known (anonymous) memory mapped pages * * Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the - * simple test in page_mapcount() is not enough. + * simple test in page_mapped() is not enough. */ - if (!PageSlab(page) && page_mapcount(page)) + if (!PageSlab(page) && page_mapped(page)) u |= 1 << KPF_MMAP; if (PageAnon(page)) u |= 1 << KPF_ANON; @@ -148,6 +148,8 @@ u64 stable_page_flags(struct page *page) */ if (PageBuddy(page)) u |= 1 << KPF_BUDDY; + else if (page_count(page) == 0 && is_free_buddy_page(page)) + u |= 1 << KPF_BUDDY; if (PageBalloon(page)) u |= 1 << KPF_BALLOON; @@ -158,6 +160,8 @@ u64 stable_page_flags(struct page *page) u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); + if (PageTail(page) && PageSlab(compound_head(page))) + u |= 1 << KPF_SLAB; u |= kpf_copy_bit(k, KPF_ERROR, PG_error); u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty); diff --git a/fs/proc/self.c b/fs/proc/self.c index 67e8db442cf0..b6a8d3529fea 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -50,7 +50,7 @@ int proc_setup_self(struct super_block *s) struct pid_namespace *ns = s->s_fs_info; struct dentry *self; - mutex_lock(&root_inode->i_mutex); + inode_lock(root_inode); self = d_alloc_name(s->s_root, "self"); if (self) { struct inode *inode = new_inode_pseudo(s); @@ -69,7 +69,7 @@ int proc_setup_self(struct super_block *s) } else { self = ERR_PTR(-ENOMEM); } - mutex_unlock(&root_inode->i_mutex); + inode_unlock(root_inode); if (IS_ERR(self)) { pr_err("proc_fill_super: can't allocate /proc/self\n"); return PTR_ERR(self); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 71ffc91060f6..9df431642042 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -259,23 +259,29 @@ static int do_maps_open(struct inode *inode, struct file *file, sizeof(struct proc_maps_private)); } -static pid_t pid_of_stack(struct proc_maps_private *priv, - struct vm_area_struct *vma, bool is_pid) +/* + * Indicate if the VMA is a stack for the given task; for + * /proc/PID/maps that is the stack of the main task. + */ +static int is_stack(struct proc_maps_private *priv, + struct vm_area_struct *vma, int is_pid) { - struct inode *inode = priv->inode; - struct task_struct *task; - pid_t ret = 0; + int stack = 0; - rcu_read_lock(); - task = pid_task(proc_pid(inode), PIDTYPE_PID); - if (task) { - task = task_of_stack(task, vma, is_pid); + if (is_pid) { + stack = vma->vm_start <= vma->vm_mm->start_stack && + vma->vm_end >= vma->vm_mm->start_stack; + } else { + struct inode *inode = priv->inode; + struct task_struct *task; + + rcu_read_lock(); + task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) - ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info); + stack = vma_is_stack_for_task(vma, task); + rcu_read_unlock(); } - rcu_read_unlock(); - - return ret; + return stack; } static void @@ -335,8 +341,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) name = arch_vma_name(vma); if (!name) { - pid_t tid; - if (!mm) { name = "[vdso]"; goto done; @@ -348,21 +352,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) goto done; } - tid = pid_of_stack(priv, vma, is_pid); - if (tid != 0) { - /* - * Thread stack in /proc/PID/task/TID/maps or - * the main process stack. - */ - if (!is_pid || (vma->vm_start <= mm->start_stack && - vma->vm_end >= mm->start_stack)) { - name = "[stack]"; - } else { - /* Thread stack in /proc/PID/maps */ - seq_pad(m, ' '); - seq_printf(m, "[stack:%d]", tid); - } - } + if (is_stack(priv, vma, is_pid)) + name = "[stack]"; } done: @@ -602,7 +593,8 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pte_t *pte; spinlock_t *ptl; - if (pmd_trans_huge_lock(pmd, vma, &ptl)) { + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { smaps_pmd_entry(pmd, addr, walk); spin_unlock(ptl); return 0; @@ -668,11 +660,20 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_MERGEABLE)] = "mg", [ilog2(VM_UFFD_MISSING)]= "um", [ilog2(VM_UFFD_WP)] = "uw", +#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS + /* These come out via ProtectionKey: */ + [ilog2(VM_PKEY_BIT0)] = "", + [ilog2(VM_PKEY_BIT1)] = "", + [ilog2(VM_PKEY_BIT2)] = "", + [ilog2(VM_PKEY_BIT3)] = "", +#endif }; size_t i; seq_puts(m, "VmFlags: "); for (i = 0; i < BITS_PER_LONG; i++) { + if (!mnemonics[i][0]) + continue; if (vma->vm_flags & (1UL << i)) { seq_printf(m, "%c%c ", mnemonics[i][0], mnemonics[i][1]); @@ -710,6 +711,10 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, } #endif /* HUGETLB_PAGE */ +void __weak arch_show_smap(struct seq_file *m, struct vm_area_struct *vma) +{ +} + static int show_smap(struct seq_file *m, void *v, int is_pid) { struct vm_area_struct *vma = v; @@ -791,6 +796,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) (vma->vm_flags & VM_LOCKED) ? (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0); + arch_show_smap(m, vma); show_smap_vma_flags(m, vma); m_cache_vma(m, vma); return 0; @@ -913,7 +919,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, spinlock_t *ptl; struct page *page; - if (pmd_trans_huge_lock(pmd, vma, &ptl)) { + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { if (cp->type == CLEAR_REFS_SOFT_DIRTY) { clear_soft_dirty_pmd(vma, addr, pmd); goto out; @@ -1187,7 +1194,8 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, int err = 0; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (pmd_trans_huge_lock(pmdp, vma, &ptl)) { + ptl = pmd_trans_huge_lock(pmdp, vma); + if (ptl) { u64 flags = 0, frame = 0; pmd_t pmd = *pmdp; @@ -1519,7 +1527,8 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, pte_t *orig_pte; pte_t *pte; - if (pmd_trans_huge_lock(pmd, vma, &ptl)) { + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { pte_t huge_pte = *(pte_t *)pmd; struct page *page; @@ -1548,18 +1557,19 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { + pte_t huge_pte = huge_ptep_get(pte); struct numa_maps *md; struct page *page; - if (!pte_present(*pte)) + if (!pte_present(huge_pte)) return 0; - page = pte_page(*pte); + page = pte_page(huge_pte); if (!page) return 0; md = walk->private; - gather_stats(page, md, pte_dirty(*pte), 1); + gather_stats(page, md, pte_dirty(huge_pte), 1); return 0; } @@ -1613,19 +1623,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) seq_file_path(m, file, "\n\t= "); } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { seq_puts(m, " heap"); - } else { - pid_t tid = pid_of_stack(proc_priv, vma, is_pid); - if (tid != 0) { - /* - * Thread stack in /proc/PID/task/TID/maps or - * the main process stack. - */ - if (!is_pid || (vma->vm_start <= mm->start_stack && - vma->vm_end >= mm->start_stack)) - seq_puts(m, " stack"); - else - seq_printf(m, " stack:%d", tid); - } + } else if (is_stack(proc_priv, vma, is_pid)) { + seq_puts(m, " stack"); } if (is_vm_hugetlb_page(vma)) diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index e0d64c92e4f6..faacb0c0d857 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -123,23 +123,26 @@ unsigned long task_statm(struct mm_struct *mm, return size; } -static pid_t pid_of_stack(struct proc_maps_private *priv, - struct vm_area_struct *vma, bool is_pid) +static int is_stack(struct proc_maps_private *priv, + struct vm_area_struct *vma, int is_pid) { - struct inode *inode = priv->inode; - struct task_struct *task; - pid_t ret = 0; - - rcu_read_lock(); - task = pid_task(proc_pid(inode), PIDTYPE_PID); - if (task) { - task = task_of_stack(task, vma, is_pid); + struct mm_struct *mm = vma->vm_mm; + int stack = 0; + + if (is_pid) { + stack = vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack; + } else { + struct inode *inode = priv->inode; + struct task_struct *task; + + rcu_read_lock(); + task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) - ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info); + stack = vma_is_stack_for_task(vma, task); + rcu_read_unlock(); } - rcu_read_unlock(); - - return ret; + return stack; } /* @@ -181,21 +184,9 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, if (file) { seq_pad(m, ' '); seq_file_path(m, file, ""); - } else if (mm) { - pid_t tid = pid_of_stack(priv, vma, is_pid); - - if (tid != 0) { - seq_pad(m, ' '); - /* - * Thread stack in /proc/PID/task/TID/maps or - * the main process stack. - */ - if (!is_pid || (vma->vm_start <= mm->start_stack && - vma->vm_end >= mm->start_stack)) - seq_printf(m, "[stack]"); - else - seq_printf(m, "[stack:%d]", tid); - } + } else if (mm && is_stack(priv, vma, is_pid)) { + seq_pad(m, ' '); + seq_printf(m, "[stack]"); } seq_putc(m, '\n'); diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index 9eacd59e0360..e58a31e8fb2a 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -52,7 +52,7 @@ int proc_setup_thread_self(struct super_block *s) struct pid_namespace *ns = s->s_fs_info; struct dentry *thread_self; - mutex_lock(&root_inode->i_mutex); + inode_lock(root_inode); thread_self = d_alloc_name(s->s_root, "thread-self"); if (thread_self) { struct inode *inode = new_inode_pseudo(s); @@ -71,7 +71,7 @@ int proc_setup_thread_self(struct super_block *s) } else { thread_self = ERR_PTR(-ENOMEM); } - mutex_unlock(&root_inode->i_mutex); + inode_unlock(root_inode); if (IS_ERR(thread_self)) { pr_err("proc_fill_super: can't allocate /proc/thread_self\n"); return PTR_ERR(thread_self); diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 4e61388ec03d..55bb57e6a30d 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -231,7 +231,9 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos, list_for_each_entry(m, &vmcore_list, list) { if (*fpos < m->offset + m->size) { - tsz = min_t(size_t, m->offset + m->size - *fpos, buflen); + tsz = (size_t)min_t(unsigned long long, + m->offset + m->size - *fpos, + buflen); start = m->paddr + *fpos - m->offset; tmp = read_from_oldmem(buffer, tsz, &start, userbuf); if (tmp < 0) @@ -461,7 +463,8 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) if (start < m->offset + m->size) { u64 paddr = 0; - tsz = min_t(size_t, m->offset + m->size - start, size); + tsz = (size_t)min_t(unsigned long long, + m->offset + m->size - start, size); paddr = m->paddr + start - m->offset; if (vmcore_remap_oldmem_pfn(vma, vma->vm_start + len, paddr >> PAGE_SHIFT, tsz, diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 2256e7e23e67..3f1190d18991 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -199,6 +199,8 @@ static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt) if (sb->s_op->show_devname) { seq_puts(m, "device "); err = sb->s_op->show_devname(m, mnt_path.dentry); + if (err) + goto out; } else { if (r->mnt_devname) { seq_puts(m, "device "); diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index d8c439d813ce..dc645b66cd79 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -377,7 +377,7 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, break; } - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); dentry = d_alloc_name(root, name); if (!dentry) @@ -397,12 +397,12 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, list_add(&private->list, &allpstore); spin_unlock_irqrestore(&allpstore_lock, flags); - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); return 0; fail_lockedalloc: - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); kfree(private); fail_alloc: iput(inode); diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 319c3a60cfa5..bd9812e83461 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -55,8 +55,8 @@ static ulong ramoops_pmsg_size = MIN_MEM_SIZE; module_param_named(pmsg_size, ramoops_pmsg_size, ulong, 0400); MODULE_PARM_DESC(pmsg_size, "size of user space message log"); -static ulong mem_address; -module_param(mem_address, ulong, 0400); +static unsigned long long mem_address; +module_param(mem_address, ullong, 0400); MODULE_PARM_DESC(mem_address, "start of reserved RAM used to store oops/panic logs"); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index fbd70af98820..04ca0cc6d065 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -682,9 +682,9 @@ int dquot_quota_sync(struct super_block *sb, int type) continue; if (!sb_has_quota_active(sb, cnt)) continue; - mutex_lock(&dqopt->files[cnt]->i_mutex); + inode_lock(dqopt->files[cnt]); truncate_inode_pages(&dqopt->files[cnt]->i_data, 0); - mutex_unlock(&dqopt->files[cnt]->i_mutex); + inode_unlock(dqopt->files[cnt]); } mutex_unlock(&dqopt->dqonoff_mutex); @@ -2162,12 +2162,12 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags) /* If quota was reenabled in the meantime, we have * nothing to do */ if (!sb_has_quota_loaded(sb, cnt)) { - mutex_lock(&toputinode[cnt]->i_mutex); + inode_lock(toputinode[cnt]); toputinode[cnt]->i_flags &= ~(S_IMMUTABLE | S_NOATIME | S_NOQUOTA); truncate_inode_pages(&toputinode[cnt]->i_data, 0); - mutex_unlock(&toputinode[cnt]->i_mutex); + inode_unlock(toputinode[cnt]); mark_inode_dirty_sync(toputinode[cnt]); } mutex_unlock(&dqopt->dqonoff_mutex); @@ -2258,11 +2258,11 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, /* We don't want quota and atime on quota files (deadlocks * possible) Also nobody should write to the file - we use * special IO operations which ignore the immutable bit. */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA); inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* * When S_NOQUOTA is set, remove dquot references as no more * references can be added @@ -2305,12 +2305,12 @@ out_file_init: iput(inode); out_lock: if (oldflags != -1) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* Set the flags back (in the case of accidental quotaon() * on a wrong file we don't want to mess up the flags) */ inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE); inode->i_flags |= oldflags; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } mutex_unlock(&dqopt->dqonoff_mutex); out_fmt: @@ -2430,9 +2430,7 @@ int dquot_quota_on_mount(struct super_block *sb, char *qf_name, struct dentry *dentry; int error; - mutex_lock(&d_inode(sb->s_root)->i_mutex); - dentry = lookup_one_len(qf_name, sb->s_root, strlen(qf_name)); - mutex_unlock(&d_inode(sb->s_root)->i_mutex); + dentry = lookup_one_len_unlocked(qf_name, sb->s_root, strlen(qf_name)); if (IS_ERR(dentry)) return PTR_ERR(dentry); diff --git a/fs/read_write.c b/fs/read_write.c index 06b07d5a08fe..cf377cf9dfe3 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -17,6 +17,7 @@ #include <linux/splice.h> #include <linux/compat.h> #include <linux/mount.h> +#include <linux/fs.h> #include "internal.h" #include <asm/uaccess.h> @@ -183,7 +184,7 @@ loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence) switch (whence) { case SEEK_SET: case SEEK_CUR: return generic_file_llseek_size(file, offset, whence, - ~0ULL, 0); + OFFSET_MAX, 0); default: return -EINVAL; } @@ -238,7 +239,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int whence) struct inode *inode = file_inode(file); loff_t retval; - mutex_lock(&inode->i_mutex); + inode_lock(inode); switch (whence) { case SEEK_END: offset += i_size_read(inode); @@ -283,7 +284,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int whence) retval = offset; } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return retval; } EXPORT_SYMBOL(default_llseek); @@ -692,12 +693,17 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to) EXPORT_SYMBOL(iov_shorten); static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, - loff_t *ppos, iter_fn_t fn) + loff_t *ppos, iter_fn_t fn, int flags) { struct kiocb kiocb; ssize_t ret; + if (flags & ~RWF_HIPRI) + return -EOPNOTSUPP; + init_sync_kiocb(&kiocb, filp); + if (flags & RWF_HIPRI) + kiocb.ki_flags |= IOCB_HIPRI; kiocb.ki_pos = *ppos; ret = fn(&kiocb, iter); @@ -708,10 +714,13 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, /* Do it by hand, with file-ops */ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, - loff_t *ppos, io_fn_t fn) + loff_t *ppos, io_fn_t fn, int flags) { ssize_t ret = 0; + if (flags & ~RWF_HIPRI) + return -EOPNOTSUPP; + while (iov_iter_count(iter)) { struct iovec iovec = iov_iter_iovec(iter); ssize_t nr; @@ -812,7 +821,8 @@ out: static ssize_t do_readv_writev(int type, struct file *file, const struct iovec __user * uvector, - unsigned long nr_segs, loff_t *pos) + unsigned long nr_segs, loff_t *pos, + int flags) { size_t tot_len; struct iovec iovstack[UIO_FASTIOV]; @@ -844,9 +854,9 @@ static ssize_t do_readv_writev(int type, struct file *file, } if (iter_fn) - ret = do_iter_readv_writev(file, &iter, pos, iter_fn); + ret = do_iter_readv_writev(file, &iter, pos, iter_fn, flags); else - ret = do_loop_readv_writev(file, &iter, pos, fn); + ret = do_loop_readv_writev(file, &iter, pos, fn, flags); if (type != READ) file_end_write(file); @@ -863,40 +873,40 @@ out: } ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, - unsigned long vlen, loff_t *pos) + unsigned long vlen, loff_t *pos, int flags) { if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; - return do_readv_writev(READ, file, vec, vlen, pos); + return do_readv_writev(READ, file, vec, vlen, pos, flags); } EXPORT_SYMBOL(vfs_readv); ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, - unsigned long vlen, loff_t *pos) + unsigned long vlen, loff_t *pos, int flags) { if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; - return do_readv_writev(WRITE, file, vec, vlen, pos); + return do_readv_writev(WRITE, file, vec, vlen, pos, flags); } EXPORT_SYMBOL(vfs_writev); -SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, - unsigned long, vlen) +static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec, + unsigned long vlen, int flags) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { loff_t pos = file_pos_read(f.file); - ret = vfs_readv(f.file, vec, vlen, &pos); + ret = vfs_readv(f.file, vec, vlen, &pos, flags); if (ret >= 0) file_pos_write(f.file, pos); fdput_pos(f); @@ -908,15 +918,15 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, return ret; } -SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, - unsigned long, vlen) +static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec, + unsigned long vlen, int flags) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { loff_t pos = file_pos_read(f.file); - ret = vfs_writev(f.file, vec, vlen, &pos); + ret = vfs_writev(f.file, vec, vlen, &pos, flags); if (ret >= 0) file_pos_write(f.file, pos); fdput_pos(f); @@ -934,10 +944,9 @@ static inline loff_t pos_from_hilo(unsigned long high, unsigned long low) return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low; } -SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec, - unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) +static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec, + unsigned long vlen, loff_t pos, int flags) { - loff_t pos = pos_from_hilo(pos_h, pos_l); struct fd f; ssize_t ret = -EBADF; @@ -948,7 +957,7 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec, if (f.file) { ret = -ESPIPE; if (f.file->f_mode & FMODE_PREAD) - ret = vfs_readv(f.file, vec, vlen, &pos); + ret = vfs_readv(f.file, vec, vlen, &pos, flags); fdput(f); } @@ -958,10 +967,9 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec, return ret; } -SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, - unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) +static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec, + unsigned long vlen, loff_t pos, int flags) { - loff_t pos = pos_from_hilo(pos_h, pos_l); struct fd f; ssize_t ret = -EBADF; @@ -972,7 +980,7 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, if (f.file) { ret = -ESPIPE; if (f.file->f_mode & FMODE_PWRITE) - ret = vfs_writev(f.file, vec, vlen, &pos); + ret = vfs_writev(f.file, vec, vlen, &pos, flags); fdput(f); } @@ -982,11 +990,64 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, return ret; } +SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, + unsigned long, vlen) +{ + return do_readv(fd, vec, vlen, 0); +} + +SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, + unsigned long, vlen) +{ + return do_writev(fd, vec, vlen, 0); +} + +SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec, + unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) +{ + loff_t pos = pos_from_hilo(pos_h, pos_l); + + return do_preadv(fd, vec, vlen, pos, 0); +} + +SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec, + unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, + int, flags) +{ + loff_t pos = pos_from_hilo(pos_h, pos_l); + + if (pos == -1) + return do_readv(fd, vec, vlen, flags); + + return do_preadv(fd, vec, vlen, pos, flags); +} + +SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, + unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) +{ + loff_t pos = pos_from_hilo(pos_h, pos_l); + + return do_pwritev(fd, vec, vlen, pos, 0); +} + +SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec, + unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, + int, flags) +{ + loff_t pos = pos_from_hilo(pos_h, pos_l); + + if (pos == -1) + return do_writev(fd, vec, vlen, flags); + + return do_pwritev(fd, vec, vlen, pos, flags); +} + #ifdef CONFIG_COMPAT static ssize_t compat_do_readv_writev(int type, struct file *file, const struct compat_iovec __user *uvector, - unsigned long nr_segs, loff_t *pos) + unsigned long nr_segs, loff_t *pos, + int flags) { compat_ssize_t tot_len; struct iovec iovstack[UIO_FASTIOV]; @@ -1018,9 +1079,9 @@ static ssize_t compat_do_readv_writev(int type, struct file *file, } if (iter_fn) - ret = do_iter_readv_writev(file, &iter, pos, iter_fn); + ret = do_iter_readv_writev(file, &iter, pos, iter_fn, flags); else - ret = do_loop_readv_writev(file, &iter, pos, fn); + ret = do_loop_readv_writev(file, &iter, pos, fn, flags); if (type != READ) file_end_write(file); @@ -1038,7 +1099,7 @@ out: static size_t compat_readv(struct file *file, const struct compat_iovec __user *vec, - unsigned long vlen, loff_t *pos) + unsigned long vlen, loff_t *pos, int flags) { ssize_t ret = -EBADF; @@ -1049,7 +1110,7 @@ static size_t compat_readv(struct file *file, if (!(file->f_mode & FMODE_CAN_READ)) goto out; - ret = compat_do_readv_writev(READ, file, vec, vlen, pos); + ret = compat_do_readv_writev(READ, file, vec, vlen, pos, flags); out: if (ret > 0) @@ -1058,9 +1119,9 @@ out: return ret; } -COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd, - const struct compat_iovec __user *,vec, - compat_ulong_t, vlen) +static size_t do_compat_readv(compat_ulong_t fd, + const struct compat_iovec __user *vec, + compat_ulong_t vlen, int flags) { struct fd f = fdget_pos(fd); ssize_t ret; @@ -1069,16 +1130,24 @@ COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd, if (!f.file) return -EBADF; pos = f.file->f_pos; - ret = compat_readv(f.file, vec, vlen, &pos); + ret = compat_readv(f.file, vec, vlen, &pos, flags); if (ret >= 0) f.file->f_pos = pos; fdput_pos(f); return ret; + +} + +COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd, + const struct compat_iovec __user *,vec, + compat_ulong_t, vlen) +{ + return do_compat_readv(fd, vec, vlen, 0); } -static long __compat_sys_preadv64(unsigned long fd, +static long do_compat_preadv64(unsigned long fd, const struct compat_iovec __user *vec, - unsigned long vlen, loff_t pos) + unsigned long vlen, loff_t pos, int flags) { struct fd f; ssize_t ret; @@ -1090,7 +1159,7 @@ static long __compat_sys_preadv64(unsigned long fd, return -EBADF; ret = -ESPIPE; if (f.file->f_mode & FMODE_PREAD) - ret = compat_readv(f.file, vec, vlen, &pos); + ret = compat_readv(f.file, vec, vlen, &pos, flags); fdput(f); return ret; } @@ -1100,7 +1169,7 @@ COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, const struct compat_iovec __user *,vec, unsigned long, vlen, loff_t, pos) { - return __compat_sys_preadv64(fd, vec, vlen, pos); + return do_compat_preadv64(fd, vec, vlen, pos, 0); } #endif @@ -1110,12 +1179,25 @@ COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, { loff_t pos = ((loff_t)pos_high << 32) | pos_low; - return __compat_sys_preadv64(fd, vec, vlen, pos); + return do_compat_preadv64(fd, vec, vlen, pos, 0); +} + +COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd, + const struct compat_iovec __user *,vec, + compat_ulong_t, vlen, u32, pos_low, u32, pos_high, + int, flags) +{ + loff_t pos = ((loff_t)pos_high << 32) | pos_low; + + if (pos == -1) + return do_compat_readv(fd, vec, vlen, flags); + + return do_compat_preadv64(fd, vec, vlen, pos, flags); } static size_t compat_writev(struct file *file, const struct compat_iovec __user *vec, - unsigned long vlen, loff_t *pos) + unsigned long vlen, loff_t *pos, int flags) { ssize_t ret = -EBADF; @@ -1126,7 +1208,7 @@ static size_t compat_writev(struct file *file, if (!(file->f_mode & FMODE_CAN_WRITE)) goto out; - ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos); + ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos, 0); out: if (ret > 0) @@ -1135,9 +1217,9 @@ out: return ret; } -COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd, - const struct compat_iovec __user *, vec, - compat_ulong_t, vlen) +static size_t do_compat_writev(compat_ulong_t fd, + const struct compat_iovec __user* vec, + compat_ulong_t vlen, int flags) { struct fd f = fdget_pos(fd); ssize_t ret; @@ -1146,16 +1228,23 @@ COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd, if (!f.file) return -EBADF; pos = f.file->f_pos; - ret = compat_writev(f.file, vec, vlen, &pos); + ret = compat_writev(f.file, vec, vlen, &pos, flags); if (ret >= 0) f.file->f_pos = pos; fdput_pos(f); return ret; } -static long __compat_sys_pwritev64(unsigned long fd, +COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd, + const struct compat_iovec __user *, vec, + compat_ulong_t, vlen) +{ + return do_compat_writev(fd, vec, vlen, 0); +} + +static long do_compat_pwritev64(unsigned long fd, const struct compat_iovec __user *vec, - unsigned long vlen, loff_t pos) + unsigned long vlen, loff_t pos, int flags) { struct fd f; ssize_t ret; @@ -1167,7 +1256,7 @@ static long __compat_sys_pwritev64(unsigned long fd, return -EBADF; ret = -ESPIPE; if (f.file->f_mode & FMODE_PWRITE) - ret = compat_writev(f.file, vec, vlen, &pos); + ret = compat_writev(f.file, vec, vlen, &pos, flags); fdput(f); return ret; } @@ -1177,7 +1266,7 @@ COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, const struct compat_iovec __user *,vec, unsigned long, vlen, loff_t, pos) { - return __compat_sys_pwritev64(fd, vec, vlen, pos); + return do_compat_pwritev64(fd, vec, vlen, pos, 0); } #endif @@ -1187,8 +1276,21 @@ COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, { loff_t pos = ((loff_t)pos_high << 32) | pos_low; - return __compat_sys_pwritev64(fd, vec, vlen, pos); + return do_compat_pwritev64(fd, vec, vlen, pos, 0); } + +COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd, + const struct compat_iovec __user *,vec, + compat_ulong_t, vlen, u32, pos_low, u32, pos_high, int, flags) +{ + loff_t pos = ((loff_t)pos_high << 32) | pos_low; + + if (pos == -1) + return do_compat_writev(fd, vec, vlen, flags); + + return do_compat_pwritev64(fd, vec, vlen, pos, flags); +} + #endif static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, @@ -1532,10 +1634,12 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in, if (!(file_in->f_mode & FMODE_READ) || !(file_out->f_mode & FMODE_WRITE) || - (file_out->f_flags & O_APPEND) || - !file_in->f_op->clone_file_range) + (file_out->f_flags & O_APPEND)) return -EBADF; + if (!file_in->f_op->clone_file_range) + return -EOPNOTSUPP; + ret = clone_verify_area(file_in, pos_in, len, false); if (ret) return ret; @@ -1656,6 +1760,9 @@ next_file: mnt_drop_write_file(dst_file); next_loop: fdput(dst_fd); + + if (fatal_signal_pending(current)) + goto out; } out: diff --git a/fs/readdir.c b/fs/readdir.c index ced679179cac..e69ef3b79787 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -44,7 +44,7 @@ int iterate_dir(struct file *file, struct dir_context *ctx) fsnotify_access(file); file_accessed(file); } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); out: return res; } diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index 4a024e2ceb9f..3abd4004184b 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c @@ -38,11 +38,11 @@ static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end, if (err) return err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); reiserfs_write_lock(inode->i_sb); err = reiserfs_commit_for_inode(inode); reiserfs_write_unlock(inode->i_sb); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (err < 0) return err; return 0; diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 96a1bcf33db4..9424a4ba93a9 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -158,7 +158,7 @@ static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end, if (err) return err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); BUG_ON(!S_ISREG(inode->i_mode)); err = sync_mapping_buffers(inode->i_mapping); reiserfs_write_lock(inode->i_sb); @@ -166,7 +166,7 @@ static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end, reiserfs_write_unlock(inode->i_sb); if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb)) blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (barrier_done < 0) return barrier_done; return (err < 0) ? -EIO : 0; diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 6ec8a30a0911..036a1fc0a8c3 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -224,7 +224,7 @@ out_unlock: page_cache_release(page); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); reiserfs_write_unlock(inode->i_sb); return retval; } diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 05db7473bcb5..c0306ec8ed7b 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -288,7 +288,7 @@ static int finish_unfinished(struct super_block *s) pathrelse(&path); inode = reiserfs_iget(s, &obj_key); - if (!inode) { + if (IS_ERR_OR_NULL(inode)) { /* * the unlink almost completed, it just did not * manage to remove "save" link and release objectid diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index e5ddb4e5ea94..57e0b2310532 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -64,14 +64,14 @@ #ifdef CONFIG_REISERFS_FS_XATTR static int xattr_create(struct inode *dir, struct dentry *dentry, int mode) { - BUG_ON(!mutex_is_locked(&dir->i_mutex)); + BUG_ON(!inode_is_locked(dir)); return dir->i_op->create(dir, dentry, mode, true); } #endif static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { - BUG_ON(!mutex_is_locked(&dir->i_mutex)); + BUG_ON(!inode_is_locked(dir)); return dir->i_op->mkdir(dir, dentry, mode); } @@ -85,11 +85,11 @@ static int xattr_unlink(struct inode *dir, struct dentry *dentry) { int error; - BUG_ON(!mutex_is_locked(&dir->i_mutex)); + BUG_ON(!inode_is_locked(dir)); - mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD); error = dir->i_op->unlink(dir, dentry); - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); if (!error) d_delete(dentry); @@ -100,13 +100,13 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry) { int error; - BUG_ON(!mutex_is_locked(&dir->i_mutex)); + BUG_ON(!inode_is_locked(dir)); - mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD); error = dir->i_op->rmdir(dir, dentry); if (!error) d_inode(dentry)->i_flags |= S_DEAD; - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); if (!error) d_delete(dentry); @@ -123,7 +123,7 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags) if (d_really_is_negative(privroot)) return ERR_PTR(-ENODATA); - mutex_lock_nested(&d_inode(privroot)->i_mutex, I_MUTEX_XATTR); + inode_lock_nested(d_inode(privroot), I_MUTEX_XATTR); xaroot = dget(REISERFS_SB(sb)->xattr_root); if (!xaroot) @@ -139,7 +139,7 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags) } } - mutex_unlock(&d_inode(privroot)->i_mutex); + inode_unlock(d_inode(privroot)); return xaroot; } @@ -156,7 +156,7 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags) le32_to_cpu(INODE_PKEY(inode)->k_objectid), inode->i_generation); - mutex_lock_nested(&d_inode(xaroot)->i_mutex, I_MUTEX_XATTR); + inode_lock_nested(d_inode(xaroot), I_MUTEX_XATTR); xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf)); if (!IS_ERR(xadir) && d_really_is_negative(xadir)) { @@ -170,7 +170,7 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags) } } - mutex_unlock(&d_inode(xaroot)->i_mutex); + inode_unlock(d_inode(xaroot)); dput(xaroot); return xadir; } @@ -195,7 +195,7 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen, container_of(ctx, struct reiserfs_dentry_buf, ctx); struct dentry *dentry; - WARN_ON_ONCE(!mutex_is_locked(&d_inode(dbuf->xadir)->i_mutex)); + WARN_ON_ONCE(!inode_is_locked(d_inode(dbuf->xadir))); if (dbuf->count == ARRAY_SIZE(dbuf->dentries)) return -ENOSPC; @@ -254,7 +254,7 @@ static int reiserfs_for_each_xattr(struct inode *inode, goto out_dir; } - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_XATTR); + inode_lock_nested(d_inode(dir), I_MUTEX_XATTR); buf.xadir = dir; while (1) { @@ -276,7 +276,7 @@ static int reiserfs_for_each_xattr(struct inode *inode, break; buf.count = 0; } - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); cleanup_dentry_buf(&buf); @@ -298,13 +298,13 @@ static int reiserfs_for_each_xattr(struct inode *inode, if (!err) { int jerror; - mutex_lock_nested(&d_inode(dir->d_parent)->i_mutex, + inode_lock_nested(d_inode(dir->d_parent), I_MUTEX_XATTR); err = action(dir, data); reiserfs_write_lock(inode->i_sb); jerror = journal_end(&th); reiserfs_write_unlock(inode->i_sb); - mutex_unlock(&d_inode(dir->d_parent)->i_mutex); + inode_unlock(d_inode(dir->d_parent)); err = jerror ?: err; } } @@ -384,7 +384,7 @@ static struct dentry *xattr_lookup(struct inode *inode, const char *name, if (IS_ERR(xadir)) return ERR_CAST(xadir); - mutex_lock_nested(&d_inode(xadir)->i_mutex, I_MUTEX_XATTR); + inode_lock_nested(d_inode(xadir), I_MUTEX_XATTR); xafile = lookup_one_len(name, xadir, strlen(name)); if (IS_ERR(xafile)) { err = PTR_ERR(xafile); @@ -404,7 +404,7 @@ static struct dentry *xattr_lookup(struct inode *inode, const char *name, if (err) dput(xafile); out: - mutex_unlock(&d_inode(xadir)->i_mutex); + inode_unlock(d_inode(xadir)); dput(xadir); if (err) return ERR_PTR(err); @@ -469,7 +469,7 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name) if (IS_ERR(xadir)) return PTR_ERR(xadir); - mutex_lock_nested(&d_inode(xadir)->i_mutex, I_MUTEX_XATTR); + inode_lock_nested(d_inode(xadir), I_MUTEX_XATTR); dentry = lookup_one_len(name, xadir, strlen(name)); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); @@ -483,7 +483,7 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name) dput(dentry); out_dput: - mutex_unlock(&d_inode(xadir)->i_mutex); + inode_unlock(d_inode(xadir)); dput(xadir); return err; } @@ -580,11 +580,11 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, .ia_valid = ATTR_SIZE | ATTR_CTIME, }; - mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_XATTR); + inode_lock_nested(d_inode(dentry), I_MUTEX_XATTR); inode_dio_wait(d_inode(dentry)); err = reiserfs_setattr(dentry, &newattrs); - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); } else update_ctime(inode); out_unlock: @@ -888,9 +888,9 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size) goto out; } - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_XATTR); + inode_lock_nested(d_inode(dir), I_MUTEX_XATTR); err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); if (!err) err = buf.pos; @@ -905,7 +905,7 @@ static int create_privroot(struct dentry *dentry) int err; struct inode *inode = d_inode(dentry->d_parent); - WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex)); + WARN_ON_ONCE(!inode_is_locked(inode)); err = xattr_mkdir(inode, dentry, 0700); if (err || d_really_is_negative(dentry)) { @@ -995,7 +995,7 @@ int reiserfs_lookup_privroot(struct super_block *s) int err = 0; /* If we don't have the privroot located yet - go find it */ - mutex_lock(&d_inode(s->s_root)->i_mutex); + inode_lock(d_inode(s->s_root)); dentry = lookup_one_len(PRIVROOT_NAME, s->s_root, strlen(PRIVROOT_NAME)); if (!IS_ERR(dentry)) { @@ -1005,7 +1005,7 @@ int reiserfs_lookup_privroot(struct super_block *s) d_inode(dentry)->i_flags |= S_PRIVATE; } else err = PTR_ERR(dentry); - mutex_unlock(&d_inode(s->s_root)->i_mutex); + inode_unlock(d_inode(s->s_root)); return err; } @@ -1025,14 +1025,14 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags) goto error; if (d_really_is_negative(privroot) && !(mount_flags & MS_RDONLY)) { - mutex_lock(&d_inode(s->s_root)->i_mutex); + inode_lock(d_inode(s->s_root)); err = create_privroot(REISERFS_SB(s)->priv_root); - mutex_unlock(&d_inode(s->s_root)->i_mutex); + inode_unlock(d_inode(s->s_root)); } if (d_really_is_positive(privroot)) { s->s_xattr = reiserfs_xattr_handlers; - mutex_lock(&d_inode(privroot)->i_mutex); + inode_lock(d_inode(privroot)); if (!REISERFS_SB(s)->xattr_root) { struct dentry *dentry; @@ -1043,7 +1043,7 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags) else err = PTR_ERR(dentry); } - mutex_unlock(&d_inode(privroot)->i_mutex); + inode_unlock(d_inode(privroot)); } error: diff --git a/fs/select.c b/fs/select.c index 79d0d4953cad..869293988c2a 100644 --- a/fs/select.c +++ b/fs/select.c @@ -70,9 +70,9 @@ static long __estimate_accuracy(struct timespec *tv) return slack; } -long select_estimate_accuracy(struct timespec *tv) +u64 select_estimate_accuracy(struct timespec *tv) { - unsigned long ret; + u64 ret; struct timespec now; /* @@ -402,7 +402,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) struct poll_wqueues table; poll_table *wait; int retval, i, timed_out = 0; - unsigned long slack = 0; + u64 slack = 0; unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; unsigned long busy_end = 0; @@ -784,7 +784,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait, poll_table* pt = &wait->pt; ktime_t expire, *to = NULL; int timed_out = 0, count = 0; - unsigned long slack = 0; + u64 slack = 0; unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; unsigned long busy_end = 0; diff --git a/fs/splice.c b/fs/splice.c index 82bc0d64fc38..9947b5c69664 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -185,6 +185,9 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, unsigned int spd_pages = spd->nr_pages; int ret, do_wakeup, page_nr; + if (!spd_pages) + return 0; + ret = 0; do_wakeup = 0; page_nr = 0; @@ -577,7 +580,7 @@ static ssize_t kernel_readv(struct file *file, const struct iovec *vec, old_fs = get_fs(); set_fs(get_ds()); /* The cast to a user pointer is valid due to the set_fs() */ - res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos); + res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos, 0); set_fs(old_fs); return res; diff --git a/fs/super.c b/fs/super.c index 1182af8fd5ff..74914b1bae70 100644 --- a/fs/super.c +++ b/fs/super.c @@ -415,6 +415,7 @@ void generic_shutdown_super(struct super_block *sb) sb->s_flags &= ~MS_ACTIVE; fsnotify_unmount_inodes(sb); + cgroup_writeback_umount(); evict_inodes(sb); diff --git a/fs/timerfd.c b/fs/timerfd.c index b94fa6c3c6eb..053818dd6c18 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -153,7 +153,7 @@ static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx) if (isalarm(ctx)) remaining = alarm_expires_remaining(&ctx->t.alarm); else - remaining = hrtimer_expires_remaining(&ctx->t.tmr); + remaining = hrtimer_expires_remaining_adjusted(&ctx->t.tmr); return remaining.tv64 < 0 ? ktime_set(0, 0): remaining; } diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index c66f2423e1f5..4a0e48f92104 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -84,9 +84,9 @@ static int tracefs_syscall_mkdir(struct inode *inode, struct dentry *dentry, umo * the files within the tracefs system. It is up to the individual * mkdir routine to handle races. */ - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); ret = tracefs_ops.mkdir(name); - mutex_lock(&inode->i_mutex); + inode_lock(inode); kfree(name); @@ -109,13 +109,13 @@ static int tracefs_syscall_rmdir(struct inode *inode, struct dentry *dentry) * This time we need to unlock not only the parent (inode) but * also the directory that is being deleted. */ - mutex_unlock(&inode->i_mutex); - mutex_unlock(&dentry->d_inode->i_mutex); + inode_unlock(inode); + inode_unlock(dentry->d_inode); ret = tracefs_ops.rmdir(name); - mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); - mutex_lock(&dentry->d_inode->i_mutex); + inode_lock_nested(inode, I_MUTEX_PARENT); + inode_lock(dentry->d_inode); kfree(name); @@ -334,7 +334,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) if (!parent) parent = tracefs_mount->mnt_root; - mutex_lock(&parent->d_inode->i_mutex); + inode_lock(parent->d_inode); dentry = lookup_one_len(name, parent, strlen(name)); if (!IS_ERR(dentry) && dentry->d_inode) { dput(dentry); @@ -342,7 +342,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) } if (IS_ERR(dentry)) { - mutex_unlock(&parent->d_inode->i_mutex); + inode_unlock(parent->d_inode); simple_release_fs(&tracefs_mount, &tracefs_mount_count); } @@ -351,7 +351,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) static struct dentry *failed_creating(struct dentry *dentry) { - mutex_unlock(&dentry->d_parent->d_inode->i_mutex); + inode_unlock(dentry->d_parent->d_inode); dput(dentry); simple_release_fs(&tracefs_mount, &tracefs_mount_count); return NULL; @@ -359,7 +359,7 @@ static struct dentry *failed_creating(struct dentry *dentry) static struct dentry *end_creating(struct dentry *dentry) { - mutex_unlock(&dentry->d_parent->d_inode->i_mutex); + inode_unlock(dentry->d_parent->d_inode); return dentry; } @@ -544,9 +544,9 @@ void tracefs_remove(struct dentry *dentry) if (!parent || !parent->d_inode) return; - mutex_lock(&parent->d_inode->i_mutex); + inode_lock(parent->d_inode); ret = __tracefs_remove(dentry, parent); - mutex_unlock(&parent->d_inode->i_mutex); + inode_unlock(parent->d_inode); if (!ret) simple_release_fs(&tracefs_mount, &tracefs_mount_count); } @@ -572,7 +572,7 @@ void tracefs_remove_recursive(struct dentry *dentry) parent = dentry; down: - mutex_lock(&parent->d_inode->i_mutex); + inode_lock(parent->d_inode); loop: /* * The parent->d_subdirs is protected by the d_lock. Outside that @@ -587,7 +587,7 @@ void tracefs_remove_recursive(struct dentry *dentry) /* perhaps simple_empty(child) makes more sense */ if (!list_empty(&child->d_subdirs)) { spin_unlock(&parent->d_lock); - mutex_unlock(&parent->d_inode->i_mutex); + inode_unlock(parent->d_inode); parent = child; goto down; } @@ -608,10 +608,10 @@ void tracefs_remove_recursive(struct dentry *dentry) } spin_unlock(&parent->d_lock); - mutex_unlock(&parent->d_inode->i_mutex); + inode_unlock(parent->d_inode); child = parent; parent = parent->d_parent; - mutex_lock(&parent->d_inode->i_mutex); + inode_lock(parent->d_inode); if (child != dentry) /* go up */ @@ -619,7 +619,7 @@ void tracefs_remove_recursive(struct dentry *dentry) if (!__tracefs_remove(child, parent)) simple_release_fs(&tracefs_mount, &tracefs_mount_count); - mutex_unlock(&parent->d_inode->i_mutex); + inode_unlock(parent->d_inode); } /** diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index e49bd2808bf3..795992a8321e 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -515,8 +515,8 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, dbg_gen("dent '%pd' to ino %lu (nlink %d) in dir ino %lu", dentry, inode->i_ino, inode->i_nlink, dir->i_ino); - ubifs_assert(mutex_is_locked(&dir->i_mutex)); - ubifs_assert(mutex_is_locked(&inode->i_mutex)); + ubifs_assert(inode_is_locked(dir)); + ubifs_assert(inode_is_locked(inode)); err = dbg_check_synced_i_size(c, inode); if (err) @@ -572,8 +572,8 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) dbg_gen("dent '%pd' from ino %lu (nlink %d) in dir ino %lu", dentry, inode->i_ino, inode->i_nlink, dir->i_ino); - ubifs_assert(mutex_is_locked(&dir->i_mutex)); - ubifs_assert(mutex_is_locked(&inode->i_mutex)); + ubifs_assert(inode_is_locked(dir)); + ubifs_assert(inode_is_locked(inode)); err = dbg_check_synced_i_size(c, inode); if (err) return err; @@ -661,8 +661,8 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) dbg_gen("directory '%pd', ino %lu in dir ino %lu", dentry, inode->i_ino, dir->i_ino); - ubifs_assert(mutex_is_locked(&dir->i_mutex)); - ubifs_assert(mutex_is_locked(&inode->i_mutex)); + ubifs_assert(inode_is_locked(dir)); + ubifs_assert(inode_is_locked(inode)); err = check_dir_empty(c, d_inode(dentry)); if (err) return err; @@ -996,10 +996,10 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, dbg_gen("dent '%pd' ino %lu in dir ino %lu to dent '%pd' in dir ino %lu", old_dentry, old_inode->i_ino, old_dir->i_ino, new_dentry, new_dir->i_ino); - ubifs_assert(mutex_is_locked(&old_dir->i_mutex)); - ubifs_assert(mutex_is_locked(&new_dir->i_mutex)); + ubifs_assert(inode_is_locked(old_dir)); + ubifs_assert(inode_is_locked(new_dir)); if (unlink) - ubifs_assert(mutex_is_locked(&new_inode->i_mutex)); + ubifs_assert(inode_is_locked(new_inode)); if (unlink && is_dir) { diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index eff62801acbf..065c88f8e4b8 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1317,7 +1317,7 @@ int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) err = filemap_write_and_wait_range(inode->i_mapping, start, end); if (err) return err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* Synchronize the inode unless this is a 'datasync()' call. */ if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) { @@ -1332,7 +1332,7 @@ int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) */ err = ubifs_sync_wbufs_by_inode(c, inode); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index e53292d0c21b..c7f4d434d098 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -313,7 +313,7 @@ static int setxattr(struct inode *host, const char *name, const void *value, union ubifs_key key; int err, type; - ubifs_assert(mutex_is_locked(&host->i_mutex)); + ubifs_assert(inode_is_locked(host)); if (size > UBIFS_MAX_INO_DATA) return -ERANGE; @@ -550,7 +550,7 @@ int ubifs_removexattr(struct dentry *dentry, const char *name) dbg_gen("xattr '%s', ino %lu ('%pd')", name, host->i_ino, dentry); - ubifs_assert(mutex_is_locked(&host->i_mutex)); + ubifs_assert(inode_is_locked(host)); err = check_namespace(&nm); if (err < 0) diff --git a/fs/udf/file.c b/fs/udf/file.c index bddf3d071dae..1af98963d860 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -122,7 +122,7 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct udf_inode_info *iinfo = UDF_I(inode); int err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); retval = generic_write_checks(iocb, from); if (retval <= 0) @@ -136,7 +136,7 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from) (udf_file_entry_alloc_offset(inode) + end)) { err = udf_expand_file_adinicb(inode); if (err) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); udf_debug("udf_expand_adinicb: err=%d\n", err); return err; } @@ -149,7 +149,7 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from) retval = __generic_file_write_iter(iocb, from); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (retval > 0) { mark_inode_dirty(inode); @@ -223,12 +223,12 @@ static int udf_release_file(struct inode *inode, struct file *filp) * Grab i_mutex to avoid races with writes changing i_size * while we are running. */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); down_write(&UDF_I(inode)->i_data_sem); udf_discard_prealloc(inode); udf_truncate_tail_extent(inode); up_write(&UDF_I(inode)->i_data_sem); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } return 0; } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 87dc16d15572..166d3ed32c39 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -262,7 +262,7 @@ int udf_expand_file_adinicb(struct inode *inode) .nr_to_write = 1, }; - WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex)); + WARN_ON_ONCE(!inode_is_locked(inode)); if (!iinfo->i_lenAlloc) { if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; diff --git a/fs/udf/super.c b/fs/udf/super.c index 0fbb4c7c72e8..a522c15a0bfd 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -279,17 +279,12 @@ static void udf_sb_free_bitmap(struct udf_bitmap *bitmap) { int i; int nr_groups = bitmap->s_nr_groups; - int size = sizeof(struct udf_bitmap) + (sizeof(struct buffer_head *) * - nr_groups); for (i = 0; i < nr_groups; i++) if (bitmap->s_block_bitmap[i]) brelse(bitmap->s_block_bitmap[i]); - if (size <= PAGE_SIZE) - kfree(bitmap); - else - vfree(bitmap); + kvfree(bitmap); } static void udf_free_partition(struct udf_part_map *map) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 50311703135b..66cdb44616d5 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -287,6 +287,12 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, goto out; /* + * We don't do userfault handling for the final child pid update. + */ + if (current->flags & PF_EXITING) + goto out; + + /* * Check that we can return VM_FAULT_RETRY. * * NOTE: it should become possible to return VM_FAULT_RETRY diff --git a/fs/utimes.c b/fs/utimes.c index aa138d64560a..85c40f4f373d 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -103,9 +103,9 @@ static int utimes_common(struct path *path, struct timespec *times) } } retry_deleg: - mutex_lock(&inode->i_mutex); + inode_lock(inode); error = notify_change(path->dentry, &newattrs, &delegated_inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) diff --git a/fs/xattr.c b/fs/xattr.c index d5dd6c8b82a7..4861322e28e8 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -129,7 +129,7 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value, if (error) return error; - mutex_lock(&inode->i_mutex); + inode_lock(inode); error = security_inode_setxattr(dentry, name, value, size, flags); if (error) goto out; @@ -137,7 +137,7 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value, error = __vfs_setxattr_noperm(dentry, name, value, size, flags); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return error; } EXPORT_SYMBOL_GPL(vfs_setxattr); @@ -277,7 +277,7 @@ vfs_removexattr(struct dentry *dentry, const char *name) if (error) return error; - mutex_lock(&inode->i_mutex); + inode_lock(inode); error = security_inode_removexattr(dentry, name); if (error) goto out; @@ -290,7 +290,7 @@ vfs_removexattr(struct dentry *dentry, const char *name) } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return error; } EXPORT_SYMBOL_GPL(vfs_removexattr); @@ -940,7 +940,7 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, bool trusted = capable(CAP_SYS_ADMIN); struct simple_xattr *xattr; ssize_t remaining_size = size; - int err; + int err = 0; #ifdef CONFIG_FS_POSIX_ACL if (inode->i_acl) { @@ -965,11 +965,11 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, err = xattr_list_one(&buffer, &remaining_size, xattr->name); if (err) - return err; + break; } spin_unlock(&xattrs->lock); - return size - remaining_size; + return err ? err : size - remaining_size; } /* diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index e2536bb1c760..dc97eb21af07 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -984,8 +984,6 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) /* * Values for di_flags - * There should be a one-to-one correspondence between these flags and the - * XFS_XFLAG_s. */ #define XFS_DIFLAG_REALTIME_BIT 0 /* file's blocks come from rt area */ #define XFS_DIFLAG_PREALLOC_BIT 1 /* file space has been preallocated */ @@ -1026,6 +1024,15 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM) /* + * Values for di_flags2 These start by being exposed to userspace in the upper + * 16 bits of the XFS_XFLAG_s range. + */ +#define XFS_DIFLAG2_DAX_BIT 0 /* use DAX for this inode */ +#define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT) + +#define XFS_DIFLAG2_ANY (XFS_DIFLAG2_DAX) + +/* * Inode number format: * low inopblog bits - offset in block * next agblklog bits - block number in ag diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index b2b73a998d42..fffe3d01bd9f 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -36,40 +36,6 @@ struct dioattr { #endif /* - * Structure for XFS_IOC_FSGETXATTR[A] and XFS_IOC_FSSETXATTR. - */ -#ifndef HAVE_FSXATTR -struct fsxattr { - __u32 fsx_xflags; /* xflags field value (get/set) */ - __u32 fsx_extsize; /* extsize field value (get/set)*/ - __u32 fsx_nextents; /* nextents field value (get) */ - __u32 fsx_projid; /* project identifier (get/set) */ - unsigned char fsx_pad[12]; -}; -#endif - -/* - * Flags for the bs_xflags/fsx_xflags field - * There should be a one-to-one correspondence between these flags and the - * XFS_DIFLAG_s. - */ -#define XFS_XFLAG_REALTIME 0x00000001 /* data in realtime volume */ -#define XFS_XFLAG_PREALLOC 0x00000002 /* preallocated file extents */ -#define XFS_XFLAG_IMMUTABLE 0x00000008 /* file cannot be modified */ -#define XFS_XFLAG_APPEND 0x00000010 /* all writes append */ -#define XFS_XFLAG_SYNC 0x00000020 /* all writes synchronous */ -#define XFS_XFLAG_NOATIME 0x00000040 /* do not update access time */ -#define XFS_XFLAG_NODUMP 0x00000080 /* do not include in backups */ -#define XFS_XFLAG_RTINHERIT 0x00000100 /* create with rt bit set */ -#define XFS_XFLAG_PROJINHERIT 0x00000200 /* create with parents projid */ -#define XFS_XFLAG_NOSYMLINKS 0x00000400 /* disallow symlink creation */ -#define XFS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */ -#define XFS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */ -#define XFS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */ -#define XFS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */ -#define XFS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ - -/* * Structure for XFS_IOC_GETBMAP. * On input, fill in bmv_offset and bmv_length of the first structure * to indicate the area of interest in the file, and bmv_entries with @@ -514,8 +480,8 @@ typedef struct xfs_swapext #define XFS_IOC_ALLOCSP _IOW ('X', 10, struct xfs_flock64) #define XFS_IOC_FREESP _IOW ('X', 11, struct xfs_flock64) #define XFS_IOC_DIOINFO _IOR ('X', 30, struct dioattr) -#define XFS_IOC_FSGETXATTR _IOR ('X', 31, struct fsxattr) -#define XFS_IOC_FSSETXATTR _IOW ('X', 32, struct fsxattr) +#define XFS_IOC_FSGETXATTR FS_IOC_FSGETXATTR +#define XFS_IOC_FSSETXATTR FS_IOC_FSSETXATTR #define XFS_IOC_ALLOCSP64 _IOW ('X', 36, struct xfs_flock64) #define XFS_IOC_FREESP64 _IOW ('X', 37, struct xfs_flock64) #define XFS_IOC_GETBMAP _IOWR('X', 38, struct getbmap) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 379c089fb051..5c57b7b40728 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -55,7 +55,7 @@ xfs_count_page_state( } while ((bh = bh->b_this_page) != head); } -STATIC struct block_device * +struct block_device * xfs_find_bdev_for_inode( struct inode *inode) { @@ -1208,6 +1208,10 @@ xfs_vm_writepages( struct writeback_control *wbc) { xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); + if (dax_mapping(mapping)) + return dax_writeback_mapping_range(mapping, + xfs_find_bdev_for_inode(mapping->host), wbc); + return generic_writepages(mapping, wbc); } @@ -1953,7 +1957,6 @@ xfs_vm_set_page_dirty( loff_t end_offset; loff_t offset; int newly_dirty; - struct mem_cgroup *memcg; if (unlikely(!mapping)) return !TestSetPageDirty(page); @@ -1974,10 +1977,10 @@ xfs_vm_set_page_dirty( } while (bh != head); } /* - * Use mem_group_begin_page_stat() to keep PageDirty synchronized with - * per-memcg dirty page counters. + * Lock out page->mem_cgroup migration to keep PageDirty + * synchronized with per-memcg dirty page counters. */ - memcg = mem_cgroup_begin_page_stat(page); + lock_page_memcg(page); newly_dirty = !TestSetPageDirty(page); spin_unlock(&mapping->private_lock); @@ -1988,13 +1991,13 @@ xfs_vm_set_page_dirty( spin_lock_irqsave(&mapping->tree_lock, flags); if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(!PageUptodate(page)); - account_page_dirtied(page, mapping, memcg); + account_page_dirtied(page, mapping); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } spin_unlock_irqrestore(&mapping->tree_lock, flags); } - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(page); if (newly_dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); return newly_dirty; diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index f6ffc9ae5ceb..a4343c63fb38 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -62,5 +62,6 @@ int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset, struct buffer_head *map_bh, int create); extern void xfs_count_page_state(struct page *, int *, int *); +extern struct block_device *xfs_find_bdev_for_inode(struct inode *); #endif /* __XFS_AOPS_H__ */ diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 45ec9e40150c..6c876012b2e5 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -75,7 +75,8 @@ xfs_zero_extent( ssize_t size = XFS_FSB_TO_B(mp, count_fsb); if (IS_DAX(VFS_I(ip))) - return dax_clear_blocks(VFS_I(ip), block, size); + return dax_clear_sectors(xfs_find_bdev_for_inode(VFS_I(ip)), + sector, size); /* * let the block layer decide on the fastest method of diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index daed4bfb85b2..435c7de42e5f 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1527,6 +1527,16 @@ xfs_wait_buftarg( LIST_HEAD(dispose); int loop = 0; + /* + * We need to flush the buffer workqueue to ensure that all IO + * completion processing is 100% done. Just waiting on buffer locks is + * not sufficient for async IO as the reference count held over IO is + * not released until after the buffer lock is dropped. Hence we need to + * ensure here that all reference counts have been dropped before we + * start walking the LRU list. + */ + drain_workqueue(btp->bt_mount->m_buf_workqueue); + /* loop until there is nothing left on the lru list. */ while (list_lru_count(&btp->bt_lru)) { list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index ebe9b8290a70..52883ac3cf84 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -55,7 +55,7 @@ xfs_rw_ilock( int type) { if (type & XFS_IOLOCK_EXCL) - mutex_lock(&VFS_I(ip)->i_mutex); + inode_lock(VFS_I(ip)); xfs_ilock(ip, type); } @@ -66,7 +66,7 @@ xfs_rw_iunlock( { xfs_iunlock(ip, type); if (type & XFS_IOLOCK_EXCL) - mutex_unlock(&VFS_I(ip)->i_mutex); + inode_unlock(VFS_I(ip)); } static inline void @@ -76,7 +76,7 @@ xfs_rw_ilock_demote( { xfs_ilock_demote(ip, type); if (type & XFS_IOLOCK_EXCL) - mutex_unlock(&VFS_I(ip)->i_mutex); + inode_unlock(VFS_I(ip)); } /* @@ -1610,9 +1610,8 @@ xfs_filemap_pmd_fault( /* * pfn_mkwrite was originally inteneded to ensure we capture time stamp * updates on write faults. In reality, it's need to serialise against - * truncate similar to page_mkwrite. Hence we open-code dax_pfn_mkwrite() - * here and cycle the XFS_MMAPLOCK_SHARED to ensure we serialise the fault - * barrier in place. + * truncate similar to page_mkwrite. Hence we cycle the XFS_MMAPLOCK_SHARED + * to ensure we serialise the fault barrier in place. */ static int xfs_filemap_pfn_mkwrite( @@ -1635,6 +1634,8 @@ xfs_filemap_pfn_mkwrite( size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) ret = VM_FAULT_SIGBUS; + else if (IS_DAX(inode)) + ret = dax_pfn_mkwrite(vma, vmf); xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); sb_end_pagefault(inode->i_sb); return ret; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ae3758a90ed6..ceba1a83cacc 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -610,60 +610,69 @@ __xfs_iflock( STATIC uint _xfs_dic2xflags( - __uint16_t di_flags) + __uint16_t di_flags, + uint64_t di_flags2, + bool has_attr) { uint flags = 0; if (di_flags & XFS_DIFLAG_ANY) { if (di_flags & XFS_DIFLAG_REALTIME) - flags |= XFS_XFLAG_REALTIME; + flags |= FS_XFLAG_REALTIME; if (di_flags & XFS_DIFLAG_PREALLOC) - flags |= XFS_XFLAG_PREALLOC; + flags |= FS_XFLAG_PREALLOC; if (di_flags & XFS_DIFLAG_IMMUTABLE) - flags |= XFS_XFLAG_IMMUTABLE; + flags |= FS_XFLAG_IMMUTABLE; if (di_flags & XFS_DIFLAG_APPEND) - flags |= XFS_XFLAG_APPEND; + flags |= FS_XFLAG_APPEND; if (di_flags & XFS_DIFLAG_SYNC) - flags |= XFS_XFLAG_SYNC; + flags |= FS_XFLAG_SYNC; if (di_flags & XFS_DIFLAG_NOATIME) - flags |= XFS_XFLAG_NOATIME; + flags |= FS_XFLAG_NOATIME; if (di_flags & XFS_DIFLAG_NODUMP) - flags |= XFS_XFLAG_NODUMP; + flags |= FS_XFLAG_NODUMP; if (di_flags & XFS_DIFLAG_RTINHERIT) - flags |= XFS_XFLAG_RTINHERIT; + flags |= FS_XFLAG_RTINHERIT; if (di_flags & XFS_DIFLAG_PROJINHERIT) - flags |= XFS_XFLAG_PROJINHERIT; + flags |= FS_XFLAG_PROJINHERIT; if (di_flags & XFS_DIFLAG_NOSYMLINKS) - flags |= XFS_XFLAG_NOSYMLINKS; + flags |= FS_XFLAG_NOSYMLINKS; if (di_flags & XFS_DIFLAG_EXTSIZE) - flags |= XFS_XFLAG_EXTSIZE; + flags |= FS_XFLAG_EXTSIZE; if (di_flags & XFS_DIFLAG_EXTSZINHERIT) - flags |= XFS_XFLAG_EXTSZINHERIT; + flags |= FS_XFLAG_EXTSZINHERIT; if (di_flags & XFS_DIFLAG_NODEFRAG) - flags |= XFS_XFLAG_NODEFRAG; + flags |= FS_XFLAG_NODEFRAG; if (di_flags & XFS_DIFLAG_FILESTREAM) - flags |= XFS_XFLAG_FILESTREAM; + flags |= FS_XFLAG_FILESTREAM; } + if (di_flags2 & XFS_DIFLAG2_ANY) { + if (di_flags2 & XFS_DIFLAG2_DAX) + flags |= FS_XFLAG_DAX; + } + + if (has_attr) + flags |= FS_XFLAG_HASATTR; + return flags; } uint xfs_ip2xflags( - xfs_inode_t *ip) + struct xfs_inode *ip) { - xfs_icdinode_t *dic = &ip->i_d; + struct xfs_icdinode *dic = &ip->i_d; - return _xfs_dic2xflags(dic->di_flags) | - (XFS_IFORK_Q(ip) ? XFS_XFLAG_HASATTR : 0); + return _xfs_dic2xflags(dic->di_flags, dic->di_flags2, XFS_IFORK_Q(ip)); } uint xfs_dic2xflags( - xfs_dinode_t *dip) + struct xfs_dinode *dip) { - return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) | - (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0); + return _xfs_dic2xflags(be16_to_cpu(dip->di_flags), + be64_to_cpu(dip->di_flags2), XFS_DFORK_Q(dip)); } /* @@ -862,7 +871,8 @@ xfs_ialloc( case S_IFREG: case S_IFDIR: if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { - uint di_flags = 0; + uint64_t di_flags2 = 0; + uint di_flags = 0; if (S_ISDIR(mode)) { if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) @@ -898,7 +908,11 @@ xfs_ialloc( di_flags |= XFS_DIFLAG_NODEFRAG; if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM) di_flags |= XFS_DIFLAG_FILESTREAM; + if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX) + di_flags2 |= XFS_DIFLAG2_DAX; + ip->i_d.di_flags |= di_flags; + ip->i_d.di_flags2 |= di_flags2; } /* FALLTHROUGH */ case S_IFLNK: diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index d42738deec6d..478d04e07f95 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -859,25 +859,25 @@ xfs_merge_ioc_xflags( unsigned int xflags = start; if (flags & FS_IMMUTABLE_FL) - xflags |= XFS_XFLAG_IMMUTABLE; + xflags |= FS_XFLAG_IMMUTABLE; else - xflags &= ~XFS_XFLAG_IMMUTABLE; + xflags &= ~FS_XFLAG_IMMUTABLE; if (flags & FS_APPEND_FL) - xflags |= XFS_XFLAG_APPEND; + xflags |= FS_XFLAG_APPEND; else - xflags &= ~XFS_XFLAG_APPEND; + xflags &= ~FS_XFLAG_APPEND; if (flags & FS_SYNC_FL) - xflags |= XFS_XFLAG_SYNC; + xflags |= FS_XFLAG_SYNC; else - xflags &= ~XFS_XFLAG_SYNC; + xflags &= ~FS_XFLAG_SYNC; if (flags & FS_NOATIME_FL) - xflags |= XFS_XFLAG_NOATIME; + xflags |= FS_XFLAG_NOATIME; else - xflags &= ~XFS_XFLAG_NOATIME; + xflags &= ~FS_XFLAG_NOATIME; if (flags & FS_NODUMP_FL) - xflags |= XFS_XFLAG_NODUMP; + xflags |= FS_XFLAG_NODUMP; else - xflags &= ~XFS_XFLAG_NODUMP; + xflags &= ~FS_XFLAG_NODUMP; return xflags; } @@ -945,40 +945,51 @@ xfs_set_diflags( unsigned int xflags) { unsigned int di_flags; + uint64_t di_flags2; /* can't set PREALLOC this way, just preserve it */ di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC); - if (xflags & XFS_XFLAG_IMMUTABLE) + if (xflags & FS_XFLAG_IMMUTABLE) di_flags |= XFS_DIFLAG_IMMUTABLE; - if (xflags & XFS_XFLAG_APPEND) + if (xflags & FS_XFLAG_APPEND) di_flags |= XFS_DIFLAG_APPEND; - if (xflags & XFS_XFLAG_SYNC) + if (xflags & FS_XFLAG_SYNC) di_flags |= XFS_DIFLAG_SYNC; - if (xflags & XFS_XFLAG_NOATIME) + if (xflags & FS_XFLAG_NOATIME) di_flags |= XFS_DIFLAG_NOATIME; - if (xflags & XFS_XFLAG_NODUMP) + if (xflags & FS_XFLAG_NODUMP) di_flags |= XFS_DIFLAG_NODUMP; - if (xflags & XFS_XFLAG_NODEFRAG) + if (xflags & FS_XFLAG_NODEFRAG) di_flags |= XFS_DIFLAG_NODEFRAG; - if (xflags & XFS_XFLAG_FILESTREAM) + if (xflags & FS_XFLAG_FILESTREAM) di_flags |= XFS_DIFLAG_FILESTREAM; if (S_ISDIR(ip->i_d.di_mode)) { - if (xflags & XFS_XFLAG_RTINHERIT) + if (xflags & FS_XFLAG_RTINHERIT) di_flags |= XFS_DIFLAG_RTINHERIT; - if (xflags & XFS_XFLAG_NOSYMLINKS) + if (xflags & FS_XFLAG_NOSYMLINKS) di_flags |= XFS_DIFLAG_NOSYMLINKS; - if (xflags & XFS_XFLAG_EXTSZINHERIT) + if (xflags & FS_XFLAG_EXTSZINHERIT) di_flags |= XFS_DIFLAG_EXTSZINHERIT; - if (xflags & XFS_XFLAG_PROJINHERIT) + if (xflags & FS_XFLAG_PROJINHERIT) di_flags |= XFS_DIFLAG_PROJINHERIT; } else if (S_ISREG(ip->i_d.di_mode)) { - if (xflags & XFS_XFLAG_REALTIME) + if (xflags & FS_XFLAG_REALTIME) di_flags |= XFS_DIFLAG_REALTIME; - if (xflags & XFS_XFLAG_EXTSIZE) + if (xflags & FS_XFLAG_EXTSIZE) di_flags |= XFS_DIFLAG_EXTSIZE; } - ip->i_d.di_flags = di_flags; + + /* diflags2 only valid for v3 inodes. */ + if (ip->i_d.di_version < 3) + return; + + di_flags2 = 0; + if (xflags & FS_XFLAG_DAX) + di_flags2 |= XFS_DIFLAG2_DAX; + + ip->i_d.di_flags2 = di_flags2; + } STATIC void @@ -988,22 +999,27 @@ xfs_diflags_to_linux( struct inode *inode = VFS_I(ip); unsigned int xflags = xfs_ip2xflags(ip); - if (xflags & XFS_XFLAG_IMMUTABLE) + if (xflags & FS_XFLAG_IMMUTABLE) inode->i_flags |= S_IMMUTABLE; else inode->i_flags &= ~S_IMMUTABLE; - if (xflags & XFS_XFLAG_APPEND) + if (xflags & FS_XFLAG_APPEND) inode->i_flags |= S_APPEND; else inode->i_flags &= ~S_APPEND; - if (xflags & XFS_XFLAG_SYNC) + if (xflags & FS_XFLAG_SYNC) inode->i_flags |= S_SYNC; else inode->i_flags &= ~S_SYNC; - if (xflags & XFS_XFLAG_NOATIME) + if (xflags & FS_XFLAG_NOATIME) inode->i_flags |= S_NOATIME; else inode->i_flags &= ~S_NOATIME; + if (xflags & FS_XFLAG_DAX) + inode->i_flags |= S_DAX; + else + inode->i_flags &= ~S_DAX; + } static int @@ -1016,11 +1032,11 @@ xfs_ioctl_setattr_xflags( /* Can't change realtime flag if any extents are allocated. */ if ((ip->i_d.di_nextents || ip->i_delayed_blks) && - XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & XFS_XFLAG_REALTIME)) + XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & FS_XFLAG_REALTIME)) return -EINVAL; /* If realtime flag is set then must have realtime device */ - if (fa->fsx_xflags & XFS_XFLAG_REALTIME) { + if (fa->fsx_xflags & FS_XFLAG_REALTIME) { if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 || (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) return -EINVAL; @@ -1031,7 +1047,7 @@ xfs_ioctl_setattr_xflags( * we have appropriate permission. */ if (((ip->i_d.di_flags & (XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND)) || - (fa->fsx_xflags & (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) && + (fa->fsx_xflags & (FS_XFLAG_IMMUTABLE | FS_XFLAG_APPEND))) && !capable(CAP_LINUX_IMMUTABLE)) return -EPERM; @@ -1095,8 +1111,8 @@ out_cancel: * extent size hint validation is somewhat cumbersome. Rules are: * * 1. extent size hint is only valid for directories and regular files - * 2. XFS_XFLAG_EXTSIZE is only valid for regular files - * 3. XFS_XFLAG_EXTSZINHERIT is only valid for directories. + * 2. FS_XFLAG_EXTSIZE is only valid for regular files + * 3. FS_XFLAG_EXTSZINHERIT is only valid for directories. * 4. can only be changed on regular files if no extents are allocated * 5. can be changed on directories at any time * 6. extsize hint of 0 turns off hints, clears inode flags. @@ -1112,10 +1128,10 @@ xfs_ioctl_setattr_check_extsize( { struct xfs_mount *mp = ip->i_mount; - if ((fa->fsx_xflags & XFS_XFLAG_EXTSIZE) && !S_ISREG(ip->i_d.di_mode)) + if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(ip->i_d.di_mode)) return -EINVAL; - if ((fa->fsx_xflags & XFS_XFLAG_EXTSZINHERIT) && + if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) && !S_ISDIR(ip->i_d.di_mode)) return -EINVAL; @@ -1132,7 +1148,7 @@ xfs_ioctl_setattr_check_extsize( return -EINVAL; if (XFS_IS_REALTIME_INODE(ip) || - (fa->fsx_xflags & XFS_XFLAG_REALTIME)) { + (fa->fsx_xflags & FS_XFLAG_REALTIME)) { size = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog; } else { size = mp->m_sb.sb_blocksize; @@ -1143,7 +1159,7 @@ xfs_ioctl_setattr_check_extsize( if (fa->fsx_extsize % size) return -EINVAL; } else - fa->fsx_xflags &= ~(XFS_XFLAG_EXTSIZE | XFS_XFLAG_EXTSZINHERIT); + fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT); return 0; } @@ -1168,7 +1184,7 @@ xfs_ioctl_setattr_check_projid( if (xfs_get_projid(ip) != fa->fsx_projid) return -EINVAL; - if ((fa->fsx_xflags & XFS_XFLAG_PROJINHERIT) != + if ((fa->fsx_xflags & FS_XFLAG_PROJINHERIT) != (ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)) return -EINVAL; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 06eafafe636e..76b71a1c6c32 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1205,8 +1205,8 @@ xfs_diflags_to_iflags( inode->i_flags |= S_SYNC; if (flags & XFS_DIFLAG_NOATIME) inode->i_flags |= S_NOATIME; - /* XXX: Also needs an on-disk per inode flag! */ - if (ip->i_mount->m_flags & XFS_MOUNT_DAX) + if (ip->i_mount->m_flags & XFS_MOUNT_DAX || + ip->i_d.di_flags2 & XFS_DIFLAG2_DAX) inode->i_flags |= S_DAX; } diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index da37beb76f6e..be5568839442 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1109,27 +1109,10 @@ xlog_verify_head( bool tmp_wrapped; /* - * Search backwards through the log looking for the log record header - * block. This wraps all the way back around to the head so something is - * seriously wrong if we can't find it. - */ - found = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp, rhead_blk, - rhead, wrapped); - if (found < 0) - return found; - if (!found) { - xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__); - return -EIO; - } - - *tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn)); - - /* - * Now that we have a tail block, check the head of the log for torn - * writes. Search again until we hit the tail or the maximum number of - * log record I/Os that could have been in flight at one time. Use a - * temporary buffer so we don't trash the rhead/bp pointer from the - * call above. + * Check the head of the log for torn writes. Search backwards from the + * head until we hit the tail or the maximum number of log record I/Os + * that could have been in flight at one time. Use a temporary buffer so + * we don't trash the rhead/bp pointers from the caller. */ tmp_bp = xlog_get_bp(log, 1); if (!tmp_bp) @@ -1216,6 +1199,115 @@ xlog_verify_head( } /* + * Check whether the head of the log points to an unmount record. In other + * words, determine whether the log is clean. If so, update the in-core state + * appropriately. + */ +static int +xlog_check_unmount_rec( + struct xlog *log, + xfs_daddr_t *head_blk, + xfs_daddr_t *tail_blk, + struct xlog_rec_header *rhead, + xfs_daddr_t rhead_blk, + struct xfs_buf *bp, + bool *clean) +{ + struct xlog_op_header *op_head; + xfs_daddr_t umount_data_blk; + xfs_daddr_t after_umount_blk; + int hblks; + int error; + char *offset; + + *clean = false; + + /* + * Look for unmount record. If we find it, then we know there was a + * clean unmount. Since 'i' could be the last block in the physical + * log, we convert to a log block before comparing to the head_blk. + * + * Save the current tail lsn to use to pass to xlog_clear_stale_blocks() + * below. We won't want to clear the unmount record if there is one, so + * we pass the lsn of the unmount record rather than the block after it. + */ + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + int h_size = be32_to_cpu(rhead->h_size); + int h_version = be32_to_cpu(rhead->h_version); + + if ((h_version & XLOG_VERSION_2) && + (h_size > XLOG_HEADER_CYCLE_SIZE)) { + hblks = h_size / XLOG_HEADER_CYCLE_SIZE; + if (h_size % XLOG_HEADER_CYCLE_SIZE) + hblks++; + } else { + hblks = 1; + } + } else { + hblks = 1; + } + after_umount_blk = rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len)); + after_umount_blk = do_mod(after_umount_blk, log->l_logBBsize); + if (*head_blk == after_umount_blk && + be32_to_cpu(rhead->h_num_logops) == 1) { + umount_data_blk = rhead_blk + hblks; + umount_data_blk = do_mod(umount_data_blk, log->l_logBBsize); + error = xlog_bread(log, umount_data_blk, 1, bp, &offset); + if (error) + return error; + + op_head = (struct xlog_op_header *)offset; + if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) { + /* + * Set tail and last sync so that newly written log + * records will point recovery to after the current + * unmount record. + */ + xlog_assign_atomic_lsn(&log->l_tail_lsn, + log->l_curr_cycle, after_umount_blk); + xlog_assign_atomic_lsn(&log->l_last_sync_lsn, + log->l_curr_cycle, after_umount_blk); + *tail_blk = after_umount_blk; + + *clean = true; + } + } + + return 0; +} + +static void +xlog_set_state( + struct xlog *log, + xfs_daddr_t head_blk, + struct xlog_rec_header *rhead, + xfs_daddr_t rhead_blk, + bool bump_cycle) +{ + /* + * Reset log values according to the state of the log when we + * crashed. In the case where head_blk == 0, we bump curr_cycle + * one because the next write starts a new cycle rather than + * continuing the cycle of the last good log record. At this + * point we have guaranteed that all partial log records have been + * accounted for. Therefore, we know that the last good log record + * written was complete and ended exactly on the end boundary + * of the physical log. + */ + log->l_prev_block = rhead_blk; + log->l_curr_block = (int)head_blk; + log->l_curr_cycle = be32_to_cpu(rhead->h_cycle); + if (bump_cycle) + log->l_curr_cycle++; + atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn)); + atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn)); + xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle, + BBTOB(log->l_curr_block)); + xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle, + BBTOB(log->l_curr_block)); +} + +/* * Find the sync block number or the tail of the log. * * This will be the block number of the last record to have its @@ -1238,22 +1330,20 @@ xlog_find_tail( xfs_daddr_t *tail_blk) { xlog_rec_header_t *rhead; - xlog_op_header_t *op_head; char *offset = NULL; xfs_buf_t *bp; int error; - xfs_daddr_t umount_data_blk; - xfs_daddr_t after_umount_blk; xfs_daddr_t rhead_blk; xfs_lsn_t tail_lsn; - int hblks; bool wrapped = false; + bool clean = false; /* * Find previous log record */ if ((error = xlog_find_head(log, head_blk))) return error; + ASSERT(*head_blk < INT_MAX); bp = xlog_get_bp(log, 1); if (!bp) @@ -1271,100 +1361,75 @@ xlog_find_tail( } /* - * Trim the head block back to skip over torn records. We can have - * multiple log I/Os in flight at any time, so we assume CRC failures - * back through the previous several records are torn writes and skip - * them. + * Search backwards through the log looking for the log record header + * block. This wraps all the way back around to the head so something is + * seriously wrong if we can't find it. */ - ASSERT(*head_blk < INT_MAX); - error = xlog_verify_head(log, head_blk, tail_blk, bp, &rhead_blk, - &rhead, &wrapped); - if (error) - goto done; + error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp, + &rhead_blk, &rhead, &wrapped); + if (error < 0) + return error; + if (!error) { + xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__); + return -EIO; + } + *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn)); /* - * Reset log values according to the state of the log when we - * crashed. In the case where head_blk == 0, we bump curr_cycle - * one because the next write starts a new cycle rather than - * continuing the cycle of the last good log record. At this - * point we have guaranteed that all partial log records have been - * accounted for. Therefore, we know that the last good log record - * written was complete and ended exactly on the end boundary - * of the physical log. + * Set the log state based on the current head record. */ - log->l_prev_block = rhead_blk; - log->l_curr_block = (int)*head_blk; - log->l_curr_cycle = be32_to_cpu(rhead->h_cycle); - if (wrapped) - log->l_curr_cycle++; - atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn)); - atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn)); - xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle, - BBTOB(log->l_curr_block)); - xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle, - BBTOB(log->l_curr_block)); + xlog_set_state(log, *head_blk, rhead, rhead_blk, wrapped); + tail_lsn = atomic64_read(&log->l_tail_lsn); /* - * Look for unmount record. If we find it, then we know there - * was a clean unmount. Since 'i' could be the last block in - * the physical log, we convert to a log block before comparing - * to the head_blk. + * Look for an unmount record at the head of the log. This sets the log + * state to determine whether recovery is necessary. + */ + error = xlog_check_unmount_rec(log, head_blk, tail_blk, rhead, + rhead_blk, bp, &clean); + if (error) + goto done; + + /* + * Verify the log head if the log is not clean (e.g., we have anything + * but an unmount record at the head). This uses CRC verification to + * detect and trim torn writes. If discovered, CRC failures are + * considered torn writes and the log head is trimmed accordingly. * - * Save the current tail lsn to use to pass to - * xlog_clear_stale_blocks() below. We won't want to clear the - * unmount record if there is one, so we pass the lsn of the - * unmount record rather than the block after it. + * Note that we can only run CRC verification when the log is dirty + * because there's no guarantee that the log data behind an unmount + * record is compatible with the current architecture. */ - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { - int h_size = be32_to_cpu(rhead->h_size); - int h_version = be32_to_cpu(rhead->h_version); + if (!clean) { + xfs_daddr_t orig_head = *head_blk; - if ((h_version & XLOG_VERSION_2) && - (h_size > XLOG_HEADER_CYCLE_SIZE)) { - hblks = h_size / XLOG_HEADER_CYCLE_SIZE; - if (h_size % XLOG_HEADER_CYCLE_SIZE) - hblks++; - } else { - hblks = 1; - } - } else { - hblks = 1; - } - after_umount_blk = rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len)); - after_umount_blk = do_mod(after_umount_blk, log->l_logBBsize); - tail_lsn = atomic64_read(&log->l_tail_lsn); - if (*head_blk == after_umount_blk && - be32_to_cpu(rhead->h_num_logops) == 1) { - umount_data_blk = rhead_blk + hblks; - umount_data_blk = do_mod(umount_data_blk, log->l_logBBsize); - error = xlog_bread(log, umount_data_blk, 1, bp, &offset); + error = xlog_verify_head(log, head_blk, tail_blk, bp, + &rhead_blk, &rhead, &wrapped); if (error) goto done; - op_head = (xlog_op_header_t *)offset; - if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) { - /* - * Set tail and last sync so that newly written - * log records will point recovery to after the - * current unmount record. - */ - xlog_assign_atomic_lsn(&log->l_tail_lsn, - log->l_curr_cycle, after_umount_blk); - xlog_assign_atomic_lsn(&log->l_last_sync_lsn, - log->l_curr_cycle, after_umount_blk); - *tail_blk = after_umount_blk; - - /* - * Note that the unmount was clean. If the unmount - * was not clean, we need to know this to rebuild the - * superblock counters from the perag headers if we - * have a filesystem using non-persistent counters. - */ - log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN; + /* update in-core state again if the head changed */ + if (*head_blk != orig_head) { + xlog_set_state(log, *head_blk, rhead, rhead_blk, + wrapped); + tail_lsn = atomic64_read(&log->l_tail_lsn); + error = xlog_check_unmount_rec(log, head_blk, tail_blk, + rhead, rhead_blk, bp, + &clean); + if (error) + goto done; } } /* + * Note that the unmount was clean. If the unmount was not clean, we + * need to know this to rebuild the superblock counters from the perag + * headers if we have a filesystem using non-persistent counters. + */ + if (clean) + log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN; + + /* * Make sure that there are no blocks in front of the head * with the same cycle number as the head. This can happen * because we allow multiple outstanding log writes concurrently, @@ -4491,7 +4556,7 @@ xlog_recover_process( * know precisely what failed. */ if (pass == XLOG_RECOVER_CRCPASS) { - if (rhead->h_crc && crc != le32_to_cpu(rhead->h_crc)) + if (rhead->h_crc && crc != rhead->h_crc) return -EFSBADCRC; return 0; } @@ -4502,7 +4567,7 @@ xlog_recover_process( * zero CRC check prevents warnings from being emitted when upgrading * the kernel from one that does not add CRCs by default. */ - if (crc != le32_to_cpu(rhead->h_crc)) { + if (crc != rhead->h_crc) { if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) { xfs_alert(log->l_mp, "log record CRC mismatch: found 0x%x, expected 0x%x.", diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index dc6221942b85..ade236e90bb3 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -42,11 +42,11 @@ xfs_break_layouts( while ((error = break_layout(inode, false) == -EWOULDBLOCK)) { xfs_iunlock(ip, *iolock); if (with_imutex && (*iolock & XFS_IOLOCK_EXCL)) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); error = break_layout(inode, true); *iolock = XFS_IOLOCK_EXCL; if (with_imutex) - mutex_lock(&inode->i_mutex); + inode_lock(inode); xfs_ilock(ip, *iolock); } diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index aa67339b9537..4f18fd92ca13 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -497,7 +497,6 @@ xfsaild( long tout = 0; /* milliseconds */ current->flags |= PF_MEMALLOC; - set_freezable(); while (!kthread_should_stop()) { if (tout && tout <= 20) |